gallery-dl/gallery_dl/extractor/poipiku.py

# -*- coding: utf-8 -*-

# Copyright 2022-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://poipiku.com/"""

from .common import Extractor, Message
from .. import text

BASE_PATTERN = r"(?:https?://)?poipiku\.com"


class PoipikuExtractor(Extractor):
    """Base class for poipiku extractors"""
    category = "poipiku"
    root = "https://poipiku.com"
    directory_fmt = ("{category}", "{user_id} {user_name}")
    filename_fmt = "{post_id}_{num}.{extension}"
    archive_fmt = "{post_id}_{num}"
    request_interval = (0.5, 1.5)

    def items(self):
        password = self.config("password", "")

        for post_url in self.posts():
            parts = post_url.split("/")
            if post_url[0] == "/":
                post_url = self.root + post_url
            page = self.request(post_url).text
            extr = text.extract_from(page)

            post = {
                "post_category": extr("<title>[", "]"),
                "count"      : extr("(", " "),
                "post_id"    : parts[-1].partition(".")[0],
                "user_id"    : parts[-2],
                "user_name"  : text.unescape(extr(
                    '<h2 class="UserInfoUserName">', '</').rpartition(">")[2]),
                "description": text.unescape(extr(
                    'class="IllustItemDesc" >', '</h1>')),
                "_http_headers": {"Referer": post_url},
            }

            yield Message.Directory, post
            post["num"] = 0

            while True:
                thumb = extr('class="IllustItemThumbImg" src="', '"')
                if not thumb:
                    break
                elif thumb.startswith(("//img.poipiku.com/img/", "/img/")):
                    continue
                post["num"] += 1
                url = text.ensure_http_scheme(thumb[:-8]).replace(
                    "//img.", "//img-org.", 1)
                yield Message.Url, url, text.nameext_from_url(url, post)

            if not extr(' show all(+', '<'):
                continue

            url = self.root + "/f/ShowAppendFileF.jsp"
            headers = {
                "Accept" : "application/json, text/javascript, */*; q=0.01",
                "X-Requested-With": "XMLHttpRequest",
                "Origin" : self.root,
                "Referer": post_url,
            }
            data = {
                "UID": post["user_id"],
                "IID": post["post_id"],
                "PAS": password,
                "MD" : "0",
                "TWF": "-1",
            }
            resp = self.request(
                url, method="POST", headers=headers, data=data).json()

            page = resp["html"]
            if (resp.get("result_num") or 0) < 0:
                self.log.warning("'%s'", page.replace("<br/>", " "))

            for thumb in text.extract_iter(
                    page, 'class="IllustItemThumbImg" src="', '"'):
                post["num"] += 1
                url = text.ensure_http_scheme(thumb[:-8]).replace(
                    "//img.", "//img-org.", 1)
                yield Message.Url, url, text.nameext_from_url(url, post)


class PoipikuUserExtractor(PoipikuExtractor):
    """Extractor for posts from a poipiku user"""
    subcategory = "user"
    pattern = (BASE_PATTERN + r"/(?:IllustListPcV\.jsp\?PG=(\d+)&ID=)?"
               r"(\d+)/?(?:$|[?&#])")
    test = (
        ("https://poipiku.com/25049/", {
            "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049"
                       r"/\d+_\w+\.(jpe?g|png)$",
            "range": "1-10",
            "count": 10,
        }),
        ("https://poipiku.com/IllustListPcV.jsp?PG=1&ID=25049&KWD=")
    )

    def __init__(self, match):
        PoipikuExtractor.__init__(self, match)
        self._page, self.user_id = match.groups()

    def posts(self):
        url = self.root + "/IllustListPcV.jsp"
        params = {
            "PG" : text.parse_int(self._page, 0),
            "ID" : self.user_id,
            "KWD": "",
        }

        while True:
            page = self.request(url, params=params).text

            cnt = 0
            for path in text.extract_iter(
                    page, 'class="IllustInfo" href="', '"'):
                yield path
                cnt += 1

            if cnt < 48:
                return
            params["PG"] += 1


class PoipikuPostExtractor(PoipikuExtractor):
    """Extractor for a poipiku post"""
    subcategory = "post"
    pattern = BASE_PATTERN + r"/(\d+)/(\d+)"
    test = (
        ("https://poipiku.com/25049/5864576.html", {
            "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049"
                       r"/005864576_EWN1Y65gQ\.png$",
            "keyword": {
                "count": "1",
                "description": "",
                "extension": "png",
                "filename": "005864576_EWN1Y65gQ",
                "num": 1,
                "post_category": "DOODLE",
                "post_id": "5864576",
                "user_id": "25049",
                "user_name": "ユキウサギ",
            },
        }),
        ("https://poipiku.com/2166245/6411749.html", {
            "pattern": r"https://img-org\.poipiku\.com/user_img\d+/002166245"
                       r"/006411749_\w+\.jpeg$",
            "count": 4,
            "keyword": {
                "count": "4",
                "description": "絵茶の産物ネタバレあるやつ",
                "num": int,
                "post_category": "SPOILER",
                "post_id": "6411749",
                "user_id": "2166245",
                "user_name": "wadahito",
            },
        }),
        # different warning button style
        ("https://poipiku.com/3572553/5776587.html", {
            "pattern": r"https://img-org\.poipiku.com/user_img\d+/003572553"
                       r"/005776587_(\d+_)?\w+\.jpeg$",
            "count": 3,
            "keyword": {
                "count": "3",
                "description": "ORANGE OASISボスネタバレ<br />曲も大好き<br />"
                               "2枚目以降はほとんど見えなかった1枚目背景"
                               "のヒエログリフ小ネタです𓀀",
                "num": int,
                "post_category": "SPOILER",
                "post_id": "5776587",
                "user_id": "3572553",
                "user_name": "nagakun",
            },
        }),
    )

    def __init__(self, match):
        PoipikuExtractor.__init__(self, match)
        self.user_id, self.post_id = match.groups()

    def posts(self):
        return ("/{}/{}.html".format(self.user_id, self.post_id),)
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`# -- coding: utf-8 --`

[poipiku] warn about login requirements 2 years ago			`# Copyright 2022-2023 Mike Fährmann`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://poipiku.com/"""`

			`from .common import Extractor, Message`
			`from .. import text`

			`BASE_PATTERN = r"(?:https?://)?poipiku\.com"`


			`class PoipikuExtractor(Extractor):`
			`"""Base class for poipiku extractors"""`
			`category = "poipiku"`
			`root = "https://poipiku.com"`
			`directory_fmt = ("{category}", "{user_id} {user_name}")`
			`filename_fmt = "{post_id}_{num}.{extension}"`
			`archive_fmt = "{post_id}_{num}"`
			`request_interval = (0.5, 1.5)`

			`def items(self):`
[poipiku] add simple password support (#1602) 2 years ago			`password = self.config("password", "")`

[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`for post_url in self.posts():`
			`parts = post_url.split("/")`
			`if post_url[0] == "/":`
			`post_url = self.root + post_url`
			`page = self.request(post_url).text`
			`extr = text.extract_from(page)`

			`post = {`
			`"post_category": extr("<title>[", "]"),`
			`"count" : extr("(", " "),`
			`"post_id" : parts[-1].partition(".")[0],`
			`"user_id" : parts[-2],`
			`"user_name" : text.unescape(extr(`
			`'<h2 class="UserInfoUserName">', '</').rpartition(">")[2]),`
			`"description": text.unescape(extr(`
[poipiku] extract full 'descriptions' (#4066) don't cut it off after the first line 1 year ago			`'class="IllustItemDesc" >', '</h1>')),`
[poipiku] use 'img-org.poipiku.com' as image domain (#2796) 2 years ago			`"_http_headers": {"Referer": post_url},`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`}`

			`yield Message.Directory, post`
			`post["num"] = 0`

			`while True:`
			`thumb = extr('class="IllustItemThumbImg" src="', '"')`
			`if not thumb:`
			`break`
[poipiku] update filter for static images (#2796) 2 years ago			`elif thumb.startswith(("//img.poipiku.com/img/", "/img/")):`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`continue`
			`post["num"] += 1`
[poipiku] use 'img-org.poipiku.com' as image domain (#2796) 2 years ago			`url = text.ensure_http_scheme(thumb[:-8]).replace(`
			`"//img.", "//img-org.", 1)`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`yield Message.Url, url, text.nameext_from_url(url, post)`

[poipiku] fix extraction for a different warning button style 2 years ago			`if not extr(' show all(+', '<'):`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`continue`

			`url = self.root + "/f/ShowAppendFileF.jsp"`
			`headers = {`
			`"Accept" : "application/json, text/javascript, /; q=0.01",`
			`"X-Requested-With": "XMLHttpRequest",`
			`"Origin" : self.root,`
			`"Referer": post_url,`
			`}`
			`data = {`
			`"UID": post["user_id"],`
			`"IID": post["post_id"],`
[poipiku] add simple password support (#1602) 2 years ago			`"PAS": password,`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`"MD" : "0",`
			`"TWF": "-1",`
			`}`
[poipiku] improve error detection (#4206) 1 year ago			`resp = self.request(`
			`url, method="POST", headers=headers, data=data).json()`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago
[poipiku] improve error detection (#4206) 1 year ago			`page = resp["html"]`
			`if (resp.get("result_num") or 0) < 0:`
			`self.log.warning("'%s'", page.replace("<br/>", " "))`
[poipiku] warn about login requirements 2 years ago
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`for thumb in text.extract_iter(`
			`page, 'class="IllustItemThumbImg" src="', '"'):`
			`post["num"] += 1`
[poipiku] use 'img-org.poipiku.com' as image domain (#2796) 2 years ago			`url = text.ensure_http_scheme(thumb[:-8]).replace(`
			`"//img.", "//img-org.", 1)`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`yield Message.Url, url, text.nameext_from_url(url, post)`


			`class PoipikuUserExtractor(PoipikuExtractor):`
			`"""Extractor for posts from a poipiku user"""`
			`subcategory = "user"`
			`pattern = (BASE_PATTERN + r"/(?:IllustListPcV\.jsp\?PG=(\d+)&ID=)?"`
			`r"(\d+)/?(?:$\|[?&#])")`
			`test = (`
			`("https://poipiku.com/25049/", {`
[poipiku] use 'img-org.poipiku.com' as image domain (#2796) 2 years ago			`"pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049"`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`r"/\d+_\w+\.(jpe?g\|png)$",`
			`"range": "1-10",`
			`"count": 10,`
			`}),`
			`("https://poipiku.com/IllustListPcV.jsp?PG=1&ID=25049&KWD=")`
			`)`

			`def __init__(self, match):`
			`PoipikuExtractor.__init__(self, match)`
			`self._page, self.user_id = match.groups()`

			`def posts(self):`
			`url = self.root + "/IllustListPcV.jsp"`
			`params = {`
			`"PG" : text.parse_int(self._page, 0),`
			`"ID" : self.user_id,`
			`"KWD": "",`
			`}`

			`while True:`
			`page = self.request(url, params=params).text`

			`cnt = 0`
			`for path in text.extract_iter(`
			`page, 'class="IllustInfo" href="', '"'):`
			`yield path`
			`cnt += 1`

			`if cnt < 48:`
			`return`
			`params["PG"] += 1`


			`class PoipikuPostExtractor(PoipikuExtractor):`
			`"""Extractor for a poipiku post"""`
			`subcategory = "post"`
			`pattern = BASE_PATTERN + r"/(\d+)/(\d+)"`
			`test = (`
			`("https://poipiku.com/25049/5864576.html", {`
[poipiku] use 'img-org.poipiku.com' as image domain (#2796) 2 years ago			`"pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049"`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`r"/005864576_EWN1Y65gQ\.png$",`
			`"keyword": {`
			`"count": "1",`
			`"description": "",`
			`"extension": "png",`
			`"filename": "005864576_EWN1Y65gQ",`
			`"num": 1,`
			`"post_category": "DOODLE",`
			`"post_id": "5864576",`
			`"user_id": "25049",`
			`"user_name": "ユキウサギ",`
			`},`
			`}),`
			`("https://poipiku.com/2166245/6411749.html", {`
[poipiku] use 'img-org.poipiku.com' as image domain (#2796) 2 years ago			`"pattern": r"https://img-org\.poipiku\.com/user_img\d+/002166245"`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`r"/006411749_\w+\.jpeg$",`
			`"count": 4,`
			`"keyword": {`
			`"count": "4",`
			`"description": "絵茶の産物ネタバレあるやつ",`
			`"num": int,`
			`"post_category": "SPOILER",`
			`"post_id": "6411749",`
			`"user_id": "2166245",`
			`"user_name": "wadahito",`
			`},`
			`}),`
[poipiku] fix extraction for a different warning button style 2 years ago			`# different warning button style`
			`("https://poipiku.com/3572553/5776587.html", {`
			`"pattern": r"https://img-org\.poipiku.com/user_img\d+/003572553"`
			`r"/005776587_(\d+_)?\w+\.jpeg$",`
			`"count": 3,`
			`"keyword": {`
			`"count": "3",`
[poipiku] extract full 'descriptions' (#4066) don't cut it off after the first line 1 year ago			`"description": "ORANGE OASISボスネタバレ<br />曲も大好き<br />"`
			`"2枚目以降はほとんど見えなかった1枚目背景"`
			`"のヒエログリフ小ネタです𓀀",`
[poipiku] fix extraction for a different warning button style 2 years ago			`"num": int,`
			`"post_category": "SPOILER",`
			`"post_id": "5776587",`
			`"user_id": "3572553",`
			`"user_name": "nagakun",`
			`},`
			`}),`
[poipiku] add 'user' and 'post' extractors (#1602) 2 years ago			`)`

			`def __init__(self, match):`
			`PoipikuExtractor.__init__(self, match)`
			`self.user_id, self.post_id = match.groups()`

			`def posts(self):`
			`return ("/{}/{}.html".format(self.user_id, self.post_id),)`