gallery-dl/gallery_dl/extractor/piczel.py

# -*- coding: utf-8 -*-

# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://piczel.tv/"""

from .common import Extractor, Message
from .. import text
import json


class PiczelExtractor(Extractor):
    """Base class for piczel extractors"""
    category = "piczel"
    directory_fmt = ("{category}", "{user[username]}")
    filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}"
    archive_fmt = "{id}_{num}"
    root = "https://piczel.tv"

    def items(self):
        yield Message.Version, 1
        for post in self.posts():
            post["tags"] = [t["title"] for t in post["tags"] if t["title"]]
            post["date"] = text.parse_datetime(
                post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")

            if post["multi"]:
                images = post["images"]
                del post["images"]
                yield Message.Directory, post
                for post["num"], image in enumerate(images):
                    if "id" in image:
                        del image["id"]
                    post.update(image)
                    url = post["image"]["url"]
                    yield Message.Url, url, text.nameext_from_url(url, post)

            else:
                yield Message.Directory, post
                post["num"] = 0
                url = post["image"]["url"]
                yield Message.Url, url, text.nameext_from_url(url, post)

    def posts(self):
        """Return an iterable with all relevant post objects"""

    def _pagination(self, url, folder_id=None):
        params = {
            "from_id"  : None,
            "folder_id": folder_id,
        }

        while True:
            data = self.request(url, params=params).json()
            if not data:
                return
            params["from_id"] = data[-1]["id"]

            for post in data:
                if not folder_id or folder_id == post["folder_id"]:
                    yield post


class PiczelUserExtractor(PiczelExtractor):
    """Extractor for all images from a user's gallery"""
    subcategory = "user"
    pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$"
    test = ("https://piczel.tv/gallery/Bikupan", {
        "range": "1-100",
        "count": ">= 100",
    })

    def __init__(self, match):
        PiczelExtractor.__init__(self, match)
        self.user = match.group(1)

    def posts(self):
        url = "{}/api/users/{}/gallery".format(self.root, self.user)
        return self._pagination(url)


class PiczelFolderExtractor(PiczelExtractor):
    """Extractor for images inside a user's folder"""
    subcategory = "folder"
    directory_fmt = ("{category}", "{user[username]}", "{folder[name]}")
    archive_fmt = "f{folder[id]}_{id}_{num}"
    pattern = (r"(?:https?://)?(?:www\.)?piczel\.tv"
               r"/gallery/(?!image)([^/?&#]+)/(\d+)")
    test = ("https://piczel.tv/gallery/Lulena/1114", {
        "count": ">= 4",
    })

    def __init__(self, match):
        PiczelExtractor.__init__(self, match)
        self.user, self.folder_id = match.groups()

    def posts(self):
        url = "{}/api/users/{}/gallery".format(self.root, self.user)
        return self._pagination(url, int(self.folder_id))


class PiczelImageExtractor(PiczelExtractor):
    """Extractor for individual images"""
    subcategory = "image"
    pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)"
    test = ("https://piczel.tv/gallery/image/7807", {
        "url": "85225dd53a03c3b6028f6c4a45b71eccc07f7066",
        "content": "df9a053a24234474a19bce2b7e27e0dec23bff87",
        "keyword": {
            "created_at": "2018-07-22T05:13:58.000Z",
            "date": "dt:2018-07-22 05:13:58",
            "description": None,
            "extension": "png",
            "favorites_count": int,
            "folder": dict,
            "folder_id": 1113,
            "id": 7807,
            "is_flash": False,
            "is_video": False,
            "multi": False,
            "nsfw": False,
            "num": 0,
            "password_protected": False,
            "tags": ["fanart", "commission", "altair", "recreators"],
            "title": "Altair",
            "user": dict,
            "views": int,
        },
    })

    def __init__(self, match):
        PiczelExtractor.__init__(self, match)
        self.image_id = match.group(1)

    def posts(self):
        url = "{}/gallery/image/{}".format(self.root, self.image_id)
        page = self.request(url).text
        data = json.loads(text.extract(
            page, 'window.__PRELOADED_STATE__ =', '</script>')[0])
        return (data["gallery"]["images"]["byId"][self.image_id],)
[piczel] add user, folder and image extractors 6 years ago			`# -- coding: utf-8 --`

[piczel] fix extraction 5 years ago			`# Copyright 2018-2020 Mike Fährmann`
[piczel] add user, folder and image extractors 6 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago			`"""Extractors for https://piczel.tv/"""`
[piczel] add user, folder and image extractors 6 years ago
			`from .common import Extractor, Message`
			`from .. import text`
[piczel] fix extraction - manually filter by folder_id - extract data for single posts from embedded JSON, since the '/api/gallery/image/<id>' endpoint is no longer available 5 years ago			`import json`
[piczel] add user, folder and image extractors 6 years ago

			`class PiczelExtractor(Extractor):`
			`"""Base class for piczel extractors"""`
			`category = "piczel"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`directory_fmt = ("{category}", "{user[username]}")`
[piczel] add user, folder and image extractors 6 years ago			`filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}"`
			`archive_fmt = "{id}_{num}"`
			`root = "https://piczel.tv"`

			`def items(self):`
			`yield Message.Version, 1`
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 5 years ago			`for post in self.posts():`
			`post["tags"] = [t["title"] for t in post["tags"] if t["title"]]`
			`post["date"] = text.parse_datetime(`
			`post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")`

			`if post["multi"]:`
			`images = post["images"]`
			`del post["images"]`
			`yield Message.Directory, post`
			`for post["num"], image in enumerate(images):`
			`if "id" in image:`
			`del image["id"]`
			`post.update(image)`
			`url = post["image"]["url"]`
			`yield Message.Url, url, text.nameext_from_url(url, post)`

[piczel] add user, folder and image extractors 6 years ago			`else:`
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 5 years ago			`yield Message.Directory, post`
			`post["num"] = 0`
			`url = post["image"]["url"]`
			`yield Message.Url, url, text.nameext_from_url(url, post)`
[piczel] add user, folder and image extractors 6 years ago
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 5 years ago			`def posts(self):`
			`"""Return an iterable with all relevant post objects"""`
[piczel] add user, folder and image extractors 6 years ago
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago			`def _pagination(self, url, folder_id=None):`
			`params = {`
			`"from_id" : None,`
			`"folder_id": folder_id,`
			`}`

			`while True:`
			`data = self.request(url, params=params).json()`
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 5 years ago			`if not data:`
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago			`return`
			`params["from_id"] = data[-1]["id"]`
[piczel] fix extraction - manually filter by folder_id - extract data for single posts from embedded JSON, since the '/api/gallery/image/<id>' endpoint is no longer available 5 years ago
			`for post in data:`
			`if not folder_id or folder_id == post["folder_id"]:`
			`yield post`
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago
[piczel] add user, folder and image extractors 6 years ago
			`class PiczelUserExtractor(PiczelExtractor):`
			`"""Extractor for all images from a user's gallery"""`
			`subcategory = "user"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$"`
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 5 years ago			`test = ("https://piczel.tv/gallery/Bikupan", {`
			`"range": "1-100",`
			`"count": ">= 100",`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`})`
[piczel] add user, folder and image extractors 6 years ago
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago			`def __init__(self, match):`
			`PiczelExtractor.__init__(self, match)`
			`self.user = match.group(1)`

[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 5 years ago			`def posts(self):`
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago			`url = "{}/api/users/{}/gallery".format(self.root, self.user)`
			`return self._pagination(url)`
[piczel] add user, folder and image extractors 6 years ago

			`class PiczelFolderExtractor(PiczelExtractor):`
			`"""Extractor for images inside a user's folder"""`
			`subcategory = "folder"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`directory_fmt = ("{category}", "{user[username]}", "{folder[name]}")`
[piczel] add user, folder and image extractors 6 years ago			`archive_fmt = "f{folder[id]}_{id}_{num}"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = (r"(?:https?://)?(?:www\.)?piczel\.tv"`
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago			`r"/gallery/(?!image)([^/?&#]+)/(\d+)")`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`test = ("https://piczel.tv/gallery/Lulena/1114", {`
[piczel] add user, folder and image extractors 6 years ago			`"count": ">= 4",`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`})`
[piczel] add user, folder and image extractors 6 years ago
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago			`def __init__(self, match):`
			`PiczelExtractor.__init__(self, match)`
			`self.user, self.folder_id = match.groups()`

[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 5 years ago			`def posts(self):`
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago			`url = "{}/api/users/{}/gallery".format(self.root, self.user)`
[piczel] fix extraction - manually filter by folder_id - extract data for single posts from embedded JSON, since the '/api/gallery/image/<id>' endpoint is no longer available 5 years ago			`return self._pagination(url, int(self.folder_id))`
[piczel] add user, folder and image extractors 6 years ago

			`class PiczelImageExtractor(PiczelExtractor):`
			`"""Extractor for individual images"""`
			`subcategory = "image"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)"`
			`test = ("https://piczel.tv/gallery/image/7807", {`
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago			`"url": "85225dd53a03c3b6028f6c4a45b71eccc07f7066",`
[piczel] add user, folder and image extractors 6 years ago			`"content": "df9a053a24234474a19bce2b7e27e0dec23bff87",`
			`"keyword": {`
			`"created_at": "2018-07-22T05:13:58.000Z",`
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 5 years ago			`"date": "dt:2018-07-22 05:13:58",`
[piczel] add user, folder and image extractors 6 years ago			`"description": None,`
			`"extension": "png",`
			`"favorites_count": int,`
			`"folder": dict,`
			`"folder_id": 1113,`
			`"id": 7807,`
			`"is_flash": False,`
			`"is_video": False,`
			`"multi": False,`
			`"nsfw": False,`
			`"num": 0,`
			`"password_protected": False,`
[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 5 years ago			`"tags": ["fanart", "commission", "altair", "recreators"],`
[piczel] add user, folder and image extractors 6 years ago			`"title": "Altair",`
			`"user": dict,`
			`"views": int,`
			`},`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`})`
[piczel] add user, folder and image extractors 6 years ago
[piczel] update and improve - use proper pagination (fixes #396) - update API host and endpoints - "fix" double slash // in image URLs 5 years ago			`def __init__(self, match):`
			`PiczelExtractor.__init__(self, match)`
			`self.image_id = match.group(1)`

[piczel] improve and update - fix tag names - fix a bug in _pagination() - parse datetime in 'created_at' as 'date' - rewrite main loop - replace user profile test 5 years ago			`def posts(self):`
[piczel] fix extraction - manually filter by folder_id - extract data for single posts from embedded JSON, since the '/api/gallery/image/<id>' endpoint is no longer available 5 years ago			`url = "{}/gallery/image/{}".format(self.root, self.image_id)`
			`page = self.request(url).text`
			`data = json.loads(text.extract(`
			`page, 'window.__PRELOADED_STATE__ =', '</script>')[0])`
			`return (data["gallery"]["images"]["byId"][self.image_id],)`