gallery-dl/gallery_dl/extractor/kemonoparty.py

# -*- coding: utf-8 -*-

# Copyright 2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://kemono.party/"""

from .common import Extractor, Message
from .. import text
import re

BASE_PATTERN = r"(?:https?://)?kemono\.party/([^/?#]+)/user/([^/?#]+)"


class KemonopartyExtractor(Extractor):
    """Base class for kemonoparty extractors"""
    category = "kemonoparty"
    root = "https://kemono.party"
    directory_fmt = ("{category}", "{service}", "{user}")
    filename_fmt = "{id}_{title}_{filename}.{extension}"
    archive_fmt = "{service}_{user}_{id}_{filename}.{extension}"

    def items(self):
        find_inline = re.compile(r'src="(/inline/[^"]+)').findall

        for post in self.posts():

            files = []
            if post["file"]:
                files.append(post["file"])
            if post["attachments"]:
                files.extend(post["attachments"])
            for path in find_inline(post["content"] or ""):
                files.append({"path": path, "name": path})

            post["date"] = text.parse_datetime(
                post["published"], "%a, %d %b %Y %H:%M:%S %Z")
            yield Message.Directory, post

            for post["num"], file in enumerate(files, 1):
                url = file["path"]
                if url[0] == "/":
                    url = "https://data.kemono.party" + url
                elif url.startswith("https://kemono.party/"):
                    url = "https://data.kemono.party" + url[20:]

                text.nameext_from_url(file["name"], post)
                yield Message.Url, url, post


class KemonopartyUserExtractor(KemonopartyExtractor):
    """Extractor for all posts from a kemono.party user listing"""
    subcategory = "user"
    pattern = BASE_PATTERN + r"/?(?:$|[?#])"
    test = (
        ("https://kemono.party/fanbox/user/6993449", {
            "range": "1-25",
            "count": 25,
        }),
        ("https://kemono.party/subscribestar/user/alcorart"),
    )

    def __init__(self, match):
        KemonopartyExtractor.__init__(self, match)
        service, user_id = match.groups()
        self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id)

    def posts(self):
        url = self.api_url
        params = {"o": 0}

        while True:
            posts = self.request(url, params=params).json()
            yield from posts

            if len(posts) < 25:
                return
            params["o"] += 25


class KemonopartyPostExtractor(KemonopartyExtractor):
    """Extractor for a single kemono.party post"""
    subcategory = "post"
    pattern = BASE_PATTERN + r"/post/([^/?#]+)"
    test = (
        ("https://kemono.party/fanbox/user/6993449/post/506575", {
            "pattern": r"https://data\.kemono\.party/files/fanbox"
                       r"/6993449/506575/P058kDFYus7DbqAkGlfWTlOr\.jpeg",
            "keyword": {
                "added": "Wed, 06 May 2020 20:28:02 GMT",
                "content": str,
                "date": "dt:2019-08-11 02:09:04",
                "edited": None,
                "embed": dict,
                "extension": "jpeg",
                "filename": "P058kDFYus7DbqAkGlfWTlOr",
                "id": "506575",
                "num": 1,
                "published": "Sun, 11 Aug 2019 02:09:04 GMT",
                "service": "fanbox",
                "shared_file": False,
                "subcategory": "post",
                "title": "c96取り置き",
                "user": "6993449",
            },
        }),
        # inline image (#1286)
        ("https://kemono.party/fanbox/user/7356311/post/802343", {
            "pattern": r"https://data\.kemono\.party/inline/fanbox"
                       r"/uaozO4Yga6ydkGIJFAQDixfE\.jpeg",
        }),
        # kemono.party -> data.kemono.party
        ("https://kemono.party/gumroad/user/trylsc/post/IURjT", {
            "pattern": r"https://data\.kemono\.party/(file|attachment)s"
                       r"/gumroad/trylsc/IURjT/",
        }),
        ("https://kemono.party/subscribestar/user/alcorart/post/184330"),
    )

    def __init__(self, match):
        KemonopartyExtractor.__init__(self, match)
        service, user_id, post_id = match.groups()
        self.api_url = "{}/api/{}/user/{}/post/{}".format(
            self.root, service, user_id, post_id)

    def posts(self):
        posts = self.request(self.api_url).json()
        return (posts[0],) if len(posts) > 1 else posts
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago			`# -- coding: utf-8 --`

			`# Copyright 2021 Mike Fährmann`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for https://kemono.party/"""`

			`from .common import Extractor, Message`
[kemonoparty] simplify (#1216) Use metadata from API responses as is and don't try to detect duplicated by their original filename. 4 years ago			`from .. import text`
[kemonoparty] extract inline images (fixes #1286) 4 years ago			`import re`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago
[kemonoparty] support URLs with non-numeric user and post IDs (fixes #1303) 4 years ago			`BASE_PATTERN = r"(?:https?://)?kemono\.party/([^/?#]+)/user/([^/?#]+)"`

[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago
			`class KemonopartyExtractor(Extractor):`
			`"""Base class for kemonoparty extractors"""`
			`category = "kemonoparty"`
			`root = "https://kemono.party"`
[kemonoparty] include 'service' in directories and archive keys 4 years ago			`directory_fmt = ("{category}", "{service}", "{user}")`
[kemonoparty] simplify (#1216) Use metadata from API responses as is and don't try to detect duplicated by their original filename. 4 years ago			`filename_fmt = "{id}_{title}_{filename}.{extension}"`
[kemonoparty] include 'service' in directories and archive keys 4 years ago			`archive_fmt = "{service}_{user}_{id}_{filename}.{extension}"`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago
			`def items(self):`
[kemonoparty] extract inline images (fixes #1286) 4 years ago			`find_inline = re.compile(r'src="(/inline/[^"]+)').findall`

[kemonoparty] use API endpoints (#1216) 4 years ago			`for post in self.posts():`

			`files = []`
			`if post["file"]:`
			`files.append(post["file"])`
			`if post["attachments"]:`
			`files.extend(post["attachments"])`
[kemonoparty] extract inline images (fixes #1286) 4 years ago			`for path in find_inline(post["content"] or ""):`
			`files.append({"path": path, "name": path})`

[kemonoparty] simplify (#1216) Use metadata from API responses as is and don't try to detect duplicated by their original filename. 4 years ago			`post["date"] = text.parse_datetime(`
			`post["published"], "%a, %d %b %Y %H:%M:%S %Z")`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago			`yield Message.Directory, post`
[kemonoparty] use API endpoints (#1216) 4 years ago
[kemonoparty] simplify (#1216) Use metadata from API responses as is and don't try to detect duplicated by their original filename. 4 years ago			`for post["num"], file in enumerate(files, 1):`
[kemonoparty] fix absolute file URLs 4 years ago			`url = file["path"]`
			`if url[0] == "/":`
[kemonoparty] fix file URLs (#1514) files are now hosted on https://data.kemono.party/ 3 years ago			`url = "https://data.kemono.party" + url`
[kemonoparty] update file URLs directly linking to kemono.party (#1514) 3 years ago			`elif url.startswith("https://kemono.party/"):`
			`url = "https://data.kemono.party" + url[20:]`

[kemonoparty] simplify (#1216) Use metadata from API responses as is and don't try to detect duplicated by their original filename. 4 years ago			`text.nameext_from_url(file["name"], post)`
[kemonoparty] fix absolute file URLs 4 years ago			`yield Message.Url, url, post`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago

			`class KemonopartyUserExtractor(KemonopartyExtractor):`
			`"""Extractor for all posts from a kemono.party user listing"""`
			`subcategory = "user"`
[kemonoparty] support URLs with non-numeric user and post IDs (fixes #1303) 4 years ago			`pattern = BASE_PATTERN + r"/?(?:$\|[?#])"`
			`test = (`
			`("https://kemono.party/fanbox/user/6993449", {`
			`"range": "1-25",`
			`"count": 25,`
			`}),`
			`("https://kemono.party/subscribestar/user/alcorart"),`
			`)`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago
			`def __init__(self, match):`
			`KemonopartyExtractor.__init__(self, match)`
			`service, user_id = match.groups()`
[kemonoparty] use API endpoints (#1216) 4 years ago			`self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id)`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago
			`def posts(self):`
[kemonoparty] use API endpoints (#1216) 4 years ago			`url = self.api_url`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago			`params = {"o": 0}`

			`while True:`
[kemonoparty] use API endpoints (#1216) 4 years ago			`posts = self.request(url, params=params).json()`
			`yield from posts`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago
[kemonoparty] use API endpoints (#1216) 4 years ago			`if len(posts) < 25:`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago			`return`
			`params["o"] += 25`


			`class KemonopartyPostExtractor(KemonopartyExtractor):`
			`"""Extractor for a single kemono.party post"""`
			`subcategory = "post"`
[kemonoparty] support URLs with non-numeric user and post IDs (fixes #1303) 4 years ago			`pattern = BASE_PATTERN + r"/post/([^/?#]+)"`
[kemonoparty] extract inline images (fixes #1286) 4 years ago			`test = (`
			`("https://kemono.party/fanbox/user/6993449/post/506575", {`
[kemonoparty] fix file URLs (#1514) files are now hosted on https://data.kemono.party/ 3 years ago			`"pattern": r"https://data\.kemono\.party/files/fanbox"`
[kemonoparty] extract inline images (fixes #1286) 4 years ago			`r"/6993449/506575/P058kDFYus7DbqAkGlfWTlOr\.jpeg",`
			`"keyword": {`
			`"added": "Wed, 06 May 2020 20:28:02 GMT",`
			`"content": str,`
			`"date": "dt:2019-08-11 02:09:04",`
			`"edited": None,`
			`"embed": dict,`
			`"extension": "jpeg",`
			`"filename": "P058kDFYus7DbqAkGlfWTlOr",`
			`"id": "506575",`
			`"num": 1,`
			`"published": "Sun, 11 Aug 2019 02:09:04 GMT",`
			`"service": "fanbox",`
			`"shared_file": False,`
			`"subcategory": "post",`
			`"title": "c96取り置き",`
			`"user": "6993449",`
			`},`
			`}),`
			`# inline image (#1286)`
			`("https://kemono.party/fanbox/user/7356311/post/802343", {`
[kemonoparty] fix file URLs (#1514) files are now hosted on https://data.kemono.party/ 3 years ago			`"pattern": r"https://data\.kemono\.party/inline/fanbox"`
[kemonoparty] extract inline images (fixes #1286) 4 years ago			`r"/uaozO4Yga6ydkGIJFAQDixfE\.jpeg",`
			`}),`
[kemonoparty] update file URLs directly linking to kemono.party (#1514) 3 years ago			`# kemono.party -> data.kemono.party`
			`("https://kemono.party/gumroad/user/trylsc/post/IURjT", {`
			`"pattern": r"https://data\.kemono\.party/(file\|attachment)s"`
			`r"/gumroad/trylsc/IURjT/",`
			`}),`
[kemonoparty] support URLs with non-numeric user and post IDs (fixes #1303) 4 years ago			`("https://kemono.party/subscribestar/user/alcorart/post/184330"),`
[kemonoparty] extract inline images (fixes #1286) 4 years ago			`)`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago
			`def __init__(self, match):`
			`KemonopartyExtractor.__init__(self, match)`
			`service, user_id, post_id = match.groups()`
[kemonoparty] use API endpoints (#1216) 4 years ago			`self.api_url = "{}/api/{}/user/{}/post/{}".format(`
[kemonoparty] add 'user' and 'post' extractors (#1216) 4 years ago			`self.root, service, user_id, post_id)`

			`def posts(self):`
[kemonoparty] use API endpoints (#1216) 4 years ago			`posts = self.request(self.api_url).json()`
			`return (posts[0],) if len(posts) > 1 else posts`