[kemonoparty] implement a 'files' option (#1991)

similar to 8d676151
pull/2051/head
Mike Fährmann 3 years ago
parent ab8eea1a24
commit d4ec245554
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1338,6 +1338,18 @@ Description
Extract ``comments`` metadata.
extractor.kemonoparty.files
---------------------------
Type
``list`` of ``strings``
Default
``["postfile", "attachments", "inline"]``
Description
Determines the type and order of files to be downloaded.
Available types are ``postfile``, ``attachments``, and ``inline``.
extractor.kemonoparty.max-posts
-------------------------------
Type

@ -30,11 +30,12 @@ class KemonopartyExtractor(Extractor):
def items(self):
self._prepare_ddosguard_cookies()
find_inline = re.compile(
self._find_inline = re.compile(
r'src="(?:https?://kemono\.party)?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
skip_service = \
self._skip_service = \
"patreon" if self.config("patreon-skip-file", True) else None
generators = self._build_file_generators(self.config("files"))
comments = self.config("comments")
if self.config("metadata"):
@ -51,20 +52,6 @@ class KemonopartyExtractor(Extractor):
for post in posts:
files = []
append = files.append
file = post["file"]
if file:
file["type"] = "file"
if post["service"] != skip_service or not post["attachments"]:
append(file)
for attachment in post["attachments"]:
attachment["type"] = "attachment"
append(attachment)
for path in find_inline(post["content"] or ""):
append({"path": path, "name": path, "type": "inline"})
post["date"] = text.parse_datetime(
post["published"] or post["added"],
"%a, %d %b %Y %H:%M:%S %Z")
@ -74,8 +61,11 @@ class KemonopartyExtractor(Extractor):
post["comments"] = self._extract_comments(post)
yield Message.Directory, post
for post["num"], file in enumerate(files, 1):
post["num"] = 0
for file in itertools.chain.from_iterable(
g(post) for g in generators):
post["type"] = file["type"]
post["num"] += 1
url = file["path"]
if url[0] == "/":
url = self.root + "/data" + url
@ -104,6 +94,36 @@ class KemonopartyExtractor(Extractor):
return {c.name: c.value for c in response.history[0].cookies}
def _postfile(self, post):
file = post["file"]
if not file:
return ()
file["type"] = "file"
if post["service"] == self._skip_service and post["attachments"]:
return ()
return (file,)
def _attachments(self, post):
for attachment in post["attachments"]:
attachment["type"] = "attachment"
return post["attachments"]
def _inline(self, post):
for path in self._find_inline(post["content"] or ""):
yield {"path": path, "name": path, "type": "inline"}
def _build_file_generators(self, filetypes):
if filetypes is None:
return (self._postfile, self._attachments, self._inline)
genmap = {
"postfile" : self._postfile,
"attachments": self._attachments,
"inline" : self._inline,
}
if isinstance(filetypes, str):
filetypes = filetypes.split(",")
return [genmap[ft] for ft in filetypes]
def _extract_comments(self, post):
url = "{}/{}/user/{}/post/{}".format(
self.root, post["service"], post["user"], post["id"])
@ -189,13 +209,14 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
}),
# inline image (#1286)
("https://kemono.party/fanbox/user/7356311/post/802343", {
"pattern": r"https://kemono\.party/data/inline/fanbox"
r"/uaozO4Yga6ydkGIJFAQDixfE\.jpeg",
"pattern": r"https://kemono\.party/data/47/b5/47b5c014ecdcfabdf2c8"
r"5eec53f1133a76336997ae8596f332e97d956a460ad2\.jpg",
}),
# kemono.party -> data.kemono.party
("https://kemono.party/gumroad/user/trylsc/post/IURjT", {
"pattern": r"https://kemono\.party/data/(file|attachment)s"
r"/gumroad/trylsc/IURjT/",
"pattern": r"https://kemono\.party/data/("
r"files/gumroad/trylsc/IURjT/reward8\.jpg|"
r"c6/04/c6048f5067fd9dbfa7a8be565ac194efdfb6e4.+\.zip)",
}),
# username (#1548, #1652)
("https://kemono.party/gumroad/user/3252870377455/post/aJnAH", {

Loading…
Cancel
Save