[fantia] improve metadata extraction (#4126)

extract all metadata and URLs before starting to download
pull/4403/head
Mike Fährmann 1 year ago
parent d50c312ff0
commit c79359eb3a
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -19,13 +19,13 @@ class FantiaExtractor(Extractor):
archive_fmt = "{post_id}_{file_id}"
_warning = True
def items(self):
def _init(self):
self.headers = {
"Accept" : "application/json, text/plain, */*",
"Referer": self.root,
"X-Requested-With": "XMLHttpRequest",
}
_empty_plan = {
self._empty_plan = {
"id" : 0,
"price": 0,
"limit": 0,
@ -33,22 +33,18 @@ class FantiaExtractor(Extractor):
"description": "",
"thumb": self.root + "/images/fallback/plan/thumb_default.png",
}
if self._warning:
if not self.cookies_check(("_session_id",)):
self.log.warning("no '_session_id' cookie set")
FantiaExtractor._warning = False
def items(self):
for post_id in self.posts():
post = self._get_post_data(post_id)
post["num"] = 0
for content in self._get_post_contents(post):
post["content_category"] = content["category"]
post["content_title"] = content["title"]
post["content_filename"] = content.get("filename", "")
post["content_id"] = content["id"]
post["plan"] = content["plan"] or _empty_plan
files = self._process_content(post, content)
yield Message.Directory, post
if content["visible_status"] != "visible":
@ -57,12 +53,12 @@ class FantiaExtractor(Extractor):
"%s#post-content-id-%s", content["visible_status"],
post["post_url"], content["id"])
for url in self._get_content_urls(post, content):
text.nameext_from_url(
post["content_filename"] or url, post)
post["file_url"] = url
for file in files:
post.update(file)
post["num"] += 1
yield Message.Url, url, post
text.nameext_from_url(
post["content_filename"] or file["file_url"], post)
yield Message.Url, file["file_url"], post
def posts(self):
"""Return post IDs"""
@ -132,42 +128,45 @@ class FantiaExtractor(Extractor):
return contents
def _get_content_urls(self, post, content):
"""Extract individual URL data from the response"""
if "comment" in content:
post["content_comment"] = content["comment"]
def _process_content(self, post, content):
post["content_category"] = content["category"]
post["content_title"] = content["title"]
post["content_filename"] = content.get("filename") or ""
post["content_id"] = content["id"]
post["content_comment"] = content.get("comment") or ""
post["plan"] = content["plan"] or self._empty_plan
files = []
if "post_content_photos" in content:
for photo in content["post_content_photos"]:
post["file_id"] = photo["id"]
yield photo["url"]["original"]
files.append({"file_id" : photo["id"],
"file_url": photo["url"]["original"]})
if "download_uri" in content:
post["file_id"] = content["id"]
url = content["download_uri"]
if url[0] == "/":
url = self.root + url
yield url
files.append({"file_id" : content["id"],
"file_url": url})
if content["category"] == "blog" and "comment" in content:
comment_json = util.json_loads(content["comment"])
ops = comment_json.get("ops") or ()
# collect blogpost text first
blog_text = ""
for op in ops:
for op in comment_json.get("ops") or ():
insert = op.get("insert")
if isinstance(insert, str):
blog_text += insert
elif isinstance(insert, dict) and "fantiaImage" in insert:
img = insert["fantiaImage"]
files.append({"file_id" : img["id"],
"file_url": self.root + img["original_url"]})
post["blogpost_text"] = blog_text
else:
post["blogpost_text"] = ""
# collect images
for op in ops:
insert = op.get("insert")
if isinstance(insert, dict) and "fantiaImage" in insert:
img = insert["fantiaImage"]
post["file_id"] = img["id"]
yield self.root + img["original_url"]
return files
class FantiaCreatorExtractor(FantiaExtractor):
@ -200,6 +199,42 @@ class FantiaPostExtractor(FantiaExtractor):
subcategory = "post"
pattern = r"(?:https?://)?(?:www\.)?fantia\.jp/posts/(\d+)"
test = (
("https://fantia.jp/posts/1166373", {
"pattern": r"https://("
r"c\.fantia\.jp/uploads/post/file/1166373/|"
r"cc\.fantia\.jp/uploads/post_content_photo"
r"/file/732549[01]|"
r"fantia\.jp/posts/1166373/album_image\?)",
"keyword": {
"blogpost_text": r"re:^$|"
r"This is a test.\n\nThis is a test.\n\n|"
r"Link to video:\nhttps://www.youtube.com"
r"/watch\?v=5SSdvNcAagI\n\nhtml img from "
r"another site:\n\n\n\n\n\n",
"comment": "\n\n",
"content_category": "re:thumb|blog|photo_gallery",
"content_comment": str,
"content_filename": "re:|",
"content_title": r"re:Test (Blog Content \d+|Image Gallery)"
r"|thumb",
"date": "dt:2022-03-09 16:46:12",
"fanclub_id": 356320,
"fanclub_name": "Test Fantia",
"fanclub_url": "https://fantia.jp/fanclubs/356320",
"fanclub_user_id": 7487131,
"fanclub_user_name": "2022/03/08 15:13:52の名無し",
"file_url": str,
"filename": str,
"num": int,
"plan": dict,
"post_id": 1166373,
"post_title": "Test Fantia Post",
"post_url": "https://fantia.jp/posts/1166373",
"posted_at": "Thu, 10 Mar 2022 01:46:12 +0900",
"rating": "general",
"tags": [],
},
}),
("https://fantia.jp/posts/508363", {
"count": 6,
"keyword": {

Loading…
Cancel
Save