[bcy] fix partial image URLs (#613)

Images from new posts can have incomplete/partial URLs (1)
without any filename extension when fetching their data from
'/apiv3/user/selfPosts', so now all data gets taken from
'/item/detail/ID' pages.

It is currently unknown how to get the non-watermarked original version
of these images, or if that is possible at all. (2)
Images with a watermark will have their 'filter' metadata field set to
"watermark". For original images this field is an empty string "".

Enabling the 'noop' option will, in addition to the watermarked version,
yield the the '~noop.image' filter version (3),
where 'filter' is set to "noop".

(1) "https://img-bcy-qn.pstatp.com/banciyuan/3ccdff22479c4060aadc86718209b281"
(2) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~tplv-banciyuan-logo-v3:wqnpnLLlhZLlpKfprZTnjotfCuWNiuasoeWFgyAtIEFDR-eIseWlveiAheekvuWMug==.image"
(3) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~noop.image"
pull/644/head
Mike Fährmann 5 years ago
parent 86c00f9e66
commit 8fbbaa54ff
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -29,42 +29,73 @@ class BcyExtractor(Extractor):
def items(self): def items(self):
sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub
iroot = "https://img-bcy-qn.pstatp.com" iroot = "https://img-bcy-qn.pstatp.com"
noop = self.config("noop")
for post in self.posts(): for post_id in self.posts():
if not post["image_list"]: post = self._parse_post(post_id)
if not post:
continue continue
data = { yield Message.Directory, post
for post["num"], image in enumerate(post["_multi"], 1):
post["id"] = image["mid"]
post["width"] = image["w"]
post["height"] = image["h"]
url = image["path"].partition("~")[0]
text.nameext_from_url(url, post)
if post["extension"]:
if not url.startswith(iroot):
url = sub(iroot, url)
post["filter"] = ""
yield Message.Url, url, post
else:
post["filter"] = "watermark"
yield Message.Url, image["origin"], post
if noop:
post["extension"] = ""
post["filter"] = "noop"
yield Message.Url, image["original_path"], post
def _parse_post(self, post_id):
url = "{}/item/detail/{}".format(self.root, post_id)
response = self.request(url)
if response.status_code >= 400:
return None
data = json.loads(
text.extract(response.text, 'JSON.parse("', '");')[0]
.replace('\\\\u002F', '/')
.replace('\\"', '"')
)["detail"]
post = data["post_data"]
if not post["multi"]:
return None
user = data["detail_user"]
return {
"user": { "user": {
"id" : post["uid"], "id" : user["uid"],
"name" : post["uname"], "name" : user["uname"],
"avatar" : sub(iroot, post["avatar"].partition("~")[0]), "avatar" : user["avatar"],
}, },
"post": { "post": {
"id" : text.parse_int(post["item_id"]), "id" : text.parse_int(post["item_id"]),
"tags" : [t["tag_name"] for t in post["post_tags"]], "tags" : [t["tag_name"] for t in post["post_tags"]],
"date" : text.parse_timestamp(post["ctime"]), "date" : text.parse_timestamp(post["ctime"]),
"parody" : post["work"], "parody" : text.parse_unicode_escapes(post["work"]),
"content": post["plain"], "content": post["plain"],
"likes" : post["like_count"], "likes" : post["like_count"],
"shares" : post["share_count"], "shares" : post["share_count"],
"replies": post["reply_count"], "replies": post["reply_count"],
}, },
"_multi": post["multi"],
} }
yield Message.Directory, data
for data["num"], image in enumerate(post["image_list"], 1):
data["id"] = image["mid"]
data["width"] = image["w"]
data["height"] = image["h"]
url = image["path"]
if not url.startswith(iroot):
url = sub(iroot, url.partition("~")[0])
data["url"] = url
yield Message.Url, url, text.nameext_from_url(url, data)
class BcyUserExtractor(BcyExtractor): class BcyUserExtractor(BcyExtractor):
"""Extractor for user timelines""" """Extractor for user timelines"""
@ -88,7 +119,8 @@ class BcyUserExtractor(BcyExtractor):
item = None item = None
for item in data["data"]["items"]: for item in data["data"]["items"]:
yield item["item_detail"] if item["item_detail"]["multi"]:
yield item["item_detail"]["item_id"]
if not item: if not item:
return return
@ -106,7 +138,7 @@ class BcyPostExtractor(BcyExtractor):
"user": { "user": {
"id" : 1933712, "id" : 1933712,
"name" : "wukloo", "name" : "wukloo",
"avatar" : "re:https://img-bcy-qn.pstatp.com/Public/Upload/", "avatar" : str,
}, },
"post": { "post": {
"id" : 6355835481002893070, "id" : 6355835481002893070,
@ -128,17 +160,4 @@ class BcyPostExtractor(BcyExtractor):
}) })
def posts(self): def posts(self):
url = self.root + "/item/detail/" + self.item_id return (self.item_id,)
page = self.request(url).text
data = json.loads(
text.extract(page, 'JSON.parse("', '");')[0]
.replace('\\\\u002F', '/')
.replace('\\"', '"')
)["detail"]
post = data["post_data"]
post["image_list"] = post["multi"]
post["plain"] = text.parse_unicode_escapes(post["plain"])
post.update(data["detail_user"])
return (post,)

Loading…
Cancel
Save