[bcy] fix partial image URLs (#613)

Images from new posts can have incomplete/partial URLs (1) without any filename extension when fetching their data from '/apiv3/user/selfPosts', so now all data gets taken from '/item/detail/ID' pages. It is currently unknown how to get the non-watermarked original version of these images, or if that is possible at all. (2) Images with a watermark will have their 'filter' metadata field set to "watermark". For original images this field is an empty string "". Enabling the 'noop' option will, in addition to the watermarked version, yield the the '~noop.image' filter version (3), where 'filter' is set to "noop". (1) "https://img-bcy-qn.pstatp.com/banciyuan/3ccdff22479c4060aadc86718209b281" (2) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~tplv-banciyuan-logo-v3:wqnpnLLlhZLlpKfprZTnjotfCuWNiuasoeWFgyAtIEFDR-eIseWlveiAheekvuWMug==.image" (3) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~noop.image"
5 years ago · 8fbbaa54ff
parent 86c00f9e66
commit 8fbbaa54ff
1 changed files with 67 additions and 48 deletions
--- a/gallery_dl/extractor/bcy.py
+++ b/gallery_dl/extractor/bcy.py
@ -29,42 +29,73 @@ class BcyExtractor(Extractor):
    def items(self):
        sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub
        iroot = "https://img-bcy-qn.pstatp.com"
        noop = self.config("noop")
-        for post in self.posts():
+        for post_id in self.posts():
-            if not post["image_list"]:
+            post = self._parse_post(post_id)
            if not post:
                continue
-            data = {
+            yield Message.Directory, post
            for post["num"], image in enumerate(post["_multi"], 1):
                post["id"] = image["mid"]
                post["width"] = image["w"]
                post["height"] = image["h"]
                url = image["path"].partition("~")[0]
                text.nameext_from_url(url, post)
                if post["extension"]:
                    if not url.startswith(iroot):
                        url = sub(iroot, url)
                    post["filter"] = ""
                    yield Message.Url, url, post
                else:
                    post["filter"] = "watermark"
                    yield Message.Url, image["origin"], post
                    if noop:
                        post["extension"] = ""
                        post["filter"] = "noop"
                        yield Message.Url, image["original_path"], post
    def _parse_post(self, post_id):
        url = "{}/item/detail/{}".format(self.root, post_id)
        response = self.request(url)
        if response.status_code >= 400:
            return None
        data = json.loads(
            text.extract(response.text, 'JSON.parse("', '");')[0]
            .replace('\\\\u002F', '/')
            .replace('\\"', '"')
        )["detail"]
        post = data["post_data"]
        if not post["multi"]:
            return None
        user = data["detail_user"]
        return {
            "user": {
-                    "id"     : post["uid"],
+                "id"     : user["uid"],
-                    "name"   : post["uname"],
+                "name"   : user["uname"],
-                    "avatar" : sub(iroot, post["avatar"].partition("~")[0]),
+                "avatar" : user["avatar"],
            },
            "post": {
                "id"     : text.parse_int(post["item_id"]),
                "tags"   : [t["tag_name"] for t in post["post_tags"]],
                "date"   : text.parse_timestamp(post["ctime"]),
-                    "parody" : post["work"],
+                "parody" : text.parse_unicode_escapes(post["work"]),
                "content": post["plain"],
                "likes"  : post["like_count"],
                "shares" : post["share_count"],
                "replies": post["reply_count"],
            },
            "_multi": post["multi"],
        }
            yield Message.Directory, data
            for data["num"], image in enumerate(post["image_list"], 1):
                data["id"] = image["mid"]
                data["width"] = image["w"]
                data["height"] = image["h"]
                url = image["path"]
                if not url.startswith(iroot):
                    url = sub(iroot, url.partition("~")[0])
                data["url"] = url
                yield Message.Url, url, text.nameext_from_url(url, data)
 class BcyUserExtractor(BcyExtractor):
    """Extractor for user timelines"""
@ -88,7 +119,8 @@ class BcyUserExtractor(BcyExtractor):
            item = None
            for item in data["data"]["items"]:
-                yield item["item_detail"]
+                if item["item_detail"]["multi"]:
                    yield item["item_detail"]["item_id"]
            if not item:
                return
@ -106,7 +138,7 @@ class BcyPostExtractor(BcyExtractor):
            "user": {
                "id"     : 1933712,
                "name"   : "wukloo",
-                "avatar" : "re:https://img-bcy-qn.pstatp.com/Public/Upload/",
+                "avatar" : str,
            },
            "post": {
                "id"     : 6355835481002893070,
@ -128,17 +160,4 @@ class BcyPostExtractor(BcyExtractor):
    })
    def posts(self):
-        url = self.root + "/item/detail/" + self.item_id
+        return (self.item_id,)
        page = self.request(url).text
        data = json.loads(
            text.extract(page, 'JSON.parse("', '");')[0]
            .replace('\\\\u002F', '/')
            .replace('\\"', '"')
        )["detail"]
        post = data["post_data"]
        post["image_list"] = post["multi"]
        post["plain"] = text.parse_unicode_escapes(post["plain"])
        post.update(data["detail_user"])
        return (post,)