[bcy] fix extraction (#3103)

- fix regex for non-watermarked images
- fetch data from '/item/detail' pages for all other posts,
  since '/apiv3/user/selfPosts' only has incomplete data
pull/3119/head
Mike Fährmann 2 years ago
parent 77173694d5
commit 46b64251eb
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2020-2021 Mike Fährmann # Copyright 2020-2022 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -25,9 +25,12 @@ class BcyExtractor(Extractor):
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.item_id = match.group(1) self.item_id = match.group(1)
self.session.headers["Referer"] = self.root + "/"
def items(self): def items(self):
sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub sub = re.compile(r"^https?://p\d+-bcy"
r"(?:-sign\.bcyimg\.com|\.byteimg\.com/img)"
r"/banciyuan").sub
iroot = "https://img-bcy-qn.pstatp.com" iroot = "https://img-bcy-qn.pstatp.com"
noop = self.config("noop") noop = self.config("noop")
@ -64,19 +67,18 @@ class BcyExtractor(Extractor):
url = image["path"].partition("~")[0] url = image["path"].partition("~")[0]
text.nameext_from_url(url, data) text.nameext_from_url(url, data)
# full-resolution image without watermark
if data["extension"]: if data["extension"]:
if not url.startswith(iroot): if not url.startswith(iroot):
url = sub(iroot, url) url = sub(iroot, url)
data["filter"] = "" data["filter"] = ""
yield Message.Url, url, data yield Message.Url, url, data
# watermarked image & low quality noop filter
else: else:
if not multi: if multi is None:
if len(post["multi"]) < len(post["image_list"]): multi = self._data_from_post(
multi = self._data_from_post(post["item_id"]) post["item_id"])["post_data"]["multi"]
multi = multi["post_data"]["multi"]
else:
multi = post["multi"]
image = multi[data["num"] - 1] image = multi[data["num"] - 1]
if image["origin"]: if image["origin"]:
@ -111,8 +113,8 @@ class BcyUserExtractor(BcyExtractor):
"count": ">= 20", "count": ">= 20",
}), }),
("https://bcy.net/u/109282764041", { ("https://bcy.net/u/109282764041", {
"pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+"
r"~tplv-banciyuan-logo-v3:.+\.image", r"~tplv-bcyx-yuan-logo-v1:.+\.image",
"range": "1-25", "range": "1-25",
"count": 25, "count": 25,
}), }),
@ -171,13 +173,13 @@ class BcyPostExtractor(BcyExtractor):
}), }),
# only watermarked images available # only watermarked images available
("https://bcy.net/item/detail/6950136331708144648", { ("https://bcy.net/item/detail/6950136331708144648", {
"pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+" "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+"
r"~tplv-banciyuan-logo-v3:.+\.image", r"~tplv-bcyx-yuan-logo-v1:.+\.image",
"count": 8, "count": 10,
"keyword": {"filter": "watermark"}, "keyword": {"filter": "watermark"},
}), }),
# deleted # deleted
("https://bcy.net/item/detail/6780546160802143236", { ("https://bcy.net/item/detail/6780546160802143237", {
"exception": exception.NotFoundError, "exception": exception.NotFoundError,
"count": 0, "count": 0,
}), }),

Loading…
Cancel
Save