|
|
@ -29,42 +29,73 @@ class BcyExtractor(Extractor):
|
|
|
|
def items(self):
|
|
|
|
def items(self):
|
|
|
|
sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub
|
|
|
|
sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub
|
|
|
|
iroot = "https://img-bcy-qn.pstatp.com"
|
|
|
|
iroot = "https://img-bcy-qn.pstatp.com"
|
|
|
|
|
|
|
|
noop = self.config("noop")
|
|
|
|
|
|
|
|
|
|
|
|
for post in self.posts():
|
|
|
|
for post_id in self.posts():
|
|
|
|
if not post["image_list"]:
|
|
|
|
post = self._parse_post(post_id)
|
|
|
|
|
|
|
|
if not post:
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
data = {
|
|
|
|
yield Message.Directory, post
|
|
|
|
|
|
|
|
for post["num"], image in enumerate(post["_multi"], 1):
|
|
|
|
|
|
|
|
post["id"] = image["mid"]
|
|
|
|
|
|
|
|
post["width"] = image["w"]
|
|
|
|
|
|
|
|
post["height"] = image["h"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
url = image["path"].partition("~")[0]
|
|
|
|
|
|
|
|
text.nameext_from_url(url, post)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if post["extension"]:
|
|
|
|
|
|
|
|
if not url.startswith(iroot):
|
|
|
|
|
|
|
|
url = sub(iroot, url)
|
|
|
|
|
|
|
|
post["filter"] = ""
|
|
|
|
|
|
|
|
yield Message.Url, url, post
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
post["filter"] = "watermark"
|
|
|
|
|
|
|
|
yield Message.Url, image["origin"], post
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if noop:
|
|
|
|
|
|
|
|
post["extension"] = ""
|
|
|
|
|
|
|
|
post["filter"] = "noop"
|
|
|
|
|
|
|
|
yield Message.Url, image["original_path"], post
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_post(self, post_id):
|
|
|
|
|
|
|
|
url = "{}/item/detail/{}".format(self.root, post_id)
|
|
|
|
|
|
|
|
response = self.request(url)
|
|
|
|
|
|
|
|
if response.status_code >= 400:
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data = json.loads(
|
|
|
|
|
|
|
|
text.extract(response.text, 'JSON.parse("', '");')[0]
|
|
|
|
|
|
|
|
.replace('\\\\u002F', '/')
|
|
|
|
|
|
|
|
.replace('\\"', '"')
|
|
|
|
|
|
|
|
)["detail"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
post = data["post_data"]
|
|
|
|
|
|
|
|
if not post["multi"]:
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
user = data["detail_user"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
"user": {
|
|
|
|
"user": {
|
|
|
|
"id" : post["uid"],
|
|
|
|
"id" : user["uid"],
|
|
|
|
"name" : post["uname"],
|
|
|
|
"name" : user["uname"],
|
|
|
|
"avatar" : sub(iroot, post["avatar"].partition("~")[0]),
|
|
|
|
"avatar" : user["avatar"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"post": {
|
|
|
|
"post": {
|
|
|
|
"id" : text.parse_int(post["item_id"]),
|
|
|
|
"id" : text.parse_int(post["item_id"]),
|
|
|
|
"tags" : [t["tag_name"] for t in post["post_tags"]],
|
|
|
|
"tags" : [t["tag_name"] for t in post["post_tags"]],
|
|
|
|
"date" : text.parse_timestamp(post["ctime"]),
|
|
|
|
"date" : text.parse_timestamp(post["ctime"]),
|
|
|
|
"parody" : post["work"],
|
|
|
|
"parody" : text.parse_unicode_escapes(post["work"]),
|
|
|
|
"content": post["plain"],
|
|
|
|
"content": post["plain"],
|
|
|
|
"likes" : post["like_count"],
|
|
|
|
"likes" : post["like_count"],
|
|
|
|
"shares" : post["share_count"],
|
|
|
|
"shares" : post["share_count"],
|
|
|
|
"replies": post["reply_count"],
|
|
|
|
"replies": post["reply_count"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
|
|
|
|
"_multi": post["multi"],
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
yield Message.Directory, data
|
|
|
|
|
|
|
|
for data["num"], image in enumerate(post["image_list"], 1):
|
|
|
|
|
|
|
|
data["id"] = image["mid"]
|
|
|
|
|
|
|
|
data["width"] = image["w"]
|
|
|
|
|
|
|
|
data["height"] = image["h"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
url = image["path"]
|
|
|
|
|
|
|
|
if not url.startswith(iroot):
|
|
|
|
|
|
|
|
url = sub(iroot, url.partition("~")[0])
|
|
|
|
|
|
|
|
data["url"] = url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BcyUserExtractor(BcyExtractor):
|
|
|
|
class BcyUserExtractor(BcyExtractor):
|
|
|
|
"""Extractor for user timelines"""
|
|
|
|
"""Extractor for user timelines"""
|
|
|
@ -88,7 +119,8 @@ class BcyUserExtractor(BcyExtractor):
|
|
|
|
|
|
|
|
|
|
|
|
item = None
|
|
|
|
item = None
|
|
|
|
for item in data["data"]["items"]:
|
|
|
|
for item in data["data"]["items"]:
|
|
|
|
yield item["item_detail"]
|
|
|
|
if item["item_detail"]["multi"]:
|
|
|
|
|
|
|
|
yield item["item_detail"]["item_id"]
|
|
|
|
|
|
|
|
|
|
|
|
if not item:
|
|
|
|
if not item:
|
|
|
|
return
|
|
|
|
return
|
|
|
@ -106,7 +138,7 @@ class BcyPostExtractor(BcyExtractor):
|
|
|
|
"user": {
|
|
|
|
"user": {
|
|
|
|
"id" : 1933712,
|
|
|
|
"id" : 1933712,
|
|
|
|
"name" : "wukloo",
|
|
|
|
"name" : "wukloo",
|
|
|
|
"avatar" : "re:https://img-bcy-qn.pstatp.com/Public/Upload/",
|
|
|
|
"avatar" : str,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"post": {
|
|
|
|
"post": {
|
|
|
|
"id" : 6355835481002893070,
|
|
|
|
"id" : 6355835481002893070,
|
|
|
@ -128,17 +160,4 @@ class BcyPostExtractor(BcyExtractor):
|
|
|
|
})
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
def posts(self):
|
|
|
|
def posts(self):
|
|
|
|
url = self.root + "/item/detail/" + self.item_id
|
|
|
|
return (self.item_id,)
|
|
|
|
page = self.request(url).text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data = json.loads(
|
|
|
|
|
|
|
|
text.extract(page, 'JSON.parse("', '");')[0]
|
|
|
|
|
|
|
|
.replace('\\\\u002F', '/')
|
|
|
|
|
|
|
|
.replace('\\"', '"')
|
|
|
|
|
|
|
|
)["detail"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
post = data["post_data"]
|
|
|
|
|
|
|
|
post["image_list"] = post["multi"]
|
|
|
|
|
|
|
|
post["plain"] = text.parse_unicode_escapes(post["plain"])
|
|
|
|
|
|
|
|
post.update(data["detail_user"])
|
|
|
|
|
|
|
|
return (post,)
|
|
|
|
|
|
|
|