[tsumino] fix extraction

deviantart-rewrite
Mike Fährmann 5 years ago
parent d92802fd37
commit 15632a1570
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -37,22 +37,22 @@ class TsuminoBase():
response = self.request(url, method="POST", headers=headers, data=data)
if not response.history:
raise exception.AuthenticationError()
return {".aotsumino": response.history[0].cookies[".aotsumino"]}
return self.session.cookies
class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
"""Extractor for image galleries on tsumino.com"""
pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
r"/(?:Book/Info|Read/View)/(\d+)")
r"/(?:entry|Book/Info|Read/(?:Index|View))/(\d+)")
test = (
("https://www.tsumino.com/Book/Info/40996", {
"url": "84bf30a86623039fc87855680fada884dc8a1ddd",
("https://www.tsumino.com/entry/40996", {
"pattern": r"https://content.tsumino.com/parts/40996/\d+\?key=\w+",
"keyword": {
"title" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
"title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
"title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
"gallery_id": 40996,
"date" : "2018 June 29",
"date" : "type:datetime",
"count" : 42,
"collection": "",
"artist" : ["Itou Life"],
@ -65,15 +65,17 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
"uploader" : "sehki",
"lang" : "en",
"language" : "English",
"thumbnail" : "re:https?://www.tsumino.com/Image/Thumb/40996",
"thumbnail" : "https://content.tsumino.com/thumbs/40996/1",
},
}),
("https://www.tsumino.com/Book/Info/40996"),
("https://www.tsumino.com/Read/View/45834"),
("https://www.tsumino.com/Read/Index/45834"),
)
def __init__(self, match):
self.gallery_id = match.group(1)
url = "{}/Book/Info/{}".format(self.root, self.gallery_id)
url = "{}/entry/{}".format(self.root, self.gallery_id)
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@ -90,7 +92,8 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
"title_jp" : title_jp,
"thumbnail" : extr('"og:image" content="', '"'),
"uploader" : text.remove_html(extr('id="Uploader">', '</div>')),
"date" : extr('id="Uploaded">', '</div>').strip(),
"date" : text.parse_datetime(
extr('id="Uploaded">', '</div>').strip(), "%Y %B %d"),
"rating" : text.parse_float(extr(
'id="Rating">', '</div>').partition(" ")[0]),
"type" : text.remove_html(extr('id="Category">' , '</div>')),
@ -105,21 +108,24 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
}
def images(self, page):
url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id)
url = "{}/Read/Index/{}?page=1".format(self.root, self.gallery_id)
headers = {"Referer": self.chapter_url}
response = self.request(url, headers=headers, fatal=False)
if response.status_code >= 400:
url = "{}/Read/View/{}".format(self.root, self.gallery_id)
if "/Auth/" in response.url:
self.log.error(
"Failed to get gallery JSON data. Visit '%s' in a browser "
"and solve the CAPTCHA to continue.", url)
"and solve the CAPTCHA to continue.", response.url)
raise exception.StopExtraction()
base = self.root + "/Image/Object?name="
page = response.text
tpl, pos = text.extract(page, 'data-cdn="', '"')
cnt, pos = text.extract(page, '> of ', '<', pos)
base, _, params = text.unescape(tpl).partition("[PAGE]")
return [
(base + text.quote(name), None)
for name in response.json()["reader_page_urls"]
(base + str(i) + params, None)
for i in range(1, text.parse_int(cnt)+1)
]
@ -149,13 +155,13 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
def items(self):
yield Message.Version, 1
for gallery in self.galleries():
url = "{}/Book/Info/{}".format(self.root, gallery["Id"])
url = "{}/entry/{}".format(self.root, gallery["id"])
gallery["_extractor"] = TsuminoGalleryExtractor
yield Message.Queue, url, gallery
def galleries(self):
"""Return all gallery results matching 'self.query'"""
url = "{}/Books/Operate".format(self.root)
url = "{}/Search/Operate?type=Book".format(self.root)
headers = {
"Referer": "{}/".format(self.root),
"X-Requested-With": "XMLHttpRequest",
@ -176,10 +182,10 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
info = self.request(
url, method="POST", headers=headers, data=data).json()
for gallery in info["Data"]:
yield gallery["Entry"]
for gallery in info["data"]:
yield gallery["entry"]
if info["PageNumber"] >= info["PageCount"]:
if info["pageNumber"] >= info["pageCount"]:
return
data["PageNumber"] += 1

Loading…
Cancel
Save