[sankaku] rewrite

- better code structure and extensibility
- better metadata
pull/54/head
Mike Fährmann 7 years ago
parent e96e1fea5d
commit 595593a35e
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1,5 +1,7 @@
# Changelog # Changelog
## Unreleased
## 1.1.0 - 2017-12-08 ## 1.1.0 - 2017-12-08
- Added the ``-r/--limit-rate`` command-line option to set a maximum download rate - Added the ``-r/--limit-rate`` command-line option to set a maximum download rate
- Added the ``--sleep`` command-line option to specify the number of seconds to sleep before each download - Added the ``--sleep`` command-line option to specify the number of seconds to sleep before each download

@ -8,102 +8,86 @@
"""Extract images from https://chan.sankakucomplex.com/""" """Extract images from https://chan.sankakucomplex.com/"""
from .common import Extractor, Message from .common import SharedConfigExtractor, Message
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
import time import time
import random import random
class SankakuTagExtractor(Extractor): class SankakuExtractor(SharedConfigExtractor):
"""Extractor for images from chan.sankakucomplex.com by search-tags""" """Base class for sankaku extractors"""
basecategory = "booru"
category = "sankaku" category = "sankaku"
subcategory = "tag"
directory_fmt = ["{category}", "{tags}"]
filename_fmt = "{category}_{id}_{md5}.{extension}" filename_fmt = "{category}_{id}_{md5}.{extension}"
pattern = [r"(?:https?://)?chan\.sankakucomplex\.com"
r"/\?(?:[^&#]*&)*tags=([^&#]+)"]
test = [("https://chan.sankakucomplex.com/?tags=bonocho", {
"count": 5,
"pattern": (r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+"),
})]
root = "https://chan.sankakucomplex.com" root = "https://chan.sankakucomplex.com"
cookienames = ("login", "pass_hash") cookienames = ("login", "pass_hash")
cookiedomain = "chan.sankakucomplex.com" cookiedomain = "chan.sankakucomplex.com"
def __init__(self, match): def __init__(self):
Extractor.__init__(self) SharedConfigExtractor.__init__(self)
self.logged_in = True self.logged_in = True
self.pagestart = 1 self.start_post = 0
self.tags = text.unquote(match.group(1).replace("+", " "))
self.wait_min = self.config("wait-min", 2) self.wait_min = self.config("wait-min", 2)
self.wait_max = self.config("wait-max", 4) self.wait_max = self.config("wait-max", 4)
if self.wait_max < self.wait_min: if self.wait_max < self.wait_min:
self.wait_max = self.wait_min self.wait_max = self.wait_min
def skip(self, num):
pages = min(num // 20, 49)
self.pagestart += pages
return pages * 20
def items(self): def items(self):
self.login() self.login()
data = self.get_job_metadata()
yield Message.Version, 1 yield Message.Version, 1
yield Message.Directory, data yield Message.Directory, self.get_metadata()
for image in self.get_images():
image.update(data)
yield Message.Url, image["file_url"], image
def get_job_metadata(self): for post_id in util.advance(self.get_posts(), self.start_post):
"""Collect metadata for extractor-job"""
return {"tags": self.tags}
def get_images(self):
"""Yield all available images for the given tags"""
params = {
"tags": self.tags,
"page": self.pagestart,
}
while self.logged_in or params["page"] <= 25:
image = None
page = self.request(self.root, params=params, retries=10).text
pos = text.extract(page, '<div id=more-popular-posts-link>', '')[1]
for image_id in text.extract_iter(
page, '<span class="thumb blacklisted" id=p', '>', pos):
self.wait() self.wait()
image = self.get_image_metadata(image_id) data = self.get_post_data(post_id)
yield image url = data["file_url"]
if not image: yield Message.Url, url, text.nameext_from_url(url, data)
return
params["page"] += 1 def skip(self, num):
params["next"] = image["id"] - 1 self.start_post += num
self.log.warning( return num
"Unauthenticated users may only access the first 500 images / 25 "
"pages. (Use '--range 501-' to continue downloading from this " def get_metadata(self):
"point onwards after setting up an account.)") """Return general metadata"""
return {}
def get_image_metadata(self, image_id): def get_posts(self):
"""Collect metadata for a single image""" """Return an iterable containing all relevant post ids"""
url = "https://chan.sankakucomplex.com/post/show/" + image_id
def get_post_data(self, post_id, extr=text.extract):
"""Extract metadata of a single post"""
url = self.root + "/post/show/" + post_id
page = self.request(url, retries=10).text page = self.request(url, retries=10).text
file_url, pos = text.extract(page, '<li>Original: <a href="', '"')
tags , pos = extr(page, "<title>", " | Sankaku Channel</title>")
vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos)
vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos)
_ , pos = extr(page, "Posted: <", "", pos)
created, pos = extr(page, ' title="', '"', pos)
rating = extr(page, "<li>Rating: ", "<", pos)[0]
file_url, pos = extr(page, '<li>Original: <a href="', '"', pos)
if file_url: if file_url:
width , pos = text.extract(page, '>', 'x', pos) width , pos = extr(page, '>', 'x', pos)
height, pos = text.extract(page, '', ' ', pos) height, pos = extr(page, '', ' ', pos)
else: else:
width , pos = text.extract(page, '<object width=', ' ', pos) width , pos = extr(page, '<object width=', ' ', pos)
height, pos = text.extract(page, 'height=', '>', pos) height, pos = extr(page, 'height=', '>', pos)
file_url = text.extract(page, '<embed src="', '"', pos)[0] file_url = extr(page, '<embed src="', '"', pos)[0]
data = text.nameext_from_url(file_url, {
"id": util.safe_int(image_id), return {
"id": util.safe_int(post_id),
"md5": file_url.rpartition("/")[2].partition(".")[0],
"tags": tags,
"vote_average": float(vavg or 0),
"vote_count": util.safe_int(vcnt),
"created_at": created,
"rating": (rating or "?")[0].lower(),
"file_url": "https:" + text.unescape(file_url), "file_url": "https:" + text.unescape(file_url),
"width": util.safe_int(width), "width": util.safe_int(width),
"height": util.safe_int(height), "height": util.safe_int(height),
}) }
data["md5"] = data["name"]
return data
def wait(self): def wait(self):
"""Wait for a randomly chosen amount of seconds""" """Wait for a randomly chosen amount of seconds"""
@ -138,3 +122,66 @@ class SankakuTagExtractor(Extractor):
raise exception.AuthenticationError() raise exception.AuthenticationError()
cookies = response.history[0].cookies cookies = response.history[0].cookies
return {c: cookies[c] for c in self.cookienames} return {c: cookies[c] for c in self.cookienames}
class SankakuTagExtractor(SankakuExtractor):
"""Extractor for images from chan.sankakucomplex.com by search-tags"""
category = "sankaku"
subcategory = "tag"
directory_fmt = ["{category}", "{tags}"]
pattern = [r"(?:https?://)?chan\.sankakucomplex\.com"
r"/\?(?:[^&#]*&)*tags=([^&#]+)"]
test = [
("https://chan.sankakucomplex.com/?tags=bonocho", {
"count": 5,
"pattern": (r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+"),
}),
("https://chan.sankakucomplex.com/?tags=bonocho+a+b+c+d", {
"options": (("username", None),),
"exception": exception.StopExtraction,
})
]
per_page = 20
def __init__(self, match):
SankakuExtractor.__init__(self)
self.tags = text.unquote(match.group(1).replace("+", " "))
self.start_page = 1
def skip(self, num):
pages, posts = divmod(num, self.per_page)
if pages > 49:
self.log.info("Cannot skip more than 50 pages ahead.")
pages, posts = 49, self.per_page
self.start_page += pages
self.start_post += posts
return pages * self.per_page + posts
def get_metadata(self):
tags = self.tags.split()
if not self.logged_in and len(tags) > 4:
self.log.error("Unauthenticated users cannot use "
"more than 4 tags at once.")
raise exception.StopExtraction()
return {"tags": " ".join(tags)}
def get_posts(self):
params = {"tags": self.tags, "page": self.start_page}
while self.logged_in or params["page"] <= 25:
page = self.request(self.root, params=params, retries=10).text
pos = page.find("<div id=more-popular-posts-link>") + 1
ids = list(text.extract_iter(page, '" id=p', '>', pos))
if not ids:
return
yield from ids
params["page"] += 1
params["next"] = int(ids[-1]) - 1
self.log.warning(
"Unauthenticated users may only access the first 500 images / 25 "
"pages. (Use '--range 501-' to continue downloading from this "
"point onwards after setting up an account.)")

@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
__version__ = "1.1.0" __version__ = "1.1.1-dev"

Loading…
Cancel
Save