diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 0875438e..9c8b410a 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -86,7 +86,7 @@ Reddit https://www.reddit.com/ individual Images, Subm rule #34 https://rule34.paheal.net/ Posts, Tag-Searches Rule 34 https://rule34.xxx/ Pools, Posts, Tag-Searches Safebooru https://safebooru.org/ Pools, Posts, Tag-Searches -Sankaku Channel https://chan.sankakucomplex.com/ Pools, Posts, Tag-Searches Optional +Sankaku Channel https://chan.sankakucomplex.com/ Articles, Pools, Posts, Tag-Searches Optional Sen Manga https://raw.senmanga.com/ Chapters Sense-Scans http://sensescans.com/reader/ Chapters, Manga Sex.com https://www.sex.com/ Boards, Pins, Search Results diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 9d3dc7c3..a1e7219d 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -297,3 +297,72 @@ class SankakuPostExtractor(SankakuExtractor): def get_posts(self): return (self.post_id,) + + +class SankakuArticleExtractor(Extractor): + """Extractor for articles on www.sankakucomplex.com""" + category = "sankaku" + subcategory = "article" + directory_fmt = ("{category}", "Articles", "{date:%Y-%m-%d} {title}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "a_{date:%Y%m%d}_{filename}" + pattern = (r"(?:https?://)?www\.sankakucomplex\.com" + r"/(\d{4}/\d\d/\d\d)/([^/?&#]+)") + test = ( + ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", { + "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d", + "keyword": "4ab96f31df9ee95d0dc6eefc2ca4e508c45c8e00", + }), + ("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", { + "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c", + "keyword": "a7876de642bf3e68fb4743dcd4d4e8778f2c17ab", + }), + ) + root = "https://www.sankakucomplex.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.date, self.title = match.groups() + + def items(self): + url = "{}/{}/{}/?pg=X".format(self.root, self.date, self.title) + extr = text.extract_from(self.request(url).text) + data = { + "title" : text.unescape( + extr('"og:title" content="', '"')), + "description": text.unescape( + extr('"og:description" content="', '"')), + "date" : text.parse_datetime( + extr('"og:updated_time" content="', '"')), + } + imgs = self.images(extr) + data["count"] = len(imgs) + data["tags"] = text.split_html(extr('="meta-tags">', ''))[::2] + + yield Message.Directory, data + for img in imgs: + img.update(data) + yield Message.Url, img["url"], img + + def images(self, extr): + num = 0 + imgs = [] + urls = set() + orig = re.compile(r"-\d+x\d+\.") + + extr('
', '') + while True: + url = extr('data-lazy-src="', '"') + if not url: + return imgs + if url in urls: + continue + if url[0] == "/": + url = text.urljoin(self.root, url) + url = orig.sub(".", url) + num += 1 + imgs.append(text.nameext_from_url(url, { + "url" : url, + "num" : num, + })) + urls.add(url)