From 2cf3f538395c06be2cbd7b20e08374c5460289ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 28 Sep 2018 12:46:39 +0200 Subject: [PATCH] [yuki] add thread extractor (closes #111) --- docs/supportedsites.rst | 2 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/yuki.py | 109 +++++++++++++++++++++++++++++++ scripts/build_supportedsites.py | 1 + 4 files changed, 113 insertions(+) create mode 100644 gallery_dl/extractor/yuki.py diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 5c053ce1..2810949a 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -84,6 +84,7 @@ Warosu https://warosu.org/ Threads World Three http://www.slide.world-three.org/ Chapters, Manga XVideos https://www.xvideos.com/ Images from Users, Galleries Yandere https://yande.re/ Pools, Popular Images, Posts, Tag-Searches +|Site-0| https://yuki.la/ Threads Acidimg https://acidimg.cc/ individual Images Imagetwist https://imagetwist.com/ individual Images Imagevenue http://imagevenue.com/ individual Images @@ -94,6 +95,7 @@ Postimg https://postimg.org/ individual Images Turboimagehost https://turboimagehost.com/ individual Images ==================== =================================== ================================================== ================ +.. |Site-0| replace:: yuki.la 4chan archive .. |Capabilities-0| replace:: Images from Users, Albums, Challenges, individual Images, Likes, Search Results .. |Capabilities-1| replace:: Collections, Deviations, Favorites, Folders, Galleries, Journals, Popular Images .. |Capabilities-2| replace:: Images from Users, Albums, Favorites, Galleries, Groups, individual Images, Search Results diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 82b26a39..1065143f 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -91,6 +91,7 @@ modules = [ "worldthree", "yandere", "xvideos", + "yuki", "imagehosts", "directlink", "recursive", diff --git a/gallery_dl/extractor/yuki.py b/gallery_dl/extractor/yuki.py new file mode 100644 index 00000000..221cf3da --- /dev/null +++ b/gallery_dl/extractor/yuki.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://yuki.la/""" + +from .common import Extractor, Message +from .. import text + + +class YukiThreadExtractor(Extractor): + """Extractor for images from threads on yuki.la""" + category = "yuki" + subcategory = "thread" + directory_fmt = ["{category}", "{board}", "{thread} - {title}"] + filename_fmt = "{time}-{filename}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = [r"(?:https?://)?yuki\.la/([^/?&#]+)/(\d+)"] + test = [ + ("https://yuki.la/gd/309639", { + "url": "289e86c5caf673a2515ec5f5f521ac0ae7e189e9", + "keyword": "01cbe29ae207a5cb7556bcbd5ed481ecdaf32727", + "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573", + }), + ("https://yuki.la/a/159767162", { + "url": "cd94d0eb646d279c3b7efb9b7898888e5d44fa93", + "keyword": "7a4ff90e423c74bd3126fb65d13015decec2fa45", + }), + ] + root = "https://yuki.la" + + def __init__(self, match): + Extractor.__init__(self) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/{}".format(self.root, self.board, self.thread) + page = self.request(url).text + data = self.get_metadata(page) + + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(page): + if "image" in post: + for key in ("w", "h", "no", "time"): + post[key] = text.parse_int(post[key]) + post.update(data) + yield Message.Url, post["image"], post + + def get_metadata(self, page): + """Collect metadata for extractor-job""" + title = text.extract(page, "", "")[0] + title, boardname, _ = title.rsplit(" - ", 2) + return { + "board": self.board, + "board_name": boardname, + "thread": text.parse_int(self.thread), + "title": text.unescape(title.partition(" - ")[2]), + } + + def posts(self, page): + """Build a list of all post-objects""" + return [ + self.parse(post) for post in text.extract_iter( + page, '
', ''), + ("time", 'data-utc="', '"'), + ("now" , '>', ' <'), + ("com" , '
'), + ))[0] + data["com"] = text.unescape(text.remove_html( + data["com"].partition(">")[2])) + return data + + @staticmethod + def _extract_image(post, data): + text.extract_all(post, ( + (None , '>File:', ''), + ("fullname", '', '<'), + ("fsize" , '(', ', '), + ("w" , '', 'x'), + ("h" , '', ')'), + ), 0, data) + filename = data["fullname"] or data["filename"] + data["filename"] = text.unescape(filename.rpartition(".")[0]) + data["image"] = "https:" + data["image"] + del data["fullname"] diff --git a/scripts/build_supportedsites.py b/scripts/build_supportedsites.py index c1fc6b0b..4f1cfea6 100755 --- a/scripts/build_supportedsites.py +++ b/scripts/build_supportedsites.py @@ -61,6 +61,7 @@ CATEGORY_MAP = { "thebarchive" : "The /b/ Archive", "worldthree" : "World Three", "xvideos" : "XVideos", + "yuki" : "yuki.la 4chan archive", } SUBCATEGORY_MAP = {