diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3a704cf4..53c88335 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -13,6 +13,12 @@ Consider all listed sites to potentially be NSFW. + + 2ch + https://2ch.hk/ + Boards, Threads + + 2chen https://sturdychan.help/ diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py new file mode 100644 index 00000000..f841dd3c --- /dev/null +++ b/gallery_dl/extractor/2ch.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.2ch.hk/""" + +from .common import Extractor, Message +from .. import text + + +class _2chThreadExtractor(Extractor): + """Extractor for 2ch threads""" + category = "2ch" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{file_id} - {filename}.{extension}" + archive_fmt = "{board}_{thread}_{file_id}" + pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = f"https://2ch.hk/{self.board}/res/{self.thread}.json" + thread_data = self.request(url).json() + + posts = thread_data["threads"][0]["posts"] + post = posts[0] + title = post.get("subject") or text.remove_html(post["comment"]) + + thread_metadata = { + "board": self.board, + "thread": self.thread, + "title": text.unescape(title)[:50], + } + + yield Message.Directory, thread_metadata + for post in posts: + if "files" in post and post['files']: + for file in post['files']: + file_metadata = { + "post_num": post["num"], + "file_id": file["name"].split('.')[0], + "filename": ".".join(file["fullname"].split('.')[:-1]), + "extension": file["name"].split('.')[-1], + } + file_metadata.update(thread_metadata) + + url = f"https://2ch.hk/{file['path']}" + yield Message.Url, url, file_metadata + + +class _2chBoardExtractor(Extractor): + """Extractor for 2ch boards""" + category = "2ch" + subcategory = "board" + pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + + def get_pages(self): + url = f"https://2ch.hk/{self.board}/index.json" + index_page = self.request(url).json() + pages_total = len(index_page['pages']) + + yield index_page + for i in range(1, pages_total): + url = f"https://2ch.hk/{self.board}/{i}.json" + yield self.request(url).json() + + def get_thread_nums(self): + for page in self.get_pages(): + for thread in page["threads"]: + yield thread["thread_num"] + + def items(self): + for thread_num in self.get_thread_nums(): + url = f"https://2ch.hk/{self.board}/res/{thread_num}.html" + yield Message.Queue, url, {"_extractor": _2chThreadExtractor} diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 13d7b38b..8e712961 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -10,6 +10,7 @@ import sys import re modules = [ + "2ch", "2chan", "2chen", "35photo",