From 6c4abc982e79b3f7b65bebbeddee01e32ec3f36d Mon Sep 17 00:00:00 2001 From: hunter-gatherer8 Date: Fri, 18 Aug 2023 00:23:22 +0300 Subject: [PATCH 1/2] [2ch] add 'thread' and 'board' extractors - [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites --- docs/supportedsites.md | 6 +++ gallery_dl/extractor/2ch.py | 84 ++++++++++++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 1 + 3 files changed, 91 insertions(+) create mode 100644 gallery_dl/extractor/2ch.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3a704cf4..53c88335 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -13,6 +13,12 @@ Consider all listed sites to potentially be NSFW. + + 2ch + https://2ch.hk/ + Boards, Threads + + 2chen https://sturdychan.help/ diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py new file mode 100644 index 00000000..f841dd3c --- /dev/null +++ b/gallery_dl/extractor/2ch.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.2ch.hk/""" + +from .common import Extractor, Message +from .. import text + + +class _2chThreadExtractor(Extractor): + """Extractor for 2ch threads""" + category = "2ch" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{file_id} - {filename}.{extension}" + archive_fmt = "{board}_{thread}_{file_id}" + pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = f"https://2ch.hk/{self.board}/res/{self.thread}.json" + thread_data = self.request(url).json() + + posts = thread_data["threads"][0]["posts"] + post = posts[0] + title = post.get("subject") or text.remove_html(post["comment"]) + + thread_metadata = { + "board": self.board, + "thread": self.thread, + "title": text.unescape(title)[:50], + } + + yield Message.Directory, thread_metadata + for post in posts: + if "files" in post and post['files']: + for file in post['files']: + file_metadata = { + "post_num": post["num"], + "file_id": file["name"].split('.')[0], + "filename": ".".join(file["fullname"].split('.')[:-1]), + "extension": file["name"].split('.')[-1], + } + file_metadata.update(thread_metadata) + + url = f"https://2ch.hk/{file['path']}" + yield Message.Url, url, file_metadata + + +class _2chBoardExtractor(Extractor): + """Extractor for 2ch boards""" + category = "2ch" + subcategory = "board" + pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + + def get_pages(self): + url = f"https://2ch.hk/{self.board}/index.json" + index_page = self.request(url).json() + pages_total = len(index_page['pages']) + + yield index_page + for i in range(1, pages_total): + url = f"https://2ch.hk/{self.board}/{i}.json" + yield self.request(url).json() + + def get_thread_nums(self): + for page in self.get_pages(): + for thread in page["threads"]: + yield thread["thread_num"] + + def items(self): + for thread_num in self.get_thread_nums(): + url = f"https://2ch.hk/{self.board}/res/{thread_num}.html" + yield Message.Queue, url, {"_extractor": _2chThreadExtractor} diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 13d7b38b..8e712961 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -10,6 +10,7 @@ import sys import re modules = [ + "2ch", "2chan", "2chen", "35photo", From 68196589c42bf3fadea2437cf996293da1892176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 8 Jan 2024 02:04:34 +0100 Subject: [PATCH 2/2] [2ch] update - simplify extractor code - more metadata - add tests --- gallery_dl/extractor/2ch.py | 95 ++++++++++++++++++++----------------- test/results/2ch.py | 64 +++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 44 deletions(-) create mode 100644 test/results/2ch.py diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py index f841dd3c..dbbf21b6 100644 --- a/gallery_dl/extractor/2ch.py +++ b/gallery_dl/extractor/2ch.py @@ -4,81 +4,88 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://www.2ch.hk/""" +"""Extractors for https://2ch.hk/""" from .common import Extractor, Message -from .. import text +from .. import text, util class _2chThreadExtractor(Extractor): """Extractor for 2ch threads""" category = "2ch" subcategory = "thread" + root = "https://2ch.hk" directory_fmt = ("{category}", "{board}", "{thread} {title}") - filename_fmt = "{file_id} - {filename}.{extension}" - archive_fmt = "{board}_{thread}_{file_id}" - pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html" + filename_fmt = "{tim}{filename:? //}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)" + example = "https://2ch.hk/a/res/12345.html" def __init__(self, match): Extractor.__init__(self, match) self.board, self.thread = match.groups() def items(self): - url = f"https://2ch.hk/{self.board}/res/{self.thread}.json" - thread_data = self.request(url).json() + url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread) + posts = self.request(url).json()["threads"][0]["posts"] - posts = thread_data["threads"][0]["posts"] - post = posts[0] - title = post.get("subject") or text.remove_html(post["comment"]) + op = posts[0] + title = op.get("subject") or text.remove_html(op["comment"]) - thread_metadata = { - "board": self.board, + thread = { + "board" : self.board, "thread": self.thread, - "title": text.unescape(title)[:50], + "title" : text.unescape(title)[:50], } - yield Message.Directory, thread_metadata + yield Message.Directory, thread for post in posts: - if "files" in post and post['files']: - for file in post['files']: - file_metadata = { - "post_num": post["num"], - "file_id": file["name"].split('.')[0], - "filename": ".".join(file["fullname"].split('.')[:-1]), - "extension": file["name"].split('.')[-1], - } - file_metadata.update(thread_metadata) + files = post.get("files") + if files: + post["post_name"] = post["name"] + post["date"] = text.parse_timestamp(post["timestamp"]) + del post["files"] + del post["name"] - url = f"https://2ch.hk/{file['path']}" - yield Message.Url, url, file_metadata + for file in files: + file.update(thread) + file.update(post) + + file["filename"] = file["fullname"].rpartition(".")[0] + file["tim"], _, file["extension"] = \ + file["name"].rpartition(".") + + yield Message.Url, self.root + file["path"], file class _2chBoardExtractor(Extractor): """Extractor for 2ch boards""" category = "2ch" subcategory = "board" - pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$" + root = "https://2ch.hk" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$" + example = "https://2ch.hk/a/" def __init__(self, match): Extractor.__init__(self, match) self.board = match.group(1) - def get_pages(self): - url = f"https://2ch.hk/{self.board}/index.json" - index_page = self.request(url).json() - pages_total = len(index_page['pages']) - - yield index_page - for i in range(1, pages_total): - url = f"https://2ch.hk/{self.board}/{i}.json" - yield self.request(url).json() - - def get_thread_nums(self): - for page in self.get_pages(): - for thread in page["threads"]: - yield thread["thread_num"] - def items(self): - for thread_num in self.get_thread_nums(): - url = f"https://2ch.hk/{self.board}/res/{thread_num}.html" - yield Message.Queue, url, {"_extractor": _2chThreadExtractor} + # index page + url = "{}/{}/index.json".format(self.root, self.board) + index = self.request(url).json() + index["_extractor"] = _2chThreadExtractor + for thread in index["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, index + + # pages 1..n + for n in util.advance(index["pages"], 1): + url = "{}/{}/{}.json".format(self.root, self.board, n) + page = self.request(url).json() + page["_extractor"] = _2chThreadExtractor + for thread in page["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, page diff --git a/test/results/2ch.py b/test/results/2ch.py new file mode 100644 index 00000000..5400292c --- /dev/null +++ b/test/results/2ch.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +gallery_dl = __import__("gallery_dl.extractor.2ch") +_2ch = getattr(gallery_dl.extractor, "2ch") + + +__tests__ = ( +{ + "#url" : "https://2ch.hk/a/res/6202876.html", + "#category": ("", "2ch", "thread"), + "#class" : _2ch._2chThreadExtractor, + "#pattern" : r"https://2ch\.hk/a/src/6202876/\d+\.\w+", + "#count" : range(450, 1000), + + "banned" : 0, + "board" : "a", + "closed" : 0, + "comment" : str, + "date" : "type:datetime", + "displayname": str, + "email" : "", + "endless" : 1, + "extension": str, + "filename" : str, + "fullname" : str, + "height" : int, + "lasthit" : 1705273977, + "md5" : r"re:[0-9a-f]{32}", + "name" : r"re:\d+\.\w+", + "num" : int, + "number" : range(1, 1000), + "op" : 0, + "parent" : int, + "path" : r"re:/a/src/6202876/\d+\.\w+", + "post_name": "Аноним", + "size" : int, + "sticky" : 0, + "subject" : str, + "thread" : "6202876", + "thumbnail": str, + "tim" : r"re:\d+", + "timestamp": int, + "title" : "MP4/WEBM", + "tn_height": int, + "tn_width" : int, + "trip" : "", + "type" : int, + "views" : int, + "width" : int, +}, + +{ + "#url" : "https://2ch.hk/a/", + "#category": ("", "2ch", "board"), + "#class" : _2ch._2chBoardExtractor, + "#pattern" : _2ch._2chThreadExtractor.pattern, + "#count" : range(200, 300), +}, + +)