From 6c4abc982e79b3f7b65bebbeddee01e32ec3f36d Mon Sep 17 00:00:00 2001
From: hunter-gatherer8 <hunter.gatherer8@proton.me>
Date: Fri, 18 Aug 2023 00:23:22 +0300
Subject: [PATCH 1/2] [2ch] add 'thread' and 'board' extractors

- [2ch] add thread extractor
- [2ch] add board extractor
- [2ch] add new entry to supported sites
---
 docs/supportedsites.md           |  6 +++
 gallery_dl/extractor/2ch.py      | 84 ++++++++++++++++++++++++++++++++
 gallery_dl/extractor/__init__.py |  1 +
 3 files changed, 91 insertions(+)
 create mode 100644 gallery_dl/extractor/2ch.py
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 3a704cf4..53c88335 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -13,6 +13,12 @@ Consider all listed sites to potentially be NSFW.
 </tr>
 </thead>
 <tbody valign="top">
+<tr>
+    <td>2ch</td>
+    <td>https://2ch.hk/</td>
+    <td>Boards, Threads</td>
+    <td></td>
+</tr>
 <tr>
     <td>2chen</td>
     <td>https://sturdychan.help/</td>
diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py
new file mode 100644
index 00000000..f841dd3c
--- /dev/null
+++ b/gallery_dl/extractor/2ch.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.2ch.hk/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class _2chThreadExtractor(Extractor):
+    """Extractor for 2ch threads"""
+    category = "2ch"
+    subcategory = "thread"
+    directory_fmt = ("{category}", "{board}", "{thread} {title}")
+    filename_fmt = "{file_id} - {filename}.{extension}"
+    archive_fmt = "{board}_{thread}_{file_id}"
+    pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html"
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.board, self.thread = match.groups()
+
+    def items(self):
+        url = f"https://2ch.hk/{self.board}/res/{self.thread}.json"
+        thread_data = self.request(url).json()
+
+        posts = thread_data["threads"][0]["posts"]
+        post = posts[0]
+        title = post.get("subject") or text.remove_html(post["comment"])
+
+        thread_metadata = {
+            "board": self.board,
+            "thread": self.thread,
+            "title": text.unescape(title)[:50],
+        }
+
+        yield Message.Directory, thread_metadata
+        for post in posts:
+            if "files" in post and post['files']:
+                for file in post['files']:
+                    file_metadata = {
+                        "post_num": post["num"],
+                        "file_id": file["name"].split('.')[0],
+                        "filename": ".".join(file["fullname"].split('.')[:-1]),
+                        "extension": file["name"].split('.')[-1],
+                    }
+                    file_metadata.update(thread_metadata)
+
+                    url = f"https://2ch.hk/{file['path']}"
+                    yield Message.Url, url, file_metadata
+
+
+class _2chBoardExtractor(Extractor):
+    """Extractor for 2ch boards"""
+    category = "2ch"
+    subcategory = "board"
+    pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$"
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.board = match.group(1)
+
+    def get_pages(self):
+        url = f"https://2ch.hk/{self.board}/index.json"
+        index_page = self.request(url).json()
+        pages_total = len(index_page['pages'])
+
+        yield index_page
+        for i in range(1, pages_total):
+            url = f"https://2ch.hk/{self.board}/{i}.json"
+            yield self.request(url).json()
+
+    def get_thread_nums(self):
+        for page in self.get_pages():
+            for thread in page["threads"]:
+                yield thread["thread_num"]
+
+    def items(self):
+        for thread_num in self.get_thread_nums():
+            url = f"https://2ch.hk/{self.board}/res/{thread_num}.html"
+            yield Message.Queue, url, {"_extractor": _2chThreadExtractor}
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 13d7b38b..8e712961 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -10,6 +10,7 @@ import sys
 import re
 
 modules = [
+    "2ch",
     "2chan",
     "2chen",
     "35photo",

From 68196589c42bf3fadea2437cf996293da1892176 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Mon, 8 Jan 2024 02:04:34 +0100
Subject: [PATCH 2/2] [2ch] update

- simplify extractor code
- more metadata
- add tests
---
 gallery_dl/extractor/2ch.py | 95 ++++++++++++++++++++-----------------
 test/results/2ch.py         | 64 +++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 44 deletions(-)
 create mode 100644 test/results/2ch.py

diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py
index f841dd3c..dbbf21b6 100644
--- a/gallery_dl/extractor/2ch.py
+++ b/gallery_dl/extractor/2ch.py
@@ -4,81 +4,88 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-"""Extractors for https://www.2ch.hk/"""
+"""Extractors for https://2ch.hk/"""
 
 from .common import Extractor, Message
-from .. import text
+from .. import text, util
 
 
 class _2chThreadExtractor(Extractor):
     """Extractor for 2ch threads"""
     category = "2ch"
     subcategory = "thread"
+    root = "https://2ch.hk"
     directory_fmt = ("{category}", "{board}", "{thread} {title}")
-    filename_fmt = "{file_id} - {filename}.{extension}"
-    archive_fmt = "{board}_{thread}_{file_id}"
-    pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html"
+    filename_fmt = "{tim}{filename:? //}.{extension}"
+    archive_fmt = "{board}_{thread}_{tim}"
+    pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)"
+    example = "https://2ch.hk/a/res/12345.html"
 
     def __init__(self, match):
         Extractor.__init__(self, match)
         self.board, self.thread = match.groups()
 
     def items(self):
-        url = f"https://2ch.hk/{self.board}/res/{self.thread}.json"
-        thread_data = self.request(url).json()
+        url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
+        posts = self.request(url).json()["threads"][0]["posts"]
 
-        posts = thread_data["threads"][0]["posts"]
-        post = posts[0]
-        title = post.get("subject") or text.remove_html(post["comment"])
+        op = posts[0]
+        title = op.get("subject") or text.remove_html(op["comment"])
 
-        thread_metadata = {
-            "board": self.board,
+        thread = {
+            "board" : self.board,
             "thread": self.thread,
-            "title": text.unescape(title)[:50],
+            "title" : text.unescape(title)[:50],
         }
 
-        yield Message.Directory, thread_metadata
+        yield Message.Directory, thread
         for post in posts:
-            if "files" in post and post['files']:
-                for file in post['files']:
-                    file_metadata = {
-                        "post_num": post["num"],
-                        "file_id": file["name"].split('.')[0],
-                        "filename": ".".join(file["fullname"].split('.')[:-1]),
-                        "extension": file["name"].split('.')[-1],
-                    }
-                    file_metadata.update(thread_metadata)
+            files = post.get("files")
+            if files:
+                post["post_name"] = post["name"]
+                post["date"] = text.parse_timestamp(post["timestamp"])
+                del post["files"]
+                del post["name"]
 
-                    url = f"https://2ch.hk/{file['path']}"
-                    yield Message.Url, url, file_metadata
+                for file in files:
+                    file.update(thread)
+                    file.update(post)
+
+                    file["filename"] = file["fullname"].rpartition(".")[0]
+                    file["tim"], _, file["extension"] = \
+                        file["name"].rpartition(".")
+
+                    yield Message.Url, self.root + file["path"], file
 
 
 class _2chBoardExtractor(Extractor):
     """Extractor for 2ch boards"""
     category = "2ch"
     subcategory = "board"
-    pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$"
+    root = "https://2ch.hk"
+    pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$"
+    example = "https://2ch.hk/a/"
 
     def __init__(self, match):
         Extractor.__init__(self, match)
         self.board = match.group(1)
 
-    def get_pages(self):
-        url = f"https://2ch.hk/{self.board}/index.json"
-        index_page = self.request(url).json()
-        pages_total = len(index_page['pages'])
-
-        yield index_page
-        for i in range(1, pages_total):
-            url = f"https://2ch.hk/{self.board}/{i}.json"
-            yield self.request(url).json()
-
-    def get_thread_nums(self):
-        for page in self.get_pages():
-            for thread in page["threads"]:
-                yield thread["thread_num"]
-
     def items(self):
-        for thread_num in self.get_thread_nums():
-            url = f"https://2ch.hk/{self.board}/res/{thread_num}.html"
-            yield Message.Queue, url, {"_extractor": _2chThreadExtractor}
+        # index page
+        url = "{}/{}/index.json".format(self.root, self.board)
+        index = self.request(url).json()
+        index["_extractor"] = _2chThreadExtractor
+        for thread in index["threads"]:
+            url = "{}/{}/res/{}.html".format(
+                self.root, self.board, thread["thread_num"])
+            yield Message.Queue, url, index
+
+        # pages 1..n
+        for n in util.advance(index["pages"], 1):
+            url = "{}/{}/{}.json".format(self.root, self.board, n)
+            page = self.request(url).json()
+            page["_extractor"] = _2chThreadExtractor
+            for thread in page["threads"]:
+                url = "{}/{}/res/{}.html".format(
+                    self.root, self.board, thread["thread_num"])
+                yield Message.Queue, url, page
diff --git a/test/results/2ch.py b/test/results/2ch.py
new file mode 100644
index 00000000..5400292c
--- /dev/null
+++ b/test/results/2ch.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+gallery_dl = __import__("gallery_dl.extractor.2ch")
+_2ch = getattr(gallery_dl.extractor, "2ch")
+
+
+__tests__ = (
+{
+    "#url"     : "https://2ch.hk/a/res/6202876.html",
+    "#category": ("", "2ch", "thread"),
+    "#class"   : _2ch._2chThreadExtractor,
+    "#pattern" : r"https://2ch\.hk/a/src/6202876/\d+\.\w+",
+    "#count"   : range(450, 1000),
+
+    "banned"   : 0,
+    "board"    : "a",
+    "closed"   : 0,
+    "comment"  : str,
+    "date"     : "type:datetime",
+    "displayname": str,
+    "email"    : "",
+    "endless"  : 1,
+    "extension": str,
+    "filename" : str,
+    "fullname" : str,
+    "height"   : int,
+    "lasthit"  : 1705273977,
+    "md5"      : r"re:[0-9a-f]{32}",
+    "name"     : r"re:\d+\.\w+",
+    "num"      : int,
+    "number"   : range(1, 1000),
+    "op"       : 0,
+    "parent"   : int,
+    "path"     : r"re:/a/src/6202876/\d+\.\w+",
+    "post_name": "Аноним",
+    "size"     : int,
+    "sticky"   : 0,
+    "subject"  : str,
+    "thread"   : "6202876",
+    "thumbnail": str,
+    "tim"      : r"re:\d+",
+    "timestamp": int,
+    "title"    : "MP4/WEBM",
+    "tn_height": int,
+    "tn_width" : int,
+    "trip"     : "",
+    "type"     : int,
+    "views"    : int,
+    "width"    : int,
+},
+
+{
+    "#url"     : "https://2ch.hk/a/",
+    "#category": ("", "2ch", "board"),
+    "#class"   : _2ch._2chBoardExtractor,
+    "#pattern" : _2ch._2chThreadExtractor.pattern,
+    "#count"   : range(200, 300),
+},
+
+)