[2ch] add 'thread' and 'board' extractors

- [2ch] add thread extractor - [2ch] add board extractor - [2ch] add new entry to supported sites
1 year ago · 6c4abc982e
parent 69726fc82c
commit 6c4abc982e
3 changed files with 91 additions and 0 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -13,6 +13,12 @@ Consider all listed sites to potentially be NSFW.
 </tr>
 </thead>
 <tbody valign="top">
 <tr>
    <td>2ch</td>
    <td>https://2ch.hk/</td>
    <td>Boards, Threads</td>
    <td></td>
 </tr>
 <tr>
    <td>2chen</td>
    <td>https://sturdychan.help/</td>
--- a/gallery_dl/extractor/2ch.py
+++ b/gallery_dl/extractor/2ch.py
@ -0,0 +1,84 @@
 # -*- coding: utf-8 -*-
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extractors for https://www.2ch.hk/"""
 from .common import Extractor, Message
 from .. import text
 class _2chThreadExtractor(Extractor):
    """Extractor for 2ch threads"""
    category = "2ch"
    subcategory = "thread"
    directory_fmt = ("{category}", "{board}", "{thread} {title}")
    filename_fmt = "{file_id} - {filename}.{extension}"
    archive_fmt = "{board}_{thread}_{file_id}"
    pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html"
    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board, self.thread = match.groups()
    def items(self):
        url = f"https://2ch.hk/{self.board}/res/{self.thread}.json"
        thread_data = self.request(url).json()
        posts = thread_data["threads"][0]["posts"]
        post = posts[0]
        title = post.get("subject") or text.remove_html(post["comment"])
        thread_metadata = {
            "board": self.board,
            "thread": self.thread,
            "title": text.unescape(title)[:50],
        }
        yield Message.Directory, thread_metadata
        for post in posts:
            if "files" in post and post['files']:
                for file in post['files']:
                    file_metadata = {
                        "post_num": post["num"],
                        "file_id": file["name"].split('.')[0],
                        "filename": ".".join(file["fullname"].split('.')[:-1]),
                        "extension": file["name"].split('.')[-1],
                    }
                    file_metadata.update(thread_metadata)
                    url = f"https://2ch.hk/{file['path']}"
                    yield Message.Url, url, file_metadata
 class _2chBoardExtractor(Extractor):
    """Extractor for 2ch boards"""
    category = "2ch"
    subcategory = "board"
    pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$"
    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board = match.group(1)
    def get_pages(self):
        url = f"https://2ch.hk/{self.board}/index.json"
        index_page = self.request(url).json()
        pages_total = len(index_page['pages'])
        yield index_page
        for i in range(1, pages_total):
            url = f"https://2ch.hk/{self.board}/{i}.json"
            yield self.request(url).json()
    def get_thread_nums(self):
        for page in self.get_pages():
            for thread in page["threads"]:
                yield thread["thread_num"]
    def items(self):
        for thread_num in self.get_thread_nums():
            url = f"https://2ch.hk/{self.board}/res/{thread_num}.html"
            yield Message.Queue, url, {"_extractor": _2chThreadExtractor}
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -10,6 +10,7 @@ import sys
 import re
 modules = [
    "2ch",
    "2chan",
    "2chen",
    "35photo",