[2ch] add 'thread' and 'board' extractors

- [2ch] add thread extractor
- [2ch] add board extractor
- [2ch] add new entry to supported sites
pull/4444/head
hunter-gatherer8 1 year ago committed by Mike Fährmann
parent 69726fc82c
commit 6c4abc982e
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -13,6 +13,12 @@ Consider all listed sites to potentially be NSFW.
</tr>
</thead>
<tbody valign="top">
<tr>
<td>2ch</td>
<td>https://2ch.hk/</td>
<td>Boards, Threads</td>
<td></td>
</tr>
<tr>
<td>2chen</td>
<td>https://sturdychan.help/</td>

@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://www.2ch.hk/"""
from .common import Extractor, Message
from .. import text
class _2chThreadExtractor(Extractor):
"""Extractor for 2ch threads"""
category = "2ch"
subcategory = "thread"
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{file_id} - {filename}.{extension}"
archive_fmt = "{board}_{thread}_{file_id}"
pattern = r"(?:https?://)?2ch\.hk/([^/]+)/res/(\d+)\.html"
def __init__(self, match):
Extractor.__init__(self, match)
self.board, self.thread = match.groups()
def items(self):
url = f"https://2ch.hk/{self.board}/res/{self.thread}.json"
thread_data = self.request(url).json()
posts = thread_data["threads"][0]["posts"]
post = posts[0]
title = post.get("subject") or text.remove_html(post["comment"])
thread_metadata = {
"board": self.board,
"thread": self.thread,
"title": text.unescape(title)[:50],
}
yield Message.Directory, thread_metadata
for post in posts:
if "files" in post and post['files']:
for file in post['files']:
file_metadata = {
"post_num": post["num"],
"file_id": file["name"].split('.')[0],
"filename": ".".join(file["fullname"].split('.')[:-1]),
"extension": file["name"].split('.')[-1],
}
file_metadata.update(thread_metadata)
url = f"https://2ch.hk/{file['path']}"
yield Message.Url, url, file_metadata
class _2chBoardExtractor(Extractor):
"""Extractor for 2ch boards"""
category = "2ch"
subcategory = "board"
pattern = r"(?:https?://)?2ch\.hk/([a-z]+)/?$"
def __init__(self, match):
Extractor.__init__(self, match)
self.board = match.group(1)
def get_pages(self):
url = f"https://2ch.hk/{self.board}/index.json"
index_page = self.request(url).json()
pages_total = len(index_page['pages'])
yield index_page
for i in range(1, pages_total):
url = f"https://2ch.hk/{self.board}/{i}.json"
yield self.request(url).json()
def get_thread_nums(self):
for page in self.get_pages():
for thread in page["threads"]:
yield thread["thread_num"]
def items(self):
for thread_num in self.get_thread_nums():
url = f"https://2ch.hk/{self.board}/res/{thread_num}.html"
yield Message.Queue, url, {"_extractor": _2chThreadExtractor}

@ -10,6 +10,7 @@ import sys
import re
modules = [
"2ch",
"2chan",
"2chen",
"35photo",

Loading…
Cancel
Save