From c0714d5585b40bc6c0047569a2783ff7c53048bd Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Tue, 24 Oct 2023 23:05:28 +0530 Subject: [PATCH] [4archive] add 'thread' and 'board' extractors --- gallery_dl/extractor/4archive.py | 110 +++++++++++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 1 + test/results/4archive.py | 62 +++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 gallery_dl/extractor/4archive.py create mode 100644 test/results/4archive.py diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py new file mode 100644 index 00000000..b04a2fda --- /dev/null +++ b/gallery_dl/extractor/4archive.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://4archive.org/""" + +from .common import Extractor, Message +from .. import text + + +class _4archiveThreadExtractor(Extractor): + """Extractor for 4archive threads""" + category = "4archive" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{no} {filename}.{extension}" + archive_fmt = "{board}_{thread}_{no}" + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)" + root = "https://4archive.org" + example = "https://4archive.org/board/a/thread/12345/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/board/{}/thread/{}".format( + self.root, self.board, self.thread) + page = self.request(url).text + data = self.metadata(page) + posts = self.posts(page) + + if not data["title"]: + data["title"] = posts[0]["com"][:50] + + for post in posts: + post.update(data) + post["time"] = text.parse_int(post["date"].timestamp()) + yield Message.Directory, data + if "url" in post: + yield Message.Url, post["url"], text.nameext_from_url( + post["filename"], post) + + def metadata(self, page): + return { + "board" : self.board, + "thread": text.parse_int(self.thread), + "title" : text.unescape(text.extr( + page, 'class="subject">', "")) + } + + def posts(self, page): + return [ + self.parse(post) + for post in page.split('class="postContainer')[1:] + ] + + @staticmethod + def parse(post): + extr = text.extract_from(post) + data = { + "name": extr('class="name">', ""), + "date": text.parse_datetime( + extr('class="dateTime postNum" >', "<").strip(), + "%Y-%m-%d %H:%M:%S"), + "no" : text.parse_int(extr('href="#p', '"')), + } + if 'class="file"' in post: + extr('class="fileText"', ">File: ").strip()[1:], + "size" : text.parse_bytes(extr(" (", ", ")[:-1]), + "width" : text.parse_int(extr("", "x")), + "height" : text.parse_int(extr("", "px")), + }) + extr("
", "
"))) + return data + + +class _4archiveBoardExtractor(Extractor): + """Extractor for 4archive boards""" + category = "4archive" + subcategory = "board" + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$" + root = "https://4archive.org" + example = "https://4archive.org/board/a/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + self.num = text.parse_int(match.group(2), 1) + + def items(self): + data = {"_extractor": _4archiveThreadExtractor} + while True: + url = "{}/board/{}/{}".format(self.root, self.board, self.num) + page = self.request(url).text + if 'class="thread"' not in page: + return + for thread in text.extract_iter(page, 'class="thread" id="t', '"'): + url = "{}/board/{}/thread/{}".format( + self.root, self.board, thread) + yield Message.Queue, url, data + self.num += 1 diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 1c1473a0..22e4fe34 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -15,6 +15,7 @@ modules = [ "35photo", "3dbooru", "4chan", + "4archive", "4chanarchives", "500px", "8chan", diff --git a/test/results/4archive.py b/test/results/4archive.py new file mode 100644 index 00000000..9b5934a7 --- /dev/null +++ b/test/results/4archive.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +gallery_dl = __import__("gallery_dl.extractor.4archive") +_4archive = getattr(gallery_dl.extractor, "4archive") +import datetime + + +__tests__ = ( +{ + "#url" : "https://4archive.org/board/u/thread/2397221", + "#category": ("", "4archive", "thread"), + "#class" : _4archive._4archiveThreadExtractor, + "#pattern" : r"https://i\.imgur\.com/\w{7}\.\w+$", + "#count" : 16, + + "board" : "u", + "com" : str, + "date" : datetime.datetime, + "name" : "Anonymous", + "no" : range(2397221, 2418158), + "thread": 2397221, + "time" : int, + "title" : "best anime", + "url" : str, + "width" : int, + "height": int, + "size" : int, +}, + +{ + "#url" : "https://4archive.org/board/jp/thread/17611798", + "#category": ("", "4archive", "thread"), + "#class" : _4archive._4archiveThreadExtractor, + "#pattern" : r"https://i\.imgur\.com/\w{7}\.\w+$", + "#count" : 85, +}, + +{ + "#url" : "https://4archive.org/board/u", + "#category": ("", "4archive", "board"), + "#class" : _4archive._4archiveBoardExtractor, + "#pattern" : _4archive._4archiveThreadExtractor.pattern, + "#board" : "u", + "#range" : "1-20", + "#count" : 20, +}, + +{ + "#url" : "https://4archive.org/board/jp/10", + "#category": ("", "4archive", "board"), + "#class" : _4archive._4archiveBoardExtractor, + "#pattern" : _4archive._4archiveThreadExtractor.pattern, + "#board" : "jp", + "#range" : "1-50", + "#count" : 50, +} + +)