From c0714d5585b40bc6c0047569a2783ff7c53048bd Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Tue, 24 Oct 2023 23:05:28 +0530 Subject: [PATCH 1/3] [4archive] add 'thread' and 'board' extractors --- gallery_dl/extractor/4archive.py | 110 +++++++++++++++++++++++++++++++ gallery_dl/extractor/__init__.py | 1 + test/results/4archive.py | 62 +++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 gallery_dl/extractor/4archive.py create mode 100644 test/results/4archive.py diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py new file mode 100644 index 00000000..b04a2fda --- /dev/null +++ b/gallery_dl/extractor/4archive.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://4archive.org/""" + +from .common import Extractor, Message +from .. import text + + +class _4archiveThreadExtractor(Extractor): + """Extractor for 4archive threads""" + category = "4archive" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{no} {filename}.{extension}" + archive_fmt = "{board}_{thread}_{no}" + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)" + root = "https://4archive.org" + example = "https://4archive.org/board/a/thread/12345/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/board/{}/thread/{}".format( + self.root, self.board, self.thread) + page = self.request(url).text + data = self.metadata(page) + posts = self.posts(page) + + if not data["title"]: + data["title"] = posts[0]["com"][:50] + + for post in posts: + post.update(data) + post["time"] = text.parse_int(post["date"].timestamp()) + yield Message.Directory, data + if "url" in post: + yield Message.Url, post["url"], text.nameext_from_url( + post["filename"], post) + + def metadata(self, page): + return { + "board" : self.board, + "thread": text.parse_int(self.thread), + "title" : text.unescape(text.extr( + page, 'class="subject">', "")) + } + + def posts(self, page): + return [ + self.parse(post) + for post in page.split('class="postContainer')[1:] + ] + + @staticmethod + def parse(post): + extr = text.extract_from(post) + data = { + "name": extr('class="name">', ""), + "date": text.parse_datetime( + extr('class="dateTime postNum" >', "<").strip(), + "%Y-%m-%d %H:%M:%S"), + "no" : text.parse_int(extr('href="#p', '"')), + } + if 'class="file"' in post: + extr('class="fileText"', ">File: ").strip()[1:], + "size" : text.parse_bytes(extr(" (", ", ")[:-1]), + "width" : text.parse_int(extr("", "x")), + "height" : text.parse_int(extr("", "px")), + }) + extr("
", "
"))) + return data + + +class _4archiveBoardExtractor(Extractor): + """Extractor for 4archive boards""" + category = "4archive" + subcategory = "board" + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$" + root = "https://4archive.org" + example = "https://4archive.org/board/a/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + self.num = text.parse_int(match.group(2), 1) + + def items(self): + data = {"_extractor": _4archiveThreadExtractor} + while True: + url = "{}/board/{}/{}".format(self.root, self.board, self.num) + page = self.request(url).text + if 'class="thread"' not in page: + return + for thread in text.extract_iter(page, 'class="thread" id="t', '"'): + url = "{}/board/{}/thread/{}".format( + self.root, self.board, thread) + yield Message.Queue, url, data + self.num += 1 diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 1c1473a0..22e4fe34 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -15,6 +15,7 @@ modules = [ "35photo", "3dbooru", "4chan", + "4archive", "4chanarchives", "500px", "8chan", diff --git a/test/results/4archive.py b/test/results/4archive.py new file mode 100644 index 00000000..9b5934a7 --- /dev/null +++ b/test/results/4archive.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +gallery_dl = __import__("gallery_dl.extractor.4archive") +_4archive = getattr(gallery_dl.extractor, "4archive") +import datetime + + +__tests__ = ( +{ + "#url" : "https://4archive.org/board/u/thread/2397221", + "#category": ("", "4archive", "thread"), + "#class" : _4archive._4archiveThreadExtractor, + "#pattern" : r"https://i\.imgur\.com/\w{7}\.\w+$", + "#count" : 16, + + "board" : "u", + "com" : str, + "date" : datetime.datetime, + "name" : "Anonymous", + "no" : range(2397221, 2418158), + "thread": 2397221, + "time" : int, + "title" : "best anime", + "url" : str, + "width" : int, + "height": int, + "size" : int, +}, + +{ + "#url" : "https://4archive.org/board/jp/thread/17611798", + "#category": ("", "4archive", "thread"), + "#class" : _4archive._4archiveThreadExtractor, + "#pattern" : r"https://i\.imgur\.com/\w{7}\.\w+$", + "#count" : 85, +}, + +{ + "#url" : "https://4archive.org/board/u", + "#category": ("", "4archive", "board"), + "#class" : _4archive._4archiveBoardExtractor, + "#pattern" : _4archive._4archiveThreadExtractor.pattern, + "#board" : "u", + "#range" : "1-20", + "#count" : 20, +}, + +{ + "#url" : "https://4archive.org/board/jp/10", + "#category": ("", "4archive", "board"), + "#class" : _4archive._4archiveBoardExtractor, + "#pattern" : _4archive._4archiveThreadExtractor.pattern, + "#board" : "jp", + "#range" : "1-50", + "#count" : 50, +} + +) From acb713b95a6ddcb8553af31e86e2ae0466cda6e9 Mon Sep 17 00:00:00 2001 From: enduser420 <91022934+enduser420@users.noreply.github.com> Date: Wed, 25 Oct 2023 23:08:45 +0530 Subject: [PATCH 2/3] [4archive] update --- gallery_dl/extractor/4archive.py | 11 ++++++----- test/results/4archive.py | 2 -- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py index b04a2fda..d1983697 100644 --- a/gallery_dl/extractor/4archive.py +++ b/gallery_dl/extractor/4archive.py @@ -7,7 +7,7 @@ """Extractors for https://4archive.org/""" from .common import Extractor, Message -from .. import text +from .. import text, util class _4archiveThreadExtractor(Extractor): @@ -17,8 +17,9 @@ class _4archiveThreadExtractor(Extractor): directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{no} {filename}.{extension}" archive_fmt = "{board}_{thread}_{no}" - pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)" root = "https://4archive.org" + referer = False + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)" example = "https://4archive.org/board/a/thread/12345/" def __init__(self, match): @@ -37,8 +38,8 @@ class _4archiveThreadExtractor(Extractor): for post in posts: post.update(data) - post["time"] = text.parse_int(post["date"].timestamp()) - yield Message.Directory, data + post["time"] = int(util.datetime_to_timestamp(post["date"])) + yield Message.Directory, post if "url" in post: yield Message.Url, post["url"], text.nameext_from_url( post["filename"], post) @@ -87,8 +88,8 @@ class _4archiveBoardExtractor(Extractor): """Extractor for 4archive boards""" category = "4archive" subcategory = "board" - pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$" root = "https://4archive.org" + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$" example = "https://4archive.org/board/a/" def __init__(self, match): diff --git a/test/results/4archive.py b/test/results/4archive.py index 9b5934a7..ec90b929 100644 --- a/test/results/4archive.py +++ b/test/results/4archive.py @@ -44,7 +44,6 @@ __tests__ = ( "#category": ("", "4archive", "board"), "#class" : _4archive._4archiveBoardExtractor, "#pattern" : _4archive._4archiveThreadExtractor.pattern, - "#board" : "u", "#range" : "1-20", "#count" : 20, }, @@ -54,7 +53,6 @@ __tests__ = ( "#category": ("", "4archive", "board"), "#class" : _4archive._4archiveBoardExtractor, "#pattern" : _4archive._4archiveThreadExtractor.pattern, - "#board" : "jp", "#range" : "1-50", "#count" : 50, } From d2874c77249dbdd74dc89e07f0054a23a32381fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 25 Oct 2023 20:11:14 +0200 Subject: [PATCH 3/3] [4archive] docs/supportedsites --- docs/supportedsites.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3924cd39..ecffc70e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -31,6 +31,12 @@ Consider all sites to be NSFW unless otherwise known. Pools, Popular Images, Posts, Tag Searches + + 4archive + https://4archive.org/ + Boards, Threads + + 4chan https://www.4chan.org/