From 13ce3a9acb2b8bc66fb8a1f51d4b40a6d2f084fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 13 Oct 2023 23:03:39 +0200 Subject: [PATCH] [warosu] fix extraction (#4634) --- gallery_dl/extractor/warosu.py | 34 +++++++++---------- test/results/warosu.py | 62 +++++++++++++++++++++++++++++++--- 2 files changed, 74 insertions(+), 22 deletions(-) diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 6f152ed7..8e6b842a 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -47,7 +47,7 @@ class WarosuThreadExtractor(Extractor): def metadata(self, page): boardname = text.extr(page, "", "") - title = text.extr(page, 'filetitle" itemprop="name">', '<') + title = text.unescape(text.extr(page, "class=filetitle>", "<")) return { "board" : self.board, "board_name": boardname.rpartition(" - ")[2], @@ -57,39 +57,37 @@ class WarosuThreadExtractor(Extractor): def posts(self, page): """Build a list of all post objects""" - page = text.extr(page, '
', '') - needle = '
' + page = text.extr(page, "
") + needle = "
" return [self.parse(post) for post in page.split(needle)] def parse(self, post): """Build post object by extracting data from an HTML post""" data = self._extract_post(post) - if "File:" in post: + if " File:" in post: self._extract_image(post, data) part = data["image"].rpartition("/")[2] data["tim"], _, data["extension"] = part.partition(".") data["ext"] = "." + data["extension"] return data - @staticmethod - def _extract_post(post): + def _extract_post(self, post): extr = text.extract_from(post) return { - "no" : extr('id="p', '"'), - "name": extr('', ""), - "time": extr(''), - "now" : extr("", "<"), + "no" : extr("id=p", ">"), + "name": extr("class=postername>", "<").strip(), + "time": extr("class=posttime title=", "000>"), + "now" : extr("", "<").strip(), "com" : text.unescape(text.remove_html(extr( - '

', '

' - ).strip())), + "
", "
").strip())), } - @staticmethod - def _extract_image(post, data): + def _extract_image(self, post, data): extr = text.extract_from(post) - data["fsize"] = extr("File: ", ", ") + data["fsize"] = extr(" File: ", ", ") data["w"] = extr("", "x") data["h"] = extr("", ", ") - data["filename"] = text.unquote(extr("", "<").rpartition(".")[0]) - extr("
", "") - data["image"] = "https:" + extr('", "") + data["image"] = self.root + extr("") diff --git a/test/results/warosu.py b/test/results/warosu.py index c9273de7..e476b508 100644 --- a/test/results/warosu.py +++ b/test/results/warosu.py @@ -12,17 +12,71 @@ __tests__ = ( "#url" : "https://warosu.org/jp/thread/16656025", "#category": ("", "warosu", "thread"), "#class" : warosu.WarosuThreadExtractor, - "#sha1_url" : "889d57246ed67e491e5b8f7f124e50ea7991e770", - "#sha1_metadata": "c00ea4c5460c5986994f17bb8416826d42ca57c0", + "#urls" : ( + "https://warosu.org/data/jp/img/0166/56/1488487280004.png", + "https://warosu.org/data/jp/img/0166/56/1488493239417.png", + "https://warosu.org/data/jp/img/0166/56/1488493636725.jpg", + "https://warosu.org/data/jp/img/0166/56/1488493700040.jpg", + "https://warosu.org/data/jp/img/0166/56/1488499585168.jpg", + "https://warosu.org/data/jp/img/0166/56/1488530851199.jpg", + "https://warosu.org/data/jp/img/0166/56/1488536072155.jpg", + "https://warosu.org/data/jp/img/0166/56/1488603426484.png", + "https://warosu.org/data/jp/img/0166/56/1488647021253.jpg", + "https://warosu.org/data/jp/img/0166/56/1488866825031.jpg", + "https://warosu.org/data/jp/img/0166/56/1489094956868.jpg", + ), }, { "#url" : "https://warosu.org/jp/thread/16658073", "#category": ("", "warosu", "thread"), "#class" : warosu.WarosuThreadExtractor, - "#sha1_url" : "4500cf3184b067424fd9883249bd543c905fbecd", - "#sha1_metadata": "7534edf4ec51891dbf44d775b73fbbefd52eec71", "#sha1_content" : "d48df0a701e6599312bfff8674f4aa5d4fb8db1c", + "#urls" : "https://warosu.org/data/jp/img/0166/58/1488521824388.jpg", + "#count" : 1, + + "board" : "jp", + "board_name": "Otaku Culture", + "com" : "Is this canon?", + "ext" : ".jpg", + "extension" : "jpg", + "filename" : "sadako-vs-kayako-movie-review", + "fsize" : "55 KB", + "h" : 675, + "image" : "https://warosu.org/data/jp/img/0166/58/1488521824388.jpg", + "name" : "Anonymous", + "no" : 16658073, + "now" : "Fri Mar 3 01:17:04 2017", + "thread" : "16658073", + "tim" : 1488521824388, + "time" : 1488503824, + "title" : "Is this canon?", + "w" : 450, +}, + +{ + "#url" : "https://warosu.org/ic/thread/4604652", + "#category": ("", "warosu", "thread"), + "#class" : warosu.WarosuThreadExtractor, + "#pattern" : r"https://warosu\.org/data/ic/img/0046/04/1590\d{9}\.jpg", + "#count" : 133, + + "board" : "ic", + "board_name": "Artwork/Critique", + "com" : str, + "ext" : ".jpg", + "filename" : str, + "fsize" : str, + "h" : range(200, 3507), + "image" : r"re:https://warosu\.org/data/ic/img/0046/04/1590\d+\.jpg", + "name" : "re:Anonymous|Dhe Specky Spider-Man", + "no" : range(4604652, 4620000), + "now" : r"re:\w\w\w \w\w\w \d\d \d\d:\d\d:\d\d 2020", + "thread" : "4604652", + "tim" : range(1590430159651, 1590755510488), + "time" : range(1590415759, 1590755510), + "title" : "American Classic Comic Artists", + "w" : range(200, 3000), }, )