From 8e01cf0ef85845a9e60f6aab3be6b28ffdc5365a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 7 Jan 2019 16:59:26 +0100 Subject: [PATCH] [reactor] generalize extractors (#148) - support *.reactor.cc domains - combine joyreactor and pornreactor modules --- gallery_dl/extractor/__init__.py | 3 +- gallery_dl/extractor/pornreactor.py | 75 ------- .../extractor/{joyreactor.py => reactor.py} | 199 +++++++++++++----- gallery_dl/extractor/simplyhentai.py | 4 +- 4 files changed, 154 insertions(+), 127 deletions(-) delete mode 100644 gallery_dl/extractor/pornreactor.py rename gallery_dl/extractor/{joyreactor.py => reactor.py} (62%) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index eca704b9..48c4351d 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -47,7 +47,6 @@ modules = [ "imgur", "instagram", "jaiminisbox", - "joyreactor", "khinsider", "kireicake", "kissmanga", @@ -72,8 +71,8 @@ modules = [ "piczel", "pinterest", "pixiv", - "pornreactor", "powermanga", + "reactor", "readcomiconline", "rebeccablacktech", "reddit", diff --git a/gallery_dl/extractor/pornreactor.py b/gallery_dl/extractor/pornreactor.py deleted file mode 100644 index bbf0b8e6..00000000 --- a/gallery_dl/extractor/pornreactor.py +++ /dev/null @@ -1,75 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2018-2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for http://pornreactor.cc/""" - -from .joyreactor import ( - JoyreactorTagExtractor, - JoyreactorSearchExtractor, - JoyreactorUserExtractor, - JoyreactorPostExtractor, -) - - -BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)" - - -class PornreactorTagExtractor(JoyreactorTagExtractor): - """Extractor for tag searches on pornreactor.cc""" - category = "pornreactor" - pattern = [BASE_PATTERN + r"/tag/([^/?&#]+)"] - test = [ - ("http://pornreactor.cc/tag/RiceGnat", { - "count": ">= 120", - }), - ("http://fapreactor.com/tag/RiceGnat", None), - ] - - -class PornreactorSearchExtractor(JoyreactorSearchExtractor): - """Extractor for search results on pornreactor.cc""" - category = "pornreactor" - pattern = [BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"] - test = [ - ("http://pornreactor.cc/search?q=ecchi+hentai", { - "range": "1-25", - "count": ">= 20", - }), - ("http://fapreactor.com/search/ecchi+hentai", None), - ] - - -class PornreactorUserExtractor(JoyreactorUserExtractor): - """Extractor for all posts of a user on pornreactor.cc""" - category = "pornreactor" - pattern = [BASE_PATTERN + r"/user/([^/?&#]+)"] - test = [ - ("http://pornreactor.cc/user/Disillusion", { - "url": "7e06f87f8dcce3fc7851b6d13aa55712ab45fb04", - "keyword": "edfefb54ea4863e3731c508ae6caeb4140be0d31", - }), - ("http://fapreactor.com/user/Disillusion", None), - ] - - -class PornreactorPostExtractor(JoyreactorPostExtractor): - """Extractor for single posts on pornreactor.cc""" - category = "pornreactor" - subcategory = "post" - pattern = [BASE_PATTERN + r"/post/(\d+)"] - test = [ - ("http://pornreactor.cc/post/863166", { - "url": "9e5f7b374605cbbd413f4f4babb9d1af6f95b843", - "keyword": "6e9e4bd4e2d4f3f2c7936340ec71f8693129f809", - "content": "3e2a09f8b5e5ed7722f51c5f423ff4c9260fb23e", - }), - ("http://fapreactor.com/post/863166", { - "url": "83ff7c87741c05bcf1de6825e2b4739afeb87ed5", - "keyword": "cf8159224fde59c1dab86677514b4aedeb533d66", - }), - ] diff --git a/gallery_dl/extractor/joyreactor.py b/gallery_dl/extractor/reactor.py similarity index 62% rename from gallery_dl/extractor/joyreactor.py rename to gallery_dl/extractor/reactor.py index 07fca456..a8dae15e 100644 --- a/gallery_dl/extractor/joyreactor.py +++ b/gallery_dl/extractor/reactor.py @@ -1,34 +1,40 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for http://joyreactor.cc/""" +"""Generic extractors for *reactor sites""" -from .common import Extractor, Message +from .common import SharedConfigExtractor, Message from .. import text +import urllib.parse import json -BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))" +BASE_PATTERN = r"(?:https?://)?([^/.]+\.reactor\.cc)" -class JoyreactorExtractor(Extractor): - """Base class for joyreactor extractors""" - category = "joyreactor" +class ReactorExtractor(SharedConfigExtractor): + """Base class for *reactor.cc extractors""" + basecategory = "reactor" directory_fmt = ["{category}"] filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}" archive_fmt = "{post_id}_{num}" def __init__(self, match): - Extractor.__init__(self) + SharedConfigExtractor.__init__(self) self.url = match.group(0) self.root = "http://" + match.group(1) self.session.headers["Referer"] = self.root + if not self.category: + # set category based on domain name + netloc = urllib.parse.urlsplit(self.root).netloc + self.category = netloc.rpartition(".")[0] + def items(self): data = self.metadata() yield Message.Version, 1 @@ -70,6 +76,7 @@ class JoyreactorExtractor(Extractor): data = json.loads(script) except ValueError: try: + # remove control characters and escape backslashes mapping = dict.fromkeys(range(32)) script = script.translate(mapping).replace("\\", "\\\\") data = json.loads(script) @@ -115,36 +122,92 @@ class JoyreactorExtractor(Extractor): } -class JoyreactorTagExtractor(JoyreactorExtractor): - """Extractor for tag searches on joyreactor.cc""" +class ReactorTagExtractor(ReactorExtractor): + """Extractor for tag searches on *reactor.cc sites""" subcategory = "tag" directory_fmt = ["{category}", "{search_tags}"] archive_fmt = "{search_tags}_{post_id}_{num}" pattern = [BASE_PATTERN + r"/tag/([^/?&#]+)"] - test = [ - ("http://joyreactor.com/tag/Cirno", { - "url": "a81382a3146da50b647c475f87427a6ca1d737df", - "keyword": "dcd3b101cae0a93fbb91281235de1410faf88455", - }), - ("http://joyreactor.cc/tag/Advent+Cirno", { - "count": ">= 17", - }), - ] + test = [("http://anime.reactor.cc/tag/Anime+Art", None)] def __init__(self, match): - JoyreactorExtractor.__init__(self, match) + ReactorExtractor.__init__(self, match) self.tag = match.group(2) def metadata(self): return {"search_tags": text.unescape(self.tag).replace("+", " ")} -class JoyreactorSearchExtractor(JoyreactorTagExtractor): - """Extractor for search results on joyreactor.cc""" +class ReactorSearchExtractor(ReactorTagExtractor): + """Extractor for search results on *reactor.cc sites""" subcategory = "search" directory_fmt = ["{category}", "search", "{search_tags}"] archive_fmt = "s_{search_tags}_{post_id}_{num}" pattern = [BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"] + test = [("http://anime.reactor.cc/search?q=Art", None)] + + +class ReactorUserExtractor(ReactorExtractor): + """Extractor for all posts of a user on *reactor.cc sites""" + subcategory = "user" + directory_fmt = ["{category}", "user", "{user}"] + pattern = [BASE_PATTERN + r"/user/([^/?&#]+)"] + test = [("http://anime.reactor.cc/user/Shuster", None)] + + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.user = match.group(2) + + def metadata(self): + return {"user": text.unescape(self.user).replace("+", " ")} + + +class ReactorPostExtractor(ReactorExtractor): + """Extractor for single posts on *reactor.cc sites""" + subcategory = "post" + pattern = [BASE_PATTERN + r"/post/(\d+)"] + test = [("http://anime.reactor.cc/post/3576250", None)] + + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.post_id = match.group(2) + + def items(self): + yield Message.Version, 1 + post = self.request(self.url).text + pos = post.find('class="uhead">') + for image in self._parse_post(post[pos:]): + if image["num"] == 1: + yield Message.Directory, image + url = image["file_url"] + yield Message.Url, url, text.nameext_from_url(url, image) + + +# -------------------------------------------------------------------- +# JoyReactor + +JR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))" + + +class JoyreactorTagExtractor(ReactorTagExtractor): + """Extractor for tag searches on joyreactor.cc""" + category = "joyreactor" + pattern = [JR_BASE_PATTERN + r"/tag/([^/?&#]+)"] + test = [ + ("http://joyreactor.com/tag/Cirno", { + "url": "a81382a3146da50b647c475f87427a6ca1d737df", + "keyword": "dcd3b101cae0a93fbb91281235de1410faf88455", + }), + ("http://joyreactor.cc/tag/Advent+Cirno", { + "count": ">= 17", + }), + ] + + +class JoyreactorSearchExtractor(ReactorSearchExtractor): + """Extractor for search results on joyreactor.cc""" + category = "joyreactor" + pattern = [JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"] test = [ ("http://joyreactor.com/search?q=Cirno+Gifs", { "count": 0, # no search results on joyreactor.com @@ -156,11 +219,10 @@ class JoyreactorSearchExtractor(JoyreactorTagExtractor): ] -class JoyreactorUserExtractor(JoyreactorExtractor): +class JoyreactorUserExtractor(ReactorUserExtractor): """Extractor for all posts of a user on joyreactor.cc""" - subcategory = "user" - directory_fmt = ["{category}", "user", "{user}"] - pattern = [BASE_PATTERN + r"/user/([^/?&#]+)"] + category = "joyreactor" + pattern = [JR_BASE_PATTERN + r"/user/([^/?&#]+)"] test = [ ("http://joyreactor.com/user/Tacoman123", { "url": "0444158f17c22f08515ad4e7abf69ad2f3a63b35", @@ -169,18 +231,11 @@ class JoyreactorUserExtractor(JoyreactorExtractor): ("http://joyreactor.cc/user/hemantic", None), ] - def __init__(self, match): - JoyreactorExtractor.__init__(self, match) - self.user = match.group(2) - - def metadata(self): - return {"user": text.unescape(self.user).replace("+", " ")} - -class JoyreactorPostExtractor(JoyreactorExtractor): +class JoyreactorPostExtractor(ReactorPostExtractor): """Extractor for single posts on joyreactor.cc""" - subcategory = "post" - pattern = [BASE_PATTERN + r"/post/(\d+)"] + category = "joyreactor" + pattern = [JR_BASE_PATTERN + r"/post/(\d+)"] test = [ ("http://joyreactor.com/post/3721876", { # single image "url": "904779f6571436f3d5adbce30c2c272f6401e14a", @@ -204,16 +259,64 @@ class JoyreactorPostExtractor(JoyreactorExtractor): }), ] - def __init__(self, match): - JoyreactorExtractor.__init__(self, match) - self.post_id = match.group(2) - def items(self): - yield Message.Version, 1 - post = self.request(self.url).text - pos = post.find('class="uhead">') - for image in self._parse_post(post[pos:]): - if image["num"] == 1: - yield Message.Directory, image - url = image["file_url"] - yield Message.Url, url, text.nameext_from_url(url, image) +# -------------------------------------------------------------------- +# PornReactor + +PR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)" + + +class PornreactorTagExtractor(ReactorTagExtractor): + """Extractor for tag searches on pornreactor.cc""" + category = "pornreactor" + pattern = [PR_BASE_PATTERN + r"/tag/([^/?&#]+)"] + test = [ + ("http://pornreactor.cc/tag/RiceGnat", { + "count": ">= 120", + }), + ("http://fapreactor.com/tag/RiceGnat", None), + ] + + +class PornreactorSearchExtractor(ReactorSearchExtractor): + """Extractor for search results on pornreactor.cc""" + category = "pornreactor" + pattern = [PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"] + test = [ + ("http://pornreactor.cc/search?q=ecchi+hentai", { + "range": "1-25", + "count": ">= 20", + }), + ("http://fapreactor.com/search/ecchi+hentai", None), + ] + + +class PornreactorUserExtractor(ReactorUserExtractor): + """Extractor for all posts of a user on pornreactor.cc""" + category = "pornreactor" + pattern = [PR_BASE_PATTERN + r"/user/([^/?&#]+)"] + test = [ + ("http://pornreactor.cc/user/Disillusion", { + "url": "7e06f87f8dcce3fc7851b6d13aa55712ab45fb04", + "keyword": "edfefb54ea4863e3731c508ae6caeb4140be0d31", + }), + ("http://fapreactor.com/user/Disillusion", None), + ] + + +class PornreactorPostExtractor(ReactorPostExtractor): + """Extractor for single posts on pornreactor.cc""" + category = "pornreactor" + subcategory = "post" + pattern = [PR_BASE_PATTERN + r"/post/(\d+)"] + test = [ + ("http://pornreactor.cc/post/863166", { + "url": "9e5f7b374605cbbd413f4f4babb9d1af6f95b843", + "keyword": "6e9e4bd4e2d4f3f2c7936340ec71f8693129f809", + "content": "3e2a09f8b5e5ed7722f51c5f423ff4c9260fb23e", + }), + ("http://fapreactor.com/post/863166", { + "url": "83ff7c87741c05bcf1de6825e2b4739afeb87ed5", + "keyword": "cf8159224fde59c1dab86677514b4aedeb533d66", + }), + ] diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index 2e1d8cb4..586d75e9 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -25,8 +25,8 @@ class SimplyhentaiGalleryExtractor(ChapterExtractor): test = [ (("https://original-work.simply-hentai.com" "/amazon-no-hiyaku-amazon-elixir"), { - "url": "35f3843d0ea83e6a618df7afaebd2b03f3628db9", - "keyword": "1e22ccbe66412eab844f135ad9cd3424b8b064e8", + "url": "258289249990502c3138719cb89e995a60861e49", + "keyword": "3873c6078ce116e798fac8b7a955e3b3a4f526a6", }), ("https://www.simply-hentai.com/notfound", { "exception": exception.GalleryDLException,