[reactor] generalize extractors (#148)

- support *.reactor.cc domains
- combine joyreactor and pornreactor modules
server
Mike Fährmann 6 years ago
parent 38500ad697
commit 8e01cf0ef8
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -47,7 +47,6 @@ modules = [
"imgur", "imgur",
"instagram", "instagram",
"jaiminisbox", "jaiminisbox",
"joyreactor",
"khinsider", "khinsider",
"kireicake", "kireicake",
"kissmanga", "kissmanga",
@ -72,8 +71,8 @@ modules = [
"piczel", "piczel",
"pinterest", "pinterest",
"pixiv", "pixiv",
"pornreactor",
"powermanga", "powermanga",
"reactor",
"readcomiconline", "readcomiconline",
"rebeccablacktech", "rebeccablacktech",
"reddit", "reddit",

@ -1,75 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2018-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for http://pornreactor.cc/"""
from .joyreactor import (
JoyreactorTagExtractor,
JoyreactorSearchExtractor,
JoyreactorUserExtractor,
JoyreactorPostExtractor,
)
BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)"
class PornreactorTagExtractor(JoyreactorTagExtractor):
"""Extractor for tag searches on pornreactor.cc"""
category = "pornreactor"
pattern = [BASE_PATTERN + r"/tag/([^/?&#]+)"]
test = [
("http://pornreactor.cc/tag/RiceGnat", {
"count": ">= 120",
}),
("http://fapreactor.com/tag/RiceGnat", None),
]
class PornreactorSearchExtractor(JoyreactorSearchExtractor):
"""Extractor for search results on pornreactor.cc"""
category = "pornreactor"
pattern = [BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"]
test = [
("http://pornreactor.cc/search?q=ecchi+hentai", {
"range": "1-25",
"count": ">= 20",
}),
("http://fapreactor.com/search/ecchi+hentai", None),
]
class PornreactorUserExtractor(JoyreactorUserExtractor):
"""Extractor for all posts of a user on pornreactor.cc"""
category = "pornreactor"
pattern = [BASE_PATTERN + r"/user/([^/?&#]+)"]
test = [
("http://pornreactor.cc/user/Disillusion", {
"url": "7e06f87f8dcce3fc7851b6d13aa55712ab45fb04",
"keyword": "edfefb54ea4863e3731c508ae6caeb4140be0d31",
}),
("http://fapreactor.com/user/Disillusion", None),
]
class PornreactorPostExtractor(JoyreactorPostExtractor):
"""Extractor for single posts on pornreactor.cc"""
category = "pornreactor"
subcategory = "post"
pattern = [BASE_PATTERN + r"/post/(\d+)"]
test = [
("http://pornreactor.cc/post/863166", {
"url": "9e5f7b374605cbbd413f4f4babb9d1af6f95b843",
"keyword": "6e9e4bd4e2d4f3f2c7936340ec71f8693129f809",
"content": "3e2a09f8b5e5ed7722f51c5f423ff4c9260fb23e",
}),
("http://fapreactor.com/post/863166", {
"url": "83ff7c87741c05bcf1de6825e2b4739afeb87ed5",
"keyword": "cf8159224fde59c1dab86677514b4aedeb533d66",
}),
]

@ -1,34 +1,40 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2018-2019 Mike Fährmann # Copyright 2019 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extractors for http://joyreactor.cc/""" """Generic extractors for *reactor sites"""
from .common import Extractor, Message from .common import SharedConfigExtractor, Message
from .. import text from .. import text
import urllib.parse
import json import json
BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))" BASE_PATTERN = r"(?:https?://)?([^/.]+\.reactor\.cc)"
class JoyreactorExtractor(Extractor): class ReactorExtractor(SharedConfigExtractor):
"""Base class for joyreactor extractors""" """Base class for *reactor.cc extractors"""
category = "joyreactor" basecategory = "reactor"
directory_fmt = ["{category}"] directory_fmt = ["{category}"]
filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}" filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}"
archive_fmt = "{post_id}_{num}" archive_fmt = "{post_id}_{num}"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) SharedConfigExtractor.__init__(self)
self.url = match.group(0) self.url = match.group(0)
self.root = "http://" + match.group(1) self.root = "http://" + match.group(1)
self.session.headers["Referer"] = self.root self.session.headers["Referer"] = self.root
if not self.category:
# set category based on domain name
netloc = urllib.parse.urlsplit(self.root).netloc
self.category = netloc.rpartition(".")[0]
def items(self): def items(self):
data = self.metadata() data = self.metadata()
yield Message.Version, 1 yield Message.Version, 1
@ -70,6 +76,7 @@ class JoyreactorExtractor(Extractor):
data = json.loads(script) data = json.loads(script)
except ValueError: except ValueError:
try: try:
# remove control characters and escape backslashes
mapping = dict.fromkeys(range(32)) mapping = dict.fromkeys(range(32))
script = script.translate(mapping).replace("\\", "\\\\") script = script.translate(mapping).replace("\\", "\\\\")
data = json.loads(script) data = json.loads(script)
@ -115,36 +122,92 @@ class JoyreactorExtractor(Extractor):
} }
class JoyreactorTagExtractor(JoyreactorExtractor): class ReactorTagExtractor(ReactorExtractor):
"""Extractor for tag searches on joyreactor.cc""" """Extractor for tag searches on *reactor.cc sites"""
subcategory = "tag" subcategory = "tag"
directory_fmt = ["{category}", "{search_tags}"] directory_fmt = ["{category}", "{search_tags}"]
archive_fmt = "{search_tags}_{post_id}_{num}" archive_fmt = "{search_tags}_{post_id}_{num}"
pattern = [BASE_PATTERN + r"/tag/([^/?&#]+)"] pattern = [BASE_PATTERN + r"/tag/([^/?&#]+)"]
test = [ test = [("http://anime.reactor.cc/tag/Anime+Art", None)]
("http://joyreactor.com/tag/Cirno", {
"url": "a81382a3146da50b647c475f87427a6ca1d737df",
"keyword": "dcd3b101cae0a93fbb91281235de1410faf88455",
}),
("http://joyreactor.cc/tag/Advent+Cirno", {
"count": ">= 17",
}),
]
def __init__(self, match): def __init__(self, match):
JoyreactorExtractor.__init__(self, match) ReactorExtractor.__init__(self, match)
self.tag = match.group(2) self.tag = match.group(2)
def metadata(self): def metadata(self):
return {"search_tags": text.unescape(self.tag).replace("+", " ")} return {"search_tags": text.unescape(self.tag).replace("+", " ")}
class JoyreactorSearchExtractor(JoyreactorTagExtractor): class ReactorSearchExtractor(ReactorTagExtractor):
"""Extractor for search results on joyreactor.cc""" """Extractor for search results on *reactor.cc sites"""
subcategory = "search" subcategory = "search"
directory_fmt = ["{category}", "search", "{search_tags}"] directory_fmt = ["{category}", "search", "{search_tags}"]
archive_fmt = "s_{search_tags}_{post_id}_{num}" archive_fmt = "s_{search_tags}_{post_id}_{num}"
pattern = [BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"] pattern = [BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"]
test = [("http://anime.reactor.cc/search?q=Art", None)]
class ReactorUserExtractor(ReactorExtractor):
"""Extractor for all posts of a user on *reactor.cc sites"""
subcategory = "user"
directory_fmt = ["{category}", "user", "{user}"]
pattern = [BASE_PATTERN + r"/user/([^/?&#]+)"]
test = [("http://anime.reactor.cc/user/Shuster", None)]
def __init__(self, match):
ReactorExtractor.__init__(self, match)
self.user = match.group(2)
def metadata(self):
return {"user": text.unescape(self.user).replace("+", " ")}
class ReactorPostExtractor(ReactorExtractor):
"""Extractor for single posts on *reactor.cc sites"""
subcategory = "post"
pattern = [BASE_PATTERN + r"/post/(\d+)"]
test = [("http://anime.reactor.cc/post/3576250", None)]
def __init__(self, match):
ReactorExtractor.__init__(self, match)
self.post_id = match.group(2)
def items(self):
yield Message.Version, 1
post = self.request(self.url).text
pos = post.find('class="uhead">')
for image in self._parse_post(post[pos:]):
if image["num"] == 1:
yield Message.Directory, image
url = image["file_url"]
yield Message.Url, url, text.nameext_from_url(url, image)
# --------------------------------------------------------------------
# JoyReactor
JR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))"
class JoyreactorTagExtractor(ReactorTagExtractor):
"""Extractor for tag searches on joyreactor.cc"""
category = "joyreactor"
pattern = [JR_BASE_PATTERN + r"/tag/([^/?&#]+)"]
test = [
("http://joyreactor.com/tag/Cirno", {
"url": "a81382a3146da50b647c475f87427a6ca1d737df",
"keyword": "dcd3b101cae0a93fbb91281235de1410faf88455",
}),
("http://joyreactor.cc/tag/Advent+Cirno", {
"count": ">= 17",
}),
]
class JoyreactorSearchExtractor(ReactorSearchExtractor):
"""Extractor for search results on joyreactor.cc"""
category = "joyreactor"
pattern = [JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"]
test = [ test = [
("http://joyreactor.com/search?q=Cirno+Gifs", { ("http://joyreactor.com/search?q=Cirno+Gifs", {
"count": 0, # no search results on joyreactor.com "count": 0, # no search results on joyreactor.com
@ -156,11 +219,10 @@ class JoyreactorSearchExtractor(JoyreactorTagExtractor):
] ]
class JoyreactorUserExtractor(JoyreactorExtractor): class JoyreactorUserExtractor(ReactorUserExtractor):
"""Extractor for all posts of a user on joyreactor.cc""" """Extractor for all posts of a user on joyreactor.cc"""
subcategory = "user" category = "joyreactor"
directory_fmt = ["{category}", "user", "{user}"] pattern = [JR_BASE_PATTERN + r"/user/([^/?&#]+)"]
pattern = [BASE_PATTERN + r"/user/([^/?&#]+)"]
test = [ test = [
("http://joyreactor.com/user/Tacoman123", { ("http://joyreactor.com/user/Tacoman123", {
"url": "0444158f17c22f08515ad4e7abf69ad2f3a63b35", "url": "0444158f17c22f08515ad4e7abf69ad2f3a63b35",
@ -169,18 +231,11 @@ class JoyreactorUserExtractor(JoyreactorExtractor):
("http://joyreactor.cc/user/hemantic", None), ("http://joyreactor.cc/user/hemantic", None),
] ]
def __init__(self, match):
JoyreactorExtractor.__init__(self, match)
self.user = match.group(2)
def metadata(self):
return {"user": text.unescape(self.user).replace("+", " ")}
class JoyreactorPostExtractor(JoyreactorExtractor): class JoyreactorPostExtractor(ReactorPostExtractor):
"""Extractor for single posts on joyreactor.cc""" """Extractor for single posts on joyreactor.cc"""
subcategory = "post" category = "joyreactor"
pattern = [BASE_PATTERN + r"/post/(\d+)"] pattern = [JR_BASE_PATTERN + r"/post/(\d+)"]
test = [ test = [
("http://joyreactor.com/post/3721876", { # single image ("http://joyreactor.com/post/3721876", { # single image
"url": "904779f6571436f3d5adbce30c2c272f6401e14a", "url": "904779f6571436f3d5adbce30c2c272f6401e14a",
@ -204,16 +259,64 @@ class JoyreactorPostExtractor(JoyreactorExtractor):
}), }),
] ]
def __init__(self, match):
JoyreactorExtractor.__init__(self, match)
self.post_id = match.group(2)
def items(self): # --------------------------------------------------------------------
yield Message.Version, 1 # PornReactor
post = self.request(self.url).text
pos = post.find('class="uhead">') PR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)"
for image in self._parse_post(post[pos:]):
if image["num"] == 1:
yield Message.Directory, image class PornreactorTagExtractor(ReactorTagExtractor):
url = image["file_url"] """Extractor for tag searches on pornreactor.cc"""
yield Message.Url, url, text.nameext_from_url(url, image) category = "pornreactor"
pattern = [PR_BASE_PATTERN + r"/tag/([^/?&#]+)"]
test = [
("http://pornreactor.cc/tag/RiceGnat", {
"count": ">= 120",
}),
("http://fapreactor.com/tag/RiceGnat", None),
]
class PornreactorSearchExtractor(ReactorSearchExtractor):
"""Extractor for search results on pornreactor.cc"""
category = "pornreactor"
pattern = [PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"]
test = [
("http://pornreactor.cc/search?q=ecchi+hentai", {
"range": "1-25",
"count": ">= 20",
}),
("http://fapreactor.com/search/ecchi+hentai", None),
]
class PornreactorUserExtractor(ReactorUserExtractor):
"""Extractor for all posts of a user on pornreactor.cc"""
category = "pornreactor"
pattern = [PR_BASE_PATTERN + r"/user/([^/?&#]+)"]
test = [
("http://pornreactor.cc/user/Disillusion", {
"url": "7e06f87f8dcce3fc7851b6d13aa55712ab45fb04",
"keyword": "edfefb54ea4863e3731c508ae6caeb4140be0d31",
}),
("http://fapreactor.com/user/Disillusion", None),
]
class PornreactorPostExtractor(ReactorPostExtractor):
"""Extractor for single posts on pornreactor.cc"""
category = "pornreactor"
subcategory = "post"
pattern = [PR_BASE_PATTERN + r"/post/(\d+)"]
test = [
("http://pornreactor.cc/post/863166", {
"url": "9e5f7b374605cbbd413f4f4babb9d1af6f95b843",
"keyword": "6e9e4bd4e2d4f3f2c7936340ec71f8693129f809",
"content": "3e2a09f8b5e5ed7722f51c5f423ff4c9260fb23e",
}),
("http://fapreactor.com/post/863166", {
"url": "83ff7c87741c05bcf1de6825e2b4739afeb87ed5",
"keyword": "cf8159224fde59c1dab86677514b4aedeb533d66",
}),
]

@ -25,8 +25,8 @@ class SimplyhentaiGalleryExtractor(ChapterExtractor):
test = [ test = [
(("https://original-work.simply-hentai.com" (("https://original-work.simply-hentai.com"
"/amazon-no-hiyaku-amazon-elixir"), { "/amazon-no-hiyaku-amazon-elixir"), {
"url": "35f3843d0ea83e6a618df7afaebd2b03f3628db9", "url": "258289249990502c3138719cb89e995a60861e49",
"keyword": "1e22ccbe66412eab844f135ad9cd3424b8b064e8", "keyword": "3873c6078ce116e798fac8b7a955e3b3a4f526a6",
}), }),
("https://www.simply-hentai.com/notfound", { ("https://www.simply-hentai.com/notfound", {
"exception": exception.GalleryDLException, "exception": exception.GalleryDLException,

Loading…
Cancel
Save