provide type information for Queue messages

Child extractors are now directly constructed with Extractor.from_url()
if the extractor class is known beforehand, instead of using
extractor.find() and searching through all possible extractor classes.
pull/170/head
Mike Fährmann 6 years ago
parent 2e516a1e3e
commit 61741d7333
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -21,6 +21,7 @@ class BehanceExtractor(Extractor):
def items(self):
yield Message.Version, 1
for gallery in self.galleries():
gallery["_extractor"] = BehanceGalleryExtractor
yield Message.Queue, gallery["url"], self._update(gallery)
def galleries(self):

@ -99,6 +99,7 @@ class BobxIdolExtractor(BobxExtractor):
def items(self):
url = "{}/{}/".format(self.root, self.path)
data = {"_extractor": BobxGalleryExtractor}
page = self.request(url).text
skip = True
@ -108,4 +109,4 @@ class BobxIdolExtractor(BobxExtractor):
skip = not skip
if skip:
continue
yield Message.Queue, "{}photoset/{}".format(url, part), {}
yield Message.Queue, "{}photoset/{}".format(url, part), data

@ -377,10 +377,11 @@ class DeviantartStashExtractor(DeviantartExtractor):
if deviation_id:
yield self.api.deviation(deviation_id)
else:
data = {"_extractor": DeviantartStashExtractor}
page = text.extract(
page, '<div id="stash-body"', '<div class="footer"')[0]
for url in text.extract_iter(page, '<a href="', '"'):
yield url, {}
yield url, data
class DeviantartFavoriteExtractor(DeviantartExtractor):

@ -338,7 +338,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
def __init__(self, match):
ExhentaiExtractor.__init__(self, match)
self.params = text.parse_query(match.group(1) or "")
self.params = text.parse_query(match.group(2) or "")
self.params["page"] = text.parse_int(self.params.get("page"))
self.search_url = self.root
@ -376,6 +376,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
"gallery_id": text.parse_int(parts[1]),
"gallery_token": parts[2],
"title": text.unescape(title),
"_extractor": ExhentaiGalleryExtractor,
key: last,
}

@ -119,7 +119,7 @@ class FlickrAlbumExtractor(FlickrExtractor):
}),
("https://www.flickr.com/photos/shona_s/albums", {
"url": "657d541470482e0d69deec33ab97a6d7d4af6fe4",
"keyword": "736a41a7d702f7fe00edc957ae201d84f745e654",
"keyword": "ef654bfbc4ce7b74ad74e7d772e5466285ffc581",
}),
)
@ -135,6 +135,7 @@ class FlickrAlbumExtractor(FlickrExtractor):
def _album_items(self):
yield Message.Version, 1
data = FlickrExtractor.data(self)
data["_extractor"] = FlickrAlbumExtractor
for albums in self.api.photosets_getList(self.user["nsid"]):
for album in albums["photoset"]:

@ -112,6 +112,7 @@ class HentaifoxSearchExtractor(Extractor):
"thumbnail": text.urljoin(self.root, thumb),
"title": text.unescape(title),
"tags": tags.split(),
"_extractor": HentaifoxGalleryExtractor,
}
pos = page.find('class="current"', gpos)

@ -154,7 +154,11 @@ class ImagefapUserExtractor(ImagefapExtractor):
yield Message.Version, 1
for gid, name in self.get_gallery_data():
url = "{}/gallery/{}".format(self.root, gid)
data = {"gallery_id": text.parse_int(gid), "title": name}
data = {
"gallery_id": text.parse_int(gid),
"title": text.unescape(name),
"_extractor": ImagefapGalleryExtractor,
}
yield Message.Queue, url, data
def get_gallery_data(self):

@ -199,4 +199,5 @@ class LusciousSearchExtractor(LusciousExtractor):
"count": text.parse_int(count),
"date": date,
"tags": text.remove_html(tags.partition(">")[2]),
"_extractor": LusciousAlbumExtractor,
}

@ -173,6 +173,7 @@ class MangadexMangaExtractor(MangadexExtractor):
"date": info["timestamp"],
"lang": lang,
"language": util.code_to_language(lang),
"_extractor": MangadexChapterExtractor,
})
results.sort(key=lambda x: (x["chapter"], x["chapter_minor"]))

@ -106,8 +106,9 @@ class MyportfolioUserExtractor(Extractor):
url = "https://" + self.domain
page = self.request(url).text
main = text.extract(page, "<main>", "</main>")[0]
data = {"_extractor": MyportfolioGalleryExtractor}
yield Message.Version, 1
for path in text.extract_iter(main, ' href="', '"'):
if path and path[0] == "/":
yield Message.Queue, self.prefix + url + path, {}
yield Message.Queue, self.prefix + url + path, data

@ -91,9 +91,10 @@ class NhentaiSearchExtractor(NHentaiExtractor):
def items(self):
yield Message.Version, 1
data = {"_extractor": NhentaiGalleryExtractor}
for gid in self._pagination(self.params):
url = "{}/g/{}/".format(self.root, gid)
yield Message.Queue, url, {}
yield Message.Queue, url, data
def _pagination(self, params):
url = "{}/search/".format(self.root)

@ -64,6 +64,7 @@ class PhotobucketAlbumExtractor(Extractor):
if self.config("subalbums", True):
for album in self.subalbums():
album["_extractor"] = PhotobucketAlbumExtractor
yield Message.Queue, album["url"], album
def images(self):

@ -135,7 +135,7 @@ class SmugmugPathExtractor(SmugmugExtractor):
"pattern": "smugmug:album:ddvxpg$",
}),
("https://acapella.smugmug.com/", {
"pattern": r"smugmug:album:\w+$",
"pattern": SmugmugAlbumExtractor.pattern,
"url": "797eb1cbbf5ad8ecac8ee4eedc6466ed77a65d68",
}),
# gallery node without owner
@ -178,11 +178,13 @@ class SmugmugPathExtractor(SmugmugExtractor):
for node in nodes:
album_id = node["Uris"]["Album"].rpartition("/")[2]
node["_extractor"] = SmugmugAlbumExtractor
yield Message.Queue, "smugmug:album:" + album_id, node
else:
for album in self.api.user_albums(self.user):
uri = "smugmug:album:" + album["AlbumKey"]
album["_extractor"] = SmugmugAlbumExtractor
yield Message.Queue, uri, album
def album_nodes(self, root):

@ -162,6 +162,7 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
yield Message.Version, 1
for gallery in self.galleries():
url = "{}/Book/Info/{}".format(self.root, gallery["Id"])
gallery["_extractor"] = TsuminoGalleryExtractor
yield Message.Queue, url, gallery
def galleries(self):

@ -99,7 +99,7 @@ class XvideosUserExtractor(XvideosExtractor):
test = (
("https://www.xvideos.com/profiles/pervertedcouple", {
"url": "a413f3e60d6d3a2de79bd44fa3b7a9c03db4336e",
"keyword": "a796760d34732adc7ec52a8feb057515209a2ca6",
"keyword": "ef941489354fd8f4754c8a87cffd5e2429a6387c",
}),
("https://www.xvideos.com/profiles/niwehrwhernvh", {
"exception": exception.NotFoundError,
@ -123,9 +123,12 @@ class XvideosUserExtractor(XvideosExtractor):
del data["galleries"]["0"]
galleries = [
{"gallery_id": text.parse_int(gid),
"title": text.unescape(gdata["title"]),
"count": gdata["nb_pics"]}
{
"gallery_id": text.parse_int(gid),
"title": text.unescape(gdata["title"]),
"count": gdata["nb_pics"],
"_extractor": XvideosGalleryExtractor,
}
for gid, gdata in data["galleries"].items()
]
galleries.sort(key=lambda x: x["gallery_id"])

@ -20,13 +20,14 @@ class Job():
"""Base class for Job-types"""
ulog = None
def __init__(self, url, parent=None):
self.url = url
self.extractor = extractor.find(url)
if self.extractor is None:
raise exception.NoExtractorError(url)
def __init__(self, extr, parent=None):
if isinstance(extr, str):
extr = extractor.find(extr)
if not extr:
raise exception.NoExtractorError()
self.extractor = extr
self.extractor.log.debug(
"Using %s for '%s'", self.extractor.__class__.__name__, url)
"Using %s for '%s'", extr.__class__.__name__, extr.url)
# url predicates
self.pred_url = self._prepare_predicates(
@ -56,10 +57,10 @@ class Job():
log.error("Authentication failed: %s", msg)
except exception.AuthorizationError:
log.error("You do not have permission to access the resource "
"at '%s'", self.url)
"at '%s'", self.extractor.url)
except exception.NotFoundError as exc:
res = str(exc) or "resource (gallery/image/user)"
log.error("The %s at '%s' does not exist", res, self.url)
log.error("The %s at '%s' does not exist", res, self.extractor.url)
except exception.HttpError as exc:
err = exc.args[0]
if isinstance(err, Exception):
@ -243,9 +244,13 @@ class DownloadJob(Job):
self.pathfmt.set_directory(keywords)
def handle_queue(self, url, keywords):
try:
self.__class__(url, self).run()
except exception.NoExtractorError:
if "_extractor" in keywords:
extr = keywords["_extractor"].from_url(url)
else:
extr = extractor.find(url)
if extr:
self.__class__(extr, self).run()
else:
self._write_unsupported(url)
def handle_finalize(self):
@ -389,6 +394,8 @@ class KeywordJob(Job):
"""Print key-value pairs with formatting"""
suffix = "]" if prefix else ""
for key, value in sorted(keywords.items()):
if key[0] == "_":
continue
key = prefix + key + suffix
if isinstance(value, dict):
@ -512,7 +519,7 @@ class TestJob(DownloadJob):
if to_list:
self.list_keyword.append(kwdict.copy())
self.hash_keyword.update(
json.dumps(kwdict, sort_keys=True).encode())
json.dumps(kwdict, sort_keys=True, default=str).encode())
def update_archive(self, kwdict):
"""Update the archive-id hash"""
@ -555,7 +562,7 @@ class DataJob(Job):
# dump to 'file'
json.dump(
self.data, self.file,
sort_keys=True, indent=2, ensure_ascii=self.ascii,
sort_keys=True, indent=2, ensure_ascii=self.ascii, default=str,
)
self.file.write("\n")

Loading…
Cancel
Save