provide type information for Queue messages

Child extractors are now directly constructed with Extractor.from_url() if the extractor class is known beforehand, instead of using extractor.find() and searching through all possible extractor classes.
6 years ago · 61741d7333
parent 2e516a1e3e
commit 61741d7333
16 changed files with 53 additions and 25 deletions
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@ -21,6 +21,7 @@ class BehanceExtractor(Extractor):
    def items(self):
        yield Message.Version, 1
        for gallery in self.galleries():
+            gallery["_extractor"] = BehanceGalleryExtractor
            yield Message.Queue, gallery["url"], self._update(gallery)

    def galleries(self):
--- a/gallery_dl/extractor/bobx.py
+++ b/gallery_dl/extractor/bobx.py
@ -99,6 +99,7 @@ class BobxIdolExtractor(BobxExtractor):

    def items(self):
        url = "{}/{}/".format(self.root, self.path)
+        data = {"_extractor": BobxGalleryExtractor}
        page = self.request(url).text
        skip = True

@ -108,4 +109,4 @@ class BobxIdolExtractor(BobxExtractor):
            skip = not skip
            if skip:
                continue
-            yield Message.Queue, "{}photoset/{}".format(url, part), {}
+            yield Message.Queue, "{}photoset/{}".format(url, part), data
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@ -377,10 +377,11 @@ class DeviantartStashExtractor(DeviantartExtractor):
        if deviation_id:
            yield self.api.deviation(deviation_id)
        else:
+            data = {"_extractor": DeviantartStashExtractor}
            page = text.extract(
                page, '<div id="stash-body"', '<div class="footer"')[0]
            for url in text.extract_iter(page, '<a href="', '"'):
-                yield url, {}
+                yield url, data


 class DeviantartFavoriteExtractor(DeviantartExtractor):
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@ -338,7 +338,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):

    def __init__(self, match):
        ExhentaiExtractor.__init__(self, match)
-        self.params = text.parse_query(match.group(1) or "")
+        self.params = text.parse_query(match.group(2) or "")
        self.params["page"] = text.parse_int(self.params.get("page"))
        self.search_url = self.root

@ -376,6 +376,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
            "gallery_id": text.parse_int(parts[1]),
            "gallery_token": parts[2],
            "title": text.unescape(title),
+            "_extractor": ExhentaiGalleryExtractor,
            key: last,
        }

--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@ -119,7 +119,7 @@ class FlickrAlbumExtractor(FlickrExtractor):
        }),
        ("https://www.flickr.com/photos/shona_s/albums", {
            "url": "657d541470482e0d69deec33ab97a6d7d4af6fe4",
-            "keyword": "736a41a7d702f7fe00edc957ae201d84f745e654",
+            "keyword": "ef654bfbc4ce7b74ad74e7d772e5466285ffc581",
        }),
    )

@ -135,6 +135,7 @@ class FlickrAlbumExtractor(FlickrExtractor):
    def _album_items(self):
        yield Message.Version, 1
        data = FlickrExtractor.data(self)
+        data["_extractor"] = FlickrAlbumExtractor

        for albums in self.api.photosets_getList(self.user["nsid"]):
            for album in albums["photoset"]:
--- a/gallery_dl/extractor/hentaifox.py
+++ b/gallery_dl/extractor/hentaifox.py
@ -112,6 +112,7 @@ class HentaifoxSearchExtractor(Extractor):
                    "thumbnail": text.urljoin(self.root, thumb),
                    "title": text.unescape(title),
                    "tags": tags.split(),
+                    "_extractor": HentaifoxGalleryExtractor,
                }

            pos = page.find('class="current"', gpos)
--- a/gallery_dl/extractor/imagefap.py
+++ b/gallery_dl/extractor/imagefap.py
@ -154,7 +154,11 @@ class ImagefapUserExtractor(ImagefapExtractor):
        yield Message.Version, 1
        for gid, name in self.get_gallery_data():
            url = "{}/gallery/{}".format(self.root, gid)
-            data = {"gallery_id": text.parse_int(gid), "title": name}
+            data = {
+                "gallery_id": text.parse_int(gid),
+                "title": text.unescape(name),
+                "_extractor": ImagefapGalleryExtractor,
+            }
            yield Message.Queue, url, data

    def get_gallery_data(self):
--- a/gallery_dl/extractor/luscious.py
+++ b/gallery_dl/extractor/luscious.py
@ -199,4 +199,5 @@ class LusciousSearchExtractor(LusciousExtractor):
            "count": text.parse_int(count),
            "date": date,
            "tags": text.remove_html(tags.partition(">")[2]),
+            "_extractor": LusciousAlbumExtractor,
        }
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@ -173,6 +173,7 @@ class MangadexMangaExtractor(MangadexExtractor):
                "date": info["timestamp"],
                "lang": lang,
                "language": util.code_to_language(lang),
+                "_extractor": MangadexChapterExtractor,
            })

        results.sort(key=lambda x: (x["chapter"], x["chapter_minor"]))
--- a/gallery_dl/extractor/myportfolio.py
+++ b/gallery_dl/extractor/myportfolio.py
@ -106,8 +106,9 @@ class MyportfolioUserExtractor(Extractor):
        url = "https://" + self.domain
        page = self.request(url).text
        main = text.extract(page, "<main>", "</main>")[0]
+        data = {"_extractor": MyportfolioGalleryExtractor}

        yield Message.Version, 1
        for path in text.extract_iter(main, ' href="', '"'):
            if path and path[0] == "/":
-                yield Message.Queue, self.prefix + url + path, {}
+                yield Message.Queue, self.prefix + url + path, data
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@ -91,9 +91,10 @@ class NhentaiSearchExtractor(NHentaiExtractor):

    def items(self):
        yield Message.Version, 1
+        data = {"_extractor": NhentaiGalleryExtractor}
        for gid in self._pagination(self.params):
            url = "{}/g/{}/".format(self.root, gid)
-            yield Message.Queue, url, {}
+            yield Message.Queue, url, data

    def _pagination(self, params):
        url = "{}/search/".format(self.root)
--- a/gallery_dl/extractor/photobucket.py
+++ b/gallery_dl/extractor/photobucket.py
@ -64,6 +64,7 @@ class PhotobucketAlbumExtractor(Extractor):

        if self.config("subalbums", True):
            for album in self.subalbums():
+                album["_extractor"] = PhotobucketAlbumExtractor
                yield Message.Queue, album["url"], album

    def images(self):
--- a/gallery_dl/extractor/smugmug.py
+++ b/gallery_dl/extractor/smugmug.py
@ -135,7 +135,7 @@ class SmugmugPathExtractor(SmugmugExtractor):
            "pattern": "smugmug:album:ddvxpg$",
        }),
        ("https://acapella.smugmug.com/", {
-            "pattern": r"smugmug:album:\w+$",
+            "pattern": SmugmugAlbumExtractor.pattern,
            "url": "797eb1cbbf5ad8ecac8ee4eedc6466ed77a65d68",
        }),
        # gallery node without owner
@ -178,11 +178,13 @@ class SmugmugPathExtractor(SmugmugExtractor):

            for node in nodes:
                album_id = node["Uris"]["Album"].rpartition("/")[2]
+                node["_extractor"] = SmugmugAlbumExtractor
                yield Message.Queue, "smugmug:album:" + album_id, node

        else:
            for album in self.api.user_albums(self.user):
                uri = "smugmug:album:" + album["AlbumKey"]
+                album["_extractor"] = SmugmugAlbumExtractor
                yield Message.Queue, uri, album

    def album_nodes(self, root):
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@ -162,6 +162,7 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor):
        yield Message.Version, 1
        for gallery in self.galleries():
            url = "{}/Book/Info/{}".format(self.root, gallery["Id"])
+            gallery["_extractor"] = TsuminoGalleryExtractor
            yield Message.Queue, url, gallery

    def galleries(self):
--- a/gallery_dl/extractor/xvideos.py
+++ b/gallery_dl/extractor/xvideos.py
@ -99,7 +99,7 @@ class XvideosUserExtractor(XvideosExtractor):
    test = (
        ("https://www.xvideos.com/profiles/pervertedcouple", {
            "url": "a413f3e60d6d3a2de79bd44fa3b7a9c03db4336e",
-            "keyword": "a796760d34732adc7ec52a8feb057515209a2ca6",
+            "keyword": "ef941489354fd8f4754c8a87cffd5e2429a6387c",
        }),
        ("https://www.xvideos.com/profiles/niwehrwhernvh", {
            "exception": exception.NotFoundError,
@ -123,9 +123,12 @@ class XvideosUserExtractor(XvideosExtractor):
            del data["galleries"]["0"]

        galleries = [
-            {"gallery_id": text.parse_int(gid),
-             "title": text.unescape(gdata["title"]),
-             "count": gdata["nb_pics"]}
+            {
+                "gallery_id": text.parse_int(gid),
+                "title": text.unescape(gdata["title"]),
+                "count": gdata["nb_pics"],
+                "_extractor": XvideosGalleryExtractor,
+            }
            for gid, gdata in data["galleries"].items()
        ]
        galleries.sort(key=lambda x: x["gallery_id"])
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@ -20,13 +20,14 @@ class Job():
    """Base class for Job-types"""
    ulog = None

-    def __init__(self, url, parent=None):
-        self.url = url
-        self.extractor = extractor.find(url)
-        if self.extractor is None:
-            raise exception.NoExtractorError(url)
+    def __init__(self, extr, parent=None):
+        if isinstance(extr, str):
+            extr = extractor.find(extr)
+        if not extr:
+            raise exception.NoExtractorError()
+        self.extractor = extr
        self.extractor.log.debug(
-            "Using %s for '%s'", self.extractor.__class__.__name__, url)
+            "Using %s for '%s'", extr.__class__.__name__, extr.url)

        # url predicates
        self.pred_url = self._prepare_predicates(
@ -56,10 +57,10 @@ class Job():
            log.error("Authentication failed: %s", msg)
        except exception.AuthorizationError:
            log.error("You do not have permission to access the resource "
-                      "at '%s'", self.url)
+                      "at '%s'", self.extractor.url)
        except exception.NotFoundError as exc:
            res = str(exc) or "resource (gallery/image/user)"
-            log.error("The %s at '%s' does not exist", res, self.url)
+            log.error("The %s at '%s' does not exist", res, self.extractor.url)
        except exception.HttpError as exc:
            err = exc.args[0]
            if isinstance(err, Exception):
@ -243,9 +244,13 @@ class DownloadJob(Job):
            self.pathfmt.set_directory(keywords)

    def handle_queue(self, url, keywords):
-        try:
-            self.__class__(url, self).run()
-        except exception.NoExtractorError:
+        if "_extractor" in keywords:
+            extr = keywords["_extractor"].from_url(url)
+        else:
+            extr = extractor.find(url)
+        if extr:
+            self.__class__(extr, self).run()
+        else:
            self._write_unsupported(url)

    def handle_finalize(self):
@ -389,6 +394,8 @@ class KeywordJob(Job):
        """Print key-value pairs with formatting"""
        suffix = "]" if prefix else ""
        for key, value in sorted(keywords.items()):
+            if key[0] == "_":
+                continue
            key = prefix + key + suffix

            if isinstance(value, dict):
@ -512,7 +519,7 @@ class TestJob(DownloadJob):
        if to_list:
            self.list_keyword.append(kwdict.copy())
        self.hash_keyword.update(
-            json.dumps(kwdict, sort_keys=True).encode())
+            json.dumps(kwdict, sort_keys=True, default=str).encode())

    def update_archive(self, kwdict):
        """Update the archive-id hash"""
@ -555,7 +562,7 @@ class DataJob(Job):
        # dump to 'file'
        json.dump(
            self.data, self.file,
-            sort_keys=True, indent=2, ensure_ascii=self.ascii,
+            sort_keys=True, indent=2, ensure_ascii=self.ascii, default=str,
        )
        self.file.write("\n")