generalize extractor creation code

6 years ago · 09d872a2b1
parent 8dc6be246b
commit 09d872a2b1
4 changed files with 97 additions and 154 deletions
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@ -337,6 +337,48 @@ class SharedConfigMixin():
        return value


+def generate_extractors(extractor_data, symtable, classes):
+    """Dynamically generate Extractor classes"""
+    extractors = config.get(("extractor", classes[0].basecategory))
+    ckey = extractor_data.get("_ckey")
+    prev = None
+
+    if extractors:
+        extractor_data.update(extractors)
+
+    for category, info in extractor_data.items():
+
+        if not isinstance(info, dict):
+            continue
+
+        root = info["root"]
+        domain = root[root.index(":") + 3:]
+        pattern = info.get("pattern") or re.escape(domain)
+        name = (info.get("name") or category).capitalize()
+
+        for cls in classes:
+
+            class Extr(cls):
+                pass
+            Extr.__module__ = cls.__module__
+            Extr.__name__ = Extr.__qualname__ = \
+                name + cls.subcategory.capitalize() + "Extractor"
+            Extr.__doc__ = \
+                "Extractor for " + cls.subcategory + "s from " + domain
+            Extr.category = category
+            Extr.pattern = r"(?:https?://)?" + pattern + cls.pattern_fmt
+            Extr.test = info.get("test-" + cls.subcategory)
+            Extr.root = root
+
+            if "extra" in info:
+                for key, value in info["extra"].items():
+                    setattr(Extr, key, value)
+            if prev and ckey:
+                setattr(Extr, ckey, prev)
+
+            symtable[Extr.__name__] = prev = Extr
+
+
 # Reduce strictness of the expected magic string in cookiejar files.
 # (This allows the use of Wget-generated cookiejars without modification)

--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@ -8,11 +8,10 @@

 """Extractors for 4chan archives based on FoolFuuka"""

-from .common import Extractor, Message, SharedConfigMixin
-from .. import text, config
+from .common import Extractor, Message, SharedConfigMixin, generate_extractors
+from .. import text
 import itertools
 import operator
-import re


 class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
@ -23,12 +22,16 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
                     "{thread_num}{title:? - //}")
    filename_fmt = "{media[media]}"
    archive_fmt = "{board[shortname]}_{num}_{timestamp}"
+    pattern_fmt = r"/([^/]+)/thread/(\d+)"
+    resolve = "default"
    root = ""

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.board, self.thread = match.groups()
        self.session.headers["Referer"] = self.root
+        if self.resolve == "direct":
+            self.remote = self._remote_direct

    def items(self):
        op = True
@ -52,6 +55,7 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
            yield Message.Url, url, post

    def posts(self):
+        """Return an iterable with all posts in this thread"""
        url = self.root + "/_/api/chan/thread/"
        params = {"board": self.board, "num": self.thread}
        data = self.request(url, params=params).json()[self.thread]
@ -63,59 +67,28 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
        return itertools.chain((data["op"],), posts)

    def remote(self, media):
+        """Resolve a remote media link"""
        needle = '<meta http-equiv="Refresh" content="0; url='
        page = self.request(media["remote_media_link"]).text
        return text.extract(page, needle, '"')[0]

-    def _remote_simple(self, media):
+    @staticmethod
+    def _remote_direct(media):
        return media["remote_media_link"]


-def generate_extractors():
-    """Dynamically generate Extractor classes for FoolFuuka instances"""
-
-    symtable = globals()
-    extractors = config.get(("extractor", "foolfuuka"))
-
-    if extractors:
-        EXTRACTORS.update(extractors)
-
-    for category, info in EXTRACTORS.items():
-
-        if not isinstance(info, dict):
-            continue
-
-        root = info["root"]
-        domain = root[root.index(":") + 3:]
-        pattern = info.get("pattern") or re.escape(domain)
-        name = (info.get("name") or category).capitalize()
-
-        class Extr(FoolfuukaThreadExtractor):
-            pass
-
-        Extr.__name__ = Extr.__qualname__ = name + "ThreadExtractor"
-        Extr.__doc__ = "Extractor for threads on " + domain
-        Extr.category = category
-        Extr.pattern = r"(?:https?://)?" + pattern + r"/([^/]+)/thread/(\d+)"
-        Extr.test = info.get("test")
-        Extr.root = root
-        if info.get("remote") == "simple":
-            Extr.remote = Extr._remote_simple
-        symtable[Extr.__name__] = Extr
-
-
 EXTRACTORS = {
    "4plebs": {
        "name": "fourplebs",
        "root": "https://archive.4plebs.org",
        "pattern": r"(?:archive\.)?4plebs\.org",
-        "test": ("https://archive.4plebs.org/tg/thread/54059290", {
+        "test-thread": ("https://archive.4plebs.org/tg/thread/54059290", {
            "url": "07452944164b602502b02b24521f8cee5c484d2a",
        }),
    },
    "archivedmoe": {
        "root": "https://archived.moe",
-        "test": (
+        "test-thread": (
            ("https://archived.moe/gd/thread/309639/", {
                "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
                "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
@ -128,41 +101,41 @@ EXTRACTORS = {
    "archiveofsins": {
        "root": "https://archiveofsins.com",
        "pattern": r"(?:www\.)?archiveofsins\.com",
-        "test": ("https://archiveofsins.com/h/thread/4668813/", {
+        "test-thread": ("https://archiveofsins.com/h/thread/4668813/", {
            "url": "f612d287087e10a228ef69517cf811539db9a102",
            "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
        }),
    },
    "b4k": {
        "root": "https://arch.b4k.co",
-        "remote": "simple",
-        "test": ("https://arch.b4k.co/meta/thread/196/", {
+        "extra": {"resolve": "direct"},
+        "test-thread": ("https://arch.b4k.co/meta/thread/196/", {
            "url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
        }),
    },
    "desuarchive": {
        "root": "https://desuarchive.org",
-        "test": ("https://desuarchive.org/a/thread/159542679/", {
+        "test-thread": ("https://desuarchive.org/a/thread/159542679/", {
            "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
        }),
    },
    "fireden": {
        "root": "https://boards.fireden.net",
-        "test": ("https://boards.fireden.net/a/thread/159803223/", {
+        "test-thread": ("https://boards.fireden.net/a/thread/159803223/", {
            "url": "01b7baacfb0656a68e566368290e3072b27f86c9",
        }),
    },
    "nyafuu": {
        "root": "https://archive.nyafuu.org",
        "pattern": r"(?:archive\.)?nyafuu\.org",
-        "test": ("https://archive.nyafuu.org/c/thread/2849220/", {
+        "test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", {
            "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
        }),
    },
    "rbt": {
        "root": "https://rbt.asia",
        "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
-        "test": (
+        "test-thread": (
            ("https://rbt.asia/g/thread/61487650/", {
                "url": "61896d9d9a2edb556b619000a308a984307b6d30",
            }),
@ -174,11 +147,12 @@ EXTRACTORS = {
    "thebarchive": {
        "root": "https://thebarchive.com",
        "pattern": r"thebarchive\.com",
-        "test": ("https://thebarchive.com/b/thread/739772332/", {
+        "test-thread": ("https://thebarchive.com/b/thread/739772332/", {
            "url": "e8b18001307d130d67db31740ce57c8561b5d80c",
        }),
    },
 }

-
-generate_extractors()
+generate_extractors(EXTRACTORS, globals(), (
+    FoolfuukaThreadExtractor,
+))
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@ -9,11 +9,16 @@
 """Extractors for FoOlSlide based sites"""

 from .common import (
-    Extractor, ChapterExtractor, MangaExtractor, Message, SharedConfigMixin)
-from .. import text, util, config
+    Extractor,
+    ChapterExtractor,
+    MangaExtractor,
+    SharedConfigMixin,
+    Message,
+    generate_extractors,
+)
+from .. import text, util
 import base64
 import json
-import re


 class FoolslideBase(SharedConfigMixin):
@ -41,6 +46,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
    directory_fmt = (
        "{category}", "{manga}", "{chapter_string}")
    archive_fmt = "{id}"
+    pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
    decode = "default"

    def items(self):
@ -92,6 +98,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):

 class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
    """Base class for manga extractors for FoOlSlide based sites"""
+    pattern_fmt = r"(/series/[^/?&#]+)"

    def chapters(self, page):
        manga , pos = text.extract(page, '<h1 class="title">', '</h1>')
@ -116,52 +123,6 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
            })))


-def generate_extractors():
-    """Dynamically generate Extractor classes for FoOlSlide instances"""
-
-    symtable = globals()
-    extractors = config.get(("extractor", "foolslide"))
-
-    if extractors:
-        EXTRACTORS.update(extractors)
-
-    for category, info in EXTRACTORS.items():
-
-        if not isinstance(info, dict):
-            continue
-
-        root = info["root"]
-        domain = root[root.index(":") + 3:]
-        pattern = info.get("pattern") or re.escape(domain)
-        name = (info.get("name") or category).capitalize()
-
-        class ChExtr(FoolslideChapterExtractor):
-            pass
-
-        ChExtr.__name__ = ChExtr.__qualname__ = name + "ChapterExtractor"
-        ChExtr.__doc__ = "Extractor for manga-chapters from " + domain
-        ChExtr.category = category
-        ChExtr.pattern = (r"(?:https?://)?" + pattern +
-                          r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
-        ChExtr.test = info.get("test-chapter")
-        ChExtr.root = root
-        if "decode" in info:
-            ChExtr.decode = info["decode"]
-        symtable[ChExtr.__name__] = ChExtr
-
-        class MaExtr(FoolslideMangaExtractor):
-            pass
-
-        MaExtr.__name__ = MaExtr.__qualname__ = name + "MangaExtractor"
-        MaExtr.__doc__ = "Extractor for manga from " + domain
-        MaExtr.category = category
-        MaExtr.pattern = r"(?:https?://)?" + pattern + r"(/series/[^/?&#]+)"
-        MaExtr.test = info.get("test-manga")
-        MaExtr.root = root
-        MaExtr.chapterclass = ChExtr
-        symtable[MaExtr.__name__] = MaExtr
-
-
 EXTRACTORS = {
    "dokireader": {
        "root": "https://kobato.hologfx.com/reader",
@ -180,7 +141,7 @@ EXTRACTORS = {
    "jaiminisbox": {
        "root": "https://jaiminisbox.com/reader",
        "pattern": r"(?:www\.)?jaiminisbox\.com/reader",
-        "decode": "base64",
+        "extra": {"decode": "base64"},
        "test-chapter": (
            ("https://jaiminisbox.com/reader/read/uratarou/en/0/1/", {
                "keyword": "6009af77cc9c05528ab1fdda47b1ad9d4811c673",
@ -290,7 +251,10 @@ EXTRACTORS = {
                "keyword": "3a24f1088b4d7f3b798a96163f21ca251293a120",
            }),
    },
+    "_ckey": "chapterclass",
 }

-
-generate_extractors()
+generate_extractors(EXTRACTORS, globals(), (
+    FoolslideChapterExtractor,
+    FoolslideMangaExtractor,
+))
--- a/gallery_dl/extractor/shopify.py
+++ b/gallery_dl/extractor/shopify.py
@ -8,8 +8,8 @@

 """Extractors for Shopify instances"""

-from .common import Extractor, Message, SharedConfigMixin
-from .. import text, config
+from .common import Extractor, Message, SharedConfigMixin, generate_extractors
+from .. import text
 import time
 import re

@ -63,13 +63,13 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):

    def products(self):
        """Return an iterable with all relevant product URLs"""
-        return ()


 class ShopifyCollectionExtractor(ShopifyExtractor):
    """Base class for collection extractors for Shopify based sites"""
    subcategory = "collection"
    directory_fmt = ("{category}", "{collection[title]}")
+    pattern_fmt = r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)"

    def __init__(self, match):
        ShopifyExtractor.__init__(self, match)
@ -98,58 +98,23 @@ class ShopifyProductExtractor(ShopifyExtractor):
    """Base class for product extractors for Shopify based sites"""
    subcategory = "product"
    directory_fmt = ("{category}", "Products")
+    pattern_fmt = r"((?:/collections/[\w-]+)?/products/[\w-]+)"

    def products(self):
        return (self.item_url,)


-def generate_extractors():
-    """Dynamically generate Extractor classes for Shopify instances"""
-    symtable = globals()
-    extractors = config.get(("extractor", "shopify"))
-
-    if extractors:
-        EXTRACTORS.update(extractors)
-
-    for category, info in EXTRACTORS.items():
-
-        if not isinstance(info, dict):
-            continue
-
-        root = info["root"]
-        domain = root[root.index(":") + 3:]
-        pattern = info.get("pattern") or re.escape(domain)
-        name = (info.get("name") or category).capitalize()
-
-        class CoExtr(ShopifyCollectionExtractor):
-            pass
-
-        CoExtr.__name__ = CoExtr.__qualname__ = name + "CollectionExtractor"
-        CoExtr.__doc__ = "Extractor for product collections from " + domain
-        CoExtr.category = category
-        CoExtr.pattern = (r"(?:https?://)?" + pattern +
-                          r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)")
-        CoExtr.test = info.get("test-collection")
-        CoExtr.root = root
-        symtable[CoExtr.__name__] = CoExtr
-
-        class PrExtr(ShopifyProductExtractor):
-            pass
-
-        PrExtr.__name__ = PrExtr.__qualname__ = name + "ProductExtractor"
-        PrExtr.__doc__ = "Extractor for individual products from " + domain
-        PrExtr.category = category
-        PrExtr.pattern = (r"(?:https?://)?" + pattern +
-                          r"((?:/collections/[\w-]+)?/products/[\w-]+)")
-        PrExtr.test = info.get("test-product")
-        PrExtr.root = root
-        symtable[PrExtr.__name__] = PrExtr
-
-
 EXTRACTORS = {
    "fashionnova": {
        "root": "https://www.fashionnova.com",
        "pattern": r"(?:www\.)?fashionnova\.com",
+        "test-product": (
+            ("https://www.fashionnova.com/products/essential-slide-red", {
+                "pattern": r"https?://cdn\.shopify.com/",
+                "count": 3,
+            }),
+            ("https://www.fashionnova.com/collections/flats/products/name"),
+        ),
        "test-collection": (
            ("https://www.fashionnova.com/collections/mini-dresses", {
                "range": "1-20",
@ -158,13 +123,11 @@ EXTRACTORS = {
            ("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
            ("https://www.fashionnova.com/collections/mini-dresses#1"),
        ),
-        "test-product": (
-            ("https://www.fashionnova.com/products"
-             "/only-here-tonight-cut-out-dress-black"),
-            ("https://www.fashionnova.com/collections/mini-dresses/products"
-             "/only-here-tonight-cut-out-dress-black"),
-        )
+
    },
 }

-generate_extractors()
+generate_extractors(EXTRACTORS, globals(), (
+    ShopifyProductExtractor,
+    ShopifyCollectionExtractor,
+))