added extractor 'danbooru' + split BooruExtractor to handle XML and JSON

10 years ago · 34d13bc906
parent 2a7dbd8868
commit 34d13bc906
3 changed files with 69 additions and 42 deletions
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@ -0,0 +1,62 @@
+from .common import AsyncExtractor
+from ..util import filename_from_url
+import xml.etree.ElementTree as ET
+import json
+import urllib.parse
+
+class BooruExtractor(AsyncExtractor):
+
+    def __init__(self, match, config):
+        AsyncExtractor.__init__(self, config)
+        self.tags      = urllib.parse.unquote(match.group(1))
+        self.category  = "booru"
+        self.params    = {"tags": self.tags}
+        self.page      = "page"
+        self.directory = self.tags.replace("/", "_")
+
+    def update_page(self, reset=False):
+        # Override this method in derived classes if necessary.
+        # It is usually enough to adjust the 'page' attribute
+        if reset is False:
+            self.params[self.page] += 1
+        else:
+            self.params[self.page]  = 1
+
+class JSONBooruExtractor(BooruExtractor):
+
+    def images(self):
+        self.update_page(reset=True)
+        while True:
+            images = json.loads(
+                self.request(self.api_url, verify=True, params=self.params).text
+            )
+            if len(images) == 0:
+                return
+            for img in images:
+                url  = urllib.parse.urljoin(self.api_url, img["file_url"])
+                name = "{}_{}".format(self.category, filename_from_url(url))
+                yield url, name
+            self.update_page()
+
+class XMLBooruExtractor(BooruExtractor):
+
+    def images(self):
+        self.update_page(reset=True)
+        while True:
+            root = ET.fromstring(
+                self.request(self.api_url, verify=True, params=self.params).text
+            )
+            if len(root) == 0:
+                return
+            for item in root:
+                url  = item.attrib["file_url"]
+                name = "{}_{}".format(self.category, filename_from_url(url))
+                yield url, name
+            self.update_page()
+
+class Extractor(JSONBooruExtractor):
+
+    def __init__(self, match, config):
+        JSONBooruExtractor.__init__(self, match, config)
+        self.category = "danbooru"
+        self.api_url  = "https://danbooru.donmai.us/posts.json"
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@ -1,8 +1,8 @@
-from .gelbooru import BooruExtractor
+from .danbooru import JSONBooruExtractor

-class Extractor(BooruExtractor):
+class Extractor(JSONBooruExtractor):

    def __init__(self, match, config):
-        BooruExtractor.__init__(self, match, config)
+        JSONBooruExtractor.__init__(self, match, config)
        self.category = "e621"
-        self.api_url  = "https://e621.net/post/index.xml"
+        self.api_url  = "https://e621.net/post/index.json"
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@ -1,44 +1,9 @@
-from .common import AsyncExtractor
-from ..util import filename_from_url
-import xml.etree.ElementTree as ET
-import urllib.parse
+from .danbooru import XMLBooruExtractor

-class BooruExtractor(AsyncExtractor):
+class Extractor(XMLBooruExtractor):

    def __init__(self, match, config):
-        AsyncExtractor.__init__(self, config)
-        self.tags      = urllib.parse.unquote(match.group(1))
-        self.category  = "booru"
-        self.params    = {"tags": self.tags}
-        self.page      = "page"
-        self.directory = self.tags.replace("/", "_")
-
-    def images(self):
-        self.update_page(reset=True)
-        while True:
-            root = ET.fromstring(
-                self.request(self.api_url, verify=True, params=self.params).text
-            )
-            if len(root) == 0:
-                return
-            for item in root:
-                url  = item.attrib["file_url"]
-                name = "{}_{}".format(self.category, filename_from_url(url))
-                yield url, name
-            self.update_page()
-
-    def update_page(self, reset=False):
-        # Override this method in derived classes if necessary.
-        # It is usually enough to adjust the 'page' attribute
-        if reset is False:
-            self.params[self.page] += 1
-        else:
-            self.params[self.page]  = 1
-
-class Extractor(BooruExtractor):
-
-    def __init__(self, match, config):
-        BooruExtractor.__init__(self, match, config)
+        XMLBooruExtractor.__init__(self, match, config)
        self.category = "gelbooru"
        self.api_url  = "http://gelbooru.com/"
        self.params   = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}