From 34d13bc9061558a4d1688de3be471249dc70a6d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 30 Dec 2014 21:34:55 +0100 Subject: [PATCH] added extractor 'danbooru' + split BooruExtractor to handle XML and JSON --- gallery_dl/extractor/danbooru.py | 62 ++++++++++++++++++++++++++++++++ gallery_dl/extractor/e621.py | 8 ++--- gallery_dl/extractor/gelbooru.py | 41 ++------------------- 3 files changed, 69 insertions(+), 42 deletions(-) create mode 100644 gallery_dl/extractor/danbooru.py diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py new file mode 100644 index 00000000..9579dea2 --- /dev/null +++ b/gallery_dl/extractor/danbooru.py @@ -0,0 +1,62 @@ +from .common import AsyncExtractor +from ..util import filename_from_url +import xml.etree.ElementTree as ET +import json +import urllib.parse + +class BooruExtractor(AsyncExtractor): + + def __init__(self, match, config): + AsyncExtractor.__init__(self, config) + self.tags = urllib.parse.unquote(match.group(1)) + self.category = "booru" + self.params = {"tags": self.tags} + self.page = "page" + self.directory = self.tags.replace("/", "_") + + def update_page(self, reset=False): + # Override this method in derived classes if necessary. + # It is usually enough to adjust the 'page' attribute + if reset is False: + self.params[self.page] += 1 + else: + self.params[self.page] = 1 + +class JSONBooruExtractor(BooruExtractor): + + def images(self): + self.update_page(reset=True) + while True: + images = json.loads( + self.request(self.api_url, verify=True, params=self.params).text + ) + if len(images) == 0: + return + for img in images: + url = urllib.parse.urljoin(self.api_url, img["file_url"]) + name = "{}_{}".format(self.category, filename_from_url(url)) + yield url, name + self.update_page() + +class XMLBooruExtractor(BooruExtractor): + + def images(self): + self.update_page(reset=True) + while True: + root = ET.fromstring( + self.request(self.api_url, verify=True, params=self.params).text + ) + if len(root) == 0: + return + for item in root: + url = item.attrib["file_url"] + name = "{}_{}".format(self.category, filename_from_url(url)) + yield url, name + self.update_page() + +class Extractor(JSONBooruExtractor): + + def __init__(self, match, config): + JSONBooruExtractor.__init__(self, match, config) + self.category = "danbooru" + self.api_url = "https://danbooru.donmai.us/posts.json" diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 7128d86a..81e2a2c6 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -1,8 +1,8 @@ -from .gelbooru import BooruExtractor +from .danbooru import JSONBooruExtractor -class Extractor(BooruExtractor): +class Extractor(JSONBooruExtractor): def __init__(self, match, config): - BooruExtractor.__init__(self, match, config) + JSONBooruExtractor.__init__(self, match, config) self.category = "e621" - self.api_url = "https://e621.net/post/index.xml" + self.api_url = "https://e621.net/post/index.json" diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 5295f1d5..43a9ffcd 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -1,44 +1,9 @@ -from .common import AsyncExtractor -from ..util import filename_from_url -import xml.etree.ElementTree as ET -import urllib.parse +from .danbooru import XMLBooruExtractor -class BooruExtractor(AsyncExtractor): +class Extractor(XMLBooruExtractor): def __init__(self, match, config): - AsyncExtractor.__init__(self, config) - self.tags = urllib.parse.unquote(match.group(1)) - self.category = "booru" - self.params = {"tags": self.tags} - self.page = "page" - self.directory = self.tags.replace("/", "_") - - def images(self): - self.update_page(reset=True) - while True: - root = ET.fromstring( - self.request(self.api_url, verify=True, params=self.params).text - ) - if len(root) == 0: - return - for item in root: - url = item.attrib["file_url"] - name = "{}_{}".format(self.category, filename_from_url(url)) - yield url, name - self.update_page() - - def update_page(self, reset=False): - # Override this method in derived classes if necessary. - # It is usually enough to adjust the 'page' attribute - if reset is False: - self.params[self.page] += 1 - else: - self.params[self.page] = 1 - -class Extractor(BooruExtractor): - - def __init__(self, match, config): - BooruExtractor.__init__(self, match, config) + XMLBooruExtractor.__init__(self, match, config) self.category = "gelbooru" self.api_url = "http://gelbooru.com/" self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}