From a2c74bc6f0ca2cde86dd986efbcd9d361c5a7f79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 3 Jul 2018 20:54:37 +0200 Subject: [PATCH] [gelbooru] inherit from BooruExtractor class Breaks pool functionality when using API calls (for now), but reduces code clutter and enables the `tags` option. --- gallery_dl/extractor/booru.py | 4 +- gallery_dl/extractor/gelbooru.py | 114 ++++++++----------------------- 2 files changed, 30 insertions(+), 88 deletions(-) diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index f702b8d7..eba75452 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -129,8 +129,10 @@ class MoebooruPageMixin(): class GelbooruPageMixin(): """Pagination for Gelbooru-like sites""" + page_start = 0 + def reset_page(self): - self.params["pid"] = self.page_start - 1 + self.params["pid"] = self.page_start def update_page(self, data): self.params["pid"] += 1 diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 110160a6..99ae4162 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -8,55 +8,46 @@ """Extract images from https://gelbooru.com/""" -from .common import SharedConfigExtractor, Message -from .. import text, util, exception -import xml.etree.ElementTree as ET +from . import booru +from .common import Message +from .. import text, exception -class GelbooruExtractor(SharedConfigExtractor): +class GelbooruExtractor(booru.XmlParserMixin, + booru.GelbooruPageMixin, + booru.BooruExtractor): """Base class for gelbooru extractors""" - basecategory = "booru" category = "gelbooru" - filename_fmt = "{category}_{id}_{md5}.{extension}" - api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index" + api_url = "https://gelbooru.com/index.php" + post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}" + + def __init__(self, match): + super().__init__(match) - def __init__(self): - SharedConfigExtractor.__init__(self) - self.start_post = 0 self.use_api = self.config("api", True) if self.use_api: - self.get_post_data = self.get_post_data_api + self.params.update({"page": "dapi", "s": "post", "q": "index"}) + else: + self.items = self.items_noapi - def items(self): + def items_noapi(self): data = self.get_metadata() yield Message.Version, 1 yield Message.Directory, data - for post in util.advance(self.get_posts(), self.start_post): - if isinstance(post, str): - post = self.get_post_data(post) - for key in ("id", "width", "height", "score", "change"): - post[key] = text.parse_int(post[key]) + for post in self.get_posts(): + post = self.get_post_data(post) url = post["file_url"] post.update(data) yield Message.Url, url, text.nameext_from_url(url, post) - def skip(self, num): - self.start_post += num - return num - - def get_metadata(self): - """Return general metadata""" - return {} - def get_posts(self): """Return an iterable containing all relevant post objects""" def get_post_data(self, post_id): """Extract metadata of a single post""" - page = self.request("https://gelbooru.com/index.php?page=post&s=view" - "&id=" + post_id).text + page = self.request(self.post_url.format(post_id)).text data = text.extract_all(page, ( (None , '[^&#]+)"] test = [ ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", { "count": 5, @@ -102,42 +85,13 @@ class GelbooruTagExtractor(GelbooruExtractor): ] def __init__(self, match): - GelbooruExtractor.__init__(self) - self.tags = text.unquote(match.group(1).replace("+", " ")) - self.per_page = 100 if self.use_api else 42 - self.start_page = 0 - - def skip(self, num): - pages, posts = divmod(num, self.per_page) - self.start_page += pages - self.start_post += posts - return num - - def get_metadata(self): - return {"search_tags": self.tags} + super().__init__(match) + if not self.use_api: + self.per_page = 42 def get_posts(self): - if self.use_api: - return self._get_posts_api() - return self._get_posts_manual() - - def _get_posts_api(self): - params = { - # 'pid' is page-id; first page has index 0 - "tags": self.tags, "limit": self.per_page, "pid": self.start_page} - while True: - root = ET.fromstring( - self.request(self.api_url, params=params).text) - for item in root: - yield item.attrib - if len(root) < self.per_page: - return - params["pid"] += 1 - - def _get_posts_manual(self): url = "https://gelbooru.com/index.php?page=post&s=list" - # 'pid' is post-id; values for 'pid' must be multiples of 42 - params = {"tags": self.tags, "pid": self.start_page * self.per_page} + params = {"tags": self.tags, "pid": self.page_start * self.per_page} while True: page = self.request(url, params=params).text @@ -150,20 +104,12 @@ class GelbooruTagExtractor(GelbooruExtractor): class GelbooruPoolExtractor(GelbooruExtractor): """Extractor for image-pools from gelbooru.com""" - subcategory = "pool" - directory_fmt = ["{category}", "pool", "{pool}"] - archive_fmt = "p_{pool}_{id}" pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=pool&s=show&id=(\d+)"] test = [("https://gelbooru.com/index.php?page=pool&s=show&id=761", { "count": 6, })] - def __init__(self, match): - GelbooruExtractor.__init__(self) - self.pool_id = match.group(1) - self.posts = None - def get_metadata(self): page = self.request("https://gelbooru.com/index.php?page=pool&s=show" "&id=" + self.pool_id).text @@ -183,20 +129,14 @@ class GelbooruPoolExtractor(GelbooruExtractor): return self.posts -class GelbooruPostExtractor(GelbooruExtractor): +class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor): """Extractor for single images from gelbooru.com""" - subcategory = "post" - archive_fmt = "{id}" pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" - r"\?page=post&s=view&id=(\d+)"] + r"\?page=post&s=view&id=(?P\d+)"] test = [("https://gelbooru.com/index.php?page=post&s=view&id=313638", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", "count": 1, })] - def __init__(self, match): - GelbooruExtractor.__init__(self) - self.post_id = match.group(1) - def get_posts(self): - return (self.post_id,) + return (self.post,)