diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index f7ac1f0c..36b5dfab 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -35,6 +35,7 @@ modules = [ "furaffinity", "fuskator", "gelbooru", + "gelbooru_v01", "gelbooru_v02", "gfycat", "hbrowse", diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py new file mode 100644 index 00000000..09359987 --- /dev/null +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Gelbooru v0.1 sites""" + +from . import booru +from .. import text + + +class GelbooruV01Extractor(booru.BooruExtractor): + basecategory = "gelbooru_v01" + per_page = 20 + + def _parse_post(self, post_id): + url = "{}/index.php?page=post&s=view&id={}".format( + self.root, post_id) + page = self.request(url).text + + post = text.extract_all(page, ( + ("created_at", 'Posted: ', ' <'), + ("uploader" , 'By: ', ' <'), + ("width" , 'Size: ', 'x'), + ("height" , '', ' <'), + ("source" , 'Source: ', '<'), + ))[0] + + post["id"] = post_id + post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] + post["rating"] = (post["rating"] or "?")[0].lower() + post["tags"] = text.unescape(post["tags"]) + post["date"] = text.parse_datetime( + post["created_at"], "%Y-%m-%d %H:%M:%S") + + return post + + +BASE_PATTERN = GelbooruV01Extractor.update({ + "thecollection" : {"root": "https://the-collection.booru.org"}, + "illusioncardsbooru": {"root": "https://illusioncards.booru.org"}, + "allgirlbooru" : {"root": "https://allgirl.booru.org"}, +}) + + +class GelbooruV01TagExtractor(GelbooruV01Extractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" + test = ( + (("https://the-collection.booru.org" + "/index.php?page=post&s=list&tags=parody"), { + "range": "1-25", + "count": 25, + }), + (("https://illusioncards.booru.org" + "/index.php?page=post&s=list&tags=koikatsu"), { + "range": "1-25", + "count": 25, + }), + ("https://allgirl.booru.org/index.php?page=post&s=list&tags=dress", { + "range": "1-25", + "count": 25, + }), + ) + + def __init__(self, match): + GelbooruV01Extractor.__init__(self, match) + self.tags = match.group(match.lastindex) + + def metadata(self): + return {"search_tags": text.unquote(self.tags.replace("+", " "))} + + def posts(self): + url = "{}/index.php?page=post&s=list&tags={}&pid=".format( + self.root, self.tags) + pid = self.page_start + + while True: + page = self.request(url + str(pid)).text + + cnt = 0 + for post_id in text.extract_iter( + page, 'class="thumb">