From a9119da4d4ab3e611b6b67ae68c0a173cd8ed97f Mon Sep 17 00:00:00 2001 From: topozorra <78620271+topozorra@users.noreply.github.com> Date: Wed, 3 Mar 2021 09:20:47 -0500 Subject: [PATCH] support `tumblrgallery.xyz` (#1298) * support `tumblrgallery.xyz` * fix format issues * Refactor and add post and search page support * Fix warnings * Few improvments * Better file names * Fix linting errors * move id closer to the begining of the file name Co-authored-by: topozorra --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/tumblrgallery.py | 178 ++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + 3 files changed, 180 insertions(+) create mode 100644 gallery_dl/extractor/tumblrgallery.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 62233940..a70fbc92 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -113,6 +113,7 @@ modules = [ "subscribestar", "tsumino", "tumblr", + "tumblrgallery", "twitter", "unsplash", "vanillarock", diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py new file mode 100644 index 00000000..c9ef16a7 --- /dev/null +++ b/gallery_dl/extractor/tumblrgallery.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://tumblrgallery.xyz/""" + +from .common import GalleryExtractor +from .. import text + + +BASE_PATTERN = r"(?:https?://)tumblrgallery\.xyz" + + +class TumblrgalleryGalleryExtractor(GalleryExtractor): + """Base class for tumblrgallery extractors""" + category = "tumblrgallery" + cookiedomain = None + + def __init__(self, match): + self.root = "https://tumblrgallery.xyz" + GalleryExtractor.__init__(self, match) + + +class TumblrgalleryTumblrblogExtractor(TumblrgalleryGalleryExtractor): + """Extractor for Tumblrblog on tumblrgallery.xyz""" + subcategory = "tumblrblog" + pattern = BASE_PATTERN + r"(/tumblrblog/gallery/(\d+).html)" + test = ( + "https://tumblrgallery.xyz/tumblrblog/gallery/103975.html", { + "pattern": r"/tumblrblog/gallery/103975.html" + r"103975", + } + ) + + filename_fmt = "{category}_{gallery_id}_{num:>03}_{id}.{extension}" + directory_fmt = ("{category}", "{gallery_id} {title}") + + def __init__(self, match): + TumblrgalleryGalleryExtractor.__init__(self, match) + self.gallery_id = text.parse_int(match.group(2)) + + def metadata(self, page): + """Collect metadata for extractor-job""" + return { + "title" : text.unescape(text.extract(page, "

", "

"))[0], + "gallery_id": self.gallery_id, + } + + def images(self, _): + page_num = 1 + while True: + response = self.request( + "{}/tumblrblog/gallery/{}/{}.html" + .format(self.root, self.gallery_id, page_num), + allow_redirects=False + ) + if response.status_code != 200: + return + + page = response.text + page_num += 1 + + urls = list(text.extract_iter( + page, + '
", "")[0]) + ).replace("_", "-"), + "gallery_id": self.gallery_id, + } + + def images(self, page): + urls = list(text.extract_iter( + page, + '