From a9119da4d4ab3e611b6b67ae68c0a173cd8ed97f Mon Sep 17 00:00:00 2001
From: topozorra <78620271+topozorra@users.noreply.github.com>
Date: Wed, 3 Mar 2021 09:20:47 -0500
Subject: [PATCH] support `tumblrgallery.xyz` (#1298)

* support `tumblrgallery.xyz`

* fix format issues

* Refactor and add post and search page support

* Fix warnings

* Few improvments

* Better file names

* Fix linting errors

* move id closer to the begining of the file name

Co-authored-by: topozorra <none>
---
 gallery_dl/extractor/__init__.py      |   1 +
 gallery_dl/extractor/tumblrgallery.py | 178 ++++++++++++++++++++++++++
 scripts/supportedsites.py             |   1 +
 3 files changed, 180 insertions(+)
 create mode 100644 gallery_dl/extractor/tumblrgallery.py
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 62233940..a70fbc92 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -113,6 +113,7 @@ modules = [
     "subscribestar",
     "tsumino",
     "tumblr",
+    "tumblrgallery",
     "twitter",
     "unsplash",
     "vanillarock",
diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py
new file mode 100644
index 00000000..c9ef16a7
--- /dev/null
+++ b/gallery_dl/extractor/tumblrgallery.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://tumblrgallery.xyz/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+BASE_PATTERN = r"(?:https?://)tumblrgallery\.xyz"
+
+
+class TumblrgalleryGalleryExtractor(GalleryExtractor):
+    """Base class for tumblrgallery extractors"""
+    category = "tumblrgallery"
+    cookiedomain = None
+
+    def __init__(self, match):
+        self.root = "https://tumblrgallery.xyz"
+        GalleryExtractor.__init__(self, match)
+
+
+class TumblrgalleryTumblrblogExtractor(TumblrgalleryGalleryExtractor):
+    """Extractor for Tumblrblog on tumblrgallery.xyz"""
+    subcategory = "tumblrblog"
+    pattern = BASE_PATTERN + r"(/tumblrblog/gallery/(\d+).html)"
+    test = (
+        "https://tumblrgallery.xyz/tumblrblog/gallery/103975.html", {
+            "pattern": r"/tumblrblog/gallery/103975.html"
+                       r"103975",
+        }
+    )
+
+    filename_fmt = "{category}_{gallery_id}_{num:>03}_{id}.{extension}"
+    directory_fmt = ("{category}", "{gallery_id} {title}")
+
+    def __init__(self, match):
+        TumblrgalleryGalleryExtractor.__init__(self, match)
+        self.gallery_id = text.parse_int(match.group(2))
+
+    def metadata(self, page):
+        """Collect metadata for extractor-job"""
+        return {
+            "title" : text.unescape(text.extract(page, "<h1>", "</h1>"))[0],
+            "gallery_id": self.gallery_id,
+        }
+
+    def images(self, _):
+        page_num = 1
+        while True:
+            response = self.request(
+                "{}/tumblrblog/gallery/{}/{}.html"
+                .format(self.root, self.gallery_id, page_num),
+                allow_redirects=False
+            )
+            if response.status_code != 200:
+                return
+
+            page = response.text
+            page_num += 1
+
+            urls = list(text.extract_iter(
+                page,
+                '<div class="report xx-co-me"> <a href="',
+                '" data-fancybox="gallery"'
+            ))
+
+            for image_src in urls:
+                yield image_src, {
+                    "id": text.extract(image_src, "tumblr_", "_")[0]
+                }
+
+
+class TumblrgalleryPostExtractor(TumblrgalleryGalleryExtractor):
+    """Extractor for Posts on tumblrgallery.xyz"""
+    subcategory = "post"
+    pattern = BASE_PATTERN + r"(/post/(\d+).html)"
+    test = (
+        "https://tumblrgallery.xyz/post/405674.html", {
+            "pattern": r"/post/405674.html"
+                       r"405674",
+        }
+    )
+
+    filename_fmt = "{category}_{gallery_id}_{num:>03}_{id}.{extension}"
+    directory_fmt = ("{category}", "{gallery_id} {title}")
+
+    def __init__(self, match):
+        TumblrgalleryGalleryExtractor.__init__(self, match)
+        self.gallery_id = text.parse_int(match.group(2))
+
+    def metadata(self, page):
+        """Collect metadata for extractor-job"""
+        return {
+            "title" : text.remove_html(
+                text.unescape(text.extract(page, "<title>", "</title>")[0])
+            ).replace("_", "-"),
+            "gallery_id": self.gallery_id,
+        }
+
+    def images(self, page):
+        urls = list(text.extract_iter(
+            page,
+            '<div class="report xx-co-me"> <a href="',
+            '" data-fancybox="gallery"'
+        ))
+
+        for image_src in urls:
+            yield image_src, {
+                "id": text.extract(image_src, "tumblr_", "_")[0] or
+                text.nameext_from_url(image_src)["filename"]
+            }
+
+
+class TumblrgallerySearchExtractor(TumblrgalleryGalleryExtractor):
+    """Extractor for Search result on tumblrgallery.xyz"""
+    subcategory = "search"
+    pattern = BASE_PATTERN + r"(/s\.php\?q=(.*))"
+    test = (
+        "https://tumblrgallery.xyz/s.php?q=everyday-life", {
+            "pattern": r"everyday-life",
+        }
+    )
+
+    filename_fmt = "{category}_{num:>03}_{gallery_id}_{id}_{title}.{extension}"
+    directory_fmt = ("{category}", "{search_term}")
+
+    def __init__(self, match):
+        self.search_term = match.group(2)
+        TumblrgalleryGalleryExtractor.__init__(self, match)
+
+    def metadata(self, page):
+        """Collect metadata for extractor-job"""
+        return {
+            "search_term": self.search_term,
+        }
+
+    def images(self, _):
+        page_num = 1
+        while True:
+            response = self.request(
+                "{}/s.php?q={}&page={}"
+                .format(self.root, self.search_term, page_num),
+                allow_redirects=False
+            )
+            if response.status_code != 200:
+                return
+
+            page = response.text
+            page_num += 1
+
+            gallery_ids = list(text.extract_iter(
+                page,
+                '<div class="title"><a href="post/',
+                '.html'
+            ))
+
+            for gallery_id in gallery_ids:
+                post_page = self.request(
+                    "{}/post/{}.html"
+                    .format(self.root, gallery_id),
+                    allow_redirects=False
+                ).text
+                for image_src in TumblrgalleryPostExtractor.images(
+                    self, post_page
+                ):
+                    image_src[1]["title"] = text.remove_html(
+                        text.unescape(
+                            text.extract(post_page, "<title>", "</title>")[0]
+                        )
+                    ).replace("_", "-")
+                    image_src[1]["gallery_id"] = gallery_id
+                    yield image_src
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
index 25b63e95..1dce3c95 100755
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@@ -90,6 +90,7 @@ CATEGORY_MAP = {
     "speakerdeck"    : "Speaker Deck",
     "subscribestar"  : "SubscribeStar",
     "thebarchive"    : "The /b/ Archive",
+    "tumblrgallery"  : "TumblrGallery",
     "vanillarock"    : "もえぴりあ",
     "vsco"           : "VSCO",
     "webtoons"       : "Webtoon",