[mememuseum] add 'tag' and 'post' extractors (closes #2264)

3 years ago · 79a461a2c1
parent e5f6af6e32
commit 79a461a2c1
3 changed files with 127 additions and 0 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -469,6 +469,12 @@ Consider all sites to be NSFW unless otherwise known.
    <td>Albums, Channels</td>
    <td>Supported</td>
 </tr>
+<tr>
+    <td>Mememuseum</td>
+    <td>https://meme.museum/</td>
+    <td>Posts, Tag Searches</td>
+    <td></td>
+</tr>
 <tr>
    <td>My Hentai Gallery</td>
    <td>https://myhentaigallery.com/</td>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -81,6 +81,7 @@ modules = [
    "mangapark",
    "mangasee",
    "mangoxo",
+    "mememuseum",
    "myhentaigallery",
    "myportfolio",
    "naver",
--- a/gallery_dl/extractor/mememuseum.py
+++ b/gallery_dl/extractor/mememuseum.py
@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://meme.museum/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class MememuseumExtractor(Extractor):
+    """Base class for meme.museum extractors"""
+    basecategory = "booru"
+    category = "mememuseum"
+    filename_fmt = "{category}_{id}_{md5}.{extension}"
+    archive_fmt = "{id}"
+    root = "https://meme.museum"
+
+    def items(self):
+        data = self.metadata()
+
+        for post in self.posts():
+            url = post["file_url"]
+            for key in ("id", "width", "height"):
+                post[key] = text.parse_int(post[key])
+            post["tags"] = text.unquote(post["tags"])
+            post.update(data)
+            yield Message.Directory, post
+            yield Message.Url, url, text.nameext_from_url(url, post)
+
+    def metadata(self):
+        """Return general metadata"""
+        return ()
+
+    def posts(self):
+        """Return an iterable containing data of all relevant posts"""
+        return ()
+
+
+class MememuseumTagExtractor(MememuseumExtractor):
+    """Extractor for images from meme.museum by search-tags"""
+    subcategory = "tag"
+    directory_fmt = ("{category}", "{search_tags}")
+    pattern = r"(?:https?://)?meme\.museum/post/list/([^/?#]+)"
+    test = ("https://meme.museum/post/list/animated/1", {
+        "pattern": r"https://meme\.museum/_images/\w+/\d+%20-%20",
+        "count": ">= 30"
+    })
+    per_page = 25
+
+    def __init__(self, match):
+        MememuseumExtractor.__init__(self, match)
+        self.tags = text.unquote(match.group(1))
+
+    def metadata(self):
+        return {"search_tags": self.tags}
+
+    def posts(self):
+        pnum = 1
+        while True:
+            url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
+            extr = text.extract_from(self.request(url).text)
+
+            while True:
+                mime = extr("data-mime='", "'")
+                if not mime:
+                    break
+
+                pid = extr("data-post-id='", "'")
+                tags, dimensions, size = extr("title='", "'").split(" // ")
+                md5 = extr("/_thumbs/", "/")
+                width, _, height = dimensions.partition("x")
+
+                yield {
+                    "file_url": "{}/_images/{}/{}%20-%20{}.{}".format(
+                        self.root, md5, pid, text.quote(tags),
+                        mime.rpartition("/")[2]),
+                    "id": pid, "md5": md5, "tags": tags,
+                    "width": width, "height": height,
+                    "size": text.parse_bytes(size[:-1]),
+                }
+
+            if not extr(">Next<", ">"):
+                return
+            pnum += 1
+
+
+class MememuseumPostExtractor(MememuseumExtractor):
+    """Extractor for single images from meme.museum"""
+    subcategory = "post"
+    pattern = r"(?:https?://)?meme\.museum/post/view/(\d+)"
+    test = ("https://meme.museum/post/view/10243", {
+        "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc4997"
+                   r"1f78/10243%20-%20g%20beard%20open_source%20richard_stallm"
+                   r"an%20stallman%20tagme%20text\.jpg",
+        "keyword": "3c8009251480cf17248c08b2b194dc0c4d59580e",
+        "content": "45565f3f141fc960a8ae1168b80e718a494c52d2",
+    })
+
+    def __init__(self, match):
+        MememuseumExtractor.__init__(self, match)
+        self.post_id = match.group(1)
+
+    def posts(self):
+        url = "{}/post/view/{}".format(self.root, self.post_id)
+        extr = text.extract_from(self.request(url).text)
+
+        return ({
+            "id"      : self.post_id,
+            "tags"    : extr(": ", "<"),
+            "md5"     : extr("/_thumbs/", "/"),
+            "file_url": self.root + extr("id='main_image' src='", "'"),
+            "width"   : extr("data-width=", " ").strip("'\""),
+            "height"  : extr("data-height=", " ").strip("'\""),
+            "size"    : 0,
+        },)