[myhentaigallery] update and fix extraction (#1001)

- extract more metadata - match "/show/" URLs - complete test results - fix missing images for lines starting with " <img" - fix missing comma in supportedsites.py
4 years ago · c43b3894be
parent 05b9ac8d37
commit c43b3894be
3 changed files with 31 additions and 18 deletions
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@ -81,6 +81,7 @@ MangaPark            https://mangapark.net/              Chapters, Manga
 Mangareader          https://www.mangareader.net/        Chapters, Manga
 Mangoxo              https://www.mangoxo.com/            Albums, Channels                                   Optional
 mastodon.social      https://mastodon.social/            Images from Statuses, User Profiles                Optional (`OAuth <https://github.com/mikf/gallery-dl#oauth>`__)
+My Hentai Gallery    https://myhentaigallery.com/        Galleries
 Naver                https://blog.naver.com/             Blogs, Posts
 Newgrounds           https://www.newgrounds.com/         |newgrounds-C|                                     Optional
 Ngomik               http://ngomik.in/                   Chapters
--- a/gallery_dl/extractor/myhentaigallery.py
+++ b/gallery_dl/extractor/myhentaigallery.py
@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-

-# Copyright 2018-2020 Mike Fährmann
-#
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
@ -15,16 +13,30 @@ from .. import text, exception
 class MyhentaigalleryGalleryExtractor(GalleryExtractor):
    """Extractor for image galleries from myhentaigallery.com"""
    category = "myhentaigallery"
-    directory_fmt = ("{category}", "{gallery_id} [{artist}] {title}")
-    pattern = (r"(?:https?://)?(myhentaigallery\.com"
-               r"/gallery/thumbnails/[0-9]+)")
+    directory_fmt = ("{category}", "{gallery_id} {artist:?[/] /J, }{title}")
+    pattern = (r"(?:https?://)?myhentaigallery\.com"
+               r"/gallery/(?:thumbnails|show)/(\d+)")
    test = (
-        ("https://myhentaigallery.com/gallery/thumbnails/16247"),
-        ("https://myhentaigallery.com/gallery/thumbnails/15224"),
+        ("https://myhentaigallery.com/gallery/thumbnails/16247", {
+            "pattern": r"https://images.myhentaigrid.com/imagesgallery/images"
+                       r"/[^/]+/original/\d+\.jpg",
+            "keyword": {
+                "artist"    : list,
+                "count"     : 11,
+                "gallery_id": 16247,
+                "group"     : list,
+                "parodies"  : list,
+                "tags"      : ["Giantess"],
+                "title"     : "Attack Of The 50ft Woman 1",
+            },
+        }),
+        ("https://myhentaigallery.com/gallery/show/16247/1"),
    )
+    root = "https://myhentaigallery.com"

    def __init__(self, match):
-        url = "https://" + match.group(1)
+        self.gallery_id = match.group(1)
+        url = "{}/gallery/thumbnails/{}".format(self.root, self.gallery_id)
        GalleryExtractor.__init__(self, match, url)
        self.session.headers["Referer"] = url

@ -32,22 +44,22 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
        extr = text.extract_from(page)
        split = text.split_html

-        image = extr('<div class="comic-cover">\n<a href="', '"')
        title = extr('<div class="comic-description">\n<h1>', '</h1>')
        if not title:
            raise exception.NotFoundError("gallery")
-        data = {
+
+        return {
            "title"     : text.unescape(title),
-            "gallery_id": text.parse_int(image.split("/")[-2]),
+            "gallery_id": text.parse_int(self.gallery_id),
            "tags"      : split(extr('<div>\nCategories:', '</div>')),
+            "artist"    : split(extr('<div>\nArtists:'   , '</div>')),
+            "group"     : split(extr('<div>\nGroups:'    , '</div>')),
+            "parodies"  : split(extr('<div>\nParodies:'  , '</div>')),
        }
-        artists = split(extr('<div>\nArtists:', '</div>'))
-        data["artist"] = artists[0] if artists else "Unknown"
-        return data

    def images(self, page):
-        extr = text.extract_iter
        return [
-            (text.unescape(url).replace("/thumbnail/", "/original/"), None)
-            for url in extr(page, 'class="comic-thumb">\n<img src="', '"')
+            (text.unescape(text.extract(url, 'src="', '"')[0]).replace(
+                "/thumbnail/", "/original/"), None)
+            for url in text.extract_iter(page, 'class="comic-thumb"', '</div>')
        ]
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@ -60,7 +60,7 @@ CATEGORY_MAP = {
    "mangapark"      : "MangaPark",
    "mangastream"    : "Manga Stream",
    "mastodon.social": "mastodon.social",
-    "myhentaigallery": "My Hentai Gallery"
+    "myhentaigallery": "My Hentai Gallery",
    "myportfolio"    : "Adobe Portfolio",
    "nhentai"        : "nhentai",
    "nijie"          : "nijie",