diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1818d51b..6bdd4ea6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -319,6 +319,12 @@ Consider all listed sites to potentially be NSFW. Chapters, Manga + + HentaiNexus + https://hentainexus.com/ + Galleries, Search Results + + Hiperdex https://hiperdex.top/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index da555df3..6aff1f3f 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -62,6 +62,7 @@ modules = [ "hentaifox", "hentaihand", "hentaihere", + "hentainexus", "hiperdex", "hitomi", "hotleak", diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py new file mode 100644 index 00000000..2320c38d --- /dev/null +++ b/gallery_dl/extractor/hentainexus.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019-2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hentainexus.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import binascii + + +class HentainexusGalleryExtractor(GalleryExtractor): + """Extractor for hentainexus galleries""" + category = "hentainexus" + root = "https://hentainexus.com" + pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com" + r"/(?:view|read)/(\d+)") + example = "https://hentainexus.com/view/12345" + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/view/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + rmve = text.remove_html + extr = text.extract_from(page) + data = { + "gallery_id": text.parse_int(self.gallery_id), + "cover" : extr('"og:image" content="', '"'), + "title" : extr('

', '

'), + } + + for key in ("Artist", "Book", "Circle", "Event", "Language", + "Magazine", "Parody", "Publisher", "Description"): + value = rmve(extr('viewcolumn">' + key + '', '')) + value, sep, rest = value.rpartition(" (") + data[key.lower()] = value if sep else rest + + data["tags"] = tags = [] + for k in text.extract_iter(page, '> 1 ^ 0xc + else: + C = C >> 1 + k = primes[C & 0x7] + + x = 0 + S = list(range(256)) + for i in range(256): + x = (x + S[i] + key[i % len(key)]) % 256 + S[i], S[x] = S[x], S[i] + + result = "" + a = c = m = x = 0 + for n in range(64, len(blob)): + a = (a + k) % 256 + x = (c + S[(x + S[a]) % 256]) % 256 + c = (c + a + S[a]) % 256 + + S[a], S[x] = S[x], S[a] + m = S[(x + S[(a + S[(m + c) % 256]) % 256]) % 256] + result += chr(blob[n] ^ m) + + return result + + @staticmethod + def _join_title(data): + event = data['event'] + artist = data['artist'] + circle = data['circle'] + title = data['title'] + parody = data['parody'] + book = data['book'] + magazine = data['magazine'] + + # a few galleries have a large number of artists or parodies, + # which get replaced with "Various" in the title string + if artist.count(',') >= 3: + artist = 'Various' + if parody.count(',') >= 3: + parody = 'Various' + + jt = '' + if event: + jt += '({}) '.format(event) + if circle: + jt += '[{} ({})] '.format(circle, artist) + else: + jt += '[{}] '.format(artist) + jt += title + if parody.lower() != 'original work': + jt += ' ({})'.format(parody) + if book: + jt += ' ({})'.format(book) + if magazine: + jt += ' ({})'.format(magazine) + return jt + + +class HentainexusSearchExtractor(Extractor): + """Extractor for hentainexus search results""" + category = "hentainexus" + subcategory = "search" + root = "https://hentainexus.com" + pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com" + r"(?:/page/\d+)?/?(?:\?(q=[^/?#]+))?$") + example = "https://hentainexus.com/?q=QUERY" + + def items(self): + params = text.parse_query(self.groups[0]) + data = {"_extractor": HentainexusGalleryExtractor} + path = "/" + + while path: + page = self.request(self.root + path, params=params).text + extr = text.extract_from(page) + + while True: + gallery_id = extr('