From 286d0cb098a24916ecccf5a24961bf4847073dca Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Fri, 17 Nov 2023 19:34:34 -0500 Subject: [PATCH 1/6] [tmohentai] add support --- docs/supportedsites.md | 6 +++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/tmohentai.py | 78 +++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 gallery_dl/extractor/tmohentai.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a15566df..94cef0f7 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -829,6 +829,12 @@ Consider all sites to be NSFW unless otherwise known. Galleries + + Tmohentai + https://tmohentai.com/ + Galleries + + Toyhouse https://toyhou.se/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 22e4fe34..efdcde78 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -147,6 +147,7 @@ modules = [ "tapas", "tcbscans", "telegraph", + "tmohentai", "toyhouse", "tsumino", "tumblr", diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py new file mode 100644 index 00000000..462e51dd --- /dev/null +++ b/gallery_dl/extractor/tmohentai.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://tmohentai.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r'(?:https?://)?tmohentai\.com' + + +class TmohentaiExtractor(Extractor): + category = 'tmohentai' + root = 'http://tmohentai.com' + directory_fmt = ('{category}', '{title}') + filename_fmt = '{filename}.{extension}' + archive_fmt = '{title}_{filename}' + pattern = BASE_PATTERN + r'/((contents)|(reader))/(\w+)' + example = 'https://tmohentai.com/contents/12345a67b89c0' + + def __init__(self, match): + Extractor.__init__(self, match) + self.contents = match.group(2) + self.reader = match.group(3) + self.id_string = match.group(4) + + def parse_location(self): + if self.contents: + url = f'{self.root}/reader/{self.id_string}/paginated' + else: + url = self.url + return url + + def items(self): + url = self.parse_location() + page_src = self.request( + text.ensure_http_scheme(url)).text + + data = self.metadata() + yield Message.Directory, data + + page_nums = text.extract_iter(page_src, 'option value="', '"') + pages = [text.extr(page_src, 'data-original="', '"')] + base_page = pages[0].rpartition('/')[0] + for num, page in enumerate(page_nums, start=1): + file = f'{base_page}/{num:>03}.webp' + img = text.nameext_from_url(file, { + 'num': num, + }) + yield Message.Url, file, img + + def metadata(self): + contents = f'{self.root}/contents/{self.id_string}' + contents_src = self.request(text.ensure_http_scheme(contents)).text + + genders_src = text.extr(contents_src, 'Genders', '') + genders_list = text.extract_iter(genders_src, '">', '') + + tags_src = text.extr(contents_src, 'Tags', '') + tags_list = text.extract_iter(tags_src, '">', '') + + upload_src = text.extr(contents_src, 'Uploaded By', '/a>') + data = { + 'title' : text.extr(contents_src, '

', '

'), + 'id_string': self.id_string, + 'artists' : text.remove_html( + text.extr(contents_src, 'tag tag-accepted">', '')), + 'genders' : list(genders_list), + 'tags' : list(tags_list), + 'uploader' : text.extr(upload_src, '">', '<'), + 'language' : text.extr( + contents_src, ' ', ''), + } + return data From dad7ba1d581b0829ccc6c4cd8a44efcb58cb3d51 Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Fri, 17 Nov 2023 21:08:34 -0500 Subject: [PATCH 2/6] [tmohentai] fix edge cases. updated archive_fmt and filename_fmt --- gallery_dl/extractor/tmohentai.py | 34 ++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index 462e51dd..0a56b230 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -16,8 +16,8 @@ class TmohentaiExtractor(Extractor): category = 'tmohentai' root = 'http://tmohentai.com' directory_fmt = ('{category}', '{title}') - filename_fmt = '{filename}.{extension}' - archive_fmt = '{title}_{filename}' + filename_fmt = '{title}_{filename}.{extension}' + archive_fmt = '{id_string}_{filename}' pattern = BASE_PATTERN + r'/((contents)|(reader))/(\w+)' example = 'https://tmohentai.com/contents/12345a67b89c0' @@ -31,9 +31,20 @@ class TmohentaiExtractor(Extractor): if self.contents: url = f'{self.root}/reader/{self.id_string}/paginated' else: - url = self.url + url_str = self.url.rpartition('/') + if url_str[-1].isdigit(): + url = url_str[0] + else: + url = self.url return url + @staticmethod + def get_file_info(page_src): + file = text.extr(page_src, 'data-original="', '"') + file_loc, _, file_name = file.rpartition('/') + start_num, ext = file_name.split('.') + return file_loc, start_num, ext + def items(self): url = self.parse_location() page_src = self.request( @@ -42,13 +53,16 @@ class TmohentaiExtractor(Extractor): data = self.metadata() yield Message.Directory, data - page_nums = text.extract_iter(page_src, 'option value="', '"') - pages = [text.extr(page_src, 'data-original="', '"')] - base_page = pages[0].rpartition('/')[0] - for num, page in enumerate(page_nums, start=1): - file = f'{base_page}/{num:>03}.webp' + file_loc, start_num, ext = self.get_file_info(page_src) + page_nums = text.extract_iter( + page_src, 'option value="', '"') + + for num, page in enumerate(page_nums, start=int(start_num)): + file = f'{file_loc}/{num:>03}.{ext}' img = text.nameext_from_url(file, { - 'num': num, + 'num' : num, + 'title' : data['title'], + 'id_string': self.id_string, }) yield Message.Url, file, img @@ -64,7 +78,7 @@ class TmohentaiExtractor(Extractor): upload_src = text.extr(contents_src, 'Uploaded By', '/a>') data = { - 'title' : text.extr(contents_src, '

', '

'), + 'title' : text.extr(contents_src, '

', '

').strip(), 'id_string': self.id_string, 'artists' : text.remove_html( text.extr(contents_src, 'tag tag-accepted">', '')), From ed965eecbb42adaebe9ddbbd4a8bfe48214b1ae7 Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Sat, 18 Nov 2023 14:39:17 -0500 Subject: [PATCH 3/6] [tmohentai] refactor to str.format for backwards compatibility --- gallery_dl/extractor/tmohentai.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index 0a56b230..a02b8e8a 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -29,7 +29,7 @@ class TmohentaiExtractor(Extractor): def parse_location(self): if self.contents: - url = f'{self.root}/reader/{self.id_string}/paginated' + url = '{}/reader/{}/paginated'.format(self.root, self.id_string) else: url_str = self.url.rpartition('/') if url_str[-1].isdigit(): @@ -58,7 +58,7 @@ class TmohentaiExtractor(Extractor): page_src, 'option value="', '"') for num, page in enumerate(page_nums, start=int(start_num)): - file = f'{file_loc}/{num:>03}.{ext}' + file = '{}/{:>03}.{}'.format(file_loc, num, ext) img = text.nameext_from_url(file, { 'num' : num, 'title' : data['title'], @@ -67,7 +67,7 @@ class TmohentaiExtractor(Extractor): yield Message.Url, file, img def metadata(self): - contents = f'{self.root}/contents/{self.id_string}' + contents = '{}/contents/{}'.format(self.root, self.id_string) contents_src = self.request(text.ensure_http_scheme(contents)).text genders_src = text.extr(contents_src, 'Genders', '') From 31963fa9478cbfeb55b8f98e283266b83e819dc1 Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Mon, 20 Nov 2023 21:35:32 -0500 Subject: [PATCH 4/6] [tmohentai] inherit from GalleryExtractor. refactor metadata. --- gallery_dl/extractor/tmohentai.py | 75 ++++++++++++------------------- 1 file changed, 28 insertions(+), 47 deletions(-) diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index a02b8e8a..ef05f989 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -6,14 +6,15 @@ """Extractors for https://tmohentai.com/""" -from .common import Extractor, Message +from .common import GalleryExtractor, Message from .. import text BASE_PATTERN = r'(?:https?://)?tmohentai\.com' -class TmohentaiExtractor(Extractor): +class TmohentaiGalleryExtractor(GalleryExtractor): category = 'tmohentai' + subcategory = 'gallery' root = 'http://tmohentai.com' directory_fmt = ('{category}', '{title}') filename_fmt = '{title}_{filename}.{extension}' @@ -22,71 +23,51 @@ class TmohentaiExtractor(Extractor): example = 'https://tmohentai.com/contents/12345a67b89c0' def __init__(self, match): - Extractor.__init__(self, match) + GalleryExtractor.__init__(self, match) self.contents = match.group(2) self.reader = match.group(3) self.id_string = match.group(4) def parse_location(self): - if self.contents: - url = '{}/reader/{}/paginated'.format(self.root, self.id_string) - else: - url_str = self.url.rpartition('/') - if url_str[-1].isdigit(): - url = url_str[0] - else: - url = self.url + url = self.url + if self.reader: + url = '{}/contents/{}'.format(self.root, self.id_string) return url - @staticmethod - def get_file_info(page_src): - file = text.extr(page_src, 'data-original="', '"') - file_loc, _, file_name = file.rpartition('/') - start_num, ext = file_name.split('.') - return file_loc, start_num, ext - def items(self): url = self.parse_location() - page_src = self.request( + page = self.request( text.ensure_http_scheme(url)).text + data = self.metadata(page) - data = self.metadata() yield Message.Directory, data + imgs = self.images(page) - file_loc, start_num, ext = self.get_file_info(page_src) - page_nums = text.extract_iter( - page_src, 'option value="', '"') - - for num, page in enumerate(page_nums, start=int(start_num)): - file = '{}/{:>03}.{}'.format(file_loc, num, ext) - img = text.nameext_from_url(file, { - 'num' : num, + cdn = 'https://imgrojo.tmohentai.com/contents' + for num, _ in enumerate(imgs, start=0): + url = ('{}/{}/{:>03}.webp'.format(cdn, self.id_string, num)) + img = text.nameext_from_url(url, { + 'num' : num + 1, 'title' : data['title'], 'id_string': self.id_string, }) - yield Message.Url, file, img - - def metadata(self): - contents = '{}/contents/{}'.format(self.root, self.id_string) - contents_src = self.request(text.ensure_http_scheme(contents)).text + yield Message.Url, url, img - genders_src = text.extr(contents_src, 'Genders', '') - genders_list = text.extract_iter(genders_src, '">', '') + def images(self, page): + pages = text.extract_iter( + page, 'class="lanzador', '>') + return pages - tags_src = text.extr(contents_src, 'Tags', '') - tags_list = text.extract_iter(tags_src, '">', '') + def metadata(self, page): + extr = text.extract_from(page, page.index('tag tag-accepted">')) - upload_src = text.extr(contents_src, 'Uploaded By', '/a>') data = { - 'title' : text.extr(contents_src, '

', '

').strip(), + 'title' : text.extr(page, '

', '

').strip(), 'id_string': self.id_string, - 'artists' : text.remove_html( - text.extr(contents_src, 'tag tag-accepted">', '')), - 'genders' : list(genders_list), - 'tags' : list(tags_list), - 'uploader' : text.extr(upload_src, '">', '<'), - 'language' : text.extr( - contents_src, ' ', ''), + 'artists' : text.remove_html(extr('">', '')), + 'genders' : text.split_html(extr('Genders', '', '')), + 'uploader' : text.remove_html(extr('Uploaded By', '')), + 'language' : extr(' ', '\n'), } return data From 714b1a7089aafdfef4eb2e8b74c7faef4564083a Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Tue, 21 Nov 2023 10:46:48 -0500 Subject: [PATCH 5/6] [tmohentai] simplify url matching --- gallery_dl/extractor/tmohentai.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index ef05f989..d4e16086 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -19,25 +19,17 @@ class TmohentaiGalleryExtractor(GalleryExtractor): directory_fmt = ('{category}', '{title}') filename_fmt = '{title}_{filename}.{extension}' archive_fmt = '{id_string}_{filename}' - pattern = BASE_PATTERN + r'/((contents)|(reader))/(\w+)' + pattern = BASE_PATTERN + r'/(contents)|(reader)/(\w+)' example = 'https://tmohentai.com/contents/12345a67b89c0' def __init__(self, match): - GalleryExtractor.__init__(self, match) - self.contents = match.group(2) - self.reader = match.group(3) - self.id_string = match.group(4) - - def parse_location(self): - url = self.url - if self.reader: - url = '{}/contents/{}'.format(self.root, self.id_string) - return url + self.id_string = match.group(2) + url = '{}/contents/{}'.format(self.root, self.id_string) + GalleryExtractor.__init__(self, match, url) def items(self): - url = self.parse_location() page = self.request( - text.ensure_http_scheme(url)).text + text.ensure_http_scheme(self.url)).text data = self.metadata(page) yield Message.Directory, data @@ -61,7 +53,7 @@ class TmohentaiGalleryExtractor(GalleryExtractor): def metadata(self, page): extr = text.extract_from(page, page.index('tag tag-accepted">')) - data = { + return { 'title' : text.extr(page, '

', '

').strip(), 'id_string': self.id_string, 'artists' : text.remove_html(extr('">', '')), @@ -70,4 +62,3 @@ class TmohentaiGalleryExtractor(GalleryExtractor): 'uploader' : text.remove_html(extr('Uploaded By', '')), 'language' : extr(' ', '\n'), } - return data From c4a201ed42e6679d7edc3ce98d75054f574c00fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 21 Nov 2023 20:24:07 +0100 Subject: [PATCH 6/6] [tmohentai] simplify + tests --- docs/supportedsites.md | 2 +- gallery_dl/extractor/tmohentai.py | 66 ++++++++++++------------------- scripts/supportedsites.py | 1 + test/results/tmohentai.py | 54 +++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 42 deletions(-) create mode 100644 test/results/tmohentai.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 94cef0f7..8aadcde5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -830,7 +830,7 @@ Consider all sites to be NSFW unless otherwise known. - Tmohentai + TMOHentai https://tmohentai.com/ Galleries diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index d4e16086..be45702a 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -6,59 +6,43 @@ """Extractors for https://tmohentai.com/""" -from .common import GalleryExtractor, Message +from .common import GalleryExtractor from .. import text -BASE_PATTERN = r'(?:https?://)?tmohentai\.com' +BASE_PATTERN = r"(?:https?://)?tmohentai\.com" class TmohentaiGalleryExtractor(GalleryExtractor): - category = 'tmohentai' - subcategory = 'gallery' - root = 'http://tmohentai.com' - directory_fmt = ('{category}', '{title}') - filename_fmt = '{title}_{filename}.{extension}' - archive_fmt = '{id_string}_{filename}' - pattern = BASE_PATTERN + r'/(contents)|(reader)/(\w+)' - example = 'https://tmohentai.com/contents/12345a67b89c0' + category = "tmohentai" + root = "http://tmohentai.com" + directory_fmt = ("{category}", "{title} ({gallery_id})") + pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)" + example = "https://tmohentai.com/contents/12345a67b89c0" def __init__(self, match): - self.id_string = match.group(2) - url = '{}/contents/{}'.format(self.root, self.id_string) + self.gallery_id = match.group(1) + url = "{}/contents/{}".format(self.root, self.gallery_id) GalleryExtractor.__init__(self, match, url) - def items(self): - page = self.request( - text.ensure_http_scheme(self.url)).text - data = self.metadata(page) - - yield Message.Directory, data - imgs = self.images(page) - - cdn = 'https://imgrojo.tmohentai.com/contents' - for num, _ in enumerate(imgs, start=0): - url = ('{}/{}/{:>03}.webp'.format(cdn, self.id_string, num)) - img = text.nameext_from_url(url, { - 'num' : num + 1, - 'title' : data['title'], - 'id_string': self.id_string, - }) - yield Message.Url, url, img - def images(self, page): - pages = text.extract_iter( - page, 'class="lanzador', '>') - return pages + fmt = "https://imgrojo.tmohentai.com/contents/{}/{{:>03}}.webp".format( + self.gallery_id).format + cnt = page.count('class="lanzador') + return [(fmt(i), None) for i in range(0, cnt)] def metadata(self, page): - extr = text.extract_from(page, page.index('tag tag-accepted">')) + extr = text.extract_from(page) return { - 'title' : text.extr(page, '

', '

').strip(), - 'id_string': self.id_string, - 'artists' : text.remove_html(extr('">', '')), - 'genders' : text.split_html(extr('Genders', '', '')), - 'uploader' : text.remove_html(extr('Uploaded By', '')), - 'language' : extr(' ', '\n'), + "gallery_id": self.gallery_id, + "title" : text.unescape(extr("

", "<").strip()), + "artists" : text.split_html(extr( + "", "")), + "categories": text.split_html(extr( + "", "")), + "tags" : text.split_html(extr( + "", "")), + "uploader" : text.remove_html(extr( + "", "")), + "language" : extr(" ", "\n"), } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 470b629d..695108e0 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -121,6 +121,7 @@ CATEGORY_MAP = { "tbib" : "The Big ImageBoard", "tcbscans" : "TCB Scans", "tco" : "Twitter t.co", + "tmohentai" : "TMOHentai", "thatpervert" : "ThatPervert", "thebarchive" : "The /b/ Archive", "thecollection" : "The /co/llection", diff --git a/test/results/tmohentai.py b/test/results/tmohentai.py new file mode 100644 index 00000000..2bae050a --- /dev/null +++ b/test/results/tmohentai.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import tmohentai + + +__tests__ = ( +{ + "#url" : "https://tmohentai.com/contents/653c2aeaa693c", + "#category": ("", "tmohentai", "gallery"), + "#class" : tmohentai.TmohentaiGalleryExtractor, + "#pattern" : r"https://imgrojo\.tmohentai\.com/contents/653c2aeaa693c/\d\d\d\.webp", + "#count" : 46, + + "artists" : ["Andoryu"], + "categories": [ + "Big Breasts", + "BlowJob", + "Cheating", + "Mature", + "Milf", + "Student", + ], + "count" : 46, + "extension" : "webp", + "gallery_id": "653c2aeaa693c", + "language" : "EspaƱol", + "num" : int, + "tags" : [ + "milf", + "Madre", + "enormes pechos", + "Peluda", + "nakadashi", + "cheating", + "madura", + "sexo a escondidas", + "Ama de casa", + "mamada", + ], + "title" : "La Mama de mi Novia es tan Pervertida que no Pude Soportarlo mas", + "uploader" : "NekoCreme Fansub", +}, + +{ + "#url" : "https://tmohentai.com/reader/653c2aeaa693c/paginated/1", + "#category": ("", "tmohentai", "gallery"), + "#class" : tmohentai.TmohentaiGalleryExtractor, +}, + +)