From 1384ebf907133ab6ae674a1c949476c7d6647b67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 28 Mar 2019 23:35:11 +0100 Subject: [PATCH] [luscious] fix metadata extraction - remove 'artist', 'language', and 'lang' fields - replace 'section' with 'genre' - provide 'tags' as list - use GalleryExtractor as base class --- gallery_dl/extractor/luscious.py | 116 ++++++++++++++----------------- 1 file changed, 54 insertions(+), 62 deletions(-) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 0472bad6..0b82c27f 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -8,12 +8,12 @@ """Extractors for https://luscious.net/""" -from .common import Extractor, Message, AsynchronousMixin -from .. import text, util, exception +from .common import GalleryExtractor, Extractor, Message +from .. import text, exception from ..cache import cache -class LusciousExtractor(Extractor): +class LusciousBase(Extractor): """Base class for luscious extractors""" category = "luscious" cookiedomain = ".luscious.net" @@ -34,7 +34,7 @@ class LusciousExtractor(Extractor): "login": username, "password": password, "remember": "on", - "next": "" "/", + "next": "/", } response = self.request(url, method="POST", headers=headers, data=data) @@ -45,24 +45,29 @@ class LusciousExtractor(Extractor): return {cookie.name: cookie.value} raise exception.AuthenticationError() + @staticmethod + def _parse_tags(tags): + return [ + text.unescape(tag.replace(":_", ":")) + for tag in text.extract_iter(tags or "", "/tagged/+", "/") + ] -class LusciousAlbumExtractor(AsynchronousMixin, LusciousExtractor): + +class LusciousAlbumExtractor(LusciousBase, GalleryExtractor): """Extractor for image albums from luscious.net""" subcategory = "album" - directory_fmt = ("{category}", "{gallery_id} {title}") - filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" archive_fmt = "{gallery_id}_{image_id}" pattern = (r"(?:https?://)?(?:www\.|members\.)?luscious\.net" r"/(?:albums|pictures/c/[^/?&#]+/album)/([^/?&#]+_(\d+))") test = ( ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { "url": "7e4984a271a1072ac6483e4228a045895aff86f3", - "keyword": "5ab53959f25a468455f79149461d26547669e50e", + "keyword": "b5cc69b36689e7360876dd1f8ef2395782eb493f", "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", }), ("https://luscious.net/albums/virgin-killer-sweater_282582/", { "url": "21cc68a7548f4d71dfd67d8caf96349dde7e791c", - "keyword": "3de82f61ad4afd0f546ab5ae5bf9c5388cc9c3db", + "keyword": "f8e5e7b32a7ff777cae5a89e93d06eb51afe3f48", }), ("https://luscious.net/albums/not-found_277035/", { "exception": exception.NotFoundError, @@ -78,22 +83,11 @@ class LusciousAlbumExtractor(AsynchronousMixin, LusciousExtractor): ) def __init__(self, match): - LusciousExtractor.__init__(self, match) - self.gpart, self.gid = match.groups() - - def items(self): - self.login() - url = "{}/albums/{}/".format(self.root, self.gpart) - page = self.request(url).text - data = self.metadata(page) - yield Message.Version, 1 - yield Message.Directory, data - for url, image in self.images(page): - image.update(data) - yield Message.Url, url, image + path, self.gallery_id = match.groups() + url = "{}/albums/{}/".format(self.root, path) + GalleryExtractor.__init__(self, match, url) def metadata(self, page): - """Collect metadata for extractor-job""" pos = page.find("

404 Not Found

") if pos >= 0: msg = text.extract(page, '
', '
', pos)[0] @@ -101,84 +95,82 @@ class LusciousAlbumExtractor(AsynchronousMixin, LusciousExtractor): raise exception.AuthorizationError() raise exception.NotFoundError("album") - data = text.extract_all(page, ( - ("tags" , '', ''), - ("count" , '

', ' '), - (None , '

Section:', ''), - ("section" , '>', '<'), - ("language", '

Language:', ' '), - ), values={"gallery_id": self.gid})[0] - data["lang"] = util.language_to_code(data["language"]) - try: - data["artist"] = text.extract(data["tags"], "rtist: ", ",")[0] - except AttributeError: - data["artist"] = None - return data + title, pos = text.extract(page, '"og:title" content="', '"') + info , pos = text.extract(page, '

  • ', "", pos) + if info is None: + count, pos = text.extract(page, '>Pages:', '<', pos) + else: + count, pos = text.extract(page, '

    ', ' ', pos) + genre, pos = text.extract(page, '

    Genre:', '

    ', pos) + tags , pos = text.extract(page, '"tag_list static">', '', pos) + + return { + "gallery_id": text.parse_int(self.gallery_id), + "title": text.unescape(title or ""), + "count": text.parse_int(count), + "genre": text.remove_html(genre), + "tags" : self._parse_tags(tags), + } def images(self, page): - """Collect image-URLs and -metadata""" extr = text.extract - num = 1 if 'class="search_filter' in page: url = "{}/pictures/album/x_{}/sorted/oldest/page/1/".format( - self.root, self.gid) + self.root, self.gallery_id) page = self.request(url).text pos = page.find('