From e5e752d928cab4e9063e61348827ecfb844c19a8 Mon Sep 17 00:00:00 2001 From: Wyoh Knott Date: Sat, 11 May 2024 14:52:21 +0200 Subject: [PATCH] [subscribestar] Refactoring extractor and handling audio content - New support for embedded audios - New support for external links compatible with yt-dlp - Add a content_type field at the post level for directory creation - Major rework of the logic - Added a check_if_supported_by_ytdlp helper function in util.py for yt-dlp external links handling --- gallery_dl/extractor/subscribestar.py | 132 +++++++++++++++++--------- gallery_dl/util.py | 11 ++- 2 files changed, 97 insertions(+), 46 deletions(-) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index d4adfed9..bbf61c18 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2023 Mike Fährmann +# Copyright 2020-2024 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,6 +17,7 @@ BASE_PATTERN = r"(?:https?://)?(?:www\.)?subscribestar\.(com|adult)" class SubscribestarExtractor(Extractor): """Base class for subscribestar extractors""" + category = "subscribestar" root = "https://www.subscribestar.com" directory_fmt = ("{category}", "{author_name}") @@ -74,6 +75,7 @@ class SubscribestarExtractor(Extractor): response = self.request( url, method="POST", headers=headers, data=data, fatal=False) + if response.json().get("errors"): self.log.debug(response.json()["errors"]) raise exception.AuthenticationError() @@ -84,44 +86,98 @@ class SubscribestarExtractor(Extractor): if cookie.name.startswith("auth") } - def _media_from_post(self, html): + def _extract_media(self, html, media_types): media = [] + media_config = { + "gallery": ('data-gallery="', '"', self._process_gallery_item), + "attachments": ( + 'class="uploads-docs"', + 'data-role="post-edit_form"', + self._process_attachment_item, + ), + "link": ('data-href="', '"', self._process_media_item), + "audio": ('', '<')), - "url" : text.unescape(text.extr(att, 'href="', '"')), - "type": "attachment", - }) + for key, config in media_types.items(): + if key in media_config: + start, end, processor = media_config[key] + segments = ( + text.extract_all( + html, + ((key, start, end),), + )[0], + ) + for segment in segments: + if segment[key]: + content = processor(segment, key) + if content: + media.append(content) + return media + + def _process_gallery_item(self, item, media_type): + gallery_list = util.json_loads(text.unescape(item["gallery"])) + for media in gallery_list: + if "/previews" in media["url"]: + self._warn_preview() + return {"url": media["url"], "type": media_type} + + def _process_attachment_item(self, item, media_type): + return { + "id": text.parse_int(text.extr(item, 'data-upload-id="', '"')), + "name": text.unescape(text.extr(item, 'doc_preview-title">', "<")), + "url": text.unescape(text.extr(item, 'href="', '"')), + "type": media_type, + } + + def _process_media_item(self, item, media_type): + if media_type == "link" and util.check_if_supported_by_ytdlp( + item[media_type]): + return {"url": "ytdl:" + item[media_type], "type": media_type} + elif media_type == "audio": + return {"url": item[media_type], "type": media_type} + def _media_from_post(self, html): + media_types = { + "gallery": True, + "attachments": True, + "link": True, + "audio": True, + } + media = self._extract_media(html, media_types) return media def _data_from_post(self, html): extr = text.extract_from(html) + + links = (text.extract_all(html, (("url", 'data-href="', '"'),), )[0],) + audios = (text.extract_all(html, (("url", '', '<')), - "date" : self._parse_datetime(extr( - 'class="post-date">', '")[2]), - "content" : (extr( - '
")[2]), + "post_id" : text.parse_int(extr('data-id="', '"')), + "author_id" : text.parse_int(extr('data-user-id="', '"')), + "author_name" : text.unescape(extr('href="/', '"')), + "author_nick" : text.unescape(extr(">", "<")), + "date" : self._parse_datetime( + extr('class="post-date">', "")[2]), + "content" : extr("\n", "\n"), + "content_type" : content_type, } def _parse_datetime(self, dt): @@ -166,17 +222,3 @@ class SubscribestarPostExtractor(SubscribestarExtractor): def posts(self): url = "{}/posts/{}".format(self.root, self.item) return (self.request(url).text,) - - def _data_from_post(self, html): - extr = text.extract_from(html) - return { - "post_id" : text.parse_int(extr('data-id="', '"')), - "author_name": text.unescape(extr('href="/', '"')), - "author_id" : text.parse_int(extr('data-user-id="', '"')), - "author_nick": text.unescape(extr('alt="', '"')), - "date" : self._parse_datetime(extr( - '', '<')), - "content" : (extr( - '
")[2]), - } diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 861ec7eb..44056aa5 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -24,7 +24,7 @@ import subprocess import urllib.parse from http.cookiejar import Cookie from email.utils import mktime_tz, parsedate_tz -from . import text, version, exception +from . import text, version, exception, ytdl, config def bencode(num, alphabet="0123456789"): @@ -496,6 +496,15 @@ CODES = { } +def check_if_supported_by_ytdlp(url): + ytdl_module = ytdl.import_module( + config.get(("extractor", "ytdl"), "module")) + for ie in ytdl_module.extractor.gen_extractor_classes(): + if ie.suitable(url): + return True + return False + + class HTTPBasicAuth(): __slots__ = ("authorization",)