diff --git a/CHANGELOG.md b/CHANGELOG.md index f84e4231..78f3fb1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # Changelog +## Unreleased + ## 1.14.2 - 2020-06-27 ### Additions - [artstation] add `date` metadata field ([#839](https://github.com/mikf/gallery-dl/issues/839)) diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index b932ef2a..613d5b54 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -120,6 +120,7 @@ SlickPic https://www.slickpic.com/ Albums, User Profiles SlideShare https://www.slideshare.net/ Presentations SmugMug https://www.smugmug.com/ |smugmug-C| Optional (OAuth) Speaker Deck https://speakerdeck.com/ Presentations +Subscribestar https://www.subscribestar.com/ Posts, User Profiles The /b/ Archive https://thebarchive.com/ Threads Tsumino https://www.tsumino.com/ Galleries, Search Results Optional Tumblr https://www.tumblr.com/ Likes, Posts, Tag Searches, User Profiles Optional (OAuth) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 561b4847..12dd36d4 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -108,6 +108,7 @@ modules = [ "slideshare", "smugmug", "speakerdeck", + "subscribestar", "tsumino", "tumblr", "twitter", diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py new file mode 100644 index 00000000..f6f29cd5 --- /dev/null +++ b/gallery_dl/extractor/subscribestar.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.subscribestar.com/""" + +from .common import Extractor, Message +from .. import text +import json + + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?subscribestar\.(com|adult)" + + +class SubscribestarExtractor(Extractor): + """Base class for subscribestar extractors""" + category = "subscribestar" + root = "https://www.subscribestar.com" + directory_fmt = ("{category}", "{author_name}") + filename_fmt = "{post_id}_{id}.{extension}" + archive_fmt = "{id}" + + def __init__(self, match): + tld, self.item = match.groups() + if tld == "adult": + self.root = "https://subscribestar.adult" + self.subcategory += "-adult" + Extractor.__init__(self, match) + self.metadata = self.config("metadata", False) + + def items(self): + for post_html in self.posts(): + media = self._media_from_post(post_html) + if not media: + continue + data = self._data_from_post(post_html) + yield Message.Directory, data + for item in media: + item.update(data) + url = item["url"] + yield Message.Url, url, text.nameext_from_url(url, item) + + def posts(self): + """Yield HTML content of all relevant posts""" + + @staticmethod + def _media_from_post(html): + gallery = text.extract(html, 'data-gallery="', '"')[0] + if gallery: + return [ + item for item in json.loads(text.unescape(gallery)) + if "/previews/" not in item["url"] + ] + return () + + def _data_from_post(self, html): + extr = text.extract_from(html) + data = { + "post_id" : text.parse_int(extr('data-id="', '"')), + "author_id" : text.parse_int(extr('data-user-id="', '"')), + "author_name": text.unescape(extr('href="/', '"')), + "author_nick": text.unescape(extr('>', '<')), + "content" : (extr( + '