From df0d7d4a12688dc7478b6599c093e692a97ae508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 10 Sep 2024 22:26:22 +0200 Subject: [PATCH] [cohost] add 'user' and 'post' extractors (#4483) --- docs/configuration.rst | 40 +++++++++ docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/cohost.py | 148 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + test/results/cohost.py | 26 ++++++ 6 files changed, 222 insertions(+) create mode 100644 gallery_dl/extractor/cohost.py create mode 100644 test/results/cohost.py diff --git a/docs/configuration.rst b/docs/configuration.rst index 9c3b7bd8..ad6ad07f 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1486,6 +1486,46 @@ Description ``gallery``. +extractor.cohost.asks +--------------------- +Type + ``bool`` +Default + ``true`` +Description + Extract ``ask`` posts. + + +extractor.cohost.pinned +----------------------- +Type + ``bool`` +Default + ``false`` +Description + Extract pinned posts. + + +extractor.cohost.replies +------------------------ +Type + ``bool`` +Default + ``true`` +Description + Extract reply posts. + + +extractor.cohost.shares +----------------------- +Type + ``bool`` +Default + ``false`` +Description + Extract shared posts. + + extractor.cyberdrop.domain -------------------------- Type diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1aa707c7..f4eaf06d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -151,6 +151,12 @@ Consider all listed sites to potentially be NSFW. Articles, Creators, Followed Users, Recent Images + + cohost! + https://cohost.org/ + Posts, User Profiles + + Comic Vine https://comicvine.gamespot.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index e103cb1b..b6432cea 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -35,6 +35,7 @@ modules = [ "catbox", "chevereto", "cien", + "cohost", "comicvine", "cyberdrop", "danbooru", diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py new file mode 100644 index 00000000..a8821bda --- /dev/null +++ b/gallery_dl/extractor/cohost.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://cohost.org/""" + +from .common import Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?cohost\.org" + + +class CohostExtractor(Extractor): + """Base class for cohost extractors""" + category = "cohost" + root = "https://cohost.org" + directory_fmt = ("{category}", "{postingProject[handle]}") + filename_fmt = ("{postId}_{headline|plainTextBody:?/_/[:100]}" + "{num}.{extension}") + archive_fmt = "{postId}_{num}" + + def _init(self): + self.replies = self.config("replies", True) + self.pinned = self.config("pinned", False) + self.shares = self.config("shares", False) + self.asks = self.config("asks", True) + + def items(self): + for post in self.posts(): + files = self._extract_files(post) + post["count"] = len(files) + post["date"] = text.parse_datetime( + post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + + yield Message.Directory, post + for post["num"], file in enumerate(files, 1): + url = file["fileURL"] + post.update(file) + text.nameext_from_url(url, post) + yield Message.Url, url, post + + def posts(self): + return () + + def _request_api(self, endpoint, input): + url = "{}/api/v1/trpc/{}".format(self.root, endpoint) + params = {"batch": "1", "input": util.json_dumps({"0": input})} + headers = {"content-type": "application/json"} + + data = self.request(url, params=params, headers=headers).json() + return data[0]["result"]["data"] + + def _extract_files(self, post): + files = [] + + self._extract_blocks(post, files) + if self.shares and post.get("shareTree"): + for share in post["shareTree"]: + self._extract_blocks(share, files, share) + del post["shareTree"] + + return files + + def _extract_blocks(self, post, files, shared=None): + post["content"] = content = [] + + for block in post.pop("blocks") or (): + try: + type = block["type"] + if type == "attachment": + file = block["attachment"].copy() + file["shared"] = shared + files.append(file) + elif type == "markdown": + content.append(block["markdown"]["content"]) + elif type == "ask": + post["ask"] = block["ask"] + else: + self.log.debug("%s: Unsupported block type '%s'", + post["postId"], type) + except Exception as exc: + self.log.debug("%s: %s", exc.__class__.__name__, exc) + + +class CohostUserExtractor(CohostExtractor): + """Extractor for media from a cohost user""" + subcategory = "user" + pattern = BASE_PATTERN + r"/([^/?#]+)/?(?:$|\?|#)" + example = "https://cohost.org/USER" + + def posts(self): + empty = 0 + params = { + "projectHandle": self.groups[0], + "page": 0, + "options": { + "pinnedPostsAtTop" : bool(self.pinned), + "hideReplies" : not self.replies, + "hideShares" : not self.shares, + "hideAsks" : not self.asks, + "viewingOnProjectPage": True, + }, + } + + while True: + data = self._request_api("posts.profilePosts", params) + + posts = data["posts"] + if posts: + empty = 0 + yield from posts + else: + empty += 1 + + pagination = data["pagination"] + if not pagination.get("morePagesForward"): + return + if empty >= 3: + return self.log.debug("Empty API results") + params["page"] = pagination["nextPage"] + + +class CohostPostExtractor(CohostExtractor): + """Extractor for media from a single cohost post""" + subcategory = "post" + pattern = BASE_PATTERN + r"/([^/?#]+)/post/(\d+)" + example = "https://cohost.org/USER/post/12345" + + def posts(self): + endpoint = "posts.singlePost" + params = { + "handle": self.groups[0], + "postId": int(self.groups[1]), + } + + data = self._request_api(endpoint, params) + post = data["post"] + + try: + post["comments"] = data["comments"][self.groups[1]] + except LookupError: + post["comments"] = () + + return (post,) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 630c3ac2..e15bb77e 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -37,6 +37,7 @@ CATEGORY_MAP = { "batoto" : "BATO.TO", "bbc" : "BBC", "cien" : "Ci-en", + "cohost" : "cohost!", "comicvine" : "Comic Vine", "coomerparty" : "Coomer", "deltaporno" : "DeltaPorno", diff --git a/test/results/cohost.py b/test/results/cohost.py new file mode 100644 index 00000000..1184f21c --- /dev/null +++ b/test/results/cohost.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import cohost + + +__tests__ = ( +{ + "#url" : "https://cohost.org/infinitebrians", + "#category": ("", "cohost", "user"), + "#class" : cohost.CohostUserExtractor, + "#range" : "1-20", + "#count" : 20, +}, + +{ + "#url" : "https://cohost.org/infinitebrians/post/4957017-thank-you-akira-tori", + "#category": ("", "cohost", "post"), + "#class" : cohost.CohostPostExtractor, + "#urls" : "https://staging.cohostcdn.org/attachment/58f9aa96-d2b2-4838-b81c-9aa8bac0bea0/march%204%202024.png", +}, + +)