From 0abd9723afd0823bc565346e0c759818307073cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 15 Feb 2024 23:30:16 +0100 Subject: [PATCH] [bluesky] add 'metadata' option (#4438) allow extracting 'user' metadata and make 'facets' extraction optional --- docs/configuration.rst | 22 ++++++++++++- gallery_dl/extractor/bluesky.py | 57 ++++++++++++++++++++++----------- test/results/bluesky.py | 15 +++++++++ 3 files changed, 75 insertions(+), 19 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 4460383c..b626a930 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1185,6 +1185,26 @@ Description Download embedded videos hosted on https://www.blogger.com/ +extractor.bluesky.metadata +-------------------------- +Type + * ``bool`` + * ``string`` + * ``list`` of ``strings`` +Default + ``false`` +Example + * ``"facets,user"`` + * ``["facets", "user"]`` +Description + Extract additional metadata. + + * ``facets``: ``hashtags``, ``mentions``, and ``uris`` + * ``user``: detailed ``user`` metadata for the user referenced in the input URL + (See `app.bsky.actor.getProfile `__). + + + extractor.bluesky.post.depth ---------------------------- Type @@ -1595,7 +1615,7 @@ Type Default ``false`` Example - * ``notes,pools`` + * ``"notes,pools"`` * ``["notes", "pools"]`` Description Extract additional metadata (notes, pool metadata) if available. diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index fdb171ae..221598c4 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -29,7 +29,17 @@ class BlueskyExtractor(Extractor): self.user = match.group(1) def _init(self): + meta = self.config("metadata") or () + if meta: + if isinstance(meta, str): + meta = meta.replace(" ", "").split(",") + elif not isinstance(meta, (list, tuple)): + meta = ("user", "facets") + self._metadata_user = ("user" in meta) + self._metadata_facets = ("facets" in meta) + self.api = BlueskyAPI(self) + self._user = None def items(self): for post in self.posts(): @@ -45,20 +55,24 @@ class BlueskyExtractor(Extractor): if "images" in media: images = media["images"] - if "facets" in post: - post["hashtags"] = tags = [] - post["mentions"] = dids = [] - post["uris"] = uris = [] - for facet in post["facets"]: - features = facet["features"][0] - if "tag" in features: - tags.append(features["tag"]) - elif "did" in features: - dids.append(features["did"]) - elif "uri" in features: - uris.append(features["uri"]) - else: - post["hashtags"] = post["mentions"] = post["uris"] = () + if self._metadata_facets: + if "facets" in post: + post["hashtags"] = tags = [] + post["mentions"] = dids = [] + post["uris"] = uris = [] + for facet in post["facets"]: + features = facet["features"][0] + if "tag" in features: + tags.append(features["tag"]) + elif "did" in features: + dids.append(features["did"]) + elif "uri" in features: + uris.append(features["uri"]) + else: + post["hashtags"] = post["mentions"] = post["uris"] = () + + if self._metadata_user: + post["user"] = self._user or post["author"] post["post_id"] = post["uri"].rpartition("/")[2] post["count"] = len(images) @@ -282,9 +296,10 @@ class BlueskyAPI(): index += 1 return posts - def get_profile(self, actor): + @memcache(keyarg=1) + def get_profile(self, did): endpoint = "app.bsky.actor.getProfile" - params = {"actor": self._did_from_actor(actor)} + params = {"actor": did} return self._call(endpoint, params) @memcache(keyarg=1) @@ -295,8 +310,14 @@ class BlueskyAPI(): def _did_from_actor(self, actor): if actor.startswith("did:"): - return actor - return self.resolve_handle(actor) + did = actor + else: + did = self.resolve_handle(actor) + + if self.extractor._metadata_user: + self.extractor._user = self.get_profile(did) + + return did def authenticate(self): self.headers["Authorization"] = self._authenticate_impl(self.username) diff --git a/test/results/bluesky.py b/test/results/bluesky.py index 99644332..6bd6eb38 100644 --- a/test/results/bluesky.py +++ b/test/results/bluesky.py @@ -12,6 +12,7 @@ __tests__ = ( "#url" : "https://bsky.app/profile/bsky.app/post/3kh5rarr3gn2n", "#category": ("", "bluesky", "post"), "#class" : bluesky.BlueskyPostExtractor, + "#options" : {"metadata": True}, "#urls" : "https://bsky.social/xrpc/com.atproto.sync.getBlob?did=did:plc:z72i7hdynmk6r22z27h6tvur&cid=bafkreidypzoaybmfj5h7pnpiyct6ng5yae6ydp4czrm72ocg7ev6vbirri", "#sha1_content": "ffcf25e7c511173a12de5276b85903309fcd8d14", @@ -42,12 +43,26 @@ __tests__ = ( "hashtags" : [], "mentions" : [], "uris" : ["https://blueskyweb.xyz/blog/12-21-2023-butterfly"], + "user" : { + "avatar" : str, + "banner" : str, + "description" : "Official Bluesky account (check domain👆)\n\nFollow for updates and announcements", + "did" : "did:plc:z72i7hdynmk6r22z27h6tvur", + "displayName" : "Bluesky", + "followersCount": int, + "followsCount" : int, + "handle" : "bsky.app", + "indexedAt" : "2023-12-22T18:54:12.339Z", + "labels" : [], + "postsCount" : int, + }, }, { "#url" : "https://bsky.app/profile/mikf.bsky.social/post/3kkzc3xaf5m2w", "#category": ("", "bluesky", "post"), "#class" : bluesky.BlueskyPostExtractor, + "#options" : {"metadata": "facets"}, "#urls" : "https://bsky.social/xrpc/com.atproto.sync.getBlob?did=did:plc:cslxjqkeexku6elp5xowxkq7&cid=bafkreib7ydpe3xxo4cq7nn32w7eqhcanfaanz6caepd2z4kzplxtx2ctgi", "#sha1_content": "9cf5748f6d00aae83fbb3cc2c6eb3caa832b90f4",