[bluesky] add 'metadata' option (#4438)

allow extracting 'user' metadata and
make 'facets' extraction optional
pull/5224/head
Mike Fährmann 7 months ago
parent 7e036ea290
commit 0abd9723af
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1185,6 +1185,26 @@ Description
Download embedded videos hosted on https://www.blogger.com/
extractor.bluesky.metadata
--------------------------
Type
* ``bool``
* ``string``
* ``list`` of ``strings``
Default
``false``
Example
* ``"facets,user"``
* ``["facets", "user"]``
Description
Extract additional metadata.
* ``facets``: ``hashtags``, ``mentions``, and ``uris``
* ``user``: detailed ``user`` metadata for the user referenced in the input URL
(See `app.bsky.actor.getProfile <https://www.docs.bsky.app/docs/api/app-bsky-actor-get-profile>`__).
extractor.bluesky.post.depth
----------------------------
Type
@ -1595,7 +1615,7 @@ Type
Default
``false``
Example
* ``notes,pools``
* ``"notes,pools"``
* ``["notes", "pools"]``
Description
Extract additional metadata (notes, pool metadata) if available.

@ -29,7 +29,17 @@ class BlueskyExtractor(Extractor):
self.user = match.group(1)
def _init(self):
meta = self.config("metadata") or ()
if meta:
if isinstance(meta, str):
meta = meta.replace(" ", "").split(",")
elif not isinstance(meta, (list, tuple)):
meta = ("user", "facets")
self._metadata_user = ("user" in meta)
self._metadata_facets = ("facets" in meta)
self.api = BlueskyAPI(self)
self._user = None
def items(self):
for post in self.posts():
@ -45,20 +55,24 @@ class BlueskyExtractor(Extractor):
if "images" in media:
images = media["images"]
if "facets" in post:
post["hashtags"] = tags = []
post["mentions"] = dids = []
post["uris"] = uris = []
for facet in post["facets"]:
features = facet["features"][0]
if "tag" in features:
tags.append(features["tag"])
elif "did" in features:
dids.append(features["did"])
elif "uri" in features:
uris.append(features["uri"])
else:
post["hashtags"] = post["mentions"] = post["uris"] = ()
if self._metadata_facets:
if "facets" in post:
post["hashtags"] = tags = []
post["mentions"] = dids = []
post["uris"] = uris = []
for facet in post["facets"]:
features = facet["features"][0]
if "tag" in features:
tags.append(features["tag"])
elif "did" in features:
dids.append(features["did"])
elif "uri" in features:
uris.append(features["uri"])
else:
post["hashtags"] = post["mentions"] = post["uris"] = ()
if self._metadata_user:
post["user"] = self._user or post["author"]
post["post_id"] = post["uri"].rpartition("/")[2]
post["count"] = len(images)
@ -282,9 +296,10 @@ class BlueskyAPI():
index += 1
return posts
def get_profile(self, actor):
@memcache(keyarg=1)
def get_profile(self, did):
endpoint = "app.bsky.actor.getProfile"
params = {"actor": self._did_from_actor(actor)}
params = {"actor": did}
return self._call(endpoint, params)
@memcache(keyarg=1)
@ -295,8 +310,14 @@ class BlueskyAPI():
def _did_from_actor(self, actor):
if actor.startswith("did:"):
return actor
return self.resolve_handle(actor)
did = actor
else:
did = self.resolve_handle(actor)
if self.extractor._metadata_user:
self.extractor._user = self.get_profile(did)
return did
def authenticate(self):
self.headers["Authorization"] = self._authenticate_impl(self.username)

@ -12,6 +12,7 @@ __tests__ = (
"#url" : "https://bsky.app/profile/bsky.app/post/3kh5rarr3gn2n",
"#category": ("", "bluesky", "post"),
"#class" : bluesky.BlueskyPostExtractor,
"#options" : {"metadata": True},
"#urls" : "https://bsky.social/xrpc/com.atproto.sync.getBlob?did=did:plc:z72i7hdynmk6r22z27h6tvur&cid=bafkreidypzoaybmfj5h7pnpiyct6ng5yae6ydp4czrm72ocg7ev6vbirri",
"#sha1_content": "ffcf25e7c511173a12de5276b85903309fcd8d14",
@ -42,12 +43,26 @@ __tests__ = (
"hashtags" : [],
"mentions" : [],
"uris" : ["https://blueskyweb.xyz/blog/12-21-2023-butterfly"],
"user" : {
"avatar" : str,
"banner" : str,
"description" : "Official Bluesky account (check domain👆)\n\nFollow for updates and announcements",
"did" : "did:plc:z72i7hdynmk6r22z27h6tvur",
"displayName" : "Bluesky",
"followersCount": int,
"followsCount" : int,
"handle" : "bsky.app",
"indexedAt" : "2023-12-22T18:54:12.339Z",
"labels" : [],
"postsCount" : int,
},
},
{
"#url" : "https://bsky.app/profile/mikf.bsky.social/post/3kkzc3xaf5m2w",
"#category": ("", "bluesky", "post"),
"#class" : bluesky.BlueskyPostExtractor,
"#options" : {"metadata": "facets"},
"#urls" : "https://bsky.social/xrpc/com.atproto.sync.getBlob?did=did:plc:cslxjqkeexku6elp5xowxkq7&cid=bafkreib7ydpe3xxo4cq7nn32w7eqhcanfaanz6caepd2z4kzplxtx2ctgi",
"#sha1_content": "9cf5748f6d00aae83fbb3cc2c6eb3caa832b90f4",

Loading…
Cancel
Save