|
|
@ -41,6 +41,7 @@ class BlueskyExtractor(Extractor):
|
|
|
|
self.api = BlueskyAPI(self)
|
|
|
|
self.api = BlueskyAPI(self)
|
|
|
|
self._user = self._user_did = None
|
|
|
|
self._user = self._user_did = None
|
|
|
|
self.instance = self.root.partition("://")[2]
|
|
|
|
self.instance = self.root.partition("://")[2]
|
|
|
|
|
|
|
|
self.videos = self.config("videos", True)
|
|
|
|
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
def items(self):
|
|
|
|
for post in self.posts():
|
|
|
|
for post in self.posts():
|
|
|
@ -55,14 +56,6 @@ class BlueskyExtractor(Extractor):
|
|
|
|
post.update(post["record"])
|
|
|
|
post.update(post["record"])
|
|
|
|
del post["record"]
|
|
|
|
del post["record"]
|
|
|
|
|
|
|
|
|
|
|
|
images = ()
|
|
|
|
|
|
|
|
if "embed" in post:
|
|
|
|
|
|
|
|
media = post["embed"]
|
|
|
|
|
|
|
|
if "media" in media:
|
|
|
|
|
|
|
|
media = media["media"]
|
|
|
|
|
|
|
|
if "images" in media:
|
|
|
|
|
|
|
|
images = media["images"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self._metadata_facets:
|
|
|
|
if self._metadata_facets:
|
|
|
|
if "facets" in post:
|
|
|
|
if "facets" in post:
|
|
|
|
post["hashtags"] = tags = []
|
|
|
|
post["hashtags"] = tags = []
|
|
|
@ -82,44 +75,88 @@ class BlueskyExtractor(Extractor):
|
|
|
|
if self._metadata_user:
|
|
|
|
if self._metadata_user:
|
|
|
|
post["user"] = self._user or post["author"]
|
|
|
|
post["user"] = self._user or post["author"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
files = self._extract_files(post)
|
|
|
|
post["instance"] = self.instance
|
|
|
|
post["instance"] = self.instance
|
|
|
|
post["post_id"] = pid
|
|
|
|
post["post_id"] = pid
|
|
|
|
post["count"] = len(images)
|
|
|
|
post["count"] = len(files)
|
|
|
|
post["date"] = text.parse_datetime(
|
|
|
|
post["date"] = text.parse_datetime(
|
|
|
|
post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
|
|
|
|
post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
|
|
|
|
|
|
|
|
|
|
|
|
yield Message.Directory, post
|
|
|
|
yield Message.Directory, post
|
|
|
|
|
|
|
|
|
|
|
|
if not images:
|
|
|
|
if not files:
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for post["num"], file in enumerate(files, 1):
|
|
|
|
|
|
|
|
post.update(file)
|
|
|
|
|
|
|
|
yield Message.Url, file["url"], post
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def posts(self):
|
|
|
|
|
|
|
|
return ()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_files(self, post):
|
|
|
|
|
|
|
|
if "embed" not in post:
|
|
|
|
|
|
|
|
return ()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
files = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
media = post["embed"]
|
|
|
|
|
|
|
|
if "media" in media:
|
|
|
|
|
|
|
|
media = media["media"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if "images" in media:
|
|
|
|
base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob"
|
|
|
|
base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob"
|
|
|
|
"?did={}&cid=".format(post["author"]["did"]))
|
|
|
|
"?did={}&cid=".format(post["author"]["did"]))
|
|
|
|
post["num"] = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for file in images:
|
|
|
|
|
|
|
|
post["num"] += 1
|
|
|
|
|
|
|
|
post["description"] = file["alt"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for image in media["images"]:
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
aspect = file["aspectRatio"]
|
|
|
|
aspect = image["aspectRatio"]
|
|
|
|
post["width"] = aspect["width"]
|
|
|
|
width = aspect["width"]
|
|
|
|
post["height"] = aspect["height"]
|
|
|
|
height = aspect["height"]
|
|
|
|
except KeyError:
|
|
|
|
except KeyError:
|
|
|
|
post["width"] = post["height"] = 0
|
|
|
|
width = height = 0
|
|
|
|
|
|
|
|
|
|
|
|
image = file["image"]
|
|
|
|
data = image["image"]
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
cid = image["ref"]["$link"]
|
|
|
|
cid = data["ref"]["$link"]
|
|
|
|
except KeyError:
|
|
|
|
except KeyError:
|
|
|
|
cid = image["cid"]
|
|
|
|
cid = data["cid"]
|
|
|
|
post["filename"] = cid
|
|
|
|
|
|
|
|
post["extension"] = image["mimeType"].rpartition("/")[2]
|
|
|
|
files.append({
|
|
|
|
|
|
|
|
"description": image.get("alt"),
|
|
|
|
yield Message.Url, base + cid, post
|
|
|
|
"width" : width,
|
|
|
|
|
|
|
|
"height" : height,
|
|
|
|
|
|
|
|
"filename" : cid,
|
|
|
|
|
|
|
|
"extension" : data["mimeType"].rpartition("/")[2],
|
|
|
|
|
|
|
|
"url" : base + cid,
|
|
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if "video" in media and self.videos:
|
|
|
|
|
|
|
|
base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob"
|
|
|
|
|
|
|
|
"?did={}&cid=".format(post["author"]["did"]))
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
aspect = media["aspectRatio"]
|
|
|
|
|
|
|
|
width = aspect["width"]
|
|
|
|
|
|
|
|
height = aspect["height"]
|
|
|
|
|
|
|
|
except KeyError:
|
|
|
|
|
|
|
|
width = height = 0
|
|
|
|
|
|
|
|
|
|
|
|
def posts(self):
|
|
|
|
video = media["video"]
|
|
|
|
return ()
|
|
|
|
try:
|
|
|
|
|
|
|
|
cid = video["ref"]["$link"]
|
|
|
|
|
|
|
|
except KeyError:
|
|
|
|
|
|
|
|
cid = video["cid"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
files.append({
|
|
|
|
|
|
|
|
"description": media.get("alt") or "",
|
|
|
|
|
|
|
|
"width" : width,
|
|
|
|
|
|
|
|
"height" : height,
|
|
|
|
|
|
|
|
"filename" : cid,
|
|
|
|
|
|
|
|
"extension" : video["mimeType"].rpartition("/")[2],
|
|
|
|
|
|
|
|
"url" : base + cid,
|
|
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return files
|
|
|
|
|
|
|
|
|
|
|
|
def _make_post(self, actor, kind):
|
|
|
|
def _make_post(self, actor, kind):
|
|
|
|
did = self.api._did_from_actor(actor)
|
|
|
|
did = self.api._did_from_actor(actor)
|
|
|
|