add generalized extractors for Mastodon instances (#144)
Extractors for Mastodon instances can now be dynamically generated, based on the instance names in the 'extractor.mastodon.*' config path. Example: { "extractor": { "mastodon": { "pawoo.net": { ... }, "mastodon.xyz": { ... }, "tabletop.social": { ... }, ... } } } Each entry requires an 'access-token' value, which can be generated with 'gallery-dl oauth:mastodon:<instance URL>'. An 'access-token' (as well as a 'client-id' and 'client-secret') for pawoo.net is always available, but can be overwritten as necessary.pull/170/head
parent
4b441c162e
commit
b8fed34548
@ -0,0 +1,175 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2019 Mike Fährmann
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for mastodon instances"""
|
||||||
|
|
||||||
|
from .common import Extractor, Message
|
||||||
|
from .. import text, config, exception
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class MastodonExtractor(Extractor):
|
||||||
|
"""Base class for mastodon extractors"""
|
||||||
|
basecategory = "mastodon"
|
||||||
|
directory_fmt = ["mastodon", "{category}", "{account[username]}"]
|
||||||
|
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
|
||||||
|
archive_fmt = "{media[id]}"
|
||||||
|
instance = None
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
Extractor.__init__(self)
|
||||||
|
self.instance = match.group(1)
|
||||||
|
self.api = MastodonAPI(self, self.instance)
|
||||||
|
|
||||||
|
def config(self, key, default=None):
|
||||||
|
return config.interpolate(
|
||||||
|
("extractor", "mastodon", self.category, self.subcategory, key),
|
||||||
|
default,
|
||||||
|
)
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
yield Message.Version, 1
|
||||||
|
for status in self.statuses():
|
||||||
|
attachments = self.prepare(status)
|
||||||
|
yield Message.Directory, status
|
||||||
|
for media in attachments:
|
||||||
|
status["media"] = media
|
||||||
|
url = media["url"]
|
||||||
|
yield Message.Url, url, text.nameext_from_url(url, status)
|
||||||
|
|
||||||
|
def statuses(self):
|
||||||
|
"""Return an iterable containing all relevant Status-objects"""
|
||||||
|
return ()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def prepare(status):
|
||||||
|
"""Prepare a status object"""
|
||||||
|
attachments = status["media_attachments"]
|
||||||
|
del status["media_attachments"]
|
||||||
|
return attachments
|
||||||
|
|
||||||
|
|
||||||
|
class MastodonUserExtractor(MastodonExtractor):
|
||||||
|
"""Extractor for all images of an account/user"""
|
||||||
|
subcategory = "user"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
MastodonExtractor.__init__(self, match)
|
||||||
|
self.account_name = match.group(2)
|
||||||
|
|
||||||
|
def statuses(self):
|
||||||
|
results = self.api.account_search("@" + self.account_name, 1)
|
||||||
|
for account in results:
|
||||||
|
if account["username"] == self.account_name:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise exception.NotFoundError("account")
|
||||||
|
return self.api.account_statuses(account["id"])
|
||||||
|
|
||||||
|
|
||||||
|
class MastodonStatusExtractor(MastodonExtractor):
|
||||||
|
"""Extractor for images from a status"""
|
||||||
|
subcategory = "status"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
MastodonExtractor.__init__(self, match)
|
||||||
|
self.status_id = match.group(2)
|
||||||
|
|
||||||
|
def statuses(self):
|
||||||
|
return (self.api.status(self.status_id),)
|
||||||
|
|
||||||
|
|
||||||
|
class MastodonAPI():
|
||||||
|
"""Minimal interface for the Mastodon API
|
||||||
|
|
||||||
|
https://github.com/tootsuite/mastodon
|
||||||
|
https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, extractor, instance, access_token=None):
|
||||||
|
self.instance = instance
|
||||||
|
self.extractor = extractor
|
||||||
|
self.headers = {"Authorization": "Bearer {}".format(
|
||||||
|
extractor.config("access-token", access_token))}
|
||||||
|
|
||||||
|
def account_search(self, query, limit=40):
|
||||||
|
"""Search for content"""
|
||||||
|
params = {"q": query, "limit": limit}
|
||||||
|
return self._call("accounts/search", params)
|
||||||
|
|
||||||
|
def account_statuses(self, account_id):
|
||||||
|
"""Get an account's statuses"""
|
||||||
|
endpoint = "accounts/{}/statuses".format(account_id)
|
||||||
|
params = {"only_media": "1"}
|
||||||
|
return self._pagination(endpoint, params)
|
||||||
|
|
||||||
|
def status(self, status_id):
|
||||||
|
"""Fetch a Status"""
|
||||||
|
return self._call("statuses/" + status_id)
|
||||||
|
|
||||||
|
def _call(self, endpoint, params=None):
|
||||||
|
url = "https://{}/api/v1/{}".format(self.instance, endpoint)
|
||||||
|
response = self.extractor.request(
|
||||||
|
url, params=params, headers=self.headers)
|
||||||
|
return self._parse(response)
|
||||||
|
|
||||||
|
def _pagination(self, endpoint, params):
|
||||||
|
url = "https://{}/api/v1/{}".format(self.instance, endpoint)
|
||||||
|
while url:
|
||||||
|
response = self.extractor.request(
|
||||||
|
url, params=params, headers=self.headers)
|
||||||
|
yield from self._parse(response)
|
||||||
|
url = response.links.get("next", {}).get("url")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _parse(response):
|
||||||
|
"""Parse an API response"""
|
||||||
|
if response.status_code == 404:
|
||||||
|
raise exception.NotFoundError()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
def generate_extractors():
|
||||||
|
"""Dynamically generate Extractor classes for Mastodon instances"""
|
||||||
|
|
||||||
|
symtable = globals()
|
||||||
|
mastodon = config.get(("extractor", "mastodon")) or {}
|
||||||
|
|
||||||
|
if "pawoo.net" not in mastodon:
|
||||||
|
mastodon["pawoo.net"] = {
|
||||||
|
"access-token" : "286462927198d0cf3e24683e91c8259a"
|
||||||
|
"ac4367233064e0570ca18df2ac65b226",
|
||||||
|
"client-id" : "97b142b6904abf97a1068d51a7bc2f2f"
|
||||||
|
"cf9323cef81f13cb505415716dba7dac",
|
||||||
|
"client-secret": "e45bef4bad45b38abf7d9ef88a646b73"
|
||||||
|
"75e7fb2532c31a026327a93549236481",
|
||||||
|
}
|
||||||
|
|
||||||
|
for instance, info in mastodon.items():
|
||||||
|
|
||||||
|
if not isinstance(info, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
class UserExtractor(MastodonUserExtractor):
|
||||||
|
pattern = [r"(?:https?://)?({})/@([^/?&#]+)(?:/media)?/?$".format(
|
||||||
|
re.escape(instance))]
|
||||||
|
|
||||||
|
class StatusExtractor(MastodonStatusExtractor):
|
||||||
|
pattern = [r"(?:https?://)?({})/@[^/?&#]+/(\d+)".format(
|
||||||
|
re.escape(instance))]
|
||||||
|
|
||||||
|
name = re.sub(r"[^A-Za-z]+", "", instance).capitalize()
|
||||||
|
|
||||||
|
for extr in (UserExtractor, StatusExtractor):
|
||||||
|
extr.category = instance
|
||||||
|
extr.__name__ = name + extr.__name__
|
||||||
|
extr.__doc__ = "{} on {}".format(extr.__base__.__doc__, instance)
|
||||||
|
symtable[extr.__name__] = extr
|
||||||
|
|
||||||
|
|
||||||
|
generate_extractors()
|
@ -1,140 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
# Copyright 2017-2018 Mike Fährmann
|
|
||||||
#
|
|
||||||
# This program is free software; you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
|
||||||
# published by the Free Software Foundation.
|
|
||||||
|
|
||||||
"""Extract images from https://pawoo.net"""
|
|
||||||
|
|
||||||
from .common import Extractor, Message
|
|
||||||
from .. import text, exception
|
|
||||||
|
|
||||||
|
|
||||||
class PawooExtractor(Extractor):
|
|
||||||
"""Base class for pawoo extractors"""
|
|
||||||
category = "pawoo"
|
|
||||||
directory_fmt = ["{category}", "{account[username]}"]
|
|
||||||
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
|
|
||||||
archive_fmt = "{media[id]}"
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
Extractor.__init__(self)
|
|
||||||
self.api = MastodonAPI(self)
|
|
||||||
|
|
||||||
def items(self):
|
|
||||||
yield Message.Version, 1
|
|
||||||
for status in self.statuses():
|
|
||||||
attachments = self.prepare(status)
|
|
||||||
yield Message.Directory, status
|
|
||||||
for media in attachments:
|
|
||||||
status["media"] = media
|
|
||||||
url = media["url"]
|
|
||||||
yield Message.Url, url, text.nameext_from_url(url, status)
|
|
||||||
|
|
||||||
def statuses(self):
|
|
||||||
"""Return an iterable containing all relevant Status-objects"""
|
|
||||||
return []
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def prepare(status):
|
|
||||||
"""Prepare a status object"""
|
|
||||||
attachments = status["media_attachments"]
|
|
||||||
del status["media_attachments"]
|
|
||||||
return attachments
|
|
||||||
|
|
||||||
|
|
||||||
class PawooUserExtractor(PawooExtractor):
|
|
||||||
"""Extractor for all images of an account/user on pawoo.net"""
|
|
||||||
subcategory = "user"
|
|
||||||
pattern = [r"(?:https?://)?pawoo\.net/@([^/?&#]+)(?:/media)?/?$"]
|
|
||||||
test = [
|
|
||||||
("https://pawoo.net/@kuroda", {
|
|
||||||
"url": "a3f9e7555f2b024554c0e9b6cbcc7991af13cf99",
|
|
||||||
}),
|
|
||||||
("https://pawoo.net/@zZzZz/", {
|
|
||||||
"exception": exception.NotFoundError,
|
|
||||||
}),
|
|
||||||
("https://pawoo.net/@kuroda/media", None),
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(self, match):
|
|
||||||
PawooExtractor.__init__(self)
|
|
||||||
self.account_name = match.group(1)
|
|
||||||
|
|
||||||
def statuses(self):
|
|
||||||
results = self.api.account_search("@" + self.account_name, 1)
|
|
||||||
for account in results:
|
|
||||||
if account["username"] == self.account_name:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise exception.NotFoundError("account")
|
|
||||||
return self.api.account_statuses(account["id"])
|
|
||||||
|
|
||||||
|
|
||||||
class PawooStatusExtractor(PawooExtractor):
|
|
||||||
"""Extractor for images from a status on pawoo.net"""
|
|
||||||
subcategory = "status"
|
|
||||||
pattern = [r"(?:https?://)?pawoo\.net/@[^/?&#]+/(\d+)"]
|
|
||||||
test = [
|
|
||||||
("https://pawoo.net/@takehana_note/559043", {
|
|
||||||
"url": "f95cc8c0274c4143e7e21dbdc693b90c65b596e3",
|
|
||||||
"content": "3b148cf90174173355fe34179741ce476921b2fc",
|
|
||||||
}),
|
|
||||||
("https://pawoo.net/@zZzZz/12346", {
|
|
||||||
"exception": exception.NotFoundError,
|
|
||||||
}),
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(self, match):
|
|
||||||
PawooExtractor.__init__(self)
|
|
||||||
self.status_id = match.group(1)
|
|
||||||
|
|
||||||
def statuses(self):
|
|
||||||
return (self.api.status(self.status_id),)
|
|
||||||
|
|
||||||
|
|
||||||
class MastodonAPI():
|
|
||||||
"""Minimal interface for the Mastodon API on pawoo.net
|
|
||||||
|
|
||||||
https://github.com/tootsuite/mastodon
|
|
||||||
https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, extractor, root="https://pawoo.net",
|
|
||||||
access_token=("286462927198d0cf3e24683e91c8259a"
|
|
||||||
"ac4367233064e0570ca18df2ac65b226")):
|
|
||||||
self.root = root
|
|
||||||
self.extractor = extractor
|
|
||||||
extractor.session.headers["Authorization"] = "Bearer {}".format(
|
|
||||||
extractor.config("access-token", access_token))
|
|
||||||
|
|
||||||
def account_search(self, query, limit=40):
|
|
||||||
"""Search for content"""
|
|
||||||
url = "{}/api/v1/accounts/search".format(self.root)
|
|
||||||
params = {"q": query, "limit": limit}
|
|
||||||
response = self.extractor.request(url, params=params)
|
|
||||||
return self._parse(response)
|
|
||||||
|
|
||||||
def account_statuses(self, account_id):
|
|
||||||
"""Get an account's statuses"""
|
|
||||||
url = "{}/api/v1/accounts/{}/statuses?only_media=1".format(
|
|
||||||
self.root, account_id)
|
|
||||||
while url:
|
|
||||||
response = self.extractor.request(url)
|
|
||||||
yield from self._parse(response)
|
|
||||||
url = response.links.get("next", {}).get("url")
|
|
||||||
|
|
||||||
def status(self, status_id):
|
|
||||||
"""Fetch a Status"""
|
|
||||||
url = "{}/api/v1/statuses/{}".format(self.root, status_id)
|
|
||||||
response = self.extractor.request(url, expect=(404,))
|
|
||||||
return self._parse(response)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _parse(response):
|
|
||||||
"""Parse an API response"""
|
|
||||||
if response.status_code == 404:
|
|
||||||
raise exception.NotFoundError()
|
|
||||||
return response.json()
|
|
@ -1,9 +1,9 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Copyright 2016-2018 Mike Fährmann
|
# Copyright 2016-2019 Mike Fährmann
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
# published by the Free Software Foundation.
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
__version__ = "1.6.3"
|
__version__ = "1.7.0-dev"
|
||||||
|
Loading…
Reference in new issue