add generalized extractors for Mastodon instances (#144)

Extractors for Mastodon instances can now be dynamically generated,
based on the instance names in the 'extractor.mastodon.*' config path.

Example:
{
    "extractor": {
        "mastodon": {
            "pawoo.net": { ... },
            "mastodon.xyz": { ... },
            "tabletop.social": { ... },
            ...
        }
    }
}

Each entry requires an 'access-token' value, which can be generated with
'gallery-dl oauth:mastodon:<instance URL>'.
An 'access-token' (as well as a 'client-id' and 'client-secret') for
pawoo.net is always available, but can be overwritten as necessary.
pull/170/head
Mike Fährmann 6 years ago
parent 4b441c162e
commit b8fed34548
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1,5 +1,7 @@
# Changelog # Changelog
## Unreleased
## 1.6.3 - 2019-01-18 ## 1.6.3 - 2019-01-18
- Added `metadata` post-processor to write image metadata to an external file ([#135](https://github.com/mikf/gallery-dl/issues/135)) - Added `metadata` post-processor to write image metadata to an external file ([#135](https://github.com/mikf/gallery-dl/issues/135))
- Added option to reverse chapter order of manga extractors ([#149](https://github.com/mikf/gallery-dl/issues/149)) - Added option to reverse chapter order of manga extractors ([#149](https://github.com/mikf/gallery-dl/issues/149))

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2018 Mike Fährmann # Copyright 2015-2019 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -67,7 +67,6 @@ modules = [
"nijie", "nijie",
"nyafuu", "nyafuu",
"paheal", "paheal",
"pawoo",
"piczel", "piczel",
"pinterest", "pinterest",
"pixiv", "pixiv",
@ -95,6 +94,7 @@ modules = [
"yandere", "yandere",
"xvideos", "xvideos",
"yuki", "yuki",
"mastodon",
"imagehosts", "imagehosts",
"directlink", "directlink",
"recursive", "recursive",

@ -0,0 +1,175 @@
# -*- coding: utf-8 -*-
# Copyright 2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for mastodon instances"""
from .common import Extractor, Message
from .. import text, config, exception
import re
class MastodonExtractor(Extractor):
"""Base class for mastodon extractors"""
basecategory = "mastodon"
directory_fmt = ["mastodon", "{category}", "{account[username]}"]
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
archive_fmt = "{media[id]}"
instance = None
def __init__(self, match):
Extractor.__init__(self)
self.instance = match.group(1)
self.api = MastodonAPI(self, self.instance)
def config(self, key, default=None):
return config.interpolate(
("extractor", "mastodon", self.category, self.subcategory, key),
default,
)
def items(self):
yield Message.Version, 1
for status in self.statuses():
attachments = self.prepare(status)
yield Message.Directory, status
for media in attachments:
status["media"] = media
url = media["url"]
yield Message.Url, url, text.nameext_from_url(url, status)
def statuses(self):
"""Return an iterable containing all relevant Status-objects"""
return ()
@staticmethod
def prepare(status):
"""Prepare a status object"""
attachments = status["media_attachments"]
del status["media_attachments"]
return attachments
class MastodonUserExtractor(MastodonExtractor):
"""Extractor for all images of an account/user"""
subcategory = "user"
def __init__(self, match):
MastodonExtractor.__init__(self, match)
self.account_name = match.group(2)
def statuses(self):
results = self.api.account_search("@" + self.account_name, 1)
for account in results:
if account["username"] == self.account_name:
break
else:
raise exception.NotFoundError("account")
return self.api.account_statuses(account["id"])
class MastodonStatusExtractor(MastodonExtractor):
"""Extractor for images from a status"""
subcategory = "status"
def __init__(self, match):
MastodonExtractor.__init__(self, match)
self.status_id = match.group(2)
def statuses(self):
return (self.api.status(self.status_id),)
class MastodonAPI():
"""Minimal interface for the Mastodon API
https://github.com/tootsuite/mastodon
https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
"""
def __init__(self, extractor, instance, access_token=None):
self.instance = instance
self.extractor = extractor
self.headers = {"Authorization": "Bearer {}".format(
extractor.config("access-token", access_token))}
def account_search(self, query, limit=40):
"""Search for content"""
params = {"q": query, "limit": limit}
return self._call("accounts/search", params)
def account_statuses(self, account_id):
"""Get an account's statuses"""
endpoint = "accounts/{}/statuses".format(account_id)
params = {"only_media": "1"}
return self._pagination(endpoint, params)
def status(self, status_id):
"""Fetch a Status"""
return self._call("statuses/" + status_id)
def _call(self, endpoint, params=None):
url = "https://{}/api/v1/{}".format(self.instance, endpoint)
response = self.extractor.request(
url, params=params, headers=self.headers)
return self._parse(response)
def _pagination(self, endpoint, params):
url = "https://{}/api/v1/{}".format(self.instance, endpoint)
while url:
response = self.extractor.request(
url, params=params, headers=self.headers)
yield from self._parse(response)
url = response.links.get("next", {}).get("url")
@staticmethod
def _parse(response):
"""Parse an API response"""
if response.status_code == 404:
raise exception.NotFoundError()
return response.json()
def generate_extractors():
"""Dynamically generate Extractor classes for Mastodon instances"""
symtable = globals()
mastodon = config.get(("extractor", "mastodon")) or {}
if "pawoo.net" not in mastodon:
mastodon["pawoo.net"] = {
"access-token" : "286462927198d0cf3e24683e91c8259a"
"ac4367233064e0570ca18df2ac65b226",
"client-id" : "97b142b6904abf97a1068d51a7bc2f2f"
"cf9323cef81f13cb505415716dba7dac",
"client-secret": "e45bef4bad45b38abf7d9ef88a646b73"
"75e7fb2532c31a026327a93549236481",
}
for instance, info in mastodon.items():
if not isinstance(info, dict):
continue
class UserExtractor(MastodonUserExtractor):
pattern = [r"(?:https?://)?({})/@([^/?&#]+)(?:/media)?/?$".format(
re.escape(instance))]
class StatusExtractor(MastodonStatusExtractor):
pattern = [r"(?:https?://)?({})/@[^/?&#]+/(\d+)".format(
re.escape(instance))]
name = re.sub(r"[^A-Za-z]+", "", instance).capitalize()
for extr in (UserExtractor, StatusExtractor):
extr.category = instance
extr.__name__ = name + extr.__name__
extr.__doc__ = "{} on {}".format(extr.__base__.__doc__, instance)
symtable[extr.__name__] = extr
generate_extractors()

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2017-2018 Mike Fährmann # Copyright 2017-2019 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -10,7 +10,8 @@
from .common import Extractor, Message from .common import Extractor, Message
from . import deviantart, flickr, reddit, smugmug, tumblr from . import deviantart, flickr, reddit, smugmug, tumblr
from .. import text, oauth, config from .. import text, oauth, config, exception
from ..cache import cache
import os import os
import urllib.parse import urllib.parse
@ -82,7 +83,6 @@ class OAuthBase(Extractor):
data = self.open(authorize_url, params) data = self.open(authorize_url, params)
# exchange the request token for an access token # exchange the request token for an access token
# self.session.token = data["oauth_token"]
data = self.session.get(access_token_url, params=data).text data = self.session.get(access_token_url, params=data).text
data = text.parse_query(data) data = text.parse_query(data)
@ -94,7 +94,8 @@ class OAuthBase(Extractor):
def _oauth2_authorization_code_grant( def _oauth2_authorization_code_grant(
self, client_id, client_secret, auth_url, token_url, self, client_id, client_secret, auth_url, token_url,
scope="read", key="refresh_token", auth=True): scope="read", key="refresh_token", auth=True,
message_template=None):
"""Perform an OAuth2 authorization code grant""" """Perform an OAuth2 authorization code grant"""
state = "gallery-dl_{}_{}".format( state = "gallery-dl_{}_{}".format(
@ -147,11 +148,15 @@ class OAuthBase(Extractor):
# display token # display token
part = key.partition("_")[0] part = key.partition("_")[0]
self.send(OAUTH2_MSG_TEMPLATE.format( template = message_template or OAUTH2_MSG_TEMPLATE
self.send(template.format(
category=self.subcategory, category=self.subcategory,
key=part, key=part,
Key=part.capitalize(), Key=part.capitalize(),
token=data[key], token=data[key],
instance=getattr(self, "instance", ""),
client_id=client_id,
client_secret=client_secret,
)) ))
@ -254,6 +259,55 @@ class OAuthTumblr(OAuthBase):
) )
class OAuthMastodon(OAuthBase):
subcategory = "mastodon"
pattern = ["oauth:mastodon:(?:https?://)?([^/?&#]+)"]
def __init__(self, match):
OAuthBase.__init__(self, match)
self.instance = match.group(1)
def items(self):
yield Message.Version, 1
application = self.oauth_config(self.instance)
if not application:
application = self._register(self.instance)
self._oauth2_authorization_code_grant(
application["client-id"],
application["client-secret"],
"https://{}/oauth/authorize".format(self.instance),
"https://{}/oauth/token".format(self.instance),
key="access_token",
message_template=MASTODON_MSG_TEMPLATE,
)
@cache(maxage=10*365*24*60*60, keyarg=1)
def _register(self, instance):
self.log.info("Registering application for '%s'", instance)
url = "https://{}/api/v1/apps".format(instance)
data = {
"client_name": "gdl:" + oauth.nonce(8),
"redirect_uris": self.redirect_uri,
"scopes": "read",
}
data = self.session.post(url, data=data).json()
if "client_id" not in data or "client_secret" not in data:
self.log.error("Failed to register new application: '%s'", data)
raise exception.StopExtraction()
data["client-id"] = data.pop("client_id")
data["client-secret"] = data.pop("client_secret")
self.log.info("client-id:\n%s", data["client-id"])
self.log.info("client-secret:\n%s", data["client-secret"])
return data
OAUTH1_MSG_TEMPLATE = """ OAUTH1_MSG_TEMPLATE = """
Your Access Token and Access Token Secret are Your Access Token and Access Token Secret are
@ -293,3 +347,29 @@ Example:
}} }}
}} }}
""" """
MASTODON_MSG_TEMPLATE = """
Your {Key} Token is
{token}
Put this value into your configuration file as
'extractor.mastodon.{instance}.{key}-token'.
You can also add your 'client-id' and 'client-secret' values
if you want to register another account in the future.
Example:
{{
"extractor": {{
"mastodon": {{
"{instance}": {{
"{key}-token": "{token}",
"client-id": "{client_id}",
"client-secret": "{client_secret}"
}}
}}
}}
}}
"""

@ -1,140 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://pawoo.net"""
from .common import Extractor, Message
from .. import text, exception
class PawooExtractor(Extractor):
"""Base class for pawoo extractors"""
category = "pawoo"
directory_fmt = ["{category}", "{account[username]}"]
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
archive_fmt = "{media[id]}"
def __init__(self):
Extractor.__init__(self)
self.api = MastodonAPI(self)
def items(self):
yield Message.Version, 1
for status in self.statuses():
attachments = self.prepare(status)
yield Message.Directory, status
for media in attachments:
status["media"] = media
url = media["url"]
yield Message.Url, url, text.nameext_from_url(url, status)
def statuses(self):
"""Return an iterable containing all relevant Status-objects"""
return []
@staticmethod
def prepare(status):
"""Prepare a status object"""
attachments = status["media_attachments"]
del status["media_attachments"]
return attachments
class PawooUserExtractor(PawooExtractor):
"""Extractor for all images of an account/user on pawoo.net"""
subcategory = "user"
pattern = [r"(?:https?://)?pawoo\.net/@([^/?&#]+)(?:/media)?/?$"]
test = [
("https://pawoo.net/@kuroda", {
"url": "a3f9e7555f2b024554c0e9b6cbcc7991af13cf99",
}),
("https://pawoo.net/@zZzZz/", {
"exception": exception.NotFoundError,
}),
("https://pawoo.net/@kuroda/media", None),
]
def __init__(self, match):
PawooExtractor.__init__(self)
self.account_name = match.group(1)
def statuses(self):
results = self.api.account_search("@" + self.account_name, 1)
for account in results:
if account["username"] == self.account_name:
break
else:
raise exception.NotFoundError("account")
return self.api.account_statuses(account["id"])
class PawooStatusExtractor(PawooExtractor):
"""Extractor for images from a status on pawoo.net"""
subcategory = "status"
pattern = [r"(?:https?://)?pawoo\.net/@[^/?&#]+/(\d+)"]
test = [
("https://pawoo.net/@takehana_note/559043", {
"url": "f95cc8c0274c4143e7e21dbdc693b90c65b596e3",
"content": "3b148cf90174173355fe34179741ce476921b2fc",
}),
("https://pawoo.net/@zZzZz/12346", {
"exception": exception.NotFoundError,
}),
]
def __init__(self, match):
PawooExtractor.__init__(self)
self.status_id = match.group(1)
def statuses(self):
return (self.api.status(self.status_id),)
class MastodonAPI():
"""Minimal interface for the Mastodon API on pawoo.net
https://github.com/tootsuite/mastodon
https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
"""
def __init__(self, extractor, root="https://pawoo.net",
access_token=("286462927198d0cf3e24683e91c8259a"
"ac4367233064e0570ca18df2ac65b226")):
self.root = root
self.extractor = extractor
extractor.session.headers["Authorization"] = "Bearer {}".format(
extractor.config("access-token", access_token))
def account_search(self, query, limit=40):
"""Search for content"""
url = "{}/api/v1/accounts/search".format(self.root)
params = {"q": query, "limit": limit}
response = self.extractor.request(url, params=params)
return self._parse(response)
def account_statuses(self, account_id):
"""Get an account's statuses"""
url = "{}/api/v1/accounts/{}/statuses?only_media=1".format(
self.root, account_id)
while url:
response = self.extractor.request(url)
yield from self._parse(response)
url = response.links.get("next", {}).get("url")
def status(self, status_id):
"""Fetch a Status"""
url = "{}/api/v1/statuses/{}".format(self.root, status_id)
response = self.extractor.request(url, expect=(404,))
return self._parse(response)
@staticmethod
def _parse(response):
"""Parse an API response"""
if response.status_code == 404:
raise exception.NotFoundError()
return response.json()

@ -1,9 +1,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2016-2018 Mike Fährmann # Copyright 2016-2019 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
__version__ = "1.6.3" __version__ = "1.7.0-dev"

@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2018 Mike Fährmann # Copyright 2018-2019 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -149,6 +149,8 @@ class TestExtractor(unittest.TestCase):
def capitalize(c): def capitalize(c):
if "-" in c: if "-" in c:
return string.capwords(c.replace("-", " ")).replace(" ", "") return string.capwords(c.replace("-", " ")).replace(" ", "")
if "." in c:
c = c.replace(".", "")
return c.capitalize() return c.capitalize()
mapping = { mapping = {

Loading…
Cancel
Save