add generalized extractors for Mastodon instances (#144)

Extractors for Mastodon instances can now be dynamically generated,
based on the instance names in the 'extractor.mastodon.*' config path.

Example:
{
    "extractor": {
        "mastodon": {
            "pawoo.net": { ... },
            "mastodon.xyz": { ... },
            "tabletop.social": { ... },
            ...
        }
    }
}

Each entry requires an 'access-token' value, which can be generated with
'gallery-dl oauth:mastodon:<instance URL>'.
An 'access-token' (as well as a 'client-id' and 'client-secret') for
pawoo.net is always available, but can be overwritten as necessary.
pull/170/head
Mike Fährmann 6 years ago
parent 4b441c162e
commit b8fed34548
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1,5 +1,7 @@
# Changelog
## Unreleased
## 1.6.3 - 2019-01-18
- Added `metadata` post-processor to write image metadata to an external file ([#135](https://github.com/mikf/gallery-dl/issues/135))
- Added option to reverse chapter order of manga extractors ([#149](https://github.com/mikf/gallery-dl/issues/149))

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2018 Mike Fährmann
# Copyright 2015-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -67,7 +67,6 @@ modules = [
"nijie",
"nyafuu",
"paheal",
"pawoo",
"piczel",
"pinterest",
"pixiv",
@ -95,6 +94,7 @@ modules = [
"yandere",
"xvideos",
"yuki",
"mastodon",
"imagehosts",
"directlink",
"recursive",

@ -0,0 +1,175 @@
# -*- coding: utf-8 -*-
# Copyright 2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for mastodon instances"""
from .common import Extractor, Message
from .. import text, config, exception
import re
class MastodonExtractor(Extractor):
"""Base class for mastodon extractors"""
basecategory = "mastodon"
directory_fmt = ["mastodon", "{category}", "{account[username]}"]
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
archive_fmt = "{media[id]}"
instance = None
def __init__(self, match):
Extractor.__init__(self)
self.instance = match.group(1)
self.api = MastodonAPI(self, self.instance)
def config(self, key, default=None):
return config.interpolate(
("extractor", "mastodon", self.category, self.subcategory, key),
default,
)
def items(self):
yield Message.Version, 1
for status in self.statuses():
attachments = self.prepare(status)
yield Message.Directory, status
for media in attachments:
status["media"] = media
url = media["url"]
yield Message.Url, url, text.nameext_from_url(url, status)
def statuses(self):
"""Return an iterable containing all relevant Status-objects"""
return ()
@staticmethod
def prepare(status):
"""Prepare a status object"""
attachments = status["media_attachments"]
del status["media_attachments"]
return attachments
class MastodonUserExtractor(MastodonExtractor):
"""Extractor for all images of an account/user"""
subcategory = "user"
def __init__(self, match):
MastodonExtractor.__init__(self, match)
self.account_name = match.group(2)
def statuses(self):
results = self.api.account_search("@" + self.account_name, 1)
for account in results:
if account["username"] == self.account_name:
break
else:
raise exception.NotFoundError("account")
return self.api.account_statuses(account["id"])
class MastodonStatusExtractor(MastodonExtractor):
"""Extractor for images from a status"""
subcategory = "status"
def __init__(self, match):
MastodonExtractor.__init__(self, match)
self.status_id = match.group(2)
def statuses(self):
return (self.api.status(self.status_id),)
class MastodonAPI():
"""Minimal interface for the Mastodon API
https://github.com/tootsuite/mastodon
https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
"""
def __init__(self, extractor, instance, access_token=None):
self.instance = instance
self.extractor = extractor
self.headers = {"Authorization": "Bearer {}".format(
extractor.config("access-token", access_token))}
def account_search(self, query, limit=40):
"""Search for content"""
params = {"q": query, "limit": limit}
return self._call("accounts/search", params)
def account_statuses(self, account_id):
"""Get an account's statuses"""
endpoint = "accounts/{}/statuses".format(account_id)
params = {"only_media": "1"}
return self._pagination(endpoint, params)
def status(self, status_id):
"""Fetch a Status"""
return self._call("statuses/" + status_id)
def _call(self, endpoint, params=None):
url = "https://{}/api/v1/{}".format(self.instance, endpoint)
response = self.extractor.request(
url, params=params, headers=self.headers)
return self._parse(response)
def _pagination(self, endpoint, params):
url = "https://{}/api/v1/{}".format(self.instance, endpoint)
while url:
response = self.extractor.request(
url, params=params, headers=self.headers)
yield from self._parse(response)
url = response.links.get("next", {}).get("url")
@staticmethod
def _parse(response):
"""Parse an API response"""
if response.status_code == 404:
raise exception.NotFoundError()
return response.json()
def generate_extractors():
"""Dynamically generate Extractor classes for Mastodon instances"""
symtable = globals()
mastodon = config.get(("extractor", "mastodon")) or {}
if "pawoo.net" not in mastodon:
mastodon["pawoo.net"] = {
"access-token" : "286462927198d0cf3e24683e91c8259a"
"ac4367233064e0570ca18df2ac65b226",
"client-id" : "97b142b6904abf97a1068d51a7bc2f2f"
"cf9323cef81f13cb505415716dba7dac",
"client-secret": "e45bef4bad45b38abf7d9ef88a646b73"
"75e7fb2532c31a026327a93549236481",
}
for instance, info in mastodon.items():
if not isinstance(info, dict):
continue
class UserExtractor(MastodonUserExtractor):
pattern = [r"(?:https?://)?({})/@([^/?&#]+)(?:/media)?/?$".format(
re.escape(instance))]
class StatusExtractor(MastodonStatusExtractor):
pattern = [r"(?:https?://)?({})/@[^/?&#]+/(\d+)".format(
re.escape(instance))]
name = re.sub(r"[^A-Za-z]+", "", instance).capitalize()
for extr in (UserExtractor, StatusExtractor):
extr.category = instance
extr.__name__ = name + extr.__name__
extr.__doc__ = "{} on {}".format(extr.__base__.__doc__, instance)
symtable[extr.__name__] = extr
generate_extractors()

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2017-2018 Mike Fährmann
# Copyright 2017-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -10,7 +10,8 @@
from .common import Extractor, Message
from . import deviantart, flickr, reddit, smugmug, tumblr
from .. import text, oauth, config
from .. import text, oauth, config, exception
from ..cache import cache
import os
import urllib.parse
@ -82,7 +83,6 @@ class OAuthBase(Extractor):
data = self.open(authorize_url, params)
# exchange the request token for an access token
# self.session.token = data["oauth_token"]
data = self.session.get(access_token_url, params=data).text
data = text.parse_query(data)
@ -94,7 +94,8 @@ class OAuthBase(Extractor):
def _oauth2_authorization_code_grant(
self, client_id, client_secret, auth_url, token_url,
scope="read", key="refresh_token", auth=True):
scope="read", key="refresh_token", auth=True,
message_template=None):
"""Perform an OAuth2 authorization code grant"""
state = "gallery-dl_{}_{}".format(
@ -147,11 +148,15 @@ class OAuthBase(Extractor):
# display token
part = key.partition("_")[0]
self.send(OAUTH2_MSG_TEMPLATE.format(
template = message_template or OAUTH2_MSG_TEMPLATE
self.send(template.format(
category=self.subcategory,
key=part,
Key=part.capitalize(),
token=data[key],
instance=getattr(self, "instance", ""),
client_id=client_id,
client_secret=client_secret,
))
@ -254,6 +259,55 @@ class OAuthTumblr(OAuthBase):
)
class OAuthMastodon(OAuthBase):
subcategory = "mastodon"
pattern = ["oauth:mastodon:(?:https?://)?([^/?&#]+)"]
def __init__(self, match):
OAuthBase.__init__(self, match)
self.instance = match.group(1)
def items(self):
yield Message.Version, 1
application = self.oauth_config(self.instance)
if not application:
application = self._register(self.instance)
self._oauth2_authorization_code_grant(
application["client-id"],
application["client-secret"],
"https://{}/oauth/authorize".format(self.instance),
"https://{}/oauth/token".format(self.instance),
key="access_token",
message_template=MASTODON_MSG_TEMPLATE,
)
@cache(maxage=10*365*24*60*60, keyarg=1)
def _register(self, instance):
self.log.info("Registering application for '%s'", instance)
url = "https://{}/api/v1/apps".format(instance)
data = {
"client_name": "gdl:" + oauth.nonce(8),
"redirect_uris": self.redirect_uri,
"scopes": "read",
}
data = self.session.post(url, data=data).json()
if "client_id" not in data or "client_secret" not in data:
self.log.error("Failed to register new application: '%s'", data)
raise exception.StopExtraction()
data["client-id"] = data.pop("client_id")
data["client-secret"] = data.pop("client_secret")
self.log.info("client-id:\n%s", data["client-id"])
self.log.info("client-secret:\n%s", data["client-secret"])
return data
OAUTH1_MSG_TEMPLATE = """
Your Access Token and Access Token Secret are
@ -293,3 +347,29 @@ Example:
}}
}}
"""
MASTODON_MSG_TEMPLATE = """
Your {Key} Token is
{token}
Put this value into your configuration file as
'extractor.mastodon.{instance}.{key}-token'.
You can also add your 'client-id' and 'client-secret' values
if you want to register another account in the future.
Example:
{{
"extractor": {{
"mastodon": {{
"{instance}": {{
"{key}-token": "{token}",
"client-id": "{client_id}",
"client-secret": "{client_secret}"
}}
}}
}}
}}
"""

@ -1,140 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://pawoo.net"""
from .common import Extractor, Message
from .. import text, exception
class PawooExtractor(Extractor):
"""Base class for pawoo extractors"""
category = "pawoo"
directory_fmt = ["{category}", "{account[username]}"]
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
archive_fmt = "{media[id]}"
def __init__(self):
Extractor.__init__(self)
self.api = MastodonAPI(self)
def items(self):
yield Message.Version, 1
for status in self.statuses():
attachments = self.prepare(status)
yield Message.Directory, status
for media in attachments:
status["media"] = media
url = media["url"]
yield Message.Url, url, text.nameext_from_url(url, status)
def statuses(self):
"""Return an iterable containing all relevant Status-objects"""
return []
@staticmethod
def prepare(status):
"""Prepare a status object"""
attachments = status["media_attachments"]
del status["media_attachments"]
return attachments
class PawooUserExtractor(PawooExtractor):
"""Extractor for all images of an account/user on pawoo.net"""
subcategory = "user"
pattern = [r"(?:https?://)?pawoo\.net/@([^/?&#]+)(?:/media)?/?$"]
test = [
("https://pawoo.net/@kuroda", {
"url": "a3f9e7555f2b024554c0e9b6cbcc7991af13cf99",
}),
("https://pawoo.net/@zZzZz/", {
"exception": exception.NotFoundError,
}),
("https://pawoo.net/@kuroda/media", None),
]
def __init__(self, match):
PawooExtractor.__init__(self)
self.account_name = match.group(1)
def statuses(self):
results = self.api.account_search("@" + self.account_name, 1)
for account in results:
if account["username"] == self.account_name:
break
else:
raise exception.NotFoundError("account")
return self.api.account_statuses(account["id"])
class PawooStatusExtractor(PawooExtractor):
"""Extractor for images from a status on pawoo.net"""
subcategory = "status"
pattern = [r"(?:https?://)?pawoo\.net/@[^/?&#]+/(\d+)"]
test = [
("https://pawoo.net/@takehana_note/559043", {
"url": "f95cc8c0274c4143e7e21dbdc693b90c65b596e3",
"content": "3b148cf90174173355fe34179741ce476921b2fc",
}),
("https://pawoo.net/@zZzZz/12346", {
"exception": exception.NotFoundError,
}),
]
def __init__(self, match):
PawooExtractor.__init__(self)
self.status_id = match.group(1)
def statuses(self):
return (self.api.status(self.status_id),)
class MastodonAPI():
"""Minimal interface for the Mastodon API on pawoo.net
https://github.com/tootsuite/mastodon
https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
"""
def __init__(self, extractor, root="https://pawoo.net",
access_token=("286462927198d0cf3e24683e91c8259a"
"ac4367233064e0570ca18df2ac65b226")):
self.root = root
self.extractor = extractor
extractor.session.headers["Authorization"] = "Bearer {}".format(
extractor.config("access-token", access_token))
def account_search(self, query, limit=40):
"""Search for content"""
url = "{}/api/v1/accounts/search".format(self.root)
params = {"q": query, "limit": limit}
response = self.extractor.request(url, params=params)
return self._parse(response)
def account_statuses(self, account_id):
"""Get an account's statuses"""
url = "{}/api/v1/accounts/{}/statuses?only_media=1".format(
self.root, account_id)
while url:
response = self.extractor.request(url)
yield from self._parse(response)
url = response.links.get("next", {}).get("url")
def status(self, status_id):
"""Fetch a Status"""
url = "{}/api/v1/statuses/{}".format(self.root, status_id)
response = self.extractor.request(url, expect=(404,))
return self._parse(response)
@staticmethod
def _parse(response):
"""Parse an API response"""
if response.status_code == 404:
raise exception.NotFoundError()
return response.json()

@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-
# Copyright 2016-2018 Mike Fährmann
# Copyright 2016-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
__version__ = "1.6.3"
__version__ = "1.7.0-dev"

@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2018 Mike Fährmann
# Copyright 2018-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -149,6 +149,8 @@ class TestExtractor(unittest.TestCase):
def capitalize(c):
if "-" in c:
return string.capwords(c.replace("-", " ")).replace(" ", "")
if "." in c:
c = c.replace(".", "")
return c.capitalize()
mapping = {

Loading…
Cancel
Save