add generalized extractors for Mastodon instances (#144)

Extractors for Mastodon instances can now be dynamically generated, based on the instance names in the 'extractor.mastodon.*' config path. Example: { "extractor": { "mastodon": { "pawoo.net": { ... }, "mastodon.xyz": { ... }, "tabletop.social": { ... }, ... } } } Each entry requires an 'access-token' value, which can be generated with 'gallery-dl oauth:mastodon:<instance URL>'. An 'access-token' (as well as a 'client-id' and 'client-secret') for pawoo.net is always available, but can be overwritten as necessary.
6 years ago · b8fed34548
parent 4b441c162e
commit b8fed34548
7 changed files with 269 additions and 150 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,7 @@
 # Changelog
 ## Unreleased
 ## 1.6.3 - 2019-01-18
 - Added `metadata` post-processor to write image metadata to an external file ([#135](https://github.com/mikf/gallery-dl/issues/135))
 - Added option to reverse chapter order of manga extractors ([#149](https://github.com/mikf/gallery-dl/issues/149))
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright 2015-2018 Mike Fährmann
+# Copyright 2015-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -67,7 +67,6 @@ modules = [
    "nijie",
    "nyafuu",
    "paheal",
    "pawoo",
    "piczel",
    "pinterest",
    "pixiv",
@ -95,6 +94,7 @@ modules = [
    "yandere",
    "xvideos",
    "yuki",
    "mastodon",
    "imagehosts",
    "directlink",
    "recursive",
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@ -0,0 +1,175 @@
 # -*- coding: utf-8 -*-
 # Copyright 2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extractors for mastodon instances"""
 from .common import Extractor, Message
 from .. import text, config, exception
 import re
 class MastodonExtractor(Extractor):
    """Base class for mastodon extractors"""
    basecategory = "mastodon"
    directory_fmt = ["mastodon", "{category}", "{account[username]}"]
    filename_fmt = "{category}_{id}_{media[id]}.{extension}"
    archive_fmt = "{media[id]}"
    instance = None
    def __init__(self, match):
        Extractor.__init__(self)
        self.instance = match.group(1)
        self.api = MastodonAPI(self, self.instance)
    def config(self, key, default=None):
        return config.interpolate(
            ("extractor", "mastodon", self.category, self.subcategory, key),
            default,
        )
    def items(self):
        yield Message.Version, 1
        for status in self.statuses():
            attachments = self.prepare(status)
            yield Message.Directory, status
            for media in attachments:
                status["media"] = media
                url = media["url"]
                yield Message.Url, url, text.nameext_from_url(url, status)
    def statuses(self):
        """Return an iterable containing all relevant Status-objects"""
        return ()
    @staticmethod
    def prepare(status):
        """Prepare a status object"""
        attachments = status["media_attachments"]
        del status["media_attachments"]
        return attachments
 class MastodonUserExtractor(MastodonExtractor):
    """Extractor for all images of an account/user"""
    subcategory = "user"
    def __init__(self, match):
        MastodonExtractor.__init__(self, match)
        self.account_name = match.group(2)
    def statuses(self):
        results = self.api.account_search("@" + self.account_name, 1)
        for account in results:
            if account["username"] == self.account_name:
                break
        else:
            raise exception.NotFoundError("account")
        return self.api.account_statuses(account["id"])
 class MastodonStatusExtractor(MastodonExtractor):
    """Extractor for images from a status"""
    subcategory = "status"
    def __init__(self, match):
        MastodonExtractor.__init__(self, match)
        self.status_id = match.group(2)
    def statuses(self):
        return (self.api.status(self.status_id),)
 class MastodonAPI():
    """Minimal interface for the Mastodon API
    https://github.com/tootsuite/mastodon
    https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
    """
    def __init__(self, extractor, instance, access_token=None):
        self.instance = instance
        self.extractor = extractor
        self.headers = {"Authorization": "Bearer {}".format(
            extractor.config("access-token", access_token))}
    def account_search(self, query, limit=40):
        """Search for content"""
        params = {"q": query, "limit": limit}
        return self._call("accounts/search", params)
    def account_statuses(self, account_id):
        """Get an account's statuses"""
        endpoint = "accounts/{}/statuses".format(account_id)
        params = {"only_media": "1"}
        return self._pagination(endpoint, params)
    def status(self, status_id):
        """Fetch a Status"""
        return self._call("statuses/" + status_id)
    def _call(self, endpoint, params=None):
        url = "https://{}/api/v1/{}".format(self.instance, endpoint)
        response = self.extractor.request(
            url, params=params, headers=self.headers)
        return self._parse(response)
    def _pagination(self, endpoint, params):
        url = "https://{}/api/v1/{}".format(self.instance, endpoint)
        while url:
            response = self.extractor.request(
                url, params=params, headers=self.headers)
            yield from self._parse(response)
            url = response.links.get("next", {}).get("url")
    @staticmethod
    def _parse(response):
        """Parse an API response"""
        if response.status_code == 404:
            raise exception.NotFoundError()
        return response.json()
 def generate_extractors():
    """Dynamically generate Extractor classes for Mastodon instances"""
    symtable = globals()
    mastodon = config.get(("extractor", "mastodon")) or {}
    if "pawoo.net" not in mastodon:
        mastodon["pawoo.net"] = {
            "access-token" : "286462927198d0cf3e24683e91c8259a"
                             "ac4367233064e0570ca18df2ac65b226",
            "client-id"    : "97b142b6904abf97a1068d51a7bc2f2f"
                             "cf9323cef81f13cb505415716dba7dac",
            "client-secret": "e45bef4bad45b38abf7d9ef88a646b73"
                             "75e7fb2532c31a026327a93549236481",
        }
    for instance, info in mastodon.items():
        if not isinstance(info, dict):
            continue
        class UserExtractor(MastodonUserExtractor):
            pattern = [r"(?:https?://)?({})/@([^/?&#]+)(?:/media)?/?$".format(
                re.escape(instance))]
        class StatusExtractor(MastodonStatusExtractor):
            pattern = [r"(?:https?://)?({})/@[^/?&#]+/(\d+)".format(
                re.escape(instance))]
        name = re.sub(r"[^A-Za-z]+", "", instance).capitalize()
        for extr in (UserExtractor, StatusExtractor):
            extr.category = instance
            extr.__name__ = name + extr.__name__
            extr.__doc__ = "{} on {}".format(extr.__base__.__doc__, instance)
            symtable[extr.__name__] = extr
 generate_extractors()
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright 2017-2018 Mike Fährmann
+# Copyright 2017-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -10,7 +10,8 @@
 from .common import Extractor, Message
 from . import deviantart, flickr, reddit, smugmug, tumblr
-from .. import text, oauth, config
+from .. import text, oauth, config, exception
 from ..cache import cache
 import os
 import urllib.parse
@ -82,7 +83,6 @@ class OAuthBase(Extractor):
        data = self.open(authorize_url, params)
        # exchange the request token for an access token
        # self.session.token = data["oauth_token"]
        data = self.session.get(access_token_url, params=data).text
        data = text.parse_query(data)
@ -94,7 +94,8 @@ class OAuthBase(Extractor):
    def _oauth2_authorization_code_grant(
            self, client_id, client_secret, auth_url, token_url,
-            scope="read", key="refresh_token", auth=True):
+            scope="read", key="refresh_token", auth=True,
            message_template=None):
        """Perform an OAuth2 authorization code grant"""
        state = "gallery-dl_{}_{}".format(
@ -147,11 +148,15 @@ class OAuthBase(Extractor):
        # display token
        part = key.partition("_")[0]
-        self.send(OAUTH2_MSG_TEMPLATE.format(
+        template = message_template or OAUTH2_MSG_TEMPLATE
        self.send(template.format(
            category=self.subcategory,
            key=part,
            Key=part.capitalize(),
            token=data[key],
            instance=getattr(self, "instance", ""),
            client_id=client_id,
            client_secret=client_secret,
        ))
@ -254,6 +259,55 @@ class OAuthTumblr(OAuthBase):
        )
 class OAuthMastodon(OAuthBase):
    subcategory = "mastodon"
    pattern = ["oauth:mastodon:(?:https?://)?([^/?&#]+)"]
    def __init__(self, match):
        OAuthBase.__init__(self, match)
        self.instance = match.group(1)
    def items(self):
        yield Message.Version, 1
        application = self.oauth_config(self.instance)
        if not application:
            application = self._register(self.instance)
        self._oauth2_authorization_code_grant(
            application["client-id"],
            application["client-secret"],
            "https://{}/oauth/authorize".format(self.instance),
            "https://{}/oauth/token".format(self.instance),
            key="access_token",
            message_template=MASTODON_MSG_TEMPLATE,
        )
    @cache(maxage=10*365*24*60*60, keyarg=1)
    def _register(self, instance):
        self.log.info("Registering application for '%s'", instance)
        url = "https://{}/api/v1/apps".format(instance)
        data = {
            "client_name": "gdl:" + oauth.nonce(8),
            "redirect_uris": self.redirect_uri,
            "scopes": "read",
        }
        data = self.session.post(url, data=data).json()
        if "client_id" not in data or "client_secret" not in data:
            self.log.error("Failed to register new application: '%s'", data)
            raise exception.StopExtraction()
        data["client-id"] = data.pop("client_id")
        data["client-secret"] = data.pop("client_secret")
        self.log.info("client-id:\n%s", data["client-id"])
        self.log.info("client-secret:\n%s", data["client-secret"])
        return data
 OAUTH1_MSG_TEMPLATE = """
 Your Access Token and Access Token Secret are
@ -293,3 +347,29 @@ Example:
    }}
 }}
 """
 MASTODON_MSG_TEMPLATE = """
 Your {Key} Token is
 {token}
 Put this value into your configuration file as
 'extractor.mastodon.{instance}.{key}-token'.
 You can also add your 'client-id' and 'client-secret' values
 if you want to register another account in the future.
 Example:
 {{
    "extractor": {{
        "mastodon": {{
            "{instance}": {{
                "{key}-token": "{token}",
                "client-id": "{client_id}",
                "client-secret": "{client_secret}"
            }}
        }}
    }}
 }}
 """
--- a/gallery_dl/extractor/pawoo.py
+++ b/gallery_dl/extractor/pawoo.py
@ -1,140 +0,0 @@
 # -*- coding: utf-8 -*-
 # Copyright 2017-2018 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extract images from https://pawoo.net"""
 from .common import Extractor, Message
 from .. import text, exception
 class PawooExtractor(Extractor):
    """Base class for pawoo extractors"""
    category = "pawoo"
    directory_fmt = ["{category}", "{account[username]}"]
    filename_fmt = "{category}_{id}_{media[id]}.{extension}"
    archive_fmt = "{media[id]}"
    def __init__(self):
        Extractor.__init__(self)
        self.api = MastodonAPI(self)
    def items(self):
        yield Message.Version, 1
        for status in self.statuses():
            attachments = self.prepare(status)
            yield Message.Directory, status
            for media in attachments:
                status["media"] = media
                url = media["url"]
                yield Message.Url, url, text.nameext_from_url(url, status)
    def statuses(self):
        """Return an iterable containing all relevant Status-objects"""
        return []
    @staticmethod
    def prepare(status):
        """Prepare a status object"""
        attachments = status["media_attachments"]
        del status["media_attachments"]
        return attachments
 class PawooUserExtractor(PawooExtractor):
    """Extractor for all images of an account/user on pawoo.net"""
    subcategory = "user"
    pattern = [r"(?:https?://)?pawoo\.net/@([^/?&#]+)(?:/media)?/?$"]
    test = [
        ("https://pawoo.net/@kuroda", {
            "url": "a3f9e7555f2b024554c0e9b6cbcc7991af13cf99",
        }),
        ("https://pawoo.net/@zZzZz/", {
            "exception": exception.NotFoundError,
        }),
        ("https://pawoo.net/@kuroda/media", None),
    ]
    def __init__(self, match):
        PawooExtractor.__init__(self)
        self.account_name = match.group(1)
    def statuses(self):
        results = self.api.account_search("@" + self.account_name, 1)
        for account in results:
            if account["username"] == self.account_name:
                break
        else:
            raise exception.NotFoundError("account")
        return self.api.account_statuses(account["id"])
 class PawooStatusExtractor(PawooExtractor):
    """Extractor for images from a status on pawoo.net"""
    subcategory = "status"
    pattern = [r"(?:https?://)?pawoo\.net/@[^/?&#]+/(\d+)"]
    test = [
        ("https://pawoo.net/@takehana_note/559043", {
            "url": "f95cc8c0274c4143e7e21dbdc693b90c65b596e3",
            "content": "3b148cf90174173355fe34179741ce476921b2fc",
        }),
        ("https://pawoo.net/@zZzZz/12346", {
            "exception": exception.NotFoundError,
        }),
    ]
    def __init__(self, match):
        PawooExtractor.__init__(self)
        self.status_id = match.group(1)
    def statuses(self):
        return (self.api.status(self.status_id),)
 class MastodonAPI():
    """Minimal interface for the Mastodon API on pawoo.net
    https://github.com/tootsuite/mastodon
    https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
    """
    def __init__(self, extractor, root="https://pawoo.net",
                 access_token=("286462927198d0cf3e24683e91c8259a"
                               "ac4367233064e0570ca18df2ac65b226")):
        self.root = root
        self.extractor = extractor
        extractor.session.headers["Authorization"] = "Bearer {}".format(
            extractor.config("access-token", access_token))
    def account_search(self, query, limit=40):
        """Search for content"""
        url = "{}/api/v1/accounts/search".format(self.root)
        params = {"q": query, "limit": limit}
        response = self.extractor.request(url, params=params)
        return self._parse(response)
    def account_statuses(self, account_id):
        """Get an account's statuses"""
        url = "{}/api/v1/accounts/{}/statuses?only_media=1".format(
            self.root, account_id)
        while url:
            response = self.extractor.request(url)
            yield from self._parse(response)
            url = response.links.get("next", {}).get("url")
    def status(self, status_id):
        """Fetch a Status"""
        url = "{}/api/v1/statuses/{}".format(self.root, status_id)
        response = self.extractor.request(url, expect=(404,))
        return self._parse(response)
    @staticmethod
    def _parse(response):
        """Parse an API response"""
        if response.status_code == 404:
            raise exception.NotFoundError()
        return response.json()
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
-# Copyright 2016-2018 Mike Fährmann
+# Copyright 2016-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
-__version__ = "1.6.3"
+__version__ = "1.7.0-dev"
--- a/test/test_extractor.py
+++ b/test/test_extractor.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-# Copyright 2018 Mike Fährmann
+# Copyright 2018-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -149,6 +149,8 @@ class TestExtractor(unittest.TestCase):
        def capitalize(c):
            if "-" in c:
                return string.capwords(c.replace("-", " ")).replace(" ", "")
            if "." in c:
                c = c.replace(".", "")
            return c.capitalize()
        mapping = {