add generalized extractors for Mastodon instances (#144)

Extractors for Mastodon instances can now be dynamically generated, based on the instance names in the 'extractor.mastodon.*' config path. Example: { "extractor": { "mastodon": { "pawoo.net": { ... }, "mastodon.xyz": { ... }, "tabletop.social": { ... }, ... } } } Each entry requires an 'access-token' value, which can be generated with 'gallery-dl oauth:mastodon:<instance URL>'. An 'access-token' (as well as a 'client-id' and 'client-secret') for pawoo.net is always available, but can be overwritten as necessary.
6 years ago · b8fed34548
parent 4b441c162e
commit b8fed34548
7 changed files with 269 additions and 150 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,7 @@
 # Changelog

+## Unreleased
+
 ## 1.6.3 - 2019-01-18
 - Added `metadata` post-processor to write image metadata to an external file ([#135](https://github.com/mikf/gallery-dl/issues/135))
 - Added option to reverse chapter order of manga extractors ([#149](https://github.com/mikf/gallery-dl/issues/149))
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2018 Mike Fährmann
+# Copyright 2015-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -67,7 +67,6 @@ modules = [
    "nijie",
    "nyafuu",
    "paheal",
-    "pawoo",
    "piczel",
    "pinterest",
    "pixiv",
@ -95,6 +94,7 @@ modules = [
    "yandere",
    "xvideos",
    "yuki",
+    "mastodon",
    "imagehosts",
    "directlink",
    "recursive",
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for mastodon instances"""
+
+from .common import Extractor, Message
+from .. import text, config, exception
+import re
+
+
+class MastodonExtractor(Extractor):
+    """Base class for mastodon extractors"""
+    basecategory = "mastodon"
+    directory_fmt = ["mastodon", "{category}", "{account[username]}"]
+    filename_fmt = "{category}_{id}_{media[id]}.{extension}"
+    archive_fmt = "{media[id]}"
+    instance = None
+
+    def __init__(self, match):
+        Extractor.__init__(self)
+        self.instance = match.group(1)
+        self.api = MastodonAPI(self, self.instance)
+
+    def config(self, key, default=None):
+        return config.interpolate(
+            ("extractor", "mastodon", self.category, self.subcategory, key),
+            default,
+        )
+
+    def items(self):
+        yield Message.Version, 1
+        for status in self.statuses():
+            attachments = self.prepare(status)
+            yield Message.Directory, status
+            for media in attachments:
+                status["media"] = media
+                url = media["url"]
+                yield Message.Url, url, text.nameext_from_url(url, status)
+
+    def statuses(self):
+        """Return an iterable containing all relevant Status-objects"""
+        return ()
+
+    @staticmethod
+    def prepare(status):
+        """Prepare a status object"""
+        attachments = status["media_attachments"]
+        del status["media_attachments"]
+        return attachments
+
+
+class MastodonUserExtractor(MastodonExtractor):
+    """Extractor for all images of an account/user"""
+    subcategory = "user"
+
+    def __init__(self, match):
+        MastodonExtractor.__init__(self, match)
+        self.account_name = match.group(2)
+
+    def statuses(self):
+        results = self.api.account_search("@" + self.account_name, 1)
+        for account in results:
+            if account["username"] == self.account_name:
+                break
+        else:
+            raise exception.NotFoundError("account")
+        return self.api.account_statuses(account["id"])
+
+
+class MastodonStatusExtractor(MastodonExtractor):
+    """Extractor for images from a status"""
+    subcategory = "status"
+
+    def __init__(self, match):
+        MastodonExtractor.__init__(self, match)
+        self.status_id = match.group(2)
+
+    def statuses(self):
+        return (self.api.status(self.status_id),)
+
+
+class MastodonAPI():
+    """Minimal interface for the Mastodon API
+
+    https://github.com/tootsuite/mastodon
+    https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
+    """
+
+    def __init__(self, extractor, instance, access_token=None):
+        self.instance = instance
+        self.extractor = extractor
+        self.headers = {"Authorization": "Bearer {}".format(
+            extractor.config("access-token", access_token))}
+
+    def account_search(self, query, limit=40):
+        """Search for content"""
+        params = {"q": query, "limit": limit}
+        return self._call("accounts/search", params)
+
+    def account_statuses(self, account_id):
+        """Get an account's statuses"""
+        endpoint = "accounts/{}/statuses".format(account_id)
+        params = {"only_media": "1"}
+        return self._pagination(endpoint, params)
+
+    def status(self, status_id):
+        """Fetch a Status"""
+        return self._call("statuses/" + status_id)
+
+    def _call(self, endpoint, params=None):
+        url = "https://{}/api/v1/{}".format(self.instance, endpoint)
+        response = self.extractor.request(
+            url, params=params, headers=self.headers)
+        return self._parse(response)
+
+    def _pagination(self, endpoint, params):
+        url = "https://{}/api/v1/{}".format(self.instance, endpoint)
+        while url:
+            response = self.extractor.request(
+                url, params=params, headers=self.headers)
+            yield from self._parse(response)
+            url = response.links.get("next", {}).get("url")
+
+    @staticmethod
+    def _parse(response):
+        """Parse an API response"""
+        if response.status_code == 404:
+            raise exception.NotFoundError()
+        return response.json()
+
+
+def generate_extractors():
+    """Dynamically generate Extractor classes for Mastodon instances"""
+
+    symtable = globals()
+    mastodon = config.get(("extractor", "mastodon")) or {}
+
+    if "pawoo.net" not in mastodon:
+        mastodon["pawoo.net"] = {
+            "access-token" : "286462927198d0cf3e24683e91c8259a"
+                             "ac4367233064e0570ca18df2ac65b226",
+            "client-id"    : "97b142b6904abf97a1068d51a7bc2f2f"
+                             "cf9323cef81f13cb505415716dba7dac",
+            "client-secret": "e45bef4bad45b38abf7d9ef88a646b73"
+                             "75e7fb2532c31a026327a93549236481",
+        }
+
+    for instance, info in mastodon.items():
+
+        if not isinstance(info, dict):
+            continue
+
+        class UserExtractor(MastodonUserExtractor):
+            pattern = [r"(?:https?://)?({})/@([^/?&#]+)(?:/media)?/?$".format(
+                re.escape(instance))]
+
+        class StatusExtractor(MastodonStatusExtractor):
+            pattern = [r"(?:https?://)?({})/@[^/?&#]+/(\d+)".format(
+                re.escape(instance))]
+
+        name = re.sub(r"[^A-Za-z]+", "", instance).capitalize()
+
+        for extr in (UserExtractor, StatusExtractor):
+            extr.category = instance
+            extr.__name__ = name + extr.__name__
+            extr.__doc__ = "{} on {}".format(extr.__base__.__doc__, instance)
+            symtable[extr.__name__] = extr
+
+
+generate_extractors()
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2017-2018 Mike Fährmann
+# Copyright 2017-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -10,7 +10,8 @@

 from .common import Extractor, Message
 from . import deviantart, flickr, reddit, smugmug, tumblr
-from .. import text, oauth, config
+from .. import text, oauth, config, exception
+from ..cache import cache
 import os
 import urllib.parse

@ -82,7 +83,6 @@ class OAuthBase(Extractor):
        data = self.open(authorize_url, params)

        # exchange the request token for an access token
-        # self.session.token = data["oauth_token"]
        data = self.session.get(access_token_url, params=data).text

        data = text.parse_query(data)
@ -94,7 +94,8 @@ class OAuthBase(Extractor):

    def _oauth2_authorization_code_grant(
            self, client_id, client_secret, auth_url, token_url,
-            scope="read", key="refresh_token", auth=True):
+            scope="read", key="refresh_token", auth=True,
+            message_template=None):
        """Perform an OAuth2 authorization code grant"""

        state = "gallery-dl_{}_{}".format(
@ -147,11 +148,15 @@ class OAuthBase(Extractor):

        # display token
        part = key.partition("_")[0]
-        self.send(OAUTH2_MSG_TEMPLATE.format(
+        template = message_template or OAUTH2_MSG_TEMPLATE
+        self.send(template.format(
            category=self.subcategory,
            key=part,
            Key=part.capitalize(),
            token=data[key],
+            instance=getattr(self, "instance", ""),
+            client_id=client_id,
+            client_secret=client_secret,
        ))


@ -254,6 +259,55 @@ class OAuthTumblr(OAuthBase):
        )


+class OAuthMastodon(OAuthBase):
+    subcategory = "mastodon"
+    pattern = ["oauth:mastodon:(?:https?://)?([^/?&#]+)"]
+
+    def __init__(self, match):
+        OAuthBase.__init__(self, match)
+        self.instance = match.group(1)
+
+    def items(self):
+        yield Message.Version, 1
+
+        application = self.oauth_config(self.instance)
+        if not application:
+            application = self._register(self.instance)
+
+        self._oauth2_authorization_code_grant(
+            application["client-id"],
+            application["client-secret"],
+            "https://{}/oauth/authorize".format(self.instance),
+            "https://{}/oauth/token".format(self.instance),
+            key="access_token",
+            message_template=MASTODON_MSG_TEMPLATE,
+        )
+
+    @cache(maxage=10*365*24*60*60, keyarg=1)
+    def _register(self, instance):
+        self.log.info("Registering application for '%s'", instance)
+
+        url = "https://{}/api/v1/apps".format(instance)
+        data = {
+            "client_name": "gdl:" + oauth.nonce(8),
+            "redirect_uris": self.redirect_uri,
+            "scopes": "read",
+        }
+        data = self.session.post(url, data=data).json()
+
+        if "client_id" not in data or "client_secret" not in data:
+            self.log.error("Failed to register new application: '%s'", data)
+            raise exception.StopExtraction()
+
+        data["client-id"] = data.pop("client_id")
+        data["client-secret"] = data.pop("client_secret")
+
+        self.log.info("client-id:\n%s", data["client-id"])
+        self.log.info("client-secret:\n%s", data["client-secret"])
+
+        return data
+
+
 OAUTH1_MSG_TEMPLATE = """
 Your Access Token and Access Token Secret are

@ -293,3 +347,29 @@ Example:
    }}
 }}
 """
+
+
+MASTODON_MSG_TEMPLATE = """
+Your {Key} Token is
+
+{token}
+
+Put this value into your configuration file as
+'extractor.mastodon.{instance}.{key}-token'.
+
+You can also add your 'client-id' and 'client-secret' values
+if you want to register another account in the future.
+
+Example:
+{{
+    "extractor": {{
+        "mastodon": {{
+            "{instance}": {{
+                "{key}-token": "{token}",
+                "client-id": "{client_id}",
+                "client-secret": "{client_secret}"
+            }}
+        }}
+    }}
+}}
+"""
--- a/gallery_dl/extractor/pawoo.py
+++ b/gallery_dl/extractor/pawoo.py
@ -1,140 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017-2018 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://pawoo.net"""
-
-from .common import Extractor, Message
-from .. import text, exception
-
-
-class PawooExtractor(Extractor):
-    """Base class for pawoo extractors"""
-    category = "pawoo"
-    directory_fmt = ["{category}", "{account[username]}"]
-    filename_fmt = "{category}_{id}_{media[id]}.{extension}"
-    archive_fmt = "{media[id]}"
-
-    def __init__(self):
-        Extractor.__init__(self)
-        self.api = MastodonAPI(self)
-
-    def items(self):
-        yield Message.Version, 1
-        for status in self.statuses():
-            attachments = self.prepare(status)
-            yield Message.Directory, status
-            for media in attachments:
-                status["media"] = media
-                url = media["url"]
-                yield Message.Url, url, text.nameext_from_url(url, status)
-
-    def statuses(self):
-        """Return an iterable containing all relevant Status-objects"""
-        return []
-
-    @staticmethod
-    def prepare(status):
-        """Prepare a status object"""
-        attachments = status["media_attachments"]
-        del status["media_attachments"]
-        return attachments
-
-
-class PawooUserExtractor(PawooExtractor):
-    """Extractor for all images of an account/user on pawoo.net"""
-    subcategory = "user"
-    pattern = [r"(?:https?://)?pawoo\.net/@([^/?&#]+)(?:/media)?/?$"]
-    test = [
-        ("https://pawoo.net/@kuroda", {
-            "url": "a3f9e7555f2b024554c0e9b6cbcc7991af13cf99",
-        }),
-        ("https://pawoo.net/@zZzZz/", {
-            "exception": exception.NotFoundError,
-        }),
-        ("https://pawoo.net/@kuroda/media", None),
-    ]
-
-    def __init__(self, match):
-        PawooExtractor.__init__(self)
-        self.account_name = match.group(1)
-
-    def statuses(self):
-        results = self.api.account_search("@" + self.account_name, 1)
-        for account in results:
-            if account["username"] == self.account_name:
-                break
-        else:
-            raise exception.NotFoundError("account")
-        return self.api.account_statuses(account["id"])
-
-
-class PawooStatusExtractor(PawooExtractor):
-    """Extractor for images from a status on pawoo.net"""
-    subcategory = "status"
-    pattern = [r"(?:https?://)?pawoo\.net/@[^/?&#]+/(\d+)"]
-    test = [
-        ("https://pawoo.net/@takehana_note/559043", {
-            "url": "f95cc8c0274c4143e7e21dbdc693b90c65b596e3",
-            "content": "3b148cf90174173355fe34179741ce476921b2fc",
-        }),
-        ("https://pawoo.net/@zZzZz/12346", {
-            "exception": exception.NotFoundError,
-        }),
-    ]
-
-    def __init__(self, match):
-        PawooExtractor.__init__(self)
-        self.status_id = match.group(1)
-
-    def statuses(self):
-        return (self.api.status(self.status_id),)
-
-
-class MastodonAPI():
-    """Minimal interface for the Mastodon API on pawoo.net
-
-    https://github.com/tootsuite/mastodon
-    https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
-    """
-
-    def __init__(self, extractor, root="https://pawoo.net",
-                 access_token=("286462927198d0cf3e24683e91c8259a"
-                               "ac4367233064e0570ca18df2ac65b226")):
-        self.root = root
-        self.extractor = extractor
-        extractor.session.headers["Authorization"] = "Bearer {}".format(
-            extractor.config("access-token", access_token))
-
-    def account_search(self, query, limit=40):
-        """Search for content"""
-        url = "{}/api/v1/accounts/search".format(self.root)
-        params = {"q": query, "limit": limit}
-        response = self.extractor.request(url, params=params)
-        return self._parse(response)
-
-    def account_statuses(self, account_id):
-        """Get an account's statuses"""
-        url = "{}/api/v1/accounts/{}/statuses?only_media=1".format(
-            self.root, account_id)
-        while url:
-            response = self.extractor.request(url)
-            yield from self._parse(response)
-            url = response.links.get("next", {}).get("url")
-
-    def status(self, status_id):
-        """Fetch a Status"""
-        url = "{}/api/v1/statuses/{}".format(self.root, status_id)
-        response = self.extractor.request(url, expect=(404,))
-        return self._parse(response)
-
-    @staticmethod
-    def _parse(response):
-        """Parse an API response"""
-        if response.status_code == 404:
-            raise exception.NotFoundError()
-        return response.json()
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-

-# Copyright 2016-2018 Mike Fährmann
+# Copyright 2016-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

-__version__ = "1.6.3"
+__version__ = "1.7.0-dev"
--- a/test/test_extractor.py
+++ b/test/test_extractor.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

-# Copyright 2018 Mike Fährmann
+# Copyright 2018-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -149,6 +149,8 @@ class TestExtractor(unittest.TestCase):
        def capitalize(c):
            if "-" in c:
                return string.capwords(c.replace("-", " ")).replace(" ", "")
+            if "." in c:
+                c = c.replace(".", "")
            return c.capitalize()

        mapping = {