[ao3] add initial support (#6013)

4 days ago · 638a676495
parent 7d6520e15d
commit 638a676495
6 changed files with 418 additions and 0 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -1279,6 +1279,20 @@ Extractor-specific Options
 ==========================


+extractor.ao3.formats
+---------------------
+Type
+    * ``string``
+    * ``list`` of ``strings``
+Default
+    ``"pdf"``
+Example
+    * ``"azw3,epub,mobi,pdf,html"``
+    * ``["azw3", "epub", "mobi", "pdf", "html"]``
+Description
+    Format(s) to download.
+
+
 extractor.artstation.external
 -----------------------------
 Type
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -103,6 +103,12 @@ Consider all listed sites to potentially be NSFW.
    <td>Firms, Projects</td>
    <td></td>
 </tr>
+<tr>
+    <td>Archive of Our Own</td>
+    <td>https://archiveofourown.org/</td>
+    <td>Search Results, Series, Tag Searches, User Profiles, Bookmarks, Works</td>
+    <td></td>
+</tr>
 <tr>
    <td>ArtStation</td>
    <td>https://www.artstation.com/</td>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -23,6 +23,7 @@ modules = [
    "8muses",
    "adultempire",
    "agnph",
+    "ao3",
    "architizer",
    "artstation",
    "aryion",
--- a/gallery_dl/extractor/ao3.py
+++ b/gallery_dl/extractor/ao3.py
@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://archiveofourown.org/"""
+
+from .common import Extractor, Message
+from .. import text, util
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?archiveofourown.org"
+
+
+class Ao3Extractor(Extractor):
+    """Base class for ao3 extractors"""
+    category = "ao3"
+    root = "https://archiveofourown.org"
+    categorytransfer = True
+    request_interval = (0.5, 1.5)
+
+    def items(self):
+        base = self.root + "/works/"
+        data = {"_extractor": Ao3WorkExtractor}
+
+        for work_id in self.works():
+            yield Message.Queue, base + work_id, data
+
+    def works(self):
+        return self._pagination(self.groups[0])
+
+    def _pagination(self, path, needle='<li id="work_'):
+        while True:
+            page = self.request(self.root + path).text
+            yield from text.extract_iter(page, needle, '"')
+            path = text.extr(page, '<a rel="next" href="', '"')
+            if not path:
+                return
+            path = text.unescape(path)
+
+
+class Ao3WorkExtractor(Ao3Extractor):
+    """Extractor for an AO3 work"""
+    subcategory = "work"
+    directory_fmt = ("{category}", "{author}")
+    filename_fmt = "{id} {title}.{extension}"
+    archive_fmt = "{id}.{extension}"
+    pattern = BASE_PATTERN + r"/works/(\d+)"
+    example = "https://archiveofourown.org/works/12345"
+
+    def _init(self):
+        formats = self.config("formats")
+        if formats is None:
+            self.formats = ("pdf",)
+        elif not formats:
+            self.formats = ()
+        elif isinstance(formats, str):
+            self.formats = formats.lower().replace(" ", "").split(",")
+        else:
+            self.formats = formats
+
+        self.cookies.set("view_adult", "true", domain="archiveofourown.org")
+
+    def items(self):
+        work_id = self.groups[0]
+        url = "{}/works/{}".format(self.root, work_id)
+        extr = text.extract_from(self.request(url).text)
+
+        fmts = {}
+        download = extr(' class="download"', "</ul>")
+        for dl in text.extract_iter(download, ' href="', "</"):
+            path, _, type = dl.rpartition('">')
+            fmts[type.lower()] = path
+
+        data = {
+            "id"           : text.parse_int(work_id),
+            "rating"       : text.split_html(
+                extr('<dd class="rating tags">', "</dd>")),
+            "warnings"     : text.split_html(
+                extr('<dd class="warning tags">', "</dd>")),
+            "categories"   : text.split_html(
+                extr('<dd class="category tags">', "</dd>")),
+            "fandom"       : text.split_html(
+                extr('<dd class="fandom tags">', "</dd>")),
+            "relationships": text.split_html(
+                extr('<dd class="relationship tags">', "</dd>")),
+            "characters"   : text.split_html(
+                extr('<dd class="character tags">', "</dd>")),
+            "tags"         : text.split_html(
+                extr('<dd class="freeform tags">', "</dd>")),
+            "lang"         : extr('<dd class="language" lang="', '"'),
+            "series"       : extr('<dd class="series">', "</dd>"),
+            "date"         : text.parse_datetime(
+                extr('<dd class="published">', "<"), "%Y-%m-%d"),
+            "words"        : text.parse_int(
+                extr('<dd class="words">', "<").replace(",", "")),
+            "chapters"     : text.parse_int(
+                extr('<dd class="chapters">', "/")),
+            "comments"     : text.parse_int(
+                extr('<dd class="comments">', "<").replace(",", "")),
+            "likes"        : text.parse_int(
+                extr('<dd class="kudos">', "<").replace(",", "")),
+            "bookmarks"    : text.parse_int(text.remove_html(
+                extr('<dd class="bookmarks">', "</dd>")).replace(",", "")),
+            "views"        : text.parse_int(
+                extr('<dd class="hits">', "<").replace(",", "")),
+            "title"        : text.unescape(
+                extr(' class="title heading">', "<").strip()),
+            "author"       : text.unescape(text.remove_html(
+                extr(' class="byline heading">', "</h3>"))),
+            "summary"      : text.split_html(
+                extr(' class="heading">Summary:</h3>', "</div>")),
+        }
+        data["language"] = util.code_to_language(data["lang"])
+
+        yield Message.Directory, data
+        for fmt in self.formats:
+            try:
+                url = text.urljoin(self.root, fmts[fmt])
+            except KeyError:
+                self.log.warning("%s: Format '%s' not available", work_id, fmt)
+            else:
+                yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class Ao3SeriesExtractor(Ao3Extractor):
+    """Extractor for AO3 works of a series"""
+    subcategory = "series"
+    pattern = BASE_PATTERN + r"(/series/(\d+))"
+    example = "https://archiveofourown.org/series/12345"
+
+
+class Ao3TagExtractor(Ao3Extractor):
+    """Extractor for AO3 works by tag"""
+    subcategory = "tag"
+    pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)"
+    example = "https://archiveofourown.org/tags/TAG/works"
+
+
+class Ao3SearchExtractor(Ao3Extractor):
+    """Extractor for AO3 search results"""
+    subcategory = "search"
+    pattern = BASE_PATTERN + r"(/works/search/?\?.+)"
+    example = "https://archiveofourown.org/works/search?work_search[query]=air"
+
+
+class Ao3UserExtractor(Ao3Extractor):
+    """Extractor for an AO3 user profile"""
+    subcategory = "user"
+    pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)"
+               r"(?:/profile)?/?(?:$|\?|#)")
+    example = "https://archiveofourown.org/users/USER"
+
+    def initialize(self):
+        pass
+
+    def items(self):
+        base = "{}/users/{}/".format(self.root, self.groups[0])
+        return self._dispatch_extractors((
+            (Ao3UserWorksExtractor   , base + "works"),
+            (Ao3UserSeriesExtractor  , base + "series"),
+            (Ao3UserBookmarkExtractor, base + "bookmarks"),
+        ), ("user-works", "user-series"))
+
+
+class Ao3UserWorksExtractor(Ao3Extractor):
+    """Extractor for works of an AO3 user"""
+    subcategory = "user-works"
+    pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
+               r"works(?:/?\?.+)?)")
+    example = "https://archiveofourown.org/users/USER/works"
+
+
+class Ao3UserSeriesExtractor(Ao3Extractor):
+    """Extractor for series of an AO3 user"""
+    subcategory = "user-series"
+    pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
+               r"series(?:/?\?.+)?)")
+    example = "https://archiveofourown.org/users/USER/series"
+
+    def items(self):
+        base = self.root + "/series/"
+        data = {"_extractor": Ao3SeriesExtractor}
+
+        for series_id in self.series():
+            yield Message.Queue, base + series_id, data
+
+    def series(self):
+        path, user, pseud, query = self.groups
+        return self._pagination(self.groups[0], '<li id="series_')
+
+
+class Ao3UserBookmarkExtractor(Ao3Extractor):
+    """Extractor for bookmarked works of an AO3 user"""
+    subcategory = "user-bookmark"
+    pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
+               r"bookmarks(?:/?\?.+)?)")
+    example = "https://archiveofourown.org/users/USER/bookmarks"
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@ -26,6 +26,7 @@ CATEGORY_MAP = {
    "adultempire"    : "Adult Empire",
    "agnph"          : "AGNPH",
    "allgirlbooru"   : "All girl",
+    "ao3"            : "Archive of Our Own",
    "archivedmoe"    : "Archived.Moe",
    "archiveofsins"  : "Archive of Sins",
    "artstation"     : "ArtStation",
@ -181,6 +182,11 @@ SUBCATEGORY_MAP = {
    "related-pin"  : "related Pins",
    "related-board": "",

+    "ao3": {
+        "user-works"   : "",
+        "user-series"  : "",
+        "user-bookmark": "Bookmarks",
+    },
    "artstation": {
        "artwork": "Artwork Listings",
        "collections": "",
--- a/test/results/ao3.py
+++ b/test/results/ao3.py
@ -0,0 +1,191 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import ao3
+
+
+__tests__ = (
+{
+    "#url"     : "https://archiveofourown.org/works/47802076",
+    "#category": ("", "ao3", "work"),
+    "#class"   : ao3.Ao3WorkExtractor,
+    "#urls"    : "https://archiveofourown.org/downloads/47802076/The_Wildcard.pdf?updated_at=1720398424",
+
+    "author"   : "Flowers_for_ghouls",
+    "bookmarks": range(100, 300),
+    "chapters" : 27,
+    "comments" : range(800, 2000),
+    "date"     : "dt:2023-06-11 00:00:00",
+    "extension": "pdf",
+    "filename" : "The_Wildcard",
+    "id"       : 47802076,
+    "lang"     : "en",
+    "language" : "English",
+    "likes"    : range(1000, 2000),
+    "title"    : "The Wildcard",
+    "views"    : range(34000, 50000),
+    "words"    : 217549,
+
+    "categories": [
+        "Gen",
+        "M/M",
+    ],
+    "characters": [
+        "Dewdrop Ghoul | Fire Ghoul",
+        "Aether | Quintessence Ghoul",
+        "Multi Ghoul | Swiss Army Ghoul",
+        "Rain | Water Ghoul",
+        "Cirrus | Air Ghoulette",
+        "Cumulus | Air Ghoulette",
+        "Sunshine Ghoulette",
+        "Mountain | Earth Ghoul",
+        "Cardinal Copia",
+        "Phantom Ghoul",
+        "Aurora Ghoulette",
+        "Sister Imperator (Ghost Sweden Band)",
+    ],
+    "fandom": [
+        "Ghost (Sweden Band)",
+    ],
+    "rating": [
+        "Mature",
+    ],
+    "relationships": [
+        "Aether | Quintessence Ghoul/Dewdrop Ghoul | Fire Ghoul",
+        "Multi Ghoul | Swiss Army Ghoul/Rain | Water Ghoul",
+    ],
+    "summary": [
+        "Aether has been asked to stay at the ministry to manage the renovation of the new infirmary. It couldn’t have been worse timing. Barely days into the new tour, Dew realizes he’s carrying their first kit.",
+    ],
+    "tags": [
+        "Domestic Fluff",
+        "Pack Dynamics",
+        "gratuitous fluff",
+        "How do ghouls work?",
+        "they don't even know",
+        "but it's cute",
+        "Pregnant Dewdrop",
+        "Recreational Drug Use",
+        "Cowbell!",
+        "Protective Ghouls",
+        "no beta we die like Nihil",
+        "sick dewdrop",
+        "TW: Vomiting",
+        "Aether really loves Dew",
+        "Nesting",
+        "Ghoul Piles (Ghost Sweden Band)",
+        "Angst",
+        "Hurt/Comfort",
+        "Original Ghoul Kit(s) (Ghost Sweden Band)",
+        "Kit fic",
+    ],
+    "warnings": [
+        "No Archive Warnings Apply",
+    ],
+},
+
+{
+    "#url"     : "https://archiveofourown.org/works/47802076",
+    "#category": ("", "ao3", "work"),
+    "#class"   : ao3.Ao3WorkExtractor,
+    "#options" : {"formats": ["epub", "mobi", "azw3", "pdf", "html"]},
+    "#urls"    : (
+        "https://archiveofourown.org/downloads/47802076/The_Wildcard.epub?updated_at=1720398424",
+        "https://archiveofourown.org/downloads/47802076/The_Wildcard.mobi?updated_at=1720398424",
+        "https://archiveofourown.org/downloads/47802076/The_Wildcard.azw3?updated_at=1720398424",
+        "https://archiveofourown.org/downloads/47802076/The_Wildcard.pdf?updated_at=1720398424",
+        "https://archiveofourown.org/downloads/47802076/The_Wildcard.html?updated_at=1720398424",
+    ),
+},
+
+{
+    "#url"     : "https://archiveofourown.org/series/1903930",
+    "#category": ("", "ao3", "series"),
+    "#class"   : ao3.Ao3SeriesExtractor,
+    "#urls"    : (
+        "https://archiveofourown.org/works/26131546",
+        "https://archiveofourown.org/works/26291101",
+        "https://archiveofourown.org/works/26325292",
+    ),
+},
+
+{
+    "#url"     : "https://archiveofourown.org/tags/Sunshine%20(Ghost%20Sweden%20Band)/works",
+    "#category": ("", "ao3", "tag"),
+    "#class"   : ao3.Ao3TagExtractor,
+    "#pattern" : ao3.Ao3WorkExtractor.pattern,
+    "#range"   : "1-50",
+    "#count"   : 50,
+},
+
+{
+    "#url"     : "https://archiveofourown.org/works/search?work_search%5Bquery%5D=air+fire+ice+water",
+    "#category": ("", "ao3", "search"),
+    "#class"   : ao3.Ao3SearchExtractor,
+    "#pattern" : ao3.Ao3WorkExtractor.pattern,
+    "#range"   : "1-50",
+    "#count"   : 50,
+},
+
+{
+    "#url"     : "https://archiveofourown.org/users/Fyrelass",
+    "#category": ("", "ao3", "user"),
+    "#class"   : ao3.Ao3UserExtractor,
+    "#urls"    : (
+        "https://archiveofourown.org/users/Fyrelass/works",
+        "https://archiveofourown.org/users/Fyrelass/series",
+    ),
+},
+
+{
+    "#url"     : "https://archiveofourown.org/users/Fyrelass/profile",
+    "#category": ("", "ao3", "user"),
+    "#class"   : ao3.Ao3UserExtractor,
+},
+
+{
+    "#url"     : "https://archiveofourown.org/users/Fyrelass/pseuds/Aileen%20Autarkeia",
+    "#category": ("", "ao3", "user"),
+    "#class"   : ao3.Ao3UserExtractor,
+},
+
+{
+    "#url"     : "https://archiveofourown.org/users/Fyrelass/works",
+    "#category": ("", "ao3", "user-works"),
+    "#class"   : ao3.Ao3UserWorksExtractor,
+    "#urls"    : (
+        "https://archiveofourown.org/works/55035061",
+        "https://archiveofourown.org/works/52704457",
+        "https://archiveofourown.org/works/52502743",
+        "https://archiveofourown.org/works/52170409",
+        "https://archiveofourown.org/works/52078558",
+        "https://archiveofourown.org/works/51699982",
+        "https://archiveofourown.org/works/51975193",
+        "https://archiveofourown.org/works/51633877",
+        "https://archiveofourown.org/works/51591436",
+        "https://archiveofourown.org/works/50860891",
+    ),
+},
+
+{
+    "#url"     : "https://archiveofourown.org/users/Fyrelass/series",
+    "#category": ("", "ao3", "user-series"),
+    "#class"   : ao3.Ao3UserSeriesExtractor,
+    "#urls"    : (
+        "https://archiveofourown.org/series/3821575",
+    ),
+},
+
+{
+    "#url"     : "https://archiveofourown.org/users/Fyrelass/bookmarks",
+    "#category": ("", "ao3", "user-bookmark"),
+    "#class"   : ao3.Ao3UserBookmarkExtractor,
+    "#pattern" : ao3.Ao3WorkExtractor.pattern,
+    "#range"   : "1-50",
+    "#count"   : 50,
+},
+
+)