[ao3] add initial support (#6013)

pull/4791/merge
Mike Fährmann 3 days ago
parent 7d6520e15d
commit 638a676495
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1279,6 +1279,20 @@ Extractor-specific Options
==========================
extractor.ao3.formats
---------------------
Type
* ``string``
* ``list`` of ``strings``
Default
``"pdf"``
Example
* ``"azw3,epub,mobi,pdf,html"``
* ``["azw3", "epub", "mobi", "pdf", "html"]``
Description
Format(s) to download.
extractor.artstation.external
-----------------------------
Type

@ -103,6 +103,12 @@ Consider all listed sites to potentially be NSFW.
<td>Firms, Projects</td>
<td></td>
</tr>
<tr>
<td>Archive of Our Own</td>
<td>https://archiveofourown.org/</td>
<td>Search Results, Series, Tag Searches, User Profiles, Bookmarks, Works</td>
<td></td>
</tr>
<tr>
<td>ArtStation</td>
<td>https://www.artstation.com/</td>

@ -23,6 +23,7 @@ modules = [
"8muses",
"adultempire",
"agnph",
"ao3",
"architizer",
"artstation",
"aryion",

@ -0,0 +1,200 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://archiveofourown.org/"""
from .common import Extractor, Message
from .. import text, util
BASE_PATTERN = r"(?:https?://)?(?:www\.)?archiveofourown.org"
class Ao3Extractor(Extractor):
"""Base class for ao3 extractors"""
category = "ao3"
root = "https://archiveofourown.org"
categorytransfer = True
request_interval = (0.5, 1.5)
def items(self):
base = self.root + "/works/"
data = {"_extractor": Ao3WorkExtractor}
for work_id in self.works():
yield Message.Queue, base + work_id, data
def works(self):
return self._pagination(self.groups[0])
def _pagination(self, path, needle='<li id="work_'):
while True:
page = self.request(self.root + path).text
yield from text.extract_iter(page, needle, '"')
path = text.extr(page, '<a rel="next" href="', '"')
if not path:
return
path = text.unescape(path)
class Ao3WorkExtractor(Ao3Extractor):
"""Extractor for an AO3 work"""
subcategory = "work"
directory_fmt = ("{category}", "{author}")
filename_fmt = "{id} {title}.{extension}"
archive_fmt = "{id}.{extension}"
pattern = BASE_PATTERN + r"/works/(\d+)"
example = "https://archiveofourown.org/works/12345"
def _init(self):
formats = self.config("formats")
if formats is None:
self.formats = ("pdf",)
elif not formats:
self.formats = ()
elif isinstance(formats, str):
self.formats = formats.lower().replace(" ", "").split(",")
else:
self.formats = formats
self.cookies.set("view_adult", "true", domain="archiveofourown.org")
def items(self):
work_id = self.groups[0]
url = "{}/works/{}".format(self.root, work_id)
extr = text.extract_from(self.request(url).text)
fmts = {}
download = extr(' class="download"', "</ul>")
for dl in text.extract_iter(download, ' href="', "</"):
path, _, type = dl.rpartition('">')
fmts[type.lower()] = path
data = {
"id" : text.parse_int(work_id),
"rating" : text.split_html(
extr('<dd class="rating tags">', "</dd>")),
"warnings" : text.split_html(
extr('<dd class="warning tags">', "</dd>")),
"categories" : text.split_html(
extr('<dd class="category tags">', "</dd>")),
"fandom" : text.split_html(
extr('<dd class="fandom tags">', "</dd>")),
"relationships": text.split_html(
extr('<dd class="relationship tags">', "</dd>")),
"characters" : text.split_html(
extr('<dd class="character tags">', "</dd>")),
"tags" : text.split_html(
extr('<dd class="freeform tags">', "</dd>")),
"lang" : extr('<dd class="language" lang="', '"'),
"series" : extr('<dd class="series">', "</dd>"),
"date" : text.parse_datetime(
extr('<dd class="published">', "<"), "%Y-%m-%d"),
"words" : text.parse_int(
extr('<dd class="words">', "<").replace(",", "")),
"chapters" : text.parse_int(
extr('<dd class="chapters">', "/")),
"comments" : text.parse_int(
extr('<dd class="comments">', "<").replace(",", "")),
"likes" : text.parse_int(
extr('<dd class="kudos">', "<").replace(",", "")),
"bookmarks" : text.parse_int(text.remove_html(
extr('<dd class="bookmarks">', "</dd>")).replace(",", "")),
"views" : text.parse_int(
extr('<dd class="hits">', "<").replace(",", "")),
"title" : text.unescape(
extr(' class="title heading">', "<").strip()),
"author" : text.unescape(text.remove_html(
extr(' class="byline heading">', "</h3>"))),
"summary" : text.split_html(
extr(' class="heading">Summary:</h3>', "</div>")),
}
data["language"] = util.code_to_language(data["lang"])
yield Message.Directory, data
for fmt in self.formats:
try:
url = text.urljoin(self.root, fmts[fmt])
except KeyError:
self.log.warning("%s: Format '%s' not available", work_id, fmt)
else:
yield Message.Url, url, text.nameext_from_url(url, data)
class Ao3SeriesExtractor(Ao3Extractor):
"""Extractor for AO3 works of a series"""
subcategory = "series"
pattern = BASE_PATTERN + r"(/series/(\d+))"
example = "https://archiveofourown.org/series/12345"
class Ao3TagExtractor(Ao3Extractor):
"""Extractor for AO3 works by tag"""
subcategory = "tag"
pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)"
example = "https://archiveofourown.org/tags/TAG/works"
class Ao3SearchExtractor(Ao3Extractor):
"""Extractor for AO3 search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"(/works/search/?\?.+)"
example = "https://archiveofourown.org/works/search?work_search[query]=air"
class Ao3UserExtractor(Ao3Extractor):
"""Extractor for an AO3 user profile"""
subcategory = "user"
pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)"
r"(?:/profile)?/?(?:$|\?|#)")
example = "https://archiveofourown.org/users/USER"
def initialize(self):
pass
def items(self):
base = "{}/users/{}/".format(self.root, self.groups[0])
return self._dispatch_extractors((
(Ao3UserWorksExtractor , base + "works"),
(Ao3UserSeriesExtractor , base + "series"),
(Ao3UserBookmarkExtractor, base + "bookmarks"),
), ("user-works", "user-series"))
class Ao3UserWorksExtractor(Ao3Extractor):
"""Extractor for works of an AO3 user"""
subcategory = "user-works"
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
r"works(?:/?\?.+)?)")
example = "https://archiveofourown.org/users/USER/works"
class Ao3UserSeriesExtractor(Ao3Extractor):
"""Extractor for series of an AO3 user"""
subcategory = "user-series"
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
r"series(?:/?\?.+)?)")
example = "https://archiveofourown.org/users/USER/series"
def items(self):
base = self.root + "/series/"
data = {"_extractor": Ao3SeriesExtractor}
for series_id in self.series():
yield Message.Queue, base + series_id, data
def series(self):
path, user, pseud, query = self.groups
return self._pagination(self.groups[0], '<li id="series_')
class Ao3UserBookmarkExtractor(Ao3Extractor):
"""Extractor for bookmarked works of an AO3 user"""
subcategory = "user-bookmark"
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?"
r"bookmarks(?:/?\?.+)?)")
example = "https://archiveofourown.org/users/USER/bookmarks"

@ -26,6 +26,7 @@ CATEGORY_MAP = {
"adultempire" : "Adult Empire",
"agnph" : "AGNPH",
"allgirlbooru" : "All girl",
"ao3" : "Archive of Our Own",
"archivedmoe" : "Archived.Moe",
"archiveofsins" : "Archive of Sins",
"artstation" : "ArtStation",
@ -181,6 +182,11 @@ SUBCATEGORY_MAP = {
"related-pin" : "related Pins",
"related-board": "",
"ao3": {
"user-works" : "",
"user-series" : "",
"user-bookmark": "Bookmarks",
},
"artstation": {
"artwork": "Artwork Listings",
"collections": "",

@ -0,0 +1,191 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import ao3
__tests__ = (
{
"#url" : "https://archiveofourown.org/works/47802076",
"#category": ("", "ao3", "work"),
"#class" : ao3.Ao3WorkExtractor,
"#urls" : "https://archiveofourown.org/downloads/47802076/The_Wildcard.pdf?updated_at=1720398424",
"author" : "Flowers_for_ghouls",
"bookmarks": range(100, 300),
"chapters" : 27,
"comments" : range(800, 2000),
"date" : "dt:2023-06-11 00:00:00",
"extension": "pdf",
"filename" : "The_Wildcard",
"id" : 47802076,
"lang" : "en",
"language" : "English",
"likes" : range(1000, 2000),
"title" : "The Wildcard",
"views" : range(34000, 50000),
"words" : 217549,
"categories": [
"Gen",
"M/M",
],
"characters": [
"Dewdrop Ghoul | Fire Ghoul",
"Aether | Quintessence Ghoul",
"Multi Ghoul | Swiss Army Ghoul",
"Rain | Water Ghoul",
"Cirrus | Air Ghoulette",
"Cumulus | Air Ghoulette",
"Sunshine Ghoulette",
"Mountain | Earth Ghoul",
"Cardinal Copia",
"Phantom Ghoul",
"Aurora Ghoulette",
"Sister Imperator (Ghost Sweden Band)",
],
"fandom": [
"Ghost (Sweden Band)",
],
"rating": [
"Mature",
],
"relationships": [
"Aether | Quintessence Ghoul/Dewdrop Ghoul | Fire Ghoul",
"Multi Ghoul | Swiss Army Ghoul/Rain | Water Ghoul",
],
"summary": [
"Aether has been asked to stay at the ministry to manage the renovation of the new infirmary. It couldnt have been worse timing. Barely days into the new tour, Dew realizes hes carrying their first kit.",
],
"tags": [
"Domestic Fluff",
"Pack Dynamics",
"gratuitous fluff",
"How do ghouls work?",
"they don't even know",
"but it's cute",
"Pregnant Dewdrop",
"Recreational Drug Use",
"Cowbell!",
"Protective Ghouls",
"no beta we die like Nihil",
"sick dewdrop",
"TW: Vomiting",
"Aether really loves Dew",
"Nesting",
"Ghoul Piles (Ghost Sweden Band)",
"Angst",
"Hurt/Comfort",
"Original Ghoul Kit(s) (Ghost Sweden Band)",
"Kit fic",
],
"warnings": [
"No Archive Warnings Apply",
],
},
{
"#url" : "https://archiveofourown.org/works/47802076",
"#category": ("", "ao3", "work"),
"#class" : ao3.Ao3WorkExtractor,
"#options" : {"formats": ["epub", "mobi", "azw3", "pdf", "html"]},
"#urls" : (
"https://archiveofourown.org/downloads/47802076/The_Wildcard.epub?updated_at=1720398424",
"https://archiveofourown.org/downloads/47802076/The_Wildcard.mobi?updated_at=1720398424",
"https://archiveofourown.org/downloads/47802076/The_Wildcard.azw3?updated_at=1720398424",
"https://archiveofourown.org/downloads/47802076/The_Wildcard.pdf?updated_at=1720398424",
"https://archiveofourown.org/downloads/47802076/The_Wildcard.html?updated_at=1720398424",
),
},
{
"#url" : "https://archiveofourown.org/series/1903930",
"#category": ("", "ao3", "series"),
"#class" : ao3.Ao3SeriesExtractor,
"#urls" : (
"https://archiveofourown.org/works/26131546",
"https://archiveofourown.org/works/26291101",
"https://archiveofourown.org/works/26325292",
),
},
{
"#url" : "https://archiveofourown.org/tags/Sunshine%20(Ghost%20Sweden%20Band)/works",
"#category": ("", "ao3", "tag"),
"#class" : ao3.Ao3TagExtractor,
"#pattern" : ao3.Ao3WorkExtractor.pattern,
"#range" : "1-50",
"#count" : 50,
},
{
"#url" : "https://archiveofourown.org/works/search?work_search%5Bquery%5D=air+fire+ice+water",
"#category": ("", "ao3", "search"),
"#class" : ao3.Ao3SearchExtractor,
"#pattern" : ao3.Ao3WorkExtractor.pattern,
"#range" : "1-50",
"#count" : 50,
},
{
"#url" : "https://archiveofourown.org/users/Fyrelass",
"#category": ("", "ao3", "user"),
"#class" : ao3.Ao3UserExtractor,
"#urls" : (
"https://archiveofourown.org/users/Fyrelass/works",
"https://archiveofourown.org/users/Fyrelass/series",
),
},
{
"#url" : "https://archiveofourown.org/users/Fyrelass/profile",
"#category": ("", "ao3", "user"),
"#class" : ao3.Ao3UserExtractor,
},
{
"#url" : "https://archiveofourown.org/users/Fyrelass/pseuds/Aileen%20Autarkeia",
"#category": ("", "ao3", "user"),
"#class" : ao3.Ao3UserExtractor,
},
{
"#url" : "https://archiveofourown.org/users/Fyrelass/works",
"#category": ("", "ao3", "user-works"),
"#class" : ao3.Ao3UserWorksExtractor,
"#urls" : (
"https://archiveofourown.org/works/55035061",
"https://archiveofourown.org/works/52704457",
"https://archiveofourown.org/works/52502743",
"https://archiveofourown.org/works/52170409",
"https://archiveofourown.org/works/52078558",
"https://archiveofourown.org/works/51699982",
"https://archiveofourown.org/works/51975193",
"https://archiveofourown.org/works/51633877",
"https://archiveofourown.org/works/51591436",
"https://archiveofourown.org/works/50860891",
),
},
{
"#url" : "https://archiveofourown.org/users/Fyrelass/series",
"#category": ("", "ao3", "user-series"),
"#class" : ao3.Ao3UserSeriesExtractor,
"#urls" : (
"https://archiveofourown.org/series/3821575",
),
},
{
"#url" : "https://archiveofourown.org/users/Fyrelass/bookmarks",
"#category": ("", "ao3", "user-bookmark"),
"#class" : ao3.Ao3UserBookmarkExtractor,
"#pattern" : ao3.Ao3WorkExtractor.pattern,
"#range" : "1-50",
"#count" : 50,
},
)
Loading…
Cancel
Save