merge #2340: [wikimedia] add 'article' and 'category' extractors (#1443, #2906)

pull/5081/head
Mike Fährmann 8 months ago
commit 34a7afdbc1
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1478,6 +1478,64 @@ Consider all listed sites to potentially be NSFW.
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Wikimedia Instances</strong></td>
</tr>
<tr>
<td>Wikipedia</td>
<td>https://www.wikipedia.org/</td>
<td>Articles, Categories</td>
<td></td>
</tr>
<tr>
<td>Wiktionary</td>
<td>https://www.wiktionary.org/</td>
<td>Articles, Categories</td>
<td></td>
</tr>
<tr>
<td>Wikiquote</td>
<td>https://www.wikiquote.org/</td>
<td>Articles, Categories</td>
<td></td>
</tr>
<tr>
<td>Wikibooks</td>
<td>https://www.wikibooks.org/</td>
<td>Articles, Categories</td>
<td></td>
</tr>
<tr>
<td>Wikisource</td>
<td>https://www.wikisource.org/</td>
<td>Articles, Categories</td>
<td></td>
</tr>
<tr>
<td>Wikinews</td>
<td>https://www.wikinews.org/</td>
<td>Articles, Categories</td>
<td></td>
</tr>
<tr>
<td>Wikiversity</td>
<td>https://www.wikiversity.org/</td>
<td>Articles, Categories</td>
<td></td>
</tr>
<tr>
<td>Wikispecies</td>
<td>https://species.wikimedia.org/</td>
<td>Articles, Categories</td>
<td></td>
</tr>
<tr>
<td>Wikimedia Commons</td>
<td>https://commons.wikimedia.org/</td>
<td>Articles, Categories</td>
<td></td>
</tr>
<tr>
<td colspan="4"><strong>Moebooru and MyImouto</strong></td>
</tr>

@ -178,6 +178,7 @@ modules = [
"weibo",
"wikiart",
"wikifeet",
"wikimedia",
"xhamster",
"xvideos",
"zerochan",

@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Ailothaen
# Copyright 2024 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for Wikimedia and Wikipedia"""
from .common import BaseExtractor, Message
from .. import text
class WikimediaExtractor(BaseExtractor):
"""Base class for wikimedia extractors"""
basecategory = "wikimedia"
directory_fmt = ("{category}", "{page}")
archive_fmt = "{sha1}"
request_interval = (1.0, 2.0)
def __init__(self, match):
BaseExtractor.__init__(self, match)
self.title = match.group(match.lastindex)
def items(self):
for info in self._pagination(self.params):
image = info["imageinfo"][0]
image["metadata"] = {
m["name"]: m["value"]
for m in image["metadata"]}
image["commonmetadata"] = {
m["name"]: m["value"]
for m in image["commonmetadata"]}
filename = image["canonicaltitle"]
image["filename"], _, image["extension"] = \
filename.partition(":")[2].rpartition(".")
image["date"] = text.parse_datetime(
image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
image["page"] = self.title
yield Message.Directory, image
yield Message.Url, image["url"], image
def _pagination(self, params):
"""
https://www.mediawiki.org/wiki/API:Query
https://opendata.stackexchange.com/questions/13381
"""
url = self.root + "/w/api.php"
params["action"] = "query"
params["format"] = "json"
while True:
data = self.request(url, params=params).json()
try:
pages = data["query"]["pages"]
except KeyError:
pass
else:
yield from pages.values()
try:
continuation = data["continue"]
except KeyError:
break
params.update(continuation)
BASE_PATTERN = WikimediaExtractor.update({
"wikipedia": {
"root": None,
"pattern": r"[a-z]{2,}\.wikipedia\.org",
},
"wiktionary": {
"root": None,
"pattern": r"[a-z]{2,}\.wiktionary\.org",
},
"wikiquote": {
"root": None,
"pattern": r"[a-z]{2,}\.wikiquote\.org",
},
"wikibooks": {
"root": None,
"pattern": r"[a-z]{2,}\.wikibooks\.org",
},
"wikisource": {
"root": None,
"pattern": r"[a-z]{2,}\.wikisource\.org",
},
"wikinews": {
"root": None,
"pattern": r"[a-z]{2,}\.wikinews\.org",
},
"wikiversity": {
"root": None,
"pattern": r"[a-z]{2,}\.wikiversity\.org",
},
"wikispecies": {
"root": "https://species.wikimedia.org",
"pattern": r"species\.wikimedia\.org",
},
"wikimediacommons": {
"root": "https://commons.wikimedia.org",
"pattern": r"commons\.wikimedia\.org",
},
})
class WikimediaArticleExtractor(WikimediaExtractor):
"""Extractor for wikimedia articles"""
subcategory = "article"
pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)"
example = "https://en.wikipedia.org/wiki/TITLE"
def _init(self):
self.params = {
"generator": "images",
"titles" : self.title,
"prop" : "imageinfo",
"iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|"
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth",
}
class WikimediaCategoryExtractor(WikimediaExtractor):
subcategory = "category"
pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)"
example = "https://commons.wikimedia.org/wiki/Category:NAME"
def _init(self):
self.params = {
"generator": "categorymembers",
"gcmtitle" : self.title,
"gcmtype" : "file",
"prop" : "imageinfo",
"iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|"
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth",
}

@ -139,6 +139,7 @@ CATEGORY_MAP = {
"webmshare" : "webmshare",
"webtoons" : "Webtoon",
"wikiart" : "WikiArt.org",
"wikimediacommons": "Wikimedia Commons",
"xbunkr" : "xBunkr",
"xhamster" : "xHamster",
"xvideos" : "XVideos",

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://www.wikibooks.org/wiki/Title",
"#category": ("wikimedia", "wikibooks", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
},
{
"#url" : "https://en.wikibooks.org/wiki/Category:Title",
"#category": ("wikimedia", "wikibooks", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
},
)

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://commons.wikimedia.org/wiki/File:Starr-050516-1367-Pimenta_dioica-flowers-Maunaloa-Molokai_(24762757525).jpg",
"#category": ("wikimedia", "wikimediacommons", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
},
{
"#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro",
"#category": ("wikimedia", "wikimediacommons", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
},
)

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://www.wikinews.org/wiki/Title",
"#category": ("wikimedia", "wikinews", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
},
{
"#url" : "https://en.wikinews.org/wiki/Category:Title",
"#category": ("wikimedia", "wikinews", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
},
)

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://www.wikipedia.org/wiki/Title",
"#category": ("wikimedia", "wikipedia", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
},
{
"#url" : "https://en.wikipedia.org/wiki/Athena",
"#category": ("wikimedia", "wikipedia", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
"#pattern" : r"https://upload.wikimedia.org/wikipedia/.+",
"#count" : range(50, 100),
"bitdepth" : int,
"canonicaltitle": str,
"comment" : str,
"commonmetadata": dict,
"date" : "type:datetime",
"descriptionshorturl": str,
"descriptionurl": str,
"extension" : str,
"extmetadata" : dict,
"filename" : str,
"height" : int,
"metadata" : dict,
"mime" : r"re:image/\w+",
"page" : "Athena",
"sha1" : r"re:^[0-9a-f]{40}$",
"size" : int,
"timestamp" : str,
"url" : str,
"user" : str,
"userid" : int,
"width" : int,
},
{
"#url" : "https://en.wikipedia.org/wiki/Category:Physics",
"#category": ("wikimedia", "wikipedia", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
},
)

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://www.wikiquote.org/wiki/Title",
"#category": ("wikimedia", "wikiquote", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
},
{
"#url" : "https://en.wikiquote.org/wiki/Category:Title",
"#category": ("wikimedia", "wikiquote", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
},
)

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://www.wikisource.org/wiki/Title",
"#category": ("wikimedia", "wikisource", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
},
{
"#url" : "https://en.wikisource.org/wiki/Category:Title",
"#category": ("wikimedia", "wikisource", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
},
)

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://species.wikimedia.org/wiki/Geranospiza",
"#category": ("wikimedia", "wikispecies", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
"#urls" : "https://upload.wikimedia.org/wikipedia/commons/0/01/Geranospiza_caerulescens.jpg",
"#sha1_content": "3a17c14b15489928e4154f826af1c42afb5a523e",
},
{
"#url" : "https://species.wikimedia.org/wiki/Category:Names",
"#category": ("wikimedia", "wikispecies", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
},
)

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://www.wikiversity.org/wiki/Title",
"#category": ("wikimedia", "wikiversity", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
},
{
"#url" : "https://en.wikiversity.org/wiki/Category:Title",
"#category": ("wikimedia", "wikiversity", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
},
)

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://www.wiktionary.org/wiki/Word",
"#category": ("wikimedia", "wiktionary", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
},
{
"#url" : "https://en.wiktionary.org/wiki/Category:Words",
"#category": ("wikimedia", "wiktionary", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor,
},
)
Loading…
Cancel
Save