[wikimedia] refactor

pull/6050/head
ClosedPort22 1 month ago
parent 8ea75202ed
commit a3b2c88fbe
No known key found for this signature in database

@ -17,13 +17,11 @@ class WikimediaExtractor(BaseExtractor):
"""Base class for wikimedia extractors"""
basecategory = "wikimedia"
filename_fmt = "{filename} ({sha1[:8]}).{extension}"
directory_fmt = ("{category}", "{page}")
archive_fmt = "{sha1}"
request_interval = (1.0, 2.0)
def __init__(self, match):
BaseExtractor.__init__(self, match)
path = match.group(match.lastindex)
if self.category == "wikimedia":
self.category = self.root.split(".")[-2]
@ -31,32 +29,6 @@ class WikimediaExtractor(BaseExtractor):
self.category = "{}-{}".format(
self.category, self.root.partition(".")[0].rpartition("/")[2])
if path.startswith("wiki/"):
path = path[5:]
pre, sep, _ = path.partition(":")
prefix = pre.lower() if sep else None
self.title = path = text.unquote(path)
if prefix:
self.subcategory = prefix
if prefix == "category":
self.params = {
"generator": "categorymembers",
"gcmtitle" : path,
"gcmtype" : "file",
}
elif prefix == "file":
self.params = {
"titles" : path,
}
else:
self.params = {
"generator": "images",
"titles" : path,
}
def _init(self):
api_path = self.config_instance("api-path")
if api_path:
@ -67,6 +39,22 @@ class WikimediaExtractor(BaseExtractor):
else:
self.api_url = self.root + "/api.php"
@staticmethod
def prepare(image):
"""Adjust the content of a image object"""
image["metadata"] = {
m["name"]: m["value"]
for m in image["metadata"] or ()}
image["commonmetadata"] = {
m["name"]: m["value"]
for m in image["commonmetadata"] or ()}
filename = image["canonicaltitle"]
image["filename"], _, image["extension"] = \
filename.partition(":")[2].rpartition(".")
image["date"] = text.parse_datetime(
image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
def items(self):
for info in self._pagination(self.params):
try:
@ -75,20 +63,7 @@ class WikimediaExtractor(BaseExtractor):
self.log.debug("Missing 'imageinfo' for %s", info)
continue
image["metadata"] = {
m["name"]: m["value"]
for m in image["metadata"] or ()}
image["commonmetadata"] = {
m["name"]: m["value"]
for m in image["commonmetadata"] or ()}
filename = image["canonicaltitle"]
image["filename"], _, image["extension"] = \
filename.partition(":")[2].rpartition(".")
image["date"] = text.parse_datetime(
image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
image["page"] = self.title
self.prepare(image)
yield Message.Directory, image
yield Message.Url, image["url"], image
@ -181,5 +156,40 @@ BASE_PATTERN = WikimediaExtractor.update({
class WikimediaArticleExtractor(WikimediaExtractor):
"""Extractor for wikimedia articles"""
subcategory = "article"
directory_fmt = ("{category}", "{page}")
pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
example = "https://en.wikipedia.org/wiki/TITLE"
def __init__(self, match):
WikimediaExtractor.__init__(self, match)
path = match.group(match.lastindex)
if path.startswith("wiki/"):
path = path[5:]
pre, sep, _ = path.partition(":")
prefix = pre.lower() if sep else None
self.title = path = text.unquote(path)
if prefix:
self.subcategory = prefix
if prefix == "category":
self.params = {
"generator": "categorymembers",
"gcmtitle" : path,
"gcmtype" : "file",
}
elif prefix == "file":
self.params = {
"titles" : path,
}
else:
self.params = {
"generator": "images",
"titles" : path,
}
def prepare(self, image):
WikimediaExtractor.prepare(image)
image["page"] = self.title

Loading…
Cancel
Save