|
|
|
@ -17,13 +17,11 @@ class WikimediaExtractor(BaseExtractor):
|
|
|
|
|
"""Base class for wikimedia extractors"""
|
|
|
|
|
basecategory = "wikimedia"
|
|
|
|
|
filename_fmt = "{filename} ({sha1[:8]}).{extension}"
|
|
|
|
|
directory_fmt = ("{category}", "{page}")
|
|
|
|
|
archive_fmt = "{sha1}"
|
|
|
|
|
request_interval = (1.0, 2.0)
|
|
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
|
BaseExtractor.__init__(self, match)
|
|
|
|
|
path = match.group(match.lastindex)
|
|
|
|
|
|
|
|
|
|
if self.category == "wikimedia":
|
|
|
|
|
self.category = self.root.split(".")[-2]
|
|
|
|
@ -31,32 +29,6 @@ class WikimediaExtractor(BaseExtractor):
|
|
|
|
|
self.category = "{}-{}".format(
|
|
|
|
|
self.category, self.root.partition(".")[0].rpartition("/")[2])
|
|
|
|
|
|
|
|
|
|
if path.startswith("wiki/"):
|
|
|
|
|
path = path[5:]
|
|
|
|
|
|
|
|
|
|
pre, sep, _ = path.partition(":")
|
|
|
|
|
prefix = pre.lower() if sep else None
|
|
|
|
|
|
|
|
|
|
self.title = path = text.unquote(path)
|
|
|
|
|
if prefix:
|
|
|
|
|
self.subcategory = prefix
|
|
|
|
|
|
|
|
|
|
if prefix == "category":
|
|
|
|
|
self.params = {
|
|
|
|
|
"generator": "categorymembers",
|
|
|
|
|
"gcmtitle" : path,
|
|
|
|
|
"gcmtype" : "file",
|
|
|
|
|
}
|
|
|
|
|
elif prefix == "file":
|
|
|
|
|
self.params = {
|
|
|
|
|
"titles" : path,
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
self.params = {
|
|
|
|
|
"generator": "images",
|
|
|
|
|
"titles" : path,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def _init(self):
|
|
|
|
|
api_path = self.config_instance("api-path")
|
|
|
|
|
if api_path:
|
|
|
|
@ -67,6 +39,22 @@ class WikimediaExtractor(BaseExtractor):
|
|
|
|
|
else:
|
|
|
|
|
self.api_url = self.root + "/api.php"
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def prepare(image):
|
|
|
|
|
"""Adjust the content of a image object"""
|
|
|
|
|
image["metadata"] = {
|
|
|
|
|
m["name"]: m["value"]
|
|
|
|
|
for m in image["metadata"] or ()}
|
|
|
|
|
image["commonmetadata"] = {
|
|
|
|
|
m["name"]: m["value"]
|
|
|
|
|
for m in image["commonmetadata"] or ()}
|
|
|
|
|
|
|
|
|
|
filename = image["canonicaltitle"]
|
|
|
|
|
image["filename"], _, image["extension"] = \
|
|
|
|
|
filename.partition(":")[2].rpartition(".")
|
|
|
|
|
image["date"] = text.parse_datetime(
|
|
|
|
|
image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
|
for info in self._pagination(self.params):
|
|
|
|
|
try:
|
|
|
|
@ -75,20 +63,7 @@ class WikimediaExtractor(BaseExtractor):
|
|
|
|
|
self.log.debug("Missing 'imageinfo' for %s", info)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
image["metadata"] = {
|
|
|
|
|
m["name"]: m["value"]
|
|
|
|
|
for m in image["metadata"] or ()}
|
|
|
|
|
image["commonmetadata"] = {
|
|
|
|
|
m["name"]: m["value"]
|
|
|
|
|
for m in image["commonmetadata"] or ()}
|
|
|
|
|
|
|
|
|
|
filename = image["canonicaltitle"]
|
|
|
|
|
image["filename"], _, image["extension"] = \
|
|
|
|
|
filename.partition(":")[2].rpartition(".")
|
|
|
|
|
image["date"] = text.parse_datetime(
|
|
|
|
|
image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
image["page"] = self.title
|
|
|
|
|
|
|
|
|
|
self.prepare(image)
|
|
|
|
|
yield Message.Directory, image
|
|
|
|
|
yield Message.Url, image["url"], image
|
|
|
|
|
|
|
|
|
@ -181,5 +156,40 @@ BASE_PATTERN = WikimediaExtractor.update({
|
|
|
|
|
class WikimediaArticleExtractor(WikimediaExtractor):
|
|
|
|
|
"""Extractor for wikimedia articles"""
|
|
|
|
|
subcategory = "article"
|
|
|
|
|
directory_fmt = ("{category}", "{page}")
|
|
|
|
|
pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
|
|
|
|
|
example = "https://en.wikipedia.org/wiki/TITLE"
|
|
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
|
WikimediaExtractor.__init__(self, match)
|
|
|
|
|
|
|
|
|
|
path = match.group(match.lastindex)
|
|
|
|
|
if path.startswith("wiki/"):
|
|
|
|
|
path = path[5:]
|
|
|
|
|
|
|
|
|
|
pre, sep, _ = path.partition(":")
|
|
|
|
|
prefix = pre.lower() if sep else None
|
|
|
|
|
|
|
|
|
|
self.title = path = text.unquote(path)
|
|
|
|
|
if prefix:
|
|
|
|
|
self.subcategory = prefix
|
|
|
|
|
|
|
|
|
|
if prefix == "category":
|
|
|
|
|
self.params = {
|
|
|
|
|
"generator": "categorymembers",
|
|
|
|
|
"gcmtitle" : path,
|
|
|
|
|
"gcmtype" : "file",
|
|
|
|
|
}
|
|
|
|
|
elif prefix == "file":
|
|
|
|
|
self.params = {
|
|
|
|
|
"titles" : path,
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
self.params = {
|
|
|
|
|
"generator": "images",
|
|
|
|
|
"titles" : path,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
def prepare(self, image):
|
|
|
|
|
WikimediaExtractor.prepare(image)
|
|
|
|
|
image["page"] = self.title
|
|
|
|
|