diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 9370cfb5..29671cb6 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -17,13 +17,11 @@ class WikimediaExtractor(BaseExtractor): """Base class for wikimedia extractors""" basecategory = "wikimedia" filename_fmt = "{filename} ({sha1[:8]}).{extension}" - directory_fmt = ("{category}", "{page}") archive_fmt = "{sha1}" request_interval = (1.0, 2.0) def __init__(self, match): BaseExtractor.__init__(self, match) - path = match.group(match.lastindex) if self.category == "wikimedia": self.category = self.root.split(".")[-2] @@ -31,32 +29,6 @@ class WikimediaExtractor(BaseExtractor): self.category = "{}-{}".format( self.category, self.root.partition(".")[0].rpartition("/")[2]) - if path.startswith("wiki/"): - path = path[5:] - - pre, sep, _ = path.partition(":") - prefix = pre.lower() if sep else None - - self.title = path = text.unquote(path) - if prefix: - self.subcategory = prefix - - if prefix == "category": - self.params = { - "generator": "categorymembers", - "gcmtitle" : path, - "gcmtype" : "file", - } - elif prefix == "file": - self.params = { - "titles" : path, - } - else: - self.params = { - "generator": "images", - "titles" : path, - } - def _init(self): api_path = self.config_instance("api-path") if api_path: @@ -67,6 +39,22 @@ class WikimediaExtractor(BaseExtractor): else: self.api_url = self.root + "/api.php" + @staticmethod + def prepare(image): + """Adjust the content of a image object""" + image["metadata"] = { + m["name"]: m["value"] + for m in image["metadata"] or ()} + image["commonmetadata"] = { + m["name"]: m["value"] + for m in image["commonmetadata"] or ()} + + filename = image["canonicaltitle"] + image["filename"], _, image["extension"] = \ + filename.partition(":")[2].rpartition(".") + image["date"] = text.parse_datetime( + image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") + def items(self): for info in self._pagination(self.params): try: @@ -75,20 +63,7 @@ class WikimediaExtractor(BaseExtractor): self.log.debug("Missing 'imageinfo' for %s", info) continue - image["metadata"] = { - m["name"]: m["value"] - for m in image["metadata"] or ()} - image["commonmetadata"] = { - m["name"]: m["value"] - for m in image["commonmetadata"] or ()} - - filename = image["canonicaltitle"] - image["filename"], _, image["extension"] = \ - filename.partition(":")[2].rpartition(".") - image["date"] = text.parse_datetime( - image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") - image["page"] = self.title - + self.prepare(image) yield Message.Directory, image yield Message.Url, image["url"], image @@ -181,5 +156,40 @@ BASE_PATTERN = WikimediaExtractor.update({ class WikimediaArticleExtractor(WikimediaExtractor): """Extractor for wikimedia articles""" subcategory = "article" + directory_fmt = ("{category}", "{page}") pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)" example = "https://en.wikipedia.org/wiki/TITLE" + + def __init__(self, match): + WikimediaExtractor.__init__(self, match) + + path = match.group(match.lastindex) + if path.startswith("wiki/"): + path = path[5:] + + pre, sep, _ = path.partition(":") + prefix = pre.lower() if sep else None + + self.title = path = text.unquote(path) + if prefix: + self.subcategory = prefix + + if prefix == "category": + self.params = { + "generator": "categorymembers", + "gcmtitle" : path, + "gcmtype" : "file", + } + elif prefix == "file": + self.params = { + "titles" : path, + } + else: + self.params = { + "generator": "images", + "titles" : path, + } + + def prepare(self, image): + WikimediaExtractor.prepare(image) + image["page"] = self.title