# -*- coding: utf-8 -*- # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Generic information extractor""" from .common import Extractor, Message from .. import config, text import os.path import re class GenericExtractor(Extractor): """Extractor for images in a generic web page.""" category = "generic" directory_fmt = ("{category}", "{pageurl}") archive_fmt = "{imageurl}" # By default, the generic extractor is disabled # and the "g(eneric):" prefix in url is required. # If the extractor is enabled, make the prefix optional pattern = r"(?i)(?Pg(?:eneric)?:)" if config.get(("extractor", "generic"), "enabled"): pattern += r"?" # The generic extractor pattern should match (almost) any valid url # Based on: https://tools.ietf.org/html/rfc3986#appendix-B pattern += ( r"(?Phttps?://)?" # optional http(s) scheme r"(?P[-\w\.]+)" # required domain r"(?P/[^?#]*)?" # optional path r"(?:\?(?P[^#]*))?" # optional query r"(?:\#(?P.*))?" # optional fragment ) example = "generic:https://www.nongnu.org/lzip/" def __init__(self, match): Extractor.__init__(self, match) # Strip the "g(eneric):" prefix # and inform about "forced" or "fallback" mode if match.group('generic'): self.url = match.group(0).partition(":")[2] else: self.log.info("Falling back on generic information extractor.") self.url = match.group(0) # Make sure we have a scheme, or use https if match.group('scheme'): self.scheme = match.group('scheme') else: self.scheme = 'https://' self.url = self.scheme + self.url # Used to resolve relative image urls self.root = self.scheme + match.group('domain') def items(self): """Get page, extract metadata & images, yield them in suitable messages Adapted from common.GalleryExtractor.items() """ page = self.request(self.url).text data = self.metadata(page) imgs = self.images(page) try: data["count"] = len(imgs) except TypeError: pass images = enumerate(imgs, 1) yield Message.Directory, data for data["num"], (url, imgdata) in images: if imgdata: data.update(imgdata) if "extension" not in imgdata: text.nameext_from_url(url, data) else: text.nameext_from_url(url, data) yield Message.Url, url, data def metadata(self, page): """Extract generic webpage metadata, return them in a dict.""" data = {} data['pageurl'] = self.url data['title'] = text.extr(page, '', "") data['description'] = text.extr( page, ']*" # ,