# -*- coding: utf-8 -*- # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for Postmill instances""" import re from .common import BaseExtractor, Message from .. import text, exception class PostmillExtractor(BaseExtractor): """Base class for Postmill extractors""" basecategory = "postmill" directory_fmt = ("{category}", "{instance}", "{forum}") filename_fmt = "{id}_{title[:220]}.{extension}" archive_fmt = "{filename}" def _init(self): self.instance = self.root.partition("://")[2] self.save_link_post_body = self.config("save-link-post-body", False) self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search self._search_image_tag = re.compile( r'')) date = text.parse_datetime(extr( '')) username = extr( '') post_canonical_url = text.unescape(extr( '')) url = text.unescape(extr( '

', '') match = self._search_canonical_url(post_canonical_url) forum = match.group(1) id = int(match.group(2)) is_text_post = url.startswith("/") is_image_post = self._search_image_tag(page) is not None data = { "title": title, "date": date, "username": username, "forum": forum, "id": id, "flair": [text.unescape(i) for i in text.extract_iter( page, '', '')], "instance": self.instance, } urls = [] if is_text_post or self.save_link_post_body: urls.append((Message.Url, "text:" + body)) if is_image_post: urls.append((Message.Url, url)) elif not is_text_post: urls.append((Message.Queue, url)) data["count"] = len(urls) yield Message.Directory, data for data["num"], (msg, url) in enumerate(urls, 1): if url.startswith("text:"): data["filename"], data["extension"] = "", "htm" else: data = text.nameext_from_url(url, data) yield msg, url, data class PostmillSubmissionsExtractor(PostmillExtractor): """Base class for Postmill submissions extractors""" whitelisted_parameters = () def __init__(self, match): PostmillExtractor.__init__(self, match) groups = match.groups() self.base = groups[-3] self.sorting_path = groups[-2] or "" self.query = {key: value for key, value in text.parse_query( groups[-1]).items() if self.acceptable_query(key)} def items(self): url = self.root + self.base + self.sorting_path while url: response = self.request(url, params=self.query) if response.history: redirect_url = response.url if redirect_url == self.root + "/login": raise exception.StopExtraction( "HTTP redirect to login page (%s)", redirect_url) page = response.text for nav in text.extract_iter(page, ''): post_url = text.unescape(text.extr(nav, '