# -*- coding: utf-8 -*- # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for Postmill instances""" import re from .common import BaseExtractor, Message from .. import text, exception class PostmillExtractor(BaseExtractor): """Base class for Postmill extractors""" basecategory = "postmill" directory_fmt = ("{category}", "{instance}", "{forum}") filename_fmt = "{id}_{title[:220]}.{extension}" archive_fmt = "{filename}" def _init(self): self.instance = self.root.partition("://")[2] self.save_link_post_body = self.config("save-link-post-body", False) self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search self._search_image_tag = re.compile( r'')) date = text.parse_datetime(extr( '')) username = extr( '') post_canonical_url = text.unescape(extr( '')) url = text.unescape(extr( '

', '') match = self._search_canonical_url(post_canonical_url) forum = match.group(1) id = int(match.group(2)) is_text_post = url.startswith("/") is_image_post = self._search_image_tag(page) is not None data = { "title": title, "date": date, "username": username, "forum": forum, "id": id, "flair": [text.unescape(i) for i in text.extract_iter( page, '', '')], "instance": self.instance, } urls = [] if is_text_post or self.save_link_post_body: urls.append((Message.Url, "text:" + body)) if is_image_post: urls.append((Message.Url, url)) elif not is_text_post: urls.append((Message.Queue, url)) data["count"] = len(urls) yield Message.Directory, data for data["num"], (msg, url) in enumerate(urls, 1): if url.startswith("text:"): data["filename"], data["extension"] = "", "htm" else: data = text.nameext_from_url(url, data) yield msg, url, data class PostmillSubmissionsExtractor(PostmillExtractor): """Base class for Postmill submissions extractors""" whitelisted_parameters = () def init(self, match): PostmillExtractor.init(self, match) groups = match.groups() self.base = groups[-3] self.sorting_path = groups[-2] or "" self.query = {key: value for key, value in text.parse_query( groups[-1]).items() if self.acceptable_query(key)} def items(self): url = self.root + self.base + self.sorting_path while url: response = self.request(url, params=self.query) if response.history: redirect_url = response.url if redirect_url == self.root + "/login": raise exception.StopExtraction( "HTTP redirect to login page (%s)", redirect_url) page = response.text for nav in text.extract_iter(page, '
', '
'): post_url = text.unescape(text.extr(nav, '')) def acceptable_query(self, key): return key in self.whitelisted_parameters or key == "t" or \ (key.startswith("next[") and key.endswith("]")) BASE_PATTERN = PostmillExtractor.update({ "raddle": { "root" : None, "pattern": (r"(?:raddle\.me|" r"c32zjeghcp5tj3kb72pltz56piei66drc63vkhn5yixiyk4cmerrjtid" r"\.onion)"), } }) QUERY_RE = r"(?:\?([^#]+))?$" SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \ QUERY_RE class PostmillPostExtractor(PostmillExtractor): """Extractor for a single submission URL""" subcategory = "post" pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)" example = "https://raddle.me/f/FORUM/123/TITLE" def init(self, match): PostmillExtractor.init(self, match) self.forum = match.group(3) self.post_id = match.group(4) def post_urls(self): return (self.root + "/f/" + self.forum + "/" + self.post_id,) class PostmillShortURLExtractor(PostmillExtractor): """Extractor for short submission URLs""" subcategory = "shorturl" pattern = BASE_PATTERN + r"/(\d+)$" example = "https://raddle.me/123" def init(self, match): PostmillExtractor.init(self, match) self.post_id = match.group(3) def items(self): url = self.root + "/" + self.post_id response = self.request(url, method="HEAD", allow_redirects=False) full_url = text.urljoin(url, response.headers["Location"]) yield Message.Queue, full_url, {"_extractor": PostmillPostExtractor} class PostmillHomeExtractor(PostmillSubmissionsExtractor): """Extractor for the home page""" subcategory = "home" pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE example = "https://raddle.me/" class PostmillForumExtractor(PostmillSubmissionsExtractor): """Extractor for submissions on a forum""" subcategory = "forum" pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE example = "https://raddle.me/f/FORUM" class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor): """Extractor for submissions made by a user""" subcategory = "usersubmissions" pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE example = "https://raddle.me/user/USER/submissions" class PostmillTagExtractor(PostmillSubmissionsExtractor): """Extractor for submissions on a forum with a specific tag""" subcategory = "tag" pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE example = "https://raddle.me/tag/TAG" class PostmillSearchExtractor(PostmillSubmissionsExtractor): """Extractor for search results""" subcategory = "search" pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$" example = "https://raddle.me/search?q=QUERY" whitelisted_parameters = ("q",)