# -*- coding: utf-8 -*- # Copyright 2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for 4chan archives based on FoolFuuka""" from .common import Extractor, Message, SharedConfigMixin, generate_extractors from .. import text import itertools import operator class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): """Base extractor for FoolFuuka based boards/archives""" basecategory = "foolfuuka" subcategory = "thread" directory_fmt = ("{category}", "{board[shortname]}", "{thread_num}{title:? - //}") archive_fmt = "{board[shortname]}_{num}_{timestamp}" pattern_fmt = r"/([^/]+)/thread/(\d+)" external = "default" def __init__(self, match): Extractor.__init__(self, match) self.board, self.thread = match.groups() self.session.headers["Referer"] = self.root if self.external == "direct": self.remote = self._remote_direct def items(self): op = True yield Message.Version, 1 for post in self.posts(): if op: yield Message.Directory, post op = False if not post["media"]: continue media = post["media"] url = media["media_link"] if not url and "remote_media_link" in media: url = self.remote(media) if url.startswith("/"): url = self.root + url post["filename"], _, post["extension"] = \ media["media"].rpartition(".") yield Message.Url, url, post def posts(self): """Return an iterable with all posts in this thread""" url = self.root + "/_/api/chan/thread/" params = {"board": self.board, "num": self.thread} data = self.request(url, params=params).json()[self.thread] # sort post-objects by key posts = sorted(data.get("posts", {}).items()) posts = map(operator.itemgetter(1), posts) return itertools.chain((data["op"],), posts) def remote(self, media): """Resolve a remote media link""" needle = '