# -*- coding: utf-8 -*- # Copyright 2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for 4chan archives based on FoolFuuka""" from .common import Extractor, Message, SharedConfigMixin from .. import text, config import itertools import operator import re class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): """Base extractor for FoolFuuka based boards/archives""" basecategory = "foolfuuka" subcategory = "thread" directory_fmt = ["{category}", "{board[shortname]}", "{thread_num}{title:? - //}"] filename_fmt = "{media[media]}" archive_fmt = "{board[shortname]}_{num}_{timestamp}" root = "" def __init__(self, match): Extractor.__init__(self) self.board, self.thread = match.groups() self.session.headers["Referer"] = self.root def items(self): op = True yield Message.Version, 1 for post in self.posts(): if op: yield Message.Directory, post op = False if not post["media"]: continue media = post["media"] url = media["media_link"] if not url and "remote_media_link" in media: url = self.remote(media) if url.startswith("/"): url = self.root + url post["extension"] = url.rpartition(".")[2] yield Message.Url, url, post def posts(self): url = self.root + "/_/api/chan/thread/" params = {"board": self.board, "num": self.thread} data = self.request(url, params=params).json()[self.thread] # sort post-objects by key posts = sorted(data.get("posts", {}).items()) posts = map(operator.itemgetter(1), posts) return itertools.chain((data["op"],), posts) def remote(self, media): needle = '