# -*- coding: utf-8 -*- # Copyright 2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://architizer.com/""" from .common import GalleryExtractor, Extractor, Message from .. import text class ArchitizerProjectExtractor(GalleryExtractor): """Extractor for project pages on architizer.com""" category = "architizer" subcategory = "project" root = "https://architizer.com" directory_fmt = ("{category}", "{firm}", "{title}") filename_fmt = "{filename}.{extension}" archive_fmt = "{gid}_{num}" pattern = r"(?:https?://)?architizer\.com/projects/([^/?#]+)" test = ("https://architizer.com/projects/house-lo/", { "pattern": r"https://architizer-prod\.imgix\.net/media/mediadata" r"/uploads/.+\.jpg$", "keyword": { "count": 27, "description": str, "firm": "Atelier Lina Bellovicova", "gid": "225496", "location": "Czechia", "num": int, "size": "1000 sqft - 3000 sqft", "slug": "house-lo", "status": "Built", "subcategory": "project", "title": "House LO", "type": "Residential › Private House", "year": "2018", }, }) def __init__(self, match): url = "{}/projects/{}/".format(self.root, match.group(1)) GalleryExtractor.__init__(self, match, url) def metadata(self, page): extr = text.extract_from(page) return { "title" : extr("data-name='", "'"), "slug" : extr("data-slug='", "'"), "gid" : extr("data-gid='", "'").rpartition(".")[2], "firm" : extr("data-firm-leaders-str='", "'"), "location" : extr("

", "<").strip(), "type" : text.unescape(text.remove_html(extr( '
Type
', 'STATUS', 'YEAR', 'SIZE', '', '') .replace("
", "\n")), } def images(self, page): return [ (url, None) for url in text.extract_iter( page, "property='og:image:secure_url' content='", "?") ] class ArchitizerFirmExtractor(Extractor): """Extractor for all projects of a firm""" category = "architizer" subcategory = "firm" root = "https://architizer.com" pattern = r"(?:https?://)?architizer\.com/firms/([^/?#]+)" test = ("https://architizer.com/firms/olson-kundig/", { "pattern": ArchitizerProjectExtractor.pattern, "count": ">= 90", }) def init(self, match): Extractor.init(self, match) self.firm = match.group(1) def items(self): url = url = "{}/firms/{}/?requesting_merlin=pages".format( self.root, self.firm) page = self.request(url).text data = {"_extractor": ArchitizerProjectExtractor} for project in text.extract_iter(page, '