# -*- coding: utf-8 -*- # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://hatenablog.com""" import re from .common import Extractor, Message from .. import text BASE_PATTERN = ( r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?" r"([\w-]+\.(?:hatenablog\.(?:com|jp)" r"|hatenadiary\.com|hateblo\.jp)))" ) QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" class HatenablogExtractor(Extractor): """Base class for HatenaBlog extractors""" category = "hatenablog" directory_fmt = ("{category}", "{domain}") filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}" archive_fmt = "{filename}" def __init__(self, match): Extractor.__init__(self, match) self.domain = match.group(1) or match.group(2) def _init(self): self._find_img = re.compile(r']+)').finditer def _handle_article(self, article: str): extr = text.extract_from(article) date = text.parse_datetime(extr('