diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d046aad4..188d8294 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -259,6 +259,12 @@ Consider all listed sites to potentially be NSFW. Folders + + HatenaBlog + https://hatenablog.com + Archive, Individual Posts, Home Feed, Search Results + + HBrowse https://www.hbrowse.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e33f2c3..26ce2093 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -53,6 +53,7 @@ modules = [ "gelbooru_v01", "gelbooru_v02", "gofile", + "hatenablog", "hbrowse", "hentai2read", "hentaicosplays", diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py new file mode 100644 index 00000000..59e2f94e --- /dev/null +++ b/gallery_dl/extractor/hatenablog.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hatenablog.com""" + +import re +from .common import Extractor, Message +from .. import text + + +BASE_PATTERN = ( + r"(?:hatenablog:https?://([^/]+)|(?:https?://)?" + r"([\w-]+\.(?:hatenablog\.com|hatenablog\.jp" + r"|hatenadiary\.com|hateblo\.jp)))" +) +QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$" + + +class HatenaBlogExtractor(Extractor): + """Base class for HatenaBlog extractors""" + category = "hatenablog" + directory_fmt = ("{category}", "{domain}") + filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}" + archive_fmt = "{filename}" + + def __init__(self, match): + Extractor.__init__(self, match) + + self.domain = match.group(1) or match.group(2) + self._find_img = re.compile(r'').finditer + self._is_image = re.compile( + r'(?: |^)class="hatena-fotolife"(?: |$)').search + self._find_img_src = re.compile(r'(?: |^)src="(.+?)"(?: |$)').search + + def _handle_article(self, article: str): + extr = text.extract_from(article) + date = text.parse_datetime(extr('