[livedoor] add blog- and post-extractors (#190)

6 years ago · 35919a9bb8
parent 3f513f1056
commit 35919a9bb8
4 changed files with 163 additions and 0 deletions
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@ -52,6 +52,7 @@ Kirei Cake           https://reader.kireicake.com/       Chapters, Manga
 KissManga            https://kissmanga.com/              Chapters, Manga
 Komikcast            https://komikcast.com/              Chapters, Manga
 Konachan             https://konachan.com/               Pools, Popular Images, Posts, Tag-Searches
+livedoor Blog        http://blog.livedoor.jp/            Blogs, Posts
 Luscious             https://luscious.net/               Albums, Search Results                             Optional
 Manga Fox            https://fanfox.net/                 Chapters
 Manga Here           https://www.mangahere.cc/           Chapters, Manga
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -47,6 +47,7 @@ modules = [
    "kissmanga",
    "komikcast",
    "konachan",
+    "livedoor",
    "luscious",
    "mangadex",
    "mangafox",
--- a/gallery_dl/extractor/livedoor.py
+++ b/gallery_dl/extractor/livedoor.py
@ -0,0 +1,160 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for http://blog.livedoor.jp/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class LivedoorExtractor(Extractor):
+    """Base class for livedoor extractors"""
+    category = "livedoor"
+    root = "http://blog.livedoor.jp"
+    img_root = "http://livedoor.blogimg.jp"
+    filename_fmt = "{post[id]}_{post[title]}_{num:>02}.{extension}"
+    directory_fmt = ("{category}", "{post[user]}")
+    archive_fmt = "{post[id]}_{hash}"
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.user = match.group(1)
+
+    def items(self):
+        yield Message.Version, 1
+        for post in self.posts():
+            images = self._images(post)
+            if images:
+                yield Message.Directory, {"post": post}
+                for image in images:
+                    yield Message.Url, image["url"], image
+
+    def posts(self):
+        """Return an iterable with post objects"""
+
+    def _load(self, data, body):
+        pid  , pos = text.extract(data, "id : '"   , "'")
+        title, pos = text.extract(data, "title : '", "'", pos)
+        cat1 , pos = text.extract(data, "name:'"   , "'", pos)
+        cat2 , pos = text.extract(data, "name:'"   , "'", pos)
+        date , pos = text.extract(data, "date : '" , "'", pos)
+        tags , pos = text.extract(body, '</dt><dd>', '</dl>')
+
+        return {
+            "id"        : text.parse_int(pid),
+            "title"     : title,
+            "date"      : date,
+            "categories": [cat1, cat2],
+            "tags"      : text.split_html(tags),
+            "user"      : self.user,
+            "body"      : body,
+        }
+
+    def _images(self, post):
+        imgs = []
+        body = post.pop("body")
+
+        for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
+            src = text.extract(img, 'src="', '"')[0]
+            alt = text.extract(img, 'alt="', '"')[0]
+
+            if src.startswith(self.img_root):
+                url = src.replace("-s.", ".")
+            else:
+                url = text.urljoin(self.root, src)
+            name, _, ext = url.rpartition("/")[2].rpartition(".")
+
+            imgs.append({
+                "url"      : url,
+                "num"      : num,
+                "hash"     : name,
+                "filename" : alt or name,
+                "extension": ext,
+                "post"     : post,
+            })
+
+        return imgs
+
+
+class LivedoorBlogExtractor(LivedoorExtractor):
+    """Extractor for a user's blog on blog.livedoor.jp"""
+    subcategory = "blog"
+    pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?&#])"
+    test = ("http://blog.livedoor.jp/zatsu_ke/", {
+        "range": "1-50",
+        "count": 50,
+        "pattern": r"http://livedoor.blogimg.jp/zatsu_ke/imgs/\w/\w/\w+\.\w+",
+        "keyword": {
+            "post": {
+                "categories": list,
+                "date": str,
+                "id": int,
+                "tags": list,
+                "title": str,
+                "user": "zatsu_ke"
+            },
+            "filename": str,
+            "hash": r"re:\w{4,}",
+            "num": int,
+        },
+    })
+
+    def posts(self):
+        url = "{}/{}".format(self.root, self.user)
+
+        while url:
+            page = self.request(url).text
+            pos = 0
+
+            while True:
+                data, pos = text.extract(page, '.articles.push(', ');', pos)
+                if not data:
+                    break
+                body, pos = text.extract(
+                    page,
+                    '<div class="article-body-inner">',
+                    '<!-- articleBody End -->',
+                    pos,
+                )
+                yield self._load(data, body)
+
+            url = text.extract(page, '<a rel="next" href="', '"', pos)[0]
+
+
+class LivedoorPostExtractor(LivedoorExtractor):
+    """Extractor for images from a blog post on blog.livedoor.jp"""
+    subcategory = "post"
+    pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/archives/(\d+)"
+    test = (
+        ("http://blog.livedoor.jp/zatsu_ke/archives/51493859.html", {
+            "url": "8826fe623f19dc868e7538e8519bf8491e92a0a2",
+            "keyword": "52fcba9253a000c339bcd658572d252e282626af",
+        }),
+        ("http://blog.livedoor.jp/amaumauma/archives/7835811.html", {
+            "url": "fc1d6a9557245b5a27d3a10bf0fa9922ef377215",
+            "keyword": "0229072abb5cd8a221df72e0ffdfc13336c0e9ce",
+        }),
+    )
+
+    def __init__(self, match):
+        LivedoorExtractor.__init__(self, match)
+        self.post_id = match.group(2)
+
+    def posts(self):
+        url = "{}/{}/archives/{}.html".format(
+            self.root, self.user, self.post_id)
+        page = self.request(url).text
+
+        data, pos = text.extract(page, 'articles :', '</script>')
+        body, pos = text.extract(
+            page,
+            '<div class="article-body-inner">',
+            '<!-- articleBody End -->',
+            pos,
+        )
+        return (self._load(data, body),)
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@ -40,6 +40,7 @@ CATEGORY_MAP = {
    "jaiminisbox"    : "Jaimini's Box",
    "kireicake"      : "Kirei Cake",
    "kissmanga"      : "KissManga",
+    "livedoor"       : "livedoor Blog",
    "mangadex"       : "MangaDex",
    "mangafox"       : "Manga Fox",
    "mangahere"      : "Manga Here",