gallery-dl/gallery_dl/extractor/seiga.py

# -*- coding: utf-8 -*-

# Copyright 2016-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://seiga.nicovideo.jp/"""

from .common import Extractor, Message
from .. import text, util, exception


class SeigaExtractor(Extractor):
    """Base class for seiga extractors"""
    category = "seiga"
    archive_fmt = "{image_id}"
    cookiedomain = ".nicovideo.jp"
    root = "https://seiga.nicovideo.jp"

    def __init__(self, match):
        Extractor.__init__(self, match)
        self.start_image = 0

    def items(self):
        if not self._check_cookies(("user_session",)):
            raise exception.StopExtraction("'user_session' cookie required")

        images = iter(self.get_images())
        data = next(images)

        yield Message.Directory, data
        for image in util.advance(images, self.start_image):
            data.update(image)
            data["extension"] = None
            yield Message.Url, self.get_image_url(data["image_id"]), data

    def get_images(self):
        """Return iterable containing metadata and images"""

    def get_image_url(self, image_id):
        """Get url for an image with id 'image_id'"""
        url = "{}/image/source/{}".format(self.root, image_id)
        response = self.request(
            url, method="HEAD", allow_redirects=False, notfound="image")
        location = response.headers["location"]
        if "nicovideo.jp/login" in location:
            raise exception.StopExtraction(
                "HTTP redirect to login page (%s)", location.partition("?")[0])
        return location.replace("/o/", "/priv/", 1)


class SeigaUserExtractor(SeigaExtractor):
    """Extractor for images of a user from seiga.nicovideo.jp"""
    subcategory = "user"
    directory_fmt = ("{category}", "{user[id]}")
    filename_fmt = "{category}_{user[id]}_{image_id}.{extension}"
    pattern = (r"(?:https?://)?(?:www\.|(?:sp\.)?seiga\.)?nicovideo\.jp/"
               r"user/illust/(\d+)(?:\?(?:[^&]+&)*sort=([^&#]+))?")
    test = (
        ("https://seiga.nicovideo.jp/user/illust/39537793", {
            "pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",
            "count": ">= 4",
            "keyword": {
                "user": {
                    "id": 39537793,
                    "message": str,
                    "name": str,
                },
                "clips": int,
                "comments": int,
                "count": int,
                "extension": None,
                "image_id": int,
                "title": str,
                "views": int,
            },
        }),
        ("https://seiga.nicovideo.jp/user/illust/79433", {
            "exception": exception.NotFoundError,
        }),
        ("https://seiga.nicovideo.jp/user/illust/39537793"
         "?sort=image_view&target=illust_all"),
        ("https://sp.seiga.nicovideo.jp/user/illust/39537793"),
    )

    def __init__(self, match):
        SeigaExtractor.__init__(self, match)
        self.user_id, self.order = match.groups()
        self.start_page = 1

    def skip(self, num):
        pages, images = divmod(num, 40)
        self.start_page += pages
        self.start_image += images
        return num

    def get_metadata(self, page):
        """Collect metadata from 'page'"""
        data = text.extract_all(page, (
            ("name" , '<img alt="', '"'),
            ("msg"  , '<li class="user_message">', '</li>'),
            (None   , '<span class="target_name">すべて</span>', ''),
            ("count", '<span class="count ">', '</span>'),
        ))[0]

        if not data["name"] and "ユーザー情報が取得出来ませんでした" in page:
            raise exception.NotFoundError("user")

        return {
            "user": {
                "id": text.parse_int(self.user_id),
                "name": data["name"],
                "message": (data["msg"] or "").strip(),
            },
            "count": text.parse_int(data["count"]),
        }

    def get_images(self):
        url = "{}/user/illust/{}".format(self.root, self.user_id)
        params = {"sort": self.order, "page": self.start_page,
                  "target": "illust_all"}

        while True:
            cnt = 0
            page = self.request(url, params=params).text

            if params["page"] == self.start_page:
                yield self.get_metadata(page)

            for info in text.extract_iter(
                    page, '<li class="list_item', '</a></li> '):
                data = text.extract_all(info, (
                    ("image_id", '/seiga/im', '"'),
                    ("title"   , '<li class="title">', '</li>'),
                    ("views"   , '</span>', '</li>'),
                    ("comments", '</span>', '</li>'),
                    ("clips"   , '</span>', '</li>'),
                ))[0]
                for key in ("image_id", "views", "comments", "clips"):
                    data[key] = text.parse_int(data[key])
                yield data
                cnt += 1

            if cnt < 40:
                return
            params["page"] += 1


class SeigaImageExtractor(SeigaExtractor):
    """Extractor for single images from seiga.nicovideo.jp"""
    subcategory = "image"
    filename_fmt = "{category}_{image_id}.{extension}"
    pattern = (r"(?:https?://)?(?:"
               r"(?:seiga\.|www\.)?nicovideo\.jp/(?:seiga/im|image/source/)"
               r"|sp\.seiga\.nicovideo\.jp/seiga/#!/im"
               r"|lohas\.nicoseiga\.jp/(?:thumb|(?:priv|o)/[^/]+/\d+)/)(\d+)")
    test = (
        ("https://seiga.nicovideo.jp/seiga/im5977527", {
            "keyword": "c8339781da260f7fc44894ad9ada016f53e3b12a",
            "content": "d9202292012178374d57fb0126f6124387265297",
        }),
        ("https://seiga.nicovideo.jp/seiga/im123", {
            "exception": exception.NotFoundError,
        }),
        ("https://seiga.nicovideo.jp/image/source/5977527"),
        ("https://sp.seiga.nicovideo.jp/seiga/#!/im5977527"),
        ("https://lohas.nicoseiga.jp/thumb/5977527i"),
        ("https://lohas.nicoseiga.jp/priv"
         "/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"),
        ("https://lohas.nicoseiga.jp/o"
         "/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"),
    )

    def __init__(self, match):
        SeigaExtractor.__init__(self, match)
        self.image_id = match.group(1)

    def skip(self, num):
        self.start_image += num
        return num

    def get_images(self):
        url = "{}/seiga/im{}".format(self.root, self.image_id)
        page = self.request(url, notfound="image").text

        data = text.extract_all(page, (
            ("date"        , '<li class="date"><span class="created">', '<'),
            ("title"       , '<h1 class="title">', '</h1>'),
            ("description" , '<p class="discription">', '</p>'),
        ))[0]

        data["user"] = text.extract_all(page, (
            ("id"  , '<a href="/user/illust/' , '"'),
            ("name", '<span itemprop="title">', '<'),
        ))[0]

        data["description"] = text.remove_html(data["description"])
        data["image_id"] = text.parse_int(self.image_id)
        data["date"] = text.parse_datetime(
            data["date"] + ":00+0900", "%Y年%m月%d日 %H:%M:%S%z")

        return (data, data)
[seiga] add extractor 8 years ago			`# -- coding: utf-8 --`

[seiga] require authentication with 'user_session' cookie (#2372) Login with username & password would now require entering a 2FA token. see also https://github.com/danbooru/danbooru/commit/7b009cc893b314a0dec65792143aad97a2d6bf92 3 years ago			`# Copyright 2016-2022 Mike Fährmann`
[seiga] add extractor 8 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[seiga] require authentication with 'user_session' cookie (#2372) Login with username & password would now require entering a 2FA token. see also https://github.com/danbooru/danbooru/commit/7b009cc893b314a0dec65792143aad97a2d6bf92 3 years ago			`"""Extractors for https://seiga.nicovideo.jp/"""`
[seiga] add extractor 8 years ago
			`from .common import Extractor, Message`
[seiga] better metadata + 'skip()' support 7 years ago			`from .. import text, util, exception`
[seiga] add extractor 8 years ago
code adjustments according to pep8 nr2 8 years ago
[seiga] add user extractor 8 years ago			`class SeigaExtractor(Extractor):`
			`"""Base class for seiga extractors"""`
[seiga] add extractor 8 years ago			`category = "seiga"`
set 'archive_fmt' values These are going to be used to create an unique id for each image. 7 years ago			`archive_fmt = "{image_id}"`
use 'cookiedomain' for cookies set by object-config-values otherwise these cookies would not be picked up by the _check_cookies() method. 7 years ago			`cookiedomain = ".nicovideo.jp"`
[seiga] use HTTPS 6 years ago			`root = "https://seiga.nicovideo.jp"`
[seiga] add extractor 8 years ago
propagate 'match' to base extractor constructor 6 years ago			`def __init__(self, match):`
			`Extractor.__init__(self, match)`
implement 'util.advance()' 7 years ago			`self.start_image = 0`
[seiga] better metadata + 'skip()' support 7 years ago
[seiga] add extractor 8 years ago			`def items(self):`
[seiga] require authentication with 'user_session' cookie (#2372) Login with username & password would now require entering a 2FA token. see also https://github.com/danbooru/danbooru/commit/7b009cc893b314a0dec65792143aad97a2d6bf92 3 years ago			`if not self._check_cookies(("user_session",)):`
			`raise exception.StopExtraction("'user_session' cookie required")`

[seiga] better metadata + 'skip()' support 7 years ago			`images = iter(self.get_images())`
			`data = next(images)`

[seiga] add extractor 8 years ago			`yield Message.Directory, data`
implement 'util.advance()' 7 years ago			`for image in util.advance(images, self.start_image):`
[seiga] add user extractor 8 years ago			`data.update(image)`
[seiga] fix file extension and xml parsing - The file extension of the first image had been used for all further images - API responses can contain invalid characters, which cause the XML parser to fail (http://seiga.nicovideo.jp/user/illust/26377934 contains several \x08 characters) 8 years ago			`data["extension"] = None`
[seiga] better metadata + 'skip()' support 7 years ago			`yield Message.Url, self.get_image_url(data["image_id"]), data`
[seiga] add user extractor 8 years ago
			`def get_images(self):`
[seiga] better metadata + 'skip()' support 7 years ago			`"""Return iterable containing metadata and images"""`
[seiga] add extractor 8 years ago
			`def get_image_url(self, image_id):`
			`"""Get url for an image with id 'image_id'"""`
[seiga] use HTTPS 6 years ago			`url = "{}/image/source/{}".format(self.root, image_id)`
use 'extractor.request()' for more HTTP requests 6 years ago			`response = self.request(`
replace extractor.request() 'expect' argument with - 'fatal': allow 4xx status codes - 'notfound': raise NotFoundError on 404 5 years ago			`url, method="HEAD", allow_redirects=False, notfound="image")`
[seiga] raise error when redirected to login page (#3401) 2 years ago			`location = response.headers["location"]`
			`if "nicovideo.jp/login" in location:`
			`raise exception.StopExtraction(`
			`"HTTP redirect to login page (%s)", location.partition("?")[0])`
			`return location.replace("/o/", "/priv/", 1)`
[seiga] add extractor 8 years ago
[seiga] add user extractor 8 years ago
			`class SeigaUserExtractor(SeigaExtractor):`
			`"""Extractor for images of a user from seiga.nicovideo.jp"""`
			`subcategory = "user"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`directory_fmt = ("{category}", "{user[id]}")`
[seiga] better metadata + 'skip()' support 7 years ago			`filename_fmt = "{category}_{user[id]}_{image_id}.{extension}"`
[seiga] support mobile URLs (closes #401) 5 years ago			`pattern = (r"(?:https?://)?(?:www\.\|(?:sp\.)?seiga\.)?nicovideo\.jp/"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`r"user/illust/(\d+)(?:\?(?:[^&]+&)*sort=([^&#]+))?")`
			`test = (`
[seiga] use HTTPS 6 years ago			`("https://seiga.nicovideo.jp/user/illust/39537793", {`
[seiga] support more than 200 images Due to API restrictions and/or missing knowledge about and documentation of API usage, it was only possible to retrieve the latest 200 images of a niconico seiga user with said API. The new approach manually visits each HTML page and gets its information from there. 7 years ago			`"pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",`
update extractor tests 7 years ago			`"count": ">= 4",`
[seiga] update tests 7 years ago			`"keyword": {`
			`"user": {`
			`"id": 39537793,`
			`"message": str,`
[booru] call update_page() with correct dict (closes #82) 7 years ago			`"name": str,`
[seiga] update tests 7 years ago			`},`
			`"clips": int,`
			`"comments": int,`
			`"count": int,`
			`"extension": None,`
			`"image_id": int,`
			`"title": str,`
			`"views": int,`
			`},`
[seiga] add user extractor 8 years ago			`}),`
[seiga] use HTTPS 6 years ago			`("https://seiga.nicovideo.jp/user/illust/79433", {`
[seiga] better metadata + 'skip()' support 7 years ago			`"exception": exception.NotFoundError,`
[seiga] add user extractor 8 years ago			`}),`
[seiga] use HTTPS 6 years ago			`("https://seiga.nicovideo.jp/user/illust/39537793"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`"?sort=image_view&target=illust_all"),`
[seiga] support mobile URLs (closes #401) 5 years ago			`("https://sp.seiga.nicovideo.jp/user/illust/39537793"),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`)`
[seiga] add user extractor 8 years ago
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 6 years ago			`SeigaExtractor.__init__(self, match)`
[seiga] better metadata + 'skip()' support 7 years ago			`self.user_id, self.order = match.groups()`
implement 'util.advance()' 7 years ago			`self.start_page = 1`
[seiga] better metadata + 'skip()' support 7 years ago
			`def skip(self, num):`
			`pages, images = divmod(num, 40)`
implement 'util.advance()' 7 years ago			`self.start_page += pages`
			`self.start_image += images`
[seiga] better metadata + 'skip()' support 7 years ago			`return num`

			`def get_metadata(self, page):`
			`"""Collect metadata from 'page'"""`
			`data = text.extract_all(page, (`
			`("name" , '<img alt="', '"'),`
			`("msg" , '<li class="user_message">', '</li>'),`
			`(None , '<span class="target_name">すべて</span>', ''),`
			`("count", '<span class="count ">', '</span>'),`
			`))[0]`

			`if not data["name"] and "ユーザー情報が取得出来ませんでした" in page:`
			`raise exception.NotFoundError("user")`

			`return {`
			`"user": {`
rename safe_int to parse_int; move parse_* to text module 7 years ago			`"id": text.parse_int(self.user_id),`
[seiga] better metadata + 'skip()' support 7 years ago			`"name": data["name"],`
			`"message": (data["msg"] or "").strip(),`
			`},`
rename safe_int to parse_int; move parse_* to text module 7 years ago			`"count": text.parse_int(data["count"]),`
[seiga] better metadata + 'skip()' support 7 years ago			`}`
[seiga] add user extractor 8 years ago
			`def get_images(self):`
[seiga] use HTTPS 6 years ago			`url = "{}/user/illust/{}".format(self.root, self.user_id)`
implement 'util.advance()' 7 years ago			`params = {"sort": self.order, "page": self.start_page,`
[seiga] better metadata + 'skip()' support 7 years ago			`"target": "illust_all"}`
[seiga] support more than 200 images Due to API restrictions and/or missing knowledge about and documentation of API usage, it was only possible to retrieve the latest 200 images of a niconico seiga user with said API. The new approach manually visits each HTML page and gets its information from there. 7 years ago
			`while True:`
			`cnt = 0`
			`page = self.request(url, params=params).text`

implement 'util.advance()' 7 years ago			`if params["page"] == self.start_page:`
[seiga] better metadata + 'skip()' support 7 years ago			`yield self.get_metadata(page)`

[seiga] support more than 200 images Due to API restrictions and/or missing knowledge about and documentation of API usage, it was only possible to retrieve the latest 200 images of a niconico seiga user with said API. The new approach manually visits each HTML page and gets its information from there. 7 years ago			`for info in text.extract_iter(`
			`page, '<li class="list_item', '</a></li> '):`
[seiga] better metadata + 'skip()' support 7 years ago			`data = text.extract_all(info, (`
[seiga] support more than 200 images Due to API restrictions and/or missing knowledge about and documentation of API usage, it was only possible to retrieve the latest 200 images of a niconico seiga user with said API. The new approach manually visits each HTML page and gets its information from there. 7 years ago			`("image_id", '/seiga/im', '"'),`
			`("title" , '<li class="title">', '</li>'),`
			`("views" , '</span>', '</li>'),`
			`("comments", '</span>', '</li>'),`
			`("clips" , '</span>', '</li>'),`
			`))[0]`
[seiga] better metadata + 'skip()' support 7 years ago			`for key in ("image_id", "views", "comments", "clips"):`
rename safe_int to parse_int; move parse_* to text module 7 years ago			`data[key] = text.parse_int(data[key])`
[seiga] better metadata + 'skip()' support 7 years ago			`yield data`
[seiga] support more than 200 images Due to API restrictions and/or missing knowledge about and documentation of API usage, it was only possible to retrieve the latest 200 images of a niconico seiga user with said API. The new approach manually visits each HTML page and gets its information from there. 7 years ago			`cnt += 1`

			`if cnt < 40:`
			`return`
			`params["page"] += 1`
[seiga] add user extractor 8 years ago

			`class SeigaImageExtractor(SeigaExtractor):`
			`"""Extractor for single images from seiga.nicovideo.jp"""`
			`subcategory = "image"`
change keyword names to valid Python identifiers This commit mostly replaces all minus-signs ('-') in keyword names with underscores ('_') to allow them to be used in filter-expressions. For example 'gallery-id' got renamed to 'gallery_id'. (It is theoretically possible to access any variable, regardless of its name, with 'locals()["NAME"]', but that seems a bit too convoluted if just 'NAME' could be enough) 7 years ago			`filename_fmt = "{category}_{image_id}.{extension}"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`pattern = (r"(?:https?://)?(?:"`
[seiga] recognize /thumb/ URLs https://lohas.nicoseiga.jp/thumb/5977527i 6 years ago			`r"(?:seiga\.\|www\.)?nicovideo\.jp/(?:seiga/im\|image/source/)"`
[seiga] support mobile URLs (closes #401) 5 years ago			`r"\|sp\.seiga\.nicovideo\.jp/seiga/#!/im"`
[seiga] recognize /thumb/ URLs https://lohas.nicoseiga.jp/thumb/5977527i 6 years ago			`r"\|lohas\.nicoseiga\.jp/(?:thumb\|(?:priv\|o)/[^/]+/\d+)/)(\d+)")`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`test = (`
[seiga] use HTTPS 6 years ago			`("https://seiga.nicovideo.jp/seiga/im5977527", {`
[seiga] Add metadata for single image downloads (#1063) * [seiga] Support image metadata. * [seiga] Update test data. * [seiga] Fix cookie check. * [test_cookies] [seiga] Fit test_cookies.py to the last commit. 4 years ago			`"keyword": "c8339781da260f7fc44894ad9ada016f53e3b12a",`
[seiga] add user extractor 8 years ago			`"content": "d9202292012178374d57fb0126f6124387265297",`
			`}),`
[seiga] use HTTPS 6 years ago			`("https://seiga.nicovideo.jp/seiga/im123", {`
[seiga] add user extractor 8 years ago			`"exception": exception.NotFoundError,`
			`}),`
[seiga] use HTTPS 6 years ago			`("https://seiga.nicovideo.jp/image/source/5977527"),`
[seiga] support mobile URLs (closes #401) 5 years ago			`("https://sp.seiga.nicovideo.jp/seiga/#!/im5977527"),`
[seiga] recognize /thumb/ URLs https://lohas.nicoseiga.jp/thumb/5977527i 6 years ago			`("https://lohas.nicoseiga.jp/thumb/5977527i"),`
			`("https://lohas.nicoseiga.jp/priv"`
			`"/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"),`
			`("https://lohas.nicoseiga.jp/o"`
			`"/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"),`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`)`
[seiga] add user extractor 8 years ago
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 6 years ago			`SeigaExtractor.__init__(self, match)`
[seiga] add user extractor 8 years ago			`self.image_id = match.group(1)`

[seiga] better metadata + 'skip()' support 7 years ago			`def skip(self, num):`
implement 'util.advance()' 7 years ago			`self.start_image += num`
[seiga] better metadata + 'skip()' support 7 years ago			`return num`

[seiga] add user extractor 8 years ago			`def get_images(self):`
[seiga] Add metadata for single image downloads (#1063) * [seiga] Support image metadata. * [seiga] Update test data. * [seiga] Fix cookie check. * [test_cookies] [seiga] Fit test_cookies.py to the last commit. 4 years ago			`url = "{}/seiga/im{}".format(self.root, self.image_id)`
[seiga] fix flake8 and cookie test (#1063) 4 years ago			`page = self.request(url, notfound="image").text`
[seiga] Add metadata for single image downloads (#1063) * [seiga] Support image metadata. * [seiga] Update test data. * [seiga] Fix cookie check. * [test_cookies] [seiga] Fit test_cookies.py to the last commit. 4 years ago
			`data = text.extract_all(page, (`
[seiga] fix flake8 and cookie test (#1063) 4 years ago			`("date" , '<li class="date"><span class="created">', '<'),`
			`("title" , '<h1 class="title">', '</h1>'),`
[seiga] Add metadata for single image downloads (#1063) * [seiga] Support image metadata. * [seiga] Update test data. * [seiga] Fix cookie check. * [test_cookies] [seiga] Fit test_cookies.py to the last commit. 4 years ago			`("description" , '<p class="discription">', '</p>'),`
			`))[0]`

			`data["user"] = text.extract_all(page, (`
[seiga] fix flake8 and cookie test (#1063) 4 years ago			`("id" , '<a href="/user/illust/' , '"'),`
			`("name", '<span itemprop="title">', '<'),`
[seiga] Add metadata for single image downloads (#1063) * [seiga] Support image metadata. * [seiga] Update test data. * [seiga] Fix cookie check. * [test_cookies] [seiga] Fit test_cookies.py to the last commit. 4 years ago			`))[0]`

			`data["description"] = text.remove_html(data["description"])`
			`data["image_id"] = text.parse_int(self.image_id)`
[seiga] fix flake8 and cookie test (#1063) 4 years ago			`data["date"] = text.parse_datetime(`
			`data["date"] + ":00+0900", "%Y年%m月%d日 %H:%M:%S%z")`
[seiga] Add metadata for single image downloads (#1063) * [seiga] Support image metadata. * [seiga] Update test data. * [seiga] Fix cookie check. * [test_cookies] [seiga] Fit test_cookies.py to the last commit. 4 years ago
			`return (data, data)`