gallery-dl/gallery_dl/extractor/sankaku.py

# -*- coding: utf-8 -*-

# Copyright 2014-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extract images from https://chan.sankakucomplex.com/"""

from .common import AsynchronousExtractor, Message
from .. import text


class SankakuTagExtractor(AsynchronousExtractor):
    """Extractor for images from chan.sankakucomplex.com by search-tags"""
    category = "sankaku"
    subcategory = "tag"
    directory_fmt = ["{category}", "{tags}"]
    filename_fmt = "{category}_{id}_{md5}.{extension}"
    pattern = [r"(?:https?://)?chan\.sankakucomplex\.com/\?tags=([^&]+)"]
    test = [("https://chan.sankakucomplex.com/?tags=bonocho", {
        "count": 5,
        "pattern": (r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
                    r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+"),
    })]
    url = "https://chan.sankakucomplex.com/"

    def __init__(self, match):
        AsynchronousExtractor.__init__(self)
        self.tags = text.unquote(match.group(1))
        self.session.headers["User-Agent"] = (
            "Mozilla/5.0 Gecko/20100101 Firefox/40.0"
        )

    def items(self):
        data = self.get_job_metadata()
        yield Message.Version, 1
        yield Message.Directory, data
        for image in self.get_images():
            image.update(data)
            yield Message.Url, image["file-url"], image

    def get_job_metadata(self):
        """Collect metadata for extractor-job"""
        return {"tags": self.tags}

    def get_images(self):
        params = {
            "tags": self.tags,
            "page": 1,
        }
        while True:
            count = 0
            page = self.request(self.url, params=params).text
            pos = text.extract(page, '<div id=more-popular-posts-link>', '')[1]
            while True:
                image_id, pos = text.extract(
                    page, '<span class="thumb blacklisted" id=p', '>', pos
                )
                if not image_id:
                    break
                image = self.get_image_metadata(image_id)
                count += 1
                yield image
            if count < 20:
                return
            params["page"] += 1

    def get_image_metadata(self, image_id):
        url = "https://chan.sankakucomplex.com/post/show/" + image_id
        page = self.request(url).text
        image_url, pos = text.extract(page, '<li>Original: <a href="', '"')
        width    , pos = text.extract(page, '>', 'x', pos)
        height   , pos = text.extract(page, '', ' ', pos)
        data = text.nameext_from_url(image_url, {
            "id": image_id,
            "file-url": "https:" + text.unescape(image_url),
            "width": width,
            "height": height,
        })
        data["md5"] = data["name"]
        return data
[sankaku] re-enable extractor 9 years ago			`# -- coding: utf-8 --`
initial commit 10 years ago
share extractor and downloader sessions There was never any "good" reason for the strict separation between extractors and downloaders. This change allows for reduced resource usage (probably unnoticeable) and less lines of code at the "cost" of tighter coupling. 7 years ago			`# Copyright 2014-2017 Mike Fährmann`
[sankaku] re-enable extractor 9 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extract images from https://chan.sankakucomplex.com/"""`

[sankaku] always use correct file-url 9 years ago			`from .common import AsynchronousExtractor, Message`
[sankaku] re-enable extractor 9 years ago			`from .. import text`

code adjustments according to pep8 nr2 8 years ago
consistent extractor naming scheme + docstrings 8 years ago			`class SankakuTagExtractor(AsynchronousExtractor):`
code adjustments according to pep8 nr2 8 years ago			`"""Extractor for images from chan.sankakucomplex.com by search-tags"""`
update all other extractors 9 years ago			`category = "sankaku"`
consistent extractor naming scheme + docstrings 8 years ago			`subcategory = "tag"`
update all other extractors 9 years ago			`directory_fmt = ["{category}", "{tags}"]`
			`filename_fmt = "{category}_{id}_{md5}.{extension}"`
			`pattern = [r"(?:https?://)?chan\.sankakucomplex\.com/\?tags=([^&]+)"]`
add missing tests 8 years ago			`test = [("https://chan.sankakucomplex.com/?tags=bonocho", {`
[sankaku] unescape image URLs 7 years ago			`"count": 5,`
			`"pattern": (r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"`
			`r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+"),`
add missing tests 8 years ago			`})]`
initial commit 10 years ago			`url = "https://chan.sankakucomplex.com/"`

[sankaku] re-enable extractor 9 years ago			`def __init__(self, match):`
[sankaku] always use correct file-url 9 years ago			`AsynchronousExtractor.__init__(self)`
[sankaku] re-enable extractor 9 years ago			`self.tags = text.unquote(match.group(1))`
			`self.session.headers["User-Agent"] = (`
			`"Mozilla/5.0 Gecko/20100101 Firefox/40.0"`
			`)`
initial commit 10 years ago
[sankaku] re-enable extractor 9 years ago			`def items(self):`
			`data = self.get_job_metadata()`
[sankaku] always use correct file-url 9 years ago			`yield Message.Version, 1`
[sankaku] re-enable extractor 9 years ago			`yield Message.Directory, data`
			`for image in self.get_images():`
[sankaku] always use correct file-url 9 years ago			`image.update(data)`
			`yield Message.Url, image["file-url"], image`
[sankaku] re-enable extractor 9 years ago
			`def get_job_metadata(self):`
			`"""Collect metadata for extractor-job"""`
remove explicit (sub)category keywords 8 years ago			`return {"tags": self.tags}`
[sankaku] re-enable extractor 9 years ago
			`def get_images(self):`
			`params = {`
			`"tags": self.tags,`
			`"page": 1,`
			`}`
initial commit 10 years ago			`while True:`
[sankaku] re-enable extractor 9 years ago			`count = 0`
			`page = self.request(self.url, params=params).text`
[sankaku] always use correct file-url 9 years ago			`pos = text.extract(page, '<div id=more-popular-posts-link>', '')[1]`
initial commit 10 years ago			`while True:`
code adjustments according to pep8 nr2 8 years ago			`image_id, pos = text.extract(`
			`page, '<span class="thumb blacklisted" id=p', '>', pos`
			`)`
[sankaku] always use correct file-url 9 years ago			`if not image_id:`
initial commit 10 years ago			`break`
[sankaku] always use correct file-url 9 years ago			`image = self.get_image_metadata(image_id)`
[sankaku] re-enable extractor 9 years ago			`count += 1`
			`yield image`
			`if count < 20:`
			`return`
initial commit 10 years ago			`params["page"] += 1`
[sankaku] re-enable extractor 9 years ago
[sankaku] always use correct file-url 9 years ago			`def get_image_metadata(self, image_id):`
			`url = "https://chan.sankakucomplex.com/post/show/" + image_id`
			`page = self.request(url).text`
			`image_url, pos = text.extract(page, '<li>Original: <a href="', '"')`
			`width , pos = text.extract(page, '>', 'x', pos)`
			`height , pos = text.extract(page, '', ' ', pos)`
code cleanup to use nameext_from_url 9 years ago			`data = text.nameext_from_url(image_url, {`
[sankaku] always use correct file-url 9 years ago			`"id": image_id,`
[sankaku] unescape image URLs 7 years ago			`"file-url": "https:" + text.unescape(image_url),`
[sankaku] always use correct file-url 9 years ago			`"width": width,`
			`"height": height,`
code cleanup to use nameext_from_url 9 years ago			`})`
			`data["md5"] = data["name"]`
			`return data`