[xhamster] add gallery & user extractor (#281)

5 years ago · 096009367b
parent 208202b962
commit 096009367b
4 changed files with 174 additions and 0 deletions
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@ -104,6 +104,7 @@ Warosu               https://warosu.org/                 Threads
 Weibo                https://www.weibo.com/              Images from Users, Images from Statuses
 WikiArt.org          https://www.wikiart.org/            Artists, Artworks
 World Three          http://www.slide.world-three.org/   Chapters, Manga
 xHamster             https://xhamster.com/               Images from Users, Galleries
 XVideos              https://www.xvideos.com/            Images from Users, Galleries
 Yandere              https://yande.re/                   Pools, Popular Images, Posts, Tag-Searches
 yaplog!              https://yaplog.jp/                  Blogs, Posts
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -93,6 +93,7 @@ modules = [
    "warosu",
    "weibo",
    "wikiart",
    "xhamster",
    "xvideos",
    "yandere",
    "yaplog",
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@ -0,0 +1,171 @@
 # -*- coding: utf-8 -*-
 # Copyright 2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extractors for https://xhamster.com/"""
 from .common import Extractor, Message
 from .. import text
 import json
 BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?xhamster\.(?:com|one|desi)"
 class XhamsterExtractor(Extractor):
    """Base class for xhamster extractors"""
    category = "xhamster"
    root = "https://xhamster.com"
 class XhamsterGalleryExtractor(XhamsterExtractor):
    """Extractor for image galleries on xhamster.com"""
    subcategory = "gallery"
    directory_fmt = ("{category}", "{user[name]}",
                     "{gallery[id]} {gallery[title]}")
    filename_fmt = "{num:>03}_{id}.{extension}"
    archive_fmt = "{id}"
    pattern = BASE_PATTERN + r"(/photos/gallery/[^/?&#]+)"
    test = (
        ("https://xhamster.com/photos/gallery/11748968", {
            "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$",
            "count": 143,
            "keyword": {
                "comments": int,
                "count": 143,
                "favorite": bool,
                "id": int,
                "num": int,
                "height": int,
                "width": int,
                "imageURL": str,
                "pageURL": str,
                "thumbURL": str,
                "gallery": {
                    "date": "type:datetime",
                    "description": "",
                    "dislikes": int,
                    "id": 11748968,
                    "likes": int,
                    "tags": ["NON-Porn"],
                    "thumbnail": str,
                    "title": "Make the world better.",
                    "views": int,
                },
                "user": {
                    "id": 16874672,
                    "name": "Anonymousrants",
                    "retired": bool,
                    "subscribers": int,
                    "url": "https://xhamster.com/users/anonymousrants",
                    "verified": bool,
                },
            },
        }),
        ("https://xhamster.com/photos/gallery/make-the-world-better-11748968"),
        ("https://xhamster.com/photos/gallery/11748968"),
        ("https://xhamster.one/photos/gallery/11748968"),
        ("https://xhamster.desi/photos/gallery/11748968"),
        ("https://en.xhamster.com/photos/gallery/11748968"),
    )
    def __init__(self, match):
        XhamsterExtractor.__init__(self, match)
        self.path = match.group(1)
        self.data = None
    def items(self):
        data = self.metadata()
        yield Message.Version, 1
        yield Message.Directory, data
        for num, image in enumerate(self.images(), 1):
            url = image["imageURL"]
            image.update(data)
            image["num"] = num
            yield Message.Url, url, text.nameext_from_url(url, image)
    def metadata(self):
        self.data = self._data(self.root + self.path)
        user = self.data["authorModel"]
        imgs = self.data["photosGalleryModel"]
        return {
            "user":
            {
                "id"         : text.parse_int(user["id"]),
                "url"        : user["pageURL"],
                "name"       : user["name"],
                "retired"    : user["retired"],
                "verified"   : user["verified"],
                "subscribers": user["subscribers"],
            },
            "gallery":
            {
                "id"         : text.parse_int(imgs["id"]),
                "tags"       : [c["name"] for c in imgs["categories"]],
                "date"       : text.parse_timestamp(imgs["created"]),
                "views"      : text.parse_int(imgs["views"]),
                "likes"      : text.parse_int(imgs["rating"]["likes"]),
                "dislikes"   : text.parse_int(imgs["rating"]["dislikes"]),
                "title"      : imgs["title"],
                "description": imgs["description"],
                "thumbnail"  : imgs["thumbURL"],
            },
            "count": text.parse_int(imgs["quantity"]),
        }
    def images(self):
        data = self.data
        self.data = None
        while True:
            for image in data["photosGalleryModel"]["photos"]:
                del image["modelName"]
                yield image
            pgntn = data["pagination"]
            if pgntn["active"] == pgntn["maxPage"]:
                return
            url = pgntn["pageLinkTemplate"][:-3] + str(pgntn["next"])
            data = self._data(url)
    def _data(self, url):
        page = self.request(url).text
        return json.loads(text.extract(
            page, "window.initials =", "</script>")[0].rstrip("\n\r;"))
 class XhamsterUserExtractor(XhamsterExtractor):
    """Extractor for all galleries of an xhamster user"""
    subcategory = "user"
    pattern = BASE_PATTERN + r"/users/([^/?&#]+)(?:/photos)?/?(?:$|[?#])"
    test = (
        ("https://xhamster.com/users/nickname68/photos", {
            "pattern": XhamsterGalleryExtractor.pattern,
            "count": 50,
            "range": "1-50",
        }),
        ("https://xhamster.com/users/nickname68"),
    )
    def __init__(self, match):
        XhamsterExtractor.__init__(self, match)
        self.user = match.group(1)
    def items(self):
        yield Message.Version, 1
        url = "{}/users/{}/photos".format(self.root, self.user)
        data = {"_extractor": XhamsterGalleryExtractor}
        while url:
            extr = text.extract_from(self.request(url).text)
            while True:
                url = extr('thumb-image-container" href="', '"')
                if not url:
                    break
                yield Message.Queue, url, data
            url = extr('data-page="next" href="', '"')
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@ -69,6 +69,7 @@ CATEGORY_MAP = {
    "thebarchive"    : "The /b/ Archive",
    "wikiart"        : "WikiArt.org",
    "worldthree"     : "World Three",
    "xhamster"       : "xHamster",
    "xvideos"        : "XVideos",
    "yaplog"         : "yaplog!",
    "yuki"           : "yuki.la 4chan archive",