From c9290d8212f704c39aa4b070dd47c926f3634152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 17 Sep 2018 21:19:25 +0200 Subject: [PATCH] [wallhaven] add wallpaper and search extractors todo: - login support to gain access to NSFW wallpapers - extractors for tag-, similar-, latest-listings - skip() support --- CHANGELOG.md | 2 + docs/supportedsites.rst | 1 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/wallhaven.py | 141 ++++++++++++++++++++++++++++++ 4 files changed, 145 insertions(+) create mode 100644 gallery_dl/extractor/wallhaven.py diff --git a/CHANGELOG.md b/CHANGELOG.md index cce02eaa..49475a1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ # Changelog ## Unreleased +- Added support for: + - `wallhaven` - https://alpha.wallhaven.cc/ ## 1.5.3 - 2018-09-14 - Added support for: diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 56c49740..eb64b447 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -79,6 +79,7 @@ SmugMug https://www.smugmug.com/ |Albums, individ-5| The /b/ Archive https://thebarchive.com/ Threads Tumblr https://www.tumblr.com/ Images from Users, Likes, Posts, Tag-Searches Optional (OAuth) Twitter https://twitter.com/ Media Timelines, Timelines, Tweets +Wallhaven https://alpha.wallhaven.cc/ individual Images, Search Results Warosu https://warosu.org/ Threads World Three http://www.slide.world-three.org/ Chapters, Manga XVideos https://www.xvideos.com/ Images from Users, Galleries diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 7036a6be..82b26a39 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -86,6 +86,7 @@ modules = [ "thebarchive", "tumblr", "twitter", + "wallhaven", "warosu", "worldthree", "yandere", diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py new file mode 100644 index 00000000..cd70ef4f --- /dev/null +++ b/gallery_dl/extractor/wallhaven.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://alpha.wallhaven.cc/""" + +from .common import Extractor, Message +from .. import text + + +class WallhavenExtractor(Extractor): + """Base class for wallhaven extractors""" + category = "wallhaven" + filename_fmt = "{category}_{id}_{width}x{height}.{extension}" + root = "https://alpha.wallhaven.cc" + + def get_wallpaper_data(self, wallpaper_id): + """Extract url and metadata for a wallpaper""" + url = "{}/wallpaper/{}".format(self.root, wallpaper_id) + page = self.request(url).text + + title, pos = text.extract(page, 'name="title" content="', '"') + url, pos = text.extract( + page, 'property="og:image" content="', '"', pos) + resolution, pos = text.extract( + page, '

', pos) + uploader, pos = text.extract(page, 'alt="', '"', pos) + date , pos = text.extract(page, 'datetime="', '"', pos) + category, pos = text.extract(page, 'Category
', '<', pos) + size , pos = text.extract(page, 'Size
', '<', pos) + views , pos = text.extract(page, 'Views
', '<', pos) + favs , pos = text.extract(page, 'Favorites
', '', pos) + + width, _, height = resolution.rpartition(">")[2].partition("x") + + return text.urljoin(self.root, url), { + "id": text.parse_int(wallpaper_id), + "width": text.parse_int(width), + "height": text.parse_int(height), + "colors": list(text.extract_iter(colors, '#', '"')), + "tags": title.rpartition(" | ")[0].lstrip("#").split(", #"), + "uploader": text.unescape(uploader), + "wh_category": category, + "date": date, + "size": size, + "views": text.parse_int(views.replace(",", "")), + "favorites": text.parse_int( + text.remove_html(favs).partition(" ")[0]), + } + + +class WallhavenSearchExtractor(WallhavenExtractor): + """Extractor for search results on wallhaven.cc""" + subcategory = "search" + directory_fmt = ["{category}", "{search[q]}"] + archive_fmt = "s_{search[q]}_{id}" + pattern = [r"(?:https?://)?alpha\.wallhaven\.cc/search\?([^/?#]+)"] + test = [ + ("https://alpha.wallhaven.cc/search?q=id%3A87", { + "url": "0a8ba15e6eb94178a8720811c4bdcca0e20d537a", + "keyword": "7e5840cff08ca53cab1963002c4c1c5868f16020", + "range": (1, 3), + }), + ] + per_page = 24 + + def __init__(self, match): + WallhavenExtractor.__init__(self) + self.params = text.parse_query(match.group(1)) + + def items(self): + yield Message.Version, 1 + yield Message.Directory, {"search": self.params} + + for wp_id in self.wallpapers(): + wp_url, wp_data = self.get_wallpaper_data(wp_id) + wp_data["search"] = self.params + yield Message.Url, wp_url, wp_data + + def wallpapers(self): + """Yield wallpaper IDs from search results""" + url = "{}/search".format(self.root) + params = self.params.copy() + headers = { + "Referer": url, + "X-Requested-With": "XMLHttpRequest", + } + + params["page"] = 1 + while True: + page = self.request(url, params=params, headers=headers).text + + ids = list(text.extract_iter(page, 'data-wallpaper-id="', '"')) + yield from ids + + if len(ids) < self.per_page: + return + params["page"] += 1 + + +class WallhavenImageExtractor(WallhavenExtractor): + """Extractor for individual wallpaper on wallhaven.cc""" + subcategory = "image" + archive_fmt = "{id}" + pattern = [r"(?:https?://)?(?:alpha\.wallhaven\.cc/wallpaper" + r"|whvn\.cc)/(\d+)"] + test = [ + ("https://alpha.wallhaven.cc/wallpaper/8114", { + "pattern": "https://[^.]+.wallhaven.cc/[^/]+/full/[^-]+-8114.jpg", + "content": "497212679383a465da1e35bd75873240435085a2", + "keyword": { + "id": 8114, + "width": 1920, + "height": 1200, + "colors": list, + "tags": list, + "uploader": "AksumkA", + "date": "2014-08-31T06:17:19+00:00", + "wh_category": "Anime", + "size": "272.3 KiB", + "views": int, + "favorites": int, + }, + }), + ("https://whvn.cc/8114", None), + ] + + def __init__(self, match): + WallhavenExtractor.__init__(self) + self.wallpaper_id = match.group(1) + + def items(self): + url, data = self.get_wallpaper_data(self.wallpaper_id) + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, url, data