diff --git a/README.rst b/README.rst index cbdfdfc6..4302e457 100644 --- a/README.rst +++ b/README.rst @@ -194,7 +194,7 @@ OAuth ----- *gallery-dl* supports user authentication via OAuth_ for -``deviantart``, ``flickr``, ``reddit`` and ``tumblr``. +``deviantart``, ``flickr``, ``reddit``, ``smugmug`` and ``tumblr``. This is entirely optional, but grants *gallery-dl* the ability to issue requests on your account's behalf and enables it to access resources which would otherwise be unavailable to a public user. diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index 16445e2e..a3ca7f63 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -100,7 +100,21 @@ "output": { "mode": "terminal", - "logfile": "~/gallery-dl/log.txt" + "log": { + "format": "{name}: {message}", + "level": "info" + }, + "logfile": { + "path": "~/gallery-dl/log.txt", + "mode": "w", + "level": "debug" + }, + "unsupportedfile": { + "path": "~/gallery-dl/unsupported.txt", + "mode": "a", + "format": "{asctime} {message}", + "format-date": "%Y-%m-%d-%H-%M-%S" + } }, "cache": { diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 3a7a8fcf..cdf5c697 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -68,7 +68,7 @@ Sea Otter Scans https://reader.seaotterscans.com/ Chapters, Manga Sen Manga http://raw.senmanga.com/ Chapters Sense-Scans http://sensescans.com/ Chapters, Manga SlideShare https://www.slideshare.net/ Presentations -SmugMug https://www.smugmug.com/ |Albums, individ-5| +SmugMug https://www.smugmug.com/ |Albums, individ-5| Optional (OAuth) Subapics https://subapics.com/ Chapters, Manga The /b/ Archive https://thebarchive.com/ Threads Tumblr https://www.tumblr.com/ Images from Users, Likes, Posts, Tag-Searches Optional (OAuth) @@ -88,8 +88,8 @@ Turboimagehost https://turboimagehost.com/ individual Images ==================== =================================== ================================================== ================ .. |Images from Use-0| replace:: Images from Users, Albums, Challenges, individual Images, Likes, Search Results -.. |Collections, De-1| replace:: Collections, Deviations, Favorites, Folders, Galleries, Journals +.. |Collections, De-1| replace:: Collections, Deviations, Favorites, Folders, Galleries, Journals, Popular Images .. |Images from Use-2| replace:: Images from Users, Albums, Favorites, Galleries, Groups, individual Images, Search Results .. |Images from Use-3| replace:: Images from Users, Doujin, Favorites, individual Images -.. |Images from Use-4| replace:: Images from Users, Bookmarks, Favorites, pixiv.me Links, Rankings, Individual Images +.. |Images from Use-4| replace:: Images from Users, Bookmarks, Favorites, Follows, pixiv.me Links, Rankings, Search Results, Individual Images .. |Albums, individ-5| replace:: Albums, individual Images, Images from Users and Folders diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 8e62f9c7..e698de62 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -27,20 +27,72 @@ from . import version, config, option, extractor, job, util, exception __version__ = version.__version__ log = logging.getLogger("gallery-dl") -def initialize_logging(loglevel, formatter): +LOG_FORMAT = "[{name}][{levelname}] {message}" +LOG_FORMAT_DATE = "%Y-%m-%d %H:%M:%S" +LOG_LEVEL = logging.INFO + +def initialize_logging(loglevel): """Setup basic logging functionality before configfiles have been loaded""" # convert levelnames to lowercase for level in (10, 20, 30, 40, 50): name = logging.getLevelName(level) logging.addLevelName(level, name.lower()) # setup basic logging to stderr + formatter = logging.Formatter(LOG_FORMAT, LOG_FORMAT_DATE, "{") handler = logging.StreamHandler() handler.setFormatter(formatter) + handler.setLevel(loglevel) root = logging.getLogger() - root.setLevel(loglevel) + root.setLevel(logging.NOTSET) root.addHandler(handler) +def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL): + """Setup a new logging handler""" + opts = config.interpolate(("output", key)) + if not opts: + return None + if isinstance(opts, str): + opts = {"path": opts} + + path = opts.get("path") + mode = opts.get("mode", "w") + try: + path = util.expand_path(path) + handler = logging.FileHandler(path, mode) + except (OSError, ValueError) as exc: + log.warning("%s: %s", key, exc) + return None + except TypeError as exc: + log.warning("%s: missing or invalid path (%s)", key, exc) + return None + + level = opts.get("level", lvl) + logfmt = opts.get("format", fmt) + datefmt = opts.get("format-date", LOG_FORMAT_DATE) + formatter = logging.Formatter(logfmt, datefmt, "{") + handler.setFormatter(formatter) + handler.setLevel(level) + + return handler + + +def configure_logging_handler(key, handler): + """Configure a logging handler""" + opts = config.interpolate(("output", key)) + if not opts: + return + if isinstance(opts, str): + opts = {"format": opts} + if handler.level == LOG_LEVEL and "level" in opts: + handler.setLevel(opts["level"]) + if "format" in opts or "format-date" in opts: + logfmt = opts.get("format", LOG_FORMAT) + datefmt = opts.get("format-date", LOG_FORMAT_DATE) + formatter = logging.Formatter(logfmt, datefmt, "{") + handler.setFormatter(formatter) + + def replace_std_streams(errors="replace"): """Replace standard streams and set their error handlers to 'errors'""" for name in ("stdout", "stdin", "stderr"): @@ -159,8 +211,7 @@ def main(): parser = option.build_parser() args = parser.parse_args() - formatter = logging.Formatter("[%(name)s][%(levelname)s] %(message)s") - initialize_logging(args.loglevel, formatter) + initialize_logging(args.loglevel) # configuration if args.load_config: @@ -173,17 +224,13 @@ def main(): config.set(key, value) config.set(("_",), {}) - # logfile - logfile = config.interpolate(("output", "logfile")) - if logfile: - try: - path = util.expand_path(logfile) - handler = logging.FileHandler(path, "w") - except OSError as exc: - log.warning("log file: %s", exc) - else: - handler.setFormatter(formatter) - logging.getLogger().addHandler(handler) + # stream logging handler + configure_logging_handler("log", logging.getLogger().handlers[0]) + + # file logging handler + handler = setup_logging_handler("logfile", lvl=args.loglevel) + if handler: + logging.getLogger().addHandler(handler) # loglevels if args.loglevel >= logging.ERROR: @@ -243,13 +290,13 @@ def main(): except OSError as exc: log.warning("input file: %s", exc) - unsupportedfile = config.interpolate(("output", "unsupportedfile")) - if unsupportedfile: - try: - path = util.expand_path(unsupportedfile) - job.Job.ufile = open(path, "w") - except OSError as exc: - log.warning("unsupported-URL file: %s", exc) + # unsupported file logging handler + handler = setup_logging_handler("unsupportedfile", fmt="{message}") + if handler: + ulog = logging.getLogger("unsupported") + ulog.addHandler(handler) + ulog.propagate = False + job.Job.ulog = ulog prepare_range(args.image_range, "image") prepare_range(args.chapter_range, "chapter") diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py index 53cc58e8..414a461d 100644 --- a/gallery_dl/cloudflare.py +++ b/gallery_dl/cloudflare.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2017 Mike Fährmann +# Copyright 2015-2018 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,6 +8,7 @@ """Methods to access sites behind Cloudflare protection""" +import re import time import operator import urllib.parse @@ -30,6 +31,7 @@ def request_func(self, *args, **kwargs): def solve_challenge(session, response): + session.headers["Referer"] = response.url page = response.text params = text.extract_all(page, ( @@ -37,58 +39,74 @@ def solve_challenge(session, response): ('pass' , 'name="pass" value="', '"'), ))[0] params["jschl_answer"] = solve_jschl(response.url, page) + time.sleep(4) - url = urllib.parse.urljoin(response.url, "/cdn-cgi/l/chk_jschl") + url = text.urljoin(response.url, "/cdn-cgi/l/chk_jschl") return session.get(url, params=params) def solve_jschl(url, page): """Solve challenge to get 'jschl_answer' value""" + + # build variable name + # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk data, pos = text.extract_all(page, ( ('var' , ',f, ', '='), ('key' , '"', '"'), ('expr', ':', '}'), )) - solution = evaluate_expression(data["expr"]) variable = "{}.{}".format(data["var"], data["key"]) vlength = len(variable) + + # evaluate the initial expression + solution = evaluate_expression(data["expr"]) + + # iterator over all remaining expressions + # and combine their values in 'solution' expressions = text.extract( - page, "'challenge-form');", "f.submit();", pos - )[0] + page, "'challenge-form');", "f.submit();", pos)[0] for expr in expressions.split(";")[1:]: + if expr.startswith(variable): + # select arithmetc function based on operator (+, -, *) func = operator_functions[expr[vlength]] + # evaluate the rest of the expression value = evaluate_expression(expr[vlength+2:]) + # combine the expression value with our current solution solution = func(solution, value) + elif expr.startswith("a.value"): + # add length of the hostname, i.e. add 11 for 'example.org' solution += len(urllib.parse.urlsplit(url).netloc) + if ".toFixed(" in expr: + # trim the solution to 10 decimal places + # and strip trailing zeros solution = "{:.10f}".format(solution).rstrip("0") + return solution -def evaluate_expression(expr): +def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")): """Evaluate a Javascript expression for the challenge""" + if "/" in expr: + # split the expression in numerator and denominator subexpressions, + # evaluate them separately, + # and return their fraction-result num, _, denom = expr.partition("/") return evaluate_expression(num) / evaluate_expression(denom) - stack = [] - ranges = [] - value = "" - for index, char in enumerate(expr): - if char == "(": - stack.append(index+1) - elif char == ")": - begin = stack.pop() - if stack: - ranges.append((begin, index)) - for subexpr in [expr[begin:end] for begin, end in ranges] or (expr,): - num = 0 - for part in subexpr.split("[]"): - num += expression_values[part] - value += str(num) - return int(value) + # iterate over all subexpressions, + # evaluate them, + # and accumulate their values in 'result' + result = "" + for subexpr in split_re.findall(expr): + result += str(sum( + expression_values[part] + for part in subexpr.split("[]") + )) + return int(result) operator_functions = { diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index bf461ae2..b590485f 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -12,7 +12,7 @@ import time import mimetypes from requests.exceptions import ConnectionError, Timeout from .common import DownloaderBase -from .. import util, exception +from .. import text, exception class Downloader(DownloaderBase): @@ -28,7 +28,7 @@ class Downloader(DownloaderBase): self.chunk_size = 16384 if self.rate: - self.rate = util.parse_bytes(self.rate) + self.rate = text.parse_bytes(self.rate) if not self.rate: self.log.warning("Invalid rate limit specified") elif self.rate < self.chunk_size: @@ -61,7 +61,7 @@ class Downloader(DownloaderBase): else: self.response.raise_for_status() - return offset, util.safe_int(size) + return offset, text.parse_int(size) def receive(self, file): if self.rate: diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 39e6fe6b..80ed0295 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -73,7 +73,7 @@ class ArtstationExtractor(Extractor): def get_user_info(self, username): """Return metadata for a specific user""" url = "{}/users/{}/quick.json".format(self.root, username.lower()) - response = self.request(url, fatal=False, allow_empty=True) + response = self.request(url, fatal=False) if response.status_code == 404: raise exception.NotFoundError("user") return response.json() @@ -158,7 +158,7 @@ class ArtstationAlbumExtractor(ArtstationExtractor): def __init__(self, match): ArtstationExtractor.__init__(self, match) - self.album_id = util.safe_int(match.group(2)) + self.album_id = text.parse_int(match.group(2)) def metadata(self): userinfo = self.get_user_info(self.user) @@ -256,7 +256,7 @@ class ArtstationChallengeExtractor(ArtstationExtractor): def _id_from_url(url): """Get an image's submission ID from its URL""" parts = url.split("/") - return util.safe_int("".join(parts[7:10])) + return text.parse_int("".join(parts[7:10])) class ArtstationSearchExtractor(ArtstationExtractor): diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index aaddbaac..0113f62c 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -10,7 +10,6 @@ from .common import SharedConfigExtractor, Message from .. import text -from urllib.parse import urljoin from xml.etree import ElementTree import datetime import operator @@ -52,7 +51,7 @@ class BooruExtractor(SharedConfigExtractor): try: url = image["file_url"] if url.startswith("/"): - url = urljoin(self.api_url, url) + url = text.urljoin(self.api_url, url) image.update(data) yield Message.Url, url, text.nameext_from_url(url, image) except KeyError: diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 8e42c32c..72a53369 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -52,34 +52,34 @@ class Extractor(): ("extractor", self.category, self.subcategory, key), default) def request(self, url, method="GET", encoding=None, fatal=True, retries=3, - allow_empty=False, *args, **kwargs): - max_retries = retries + *args, **kwargs): + max_tries = retries while True: try: - response = None response = self.session.request(method, url, *args, **kwargs) - if fatal: - response.raise_for_status() - if encoding: - response.encoding = encoding - if response.content or allow_empty: - return response - msg = "empty response body" - except requests.exceptions.HTTPError as exc: + except (requests.ConnectionError, requests.Timeout) as exc: msg = exc - code = response.status_code - if 400 <= code < 500 and code != 429: # Client Error - retries = 0 except requests.exceptions.RequestException as exc: - msg = exc - if not retries: - raise exception.HttpError(msg) - if response and response.status_code == 429: # Too Many Requests - waittime = float(response.headers.get("Retry-After", 10.0)) + raise exception.HttpError(exc) else: - waittime = 1 + if 200 <= response.status_code < 400 or not fatal: + if encoding: + response.encoding = encoding + return response + + msg = "{} HTTP Error: {} for url: {}".format( + response.status_code, response.reason, url) + if response.status_code < 500 and response.status_code != 429: + break + + if not retries: + break + tries = max_tries - retries retries -= 1 - time.sleep(waittime * (max_retries - retries)) + self.log.debug("%s (%d/%d)", msg, tries, max_tries) + time.sleep(2 ** tries) + + raise exception.HttpError(msg) def _get_auth_info(self): """Return authentication information as (username, password) tuple""" diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 7f63bc3d..41641052 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -9,7 +9,7 @@ """Extract images from https://www.deviantart.com/""" from .common import Extractor, Message -from .. import text, util, exception +from .. import text, exception from ..cache import cache, memcache import itertools import datetime @@ -62,7 +62,7 @@ class DeviantartExtractor(Extractor): if "videos" in deviation: video = max(deviation["videos"], - key=lambda x: util.safe_int(x["quality"][:-1])) + key=lambda x: text.parse_int(x["quality"][:-1])) yield self.commit(deviation, video) if "flash" in deviation: diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index bd9107ac..d63ddc0a 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -9,7 +9,7 @@ """Extract manga-chapters from https://dynasty-scans.com/""" from .common import ChapterExtractor -from .. import text, util +from .. import text import re import json @@ -53,7 +53,7 @@ class DynastyscansChapterExtractor(ChapterExtractor): return { "manga": text.unescape(match.group(1)), - "chapter": util.safe_int(match.group(2)), + "chapter": text.parse_int(match.group(2)), "chapter_minor": match.group(3) or "", "title": text.unescape(match.group(4) or ""), "author": text.remove_html(author), diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 41eaeca1..4b07abcb 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -120,7 +120,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.key = {} self.count = 0 self.version, self.gid, self.token = match.groups() - self.gid = util.safe_int(self.gid) + self.gid = text.parse_int(self.gid) def items(self): self.login() @@ -163,8 +163,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["lang"] = util.language_to_code(data["language"]) data["title"] = text.unescape(data["title"]) data["title_jp"] = text.unescape(data["title_jp"]) - data["count"] = util.safe_int(data["count"]) - data["gallery_size"] = util.parse_bytes( + data["count"] = text.parse_int(data["count"]) + data["gallery_size"] = text.parse_bytes( data["gallery_size"].rstrip("Bb")) return data @@ -245,18 +245,18 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def _parse_image_info(url): parts = url.split("/")[4].split("-") return { - "width": util.safe_int(parts[2]), - "height": util.safe_int(parts[3]), - "size": util.safe_int(parts[1]), + "width": text.parse_int(parts[2]), + "height": text.parse_int(parts[3]), + "size": text.parse_int(parts[1]), } @staticmethod def _parse_original_info(info): parts = info.lstrip().split(" ") return { - "width": util.safe_int(parts[0]), - "height": util.safe_int(parts[2]), - "size": util.parse_bytes(parts[3] + parts[4][0]), + "width": text.parse_int(parts[0]), + "height": text.parse_int(parts[2]), + "size": text.parse_bytes(parts[3] + parts[4][0]), } @@ -274,7 +274,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): def __init__(self, match): ExhentaiExtractor.__init__(self) self.params = text.parse_query(match.group(1) or "") - self.params["page"] = util.safe_int(self.params.get("page")) + self.params["page"] = text.parse_int(self.params.get("page")) self.url = self.root def items(self): @@ -308,7 +308,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): return Message.Queue, url, { "type": gtype, "date": date, - "gallery_id": util.safe_int(parts[1]), + "gallery_id": text.parse_int(parts[1]), "gallery_token": parts[2], "title": text.unescape(title), key: last, diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py index a4ea6f58..3cd3f7a2 100644 --- a/gallery_dl/extractor/fallenangels.py +++ b/gallery_dl/extractor/fallenangels.py @@ -98,8 +98,8 @@ class FallenangelsMangaExtractor(MangaExtractor): chapter, dot, minor = chapter.partition(".") results.append((url, { "manga": manga, "title": title, - "volume": util.safe_int(volume), - "chapter": util.safe_int(chapter), + "volume": text.parse_int(volume), + "chapter": text.parse_int(chapter), "chapter_minor": dot + minor, "lang": self.lang, "language": language, })) diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 150b2a28..cea46b1d 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -9,7 +9,7 @@ """Extract images from https://www.flickr.com/""" from .common import Extractor, Message -from .. import text, util, exception +from .. import text, oauth, util, exception class FlickrExtractor(Extractor): @@ -243,7 +243,7 @@ class FlickrSearchExtractor(FlickrExtractor): return self.api.photos_search(self.search) -class FlickrAPI(): +class FlickrAPI(oauth.OAuth1API): """Minimal interface for the flickr API""" API_URL = "https://api.flickr.com/services/rest/" API_KEY = "ac4fd7aa98585b9eee1ba761c209de68" @@ -264,17 +264,7 @@ class FlickrAPI(): ] def __init__(self, extractor): - self.api_key = extractor.config("api-key", self.API_KEY) - self.api_secret = extractor.config("api-secret", self.API_SECRET) - token = extractor.config("access-token") - token_secret = extractor.config("access-token-secret") - if token and token_secret: - self.session = util.OAuthSession( - extractor.session, - self.api_key, self.api_secret, token, token_secret) - self.api_key = None - else: - self.session = extractor.session + oauth.OAuth1API.__init__(self, extractor) self.maxsize = extractor.config("size-max") if isinstance(self.maxsize, str): diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 117c3bdc..cf92b3cf 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -50,8 +50,8 @@ class FoolslideExtractor(SharedConfigExtractor): lang = info[1].partition("-")[0] data["lang"] = lang data["language"] = util.code_to_language(lang) - data["volume"] = util.safe_int(info[2]) - data["chapter"] = util.safe_int(info[3]) + data["volume"] = text.parse_int(info[2]) + data["chapter"] = text.parse_int(info[3]) data["chapter_minor"] = "." + info[4] if len(info) >= 5 else "" return data @@ -75,7 +75,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): imgs = self.get_images(page) data["count"] = len(imgs) - data["chapter_id"] = util.safe_int(imgs[0]["chapter_id"]) + data["chapter_id"] = text.parse_int(imgs[0]["chapter_id"]) yield Message.Version, 1 yield Message.Directory, data @@ -88,7 +88,7 @@ class FoolslideChapterExtractor(FoolslideExtractor): except KeyError: pass for key in ("height", "id", "size", "width"): - image[key] = util.safe_int(image[key]) + image[key] = text.parse_int(image[key]) data.update(image) text.nameext_from_url(data["filename"], data) yield Message.Url, url, data diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 33abdbd4..110160a6 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -37,7 +37,7 @@ class GelbooruExtractor(SharedConfigExtractor): if isinstance(post, str): post = self.get_post_data(post) for key in ("id", "width", "height", "score", "change"): - post[key] = util.safe_int(post[key]) + post[key] = text.parse_int(post[key]) url = post["file_url"] post.update(data) yield Message.Url, url, text.nameext_from_url(url, post) @@ -174,7 +174,7 @@ class GelbooruPoolExtractor(GelbooruExtractor): raise exception.NotFoundError("pool") return { - "pool": util.safe_int(self.pool_id), + "pool": text.parse_int(self.pool_id), "pool_name": text.unescape(name), "count": len(self.posts), } diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py index 03232799..f1f04ed2 100644 --- a/gallery_dl/extractor/hbrowse.py +++ b/gallery_dl/extractor/hbrowse.py @@ -9,8 +9,7 @@ """Extract images from http://www.hbrowse.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, util -from urllib.parse import urljoin +from .. import text import json @@ -30,7 +29,7 @@ class HbrowseExtractor(): ), values=data) data["manga"] = text.unescape(data["manga"]) - data["total"] = util.safe_int(data["total"]) + data["total"] = text.parse_int(data["total"]) data["artist"] = text.remove_html(data["artist"]) data["origin"] = text.remove_html(data["origin"]) return data @@ -48,7 +47,7 @@ class HbrowseMangaExtractor(HbrowseExtractor, MangaExtractor): def chapters(self, page): results = [] data = self.parse_page(page, { - "manga_id": util.safe_int( + "manga_id": text.parse_int( self.url.rstrip("/").rpartition("/")[2]) }) @@ -59,9 +58,9 @@ class HbrowseMangaExtractor(HbrowseExtractor, MangaExtractor): if not url: return results title, pos = text.extract(page, '>View ', '<', pos) - data["chapter"] = util.safe_int(url.rpartition("/")[2][1:]) + data["chapter"] = text.parse_int(url.rpartition("/")[2][1:]) data["title"] = title - results.append((urljoin(self.root, url), data.copy())) + results.append((text.urljoin(self.root, url), data.copy())) class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor): @@ -84,8 +83,8 @@ class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor): def get_metadata(self, page): return self.parse_page(page, { - "manga_id": util.safe_int(self.gid), - "chapter": util.safe_int(self.chapter) + "manga_id": text.parse_int(self.gid), + "chapter": text.parse_int(self.chapter) }) def get_images(self, page): diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py index 6d2cd75f..34a7749c 100644 --- a/gallery_dl/extractor/hentai2read.py +++ b/gallery_dl/extractor/hentai2read.py @@ -9,7 +9,7 @@ """Extract hentai-manga from https://hentai2read.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, util +from .. import text import re import json @@ -36,7 +36,8 @@ class Hentai2readMangaExtractor(MangaExtractor): page, '', '') mtype, pos = text.extract( page, '[', ']', pos) - manga_id = util.safe_int(text.extract(page, 'data-mid="', '"', pos)[0]) + manga_id = text.parse_int(text.extract( + page, 'data-mid="', '"', pos)[0]) while True: chapter_id, pos = text.extract(page, ' data-cid="', '"', pos) @@ -49,8 +50,8 @@ class Hentai2readMangaExtractor(MangaExtractor): chapter, _, title = text.unescape(chapter).strip().partition(" - ") results.append((url, { "manga_id": manga_id, "manga": manga, "type": mtype, - "chapter_id": util.safe_int(chapter_id), - "chapter": util.safe_int(chapter), + "chapter_id": text.parse_int(chapter_id), + "chapter": text.parse_int(chapter), "title": title, "lang": "en", "language": "English", })) @@ -78,9 +79,9 @@ class Hentai2readChapterExtractor(ChapterExtractor): r"(\d+): (.+) . Page 1 ", title) return { "manga": match.group(1), - "manga_id": util.safe_int(manga_id), - "chapter": util.safe_int(self.chapter), - "chapter_id": util.safe_int(chapter_id), + "manga_id": text.parse_int(manga_id), + "chapter": text.parse_int(self.chapter), + "chapter_id": text.parse_int(chapter_id), "type": match.group(2), "author": match.group(3), "title": match.group(5), diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 63f37274..23713fc0 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -10,7 +10,6 @@ from .common import Extractor, Message from .. import text, util, exception -from urllib.parse import urljoin class HentaifoundryExtractor(Extractor): @@ -47,13 +46,13 @@ class HentaifoundryExtractor(Extractor): def get_image_metadata(self, url): """Collect metadata for an image""" - page = self.request(urljoin(self.root, url)).text + page = self.request(text.urljoin(self.root, url)).text index = url.rsplit("/", 2)[1] title, pos = text.extract( page, 'Pictures » ', '<') part, pos = text.extract( page, '//pictures.hentai-foundry.com', '"', pos) - data = {"index": util.safe_int(index), "title": text.unescape(title)} + data = {"index": text.parse_int(index), "title": text.unescape(title)} text.nameext_from_url(part, data) return "https://pictures.hentai-foundry.com" + part, data @@ -77,7 +76,7 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor): def __init__(self, match): HentaifoundryExtractor.__init__(self, match.group(1) or match.group(3)) - self.start_page = util.safe_int(match.group(2), 1) + self.start_page = text.parse_int(match.group(2), 1) self._skipped = (self.start_page - 1) * self.per_page def skip(self, num): @@ -104,7 +103,7 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor): if response.status_code == 404: raise exception.NotFoundError("user") - count = util.safe_int(text.extract( + count = text.parse_int(text.extract( response.text, 'class="active" >Pictures (', ')')[0]) if self._skipped >= count: raise exception.StopExtraction() @@ -142,7 +141,7 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor): "filter_type": 0, } self.request("https://www.hentai-foundry.com/site/filters", - method="post", data=formdata, allow_empty=True) + method="post", data=formdata) class HentaifoundryImageExtractor(HentaifoundryExtractor): @@ -171,4 +170,4 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): return ("{}/{}?enterAgree=1".format(self.artist_url, self.index),) def get_job_metadata(self): - return {"artist": self.artist, "index": util.safe_int(self.index)} + return {"artist": self.artist, "index": text.parse_int(self.index)} diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py index d1c5c932..50150cb8 100644 --- a/gallery_dl/extractor/hentaihere.py +++ b/gallery_dl/extractor/hentaihere.py @@ -9,7 +9,7 @@ """Extract hentai-manga from https://hentaihere.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, util +from .. import text import re import json @@ -32,7 +32,7 @@ class HentaihereMangaExtractor(MangaExtractor): def chapters(self, page): results = [] - manga_id = util.safe_int( + manga_id = text.parse_int( self.url.rstrip("/").rpartition("/")[2][1:]) manga, pos = text.extract( page, '', '') @@ -50,8 +50,8 @@ class HentaihereMangaExtractor(MangaExtractor): chapter, _, title = text.unescape(chapter).strip().partition(" - ") results.append((url, { "manga_id": manga_id, "manga": manga, "type": mtype, - "chapter_id": util.safe_int(chapter_id), - "chapter": util.safe_int(chapter), + "chapter_id": text.parse_int(chapter_id), + "chapter": text.parse_int(chapter), "title": title, "lang": "en", "language": "English", })) @@ -79,9 +79,9 @@ class HentaihereChapterExtractor(ChapterExtractor): match = re.match(pattern, title) return { "manga": match.group(1), - "manga_id": util.safe_int(self.manga_id), - "chapter": util.safe_int(self.chapter), - "chapter_id": util.safe_int(chapter_id), + "manga_id": text.parse_int(self.manga_id), + "chapter": text.parse_int(self.chapter), + "chapter_id": text.parse_int(chapter_id), "type": match.group(2), "title": match.group(3), "author": match.group(4), diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index a7014fd7..7048021e 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -30,7 +30,7 @@ class HitomiGalleryExtractor(ChapterExtractor): ] def __init__(self, match): - self.gid = util.safe_int(match.group(1)) + self.gid = text.parse_int(match.group(1)) url = "https://hitomi.la/galleries/{}.html".format(self.gid) ChapterExtractor.__init__(self, url) diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 97d8cb68..75f2e623 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -9,7 +9,7 @@ """Extract images from http://imagefap.com/""" from .common import Extractor, Message -from .. import text, util +from .. import text import json @@ -159,7 +159,7 @@ class ImagefapUserExtractor(ImagefapExtractor): yield Message.Version, 1 for gid, name in self.get_gallery_data(): url = "http://www.imagefap.com/gallery/" + gid - data = {"gallery_id": util.safe_int(gid), "title": name} + data = {"gallery_id": text.parse_int(gid), "title": name} yield Message.Queue, url, data def get_gallery_data(self): diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index d3d07826..1b029d33 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -12,7 +12,6 @@ from .common import Extractor, Message from .. import text, exception from ..cache import memcache from os.path import splitext -from urllib.parse import urljoin class ImagehostImageExtractor(Extractor): @@ -142,8 +141,7 @@ class ImagevenueImageExtractor(ImagehostImageExtractor): def get_info(self, page): url = text.extract(page, 'SRC="', '"')[0] - url = urljoin(self.url, url) - return url, url + return text.urljoin(self.url, url), url class ImagetwistImageExtractor(ImagehostImageExtractor): diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index ddbe56c3..cde65827 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -10,7 +10,6 @@ from .common import AsynchronousExtractor, Message from .. import text, exception -from urllib.parse import urljoin class KhinsiderSoundtrackExtractor(AsynchronousExtractor): @@ -63,7 +62,8 @@ class KhinsiderSoundtrackExtractor(AsynchronousExtractor): page = text.extract(page, '', '
')[0] for num, url in enumerate(text.extract_iter( page, '', '