diff --git a/README.rst b/README.rst
index cbdfdfc6..4302e457 100644
--- a/README.rst
+++ b/README.rst
@@ -194,7 +194,7 @@ OAuth
-----
*gallery-dl* supports user authentication via OAuth_ for
-``deviantart``, ``flickr``, ``reddit`` and ``tumblr``.
+``deviantart``, ``flickr``, ``reddit``, ``smugmug`` and ``tumblr``.
This is entirely optional, but grants *gallery-dl* the ability
to issue requests on your account's behalf and enables it to access resources
which would otherwise be unavailable to a public user.
diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf
index 16445e2e..a3ca7f63 100644
--- a/docs/gallery-dl-example.conf
+++ b/docs/gallery-dl-example.conf
@@ -100,7 +100,21 @@
"output":
{
"mode": "terminal",
- "logfile": "~/gallery-dl/log.txt"
+ "log": {
+ "format": "{name}: {message}",
+ "level": "info"
+ },
+ "logfile": {
+ "path": "~/gallery-dl/log.txt",
+ "mode": "w",
+ "level": "debug"
+ },
+ "unsupportedfile": {
+ "path": "~/gallery-dl/unsupported.txt",
+ "mode": "a",
+ "format": "{asctime} {message}",
+ "format-date": "%Y-%m-%d-%H-%M-%S"
+ }
},
"cache": {
diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst
index 3a7a8fcf..cdf5c697 100644
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@@ -68,7 +68,7 @@ Sea Otter Scans https://reader.seaotterscans.com/ Chapters, Manga
Sen Manga http://raw.senmanga.com/ Chapters
Sense-Scans http://sensescans.com/ Chapters, Manga
SlideShare https://www.slideshare.net/ Presentations
-SmugMug https://www.smugmug.com/ |Albums, individ-5|
+SmugMug https://www.smugmug.com/ |Albums, individ-5| Optional (OAuth)
Subapics https://subapics.com/ Chapters, Manga
The /b/ Archive https://thebarchive.com/ Threads
Tumblr https://www.tumblr.com/ Images from Users, Likes, Posts, Tag-Searches Optional (OAuth)
@@ -88,8 +88,8 @@ Turboimagehost https://turboimagehost.com/ individual Images
==================== =================================== ================================================== ================
.. |Images from Use-0| replace:: Images from Users, Albums, Challenges, individual Images, Likes, Search Results
-.. |Collections, De-1| replace:: Collections, Deviations, Favorites, Folders, Galleries, Journals
+.. |Collections, De-1| replace:: Collections, Deviations, Favorites, Folders, Galleries, Journals, Popular Images
.. |Images from Use-2| replace:: Images from Users, Albums, Favorites, Galleries, Groups, individual Images, Search Results
.. |Images from Use-3| replace:: Images from Users, Doujin, Favorites, individual Images
-.. |Images from Use-4| replace:: Images from Users, Bookmarks, Favorites, pixiv.me Links, Rankings, Individual Images
+.. |Images from Use-4| replace:: Images from Users, Bookmarks, Favorites, Follows, pixiv.me Links, Rankings, Search Results, Individual Images
.. |Albums, individ-5| replace:: Albums, individual Images, Images from Users and Folders
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index 8e62f9c7..e698de62 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -27,20 +27,72 @@ from . import version, config, option, extractor, job, util, exception
__version__ = version.__version__
log = logging.getLogger("gallery-dl")
-def initialize_logging(loglevel, formatter):
+LOG_FORMAT = "[{name}][{levelname}] {message}"
+LOG_FORMAT_DATE = "%Y-%m-%d %H:%M:%S"
+LOG_LEVEL = logging.INFO
+
+def initialize_logging(loglevel):
"""Setup basic logging functionality before configfiles have been loaded"""
# convert levelnames to lowercase
for level in (10, 20, 30, 40, 50):
name = logging.getLevelName(level)
logging.addLevelName(level, name.lower())
# setup basic logging to stderr
+ formatter = logging.Formatter(LOG_FORMAT, LOG_FORMAT_DATE, "{")
handler = logging.StreamHandler()
handler.setFormatter(formatter)
+ handler.setLevel(loglevel)
root = logging.getLogger()
- root.setLevel(loglevel)
+ root.setLevel(logging.NOTSET)
root.addHandler(handler)
+def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL):
+ """Setup a new logging handler"""
+ opts = config.interpolate(("output", key))
+ if not opts:
+ return None
+ if isinstance(opts, str):
+ opts = {"path": opts}
+
+ path = opts.get("path")
+ mode = opts.get("mode", "w")
+ try:
+ path = util.expand_path(path)
+ handler = logging.FileHandler(path, mode)
+ except (OSError, ValueError) as exc:
+ log.warning("%s: %s", key, exc)
+ return None
+ except TypeError as exc:
+ log.warning("%s: missing or invalid path (%s)", key, exc)
+ return None
+
+ level = opts.get("level", lvl)
+ logfmt = opts.get("format", fmt)
+ datefmt = opts.get("format-date", LOG_FORMAT_DATE)
+ formatter = logging.Formatter(logfmt, datefmt, "{")
+ handler.setFormatter(formatter)
+ handler.setLevel(level)
+
+ return handler
+
+
+def configure_logging_handler(key, handler):
+ """Configure a logging handler"""
+ opts = config.interpolate(("output", key))
+ if not opts:
+ return
+ if isinstance(opts, str):
+ opts = {"format": opts}
+ if handler.level == LOG_LEVEL and "level" in opts:
+ handler.setLevel(opts["level"])
+ if "format" in opts or "format-date" in opts:
+ logfmt = opts.get("format", LOG_FORMAT)
+ datefmt = opts.get("format-date", LOG_FORMAT_DATE)
+ formatter = logging.Formatter(logfmt, datefmt, "{")
+ handler.setFormatter(formatter)
+
+
def replace_std_streams(errors="replace"):
"""Replace standard streams and set their error handlers to 'errors'"""
for name in ("stdout", "stdin", "stderr"):
@@ -159,8 +211,7 @@ def main():
parser = option.build_parser()
args = parser.parse_args()
- formatter = logging.Formatter("[%(name)s][%(levelname)s] %(message)s")
- initialize_logging(args.loglevel, formatter)
+ initialize_logging(args.loglevel)
# configuration
if args.load_config:
@@ -173,17 +224,13 @@ def main():
config.set(key, value)
config.set(("_",), {})
- # logfile
- logfile = config.interpolate(("output", "logfile"))
- if logfile:
- try:
- path = util.expand_path(logfile)
- handler = logging.FileHandler(path, "w")
- except OSError as exc:
- log.warning("log file: %s", exc)
- else:
- handler.setFormatter(formatter)
- logging.getLogger().addHandler(handler)
+ # stream logging handler
+ configure_logging_handler("log", logging.getLogger().handlers[0])
+
+ # file logging handler
+ handler = setup_logging_handler("logfile", lvl=args.loglevel)
+ if handler:
+ logging.getLogger().addHandler(handler)
# loglevels
if args.loglevel >= logging.ERROR:
@@ -243,13 +290,13 @@ def main():
except OSError as exc:
log.warning("input file: %s", exc)
- unsupportedfile = config.interpolate(("output", "unsupportedfile"))
- if unsupportedfile:
- try:
- path = util.expand_path(unsupportedfile)
- job.Job.ufile = open(path, "w")
- except OSError as exc:
- log.warning("unsupported-URL file: %s", exc)
+ # unsupported file logging handler
+ handler = setup_logging_handler("unsupportedfile", fmt="{message}")
+ if handler:
+ ulog = logging.getLogger("unsupported")
+ ulog.addHandler(handler)
+ ulog.propagate = False
+ job.Job.ulog = ulog
prepare_range(args.image_range, "image")
prepare_range(args.chapter_range, "chapter")
diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py
index 53cc58e8..414a461d 100644
--- a/gallery_dl/cloudflare.py
+++ b/gallery_dl/cloudflare.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2017 Mike Fährmann
+# Copyright 2015-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,6 +8,7 @@
"""Methods to access sites behind Cloudflare protection"""
+import re
import time
import operator
import urllib.parse
@@ -30,6 +31,7 @@ def request_func(self, *args, **kwargs):
def solve_challenge(session, response):
+
session.headers["Referer"] = response.url
page = response.text
params = text.extract_all(page, (
@@ -37,58 +39,74 @@ def solve_challenge(session, response):
('pass' , 'name="pass" value="', '"'),
))[0]
params["jschl_answer"] = solve_jschl(response.url, page)
+
time.sleep(4)
- url = urllib.parse.urljoin(response.url, "/cdn-cgi/l/chk_jschl")
+ url = text.urljoin(response.url, "/cdn-cgi/l/chk_jschl")
return session.get(url, params=params)
def solve_jschl(url, page):
"""Solve challenge to get 'jschl_answer' value"""
+
+ # build variable name
+ # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk
data, pos = text.extract_all(page, (
('var' , ',f, ', '='),
('key' , '"', '"'),
('expr', ':', '}'),
))
- solution = evaluate_expression(data["expr"])
variable = "{}.{}".format(data["var"], data["key"])
vlength = len(variable)
+
+ # evaluate the initial expression
+ solution = evaluate_expression(data["expr"])
+
+ # iterator over all remaining expressions
+ # and combine their values in 'solution'
expressions = text.extract(
- page, "'challenge-form');", "f.submit();", pos
- )[0]
+ page, "'challenge-form');", "f.submit();", pos)[0]
for expr in expressions.split(";")[1:]:
+
if expr.startswith(variable):
+ # select arithmetc function based on operator (+, -, *)
func = operator_functions[expr[vlength]]
+ # evaluate the rest of the expression
value = evaluate_expression(expr[vlength+2:])
+ # combine the expression value with our current solution
solution = func(solution, value)
+
elif expr.startswith("a.value"):
+ # add length of the hostname, i.e. add 11 for 'example.org'
solution += len(urllib.parse.urlsplit(url).netloc)
+
if ".toFixed(" in expr:
+ # trim the solution to 10 decimal places
+ # and strip trailing zeros
solution = "{:.10f}".format(solution).rstrip("0")
+
return solution
-def evaluate_expression(expr):
+def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")):
"""Evaluate a Javascript expression for the challenge"""
+
if "/" in expr:
+ # split the expression in numerator and denominator subexpressions,
+ # evaluate them separately,
+ # and return their fraction-result
num, _, denom = expr.partition("/")
return evaluate_expression(num) / evaluate_expression(denom)
- stack = []
- ranges = []
- value = ""
- for index, char in enumerate(expr):
- if char == "(":
- stack.append(index+1)
- elif char == ")":
- begin = stack.pop()
- if stack:
- ranges.append((begin, index))
- for subexpr in [expr[begin:end] for begin, end in ranges] or (expr,):
- num = 0
- for part in subexpr.split("[]"):
- num += expression_values[part]
- value += str(num)
- return int(value)
+ # iterate over all subexpressions,
+ # evaluate them,
+ # and accumulate their values in 'result'
+ result = ""
+ for subexpr in split_re.findall(expr):
+ result += str(sum(
+ expression_values[part]
+ for part in subexpr.split("[]")
+ ))
+ return int(result)
operator_functions = {
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index bf461ae2..b590485f 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -12,7 +12,7 @@ import time
import mimetypes
from requests.exceptions import ConnectionError, Timeout
from .common import DownloaderBase
-from .. import util, exception
+from .. import text, exception
class Downloader(DownloaderBase):
@@ -28,7 +28,7 @@ class Downloader(DownloaderBase):
self.chunk_size = 16384
if self.rate:
- self.rate = util.parse_bytes(self.rate)
+ self.rate = text.parse_bytes(self.rate)
if not self.rate:
self.log.warning("Invalid rate limit specified")
elif self.rate < self.chunk_size:
@@ -61,7 +61,7 @@ class Downloader(DownloaderBase):
else:
self.response.raise_for_status()
- return offset, util.safe_int(size)
+ return offset, text.parse_int(size)
def receive(self, file):
if self.rate:
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index 39e6fe6b..80ed0295 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -73,7 +73,7 @@ class ArtstationExtractor(Extractor):
def get_user_info(self, username):
"""Return metadata for a specific user"""
url = "{}/users/{}/quick.json".format(self.root, username.lower())
- response = self.request(url, fatal=False, allow_empty=True)
+ response = self.request(url, fatal=False)
if response.status_code == 404:
raise exception.NotFoundError("user")
return response.json()
@@ -158,7 +158,7 @@ class ArtstationAlbumExtractor(ArtstationExtractor):
def __init__(self, match):
ArtstationExtractor.__init__(self, match)
- self.album_id = util.safe_int(match.group(2))
+ self.album_id = text.parse_int(match.group(2))
def metadata(self):
userinfo = self.get_user_info(self.user)
@@ -256,7 +256,7 @@ class ArtstationChallengeExtractor(ArtstationExtractor):
def _id_from_url(url):
"""Get an image's submission ID from its URL"""
parts = url.split("/")
- return util.safe_int("".join(parts[7:10]))
+ return text.parse_int("".join(parts[7:10]))
class ArtstationSearchExtractor(ArtstationExtractor):
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index aaddbaac..0113f62c 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -10,7 +10,6 @@
from .common import SharedConfigExtractor, Message
from .. import text
-from urllib.parse import urljoin
from xml.etree import ElementTree
import datetime
import operator
@@ -52,7 +51,7 @@ class BooruExtractor(SharedConfigExtractor):
try:
url = image["file_url"]
if url.startswith("/"):
- url = urljoin(self.api_url, url)
+ url = text.urljoin(self.api_url, url)
image.update(data)
yield Message.Url, url, text.nameext_from_url(url, image)
except KeyError:
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 8e42c32c..72a53369 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -52,34 +52,34 @@ class Extractor():
("extractor", self.category, self.subcategory, key), default)
def request(self, url, method="GET", encoding=None, fatal=True, retries=3,
- allow_empty=False, *args, **kwargs):
- max_retries = retries
+ *args, **kwargs):
+ max_tries = retries
while True:
try:
- response = None
response = self.session.request(method, url, *args, **kwargs)
- if fatal:
- response.raise_for_status()
- if encoding:
- response.encoding = encoding
- if response.content or allow_empty:
- return response
- msg = "empty response body"
- except requests.exceptions.HTTPError as exc:
+ except (requests.ConnectionError, requests.Timeout) as exc:
msg = exc
- code = response.status_code
- if 400 <= code < 500 and code != 429: # Client Error
- retries = 0
except requests.exceptions.RequestException as exc:
- msg = exc
- if not retries:
- raise exception.HttpError(msg)
- if response and response.status_code == 429: # Too Many Requests
- waittime = float(response.headers.get("Retry-After", 10.0))
+ raise exception.HttpError(exc)
else:
- waittime = 1
+ if 200 <= response.status_code < 400 or not fatal:
+ if encoding:
+ response.encoding = encoding
+ return response
+
+ msg = "{} HTTP Error: {} for url: {}".format(
+ response.status_code, response.reason, url)
+ if response.status_code < 500 and response.status_code != 429:
+ break
+
+ if not retries:
+ break
+ tries = max_tries - retries
retries -= 1
- time.sleep(waittime * (max_retries - retries))
+ self.log.debug("%s (%d/%d)", msg, tries, max_tries)
+ time.sleep(2 ** tries)
+
+ raise exception.HttpError(msg)
def _get_auth_info(self):
"""Return authentication information as (username, password) tuple"""
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 7f63bc3d..41641052 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -9,7 +9,7 @@
"""Extract images from https://www.deviantart.com/"""
from .common import Extractor, Message
-from .. import text, util, exception
+from .. import text, exception
from ..cache import cache, memcache
import itertools
import datetime
@@ -62,7 +62,7 @@ class DeviantartExtractor(Extractor):
if "videos" in deviation:
video = max(deviation["videos"],
- key=lambda x: util.safe_int(x["quality"][:-1]))
+ key=lambda x: text.parse_int(x["quality"][:-1]))
yield self.commit(deviation, video)
if "flash" in deviation:
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index bd9107ac..d63ddc0a 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -9,7 +9,7 @@
"""Extract manga-chapters from https://dynasty-scans.com/"""
from .common import ChapterExtractor
-from .. import text, util
+from .. import text
import re
import json
@@ -53,7 +53,7 @@ class DynastyscansChapterExtractor(ChapterExtractor):
return {
"manga": text.unescape(match.group(1)),
- "chapter": util.safe_int(match.group(2)),
+ "chapter": text.parse_int(match.group(2)),
"chapter_minor": match.group(3) or "",
"title": text.unescape(match.group(4) or ""),
"author": text.remove_html(author),
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 41eaeca1..4b07abcb 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -120,7 +120,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
self.key = {}
self.count = 0
self.version, self.gid, self.token = match.groups()
- self.gid = util.safe_int(self.gid)
+ self.gid = text.parse_int(self.gid)
def items(self):
self.login()
@@ -163,8 +163,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["lang"] = util.language_to_code(data["language"])
data["title"] = text.unescape(data["title"])
data["title_jp"] = text.unescape(data["title_jp"])
- data["count"] = util.safe_int(data["count"])
- data["gallery_size"] = util.parse_bytes(
+ data["count"] = text.parse_int(data["count"])
+ data["gallery_size"] = text.parse_bytes(
data["gallery_size"].rstrip("Bb"))
return data
@@ -245,18 +245,18 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def _parse_image_info(url):
parts = url.split("/")[4].split("-")
return {
- "width": util.safe_int(parts[2]),
- "height": util.safe_int(parts[3]),
- "size": util.safe_int(parts[1]),
+ "width": text.parse_int(parts[2]),
+ "height": text.parse_int(parts[3]),
+ "size": text.parse_int(parts[1]),
}
@staticmethod
def _parse_original_info(info):
parts = info.lstrip().split(" ")
return {
- "width": util.safe_int(parts[0]),
- "height": util.safe_int(parts[2]),
- "size": util.parse_bytes(parts[3] + parts[4][0]),
+ "width": text.parse_int(parts[0]),
+ "height": text.parse_int(parts[2]),
+ "size": text.parse_bytes(parts[3] + parts[4][0]),
}
@@ -274,7 +274,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
def __init__(self, match):
ExhentaiExtractor.__init__(self)
self.params = text.parse_query(match.group(1) or "")
- self.params["page"] = util.safe_int(self.params.get("page"))
+ self.params["page"] = text.parse_int(self.params.get("page"))
self.url = self.root
def items(self):
@@ -308,7 +308,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
return Message.Queue, url, {
"type": gtype,
"date": date,
- "gallery_id": util.safe_int(parts[1]),
+ "gallery_id": text.parse_int(parts[1]),
"gallery_token": parts[2],
"title": text.unescape(title),
key: last,
diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py
index a4ea6f58..3cd3f7a2 100644
--- a/gallery_dl/extractor/fallenangels.py
+++ b/gallery_dl/extractor/fallenangels.py
@@ -98,8 +98,8 @@ class FallenangelsMangaExtractor(MangaExtractor):
chapter, dot, minor = chapter.partition(".")
results.append((url, {
"manga": manga, "title": title,
- "volume": util.safe_int(volume),
- "chapter": util.safe_int(chapter),
+ "volume": text.parse_int(volume),
+ "chapter": text.parse_int(chapter),
"chapter_minor": dot + minor,
"lang": self.lang, "language": language,
}))
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index 150b2a28..cea46b1d 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -9,7 +9,7 @@
"""Extract images from https://www.flickr.com/"""
from .common import Extractor, Message
-from .. import text, util, exception
+from .. import text, oauth, util, exception
class FlickrExtractor(Extractor):
@@ -243,7 +243,7 @@ class FlickrSearchExtractor(FlickrExtractor):
return self.api.photos_search(self.search)
-class FlickrAPI():
+class FlickrAPI(oauth.OAuth1API):
"""Minimal interface for the flickr API"""
API_URL = "https://api.flickr.com/services/rest/"
API_KEY = "ac4fd7aa98585b9eee1ba761c209de68"
@@ -264,17 +264,7 @@ class FlickrAPI():
]
def __init__(self, extractor):
- self.api_key = extractor.config("api-key", self.API_KEY)
- self.api_secret = extractor.config("api-secret", self.API_SECRET)
- token = extractor.config("access-token")
- token_secret = extractor.config("access-token-secret")
- if token and token_secret:
- self.session = util.OAuthSession(
- extractor.session,
- self.api_key, self.api_secret, token, token_secret)
- self.api_key = None
- else:
- self.session = extractor.session
+ oauth.OAuth1API.__init__(self, extractor)
self.maxsize = extractor.config("size-max")
if isinstance(self.maxsize, str):
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
index 117c3bdc..cf92b3cf 100644
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@@ -50,8 +50,8 @@ class FoolslideExtractor(SharedConfigExtractor):
lang = info[1].partition("-")[0]
data["lang"] = lang
data["language"] = util.code_to_language(lang)
- data["volume"] = util.safe_int(info[2])
- data["chapter"] = util.safe_int(info[3])
+ data["volume"] = text.parse_int(info[2])
+ data["chapter"] = text.parse_int(info[3])
data["chapter_minor"] = "." + info[4] if len(info) >= 5 else ""
return data
@@ -75,7 +75,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
imgs = self.get_images(page)
data["count"] = len(imgs)
- data["chapter_id"] = util.safe_int(imgs[0]["chapter_id"])
+ data["chapter_id"] = text.parse_int(imgs[0]["chapter_id"])
yield Message.Version, 1
yield Message.Directory, data
@@ -88,7 +88,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
except KeyError:
pass
for key in ("height", "id", "size", "width"):
- image[key] = util.safe_int(image[key])
+ image[key] = text.parse_int(image[key])
data.update(image)
text.nameext_from_url(data["filename"], data)
yield Message.Url, url, data
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 33abdbd4..110160a6 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -37,7 +37,7 @@ class GelbooruExtractor(SharedConfigExtractor):
if isinstance(post, str):
post = self.get_post_data(post)
for key in ("id", "width", "height", "score", "change"):
- post[key] = util.safe_int(post[key])
+ post[key] = text.parse_int(post[key])
url = post["file_url"]
post.update(data)
yield Message.Url, url, text.nameext_from_url(url, post)
@@ -174,7 +174,7 @@ class GelbooruPoolExtractor(GelbooruExtractor):
raise exception.NotFoundError("pool")
return {
- "pool": util.safe_int(self.pool_id),
+ "pool": text.parse_int(self.pool_id),
"pool_name": text.unescape(name),
"count": len(self.posts),
}
diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py
index 03232799..f1f04ed2 100644
--- a/gallery_dl/extractor/hbrowse.py
+++ b/gallery_dl/extractor/hbrowse.py
@@ -9,8 +9,7 @@
"""Extract images from http://www.hbrowse.com/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
-from urllib.parse import urljoin
+from .. import text
import json
@@ -30,7 +29,7 @@ class HbrowseExtractor():
), values=data)
data["manga"] = text.unescape(data["manga"])
- data["total"] = util.safe_int(data["total"])
+ data["total"] = text.parse_int(data["total"])
data["artist"] = text.remove_html(data["artist"])
data["origin"] = text.remove_html(data["origin"])
return data
@@ -48,7 +47,7 @@ class HbrowseMangaExtractor(HbrowseExtractor, MangaExtractor):
def chapters(self, page):
results = []
data = self.parse_page(page, {
- "manga_id": util.safe_int(
+ "manga_id": text.parse_int(
self.url.rstrip("/").rpartition("/")[2])
})
@@ -59,9 +58,9 @@ class HbrowseMangaExtractor(HbrowseExtractor, MangaExtractor):
if not url:
return results
title, pos = text.extract(page, '>View ', '<', pos)
- data["chapter"] = util.safe_int(url.rpartition("/")[2][1:])
+ data["chapter"] = text.parse_int(url.rpartition("/")[2][1:])
data["title"] = title
- results.append((urljoin(self.root, url), data.copy()))
+ results.append((text.urljoin(self.root, url), data.copy()))
class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor):
@@ -84,8 +83,8 @@ class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor):
def get_metadata(self, page):
return self.parse_page(page, {
- "manga_id": util.safe_int(self.gid),
- "chapter": util.safe_int(self.chapter)
+ "manga_id": text.parse_int(self.gid),
+ "chapter": text.parse_int(self.chapter)
})
def get_images(self, page):
diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py
index 6d2cd75f..34a7749c 100644
--- a/gallery_dl/extractor/hentai2read.py
+++ b/gallery_dl/extractor/hentai2read.py
@@ -9,7 +9,7 @@
"""Extract hentai-manga from https://hentai2read.com/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
+from .. import text
import re
import json
@@ -36,7 +36,8 @@ class Hentai2readMangaExtractor(MangaExtractor):
page, '', '')
mtype, pos = text.extract(
page, '[', ']', pos)
- manga_id = util.safe_int(text.extract(page, 'data-mid="', '"', pos)[0])
+ manga_id = text.parse_int(text.extract(
+ page, 'data-mid="', '"', pos)[0])
while True:
chapter_id, pos = text.extract(page, ' data-cid="', '"', pos)
@@ -49,8 +50,8 @@ class Hentai2readMangaExtractor(MangaExtractor):
chapter, _, title = text.unescape(chapter).strip().partition(" - ")
results.append((url, {
"manga_id": manga_id, "manga": manga, "type": mtype,
- "chapter_id": util.safe_int(chapter_id),
- "chapter": util.safe_int(chapter),
+ "chapter_id": text.parse_int(chapter_id),
+ "chapter": text.parse_int(chapter),
"title": title, "lang": "en", "language": "English",
}))
@@ -78,9 +79,9 @@ class Hentai2readChapterExtractor(ChapterExtractor):
r"(\d+): (.+) . Page 1 ", title)
return {
"manga": match.group(1),
- "manga_id": util.safe_int(manga_id),
- "chapter": util.safe_int(self.chapter),
- "chapter_id": util.safe_int(chapter_id),
+ "manga_id": text.parse_int(manga_id),
+ "chapter": text.parse_int(self.chapter),
+ "chapter_id": text.parse_int(chapter_id),
"type": match.group(2),
"author": match.group(3),
"title": match.group(5),
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
index 63f37274..23713fc0 100644
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -10,7 +10,6 @@
from .common import Extractor, Message
from .. import text, util, exception
-from urllib.parse import urljoin
class HentaifoundryExtractor(Extractor):
@@ -47,13 +46,13 @@ class HentaifoundryExtractor(Extractor):
def get_image_metadata(self, url):
"""Collect metadata for an image"""
- page = self.request(urljoin(self.root, url)).text
+ page = self.request(text.urljoin(self.root, url)).text
index = url.rsplit("/", 2)[1]
title, pos = text.extract(
page, 'Pictures » ', '<')
part, pos = text.extract(
page, '//pictures.hentai-foundry.com', '"', pos)
- data = {"index": util.safe_int(index), "title": text.unescape(title)}
+ data = {"index": text.parse_int(index), "title": text.unescape(title)}
text.nameext_from_url(part, data)
return "https://pictures.hentai-foundry.com" + part, data
@@ -77,7 +76,7 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor):
def __init__(self, match):
HentaifoundryExtractor.__init__(self, match.group(1) or match.group(3))
- self.start_page = util.safe_int(match.group(2), 1)
+ self.start_page = text.parse_int(match.group(2), 1)
self._skipped = (self.start_page - 1) * self.per_page
def skip(self, num):
@@ -104,7 +103,7 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor):
if response.status_code == 404:
raise exception.NotFoundError("user")
- count = util.safe_int(text.extract(
+ count = text.parse_int(text.extract(
response.text, 'class="active" >Pictures (', ')')[0])
if self._skipped >= count:
raise exception.StopExtraction()
@@ -142,7 +141,7 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor):
"filter_type": 0,
}
self.request("https://www.hentai-foundry.com/site/filters",
- method="post", data=formdata, allow_empty=True)
+ method="post", data=formdata)
class HentaifoundryImageExtractor(HentaifoundryExtractor):
@@ -171,4 +170,4 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor):
return ("{}/{}?enterAgree=1".format(self.artist_url, self.index),)
def get_job_metadata(self):
- return {"artist": self.artist, "index": util.safe_int(self.index)}
+ return {"artist": self.artist, "index": text.parse_int(self.index)}
diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py
index d1c5c932..50150cb8 100644
--- a/gallery_dl/extractor/hentaihere.py
+++ b/gallery_dl/extractor/hentaihere.py
@@ -9,7 +9,7 @@
"""Extract hentai-manga from https://hentaihere.com/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
+from .. import text
import re
import json
@@ -32,7 +32,7 @@ class HentaihereMangaExtractor(MangaExtractor):
def chapters(self, page):
results = []
- manga_id = util.safe_int(
+ manga_id = text.parse_int(
self.url.rstrip("/").rpartition("/")[2][1:])
manga, pos = text.extract(
page, '', '')
@@ -50,8 +50,8 @@ class HentaihereMangaExtractor(MangaExtractor):
chapter, _, title = text.unescape(chapter).strip().partition(" - ")
results.append((url, {
"manga_id": manga_id, "manga": manga, "type": mtype,
- "chapter_id": util.safe_int(chapter_id),
- "chapter": util.safe_int(chapter),
+ "chapter_id": text.parse_int(chapter_id),
+ "chapter": text.parse_int(chapter),
"title": title, "lang": "en", "language": "English",
}))
@@ -79,9 +79,9 @@ class HentaihereChapterExtractor(ChapterExtractor):
match = re.match(pattern, title)
return {
"manga": match.group(1),
- "manga_id": util.safe_int(self.manga_id),
- "chapter": util.safe_int(self.chapter),
- "chapter_id": util.safe_int(chapter_id),
+ "manga_id": text.parse_int(self.manga_id),
+ "chapter": text.parse_int(self.chapter),
+ "chapter_id": text.parse_int(chapter_id),
"type": match.group(2),
"title": match.group(3),
"author": match.group(4),
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index a7014fd7..7048021e 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -30,7 +30,7 @@ class HitomiGalleryExtractor(ChapterExtractor):
]
def __init__(self, match):
- self.gid = util.safe_int(match.group(1))
+ self.gid = text.parse_int(match.group(1))
url = "https://hitomi.la/galleries/{}.html".format(self.gid)
ChapterExtractor.__init__(self, url)
diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py
index 97d8cb68..75f2e623 100644
--- a/gallery_dl/extractor/imagefap.py
+++ b/gallery_dl/extractor/imagefap.py
@@ -9,7 +9,7 @@
"""Extract images from http://imagefap.com/"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text
import json
@@ -159,7 +159,7 @@ class ImagefapUserExtractor(ImagefapExtractor):
yield Message.Version, 1
for gid, name in self.get_gallery_data():
url = "http://www.imagefap.com/gallery/" + gid
- data = {"gallery_id": util.safe_int(gid), "title": name}
+ data = {"gallery_id": text.parse_int(gid), "title": name}
yield Message.Queue, url, data
def get_gallery_data(self):
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index d3d07826..1b029d33 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -12,7 +12,6 @@ from .common import Extractor, Message
from .. import text, exception
from ..cache import memcache
from os.path import splitext
-from urllib.parse import urljoin
class ImagehostImageExtractor(Extractor):
@@ -142,8 +141,7 @@ class ImagevenueImageExtractor(ImagehostImageExtractor):
def get_info(self, page):
url = text.extract(page, 'SRC="', '"')[0]
- url = urljoin(self.url, url)
- return url, url
+ return text.urljoin(self.url, url), url
class ImagetwistImageExtractor(ImagehostImageExtractor):
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
index ddbe56c3..cde65827 100644
--- a/gallery_dl/extractor/khinsider.py
+++ b/gallery_dl/extractor/khinsider.py
@@ -10,7 +10,6 @@
from .common import AsynchronousExtractor, Message
from .. import text, exception
-from urllib.parse import urljoin
class KhinsiderSoundtrackExtractor(AsynchronousExtractor):
@@ -63,7 +62,8 @@ class KhinsiderSoundtrackExtractor(AsynchronousExtractor):
page = text.extract(page, '')[0]
for num, url in enumerate(text.extract_iter(
page, '
', '')[0]
return [
(url, {
- "width": util.safe_int(width),
- "height": util.safe_int(height),
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
})
for url, width, height in re.findall(
r" ]*? src=[\"']([^\"']+)[\"']"
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index bb1ecefe..58841859 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -10,7 +10,6 @@
from .common import ChapterExtractor, MangaExtractor
from .. import text, util, exception
-from urllib.parse import urljoin
import json
import re
@@ -69,8 +68,8 @@ class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor):
"manga": data["manga_title"],
"manga_id": data["manga_id"],
"title": data["chapter_title"],
- "volume": util.safe_int(match.group(1)),
- "chapter": util.safe_int(match.group(2)),
+ "volume": text.parse_int(match.group(1)),
+ "chapter": text.parse_int(match.group(2)),
"chapter_minor": match.group(3) or "",
"chapter_id": data["chapter_id"],
"chapter_string": info.replace(" - MangaDex", ""),
@@ -84,7 +83,7 @@ class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor):
pagelist, pos = text.extract(page, "var page_array = [", "]", pos)
server , pos = text.extract(page, "var server = '", "'", pos)
- base = urljoin(self.root, server + dataurl + "/")
+ base = text.urljoin(self.root, server + dataurl + "/")
return [
(base + page, None)
@@ -130,7 +129,7 @@ class MangadexMangaExtractor(MangadexExtractor, MangaExtractor):
manga = text.unescape(extr(
page, '"og:title" content="', '"')[0].rpartition(" (")[0])
- manga_id = util.safe_int(extr(
+ manga_id = text.parse_int(extr(
page, '/images/manga/', '.')[0])
while True:
@@ -151,15 +150,15 @@ class MangadexMangaExtractor(MangadexExtractor, MangaExtractor):
results.append((self.root + "/chapter/" + chid, {
"manga": manga,
- "manga_id": util.safe_int(manga_id),
+ "manga_id": text.parse_int(manga_id),
"title": text.unescape(title),
- "volume": util.safe_int(volume),
- "chapter": util.safe_int(chapter),
+ "volume": text.parse_int(volume),
+ "chapter": text.parse_int(chapter),
"chapter_minor": sep + minor,
- "chapter_id": util.safe_int(chid),
+ "chapter_id": text.parse_int(chid),
"group": text.unescape(text.remove_html(group)),
"contributor": text.remove_html(user),
- "views": util.safe_int(views),
+ "views": text.parse_int(views),
"date": date,
"lang": util.language_to_code(language),
"language": language,
diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py
index 906c2372..38eefa12 100644
--- a/gallery_dl/extractor/mangafox.py
+++ b/gallery_dl/extractor/mangafox.py
@@ -9,7 +9,7 @@
"""Extract manga-chapters and entire manga from http://fanfox.net/"""
from .common import ChapterExtractor
-from .. import text, util, exception
+from .. import text, exception
import re
@@ -47,7 +47,7 @@ class MangafoxChapterExtractor(ChapterExtractor):
data["chapter_minor"] = match.group(4) or ""
data["manga"] = data["manga"].rpartition(" ")[0]
for key in ("sid", "cid", "count", "volume", "chapter"):
- data[key] = util.safe_int(data[key])
+ data[key] = text.parse_int(data[key])
return data
def get_images(self, page):
diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py
index 0596c64e..8cfe373b 100644
--- a/gallery_dl/extractor/mangahere.py
+++ b/gallery_dl/extractor/mangahere.py
@@ -9,9 +9,8 @@
"""Extract manga-chapters and entire manga from http://www.mangahere.co/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
+from .. import text
from ..cache import memcache
-from urllib.parse import urljoin
import re
@@ -58,10 +57,10 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor):
volume, pos = text.extract(page, 'span class="mr6">', '<', pos)
title, pos = text.extract(page, '/span>', '<', pos)
date, pos = text.extract(page, 'class="right">', '', pos)
- results.append((urljoin("http:", url), {
+ results.append((text.urljoin("http:", url), {
"manga": manga, "title": title, "date": date,
- "volume": util.safe_int(volume.rpartition(" ")[2]),
- "chapter": util.safe_int(chapter),
+ "volume": text.parse_int(volume.rpartition(" ")[2]),
+ "chapter": text.parse_int(chapter),
"chapter_minor": dot + minor,
"lang": "en", "language": "English",
}))
@@ -98,12 +97,12 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
return {
"manga": text.unescape(manga),
- "manga_id": util.safe_int(mid),
+ "manga_id": text.parse_int(mid),
"title": self._get_title_map(mid).get(self.chapter),
- "volume": util.safe_int(self.volume),
- "chapter": util.safe_int(chapter),
+ "volume": text.parse_int(self.volume),
+ "chapter": text.parse_int(chapter),
"chapter_minor": dot + minor,
- "count": util.safe_int(count),
+ "count": text.parse_int(count),
"lang": "en",
"language": "English",
}
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py
index 4752d1f5..9f62cbaa 100644
--- a/gallery_dl/extractor/mangapark.py
+++ b/gallery_dl/extractor/mangapark.py
@@ -9,8 +9,7 @@
"""Extract manga-chapters and entire manga from https://mangapark.me/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
-from urllib.parse import urljoin
+from .. import text
class MangaparkExtractor():
@@ -25,12 +24,12 @@ class MangaparkExtractor():
for part in path.split("/")[3:]:
key, value = part[0], part[1:]
if key == "s":
- data["version"] = util.safe_int(value)
+ data["version"] = text.parse_int(value)
elif key == "v":
- data["volume"] = util.safe_int(value)
+ data["volume"] = text.parse_int(value)
elif key == "c":
chapter, dot, minor = value.partition(".")
- data["chapter"] = util.safe_int(chapter)
+ data["chapter"] = text.parse_int(chapter)
data["chapter_minor"] = dot + minor
elif key == "e":
data["chapter_minor"] = "v" + value
@@ -64,7 +63,7 @@ class MangaparkMangaExtractor(MangaparkExtractor, MangaExtractor):
self.parse_chapter_path(path, data)
data["title"] = title[3:].strip()
data["date"] = date
- data["count"] = util.safe_int(count)
+ data["count"] = text.parse_int(count)
results.append((self.root + path, data.copy()))
@@ -107,7 +106,7 @@ class MangaparkChapterExtractor(MangaparkExtractor, ChapterExtractor):
data["manga"], _, data["type"] = data["manga"].rpartition(" ")
data["manga"] = text.unescape(data["manga"])
data["title"] = data["title"].partition(": ")[2]
- data["count"] = util.safe_int(data["count"])
+ data["count"] = text.parse_int(data["count"])
return data
def get_images(self, page):
@@ -120,7 +119,7 @@ class MangaparkChapterExtractor(MangaparkExtractor, ChapterExtractor):
num += 1
width , pos = text.extract(page, ' width="', '"', pos)
height, pos = text.extract(page, ' _heighth="', '"', pos)
- yield urljoin(self.root, url), {
+ yield text.urljoin(self.root, url), {
"page": num,
"width": width,
"height": height,
diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py
index 11714a6d..3aabb153 100644
--- a/gallery_dl/extractor/mangareader.py
+++ b/gallery_dl/extractor/mangareader.py
@@ -9,7 +9,7 @@
"""Extract manga-chapters and entire manga from https://www.mangareader.net/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
+from .. import text
class MangareaderBase():
@@ -53,7 +53,7 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):
return results
data["title"], pos = text.extract(page, ' : ', ' | ', pos)
data["date"] , pos = text.extract(page, '', ' | ', pos)
- data["chapter"] = util.safe_int(url.rpartition("/")[2])
+ data["chapter"] = text.parse_int(url.rpartition("/")[2])
results.append((self.root + url, data.copy()))
@@ -79,7 +79,7 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
"""Collect metadata for extractor-job"""
page = self.request(self.root + self.url_title).text
data = self.parse_page(page, {
- "chapter": util.safe_int(self.chapter),
+ "chapter": text.parse_int(self.chapter),
"lang": "en",
"language": "English",
})
@@ -87,7 +87,7 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
('title', ' ' + self.chapter + ' : ', ''),
('date', '', ' | '),
), page.index(''), data)
- data["count"] = util.safe_int(text.extract(
+ data["count"] = text.parse_int(text.extract(
chapter_page, ' of ', '<')[0]
)
return data
@@ -118,6 +118,6 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
height, pos = extr(page, ' height="', '"', pos)
image, pos = extr(page, ' src="', '"', pos)
return self.root + url, image, {
- "width": util.safe_int(width),
- "height": util.safe_int(height),
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
}
diff --git a/gallery_dl/extractor/mangastream.py b/gallery_dl/extractor/mangastream.py
index 51a469a3..12ae717a 100644
--- a/gallery_dl/extractor/mangastream.py
+++ b/gallery_dl/extractor/mangastream.py
@@ -9,8 +9,7 @@
"""Extract manga-chapters from https://mangastream.com/"""
from .common import ChapterExtractor
-from .. import text, util
-from urllib.parse import urljoin
+from .. import text
class MangastreamChapterExtractor(ChapterExtractor):
@@ -35,9 +34,9 @@ class MangastreamChapterExtractor(ChapterExtractor):
return {
"manga": manga,
"chapter": text.unquote(self.chapter),
- "chapter_id": util.safe_int(self.ch_id),
+ "chapter_id": text.parse_int(self.ch_id),
"title": title,
- "count": util.safe_int(count, 1),
+ "count": text.parse_int(count, 1),
"lang": "en",
"language": "English",
}
@@ -47,5 +46,5 @@ class MangastreamChapterExtractor(ChapterExtractor):
pos = page.index(' class="page"')
next_url = text.extract(page, ' href="', '"', pos)[0]
image_url = text.extract(page, ' src="', '"', pos)[0]
- yield urljoin(self.base_url, image_url), None
- page = self.request(urljoin(self.base_url, next_url)).text
+ yield text.urljoin(self.base_url, image_url), None
+ page = self.request(text.urljoin(self.base_url, next_url)).text
diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py
index 35816e60..a0c8abc1 100644
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@@ -9,7 +9,7 @@
"""Extract images from https://nhentai.net/"""
from .common import Extractor, Message
-from .. import text, util
+from .. import text
class NHentaiExtractor(Extractor):
@@ -95,7 +95,7 @@ class NhentaiSearchExtractor(NHentaiExtractor):
def _pagination(self, endpoint, params):
"""Pagination over API responses"""
url = "{}/api/{}".format(self.root, endpoint)
- params["page"] = util.safe_int(params.get("page"), 1)
+ params["page"] = text.parse_int(params.get("page"), 1)
while True:
data = self.request(url, params=params, fatal=False).json()
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index 56e1e521..db2ce317 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -9,7 +9,7 @@
"""Extract images from https://nijie.info/"""
from .common import AsynchronousExtractor, Message
-from .. import text, util, exception
+from .. import text, exception
from ..cache import cache
@@ -44,7 +44,7 @@ class NijieExtractor(AsynchronousExtractor):
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
- return {"user_id": util.safe_int(self.user_id)}
+ return {"user_id": text.parse_int(self.user_id)}
def get_image_ids(self):
"""Collect all relevant image-ids"""
@@ -63,8 +63,8 @@ class NijieExtractor(AsynchronousExtractor):
images = list(text.extract_iter(page, '
_ugoira1920x1080.zip",
- data["url"]
- )
-
- # build framelist
- framelist = re.sub(
- r'\{"file":"([^"]+)","delay":(\d+)\},?',
- r'\1 \2\n', frames
- )
-
- return url, framelist
def get_metadata(self, user=None):
"""Collect metadata for extractor-job"""
if not user:
- user = self.api.user(self.user_id)[0]
+ user = self.api.user_detail(self.user_id)
return {"user": user}
@@ -106,17 +89,23 @@ class PixivUserExtractor(PixivExtractor):
"""Extractor for works of a pixiv-user"""
subcategory = "user"
pattern = [(r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
- r"/member(?:_illust)?\.php\?id=(\d+)(?:.*&tag=([^]+))?"),
+ r"/member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?"),
(r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
r"/(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+)()")]
test = [
("http://www.pixiv.net/member_illust.php?id=173530", {
"url": "852c31ad83b6840bacbce824d85f2a997889efb7",
}),
+ # illusts with specific tag
(("https://www.pixiv.net/member_illust.php?id=173530"
"&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), {
"url": "25b1cd81153a8ff82eec440dd9f20a4a22079658",
}),
+ # all sorts of query parameters
+ (("https://www.pixiv.net/member_illust.php?id=3137110"
+ "&tag=%E3%83%96%E3%82%A4%E3%82%BA&type=illust&p=2"), {
+ "count": ">= 55",
+ }),
("http://www.pixiv.net/member_illust.php?id=173531", {
"exception": exception.NotFoundError,
}),
@@ -129,18 +118,32 @@ class PixivUserExtractor(PixivExtractor):
def __init__(self, match):
PixivExtractor.__init__(self)
- self.user_id, tag = match.groups()
- if tag:
- self.tag = text.unquote(tag).lower()
- self.works = self._tagged_works
+ self.user_id, self.query = match.groups()
def works(self):
- return self.api.user_works(self.user_id)
+ works = self.api.user_illusts(self.user_id)
- def _tagged_works(self):
- for work in self.api.user_works(self.user_id):
- if self.tag in [tag.lower() for tag in work["tags"]]:
- yield work
+ if self.query:
+ qdict = text.parse_query(self.query)
+ if "type" in qdict:
+ type_ = qdict["type"].lower()
+ works = filter(self._is_type(type_), works)
+ if "tag" in qdict:
+ tag = text.unquote(qdict["tag"]).lower()
+ works = filter(self._has_tag(tag), works)
+ if "p" in qdict: # apply page-offset last
+ offset = (text.parse_int(qdict["p"], 1) - 1) * 20
+ works = util.advance(works, offset)
+
+ return works
+
+ @staticmethod
+ def _has_tag(tag):
+ return lambda work: tag in [t["name"].lower() for t in work["tags"]]
+
+ @staticmethod
+ def _is_type(type_):
+ return lambda work: work["type"] == type_
class PixivMeExtractor(PixivExtractor):
@@ -188,14 +191,6 @@ class PixivWorkExtractor(PixivExtractor):
"?mode=medium&illust_id=966411"), {
"exception": exception.NotFoundError,
}),
- (("http://i1.pixiv.net/c/600x600/img-master/"
- "img/2008/06/13/00/29/13/966412_p0_master1200.jpg"), {
- "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba",
- }),
- (("https://i.pximg.net/img-original/"
- "img/2017/04/25/07/33/29/62568267_p0.png"), {
- "url": "71b8bbd070d6b03a75ca4afb89f64d1445b2278d",
- }),
# ugoira
(("https://www.pixiv.net/member_illust.php"
"?mode=medium&illust_id=66806629"), {
@@ -203,6 +198,10 @@ class PixivWorkExtractor(PixivExtractor):
r"66806629_ugoira1920x1080\.zip|text:.+"),
"count": 2,
}),
+ (("http://i1.pixiv.net/c/600x600/img-master/"
+ "img/2008/06/13/00/29/13/966412_p0_master1200.jpg"), None),
+ (("https://i.pximg.net/img-original/"
+ "img/2017/04/25/07/33/29/62568267_p0.png"), None),
("https://www.pixiv.net/i/966412", None),
("http://img.pixiv.net/img/soundcross/42626136.jpg", None),
("http://i2.pixiv.net/img76/img/snailrin/42672235.jpg", None),
@@ -218,214 +217,318 @@ class PixivWorkExtractor(PixivExtractor):
return (self.work,)
def get_metadata(self, user=None):
- self.work = self.api.work(self.illust_id)[0]
+ self.work = self.api.illust_detail(self.illust_id)
return PixivExtractor.get_metadata(self, self.work["user"])
class PixivFavoriteExtractor(PixivExtractor):
"""Extractor for all favorites/bookmarks of a pixiv-user"""
subcategory = "favorite"
- directory_fmt = ["{category}", "bookmarks", "{user[id]} {user[account]}"]
- archive_fmt = "f_{bookmark[id]}{num}.{extension}"
+ directory_fmt = ["{category}", "bookmarks",
+ "{user_bookmark[id]} {user_bookmark[account]}"]
+ archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}"
pattern = [r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
- r"/bookmark\.php\?id=(\d+)"]
+ r"/bookmark\.php(?:\?([^#]*))?"]
test = [
("https://www.pixiv.net/bookmark.php?id=173530", {
"url": "e717eb511500f2fa3497aaee796a468ecf685cc4",
}),
+ # bookmarks with specific tag
+ (("https://www.pixiv.net/bookmark.php?id=3137110"
+ "&tag=%E3%81%AF%E3%82%93%E3%82%82%E3%82%93&p=1"), {
+ "count": 2,
+ }),
+ # own bookmarks
+ ("https://www.pixiv.net/bookmark.php", {
+ "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba",
+ }),
+ # touch URLs
("https://touch.pixiv.net/bookmark.php?id=173530", None),
+ ("https://touch.pixiv.net/bookmark.php", None),
]
def __init__(self, match):
PixivExtractor.__init__(self)
- self.user_id = match.group(1)
+ self.query = text.parse_query(match.group(1))
+ if "id" not in self.query:
+ self.subcategory = "bookmark"
def works(self):
- return self.api.user_favorite_works(self.user_id)
-
- def prepare_work(self, work):
- work["work"]["bookmark"] = {
- key: work[key]
- for key in ("id", "comment", "tags", "publicity")
- }
- return PixivExtractor.prepare_work(self, work["work"])
+ tag = None
+ restrict = "public"
+ offset = 0
+ if "tag" in self.query:
+ tag = text.unquote(self.query["tag"])
+ if "rest" in self.query and self.query["rest"] == "hide":
+ restrict = "private"
+ if "p" in self.query:
+ offset = (text.parse_int(self.query["p"], 1) - 1) * 20
-class PixivBookmarkExtractor(PixivFavoriteExtractor):
- """Extractor for all favorites/bookmarks of your own account"""
- subcategory = "bookmark"
- pattern = [r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/bookmark\.php()$"]
- test = [
- ("https://www.pixiv.net/bookmark.php", None),
- ("https://touch.pixiv.net/bookmark.php", None),
- ]
+ works = self.api.user_bookmarks_illust(self.user_id, tag, restrict)
+ return util.advance(works, offset)
def get_metadata(self, user=None):
- self.api.login()
- user = self.api.user_info
+ if "id" in self.query:
+ user = self.api.user_detail(self.query["id"])
+ else:
+ self.api.login()
+ user = self.api.user
+
self.user_id = user["id"]
- return PixivExtractor.get_metadata(self, user)
+ return {"user_bookmark": user}
class PixivRankingExtractor(PixivExtractor):
"""Extractor for pixiv ranking pages"""
subcategory = "ranking"
archive_fmt = "r_{ranking[mode]}_{ranking[date]}_{id}{num}.{extension}"
- directory_fmt = ["{category}", "rankings", "{mode}", "{date}"]
+ directory_fmt = ["{category}", "rankings",
+ "{ranking[mode]}", "{ranking[date]}"]
pattern = [r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
r"/ranking\.php(?:\?([^#]*))?"]
test = [
- (("https://www.pixiv.net/ranking.php"
- "?mode=daily&content=illust&date=20170818"), None),
+ ("https://www.pixiv.net/ranking.php?mode=daily&date=20170818", None),
("https://www.pixiv.net/ranking.php", None),
("https://touch.pixiv.net/ranking.php", None),
]
def __init__(self, match):
PixivExtractor.__init__(self)
- self.ranking_info = None
- self._iter = None
- self._first = None
-
- query = text.parse_query(match.group(1))
- self.mode = query.get("mode", "daily")
- self.content = query.get("content", "all")
- self.date = query.get("date")
-
- if self.date:
- if len(self.date) == 8 and self.date.isdecimal():
- self.date = (self.date[0:4] + "-" +
- self.date[4:6] + "-" +
- self.date[6:8])
+ self.query = match.group(1)
+ self.mode = self.date = None
+
+ def works(self):
+ return self.api.illust_ranking(self.mode, self.date)
+
+ def get_metadata(self, user=None):
+ query = text.parse_query(self.query)
+
+ mode = query.get("mode", "daily").lower()
+ mode_map = {
+ "daily": "day",
+ "daily_r18": "day_r18",
+ "weekly": "week",
+ "weekly_r18": "week_r18",
+ "monthly": "month",
+ "male": "day_male",
+ "male_r18": "day_male_r18",
+ "female": "day_female",
+ "female_r18": "day_female_r18",
+ "original": "week_original",
+ "rookie": "week_rookie",
+ "r18g": "week_r18g",
+ }
+ if mode not in mode_map:
+ self.log.warning("invalid mode '%s'", mode)
+ mode = "daily"
+ self.mode = mode_map[mode]
+
+ date = query.get("date")
+ if date:
+ if len(date) == 8 and date.isdecimal():
+ date = "{}-{}-{}".format(date[0:4], date[4:6], date[6:8])
else:
- self.log.warning("invalid date '%s'", self.date)
- self.date = None
+ self.log.warning("invalid date '%s'", date)
+ date = None
+ if not date:
+ date = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d")
+ self.date = date
+
+ return {"ranking": {
+ "mode": mode,
+ "date": self.date,
+ }}
+
+
+class PixivSearchExtractor(PixivExtractor):
+ """Extractor for pixiv search results"""
+ subcategory = "search"
+ archive_fmt = "s_{search[word]}_{id}{num}.{extension}"
+ directory_fmt = ["{category}", "search", "{search[word]}"]
+ pattern = [r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
+ r"/search\.php\?([^#]+)"]
+ test = [
+ ("https://www.pixiv.net/search.php?s_mode=s_tag&word=Original", None),
+ ("https://touch.pixiv.net/search.php?word=Original", None),
+ ]
- if self.content not in ("all", "illust", "manga", "ugoira"):
- self.log.warning("unrecognized content value '%s' - "
- "falling back to 'all'", self.content)
- self.content = "all"
+ def __init__(self, match):
+ PixivExtractor.__init__(self)
+ self.query = match.group(1)
+ self.word = self.sort = self.target = None
def works(self):
- yield from self._first["works"]
- for page in self._iter:
- yield from page["works"]
+ return self.api.search_illust(self.word, self.sort, self.target)
def get_metadata(self, user=None):
- self._iter = self.api.ranking(self.mode, self.content, self.date)
- self._first = next(self._iter)
- self.ranking_info = {
- key: self._first[key]
- for key in ("mode", "content", "date")
+ query = text.parse_query(self.query)
+
+ if "word" in query:
+ self.word = text.unescape(query["word"])
+ else:
+ self.log.error("missing search term")
+ raise exception.StopExtraction()
+
+ sort = query.get("order", "date_d")
+ sort_map = {
+ "date": "date_asc",
+ "date_d": "date_desc",
}
- return self.ranking_info.copy()
+ if sort not in sort_map:
+ self.log.warning("invalid sort order '%s'", sort)
+ sort = "date_d"
+ self.sort = sort_map[sort]
+
+ target = query.get("s_mode", "s_tag")
+ target_map = {
+ "s_tag": "partial_match_for_tags",
+ "s_tag_full": "exact_match_for_tags",
+ "s_tc": "title_and_caption",
+ }
+ if target not in target_map:
+ self.log.warning("invalid search target '%s'", target)
+ target = "s_tag"
+ self.target = target_map[target]
+
+ return {"search": {
+ "word": self.word,
+ "sort": self.sort,
+ "target": self.target,
+ }}
+
+
+class PixivFollowExtractor(PixivExtractor):
+ """Extractor for new illustrations from your followed artists"""
+ subcategory = "follow"
+ archive_fmt = "F_{user_follow[id]}_{id}{num}.{extension}"
+ directory_fmt = ["{category}", "following"]
+ pattern = [r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
+ r"/bookmark_new_illust\.php"]
+ test = [
+ ("https://www.pixiv.net/bookmark_new_illust.php", None),
+ ("https://touch.pixiv.net/bookmark_new_illust.php", None),
+ ]
- def prepare_work(self, work):
- work["work"]["rank"] = work["rank"]
- work["work"]["ranking"] = self.ranking_info
- return PixivExtractor.prepare_work(self, work["work"])
+ def __init__(self, _):
+ PixivExtractor.__init__(self)
+
+ def works(self):
+ return self.api.illust_follow()
+
+ def get_metadata(self, user=None):
+ self.api.login()
+ return {"user_follow": self.api.user}
-class PixivAPI():
- """Minimal interface for the Pixiv Public-API for mobile devices
+class PixivAppAPI():
+ """Minimal interface for the Pixiv App API for mobile devices
- For a better and more complete implementation, see
+ For a more complete implementation or documentation, see
- https://github.com/upbit/pixivpy
- For in-depth information regarding the Pixiv Public-API, see
- - http://blog.imaou.com/opensource/2014/10/09/pixiv_api_for_ios_update.html
- - https://gist.github.com/ZipFile/e14ff1a7e6d01456188a
+ - https://gist.github.com/ZipFile/3ba99b47162c23f8aea5d5942bb557b1
"""
+ CLIENT_ID = "MOBrBDS8blbauoSck0ZfDbtuzpyT"
+ CLIENT_SECRET = "lsACyCD94FhDUtGTXi3QzcFE2uU1hqtDaKeqrdwj"
+
def __init__(self, extractor):
self.session = extractor.session
self.log = extractor.log
self.username, self.password = extractor._get_auth_info()
- self.user_info = None
+ self.user = None
+
+ self.client_id = extractor.config(
+ "client-id", self.CLIENT_ID)
+ self.client_secret = extractor.config(
+ "client-secret", self.CLIENT_SECRET)
+
self.session.headers.update({
- "Referer": "https://www.pixiv.net/",
- 'App-OS': 'ios',
- 'App-OS-Version': '10.3.1',
- 'App-Version': '6.7.1',
- 'User-Agent': 'PixivIOSApp/6.7.1 (iOS 10.3.1; iPhone8,1)',
+ "App-OS": "ios",
+ "App-OS-Version": "10.3.1",
+ "App-Version": "6.7.1",
+ "User-Agent": "PixivIOSApp/6.7.1 (iOS 10.3.1; iPhone8,1)",
+ "Referer": "https://app-api.pixiv.net/",
})
- def user(self, user_id):
- """Query information about a pixiv user"""
- endpoint = "users/" + user_id
- return self._call(endpoint, {})["response"]
-
- def work(self, illust_id):
- """Query information about a single pixiv work/illustration"""
- endpoint = "works/" + illust_id
- params = {"image_sizes": "large"}
- return self._call(endpoint, params)["response"]
-
- def user_works(self, user_id):
- """Query information about the works of a pixiv user"""
- endpoint = "users/{user}/works".format(user=user_id)
- params = {"image_sizes": "large"}
- return self._pagination(endpoint, params)
-
- def user_favorite_works(self, user_id):
- """Query information about the favorite works of a pixiv user"""
- endpoint = "users/{user}/favorite_works".format(user=user_id)
- params = {"image_sizes": "large", "include_stats": False}
- return self._pagination(endpoint, params)
-
- def ranking(self, mode, content="all", date=None):
- """Query pixiv's ranking lists"""
- endpoint = "ranking/" + content
- params = {"image_sizes": "large", "mode": mode, "date": date}
- return self._pagination(endpoint, params)
-
def login(self):
- """Login and gain a Pixiv Public-API access token"""
- self.user_info, access_token = self._login_impl(
+ """Login and gain an access token"""
+ self.user, auth = self._login_impl(
self.username, self.password)
- self.session.headers["Authorization"] = access_token
+ self.session.headers["Authorization"] = auth
- @cache(maxage=50*60, keyarg=1)
+ @cache(maxage=3590, keyarg=1)
def _login_impl(self, username, password):
- """Actual login implementation"""
self.log.info("Logging in as %s", username)
+
+ url = "https://oauth.secure.pixiv.net/auth/token"
data = {
+ "client_id": self.client_id,
+ "client_secret": self.client_secret,
+ "grant_type": "password",
"username": username,
"password": password,
- "grant_type": "password",
- "client_id": "bYGKuGVw91e0NMfPGp44euvGt59s",
- "client_secret": "HP3RmkgAmEGro0gn1x9ioawQE8WMfvLXDz3ZqxpK",
"get_secure_url": 1,
}
- response = self.session.post(
- "https://oauth.secure.pixiv.net/auth/token", data=data
- )
- if response.status_code != 200:
+
+ response = self.session.post(url, data=data)
+ if response.status_code >= 400:
raise exception.AuthenticationError()
- try:
- response = response.json()["response"]
- token = response["access_token"]
- user = response["user"]
- except KeyError:
- raise Exception("Get token error! Response: %s" % (response))
- return user, "Bearer " + token
- def _call(self, endpoint, params, _empty=[None]):
- url = "https://public-api.secure.pixiv.net/v1/" + endpoint + ".json"
+ data = response.json()["response"]
+ return data["user"], "Bearer " + data["access_token"]
+
+ def illust_detail(self, illust_id):
+ params = {"illust_id": illust_id}
+ return self._call("v1/illust/detail", params)["illust"]
+
+ def illust_follow(self, restrict="all"):
+ params = {"restrict": restrict}
+ return self._pagination("v2/illust/follow", params)
+
+ def illust_ranking(self, mode="day", date=None):
+ params = {"mode": mode, "date": date}
+ return self._pagination("v1/illust/ranking", params)
+
+ def search_illust(self, word, sort=None, target=None, duration=None):
+ params = {"word": word, "search_target": target,
+ "sort": sort, "duration": duration}
+ return self._pagination("v1/search/illust", params)
+
+ def user_bookmarks_illust(self, user_id, tag=None, restrict="public"):
+ params = {"user_id": user_id, "tag": tag, "restrict": restrict}
+ return self._pagination("v1/user/bookmarks/illust", params)
+
+ def user_detail(self, user_id):
+ params = {"user_id": user_id}
+ return self._call("v1/user/detail", params)["user"]
+
+ def user_illusts(self, user_id):
+ params = {"user_id": user_id}
+ return self._pagination("v1/user/illusts", params)
+
+ def ugoira_metadata(self, illust_id):
+ params = {"illust_id": illust_id}
+ return self._call("v1/ugoira/metadata", params)["ugoira_metadata"]
+
+ def _call(self, endpoint, params=None):
+ url = "https://app-api.pixiv.net/" + endpoint
self.login()
- data = self.session.get(url, params=params).json()
+ response = self.session.get(url, params=params)
- status = data.get("status")
- response = data.get("response", _empty)
- if status == "failure" or response == _empty:
+ if 200 <= response.status_code < 400:
+ return response.json()
+ if response.status_code == 404:
raise exception.NotFoundError()
- return data
+ self.log.error("API request failed: %s", response.text)
+ raise exception.StopExtraction()
def _pagination(self, endpoint, params):
while True:
data = self._call(endpoint, params)
- yield from data["response"]
+ yield from data["illusts"]
- pinfo = data["pagination"]
- if pinfo["current"] == pinfo["pages"]:
+ if not data["next_url"]:
return
- params["page"] = pinfo["next"]
+ query = data["next_url"].rpartition("?")[2]
+ params = text.parse_query(query)
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
index 2d8b6222..ca26b5e7 100644
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -9,7 +9,7 @@
"""Extract comic-issues and entire comics from http://readcomiconline.to/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, util, cloudflare
+from .. import text, cloudflare
import re
@@ -56,7 +56,7 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
issue = issue[7:]
results.append((self.root + url, {
"comic": comic, "issue": issue,
- "issue_id": util.safe_int(url.rpartition("=")[2]),
+ "issue_id": text.parse_int(url.rpartition("=")[2]),
"lang": "en", "language": "English",
}))
return results
@@ -84,7 +84,7 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
return {
"comic": comic,
"issue": match.group(1) or match.group(2),
- "issue_id": util.safe_int(self.issue_id),
+ "issue_id": text.parse_int(self.issue_id),
"lang": "en",
"language": "English",
}
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 81de60c2..001cbb2c 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -82,16 +82,16 @@ class SankakuExtractor(SharedConfigExtractor):
file_url = extr(page, '