improve 'extractor.request'

- add 'fatal' argument
- improve internal logic and flow
- raise known exception on error
- update exception hierarchy
pull/40/head
Mike Fährmann 7 years ago
parent dcd573806e
commit 915a0137de
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -6,22 +6,49 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Exception classes used by gallery-dl
class NoExtractorError(Exception): Class Hierarchy:
"""No extractor can handle the given URL"""
Exception
+-- GalleryDLException
+-- ExtractionError
| +-- AuthenticationError
| +-- AuthorizationError
| +-- NotFoundError
| +-- HttpError
+-- NoExtractorError
+-- StopExtraction
"""
class GalleryDLException(Exception):
"""Base class for GalleryDL exceptions"""
class ExtractionError(GalleryDLException):
"""Base class for exceptions during information extraction"""
class AuthenticationError(Exception): class AuthenticationError(ExtractionError):
"""Invalid or missing login information""" """Invalid or missing login information"""
class AuthorizationError(Exception): class AuthorizationError(ExtractionError):
"""Insufficient privileges to access a resource""" """Insufficient privileges to access a resource"""
class NotFoundError(Exception): class NotFoundError(ExtractionError):
"""Requested resource (gallery/image) does not exist""" """Requested resource (gallery/image) does not exist"""
class StopExtraction(Exception): class HttpError(ExtractionError):
"""HTTP request during extraction failed"""
class NoExtractorError(GalleryDLException):
"""No extractor can handle the given URL"""
class StopExtraction(GalleryDLException):
"""Extraction should stop""" """Extraction should stop"""

@ -107,7 +107,7 @@ class BatotoChapterExtractor(BatotoExtractor, AsynchronousExtractor):
"p": 1, "p": 1,
"supress_webtoon": "t", "supress_webtoon": "t",
} }
response = self.session.get(self.reader_url, params=params) response = self.request(self.reader_url, params=params, fatal=False)
if response.status_code == 405: if response.status_code == 405:
error = text.extract(response.text, "ERROR [", "]")[0] error = text.extract(response.text, "ERROR [", "]")[0]
if error == "10030": if error == "10030":

@ -18,7 +18,7 @@ import requests
import threading import threading
import http.cookiejar import http.cookiejar
from .message import Message from .message import Message
from .. import config from .. import config, exception
class Extractor(): class Extractor():
@ -47,11 +47,22 @@ class Extractor():
return config.interpolate( return config.interpolate(
("extractor", self.category, self.subcategory, key), default) ("extractor", self.category, self.subcategory, key), default)
def request(self, url, encoding=None, *args, **kwargs): def request(self, url, method="GET", encoding=None, fatal=True, retries=3,
response = safe_request(self.session, url, *args, **kwargs) *args, **kwargs):
if encoding: while True:
response.encoding = encoding try:
return response response = self.session.request(method, url, *args, **kwargs)
if fatal:
response.raise_for_status()
if encoding:
response.encoding = encoding
return response
except requests.exceptions.RequestException as exc:
msg = exc
retries -= 1
if not retries:
raise exception.HttpError(msg)
time.sleep(1)
def _get_auth_info(self): def _get_auth_info(self):
"""Return authentication information as (username, password) tuple""" """Return authentication information as (username, password) tuple"""
@ -164,33 +175,8 @@ class MangaExtractor(Extractor):
return [] return []
def safe_request(session, url, method="GET", *args, **kwargs): # Reduce strictness of the expected magic string in cookiejar files.
tries = 0 # (This allows the use of Wget-generated cookiejars without modification)
while True:
# try to connect to remote source
try:
r = session.request(method, url, *args, **kwargs)
except requests.exceptions.ConnectionError:
tries += 1
time.sleep(1)
if tries == 5:
raise
continue
# reject error-status-codes
if r.status_code != requests.codes.ok:
tries += 1
time.sleep(1)
if tries == 5:
r.raise_for_status()
continue
# everything ok -- proceed to download
return r
# Reduce strictness of the expected magic string in cookie jar files.
# (This allows the use of Wget-generated cookiejar files without modification)
http.cookiejar.MozillaCookieJar.magic_re = re.compile( http.cookiejar.MozillaCookieJar.magic_re = re.compile(
"#( Netscape)? HTTP Cookie File", re.IGNORECASE) "#( Netscape)? HTTP Cookie File", re.IGNORECASE)

@ -242,7 +242,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
self.url = "https://" + match.group(1) self.url = "https://" + match.group(1)
def deviations(self): def deviations(self):
response = self.session.get(self.url) response = self.request(self.url, fatal=False)
deviation_id = text.extract(response.text, '//deviation/', '"')[0] deviation_id = text.extract(response.text, '//deviation/', '"')[0]
if response.status_code != 200 or not deviation_id: if response.status_code != 200 or not deviation_id:
raise exception.NotFoundError("image") raise exception.NotFoundError("image")

@ -56,7 +56,7 @@ class ExhentaiGalleryExtractor(Extractor):
yield Message.Version, 1 yield Message.Version, 1
url = "{}/g/{}/{}/".format(self.root, self.gid, self.token) url = "{}/g/{}/{}/".format(self.root, self.gid, self.token)
response = self.session.get(url) response = self.request(url, fatal=False)
page = response.text page = response.text
if response.status_code == 404 and "Gallery Not Available" in page: if response.status_code == 404 and "Gallery Not Available" in page:
raise exception.AuthorizationError() raise exception.AuthorizationError()
@ -196,7 +196,7 @@ class ExhentaiGalleryExtractor(Extractor):
"""Actual login implementation""" """Actual login implementation"""
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01" url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"
params = { data = {
"CookieDate": "1", "CookieDate": "1",
"b": "d", "b": "d",
"bt": "1-1", "bt": "1-1",
@ -206,7 +206,7 @@ class ExhentaiGalleryExtractor(Extractor):
} }
referer = "https://e-hentai.org/bounce_login.php?b=d&bt=1-1" referer = "https://e-hentai.org/bounce_login.php?b=d&bt=1-1"
self.session.headers["Referer"] = referer self.session.headers["Referer"] = referer
response = self.session.post(url, data=params) response = self.request(url, method="POST", data=data)
if "You are now logged in as:" not in response.text: if "You are now logged in as:" not in response.text:
raise exception.AuthenticationError() raise exception.AuthenticationError()

@ -65,7 +65,7 @@ class GfycatImageExtractor(GfycatExtractor):
def _get_info(self, gfycat_id): def _get_info(self, gfycat_id):
url = "https://gfycat.com/cajax/get/" + gfycat_id url = "https://gfycat.com/cajax/get/" + gfycat_id
data = self.session.get(url).json() data = self.request(url).json()
if "error" in data: if "error" in data:
raise exception.NotFoundError() raise exception.NotFoundError("animation")
return data["gfyItem"] return data["gfyItem"]

@ -62,7 +62,7 @@ class HentaifoundryUserExtractor(Extractor):
def get_job_metadata(self): def get_job_metadata(self):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
url = self.url_base + self.artist + "?enterAgree=1" url = self.url_base + self.artist + "?enterAgree=1"
response = self.session.get(url) response = self.request(url, fatal=False)
if response.status_code == 404: if response.status_code == 404:
raise exception.NotFoundError("user") raise exception.NotFoundError("user")
page = response.text page = response.text
@ -150,7 +150,7 @@ class HentaifoundryImageExtractor(Extractor):
"""Collect metadata for an image""" """Collect metadata for an image"""
url = "https://www.hentai-foundry.com/pictures/user/{}/{}".format( url = "https://www.hentai-foundry.com/pictures/user/{}/{}".format(
self.artist, self.index) self.artist, self.index)
response = self.session.get(url + "?enterAgree=1") response = self.request(url + "?enterAgree=1", fatal=False)
if response.status_code == 404: if response.status_code == 404:
raise exception.NotFoundError("image") raise exception.NotFoundError("image")
extr = text.extract extr = text.extract

@ -23,7 +23,7 @@ class ImgurExtractor(Extractor):
self.mp4 = self.config("mp4", True) self.mp4 = self.config("mp4", True)
def _get_data(self, urlpart): def _get_data(self, urlpart):
response = self.session.get("https://imgur.com/" + urlpart) response = self.request("https://imgur.com/" + urlpart, fatal=False)
if response.status_code == 404: if response.status_code == 404:
raise exception.NotFoundError(self.subcategory) raise exception.NotFoundError(self.subcategory)
data = text.extract(response.text, "image : ", ",\n")[0] data = text.extract(response.text, "image : ", ",\n")[0]

@ -71,9 +71,9 @@ class NijieExtractor(AsynchronousExtractor):
def _login_impl(self, username, password): def _login_impl(self, username, password):
"""Actual login implementation""" """Actual login implementation"""
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
params = {"email": username, "password": password} data = {"email": username, "password": password}
page = self.session.post("https://nijie.info/login_int.php", page = self.request("https://nijie.info/login_int.php",
data=params).text method="POST", data=data).text
if "//nijie.info/login.php" in page: if "//nijie.info/login.php" in page:
raise exception.AuthenticationError() raise exception.AuthenticationError()
return self.session.cookies return self.session.cookies
@ -102,7 +102,7 @@ class NijieUserExtractor(NijieExtractor):
params = {"id": self.artist_id, "p": 1} params = {"id": self.artist_id, "p": 1}
url = "https://nijie.info/members_illust.php" url = "https://nijie.info/members_illust.php"
while True: while True:
response = self.session.get(url, params=params) response = self.request(url, params=params, fatal=False)
if response.status_code == 404: if response.status_code == 404:
raise exception.NotFoundError("artist") raise exception.NotFoundError("artist")
ids = list(text.extract_iter(response.text, ' illust_id="', '"')) ids = list(text.extract_iter(response.text, ' illust_id="', '"'))
@ -133,8 +133,8 @@ class NijieImageExtractor(NijieExtractor):
self.page = "" self.page = ""
def get_job_metadata(self): def get_job_metadata(self):
response = self.session.get(self.popup_url + self.image_id, response = self.request(self.popup_url + self.image_id,
allow_redirects=False) allow_redirects=False)
if 300 <= response.status_code < 400: if 300 <= response.status_code < 400:
raise exception.NotFoundError("image") raise exception.NotFoundError("image")
self.page = response.text self.page = response.text

@ -135,12 +135,10 @@ class MastodonAPI():
"""Get an account's statuses""" """Get an account's statuses"""
url = "{}/api/v1/accounts/{}/statuses?only_media=1".format( url = "{}/api/v1/accounts/{}/statuses?only_media=1".format(
self.root, account_id) self.root, account_id)
while True: while url:
response = self.session.get(url) response = self.session.get(url)
yield from self._parse(response) yield from self._parse(response)
url = response.links.get("next", {}).get("url") url = response.links.get("next", {}).get("url")
if not url:
break
@staticmethod @staticmethod
def _parse(response): def _parse(response):

@ -57,8 +57,8 @@ class SeigaExtractor(Extractor):
"""Actual login implementation""" """Actual login implementation"""
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
url = "https://account.nicovideo.jp/api/v1/login" url = "https://account.nicovideo.jp/api/v1/login"
params = {"mail_tel": username, "password": password} data = {"mail_tel": username, "password": password}
self.session.post(url, data=params).close() self.request(url, method="POST", data=data)
if "user_session" not in self.session.cookies: if "user_session" not in self.session.cookies:
raise exception.AuthenticationError() raise exception.AuthenticationError()
del self.session.cookies["nicosid"] del self.session.cookies["nicosid"]

@ -49,9 +49,11 @@ class Job():
except exception.AuthorizationError: except exception.AuthorizationError:
log.error("You do not have permission to access the resource " log.error("You do not have permission to access the resource "
"at '%s'", self.url) "at '%s'", self.url)
except exception.NotFoundError as err: except exception.NotFoundError as exc:
res = str(err) or "resource (gallery/image/user)" res = str(exc) or "resource (gallery/image/user)"
log.error("The %s at '%s' does not exist", res, self.url) log.error("The %s at '%s' does not exist", res, self.url)
except exception.HttpError as exc:
log.error("HTTP request failed:\n%s", exc)
except exception.StopExtraction: except exception.StopExtraction:
pass pass
except Exception as exc: except Exception as exc:

Loading…
Cancel
Save