improve 'extractor.request'

- add 'fatal' argument - improve internal logic and flow - raise known exception on error - update exception hierarchy
7 years ago · 915a0137de
parent dcd573806e
commit 915a0137de
12 changed files with 75 additions and 62 deletions
--- a/gallery_dl/exception.py
+++ b/gallery_dl/exception.py
@ -6,22 +6,49 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

+"""Exception classes used by gallery-dl

-class NoExtractorError(Exception):
-    """No extractor can handle the given URL"""
+Class Hierarchy:
+
+Exception
+ +-- GalleryDLException
+      +-- ExtractionError
+      |    +-- AuthenticationError
+      |    +-- AuthorizationError
+      |    +-- NotFoundError
+      |    +-- HttpError
+      +-- NoExtractorError
+      +-- StopExtraction
+"""
+
+
+class GalleryDLException(Exception):
+    """Base class for GalleryDL exceptions"""
+
+
+class ExtractionError(GalleryDLException):
+    """Base class for exceptions during information extraction"""


-class AuthenticationError(Exception):
+class AuthenticationError(ExtractionError):
    """Invalid or missing login information"""


-class AuthorizationError(Exception):
+class AuthorizationError(ExtractionError):
    """Insufficient privileges to access a resource"""


-class NotFoundError(Exception):
+class NotFoundError(ExtractionError):
    """Requested resource (gallery/image) does not exist"""


-class StopExtraction(Exception):
+class HttpError(ExtractionError):
+    """HTTP request during extraction failed"""
+
+
+class NoExtractorError(GalleryDLException):
+    """No extractor can handle the given URL"""
+
+
+class StopExtraction(GalleryDLException):
    """Extraction should stop"""
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@ -107,7 +107,7 @@ class BatotoChapterExtractor(BatotoExtractor, AsynchronousExtractor):
            "p": 1,
            "supress_webtoon": "t",
        }
-        response = self.session.get(self.reader_url, params=params)
+        response = self.request(self.reader_url, params=params, fatal=False)
        if response.status_code == 405:
            error = text.extract(response.text, "ERROR [", "]")[0]
            if error == "10030":
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@ -18,7 +18,7 @@ import requests
 import threading
 import http.cookiejar
 from .message import Message
-from .. import config
+from .. import config, exception


 class Extractor():
@ -47,11 +47,22 @@ class Extractor():
        return config.interpolate(
            ("extractor", self.category, self.subcategory, key), default)

-    def request(self, url, encoding=None, *args, **kwargs):
-        response = safe_request(self.session, url, *args, **kwargs)
-        if encoding:
-            response.encoding = encoding
-        return response
+    def request(self, url, method="GET", encoding=None, fatal=True, retries=3,
+                *args, **kwargs):
+        while True:
+            try:
+                response = self.session.request(method, url, *args, **kwargs)
+                if fatal:
+                    response.raise_for_status()
+                if encoding:
+                    response.encoding = encoding
+                return response
+            except requests.exceptions.RequestException as exc:
+                msg = exc
+            retries -= 1
+            if not retries:
+                raise exception.HttpError(msg)
+            time.sleep(1)

    def _get_auth_info(self):
        """Return authentication information as (username, password) tuple"""
@ -164,33 +175,8 @@ class MangaExtractor(Extractor):
        return []


-def safe_request(session, url, method="GET", *args, **kwargs):
-    tries = 0
-    while True:
-        # try to connect to remote source
-        try:
-            r = session.request(method, url, *args, **kwargs)
-        except requests.exceptions.ConnectionError:
-            tries += 1
-            time.sleep(1)
-            if tries == 5:
-                raise
-            continue
-
-        # reject error-status-codes
-        if r.status_code != requests.codes.ok:
-            tries += 1
-            time.sleep(1)
-            if tries == 5:
-                r.raise_for_status()
-            continue
-
-        # everything ok -- proceed to download
-        return r
-
-
-# Reduce strictness of the expected magic string in cookie jar files.
-# (This allows the use of Wget-generated cookiejar files without modification)
+# Reduce strictness of the expected magic string in cookiejar files.
+# (This allows the use of Wget-generated cookiejars without modification)

 http.cookiejar.MozillaCookieJar.magic_re = re.compile(
    "#( Netscape)? HTTP Cookie File", re.IGNORECASE)
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@ -242,7 +242,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
        self.url = "https://" + match.group(1)

    def deviations(self):
-        response = self.session.get(self.url)
+        response = self.request(self.url, fatal=False)
        deviation_id = text.extract(response.text, '//deviation/', '"')[0]
        if response.status_code != 200 or not deviation_id:
            raise exception.NotFoundError("image")
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@ -56,7 +56,7 @@ class ExhentaiGalleryExtractor(Extractor):
        yield Message.Version, 1

        url = "{}/g/{}/{}/".format(self.root, self.gid, self.token)
-        response = self.session.get(url)
+        response = self.request(url, fatal=False)
        page = response.text
        if response.status_code == 404 and "Gallery Not Available" in page:
            raise exception.AuthorizationError()
@ -196,7 +196,7 @@ class ExhentaiGalleryExtractor(Extractor):
        """Actual login implementation"""
        self.log.info("Logging in as %s", username)
        url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"
-        params = {
+        data = {
            "CookieDate": "1",
            "b": "d",
            "bt": "1-1",
@ -206,7 +206,7 @@ class ExhentaiGalleryExtractor(Extractor):
        }
        referer = "https://e-hentai.org/bounce_login.php?b=d&bt=1-1"
        self.session.headers["Referer"] = referer
-        response = self.session.post(url, data=params)
+        response = self.request(url, method="POST", data=data)

        if "You are now logged in as:" not in response.text:
            raise exception.AuthenticationError()
--- a/gallery_dl/extractor/gfycat.py
+++ b/gallery_dl/extractor/gfycat.py
@ -65,7 +65,7 @@ class GfycatImageExtractor(GfycatExtractor):

    def _get_info(self, gfycat_id):
        url = "https://gfycat.com/cajax/get/" + gfycat_id
-        data = self.session.get(url).json()
+        data = self.request(url).json()
        if "error" in data:
-            raise exception.NotFoundError()
+            raise exception.NotFoundError("animation")
        return data["gfyItem"]
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@ -62,7 +62,7 @@ class HentaifoundryUserExtractor(Extractor):
    def get_job_metadata(self):
        """Collect metadata for extractor-job"""
        url = self.url_base + self.artist + "?enterAgree=1"
-        response = self.session.get(url)
+        response = self.request(url, fatal=False)
        if response.status_code == 404:
            raise exception.NotFoundError("user")
        page = response.text
@ -150,7 +150,7 @@ class HentaifoundryImageExtractor(Extractor):
        """Collect metadata for an image"""
        url = "https://www.hentai-foundry.com/pictures/user/{}/{}".format(
            self.artist, self.index)
-        response = self.session.get(url + "?enterAgree=1")
+        response = self.request(url + "?enterAgree=1", fatal=False)
        if response.status_code == 404:
            raise exception.NotFoundError("image")
        extr = text.extract
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@ -23,7 +23,7 @@ class ImgurExtractor(Extractor):
        self.mp4 = self.config("mp4", True)

    def _get_data(self, urlpart):
-        response = self.session.get("https://imgur.com/" + urlpart)
+        response = self.request("https://imgur.com/" + urlpart, fatal=False)
        if response.status_code == 404:
            raise exception.NotFoundError(self.subcategory)
        data = text.extract(response.text, "image               : ", ",\n")[0]
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@ -71,9 +71,9 @@ class NijieExtractor(AsynchronousExtractor):
    def _login_impl(self, username, password):
        """Actual login implementation"""
        self.log.info("Logging in as %s", username)
-        params = {"email": username, "password": password}
-        page = self.session.post("https://nijie.info/login_int.php",
-                                 data=params).text
+        data = {"email": username, "password": password}
+        page = self.request("https://nijie.info/login_int.php",
+                            method="POST", data=data).text
        if "//nijie.info/login.php" in page:
            raise exception.AuthenticationError()
        return self.session.cookies
@ -102,7 +102,7 @@ class NijieUserExtractor(NijieExtractor):
        params = {"id": self.artist_id, "p": 1}
        url = "https://nijie.info/members_illust.php"
        while True:
-            response = self.session.get(url, params=params)
+            response = self.request(url, params=params, fatal=False)
            if response.status_code == 404:
                raise exception.NotFoundError("artist")
            ids = list(text.extract_iter(response.text, ' illust_id="', '"'))
@ -133,8 +133,8 @@ class NijieImageExtractor(NijieExtractor):
        self.page = ""

    def get_job_metadata(self):
-        response = self.session.get(self.popup_url + self.image_id,
-                                    allow_redirects=False)
+        response = self.request(self.popup_url + self.image_id,
+                                allow_redirects=False)
        if 300 <= response.status_code < 400:
            raise exception.NotFoundError("image")
        self.page = response.text
--- a/gallery_dl/extractor/pawoo.py
+++ b/gallery_dl/extractor/pawoo.py
@ -135,12 +135,10 @@ class MastodonAPI():
        """Get an account's statuses"""
        url = "{}/api/v1/accounts/{}/statuses?only_media=1".format(
            self.root, account_id)
-        while True:
+        while url:
            response = self.session.get(url)
            yield from self._parse(response)
            url = response.links.get("next", {}).get("url")
-            if not url:
-                break

    @staticmethod
    def _parse(response):
--- a/gallery_dl/extractor/seiga.py
+++ b/gallery_dl/extractor/seiga.py
@ -57,8 +57,8 @@ class SeigaExtractor(Extractor):
        """Actual login implementation"""
        self.log.info("Logging in as %s", username)
        url = "https://account.nicovideo.jp/api/v1/login"
-        params = {"mail_tel": username, "password": password}
-        self.session.post(url, data=params).close()
+        data = {"mail_tel": username, "password": password}
+        self.request(url, method="POST", data=data)
        if "user_session" not in self.session.cookies:
            raise exception.AuthenticationError()
        del self.session.cookies["nicosid"]
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@ -49,9 +49,11 @@ class Job():
        except exception.AuthorizationError:
            log.error("You do not have permission to access the resource "
                      "at '%s'", self.url)
-        except exception.NotFoundError as err:
-            res = str(err) or "resource (gallery/image/user)"
+        except exception.NotFoundError as exc:
+            res = str(exc) or "resource (gallery/image/user)"
            log.error("The %s at '%s' does not exist", res, self.url)
+        except exception.HttpError as exc:
+            log.error("HTTP request failed:\n%s", exc)
        except exception.StopExtraction:
            pass
        except Exception as exc: