replace extractor.request() 'expect' argument

with
- 'fatal': allow 4xx status codes
- 'notfound': raise NotFoundError on 404
pull/359/head
Mike Fährmann 5 years ago
parent 2ff73873f0
commit fdec59f8e2
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -79,9 +79,7 @@ class ArtstationExtractor(Extractor):
def get_user_info(self, username):
"""Return metadata for a specific user"""
url = "{}/users/{}/quick.json".format(self.root, username.lower())
response = self.request(url, expect=(404,))
if response.status_code == 404:
raise exception.NotFoundError("user")
response = self.request(url, notfound="user")
return response.json()
def _pagination(self, url, params=None):

@ -66,8 +66,8 @@ class Extractor():
return config.interpolate(
("extractor", self.category, self.subcategory, key), default)
def request(self, url, method="GET", *, session=None,
encoding=None, expect=(), retries=None, **kwargs):
def request(self, url, method="GET", *, session=None, retries=None,
encoding=None, fatal=True, notfound=None, **kwargs):
tries = 1
retries = self._retries if retries is None else retries
session = self.session if session is None else session
@ -86,10 +86,13 @@ class Extractor():
raise exception.HttpError(exc)
else:
code = response.status_code
if 200 <= code < 400 or code in expect:
if 200 <= code < 400 or not fatal and \
(400 <= code < 429 or 431 <= code < 500):
if encoding:
response.encoding = encoding
return response
if notfound and code == 404:
raise exception.NotFoundError(notfound)
if cloudflare.is_challenge(response):
self.log.info("Solving Cloudflare challenge")
url, domain, cookies = cloudflare.solve_challenge(
@ -98,7 +101,7 @@ class Extractor():
continue
msg = "{}: {} for url: {}".format(code, response.reason, url)
if code < 500 and code != 429:
if code < 500 and code != 429 and code != 430:
break
self.log.debug("%s (%s/%s)", msg, tries, retries+1)

@ -416,7 +416,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
def deviations(self):
url = "{}/{}/{}".format(self.root, self.user, self.path)
response = self._html_request(url, expect=range(400, 500))
response = self._html_request(url, fatal=False)
deviation_id = text.extract(response.text, '//deviation/', '"')[0]
if response.status_code >= 400 or not deviation_id:
raise exception.NotFoundError("image")
@ -767,7 +767,7 @@ class DeviantartAPI():
def user_profile(self, username):
"""Get user profile information"""
endpoint = "user/profile/" + username
return self._call(endpoint, expect_error=True)
return self._call(endpoint, fatal=False)
def authenticate(self, refresh_token):
"""Authenticate the application by requesting an access token"""
@ -797,7 +797,7 @@ class DeviantartAPI():
_refresh_token_cache.update(refresh_token, data["refresh_token"])
return "Bearer " + data["access_token"]
def _call(self, endpoint, params=None, expect_error=False, public=True):
def _call(self, endpoint, params=None, fatal=True, public=True):
"""Call an API endpoint"""
url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint
while True:
@ -806,11 +806,7 @@ class DeviantartAPI():
self.authenticate(None if public else self.refresh_token)
response = self.extractor.request(
url,
params=params,
headers=self.headers,
expect=range(400, 500),
)
url, headers=self.headers, params=params, fatal=False)
data = response.json()
status = response.status_code
@ -818,7 +814,7 @@ class DeviantartAPI():
if self.delay > self.delay_min:
self.delay -= 1
return data
if expect_error:
if not fatal:
return None
if data.get("error_description") == "User not found.":
raise exception.NotFoundError("user or group")

@ -259,7 +259,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def _gallery_page(self):
url = "{}/g/{}/{}/".format(
self.root, self.gallery_id, self.gallery_token)
response = self.request(url, expect=range(400, 500))
response = self.request(url, fatal=False)
page = response.text
if response.status_code == 404 and "Gallery Not Available" in page:
@ -271,7 +271,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def _image_page(self):
url = "{}/s/{}/{}-{}".format(
self.root, self.image_token, self.gallery_id, self.image_num)
page = self.request(url, expect=range(400, 500)).text
page = self.request(url, fatal=False).text
if page.startswith(("Invalid page", "Keep trying")):
raise exception.NotFoundError("image page")

@ -16,16 +16,15 @@ import json
class ImgurExtractor(Extractor):
"""Base class for imgur extractors"""
category = "imgur"
root = "https://imgur.com"
def __init__(self, match):
Extractor.__init__(self, match)
self.item_id = match.group(1)
self.mp4 = self.config("mp4", True)
def _get_data(self, urlpart):
response = self.request("https://imgur.com/" + urlpart, expect=(404,))
if response.status_code == 404:
raise exception.NotFoundError(self.subcategory)
def _get_data(self, path):
response = self.request(self.root + path, notfound=self.subcategory)
data = text.extract(response.text, "image : ", ",\n")[0]
return self._clean(json.loads(data))
@ -102,7 +101,7 @@ class ImgurImageExtractor(ImgurExtractor):
)
def items(self):
image = self._get_data(self.item_id)
image = self._get_data("/" + self.item_id)
url = self._prepare(image)
yield Message.Version, 1
@ -165,13 +164,13 @@ class ImgurAlbumExtractor(ImgurExtractor):
)
def items(self):
album = self._get_data("a/" + self.item_id + "/all")
album = self._get_data("/a/" + self.item_id + "/all")
images = album["album_images"]["images"]
del album["album_images"]
if int(album["num_images"]) > len(images):
url = ("https://imgur.com/ajaxalbums/getimages/" +
self.item_id + "/hit.json")
url = "{}/ajaxalbums/getimages/{}/hit.json".format(
self.root, self.item_id)
images = self.request(url).json()["data"]["images"]
yield Message.Version, 1

@ -106,13 +106,8 @@ class NijieExtractor(AsynchronousMixin, Extractor):
params = {"id": self.user_id, "p": 1}
while True:
response = self.request(url, params=params, expect=(404,))
if response.status_code == 404:
raise exception.NotFoundError("artist")
page = response.text
ids = list(text.extract_iter(page, ' illust_id="', '"'))
yield from ids
page = self.request(url, params=params, notfound="artist").text
yield from text.extract_iter(page, 'illust_id="', '"')
if '<a rel="next"' not in page:
return
@ -190,10 +185,8 @@ class NijieImageExtractor(NijieExtractor):
self.page = ""
def get_job_metadata(self):
response = self.request(self.view_url + self.image_id, expect=(404,))
if response.status_code == 404:
raise exception.NotFoundError("image")
self.page = response.text
self.page = self.request(
self.view_url + self.image_id, notfound="image").text
self.user_id = text.extract(
self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0]
return NijieExtractor.get_job_metadata(self)

@ -228,14 +228,14 @@ class PinterestAPI():
params = {"data": json.dumps({"options": options}), "source_url": ""}
response = self.extractor.request(
url, params=params, headers=self.HEADERS, expect=range(400, 500))
url, params=params, headers=self.HEADERS, fatal=False)
try:
data = response.json()
except ValueError:
data = {}
if 200 <= response.status_code < 400 and not response.history:
if response.status_code < 400 and not response.history:
return data
if response.status_code == 404 or response.history:

@ -143,9 +143,7 @@ class PixivMeExtractor(PixivExtractor):
def items(self):
url = "https://pixiv.me/" + self.account
response = self.request(
url, method="HEAD", allow_redirects=False, expect=(404,))
if response.status_code == 404:
raise exception.NotFoundError("user")
url, method="HEAD", allow_redirects=False, notfound="user")
yield Message.Version, 1
yield Message.Queue, response.headers["Location"], {}
@ -445,7 +443,7 @@ class PixivAppAPI():
data["password"] = password
response = self.extractor.request(
url, method="POST", data=data, expect=(400,))
url, method="POST", data=data, fatal=False)
if response.status_code >= 400:
raise exception.AuthenticationError()
@ -491,10 +489,9 @@ class PixivAppAPI():
url = "https://app-api.pixiv.net/" + endpoint
self.login()
response = self.extractor.request(
url, params=params, expect=range(400, 500))
response = self.extractor.request(url, params=params, fatal=False)
if 200 <= response.status_code < 400:
if response.status_code < 400:
return response.json()
if response.status_code == 404:
raise exception.NotFoundError()

@ -235,8 +235,7 @@ class RedditAPI():
url = "https://oauth.reddit.com" + endpoint
params["raw_json"] = 1
self.authenticate()
response = self.extractor.request(
url, params=params, expect=range(400, 500))
response = self.extractor.request(url, params=params, fatal=False)
remaining = response.headers.get("x-ratelimit-remaining")
if remaining and float(remaining) < 2:
wait = int(response.headers["x-ratelimit-reset"])

@ -110,8 +110,8 @@ class SankakucomplexTagExtractor(SankakucomplexExtractor):
yield Message.Version, 1
while True:
url = "{}/{}/page/{}/".format(self.root, self.path, pnum)
response = self.request(url, expect=(404,))
if response.status_code == 404:
response = self.request(url, fatal=False)
if response.status_code >= 400:
return
for url in text.extract_iter(response.text, 'data-direct="', '"'):
if url != last:

@ -43,9 +43,7 @@ class SeigaExtractor(Extractor):
"""Get url for an image with id 'image_id'"""
url = "{}/image/source/{}".format(self.root, image_id)
response = self.request(
url, method="HEAD", allow_redirects=False, expect=(404,))
if response.status_code == 404:
raise exception.NotFoundError("image")
url, method="HEAD", allow_redirects=False, notfound="image")
return response.headers["Location"].replace("/o/", "/priv/", 1)
def login(self):

@ -49,10 +49,10 @@ class SexcomExtractor(Extractor):
return
url = text.urljoin(self.root, url)
def _parse_pin(self, url, expect=range(400, 429)):
response = self.request(url, expect=expect)
def _parse_pin(self, url):
response = self.request(url, fatal=False)
if response.status_code >= 400:
self.log.warning("Unable to fetch %s (%s: %s)",
self.log.warning('Unable to fetch %s ("%s: %s")',
url, response.status_code, response.reason)
return None
extr = text.extract_from(response.text)

@ -10,7 +10,6 @@
from .common import Extractor, Message, SharedConfigMixin, generate_extractors
from .. import text
import time
import re
@ -24,19 +23,9 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):
Extractor.__init__(self, match)
self.item_url = self.root + match.group(1)
def request(self, url, method="GET", expect=range(400, 500), **kwargs):
tries = 0
kwargs["expect"] = expect
while True:
response = Extractor.request(self, url, method, **kwargs)
if response.status_code not in (429, 430):
return response
tries += 1
waittime = 2 ** (tries + 2)
self.log.warning(
"HTTP status %s: %s - Waiting for %d seconds",
response.status_code, response.reason, waittime)
time.sleep(waittime)
def request(self, url, **kwargs):
kwargs["retries"] = float("inf")
return Extractor.request(self, url, **kwargs)
def items(self):
data = self.metadata()
@ -45,9 +34,10 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):
headers = {"X-Requested-With": "XMLHttpRequest"}
for url in self.products():
response = self.request(url + ".json", headers=headers)
response = self.request(
url + ".json", headers=headers, fatal=False)
if response.status_code >= 400:
self.log.warning('Skipping %s ("%d: %s")',
self.log.warning('Skipping %s ("%s: %s")',
url, response.status_code, response.reason)
continue
product = response.json()["product"]

@ -107,9 +107,9 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
def images(self, page):
url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id)
headers = {"Referer": self.chapter_url}
response = self.request(url, headers=headers, expect=(404,))
response = self.request(url, headers=headers, fatal=False)
if response.status_code == 404:
if response.status_code >= 400:
url = "{}/Read/View/{}".format(self.root, self.gallery_id)
self.log.error(
"Failed to get gallery JSON data. Visit '%s' in a browser "

@ -18,12 +18,6 @@ class XvideosExtractor(Extractor):
category = "xvideos"
root = "https://www.xvideos.com"
def get_page(self, url, codes=(403, 404)):
response = self.request(url, expect=codes)
if response.status_code in codes:
raise exception.NotFoundError(self.subcategory)
return response.text
class XvideosGalleryExtractor(XvideosExtractor):
"""Extractor for user profile galleries from xvideos.com"""
@ -50,7 +44,7 @@ class XvideosGalleryExtractor(XvideosExtractor):
def items(self):
url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid)
page = self.get_page(url)
page = self.request(url, notfound=self.subcategory).text
data = self.get_metadata(page)
imgs = self.get_images(page)
data["count"] = len(imgs)
@ -113,7 +107,7 @@ class XvideosUserExtractor(XvideosExtractor):
def items(self):
url = "{}/profiles/{}".format(self.root, self.user)
page = self.get_page(url)
page = self.request(url, notfound=self.subcategory).text
data = json.loads(text.extract(
page, "xv.conf=", ";</script>")[0])["data"]

@ -126,7 +126,7 @@ class OAuth1API():
self.session = extractor.session
self.api_key = api_key
def request(self, url, method="GET", *, expect=range(400, 500), **kwargs):
kwargs["expect"] = expect
def request(self, url, method="GET", **kwargs):
kwargs["fatal"] = False
kwargs["session"] = self.session
return self.extractor.request(url, method, **kwargs)

Loading…
Cancel
Save