add 'extractor.*.user-agent' config option

pull/54/head
Mike Fährmann 7 years ago
parent 6913eeaa40
commit e6814aebe2
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -289,6 +289,19 @@ Description Source to read additional cookies from.
=========== =====
extractor.*.user-agent
----------------------
=========== =====
Type ``string``
Default ``"Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0"``
Description User-Agent header value to be used for HTTP requests.
Note that this option has no effect on `pixiv` and
`readcomiconline` extractors, as these need specific values to
function correctly.
=========== =====
Extractor-specific Options
==========================

@ -17,7 +17,6 @@ class ThreedeebooruExtractor(booru.JSONBooruExtractor):
api_url = "http://behoimi.org/post/index.json"
headers = {
"Referer": "http://behoimi.org/post/show/",
"User-Agent": "Mozilla/5.0",
"Accept-Encoding": "identity",
}

@ -75,7 +75,6 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor):
def __init__(self, match):
SharedConfigExtractor.__init__(self)
self.board, self.thread = match.groups()
self.session.headers["User-Agent"] = "Mozilla 5.0"
if self.referer:
self.session.headers["Referer"] = self.root

@ -34,6 +34,7 @@ class Extractor():
self.session = requests.Session()
self.log = logging.getLogger(self.category)
self._set_cookies(self.config("cookies"))
self._set_headers()
def __iter__(self):
return self.items()
@ -96,6 +97,13 @@ class Extractor():
return username, password
def _set_headers(self):
"""Set additional headers for the 'session' object"""
self.session.headers["Accept-Language"] = "en-US,en;q=0.5"
self.session.headers["User-Agent"] = self.config(
"user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:54.0) "
"Gecko/20100101 Firefox/54.0"))
def _set_cookies(self, cookies):
"""Populate the cookiejar with 'cookies'"""
if cookies:

@ -50,11 +50,7 @@ class ExhentaiGalleryExtractor(Extractor):
self.wait_max = self.config("wait-max", 6)
if self.wait_max < self.wait_min:
self.wait_max = self.wait_min
self.session.headers.update({
"User-Agent": "Mozilla/5.0",
"Accept-Language": "en-US,en;q=0.5",
"Referer": self.root + "/",
})
self.session.headers["Referer"] = self.root + "/"
def items(self):
self.login()

@ -20,6 +20,10 @@ class ReadcomiconlineExtractor(kissmanga.KissmangaExtractor):
filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
root = "http://readcomiconline.to"
def __init__(self, match):
kissmanga.KissmangaExtractor.__init__(self, match)
self.session.headers["User-Agent"] = "Wget/1.19.2 (linux-gnu)"
class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor,
kissmanga.KissmangaMangaExtractor):

@ -41,9 +41,6 @@ class SankakuTagExtractor(Extractor):
self.wait_max = self.config("wait-max", 4)
if self.wait_max < self.wait_min:
self.wait_max = self.wait_min
self.session.headers["User-Agent"] = (
"Mozilla/5.0 Gecko/20100101 Firefox/40.0"
)
def skip(self, num):
pages = min(num // 20, 49)

@ -38,7 +38,6 @@ class SenmangaChapterExtractor(Extractor):
self.chapter_url = "{}/{}/".format(self.root, part)
self.img_url = "{}/viewer/{}/".format(self.root, part)
self.session.headers["Referer"] = self.chapter_url
self.session.headers["User-Agent"] = "Mozilla 5.0"
def items(self):
data = self.get_job_metadata()

@ -37,10 +37,6 @@ class TwitterTweetExtractor(Extractor):
self.path, self.user, self.tid = match.groups()
def items(self):
self.session.headers["User-Agent"] = (
"Mozilla/5.0 (X11; Linux x86_64; rv:48.0) "
"Gecko/20100101 Firefox/48.0"
)
page = self.request("https://twitter.com/" + self.path).text
data = self.get_job_metadata()
imgs = self.get_image_urls(page)

Loading…
Cancel
Save