improve '--write-pages' (#737)

- move code into its own function
- add enumeration index to filenames
- dump responses regardless of status code
pull/754/head
Mike Fährmann 4 years ago
parent dba87ca99e
commit f8f95e68a7
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -40,6 +40,7 @@ class Extractor():
self._cookiefile = None
self._cookiejar = self.session.cookies
self._parentdir = ""
self._write_pages = self.config("write-pages", False)
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
@ -91,22 +92,13 @@ class Extractor():
raise exception.HttpError(exc)
else:
code = response.status_code
if self._write_pages:
self._dump_response(response)
if 200 <= code < 400 or fatal is None and \
(400 <= code < 500) or not fatal and \
(400 <= code < 429 or 431 <= code < 500):
if encoding:
response.encoding = encoding
if config.get((), "write_pages", False):
# Write the response content to a .dump file
# in the current directory.
# The file name is derived from the response
# url, replacing special characters with "_"
r = re.compile(r"[\\\\|/<>:\"?*&=#]+")
outfilename = r.sub('_', response.url) + '.dump'
with open(outfilename, 'wb') as outfile:
outfile.write(response.content)
return response
if notfound and code == 404:
raise exception.NotFoundError(notfound)
@ -321,6 +313,24 @@ class Extractor():
result.append((Message.Queue, url, {"_extractor": extr}))
return iter(result)
@staticmethod
def _dump_response(response):
"""Write the response content to a .dump file in the current directory.
The file name is derived from the response url,
replacing special characters with "_"
"""
if hasattr(Extractor, "_dump_index"):
Extractor._dump_index += 1
else:
Extractor._dump_index = 1
Extractor._dump_sanitize = re.compile(r"[\\\\|/<>:\"?*&=#]+").sub
outfilename = "{:>03}_{}.dump".format(
Extractor._dump_index, Extractor._dump_sanitize('_', response.url))
with open(outfilename, 'wb') as outfile:
outfile.write(response.content)
@classmethod
def _get_tests(cls):
"""Yield an extractor's test cases as (URL, RESULTS) tuples"""

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2017-2019 Mike Fährmann
# Copyright 2017-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -175,7 +175,7 @@ def build_parser():
)
output.add_argument(
"--write-pages",
dest="write_pages", nargs=0, action=ConfigConstAction, const=True,
dest="write-pages", nargs=0, action=ConfigConstAction, const=True,
help=("Write downloaded intermediary pages to files "
"in the current directory to debug problems"),
)

Loading…
Cancel
Save