allow specifying a minimum/maximum for 'sleep-*' options (#1835)

for example '"sleep-request": [5.0, 10.0]' to wait between 5 and 10
seconds between each HTTP request
pull/1853/head
Mike Fährmann 3 years ago
parent bd845303ad
commit c9e6693530
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -314,7 +314,7 @@ Description
extractor.*.sleep extractor.*.sleep
----------------- -----------------
Type Type
``float`` |Duration|_
Default Default
``0`` ``0``
Description Description
@ -324,7 +324,7 @@ Description
extractor.*.sleep-extractor extractor.*.sleep-extractor
--------------------------- ---------------------------
Type Type
``float`` |Duration|_
Default Default
``0`` ``0``
Description Description
@ -335,7 +335,7 @@ Description
extractor.*.sleep-request extractor.*.sleep-request
------------------------- -------------------------
Type Type
``float`` |Duration|_
Default Default
``0`` ``0``
Description Description
@ -3167,7 +3167,8 @@ Custom Types
Date Date
---- ----
Type Type
``string`` or ``integer`` * ``string``
* ``integer``
Example Example
* ``"2019-01-01T00:00:00"`` * ``"2019-01-01T00:00:00"``
* ``"2019"`` with ``"%Y"`` as `date-format`_ * ``"2019"`` with ``"%Y"`` as `date-format`_
@ -3179,10 +3180,28 @@ Description
* If given as ``integer``, it is interpreted as UTC timestamp. * If given as ``integer``, it is interpreted as UTC timestamp.
Duration
--------
Type
* ``float``
* ``list`` with 2 ``floats``
Example
* ``2.85``
* ``[1.5, 3.0]``
Description
A |Duration|_ represents a span of time in seconds.
* If given as a single ``float``, it will be used as that exact value.
* If given as a ``list`` with 2 floating-point numbers ``a`` & ``b`` ,
it will be randomly chosen with uniform distribution such that ``a <= N <=b``.
(see `random.uniform() <https://docs.python.org/3/library/random.html#random.uniform>`_)
Path Path
---- ----
Type Type
``string`` or ``list`` of ``strings`` * ``string``
* ``list`` of ``strings``
Example Example
* ``"file.ext"`` * ``"file.ext"``
* ``"~/path/to/file.ext"`` * ``"~/path/to/file.ext"``
@ -3328,6 +3347,7 @@ Description
.. |datetime| replace:: ``datetime`` .. |datetime| replace:: ``datetime``
.. |datetime.max| replace:: ``datetime.max`` .. |datetime.max| replace:: ``datetime.max``
.. |Date| replace:: ``Date`` .. |Date| replace:: ``Date``
.. |Duration| replace:: ``Duration``
.. |Path| replace:: ``Path`` .. |Path| replace:: ``Path``
.. |Last-Modified| replace:: ``Last-Modified`` .. |Last-Modified| replace:: ``Last-Modified``
.. |Logging Configuration| replace:: ``Logging Configuration`` .. |Logging Configuration| replace:: ``Logging Configuration``

@ -54,13 +54,13 @@ class Extractor():
self._retries = self.config("retries", 4) self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30) self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True) self._verify = self.config("verify", True)
self.request_interval = self.config( self._interval = util.build_duration_func(
"sleep-request", self.request_interval) self.config("sleep-request", self.request_interval),
self.request_interval_min,
)
if self._retries < 0: if self._retries < 0:
self._retries = float("inf") self._retries = float("inf")
if self.request_interval < self.request_interval_min:
self.request_interval = self.request_interval_min
self._init_session() self._init_session()
self._init_cookies() self._init_cookies()
@ -114,8 +114,8 @@ class Extractor():
response = None response = None
tries = 1 tries = 1
if self.request_interval: if self._interval:
seconds = (self.request_interval - seconds = (self._interval() -
(time.time() - Extractor.request_timestamp)) (time.time() - Extractor.request_timestamp))
if seconds > 0.0: if seconds > 0.0:
self.log.debug("Sleeping for %.5s seconds", seconds) self.log.debug("Sleeping for %.5s seconds", seconds)

@ -72,9 +72,9 @@ class Job():
log = extractor.log log = extractor.log
msg = None msg = None
sleep = extractor.config("sleep-extractor") sleep = util.build_duration_func(extractor.config("sleep-extractor"))
if sleep: if sleep:
time.sleep(sleep) time.sleep(sleep())
try: try:
for msg in extractor: for msg in extractor:
@ -236,7 +236,7 @@ class DownloadJob(Job):
return return
if self.sleep: if self.sleep:
time.sleep(self.sleep) time.sleep(self.sleep())
# download from URL # download from URL
if not self.download(url): if not self.download(url):
@ -398,7 +398,7 @@ class DownloadJob(Job):
if kwdict: if kwdict:
pathfmt.set_directory(kwdict) pathfmt.set_directory(kwdict)
self.sleep = cfg("sleep") self.sleep = util.build_duration_func(cfg("sleep"))
self.fallback = cfg("fallback", True) self.fallback = cfg("fallback", True)
if not cfg("download", True): if not cfg("download", True):
# monkey-patch method to do nothing and always return True # monkey-patch method to do nothing and always return True
@ -541,7 +541,7 @@ class SimulationJob(DownloadJob):
self.pathfmt.set_filename(kwdict) self.pathfmt.set_filename(kwdict)
self.out.skip(self.pathfmt.path) self.out.skip(self.pathfmt.path)
if self.sleep: if self.sleep:
time.sleep(self.sleep) time.sleep(self.sleep())
if self.archive: if self.archive:
self.archive.add(kwdict) self.archive.add(kwdict)
@ -695,9 +695,10 @@ class DataJob(Job):
self.filter = util.identity if private else util.filter_dict self.filter = util.identity if private else util.filter_dict
def run(self): def run(self):
sleep = self.extractor.config("sleep-extractor") sleep = util.build_duration_func(
self.extractor.config("sleep-extractor"))
if sleep: if sleep:
time.sleep(sleep) time.sleep(sleep())
# collect data # collect data
try: try:

@ -409,6 +409,24 @@ def compile_expression(expr, name="<expr>", globals=GLOBALS):
return functools.partial(eval, code_object, globals) return functools.partial(eval, code_object, globals)
def build_duration_func(duration, min=0.0):
if not duration:
return None
try:
lower, upper = duration
except TypeError:
pass
else:
return functools.partial(
random.uniform,
lower if lower > min else min,
upper if upper > min else min,
)
return functools.partial(identity, duration if duration > min else min)
def build_predicate(predicates): def build_predicate(predicates):
if not predicates: if not predicates:
return lambda url, kwdict: True return lambda url, kwdict: True

Loading…
Cancel
Save