fix bug when processing input file comments (#2808)

and move 'parse_inputfile()' to util.py
pull/3028/head
Mike Fährmann 2 years ago
parent 14717f3fc9
commit a6e2d96dde
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -7,7 +7,6 @@
# published by the Free Software Foundation.
import sys
import json
import logging
from . import version, config, option, output, extractor, job, util, exception
@ -32,81 +31,6 @@ def progress(urls, pformat):
yield pinfo["url"]
def parse_inputfile(file, log):
"""Filter and process strings from an input file.
Lines starting with '#' and empty lines will be ignored.
Lines starting with '-' will be interpreted as a key-value pair separated
by an '='. where 'key' is a dot-separated option name and 'value' is a
JSON-parsable value. These configuration options will be applied while
processing the next URL.
Lines starting with '-G' are the same as above, except these options will
be applied for *all* following URLs, i.e. they are Global.
Everything else will be used as a potential URL.
Example input file:
# settings global options
-G base-directory = "/tmp/"
-G skip = false
# setting local options for the next URL
-filename="spaces_are_optional.jpg"
-skip = true
https://example.org/
# next URL uses default filename and 'skip' is false.
https://example.com/index.htm # comment1
https://example.com/404.htm # comment2
"""
gconf = []
lconf = []
for line in file:
line = line.strip()
if not line or line[0] == "#":
# empty line or comment
continue
elif line[0] == "-":
# config spec
if len(line) >= 2 and line[1] == "G":
conf = gconf
line = line[2:]
else:
conf = lconf
line = line[1:]
key, sep, value = line.partition("=")
if not sep:
log.warning("input file: invalid <key>=<value> pair: %s", line)
continue
try:
value = json.loads(value.strip())
except ValueError as exc:
log.warning("input file: unable to parse '%s': %s", value, exc)
continue
key = key.strip().split(".")
conf.append((key[:-1], key[-1], value))
else:
# url
if " #" in line:
line = line.partition(" #")[0].rstrip()
elif "\t#" in line:
line = line.partition("\t#")[0].rstrip()
if gconf or lconf:
yield util.ExtendedUrl(line, gconf, lconf)
gconf = []
lconf = []
else:
yield line
def main():
try:
if sys.stdout and sys.stdout.encoding.lower() != "utf-8":
@ -275,12 +199,12 @@ def main():
try:
if inputfile == "-":
if sys.stdin:
urls += parse_inputfile(sys.stdin, log)
urls += util.parse_inputfile(sys.stdin, log)
else:
log.warning("input file: stdin is not readable")
else:
with open(inputfile, encoding="utf-8") as file:
urls += parse_inputfile(file, log)
urls += util.parse_inputfile(file, log)
except OSError as exc:
log.warning("input file: %s", exc)

@ -418,6 +418,82 @@ CODES = {
}
def parse_inputfile(file, log):
"""Filter and process strings from an input file.
Lines starting with '#' and empty lines will be ignored.
Lines starting with '-' will be interpreted as a key-value pair separated
by an '='. where 'key' is a dot-separated option name and 'value' is a
JSON-parsable value. These configuration options will be applied while
processing the next URL.
Lines starting with '-G' are the same as above, except these options will
be applied for *all* following URLs, i.e. they are Global.
Everything else will be used as a potential URL.
Example input file:
# settings global options
-G base-directory = "/tmp/"
-G skip = false
# setting local options for the next URL
-filename="spaces_are_optional.jpg"
-skip = true
https://example.org/
# next URL uses default filename and 'skip' is false.
https://example.com/index.htm # comment1
https://example.com/404.htm # comment2
"""
gconf = []
lconf = []
strip_comment = None
for line in file:
line = line.strip()
if not line or line[0] == "#":
# empty line or comment
continue
elif line[0] == "-":
# config spec
if len(line) >= 2 and line[1] == "G":
conf = gconf
line = line[2:]
else:
conf = lconf
line = line[1:]
key, sep, value = line.partition("=")
if not sep:
log.warning("input file: invalid <key>=<value> pair: %s", line)
continue
try:
value = json.loads(value.strip())
except ValueError as exc:
log.warning("input file: unable to parse '%s': %s", value, exc)
continue
key = key.strip().split(".")
conf.append((key[:-1], key[-1], value))
else:
# url
if " #" in line or "\t#" in line:
if strip_comment is None:
strip_comment = re.compile(r"\s+#.*").sub
line = strip_comment("", line)
if gconf or lconf:
yield ExtendedUrl(line, gconf, lconf)
gconf = []
lconf = []
else:
yield line
class UniversalNone():
"""None-style object that supports more operations than None itself"""
__slots__ = ()

Loading…
Cancel
Save