|
|
@ -857,15 +857,20 @@ class InfoJob(Job):
|
|
|
|
class DataJob(Job):
|
|
|
|
class DataJob(Job):
|
|
|
|
"""Collect extractor results and dump them"""
|
|
|
|
"""Collect extractor results and dump them"""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True):
|
|
|
|
def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True,
|
|
|
|
|
|
|
|
resolve=False):
|
|
|
|
Job.__init__(self, url, parent)
|
|
|
|
Job.__init__(self, url, parent)
|
|
|
|
self.file = file
|
|
|
|
self.file = file
|
|
|
|
self.data = []
|
|
|
|
self.data = []
|
|
|
|
self.ascii = config.get(("output",), "ascii", ensure_ascii)
|
|
|
|
self.ascii = config.get(("output",), "ascii", ensure_ascii)
|
|
|
|
|
|
|
|
self.resolve = 128 if resolve is True else resolve
|
|
|
|
|
|
|
|
|
|
|
|
private = config.get(("output",), "private")
|
|
|
|
private = config.get(("output",), "private")
|
|
|
|
self.filter = dict.copy if private else util.filter_dict
|
|
|
|
self.filter = dict.copy if private else util.filter_dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if resolve:
|
|
|
|
|
|
|
|
self.handle_queue = self.handle_queue_resolve
|
|
|
|
|
|
|
|
|
|
|
|
def run(self):
|
|
|
|
def run(self):
|
|
|
|
self._init()
|
|
|
|
self._init()
|
|
|
|
|
|
|
|
|
|
|
@ -891,6 +896,7 @@ class DataJob(Job):
|
|
|
|
for msg in self.data:
|
|
|
|
for msg in self.data:
|
|
|
|
util.transform_dict(msg[-1], util.number_to_string)
|
|
|
|
util.transform_dict(msg[-1], util.number_to_string)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.file:
|
|
|
|
# dump to 'file'
|
|
|
|
# dump to 'file'
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
util.dump_json(self.data, self.file, self.ascii, 2)
|
|
|
|
util.dump_json(self.data, self.file, self.ascii, 2)
|
|
|
@ -908,3 +914,17 @@ class DataJob(Job):
|
|
|
|
|
|
|
|
|
|
|
|
def handle_queue(self, url, kwdict):
|
|
|
|
def handle_queue(self, url, kwdict):
|
|
|
|
self.data.append((Message.Queue, url, self.filter(kwdict)))
|
|
|
|
self.data.append((Message.Queue, url, self.filter(kwdict)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def handle_queue_resolve(self, url, kwdict):
|
|
|
|
|
|
|
|
cls = kwdict.get("_extractor")
|
|
|
|
|
|
|
|
if cls:
|
|
|
|
|
|
|
|
extr = cls.from_url(url)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
extr = extractor.find(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not extr:
|
|
|
|
|
|
|
|
return self.data.append((Message.Queue, url, self.filter(kwdict)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
job = self.__class__(extr, self, None, self.ascii, self.resolve-1)
|
|
|
|
|
|
|
|
job.data = self.data
|
|
|
|
|
|
|
|
job.run()
|
|
|
|