update download-infrastructure

pull/13/head
Mike Fährmann 10 years ago
parent 513808d156
commit 0abbee3710

@ -6,14 +6,13 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
""" """
import os import os
import sys import sys
import re import re
import sqlite3 import sqlite3
import importlib import importlib
from extractor.common import Message
class DownloadManager(): class DownloadManager():
@ -21,35 +20,94 @@ class DownloadManager():
self.opts = opts self.opts = opts
self.conf = conf self.conf = conf
self.downloaders = {} self.downloaders = {}
self.extractors = ExtractorFinder(conf)
def add(self, url):
job = DownloadJob(self, url)
job.run()
def get_downloader_module(self, scheme):
"""Return a downloader module suitable for 'scheme'"""
module = self.downloaders.get(scheme)
if module is None:
module = importlib.import_module(".downloader."+scheme, __package__)
self.downloaders[scheme] = module
return module
def add(self, extr): def get_base_directory(self):
if self.opts.dest: if self.opts.dest:
dest = self.opts.dest return self.opts.dest
elif extr.category in self.conf:
dest = self.conf[extr.category].get("destination", "/tmp/")
else: else:
dest = self.conf["general"].get("destination", "/tmp/") return self.conf["general"].get("destination", "/tmp/")
dest = os.path.join(dest, extr.category, extr.directory)
os.makedirs(dest, exist_ok=True)
class DownloadJob():
def __init__(self, mngr, url):
self.mngr = mngr
self.extractor, self.info = mngr.extractors.get_for_url(url)
self.directory = mngr.get_base_directory()
self.downloaders = {}
def run(self):
"""Execute/Run the downlaod job"""
if self.extractor is None:
return # TODO: error msg
for msg in self.extractor:
print(msg)
print(type(msg))
if msg[0] == Message.Url:
self.download(msg)
for url, filename in extr: elif msg[0] == Message.Directory:
path = os.path.join(dest, filename) self.set_directory(msg)
elif msg[0] == Message.Version:
if msg[1] != 1:
raise "unsupported message-version ({}, {})".format(
self.info.category, msg[1]
)
# TODO: support for multiple message versions
def download(self, msg):
"""Download the resource specified in 'msg'"""
_, url, metadata = msg
filename = self.info["filename"].format(**metadata)
path = os.path.join(self.directory, filename)
if os.path.exists(path): if os.path.exists(path):
self.print_skip(path) self.print_skip(path)
continue return
dl = self.get_downloader(extr, url) dl = self.get_downloader(url)
self.print_start(path) self.print_start(path)
tries = dl.download(url, path) tries = dl.download(url, path)
self.print_success(path, tries) self.print_success(path, tries)
def get_downloader(self, extr, url): def set_directory(self, msg):
end = url.find("://") """Set and create the target directory for downloads"""
proto = url[:end] if end != -1 else "http" path = []
if proto not in self.downloaders: for segment in self.info["directory"]:
# import downloader path.append(segment.format(**msg[1]))
module = importlib.import_module("."+proto, __package__) self.directory = os.path.join(
self.downloaders[proto] = module.Downloader self.mngr.get_base_directory(),
return self.downloaders[proto](extr) *path
)
os.makedirs(self.directory, exist_ok=True)
def get_downloader(self, url):
"""Return, and possibly construct, a downloader suitable for 'url'"""
pos = url.find(":")
scheme = url[:pos] if pos != -1 else "http"
if scheme == "https":
scheme = "http"
downloader = self.downloaders.get(scheme)
if downloader is None:
module = self.mngr.get_downloader_module(scheme)
downloader = module.Downloader(self.extractor)
self.downloaders[scheme] = downloader
return downloader
@staticmethod @staticmethod
def print_start(path): def print_start(path):
@ -78,6 +136,17 @@ class ExtractorFinder():
self.load_from_database(conn) self.load_from_database(conn)
self.load_from_config(config) self.load_from_config(config)
def get_for_url(self, url):
# TODO: implement general case
module = importlib.import_module(".extractor.8chan", __package__)
for pattern in module.info["pattern"]:
match = re.match(pattern, url)
if match:
klass = getattr(module, module.info["extractor"])
return klass(match, self.config), module.info
print("pattern mismatch")
sys.exit()
def match(self, url): def match(self, url):
for category, regex in self.match_list: for category, regex in self.match_list:
match = regex.match(url) match = regex.match(url)

Loading…
Cancel
Save