update fallback URL handling

remove Message.Urllist and use a '_fallback' field inside a kwdict
pull/2278/head
Mike Fährmann 4 years ago
parent 43dab3a228
commit a3ca2f6080
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -40,7 +40,7 @@ class Message():
- 2nd element is the (external) URL as a string
- 3rd element is a dictionary containing URL-specific metadata
- Message.Urllist:
- Message.Urllist: # obsolete
- Same as Message.Url, but its 2nd element is a list of multiple URLs
- The additional URLs serve as a fallback if the primary one fails
"""
@ -51,5 +51,5 @@ class Message():
# Headers = 4
# Cookies = 5
Queue = 6
Urllist = 7
# Urllist = 7
Metadata = 8

@ -27,7 +27,6 @@ class TwitterExtractor(Extractor):
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
cookiedomain = ".twitter.com"
root = "https://twitter.com"
sizes = (":orig", ":large", ":medium", ":small")
def __init__(self, match):
Extractor.__init__(self, match)
@ -95,9 +94,10 @@ class TwitterExtractor(Extractor):
elif "media_url_https" in media:
url = media["media_url_https"]
urls = [url + size for size in self.sizes]
tdata["_fallback"] = [
url + size for size in (":large", ":medium", ":small")]
text.nameext_from_url(url, tdata)
yield Message.Urllist, urls, tdata
yield Message.Url, url + ":orig", tdata
else:
url = media["media_url"]
@ -249,7 +249,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
test = (
("https://twitter.com/supernaturepics", {
"range": "1-40",
"url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
"url": "2b7814162028fcd238da4ff4072cf6390efe40b0",
}),
("https://mobile.twitter.com/supernaturepics?p=i"),
("https://www.twitter.com/id:2976459548"),
@ -273,7 +273,7 @@ class TwitterMediaExtractor(TwitterExtractor):
test = (
("https://twitter.com/supernaturepics/media", {
"range": "1-40",
"url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
"url": "2b7814162028fcd238da4ff4072cf6390efe40b0",
}),
("https://mobile.twitter.com/supernaturepics/media#t"),
("https://www.twitter.com/id:2976459548/media"),

@ -110,12 +110,6 @@ class Job():
if self.pred_queue(url, kwds):
self.handle_queue(url, kwds)
elif msg[0] == Message.Urllist:
_, urls, kwds = msg
if self.pred_url(urls[0], kwds):
self.update_kwdict(kwds)
self.handle_urllist(urls, kwds)
elif msg[0] == Message.Metadata:
self.update_kwdict(msg[1])
self.handle_metadata(msg[1])
@ -130,10 +124,6 @@ class Job():
def handle_url(self, url, kwdict):
"""Handle Message.Url"""
def handle_urllist(self, urls, kwdict):
"""Handle Message.Urllist"""
self.handle_url(urls[0], kwdict)
def handle_directory(self, kwdict):
"""Handle Message.Directory"""
@ -215,7 +205,7 @@ class DownloadJob(Job):
else:
self.visited = set()
def handle_url(self, url, kwdict, fallback=None):
def handle_url(self, url, kwdict):
"""Download the resource specified in 'url'"""
postprocessors = self.postprocessors
pathfmt = self.pathfmt
@ -246,7 +236,7 @@ class DownloadJob(Job):
if not self.download(url):
# use fallback URLs if available
for num, url in enumerate(fallback or (), 1):
for num, url in enumerate(kwdict.get("_fallback", ()), 1):
util.remove_file(pathfmt.temppath)
self.log.info("Trying fallback URL #%d", num)
if self.download(url):
@ -279,12 +269,6 @@ class DownloadJob(Job):
pp.run_after(pathfmt)
self._skipcnt = 0
def handle_urllist(self, urls, kwdict):
"""Download the resource specified in 'url'"""
fallback = iter(urls)
url = next(fallback)
self.handle_url(url, kwdict, fallback)
def handle_directory(self, kwdict):
"""Set and create the target directory for downloads"""
if not self.pathfmt:
@ -563,15 +547,11 @@ class UrlJob(Job):
self.handle_queue = self.handle_url
@staticmethod
def handle_url(url, _):
def handle_url(url, kwdict):
print(url)
@staticmethod
def handle_urllist(urls, _):
prefix = ""
for url in urls:
print(prefix, url, sep="")
prefix = "| "
if "_fallback" in kwdict:
for url in kwdict["_fallback"]:
print("|", url)
def handle_queue(self, url, _):
try:
@ -625,9 +605,6 @@ class DataJob(Job):
def handle_url(self, url, kwdict):
self.data.append((Message.Url, url, self.filter(kwdict)))
def handle_urllist(self, urls, kwdict):
self.data.append((Message.Urllist, list(urls), self.filter(kwdict)))
def handle_directory(self, kwdict):
self.data.append((Message.Directory, self.filter(kwdict)))

Loading…
Cancel
Save