[reddit] some small fixes

- filter or complete some URLs
- remove the 'nofollow:' scheme before printing URLs
- (#15)
pull/17/head
Mike Fährmann 7 years ago
parent a22892f494
commit e425243b1e
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015, 2016 Mike Fährmann
# Copyright 2015-2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -14,9 +14,9 @@ from .. import adapter
class RecursiveExtractor(Extractor):
"""Extractor that fetches URLs from a remote or local source"""
category = "recursive"
pattern = ["r(?:ecursive)?:(.+)"]
pattern = [r"r(?:ecursive)?:(.+)"]
test = [("recursive:https://pastebin.com/raw/FLwrCYsT", {
"url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
})]

@ -34,7 +34,11 @@ class RedditExtractor(Extractor):
)
)
for url in urls:
if regex.match(url):
if url[0] == "#":
continue
elif url[0] == "/":
url = "nofollow:https://www.reddit.com" + url
elif regex.match(url):
url = "nofollow:" + url
yield Message.Queue, url
@ -61,7 +65,8 @@ class RedditSubmissionExtractor(RedditExtractor):
"""Extractor for images from a submission on reddit.com"""
subcategory = "subreddit"
pattern = [(r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/[^/]+"
r"/comments/([^/]+)")]
r"/comments/([a-z0-9]+)"),
(r"(?:https?://)?redd\.it/([a-z0-9]+)")]
def __init__(self, match):
RedditExtractor.__init__(self)

@ -214,9 +214,10 @@ class UrlJob(Job):
Job.__init__(self, url)
self.depth = depth
if depth == self.maxdepth:
self.handle_queue = print
self.handle_queue = self._print
def handle_url(self, url, _):
@staticmethod
def handle_url(url, _):
print(url)
def handle_queue(self, url):
@ -225,6 +226,12 @@ class UrlJob(Job):
except exception.NoExtractorError:
pass
@staticmethod
def _print(url):
if url.startswith("nofollow:"):
url = url[9:]
print(url)
class TestJob(DownloadJob):
"""Generate test-results for extractor runs"""

Loading…
Cancel
Save