[reddit] some small fixes

- filter or complete some URLs - remove the 'nofollow:' scheme before printing URLs - (#15)
7 years ago · e425243b1e
parent a22892f494
commit e425243b1e
3 changed files with 19 additions and 7 deletions
--- a/gallery_dl/extractor/recursive.py
+++ b/gallery_dl/extractor/recursive.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015, 2016 Mike Fährmann
+# Copyright 2015-2017 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -14,9 +14,9 @@ from .. import adapter


 class RecursiveExtractor(Extractor):
-
+    """Extractor that fetches URLs from a remote or local source"""
    category = "recursive"
-    pattern = ["r(?:ecursive)?:(.+)"]
+    pattern = [r"r(?:ecursive)?:(.+)"]
    test = [("recursive:https://pastebin.com/raw/FLwrCYsT", {
        "url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
    })]
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@ -34,7 +34,11 @@ class RedditExtractor(Extractor):
                )
            )
            for url in urls:
-                if regex.match(url):
+                if url[0] == "#":
+                    continue
+                elif url[0] == "/":
+                    url = "nofollow:https://www.reddit.com" + url
+                elif regex.match(url):
                    url = "nofollow:" + url
                yield Message.Queue, url

@ -61,7 +65,8 @@ class RedditSubmissionExtractor(RedditExtractor):
    """Extractor for images from a submission on reddit.com"""
    subcategory = "subreddit"
    pattern = [(r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/[^/]+"
-                r"/comments/([^/]+)")]
+                r"/comments/([a-z0-9]+)"),
+               (r"(?:https?://)?redd\.it/([a-z0-9]+)")]

    def __init__(self, match):
        RedditExtractor.__init__(self)
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@ -214,9 +214,10 @@ class UrlJob(Job):
        Job.__init__(self, url)
        self.depth = depth
        if depth == self.maxdepth:
-            self.handle_queue = print
+            self.handle_queue = self._print

-    def handle_url(self, url, _):
+    @staticmethod
+    def handle_url(url, _):
        print(url)

    def handle_queue(self, url):
@ -225,6 +226,12 @@ class UrlJob(Job):
        except exception.NoExtractorError:
            pass

+    @staticmethod
+    def _print(url):
+        if url.startswith("nofollow:"):
+            url = url[9:]
+        print(url)
+

 class TestJob(DownloadJob):
    """Generate test-results for extractor runs"""