[reddit] support comment embeds (#5366)

6 months ago · 095e5ded6f
parent 64948f2c09
commit 095e5ded6f
2 changed files with 48 additions and 3 deletions
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@ -74,8 +74,8 @@ class RedditExtractor(Extractor):
                        yield Message.Url, url, submission

                    elif "gallery_data" in media:
-                        for submission["num"], url in enumerate(
-                                self._extract_gallery(media), 1):
+                        for url in self._extract_gallery(media):
+                            submission["num"] += 1
                            text.nameext_from_url(url, submission)
                            yield Message.Url, url, submission

@ -99,7 +99,10 @@ class RedditExtractor(Extractor):
                            urls.append((url, submission))
                    for comment in comments:
                        html = comment["body_html"] or ""
-                        if ' href="' in html:
+                        href = (' href="' in html)
+                        media = ("media_metadata" in comment)
+
+                        if media or href:
                            comment["date"] = text.parse_timestamp(
                                comment["created_utc"])
                            if submission:
@ -107,6 +110,14 @@ class RedditExtractor(Extractor):
                                data["comment"] = comment
                            else:
                                data = comment
+
+                        if media:
+                            for embed in self._extract_embed(comment):
+                                submission["num"] += 1
+                                text.nameext_from_url(embed, submission)
+                                yield Message.Url, embed, submission
+
+                        if href:
                            for url in text.extract_iter(html, ' href="', '"'):
                                urls.append((url, data))

@ -118,6 +129,7 @@ class RedditExtractor(Extractor):
                    if url.startswith((
                        "https://www.reddit.com/message/compose",
                        "https://reddit.com/message/compose",
+                        "https://preview.redd.it/",
                    )):
                        continue

@ -172,6 +184,27 @@ class RedditExtractor(Extractor):
                    submission["id"], item["media_id"])
                self.log.debug(src)

+    def _extract_embed(self, submission):
+        meta = submission["media_metadata"]
+        if not meta:
+            return
+
+        for mid, data in meta.items():
+            if data["status"] != "valid" or "s" not in data:
+                self.log.warning(
+                    "embed %s: skipping item %s (status: %s)",
+                    submission["id"], mid, data.get("status"))
+                continue
+            src = data["s"]
+            url = src.get("u") or src.get("gif") or src.get("mp4")
+            if url:
+                yield url.partition("?")[0].replace("/preview.", "/i.", 1)
+            else:
+                self.log.error(
+                    "embed %s: unable to fetch download URL for item %s",
+                    submission["id"], mid)
+                self.log.debug(src)
+
    def _extract_video_ytdl(self, submission):
        return "https://www.reddit.com" + submission["permalink"]

--- a/test/results/reddit.py
+++ b/test/results/reddit.py
@ -168,6 +168,18 @@ __tests__ = (
    "#count"   : 0,
 },

+{
+    "#url"     : "https://www.reddit.com/r/RobloxArt/comments/15ko0qu/",
+    "#comment" : "comment embeds (#5366)",
+    "#category": ("", "reddit", "submission"),
+    "#class"   : reddit.RedditSubmissionExtractor,
+    "#options" : {"comments": 10},
+    "#urls"    : (
+        "https://i.redd.it/ppt5yciyipgb1.jpg",
+        "https://i.redd.it/u0ojzd69kpgb1.png",
+    ),
+},
+
 {
    "#url"     : "https://www.reddit.com/user/TheSpiritTree/comments/srilyf/",
    "#comment" : "user page submission (#2301)",