From ae3706286a7f394044b09ccbd6fd1b82db920515 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 15 Jun 2024 21:56:51 +0200 Subject: [PATCH] [speakerdeck] inherit from GalleryExtractor --- gallery_dl/extractor/speakerdeck.py | 42 +++++++++++------------------ test/results/speakerdeck.py | 8 ++++++ 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/gallery_dl/extractor/speakerdeck.py b/gallery_dl/extractor/speakerdeck.py index c4ef723f..3210fd8b 100644 --- a/gallery_dl/extractor/speakerdeck.py +++ b/gallery_dl/extractor/speakerdeck.py @@ -8,46 +8,35 @@ """Extractors for https://speakerdeck.com/""" -from .common import Extractor, Message +from .common import GalleryExtractor from .. import text import re -class SpeakerdeckPresentationExtractor(Extractor): +class SpeakerdeckPresentationExtractor(GalleryExtractor): """Extractor for images from a presentation on speakerdeck.com""" category = "speakerdeck" subcategory = "presentation" directory_fmt = ("{category}", "{user}") filename_fmt = "{presentation}-{num:>02}.{extension}" archive_fmt = "{presentation}_{num}" + root = "https://speakerdeck.com" pattern = r"(?:https?://)?(?:www\.)?speakerdeck\.com/([^/?#]+)/([^/?#]+)" example = "https://speakerdeck.com/USER/PRESENTATION" def __init__(self, match): - Extractor.__init__(self, match) + GalleryExtractor.__init__(self, match, "") self.user, self.presentation = match.groups() - self.presentation_id = None - def items(self): - data = self.get_job_metadata() - imgs = self.get_image_urls() - data["count"] = len(imgs) - yield Message.Directory, data - for data["num"], url in enumerate(imgs, 1): - yield Message.Url, url, text.nameext_from_url(url, data) - - def get_job_metadata(self): - """Collect metadata for extractor-job""" - url = "https://speakerdeck.com/oembed.json" + def metadata(self, _): + url = self.root + "/oembed.json" params = { - "url": "https://speakerdeck.com/" + self.user + - "/" + self.presentation, + "url": "{}/{}/{}".format(self.root, self.user, self.presentation), } - data = self.request(url, params=params).json() - self.presentation_id, pos = \ - text.extract(data["html"], 'src="//speakerdeck.com/player/', '"') + self.presentation_id = text.extr( + data["html"], 'src="//speakerdeck.com/player/', '"') return { "user": self.user, @@ -57,9 +46,10 @@ class SpeakerdeckPresentationExtractor(Extractor): "author": data["author_name"], } - def get_image_urls(self): - """Extract and return a list of all image-urls""" - page = self.request("https://speakerdeck.com/player/" + - self.presentation_id).text - page = re.sub(r"\s+", " ", page) - return list(text.extract_iter(page, 'js-sd-slide" data-url="', '"')) + def images(self, _): + url = "{}/player/{}".format(self.root, self.presentation_id) + page = re.sub(r"\s+", " ", self.request(url).text) + return [ + (url, None) + for url in text.extract_iter(page, 'js-sd-slide" data-url="', '"') + ] diff --git a/test/results/speakerdeck.py b/test/results/speakerdeck.py index 5e63a4cb..b0893233 100644 --- a/test/results/speakerdeck.py +++ b/test/results/speakerdeck.py @@ -15,6 +15,14 @@ __tests__ = ( "#pattern" : r"https://files.speakerdeck.com/presentations/50021f75cf1db900020005e7/slide_\d+.jpg", "#count" : 6, "#sha1_content": "75c7abf0969b0bcab23e0da9712c95ee5113db3a", + + "author" : "Speaker Deck", + "count" : 6, + "num" : range(1, 6), + "presentation" : "introduction-to-speakerdeck", + "presentation_id": "50021f75cf1db900020005e7", + "title" : "Introduction to SpeakerDeck", + "user" : "speakerdeck", }, )