[slideshare] improve metadata; flake8

- added 'views' and 'published' keywords - fixed longer titles and descriptions
7 years ago · 0a9a07a6e1
parent a8d2dde8b2
commit 0a9a07a6e1
5 changed files with 40 additions and 14 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,6 +1,8 @@
 # Changelog

 ## Unreleased
+- Added support for:
+  - `slideshare` - https://www.slideshare.net/ ([#54](https://github.com/mikf/gallery-dl/issues/54))
 - Added pool- and post-extractors for `sankaku`

 ## 1.1.0 - 2017-12-08
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@ -66,6 +66,7 @@ Sankaku Channel      https://chan.sankakucomplex.com/    Pools, Posts, Tag-Searc
 Sea Otter Scans      https://reader.seaotterscans.com/   Chapters, Manga
 Sen Manga            http://raw.senmanga.com/            Chapters
 Sense-Scans          http://sensescans.com/              Chapters, Manga
+SlideShare           https://www.slideshare.net/         Presentations
 Spectrum Nexus       |http://www.thes-0|                 Chapters, Manga
 The /b/ Archive      https://thebarchive.com/            Threads
 Tumblr               https://www.tumblr.com/             Images from Users, Posts, Tag-Searches
--- a/gallery_dl/extractor/slideshare.py
+++ b/gallery_dl/extractor/slideshare.py
@ -9,7 +9,7 @@
 """Extract images from https://www.slideshare.net/"""

 from .common import Extractor, Message
-from .. import text
+from .. import text, util


 class SlideshareExtractor(Extractor):
@ -17,14 +17,21 @@ class SlideshareExtractor(Extractor):
    category = "slideshare"
    subcategory = "presentation"
    directory_fmt = ["{category}", "{user}"]
-    filename_fmt = "{presentation}-{num}.{extension}"
+    filename_fmt = "{presentation}-{num:>02}.{extension}"
    pattern = [r"(?:https?://)?(?:www\.)?slideshare\.net/"
-               r"([^/]+)/([^/]+)"]
+               r"([^/?&#]+)/([^/?&#]+)"]
    test = [
-        ("https://www.slideshare.net/Slideshare/get-started-with-slide-share", {
+        (("https://www.slideshare.net/"
+          "Slideshare/get-started-with-slide-share"), {
            "url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18",
            "content": "ee54e54898778e92696a7afec3ffabdbd98eb0cc",
        }),
+
+        # long title
+        (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren"
+          "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), {
+            "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7",
+        }),
    ]

    def __init__(self, match):
@ -32,7 +39,8 @@ class SlideshareExtractor(Extractor):
        self.user, self.presentation = match.groups()

    def items(self):
-        page = self.request("https://www.slideshare.net/" + self.user + "/" + self.presentation).text
+        page = self.request("https://www.slideshare.net/" + self.user +
+                            "/" + self.presentation).text
        data = self.get_job_metadata(page)
        imgs = self.get_image_urls(page)
        data["count"] = len(imgs)
@ -43,17 +51,29 @@ class SlideshareExtractor(Extractor):

    def get_job_metadata(self, page):
        """Collect metadata for extractor-job"""
-        metadata = {}
-
-        text.extract_all(page, (
-            ('title', '<title>', '</title>'),
-            ('description', '<meta name="description" content="', '">'),
-        ), values=metadata)
+        descr, pos = text.extract(
+            page, '<meta name="description" content="', '"')
+        title, pos = text.extract(
+            page, '<span class="j-title-breadcrumb">', '</span>', pos)
+        views, pos = text.extract(
+            page, '<span class="notranslate pippin-data">', 'views<', pos)
+        published, pos = text.extract(
+            page, '<time datetime="', '"', pos)
+        alt_descr, pos = text.extract(
+            page, 'id="slideshow-description-paragraph" class="notranslate">',
+            '</p>', pos)

-        metadata["presentation"] = self.presentation
-        metadata["user"] = self.user
+        if descr.endswith("…") and alt_descr:
+            descr = text.remove_html(alt_descr).strip()

-        return metadata
+        return {
+            "user": self.user,
+            "presentation": self.presentation,
+            "title": text.unescape(title.strip()),
+            "description": text.unescape(descr),
+            "views": util.safe_int(views.replace(",", "")),
+            "published": published,
+        }

    @staticmethod
    def get_image_urls(page):
--- a/scripts/build_supportedsites.py
+++ b/scripts/build_supportedsites.py
@ -52,6 +52,7 @@ CATEGORY_MAP = {
    "seiga"          : "Niconico Seiga",
    "senmanga"       : "Sen Manga",
    "sensescans"     : "Sense-Scans",
+    "slideshare"     : "SlideShare",
    "spectrumnexus"  : "Spectrum Nexus",
    "thebarchive"    : "The /b/ Archive",
    "worldthree"     : "World Three",
--- a/test/test_extractors.py
+++ b/test/test_extractors.py
@ -82,6 +82,8 @@ skip = [
    "exhentai", "kissmanga", "mangafox", "dynastyscans", "nijie",
    "archivedmoe", "archiveofsins", "thebarchive",
    # temporary issues
+    "mangareader",
+    "mangapanda",
 ]
 # enable selective testing for direct calls
 if __name__ == '__main__' and len(sys.argv) > 1: