gallery-dl/gallery_dl/extractor/myportfolio.py

# -*- coding: utf-8 -*-

# Copyright 2018-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://www.myportfolio.com/"""

from .common import Extractor, Message
from .. import text, exception


class MyportfolioGalleryExtractor(Extractor):
    """Extractor for an image gallery on www.myportfolio.com"""
    category = "myportfolio"
    subcategory = "gallery"
    directory_fmt = ("{category}", "{user}", "{title}")
    filename_fmt = "{num:>02}.{extension}"
    archive_fmt = "{user}_{filename}"
    pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|"
               r"(?:https?://)?([\w-]+\.myportfolio\.com))"
               r"(/[^/?#]+)?")
    example = "https://USER.myportfolio.com/TITLE"

    def __init__(self, match):
        Extractor.__init__(self, match)
        domain1, domain2, self.path = match.groups()
        self.domain = domain1 or domain2
        self.prefix = "myportfolio:" if domain1 else ""

    def items(self):
        url = "https://" + self.domain + (self.path or "")
        response = self.request(url)
        if response.history and response.url.endswith(".adobe.com/missing"):
            raise exception.NotFoundError()
        page = response.text

        projects = text.extr(
            page, '<section class="project-covers', '</section>')

        if projects:
            data = {"_extractor": MyportfolioGalleryExtractor}
            base = self.prefix + "https://" + self.domain
            for path in text.extract_iter(projects, ' href="', '"'):
                yield Message.Queue, base + path, data
        else:
            data = self.metadata(page)
            imgs = self.images(page)
            data["count"] = len(imgs)
            yield Message.Directory, data
            for data["num"], url in enumerate(imgs, 1):
                yield Message.Url, url, text.nameext_from_url(url, data)

    @staticmethod
    def metadata(page):
        """Collect general image metadata"""
        # og:title contains data as "<user> - <title>", but both
        # <user> and <title> can contain a "-" as well, so we get the title
        # from somewhere else and cut that amount from the og:title content

        extr = text.extract_from(page)
        user = extr('property="og:title" content="', '"') or \
            extr('property=og:title content="', '"')
        descr = extr('property="og:description" content="', '"') or \
            extr('property=og:description content="', '"')
        title = extr('<h1 ', '</h1>')

        if title:
            title = title.partition(">")[2]
            user = user[:-len(title)-3]
        elif user:
            user, _, title = user.partition(" - ")
        else:
            raise exception.NotFoundError()

        return {
            "user": text.unescape(user),
            "title": text.unescape(title),
            "description": text.unescape(descr),
        }

    @staticmethod
    def images(page):
        """Extract and return a list of all image-urls"""
        return (
            list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) or
            list(text.extract_iter(page, 'data-src="', '"'))
        )
[myportfolio] add user and gallery extractors (#95) 6 years ago			`# -- coding: utf-8 --`

remove '&' from URL patterns part 2 follow-up on 968d3e8465d70bf589b87ff79182ee9cae3ce4fb 1 year ago			`# Copyright 2018-2023 Mike Fährmann`
[myportfolio] add user and gallery extractors (#95) 6 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

remove '&' from URL patterns part 2 follow-up on 968d3e8465d70bf589b87ff79182ee9cae3ce4fb 1 year ago			`"""Extractors for https://www.myportfolio.com/"""`
[myportfolio] add user and gallery extractors (#95) 6 years ago
			`from .common import Extractor, Message`
[myportfolio] raise 'NotFoundError' for deleted posts 4 years ago			`from .. import text, exception`
[myportfolio] add user and gallery extractors (#95) 6 years ago

			`class MyportfolioGalleryExtractor(Extractor):`
			`"""Extractor for an image gallery on www.myportfolio.com"""`
			`category = "myportfolio"`
			`subcategory = "gallery"`
simplify extractor constants - single strings for URL patterns - tuples instead of lists for 'directory_fmt' and 'test' - single-tuple tests where applicable 6 years ago			`directory_fmt = ("{category}", "{user}", "{title}")`
[myportfolio] add user and gallery extractors (#95) 6 years ago			`filename_fmt = "{num:>02}.{extension}"`
change results of text.nameext_from_url() Instead of getting a complete 'filename' from an URL and splitting that into 'name' and 'extension', the new approach gets rid of the complete version and renames 'name' to 'filename'. (Using anything other than {extension} for a filename extension doesn't really work anyway) Example: "https://example.org/path/filename.ext" before: - filename : filename.ext - name : filename - extension: ext now: - filename : filename - extension: ext 6 years ago			`archive_fmt = "{user}_{filename}"`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 6 years ago			`pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)\|"`
generic extractor (#735) * Generic extractor, see issue #683 * Fix failed test_names test, no subcategory needed * Prefix directory_fmt with "generic" * Relax regex (would break some urls) * Flake8 compliance * pattern: don't require a scheme This fixes a bug when we force the generic extractor on urls without a scheme (that are allowed by all other extractors). * Fix using g: and r: on urls without http(s) scheme Almost all extractors accept urls without an initial http(s) scheme. Many extractors also allow for generic subdomains in their "pattern" variable; some of them implement this with the regex character class "[^.]+" (everything but a dot). This leads to a problem when the extractor is given a url starting with g: or r: (to force using the generic or recursive extractor) and without the http(s) scheme: e.g. with "r:foobar.tumblr.com" the "r:" is wrongly considered part of the subdomain. This commit fixes the bug, replacing the too generic "[^.]+" with the more specific "[\w-]+" (letters, digits and "-", the only characters allowed in domain names), which is already used by some extractors. * Relax imageurl_pattern_ext: allow relative urls * First round of small suggested changes * Support image urls starting with "//" * self.baseurl: remove trailing slash * Relax regexp (didn't catch some image urls) * Some fixes and cleanup * Fix domain pattern; option to enable extractor Fixed the domain section for "pattern", to pass "test_add" and "test_add_module" tests. Added the "enabled" configuration option (default False) to enable the generic extractor. Using "g(eneric):URL" forces using the extractor. 3 years ago			`r"(?:https?://)?([\w-]+\.myportfolio\.com))"`
remove '&' from URL patterns part 2 follow-up on 968d3e8465d70bf589b87ff79182ee9cae3ce4fb 1 year ago			`r"(/[^/?#]+)?")`
remove test results in extractor modules and add generic example URLs 1 year ago			`example = "https://USER.myportfolio.com/TITLE"`
[myportfolio] add user and gallery extractors (#95) 6 years ago
			`def __init__(self, match):`
propagate 'match' to base extractor constructor 6 years ago			`Extractor.__init__(self, match)`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 6 years ago			`domain1, domain2, self.path = match.groups()`
			`self.domain = domain1 or domain2`
			`self.prefix = "myportfolio:" if domain1 else ""`
[myportfolio] add user and gallery extractors (#95) 6 years ago
			`def items(self):`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 6 years ago			`url = "https://" + self.domain + (self.path or "")`
[myportfolio] fix extraction 3 years ago			`response = self.request(url)`
			`if response.history and response.url.endswith(".adobe.com/missing"):`
			`raise exception.NotFoundError()`
			`page = response.text`
[myportfolio] add user and gallery extractors (#95) 6 years ago
replace 'text.extract()' with 'text.extr()' where possible 2 years ago			`projects = text.extr(`
			`page, '<section class="project-covers', '</section>')`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 6 years ago
			`if projects:`
			`data = {"_extractor": MyportfolioGalleryExtractor}`
			`base = self.prefix + "https://" + self.domain`
			`for path in text.extract_iter(projects, ' href="', '"'):`
			`yield Message.Queue, base + path, data`
			`else:`
			`data = self.metadata(page)`
			`imgs = self.images(page)`
			`data["count"] = len(imgs)`
			`yield Message.Directory, data`
			`for data["num"], url in enumerate(imgs, 1):`
			`yield Message.Url, url, text.nameext_from_url(url, data)`
[myportfolio] add user and gallery extractors (#95) 6 years ago
			`@staticmethod`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 6 years ago			`def metadata(page):`
			`"""Collect general image metadata"""`
[myportfolio] add user and gallery extractors (#95) 6 years ago			`# og:title contains data as "<user> - <title>", but both`
			`# <user> and <title> can contain a "-" as well, so we get the title`
			`# from somewhere else and cut that amount from the og:title content`

[myportfolio] fix extraction 3 years ago			`extr = text.extract_from(page)`
			`user = extr('property="og:title" content="', '"') or \`
			`extr('property=og:title content="', '"')`
			`descr = extr('property="og:description" content="', '"') or \`
			`extr('property=og:description content="', '"')`
			`title = extr('<h1 ', '</h1>')`
[myportfolio] add user and gallery extractors (#95) 6 years ago
[myportfolio] fix extraction of galleries without title 5 years ago			`if title:`
			`title = title.partition(">")[2]`
			`user = user[:-len(title)-3]`
[myportfolio] raise 'NotFoundError' for deleted posts 4 years ago			`elif user:`
[myportfolio] fix extraction of galleries without title 5 years ago			`user, _, title = user.partition(" - ")`
[myportfolio] raise 'NotFoundError' for deleted posts 4 years ago			`else:`
			`raise exception.NotFoundError()`
[myportfolio] add user and gallery extractors (#95) 6 years ago
			`return {`
			`"user": text.unescape(user),`
			`"title": text.unescape(title),`
[myportfolio] fix extraction 3 years ago			`"description": text.unescape(descr),`
[myportfolio] add user and gallery extractors (#95) 6 years ago			`}`

			`@staticmethod`
[myportfolio] combine gallery and user extractors An URL alone isn't good enough to distinguish between a gallery or a gallery-listing, so the new extractor decides what to do based on the page's content. 6 years ago			`def images(page):`
[myportfolio] add user and gallery extractors (#95) 6 years ago			`"""Extract and return a list of all image-urls"""`
[myportfolio] use fallback when no images are found (#2959) 2 years ago			`return (`
			`list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) or`
			`list(text.extract_iter(page, 'data-src="', '"'))`
			`)`