gallery-dl/gallery_dl/extractor/shopify.py

# -*- coding: utf-8 -*-

# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for Shopify instances"""

from .common import BaseExtractor, Message
from .. import text
import re


class ShopifyExtractor(BaseExtractor):
    """Base class for Shopify extractors"""
    basecategory = "shopify"
    filename_fmt = "{product[title]}_{num:>02}_{id}.{extension}"
    archive_fmt = "{id}"

    def __init__(self, match):
        BaseExtractor.__init__(self, match)
        self.item_url = self.root + match.group(match.lastindex)

    def items(self):
        data = self.metadata()
        yield Message.Directory, data

        headers = {"X-Requested-With": "XMLHttpRequest"}
        for url in self.products():
            response = self.request(
                url + ".json", headers=headers, fatal=False)
            if response.status_code >= 400:
                self.log.warning('Skipping %s ("%s: %s")',
                                 url, response.status_code, response.reason)
                continue
            product = response.json()["product"]
            del product["image"]

            for num, image in enumerate(product.pop("images"), 1):
                text.nameext_from_url(image["src"], image)
                image.update(data)
                image["product"] = product
                image["num"] = num
                yield Message.Url, image["src"], image

    def metadata(self):
        """Return general metadata"""
        return {}

    def products(self):
        """Return an iterable with all relevant product URLs"""


BASE_PATTERN = ShopifyExtractor.update({
    "fashionnova": {
        "root": "https://www.fashionnova.com",
        "pattern": r"(?:www\.)?fashionnova\.com",
    },
    "omgmiamiswimwear": {
        "root": "https://www.omgmiamiswimwear.com"
    },
})


class ShopifyCollectionExtractor(ShopifyExtractor):
    """Base class for collection extractors for Shopify based sites"""
    subcategory = "collection"
    directory_fmt = ("{category}", "{collection[title]}")
    pattern = BASE_PATTERN + r"(/collections/[\w-]+)/?(?:$|[?#])"
    test = (
        ("https://www.fashionnova.com/collections/mini-dresses", {
            "range": "1-20",
            "count": 20,
            "archive": False,
        }),
        ("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
        ("https://www.fashionnova.com/collections/mini-dresses#1"),
        ("https://www.omgmiamiswimwear.com/collections/fajas"),
    )

    def metadata(self):
        return self.request(self.item_url + ".json").json()

    def products(self):
        params = {"page": 1}
        fetch = True
        last = None

        for pattern in (
            r"/collections/[\w-]+/products/[\w-]+",
            r"href=[\"'](/products/[\w-]+)",
        ):
            search_re = re.compile(pattern)

            while True:
                if fetch:
                    page = self.request(self.item_url, params=params).text
                urls = search_re.findall(page)

                if len(urls) < 3:
                    if last:
                        return
                    fetch = False
                    break
                fetch = True

                for path in urls:
                    if last == path:
                        continue
                    last = path
                    yield self.root + path
                params["page"] += 1


class ShopifyProductExtractor(ShopifyExtractor):
    """Base class for product extractors for Shopify based sites"""
    subcategory = "product"
    directory_fmt = ("{category}", "Products")
    pattern = BASE_PATTERN + r"((?:/collections/[\w-]+)?/products/[\w-]+)"
    test = (
        ("https://www.fashionnova.com/products/essential-slide-red", {
            "pattern": r"https?://cdn\d*\.shopify.com/",
            "count": 3,
        }),
        ("https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", {
            "pattern": r"https://cdn\.shopify\.com/s/files/1/1819/6171/",
            "count": 5,
        }),
        ("https://www.fashionnova.com/collections/flats/products/name"),
    )

    def products(self):
        return (self.item_url,)
[shopify] add generic collection and product extractors (#175) with fashionnova.com as a default domain 6 years ago			`# -- coding: utf-8 --`

[shopify] use BaseExtractor 4 years ago			`# Copyright 2019-2021 Mike Fährmann`
[shopify] add generic collection and product extractors (#175) with fashionnova.com as a default domain 6 years ago			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

			`"""Extractors for Shopify instances"""`

[shopify] use BaseExtractor 4 years ago			`from .common import BaseExtractor, Message`
generalize extractor creation code 6 years ago			`from .. import text`
[shopify] add generic collection and product extractors (#175) with fashionnova.com as a default domain 6 years ago			`import re`


[shopify] use BaseExtractor 4 years ago			`class ShopifyExtractor(BaseExtractor):`
[shopify] cosmetic changes in shopify.py (#181) Glanced over the commits, randomly spotted some minor things. 6 years ago			`"""Base class for Shopify extractors"""`
[shopify] add generic collection and product extractors (#175) with fashionnova.com as a default domain 6 years ago			`basecategory = "shopify"`
			`filename_fmt = "{product[title]}_{num:>02}_{id}.{extension}"`
			`archive_fmt = "{id}"`

			`def __init__(self, match):`
[shopify] use BaseExtractor 4 years ago			`BaseExtractor.__init__(self, match)`
			`self.item_url = self.root + match.group(match.lastindex)`
[shopify] add custom retry logic for 430 status codes (#175) 6 years ago
[shopify] add generic collection and product extractors (#175) with fashionnova.com as a default domain 6 years ago			`def items(self):`
			`data = self.metadata()`
			`yield Message.Directory, data`

			`headers = {"X-Requested-With": "XMLHttpRequest"}`
			`for url in self.products():`
replace extractor.request() 'expect' argument with - 'fatal': allow 4xx status codes - 'notfound': raise NotFoundError on 404 5 years ago			`response = self.request(`
			`url + ".json", headers=headers, fatal=False)`
[shopify] skip deleted products (#175) Product pages which return a 4xx status code will now be skipped instead of raising an exception. 5 years ago			`if response.status_code >= 400:`
replace extractor.request() 'expect' argument with - 'fatal': allow 4xx status codes - 'notfound': raise NotFoundError on 404 5 years ago			`self.log.warning('Skipping %s ("%s: %s")',`
[shopify] skip deleted products (#175) Product pages which return a 4xx status code will now be skipped instead of raising an exception. 5 years ago			`url, response.status_code, response.reason)`
			`continue`
			`product = response.json()["product"]`
[shopify] add generic collection and product extractors (#175) with fashionnova.com as a default domain 6 years ago			`del product["image"]`

			`for num, image in enumerate(product.pop("images"), 1):`
			`text.nameext_from_url(image["src"], image)`
			`image.update(data)`
			`image["product"] = product`
			`image["num"] = num`
			`yield Message.Url, image["src"], image`

			`def metadata(self):`
			`"""Return general metadata"""`
			`return {}`

			`def products(self):`
			`"""Return an iterable with all relevant product URLs"""`


[shopify] use BaseExtractor 4 years ago			`BASE_PATTERN = ShopifyExtractor.update({`
			`"fashionnova": {`
			`"root": "https://www.fashionnova.com",`
			`"pattern": r"(?:www\.)?fashionnova\.com",`
			`},`
[shopify] support omgmiamiswimwear.com (closes #1280) 3 years ago			`"omgmiamiswimwear": {`
			`"root": "https://www.omgmiamiswimwear.com"`
			`},`
[shopify] use BaseExtractor 4 years ago			`})`


[shopify] add generic collection and product extractors (#175) with fashionnova.com as a default domain 6 years ago			`class ShopifyCollectionExtractor(ShopifyExtractor):`
			`"""Base class for collection extractors for Shopify based sites"""`
			`subcategory = "collection"`
			`directory_fmt = ("{category}", "{collection[title]}")`
[shopify] use BaseExtractor 4 years ago			`pattern = BASE_PATTERN + r"(/collections/[\w-]+)/?(?:$\|[?#])"`
			`test = (`
			`("https://www.fashionnova.com/collections/mini-dresses", {`
			`"range": "1-20",`
			`"count": 20,`
			`"archive": False,`
			`}),`
			`("https://www.fashionnova.com/collections/mini-dresses/?page=1"),`
			`("https://www.fashionnova.com/collections/mini-dresses#1"),`
[shopify] support omgmiamiswimwear.com (closes #1280) 3 years ago			`("https://www.omgmiamiswimwear.com/collections/fajas"),`
[shopify] use BaseExtractor 4 years ago			`)`
[shopify] add generic collection and product extractors (#175) with fashionnova.com as a default domain 6 years ago
			`def metadata(self):`
			`return self.request(self.item_url + ".json").json()`

			`def products(self):`
[shopify] use BaseExtractor 4 years ago			`params = {"page": 1}`
[shopify] use alternate regex for products on collection pages when the first on doesn't yield any results 4 years ago			`fetch = True`
			`last = None`

			`for pattern in (`
			`r"/collections/[\w-]+/products/[\w-]+",`
			`r"href=[\"'](/products/[\w-]+)",`
			`):`
			`search_re = re.compile(pattern)`

			`while True:`
			`if fetch:`
			`page = self.request(self.item_url, params=params).text`
			`urls = search_re.findall(page)`

			`if len(urls) < 3:`
			`if last:`
			`return`
			`fetch = False`
			`break`
			`fetch = True`

			`for path in urls:`
			`if last == path:`
			`continue`
			`last = path`
			`yield self.root + path`
			`params["page"] += 1`
[shopify] add generic collection and product extractors (#175) with fashionnova.com as a default domain 6 years ago

			`class ShopifyProductExtractor(ShopifyExtractor):`
			`"""Base class for product extractors for Shopify based sites"""`
			`subcategory = "product"`
			`directory_fmt = ("{category}", "Products")`
[shopify] use BaseExtractor 4 years ago			`pattern = BASE_PATTERN + r"((?:/collections/[\w-]+)?/products/[\w-]+)"`
			`test = (`
			`("https://www.fashionnova.com/products/essential-slide-red", {`
			`"pattern": r"https?://cdn\d*\.shopify.com/",`
			`"count": 3,`
			`}),`
[shopify] support omgmiamiswimwear.com (closes #1280) 3 years ago			`("https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", {`
			`"pattern": r"https://cdn\.shopify\.com/s/files/1/1819/6171/",`
			`"count": 5,`
			`}),`
[shopify] use BaseExtractor 4 years ago			`("https://www.fashionnova.com/collections/flats/products/name"),`
			`)`
[shopify] add generic collection and product extractors (#175) with fashionnova.com as a default domain 6 years ago
			`def products(self):`
			`return (self.item_url,)`