From d06ad148c70cd211409a7990fc6cff0d16b7ea20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 15 Aug 2020 18:24:14 +0200 Subject: [PATCH] [shopify] use alternate regex for products on collection pages when the first on doesn't yield any results --- gallery_dl/extractor/shopify.py | 43 +++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index 28ee46c2..9d1df18c 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -74,21 +74,33 @@ class ShopifyCollectionExtractor(ShopifyExtractor): def products(self): params = text.parse_query(self.params) params["page"] = text.parse_int(params.get("page"), 1) - search_re = re.compile(r"/collections/[\w-]+/products/[\w-]+") - - while True: - page = self.request(self.item_url, params=params).text - urls = search_re.findall(page) - last = None - - if not urls: - return - for path in urls: - if last == path: - continue - last = path - yield self.root + path - params["page"] += 1 + fetch = True + last = None + + for pattern in ( + r"/collections/[\w-]+/products/[\w-]+", + r"href=[\"'](/products/[\w-]+)", + ): + search_re = re.compile(pattern) + + while True: + if fetch: + page = self.request(self.item_url, params=params).text + urls = search_re.findall(page) + + if len(urls) < 3: + if last: + return + fetch = False + break + fetch = True + + for path in urls: + if last == path: + continue + last = path + yield self.root + path + params["page"] += 1 class ShopifyProductExtractor(ShopifyExtractor): @@ -121,7 +133,6 @@ EXTRACTORS = { ("https://www.fashionnova.com/collections/mini-dresses/?page=1"), ("https://www.fashionnova.com/collections/mini-dresses#1"), ), - }, }