[shopify] use alternate regex for products on collection pages

when the first on doesn't yield any results
pull/960/head
Mike Fährmann 4 years ago
parent 7619152988
commit d06ad148c7
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -74,21 +74,33 @@ class ShopifyCollectionExtractor(ShopifyExtractor):
def products(self):
params = text.parse_query(self.params)
params["page"] = text.parse_int(params.get("page"), 1)
search_re = re.compile(r"/collections/[\w-]+/products/[\w-]+")
while True:
page = self.request(self.item_url, params=params).text
urls = search_re.findall(page)
last = None
if not urls:
return
for path in urls:
if last == path:
continue
last = path
yield self.root + path
params["page"] += 1
fetch = True
last = None
for pattern in (
r"/collections/[\w-]+/products/[\w-]+",
r"href=[\"'](/products/[\w-]+)",
):
search_re = re.compile(pattern)
while True:
if fetch:
page = self.request(self.item_url, params=params).text
urls = search_re.findall(page)
if len(urls) < 3:
if last:
return
fetch = False
break
fetch = True
for path in urls:
if last == path:
continue
last = path
yield self.root + path
params["page"] += 1
class ShopifyProductExtractor(ShopifyExtractor):
@ -121,7 +133,6 @@ EXTRACTORS = {
("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
("https://www.fashionnova.com/collections/mini-dresses#1"),
),
},
}

Loading…
Cancel
Save