[photos18] do pagination

Photos18ListExtractor bails if there are no posts within a page, since
(as of this commit), some pages might have improper navigation bars
(for example, https://www.photos18.com/cat/2?page=75 still has a link
to the next page despite the list of posts being far outside that range)
pull/5081/head
blankie 8 months ago
parent 2b3b08b2b8
commit 3f82852650
No known key found for this signature in database
GPG Key ID: CC15FC822C7F61F5

@ -22,9 +22,19 @@ class Photos18Extractor(Extractor):
archive_fmt = "{filename}"
root = "https://www.photos18.com"
class Photos18AlbumExtractor(Photos18Extractor):
"""Extractor for a single album URL"""
subcategory = "album"
pattern = BASE_PATTERN + r"/v/(\w+)"
example = "https://www.photos18.com/v/ID"
def __init__(self, match):
Photos18Extractor.__init__(self, match)
self.post_id = match.group(1)
def items(self):
for post_id in self.posts():
url = self.root + "/v/" + post_id
url = self.root + "/v/" + self.post_id
page = self.request(url).text
extr = text.extract_from(page)
@ -45,7 +55,7 @@ class Photos18Extractor(Extractor):
urls.append(url)
data = {
"post_id": post_id,
"post_id": self.post_id,
"title": title,
"category_id": category_id,
"category_name": category_name,
@ -58,20 +68,6 @@ class Photos18Extractor(Extractor):
yield Message.Url, url, text.nameext_from_url(url, data)
class Photos18AlbumExtractor(Photos18Extractor):
"""Extractor for a single album URL"""
subcategory = "album"
pattern = BASE_PATTERN + r"/v/(\w+)"
example = "https://www.photos18.com/v/ID"
def __init__(self, match):
Photos18Extractor.__init__(self, match)
self.post_id = match.group(1)
def posts(self):
return (self.post_id,)
class Photos18ListExtractor(Photos18Extractor):
"""Extractor for a list of posts"""
subcategory = "list"
@ -86,9 +82,9 @@ class Photos18ListExtractor(Photos18Extractor):
self.q = text.unquote(match.group(4) or "") or query.get("q")
self.category_id = match.group(1) or query.get("category_id")
self.sort = match.group(2) or match.group(3) or query.get("sort")
self.page = query.get("page")
self.page = int(query.get("page") or 1)
def posts(self):
def items(self):
query = {}
if self.q:
query["q"] = self.q
@ -99,5 +95,17 @@ class Photos18ListExtractor(Photos18Extractor):
if self.page:
query["page"] = self.page
while True:
has_post = False
page = self.request(self.root, params=query).text
return text.extract_iter(page, '<a class="visited" href="/v/', '"')
for i in text.extract_iter(
page, '<a class="visited" href="/v/', '"'):
has_post = True
url = self.root + "/v/" + i
data = {"_extractor": Photos18AlbumExtractor}
yield Message.Queue, url, data
if not has_post or '<li class="page-item next">' not in page:
break
query["page"] += 1

@ -31,4 +31,12 @@ __tests__ = (
"category_name": "歐美寫真",
},
{
"#url" : "https://www.photos18.com",
"#category": ("", "photos18", "list"),
"#class" : photos18.Photos18ListExtractor,
"#range" : "1-200",
"#count" : 200,
},
)

Loading…
Cancel
Save