From c19e762fdf888a0e65c8124029da960b80d6586c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 23 Oct 2021 00:46:20 +0200 Subject: [PATCH] [vk] add 'album' extractor (#474, fixes #1952) todo: better metadata for albums --- docs/supportedsites.md | 2 +- gallery_dl/extractor/vk.py | 132 +++++++++++++++++++++++++------------ 2 files changed, 92 insertions(+), 42 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 56802401..94fc492a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -778,7 +778,7 @@ Consider all sites to be NSFW unless otherwise known. VK https://vk.com/ - Photos + Albums, Photos diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 9dd2d472..9724c4b7 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -12,18 +12,67 @@ from .common import Extractor, Message from .. import text import re +BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com" -class VkPhotosExtractor(Extractor): - """Extractor for photos from a vk user""" + +class VkExtractor(Extractor): + """Base class for vk extractors""" category = "vk" - subcategory = "photos" directory_fmt = ("{category}", "{user[name]|user[id]}") filename_fmt = "{id}.{extension}" archive_fmt = "{id}" root = "https://vk.com" request_interval = 1.0 - pattern = (r"(?:https://)?(?:www\.|m\.)?vk\.com/(?:" - r"(?:albums|photos|id)(-?\d+)|([^/?#]+))") + + def items(self): + data = self.metadata() + yield Message.Directory, data + for photo in self.photos(): + photo.update(data) + yield Message.Url, photo["url"], photo + + def _pagination(self, photos_url, user_id): + sub = re.compile(r"/imp[fg]/").sub + needle = 'data-id="{}_'.format(user_id) + cnt = 0 + + headers = { + "X-Requested-With": "XMLHttpRequest", + "Origin" : self.root, + "Referer" : photos_url, + } + params = { + "al" : "1", + "al_ad" : "0", + "offset": 0, + "part" : "1", + } + + while True: + payload = self.request( + photos_url, method="POST", headers=headers, data=params + ).json()["payload"][1] + + offset = payload[0] + html = payload[1] + + for cnt, photo in enumerate(text.extract_iter(html, needle, ')')): + pid = photo[:photo.find('"')] + url = photo[photo.rindex("(")+1:] + url = sub("/", url.partition("?")[0]) + yield text.nameext_from_url(url, {"url": url, "id": pid}) + + if cnt <= 20 or offset == params["offset"]: + return + params["offset"] = offset + + +class VkPhotosExtractor(VkExtractor): + """Extractor for photos from a vk user""" + subcategory = "photos" + pattern = (BASE_PATTERN + r"/(?:" + r"(?:albums|photos|id)(-?\d+)" + r"|(?!album-?\d+_)([^/?#]+))") test = ( ("https://vk.com/id398982326", { "pattern": r"https://sun\d+-\d+\.userapi\.com/c\d+/v\d+" @@ -58,10 +107,14 @@ class VkPhotosExtractor(Extractor): ) def __init__(self, match): - Extractor.__init__(self, match) + VkExtractor.__init__(self, match) self.user_id, self.user_name = match.groups() - def items(self): + def photos(self): + url = "{}/photos{}".format(self.root, self.user_id) + return self._pagination(url, self.user_id) + + def metadata(self): if self.user_id: user_id = self.user_id prefix = "public" if user_id[0] == "-" else "id" @@ -70,40 +123,8 @@ class VkPhotosExtractor(Extractor): else: url = "{}/{}".format(self.root, self.user_name) data = self._extract_profile(url) - user_id = data["user"]["id"] - - photos_url = "{}/photos{}".format(self.root, user_id) - headers = { - "X-Requested-With": "XMLHttpRequest", - "Origin" : self.root, - "Referer" : photos_url, - } - params = { - "al" : "1", - "al_ad" : "0", - "offset": 0, - "part" : "1", - } - - yield Message.Directory, data - sub = re.compile(r"/imp[fg]/").sub - needle = 'data-id="{}_'.format(user_id) - cnt = 0 - - while True: - offset, html = self.request( - photos_url, method="POST", headers=headers, data=params - ).json()["payload"][1] - - for cnt, photo in enumerate(text.extract_iter(html, needle, ')')): - data["id"] = photo[:photo.find('"')] - url = photo[photo.rindex("(")+1:] - url = sub("/", url.partition("?")[0]) - yield Message.Url, url, text.nameext_from_url(url, data) - - if cnt <= 40 or offset == params["offset"]: - return - params["offset"] = offset + self.user_id = data["user"]["id"] + return data def _extract_profile(self, url): extr = text.extract_from(self.request(url).text) @@ -116,3 +137,32 @@ class VkPhotosExtractor(Extractor): '', '