From ec3d5d58a817744167b6f41cdea7554543f2c653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 1 Apr 2021 14:35:56 +0200 Subject: [PATCH] [vk] improve extractor (#474) - fetch all photos - add 'metadata' option - fix extracting photos without '?' in URL --- docs/supportedsites.md | 2 +- gallery_dl/extractor/vk.py | 77 +++++++++++++++++++++++++++----------- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d727b140..d9f3c372 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -694,7 +694,7 @@ Consider all sites to be NSFW unless otherwise known. VK https://vk.com/ - Albums + Photos diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 22c840e8..1ce1140f 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -8,20 +8,21 @@ """Extractors for https://vk.com/""" -from .common import GalleryExtractor +from .common import Extractor, Message from .. import text import re -class VkAlbumExtractor(GalleryExtractor): - """Extractor for vkontakte albums""" +class VkPhotosExtractor(Extractor): + """Extractor for photos from a vk user""" category = "vk" - subcategory = "album" - directory_fmt = ("{category}", "{album_id}") + subcategory = "photos" + directory_fmt = ("{category}", "{user[id]}") filename_fmt = "{id}.{extension}" archive_fmt = "{id}" - root = "https://vk.com/" - pattern = r"(?:https://)?(?:www\.|m\.)?vk\.com/(?:albums|id)(\d+)" + root = "https://vk.com" + request_interval = 1.0 + pattern = r"(?:https://)?(?:www\.|m\.)?vk\.com/(?:albums|photos|id)(\d+)" test = ( ("https://vk.com/id398982326", { "pattern": r"https://sun\d+-\d+\.userapi\.com/c\d+/v\d+" @@ -29,27 +30,59 @@ class VkAlbumExtractor(GalleryExtractor): "count": ">= 35", }), ("https://m.vk.com/albums398982326"), - ("https://www.vk.com/id398982326"), + ("https://www.vk.com/id398982326?profile=1"), ) def __init__(self, match): - self.album_id = match.group(1) - url = "{}/albums{}".format(self.root, self.album_id) - GalleryExtractor.__init__(self, match, url) + Extractor.__init__(self, match) + self.user_id = match.group(1) - def metadata(self, page): - return { - "album_id": self.album_id, + def items(self): + user_id = self.user_id + + if self.config("metadata"): + url = "{}/id{}".format(self.root, user_id) + extr = text.extract_from(self.request(url).text) + data = {"user": { + "id" : user_id, + "nick": text.unescape(extr( + "", " | VK<")), + "name": text.unescape(extr( + '<h1 class="page_name">', "<")).replace(" ", " "), + "info": text.unescape(text.remove_html(extr( + '<span class="current_text">', '</span'))) + }} + else: + data = {"user": {"id": user_id}} + + photos_url = "{}/photos{}".format(self.root, user_id) + headers = { + "X-Requested-With": "XMLHttpRequest", + "Origin" : self.root, + "Referer" : photos_url, + } + params = { + "al" : "1", + "al_ad" : "0", + "offset": 0, + "part" : "1", } - def images(self, page): - results = [] + yield Message.Directory, data sub = re.compile(r"/imp[fg]/").sub - needle = 'data-id="{}_'.format(self.album_id) + needle = 'data-id="{}_'.format(user_id) + + while True: + offset, html = self.request( + photos_url, method="POST", headers=headers, data=params + ).json()["payload"][1] - for photo in text.extract_iter(page, needle, '?'): - photo_id = photo.partition('"')[0] - url = sub("/", photo.rpartition("(")[2]) - results.append((url, {"id": photo_id})) + for cnt, photo in enumerate(text.extract_iter(html, needle, ')')): + data["id"] = photo[:photo.find('"')] + url = photo[photo.rindex("(")+1:] + url = sub("/", url.partition("?")[0]) + yield Message.Url, url, text.nameext_from_url(url, data) - return results + if cnt <= 40 or offset == params["offset"]: + return + params["offset"] = offset