From 62b65e59d094e2fef36a7d21757f52ff1da010b7 Mon Sep 17 00:00:00 2001 From: Vrihub Date: Thu, 28 May 2020 21:58:24 +0200 Subject: [PATCH] Add instagram metadata: post_pageurl, post_tags (#743) * Add instagram metadata: post_pageurl, post_tags Add the following metadata for instagram: - post_pageurl: json string with url of the post page - post_tags: json array with instagram tags extracted from the post description * Oops: rename post_tags to tags for --write-tags This way, --write-tags will pick up the post tags. * Rename to post_url, improve regex * Add post_url and tags to tests * Remove duplicate tags and sort them * Bugfix: don't create empty tag lists * Metadata: add location * Metadata: add tagged_users for each media * Move self._find_tags() to base class * Make flake happy --- gallery_dl/extractor/instagram.py | 57 ++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index ea39cab1..92ded0ff 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -14,6 +14,7 @@ from .. import text, exception from ..cache import cache import itertools import json +import re class InstagramExtractor(Extractor): @@ -26,6 +27,10 @@ class InstagramExtractor(Extractor): cookiedomain = ".instagram.com" cookienames = ("sessionid",) + def __init__(self, match): + Extractor.__init__(self, match) + self._find_tags = re.compile(r'#\w+').findall + def get_metadata(self): return {} @@ -133,12 +138,28 @@ class InstagramExtractor(Extractor): 'fullname': media['owner']['full_name'], 'post_id': media['id'], 'post_shortcode': media['shortcode'], + 'post_url': url, 'description': text.parse_unicode_escapes('\n'.join( edge['node']['text'] for edge in media['edge_media_to_caption']['edges'] )), } + if self._find_tags(common['description']): + common['tags'] = sorted( + set(self._find_tags(common['description']))) + + if media['location']: + common['location_id'] = media['location']['id'] + common['location_slug'] = media['location']['slug'] + common['location_url'] = ( + 'https://www.instagram.com/explore/locations/' + + media['location']['id'] + + '/' + + media['location']['slug'] + + '/' + ) + medias = [] if media['__typename'] == 'GraphSidecar': for num, edge in enumerate( @@ -156,6 +177,7 @@ class InstagramExtractor(Extractor): 'sidecar_media_id': media['id'], 'sidecar_shortcode': media['shortcode'], } + self._extract_tagged_users(children, media_data) media_data.update(common) medias.append(media_data) @@ -169,6 +191,7 @@ class InstagramExtractor(Extractor): 'height': text.parse_int(media['dimensions']['height']), 'width': text.parse_int(media['dimensions']['width']), } + self._extract_tagged_users(media, media_data) media_data.update(common) medias.append(media_data) @@ -305,6 +328,19 @@ class InstagramExtractor(Extractor): variables, psdf['query_hash'], csrf, ) + def _extract_tagged_users(self, src_media, dest_dict): + if src_media['edge_media_to_tagged_user']['edges']: + tagged_users = [] + for num, edge in enumerate( + src_media['edge_media_to_tagged_user']['edges'], 1): + tagged = edge['node'] + tagged_data = { + 'username': tagged['user']['username'], + 'full_name': tagged['user']['full_name'], + } + tagged_users.append(tagged_data) + dest_dict['tagged_users'] = tagged_users + class InstagramImageExtractor(InstagramExtractor): """Extractor for PostPage""" @@ -321,10 +357,14 @@ class InstagramImageExtractor(InstagramExtractor): "description": str, "height": int, "likes": int, + "location_id": "214424288", + "location_slug": "hong-kong", "media_id": "1922949326347663701", "shortcode": "BqvsDleB3lV", "post_id": "1922949326347663701", "post_shortcode": "BqvsDleB3lV", + "post_url": "https://www.instagram.com/p/BqvsDleB3lV/", + "tags": ["#WHPsquares"], "typename": "GraphImage", "username": "instagram", "width": int, @@ -339,6 +379,7 @@ class InstagramImageExtractor(InstagramExtractor): "sidecar_shortcode": "BoHk1haB5tM", "post_id": "1875629777499953996", "post_shortcode": "BoHk1haB5tM", + "post_url": "https://www.instagram.com/p/BoHk1haB5tM/", "num": int, "likes": int, "username": "instagram", @@ -354,7 +395,9 @@ class InstagramImageExtractor(InstagramExtractor): "height": int, "likes": int, "media_id": "1923502432034620000", + "post_url": "https://www.instagram.com/p/Bqxp0VSBgJg/", "shortcode": "Bqxp0VSBgJg", + "tags": ["#ASMR"], "typename": "GraphVideo", "username": "instagram", "width": int, @@ -370,6 +413,7 @@ class InstagramImageExtractor(InstagramExtractor): "height": int, "likes": int, "media_id": "1806097553666903266", + "post_url": "https://www.instagram.com/p/BkQjCfsBIzi/", "shortcode": "BkQjCfsBIzi", "typename": "GraphVideo", "username": "instagram", @@ -381,11 +425,22 @@ class InstagramImageExtractor(InstagramExtractor): ("https://www.instagram.com/p/BtOvDOfhvRr/", { "count": 2, "keyword": { + "post_url": "https://www.instagram.com/p/BtOvDOfhvRr/", "sidecar_media_id": "1967717017113261163", "sidecar_shortcode": "BtOvDOfhvRr", "video_url": str, } - }) + }), + + # GraphImage with tagged user + ("https://www.instagram.com/p/B_2lf3qAd3y/", { + "keyword": { + "tagged_users": [{ + "full_name": "Call Me Kay", + "username": "kaaymbl" + }] + } + }), ) def __init__(self, match):