From 62b65e59d094e2fef36a7d21757f52ff1da010b7 Mon Sep 17 00:00:00 2001
From: Vrihub <Vrihub@users.noreply.github.com>
Date: Thu, 28 May 2020 21:58:24 +0200
Subject: [PATCH] Add instagram metadata: post_pageurl, post_tags (#743)

* Add instagram metadata: post_pageurl, post_tags

Add the following metadata for instagram:
- post_pageurl: json string with url of the post page
- post_tags: json array with instagram tags extracted from the post description

* Oops: rename post_tags to tags for --write-tags

This way, --write-tags will pick up the post tags.

* Rename to post_url, improve regex

* Add post_url and tags to tests

* Remove duplicate tags and sort them

* Bugfix: don't create empty tag lists

* Metadata: add location

* Metadata: add tagged_users for each media

* Move self._find_tags() to base class

* Make flake happy
---
 gallery_dl/extractor/instagram.py | 57 ++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index ea39cab1..92ded0ff 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -14,6 +14,7 @@ from .. import text, exception
 from ..cache import cache
 import itertools
 import json
+import re
 
 
 class InstagramExtractor(Extractor):
@@ -26,6 +27,10 @@ class InstagramExtractor(Extractor):
     cookiedomain = ".instagram.com"
     cookienames = ("sessionid",)
 
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self._find_tags = re.compile(r'#\w+').findall
+
     def get_metadata(self):
         return {}
 
@@ -133,12 +138,28 @@ class InstagramExtractor(Extractor):
             'fullname': media['owner']['full_name'],
             'post_id': media['id'],
             'post_shortcode': media['shortcode'],
+            'post_url': url,
             'description': text.parse_unicode_escapes('\n'.join(
                 edge['node']['text']
                 for edge in media['edge_media_to_caption']['edges']
             )),
         }
 
+        if self._find_tags(common['description']):
+            common['tags'] = sorted(
+                set(self._find_tags(common['description'])))
+
+        if media['location']:
+            common['location_id'] = media['location']['id']
+            common['location_slug'] = media['location']['slug']
+            common['location_url'] = (
+                'https://www.instagram.com/explore/locations/' +
+                media['location']['id'] +
+                '/' +
+                media['location']['slug'] +
+                '/'
+            )
+
         medias = []
         if media['__typename'] == 'GraphSidecar':
             for num, edge in enumerate(
@@ -156,6 +177,7 @@ class InstagramExtractor(Extractor):
                     'sidecar_media_id': media['id'],
                     'sidecar_shortcode': media['shortcode'],
                 }
+                self._extract_tagged_users(children, media_data)
                 media_data.update(common)
                 medias.append(media_data)
 
@@ -169,6 +191,7 @@ class InstagramExtractor(Extractor):
                 'height': text.parse_int(media['dimensions']['height']),
                 'width': text.parse_int(media['dimensions']['width']),
             }
+            self._extract_tagged_users(media, media_data)
             media_data.update(common)
             medias.append(media_data)
 
@@ -305,6 +328,19 @@ class InstagramExtractor(Extractor):
                 variables, psdf['query_hash'], csrf,
             )
 
+    def _extract_tagged_users(self, src_media, dest_dict):
+        if src_media['edge_media_to_tagged_user']['edges']:
+            tagged_users = []
+            for num, edge in enumerate(
+                    src_media['edge_media_to_tagged_user']['edges'], 1):
+                tagged = edge['node']
+                tagged_data = {
+                    'username': tagged['user']['username'],
+                    'full_name': tagged['user']['full_name'],
+                }
+            tagged_users.append(tagged_data)
+            dest_dict['tagged_users'] = tagged_users
+
 
 class InstagramImageExtractor(InstagramExtractor):
     """Extractor for PostPage"""
@@ -321,10 +357,14 @@ class InstagramImageExtractor(InstagramExtractor):
                 "description": str,
                 "height": int,
                 "likes": int,
+                "location_id": "214424288",
+                "location_slug": "hong-kong",
                 "media_id": "1922949326347663701",
                 "shortcode": "BqvsDleB3lV",
                 "post_id": "1922949326347663701",
                 "post_shortcode": "BqvsDleB3lV",
+                "post_url": "https://www.instagram.com/p/BqvsDleB3lV/",
+                "tags": ["#WHPsquares"],
                 "typename": "GraphImage",
                 "username": "instagram",
                 "width": int,
@@ -339,6 +379,7 @@ class InstagramImageExtractor(InstagramExtractor):
                 "sidecar_shortcode": "BoHk1haB5tM",
                 "post_id": "1875629777499953996",
                 "post_shortcode": "BoHk1haB5tM",
+                "post_url": "https://www.instagram.com/p/BoHk1haB5tM/",
                 "num": int,
                 "likes": int,
                 "username": "instagram",
@@ -354,7 +395,9 @@ class InstagramImageExtractor(InstagramExtractor):
                 "height": int,
                 "likes": int,
                 "media_id": "1923502432034620000",
+                "post_url": "https://www.instagram.com/p/Bqxp0VSBgJg/",
                 "shortcode": "Bqxp0VSBgJg",
+                "tags": ["#ASMR"],
                 "typename": "GraphVideo",
                 "username": "instagram",
                 "width": int,
@@ -370,6 +413,7 @@ class InstagramImageExtractor(InstagramExtractor):
                 "height": int,
                 "likes": int,
                 "media_id": "1806097553666903266",
+                "post_url": "https://www.instagram.com/p/BkQjCfsBIzi/",
                 "shortcode": "BkQjCfsBIzi",
                 "typename": "GraphVideo",
                 "username": "instagram",
@@ -381,11 +425,22 @@ class InstagramImageExtractor(InstagramExtractor):
         ("https://www.instagram.com/p/BtOvDOfhvRr/", {
             "count": 2,
             "keyword": {
+                "post_url": "https://www.instagram.com/p/BtOvDOfhvRr/",
                 "sidecar_media_id": "1967717017113261163",
                 "sidecar_shortcode": "BtOvDOfhvRr",
                 "video_url": str,
             }
-        })
+        }),
+
+        # GraphImage with tagged user
+        ("https://www.instagram.com/p/B_2lf3qAd3y/", {
+            "keyword": {
+                "tagged_users": [{
+                    "full_name": "Call Me Kay",
+                    "username": "kaaymbl"
+                }]
+            }
+        }),
     )
 
     def __init__(self, match):