Add support for instagram.com user profiles and pages (#134)

* [instagram] Add extractor for instagram.com user profiles and pages The extractor scrapes `instagram.com/<user>' timelines and `instagram.com/p/<shortcode>' by mimicking the behaviour of a web browser and extracting the sharedData JSON of the single pages. Please note that this mean that for user timelines we also do an extra request to the `instagram.com/p/<shortcode>' page but this permit to have consistent (and all) information about the media fetched. The MD5 logic used for X-Instagram-GIS was documented in <https://stackoverflow.com/questions/49786980/> * [instagram] Test for keywords, not url for GraphImage and GraphSidecar URLs returned by instagram seems not stable so avoid testing for them and instead test for keyword returned. * [instagram] Improve test of InstagramProfilepageExtractor Also check the count of media returned. * [instagram] Several cleanup and improvements - Change description, subcategories to generate a better description in docs/supportedsite.rst - Remove not needed InstagramExtractor.__init__() - Use text.parse_int() instead of directly using int() (the former is more robust) - Use self.request().json() instead of using json.loads() the self.request().text() - Add `pattern:' to check the URLs where we do not have a stable URLs. It seems that only the subdomain is not stable. Thanks to @mikf!
6 years ago · 2655a2ea02
parent e80ee77d71
commit 2655a2ea02
2 changed files with 195 additions and 0 deletions
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -45,6 +45,7 @@ modules = [
    "imgbox",
    "imgth",
    "imgur",
+    "instagram",
    "jaiminisbox",
    "joyreactor",
    "khinsider",
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Leonardo Taccari
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.instagram.com/"""
+
+import hashlib
+import json
+from .common import Extractor, Message
+from .. import text
+
+
+class InstagramExtractor(Extractor):
+    """Base class for instagram extractors"""
+    category = "instagram"
+    directory_fmt = ["{category}", "{username}"]
+    filename_fmt = "{media_id}.{extension}"
+    archive_fmt = "{media_id}"
+    root = "https://www.instagram.com"
+
+    def items(self):
+        yield Message.Version, 1
+
+        for data in self.instagrams():
+            yield Message.Directory, data
+
+            if data['typename'] == 'GraphImage':
+                yield Message.Url, data['display_url'], \
+                    text.nameext_from_url(data['display_url'], data)
+            elif data['typename'] == 'GraphSidecar':
+                # TODO: Extract all images in edge_sidecar_to_children
+                # TODO: instead of just extracting the main one!
+                yield Message.Url, data['display_url'], \
+                    text.nameext_from_url(data['display_url'], data)
+            elif data['typename'] == 'GraphVideo':
+                yield Message.Url, \
+                    'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data
+
+    def _extract_shared_data(self, page):
+        return json.loads(text.extract(page,
+                          'window._sharedData = ', ';</script>')[0])
+
+    def _extract_postpage(self, url):
+        page = self.request(url).text
+        shared_data = self._extract_shared_data(page)
+        media = shared_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']
+
+        return {
+            'media_id': media['id'],
+            'shortcode': media['shortcode'],
+            'typename': media['__typename'],
+            'display_url': media['display_url'],
+            'height': text.parse_int(media['dimensions']['height']),
+            'width': text.parse_int(media['dimensions']['width']),
+            'comments': text.parse_int(media['edge_media_to_comment']['count']),
+            'likes': text.parse_int(media['edge_media_preview_like']['count']),
+            'owner_id': media['owner']['id'],
+            'username': media['owner']['username'],
+            'fullname': media['owner']['full_name'],
+        }
+
+    def _extract_profilepage(self, url):
+        page = self.request(url).text
+        shared_data = self._extract_shared_data(page)
+
+        while True:
+            # Deal with different structure of profile pages: the first page
+            # has interesting data in `entry_data', next pages in `data'.
+            if 'entry_data' in shared_data:
+                base_shared_data = shared_data['entry_data']['ProfilePage'][0]['graphql']
+
+                # `rhx_gis' and `user_id' are available only in the first page
+                rhx_gis = shared_data['rhx_gis']
+                user_id = base_shared_data['user']['id']
+            else:
+                base_shared_data = shared_data['data']
+
+            timeline = base_shared_data['user']['edge_owner_to_timeline_media']
+            has_next_page = timeline['page_info']['has_next_page']
+            shortcodes = [n['node']['shortcode'] for n in timeline['edges']]
+
+            for s in shortcodes:
+                url = '{}/p/{}/'.format(self.root, s)
+                yield self._extract_postpage(url)
+
+            if not has_next_page:
+                break
+
+            end_cursor = timeline['page_info']['end_cursor']
+            variables = '{{"id":"{}","first":12,"after":"{}"}}'.format(
+                user_id,
+                end_cursor,
+            )
+            xigis = '{}:{}'.format(rhx_gis, variables)
+            headers = {
+                "X-Requested-With": "XMLHttpRequest",
+                "X-Instagram-GIS": hashlib.md5(xigis.encode()).hexdigest(),
+            }
+            url = '{}/graphql/query/?query_hash={}&variables={}'.format(
+                self.root,
+                '66eb9403e44cc12e5b5ecda48b667d41',
+                variables,
+            )
+            shared_data = self.request(url, headers=headers).json()
+
+
+class InstagramImageExtractor(InstagramExtractor):
+    """Extractor for PostPage"""
+    subcategory = "image"
+    pattern = [r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/]+)/?"]
+    test = [
+        # GraphImage
+        ("https://www.instagram.com/p/BqvsDleB3lV/", {
+            "pattern": r"https://[^/.]+\.cdninstagram\.com/vp"
+                       r"/5043db33a998e32fb5713411be1d466e"
+                       r"/5C8DAF92/t51.2885-15/e35"
+                       r"/44877605_725955034447492_3123079845831750529_n.jpg",
+            "keyword": {
+                "comments": int,
+                "height": int,
+                "likes": int,
+                "media_id": "1922949326347663701",
+                "shortcode": "BqvsDleB3lV",
+                "typename": "GraphImage",
+                "username": "instagram",
+                "width": int,
+            }
+        }),
+
+        # GraphSidecar
+        ("https://www.instagram.com/p/BoHk1haB5tM/", {
+            "pattern": r"https://[^/.]+\.cdninstagram\.com/vp"
+                       "/fd70fa8d5775ce1c297a95d3800f4b7c"
+                       "/5C935FCB/t51.2885-15/e35"
+                       "/40758827_2138611023072230_4073975203662780931_n.jpg",
+            "keyword": {
+                "comments": int,
+                "height": int,
+                "likes": int,
+                "media_id": "1875629777499953996",
+                "shortcode": "BoHk1haB5tM",
+                "typename": "GraphSidecar",
+                "username": "instagram",
+                "width": int,
+            }
+        }),
+
+        # GraphVideo
+        ("https://www.instagram.com/p/Bqxp0VSBgJg/", {
+            "url": "8f38c1cf460c9804842f7306c487410f33f82e7e",
+            "keyword": {
+                "comments": int,
+                "height": int,
+                "likes": int,
+                "media_id": "1923502432034620000",
+                "shortcode": "Bqxp0VSBgJg",
+                "typename": "GraphVideo",
+                "username": "instagram",
+                "width": int,
+            }
+        }),
+    ]
+
+    def __init__(self, match):
+        InstagramExtractor.__init__(self)
+        self.shortcode = match.group(1)
+
+    def instagrams(self):
+        url = '{}/p/{}/'.format(self.root, self.shortcode)
+        return (self._extract_postpage(url),)
+
+
+class InstagramUserExtractor(InstagramExtractor):
+    """Extractor for ProfilePage"""
+    subcategory = "user"
+    pattern = [r"(?:https?://)?(?:www\.)?instagram\.com/(?!p/)([^/?&#]+)"]
+    test = [
+        ("https://www.instagram.com/instagram/", {
+            "range": "1-12",
+            "count": ">= 12",
+        }),
+    ]
+
+    def __init__(self, match):
+        InstagramExtractor.__init__(self)
+        self.username = match.group(1)
+
+    def instagrams(self):
+        url = '{}/{}/'.format(self.root, self.username)
+        return self._extract_profilepage(url)