Add support for instagram.com user profiles and pages (#134)

* [instagram] Add extractor for instagram.com user profiles and pages The extractor scrapes `instagram.com/<user>' timelines and `instagram.com/p/<shortcode>' by mimicking the behaviour of a web browser and extracting the sharedData JSON of the single pages. Please note that this mean that for user timelines we also do an extra request to the `instagram.com/p/<shortcode>' page but this permit to have consistent (and all) information about the media fetched. The MD5 logic used for X-Instagram-GIS was documented in <https://stackoverflow.com/questions/49786980/> * [instagram] Test for keywords, not url for GraphImage and GraphSidecar URLs returned by instagram seems not stable so avoid testing for them and instead test for keyword returned. * [instagram] Improve test of InstagramProfilepageExtractor Also check the count of media returned. * [instagram] Several cleanup and improvements - Change description, subcategories to generate a better description in docs/supportedsite.rst - Remove not needed InstagramExtractor.__init__() - Use text.parse_int() instead of directly using int() (the former is more robust) - Use self.request().json() instead of using json.loads() the self.request().text() - Add `pattern:' to check the URLs where we do not have a stable URLs. It seems that only the subdomain is not stable. Thanks to @mikf!
6 years ago · 2655a2ea02
parent e80ee77d71
commit 2655a2ea02
2 changed files with 195 additions and 0 deletions
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -45,6 +45,7 @@ modules = [
    "imgbox",
    "imgth",
    "imgur",
    "instagram",
    "jaiminisbox",
    "joyreactor",
    "khinsider",
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@ -0,0 +1,194 @@
 # -*- coding: utf-8 -*-
 # Copyright 2018 Leonardo Taccari
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extract images from https://www.instagram.com/"""
 import hashlib
 import json
 from .common import Extractor, Message
 from .. import text
 class InstagramExtractor(Extractor):
    """Base class for instagram extractors"""
    category = "instagram"
    directory_fmt = ["{category}", "{username}"]
    filename_fmt = "{media_id}.{extension}"
    archive_fmt = "{media_id}"
    root = "https://www.instagram.com"
    def items(self):
        yield Message.Version, 1
        for data in self.instagrams():
            yield Message.Directory, data
            if data['typename'] == 'GraphImage':
                yield Message.Url, data['display_url'], \
                    text.nameext_from_url(data['display_url'], data)
            elif data['typename'] == 'GraphSidecar':
                # TODO: Extract all images in edge_sidecar_to_children
                # TODO: instead of just extracting the main one!
                yield Message.Url, data['display_url'], \
                    text.nameext_from_url(data['display_url'], data)
            elif data['typename'] == 'GraphVideo':
                yield Message.Url, \
                    'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data
    def _extract_shared_data(self, page):
        return json.loads(text.extract(page,
                          'window._sharedData = ', ';</script>')[0])
    def _extract_postpage(self, url):
        page = self.request(url).text
        shared_data = self._extract_shared_data(page)
        media = shared_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']
        return {
            'media_id': media['id'],
            'shortcode': media['shortcode'],
            'typename': media['__typename'],
            'display_url': media['display_url'],
            'height': text.parse_int(media['dimensions']['height']),
            'width': text.parse_int(media['dimensions']['width']),
            'comments': text.parse_int(media['edge_media_to_comment']['count']),
            'likes': text.parse_int(media['edge_media_preview_like']['count']),
            'owner_id': media['owner']['id'],
            'username': media['owner']['username'],
            'fullname': media['owner']['full_name'],
        }
    def _extract_profilepage(self, url):
        page = self.request(url).text
        shared_data = self._extract_shared_data(page)
        while True:
            # Deal with different structure of profile pages: the first page
            # has interesting data in `entry_data', next pages in `data'.
            if 'entry_data' in shared_data:
                base_shared_data = shared_data['entry_data']['ProfilePage'][0]['graphql']
                # `rhx_gis' and `user_id' are available only in the first page
                rhx_gis = shared_data['rhx_gis']
                user_id = base_shared_data['user']['id']
            else:
                base_shared_data = shared_data['data']
            timeline = base_shared_data['user']['edge_owner_to_timeline_media']
            has_next_page = timeline['page_info']['has_next_page']
            shortcodes = [n['node']['shortcode'] for n in timeline['edges']]
            for s in shortcodes:
                url = '{}/p/{}/'.format(self.root, s)
                yield self._extract_postpage(url)
            if not has_next_page:
                break
            end_cursor = timeline['page_info']['end_cursor']
            variables = '{{"id":"{}","first":12,"after":"{}"}}'.format(
                user_id,
                end_cursor,
            )
            xigis = '{}:{}'.format(rhx_gis, variables)
            headers = {
                "X-Requested-With": "XMLHttpRequest",
                "X-Instagram-GIS": hashlib.md5(xigis.encode()).hexdigest(),
            }
            url = '{}/graphql/query/?query_hash={}&variables={}'.format(
                self.root,
                '66eb9403e44cc12e5b5ecda48b667d41',
                variables,
            )
            shared_data = self.request(url, headers=headers).json()
 class InstagramImageExtractor(InstagramExtractor):
    """Extractor for PostPage"""
    subcategory = "image"
    pattern = [r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/]+)/?"]
    test = [
        # GraphImage
        ("https://www.instagram.com/p/BqvsDleB3lV/", {
            "pattern": r"https://[^/.]+\.cdninstagram\.com/vp"
                       r"/5043db33a998e32fb5713411be1d466e"
                       r"/5C8DAF92/t51.2885-15/e35"
                       r"/44877605_725955034447492_3123079845831750529_n.jpg",
            "keyword": {
                "comments": int,
                "height": int,
                "likes": int,
                "media_id": "1922949326347663701",
                "shortcode": "BqvsDleB3lV",
                "typename": "GraphImage",
                "username": "instagram",
                "width": int,
            }
        }),
        # GraphSidecar
        ("https://www.instagram.com/p/BoHk1haB5tM/", {
            "pattern": r"https://[^/.]+\.cdninstagram\.com/vp"
                       "/fd70fa8d5775ce1c297a95d3800f4b7c"
                       "/5C935FCB/t51.2885-15/e35"
                       "/40758827_2138611023072230_4073975203662780931_n.jpg",
            "keyword": {
                "comments": int,
                "height": int,
                "likes": int,
                "media_id": "1875629777499953996",
                "shortcode": "BoHk1haB5tM",
                "typename": "GraphSidecar",
                "username": "instagram",
                "width": int,
            }
        }),
        # GraphVideo
        ("https://www.instagram.com/p/Bqxp0VSBgJg/", {
            "url": "8f38c1cf460c9804842f7306c487410f33f82e7e",
            "keyword": {
                "comments": int,
                "height": int,
                "likes": int,
                "media_id": "1923502432034620000",
                "shortcode": "Bqxp0VSBgJg",
                "typename": "GraphVideo",
                "username": "instagram",
                "width": int,
            }
        }),
    ]
    def __init__(self, match):
        InstagramExtractor.__init__(self)
        self.shortcode = match.group(1)
    def instagrams(self):
        url = '{}/p/{}/'.format(self.root, self.shortcode)
        return (self._extract_postpage(url),)
 class InstagramUserExtractor(InstagramExtractor):
    """Extractor for ProfilePage"""
    subcategory = "user"
    pattern = [r"(?:https?://)?(?:www\.)?instagram\.com/(?!p/)([^/?&#]+)"]
    test = [
        ("https://www.instagram.com/instagram/", {
            "range": "1-12",
            "count": ">= 12",
        }),
    ]
    def __init__(self, match):
        InstagramExtractor.__init__(self)
        self.username = match.group(1)
    def instagrams(self):
        url = '{}/{}/'.format(self.root, self.username)
        return self._extract_profilepage(url)