Add support for instagram.com user profiles and pages (#134)

* [instagram] Add extractor for instagram.com user profiles and pages

The extractor scrapes `instagram.com/<user>' timelines and
`instagram.com/p/<shortcode>' by mimicking the behaviour of a web
browser and extracting the sharedData JSON of the single pages.

Please note that this mean that for user timelines we also do an
extra request to the `instagram.com/p/<shortcode>' page but this
permit to have consistent (and all) information about the media
fetched.

The MD5 logic used for X-Instagram-GIS was documented in

 <https://stackoverflow.com/questions/49786980/>

* [instagram] Test for keywords, not url for GraphImage and GraphSidecar

URLs returned by instagram seems not stable so avoid testing for
them and instead test for keyword returned.

* [instagram] Improve test of InstagramProfilepageExtractor

Also check the count of media returned.

* [instagram] Several cleanup and improvements

- Change description, subcategories to generate a better description in
  docs/supportedsite.rst
- Remove not needed InstagramExtractor.__init__()
- Use text.parse_int() instead of directly using int() (the former is more
  robust)
- Use self.request().json() instead of using json.loads() the
  self.request().text()
- Add `pattern:' to check the URLs where we do not have a stable URLs.
  It seems that only the subdomain is not stable.

Thanks to @mikf!
server
Leonardo Taccari 6 years ago committed by Mike Fährmann
parent e80ee77d71
commit 2655a2ea02

@ -45,6 +45,7 @@ modules = [
"imgbox", "imgbox",
"imgth", "imgth",
"imgur", "imgur",
"instagram",
"jaiminisbox", "jaiminisbox",
"joyreactor", "joyreactor",
"khinsider", "khinsider",

@ -0,0 +1,194 @@
# -*- coding: utf-8 -*-
# Copyright 2018 Leonardo Taccari
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://www.instagram.com/"""
import hashlib
import json
from .common import Extractor, Message
from .. import text
class InstagramExtractor(Extractor):
"""Base class for instagram extractors"""
category = "instagram"
directory_fmt = ["{category}", "{username}"]
filename_fmt = "{media_id}.{extension}"
archive_fmt = "{media_id}"
root = "https://www.instagram.com"
def items(self):
yield Message.Version, 1
for data in self.instagrams():
yield Message.Directory, data
if data['typename'] == 'GraphImage':
yield Message.Url, data['display_url'], \
text.nameext_from_url(data['display_url'], data)
elif data['typename'] == 'GraphSidecar':
# TODO: Extract all images in edge_sidecar_to_children
# TODO: instead of just extracting the main one!
yield Message.Url, data['display_url'], \
text.nameext_from_url(data['display_url'], data)
elif data['typename'] == 'GraphVideo':
yield Message.Url, \
'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data
def _extract_shared_data(self, page):
return json.loads(text.extract(page,
'window._sharedData = ', ';</script>')[0])
def _extract_postpage(self, url):
page = self.request(url).text
shared_data = self._extract_shared_data(page)
media = shared_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']
return {
'media_id': media['id'],
'shortcode': media['shortcode'],
'typename': media['__typename'],
'display_url': media['display_url'],
'height': text.parse_int(media['dimensions']['height']),
'width': text.parse_int(media['dimensions']['width']),
'comments': text.parse_int(media['edge_media_to_comment']['count']),
'likes': text.parse_int(media['edge_media_preview_like']['count']),
'owner_id': media['owner']['id'],
'username': media['owner']['username'],
'fullname': media['owner']['full_name'],
}
def _extract_profilepage(self, url):
page = self.request(url).text
shared_data = self._extract_shared_data(page)
while True:
# Deal with different structure of profile pages: the first page
# has interesting data in `entry_data', next pages in `data'.
if 'entry_data' in shared_data:
base_shared_data = shared_data['entry_data']['ProfilePage'][0]['graphql']
# `rhx_gis' and `user_id' are available only in the first page
rhx_gis = shared_data['rhx_gis']
user_id = base_shared_data['user']['id']
else:
base_shared_data = shared_data['data']
timeline = base_shared_data['user']['edge_owner_to_timeline_media']
has_next_page = timeline['page_info']['has_next_page']
shortcodes = [n['node']['shortcode'] for n in timeline['edges']]
for s in shortcodes:
url = '{}/p/{}/'.format(self.root, s)
yield self._extract_postpage(url)
if not has_next_page:
break
end_cursor = timeline['page_info']['end_cursor']
variables = '{{"id":"{}","first":12,"after":"{}"}}'.format(
user_id,
end_cursor,
)
xigis = '{}:{}'.format(rhx_gis, variables)
headers = {
"X-Requested-With": "XMLHttpRequest",
"X-Instagram-GIS": hashlib.md5(xigis.encode()).hexdigest(),
}
url = '{}/graphql/query/?query_hash={}&variables={}'.format(
self.root,
'66eb9403e44cc12e5b5ecda48b667d41',
variables,
)
shared_data = self.request(url, headers=headers).json()
class InstagramImageExtractor(InstagramExtractor):
"""Extractor for PostPage"""
subcategory = "image"
pattern = [r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/]+)/?"]
test = [
# GraphImage
("https://www.instagram.com/p/BqvsDleB3lV/", {
"pattern": r"https://[^/.]+\.cdninstagram\.com/vp"
r"/5043db33a998e32fb5713411be1d466e"
r"/5C8DAF92/t51.2885-15/e35"
r"/44877605_725955034447492_3123079845831750529_n.jpg",
"keyword": {
"comments": int,
"height": int,
"likes": int,
"media_id": "1922949326347663701",
"shortcode": "BqvsDleB3lV",
"typename": "GraphImage",
"username": "instagram",
"width": int,
}
}),
# GraphSidecar
("https://www.instagram.com/p/BoHk1haB5tM/", {
"pattern": r"https://[^/.]+\.cdninstagram\.com/vp"
"/fd70fa8d5775ce1c297a95d3800f4b7c"
"/5C935FCB/t51.2885-15/e35"
"/40758827_2138611023072230_4073975203662780931_n.jpg",
"keyword": {
"comments": int,
"height": int,
"likes": int,
"media_id": "1875629777499953996",
"shortcode": "BoHk1haB5tM",
"typename": "GraphSidecar",
"username": "instagram",
"width": int,
}
}),
# GraphVideo
("https://www.instagram.com/p/Bqxp0VSBgJg/", {
"url": "8f38c1cf460c9804842f7306c487410f33f82e7e",
"keyword": {
"comments": int,
"height": int,
"likes": int,
"media_id": "1923502432034620000",
"shortcode": "Bqxp0VSBgJg",
"typename": "GraphVideo",
"username": "instagram",
"width": int,
}
}),
]
def __init__(self, match):
InstagramExtractor.__init__(self)
self.shortcode = match.group(1)
def instagrams(self):
url = '{}/p/{}/'.format(self.root, self.shortcode)
return (self._extract_postpage(url),)
class InstagramUserExtractor(InstagramExtractor):
"""Extractor for ProfilePage"""
subcategory = "user"
pattern = [r"(?:https?://)?(?:www\.)?instagram\.com/(?!p/)([^/?&#]+)"]
test = [
("https://www.instagram.com/instagram/", {
"range": "1-12",
"count": ">= 12",
}),
]
def __init__(self, match):
InstagramExtractor.__init__(self)
self.username = match.group(1)
def instagrams(self):
url = '{}/{}/'.format(self.root, self.username)
return self._extract_profilepage(url)
Loading…
Cancel
Save