Add support for instagram.com user profiles and pages (#134)
* [instagram] Add extractor for instagram.com user profiles and pages The extractor scrapes `instagram.com/<user>' timelines and `instagram.com/p/<shortcode>' by mimicking the behaviour of a web browser and extracting the sharedData JSON of the single pages. Please note that this mean that for user timelines we also do an extra request to the `instagram.com/p/<shortcode>' page but this permit to have consistent (and all) information about the media fetched. The MD5 logic used for X-Instagram-GIS was documented in <https://stackoverflow.com/questions/49786980/> * [instagram] Test for keywords, not url for GraphImage and GraphSidecar URLs returned by instagram seems not stable so avoid testing for them and instead test for keyword returned. * [instagram] Improve test of InstagramProfilepageExtractor Also check the count of media returned. * [instagram] Several cleanup and improvements - Change description, subcategories to generate a better description in docs/supportedsite.rst - Remove not needed InstagramExtractor.__init__() - Use text.parse_int() instead of directly using int() (the former is more robust) - Use self.request().json() instead of using json.loads() the self.request().text() - Add `pattern:' to check the URLs where we do not have a stable URLs. It seems that only the subdomain is not stable. Thanks to @mikf!server
parent
e80ee77d71
commit
2655a2ea02
@ -0,0 +1,194 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2018 Leonardo Taccari
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract images from https://www.instagram.com/"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from .common import Extractor, Message
|
||||
from .. import text
|
||||
|
||||
|
||||
class InstagramExtractor(Extractor):
|
||||
"""Base class for instagram extractors"""
|
||||
category = "instagram"
|
||||
directory_fmt = ["{category}", "{username}"]
|
||||
filename_fmt = "{media_id}.{extension}"
|
||||
archive_fmt = "{media_id}"
|
||||
root = "https://www.instagram.com"
|
||||
|
||||
def items(self):
|
||||
yield Message.Version, 1
|
||||
|
||||
for data in self.instagrams():
|
||||
yield Message.Directory, data
|
||||
|
||||
if data['typename'] == 'GraphImage':
|
||||
yield Message.Url, data['display_url'], \
|
||||
text.nameext_from_url(data['display_url'], data)
|
||||
elif data['typename'] == 'GraphSidecar':
|
||||
# TODO: Extract all images in edge_sidecar_to_children
|
||||
# TODO: instead of just extracting the main one!
|
||||
yield Message.Url, data['display_url'], \
|
||||
text.nameext_from_url(data['display_url'], data)
|
||||
elif data['typename'] == 'GraphVideo':
|
||||
yield Message.Url, \
|
||||
'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data
|
||||
|
||||
def _extract_shared_data(self, page):
|
||||
return json.loads(text.extract(page,
|
||||
'window._sharedData = ', ';</script>')[0])
|
||||
|
||||
def _extract_postpage(self, url):
|
||||
page = self.request(url).text
|
||||
shared_data = self._extract_shared_data(page)
|
||||
media = shared_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']
|
||||
|
||||
return {
|
||||
'media_id': media['id'],
|
||||
'shortcode': media['shortcode'],
|
||||
'typename': media['__typename'],
|
||||
'display_url': media['display_url'],
|
||||
'height': text.parse_int(media['dimensions']['height']),
|
||||
'width': text.parse_int(media['dimensions']['width']),
|
||||
'comments': text.parse_int(media['edge_media_to_comment']['count']),
|
||||
'likes': text.parse_int(media['edge_media_preview_like']['count']),
|
||||
'owner_id': media['owner']['id'],
|
||||
'username': media['owner']['username'],
|
||||
'fullname': media['owner']['full_name'],
|
||||
}
|
||||
|
||||
def _extract_profilepage(self, url):
|
||||
page = self.request(url).text
|
||||
shared_data = self._extract_shared_data(page)
|
||||
|
||||
while True:
|
||||
# Deal with different structure of profile pages: the first page
|
||||
# has interesting data in `entry_data', next pages in `data'.
|
||||
if 'entry_data' in shared_data:
|
||||
base_shared_data = shared_data['entry_data']['ProfilePage'][0]['graphql']
|
||||
|
||||
# `rhx_gis' and `user_id' are available only in the first page
|
||||
rhx_gis = shared_data['rhx_gis']
|
||||
user_id = base_shared_data['user']['id']
|
||||
else:
|
||||
base_shared_data = shared_data['data']
|
||||
|
||||
timeline = base_shared_data['user']['edge_owner_to_timeline_media']
|
||||
has_next_page = timeline['page_info']['has_next_page']
|
||||
shortcodes = [n['node']['shortcode'] for n in timeline['edges']]
|
||||
|
||||
for s in shortcodes:
|
||||
url = '{}/p/{}/'.format(self.root, s)
|
||||
yield self._extract_postpage(url)
|
||||
|
||||
if not has_next_page:
|
||||
break
|
||||
|
||||
end_cursor = timeline['page_info']['end_cursor']
|
||||
variables = '{{"id":"{}","first":12,"after":"{}"}}'.format(
|
||||
user_id,
|
||||
end_cursor,
|
||||
)
|
||||
xigis = '{}:{}'.format(rhx_gis, variables)
|
||||
headers = {
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"X-Instagram-GIS": hashlib.md5(xigis.encode()).hexdigest(),
|
||||
}
|
||||
url = '{}/graphql/query/?query_hash={}&variables={}'.format(
|
||||
self.root,
|
||||
'66eb9403e44cc12e5b5ecda48b667d41',
|
||||
variables,
|
||||
)
|
||||
shared_data = self.request(url, headers=headers).json()
|
||||
|
||||
|
||||
class InstagramImageExtractor(InstagramExtractor):
|
||||
"""Extractor for PostPage"""
|
||||
subcategory = "image"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/]+)/?"]
|
||||
test = [
|
||||
# GraphImage
|
||||
("https://www.instagram.com/p/BqvsDleB3lV/", {
|
||||
"pattern": r"https://[^/.]+\.cdninstagram\.com/vp"
|
||||
r"/5043db33a998e32fb5713411be1d466e"
|
||||
r"/5C8DAF92/t51.2885-15/e35"
|
||||
r"/44877605_725955034447492_3123079845831750529_n.jpg",
|
||||
"keyword": {
|
||||
"comments": int,
|
||||
"height": int,
|
||||
"likes": int,
|
||||
"media_id": "1922949326347663701",
|
||||
"shortcode": "BqvsDleB3lV",
|
||||
"typename": "GraphImage",
|
||||
"username": "instagram",
|
||||
"width": int,
|
||||
}
|
||||
}),
|
||||
|
||||
# GraphSidecar
|
||||
("https://www.instagram.com/p/BoHk1haB5tM/", {
|
||||
"pattern": r"https://[^/.]+\.cdninstagram\.com/vp"
|
||||
"/fd70fa8d5775ce1c297a95d3800f4b7c"
|
||||
"/5C935FCB/t51.2885-15/e35"
|
||||
"/40758827_2138611023072230_4073975203662780931_n.jpg",
|
||||
"keyword": {
|
||||
"comments": int,
|
||||
"height": int,
|
||||
"likes": int,
|
||||
"media_id": "1875629777499953996",
|
||||
"shortcode": "BoHk1haB5tM",
|
||||
"typename": "GraphSidecar",
|
||||
"username": "instagram",
|
||||
"width": int,
|
||||
}
|
||||
}),
|
||||
|
||||
# GraphVideo
|
||||
("https://www.instagram.com/p/Bqxp0VSBgJg/", {
|
||||
"url": "8f38c1cf460c9804842f7306c487410f33f82e7e",
|
||||
"keyword": {
|
||||
"comments": int,
|
||||
"height": int,
|
||||
"likes": int,
|
||||
"media_id": "1923502432034620000",
|
||||
"shortcode": "Bqxp0VSBgJg",
|
||||
"typename": "GraphVideo",
|
||||
"username": "instagram",
|
||||
"width": int,
|
||||
}
|
||||
}),
|
||||
]
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self)
|
||||
self.shortcode = match.group(1)
|
||||
|
||||
def instagrams(self):
|
||||
url = '{}/p/{}/'.format(self.root, self.shortcode)
|
||||
return (self._extract_postpage(url),)
|
||||
|
||||
|
||||
class InstagramUserExtractor(InstagramExtractor):
|
||||
"""Extractor for ProfilePage"""
|
||||
subcategory = "user"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?instagram\.com/(?!p/)([^/?&#]+)"]
|
||||
test = [
|
||||
("https://www.instagram.com/instagram/", {
|
||||
"range": "1-12",
|
||||
"count": ">= 12",
|
||||
}),
|
||||
]
|
||||
|
||||
def __init__(self, match):
|
||||
InstagramExtractor.__init__(self)
|
||||
self.username = match.group(1)
|
||||
|
||||
def instagrams(self):
|
||||
url = '{}/{}/'.format(self.root, self.username)
|
||||
return self._extract_profilepage(url)
|
Loading…
Reference in new issue