|
|
@ -243,8 +243,8 @@ class TwitterExtractor(Extractor):
|
|
|
|
|
|
|
|
|
|
|
|
# collect URLs from entities
|
|
|
|
# collect URLs from entities
|
|
|
|
for url in tweet["entities"].get("urls") or ():
|
|
|
|
for url in tweet["entities"].get("urls") or ():
|
|
|
|
url = url["expanded_url"]
|
|
|
|
url = url.get("expanded_url") or url.get("url") or ""
|
|
|
|
if "//twitpic.com/" not in url or "/photos/" in url:
|
|
|
|
if not url or "//twitpic.com/" not in url or "/photos/" in url:
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
if url.startswith("http:"):
|
|
|
|
if url.startswith("http:"):
|
|
|
|
url = "https" + url[4:]
|
|
|
|
url = "https" + url[4:]
|
|
|
@ -336,7 +336,10 @@ class TwitterExtractor(Extractor):
|
|
|
|
urls = entities.get("urls")
|
|
|
|
urls = entities.get("urls")
|
|
|
|
if urls:
|
|
|
|
if urls:
|
|
|
|
for url in urls:
|
|
|
|
for url in urls:
|
|
|
|
|
|
|
|
try:
|
|
|
|
content = content.replace(url["url"], url["expanded_url"])
|
|
|
|
content = content.replace(url["url"], url["expanded_url"])
|
|
|
|
|
|
|
|
except KeyError:
|
|
|
|
|
|
|
|
pass
|
|
|
|
txt, _, tco = content.rpartition(" ")
|
|
|
|
txt, _, tco = content.rpartition(" ")
|
|
|
|
tdata["content"] = txt if tco.startswith("https://t.co/") else content
|
|
|
|
tdata["content"] = txt if tco.startswith("https://t.co/") else content
|
|
|
|
|
|
|
|
|
|
|
@ -403,7 +406,10 @@ class TwitterExtractor(Extractor):
|
|
|
|
urls = entities["description"].get("urls")
|
|
|
|
urls = entities["description"].get("urls")
|
|
|
|
if urls:
|
|
|
|
if urls:
|
|
|
|
for url in urls:
|
|
|
|
for url in urls:
|
|
|
|
|
|
|
|
try:
|
|
|
|
descr = descr.replace(url["url"], url["expanded_url"])
|
|
|
|
descr = descr.replace(url["url"], url["expanded_url"])
|
|
|
|
|
|
|
|
except KeyError:
|
|
|
|
|
|
|
|
pass
|
|
|
|
udata["description"] = descr
|
|
|
|
udata["description"] = descr
|
|
|
|
|
|
|
|
|
|
|
|
if "url" in entities:
|
|
|
|
if "url" in entities:
|
|
|
|