[urlshortener] update

pull/3935/head
Mike Fährmann 1 year ago
parent 875485313f
commit 5e63942b37
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1276,13 +1276,13 @@ Consider all sites to be NSFW unless otherwise known.
<tr> <tr>
<td>Bitly</td> <td>Bitly</td>
<td>https://bit.ly/</td> <td>https://bit.ly/</td>
<td></td> <td>Links</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>Twitter t.co</td> <td>Twitter t.co</td>
<td>https://t.co/</td> <td>https://t.co/</td>
<td></td> <td>Links</td>
<td></td> <td></td>
</tr> </tr>

@ -4,57 +4,66 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extractor for general-purpose URL shorteners""" """Extractors for general-purpose URL shorteners"""
from .common import BaseExtractor, Message from .common import BaseExtractor, Message
from .. import exception from .. import exception
class UrlshortenerExtractor(BaseExtractor): class UrlshortenerExtractor(BaseExtractor):
"""Extractor for general-purpose URL shorteners""" """Base class for URL shortener extractors"""
basecategory = "urlshortener" basecategory = "urlshortener"
INSTANCES = {
"bitly": {
"root": "https://bit.ly",
"pattern": r"bit\.ly",
},
"tco": {
# t.co sends 'http-equiv="refresh"' (200) when using browser UA
"headers": {"User-Agent": None},
"root": "https://t.co",
"pattern": r"t\.co",
},
}
BASE_PATTERN = UrlshortenerExtractor.update(INSTANCES)
class UrlshortenerLinkExtractor(UrlshortenerExtractor):
"""Extractor for general-purpose URL shorteners"""
subcategory = "link"
pattern = BASE_PATTERN + r"/([^/?&#]+)"
test = ( test = (
("https://bit.ly/3cWIUgq", { ("https://bit.ly/3cWIUgq", {
"count": 1, "count": 1,
"pattern": "^https://gumroad.com/l/storm_b1" "pattern": "^https://gumroad.com/l/storm_b1",
}), }),
("https://t.co/bCgBY8Iv5n", { ("https://t.co/bCgBY8Iv5n", {
"count": 1, "count": 1,
"pattern": ("^https://twitter.com/elonmusk/status/" "pattern": "^https://twitter.com/elonmusk/status/"
"1421395561324896257/photo/1") "1421395561324896257/photo/1",
}),
("https://t.co/abcdefghij", {
"exception": exception.NotFoundError,
}), }),
) )
def __init__(self, match): def __init__(self, match):
BaseExtractor.__init__(self, match) UrlshortenerExtractor.__init__(self, match)
self.headers = INSTANCES[self.category].get("headers")
self.id = match.group(match.lastindex) self.id = match.group(match.lastindex)
def request(self, url, **kwargs): try:
kwargs["headers"] = self.headers self.headers = INSTANCES[self.category]["headers"]
return BaseExtractor.request(self, url, **kwargs) except Exception:
self.headers = None
def items(self): def items(self):
response = self.request( response = self.request(
"{}/{}".format(self.root, self.id), method="HEAD", "{}/{}".format(self.root, self.id), headers=self.headers,
allow_redirects=False, notfound="URL") method="HEAD", allow_redirects=False, notfound="URL")
if "location" not in response.headers: try:
yield Message.Queue, response.headers["location"], {}
except KeyError:
raise exception.StopExtraction("Unable to resolve short URL") raise exception.StopExtraction("Unable to resolve short URL")
yield Message.Queue, response.headers["location"], {}
INSTANCES = {
"bitly": {
"root": "https://bit.ly",
"pattern": r"bit\.ly",
},
"tco": {
# t.co sends 'http-equiv="refresh"' (200) when using browser UA
"headers": {"User-Agent": None},
"root": "https://t.co",
"pattern": r"t\.co",
},
}
UrlshortenerExtractor.pattern = \
UrlshortenerExtractor.update(INSTANCES) + r"/([^/?#&]+)"

Loading…
Cancel
Save