From a783c4f0fe69b52181c0e58fcc3636b13adb760d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 29 Aug 2023 19:34:27 +0200 Subject: [PATCH] [pornhub] add 'gif' support (#4463) --- docs/supportedsites.md | 2 +- gallery_dl/extractor/pornhub.py | 158 ++++++++++++++++++++++++++------ scripts/supportedsites.py | 3 + 3 files changed, 133 insertions(+), 30 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 676ba938..02b55230 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -694,7 +694,7 @@ Consider all sites to be NSFW unless otherwise known. Pornhub https://www.pornhub.com/ - Galleries, User Profiles + Galleries, Gifs, Photos, User Profiles diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index d3619da6..6cb2063b 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -19,6 +19,35 @@ class PornhubExtractor(Extractor): category = "pornhub" root = "https://www.pornhub.com" + def _init(self): + self.cookies.set( + "accessAgeDisclaimerPH", "1", domain=".pornhub.com") + + def _pagination(self, user, path): + if "/" not in path: + path += "/public" + + url = "{}/{}/{}/ajax".format(self.root, user, path) + params = {"page": 1} + headers = { + "Referer": url[:-5], + "X-Requested-With": "XMLHttpRequest", + } + + while True: + response = self.request( + url, method="POST", headers=headers, params=params, + allow_redirects=False) + + if 300 <= response.status_code < 400: + url = "{}{}/{}/ajax".format( + self.root, response.headers["location"], path) + continue + + yield response.text + + params["page"] += 1 + class PornhubGalleryExtractor(PornhubExtractor): """Extractor for image galleries on pornhub.com""" @@ -58,9 +87,6 @@ class PornhubGalleryExtractor(PornhubExtractor): self._first = None def items(self): - self.cookies.set( - "accessAgeDisclaimerPH", "1", domain=".pornhub.com") - data = self.metadata() yield Message.Directory, data for num, image in enumerate(self.images(), 1): @@ -116,17 +142,83 @@ class PornhubGalleryExtractor(PornhubExtractor): return +class PornhubGifExtractor(PornhubExtractor): + """Extractor for pornhub.com gifs""" + subcategory = "gif" + directory_fmt = ("{category}", "{user}", "gifs") + filename_fmt = "{id} {title}.{extension}" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/gif/(\d+)" + test = ( + ("https://www.pornhub.com/gif/33643461", { + "pattern": r"https://\w+\.phncdn\.com/pics/gifs" + r"/033/643/461/33643461a\.webm", + "keyword": { + "date": "dt:2020-10-31 00:00:00", + "extension": "webm", + "filename": "33643461a", + "id": "33643461", + "tags": ["big boobs", "lana rhoades"], + "title": "Big boobs", + "url": str, + "user": "Lana Rhoades", + }, + }), + ) + + def __init__(self, match): + PornhubExtractor.__init__(self, match) + self.gallery_id = match.group(1) + + def items(self): + url = "{}/gif/{}".format(self.root, self.gallery_id) + extr = text.extract_from(self.request(url).text) + + gif = { + "id" : self.gallery_id, + "tags" : extr("data-context-tag='", "'").split(","), + "title": extr('"name": "', '"'), + "url" : extr('"contentUrl": "', '"'), + "date" : text.parse_datetime( + extr('"uploadDate": "', '"'), "%Y-%m-%d"), + "user" : extr('data-mxptext="', '"'), + } + + yield Message.Directory, gif + yield Message.Url, gif["url"], text.nameext_from_url(gif["url"], gif) + + class PornhubUserExtractor(PornhubExtractor): - """Extractor for all galleries of a pornhub user""" + """Extractor for a pornhub user""" subcategory = "user" - pattern = (BASE_PATTERN + r"/(users|model|pornstar)/([^/?#]+)" - "(?:/photos(?:/(public|private|favorites))?)?/?$") + pattern = BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)/?$" + test = ("https://www.pornhub.com/pornstar/danika-mori",) + + def __init__(self, match): + PornhubExtractor.__init__(self, match) + self.user = match.group(1) + + def initialize(self): + pass + + def items(self): + base = "{}/{}/".format(self.root, self.user) + return self._dispatch_extractors(( + (PornhubPhotosExtractor, base + "photos"), + (PornhubGifsExtractor , base + "gifs"), + ), ("photos",)) + + +class PornhubPhotosExtractor(PornhubExtractor): + """Extractor for all galleries of a pornhub user""" + subcategory = "photos" + pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)" + "/(photos(?:/[^/?#]+)?)") test = ( ("https://www.pornhub.com/pornstar/danika-mori/photos", { "pattern": PornhubGalleryExtractor.pattern, "count": ">= 6", }), - ("https://www.pornhub.com/users/flyings0l0/"), ("https://www.pornhub.com/users/flyings0l0/photos/public"), ("https://www.pornhub.com/users/flyings0l0/photos/private"), ("https://www.pornhub.com/users/flyings0l0/photos/favorites"), @@ -135,33 +227,41 @@ class PornhubUserExtractor(PornhubExtractor): def __init__(self, match): PornhubExtractor.__init__(self, match) - self.type, self.user, self.cat = match.groups() + self.user, self.path = match.groups() def items(self): - url = "{}/{}/{}/photos/{}/ajax".format( - self.root, self.type, self.user, self.cat or "public") - params = {"page": 1} - headers = { - "Referer": url[:-5], - "X-Requested-With": "XMLHttpRequest", - } - data = {"_extractor": PornhubGalleryExtractor} - while True: - response = self.request( - url, method="POST", headers=headers, params=params, - allow_redirects=False) - - if 300 <= response.status_code < 400: - url = "{}{}/photos/{}/ajax".format( - self.root, response.headers["location"], - self.cat or "public") - continue - + for page in self._pagination(self.user, self.path): gid = None - for gid in text.extract_iter(response.text, 'id="albumphoto', '"'): + for gid in text.extract_iter(page, 'id="albumphoto', '"'): yield Message.Queue, self.root + "/album/" + gid, data if gid is None: return - params["page"] += 1 + +class PornhubGifsExtractor(PornhubExtractor): + """Extractor for a pornhub user's gifs""" + subcategory = "gifs" + pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)" + "/(gifs(?:/[^/?#]+)?)") + test = ( + ("https://www.pornhub.com/pornstar/danika-mori/gifs", { + "pattern": PornhubGifExtractor.pattern, + "count": ">= 42", + }), + ("https://www.pornhub.com/users/flyings0l0/gifs"), + ("https://www.pornhub.com/model/bossgirl/gifs/video"), + ) + + def __init__(self, match): + PornhubExtractor.__init__(self, match) + self.user, self.path = match.groups() + + def items(self): + data = {"_extractor": PornhubGifExtractor} + for page in self._pagination(self.user, self.path): + gid = None + for gid in text.extract_iter(page, 'id="gif', '"'): + yield Message.Queue, self.root + "/gif/" + gid, data + if gid is None: + return diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 68925caa..9077a036 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -224,6 +224,9 @@ SUBCATEGORY_MAP = { "sketch": "Sketch", "work": "individual Images", }, + "pornhub": { + "gifs": "", + }, "reddit": { "home": "Home Feed", },