[pornhub] add 'gif' support (#4463)

1 year ago · a783c4f0fe
parent ba842981af
commit a783c4f0fe
3 changed files with 133 additions and 30 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@ -694,7 +694,7 @@ Consider all sites to be NSFW unless otherwise known.
 <tr>
    <td>Pornhub</td>
    <td>https://www.pornhub.com/</td>
-    <td>Galleries, User Profiles</td>
+    <td>Galleries, Gifs, Photos, User Profiles</td>
    <td></td>
 </tr>
 <tr>
--- a/gallery_dl/extractor/pornhub.py
+++ b/gallery_dl/extractor/pornhub.py
@ -19,6 +19,35 @@ class PornhubExtractor(Extractor):
    category = "pornhub"
    root = "https://www.pornhub.com"
    def _init(self):
        self.cookies.set(
            "accessAgeDisclaimerPH", "1", domain=".pornhub.com")
    def _pagination(self, user, path):
        if "/" not in path:
            path += "/public"
        url = "{}/{}/{}/ajax".format(self.root, user, path)
        params = {"page": 1}
        headers = {
            "Referer": url[:-5],
            "X-Requested-With": "XMLHttpRequest",
        }
        while True:
            response = self.request(
                url, method="POST", headers=headers, params=params,
                allow_redirects=False)
            if 300 <= response.status_code < 400:
                url = "{}{}/{}/ajax".format(
                    self.root, response.headers["location"], path)
                continue
            yield response.text
            params["page"] += 1
 class PornhubGalleryExtractor(PornhubExtractor):
    """Extractor for image galleries on pornhub.com"""
@ -58,9 +87,6 @@ class PornhubGalleryExtractor(PornhubExtractor):
        self._first = None
    def items(self):
        self.cookies.set(
            "accessAgeDisclaimerPH", "1", domain=".pornhub.com")
        data = self.metadata()
        yield Message.Directory, data
        for num, image in enumerate(self.images(), 1):
@ -116,17 +142,83 @@ class PornhubGalleryExtractor(PornhubExtractor):
                return
 class PornhubGifExtractor(PornhubExtractor):
    """Extractor for pornhub.com gifs"""
    subcategory = "gif"
    directory_fmt = ("{category}", "{user}", "gifs")
    filename_fmt = "{id} {title}.{extension}"
    archive_fmt = "{id}"
    pattern = BASE_PATTERN + r"/gif/(\d+)"
    test = (
        ("https://www.pornhub.com/gif/33643461", {
            "pattern": r"https://\w+\.phncdn\.com/pics/gifs"
                       r"/033/643/461/33643461a\.webm",
            "keyword": {
                "date": "dt:2020-10-31 00:00:00",
                "extension": "webm",
                "filename": "33643461a",
                "id": "33643461",
                "tags": ["big boobs", "lana rhoades"],
                "title": "Big boobs",
                "url": str,
                "user": "Lana Rhoades",
            },
        }),
    )
    def __init__(self, match):
        PornhubExtractor.__init__(self, match)
        self.gallery_id = match.group(1)
    def items(self):
        url = "{}/gif/{}".format(self.root, self.gallery_id)
        extr = text.extract_from(self.request(url).text)
        gif = {
            "id"   : self.gallery_id,
            "tags" : extr("data-context-tag='", "'").split(","),
            "title": extr('"name": "', '"'),
            "url"  : extr('"contentUrl": "', '"'),
            "date" : text.parse_datetime(
                extr('"uploadDate": "', '"'), "%Y-%m-%d"),
            "user" : extr('data-mxptext="', '"'),
        }
        yield Message.Directory, gif
        yield Message.Url, gif["url"], text.nameext_from_url(gif["url"], gif)
 class PornhubUserExtractor(PornhubExtractor):
-    """Extractor for all galleries of a pornhub user"""
+    """Extractor for a pornhub user"""
    subcategory = "user"
-    pattern = (BASE_PATTERN + r"/(users|model|pornstar)/([^/?#]+)"
+    pattern = BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)/?$"
-               "(?:/photos(?:/(public|private|favorites))?)?/?$")
+    test = ("https://www.pornhub.com/pornstar/danika-mori",)
    def __init__(self, match):
        PornhubExtractor.__init__(self, match)
        self.user = match.group(1)
    def initialize(self):
        pass
    def items(self):
        base = "{}/{}/".format(self.root, self.user)
        return self._dispatch_extractors((
            (PornhubPhotosExtractor, base + "photos"),
            (PornhubGifsExtractor  , base + "gifs"),
        ), ("photos",))
 class PornhubPhotosExtractor(PornhubExtractor):
    """Extractor for all galleries of a pornhub user"""
    subcategory = "photos"
    pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)"
               "/(photos(?:/[^/?#]+)?)")
    test = (
        ("https://www.pornhub.com/pornstar/danika-mori/photos", {
            "pattern": PornhubGalleryExtractor.pattern,
            "count": ">= 6",
        }),
        ("https://www.pornhub.com/users/flyings0l0/"),
        ("https://www.pornhub.com/users/flyings0l0/photos/public"),
        ("https://www.pornhub.com/users/flyings0l0/photos/private"),
        ("https://www.pornhub.com/users/flyings0l0/photos/favorites"),
@ -135,33 +227,41 @@ class PornhubUserExtractor(PornhubExtractor):
    def __init__(self, match):
        PornhubExtractor.__init__(self, match)
-        self.type, self.user, self.cat = match.groups()
+        self.user, self.path = match.groups()
    def items(self):
        url = "{}/{}/{}/photos/{}/ajax".format(
            self.root, self.type, self.user, self.cat or "public")
        params = {"page": 1}
        headers = {
            "Referer": url[:-5],
            "X-Requested-With": "XMLHttpRequest",
        }
        data = {"_extractor": PornhubGalleryExtractor}
-        while True:
+        for page in self._pagination(self.user, self.path):
            response = self.request(
                url, method="POST", headers=headers, params=params,
                allow_redirects=False)
            if 300 <= response.status_code < 400:
                url = "{}{}/photos/{}/ajax".format(
                    self.root, response.headers["location"],
                    self.cat or "public")
                continue
            gid = None
-            for gid in text.extract_iter(response.text, 'id="albumphoto', '"'):
+            for gid in text.extract_iter(page, 'id="albumphoto', '"'):
                yield Message.Queue, self.root + "/album/" + gid, data
            if gid is None:
                return
-            params["page"] += 1
+
 class PornhubGifsExtractor(PornhubExtractor):
    """Extractor for a pornhub user's gifs"""
    subcategory = "gifs"
    pattern = (BASE_PATTERN + r"/((?:users|model|pornstar)/[^/?#]+)"
               "/(gifs(?:/[^/?#]+)?)")
    test = (
        ("https://www.pornhub.com/pornstar/danika-mori/gifs", {
            "pattern": PornhubGifExtractor.pattern,
            "count": ">= 42",
        }),
        ("https://www.pornhub.com/users/flyings0l0/gifs"),
        ("https://www.pornhub.com/model/bossgirl/gifs/video"),
    )
    def __init__(self, match):
        PornhubExtractor.__init__(self, match)
        self.user, self.path = match.groups()
    def items(self):
        data = {"_extractor": PornhubGifExtractor}
        for page in self._pagination(self.user, self.path):
            gid = None
            for gid in text.extract_iter(page, 'id="gif', '"'):
                yield Message.Queue, self.root + "/gif/" + gid, data
            if gid is None:
                return
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@ -224,6 +224,9 @@ SUBCATEGORY_MAP = {
        "sketch": "Sketch",
        "work": "individual Images",
    },
    "pornhub": {
        "gifs": "",
    },
    "reddit": {
        "home": "Home Feed",
    },