From a247c94c3432a8a5dd25ac71f640c4261f993881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 23 Apr 2019 22:10:39 +0200 Subject: [PATCH] [sexcom] add pin and board extractors (#147) --- CHANGELOG.md | 13 ++- docs/supportedsites.rst | 1 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/sexcom.py | 159 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + 5 files changed, 170 insertions(+), 5 deletions(-) create mode 100644 gallery_dl/extractor/sexcom.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 19b05bb9..ee05c688 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,10 @@ ## Unreleased ### Additions - Support for - - `plurk` - https://www.plurk.com/ (#212) + - `plurk` - https://www.plurk.com/ (#212) + - `sexcom` - https://www.sex.com/ (#147) +### Changes +- Standalone Windows executables use PyInstaller and Python 3.7 ## 1.8.2 - 2019-04-12 ### Additions @@ -25,7 +28,7 @@ ### Additions - Support for: - `35photo` - https://35photo.pro/ ([#162](https://github.com/mikf/gallery-dl/issues/162)) - - `500px` - https://500px.com/ ([#185](https://github.com/mikf/gallery-dl/issues/185)) + - `500px` - https://500px.com/ ([#185](https://github.com/mikf/gallery-dl/issues/185)) - `instagram` extractor for hashtags ([#202](https://github.com/mikf/gallery-dl/issues/202)) - Option to get more metadata on `deviantart` ([#189](https://github.com/mikf/gallery-dl/issues/189)) - Man pages and bash completion ([#150](https://github.com/mikf/gallery-dl/issues/150)) @@ -41,9 +44,9 @@ ## 1.8.0 - 2019-03-15 ### Additions - Support for: - - `weibo` - https://www.weibo.com/ - - `pururin` - https://pururin.io/ ([#174](https://github.com/mikf/gallery-dl/issues/174)) - - `fashionnove` - https://www.fashionnova.com ([#175](https://github.com/mikf/gallery-dl/issues/175)) + - `weibo` - https://www.weibo.com/ + - `pururin` - https://pururin.io/ ([#174](https://github.com/mikf/gallery-dl/issues/174)) + - `fashionnova` - https://www.fashionnova.com/ ([#175](https://github.com/mikf/gallery-dl/issues/175)) - `shopify` sites in general ([#175](https://github.com/mikf/gallery-dl/issues/175)) - Snap packaging ([#169](https://github.com/mikf/gallery-dl/issues/169), [#170](https://github.com/mikf/gallery-dl/issues/170), [#187](https://github.com/mikf/gallery-dl/issues/187), [#188](https://github.com/mikf/gallery-dl/issues/188)) - Automatic Cloudflare DDoS protection bypass diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index df8f5790..f18990cc 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -87,6 +87,7 @@ Safebooru https://safebooru.org/ Pools, Posts, Tag-Searc Sankaku Channel https://chan.sankakucomplex.com/ Pools, Posts, Tag-Searches Optional Sen Manga https://raw.senmanga.com/ Chapters Sense-Scans http://sensescans.com/reader/ Chapters, Manga +Sex.com https://www.sex.com/ Boards, Pins Simply Hentai https://www.simply-hentai.com/ Galleries, individual Images, Videos SlideShare https://www.slideshare.net/ Presentations SmugMug https://www.smugmug.com/ |smugmug-C| Optional (OAuth) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index cf1cbcd9..60d1f64d 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -78,6 +78,7 @@ modules = [ "sankaku", "seiga", "senmanga", + "sexcom", "simplyhentai", "slideshare", "smugmug", diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py new file mode 100644 index 00000000..a2f40dd1 --- /dev/null +++ b/gallery_dl/extractor/sexcom.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.sex.com/""" + +from .common import Extractor, Message +from .. import text + + +class SexcomExtractor(Extractor): + """Base class for sexcom extractors""" + category = "sexcom" + directory_fmt = ("{category}") + filename_fmt = "{pin_id}{title:? //}.{extension}" + archive_fmt = "{pin_id}" + root = "https://www.sex.com" + + def items(self): + yield Message.Version, 1 + yield Message.Directory, self.metadata() + for url in self.pins(): + pin = self._parse_pin(url) + yield Message.Url, pin["url"], pin + + def metadata(self): + return {} + + def pins(self): + return () + + def _pagination(self, url): + while True: + extr = text.extract_from(self.request(url).text) + + while True: + href = extr('') + url = text.extract(pager, ' href="', '"')[0] + if not url: + return + url = text.urljoin(self.root, url) + + def _parse_pin(self, pin_url): + extr = text.extract_from(self.request(pin_url).text) + data = {} + + data["thumbnail"] = extr('itemprop="thumbnail" content="', '"') + data["type"] = extr('

' , '<').rstrip(" -").strip().lower() + data["title"] = text.unescape(extr('itemprop="name">' , '<')) + data["repins"] = text.parse_int(text.extract( + extr('"btn-group"', ''), '"btn btn-primary">' , '<')[0]) + data["likes"] = text.parse_int(text.extract( + extr('"btn-group"', ''), '"btn btn-default">' , '<')[0]) + data["pin_id"] = text.parse_int(extr('data-id="', '"')) + + if data["type"] == "video": + info = extr("player.updateSrc(", ");") + + if info: + path = text.extract(info, "src: '", "'")[0] + data["filename"] = path.rpartition("/")[2] + data["extension"] = "mp4" + if "'HD'" in info: + path += "/hd" + data["url"] = self.root + path + else: + data["url"] = "ytdl:" + text.extract( + extr(''), ' src="', '"')[0] + else: + data["url"] = extr(' src="', '"') + text.nameext_from_url(data["url"], data) + + data["uploader"] = extr('itemprop="author">', '<') + data["date"] = extr('datetime="', '"') + data["tags"] = text.split_html(extr('class="tags"> Tags', '')) + data["comments"] = text.parse_int(extr('Comments (', ')')) + + return data + + +class SexcomPinExtractor(SexcomExtractor): + """Extractor a pinned image or video on www.sex.com""" + subcategory = "pin" + directory_fmt = ("{category}",) + pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)" + test = ( + # picture + ("https://www.sex.com/pin/56714360/", { + "url": "599190d6e3d79f9f49dda194a0a58cb0ffa3ab86", + "keyword": { + "comments": int, + "date": "2018-10-02T21:18:17-04:00", + "extension": "jpg", + "filename": "20037816", + "likes": int, + "pin_id": 56714360, + "repins": int, + "tags": list, + "thumbnail": str, + "title": "Pin #56714360", + "type": "picture", + "uploader": "alguem", + "url": str, + }, + }), + # gif + ("https://www.sex.com/pin/11465040-big-titted-hentai-gif/", { + "url": "98a82c5ae7a65c8228e1405ac740f80d4d556de1", + }), + # video + ("https://www.sex.com/pin/55748381/", { + "pattern": "https://www.sex.com/video/stream/776238/hd", + }), + # pornhub embed + ("https://www.sex.com/pin/55847384-very-nicely-animated/", { + "pattern": "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2", + }), + ) + + def __init__(self, match): + SexcomExtractor.__init__(self, match) + self.pin_id = match.group(1) + + def pins(self): + return ("{}/pin/{}/".format(self.root, self.pin_id),) + + +class SexcomBoardExtractor(SexcomExtractor): + """Extractor for pins from a board on www.sex.com""" + subcategory = "board" + directory_fmt = ("{category}", "{user}", "{board}") + pattern = (r"(?:https?://)?(?:www\.)?sex\.com/user" + r"/([^/?&#]+)/(?!(?:following|pins|repins|likes)/)([^/?&#]+)") + test = ("https://www.sex.com/user/ronin17/exciting-hentai/", { + "count": ">= 15", + }) + + def __init__(self, match): + SexcomExtractor.__init__(self, match) + self.user, self.board = match.groups() + + def metadata(self): + return { + "user" : text.unquote(self.user), + "board": text.unquote(self.board), + } + + def pins(self): + url = "{}/user/{}/{}/".format(self.root, self.user, self.board) + return self._pagination(url) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 9194ea46..3d103a08 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -61,6 +61,7 @@ CATEGORY_MAP = { "seiga" : "Niconico Seiga", "senmanga" : "Sen Manga", "sensescans" : "Sense-Scans", + "sexcom" : "Sex.com", "simplyhentai" : "Simply Hentai", "slideshare" : "SlideShare", "smugmug" : "SmugMug",