diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py index aa6e7db5..b21709a9 100644 --- a/gallery_dl/extractor/urlgalleries.py +++ b/gallery_dl/extractor/urlgalleries.py @@ -6,7 +6,7 @@ """Extractors for https://urlgalleries.net/""" -from .common import GalleryExtractor +from .common import GalleryExtractor, Message from .. import text @@ -14,30 +14,42 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor): """Base class for Urlgalleries extractors""" category = "urlgalleries" root = "urlgalleries.net" - directory_fmt = ("{category}", "{title}") - pattern = r"(?:https?://)([^/?#]+)?\.urlgalleries\.net/([^/?#]+)/([^/?#]+)" - example = "https://blog.urlgalleries.net/gallery-1234567/a-title--1234" + request_interval = (0.5, 1.0) + pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)" + example = "https://blog.urlgalleries.net/gallery-12345/TITLE" def __init__(self, match): - self.blog = match.group(1) - self.gallery_id = match.group(2) - self.title = match.group(3) - url = "{}.urlgalleries.net/{}/{}&a=10000".format( - self.blog, self.gallery_id, self.title) - GalleryExtractor.__init__(self, match, text.ensure_http_scheme(url)) - - def images(self, page): - extr = text.extr(page, 'id="wtf"', "") - url = "{}{{}}".format(self.root).format - return [ - (text.ensure_http_scheme(url(i)), None) - for i in text.extract_iter(extr, "href='", "'") - ] + self.blog, self.gallery_id = match.groups() + url = "https://{}.urlgalleries.net/porn-gallery-{}/?a=10000".format( + self.blog, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def items(self): + page = self.request(self.gallery_url).text + imgs = self.images(page) + data = self.metadata(page) + data["count"] = len(imgs) + del page + + root = "https://{}.urlgalleries.net".format(self.blog) + yield Message.Directory, data + for data["num"], img in enumerate(imgs, 1): + response = self.request( + root + img, method="HEAD", allow_redirects=False) + yield Message.Queue, response.headers["Location"], data def metadata(self, page): - date = text.extr( - page, "float:left;'> ", '').split(" | ")[-1] + extr = text.extract_from(page) return { - 'title': self.title, - 'date': text.parse_datetime(date, format='%B %d, %Y T%H:%M') + "gallery_id": self.gallery_id, + "_site": extr(' title="', '"'), # site name + "blog" : text.unescape(extr(' title="', '"')), + "_rprt": extr(' title="', '"'), # report button + "title": text.unescape(extr(' title="', '"').strip()), + "date" : text.parse_datetime( + extr(" images in gallery | ", "<"), "%B %d, %Y %H:%M"), } + + def images(self, page): + imgs = text.extr(page, 'id="wtf"', "") + return list(text.extract_iter(imgs, " href='", "'"))