# -*- coding: utf-8 -*- # Copyright 2017-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from https://www.xvideos.com/""" from .common import Extractor, Message from .. import text, exception import json class XvideosExtractor(Extractor): """Base class for xvideos extractors""" category = "xvideos" root = "https://www.xvideos.com" class XvideosGalleryExtractor(XvideosExtractor): """Extractor for user profile galleries from xvideos.com""" subcategory = "gallery" directory_fmt = ("{category}", "{user[name]}", "{title}") filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" archive_fmt = "{gallery_id}_{num}" pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" r"/profiles/([^/?&#]+)/photos/(\d+)") test = ( (("https://www.xvideos.com/profiles" "/pervertedcouple/photos/751031/random_stuff"), { "url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7", "keyword": "65979d63a69576cf692b41d5fbbd995cc40a51b9", }), ("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", { "exception": exception.NotFoundError, }), ) def __init__(self, match): XvideosExtractor.__init__(self, match) self.user, self.gid = match.groups() def items(self): url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid) page = self.request(url, notfound=self.subcategory).text data = self.get_metadata(page) imgs = self.get_images(page) data["count"] = len(imgs) yield Message.Version, 1 yield Message.Directory, data for url in imgs: data["num"] = text.parse_int(url.rsplit("_", 2)[1]) data["extension"] = url.rpartition(".")[2] yield Message.Url, url, data def get_metadata(self, page): """Collect metadata for extractor-job""" data = text.extract_all(page, ( ("userid" , '"id_user":', ','), ("display", '"display":"', '"'), ("title" , '"title":"', '"'), ("descr" , '', ''), ("tags" , 'Tagged:', '<'), ))[0] return { "user": { "id": text.parse_int(data["userid"]), "name": self.user, "display": data["display"], "description": data["descr"].strip(), }, "tags": text.unescape(data["tags"] or "").strip().split(", "), "title": text.unescape(data["title"]), "gallery_id": text.parse_int(self.gid), } @staticmethod def get_images(page): """Return a list of all image urls for this gallery""" return list(text.extract_iter( page, '")[0])["data"] if not isinstance(data["galleries"], dict): return if "0" in data["galleries"]: del data["galleries"]["0"] galleries = [ { "gallery_id": text.parse_int(gid), "title": text.unescape(gdata["title"]), "count": gdata["nb_pics"], "_extractor": XvideosGalleryExtractor, } for gid, gdata in data["galleries"].items() ] galleries.sort(key=lambda x: x["gallery_id"]) yield Message.Version, 1 for gallery in galleries: url = "https://www.xvideos.com/profiles/{}/photos/{}".format( self.user, gallery["gallery_id"]) yield Message.Queue, url, gallery