')[0] if not notes_data: return note_iter = text.extract_iter(notes_data, '') extr = text.extract for note_data in note_iter: note = { "width": int(extr(note_data, 'data-width="', '"')[0]), "height": int(extr(note_data, 'data-height="', '"')[0]), "x": int(extr(note_data, 'data-x="', '"')[0]), "y": int(extr(note_data, 'data-y="', '"')[0]), "body": extr(note_data, 'data-body="', '"')[0], } notes.append(note) post["notes"] = notes INSTANCES = { "realbooru": { "root": "https://realbooru.com", "pattern": r"realbooru\.com", }, "rule34": { "root": "https://rule34.xxx", "pattern": r"rule34\.xxx", "api_root": "https://api.rule34.xxx", }, "safebooru": { "root": "https://safebooru.org", "pattern": r"safebooru\.org", }, "tbib": { "root": "https://tbib.org", "pattern": r"tbib\.org", }, "hypnohub": { "root": "https://hypnohub.net", "pattern": r"hypnohub\.net", }, } BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES) class GelbooruV02TagExtractor(GelbooruV02Extractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" test = ( ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", { "content": "5c6ae9ee13e6d4bc9cb8bdce224c84e67fbfa36c", "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg", "count": 2, }), ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", { "url": "17c61b386530cf4c30842c9f580d15ef1cd09586", "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb", }), ("https://realbooru.com/index.php?page=post&s=list&tags=wine", { "count": ">= 64", }), ("https://tbib.org/index.php?page=post&s=list&tags=yuyaiyaui", { "count": ">= 120", }), ("https://hypnohub.net/index.php?page=post&s=list&tags=gonoike_biwa", { "url": "fe662b86d38c331fcac9c62af100167d404937dc", }), ) def __init__(self, match): GelbooruV02Extractor.__init__(self, match) tags = match.group(match.lastindex) self.tags = text.unquote(tags.replace("+", " ")) def metadata(self): return {"search_tags": self.tags} def posts(self): return self._pagination({"tags": self.tags}) class GelbooruV02PoolExtractor(GelbooruV02Extractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)" test = ( ("https://rule34.xxx/index.php?page=pool&s=show&id=179", { "count": 3, }), ("https://safebooru.org/index.php?page=pool&s=show&id=11", { "count": 5, }), ("https://realbooru.com/index.php?page=pool&s=show&id=1", { "count": 3, }), ("https://hypnohub.net/index.php?page=pool&s=show&id=61", { "url": "d314826280073441a2da609f70ee814d1f4b9407", "count": 3, }), ) def __init__(self, match): GelbooruV02Extractor.__init__(self, match) self.pool_id = match.group(match.lastindex) self.post_ids = () def skip(self, num): self.page_start += num return num def metadata(self): url = "{}/index.php?page=pool&s=show&id={}".format( self.root, self.pool_id) page = self.request(url).text name, pos = text.extract(page, "

Pool: ", "

") if not name: raise exception.NotFoundError("pool") self.post_ids = text.extract_iter( page, 'class="thumb" id="p', '"', pos) return { "pool": text.parse_int(self.pool_id), "pool_name": text.unescape(name), } def posts(self): params = {} for params["id"] in util.advance(self.post_ids, self.page_start): for post in self._api_request(params): yield post.attrib class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): subcategory = "favorite" directory_fmt = ("{category}", "favorites", "{favorite_id}") archive_fmt = "f_{favorite_id}_{id}" per_page = 50 pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)" test = ( ("https://rule34.xxx/index.php?page=favorites&s=view&id=1030218", { "count": 3, }), ("https://safebooru.org/index.php?page=favorites&s=view&id=17567", { "count": 2, }), ("https://realbooru.com/index.php?page=favorites&s=view&id=274", { "count": 2, }), ("https://tbib.org/index.php?page=favorites&s=view&id=7881", { "count": 3, }), ("https://hypnohub.net/index.php?page=favorites&s=view&id=43546", { "count": 3, }), ) def __init__(self, match): GelbooruV02Extractor.__init__(self, match) self.favorite_id = match.group(match.lastindex) def metadata(self): return {"favorite_id": text.parse_int(self.favorite_id)} def posts(self): url = self.root + "/index.php" params = { "page": "favorites", "s" : "view", "id" : self.favorite_id, "pid" : self.page_start * self.per_page, } data = {} while True: num_ids = 0 page = self.request(url, params=params).text for data["id"] in text.extract_iter(page, '" id="p', '"'): num_ids += 1 for post in self._api_request(data): yield post.attrib if num_ids < self.per_page: return params["pid"] += self.per_page class GelbooruV02PostExtractor(GelbooruV02Extractor): subcategory = "post" archive_fmt = "{id}" pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" test = ( ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", { "content": "97e4bbf86c3860be18de384d02d544251afe1d45", "options": (("tags", True),), "keyword": { "tags_artist": "danraku", "tags_character": "kashima_(kantai_collection)", "tags_copyright": "kantai_collection", "tags_general": str, "tags_metadata": str, }, }), ("https://safebooru.org/index.php?page=post&s=view&id=1169132", { "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b", "content": "93b293b27dabd198afafabbaf87c49863ac82f27", "options": (("tags", True),), "keyword": { "tags_artist": "kawanakajima", "tags_character": "heath_ledger ronald_mcdonald the_joker", "tags_copyright": "dc_comics mcdonald's the_dark_knight", "tags_general": str, }, }), ("https://realbooru.com/index.php?page=post&s=view&id=668483", { "pattern": r"https://realbooru\.com/images/dc/b5" r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg", "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", }), ("https://tbib.org/index.php?page=post&s=view&id=9233957", { "url": "5a6ebe07bfff8e6d27f7c30b5480f27abcb577d2", "content": "1c3831b6fbaa4686e3c79035b5d98460b1c85c43", }), ("https://hypnohub.net/index.php?page=post&s=view&id=73964", { "pattern": r"https://hypnohub\.net/images/7a/37" r"/7a37c0ba372f35767fb10c904a398831\.png", "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee", }), ) def __init__(self, match): GelbooruV02Extractor.__init__(self, match) self.post_id = match.group(match.lastindex) def posts(self): return self._pagination({"id": self.post_id})