diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 12d98b12..0d7d13d4 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -25,6 +25,7 @@ class BooruExtractor(BaseExtractor): data = self.metadata() tags = self.config("tags", False) notes = self.config("notes", False) + fetch_html = tags or notes for post in self.posts(): try: @@ -36,11 +37,13 @@ class BooruExtractor(BaseExtractor): "(md5: %s)", post.get("id"), post.get("md5")) continue - page_html = None - if tags: - page_html = self._extended_tags(post) - if notes: - self._notes(post, page_html) + if fetch_html: + html = self._html(post) + if tags: + self._tags(post, html) + if notes: + self._notes(post, html) + text.nameext_from_url(url, post) post.update(data) self._prepare(post) @@ -67,16 +70,13 @@ class BooruExtractor(BaseExtractor): _file_url = operator.itemgetter("file_url") def _prepare(self, post): - """Prepare the 'post's metadata""" + """Prepare a 'post's metadata""" - def _extended_tags(self, post, page=None): - """Generate extended tag information + def _html(self, post): + """Return HTML content of a post""" - The return value of this function will be - passed to the _notes function as the page parameter. - This makes it possible to reuse the same HTML both for - extracting tags and notes. - """ + def _tags(self, post, page): + """Extract extended tag metadata""" - def _notes(self, post, page=None): - """Generate information about notes""" + def _notes(self, post, page): + """Extract notes metadata""" diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index a2cf0c03..63450c04 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -68,6 +68,22 @@ class GelbooruBase(): yield "https://img2.gelbooru.com" + path yield "https://img1.gelbooru.com" + path + def _notes(self, post, page): + notes_data = text.extract(page, '
')[0] + if not notes_data: + return + + post["notes"] = notes = [] + extr = text.extract + for note in text.extract_iter(notes_data, ''): + notes.append({ + "width" : int(extr(note, 'data-width="', '"')[0]), + "height": int(extr(note, 'data-height="', '"')[0]), + "x" : int(extr(note, 'data-x="', '"')[0]), + "y" : int(extr(note, 'data-y="', '"')[0]), + "body" : extr(note, 'data-body="', '"')[0], + }) + class GelbooruTagExtractor(GelbooruBase, gelbooru_v02.GelbooruV02TagExtractor): @@ -182,21 +198,21 @@ class GelbooruPostExtractor(GelbooruBase, "keywords": { "notes": [ { - "height": 553, "body": "Look over this way when you talk~", + "height": 553, "width": 246, "x": 35, - "y": 72 + "y": 72, }, { - "height": 557, "body": "Hey~\nAre you listening~?", + "height": 557, "width": 246, "x": 1233, - "y": 109 - } - ] - } + "y": 109, + }, + ], + }, }), ) diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 82146146..c3b04459 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -93,11 +93,11 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) return url - def _extended_tags(self, post, page=None): - if not page: - url = "{}/index.php?page=post&s=view&id={}".format( - self.root, post["id"]) - page = self.request(url).text + def _html(self, post): + return self.request("{}/index.php?page=post&s=view&id={}".format( + self.root, post["id"])).text + + def _tags(self, post, page): html = text.extract(page, '