[booru] adjust/match '_tags' and '_notes' code

2 years ago · 4fd3c893fa
parent 88954aa2e4
commit 4fd3c893fa
2 changed files with 26 additions and 23 deletions
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@ -98,14 +98,15 @@ class GelbooruV02Extractor(booru.BooruExtractor):
            self.root, post["id"])).text

    def _tags(self, post, page):
-        html = text.extract(page, '<ul id="tag-', '</ul>')[0]
-        if not html:
-            html = text.extract(page, '<ul class="tag-', '</ul>')[0]
-        if html:
+        tag_container = (text.extract(page, '<ul id="tag-', '</ul>')[0] or
+                         text.extract(page, '<ul class="tag-', '</ul>')[0])
+        if not tag_container:
+            return
+
        tags = collections.defaultdict(list)
        pattern = re.compile(
            r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
-            for tag_type, tag_name in pattern.findall(html):
+        for tag_type, tag_name in pattern.findall(tag_container):
            tags[tag_type].append(text.unquote(tag_name))
        for key, value in tags.items():
            post["tags_" + key] = " ".join(value)
--- a/gallery_dl/extractor/moebooru.py
+++ b/gallery_dl/extractor/moebooru.py
@ -31,22 +31,24 @@ class MoebooruExtractor(BooruExtractor):
            self.root, post["id"])).text

    def _tags(self, post, page):
-        html = text.extract(page, '<ul id="tag-', '</ul>')[0]
-        if html:
+        tag_container = text.extract(page, '<ul id="tag-', '</ul>')[0]
+        if not tag_container:
+            return
+
        tags = collections.defaultdict(list)
        pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'+]+)")
-            for tag_type, tag_name in pattern.findall(html):
+        for tag_type, tag_name in pattern.findall(tag_container):
            tags[tag_type].append(text.unquote(tag_name))
        for key, value in tags.items():
            post["tags_" + key] = " ".join(value)

    def _notes(self, post, page):
-        notes_container = text.extract(page, 'id="note-container"', "<img ")[0]
-        if not notes_container:
+        note_container = text.extract(page, 'id="note-container"', "<img ")[0]
+        if not note_container:
            return

        post["notes"] = notes = []
-        for note in notes_container.split('class="note-box"')[1:]:
+        for note in note_container.split('class="note-box"')[1:]:
            extr = text.extract_from(note)
            notes.append({
                "width" : int(extr("width:", "p")),
@ -54,7 +56,7 @@ class MoebooruExtractor(BooruExtractor):
                "y"     : int(extr("top:", "p")),
                "x"     : int(extr("left:", "p")),
                "id"    : int(extr('id="note-body-', '"')),
-                "body"  : text.remove_html(extr(">", "</div>")),
+                "body"  : text.unescape(text.remove_html(extr(">", "</div>"))),
            })

    def _pagination(self, url, params):