[booru] refactor 'tags' and 'notes' extraction

- move HTML request for post pages into its own function
- move gelbooru_v02.py notes extraction to gelbooru.py
  since it only works there
- clean up some code
pull/3155/head
Mike Fährmann 2 years ago
parent 48bbe1ccf6
commit 775895f44b
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -25,6 +25,7 @@ class BooruExtractor(BaseExtractor):
data = self.metadata()
tags = self.config("tags", False)
notes = self.config("notes", False)
fetch_html = tags or notes
for post in self.posts():
try:
@ -36,11 +37,13 @@ class BooruExtractor(BaseExtractor):
"(md5: %s)", post.get("id"), post.get("md5"))
continue
page_html = None
if tags:
page_html = self._extended_tags(post)
if notes:
self._notes(post, page_html)
if fetch_html:
html = self._html(post)
if tags:
self._tags(post, html)
if notes:
self._notes(post, html)
text.nameext_from_url(url, post)
post.update(data)
self._prepare(post)
@ -67,16 +70,13 @@ class BooruExtractor(BaseExtractor):
_file_url = operator.itemgetter("file_url")
def _prepare(self, post):
"""Prepare the 'post's metadata"""
"""Prepare a 'post's metadata"""
def _extended_tags(self, post, page=None):
"""Generate extended tag information
def _html(self, post):
"""Return HTML content of a post"""
The return value of this function will be
passed to the _notes function as the page parameter.
This makes it possible to reuse the same HTML both for
extracting tags and notes.
"""
def _tags(self, post, page):
"""Extract extended tag metadata"""
def _notes(self, post, page=None):
"""Generate information about notes"""
def _notes(self, post, page):
"""Extract notes metadata"""

@ -68,6 +68,22 @@ class GelbooruBase():
yield "https://img2.gelbooru.com" + path
yield "https://img1.gelbooru.com" + path
def _notes(self, post, page):
notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
if not notes_data:
return
post["notes"] = notes = []
extr = text.extract
for note in text.extract_iter(notes_data, '<article', '</article>'):
notes.append({
"width" : int(extr(note, 'data-width="', '"')[0]),
"height": int(extr(note, 'data-height="', '"')[0]),
"x" : int(extr(note, 'data-x="', '"')[0]),
"y" : int(extr(note, 'data-y="', '"')[0]),
"body" : extr(note, 'data-body="', '"')[0],
})
class GelbooruTagExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02TagExtractor):
@ -182,21 +198,21 @@ class GelbooruPostExtractor(GelbooruBase,
"keywords": {
"notes": [
{
"height": 553,
"body": "Look over this way when you talk~",
"height": 553,
"width": 246,
"x": 35,
"y": 72
"y": 72,
},
{
"height": 557,
"body": "Hey~\nAre you listening~?",
"height": 557,
"width": 246,
"x": 1233,
"y": 109
}
]
}
"y": 109,
},
],
},
}),
)

@ -93,11 +93,11 @@ class GelbooruV02Extractor(booru.BooruExtractor):
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
return url
def _extended_tags(self, post, page=None):
if not page:
url = "{}/index.php?page=post&s=view&id={}".format(
self.root, post["id"])
page = self.request(url).text
def _html(self, post):
return self.request("{}/index.php?page=post&s=view&id={}".format(
self.root, post["id"])).text
def _tags(self, post, page):
html = text.extract(page, '<ul id="tag-', '</ul>')[0]
if not html:
html = text.extract(page, '<ul class="tag-', '</ul>')[0]
@ -109,31 +109,6 @@ class GelbooruV02Extractor(booru.BooruExtractor):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
post["tags_" + key] = " ".join(value)
return page
def _notes(self, post, page=None):
if not page:
url = "{}/index.php?page=post&s=view&id={}".format(
self.root, post["id"])
page = self.request(url).text
notes = []
notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
if not notes_data:
return
note_iter = text.extract_iter(notes_data, '<article', '</article>')
extr = text.extract
for note_data in note_iter:
note = {
"width": int(extr(note_data, 'data-width="', '"')[0]),
"height": int(extr(note_data, 'data-height="', '"')[0]),
"x": int(extr(note_data, 'data-x="', '"')[0]),
"y": int(extr(note_data, 'data-y="', '"')[0]),
"body": extr(note_data, 'data-body="', '"')[0],
}
notes.append(note)
post["notes"] = notes
INSTANCES = {

@ -26,10 +26,11 @@ class MoebooruExtractor(BooruExtractor):
def _prepare(post):
post["date"] = text.parse_timestamp(post["created_at"])
def _extended_tags(self, post, page=None):
if not page:
url = "{}/post/show/{}".format(self.root, post["id"])
page = self.request(url).text
def _html(self, post):
return self.request("{}/post/show/{}".format(
self.root, post["id"])).text
def _tags(self, post, page):
html = text.extract(page, '<ul id="tag-', '</ul>')[0]
if html:
tags = collections.defaultdict(list)
@ -38,30 +39,24 @@ class MoebooruExtractor(BooruExtractor):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
post["tags_" + key] = " ".join(value)
return page
def _notes(self, post, page=None):
if not page:
url = "{}/post/show/{}".format(self.root, post["id"])
page = self.request(url).text
notes = []
def _notes(self, post, page):
notes_container = text.extract(page, 'id="note-container"', "<img ")[0]
if not notes_container:
return
post["notes"] = notes = []
for note in notes_container.split('class="note-box"')[1:]:
extr = text.extract_from(note)
notes.append({
"width" : int(extr("width: ", "p")),
"height": int(extr("height: ", "p")),
"y" : int(extr("top: ", "p")),
"x" : int(extr("left: ", "p")),
"width" : int(extr("width:", "p")),
"height": int(extr("height:", "p")),
"y" : int(extr("top:", "p")),
"x" : int(extr("left:", "p")),
"id" : int(extr('id="note-body-', '"')),
"body" : text.remove_html(extr('>', "</div>")),
"body" : text.remove_html(extr(">", "</div>")),
})
post["notes"] = notes
def _pagination(self, url, params):
params["page"] = self.page_start
params["limit"] = self.per_page

@ -27,10 +27,6 @@ class PhilomenaExtractor(BooruExtractor):
def _prepare(post):
post["date"] = text.parse_datetime(post["created_at"])
@staticmethod
def _extended_tags(post):
pass
def _pagination(self, url, params):
params["page"] = 1
params["per_page"] = self.per_page

@ -63,7 +63,7 @@ class SankakuExtractor(BooruExtractor):
def _check_expired(self, response):
return not response.history or '.com/expired.png' not in response.url
def _extended_tags(self, post):
def _tags(self, post, page):
tags = collections.defaultdict(list)
types = self.TAG_TYPES
for tag in post["tags"]:

@ -83,7 +83,7 @@ class TwibooruPostExtractor(TwibooruExtractor):
"tag_ids": list,
"tags": list,
"thumbnails_generated": True,
"updated_at": "2022-05-13T00:43:19.791Z",
"updated_at": "2022-09-21T14:31:50.441Z",
"upvotes": int,
"view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png",
"width": 576,

Loading…
Cancel
Save