[szurubooru] add 'tag' and 'post' extractors (#3583, #3713)

pull/3717/head
Mike Fährmann 2 years ago
parent 075c965512
commit 0d142e403c
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -2647,6 +2647,17 @@ Description
Download video files.
extractor.[szurubooru].username & .token
----------------------------------------
Type
``string``
Description
Username and login token of your account to access private resources.
To generate a token, visit ``/user/USERNAME/list-tokens``
and click ``Create Token``.
extractor.tumblr.avatar
-----------------------
Type

@ -1232,6 +1232,22 @@ Consider all sites to be NSFW unless otherwise known.
<td></td>
</tr>
<tr>
<td colspan="4"><strong>szurubooru Instances</strong></td>
</tr>
<tr>
<td>Foalcon</td>
<td>https://booru.foalcon.com/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr>
<td>Bcbnsfw</td>
<td>https://booru.bcbnsfw.space/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr>
<td colspan="4"><strong>vichan Imageboards</strong></td>
</tr>

@ -140,6 +140,7 @@ modules = [
"soundgasm",
"speakerdeck",
"subscribestar",
"szurubooru",
"tapas",
"tcbscans",
"telegraph",

@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
# Copyright 2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for szurubooru instances"""
from . import booru
from .. import text
import collections
import binascii
class SzurubooruExtractor(booru.BooruExtractor):
basecategory = "szurubooru"
filename_fmt = "{id}_{version}_{checksumMD5}.{extension}"
per_page = 100
def __init__(self, match):
booru.BooruExtractor.__init__(self, match)
self.headers = {
"Accept": "application/json",
"Content-Type": "application/json",
}
username = self.config("username")
if username:
token = self.config("token")
if token:
value = username + ":" + token
self.headers["Authorization"] = "Token " + \
binascii.b2a_base64(value.encode())[:-1].decode()
def _api_request(self, endpoint, params=None):
url = self.root + "/api" + endpoint
return self.request(url, headers=self.headers, params=params).json()
def _pagination(self, endpoint, params):
params["offset"] = 0
params["limit"] = self.per_page
while True:
data = self._api_request(endpoint, params)
results = data["results"]
yield from results
if len(results) < self.per_page:
return
params["offset"] += len(results)
def _file_url(self, post):
url = post["contentUrl"]
if not url.startswith("http"):
url = self.root + "/" + url
return url
@staticmethod
def _prepare(post):
post["date"] = text.parse_datetime(
post["creationTime"], "%Y-%m-%dT%H:%M:%S.%fZ")
tags = []
append = tags.append
tags_categories = collections.defaultdict(list)
for tag in post["tags"]:
tag_type = tag["category"].rpartition("_")[2]
tag_name = tag["names"][0]
tags_categories[tag_type].append(tag_name)
append(tag_name)
post["tags"] = tags
for category, tags in tags_categories.items():
post["tags_" + category] = tags
BASE_PATTERN = SzurubooruExtractor.update({
"foalcon": {
"root": "https://booru.foalcon.com",
"pattern": r"booru\.foalcon\.com",
},
"bcbnsfw": {
"root": "https://booru.bcbnsfw.space",
"pattern": r"booru\.bcbnsfw\.space",
},
})
class SzurubooruTagExtractor(SzurubooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}_{version}"
pattern = BASE_PATTERN + r"/posts/query=([^/?#]+)"
test = (
("https://booru.foalcon.com/posts/query=simple_background", {
"pattern": r"https://booru\.foalcon\.com/data/posts"
r"/\d+_[0-9a-f]{16}\.\w+",
"range": "1-150",
"count": 150,
}),
("https://booru.bcbnsfw.space/posts/query=simple_background"),
)
def __init__(self, match):
SzurubooruExtractor.__init__(self, match)
query = match.group(match.lastindex)
self.query = text.unquote(query.replace("+", " "))
def metadata(self):
return {"search_tags": self.query}
def posts(self):
return self._pagination("/posts/", {"query": self.query})
class SzurubooruPostExtractor(SzurubooruExtractor):
subcategory = "post"
archive_fmt = "{id}_{version}"
pattern = BASE_PATTERN + r"/post/(\d+)"
test = (
("https://booru.foalcon.com/post/30092", {
"pattern": r"https://booru\.foalcon\.com/data/posts"
r"/30092_b7d56e941888b624\.png",
"url": "dad4d4c67d87cd9a4ac429b3414747c27a95d5cb",
"content": "86d1514c0ca8197950cc4b74e7a59b2dc76ebf9c",
}),
("https://booru.bcbnsfw.space/post/1599", {
"pattern": r"https://booru\.bcbnsfw\.space/data/posts"
r"/1599_53784518e92086bd\.png",
"content": "0c38fc612ba1f03950fad31c4f80a1fccdab1096",
}),
)
def __init__(self, match):
SzurubooruExtractor.__init__(self, match)
self.post_id = match.group(match.lastindex)
def posts(self):
return (self._api_request("/post/" + self.post_id),)

@ -264,6 +264,7 @@ BASE_MAP = {
"lolisafe" : "lolisafe and chibisafe",
"lynxchan" : "LynxChan Imageboards",
"moebooru" : "Moebooru and MyImouto",
"szurubooru" : "szurubooru Instances",
"vichan" : "vichan Imageboards",
}

Loading…
Cancel
Save