[newgrounds] support 'imageData' files (#4642)

pull/2432/merge
Mike Fährmann 11 months ago
parent b52fd91ac6
commit 7958ab1946
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -54,27 +54,30 @@ class NewgroundsExtractor(Extractor):
if metadata:
post.update(metadata)
yield Message.Directory, post
post["num"] = 0
yield Message.Url, url, text.nameext_from_url(url, post)
ext = post["extension"]
for num, url in enumerate(text.extract_iter(
post["_images"] + post["_comment"],
'data-smartload-src="', '"'), 1):
post["num"] = num
post["_index"] = "{}_{:>02}".format(post["index"], num)
if "_multi" in post:
for data in post["_multi"]:
post["num"] += 1
post["_index"] = "{}_{:>02}".format(
post["index"], post["num"])
post.update(data)
url = data["image"]
text.nameext_from_url(url, post)
yield Message.Url, url, post
if "_fallback" in post:
del post["_fallback"]
for url in text.extract_iter(
post["_comment"], 'data-smartload-src="', '"'):
post["num"] += 1
post["_index"] = "{}_{:>02}".format(
post["index"], post["num"])
url = text.ensure_http_scheme(url)
text.nameext_from_url(url, post)
if "_fallback" in post:
del post["_fallback"]
if "/comments/" not in url:
url = url.replace("/medium_views/", "/images/", 1)
if post["extension"] == "webp":
post["_fallback"] = (url,)
post["extension"] = ext
url = url.replace(".webp", "." + ext)
yield Message.Url, url, post
else:
self.log.warning(
@ -149,7 +152,6 @@ class NewgroundsExtractor(Extractor):
extr = text.extract_from(page)
data = extract_data(extr, post_url)
data["_images"] = extr('<div class="art-images', '\n</div>')
data["_comment"] = extr(
'id="author_comments"', '</div>').partition(">")[2]
data["comment"] = text.unescape(text.remove_html(
@ -168,8 +170,7 @@ class NewgroundsExtractor(Extractor):
data["post_url"] = post_url
return data
@staticmethod
def _extract_image_data(extr, url):
def _extract_image_data(self, extr, url):
full = text.extract_from(util.json_loads(extr(
'"full_image_text":', '});')))
data = {
@ -187,8 +188,34 @@ class NewgroundsExtractor(Extractor):
index = data["url"].rpartition("/")[2].partition("_")[0]
data["index"] = text.parse_int(index)
data["_index"] = index
image_data = extr("let imageData =", "\n];")
if image_data:
data["_multi"] = self._extract_images_multi(image_data)
else:
art_images = extr('<div class="art-images', '\n</div>')
if art_images:
data["_multi"] = self._extract_images_art(art_images, data)
return data
def _extract_images_multi(self, html):
data = util.json_loads(html + "]")
yield from data[1:]
def _extract_images_art(self, html, data):
ext = text.ext_from_url(data["url"])
for url in text.extract_iter(html, 'data-smartload-src="', '"'):
url = text.ensure_http_scheme(url)
url = url.replace("/medium_views/", "/images/", 1)
if text.ext_from_url(url) == "webp":
yield {
"image" : url.replace(".webp", "." + ext),
"_fallback": (url,),
}
else:
yield {"image": url}
@staticmethod
def _extract_audio_data(extr, url):
index = url.split("/")[5]

@ -82,6 +82,17 @@ __tests__ = (
),
},
{
"#url" : "https://www.newgrounds.com/art/view/bacun/kill-la-kill-10th-anniversary",
"#comment" : "extra files in 'imageData' block (#4642)",
"#category": ("", "newgrounds", "image"),
"#class" : newgrounds.NewgroundsImageExtractor,
"#urls" : (
"https://art.ngfiles.com/images/5127000/5127150_93307_bacun_kill-la-kill-10th-anniversary.61adfe309bec342f9db55fd44397235b.png?f1697310027",
"https://art.ngfiles.com/images/5127000/5127150_94250_bacun_kill-la-kill-10th-anniversary.64fdf525fa38c1ab34defac4b354bc7a.png?f1697332109",
),
},
{
"#url" : "https://www.newgrounds.com/art/view/kekiiro/red",
"#comment" : "'adult' rated (#2456)",

Loading…
Cancel
Save