|
|
|
@ -54,27 +54,30 @@ class NewgroundsExtractor(Extractor):
|
|
|
|
|
if metadata:
|
|
|
|
|
post.update(metadata)
|
|
|
|
|
yield Message.Directory, post
|
|
|
|
|
post["num"] = 0
|
|
|
|
|
yield Message.Url, url, text.nameext_from_url(url, post)
|
|
|
|
|
|
|
|
|
|
ext = post["extension"]
|
|
|
|
|
for num, url in enumerate(text.extract_iter(
|
|
|
|
|
post["_images"] + post["_comment"],
|
|
|
|
|
'data-smartload-src="', '"'), 1):
|
|
|
|
|
post["num"] = num
|
|
|
|
|
post["_index"] = "{}_{:>02}".format(post["index"], num)
|
|
|
|
|
if "_multi" in post:
|
|
|
|
|
for data in post["_multi"]:
|
|
|
|
|
post["num"] += 1
|
|
|
|
|
post["_index"] = "{}_{:>02}".format(
|
|
|
|
|
post["index"], post["num"])
|
|
|
|
|
post.update(data)
|
|
|
|
|
url = data["image"]
|
|
|
|
|
|
|
|
|
|
text.nameext_from_url(url, post)
|
|
|
|
|
yield Message.Url, url, post
|
|
|
|
|
|
|
|
|
|
if "_fallback" in post:
|
|
|
|
|
del post["_fallback"]
|
|
|
|
|
|
|
|
|
|
for url in text.extract_iter(
|
|
|
|
|
post["_comment"], 'data-smartload-src="', '"'):
|
|
|
|
|
post["num"] += 1
|
|
|
|
|
post["_index"] = "{}_{:>02}".format(
|
|
|
|
|
post["index"], post["num"])
|
|
|
|
|
url = text.ensure_http_scheme(url)
|
|
|
|
|
text.nameext_from_url(url, post)
|
|
|
|
|
|
|
|
|
|
if "_fallback" in post:
|
|
|
|
|
del post["_fallback"]
|
|
|
|
|
|
|
|
|
|
if "/comments/" not in url:
|
|
|
|
|
url = url.replace("/medium_views/", "/images/", 1)
|
|
|
|
|
if post["extension"] == "webp":
|
|
|
|
|
post["_fallback"] = (url,)
|
|
|
|
|
post["extension"] = ext
|
|
|
|
|
url = url.replace(".webp", "." + ext)
|
|
|
|
|
|
|
|
|
|
yield Message.Url, url, post
|
|
|
|
|
else:
|
|
|
|
|
self.log.warning(
|
|
|
|
@ -149,7 +152,6 @@ class NewgroundsExtractor(Extractor):
|
|
|
|
|
extr = text.extract_from(page)
|
|
|
|
|
data = extract_data(extr, post_url)
|
|
|
|
|
|
|
|
|
|
data["_images"] = extr('<div class="art-images', '\n</div>')
|
|
|
|
|
data["_comment"] = extr(
|
|
|
|
|
'id="author_comments"', '</div>').partition(">")[2]
|
|
|
|
|
data["comment"] = text.unescape(text.remove_html(
|
|
|
|
@ -168,8 +170,7 @@ class NewgroundsExtractor(Extractor):
|
|
|
|
|
data["post_url"] = post_url
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _extract_image_data(extr, url):
|
|
|
|
|
def _extract_image_data(self, extr, url):
|
|
|
|
|
full = text.extract_from(util.json_loads(extr(
|
|
|
|
|
'"full_image_text":', '});')))
|
|
|
|
|
data = {
|
|
|
|
@ -187,8 +188,34 @@ class NewgroundsExtractor(Extractor):
|
|
|
|
|
index = data["url"].rpartition("/")[2].partition("_")[0]
|
|
|
|
|
data["index"] = text.parse_int(index)
|
|
|
|
|
data["_index"] = index
|
|
|
|
|
|
|
|
|
|
image_data = extr("let imageData =", "\n];")
|
|
|
|
|
if image_data:
|
|
|
|
|
data["_multi"] = self._extract_images_multi(image_data)
|
|
|
|
|
else:
|
|
|
|
|
art_images = extr('<div class="art-images', '\n</div>')
|
|
|
|
|
if art_images:
|
|
|
|
|
data["_multi"] = self._extract_images_art(art_images, data)
|
|
|
|
|
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
def _extract_images_multi(self, html):
|
|
|
|
|
data = util.json_loads(html + "]")
|
|
|
|
|
yield from data[1:]
|
|
|
|
|
|
|
|
|
|
def _extract_images_art(self, html, data):
|
|
|
|
|
ext = text.ext_from_url(data["url"])
|
|
|
|
|
for url in text.extract_iter(html, 'data-smartload-src="', '"'):
|
|
|
|
|
url = text.ensure_http_scheme(url)
|
|
|
|
|
url = url.replace("/medium_views/", "/images/", 1)
|
|
|
|
|
if text.ext_from_url(url) == "webp":
|
|
|
|
|
yield {
|
|
|
|
|
"image" : url.replace(".webp", "." + ext),
|
|
|
|
|
"_fallback": (url,),
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
yield {"image": url}
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _extract_audio_data(extr, url):
|
|
|
|
|
index = url.split("/")[5]
|
|
|
|
|