From 8a6e20860524a177d051a21312b8cf6bb1b8154b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 29 Jul 2024 11:19:10 +0200 Subject: [PATCH] [zerochan] fix 'Invalid control character' errors (#5892) --- gallery_dl/extractor/zerochan.py | 10 +++++++++- test/results/zerochan.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 49b06a77..126ef494 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -12,6 +12,7 @@ from .booru import BooruExtractor from ..cache import cache from .. import text, util, exception import collections +import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" @@ -92,7 +93,14 @@ class ZerochanExtractor(BooruExtractor): def _parse_entry_api(self, entry_id): url = "{}/{}?json".format(self.root, entry_id) - item = self.request(url).json() + text = self.request(url).text + try: + item = util.json_loads(text) + except ValueError as exc: + if " control character " not in str(exc): + raise + text = re.sub(r"[\x00-\x1f\x7f]", "", text) + item = util.json_loads(text) data = { "id" : item["id"], diff --git a/test/results/zerochan.py b/test/results/zerochan.py index f1788c2b..ec8eddf6 100644 --- a/test/results/zerochan.py +++ b/test/results/zerochan.py @@ -177,4 +177,15 @@ __tests__ = ( "width" : 750, }, +{ + "#url" : "https://www.zerochan.net/1395035", + "#comment" : "Invalid control character '\r' in 'source' field (#5892)", + "#category": ("booru", "zerochan", "image"), + "#class" : zerochan.ZerochanImageExtractor, + "#auth" : True, + "#options" : {"metadata": True}, + + "source": "http://www.youtube.com/watch?v=0vodqkGPxt8", +}, + )