From f64fb8f239744c912ad4b15c3accf9e3bc6b7018 Mon Sep 17 00:00:00 2001 From: Johann Hong <57867081+986569200-johann-Hong@users.noreply.github.com> Date: Mon, 29 Jan 2024 00:23:09 +0900 Subject: [PATCH] [naver] EUC-KR encoding issue in old image URLs Fix Around October 2010, the image server URL format and file name encoding changed from EUC-KR to UTF-8. Modified to detect old URL format and decode image URLs into EUC-KR - (lint with flake8) Customize conditions Wrap lines smaller than 79 characters - (lint with flake8) Customize conditions (2nd try) - One import per line - Indent on consecutive lines - (lint with flake8) Customize conditions (3rd try) - E128 continuation line under-indented for visual indent - E123 closing bracket does not match indentation of opening bracket's line - Update naver.py Check encoding for all image URLs --- gallery_dl/extractor/naver.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index 55faf9e7..25801c7e 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -10,6 +10,7 @@ from .common import GalleryExtractor, Extractor, Message from .. import text +from urllib.parse import unquote class NaverBase(): @@ -63,7 +64,13 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): def images(self, page): return [ - (url.replace("://post", "://blog", 1).partition("?")[0], None) + (unquote(url, encoding="EUC-KR") + .replace("://post", "://blog", 1) + .partition("?")[0], None) + if "\ufffd" in unquote(url) + else + (url.replace("://post", "://blog", 1) + .partition("?")[0], None) for url in text.extract_iter(page, 'data-lazy-src="', '"') ]