From 39cd389679bb67a76ff38d6f1af08ffd4ee9fd1d Mon Sep 17 00:00:00 2001 From: Leonardo Taccari Date: Mon, 18 May 2020 19:04:20 +0200 Subject: [PATCH] [webtoons] Add a new extractor for webtoons.com (#761) The webtoons extractor can extract episode and entire comic (all episodes) from webtoons.com. All the logic of the extractors should be trivial except for a couple of kludges needed: - `ageGatePass' cookie is always set to avoid possible redirect and stop of extraction, especially in the comic extractor - The image URLs returned by the episode extractor could not be fetched directly and the `Referer:' HTTP header needs to be passed to fetch them Close #593. --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/webtoons.py | 138 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + 3 files changed, 140 insertions(+) create mode 100644 gallery_dl/extractor/webtoons.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index cde59d61..561b4847 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -115,6 +115,7 @@ modules = [ "vsco", "wallhaven", "warosu", + "webtoons", "weibo", "wikiart", "xhamster", diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py new file mode 100644 index 00000000..bebac770 --- /dev/null +++ b/gallery_dl/extractor/webtoons.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Leonardo Taccari +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.webtoons.com/""" + +from .common import Extractor, Message +from .. import exception, text + + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?webtoons\.com/(?:en|fr)" + + +class WebtoonsExtractor(Extractor): + category = "webtoons" + cookiedomain = "www.webtoons.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.session.cookies.set("ageGatePass", "true", + domain=self.cookiedomain) + + +class WebtoonsEpisodeExtractor(WebtoonsExtractor): + """Extractor for an episode on webtoons.com""" + subcategory = "episode" + directory_fmt = ("{category}", "{comic}") + filename_fmt = "{episode}-{num:>02}.{extension}" + archive_fmt = "{episode}_{num}" + pattern = (BASE_PATTERN + r"/([^/?&#]+)/([^/?&#]+)/(?:[^/?&#]+)" + r"/viewer(?:\?([^#]+))") + test = ( + (("https://www.webtoons.com/en/comedy/safely-endangered" + "/ep-572-earth/viewer?title_no=352&episode_no=572"), { + "url": "11041d71a3f92728305c11a228e77cf0f7aa02ef", + "content": "4f7701a750368e377d65900e6e8f64a5f9cb9c86", + "count": 5, + }), + ) + + def __init__(self, match): + WebtoonsExtractor.__init__(self, match) + self.genre , self.comic, query = match.groups() + query = text.parse_query(query) + self.title_no = query.get("title_no") + if not self.title_no: + raise exception.NotFoundError("title_no") + self.episode = query.get("episode_no") + if not self.episode: + raise exception.NotFoundError("episode_no") + self.session.headers["Referer"] = self.url + + def items(self): + page = self.request(self.url).text + data = self.get_job_metadata(page) + imgs = self.get_image_urls(page) + data["count"] = len(imgs) + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], url in enumerate(imgs, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_job_metadata(self, page): + """Collect metadata for extractor-job""" + title, pos = text.extract( + page, '= 15", + }), + ) + + def __init__(self, match): + WebtoonsExtractor.__init__(self, match) + self.genre, self.comic, query = match.groups() + query = text.parse_query(query) + self.title_no = query.get("title_no") + if not self.title_no: + raise exception.NotFoundError("title_no") + self.page_no = int(query.get("page", 1)) + + def items(self): + data = {} + data["_extractor"] = WebtoonsEpisodeExtractor + while True: + page = self.request("https://www.webtoons.com/en/" + + self.genre + "/" + self.comic + "/list?" + + "title_no=" + self.title_no + "&" + "page=" + str(self.page_no)).text + data["page"] = self.page_no + + for url in self.get_episode_urls(page): + yield Message.Queue, url, data + + if not self.has_next_page(page): + break + + self.page_no += 1 + + def has_next_page(self, page): + return "/en/" + self.genre + "/" + self.comic + "/list?" + \ + "title_no=" + self.title_no + \ + "&page=" + str(self.page_no + 1) in page + + @staticmethod + def get_episode_urls(page): + """Extract and return a list of all episode urls""" + return list(text.extract_iter(page, '