From e2157f594ee3864d0cdd7de5d3508e293f48294d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 6 May 2018 17:43:50 +0200 Subject: [PATCH 1/6] [mangadex] fix manga extraction (closes #84) Chapter listings for manga now use https://mangadex.org/manga//_/chapters/2/ as URL instead of https://mangadex.org/manga//_//2/ --- CHANGELOG.md | 3 +++ gallery_dl/extractor/komikcast.py | 4 ++-- gallery_dl/extractor/mangadex.py | 5 ++++- gallery_dl/version.py | 2 +- test/test_results.py | 1 + 5 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0765ef0d..ae0aead7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# Unreleased +- Fixed extraction of `mangadex` manga with more than 100 chapters (#84) + ## 1.3.5 - 2018-05-04 - Added support for: - `smugmug` - https://www.smugmug.com/ diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 04805001..e21a13f5 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -91,8 +91,8 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): """Extractor for manga from komikcast.com""" pattern = [r"(?:https?://)?(?:www\.)?(komikcast\.com/[^/?&#]+/?)$"] test = [("https://komikcast.com/tonari-no-kashiwagi-san/", { - "url": "ae862f22eb17fa97adf49222377ebe4412719771", - "keyword": "cd1ec571feacbc32e3b72c437d2e93e6b825be60", + "url": "c3c7a9233904d1c9e12dbb20911934af4b255ff8", + "keyword": "a4c7c24c87df41ff1d11da21e65df13d3a912691", })] def chapters(self, page): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 1d2101a2..bb1ecefe 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -116,6 +116,9 @@ class MangadexMangaExtractor(MangadexExtractor, MangaExtractor): "language": str, }, }), + ("https://mangadex.org/manga/13318/dagashi-kashi/chapters/2/", { + "count": ">= 100", + }), ] scheme = "https" per_page = 100 @@ -166,4 +169,4 @@ class MangadexMangaExtractor(MangadexExtractor, MangaExtractor): return results num += 1 - page = self.request("{}/_/{}/".format(self.url, num)).text + page = self.request("{}/_/chapters/{}/".format(self.url, num)).text diff --git a/gallery_dl/version.py b/gallery_dl/version.py index fb73168e..d97166e3 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.3.5" +__version__ = "1.3.6-dev" diff --git a/test/test_results.py b/test/test_results.py index 2e13ebea..fb0cf4f9 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -21,6 +21,7 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { + "pixiv", # API requests sometimes fail } From ec158776ed08c965bef97220fcd2feb61740f1ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 8 May 2018 18:10:50 +0200 Subject: [PATCH 2/6] [deviantart] add extractor for popular listings --- gallery_dl/extractor/deviantart.py | 67 ++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 29411445..7f63bc3d 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -167,7 +167,7 @@ class DeviantartGalleryExtractor(DeviantartExtractor): subcategory = "gallery" archive_fmt = "g_{username}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$"] test = [ ("http://shimoda7.deviantart.com/gallery/", { @@ -197,7 +197,7 @@ class DeviantartFolderExtractor(DeviantartExtractor): subcategory = "folder" directory_fmt = ["{category}", "{folder[owner]}", "{folder[title]}"] archive_fmt = "F_{folder[uuid]}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/gallery/(\d+)/([^/?&#]+)"] test = [ ("http://shimoda7.deviantart.com/gallery/722019/Miscellaneous", { @@ -232,8 +232,8 @@ class DeviantartDeviationExtractor(DeviantartExtractor): """Extractor for single deviations""" subcategory = "deviation" archive_fmt = "{index}.{extension}" - pattern = [(r"(?:https?://)?([^.]+\.deviantart\.com/" - r"(?:art|journal)/[^/?&#]+-\d+)"), + pattern = [(r"(?:https?://)?(?!www\.)([\w-]+\.deviantart\.com" + r"/(?:art|journal)/[^/?&#]+-\d+)"), (r"(?:https?://)?(sta\.sh/[a-z0-9]+)")] test = [ (("http://shimoda7.deviantart.com/art/" @@ -276,7 +276,7 @@ class DeviantartFavoriteExtractor(DeviantartExtractor): subcategory = "favorite" directory_fmt = ["{category}", "{username}", "Favourites"] archive_fmt = "f_{username}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/favourites/?(?:\?catpath=/)?$"] test = [ ("http://h3813067.deviantart.com/favourites/", { @@ -304,7 +304,7 @@ class DeviantartCollectionExtractor(DeviantartExtractor): directory_fmt = ["{category}", "{collection[owner]}", "Favourites", "{collection[title]}"] archive_fmt = "C_{collection[uuid]}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/favourites/(\d+)/([^/?&#]+)"] test = [(("https://pencilshadings.deviantart.com" "/favourites/70595441/3D-Favorites"), { @@ -334,7 +334,7 @@ class DeviantartJournalExtractor(DeviantartExtractor): subcategory = "journal" directory_fmt = ["{category}", "{username}", "Journal"] archive_fmt = "j_{username}_{index}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" + pattern = [r"(?:https?://)?(?!www\.)([\w-]+)\.deviantart\.com" r"/(?:journal|blog)/?(?:\?catpath=/)?$"] test = [ ("https://angrywhitewanker.deviantart.com/journal/", { @@ -348,6 +348,50 @@ class DeviantartJournalExtractor(DeviantartExtractor): return self.api.browse_user_journals(self.user, self.offset) +class DeviantartPopularExtractor(DeviantartExtractor): + """Extractor for popular deviations""" + subcategory = "popular" + directory_fmt = ["{category}", "Popular", + "{popular[range]}", "{popular[search]}"] + archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}" + pattern = [r"(?:https?://)?www\.deviantart\.com" + r"((?:/\w+)*)/(?:popular-([^/?&#]+))?/?(?:\?([^#]*))?"] + test = [ + ("https://www.deviantart.com/popular-8-hours/?q=tree+house", { + "options": (("original", False),), + }), + ("https://www.deviantart.com/artisan/popular-all-time/?q=tree", None), + ("https://www.deviantart.com/?q=tree", None), + ("https://www.deviantart.com/", None), + ] + + def __init__(self, match): + DeviantartExtractor.__init__(self) + self.search_term = self.time_range = self.category_path = None + + path, trange, query = match.groups() + if path: + self.category_path = path.lstrip("/") + if trange: + self.time_range = trange.replace("-", "").replace("hours", "hr") + if query: + self.search_term = text.parse_query(query).get("q") + + self.popular = { + "search": self.search_term or "", + "range": trange or "24-hours", + "path": self.category_path, + } + + def deviations(self): + return self.api.browse_popular( + self.search_term, self.time_range, self.category_path, self.offset) + + def prepare(self, deviation): + DeviantartExtractor.prepare(self, deviation) + deviation["popular"] = self.popular + + class DeviantartAPI(): """Minimal interface for the deviantart API""" CLIENT_ID = "5388" @@ -368,6 +412,15 @@ class DeviantartAPI(): self.client_secret = extractor.config( "client-secret", self.CLIENT_SECRET) + def browse_popular(self, query=None, timerange=None, + category_path=None, offset=0): + """Yield popular deviations""" + endpoint = "browse/popular" + params = {"q": query, "offset": offset, "limit": 120, + "timerange": timerange, "category_path": category_path, + "mature_content": self.mature} + return self._pagination(endpoint, params) + def browse_user_journals(self, username, offset=0): """Yield all journal entries of a specific user""" endpoint = "browse/user/journals" From 789608c1074b97cbf50cf82c5dde53d740c0e831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 11 May 2018 17:11:52 +0200 Subject: [PATCH 3/6] [imagebam] fix extraction for certain galleries --- gallery_dl/extractor/imagebam.py | 148 +++++++++++++++++-------------- gallery_dl/extractor/xvideos.py | 2 +- 2 files changed, 81 insertions(+), 69 deletions(-) diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 90164d24..7e88eead 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -8,91 +8,103 @@ """Extract images from http://www.imagebam.com/""" -from .common import Extractor, AsynchronousExtractor, Message +from .common import Extractor, Message from .. import text -class ImagebamGalleryExtractor(AsynchronousExtractor): - """Extractor for image galleries from imagebam.com""" +class ImagebamExtractor(Extractor): + """Base class for imagebam extractors""" category = "imagebam" + root = "http://www.imagebam.com" + + def get_image_data(self, page_url, data): + """Fill 'data' and return image URL""" + page = self.request(page_url).text + image_url = text.extract(page, 'property="og:image" content="', '"')[0] + data["extension"] = image_url.rpartition(".")[2] + data["image_key"] = page_url.rpartition("/")[2] + data["image_id"] = data["image_key"][6:] + return image_url + + +class ImagebamGalleryExtractor(ImagebamExtractor): + """Extractor for image galleries from imagebam.com""" subcategory = "gallery" directory_fmt = ["{category}", "{title} - {gallery_key}"] - filename_fmt = "{num:>03}-{name}.{extension}" - archive_fmt = "{gallery_key}_{image_id}" - pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"] - test = [(("http://www.imagebam.com/" - "gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), { - "url": "fb01925129a1ff1941762eaa3a2783a66de6847f", - "keyword": "2541078f61ce50714715e21757176dd69126f804", - "content": "596e6bfa157f2c7169805d50075c2986549973a8", - })] - root = "http://www.imagebam.com" + filename_fmt = "{num:>03}-{image_key}.{extension}" + archive_fmt = "{gallery_key}_{image_key}" + pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)"] + test = [ + ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { + "url": "fb01925129a1ff1941762eaa3a2783a66de6847f", + "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a", + "content": "596e6bfa157f2c7169805d50075c2986549973a8", + }), + ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", { + "url": "7d54178cecddfd46025cc9759f5b675fbb8f65af", + "keyword": "7d7db9664061132be50aa0d98e9602e98eb581ce", + }), + ] def __init__(self, match): - AsynchronousExtractor.__init__(self) - self.gkey = match.group(1) + ImagebamExtractor.__init__(self) + self.gallery_key = match.group(1) def items(self): - data, url = self.get_job_metadata() + url = "{}/gallery/{}".format(self.root, self.gallery_key) + page = text.extract( + self.request(url).text, "
", "
")[0] + + data = self.get_metadata(page) + imgs = self.get_image_pages(page) + data["count"] = len(imgs) + data["gallery_key"] = self.gallery_key + yield Message.Version, 1 yield Message.Directory, data - data["num"] = 0 - for image_url, image_id in self.get_images(url): - data["image_id"] = image_id - data["num"] += 1 - text.nameext_from_url(image_url, data) - yield Message.Url, image_url, data.copy() - - def get_job_metadata(self): - """Collect metadata for extractor-job""" - url = self.root + "/gallery/" + self.gkey - page = self.request(url, encoding="utf-8").text - data, pos = text.extract_all(page, ( - (None , " ", " <"), - ("count" , "'>", " images"), - ), values={"gallery_key": self.gkey}) - url, pos = text.extract( - page, "', '' - )[1] - if pos == 0: - done = True - else: - url, pos = text.extract(page, ' href="', '"', pos-70) - image_id , pos = text.extract(page, 'class="image" id="', '"', pos) - image_url, pos = text.extract(page, 'src="', '"', pos) - yield image_url, image_id - - -class ImagebamImageExtractor(Extractor): + for data["num"], page_url in enumerate(imgs, 1): + image_url = self.get_image_data(page_url, data) + yield Message.Url, image_url, data + + @staticmethod + def get_metadata(page): + """Return gallery metadata""" + return text.extract_all(page, ( + ("title" , "'> ", " ", ""), + ("description", ":#FCFCFC;'>", ""), + ))[0] + + @staticmethod + def get_image_pages(page): + """Return a list of all image pages""" + return list(text.extract_iter(page, " Date: Fri, 11 May 2018 17:28:34 +0200 Subject: [PATCH 4/6] [pinterest] catch JSON decode errors --- gallery_dl/extractor/pinterest.py | 9 +++++++-- test/test_results.py | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index a244cf9e..1583e03c 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -72,7 +72,7 @@ class PinterestBoardExtractor(PinterestExtractor): "url": "85911dfca313f3f7f48c2aa0bc684f539d1d80a6", }), ("https://www.pinterest.com/g1952848/test/", { - "exception": exception.NotFoundError, + "exception": exception.GalleryDLException, }), ] @@ -161,7 +161,11 @@ class PinterestAPI(): response = self.extractor.request( url, params=params, headers=self.HEADERS, fatal=False) - data = response.json() + + try: + data = response.json() + except ValueError: + data = {} if 200 <= response.status_code < 400 and not response.history: return data @@ -169,6 +173,7 @@ class PinterestAPI(): if response.status_code == 404 or response.history: raise exception.NotFoundError(self.extractor.subcategory) self.extractor.log.error("API request failed") + self.extractor.log.debug("%s", response.text) raise exception.StopExtraction() def _pagination(self, resource, options): diff --git a/test/test_results.py b/test/test_results.py index fb0cf4f9..aee4913f 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -21,6 +21,7 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { + "dokireader", # SSL cert expired "pixiv", # API requests sometimes fail } From 4cea8861779025ec2fa84ce6d8e590f99e09da06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 13 May 2018 11:19:10 +0200 Subject: [PATCH 5/6] [imgur] allow longer album hashes --- gallery_dl/extractor/imgur.py | 5 ++++- test/test_results.py | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 621282e0..a3e332cb 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -118,7 +118,7 @@ class ImgurAlbumExtractor(ImgurExtractor): filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}" archive_fmt = "{album[hash]}_{hash}" pattern = [r"(?:https?://)?(?:www\.|m\.)?imgur\.com" - r"/(?:a|gallery)/(\w{5})"] + r"/(?:a|gallery)/(\w{7}|\w{5})"] test = [ ("https://imgur.com/a/TcBmP", { "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", @@ -152,6 +152,9 @@ class ImgurAlbumExtractor(ImgurExtractor): ("https://imgur.com/gallery/eD9CT", { # large album "url": "4ee94de31ff26be416271bc0b1ea27b9349c9937", }), + ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash + "url": "695ef0c950023362a0163ee5041796300db76674", + }), ("https://imgur.com/a/TcBmQ", { "exception": exception.NotFoundError, }), diff --git a/test/test_results.py b/test/test_results.py index aee4913f..90fc1b96 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -16,13 +16,13 @@ from gallery_dl import extractor, job, config, exception # these don't work on travis-ci TRAVIS_SKIP = { "exhentai", "kissmanga", "mangafox", "dynastyscans", "nijie", - "archivedmoe", "archiveofsins", "thebarchive", "sankaku", "idolcomplex", + "archivedmoe", "archiveofsins", "thebarchive", "fireden", + "sankaku", "idolcomplex", } # temporary issues, etc. BROKEN = { - "dokireader", # SSL cert expired - "pixiv", # API requests sometimes fail + "pixiv", # /users//favorite_works API endpoint is gone } From 92fc199b0795e2d54a125f932525db50c3a6b5d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 13 May 2018 11:23:23 +0200 Subject: [PATCH 6/6] [reddit] allow arbitrary subdomains --- gallery_dl/extractor/reddit.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 55f15f1d..6794b165 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -72,12 +72,15 @@ class RedditExtractor(Extractor): class RedditSubredditExtractor(RedditExtractor): """Extractor for images from subreddits on reddit.com""" subcategory = "subreddit" - pattern = [r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/([^/?&#]+)" + pattern = [r"(?:https?://)?(?:\w+\.)?reddit\.com/r/([^/?&#]+)" r"(/[a-z]+)?/?" r"(?:\?.*?(?:\bt=([a-z]+))?)?$"] test = [ ("https://www.reddit.com/r/lavaporn/", None), ("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month", None), + ("https://old.reddit.com/r/lavaporn/", None), + ("https://np.reddit.com/r/lavaporn/", None), + ("https://m.reddit.com/r/lavaporn/", None), ] def __init__(self, match): @@ -94,7 +97,7 @@ class RedditSubmissionExtractor(RedditExtractor): """Extractor for images from a submission on reddit.com""" subcategory = "submission" pattern = [(r"(?:https?://)?(?:" - r"(?:m\.|www\.)?reddit\.com/r/[^/]+/comments|" + r"(?:\w+\.)?reddit\.com/r/[^/?&#]+/comments|" r"redd\.it" r")/([a-z0-9]+)")] test = [ @@ -102,6 +105,8 @@ class RedditSubmissionExtractor(RedditExtractor): "pattern": r"https?://i\.imgur\.com/AaAUCgy\.jpg", "count": 1, }), + ("https://old.reddit.com/r/lavaporn/comments/2a00np/", None), + ("https://np.reddit.com/r/lavaporn/comments/2a00np/", None), ("https://m.reddit.com/r/lavaporn/comments/2a00np/", None), ("https://redd.it/2a00np/", None), ]