From 5008e105ee6d864ba91bd7b7532c72b97e9e7b1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 1 Mar 2018 17:40:31 +0100 Subject: [PATCH] update archive IDs ... to behave in a more straightforward way when dealing with bookmarks/favourites/etc. specific IDs are now grouped by their owner, album-id, ... to allow for duplicates when it would be expected. --- gallery_dl/extractor/booru.py | 20 +++++++++++++------- gallery_dl/extractor/deviantart.py | 10 ++++++++-- gallery_dl/extractor/flickr.py | 8 +++++++- gallery_dl/extractor/gelbooru.py | 13 +++++++++---- gallery_dl/extractor/imagebam.py | 2 +- gallery_dl/extractor/imgbox.py | 3 ++- gallery_dl/extractor/imgchili.py | 7 ++++--- gallery_dl/extractor/sankaku.py | 19 ++++++++++++------- gallery_dl/extractor/tumblr.py | 1 + gallery_dl/version.py | 2 +- test/test_extractors.py | 1 + 11 files changed, 59 insertions(+), 27 deletions(-) diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 9caaaa30..f0378803 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -20,7 +20,6 @@ class BooruExtractor(SharedConfigExtractor): """Base class for all booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" - archive_fmt = "{id}" api_url = "" per_page = 50 page_start = 1 @@ -39,20 +38,23 @@ class BooruExtractor(SharedConfigExtractor): return pages * self.per_page def items(self): + data = self.get_metadata() + yield Message.Version, 1 - yield Message.Directory, self.get_metadata() + yield Message.Directory, data self.reset_page() while True: images = self.parse_response( self.request(self.api_url, params=self.params)) - for data in images: + for image in images: try: - url = data["file_url"] + url = image["file_url"] if url.startswith("/"): url = urljoin(self.api_url, url) - yield Message.Url, url, text.nameext_from_url(url, data) + image.update(data) + yield Message.Url, url, text.nameext_from_url(url, image) except KeyError: continue @@ -115,7 +117,8 @@ class GelbooruPageMixin(): class TagMixin(): """Extraction of images based on search-tags""" subcategory = "tag" - directory_fmt = ["{category}", "{tags}"] + directory_fmt = ["{category}", "{search_tags}"] + archive_fmt = "t_{search_tags}_{id}" def __init__(self, match): super().__init__(match) @@ -124,13 +127,14 @@ class TagMixin(): self.params["limit"] = self.per_page def get_metadata(self): - return {"tags": self.tags} + return {"search_tags": self.tags} class PoolMixin(): """Extraction of image-pools""" subcategory = "pool" directory_fmt = ["{category}", "pool", "{pool}"] + archive_fmt = "p_{pool}_{id}" def __init__(self, match): super().__init__(match) @@ -145,6 +149,7 @@ class PoolMixin(): class PostMixin(): """Extraction of a single image-post""" subcategory = "post" + archive_fmt = "{id}" def __init__(self, match): super().__init__(match) @@ -156,6 +161,7 @@ class PopularMixin(): """Extraction and metadata handling for Danbooru v2""" subcategory = "popular" directory_fmt = ["{category}", "popular", "{scale}", "{date}"] + archive_fmt = "P_{scale[0]}_{date}_{id}" page_start = None sort = True diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 801e9e8d..5747fe85 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -22,7 +22,6 @@ class DeviantartExtractor(Extractor): category = "deviantart" directory_fmt = ["{category}", "{author[username]!l}"] filename_fmt = "{category}_{index}_{title}.{extension}" - archive_fmt = "{index}.{extension}" def __init__(self, match=None): Extractor.__init__(self) @@ -166,6 +165,8 @@ class DeviantartExtractor(Extractor): class DeviantartGalleryExtractor(DeviantartExtractor): """Extractor for all deviations from an artist's gallery""" subcategory = "gallery" + archive_fmt = "g_{username}_{index}.{extension}" + pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$"] test = [ @@ -192,6 +193,7 @@ class DeviantartFolderExtractor(DeviantartExtractor): """Extractor for deviations inside an artist's gallery folder""" subcategory = "folder" directory_fmt = ["{category}", "{folder[owner]}", "{folder[title]}"] + archive_fmt = "F_{folder[index]}_{index}.{extension}" pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" r"/gallery/(\d+)/([^/?&#]+)"] test = [ @@ -225,6 +227,7 @@ class DeviantartFolderExtractor(DeviantartExtractor): class DeviantartDeviationExtractor(DeviantartExtractor): """Extractor for single deviations""" subcategory = "deviation" + archive_fmt = "{index}.{extension}" pattern = [(r"(?:https?://)?([^.]+\.deviantart\.com/" r"(?:art|journal)/[^/?&#]+-\d+)"), (r"(?:https?://)?(sta\.sh/[a-z0-9]+)")] @@ -268,6 +271,7 @@ class DeviantartFavoriteExtractor(DeviantartExtractor): """Extractor for an artist's favorites""" subcategory = "favorite" directory_fmt = ["{category}", "{username}", "Favourites"] + archive_fmt = "f_{username}_{index}.{extension}" pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" r"/favourites/?(?:\?catpath=/)?$"] test = [ @@ -295,12 +299,13 @@ class DeviantartCollectionExtractor(DeviantartExtractor): subcategory = "collection" directory_fmt = ["{category}", "{collection[owner]}", "Favourites", "{collection[title]}"] + archive_fmt = "C_{collection[index]}_{index}.{extension}" pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" r"/favourites/(\d+)/([^/?&#]+)"] test = [(("https://pencilshadings.deviantart.com" "/favourites/70595441/3D-Favorites"), { "url": "742f92199d5bc6a89cda6ec6133d46c7a523824d", - "keyword": "9210c976b5274eff6ea1d2b8a4f891c9f35ce340", + "keyword": "5da3a16e85150d2a09e074b2b2ee916099b52737", "options": (("original", False),), })] @@ -324,6 +329,7 @@ class DeviantartJournalExtractor(DeviantartExtractor): """Extractor for an artist's journals""" subcategory = "journal" directory_fmt = ["{category}", "{username}", "Journal"] + archive_fmt = "j_{username}_{index}.{extension}" pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com" r"/(?:journal|blog)/?(?:\?catpath=/)?$"] test = [ diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index f818d756..79b3622f 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -16,7 +16,6 @@ class FlickrExtractor(Extractor): """Base class for flickr extractors""" category = "flickr" filename_fmt = "{category}_{id}.{extension}" - archive_fmt = "{id}" def __init__(self, match): Extractor.__init__(self) @@ -45,6 +44,7 @@ class FlickrExtractor(Extractor): class FlickrImageExtractor(FlickrExtractor): """Extractor for individual images from flickr.com""" subcategory = "image" + archive_fmt = "{id}" pattern = [r"(?:https?://)?(?:www\.|m\.)?flickr\.com/photos/[^/]+/(\d+)", r"(?:https?://)?[^.]+\.static\.?flickr\.com/(?:\d+/)+(\d+)_", r"(?:https?://)?flic\.kr/(p)/([A-Za-z1-9]+)"] @@ -108,6 +108,7 @@ class FlickrAlbumExtractor(FlickrExtractor): subcategory = "album" directory_fmt = ["{category}", "{subcategory}s", "{album[id]} - {album[title]}"] + archive_fmt = "a_{album[id]}_{id}" pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/" r"photos/([^/]+)/(?:album|set)s/(\d+)"] test = [(("https://www.flickr.com/photos/" @@ -143,6 +144,7 @@ class FlickrGalleryExtractor(FlickrExtractor): subcategory = "gallery" directory_fmt = ["{category}", "galleries", "{user[username]} {gallery[id]}"] + archive_fmt = "g_{gallery[id]}_{id}" pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/" r"photos/([^/]+)/galleries/(\d+)"] test = [(("https://www.flickr.com/photos/flickr/" @@ -171,6 +173,7 @@ class FlickrGroupExtractor(FlickrExtractor): """Extractor for group pools from flickr.com""" subcategory = "group" directory_fmt = ["{category}", "{subcategory}s", "{group[groupname]}"] + archive_fmt = "G_{group[nsid]}_{id}" pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)"] test = [("https://www.flickr.com/groups/bird_headshots/", { "pattern": (r"https?://farm\d+\.staticflickr\.com" @@ -189,6 +192,7 @@ class FlickrUserExtractor(FlickrExtractor): """Extractor for the photostream of a flickr user""" subcategory = "user" directory_fmt = ["{category}", "{user[username]}"] + archive_fmt = "u_{user[nsid]}_{id}" pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$"] test = [("https://www.flickr.com/photos/shona_s/", { "url": "d125b536cd8c4229363276b6c84579c394eec3a2", @@ -203,6 +207,7 @@ class FlickrFavoriteExtractor(FlickrExtractor): """Extractor for favorite photos of a flickr user""" subcategory = "favorite" directory_fmt = ["{category}", "{subcategory}s", "{user[username]}"] + archive_fmt = "f_{user[nsid]}_{id}" pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites"] test = [("https://www.flickr.com/photos/shona_s/favorites", { "url": "5129b3f5bfa83cc25bdae3ce476036de1488dad2", @@ -217,6 +222,7 @@ class FlickrSearchExtractor(FlickrExtractor): """Extractor for flickr photos based on search results""" subcategory = "search" directory_fmt = ["{category}", "{subcategory}", "{search[text]}"] + archive_fmt = "s_{search}_{id}" pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)"] test = [ (("https://flickr.com/search/?text=mountain"), None), diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index f9a30e7c..33abdbd4 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -18,7 +18,6 @@ class GelbooruExtractor(SharedConfigExtractor): basecategory = "booru" category = "gelbooru" filename_fmt = "{category}_{id}_{md5}.{extension}" - archive_fmt = "{id}" api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index" def __init__(self): @@ -29,8 +28,10 @@ class GelbooruExtractor(SharedConfigExtractor): self.get_post_data = self.get_post_data_api def items(self): + data = self.get_metadata() + yield Message.Version, 1 - yield Message.Directory, self.get_metadata() + yield Message.Directory, data for post in util.advance(self.get_posts(), self.start_post): if isinstance(post, str): @@ -38,6 +39,7 @@ class GelbooruExtractor(SharedConfigExtractor): for key in ("id", "width", "height", "score", "change"): post[key] = util.safe_int(post[key]) url = post["file_url"] + post.update(data) yield Message.Url, url, text.nameext_from_url(url, post) def skip(self, num): @@ -85,7 +87,8 @@ class GelbooruExtractor(SharedConfigExtractor): class GelbooruTagExtractor(GelbooruExtractor): """Extractor for images from gelbooru.com based on search-tags""" subcategory = "tag" - directory_fmt = ["{category}", "{tags}"] + directory_fmt = ["{category}", "{search_tags}"] + archive_fmt = "t_{search_tags}_{id}" pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=post&s=list&tags=([^&]+)"] test = [ @@ -111,7 +114,7 @@ class GelbooruTagExtractor(GelbooruExtractor): return num def get_metadata(self): - return {"tags": self.tags} + return {"search_tags": self.tags} def get_posts(self): if self.use_api: @@ -149,6 +152,7 @@ class GelbooruPoolExtractor(GelbooruExtractor): """Extractor for image-pools from gelbooru.com""" subcategory = "pool" directory_fmt = ["{category}", "pool", "{pool}"] + archive_fmt = "p_{pool}_{id}" pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=pool&s=show&id=(\d+)"] test = [("https://gelbooru.com/index.php?page=pool&s=show&id=761", { @@ -182,6 +186,7 @@ class GelbooruPoolExtractor(GelbooruExtractor): class GelbooruPostExtractor(GelbooruExtractor): """Extractor for single images from gelbooru.com""" subcategory = "post" + archive_fmt = "{id}" pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=post&s=view&id=(\d+)"] test = [("https://gelbooru.com/index.php?page=post&s=view&id=313638", { diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index b365484f..90164d24 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -18,7 +18,7 @@ class ImagebamGalleryExtractor(AsynchronousExtractor): subcategory = "gallery" directory_fmt = ["{category}", "{title} - {gallery_key}"] filename_fmt = "{num:>03}-{name}.{extension}" - archive_fmt = "{image_id}" + archive_fmt = "{gallery_key}_{image_id}" pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"] test = [(("http://www.imagebam.com/" "gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), { diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py index 07e7564d..f92e65b8 100644 --- a/gallery_dl/extractor/imgbox.py +++ b/gallery_dl/extractor/imgbox.py @@ -16,7 +16,6 @@ import re class ImgboxExtractor(Extractor): """Base class for imgbox extractors""" category = "imgbox" - archive_fmt = "{image_key}" root = "https://imgbox.com" def items(self): @@ -64,6 +63,7 @@ class ImgboxGalleryExtractor(AsynchronousExtractor, ImgboxExtractor): subcategory = "gallery" directory_fmt = ["{category}", "{title} - {gallery_key}"] filename_fmt = "{num:>03}-{name}.{extension}" + archive_fmt = "{gallery_key}_{image_key}" pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"] test = [ ("https://imgbox.com/g/JaX5V5HX7g", { @@ -106,6 +106,7 @@ class ImgboxGalleryExtractor(AsynchronousExtractor, ImgboxExtractor): class ImgboxImageExtractor(ImgboxExtractor): """Extractor for single images from imgbox.com""" subcategory = "image" + archive_fmt = "{image_key}" pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/([A-Za-z0-9]{8})"] test = [ ("https://imgbox.com/qHhw7lpG", { diff --git a/gallery_dl/extractor/imgchili.py b/gallery_dl/extractor/imgchili.py index bdb0434c..d7541ecc 100644 --- a/gallery_dl/extractor/imgchili.py +++ b/gallery_dl/extractor/imgchili.py @@ -15,7 +15,6 @@ from .. import text class ImgchiliExtractor(Extractor): """Base class for imgchili extractors""" category = "imgchili" - archive_fmt = "{image_id}" root = "https://imgchili.net" def __init__(self, match): @@ -45,6 +44,7 @@ class ImgchiliExtractor(Extractor): class ImgchiliImageExtractor(ImgchiliExtractor): """Extractor for single images from imgchili.net""" subcategory = "image" + archive_fmt = "{image_id}" pattern = [r"(?:https?://)?(?:www\.)?imgchili\.net/show/\d+/(\d+)_[^/]+"] test = [(("http://imgchili.net/show/89427/" "89427136_test___quot;___gt;.png"), { @@ -71,7 +71,8 @@ class ImgchiliImageExtractor(ImgchiliExtractor): class ImgchiliAlbumExtractor(ImgchiliExtractor): """Extractor for image-albums from imgchili.net""" subcategory = "album" - directory_fmt = ["{category}", "{title} - {key}"] + directory_fmt = ["{category}", "{title} - {album_id}"] + archive_fmt = "{album_id}_{image_id}" filename_fmt = "{num:>03} {filename}" pattern = [r"(?:https?://)?(?:www\.)?imgchili\.net/album/([^/]+)"] test = [("http://imgchili.net/album/7a3824c59f77c8d39b260f9168d4b49b", { @@ -83,7 +84,7 @@ class ImgchiliAlbumExtractor(ImgchiliExtractor): title = text.extract(page, "

", "

")[0] return { "title": text.unescape(title), - "key": self.match.group(1), + "album_id": self.match.group(1), } def get_images(self, page): diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 3b4627ec..81de60c2 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -20,7 +20,6 @@ class SankakuExtractor(SharedConfigExtractor): basecategory = "booru" category = "sankaku" filename_fmt = "{category}_{id}_{md5}.{extension}" - archive_fmt = "{id}" cookienames = ("login", "pass_hash") cookiedomain = "chan.sankakucomplex.com" subdomain = "chan" @@ -38,14 +37,17 @@ class SankakuExtractor(SharedConfigExtractor): def items(self): self.login() + data = self.get_metadata() + yield Message.Version, 1 - yield Message.Directory, self.get_metadata() + yield Message.Directory, data for post_id in util.advance(self.get_posts(), self.start_post): self.wait() - data = self.get_post_data(post_id) - url = data["file_url"] - yield Message.Url, url, text.nameext_from_url(url, data) + post = self.get_post_data(post_id) + url = post["file_url"] + post.update(data) + yield Message.Url, url, text.nameext_from_url(url, post) def skip(self, num): self.start_post += num @@ -131,7 +133,8 @@ class SankakuExtractor(SharedConfigExtractor): class SankakuTagExtractor(SankakuExtractor): """Extractor for images from chan.sankakucomplex.com by search-tags""" subcategory = "tag" - directory_fmt = ["{category}", "{tags}"] + directory_fmt = ["{category}", "{search_tags}"] + archive_fmt = "t_{search_tags}_{id}" pattern = [r"(?:https?://)?chan\.sankakucomplex\.com/\?([^#]*)"] test = [ ("https://chan.sankakucomplex.com/?tags=bonocho", { @@ -188,7 +191,7 @@ class SankakuTagExtractor(SankakuExtractor): self.log.error("Unauthenticated users cannot use " "more than 4 tags at once.") raise exception.StopExtraction() - return {"tags": " ".join(tags)} + return {"search_tags": " ".join(tags)} def get_posts(self): params = {"tags": self.tags} @@ -216,6 +219,7 @@ class SankakuPoolExtractor(SankakuExtractor): """Extractor for image-pools from chan.sankakucomplex.com""" subcategory = "pool" directory_fmt = ["{category}", "pool", "{pool}"] + archive_fmt = "p_{pool}_{id}" pattern = [r"(?:https?://)?chan\.sankakucomplex\.com/pool/show/(\d+)"] test = [("https://chan.sankakucomplex.com/pool/show/90", { "count": 5, @@ -253,6 +257,7 @@ class SankakuPoolExtractor(SankakuExtractor): class SankakuPostExtractor(SankakuExtractor): """Extractor for single images from chan.sankakucomplex.com""" subcategory = "post" + archive_fmt = "{id}" pattern = [r"(?:https?://)?chan\.sankakucomplex\.com/post/show/(\d+)"] test = [("https://chan.sankakucomplex.com/post/show/360451", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 2752ec32..f8f65add 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -229,6 +229,7 @@ class TumblrLikesExtractor(TumblrExtractor): """Extractor for images from a tumblr-user by tag""" subcategory = "likes" directory_fmt = ["{category}", "{name}", "likes"] + archive_fmt = "f_{blog[name]}_{id}_{offset}" pattern = [BASE_PATTERN + r"/likes"] test = [("http://mikf123.tumblr.com/likes", { "count": 1, diff --git a/gallery_dl/version.py b/gallery_dl/version.py index f7d4f9e3..55c480ce 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.2.1-dev" +__version__ = "1.3.0-dev" diff --git a/test/test_extractors.py b/test/test_extractors.py index 1f7239bb..80cd4f4e 100644 --- a/test/test_extractors.py +++ b/test/test_extractors.py @@ -18,6 +18,7 @@ SKIP = { "archivedmoe", "archiveofsins", "thebarchive", # temporary issues + "powermanga", }