From 5008e105ee6d864ba91bd7b7532c72b97e9e7b1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Thu, 1 Mar 2018 17:40:31 +0100
Subject: [PATCH] update archive IDs

... to behave in a more straightforward way when dealing with
bookmarks/favourites/etc.

specific IDs are now grouped by their owner, album-id, ... to
allow for duplicates when it would be expected.
---
 gallery_dl/extractor/booru.py      | 20 +++++++++++++-------
 gallery_dl/extractor/deviantart.py | 10 ++++++++--
 gallery_dl/extractor/flickr.py     |  8 +++++++-
 gallery_dl/extractor/gelbooru.py   | 13 +++++++++----
 gallery_dl/extractor/imagebam.py   |  2 +-
 gallery_dl/extractor/imgbox.py     |  3 ++-
 gallery_dl/extractor/imgchili.py   |  7 ++++---
 gallery_dl/extractor/sankaku.py    | 19 ++++++++++++-------
 gallery_dl/extractor/tumblr.py     |  1 +
 gallery_dl/version.py              |  2 +-
 test/test_extractors.py            |  1 +
 11 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index 9caaaa30..f0378803 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -20,7 +20,6 @@ class BooruExtractor(SharedConfigExtractor):
     """Base class for all booru extractors"""
     basecategory = "booru"
     filename_fmt = "{category}_{id}_{md5}.{extension}"
-    archive_fmt = "{id}"
     api_url = ""
     per_page = 50
     page_start = 1
@@ -39,20 +38,23 @@ class BooruExtractor(SharedConfigExtractor):
         return pages * self.per_page
 
     def items(self):
+        data = self.get_metadata()
+
         yield Message.Version, 1
-        yield Message.Directory, self.get_metadata()
+        yield Message.Directory, data
 
         self.reset_page()
         while True:
             images = self.parse_response(
                 self.request(self.api_url, params=self.params))
 
-            for data in images:
+            for image in images:
                 try:
-                    url = data["file_url"]
+                    url = image["file_url"]
                     if url.startswith("/"):
                         url = urljoin(self.api_url, url)
-                    yield Message.Url, url, text.nameext_from_url(url, data)
+                    image.update(data)
+                    yield Message.Url, url, text.nameext_from_url(url, image)
                 except KeyError:
                     continue
 
@@ -115,7 +117,8 @@ class GelbooruPageMixin():
 class TagMixin():
     """Extraction of images based on search-tags"""
     subcategory = "tag"
-    directory_fmt = ["{category}", "{tags}"]
+    directory_fmt = ["{category}", "{search_tags}"]
+    archive_fmt = "t_{search_tags}_{id}"
 
     def __init__(self, match):
         super().__init__(match)
@@ -124,13 +127,14 @@ class TagMixin():
         self.params["limit"] = self.per_page
 
     def get_metadata(self):
-        return {"tags": self.tags}
+        return {"search_tags": self.tags}
 
 
 class PoolMixin():
     """Extraction of image-pools"""
     subcategory = "pool"
     directory_fmt = ["{category}", "pool", "{pool}"]
+    archive_fmt = "p_{pool}_{id}"
 
     def __init__(self, match):
         super().__init__(match)
@@ -145,6 +149,7 @@ class PoolMixin():
 class PostMixin():
     """Extraction of a single image-post"""
     subcategory = "post"
+    archive_fmt = "{id}"
 
     def __init__(self, match):
         super().__init__(match)
@@ -156,6 +161,7 @@ class PopularMixin():
     """Extraction and metadata handling for Danbooru v2"""
     subcategory = "popular"
     directory_fmt = ["{category}", "popular", "{scale}", "{date}"]
+    archive_fmt = "P_{scale[0]}_{date}_{id}"
     page_start = None
     sort = True
 
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 801e9e8d..5747fe85 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -22,7 +22,6 @@ class DeviantartExtractor(Extractor):
     category = "deviantart"
     directory_fmt = ["{category}", "{author[username]!l}"]
     filename_fmt = "{category}_{index}_{title}.{extension}"
-    archive_fmt = "{index}.{extension}"
 
     def __init__(self, match=None):
         Extractor.__init__(self)
@@ -166,6 +165,8 @@ class DeviantartExtractor(Extractor):
 class DeviantartGalleryExtractor(DeviantartExtractor):
     """Extractor for all deviations from an artist's gallery"""
     subcategory = "gallery"
+    archive_fmt = "g_{username}_{index}.{extension}"
+
     pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com"
                r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$"]
     test = [
@@ -192,6 +193,7 @@ class DeviantartFolderExtractor(DeviantartExtractor):
     """Extractor for deviations inside an artist's gallery folder"""
     subcategory = "folder"
     directory_fmt = ["{category}", "{folder[owner]}", "{folder[title]}"]
+    archive_fmt = "F_{folder[index]}_{index}.{extension}"
     pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com"
                r"/gallery/(\d+)/([^/?&#]+)"]
     test = [
@@ -225,6 +227,7 @@ class DeviantartFolderExtractor(DeviantartExtractor):
 class DeviantartDeviationExtractor(DeviantartExtractor):
     """Extractor for single deviations"""
     subcategory = "deviation"
+    archive_fmt = "{index}.{extension}"
     pattern = [(r"(?:https?://)?([^.]+\.deviantart\.com/"
                 r"(?:art|journal)/[^/?&#]+-\d+)"),
                (r"(?:https?://)?(sta\.sh/[a-z0-9]+)")]
@@ -268,6 +271,7 @@ class DeviantartFavoriteExtractor(DeviantartExtractor):
     """Extractor for an artist's favorites"""
     subcategory = "favorite"
     directory_fmt = ["{category}", "{username}", "Favourites"]
+    archive_fmt = "f_{username}_{index}.{extension}"
     pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com"
                r"/favourites/?(?:\?catpath=/)?$"]
     test = [
@@ -295,12 +299,13 @@ class DeviantartCollectionExtractor(DeviantartExtractor):
     subcategory = "collection"
     directory_fmt = ["{category}", "{collection[owner]}",
                      "Favourites", "{collection[title]}"]
+    archive_fmt = "C_{collection[index]}_{index}.{extension}"
     pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com"
                r"/favourites/(\d+)/([^/?&#]+)"]
     test = [(("https://pencilshadings.deviantart.com"
               "/favourites/70595441/3D-Favorites"), {
         "url": "742f92199d5bc6a89cda6ec6133d46c7a523824d",
-        "keyword": "9210c976b5274eff6ea1d2b8a4f891c9f35ce340",
+        "keyword": "5da3a16e85150d2a09e074b2b2ee916099b52737",
         "options": (("original", False),),
     })]
 
@@ -324,6 +329,7 @@ class DeviantartJournalExtractor(DeviantartExtractor):
     """Extractor for an artist's journals"""
     subcategory = "journal"
     directory_fmt = ["{category}", "{username}", "Journal"]
+    archive_fmt = "j_{username}_{index}.{extension}"
     pattern = [r"(?:https?://)?([^.]+)\.deviantart\.com"
                r"/(?:journal|blog)/?(?:\?catpath=/)?$"]
     test = [
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index f818d756..79b3622f 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -16,7 +16,6 @@ class FlickrExtractor(Extractor):
     """Base class for flickr extractors"""
     category = "flickr"
     filename_fmt = "{category}_{id}.{extension}"
-    archive_fmt = "{id}"
 
     def __init__(self, match):
         Extractor.__init__(self)
@@ -45,6 +44,7 @@ class FlickrExtractor(Extractor):
 class FlickrImageExtractor(FlickrExtractor):
     """Extractor for individual images from flickr.com"""
     subcategory = "image"
+    archive_fmt = "{id}"
     pattern = [r"(?:https?://)?(?:www\.|m\.)?flickr\.com/photos/[^/]+/(\d+)",
                r"(?:https?://)?[^.]+\.static\.?flickr\.com/(?:\d+/)+(\d+)_",
                r"(?:https?://)?flic\.kr/(p)/([A-Za-z1-9]+)"]
@@ -108,6 +108,7 @@ class FlickrAlbumExtractor(FlickrExtractor):
     subcategory = "album"
     directory_fmt = ["{category}", "{subcategory}s",
                      "{album[id]} - {album[title]}"]
+    archive_fmt = "a_{album[id]}_{id}"
     pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/"
                r"photos/([^/]+)/(?:album|set)s/(\d+)"]
     test = [(("https://www.flickr.com/photos/"
@@ -143,6 +144,7 @@ class FlickrGalleryExtractor(FlickrExtractor):
     subcategory = "gallery"
     directory_fmt = ["{category}", "galleries",
                      "{user[username]} {gallery[id]}"]
+    archive_fmt = "g_{gallery[id]}_{id}"
     pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/"
                r"photos/([^/]+)/galleries/(\d+)"]
     test = [(("https://www.flickr.com/photos/flickr/"
@@ -171,6 +173,7 @@ class FlickrGroupExtractor(FlickrExtractor):
     """Extractor for group pools from flickr.com"""
     subcategory = "group"
     directory_fmt = ["{category}", "{subcategory}s", "{group[groupname]}"]
+    archive_fmt = "G_{group[nsid]}_{id}"
     pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)"]
     test = [("https://www.flickr.com/groups/bird_headshots/", {
         "pattern": (r"https?://farm\d+\.staticflickr\.com"
@@ -189,6 +192,7 @@ class FlickrUserExtractor(FlickrExtractor):
     """Extractor for the photostream of a flickr user"""
     subcategory = "user"
     directory_fmt = ["{category}", "{user[username]}"]
+    archive_fmt = "u_{user[nsid]}_{id}"
     pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$"]
     test = [("https://www.flickr.com/photos/shona_s/", {
         "url": "d125b536cd8c4229363276b6c84579c394eec3a2",
@@ -203,6 +207,7 @@ class FlickrFavoriteExtractor(FlickrExtractor):
     """Extractor for favorite photos of a flickr user"""
     subcategory = "favorite"
     directory_fmt = ["{category}", "{subcategory}s", "{user[username]}"]
+    archive_fmt = "f_{user[nsid]}_{id}"
     pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites"]
     test = [("https://www.flickr.com/photos/shona_s/favorites", {
         "url": "5129b3f5bfa83cc25bdae3ce476036de1488dad2",
@@ -217,6 +222,7 @@ class FlickrSearchExtractor(FlickrExtractor):
     """Extractor for flickr photos based on search results"""
     subcategory = "search"
     directory_fmt = ["{category}", "{subcategory}", "{search[text]}"]
+    archive_fmt = "s_{search}_{id}"
     pattern = [r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)"]
     test = [
         (("https://flickr.com/search/?text=mountain"), None),
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index f9a30e7c..33abdbd4 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -18,7 +18,6 @@ class GelbooruExtractor(SharedConfigExtractor):
     basecategory = "booru"
     category = "gelbooru"
     filename_fmt = "{category}_{id}_{md5}.{extension}"
-    archive_fmt = "{id}"
     api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index"
 
     def __init__(self):
@@ -29,8 +28,10 @@ class GelbooruExtractor(SharedConfigExtractor):
             self.get_post_data = self.get_post_data_api
 
     def items(self):
+        data = self.get_metadata()
+
         yield Message.Version, 1
-        yield Message.Directory, self.get_metadata()
+        yield Message.Directory, data
 
         for post in util.advance(self.get_posts(), self.start_post):
             if isinstance(post, str):
@@ -38,6 +39,7 @@ class GelbooruExtractor(SharedConfigExtractor):
             for key in ("id", "width", "height", "score", "change"):
                 post[key] = util.safe_int(post[key])
             url = post["file_url"]
+            post.update(data)
             yield Message.Url, url, text.nameext_from_url(url, post)
 
     def skip(self, num):
@@ -85,7 +87,8 @@ class GelbooruExtractor(SharedConfigExtractor):
 class GelbooruTagExtractor(GelbooruExtractor):
     """Extractor for images from gelbooru.com based on search-tags"""
     subcategory = "tag"
-    directory_fmt = ["{category}", "{tags}"]
+    directory_fmt = ["{category}", "{search_tags}"]
+    archive_fmt = "t_{search_tags}_{id}"
     pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
                r"\?page=post&s=list&tags=([^&]+)"]
     test = [
@@ -111,7 +114,7 @@ class GelbooruTagExtractor(GelbooruExtractor):
         return num
 
     def get_metadata(self):
-        return {"tags": self.tags}
+        return {"search_tags": self.tags}
 
     def get_posts(self):
         if self.use_api:
@@ -149,6 +152,7 @@ class GelbooruPoolExtractor(GelbooruExtractor):
     """Extractor for image-pools from gelbooru.com"""
     subcategory = "pool"
     directory_fmt = ["{category}", "pool", "{pool}"]
+    archive_fmt = "p_{pool}_{id}"
     pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
                r"\?page=pool&s=show&id=(\d+)"]
     test = [("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
@@ -182,6 +186,7 @@ class GelbooruPoolExtractor(GelbooruExtractor):
 class GelbooruPostExtractor(GelbooruExtractor):
     """Extractor for single images from gelbooru.com"""
     subcategory = "post"
+    archive_fmt = "{id}"
     pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
                r"\?page=post&s=view&id=(\d+)"]
     test = [("https://gelbooru.com/index.php?page=post&s=view&id=313638", {
diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py
index b365484f..90164d24 100644
--- a/gallery_dl/extractor/imagebam.py
+++ b/gallery_dl/extractor/imagebam.py
@@ -18,7 +18,7 @@ class ImagebamGalleryExtractor(AsynchronousExtractor):
     subcategory = "gallery"
     directory_fmt = ["{category}", "{title} - {gallery_key}"]
     filename_fmt = "{num:>03}-{name}.{extension}"
-    archive_fmt = "{image_id}"
+    archive_fmt = "{gallery_key}_{image_id}"
     pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+)"]
     test = [(("http://www.imagebam.com/"
               "gallery/adz2y0f9574bjpmonaismyrhtjgvey4o"), {
diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py
index 07e7564d..f92e65b8 100644
--- a/gallery_dl/extractor/imgbox.py
+++ b/gallery_dl/extractor/imgbox.py
@@ -16,7 +16,6 @@ import re
 class ImgboxExtractor(Extractor):
     """Base class for imgbox extractors"""
     category = "imgbox"
-    archive_fmt = "{image_key}"
     root = "https://imgbox.com"
 
     def items(self):
@@ -64,6 +63,7 @@ class ImgboxGalleryExtractor(AsynchronousExtractor, ImgboxExtractor):
     subcategory = "gallery"
     directory_fmt = ["{category}", "{title} - {gallery_key}"]
     filename_fmt = "{num:>03}-{name}.{extension}"
+    archive_fmt = "{gallery_key}_{image_key}"
     pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"]
     test = [
         ("https://imgbox.com/g/JaX5V5HX7g", {
@@ -106,6 +106,7 @@ class ImgboxGalleryExtractor(AsynchronousExtractor, ImgboxExtractor):
 class ImgboxImageExtractor(ImgboxExtractor):
     """Extractor for single images from imgbox.com"""
     subcategory = "image"
+    archive_fmt = "{image_key}"
     pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/([A-Za-z0-9]{8})"]
     test = [
         ("https://imgbox.com/qHhw7lpG", {
diff --git a/gallery_dl/extractor/imgchili.py b/gallery_dl/extractor/imgchili.py
index bdb0434c..d7541ecc 100644
--- a/gallery_dl/extractor/imgchili.py
+++ b/gallery_dl/extractor/imgchili.py
@@ -15,7 +15,6 @@ from .. import text
 class ImgchiliExtractor(Extractor):
     """Base class for imgchili extractors"""
     category = "imgchili"
-    archive_fmt = "{image_id}"
     root = "https://imgchili.net"
 
     def __init__(self, match):
@@ -45,6 +44,7 @@ class ImgchiliExtractor(Extractor):
 class ImgchiliImageExtractor(ImgchiliExtractor):
     """Extractor for single images from imgchili.net"""
     subcategory = "image"
+    archive_fmt = "{image_id}"
     pattern = [r"(?:https?://)?(?:www\.)?imgchili\.net/show/\d+/(\d+)_[^/]+"]
     test = [(("http://imgchili.net/show/89427/"
               "89427136_test___quot;___gt;.png"), {
@@ -71,7 +71,8 @@ class ImgchiliImageExtractor(ImgchiliExtractor):
 class ImgchiliAlbumExtractor(ImgchiliExtractor):
     """Extractor for image-albums from imgchili.net"""
     subcategory = "album"
-    directory_fmt = ["{category}", "{title} - {key}"]
+    directory_fmt = ["{category}", "{title} - {album_id}"]
+    archive_fmt = "{album_id}_{image_id}"
     filename_fmt = "{num:>03} {filename}"
     pattern = [r"(?:https?://)?(?:www\.)?imgchili\.net/album/([^/]+)"]
     test = [("http://imgchili.net/album/7a3824c59f77c8d39b260f9168d4b49b", {
@@ -83,7 +84,7 @@ class ImgchiliAlbumExtractor(ImgchiliExtractor):
         title = text.extract(page, "<h1>", "</h1>")[0]
         return {
             "title": text.unescape(title),
-            "key": self.match.group(1),
+            "album_id": self.match.group(1),
         }
 
     def get_images(self, page):
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 3b4627ec..81de60c2 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -20,7 +20,6 @@ class SankakuExtractor(SharedConfigExtractor):
     basecategory = "booru"
     category = "sankaku"
     filename_fmt = "{category}_{id}_{md5}.{extension}"
-    archive_fmt = "{id}"
     cookienames = ("login", "pass_hash")
     cookiedomain = "chan.sankakucomplex.com"
     subdomain = "chan"
@@ -38,14 +37,17 @@ class SankakuExtractor(SharedConfigExtractor):
 
     def items(self):
         self.login()
+        data = self.get_metadata()
+
         yield Message.Version, 1
-        yield Message.Directory, self.get_metadata()
+        yield Message.Directory, data
 
         for post_id in util.advance(self.get_posts(), self.start_post):
             self.wait()
-            data = self.get_post_data(post_id)
-            url = data["file_url"]
-            yield Message.Url, url, text.nameext_from_url(url, data)
+            post = self.get_post_data(post_id)
+            url = post["file_url"]
+            post.update(data)
+            yield Message.Url, url, text.nameext_from_url(url, post)
 
     def skip(self, num):
         self.start_post += num
@@ -131,7 +133,8 @@ class SankakuExtractor(SharedConfigExtractor):
 class SankakuTagExtractor(SankakuExtractor):
     """Extractor for images from chan.sankakucomplex.com by search-tags"""
     subcategory = "tag"
-    directory_fmt = ["{category}", "{tags}"]
+    directory_fmt = ["{category}", "{search_tags}"]
+    archive_fmt = "t_{search_tags}_{id}"
     pattern = [r"(?:https?://)?chan\.sankakucomplex\.com/\?([^#]*)"]
     test = [
         ("https://chan.sankakucomplex.com/?tags=bonocho", {
@@ -188,7 +191,7 @@ class SankakuTagExtractor(SankakuExtractor):
             self.log.error("Unauthenticated users cannot use "
                            "more than 4 tags at once.")
             raise exception.StopExtraction()
-        return {"tags": " ".join(tags)}
+        return {"search_tags": " ".join(tags)}
 
     def get_posts(self):
         params = {"tags": self.tags}
@@ -216,6 +219,7 @@ class SankakuPoolExtractor(SankakuExtractor):
     """Extractor for image-pools  from chan.sankakucomplex.com"""
     subcategory = "pool"
     directory_fmt = ["{category}", "pool", "{pool}"]
+    archive_fmt = "p_{pool}_{id}"
     pattern = [r"(?:https?://)?chan\.sankakucomplex\.com/pool/show/(\d+)"]
     test = [("https://chan.sankakucomplex.com/pool/show/90", {
         "count": 5,
@@ -253,6 +257,7 @@ class SankakuPoolExtractor(SankakuExtractor):
 class SankakuPostExtractor(SankakuExtractor):
     """Extractor for single images from chan.sankakucomplex.com"""
     subcategory = "post"
+    archive_fmt = "{id}"
     pattern = [r"(?:https?://)?chan\.sankakucomplex\.com/post/show/(\d+)"]
     test = [("https://chan.sankakucomplex.com/post/show/360451", {
         "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 2752ec32..f8f65add 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -229,6 +229,7 @@ class TumblrLikesExtractor(TumblrExtractor):
     """Extractor for images from a tumblr-user by tag"""
     subcategory = "likes"
     directory_fmt = ["{category}", "{name}", "likes"]
+    archive_fmt = "f_{blog[name]}_{id}_{offset}"
     pattern = [BASE_PATTERN + r"/likes"]
     test = [("http://mikf123.tumblr.com/likes", {
         "count": 1,
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index f7d4f9e3..55c480ce 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-__version__ = "1.2.1-dev"
+__version__ = "1.3.0-dev"
diff --git a/test/test_extractors.py b/test/test_extractors.py
index 1f7239bb..80cd4f4e 100644
--- a/test/test_extractors.py
+++ b/test/test_extractors.py
@@ -18,6 +18,7 @@ SKIP = {
     "archivedmoe", "archiveofsins", "thebarchive",
 
     # temporary issues
+    "powermanga",
 }