From 34bab080ae60980602aae1bb2ad901bab1b08e1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 8 Feb 2019 12:03:10 +0100 Subject: [PATCH] rewrite URL patterns to use only 1 per extractor --- gallery_dl/extractor/artstation.py | 18 ++++++++---------- gallery_dl/extractor/e621.py | 6 ++---- gallery_dl/extractor/flickr.py | 9 +++++---- gallery_dl/extractor/imagefap.py | 14 ++++---------- gallery_dl/extractor/imagehosts.py | 5 ++--- gallery_dl/extractor/imgur.py | 5 ++--- gallery_dl/extractor/mangapanda.py | 6 +----- gallery_dl/extractor/mangareader.py | 6 +----- gallery_dl/extractor/pixiv.py | 23 ++++++++++------------- gallery_dl/extractor/seiga.py | 7 +++---- 10 files changed, 38 insertions(+), 61 deletions(-) diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index eee42d07..2018dbc0 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -24,7 +24,7 @@ class ArtstationExtractor(Extractor): def __init__(self, match=None): Extractor.__init__(self) - self.user = match.group(1) if match else None + self.user = match.group(1) or match.group(2) if match else None self.external = self.config("external", False) def items(self): @@ -123,10 +123,9 @@ class ArtstationExtractor(Extractor): class ArtstationUserExtractor(ArtstationExtractor): """Extractor for all projects of an artstation user""" subcategory = "user" - pattern = [r"(?:https?://)?(?:www\.)?artstation\.com" - r"/(?!artwork|projects|search)([^/?&#]+)(?:/albums/all)?/?$", - r"(?:https?://)?((?!www)\w+)\.artstation\.com" - r"(?:/(?:projects/?)?)?$"] + pattern = [r"(?:https?://)?(?:(?:www\.)?artstation\.com" + r"/(?!artwork|projects|search)([^/?&#]+)(?:/albums/all)?" + r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$"] test = [ ("https://www.artstation.com/gaerikim/", { "pattern": r"https://\w+\.artstation\.com/p/assets" @@ -149,10 +148,9 @@ class ArtstationAlbumExtractor(ArtstationExtractor): directory_fmt = ["{category}", "{userinfo[username]}", "Albums", "{album[id]} - {album[title]}"] archive_fmt = "a_{album[id]}_{asset[id]}" - pattern = [r"(?:https?://)?(?:www\.)?artstation\.com" - r"/(?!artwork|projects|search)([^/?&#]+)/albums/(\d+)", - r"(?:https?://)?((?!www)\w+)\.artstation\.com" - r"/albums/(\d+)"] + pattern = [r"(?:https?://)?(?:(?:www\.)?artstation\.com" + r"/(?!artwork|projects|search)([^/?&#]+)" + r"|((?!www)\w+)\.artstation\.com)/albums/(\d+)"] test = [ ("https://www.artstation.com/huimeiye/albums/770899", { "count": 2, @@ -165,7 +163,7 @@ class ArtstationAlbumExtractor(ArtstationExtractor): def __init__(self, match): ArtstationExtractor.__init__(self, match) - self.album_id = text.parse_int(match.group(2)) + self.album_id = text.parse_int(match.group(3)) def metadata(self): userinfo = self.get_user_info(self.user) diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 726f6c36..71bd1584 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -21,10 +21,8 @@ class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor): class E621TagExtractor(booru.TagMixin, E621Extractor): """Extractor for images from e621.net based on search-tags""" - pattern = [ - r"(?:https?://)?(?:www\.)?e621\.net/post/index/\d+/(?P[^/?&#]+)", - r"(?:https?://)?(?:www\.)?e621\.net/post\?tags=(?P[^&#]+)", - ] + pattern = [r"(?:https?://)?(?:www\.)?e621\.net/post" + r"(?:/index/\d+/|\?tags=)(?P[^/?&#]+)"] test = [ ("https://e621.net/post/index/1/anry", { "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba", diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 9de50c74..58175290 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -45,9 +45,10 @@ class FlickrImageExtractor(FlickrExtractor): """Extractor for individual images from flickr.com""" subcategory = "image" archive_fmt = "{id}" - pattern = [r"(?:https?://)?(?:www\.|m\.)?flickr\.com/photos/[^/]+/(\d+)", - r"(?:https?://)?[^.]+\.static\.?flickr\.com/(?:\d+/)+(\d+)_", - r"(?:https?://)?flic\.kr/(p)/([A-Za-z1-9]+)"] + pattern = [r"(?:https?://)?(?:" + r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/" + r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)" + r"|flic\.kr/p/([A-Za-z1-9]+))"] test = [ ("https://www.flickr.com/photos/departingyyz/16089302239", { "url": "7f0887f5953f61c8b79a695cb102ea309c0346b0", @@ -66,7 +67,7 @@ class FlickrImageExtractor(FlickrExtractor): def __init__(self, match): FlickrExtractor.__init__(self, match) - if self.item_id == "p": + if not self.item_id: alphabet = ("123456789abcdefghijkmnopqrstu" "vwxyzABCDEFGHJKLMNPQRSTUVWXYZ") self.item_id = util.bdecode(match.group(2), alphabet) diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 9e3b8997..f4fec0f9 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -133,10 +133,9 @@ class ImagefapUserExtractor(ImagefapExtractor): """Extractor for all galleries from a user at imagefap.com""" subcategory = "user" categorytransfer = True - pattern = [(r"(?:https?://)?(?:www\.)?imagefap\.com" - r"/profile(?:\.php\?user=|/)([^/?&#]+)"), - (r"(?:https?://)?(?:www\.)?imagefap\.com" - r"/usergallery\.php\?userid=(\d+)")] + pattern = [r"(?:https?://)?(?:www\.)?imagefap\.com/" + r"(?:profile(?:\.php\?user=|/)([^/?&#]+)" + r"|usergallery\.php\?userid=(\d+))"] test = [ ("https://www.imagefap.com/profile/LucyRae/galleries", { "url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd", @@ -149,12 +148,7 @@ class ImagefapUserExtractor(ImagefapExtractor): def __init__(self, match): ImagefapExtractor.__init__(self) - try: - self.user_id = int(match.group(1)) - self.user = None - except ValueError: - self.user_id = None - self.user = match.group(1) + self.user, self.user_id = match.groups() def items(self): yield Message.Version, 1 diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 8efd8b0b..0a9d9b45 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -70,9 +70,8 @@ class ImagehostImageExtractor(SharedConfigMixin, Extractor): class ImxtoImageExtractor(ImagehostImageExtractor): """Extractor for single images from imx.to""" category = "imxto" - pattern = [r"(?:https?://)?(?:www\.)?(imx\.to/i/(\w+))", - r"(?:https?://)?(?:www\.)?((?:imx\.to|img\.yt)" - r"/img-([a-z0-9]+)\.html)"] + pattern = [r"(?:https?://)?(?:www\.)?((?:imx\.to|img\.yt)" + r"/(?:i/|img-)(\w+)(\.html)?)"] test = ( ("https://imx.to/i/1qdeva", { # new-style URL "url": "ab2173088a6cdef631d7a47dec4a5da1c6a00130", diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 8e77713b..184c8606 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -53,9 +53,8 @@ class ImgurImageExtractor(ImgurExtractor): subcategory = "image" filename_fmt = "{category}_{hash}{title:?_//}.{extension}" archive_fmt = "{hash}" - pattern = [(r"(?:https?://)?(?:www\.|m\.)?imgur\.com" - r"/(?!gallery)(\w{7}|\w{5})"), - (r"(?:https?://)?i\.imgur\.com/(\w{7}|\w{5})[sbtmlh]?\.")] + pattern = [r"(?:https?://)?(?:www\.|[im]\.|)?imgur\.com" + r"/(?!gallery)(\w{7}|\w{5})[sbtmlh]?\.?"] test = [ ("https://imgur.com/21yMxCS", { "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2", diff --git a/gallery_dl/extractor/mangapanda.py b/gallery_dl/extractor/mangapanda.py index 426fc8ff..ed621b96 100644 --- a/gallery_dl/extractor/mangapanda.py +++ b/gallery_dl/extractor/mangapanda.py @@ -28,11 +28,7 @@ class MangapandaMangaExtractor(MangapandaBase, MangareaderMangaExtractor): class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor): """Extractor for manga-chapters from mangapanda.com""" - pattern = [ - (r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?&#]+)/(\d+))"), - (r"(?:https?://)?(?:www\.)?mangapanda\.com" - r"(/\d+-\d+-\d+(/[^/]+)/chapter-(\d+)\.html)"), - ] + pattern = [r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?&#]+)/(\d+))"] test = [("https://www.mangapanda.com/red-storm/2", { "url": "1f633f776e950531ba9b1e81965316458e785261", "keyword": "32b5e84017c2bf5f122b339ecf40899e41f18cc9", diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index 3aabb153..ef074ecb 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -60,11 +60,7 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): """Extractor for manga-chapters from mangareader.net""" archive_fmt = "{manga}_{chapter}_{page}" - pattern = [ - (r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"), - (r"(?:https?://)?(?:www\.)?mangareader\.net" - r"(/\d+-\d+-\d+(/[^/]+)/chapter-(\d+)\.html)"), - ] + pattern = [r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"] test = [(("https://www.mangareader.net/" "karate-shoukoushi-kohinata-minoru/11"), { "url": "061cc92a07edf17bb991ce0821fa4c77a147a860", diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index c7d0eacd..fa84ff37 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -82,10 +82,9 @@ class PixivExtractor(Extractor): class PixivUserExtractor(PixivExtractor): """Extractor for works of a pixiv-user""" subcategory = "user" - pattern = [(r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?"), - (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+)()")] + pattern = [r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/" + r"(?:member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?" + r"|(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+))"] test = [ ("http://www.pixiv.net/member_illust.php?id=173530", { "url": "852c31ad83b6840bacbce824d85f2a997889efb7", @@ -107,7 +106,7 @@ class PixivUserExtractor(PixivExtractor): def __init__(self, match): PixivExtractor.__init__(self) - self.user_id = match.group(1) + self.user_id = match.group(1) or match.group(3) self.query = text.parse_query(match.group(2)) def works(self): @@ -153,13 +152,11 @@ class PixivMeExtractor(PixivExtractor): class PixivWorkExtractor(PixivExtractor): """Extractor for a single pixiv work/illustration""" subcategory = "work" - pattern = [(r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/member(?:_illust)?\.php\?(?:[^&]+&)*illust_id=(\d+)"), - (r"(?:https?://)?i(?:\d+\.pixiv|\.pximg)\.net" - r"/(?:(?:.*/)?img-[^/]+/img/\d{4}(?:/\d\d){5}" - r"|img\d+/img/[^/]+)/(\d+)"), - (r"(?:https?://)?img\d*\.pixiv\.net/img/[^/]+/(\d+)"), - (r"(?:https?://)?(?:www\.)?pixiv\.net/i/(\d+)")] + pattern = [r"(?:https?://)?(?:(?:www\.|touch\.)?pixiv\.net" + r"/member(?:_illust)?\.php\?(?:[^&]+&)*illust_id=(\d+)" + r"|(?:i(?:\d+\.pixiv|\.pximg)\.net" + r"/(?:(?:.*/)?img-[^/]+/img/\d{4}(?:/\d\d){5}|img\d+/img/[^/]+)" + r"|img\d*\.pixiv\.net/img/[^/]+|(?:www\.)?pixiv\.net/i)/(\d+))"] test = [ (("http://www.pixiv.net/member_illust.php" "?mode=medium&illust_id=966412"), { @@ -187,7 +184,7 @@ class PixivWorkExtractor(PixivExtractor): def __init__(self, match): PixivExtractor.__init__(self) - self.illust_id = match.group(1) + self.illust_id = match.group(1) or match.group(2) self.load_ugoira = True self.work = None diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index 8a913b42..28695324 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -166,10 +166,9 @@ class SeigaImageExtractor(SeigaExtractor): """Extractor for single images from seiga.nicovideo.jp""" subcategory = "image" filename_fmt = "{category}_{image_id}.{extension}" - pattern = [(r"(?:https?://)?(?:www\.|seiga\.)?nicovideo\.jp/" - r"(?:seiga/im|image/source/)(\d+)"), - (r"(?:https?://)?lohas\.nicoseiga\.jp/" - r"(?:priv|o)/[^/]+/\d+/(\d+)")] + pattern = [r"(?:https?://)?(?:" + r"(?:www\.|seiga\.)?nicovideo\.jp/(?:seiga/im|image/source/)" + r"|lohas\.nicoseiga\.jp/(?:priv|o)/[^/]+/\d+/)(\d+)"] test = [ ("http://seiga.nicovideo.jp/seiga/im5977527", { "keyword": "f66ba5de33d4ce2cb57f23bb37e1e847e0771c10",