diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 77bd7c67..1677c9dd 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -19,6 +19,7 @@ Archive of Sins https://archiveofsins.com/ Threads Archived.Moe https://archived.moe/ Threads ArtStation https://www.artstation.com/ |artstation-C| Behance https://www.behance.net/ Collections, Galleries, User Profiles +Blogger https://www.blogger.com/ Blogs, Posts BobX http://www.bobx.com/dark/ Galleries, Idols Danbooru https://danbooru.donmai.us/ Pools, Popular Images, Posts, Tag-Searches Optional Desuarchive https://desuarchive.org/ Threads diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 6bbd0b47..b99243df 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -20,6 +20,7 @@ modules = [ "adultempire", "artstation", "behance", + "blogger", "bobx", "danbooru", "deviantart", diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py new file mode 100644 index 00000000..31bbaf86 --- /dev/null +++ b/gallery_dl/extractor/blogger.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Blogger blogs""" + +from .common import Extractor, Message +from .. import text +import re + +BASE_PATTERN = ( + r"(?:blogger:(?:https?://)?([^/]+)|" + r"(?:https?://)?([^.]+\.blogspot\.com))") + + +class BloggerExtractor(Extractor): + """Base class for blogger extractors""" + category = "blogger" + directory_fmt = ("{category}", "{blog[name]}", + "{post[date]:%Y-%m-%d} {post[title]}") + filename_fmt = "{num:>03}.{extension}" + archive_fmt = "{post[id]}_{num}" + root = "https://www.blogger.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.blog = match.group(1) or match.group(2) + self.api = BloggerAPI(self) + + def items(self): + yield Message.Version, 1 + + blog = self.api.blog_by_url("http://" + self.blog) + blog["pages"] = blog["pages"]["totalItems"] + blog["posts"] = blog["posts"]["totalItems"] + blog["date"] = text.parse_datetime(blog["published"]) + del blog["selfLink"] + + sub = re.compile(r"/s\d+/").sub + findall = re.compile( + r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)"').findall + + for post in self.posts(blog): + images = findall(post["content"]) + if not images: + continue + + post["author"] = post["author"]["displayName"] + post["replies"] = post["replies"]["totalItems"] + post["content"] = text.remove_html(post["content"]) + post["date"] = text.parse_datetime(post["published"]) + del post["selfLink"] + del post["blog"] + + yield Message.Directory, {"blog": blog, "post": post} + for num, url in enumerate(images, 1): + url = sub("/s0/", url).replace("http:", "https:", 1) + yield Message.Url, url, text.nameext_from_url(url, { + "blog": blog, + "post": post, + "url" : url, + "num" : num, + }) + + def posts(self, blog): + """Return an iterable with all relevant post objects""" + + +class BloggerPostExtractor(BloggerExtractor): + """Extractor for a single blog post""" + subcategory = "post" + pattern = BASE_PATTERN + r"(/\d{4}/\d\d/[^/?&#]+\.html)" + test = ( + ("https://julianbphotography.blogspot.com/2010/12/moon-rise.html", { + "url": "9928429fb62f712eb4de80f53625eccecc614aae", + "pattern": r"https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg", + "keyword": { + "blog": { + "date" : "type:datetime", + "description": "", + "id" : "5623928067739466034", + "kind" : "blogger#blog", + "locale" : dict, + "name" : "Julian Bunker Photography", + "pages" : int, + "posts" : int, + "published" : "2010-11-21T10:19:42-08:00", + "updated" : str, + "url" : "http://www.julianbunker.com/", + }, + "post": { + "author" : "Julian Bunker", + "content" : str, + "date" : "type:datetime", + "etag" : str, + "id" : "6955139236418998998", + "kind" : "blogger#post", + "published" : "2010-12-25T17:08:00-08:00", + "replies" : "0", + "title" : "Moon Rise", + "updated" : "2011-12-06T05:21:24-08:00", + "url" : "re:.+/2010/12/moon-rise.html$", + }, + "num": int, + "url": str, + }, + }), + ("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", { + "url": "9928429fb62f712eb4de80f53625eccecc614aae", + }), + ) + + def __init__(self, match): + BloggerExtractor.__init__(self, match) + self.path = match.group(3) + + def posts(self, blog): + return (self.api.post_by_path(blog["id"], self.path),) + + +class BloggerBlogExtractor(BloggerExtractor): + """Extractor for an entire Blogger blog""" + subcategory = "blog" + pattern = BASE_PATTERN + "/?$" + test = ( + ("https://julianbphotography.blogspot.com/", { + "range": "1-25", + "count": 25, + "pattern": r"https://\d\.bp\.blogspot\.com/.*/s0/[^.]+\.jpg", + }), + ("blogger:http://www.julianbunker.com/", { + "range": "1-25", + "count": 25, + }), + ) + + def posts(self, blog): + return self.api.blog_posts(blog["id"]) + + +class BloggerAPI(): + """Minimal interface for the Blogger v3 API + + Ref: https://developers.google.com/blogger + """ + API_KEY = "AIzaSyCN9ax34oMMyM07g_M-5pjeDp_312eITK8" + + def __init__(self, extractor): + self.extractor = extractor + self.api_key = extractor.config("api-key", self.API_KEY) + + def blog_by_url(self, url): + return self._call("blogs/byurl", {"url": url}) + + def blog_posts(self, blog_id): + return self._pagination("blogs/{}/posts".format(blog_id), {}) + + def post_by_path(self, blog_id, path): + endpoint = "blogs/{}/posts/bypath".format(blog_id) + return self._call(endpoint, {"path": path}) + + def _call(self, endpoint, params): + url = "https://www.googleapis.com/blogger/v3/" + endpoint + params["key"] = self.api_key + return self.extractor.request(url, params=params).json() + + def _pagination(self, endpoint, params): + while True: + data = self._call(endpoint, params) + yield from data["items"] + + if "nextPageToken" not in data: + return + params["pageToken"] = data["nextPageToken"]