[blogger] add blog and post extractors (closes #364)

pull/465/head v1.11.0-dev.1
Mike Fährmann 5 years ago
parent 244d396b0b
commit 109718a5e3
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -19,6 +19,7 @@ Archive of Sins https://archiveofsins.com/ Threads
Archived.Moe https://archived.moe/ Threads Archived.Moe https://archived.moe/ Threads
ArtStation https://www.artstation.com/ |artstation-C| ArtStation https://www.artstation.com/ |artstation-C|
Behance https://www.behance.net/ Collections, Galleries, User Profiles Behance https://www.behance.net/ Collections, Galleries, User Profiles
Blogger https://www.blogger.com/ Blogs, Posts
BobX http://www.bobx.com/dark/ Galleries, Idols BobX http://www.bobx.com/dark/ Galleries, Idols
Danbooru https://danbooru.donmai.us/ Pools, Popular Images, Posts, Tag-Searches Optional Danbooru https://danbooru.donmai.us/ Pools, Popular Images, Posts, Tag-Searches Optional
Desuarchive https://desuarchive.org/ Threads Desuarchive https://desuarchive.org/ Threads

@ -20,6 +20,7 @@ modules = [
"adultempire", "adultempire",
"artstation", "artstation",
"behance", "behance",
"blogger",
"bobx", "bobx",
"danbooru", "danbooru",
"deviantart", "deviantart",

@ -0,0 +1,178 @@
# -*- coding: utf-8 -*-
# Copyright 2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for Blogger blogs"""
from .common import Extractor, Message
from .. import text
import re
BASE_PATTERN = (
r"(?:blogger:(?:https?://)?([^/]+)|"
r"(?:https?://)?([^.]+\.blogspot\.com))")
class BloggerExtractor(Extractor):
"""Base class for blogger extractors"""
category = "blogger"
directory_fmt = ("{category}", "{blog[name]}",
"{post[date]:%Y-%m-%d} {post[title]}")
filename_fmt = "{num:>03}.{extension}"
archive_fmt = "{post[id]}_{num}"
root = "https://www.blogger.com"
def __init__(self, match):
Extractor.__init__(self, match)
self.blog = match.group(1) or match.group(2)
self.api = BloggerAPI(self)
def items(self):
yield Message.Version, 1
blog = self.api.blog_by_url("http://" + self.blog)
blog["pages"] = blog["pages"]["totalItems"]
blog["posts"] = blog["posts"]["totalItems"]
blog["date"] = text.parse_datetime(blog["published"])
del blog["selfLink"]
sub = re.compile(r"/s\d+/").sub
findall = re.compile(
r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)"').findall
for post in self.posts(blog):
images = findall(post["content"])
if not images:
continue
post["author"] = post["author"]["displayName"]
post["replies"] = post["replies"]["totalItems"]
post["content"] = text.remove_html(post["content"])
post["date"] = text.parse_datetime(post["published"])
del post["selfLink"]
del post["blog"]
yield Message.Directory, {"blog": blog, "post": post}
for num, url in enumerate(images, 1):
url = sub("/s0/", url).replace("http:", "https:", 1)
yield Message.Url, url, text.nameext_from_url(url, {
"blog": blog,
"post": post,
"url" : url,
"num" : num,
})
def posts(self, blog):
"""Return an iterable with all relevant post objects"""
class BloggerPostExtractor(BloggerExtractor):
"""Extractor for a single blog post"""
subcategory = "post"
pattern = BASE_PATTERN + r"(/\d{4}/\d\d/[^/?&#]+\.html)"
test = (
("https://julianbphotography.blogspot.com/2010/12/moon-rise.html", {
"url": "9928429fb62f712eb4de80f53625eccecc614aae",
"pattern": r"https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg",
"keyword": {
"blog": {
"date" : "type:datetime",
"description": "",
"id" : "5623928067739466034",
"kind" : "blogger#blog",
"locale" : dict,
"name" : "Julian Bunker Photography",
"pages" : int,
"posts" : int,
"published" : "2010-11-21T10:19:42-08:00",
"updated" : str,
"url" : "http://www.julianbunker.com/",
},
"post": {
"author" : "Julian Bunker",
"content" : str,
"date" : "type:datetime",
"etag" : str,
"id" : "6955139236418998998",
"kind" : "blogger#post",
"published" : "2010-12-25T17:08:00-08:00",
"replies" : "0",
"title" : "Moon Rise",
"updated" : "2011-12-06T05:21:24-08:00",
"url" : "re:.+/2010/12/moon-rise.html$",
},
"num": int,
"url": str,
},
}),
("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", {
"url": "9928429fb62f712eb4de80f53625eccecc614aae",
}),
)
def __init__(self, match):
BloggerExtractor.__init__(self, match)
self.path = match.group(3)
def posts(self, blog):
return (self.api.post_by_path(blog["id"], self.path),)
class BloggerBlogExtractor(BloggerExtractor):
"""Extractor for an entire Blogger blog"""
subcategory = "blog"
pattern = BASE_PATTERN + "/?$"
test = (
("https://julianbphotography.blogspot.com/", {
"range": "1-25",
"count": 25,
"pattern": r"https://\d\.bp\.blogspot\.com/.*/s0/[^.]+\.jpg",
}),
("blogger:http://www.julianbunker.com/", {
"range": "1-25",
"count": 25,
}),
)
def posts(self, blog):
return self.api.blog_posts(blog["id"])
class BloggerAPI():
"""Minimal interface for the Blogger v3 API
Ref: https://developers.google.com/blogger
"""
API_KEY = "AIzaSyCN9ax34oMMyM07g_M-5pjeDp_312eITK8"
def __init__(self, extractor):
self.extractor = extractor
self.api_key = extractor.config("api-key", self.API_KEY)
def blog_by_url(self, url):
return self._call("blogs/byurl", {"url": url})
def blog_posts(self, blog_id):
return self._pagination("blogs/{}/posts".format(blog_id), {})
def post_by_path(self, blog_id, path):
endpoint = "blogs/{}/posts/bypath".format(blog_id)
return self._call(endpoint, {"path": path})
def _call(self, endpoint, params):
url = "https://www.googleapis.com/blogger/v3/" + endpoint
params["key"] = self.api_key
return self.extractor.request(url, params=params).json()
def _pagination(self, endpoint, params):
while True:
data = self._call(endpoint, params)
yield from data["items"]
if "nextPageToken" not in data:
return
params["pageToken"] = data["nextPageToken"]
Loading…
Cancel
Save