|
|
@ -7,7 +7,6 @@
|
|
|
|
"""Extractors for Postmill instances"""
|
|
|
|
"""Extractors for Postmill instances"""
|
|
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
import re
|
|
|
|
import urllib.parse
|
|
|
|
|
|
|
|
from .common import BaseExtractor, Message
|
|
|
|
from .common import BaseExtractor, Message
|
|
|
|
from .. import text, exception
|
|
|
|
from .. import text, exception
|
|
|
|
|
|
|
|
|
|
|
@ -28,8 +27,8 @@ class PostmillExtractor(BaseExtractor):
|
|
|
|
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
def items(self):
|
|
|
|
for post_url in self.post_urls():
|
|
|
|
for post_url in self.post_urls():
|
|
|
|
response = self.request(post_url)
|
|
|
|
page = self.request(post_url).text
|
|
|
|
extr = text.extract_from(response.text)
|
|
|
|
extr = text.extract_from(page)
|
|
|
|
|
|
|
|
|
|
|
|
title = text.unescape(extr(
|
|
|
|
title = text.unescape(extr(
|
|
|
|
'<meta property="og:title" content="', '">'))
|
|
|
|
'<meta property="og:title" content="', '">'))
|
|
|
@ -52,7 +51,7 @@ class PostmillExtractor(BaseExtractor):
|
|
|
|
id = int(match.group(2))
|
|
|
|
id = int(match.group(2))
|
|
|
|
|
|
|
|
|
|
|
|
is_text_post = url.startswith("/")
|
|
|
|
is_text_post = url.startswith("/")
|
|
|
|
is_image_post = self._search_image_tag(response.text) is not None
|
|
|
|
is_image_post = self._search_image_tag(page) is not None
|
|
|
|
data = {
|
|
|
|
data = {
|
|
|
|
"title": title,
|
|
|
|
"title": title,
|
|
|
|
"date": date,
|
|
|
|
"date": date,
|
|
|
@ -60,7 +59,7 @@ class PostmillExtractor(BaseExtractor):
|
|
|
|
"forum": forum,
|
|
|
|
"forum": forum,
|
|
|
|
"id": id,
|
|
|
|
"id": id,
|
|
|
|
"flair": [text.unescape(i) for i in text.extract_iter(
|
|
|
|
"flair": [text.unescape(i) for i in text.extract_iter(
|
|
|
|
response.text, '<span class="flair__label">', '</span>')],
|
|
|
|
page, '<span class="flair__label">', '</span>')],
|
|
|
|
"instance": self.instance,
|
|
|
|
"instance": self.instance,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -90,32 +89,32 @@ class PostmillSubmissionsExtractor(PostmillExtractor):
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
def __init__(self, match):
|
|
|
|
PostmillExtractor.__init__(self, match)
|
|
|
|
PostmillExtractor.__init__(self, match)
|
|
|
|
self.base = match.group(3)
|
|
|
|
groups = match.groups()
|
|
|
|
self.sorting_path = match.group(4) or ""
|
|
|
|
self.base = groups[-3]
|
|
|
|
|
|
|
|
self.sorting_path = groups[-2] or ""
|
|
|
|
self.query = {key: value for key, value in text.parse_query(
|
|
|
|
self.query = {key: value for key, value in text.parse_query(
|
|
|
|
match.group(5) or "").items() if self.acceptable_query(key)}
|
|
|
|
groups[-1]).items() if self.acceptable_query(key)}
|
|
|
|
|
|
|
|
|
|
|
|
def items(self):
|
|
|
|
def items(self):
|
|
|
|
url = self.root + self.base + self.sorting_path
|
|
|
|
url = self.root + self.base + self.sorting_path
|
|
|
|
if self.query:
|
|
|
|
|
|
|
|
url += "?" + urllib.parse.urlencode(self.query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while url:
|
|
|
|
while url:
|
|
|
|
response = self.request(url)
|
|
|
|
response = self.request(url, params=self.query)
|
|
|
|
if response.history:
|
|
|
|
if response.history:
|
|
|
|
redirect_url = response.url
|
|
|
|
redirect_url = response.url
|
|
|
|
if redirect_url == self.root + "/login":
|
|
|
|
if redirect_url == self.root + "/login":
|
|
|
|
raise exception.StopExtraction(
|
|
|
|
raise exception.StopExtraction(
|
|
|
|
"HTTP redirect to login page (%s)", redirect_url)
|
|
|
|
"HTTP redirect to login page (%s)", redirect_url)
|
|
|
|
|
|
|
|
page = response.text
|
|
|
|
|
|
|
|
|
|
|
|
for nav in text.extract_iter(response.text,
|
|
|
|
for nav in text.extract_iter(page,
|
|
|
|
'<nav class="submission__nav">',
|
|
|
|
'<nav class="submission__nav">',
|
|
|
|
'</nav>'):
|
|
|
|
'</nav>'):
|
|
|
|
post_url = text.unescape(text.extr(nav, '<a href="', '"'))
|
|
|
|
post_url = text.unescape(text.extr(nav, '<a href="', '"'))
|
|
|
|
yield Message.Queue, text.urljoin(url, post_url), \
|
|
|
|
yield Message.Queue, text.urljoin(url, post_url), \
|
|
|
|
{"_extractor": PostmillPostExtractor}
|
|
|
|
{"_extractor": PostmillPostExtractor}
|
|
|
|
|
|
|
|
|
|
|
|
url = text.unescape(text.extr(response.text,
|
|
|
|
url = text.unescape(text.extr(page,
|
|
|
|
'<link rel="next" href="', '">'))
|
|
|
|
'<link rel="next" href="', '">'))
|
|
|
|
|
|
|
|
|
|
|
|
def acceptable_query(self, key):
|
|
|
|
def acceptable_query(self, key):
|
|
|
@ -131,14 +130,15 @@ BASE_PATTERN = PostmillExtractor.update({
|
|
|
|
r"\.onion)"),
|
|
|
|
r"\.onion)"),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
})
|
|
|
|
})
|
|
|
|
SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?"
|
|
|
|
QUERY_RE = r"(?:\?([^#]+))?$"
|
|
|
|
QUERY_RE = r"(?:\?([^#]+))?"
|
|
|
|
SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \
|
|
|
|
|
|
|
|
QUERY_RE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PostmillPostExtractor(PostmillExtractor):
|
|
|
|
class PostmillPostExtractor(PostmillExtractor):
|
|
|
|
"""Extractor for a single submission URL"""
|
|
|
|
"""Extractor for a single submission URL"""
|
|
|
|
subcategory = "post"
|
|
|
|
subcategory = "post"
|
|
|
|
pattern = BASE_PATTERN + r"/f/([\w\d_]+)/(\d+)(?:/.+)?$"
|
|
|
|
pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)"
|
|
|
|
example = "https://raddle.me/f/FORUM/123/TITLE"
|
|
|
|
example = "https://raddle.me/f/FORUM/123/TITLE"
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, match):
|
|
|
|
def __init__(self, match):
|
|
|
@ -170,29 +170,28 @@ class PostmillShortURLExtractor(PostmillExtractor):
|
|
|
|
class PostmillHomeExtractor(PostmillSubmissionsExtractor):
|
|
|
|
class PostmillHomeExtractor(PostmillSubmissionsExtractor):
|
|
|
|
"""Extractor for the home page"""
|
|
|
|
"""Extractor for the home page"""
|
|
|
|
subcategory = "home"
|
|
|
|
subcategory = "home"
|
|
|
|
pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE \
|
|
|
|
pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE
|
|
|
|
+ QUERY_RE + "$"
|
|
|
|
|
|
|
|
example = "https://raddle.me/"
|
|
|
|
example = "https://raddle.me/"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PostmillForumExtractor(PostmillSubmissionsExtractor):
|
|
|
|
class PostmillForumExtractor(PostmillSubmissionsExtractor):
|
|
|
|
"""Extractor for submissions on a forum"""
|
|
|
|
"""Extractor for submissions on a forum"""
|
|
|
|
subcategory = "forum"
|
|
|
|
subcategory = "forum"
|
|
|
|
pattern = BASE_PATTERN + r"(/f/[\w\d_]+)" + SORTING_RE + QUERY_RE + "$"
|
|
|
|
pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE
|
|
|
|
example = "https://raddle.me/f/FORUM"
|
|
|
|
example = "https://raddle.me/f/FORUM"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor):
|
|
|
|
class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor):
|
|
|
|
"""Extractor for submissions made by a user"""
|
|
|
|
"""Extractor for submissions made by a user"""
|
|
|
|
subcategory = "usersubmissions"
|
|
|
|
subcategory = "usersubmissions"
|
|
|
|
pattern = BASE_PATTERN + r"(/user/[\w\d_]+/submissions)()" + QUERY_RE + "$"
|
|
|
|
pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE
|
|
|
|
example = "https://raddle.me/user/USER/submissions"
|
|
|
|
example = "https://raddle.me/user/USER/submissions"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PostmillTagExtractor(PostmillSubmissionsExtractor):
|
|
|
|
class PostmillTagExtractor(PostmillSubmissionsExtractor):
|
|
|
|
"""Extractor for submissions on a forum with a specific tag"""
|
|
|
|
"""Extractor for submissions on a forum with a specific tag"""
|
|
|
|
subcategory = "tag"
|
|
|
|
subcategory = "tag"
|
|
|
|
pattern = BASE_PATTERN + r"(/tag/[\w\d_]+)" + SORTING_RE + QUERY_RE + "$"
|
|
|
|
pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE
|
|
|
|
example = "https://raddle.me/tag/TAG"
|
|
|
|
example = "https://raddle.me/tag/TAG"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|