[postmill] implement suggestions

pull/4919/head
blankie 9 months ago
parent fbe14a2745
commit 8a42ea736a
No known key found for this signature in database
GPG Key ID: CC15FC822C7F61F5

@ -2735,7 +2735,7 @@ Description
extractor.[postmill].save-link-post-body extractor.[postmill].save-link-post-body
------------------------ ----------------------------------------
Type Type
``bool`` ``bool``
Default Default

@ -7,7 +7,6 @@
"""Extractors for Postmill instances""" """Extractors for Postmill instances"""
import re import re
import urllib.parse
from .common import BaseExtractor, Message from .common import BaseExtractor, Message
from .. import text, exception from .. import text, exception
@ -28,8 +27,8 @@ class PostmillExtractor(BaseExtractor):
def items(self): def items(self):
for post_url in self.post_urls(): for post_url in self.post_urls():
response = self.request(post_url) page = self.request(post_url).text
extr = text.extract_from(response.text) extr = text.extract_from(page)
title = text.unescape(extr( title = text.unescape(extr(
'<meta property="og:title" content="', '">')) '<meta property="og:title" content="', '">'))
@ -52,7 +51,7 @@ class PostmillExtractor(BaseExtractor):
id = int(match.group(2)) id = int(match.group(2))
is_text_post = url.startswith("/") is_text_post = url.startswith("/")
is_image_post = self._search_image_tag(response.text) is not None is_image_post = self._search_image_tag(page) is not None
data = { data = {
"title": title, "title": title,
"date": date, "date": date,
@ -60,7 +59,7 @@ class PostmillExtractor(BaseExtractor):
"forum": forum, "forum": forum,
"id": id, "id": id,
"flair": [text.unescape(i) for i in text.extract_iter( "flair": [text.unescape(i) for i in text.extract_iter(
response.text, '<span class="flair__label">', '</span>')], page, '<span class="flair__label">', '</span>')],
"instance": self.instance, "instance": self.instance,
} }
@ -90,32 +89,32 @@ class PostmillSubmissionsExtractor(PostmillExtractor):
def __init__(self, match): def __init__(self, match):
PostmillExtractor.__init__(self, match) PostmillExtractor.__init__(self, match)
self.base = match.group(3) groups = match.groups()
self.sorting_path = match.group(4) or "" self.base = groups[-3]
self.sorting_path = groups[-2] or ""
self.query = {key: value for key, value in text.parse_query( self.query = {key: value for key, value in text.parse_query(
match.group(5) or "").items() if self.acceptable_query(key)} groups[-1]).items() if self.acceptable_query(key)}
def items(self): def items(self):
url = self.root + self.base + self.sorting_path url = self.root + self.base + self.sorting_path
if self.query:
url += "?" + urllib.parse.urlencode(self.query)
while url: while url:
response = self.request(url) response = self.request(url, params=self.query)
if response.history: if response.history:
redirect_url = response.url redirect_url = response.url
if redirect_url == self.root + "/login": if redirect_url == self.root + "/login":
raise exception.StopExtraction( raise exception.StopExtraction(
"HTTP redirect to login page (%s)", redirect_url) "HTTP redirect to login page (%s)", redirect_url)
page = response.text
for nav in text.extract_iter(response.text, for nav in text.extract_iter(page,
'<nav class="submission__nav">', '<nav class="submission__nav">',
'</nav>'): '</nav>'):
post_url = text.unescape(text.extr(nav, '<a href="', '"')) post_url = text.unescape(text.extr(nav, '<a href="', '"'))
yield Message.Queue, text.urljoin(url, post_url), \ yield Message.Queue, text.urljoin(url, post_url), \
{"_extractor": PostmillPostExtractor} {"_extractor": PostmillPostExtractor}
url = text.unescape(text.extr(response.text, url = text.unescape(text.extr(page,
'<link rel="next" href="', '">')) '<link rel="next" href="', '">'))
def acceptable_query(self, key): def acceptable_query(self, key):
@ -131,14 +130,15 @@ BASE_PATTERN = PostmillExtractor.update({
r"\.onion)"), r"\.onion)"),
} }
}) })
SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" QUERY_RE = r"(?:\?([^#]+))?$"
QUERY_RE = r"(?:\?([^#]+))?" SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \
QUERY_RE
class PostmillPostExtractor(PostmillExtractor): class PostmillPostExtractor(PostmillExtractor):
"""Extractor for a single submission URL""" """Extractor for a single submission URL"""
subcategory = "post" subcategory = "post"
pattern = BASE_PATTERN + r"/f/([\w\d_]+)/(\d+)(?:/.+)?$" pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)"
example = "https://raddle.me/f/FORUM/123/TITLE" example = "https://raddle.me/f/FORUM/123/TITLE"
def __init__(self, match): def __init__(self, match):
@ -170,29 +170,28 @@ class PostmillShortURLExtractor(PostmillExtractor):
class PostmillHomeExtractor(PostmillSubmissionsExtractor): class PostmillHomeExtractor(PostmillSubmissionsExtractor):
"""Extractor for the home page""" """Extractor for the home page"""
subcategory = "home" subcategory = "home"
pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE \ pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE
+ QUERY_RE + "$"
example = "https://raddle.me/" example = "https://raddle.me/"
class PostmillForumExtractor(PostmillSubmissionsExtractor): class PostmillForumExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum""" """Extractor for submissions on a forum"""
subcategory = "forum" subcategory = "forum"
pattern = BASE_PATTERN + r"(/f/[\w\d_]+)" + SORTING_RE + QUERY_RE + "$" pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE
example = "https://raddle.me/f/FORUM" example = "https://raddle.me/f/FORUM"
class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor): class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions made by a user""" """Extractor for submissions made by a user"""
subcategory = "usersubmissions" subcategory = "usersubmissions"
pattern = BASE_PATTERN + r"(/user/[\w\d_]+/submissions)()" + QUERY_RE + "$" pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE
example = "https://raddle.me/user/USER/submissions" example = "https://raddle.me/user/USER/submissions"
class PostmillTagExtractor(PostmillSubmissionsExtractor): class PostmillTagExtractor(PostmillSubmissionsExtractor):
"""Extractor for submissions on a forum with a specific tag""" """Extractor for submissions on a forum with a specific tag"""
subcategory = "tag" subcategory = "tag"
pattern = BASE_PATTERN + r"(/tag/[\w\d_]+)" + SORTING_RE + QUERY_RE + "$" pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE
example = "https://raddle.me/tag/TAG" example = "https://raddle.me/tag/TAG"

Loading…
Cancel
Save