[myportfolio] fix extraction

pull/1529/head
Mike Fährmann 3 years ago
parent 8b22d4e667
commit d108421461
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2018-2019 Mike Fährmann # Copyright 2018-2021 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@ -51,9 +51,11 @@ class MyportfolioGalleryExtractor(Extractor):
self.prefix = "myportfolio:" if domain1 else "" self.prefix = "myportfolio:" if domain1 else ""
def items(self): def items(self):
yield Message.Version, 1
url = "https://" + self.domain + (self.path or "") url = "https://" + self.domain + (self.path or "")
page = self.request(url).text response = self.request(url)
if response.history and response.url.endswith(".adobe.com/missing"):
raise exception.NotFoundError()
page = response.text
projects = text.extract( projects = text.extract(
page, '<section class="project-covers', '</section>')[0] page, '<section class="project-covers', '</section>')[0]
@ -78,12 +80,12 @@ class MyportfolioGalleryExtractor(Extractor):
# <user> and <title> can contain a "-" as well, so we get the title # <user> and <title> can contain a "-" as well, so we get the title
# from somewhere else and cut that amount from the og:title content # from somewhere else and cut that amount from the og:title content
user, pos = text.extract( extr = text.extract_from(page)
page, 'property=og:title content="', '"') user = extr('property="og:title" content="', '"') or \
desc, pos = text.extract( extr('property=og:title content="', '"')
page, 'property=og:description content="', '"', pos) descr = extr('property="og:description" content="', '"') or \
title, pos = text.extract( extr('property=og:description content="', '"')
page, '<h1 ', '</h1>', pos) title = extr('<h1 ', '</h1>')
if title: if title:
title = title.partition(">")[2] title = title.partition(">")[2]
@ -96,7 +98,7 @@ class MyportfolioGalleryExtractor(Extractor):
return { return {
"user": text.unescape(user), "user": text.unescape(user),
"title": text.unescape(title), "title": text.unescape(title),
"description": text.unescape(desc or ""), "description": text.unescape(descr),
} }
@staticmethod @staticmethod

Loading…
Cancel
Save