[2chan] fix metadata extraction

pull/511/head
Mike Fährmann 5 years ago
parent 173a93454e
commit 71acbdabf4
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -68,6 +68,8 @@ class _2chanThreadExtractor(Extractor):
def parse(self, post):
"""Build post-object by extracting data from an HTML post"""
data = self._extract_post(post)
if data["name"]:
data["name"] = data["name"].strip()
if '<a href="/' in post:
self._extract_image(post, data)
data["tim"], _, data["extension"] = data["filename"].partition(".")
@ -78,10 +80,10 @@ class _2chanThreadExtractor(Extractor):
@staticmethod
def _extract_post(post):
return text.extract_all(post, (
("no" , 'name="', '"'),
("post", '<b>', '</b>'),
("name", '<b>', ' </b>'),
("now" , '</font> ', ' '),
("post", 'class="csb">' , '<'),
("name", 'class="cnm">' , '<'),
("now" , 'class="cnw">' , '<'),
("no" , 'class="cno">No.', '<'),
(None , '<blockquote', ''),
("com" , '>', '</blockquote>'),
))[0]

@ -93,7 +93,7 @@ class WikiartArtworksExtractor(WikiartExtractor):
directory_fmt = ("{category}", "Artworks by {group!c}", "{type}")
pattern = BASE_PATTERN + r"/paintings-by-([\w-]+)/([\w-]+)"
test = ("https://www.wikiart.org/en/paintings-by-media/grisaille", {
"url": "f92d55669fa949491c26a5437527adb14b35b8cc",
"url": "228426a9d32b5bba9d659944c6b0ba73883af33f",
})
def __init__(self, match):

Loading…
Cancel
Save