[deviantart] fix issue with small images

pull/13/head
Mike Fährmann 9 years ago
parent 3ebd126b35
commit e4a661fd6b
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -13,11 +13,11 @@ from .. import text
import re import re
class DeviantArtExtractor(AsynchronousExtractor): class DeviantArtExtractor(AsynchronousExtractor):
"""Extract all works of an artist on deviantart"""
category = "deviantart" category = "deviantart"
directory_fmt = ["{category}", "{artist}"] directory_fmt = ["{category}", "{artist}"]
filename_fmt = "{category}_{index}_{title}.{extension}" filename_fmt = "{category}_{index}_{title}.{extension}"
pattern = [r"(?:https?://)?([^\.]+)\.deviantart\.com/gallery/.*"] pattern = [r"(?:https?://)?([^\.]+)\.deviantart\.com(?:/gallery)?/?$"]
def __init__(self, match): def __init__(self, match):
AsynchronousExtractor.__init__(self) AsynchronousExtractor.__init__(self)
@ -57,39 +57,36 @@ class DeviantArtExtractor(AsynchronousExtractor):
def get_image_metadata(self, image): def get_image_metadata(self, image):
"""Collect metadata for an image""" """Collect metadata for an image"""
match = self.extract_data(image, 'title', tmatch = self.extract_data(image, 'title',
r'(.+) by (.+), ([A-Z][a-z]{2} \d+, \d{4}) in') r'(.+) by (.+), ([A-Z][a-z]{2} \d+, \d{4}) in')
if image.startswith(" ismature"): hmatch = self.extract_data(image, 'href', r'[^"]+-(\d+)')
# adult image
url, _ = text.extract(image, 'href="', '"') url, pos = text.extract(image, ' data-super-full-img="', '"', tmatch.end())
page = self.request(url).text
_ , pos = text.extract(page, ' class="dev-content-normal "', '')
url , pos = text.extract(page, ' src="', '"', pos)
index , pos = text.extract(page, ' data-embed-id="', '"', pos)
width , pos = text.extract(page, ' width="', '"', pos)
height, pos = text.extract(page, ' height="', '"', pos)
else:
# normal image
index = self.extract_data(image, 'href', r'[^"]+-(\d+)').group(1)
url, pos = text.extract(image, ' data-super-full-img="', '"', match.end())
if url: if url:
width , pos = text.extract(image, ' data-super-full-width="', '"', pos) width , pos = text.extract(image, ' data-super-full-width="', '"', pos)
height, pos = text.extract(image, ' data-super-full-height="', '"', pos) height, pos = text.extract(image, ' data-super-full-height="', '"', pos)
else: else:
url, pos = text.extract(image, ' data-super-img="', '"', pos) url, pos = text.extract(image, ' data-super-img="', '"', pos)
if url:
width , pos = text.extract(image, ' data-super-width="', '"', pos) width , pos = text.extract(image, ' data-super-width="', '"', pos)
height, pos = text.extract(image, ' data-super-height="', '"', pos) height, pos = text.extract(image, ' data-super-height="', '"', pos)
data = { else:
"index": index, page = self.request(hmatch.group(0)).text
"title": match.group(1), _ , pos = text.extract(page, ' class="dev-content-normal "', '')
"artist": match.group(2), url , pos = text.extract(page, ' src="', '"', pos)
"date": match.group(3), width , pos = text.extract(page, ' width="', '"', pos)
height, pos = text.extract(page, ' height="', '"', pos)
return url, text.nameext_from_url(url, {
"index": hmatch.group(1),
"title": text.unescape(tmatch.group(1)),
"artist": tmatch.group(2),
"date": tmatch.group(3),
"width": width, "width": width,
"height": height, "height": height,
} })
return url, text.nameext_from_url(url, data)
@staticmethod @staticmethod
def extract_data(txt, attr, pattern): def extract_data(txt, attr, pattern):
"""Extract a HTML attribute and apply a regex to it"""
txt, _ = text.extract(txt, ' %s="' % attr, '"') txt, _ = text.extract(txt, ' %s="' % attr, '"')
return re.match(pattern, txt) return re.match(pattern, txt)

Loading…
Cancel
Save