[seiga] support more than 200 images

Due to API restrictions and/or missing knowledge about and
documentation of API usage, it was only possible to retrieve the
latest 200 images of a niconico seiga user with said API.

The new approach manually visits each HTML page and gets its
information from there.
pull/54/head
Mike Fährmann 7 years ago
parent baf8094868
commit f72318e593
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -11,7 +11,6 @@
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
from xml.etree import ElementTree
class SeigaExtractor(Extractor):
@ -74,11 +73,12 @@ class SeigaUserExtractor(SeigaExtractor):
r"user/illust/(\d+)")]
test = [
("http://seiga.nicovideo.jp/user/illust/39537793", {
"keyword": "a716bf534b4191dc58ddbff51494b72a9cf58285",
"pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",
"count": 2,
}),
("http://seiga.nicovideo.jp/user/illust/79433", {
"url": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
"keyword": "187b77728381d072466af7f7ebcc479a0830ce25",
"count": 0,
}),
]
@ -90,25 +90,27 @@ class SeigaUserExtractor(SeigaExtractor):
return {"user_id": self.user_id}
def get_images(self):
keymap = {0: "image_id", 2: "title", 3: "description",
7: "summary", 8: "genre", 18: "date"}
url = "http://seiga.nicovideo.jp/api/user/data?id=" + self.user_id
response = self.request(url)
try:
root = ElementTree.fromstring(response.text)
except ElementTree.ParseError:
self.log.debug("xml parsing error; removing control characters")
xmldata = text.clean_xml(response.text)
root = ElementTree.fromstring(xmldata)
if root[0].text == "0":
return []
return [
{
key: image[index].text
for index, key in keymap.items()
}
for image in root[1]
]
url = "http://seiga.nicovideo.jp/user/illust/" + self.user_id
params = {"target": "illust_all", "page": 1}
while True:
cnt = 0
page = self.request(url, params=params).text
for info in text.extract_iter(
page, '<li class="list_item', '</a></li> '):
yield text.extract_all(info, (
("image_id", '/seiga/im', '"'),
("title" , '<li class="title">', '</li>'),
("views" , '</span>', '</li>'),
("comments", '</span>', '</li>'),
("clips" , '</span>', '</li>'),
))[0]
cnt += 1
if cnt < 40:
return
params["page"] += 1
class SeigaImageExtractor(SeigaExtractor):

@ -83,6 +83,7 @@ skip = [
"archivedmoe", "archiveofsins", "thebarchive",
# temporary issues
"mangazuki",
"hentaifoundry", # invalid SSL cert
]
# enable selective testing for direct calls
if __name__ == '__main__' and len(sys.argv) > 1:

Loading…
Cancel
Save