replace 'text.extract()' with 'text.extr()' where possible

pull/3177/head
Mike Fährmann 2 years ago
parent eb33e6cf2d
commit b0cb4a1b9c
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -60,8 +60,8 @@ class _2chanThreadExtractor(Extractor):
def metadata(self, page):
"""Collect metadata for extractor-job"""
title = text.extract(page, "<title>", "</title>")[0]
title, _, boardname = title.rpartition(" - ")
title, _, boardname = text.extr(
page, "<title>", "</title>").rpartition(" - ")
return {
"server": self.server,
"title": title,
@ -72,8 +72,8 @@ class _2chanThreadExtractor(Extractor):
def posts(self, page):
"""Build a list of all post-objects"""
page = text.extract(
page, '<div class="thre"', '<div style="clear:left"></div>')[0]
page = text.extr(
page, '<div class="thre"', '<div style="clear:left"></div>')
return [
self.parse(post)
for post in page.split('<table border=0>')
@ -84,7 +84,7 @@ class _2chanThreadExtractor(Extractor):
data = self._extract_post(post)
if data["name"]:
data["name"] = data["name"].strip()
path = text.extract(post, '<a href="/', '"')[0]
path = text.extr(post, '<a href="/', '"')
if path and not path.startswith("bin/jump"):
self._extract_image(post, data)
data["tim"], _, data["extension"] = data["filename"].partition(".")

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2019-2021 Mike Fährmann
# Copyright 2019-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -124,7 +124,7 @@ class _35photoUserExtractor(_35photoExtractor):
def metadata(self):
url = "{}/{}/".format(self.root, self.user)
page = self.request(url).text
self.user_id = text.parse_int(text.extract(page, "/user_", ".xml")[0])
self.user_id = text.parse_int(text.extr(page, "/user_", ".xml"))
return {
"user": self.user,
"user_id": self.user_id,
@ -189,10 +189,10 @@ class _35photoGenreExtractor(_35photoExtractor):
def metadata(self):
url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/")
page = self.request(url).text
self.photo_ids = self._photo_ids(text.extract(
page, ' class="photo', '\n')[0])
self.photo_ids = self._photo_ids(text.extr(
page, ' class="photo', '\n'))
return {
"genre": text.extract(page, " genre - ", ". ")[0],
"genre": text.extr(page, " genre - ", ". "),
"genre_id": text.parse_int(self.genre_id),
}

@ -76,9 +76,9 @@ class _8musesAlbumExtractor(Extractor):
url = self.root + self.path + self.params
while True:
data = self._unobfuscate(text.extract(
data = self._unobfuscate(text.extr(
self.request(url).text,
'id="ractive-public" type="text/plain">', '</script>')[0])
'id="ractive-public" type="text/plain">', '</script>'))
images = data.get("pictures")
if images:

@ -41,8 +41,8 @@ class ArtstationExtractor(Extractor):
if adict["has_embedded_player"] and self.external:
player = adict["player_embedded"]
url = text.extract(player, 'src="', '"')[0] or \
text.extract(player, "src='", "'")[0]
url = (text.extr(player, 'src="', '"') or
text.extr(player, "src='", "'"))
if url and not url.startswith(self.root):
asset["extension"] = None
yield Message.Url, "ytdl:" + url, asset

@ -128,8 +128,7 @@ class AryionExtractor(Extractor):
# get filename from 'Content-Disposition' header
cdis = headers["content-disposition"]
fname, _, ext = text.extract(
cdis, 'filename="', '"')[0].rpartition(".")
fname, _, ext = text.extr(cdis, 'filename="', '"').rpartition(".")
if not fname:
fname, ext = ext, fname

@ -38,8 +38,8 @@ class BbcGalleryExtractor(GalleryExtractor):
)
def metadata(self, page):
data = json.loads(text.extract(
page, '<script type="application/ld+json">', '</script>')[0])
data = json.loads(text.extr(
page, '<script type="application/ld+json">', '</script>'))
return {
"programme": self.gallery_url.split("/")[4],
"path": list(util.unique_sequence(

@ -97,7 +97,7 @@ class BcyExtractor(Extractor):
url = "{}/item/detail/{}".format(self.root, post_id)
page = self.request(url, notfound="post").text
return json.loads(
text.extract(page, 'JSON.parse("', '");')[0]
text.extr(page, 'JSON.parse("', '");')
.replace('\\\\u002F', '/')
.replace('\\"', '"')
)["detail"]

@ -119,8 +119,8 @@ class BehanceGalleryExtractor(BehanceExtractor):
}
page = self.request(url, cookies=cookies).text
data = json.loads(text.extract(
page, 'id="beconfig-store_state">', '</script>')[0])
data = json.loads(text.extr(
page, 'id="beconfig-store_state">', '</script>'))
return self._update(data["project"]["project"])
def get_images(self, data):
@ -137,7 +137,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
elif mtype == "video":
page = self.request(module["src"]).text
url = text.extract(page, '<source src="', '"')[0]
url = text.extr(page, '<source src="', '"')
if text.ext_from_url(url) == "m3u8":
url = "ytdl:" + url
append((url, module))
@ -150,8 +150,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
elif mtype == "embed":
embed = module.get("original_embed") or module.get("embed")
if embed:
url = "ytdl:" + text.extract(embed, 'src="', '"')[0]
append((url, module))
append(("ytdl:" + text.extr(embed, 'src="', '"'), module))
return result

@ -61,8 +61,8 @@ class BloggerExtractor(Extractor):
page = self.request(post["url"]).text
for url in findall_video(page):
page = self.request(url).text
video_config = json.loads(text.extract(
page, 'var VIDEO_CONFIG =', '\n')[0])
video_config = json.loads(text.extr(
page, 'var VIDEO_CONFIG =', '\n'))
files.append(max(
video_config["streams"],
key=lambda x: x["format_id"],

@ -68,9 +68,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
url = self.root + "/a/" + self.album_id
try:
data = json.loads(text.extract(
data = json.loads(text.extr(
self.request(url).text,
'id="__NEXT_DATA__" type="application/json">', '<')[0])
'id="__NEXT_DATA__" type="application/json">', '<'))
album = data["props"]["pageProps"]["album"]
files = album["files"]
except Exception as exc:

@ -603,22 +603,22 @@ class DeviantartStashExtractor(DeviantartExtractor):
page = self._limited_request(url).text
if stash_id[0] == "0":
uuid = text.extract(page, '//deviation/', '"')[0]
uuid = text.extr(page, '//deviation/', '"')
if uuid:
deviation = self.api.deviation(uuid)
deviation["index"] = text.parse_int(text.extract(
page, 'gmi-deviationid="', '"')[0])
deviation["index"] = text.parse_int(text.extr(
page, 'gmi-deviationid="', '"'))
yield deviation
return
for item in text.extract_iter(
page, 'class="stash-thumb-container', '</div>'):
url = text.extract(item, '<a href="', '"')[0]
url = text.extr(item, '<a href="', '"')
if url:
stash_id = url.rpartition("/")[2]
else:
stash_id = text.extract(item, 'gmi-stashid="', '"')[0]
stash_id = text.extr(item, 'gmi-stashid="', '"')
stash_id = "2" + util.bencode(text.parse_int(
stash_id), "0123456789abcdefghijklmnopqrstuvwxyz")
@ -1484,8 +1484,8 @@ class DeviantartEclipseAPI():
def _fetch_csrf_token(self, page=None):
if page is None:
page = self.request(self.extractor.root + "/").text
self.csrf_token = token = text.extract(
page, "window.__CSRF_TOKEN__ = '", "'")[0]
self.csrf_token = token = text.extr(
page, "window.__CSRF_TOKEN__ = '", "'")
return token

@ -30,7 +30,7 @@ class DynastyscansBase():
src = extr("class='btn-group'>", "</div>")
url = extr(' src="', '"')
src = text.extract(src, 'href="', '"')[0] if "Source<" in src else ""
src = text.extr(src, 'href="', '"') if "Source<" in src else ""
return {
"url" : self.root + url,
@ -75,7 +75,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
"title" : text.unescape(match.group(4) or ""),
"author" : text.remove_html(author),
"group" : (text.remove_html(group) or
text.extract(group, ' alt="', '"')[0] or ""),
text.extr(group, ' alt="', '"')),
"date" : text.parse_datetime(extr(
'"icon-calendar"></i> ', '<'), "%b %d, %Y"),
"lang" : "en",
@ -83,7 +83,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
}
def images(self, page):
data = text.extract(page, "var pages = ", ";\n")[0]
data = text.extr(page, "var pages = ", ";\n")
return [
(self.root + img["image"], None)
for img in json.loads(data)

@ -55,8 +55,8 @@ class EromeExtractor(Extractor):
yield Message.Directory, data
groups = page.split('<div class="media-group"')
for data["num"], group in enumerate(util.advance(groups, 1), 1):
url = (text.extract(group, '<source src="', '"')[0] or
text.extract(group, 'data-src="', '"')[0])
url = (text.extr(group, '<source src="', '"') or
text.extr(group, 'data-src="', '"'))
if url:
yield Message.Url, url, text.nameext_from_url(url, data)

@ -185,7 +185,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
if self.gallery_token:
gpage = self._gallery_page()
self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0]
self.image_token = text.extr(gpage, 'hentai.org/s/', '"')
if not self.image_token:
self.log.error("Failed to extract initial image token")
self.log.debug("Page content:\n%s", gpage)
@ -193,7 +193,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
ipage = self._image_page()
else:
ipage = self._image_page()
part = text.extract(ipage, 'hentai.org/g/', '"')[0]
part = text.extr(ipage, 'hentai.org/g/', '"')
if not part:
self.log.error("Failed to extract gallery token")
self.log.debug("Page content:\n%s", ipage)
@ -271,8 +271,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
}
if data["uploader"].startswith("<"):
data["uploader"] = text.unescape(text.extract(
data["uploader"], ">", "<")[0])
data["uploader"] = text.unescape(text.extr(
data["uploader"], ">", "<"))
f = data["favorites"][0]
if f == "N":
@ -400,7 +400,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
}
page = self.request(url, cookies=cookies).text
current = text.extract(page, "<strong>", "</strong>")[0]
current = text.extr(page, "<strong>", "</strong>")
self.log.debug("Image Limits: %s/%s", current, self.limits)
self._remaining = self.limits - text.parse_int(current)

@ -57,7 +57,7 @@ class FallenangelsChapterExtractor(ChapterExtractor):
return [
(img["page_image"], None)
for img in json.loads(
text.extract(page, "var pages = ", ";")[0]
text.extr(page, "var pages = ", ";")
)
]

@ -56,7 +56,7 @@ class FoolfuukaExtractor(BaseExtractor):
"""Resolve a remote media link"""
needle = '<meta http-equiv="Refresh" content="0; url='
page = self.request(media["remote_media_link"]).text
return text.extract(page, needle, '"')[0]
return text.extr(page, needle, '"')
@staticmethod
def _remote_direct(media):

@ -114,7 +114,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
})
def images(self, page):
return json.loads(text.extract(page, "var pages = ", ";")[0])
return json.loads(text.extr(page, "var pages = ", ";"))
class FoolslideMangaExtractor(FoolslideExtractor):

@ -160,7 +160,7 @@ class FuraffinityExtractor(Extractor):
while path:
page = self.request(self.root + path).text
yield from text.extract_iter(page, 'id="sid-', '"')
path = text.extract(page, 'right" href="', '"')[0]
path = text.extr(page, 'right" href="', '"')
def _pagination_search(self, query):
url = self.root + "/search/"

@ -58,7 +58,7 @@ class FuskatorGalleryExtractor(GalleryExtractor):
self.root + "/ajax/gal.aspx", params=params, headers=headers,
).json()
title = text.extract(page, "<title>", "</title>")[0].strip()
title = text.extr(page, "<title>", "</title>").strip()
title, _, gallery_id = title.rpartition("#")
return {
@ -104,7 +104,7 @@ class FuskatorSearchExtractor(Extractor):
page, 'class="pic_pad"><a href="', '"'):
yield Message.Queue, self.root + path, data
pages = text.extract(page, 'class="pages"><span>', '>&gt;&gt;<')[0]
pages = text.extr(page, 'class="pages"><span>', '>&gt;&gt;<')
if not pages:
return
url = self.root + text.rextract(pages, 'href="', '"')[0]

@ -69,7 +69,7 @@ class GelbooruBase():
yield "https://img1.gelbooru.com" + path
def _notes(self, post, page):
notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
notes_data = text.extr(page, '<section id="notes"', '</section>')
if not notes_data:
return

@ -98,8 +98,8 @@ class GelbooruV02Extractor(booru.BooruExtractor):
self.root, post["id"])).text
def _tags(self, post, page):
tag_container = (text.extract(page, '<ul id="tag-', '</ul>')[0] or
text.extract(page, '<ul class="tag-', '</ul>')[0])
tag_container = (text.extr(page, '<ul id="tag-', '</ul>') or
text.extr(page, '<ul class="tag-', '</ul>'))
if not tag_container:
return
@ -112,7 +112,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
post["tags_" + key] = " ".join(value)
def _notes(self, post, page):
note_container = text.extract(page, 'id="note-container"', "<img ")[0]
note_container = text.extr(page, 'id="note-container"', "<img ")
if not note_container:
return

@ -87,25 +87,25 @@ class GenericExtractor(Extractor):
"""Extract generic webpage metadata, return them in a dict."""
data = {}
data['pageurl'] = self.url
data['title'] = text.extract(page, '<title>', "</title>")[0] or ""
data['description'] = text.extract(
page, '<meta name="description" content="', '"')[0] or ""
data['keywords'] = text.extract(
page, '<meta name="keywords" content="', '"')[0] or ""
data['language'] = text.extract(
page, '<meta name="language" content="', '"')[0] or ""
data['name'] = text.extract(
page, '<meta itemprop="name" content="', '"')[0] or ""
data['copyright'] = text.extract(
page, '<meta name="copyright" content="', '"')[0] or ""
data['og_site'] = text.extract(
page, '<meta property="og:site" content="', '"')[0] or ""
data['og_site_name'] = text.extract(
page, '<meta property="og:site_name" content="', '"')[0] or ""
data['og_title'] = text.extract(
page, '<meta property="og:title" content="', '"')[0] or ""
data['og_description'] = text.extract(
page, '<meta property="og:description" content="', '"')[0] or ""
data['title'] = text.extr(page, '<title>', "</title>")
data['description'] = text.extr(
page, '<meta name="description" content="', '"')
data['keywords'] = text.extr(
page, '<meta name="keywords" content="', '"')
data['language'] = text.extr(
page, '<meta name="language" content="', '"')
data['name'] = text.extr(
page, '<meta itemprop="name" content="', '"')
data['copyright'] = text.extr(
page, '<meta name="copyright" content="', '"')
data['og_site'] = text.extr(
page, '<meta property="og:site" content="', '"')
data['og_site_name'] = text.extr(
page, '<meta property="og:site_name" content="', '"')
data['og_title'] = text.extr(
page, '<meta property="og:title" content="', '"')
data['og_description'] = text.extr(
page, '<meta property="og:description" content="', '"')
data = {k: text.unescape(data[k]) for k in data if data[k] != ""}

@ -60,7 +60,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor):
self.session.headers["Referer"] = url
def metadata(self, page):
title = text.extract(page, "<title>", "</title>")[0]
title = text.extr(page, "<title>", "</title>")
return {
"title": text.unescape(title.rpartition(" Story Viewer - ")[0]),
"slug" : self.slug,

@ -156,8 +156,8 @@ class HentaifoundryExtractor(Extractor):
"filter_media" : "A",
"filter_order" : "date_new",
"filter_type" : "0",
"YII_CSRF_TOKEN" : text.unquote(text.extract(
csrf_token, "%22", "%22")[0]),
"YII_CSRF_TOKEN" : text.unquote(text.extr(
csrf_token, "%22", "%22")),
}
self.request(url, method="POST", data=data)

@ -57,8 +57,8 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
ChapterExtractor.__init__(self, match, url)
def metadata(self, page):
title = text.extract(page, "<title>", "</title>")[0]
chapter_id = text.extract(page, 'report/C', '"')[0]
title = text.extr(page, "<title>", "</title>")
chapter_id = text.extr(page, 'report/C', '"')
chapter, sep, minor = self.chapter.partition(".")
pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at "
match = re.match(pattern, title)
@ -77,7 +77,7 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
@staticmethod
def images(page):
images = text.extract(page, "var rff_imageList = ", ";")[0]
images = text.extr(page, "var rff_imageList = ", ";")
return [
("https://hentaicdn.com/hentai" + part, None)
for part in json.loads(images)

@ -139,7 +139,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
self.manga_data(self.manga, page)
results = []
shortlink = text.extract(page, "rel='shortlink' href='", "'")[0]
shortlink = text.extr(page, "rel='shortlink' href='", "'")
data = {
"action" : "manga_get_reading_nav",
"manga" : shortlink.rpartition("=")[2],
@ -182,6 +182,6 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
def chapters(self, page):
results = []
for info in text.extract_iter(page, 'id="manga-item-', '<img'):
url = text.extract(info, 'href="', '"')[0]
url = text.extr(info, 'href="', '"')
results.append((url, {}))
return results

@ -44,7 +44,7 @@ class HotleakExtractor(Extractor):
for item in text.extract_iter(
page, '<article class="movie-item', '</article>'):
yield text.extract(item, '<a href="', '"')[0]
yield text.extr(item, '<a href="', '"')
params["page"] += 1
@ -87,8 +87,8 @@ class HotleakPostExtractor(HotleakExtractor):
url = "{}/{}/{}/{}".format(
self.root, self.creator, self.type, self.id)
page = self.request(url).text
page = text.extract(
page, '<div class="movie-image thumb">', '</article>')[0]
page = text.extr(
page, '<div class="movie-image thumb">', '</article>')
data = {
"id" : text.parse_int(self.id),
"creator": self.creator,
@ -96,12 +96,12 @@ class HotleakPostExtractor(HotleakExtractor):
}
if self.type == "photo":
data["url"] = text.extract(page, 'data-src="', '"')[0]
data["url"] = text.extr(page, 'data-src="', '"')
text.nameext_from_url(data["url"], data)
elif self.type == "video":
data["url"] = "ytdl:" + text.extract(
text.unescape(page), '"src":"', '"')[0]
data["url"] = "ytdl:" + text.extr(
text.unescape(page), '"src":"', '"')
text.nameext_from_url(data["url"], data)
data["extension"] = "mp4"

@ -115,7 +115,7 @@ class IdolcomplexExtractor(SankakuExtractor):
if self.extags:
tags = collections.defaultdict(list)
tags_html = text.extract(page, '<ul id=tag-sidebar>', '</ul>')[0]
tags_html = text.extr(page, '<ul id=tag-sidebar>', '</ul>')
pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)')
for tag_type, tag_name in pattern.findall(tags_html or ""):
tags[tag_type].append(text.unquote(tag_name))

@ -83,8 +83,8 @@ class ImagebamGalleryExtractor(ImagebamExtractor):
@staticmethod
def metadata(page):
return {"title": text.unescape(text.extract(
page, 'id="gallery-name">', '<')[0].strip())}
return {"title": text.unescape(text.extr(
page, 'id="gallery-name">', '<').strip())}
def images(self, page):
findall = re.compile(r'<a href="https://www\.imagebam\.com'

@ -36,8 +36,8 @@ class ImagechestGalleryExtractor(GalleryExtractor):
return {
"gallery_id": self.gallery_id,
"title": text.unescape(text.extract(
page, 'property="og:title" content="', '"')[0].strip())
"title": text.unescape(text.extr(
page, 'property="og:title" content="', '"').strip())
}
def images(self, page):

@ -202,7 +202,7 @@ class ImagefapUserExtractor(ImagefapExtractor):
response = self.request(url)
self.user = response.url.split("/")[-2]
folders = text.extract(response.text, ' id="tgl_all" value="', '"')[0]
folders = text.extr(response.text, ' id="tgl_all" value="', '"')
return folders.rstrip("|").split("|")
def galleries(self, folder_id):

@ -259,7 +259,7 @@ class ViprImageExtractor(ImagehostImageExtractor):
})
def get_info(self, page):
url = text.extract(page, '<img src="', '"')[0]
url = text.extr(page, '<img src="', '"')
return url, url

@ -71,7 +71,7 @@ class ImgbbExtractor(Extractor):
url = self.root + "/login"
page = self.request(url).text
token = text.extract(page, 'PF.obj.config.auth_token="', '"')[0]
token = text.extr(page, 'PF.obj.config.auth_token="', '"')
headers = {"Referer": url}
data = {
@ -154,7 +154,7 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
}
def images(self, page):
url = text.extract(page, '"og:url" content="', '"')[0]
url = text.extr(page, '"og:url" content="', '"')
album_id = url.rpartition("/")[2].partition("?")[0]
return self._pagination(page, "https://ibb.co/json", {
@ -185,7 +185,7 @@ class ImgbbUserExtractor(ImgbbExtractor):
return {"user": self.user}
def images(self, page):
user = text.extract(page, '.obj.resource={"id":"', '"')[0]
user = text.extr(page, '.obj.resource={"id":"', '"')
return self._pagination(page, self.page_url + "json", {
"from" : "user",
"userid" : user,

@ -53,7 +53,7 @@ class ImgboxExtractor(Extractor):
@staticmethod
def get_image_url(page):
"""Extract download-url"""
return text.extract(page, 'property="og:image" content="', '"')[0]
return text.extr(page, 'property="og:image" content="', '"')
class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
@ -89,7 +89,7 @@ class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
raise exception.NotFoundError("gallery")
self.image_keys = re.findall(r'<a href="/([^"]+)"><img alt="', page)
title = text.extract(page, "<h1>", "</h1>")[0]
title = text.extr(page, "<h1>", "</h1>")
title, _, count = title.rpartition(" - ")
return {
"gallery_key": self.gallery_key,

@ -41,7 +41,7 @@ class ImgthGalleryExtractor(Extractor):
"""Yield all image urls for this gallery"""
pnum = 0
while True:
thumbs = text.extract(page, '<ul class="thumbnails">', '</ul>')[0]
thumbs = text.extr(page, '<ul class="thumbnails">', '</ul>')
for url in text.extract_iter(thumbs, '<img src="', '"'):
yield "https://imgth.com/images" + url[24:]
if '<li class="next">' not in page:

@ -236,7 +236,7 @@ class InkbunnySearchExtractor(InkbunnyExtractor):
# get user_id from user profile
url = "{}/{}".format(self.root, favsby)
page = self.request(url).text
user_id = text.extract(page, "?user_id=", "'")[0]
user_id = text.extr(page, "?user_id=", "'")
params["favs_user_id"] = user_id.partition("&")[0]
return self.api.search(params)

@ -54,8 +54,8 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
})
def metadata(self, page):
data = json.loads(text.extract(
page, '<script data-json="', '"')[0].replace("&quot;", '"'))
data = json.loads(text.extr(
page, '<script data-json="', '"').replace("&quot;", '"'))
doc = data["initialDocumentData"]["document"]
doc["date"] = text.parse_datetime(

@ -62,7 +62,7 @@ class KabeuchiUserExtractor(Extractor):
response = self.request(url)
if response.history and response.url == self.root + "/":
raise exception.NotFoundError("user")
target_id = text.extract(response.text, 'user_friend_id = "', '"')[0]
target_id = text.extr(response.text, 'user_friend_id = "', '"')
return self._pagination(target_id)
def _pagination(self, target_id):

@ -96,7 +96,7 @@ class KeenspotComicExtractor(Extractor):
self._image = '<div id="comic">'
return "http://brawlinthefamily.keenspot.com/comic/theshowdown/"
url = text.extract(page, '<link rel="first" href="', '"')[0]
url = text.extr(page, '<link rel="first" href="', '"')
if url:
if self.comic == "porcelain":
self._needle = 'id="porArchivetop_"'
@ -144,7 +144,7 @@ class KeenspotComicExtractor(Extractor):
@staticmethod
def _next_link(page):
return text.extract(page, '<link rel="next" href="', '"')[0]
return text.extr(page, '<link rel="next" href="', '"')
@staticmethod
def _next_id(page):

@ -192,7 +192,7 @@ class KemonopartyExtractor(Extractor):
"body": text.unescape(text.extract(
dm, "<pre>", "</pre></",
)[0].strip()),
"date": text.extract(dm, 'datetime="', '"')[0],
"date": text.extr(dm, 'datetime="', '"'),
})
return dms

@ -76,7 +76,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
else:
fmt = fmt.lower().split(",")
page = text.extract(page, '<table id="songlist">', '</table>')[0]
page = text.extr(page, '<table id="songlist">', '</table>')
for num, url in enumerate(text.extract_iter(
page, '<td class="clickable-row"><a href="', '"'), 1):
url = text.urljoin(self.root, url)

@ -35,8 +35,8 @@ class KissgoddessGalleryExtractor(GalleryExtractor):
def metadata(self, page):
return {
"gallery_id": text.parse_int(self.gallery_id),
"title" : text.extract(
page, '<title>', "<")[0].rpartition(" | ")[0],
"title" : text.extr(
page, '<title>', "<")[0].rpartition(" | "),
}
def images(self, page):

@ -62,13 +62,13 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
)
def metadata(self, page):
info = text.extract(page, "<title>", " Komikcast<")[0]
info = text.extr(page, "<title>", " Komikcast<")
return self.parse_chapter_string(info)
@staticmethod
def images(page):
readerarea = text.extract(
page, '<div class="main-reading-area', '</div')[0]
readerarea = text.extr(
page, '<div class="main-reading-area', '</div')
return [
(text.unescape(url), None)
for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea)

@ -47,7 +47,7 @@ class LightroomGalleryExtractor(Extractor):
url = "https://lightroom.adobe.com/shares/" + self.href
response = self.request(url)
album = json.loads(
text.extract(response.text, "albumAttributes: ", "\n")[0]
text.extr(response.text, "albumAttributes: ", "\n")
)
images = self.images(album)

@ -22,8 +22,8 @@ class LineblogBase():
body = post.pop("body")
for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
src = text.extract(img, 'src="', '"')[0]
alt = text.extract(img, 'alt="', '"')[0]
src = text.extr(img, 'src="', '"')
alt = text.extr(img, 'alt="', '"')
if not src:
continue

@ -37,7 +37,7 @@ class LivedoorExtractor(Extractor):
def _load(self, data, body):
extr = text.extract_from(data)
tags = text.extract(body, 'class="article-tags">', '</dl>')[0]
tags = text.extr(body, 'class="article-tags">', '</dl>')
about = extr('rdf:about="', '"')
return {
@ -57,8 +57,8 @@ class LivedoorExtractor(Extractor):
body = post.pop("body")
for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
src = text.extract(img, 'src="', '"')[0]
alt = text.extract(img, 'alt="', '"')[0]
src = text.extr(img, 'src="', '"')
alt = text.extr(img, 'alt="', '"')
if not src:
continue

@ -63,8 +63,8 @@ class ManganeloChapterExtractor(ChapterExtractor):
}
def images(self, page):
page = text.extract(
page, 'class="container-chapter-reader', '\n<div')[0]
page = text.extr(
page, 'class="container-chapter-reader', '\n<div')
return [
(url, None)
for url in text.extract_iter(page, '<img src="', '"')

@ -104,7 +104,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
return data
def images(self, page):
data = json.loads(text.extract(page, "var _load_pages =", ";")[0])
data = json.loads(text.extr(page, "var _load_pages =", ";"))
return [
(text.urljoin(self.root, item["u"]), {
"width": text.parse_int(item["w"]),
@ -136,10 +136,10 @@ class MangaparkMangaExtractor(MangaparkBase, MangaExtractor):
results = []
data = {"lang": "en", "language": "English"}
data["manga"] = text.unescape(
text.extract(page, '<title>', ' Manga - ')[0])
text.extr(page, '<title>', ' Manga - '))
for stream in page.split('<div id="stream_')[1:]:
data["stream"] = text.parse_int(text.extract(stream, '', '"')[0])
data["stream"] = text.parse_int(text.extr(stream, '', '"'))
for chapter in text.extract_iter(stream, '<li ', '</li>'):
path , pos = text.extract(chapter, 'href="', '"')

@ -38,7 +38,7 @@ class MangoxoExtractor(Extractor):
url = self.root + "/login"
page = self.request(url).text
token = text.extract(page, 'id="loginToken" value="', '"')[0]
token = text.extr(page, 'id="loginToken" value="', '"')
url = self.root + "/api/login"
headers = {
@ -115,7 +115,7 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
data["extension"] = None
for data["num"], path in enumerate(imgs, 1):
data["id"] = text.parse_int(text.extract(path, "=", "&")[0])
data["id"] = text.parse_int(text.extr(path, "=", "&"))
url = self.root + "/external/" + path.rpartition("url=")[2]
yield Message.Url, url, text.nameext_from_url(url, data)

@ -31,7 +31,7 @@ class MoebooruExtractor(BooruExtractor):
self.root, post["id"])).text
def _tags(self, post, page):
tag_container = text.extract(page, '<ul id="tag-', '</ul>')[0]
tag_container = text.extr(page, '<ul id="tag-', '</ul>')
if not tag_container:
return
@ -43,7 +43,7 @@ class MoebooruExtractor(BooruExtractor):
post["tags_" + key] = " ".join(value)
def _notes(self, post, page):
note_container = text.extract(page, 'id="note-container"', "<img ")[0]
note_container = text.extr(page, 'id="note-container"', "<img ")
if not note_container:
return

@ -59,7 +59,7 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
def images(self, page):
return [
(text.unescape(text.extract(url, 'src="', '"')[0]).replace(
(text.unescape(text.extr(url, 'src="', '"')).replace(
"/thumbnail/", "/original/"), None)
for url in text.extract_iter(page, 'class="comic-thumb"', '</div>')
]

@ -57,8 +57,8 @@ class MyportfolioGalleryExtractor(Extractor):
raise exception.NotFoundError()
page = response.text
projects = text.extract(
page, '<section class="project-covers', '</section>')[0]
projects = text.extr(
page, '<section class="project-covers', '</section>')
if projects:
data = {"_extractor": MyportfolioGalleryExtractor}

@ -44,10 +44,10 @@ class NanaGalleryExtractor(GalleryExtractor):
def metadata(self, page):
title = text.unescape(
text.extract(page, '</a>&nbsp; ', '</div>')[0])
artist = text.unescape(text.extract(
page, '<title>', '</title>')[0])[len(title):-10]
tags = text.extract(page, 'Reader.tags = "', '"')[0]
text.extr(page, '</a>&nbsp; ', '</div>'))
artist = text.unescape(text.extr(
page, '<title>', '</title>'))[len(title):-10]
tags = text.extr(page, 'Reader.tags = "', '"')
return {
"gallery_id": self.gallery_id,
@ -59,7 +59,7 @@ class NanaGalleryExtractor(GalleryExtractor):
}
def images(self, page):
data = json.loads(text.extract(page, "Reader.pages = ", ".pages")[0])
data = json.loads(text.extr(page, "Reader.pages = ", ".pages"))
return [
("https://nana.my.id" + image, None)
for image in data["pages"]
@ -108,8 +108,8 @@ class NanaSearchExtractor(Extractor):
for gallery in text.extract_iter(
page, '<div class="id3">', '</div>'):
url = "https://nana.my.id" + text.extract(
gallery, '<a href="', '"')[0]
url = "https://nana.my.id" + text.extr(
gallery, '<a href="', '"')
yield Message.Queue, url, data
self.params["p"] += 1

@ -76,7 +76,7 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
@staticmethod
def images(page):
view_area = text.extract(page, 'id="comic_view_area"', '</div>')[0]
view_area = text.extr(page, 'id="comic_view_area"', '</div>')
return [
(url, None)
for url in text.extract_iter(view_area, '<img src="', '"')

@ -88,8 +88,8 @@ class NewgroundsExtractor(Extractor):
return self.session.cookies
headers = {"Origin": self.root, "Referer": url}
url = text.urljoin(self.root, text.extract(
response.text, 'action="', '"')[0])
url = text.urljoin(self.root, text.extr(
response.text, 'action="', '"'))
data = {
"username": username,
"password": password,
@ -140,7 +140,7 @@ class NewgroundsExtractor(Extractor):
data["score"] = text.parse_float(extr('id="score_number">', '<'))
data["tags"] = text.split_html(extr('<dd class="tags">', '</dd>'))
data["artist"] = [
text.extract(user, '//', '.')[0]
text.extr(user, '//', '.')
for user in text.extract_iter(page, '<div class="item-user">', '>')
]
@ -275,7 +275,7 @@ class NewgroundsExtractor(Extractor):
for year, items in items.items():
for item in items:
page_url = text.extract(item, 'href="', '"')[0]
page_url = text.extr(item, 'href="', '"')
if page_url[0] == "/":
page_url = self.root + page_url
yield page_url

@ -107,7 +107,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
"""Extract image URLs from 'page'"""
images = text.extract_iter(page, "/view_popup.php", "</a>")
for num, image in enumerate(images):
src = text.extract(image, 'src="', '"')[0]
src = text.extr(image, 'src="', '"')
if not src:
continue
url = ("https:" + src).replace("/__rs_l120x120/", "/")
@ -118,7 +118,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
@staticmethod
def _extract_user_name(page):
return text.unescape(text.extract(page, "<br />", "<")[0] or "")
return text.unescape(text.extr(page, "<br />", "<"))
def login(self):
"""Login and obtain session cookies"""
@ -322,8 +322,7 @@ class NijieNuitaExtractor(NijieExtractor):
@staticmethod
def _extract_user_name(page):
return text.unescape(text.extract(
page, "<title>", "さんの抜いた")[0] or "")
return text.unescape(text.extr(page, "<title>", "さんの抜いた"))
class NijieFeedExtractor(NijieExtractor):

@ -95,7 +95,7 @@ class PatreonExtractor(Extractor):
if content:
for img in text.extract_iter(
content, '<img data-media-id="', '>'):
url = text.extract(img, 'src="', '"')[0]
url = text.extr(img, 'src="', '"')
if url:
yield "content", url, self._filename(url) or url
@ -181,7 +181,7 @@ class PatreonExtractor(Extractor):
"""Fetch filename from an URL's Content-Disposition header"""
response = self.request(url, method="HEAD", fatal=False)
cd = response.headers.get("Content-Disposition")
return text.extract(cd, 'filename="', '"')[0]
return text.extr(cd, 'filename="', '"')
@staticmethod
def _filehash(url):
@ -284,7 +284,7 @@ class PatreonCreatorExtractor(PatreonExtractor):
url = "{}/{}/posts".format(self.root, self.creator)
page = self.request(url, notfound="creator").text
campaign_id = text.extract(page, "/campaign/", "/")[0]
campaign_id = text.extr(page, "/campaign/", "/")
if not campaign_id:
raise exception.NotFoundError("creator")

@ -75,7 +75,7 @@ class PhotobucketAlbumExtractor(Extractor):
page = self.request(url, params=params).text
json_data = text.extract(page, "collectionData:", ",\n")[0]
if not json_data:
msg = text.extract(page, 'libraryPrivacyBlock">', "</div>")[0]
msg = text.extr(page, 'libraryPrivacyBlock">', "</div>")
msg = ' ("{}")'.format(text.remove_html(msg)) if msg else ""
self.log.error("Unable to get JSON data%s", msg)
return

@ -98,7 +98,7 @@ class PillowfortExtractor(Extractor):
url = "https://www.pillowfort.social/users/sign_in"
page = self.request(url).text
auth = text.extract(page, 'name="authenticity_token" value="', '"')[0]
auth = text.extr(page, 'name="authenticity_token" value="', '"')
headers = {"Origin": self.root, "Referer": url}
data = {

@ -638,7 +638,7 @@ class PixivPixivisionExtractor(PixivExtractor):
headers = {"User-Agent": "Mozilla/5.0"}
self.page = self.request(url, headers=headers).text
title = text.extract(self.page, '<title>', '<')[0]
title = text.extr(self.page, '<title>', '<')
return {
"pixivision_id" : self.pixivision_id,
"pixivision_title": text.unescape(title),
@ -692,7 +692,7 @@ class PixivSeriesExtractor(PixivExtractor):
series = body["extraData"]["meta"]
series["id"] = self.series_id
series["total"] = page["total"]
series["title"] = text.extract(series["title"], '"', '"')[0]
series["title"] = text.extr(series["title"], '"', '"')
for info in page["series"]:
work = self.api.illust_detail(info["workId"])

@ -30,7 +30,7 @@ class PixnetExtractor(Extractor):
def items(self):
url = self.url_fmt.format(self.root, self.item_id)
page = self.request(url, encoding="utf-8").text
user = text.extract(page, '<meta name="author" content="', '";')[0]
user = text.extr(page, '<meta name="author" content="', '";')
data = {
"blog": self.blog,
"user": user.rpartition(" (")[0],
@ -52,13 +52,13 @@ class PixnetExtractor(Extractor):
while True:
yield from text.extract_iter(page, '<li id="', '</li>')
pnext = text.extract(page, 'class="nextBtn"', '>')[0]
pnext = text.extr(page, 'class="nextBtn"', '>')
if pnext is None and 'name="albumpass">' in page:
raise exception.StopExtraction(
"Album %s is password-protected.", self.item_id)
if "href" not in pnext:
return
url = self.root + text.extract(pnext, 'href="', '"')[0]
url = self.root + text.extr(pnext, 'href="', '"')
page = self.request(url, encoding="utf-8").text

@ -73,8 +73,8 @@ class PururinGalleryExtractor(GalleryExtractor):
url = "{}/read/{}/01/x".format(self.root, self.gallery_id)
page = self.request(url).text
info = json.loads(binascii.a2b_base64(text.extract(
page, '<gallery-read encoded="', '"')[0]).decode())
info = json.loads(binascii.a2b_base64(text.extr(
page, '<gallery-read encoded="', '"')).decode())
self._ext = info["image_extension"]
self._cnt = info["total_pages"]

@ -109,13 +109,13 @@ class ReactorExtractor(BaseExtractor):
tags.sort()
for image in images:
url = text.extract(image, ' src="', '"')[0]
url = text.extr(image, ' src="', '"')
if not url:
continue
if url.startswith("//"):
url = "http:" + url
width = text.extract(image, ' width="', '"')[0]
height = text.extract(image, ' height="', '"')[0]
width = text.extr(image, ' width="', '"')
height = text.extr(image, ' height="', '"')
image_id = url.rpartition("-")[2].partition(".")[0]
num += 1
@ -125,7 +125,7 @@ class ReactorExtractor(BaseExtractor):
url = url.replace("/post/", "/post/full/")
if self.gif and ("/post/webm/" in url or "/post/mp4/" in url):
gif_url = text.extract(image, '<a href="', '"')[0]
gif_url = text.extr(image, '<a href="', '"')
if not gif_url:
continue
url = gif_url

@ -306,7 +306,7 @@ class SankakuAPI():
url = post["file_url"]
if url:
expires = text.parse_int(
text.extract(url, "e=", "&")[0]) - 60
text.extr(url, "e=", "&")) - 60
if 0 < expires <= time():
self.extractor.log.debug("Refreshing download URLs")

@ -43,7 +43,7 @@ class SexcomExtractor(Extractor):
yield self.root + href
pager = extr('id="pagenum"', '</div>')
url = text.extract(pager, ' href="', '"')[0]
url = text.extr(pager, ' href="', '"')
if not url:
return
url = text.urljoin(self.root, url)
@ -71,7 +71,7 @@ class SexcomExtractor(Extractor):
info = extr("player.updateSrc(", ");")
if info:
path = text.extract(info, "src: '", "'")[0]
path = text.extr(info, "src: '", "'")
data["filename"] = path.rpartition("/")[2]
data["extension"] = "mp4"
if "'HD'" in info:
@ -79,8 +79,8 @@ class SexcomExtractor(Extractor):
data["url"] = self.root + path
else:
iframe = extr('<iframe', '>')
src = (text.extract(iframe, ' src="', '"')[0] or
text.extract(iframe, " src='", "'")[0])
src = (text.extr(iframe, ' src="', '"') or
text.extr(iframe, " src='", "'"))
if not src:
self.log.warning("Unable to fetch media from %s", url)
return None

@ -111,7 +111,7 @@ class SimplyhentaiImageExtractor(Extractor):
url = extr('&quot;image&quot;:&quot;' , '&')
url = extr("&quot;content&quot;:&quot;", "&") or url
tags = text.extract(descr, " tagged with ", " online for free ")[0]
tags = text.extr(descr, " tagged with ", " online for free ")
if tags:
tags = tags.split(", ")
tags[-1] = tags[-1].partition(" ")[2]
@ -176,7 +176,7 @@ class SimplyhentaiVideoExtractor(Extractor):
embed_url = text.extract(page, 'src="', '"', pos)[0].replace(
"embedplayer.php?link=", "embed.php?name=")
embed_page = self.request(embed_url).text
video_url = text.extract(embed_page, '"file":"', '"')[0]
video_url = text.extr(embed_page, '"file":"', '"')
title, _, episode = title.rpartition(" Episode ")
if video_url.startswith("//"):

@ -89,23 +89,23 @@ class SubscribestarExtractor(Extractor):
def _media_from_post(html):
media = []
gallery = text.extract(html, 'data-gallery="', '"')[0]
gallery = text.extr(html, 'data-gallery="', '"')
if gallery:
media.extend(
item for item in json.loads(text.unescape(gallery))
if "/previews/" not in item["url"]
)
attachments = text.extract(
html, 'class="uploads-docs"', 'data-role="post-edit_form"')[0]
attachments = text.extr(
html, 'class="uploads-docs"', 'data-role="post-edit_form"')
if attachments:
for att in attachments.split('class="doc_preview"')[1:]:
media.append({
"id" : text.parse_int(text.extract(
att, 'data-upload-id="', '"')[0]),
"name": text.unescape(text.extract(
att, 'doc_preview-title">', '<')[0] or ""),
"url" : text.unescape(text.extract(att, 'href="', '"')[0]),
"id" : text.parse_int(text.extr(
att, 'data-upload-id="', '"')),
"name": text.unescape(text.extr(
att, 'doc_preview-title">', '<')),
"url" : text.unescape(text.extr(att, 'href="', '"')),
"type": "attachment",
})
@ -175,7 +175,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor):
return
yield from posts
url = text.extract(posts[-1], needle_next_page, '"')[0]
url = text.extr(posts[-1], needle_next_page, '"')
if not url:
return
page = self.request(self.root + text.unescape(url)).json()["html"]

@ -257,7 +257,7 @@ class TumblrExtractor(Extractor):
except Exception:
return resized, True
else:
updated = text.extract(response.text, '" src="', '"')[0]
updated = text.extr(response.text, '" src="', '"')
return updated, (resized == updated)
def _original_image_fallback(self, url, post_id):

@ -46,7 +46,7 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
def metadata(self, page):
return {
"title" : text.unescape(text.extract(page, "<h1>", "</h1>"))[0],
"title" : text.unescape(text.extr(page, "<h1>", "</h1>")),
"gallery_id": self.gallery_id,
}
@ -82,7 +82,7 @@ class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
def metadata(self, page):
return {
"title" : text.remove_html(
text.unescape(text.extract(page, "<title>", "</title>")[0])
text.unescape(text.extr(page, "<title>", "</title>"))
).replace("_", "-"),
"gallery_id": self.gallery_id,
}
@ -127,12 +127,12 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
data = self._data_from_url(url)
data["gallery_id"] = gallery_id
data["title"] = text.remove_html(text.unescape(
text.extract(post_page, "<title>", "</title>")[0]
text.extr(post_page, "<title>", "</title>")
)).replace("_", "-")
yield url, data
next_url = text.extract(
page, '</span> <a class="btn btn-primary" href="', '"')[0]
next_url = text.extr(
page, '</span> <a class="btn btn-primary" href="', '"')
if not next_url or page_url == next_url:
return
page_url = next_url

@ -227,8 +227,8 @@ class TwitterExtractor(Extractor):
response = self.request(url, fatal=False)
if response.status_code >= 400:
continue
url = text.extract(
response.text, 'name="twitter:image" value="', '"')[0]
url = text.extr(
response.text, 'name="twitter:image" value="', '"')
if url:
files.append({"url": url})

@ -44,7 +44,7 @@ class VanillarockPostExtractor(VanillarockExtractor):
img = extr('<div class="main-img">', '</div>')
if not img:
break
imgs.append(text.extract(img, 'href="', '"')[0])
imgs.append(text.extr(img, 'href="', '"'))
data = {
"count": len(imgs),
@ -89,5 +89,5 @@ class VanillarockTagExtractor(VanillarockExtractor):
post = extr('<h2 class="entry-title">', '</h2>')
if not post:
break
yield Message.Queue, text.extract(post, 'href="', '"')[0], data
yield Message.Queue, text.extr(post, 'href="', '"'), data
url = text.unescape(extr('class="next page-numbers" href="', '"'))

@ -69,7 +69,7 @@ class VscoExtractor(Extractor):
def _extract_preload_state(self, url):
page = self.request(url, notfound=self.subcategory).text
return json.loads(text.extract(page, "__PRELOADED_STATE__ = ", "<")[0])
return json.loads(text.extr(page, "__PRELOADED_STATE__ = ", "<"))
def _pagination(self, url, params, token, key, extra=None):
headers = {

@ -57,8 +57,8 @@ class WarosuThreadExtractor(Extractor):
def get_metadata(self, page):
"""Collect metadata for extractor-job"""
boardname = text.extract(page, "<title>", "</title>")[0]
title = text.extract(page, 'filetitle" itemprop="name">', '<')[0]
boardname = text.extr(page, "<title>", "</title>")
title = text.extr(page, 'filetitle" itemprop="name">', '<')
return {
"board": self.board,
"board_name": boardname.rpartition(" - ")[2],
@ -68,7 +68,7 @@ class WarosuThreadExtractor(Extractor):
def posts(self, page):
"""Build a list of all post-objects"""
page = text.extract(page, '<div class="content">', '<table>')[0]
page = text.extr(page, '<div class="content">', '<table>')
needle = '<table itemscope itemtype="http://schema.org/Comment">'
return [self.parse(post) for post in page.split(needle)]

@ -225,7 +225,7 @@ class WeasylFavoriteExtractor(WeasylExtractor):
pos = page.index('id="favorites-content"')
if not owner_login:
owner_login = text.extract(page, '<a href="/~', '"')[0]
owner_login = text.extr(page, '<a href="/~', '"')
for submitid in text.extract_iter(page, "/submissions/", "/", pos):
if submitid == lastid:

@ -169,7 +169,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor):
@staticmethod
def get_episode_urls(page):
"""Extract and return all episode urls in 'page'"""
page = text.extract(page, 'id="_listUl"', '</ul>')[0]
page = text.extr(page, 'id="_listUl"', '</ul>')
return [
match.group(0)
for match in WebtoonsEpisodeExtractor.pattern.finditer(page)

@ -173,7 +173,7 @@ class WeiboExtractor(Extractor):
page = Extractor.request(
self, passport_url, method="POST", headers=headers, data=data).text
data = json.loads(text.extract(page, "(", ");")[0])["data"]
data = json.loads(text.extr(page, "(", ");"))["data"]
passport_url = "https://passport.weibo.com/visitor/visitor"
params = {

@ -144,8 +144,8 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
def _data(self, url):
page = self.request(url).text
return json.loads(text.extract(
page, "window.initials=", "</script>")[0].rstrip("\n\r;"))
return json.loads(text.extr(
page, "window.initials=", "</script>").rstrip("\n\r;"))
class XhamsterUserExtractor(XhamsterExtractor):

@ -113,8 +113,8 @@ class XvideosUserExtractor(XvideosBase, Extractor):
def items(self):
url = "{}/profiles/{}".format(self.root, self.user)
page = self.request(url, notfound=self.subcategory).text
data = json.loads(text.extract(
page, "xv.conf=", ";</script>")[0])["data"]
data = json.loads(text.extr(
page, "xv.conf=", ";</script>"))["data"]
if not isinstance(data["galleries"], dict):
return

@ -127,7 +127,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
while True:
page = self.request(url, params=params).text
thumbs = text.extract(page, '<ul id="thumbs', '</ul>')[0]
thumbs = text.extr(page, '<ul id="thumbs', '</ul>')
extr = text.extract_from(thumbs)
while True:

Loading…
Cancel
Save