allow BaseExtractors to use the domain pf the matched URL

pull/2301/head
Mike Fährmann 3 years ago
parent c0fddcefc5
commit b4f8e15a1f
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

@ -607,6 +607,9 @@ class BaseExtractor(Extractor):
if group is not None: if group is not None:
if index: if index:
self.category, self.root = self.instances[index-1] self.category, self.root = self.instances[index-1]
if not self.root:
url = text.ensure_http_scheme(match.group(0))
self.root = url[:url.index("/", 8)]
else: else:
self.root = group self.root = group
self.category = group.partition("://")[2] self.category = group.partition("://")[2]
@ -624,7 +627,9 @@ class BaseExtractor(Extractor):
pattern_list = [] pattern_list = []
instance_list = cls.instances = [] instance_list = cls.instances = []
for category, info in instances.items(): for category, info in instances.items():
root = info["root"].rstrip("/") root = info["root"]
if root:
root = root.rstrip("/")
instance_list.append((category, root)) instance_list.append((category, root))
pattern = info.get("pattern") pattern = info.get("pattern")

@ -349,6 +349,12 @@ def build_extractor_list():
for category, root in extr.instances: for category, root in extr.instances:
base[category].append(extr.subcategory) base[category].append(extr.subcategory)
if category not in domains: if category not in domains:
if not root:
# use domain from first matching test
for url, _ in extr._get_tests():
if extr.from_url(url).category == category:
root = url[:url.index("/", 8)]
break
domains[category] = root + "/" domains[category] = root + "/"
# sort subcategory lists # sort subcategory lists

Loading…
Cancel
Save