commit
0f2dc855b1
@ -0,0 +1,56 @@
|
|||||||
|
name: docker
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- v[0-9]+.[0-9]+.[0-9]+
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
packages: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
docker:
|
||||||
|
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
# https://github.com/docker/setup-buildx-action
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
# https://github.com/docker/login-action
|
||||||
|
- name: Login to GitHub Container Registry
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
username: ${{ github.repository_owner }}
|
||||||
|
password: ${{ secrets.GHCR_TOKEN }}
|
||||||
|
|
||||||
|
- name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
|
||||||
|
# https://github.com/docker/metadata-action
|
||||||
|
- name: Generate Docker tags
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
id: metadata
|
||||||
|
with:
|
||||||
|
images: |
|
||||||
|
mikf123/gallery-dl
|
||||||
|
ghcr.io/mikf/gallery-dl
|
||||||
|
tags: |
|
||||||
|
type=sha,format=long,prefix=
|
||||||
|
type=ref,event=tag
|
||||||
|
|
||||||
|
# https://github.com/docker/build-push-action
|
||||||
|
- name: Build image
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
push: true
|
||||||
|
tags: ${{ steps.metadata.outputs.tags }}
|
||||||
|
labels: ${{ steps.metadata.outputs.labels }}
|
||||||
|
platforms: linux/amd64
|
@ -0,0 +1,5 @@
|
|||||||
|
FROM python:alpine
|
||||||
|
RUN python3 -m pip install -U gallery-dl yt-dlp
|
||||||
|
RUN apk update
|
||||||
|
RUN apk add ffmpeg
|
||||||
|
ENTRYPOINT [ "gallery-dl" ]
|
@ -0,0 +1,9 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>gallery-dl</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -0,0 +1,12 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>gallery-dl - OAuth Redirect</title>
|
||||||
|
<script>
|
||||||
|
window.location.href = "http://localhost:6414/" + window.location.search;
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -0,0 +1,91 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for https://2ch.hk/"""
|
||||||
|
|
||||||
|
from .common import Extractor, Message
|
||||||
|
from .. import text, util
|
||||||
|
|
||||||
|
|
||||||
|
class _2chThreadExtractor(Extractor):
|
||||||
|
"""Extractor for 2ch threads"""
|
||||||
|
category = "2ch"
|
||||||
|
subcategory = "thread"
|
||||||
|
root = "https://2ch.hk"
|
||||||
|
directory_fmt = ("{category}", "{board}", "{thread} {title}")
|
||||||
|
filename_fmt = "{tim}{filename:? //}.{extension}"
|
||||||
|
archive_fmt = "{board}_{thread}_{tim}"
|
||||||
|
pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)"
|
||||||
|
example = "https://2ch.hk/a/res/12345.html"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
Extractor.__init__(self, match)
|
||||||
|
self.board, self.thread = match.groups()
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
|
||||||
|
posts = self.request(url).json()["threads"][0]["posts"]
|
||||||
|
|
||||||
|
op = posts[0]
|
||||||
|
title = op.get("subject") or text.remove_html(op["comment"])
|
||||||
|
|
||||||
|
thread = {
|
||||||
|
"board" : self.board,
|
||||||
|
"thread": self.thread,
|
||||||
|
"title" : text.unescape(title)[:50],
|
||||||
|
}
|
||||||
|
|
||||||
|
yield Message.Directory, thread
|
||||||
|
for post in posts:
|
||||||
|
files = post.get("files")
|
||||||
|
if files:
|
||||||
|
post["post_name"] = post["name"]
|
||||||
|
post["date"] = text.parse_timestamp(post["timestamp"])
|
||||||
|
del post["files"]
|
||||||
|
del post["name"]
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
file.update(thread)
|
||||||
|
file.update(post)
|
||||||
|
|
||||||
|
file["filename"] = file["fullname"].rpartition(".")[0]
|
||||||
|
file["tim"], _, file["extension"] = \
|
||||||
|
file["name"].rpartition(".")
|
||||||
|
|
||||||
|
yield Message.Url, self.root + file["path"], file
|
||||||
|
|
||||||
|
|
||||||
|
class _2chBoardExtractor(Extractor):
|
||||||
|
"""Extractor for 2ch boards"""
|
||||||
|
category = "2ch"
|
||||||
|
subcategory = "board"
|
||||||
|
root = "https://2ch.hk"
|
||||||
|
pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$"
|
||||||
|
example = "https://2ch.hk/a/"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
Extractor.__init__(self, match)
|
||||||
|
self.board = match.group(1)
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
# index page
|
||||||
|
url = "{}/{}/index.json".format(self.root, self.board)
|
||||||
|
index = self.request(url).json()
|
||||||
|
index["_extractor"] = _2chThreadExtractor
|
||||||
|
for thread in index["threads"]:
|
||||||
|
url = "{}/{}/res/{}.html".format(
|
||||||
|
self.root, self.board, thread["thread_num"])
|
||||||
|
yield Message.Queue, url, index
|
||||||
|
|
||||||
|
# pages 1..n
|
||||||
|
for n in util.advance(index["pages"], 1):
|
||||||
|
url = "{}/{}/{}.json".format(self.root, self.board, n)
|
||||||
|
page = self.request(url).json()
|
||||||
|
page["_extractor"] = _2chThreadExtractor
|
||||||
|
for thread in page["threads"]:
|
||||||
|
url = "{}/{}/res/{}.html".format(
|
||||||
|
self.root, self.board, thread["thread_num"])
|
||||||
|
yield Message.Queue, url, page
|
@ -0,0 +1,111 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for https://4archive.org/"""
|
||||||
|
|
||||||
|
from .common import Extractor, Message
|
||||||
|
from .. import text, util
|
||||||
|
|
||||||
|
|
||||||
|
class _4archiveThreadExtractor(Extractor):
|
||||||
|
"""Extractor for 4archive threads"""
|
||||||
|
category = "4archive"
|
||||||
|
subcategory = "thread"
|
||||||
|
directory_fmt = ("{category}", "{board}", "{thread} {title}")
|
||||||
|
filename_fmt = "{no} {filename}.{extension}"
|
||||||
|
archive_fmt = "{board}_{thread}_{no}"
|
||||||
|
root = "https://4archive.org"
|
||||||
|
referer = False
|
||||||
|
pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)"
|
||||||
|
example = "https://4archive.org/board/a/thread/12345/"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
Extractor.__init__(self, match)
|
||||||
|
self.board, self.thread = match.groups()
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = "{}/board/{}/thread/{}".format(
|
||||||
|
self.root, self.board, self.thread)
|
||||||
|
page = self.request(url).text
|
||||||
|
data = self.metadata(page)
|
||||||
|
posts = self.posts(page)
|
||||||
|
|
||||||
|
if not data["title"]:
|
||||||
|
data["title"] = posts[0]["com"][:50]
|
||||||
|
|
||||||
|
for post in posts:
|
||||||
|
post.update(data)
|
||||||
|
post["time"] = int(util.datetime_to_timestamp(post["date"]))
|
||||||
|
yield Message.Directory, post
|
||||||
|
if "url" in post:
|
||||||
|
yield Message.Url, post["url"], text.nameext_from_url(
|
||||||
|
post["filename"], post)
|
||||||
|
|
||||||
|
def metadata(self, page):
|
||||||
|
return {
|
||||||
|
"board" : self.board,
|
||||||
|
"thread": text.parse_int(self.thread),
|
||||||
|
"title" : text.unescape(text.extr(
|
||||||
|
page, 'class="subject">', "</span>"))
|
||||||
|
}
|
||||||
|
|
||||||
|
def posts(self, page):
|
||||||
|
return [
|
||||||
|
self.parse(post)
|
||||||
|
for post in page.split('class="postContainer')[1:]
|
||||||
|
]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def parse(post):
|
||||||
|
extr = text.extract_from(post)
|
||||||
|
data = {
|
||||||
|
"name": extr('class="name">', "</span>"),
|
||||||
|
"date": text.parse_datetime(
|
||||||
|
extr('class="dateTime postNum" >', "<").strip(),
|
||||||
|
"%Y-%m-%d %H:%M:%S"),
|
||||||
|
"no" : text.parse_int(extr('href="#p', '"')),
|
||||||
|
}
|
||||||
|
if 'class="file"' in post:
|
||||||
|
extr('class="fileText"', ">File: <a")
|
||||||
|
data.update({
|
||||||
|
"url" : extr('href="', '"'),
|
||||||
|
"filename": extr(
|
||||||
|
'rel="noreferrer noopener"', "</a>").strip()[1:],
|
||||||
|
"size" : text.parse_bytes(extr(" (", ", ")[:-1]),
|
||||||
|
"width" : text.parse_int(extr("", "x")),
|
||||||
|
"height" : text.parse_int(extr("", "px")),
|
||||||
|
})
|
||||||
|
extr("<blockquote ", "")
|
||||||
|
data["com"] = text.unescape(text.remove_html(
|
||||||
|
extr(">", "</blockquote>")))
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
class _4archiveBoardExtractor(Extractor):
|
||||||
|
"""Extractor for 4archive boards"""
|
||||||
|
category = "4archive"
|
||||||
|
subcategory = "board"
|
||||||
|
root = "https://4archive.org"
|
||||||
|
pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$"
|
||||||
|
example = "https://4archive.org/board/a/"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
Extractor.__init__(self, match)
|
||||||
|
self.board = match.group(1)
|
||||||
|
self.num = text.parse_int(match.group(2), 1)
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
data = {"_extractor": _4archiveThreadExtractor}
|
||||||
|
while True:
|
||||||
|
url = "{}/board/{}/{}".format(self.root, self.board, self.num)
|
||||||
|
page = self.request(url).text
|
||||||
|
if 'class="thread"' not in page:
|
||||||
|
return
|
||||||
|
for thread in text.extract_iter(page, 'class="thread" id="t', '"'):
|
||||||
|
url = "{}/board/{}/thread/{}".format(
|
||||||
|
self.root, self.board, thread)
|
||||||
|
yield Message.Queue, url, data
|
||||||
|
self.num += 1
|
@ -0,0 +1,123 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for https://bato.to/"""
|
||||||
|
|
||||||
|
from .common import Extractor, ChapterExtractor, MangaExtractor
|
||||||
|
from .. import text, exception
|
||||||
|
import re
|
||||||
|
|
||||||
|
BASE_PATTERN = (r"(?:https?://)?(?:"
|
||||||
|
r"(?:ba|d|h|m|w)to\.to|"
|
||||||
|
r"(?:(?:manga|read)toto|batocomic|[xz]bato)\.(?:com|net|org)|"
|
||||||
|
r"comiko\.(?:net|org)|"
|
||||||
|
r"bat(?:otoo|o?two)\.com)")
|
||||||
|
|
||||||
|
|
||||||
|
class BatotoBase():
|
||||||
|
"""Base class for batoto extractors"""
|
||||||
|
category = "batoto"
|
||||||
|
root = "https://bato.to"
|
||||||
|
|
||||||
|
def request(self, url, **kwargs):
|
||||||
|
kwargs["encoding"] = "utf-8"
|
||||||
|
return Extractor.request(self, url, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
|
||||||
|
"""Extractor for bato.to manga chapters"""
|
||||||
|
pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)"
|
||||||
|
example = "https://bato.to/title/12345-MANGA/54321"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
self.root = text.root_from_url(match.group(0))
|
||||||
|
self.chapter_id = match.group(1)
|
||||||
|
url = "{}/title/0/{}".format(self.root, self.chapter_id)
|
||||||
|
ChapterExtractor.__init__(self, match, url)
|
||||||
|
|
||||||
|
def metadata(self, page):
|
||||||
|
extr = text.extract_from(page)
|
||||||
|
manga, info, _ = extr("<title>", "<").rsplit(" - ", 3)
|
||||||
|
manga_id = text.extr(
|
||||||
|
extr('rel="canonical" href="', '"'), "/title/", "/")
|
||||||
|
|
||||||
|
match = re.match(
|
||||||
|
r"(?:Volume\s+(\d+) )?"
|
||||||
|
r"\w+\s+(\d+)(.*)", info)
|
||||||
|
if match:
|
||||||
|
volume, chapter, minor = match.groups()
|
||||||
|
title = text.remove_html(extr(
|
||||||
|
"selected>", "</option")).partition(" : ")[2]
|
||||||
|
else:
|
||||||
|
volume = chapter = 0
|
||||||
|
minor = ""
|
||||||
|
title = info
|
||||||
|
|
||||||
|
return {
|
||||||
|
"manga" : text.unescape(manga),
|
||||||
|
"manga_id" : text.parse_int(manga_id),
|
||||||
|
"title" : text.unescape(title),
|
||||||
|
"volume" : text.parse_int(volume),
|
||||||
|
"chapter" : text.parse_int(chapter),
|
||||||
|
"chapter_minor": minor,
|
||||||
|
"chapter_id" : text.parse_int(self.chapter_id),
|
||||||
|
"date" : text.parse_timestamp(extr(' time="', '"')[:-3]),
|
||||||
|
}
|
||||||
|
|
||||||
|
def images(self, page):
|
||||||
|
images_container = text.extr(page, 'pageOpts', ':[0,0]}"')
|
||||||
|
images_container = text.unescape(images_container)
|
||||||
|
return [
|
||||||
|
(url, None)
|
||||||
|
for url in text.extract_iter(images_container, r"\"", r"\"")
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class BatotoMangaExtractor(BatotoBase, MangaExtractor):
|
||||||
|
"""Extractor for bato.to manga"""
|
||||||
|
reverse = False
|
||||||
|
chapterclass = BatotoChapterExtractor
|
||||||
|
pattern = (BASE_PATTERN +
|
||||||
|
r"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$")
|
||||||
|
example = "https://bato.to/title/12345-MANGA/"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
self.root = text.root_from_url(match.group(0))
|
||||||
|
self.manga_id = match.group(1) or match.group(2)
|
||||||
|
url = "{}/title/{}".format(self.root, self.manga_id)
|
||||||
|
MangaExtractor.__init__(self, match, url)
|
||||||
|
|
||||||
|
def chapters(self, page):
|
||||||
|
extr = text.extract_from(page)
|
||||||
|
|
||||||
|
warning = extr(' class="alert alert-warning">', "</div><")
|
||||||
|
if warning:
|
||||||
|
raise exception.StopExtraction("'%s'", text.remove_html(warning))
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"manga_id": text.parse_int(self.manga_id),
|
||||||
|
"manga" : text.unescape(extr(
|
||||||
|
"<title>", "<").rpartition(" - ")[0]),
|
||||||
|
}
|
||||||
|
|
||||||
|
extr('<div data-hk="0-0-0-0"', "")
|
||||||
|
results = []
|
||||||
|
while True:
|
||||||
|
href = extr('<a href="/title/', '"')
|
||||||
|
if not href:
|
||||||
|
break
|
||||||
|
|
||||||
|
chapter = href.rpartition("-ch_")[2]
|
||||||
|
chapter, sep, minor = chapter.partition(".")
|
||||||
|
|
||||||
|
data["chapter"] = text.parse_int(chapter)
|
||||||
|
data["chapter_minor"] = sep + minor
|
||||||
|
data["date"] = text.parse_datetime(
|
||||||
|
extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
|
|
||||||
|
url = "{}/title/{}".format(self.root, href)
|
||||||
|
results.append((url, data.copy()))
|
||||||
|
return results
|
@ -0,0 +1,109 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2023 Mike Fährmann
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for Chevereto galleries"""
|
||||||
|
|
||||||
|
from .common import BaseExtractor, Message
|
||||||
|
from .. import text
|
||||||
|
|
||||||
|
|
||||||
|
class CheveretoExtractor(BaseExtractor):
|
||||||
|
"""Base class for chevereto extractors"""
|
||||||
|
basecategory = "chevereto"
|
||||||
|
directory_fmt = ("{category}", "{user}", "{album}",)
|
||||||
|
archive_fmt = "{id}"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
BaseExtractor.__init__(self, match)
|
||||||
|
self.path = match.group(match.lastindex)
|
||||||
|
|
||||||
|
def _pagination(self, url):
|
||||||
|
while url:
|
||||||
|
page = self.request(url).text
|
||||||
|
|
||||||
|
for item in text.extract_iter(
|
||||||
|
page, '<div class="list-item-image ', 'image-container'):
|
||||||
|
yield text.extr(item, '<a href="', '"')
|
||||||
|
|
||||||
|
url = text.extr(page, '<a data-pagination="next" href="', '" ><')
|
||||||
|
|
||||||
|
|
||||||
|
BASE_PATTERN = CheveretoExtractor.update({
|
||||||
|
"jpgfish": {
|
||||||
|
"root": "https://jpg4.su",
|
||||||
|
"pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)",
|
||||||
|
},
|
||||||
|
"imgkiwi": {
|
||||||
|
"root": "https://img.kiwi",
|
||||||
|
"pattern": r"img\.kiwi",
|
||||||
|
},
|
||||||
|
"deltaporno": {
|
||||||
|
"root": "https://gallery.deltaporno.com",
|
||||||
|
"pattern": r"gallery\.deltaporno\.com",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
class CheveretoImageExtractor(CheveretoExtractor):
|
||||||
|
"""Extractor for chevereto Images"""
|
||||||
|
subcategory = "image"
|
||||||
|
pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)"
|
||||||
|
example = "https://jpg2.su/img/TITLE.ID"
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = self.root + self.path
|
||||||
|
extr = text.extract_from(self.request(url).text)
|
||||||
|
|
||||||
|
image = {
|
||||||
|
"id" : self.path.rpartition(".")[2],
|
||||||
|
"url" : extr('<meta property="og:image" content="', '"'),
|
||||||
|
"album": text.extr(extr("Added to <a", "/a>"), ">", "<"),
|
||||||
|
"user" : extr('username: "', '"'),
|
||||||
|
}
|
||||||
|
|
||||||
|
text.nameext_from_url(image["url"], image)
|
||||||
|
yield Message.Directory, image
|
||||||
|
yield Message.Url, image["url"], image
|
||||||
|
|
||||||
|
|
||||||
|
class CheveretoAlbumExtractor(CheveretoExtractor):
|
||||||
|
"""Extractor for chevereto Albums"""
|
||||||
|
subcategory = "album"
|
||||||
|
pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)"
|
||||||
|
example = "https://jpg2.su/album/TITLE.ID"
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = self.root + self.path
|
||||||
|
data = {"_extractor": CheveretoImageExtractor}
|
||||||
|
|
||||||
|
if self.path.endswith("/sub"):
|
||||||
|
albums = self._pagination(url)
|
||||||
|
else:
|
||||||
|
albums = (url,)
|
||||||
|
|
||||||
|
for album in albums:
|
||||||
|
for image in self._pagination(album):
|
||||||
|
yield Message.Queue, image, data
|
||||||
|
|
||||||
|
|
||||||
|
class CheveretoUserExtractor(CheveretoExtractor):
|
||||||
|
"""Extractor for chevereto Users"""
|
||||||
|
subcategory = "user"
|
||||||
|
pattern = BASE_PATTERN + r"(/(?!img|image|a(?:lbum)?)[^/?#]+(?:/albums)?)"
|
||||||
|
example = "https://jpg2.su/USER"
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = self.root + self.path
|
||||||
|
|
||||||
|
if self.path.endswith("/albums"):
|
||||||
|
data = {"_extractor": CheveretoAlbumExtractor}
|
||||||
|
else:
|
||||||
|
data = {"_extractor": CheveretoImageExtractor}
|
||||||
|
|
||||||
|
for url in self._pagination(url):
|
||||||
|
yield Message.Queue, url, data
|
@ -0,0 +1,167 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for https://hatenablog.com"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from .common import Extractor, Message
|
||||||
|
from .. import text
|
||||||
|
|
||||||
|
|
||||||
|
BASE_PATTERN = (
|
||||||
|
r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?"
|
||||||
|
r"([\w-]+\.(?:hatenablog\.(?:com|jp)"
|
||||||
|
r"|hatenadiary\.com|hateblo\.jp)))"
|
||||||
|
)
|
||||||
|
QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$"
|
||||||
|
|
||||||
|
|
||||||
|
class HatenablogExtractor(Extractor):
|
||||||
|
"""Base class for HatenaBlog extractors"""
|
||||||
|
category = "hatenablog"
|
||||||
|
directory_fmt = ("{category}", "{domain}")
|
||||||
|
filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}"
|
||||||
|
archive_fmt = "{filename}"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
Extractor.__init__(self, match)
|
||||||
|
self.domain = match.group(1) or match.group(2)
|
||||||
|
|
||||||
|
def _init(self):
|
||||||
|
self._find_img = re.compile(r'<img +([^>]+)').finditer
|
||||||
|
|
||||||
|
def _handle_article(self, article: str):
|
||||||
|
extr = text.extract_from(article)
|
||||||
|
date = text.parse_datetime(extr('<time datetime="', '"'))
|
||||||
|
entry_link = text.unescape(extr('<a href="', '"'))
|
||||||
|
entry = entry_link.partition("/entry/")[2]
|
||||||
|
title = text.unescape(extr('>', '<'))
|
||||||
|
content = extr(
|
||||||
|
'<div class="entry-content hatenablog-entry">', '</div>')
|
||||||
|
|
||||||
|
images = []
|
||||||
|
for i in self._find_img(content):
|
||||||
|
attributes = i.group(1)
|
||||||
|
if 'class="hatena-fotolife"' not in attributes:
|
||||||
|
continue
|
||||||
|
image = text.unescape(text.extr(attributes, 'src="', '"'))
|
||||||
|
images.append(image)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"domain": self.domain,
|
||||||
|
"date": date,
|
||||||
|
"entry": entry,
|
||||||
|
"title": title,
|
||||||
|
"count": len(images),
|
||||||
|
}
|
||||||
|
yield Message.Directory, data
|
||||||
|
for data["num"], url in enumerate(images, 1):
|
||||||
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||||
|
|
||||||
|
|
||||||
|
class HatenablogEntriesExtractor(HatenablogExtractor):
|
||||||
|
"""Base class for a list of entries"""
|
||||||
|
allowed_parameters = ()
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
HatenablogExtractor.__init__(self, match)
|
||||||
|
self.path = match.group(3)
|
||||||
|
self.query = {key: value for key, value in text.parse_query(
|
||||||
|
match.group(4)).items() if self._acceptable_query(key)}
|
||||||
|
|
||||||
|
def _init(self):
|
||||||
|
HatenablogExtractor._init(self)
|
||||||
|
self._find_pager_url = re.compile(
|
||||||
|
r' class="pager-next">\s*<a href="([^"]+)').search
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = "https://" + self.domain + self.path
|
||||||
|
query = self.query
|
||||||
|
|
||||||
|
while url:
|
||||||
|
page = self.request(url, params=query).text
|
||||||
|
|
||||||
|
extr = text.extract_from(page)
|
||||||
|
attributes = extr('<body ', '>')
|
||||||
|
if "page-archive" in attributes:
|
||||||
|
yield from self._handle_partial_articles(extr)
|
||||||
|
else:
|
||||||
|
yield from self._handle_full_articles(extr)
|
||||||
|
|
||||||
|
match = self._find_pager_url(page)
|
||||||
|
url = text.unescape(match.group(1)) if match else None
|
||||||
|
query = None
|
||||||
|
|
||||||
|
def _handle_partial_articles(self, extr):
|
||||||
|
while True:
|
||||||
|
section = extr('<section class="archive-entry', '</section>')
|
||||||
|
if not section:
|
||||||
|
break
|
||||||
|
|
||||||
|
url = "hatenablog:" + text.unescape(text.extr(
|
||||||
|
section, '<a class="entry-title-link" href="', '"'))
|
||||||
|
data = {"_extractor": HatenablogEntryExtractor}
|
||||||
|
yield Message.Queue, url, data
|
||||||
|
|
||||||
|
def _handle_full_articles(self, extr):
|
||||||
|
while True:
|
||||||
|
attributes = extr('<article ', '>')
|
||||||
|
if not attributes:
|
||||||
|
break
|
||||||
|
if "no-entry" in attributes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
article = extr('', '</article>')
|
||||||
|
yield from self._handle_article(article)
|
||||||
|
|
||||||
|
def _acceptable_query(self, key):
|
||||||
|
return key == "page" or key in self.allowed_parameters
|
||||||
|
|
||||||
|
|
||||||
|
class HatenablogEntryExtractor(HatenablogExtractor):
|
||||||
|
"""Extractor for a single entry URL"""
|
||||||
|
subcategory = "entry"
|
||||||
|
pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE
|
||||||
|
example = "https://BLOG.hatenablog.com/entry/PATH"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
HatenablogExtractor.__init__(self, match)
|
||||||
|
self.path = match.group(3)
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = "https://" + self.domain + "/entry/" + self.path
|
||||||
|
page = self.request(url).text
|
||||||
|
|
||||||
|
extr = text.extract_from(page)
|
||||||
|
while True:
|
||||||
|
attributes = extr('<article ', '>')
|
||||||
|
if "no-entry" in attributes:
|
||||||
|
continue
|
||||||
|
article = extr('', '</article>')
|
||||||
|
return self._handle_article(article)
|
||||||
|
|
||||||
|
|
||||||
|
class HatenablogHomeExtractor(HatenablogEntriesExtractor):
|
||||||
|
"""Extractor for a blog's home page"""
|
||||||
|
subcategory = "home"
|
||||||
|
pattern = BASE_PATTERN + r"(/?)" + QUERY_RE
|
||||||
|
example = "https://BLOG.hatenablog.com"
|
||||||
|
|
||||||
|
|
||||||
|
class HatenablogArchiveExtractor(HatenablogEntriesExtractor):
|
||||||
|
"""Extractor for a blog's archive page"""
|
||||||
|
subcategory = "archive"
|
||||||
|
pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?"
|
||||||
|
r"|/category/[^?#]+)?)" + QUERY_RE)
|
||||||
|
example = "https://BLOG.hatenablog.com/archive/2024"
|
||||||
|
|
||||||
|
|
||||||
|
class HatenablogSearchExtractor(HatenablogEntriesExtractor):
|
||||||
|
"""Extractor for a blog's search results"""
|
||||||
|
subcategory = "search"
|
||||||
|
pattern = BASE_PATTERN + r"(/search)" + QUERY_RE
|
||||||
|
example = "https://BLOG.hatenablog.com/search?q=QUERY"
|
||||||
|
allowed_parameters = ("q",)
|
@ -1,92 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
# Copyright 2015-2023 Mike Fährmann
|
|
||||||
#
|
|
||||||
# This program is free software; you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
|
||||||
# published by the Free Software Foundation.
|
|
||||||
|
|
||||||
"""Extractors for https://www.hbrowse.com/"""
|
|
||||||
|
|
||||||
from .common import ChapterExtractor, MangaExtractor
|
|
||||||
from .. import text, util, exception
|
|
||||||
|
|
||||||
|
|
||||||
class HbrowseBase():
|
|
||||||
"""Base class for hbrowse extractors"""
|
|
||||||
category = "hbrowse"
|
|
||||||
root = "https://www.hbrowse.com"
|
|
||||||
|
|
||||||
def parse_page(self, page, data):
|
|
||||||
"""Parse metadata on 'page' and add it to 'data'"""
|
|
||||||
data, pos = text.extract_all(page, (
|
|
||||||
('manga' , '<td class="listLong">', '</td>'),
|
|
||||||
('artist', '<td class="listLong">', '</td>'),
|
|
||||||
('total' , '<td class="listLong">', ' '),
|
|
||||||
('origin', '<td class="listLong">', '</td>'),
|
|
||||||
), values=data)
|
|
||||||
|
|
||||||
if not data["manga"] and "<b>Warning</b>" in page:
|
|
||||||
msg = page.rpartition(">")[2].strip()
|
|
||||||
raise exception.StopExtraction("Site is not accessible: '%s'", msg)
|
|
||||||
|
|
||||||
tags = text.extract(page, 'class="listTable"', '</table>', pos)[0]
|
|
||||||
|
|
||||||
data["manga"] = text.unescape(data["manga"])
|
|
||||||
data["total"] = text.parse_int(data["total"])
|
|
||||||
data["artist"] = text.remove_html(data["artist"])
|
|
||||||
data["origin"] = text.remove_html(data["origin"])
|
|
||||||
data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"'))
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor):
|
|
||||||
"""Extractor for manga-chapters from hbrowse.com"""
|
|
||||||
directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}")
|
|
||||||
filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
|
|
||||||
"{page:>03}.{extension}")
|
|
||||||
archive_fmt = "{manga_id}_{chapter}_{page}"
|
|
||||||
pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))"
|
|
||||||
example = "https://www.hbrowse.com/12345/c00000"
|
|
||||||
|
|
||||||
def __init__(self, match):
|
|
||||||
self.path, self.gid, self.chapter = match.groups()
|
|
||||||
self.path += "/"
|
|
||||||
ChapterExtractor.__init__(self, match)
|
|
||||||
|
|
||||||
def metadata(self, page):
|
|
||||||
return self.parse_page(page, {
|
|
||||||
"manga_id": text.parse_int(self.gid),
|
|
||||||
"chapter": text.parse_int(self.chapter)
|
|
||||||
})
|
|
||||||
|
|
||||||
def images(self, page):
|
|
||||||
base = self.root + "/data" + self.path
|
|
||||||
json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
|
|
||||||
return [(base + name, None) for name in util.json_loads(json_data)]
|
|
||||||
|
|
||||||
|
|
||||||
class HbrowseMangaExtractor(HbrowseBase, MangaExtractor):
|
|
||||||
"""Extractor for manga from hbrowse.com"""
|
|
||||||
chapterclass = HbrowseChapterExtractor
|
|
||||||
reverse = False
|
|
||||||
pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$"
|
|
||||||
example = "https://www.hbrowse.com/12345"
|
|
||||||
|
|
||||||
def chapters(self, page):
|
|
||||||
results = []
|
|
||||||
data = self.parse_page(page, {
|
|
||||||
"manga_id": text.parse_int(
|
|
||||||
self.manga_url.rstrip("/").rpartition("/")[2])
|
|
||||||
})
|
|
||||||
|
|
||||||
pos = 0
|
|
||||||
needle = '<td class="listMiddle">\n<a class="listLink" href="'
|
|
||||||
while True:
|
|
||||||
url, pos = text.extract(page, needle, '"', pos)
|
|
||||||
if not url:
|
|
||||||
return results
|
|
||||||
title, pos = text.extract(page, '>View ', '<', pos)
|
|
||||||
data["chapter"] = text.parse_int(url.rpartition("/")[2][1:])
|
|
||||||
data["title"] = title
|
|
||||||
results.append((text.urljoin(self.root, url), data.copy()))
|
|
@ -1,105 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
# This program is free software; you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
|
||||||
# published by the Free Software Foundation.
|
|
||||||
|
|
||||||
"""Extractors for https://jpg1.su/"""
|
|
||||||
|
|
||||||
from .common import Extractor, Message
|
|
||||||
from .. import text
|
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)"
|
|
||||||
|
|
||||||
|
|
||||||
class JpgfishExtractor(Extractor):
|
|
||||||
"""Base class for jpgfish extractors"""
|
|
||||||
category = "jpgfish"
|
|
||||||
root = "https://jpg1.su"
|
|
||||||
directory_fmt = ("{category}", "{user}", "{album}",)
|
|
||||||
archive_fmt = "{id}"
|
|
||||||
|
|
||||||
def _pagination(self, url):
|
|
||||||
while url:
|
|
||||||
page = self.request(url).text
|
|
||||||
|
|
||||||
for item in text.extract_iter(
|
|
||||||
page, '<div class="list-item-image ', 'image-container'):
|
|
||||||
yield text.extract(item, '<a href="', '"')[0]
|
|
||||||
|
|
||||||
url = text.extract(
|
|
||||||
page, '<a data-pagination="next" href="', '" ><')[0]
|
|
||||||
|
|
||||||
|
|
||||||
class JpgfishImageExtractor(JpgfishExtractor):
|
|
||||||
"""Extractor for jpgfish Images"""
|
|
||||||
subcategory = "image"
|
|
||||||
pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))"
|
|
||||||
example = "https://jpg1.su/img/TITLE.ID"
|
|
||||||
|
|
||||||
def __init__(self, match):
|
|
||||||
JpgfishExtractor.__init__(self, match)
|
|
||||||
self.path, self.image_id = match.groups()
|
|
||||||
|
|
||||||
def items(self):
|
|
||||||
url = "{}/img/{}".format(self.root, self.path)
|
|
||||||
extr = text.extract_from(self.request(url).text)
|
|
||||||
|
|
||||||
image = {
|
|
||||||
"id" : self.image_id,
|
|
||||||
"url" : extr('<meta property="og:image" content="', '"'),
|
|
||||||
"album": text.extract(extr(
|
|
||||||
"Added to <a", "/a>"), ">", "<")[0] or "",
|
|
||||||
"user" : extr('username: "', '"'),
|
|
||||||
}
|
|
||||||
|
|
||||||
text.nameext_from_url(image["url"], image)
|
|
||||||
yield Message.Directory, image
|
|
||||||
yield Message.Url, image["url"], image
|
|
||||||
|
|
||||||
|
|
||||||
class JpgfishAlbumExtractor(JpgfishExtractor):
|
|
||||||
"""Extractor for jpgfish Albums"""
|
|
||||||
subcategory = "album"
|
|
||||||
pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?"
|
|
||||||
example = "https://jpg1.su/album/TITLE.ID"
|
|
||||||
|
|
||||||
def __init__(self, match):
|
|
||||||
JpgfishExtractor.__init__(self, match)
|
|
||||||
self.album, self.sub_albums = match.groups()
|
|
||||||
|
|
||||||
def items(self):
|
|
||||||
url = "{}/a/{}".format(self.root, self.album)
|
|
||||||
data = {"_extractor": JpgfishImageExtractor}
|
|
||||||
|
|
||||||
if self.sub_albums:
|
|
||||||
albums = self._pagination(url + "/sub")
|
|
||||||
else:
|
|
||||||
albums = (url,)
|
|
||||||
|
|
||||||
for album in albums:
|
|
||||||
for image in self._pagination(album):
|
|
||||||
yield Message.Queue, image, data
|
|
||||||
|
|
||||||
|
|
||||||
class JpgfishUserExtractor(JpgfishExtractor):
|
|
||||||
"""Extractor for jpgfish Users"""
|
|
||||||
subcategory = "user"
|
|
||||||
pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?"
|
|
||||||
example = "https://jpg1.su/USER"
|
|
||||||
|
|
||||||
def __init__(self, match):
|
|
||||||
JpgfishExtractor.__init__(self, match)
|
|
||||||
self.user, self.albums = match.groups()
|
|
||||||
|
|
||||||
def items(self):
|
|
||||||
url = "{}/{}".format(self.root, self.user)
|
|
||||||
|
|
||||||
if self.albums:
|
|
||||||
url += "/albums"
|
|
||||||
data = {"_extractor": JpgfishAlbumExtractor}
|
|
||||||
else:
|
|
||||||
data = {"_extractor": JpgfishImageExtractor}
|
|
||||||
|
|
||||||
for url in self._pagination(url):
|
|
||||||
yield Message.Queue, url, data
|
|
@ -1,87 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
# This program is free software; you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
|
||||||
# published by the Free Software Foundation.
|
|
||||||
|
|
||||||
"""Extractors for https://nudecollect.com/"""
|
|
||||||
|
|
||||||
from .common import GalleryExtractor
|
|
||||||
from .. import text
|
|
||||||
|
|
||||||
|
|
||||||
class NudecollectExtractor(GalleryExtractor):
|
|
||||||
"""Base class for Nudecollect extractors"""
|
|
||||||
category = "nudecollect"
|
|
||||||
directory_fmt = ("{category}", "{title}")
|
|
||||||
filename_fmt = "{slug}_{num:>03}.{extension}"
|
|
||||||
archive_fmt = "{slug}_{num}"
|
|
||||||
root = "https://www.nudecollect.com"
|
|
||||||
|
|
||||||
def request(self, url, **kwargs):
|
|
||||||
kwargs["allow_redirects"] = False
|
|
||||||
return GalleryExtractor.request(self, url, **kwargs)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_title(page):
|
|
||||||
return text.unescape(text.extr(page, "<title>", "</title>"))[31:]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_image(page):
|
|
||||||
return text.extr(page, '<img src="', '"')
|
|
||||||
|
|
||||||
|
|
||||||
class NudecollectImageExtractor(NudecollectExtractor):
|
|
||||||
"""Extractor for individual images from nudecollect.com"""
|
|
||||||
subcategory = "image"
|
|
||||||
pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com"
|
|
||||||
r"(/content/([^/?#]+)/image-(\d+)-pics-(\d+)"
|
|
||||||
r"-mirror-(\d+)\.html)")
|
|
||||||
example = ("https://www.nudecollect.com/content/12345_TITLE"
|
|
||||||
"/image-1-pics-108-mirror-1.html")
|
|
||||||
|
|
||||||
def __init__(self, match):
|
|
||||||
NudecollectExtractor.__init__(self, match)
|
|
||||||
_, self.slug, self.num, self.count, self.mirror = match.groups()
|
|
||||||
|
|
||||||
def metadata(self, page):
|
|
||||||
return {
|
|
||||||
"slug" : self.slug,
|
|
||||||
"title" : self.get_title(page),
|
|
||||||
"count" : text.parse_int(self.count),
|
|
||||||
"mirror": text.parse_int(self.mirror),
|
|
||||||
}
|
|
||||||
|
|
||||||
def images(self, page):
|
|
||||||
return ((self.get_image(page), {"num": text.parse_int(self.num)}),)
|
|
||||||
|
|
||||||
|
|
||||||
class NudecollectAlbumExtractor(NudecollectExtractor):
|
|
||||||
"""Extractor for image albums on nudecollect.com"""
|
|
||||||
subcategory = "album"
|
|
||||||
pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com"
|
|
||||||
r"/content/([^/?#]+)/(?:index-mirror-(\d+)-(\d+)"
|
|
||||||
r"|page-\d+-pics-(\d+)-mirror-(\d+))\.html")
|
|
||||||
example = ("https://www.nudecollect.com/content/12345_TITLE"
|
|
||||||
"/index-mirror-01-123.html")
|
|
||||||
|
|
||||||
def __init__(self, match):
|
|
||||||
self.slug = match.group(1)
|
|
||||||
self.mirror = match.group(2) or match.group(5)
|
|
||||||
self.count = text.parse_int(match.group(3) or match.group(4))
|
|
||||||
url = "{}/content/{}/image-1-pics-{}-mirror-{}.html".format(
|
|
||||||
self.root, self.slug, self.count, self.mirror)
|
|
||||||
NudecollectExtractor.__init__(self, match, url)
|
|
||||||
|
|
||||||
def metadata(self, page):
|
|
||||||
return {
|
|
||||||
"slug" : self.slug,
|
|
||||||
"title" : self.get_title(page),
|
|
||||||
"mirror": text.parse_int(self.mirror),
|
|
||||||
}
|
|
||||||
|
|
||||||
def images(self, page):
|
|
||||||
url = self.get_image(page)
|
|
||||||
p1, _, p2 = url.partition("/image0")
|
|
||||||
ufmt = p1 + "/image{:>05}" + p2[4:]
|
|
||||||
return [(ufmt.format(num), None) for num in range(1, self.count + 1)]
|
|
@ -0,0 +1,88 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2023 Mike Fährmann
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for https://pixeldrain.com/"""
|
||||||
|
|
||||||
|
from .common import Extractor, Message
|
||||||
|
from .. import text, util
|
||||||
|
|
||||||
|
BASE_PATTERN = r"(?:https?://)?pixeldrain\.com"
|
||||||
|
|
||||||
|
|
||||||
|
class PixeldrainExtractor(Extractor):
|
||||||
|
"""Base class for pixeldrain extractors"""
|
||||||
|
category = "pixeldrain"
|
||||||
|
root = "https://pixeldrain.com"
|
||||||
|
archive_fmt = "{id}"
|
||||||
|
|
||||||
|
def _init(self):
|
||||||
|
api_key = self.config("api-key")
|
||||||
|
if api_key:
|
||||||
|
self.session.auth = util.HTTPBasicAuth("", api_key)
|
||||||
|
|
||||||
|
def parse_datetime(self, date_string):
|
||||||
|
return text.parse_datetime(
|
||||||
|
date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
|
|
||||||
|
|
||||||
|
class PixeldrainFileExtractor(PixeldrainExtractor):
|
||||||
|
"""Extractor for pixeldrain files"""
|
||||||
|
subcategory = "file"
|
||||||
|
filename_fmt = "{filename[:230]} ({id}).{extension}"
|
||||||
|
pattern = BASE_PATTERN + r"/(?:u|api/file)/(\w+)"
|
||||||
|
example = "https://pixeldrain.com/u/abcdefgh"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
Extractor.__init__(self, match)
|
||||||
|
self.file_id = match.group(1)
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = "{}/api/file/{}".format(self.root, self.file_id)
|
||||||
|
file = self.request(url + "/info").json()
|
||||||
|
|
||||||
|
file["url"] = url + "?download"
|
||||||
|
file["date"] = self.parse_datetime(file["date_upload"])
|
||||||
|
|
||||||
|
text.nameext_from_url(file["name"], file)
|
||||||
|
yield Message.Directory, file
|
||||||
|
yield Message.Url, file["url"], file
|
||||||
|
|
||||||
|
|
||||||
|
class PixeldrainAlbumExtractor(PixeldrainExtractor):
|
||||||
|
"""Extractor for pixeldrain albums"""
|
||||||
|
subcategory = "album"
|
||||||
|
directory_fmt = ("{category}",
|
||||||
|
"{album[date]:%Y-%m-%d} {album[title]} ({album[id]})")
|
||||||
|
filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}"
|
||||||
|
pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)"
|
||||||
|
example = "https://pixeldrain.com/l/abcdefgh"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
Extractor.__init__(self, match)
|
||||||
|
self.album_id = match.group(1)
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = "{}/api/list/{}".format(self.root, self.album_id)
|
||||||
|
album = self.request(url).json()
|
||||||
|
|
||||||
|
files = album["files"]
|
||||||
|
album["count"] = album["file_count"]
|
||||||
|
album["date"] = self.parse_datetime(album["date_created"])
|
||||||
|
|
||||||
|
del album["files"]
|
||||||
|
del album["file_count"]
|
||||||
|
|
||||||
|
yield Message.Directory, {"album": album}
|
||||||
|
for num, file in enumerate(files, 1):
|
||||||
|
file["album"] = album
|
||||||
|
file["num"] = num
|
||||||
|
file["url"] = url = "{}/api/file/{}?download".format(
|
||||||
|
self.root, file["id"])
|
||||||
|
file["date"] = self.parse_datetime(file["date_upload"])
|
||||||
|
text.nameext_from_url(file["name"], file)
|
||||||
|
yield Message.Url, url, file
|
@ -0,0 +1,138 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for http://www.poringa.net/"""
|
||||||
|
|
||||||
|
from .common import Extractor, Message
|
||||||
|
from .. import text, exception
|
||||||
|
from ..cache import cache
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
BASE_PATTERN = r"(?:https?://)?(?:www\.)?poringa\.net"
|
||||||
|
|
||||||
|
|
||||||
|
class PoringaExtractor(Extractor):
|
||||||
|
category = "poringa"
|
||||||
|
directory_fmt = ("{category}", "{user}", "{post_id}")
|
||||||
|
filename_fmt = "{post_id}_{title}_{num:>03}_{filename}.{extension}"
|
||||||
|
archive_fmt = "{post_id}_{num}"
|
||||||
|
root = "http://www.poringa.net"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
Extractor.__init__(self, match)
|
||||||
|
self.item = match.group(1)
|
||||||
|
self.__cookies = True
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
for post_id in self.posts():
|
||||||
|
url = "{}/posts/imagenes/{}".format(self.root, post_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.request(url)
|
||||||
|
except exception.HttpError as exc:
|
||||||
|
self.log.warning(
|
||||||
|
"Unable to fetch posts for '%s' (%s)", post_id, exc)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "/registro-login?" in response.url:
|
||||||
|
self.log.warning("Private post '%s'", post_id)
|
||||||
|
continue
|
||||||
|
|
||||||
|
page = response.text
|
||||||
|
title, pos = text.extract(
|
||||||
|
page, 'property="og:title" content="', '"')
|
||||||
|
|
||||||
|
try:
|
||||||
|
pos = page.index('<div class="main-info', pos)
|
||||||
|
user, pos = text.extract(
|
||||||
|
page, 'href="http://www.poringa.net/', '"', pos)
|
||||||
|
except ValueError:
|
||||||
|
user = None
|
||||||
|
|
||||||
|
if not user:
|
||||||
|
user = "poringa"
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"post_id" : post_id,
|
||||||
|
"title" : text.unescape(title),
|
||||||
|
"user" : text.unquote(user),
|
||||||
|
"_http_headers": {"Referer": url},
|
||||||
|
}
|
||||||
|
|
||||||
|
main_post = text.extr(
|
||||||
|
page, 'property="dc:content" role="main">', '</div>')
|
||||||
|
urls = list(text.extract_iter(
|
||||||
|
main_post, '<img class="imagen" border="0" src="', '"'))
|
||||||
|
data["count"] = len(urls)
|
||||||
|
|
||||||
|
yield Message.Directory, data
|
||||||
|
for data["num"], url in enumerate(urls, 1):
|
||||||
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||||
|
|
||||||
|
def posts(self):
|
||||||
|
return ()
|
||||||
|
|
||||||
|
def request(self, url, **kwargs):
|
||||||
|
if self.__cookies:
|
||||||
|
self.__cookies = False
|
||||||
|
self.cookies_update(_cookie_cache())
|
||||||
|
|
||||||
|
for _ in range(5):
|
||||||
|
response = Extractor.request(self, url, **kwargs)
|
||||||
|
if response.cookies:
|
||||||
|
_cookie_cache.update("", response.cookies)
|
||||||
|
if response.content.find(
|
||||||
|
b"<title>Please wait a few moments</title>", 0, 600) < 0:
|
||||||
|
return response
|
||||||
|
self.sleep(5.0, "check")
|
||||||
|
|
||||||
|
def _pagination(self, url, params):
|
||||||
|
for params["p"] in itertools.count(1):
|
||||||
|
page = self.request(url, params=params).text
|
||||||
|
|
||||||
|
posts_ids = PoringaPostExtractor.pattern.findall(page)
|
||||||
|
posts_ids = list(dict.fromkeys(posts_ids))
|
||||||
|
yield from posts_ids
|
||||||
|
|
||||||
|
if len(posts_ids) < 19:
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class PoringaPostExtractor(PoringaExtractor):
|
||||||
|
"""Extractor for posts on poringa.net"""
|
||||||
|
subcategory = "post"
|
||||||
|
pattern = BASE_PATTERN + r"/posts/imagenes/(\d+)"
|
||||||
|
example = "http://www.poringa.net/posts/imagenes/12345/TITLE.html"
|
||||||
|
|
||||||
|
def posts(self):
|
||||||
|
return (self.item,)
|
||||||
|
|
||||||
|
|
||||||
|
class PoringaUserExtractor(PoringaExtractor):
|
||||||
|
subcategory = "user"
|
||||||
|
pattern = BASE_PATTERN + r"/(\w+)$"
|
||||||
|
example = "http://www.poringa.net/USER"
|
||||||
|
|
||||||
|
def posts(self):
|
||||||
|
url = self.root + "/buscar/"
|
||||||
|
params = {"q": self.item}
|
||||||
|
return self._pagination(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
class PoringaSearchExtractor(PoringaExtractor):
|
||||||
|
subcategory = "search"
|
||||||
|
pattern = BASE_PATTERN + r"/buscar/\?&?q=([^&#]+)"
|
||||||
|
example = "http://www.poringa.net/buscar/?q=QUERY"
|
||||||
|
|
||||||
|
def posts(self):
|
||||||
|
url = self.root + "/buscar/"
|
||||||
|
params = {"q": self.item}
|
||||||
|
return self._pagination(url, params)
|
||||||
|
|
||||||
|
|
||||||
|
@cache()
|
||||||
|
def _cookie_cache():
|
||||||
|
return ()
|
@ -0,0 +1,203 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for Postmill instances"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from .common import BaseExtractor, Message
|
||||||
|
from .. import text, exception
|
||||||
|
|
||||||
|
|
||||||
|
class PostmillExtractor(BaseExtractor):
|
||||||
|
"""Base class for Postmill extractors"""
|
||||||
|
basecategory = "postmill"
|
||||||
|
directory_fmt = ("{category}", "{instance}", "{forum}")
|
||||||
|
filename_fmt = "{id}_{title[:220]}.{extension}"
|
||||||
|
archive_fmt = "{filename}"
|
||||||
|
|
||||||
|
def _init(self):
|
||||||
|
self.instance = self.root.partition("://")[2]
|
||||||
|
self.save_link_post_body = self.config("save-link-post-body", False)
|
||||||
|
self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search
|
||||||
|
self._search_image_tag = re.compile(
|
||||||
|
r'<a href="[^"]+"\n +class="submission__image-link"').search
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
for post_url in self.post_urls():
|
||||||
|
page = self.request(post_url).text
|
||||||
|
extr = text.extract_from(page)
|
||||||
|
|
||||||
|
title = text.unescape(extr(
|
||||||
|
'<meta property="og:title" content="', '">'))
|
||||||
|
date = text.parse_datetime(extr(
|
||||||
|
'<meta property="og:article:published_time" content="', '">'))
|
||||||
|
username = extr(
|
||||||
|
'<meta property="og:article:author" content="', '">')
|
||||||
|
post_canonical_url = text.unescape(extr(
|
||||||
|
'<link rel="canonical" href="', '">'))
|
||||||
|
|
||||||
|
url = text.unescape(extr(
|
||||||
|
'<h1 class="submission__title unheaderize inline"><a href="',
|
||||||
|
'"'))
|
||||||
|
body = extr(
|
||||||
|
'<div class="submission__body break-text text-flow">',
|
||||||
|
'</div>')
|
||||||
|
|
||||||
|
match = self._search_canonical_url(post_canonical_url)
|
||||||
|
forum = match.group(1)
|
||||||
|
id = int(match.group(2))
|
||||||
|
|
||||||
|
is_text_post = url.startswith("/")
|
||||||
|
is_image_post = self._search_image_tag(page) is not None
|
||||||
|
data = {
|
||||||
|
"title": title,
|
||||||
|
"date": date,
|
||||||
|
"username": username,
|
||||||
|
"forum": forum,
|
||||||
|
"id": id,
|
||||||
|
"flair": [text.unescape(i) for i in text.extract_iter(
|
||||||
|
page, '<span class="flair__label">', '</span>')],
|
||||||
|
"instance": self.instance,
|
||||||
|
}
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
if is_text_post or self.save_link_post_body:
|
||||||
|
urls.append((Message.Url, "text:" + body))
|
||||||
|
|
||||||
|
if is_image_post:
|
||||||
|
urls.append((Message.Url, url))
|
||||||
|
elif not is_text_post:
|
||||||
|
urls.append((Message.Queue, url))
|
||||||
|
|
||||||
|
data["count"] = len(urls)
|
||||||
|
yield Message.Directory, data
|
||||||
|
for data["num"], (msg, url) in enumerate(urls, 1):
|
||||||
|
if url.startswith("text:"):
|
||||||
|
data["filename"], data["extension"] = "", "htm"
|
||||||
|
else:
|
||||||
|
data = text.nameext_from_url(url, data)
|
||||||
|
|
||||||
|
yield msg, url, data
|
||||||
|
|
||||||
|
|
||||||
|
class PostmillSubmissionsExtractor(PostmillExtractor):
|
||||||
|
"""Base class for Postmill submissions extractors"""
|
||||||
|
whitelisted_parameters = ()
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
PostmillExtractor.__init__(self, match)
|
||||||
|
groups = match.groups()
|
||||||
|
self.base = groups[-3]
|
||||||
|
self.sorting_path = groups[-2] or ""
|
||||||
|
self.query = {key: value for key, value in text.parse_query(
|
||||||
|
groups[-1]).items() if self.acceptable_query(key)}
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = self.root + self.base + self.sorting_path
|
||||||
|
|
||||||
|
while url:
|
||||||
|
response = self.request(url, params=self.query)
|
||||||
|
if response.history:
|
||||||
|
redirect_url = response.url
|
||||||
|
if redirect_url == self.root + "/login":
|
||||||
|
raise exception.StopExtraction(
|
||||||
|
"HTTP redirect to login page (%s)", redirect_url)
|
||||||
|
page = response.text
|
||||||
|
|
||||||
|
for nav in text.extract_iter(page,
|
||||||
|
'<nav class="submission__nav">',
|
||||||
|
'</nav>'):
|
||||||
|
post_url = text.unescape(text.extr(nav, '<a href="', '"'))
|
||||||
|
yield Message.Queue, text.urljoin(url, post_url), \
|
||||||
|
{"_extractor": PostmillPostExtractor}
|
||||||
|
|
||||||
|
url = text.unescape(text.extr(page,
|
||||||
|
'<link rel="next" href="', '">'))
|
||||||
|
|
||||||
|
def acceptable_query(self, key):
|
||||||
|
return key in self.whitelisted_parameters or key == "t" or \
|
||||||
|
(key.startswith("next[") and key.endswith("]"))
|
||||||
|
|
||||||
|
|
||||||
|
BASE_PATTERN = PostmillExtractor.update({
|
||||||
|
"raddle": {
|
||||||
|
"root" : None,
|
||||||
|
"pattern": (r"(?:raddle\.me|"
|
||||||
|
r"c32zjeghcp5tj3kb72pltz56piei66drc63vkhn5yixiyk4cmerrjtid"
|
||||||
|
r"\.onion)"),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
QUERY_RE = r"(?:\?([^#]+))?$"
|
||||||
|
SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \
|
||||||
|
QUERY_RE
|
||||||
|
|
||||||
|
|
||||||
|
class PostmillPostExtractor(PostmillExtractor):
|
||||||
|
"""Extractor for a single submission URL"""
|
||||||
|
subcategory = "post"
|
||||||
|
pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)"
|
||||||
|
example = "https://raddle.me/f/FORUM/123/TITLE"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
PostmillExtractor.__init__(self, match)
|
||||||
|
self.forum = match.group(3)
|
||||||
|
self.post_id = match.group(4)
|
||||||
|
|
||||||
|
def post_urls(self):
|
||||||
|
return (self.root + "/f/" + self.forum + "/" + self.post_id,)
|
||||||
|
|
||||||
|
|
||||||
|
class PostmillShortURLExtractor(PostmillExtractor):
|
||||||
|
"""Extractor for short submission URLs"""
|
||||||
|
subcategory = "shorturl"
|
||||||
|
pattern = BASE_PATTERN + r"/(\d+)$"
|
||||||
|
example = "https://raddle.me/123"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
PostmillExtractor.__init__(self, match)
|
||||||
|
self.post_id = match.group(3)
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = self.root + "/" + self.post_id
|
||||||
|
response = self.request(url, method="HEAD", allow_redirects=False)
|
||||||
|
full_url = text.urljoin(url, response.headers["Location"])
|
||||||
|
yield Message.Queue, full_url, {"_extractor": PostmillPostExtractor}
|
||||||
|
|
||||||
|
|
||||||
|
class PostmillHomeExtractor(PostmillSubmissionsExtractor):
|
||||||
|
"""Extractor for the home page"""
|
||||||
|
subcategory = "home"
|
||||||
|
pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE
|
||||||
|
example = "https://raddle.me/"
|
||||||
|
|
||||||
|
|
||||||
|
class PostmillForumExtractor(PostmillSubmissionsExtractor):
|
||||||
|
"""Extractor for submissions on a forum"""
|
||||||
|
subcategory = "forum"
|
||||||
|
pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE
|
||||||
|
example = "https://raddle.me/f/FORUM"
|
||||||
|
|
||||||
|
|
||||||
|
class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor):
|
||||||
|
"""Extractor for submissions made by a user"""
|
||||||
|
subcategory = "usersubmissions"
|
||||||
|
pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE
|
||||||
|
example = "https://raddle.me/user/USER/submissions"
|
||||||
|
|
||||||
|
|
||||||
|
class PostmillTagExtractor(PostmillSubmissionsExtractor):
|
||||||
|
"""Extractor for submissions on a forum with a specific tag"""
|
||||||
|
subcategory = "tag"
|
||||||
|
pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE
|
||||||
|
example = "https://raddle.me/tag/TAG"
|
||||||
|
|
||||||
|
|
||||||
|
class PostmillSearchExtractor(PostmillSubmissionsExtractor):
|
||||||
|
"""Extractor for search results"""
|
||||||
|
subcategory = "search"
|
||||||
|
pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$"
|
||||||
|
example = "https://raddle.me/search?q=QUERY"
|
||||||
|
whitelisted_parameters = ("q",)
|
@ -0,0 +1,211 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for https://www.steamgriddb.com"""
|
||||||
|
|
||||||
|
from .common import Extractor, Message
|
||||||
|
from .. import text, exception
|
||||||
|
|
||||||
|
|
||||||
|
BASE_PATTERN = r"(?:https?://)?(?:www\.)?steamgriddb\.com"
|
||||||
|
LANGUAGE_CODES = (
|
||||||
|
"aa", "ab", "ae", "af", "ak", "am", "an", "ar", "as", "av", "ay", "az",
|
||||||
|
"ba", "be", "bg", "bh", "bi", "bm", "bn", "bo", "br", "bs", "ca", "ce",
|
||||||
|
"ch", "co", "cr", "cs", "cu", "cv", "cy", "da", "de", "dv", "dz", "ee",
|
||||||
|
"el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fj", "fo", "fr",
|
||||||
|
"fy", "ga", "gd", "gl", "gn", "gu", "gv", "ha", "he", "hi", "ho", "hr",
|
||||||
|
"ht", "hu", "hy", "hz", "ia", "id", "ie", "ig", "ii", "ik", "io", "is",
|
||||||
|
"it", "iu", "ja", "jv", "ka", "kg", "ki", "kj", "kk", "kl", "km", "kn",
|
||||||
|
"ko", "kr", "ks", "ku", "kv", "kw", "ky", "la", "lb", "lg", "li", "ln",
|
||||||
|
"lo", "lt", "lu", "lv", "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms",
|
||||||
|
"mt", "my", "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv",
|
||||||
|
"ny", "oc", "oj", "om", "or", "os", "pa", "pi", "pl", "ps", "pt", "qu",
|
||||||
|
"rm", "rn", "ro", "ru", "rw", "sa", "sc", "sd", "se", "sg", "si", "sk",
|
||||||
|
"sl", "sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw", "ta",
|
||||||
|
"te", "tg", "th", "ti", "tk", "tl", "tn", "to", "tr", "ts", "tt", "tw",
|
||||||
|
"ty", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi",
|
||||||
|
"yo", "za", "zh", "zu",
|
||||||
|
)
|
||||||
|
FILE_EXT_TO_MIME = {
|
||||||
|
"png": "image/png",
|
||||||
|
"jpeg": "image/jpeg",
|
||||||
|
"jpg": "image/jpeg",
|
||||||
|
"webp": "image/webp",
|
||||||
|
"ico": "image/vnd.microsoft.icon",
|
||||||
|
"all": "all",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class SteamgriddbExtractor(Extractor):
|
||||||
|
"""Base class for SteamGridDB"""
|
||||||
|
category = "steamgriddb"
|
||||||
|
directory_fmt = ("{category}", "{subcategory}", "{game[id]}")
|
||||||
|
filename_fmt = "{game[id]}_{id}_{num:>02}.{extension}"
|
||||||
|
archive_fmt = "{filename}"
|
||||||
|
root = "https://www.steamgriddb.com"
|
||||||
|
|
||||||
|
def _init(self):
|
||||||
|
self.cookies_update({
|
||||||
|
"userprefs": "%7B%22adult%22%3Afalse%7D",
|
||||||
|
})
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
download_fake_png = self.config("download-fake-png", True)
|
||||||
|
|
||||||
|
for asset in self.assets():
|
||||||
|
if download_fake_png and asset.get("fake_png"):
|
||||||
|
urls = (asset["url"], asset["fake_png"])
|
||||||
|
else:
|
||||||
|
urls = (asset["url"],)
|
||||||
|
|
||||||
|
asset["count"] = len(urls)
|
||||||
|
yield Message.Directory, asset
|
||||||
|
for asset["num"], url in enumerate(urls, 1):
|
||||||
|
yield Message.Url, url, text.nameext_from_url(url, asset)
|
||||||
|
|
||||||
|
def _call(self, endpoint, **kwargs):
|
||||||
|
data = self.request(self.root + endpoint, **kwargs).json()
|
||||||
|
if not data["success"]:
|
||||||
|
raise exception.StopExtraction(data["error"])
|
||||||
|
return data["data"]
|
||||||
|
|
||||||
|
|
||||||
|
class SteamgriddbAssetsExtractor(SteamgriddbExtractor):
|
||||||
|
"""Base class for extracting a list of assets"""
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
SteamgriddbExtractor.__init__(self, match)
|
||||||
|
list_type = match.group(1)
|
||||||
|
id = int(match.group(2))
|
||||||
|
self.game_id = id if list_type == "game" else None
|
||||||
|
self.collection_id = id if list_type == "collection" else None
|
||||||
|
self.page = int(match.group(3) or 1)
|
||||||
|
|
||||||
|
def assets(self):
|
||||||
|
limit = 48
|
||||||
|
page = min(self.page - 1, 0)
|
||||||
|
|
||||||
|
sort = self.config("sort", "score_desc")
|
||||||
|
if sort not in ("score_desc", "score_asc", "score_old_desc",
|
||||||
|
"score_old_asc", "age_desc", "age_asc"):
|
||||||
|
raise exception.StopExtractor("Invalid sort '%s'", sort)
|
||||||
|
|
||||||
|
json = {
|
||||||
|
"static" : self.config("static", True),
|
||||||
|
"animated": self.config("animated", True),
|
||||||
|
"humor" : self.config("humor", True),
|
||||||
|
"nsfw" : self.config("nsfw", True),
|
||||||
|
"epilepsy": self.config("epilepsy", True),
|
||||||
|
"untagged": self.config("untagged", True),
|
||||||
|
|
||||||
|
"asset_type": self.asset_type,
|
||||||
|
"limit": limit,
|
||||||
|
"order": sort,
|
||||||
|
}
|
||||||
|
if self.valid_dimensions:
|
||||||
|
json["dimensions"] = self.config_list(
|
||||||
|
"dimensions", "dimension", self.valid_dimensions)
|
||||||
|
json["styles"] = self.config_list("styles", "style", self.valid_styles)
|
||||||
|
json["languages"] = self.config_list(
|
||||||
|
"languages", "language", LANGUAGE_CODES)
|
||||||
|
file_types = self.config_list(
|
||||||
|
"file-types", "file type", self.valid_file_types)
|
||||||
|
json["mime"] = [FILE_EXT_TO_MIME[i] for i in file_types]
|
||||||
|
|
||||||
|
if self.game_id:
|
||||||
|
json["game_id"] = [self.game_id]
|
||||||
|
else:
|
||||||
|
json["collection_id"] = self.collection_id
|
||||||
|
|
||||||
|
while True:
|
||||||
|
json["page"] = page
|
||||||
|
|
||||||
|
data = self._call(
|
||||||
|
"/api/public/search/assets", method="POST", json=json)
|
||||||
|
for asset in data["assets"]:
|
||||||
|
if not asset.get("game"):
|
||||||
|
asset["game"] = data["game"]
|
||||||
|
yield asset
|
||||||
|
|
||||||
|
if data["total"] <= limit * page:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
def config_list(self, key, type_name, valid_values):
|
||||||
|
value = self.config(key)
|
||||||
|
if isinstance(value, str):
|
||||||
|
value = value.split(",")
|
||||||
|
|
||||||
|
if value is None or "all" in value:
|
||||||
|
return ["all"]
|
||||||
|
|
||||||
|
for i in value:
|
||||||
|
if i not in valid_values:
|
||||||
|
raise exception.StopExtraction("Invalid %s '%s'", type_name, i)
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
class SteamgriddbAssetExtractor(SteamgriddbExtractor):
|
||||||
|
"""Extractor for a single asset"""
|
||||||
|
subcategory = "asset"
|
||||||
|
pattern = BASE_PATTERN + r"/(grid|hero|logo|icon)/(\d+)"
|
||||||
|
example = "https://www.steamgriddb.com/grid/1234"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
SteamgriddbExtractor.__init__(self, match)
|
||||||
|
self.asset_type = match.group(1)
|
||||||
|
self.asset_id = match.group(2)
|
||||||
|
|
||||||
|
def assets(self):
|
||||||
|
endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id
|
||||||
|
asset = self._call(endpoint)["asset"]
|
||||||
|
return (asset,)
|
||||||
|
|
||||||
|
|
||||||
|
class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor):
|
||||||
|
subcategory = "grids"
|
||||||
|
asset_type = "grid"
|
||||||
|
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/grids(?:/(\d+))?"
|
||||||
|
example = "https://www.steamgriddb.com/game/1234/grids"
|
||||||
|
valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930",
|
||||||
|
"512x512", "1024x1024")
|
||||||
|
valid_styles = ("alternate", "blurred", "no_logo", "material",
|
||||||
|
"white_logo")
|
||||||
|
valid_file_types = ("png", "jpeg", "jpg", "webp")
|
||||||
|
|
||||||
|
|
||||||
|
class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor):
|
||||||
|
subcategory = "heroes"
|
||||||
|
asset_type = "hero"
|
||||||
|
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/heroes(?:/(\d+))?"
|
||||||
|
example = "https://www.steamgriddb.com/game/1234/heroes"
|
||||||
|
valid_dimensions = ("1920x620", "3840x1240", "1600x650")
|
||||||
|
valid_styles = ("alternate", "blurred", "material")
|
||||||
|
valid_file_types = ("png", "jpeg", "jpg", "webp")
|
||||||
|
|
||||||
|
|
||||||
|
class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor):
|
||||||
|
subcategory = "logos"
|
||||||
|
asset_type = "logo"
|
||||||
|
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/logos(?:/(\d+))?"
|
||||||
|
example = "https://www.steamgriddb.com/game/1234/logos"
|
||||||
|
valid_dimensions = None
|
||||||
|
valid_styles = ("official", "white", "black", "custom")
|
||||||
|
valid_file_types = ("png", "webp")
|
||||||
|
|
||||||
|
|
||||||
|
class SteamgriddbIconsExtractor(SteamgriddbAssetsExtractor):
|
||||||
|
subcategory = "icons"
|
||||||
|
asset_type = "icon"
|
||||||
|
pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/icons(?:/(\d+))?"
|
||||||
|
example = "https://www.steamgriddb.com/game/1234/icons"
|
||||||
|
valid_dimensions = ["{0}x{0}".format(i) for i in (8, 10, 14, 16, 20, 24,
|
||||||
|
28, 32, 35, 40, 48, 54, 56, 57, 60, 64, 72, 76, 80, 90,
|
||||||
|
96, 100, 114, 120, 128, 144, 150, 152, 160, 180, 192,
|
||||||
|
194, 256, 310, 512, 768, 1024)]
|
||||||
|
valid_styles = ("official", "custom")
|
||||||
|
valid_file_types = ("png", "ico")
|
@ -0,0 +1,48 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for https://tmohentai.com/"""
|
||||||
|
|
||||||
|
from .common import GalleryExtractor
|
||||||
|
from .. import text
|
||||||
|
|
||||||
|
BASE_PATTERN = r"(?:https?://)?tmohentai\.com"
|
||||||
|
|
||||||
|
|
||||||
|
class TmohentaiGalleryExtractor(GalleryExtractor):
|
||||||
|
category = "tmohentai"
|
||||||
|
root = "http://tmohentai.com"
|
||||||
|
directory_fmt = ("{category}", "{title} ({gallery_id})")
|
||||||
|
pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)"
|
||||||
|
example = "https://tmohentai.com/contents/12345a67b89c0"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
self.gallery_id = match.group(1)
|
||||||
|
url = "{}/contents/{}".format(self.root, self.gallery_id)
|
||||||
|
GalleryExtractor.__init__(self, match, url)
|
||||||
|
|
||||||
|
def images(self, page):
|
||||||
|
fmt = "https://imgrojo.tmohentai.com/contents/{}/{{:>03}}.webp".format(
|
||||||
|
self.gallery_id).format
|
||||||
|
cnt = page.count('class="lanzador')
|
||||||
|
return [(fmt(i), None) for i in range(0, cnt)]
|
||||||
|
|
||||||
|
def metadata(self, page):
|
||||||
|
extr = text.extract_from(page)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"gallery_id": self.gallery_id,
|
||||||
|
"title" : text.unescape(extr("<h3>", "<").strip()),
|
||||||
|
"artists" : text.split_html(extr(
|
||||||
|
"<label>Artists and Artists Groups</label>", "</ul>")),
|
||||||
|
"genres" : text.split_html(extr(
|
||||||
|
"<label>Genders</label>", "</ul>")),
|
||||||
|
"tags" : text.split_html(extr(
|
||||||
|
"<label>Tags</label>", "</ul>")),
|
||||||
|
"uploader" : text.remove_html(extr(
|
||||||
|
"<label>Uploaded By</label>", "</ul>")),
|
||||||
|
"language" : extr(" ", "\n"),
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue