[2chen] Add 2chen.moe extractor (#2707)

* [2chen] Add 2chen.moe extractor

* change "==" to is

* fix for "test_unique_pattern_matches"

* fix regex pattern and group matching

* fix regex again

* [2chen] add 'reply_no' and 'hash' metadata and change 'filename_fmt'

also made an entry in supportedsites.md

* [2chen] unescape 'title'

* [2chen] partition() -> rpartition()

* [2chen] extract 'date' and 'name' metadata

* [2chen] remove 'offset' argument

* [2chen] do some changes

* [2chen] do some more changes

* [2chen] unescape 'name' and 'filename'
pull/3028/head
enduser420 2 years ago committed by GitHub
parent f7ba19a1c0
commit f0321f423d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -13,6 +13,12 @@ Consider all sites to be NSFW unless otherwise known.
</tr>
</thead>
<tbody valign="top">
<tr>
<td>2chen</td>
<td>https://2chen.moe/</td>
<td>Boards, Threads</td>
<td></td>
</tr>
<tr>
<td>35PHOTO</td>
<td>https://35photo.pro/</td>

@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://2chen.moe/"""
from .common import Extractor, Message
from .. import text
class _2chenThreadExtractor(Extractor):
"""Extractor for 2chen threads"""
category = "2chen"
subcategory = "thread"
directory_fmt = ("{category}", "{board}", "{thread} {title}")
filename_fmt = "{time} {filename}.{extension}"
archive_fmt = "{hash}"
root = "https://2chen.moe"
pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)"
test = (
("https://2chen.moe/jp/303786", {
"count": ">= 10",
}),
)
def __init__(self, match):
Extractor.__init__(self, match)
self.board, self.thread = match.groups()
def items(self):
url = "{}/{}/{}".format(self.root, self.board, self.thread)
page = self.request(url, encoding="utf-8").text
data = self.metadata(page)
yield Message.Directory, data
for post in self.posts(page):
if not post["url"]:
continue
post.update(data)
post["url"] = self.root + post["url"]
post["time"] = text.parse_int(post["date"].timestamp())
yield Message.Url, post["url"], text.nameext_from_url(
post["filename"], post)
def metadata(self, page):
board, pos = text.extract(page, 'class="board">/', '/<')
title = text.extract(page, "<h3>", "</h3>", pos)[0]
return {
"board" : board,
"thread": self.thread,
"title" : text.unescape(title),
}
def posts(self, page):
"""Return iterable with relevant posts"""
return map(self.parse, text.extract_iter(
page, 'class="glass media', '</article>'))
def parse(self, post):
extr = text.extract_from(post)
return {
"name" : text.unescape(extr("<span>", "</span>")),
"date" : text.parse_datetime(
extr("<time", "<").partition(">")[2],
"%d %b %Y (%a) %H:%M:%S"
),
"no" : extr('href="#p', '"'),
"url" : extr('</span><a href="', '"'),
"filename": text.unescape(extr('download="', '"')),
"hash" : extr('data-hash="', '"'),
}
class _2chenBoardExtractor(Extractor):
"""Extractor for 2chen boards"""
category = "2chen"
subcategory = "board"
root = "https://2chen.moe"
pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)(?:/catalog)?/?$"
test = (
("https://2chen.moe/co/", {
"pattern": _2chenThreadExtractor.pattern
}),
("https://2chen.moe/co"),
("https://2chen.moe/co/catalog")
)
def __init__(self, match):
Extractor.__init__(self, match)
self.board = match.group(1)
def items(self):
url = "{}/{}/catalog".format(self.root, self.board)
page = self.request(url).text
data = {"_extractor": _2chenThreadExtractor}
for thread in text.extract_iter(
page, '<figure><a href="', '"'):
yield Message.Queue, self.root + thread, data

@ -10,6 +10,7 @@ import re
modules = [
"2chan",
"2chen",
"35photo",
"3dbooru",
"420chan",

Loading…
Cancel
Save