[huaban] - formatting (flake8)

pull/3685/head
Frank Tang 2 years ago
parent 74c58e8e76
commit 855a872bc3

@ -7,18 +7,17 @@
"""Extractors for https://www.pinterest.com/""" """Extractors for https://www.pinterest.com/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import itertools
BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?huaban\.com" BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?huaban\.com"
class BaseExtractor(Extractor): class BaseExtractor(Extractor):
''' Base class for other extractors''' """Base class for other extractors"""
category = "huaban" category = "huaban"
root="https://huaban.com" root = "https://huaban.com"
api_root = "https://api.huaban.com" api_root = "https://api.huaban.com"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.url = match.group(0) self.url = match.group(0)
@ -31,97 +30,125 @@ class BaseExtractor(Extractor):
def api_request(self, url, *args, **kwargs): def api_request(self, url, *args, **kwargs):
default_headers = { default_headers = {
"Accept": "application/json, text/plain, */*", "Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;\
q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br", "Accept-Encoding": "gzip, deflate, br",
"Origin": "https://huaban.com" "Origin": "https://huaban.com",
} }
kwargs.update({"headers": default_headers}) kwargs.update({"headers": default_headers})
return self.request(self.api_root+url, *args, **kwargs) return self.request(self.api_root + url, *args, **kwargs)
def pin_img_url(self, pin): def pin_img_url(self, pin):
''' real image url from pin's data''' """real image url from pin's data"""
return "https://%s.huaban.com/%s" % (pin['file']['bucket'], pin['file']['key']) return "https://%s.huaban.com/%s" % (
pin["file"]["bucket"],
pin["file"]["key"],
)
class PinExtractor(BaseExtractor): class PinExtractor(BaseExtractor):
''' Extractor for image from a pin ''' """Extractor for image from a pin"""
subcategory = "pin" subcategory = "pin"
pattern = BASE_PATTERN + "/pins/([0-9]+)" pattern = BASE_PATTERN + "/pins/([0-9]+)"
directory_fmt = ("{category}", ) directory_fmt = ("{category}",)
test = ( test = ("https://huaban.com/pins/4535272056",)
'https://huaban.com/pins/4535272056',
)
def __init__(self, match): def __init__(self, match):
BaseExtractor.__init__(self, match) BaseExtractor.__init__(self, match)
def items(self): def items(self):
metadata = self.api_request("/pins/"+self.id, headers={"Referer":self.url}).json() metadata = self.api_request(
pin = metadata['pin'] "/pins/" + self.id, headers={"Referer": self.url}
).json()
pin = metadata["pin"]
yield Message.Directory, pin yield Message.Directory, pin
pin['filename'] = pin['file']['key'] pin["filename"] = pin["file"]["key"]
pin['extension'] = pin['file']['type'].split('/')[-1] pin["extension"] = pin["file"]["type"].split("/")[-1]
yield Message.Url, self.pin_img_url(pin), pin yield Message.Url, self.pin_img_url(pin), pin
class BoardExtractor(BaseExtractor): class BoardExtractor(BaseExtractor):
''' Extractor for images from a board ''' """Extractor for images from a board"""
subcategory = "board" subcategory = "board"
pattern = BASE_PATTERN+ "/boards/([0-9]+)" pattern = BASE_PATTERN + "/boards/([0-9]+)"
directory_fmt=("{category}", "{user[user_id]} {user[username]}", "{board_id} {title}") directory_fmt = (
"{category}",
test = ('https://huaban.com/boards/76275185', "{user[user_id]} {user[username]}",
) "{board_id} {title}",
)
test = ("https://huaban.com/boards/76275185",)
def __init__(self, match): def __init__(self, match):
BaseExtractor.__init__(self, match) BaseExtractor.__init__(self, match)
def items(self): def items(self):
metadata = self.api_request("/boards/"+self.id, headers={"Referer":self.url}).json() metadata = self.api_request(
"/boards/" + self.id, headers={"Referer": self.url}
).json()
yield Message.Directory, metadata["board"] yield Message.Directory, metadata["board"]
#get all pins # get all pins
pins_data = self.api_request("/boards/%s/pins?limit=20"%(self.id), headers={"Referer":self.url}).json() pins_data = self.api_request(
"/boards/%s/pins?limit=20" % (self.id),
headers={"Referer": self.url},
).json()
while True: while True:
pins = pins_data['pins'] pins = pins_data["pins"]
if len(pins) <= 0: if len(pins) <= 0:
break break
last_pin = None last_pin = None
for pin in pins: for pin in pins:
pin_file = pin['file'] pin_file = pin["file"]
pin['filename'] = pin_file['key'] pin["filename"] = pin_file["key"]
pin['extension'] = pin_file['type'].split('/')[-1] pin["extension"] = pin_file["type"].split("/")[-1]
yield Message.Url, self.pin_img_url(pin), pin yield Message.Url, self.pin_img_url(pin), pin
last_pin = pin last_pin = pin
pins_data = self.api_request("/boards/%s/pins?limit=20&max=%s"%(self.id, last_pin['pin_id']), headers={"Referer":self.url}).json() pins_data = self.api_request(
"/boards/%s/pins?limit=20&max=%s"
% (self.id, last_pin["pin_id"]),
headers={"Referer": self.url},
).json()
class UserExtractor(BaseExtractor): class UserExtractor(BaseExtractor):
''' Extractor for images from a user's boards''' """Extractor for images from a user's boards"""
subcategory = "user"
pattern = BASE_PATTERN+ "/user/([\w_]+[_\w\d]*)" subcategory = "user"
directory_fmt=("{category}", "{user_id} {username}") pattern = BASE_PATTERN + r"/user/([\w_]+[_\w\d]*)"
test = ( directory_fmt = ("{category}", "{user_id} {username}")
'https://huaban.com/user/huaban', test = ("https://huaban.com/user/huaban",)
)
def items(self): def items(self):
metadata = boards_data = self.api_request("/%s/boards?limit=30&urlname=%s"%(self.id,self.id), headers={"Referer":self.url}).json() metadata = boards_data = self.api_request(
yield Message.Directory, metadata['user'] "/%s/boards?limit=30&urlname=%s" % (self.id, self.id),
headers={"Referer": self.url},
).json()
yield Message.Directory, metadata["user"]
#queue all boards # queue all boards
while True: while True:
boards = boards_data['boards'] boards = boards_data["boards"]
if len(boards) <= 0: if len(boards) <= 0:
break break
last_board = None last_board = None
for board in boards: for board in boards:
board["_extractor"] = BoardExtractor board["_extractor"] = BoardExtractor
yield Message.Queue, "%s/boards/%s"%(self.root, board['board_id']), { '_extractor': BoardExtractor} yield Message.Queue, "%s/boards/%s" % (
self.root,
board["board_id"],
), {"_extractor": BoardExtractor}
last_board = board last_board = board
boards_data = self.api_request("/%s/boards?max=%s&limit=30&urlname=%s"%(self.id,last_board['board_id'],self.id), headers={"Referer":self.url}).json() boards_data = self.api_request(
"/%s/boards?max=%s&limit=30&urlname=%s"
% (self.id, last_board["board_id"], self.id),
headers={"Referer": self.url},
).json()

Loading…
Cancel
Save