class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['https://www.zhihu.com/']
cookie_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "cookies/zhihu.cookie")
def parse(self, response):
pass
def start_requests(self):
from selenium.webdriver.chrome.options import Options
# 启动浏览器:/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9000
chrome_option = Options()
chrome_option.add_argument("--disable-extensions")
chrome_option.add_experimental_option("debuggerAddress", "127.0.0.1:9000")
browser = webdriver.Chrome(executable_path="/Users/alan/Desktop/demo/chromedriver", chrome_options=chrome_option)
browser.get("https://www.zhihu.com/signin")
browser.find_element_by_css_selector(".SignFlow-tabs div:nth-child(2)").click()
# TODO:先读取 cookies 判断是否仍有效,若无效再执行登录刷新 cookies 文件
self.login_success(browser)
self.get_cookies(browser)
cookies = pickle.load(open(self.cookie_path, "rb"))
cookie_dict = {}
for cookie in cookies:
cookie_dict[cookie["name"]] = cookie["value"]
return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)]
def login_success(self, browser):
# 判断是否登录,若未登录执行登录
try:
# 解决浏览器显示大小的问题
browser.maximize_window()
except:
pass
login_success = False
while not login_success:
# 通过已登录才会出现的元素判断是否登录成功
try:
profile_ele = browser.find_element_by_class_name("AppHeader-notifications")
login_success = True
except:
pass
if not login_success:
# 判断是中文验证码还是英文验证码
try:
english_captcha_element = browser.find_element_by_class_name("Captcha-englishImg")
except:
english_captcha_element = None
try:
chinese_captcha_element = browser.find_element_by_class_name("Captcha-chineseImg")
except:
chinese_captcha_element = None
if chinese_captcha_element:
self.chinese_captcha(chinese_captcha_element, browser)
elif english_captcha_element:
self.english_captcha(english_captcha_element, browser)
def auto_login(self, browser, x=715, y=540):
# 模拟登录
user_element = browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input")
action_chains = ActionChains(browser)
action_chains.double_click(user_element).perform()
time.sleep(0.5)
user_element.send_keys(ZHIHU_USER)
pwd_element = browser.find_element_by_css_selector(".SignFlow-password input[name='password']")
action_chains.double_click(pwd_element).perform()
time.sleep(0.5)
browser.find_element_by_css_selector(".SignFlow-password input[name='password']").send_keys(ZHIHU_PWD)
# m.click(x, y, 1)
browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click()
def get_cookies(self, browser):
# 获取 cookies
browser.get("https://www.zhihu.com/")
cookies = browser.get_cookies()
pickle.dump(cookies, open(self.cookie_path, "wb"))
cookie_dict = {}
for cookie in cookies:
cookie_dict[cookie["name"]] = cookie["value"]
def chinese_captcha(self, chinese_captcha_element, browser):
# 获取验证码的坐标位置(相对于浏览区域左上角的位置)
ele_position = chinese_captcha_element.location
x_relative = ele_position["x"]
y_relative = ele_position["y"]
# 执行 JS 脚本获取浏览器菜单栏+搜索框的位置
browser_navigation_panel_height = browser.execute_script(
# 'return window.outerHeight - window.innerHeight' # 未包含浏览电脑顶部的菜单栏
'return window.screen.height - window.innerHeight'
)
base64_text = chinese_captcha_element.get_attribute("src")
import base64
code = base64_text.replace("data:image/jpg;base64,", "").replace("%0A", "")
if code != 'null':
fh = open("yzm_cn.jpeg", "wb")
fh.write(base64.b64decode(code))
fh.close()
from zheye import zheye
z = zheye()
positions = z.Recognize('yzm_cn.jpeg')
last_position = []
action_chains = ActionChains(browser)
if len(positions) >= 2:
# 修正 x,y 坐标及文字先后顺序
if positions[0][1] > positions[1][1]:
last_position.append([positions[1][1], positions[1][0]])
last_position.append([positions[0][1], positions[0][0]])
else:
last_position.append([positions[0][1], positions[0][0]])
last_position.append([positions[1][1], positions[1][0]])
# TODO: 根据图片尺寸和所显示大小的比例进行相关计算
first_position = [int(last_position[0][0] / 2), int(last_position[0][1] / 2)] # 屏幕上显示图片为实际尺寸的一半
second_position = [int(last_position[1][0] / 2), int(last_position[1][1] / 2)]
time.sleep(1)
action_chains.move_to_element_with_offset(chinese_captcha_element,first_position[0], first_position[1]).click().perform()
# m.click((x_relative + first_position[0]), (y_relative + browser_navigation_panel_height + first_position[1]))
time.sleep(1)
action_chains.move_to_element_with_offset(chinese_captcha_element, second_position[0], second_position[1]).click().perform()
# m.click((x_relative + second_position[0]), (y_relative + browser_navigation_panel_height + second_position[1]))
else:
last_position.append([positions[0][1], positions[0][0]])
first_position = [int(last_position[0][0] / 2), int(last_position[0][1] / 2)]
action_chains.move_to_element_with_offset(chinese_captcha_element, first_position[0], first_position[1]).click().perform()
# m.click(x_relative + first_position[0], y_relative + browser_navigation_panel_height + first_position[1])
self.auto_login(browser, 710, 590)
def english_captcha(self, english_captcha_element, browser):
base64_text = english_captcha_element.get_attribute("src")
import base64
code = base64_text.replace("data:image/jpg;base64,","").replace("%0A", "")
if code != 'null':
fh = open("yzm_en.jpeg", "wb")
fh.write(base64.b64decode(code))
fh.close()
# 使用封装的云打码平台进行英文验证,TODO:修改为开源的 OCR 接口识别
# 知乎的 captcha 应该是经过处理的,目前使用百度 OCR 及tesseract的识别效果均不理想
from ArticleSpider.utils.yudama_requests import YDMHttp
yundama = YDMHttp()
code = yundama.decode("yzm_en.jpeg", 5000, 60)
while True:
if code == "":
code = yundama.decode("yzm_en.jpeg", 5000, 60)
else:
break
captcha_element = browser.find_element_by_css_selector('.SignFlow-captchaContainer input[name="captcha"]')
action_chains = ActionChains(browser)
action_chains.double_click(captcha_element).perform()
captcha_element.send_keys(code)
self.auto_login(browser)