最近在进行爬虫技术的重新学习,本文是对【Python开发】利用Scrapy开发爬虫的补充。
Chrome Driver 下载地址:http://chromedriver.chromium.org/
国内镜像站:https://registry.npmmirror.com/binary.html
Mac 解决权限问题:拷贝到Python 环境的 bin 目录下并在该目录执行命令xattr -d com.apple.quarantine chromedriver
Chrome 启动参数完整列表:https://peter.sh/experiments/chromium-command-line-switches/#load-extension
查看常用 Chrome 页面:chrome://about/
Cookies 实现内容抓取
在进行 Selenium 模拟登录后,可存储 Cookie 并再下一次访问时直接调取 Cookie 进行内存爬取。
更新贴出完整代码(系统 macOS):
使用 debug 模式启动浏览器后可访问http://127.0.0.1:9000/json进行验证,注意应关闭已打开的浏览器之后再启动
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
class ZhihuSpider(scrapy.Spider): name = 'zhihu' allowed_domains = ['www.zhihu.com'] start_urls = ['https://www.zhihu.com/'] cookie_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "cookies/zhihu.cookie") def parse(self, response): pass def start_requests(self): from selenium.webdriver.chrome.options import Options # 启动浏览器:/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9000 chrome_option = Options() chrome_option.add_argument("--disable-extensions") chrome_option.add_experimental_option("debuggerAddress", "127.0.0.1:9000") browser = webdriver.Chrome(executable_path="/Users/alan/Desktop/demo/chromedriver", chrome_options=chrome_option) browser.get("https://www.zhihu.com/signin") browser.find_element_by_css_selector(".SignFlow-tabs div:nth-child(2)").click() # TODO:先读取 cookies 判断是否仍有效,若无效再执行登录刷新 cookies 文件 self.login_success(browser) self.get_cookies(browser) cookies = pickle.load(open(self.cookie_path, "rb")) cookie_dict = {} for cookie in cookies: cookie_dict[cookie["name"]] = cookie["value"] return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)] def login_success(self, browser): # 判断是否登录,若未登录执行登录 try: # 解决浏览器显示大小的问题 browser.maximize_window() except: pass login_success = False while not login_success: # 通过已登录才会出现的元素判断是否登录成功 try: profile_ele = browser.find_element_by_class_name("AppHeader-notifications") login_success = True except: pass if not login_success: # 判断是中文验证码还是英文验证码 try: english_captcha_element = browser.find_element_by_class_name("Captcha-englishImg") except: english_captcha_element = None try: chinese_captcha_element = browser.find_element_by_class_name("Captcha-chineseImg") except: chinese_captcha_element = None if chinese_captcha_element: self.chinese_captcha(chinese_captcha_element, browser) elif english_captcha_element: self.english_captcha(english_captcha_element, browser) def auto_login(self, browser, x=715, y=540): # 模拟登录 user_element = browser.find_element_by_css_selector(".SignFlow-accountInput.Input-wrapper input") action_chains = ActionChains(browser) action_chains.double_click(user_element).perform() time.sleep(0.5) user_element.send_keys(ZHIHU_USER) pwd_element = browser.find_element_by_css_selector(".SignFlow-password input[name='password']") action_chains.double_click(pwd_element).perform() time.sleep(0.5) browser.find_element_by_css_selector(".SignFlow-password input[name='password']").send_keys(ZHIHU_PWD) # m.click(x, y, 1) browser.find_element_by_css_selector(".Button.SignFlow-submitButton").click() def get_cookies(self, browser): # 获取 cookies browser.get("https://www.zhihu.com/") cookies = browser.get_cookies() pickle.dump(cookies, open(self.cookie_path, "wb")) cookie_dict = {} for cookie in cookies: cookie_dict[cookie["name"]] = cookie["value"] def chinese_captcha(self, chinese_captcha_element, browser): # 获取验证码的坐标位置(相对于浏览区域左上角的位置) ele_position = chinese_captcha_element.location x_relative = ele_position["x"] y_relative = ele_position["y"] # 执行 JS 脚本获取浏览器菜单栏+搜索框的位置 browser_navigation_panel_height = browser.execute_script( # 'return window.outerHeight - window.innerHeight' # 未包含浏览电脑顶部的菜单栏 'return window.screen.height - window.innerHeight' ) base64_text = chinese_captcha_element.get_attribute("src") import base64 code = base64_text.replace("data:image/jpg;base64,", "").replace("%0A", "") if code != 'null': fh = open("yzm_cn.jpeg", "wb") fh.write(base64.b64decode(code)) fh.close() from zheye import zheye z = zheye() positions = z.Recognize('yzm_cn.jpeg') last_position = [] action_chains = ActionChains(browser) if len(positions) >= 2: # 修正 x,y 坐标及文字先后顺序 if positions[0][1] > positions[1][1]: last_position.append([positions[1][1], positions[1][0]]) last_position.append([positions[0][1], positions[0][0]]) else: last_position.append([positions[0][1], positions[0][0]]) last_position.append([positions[1][1], positions[1][0]]) # TODO: 根据图片尺寸和所显示大小的比例进行相关计算 first_position = [int(last_position[0][0] / 2), int(last_position[0][1] / 2)] # 屏幕上显示图片为实际尺寸的一半 second_position = [int(last_position[1][0] / 2), int(last_position[1][1] / 2)] time.sleep(1) action_chains.move_to_element_with_offset(chinese_captcha_element,first_position[0], first_position[1]).click().perform() # m.click((x_relative + first_position[0]), (y_relative + browser_navigation_panel_height + first_position[1])) time.sleep(1) action_chains.move_to_element_with_offset(chinese_captcha_element, second_position[0], second_position[1]).click().perform() # m.click((x_relative + second_position[0]), (y_relative + browser_navigation_panel_height + second_position[1])) else: last_position.append([positions[0][1], positions[0][0]]) first_position = [int(last_position[0][0] / 2), int(last_position[0][1] / 2)] action_chains.move_to_element_with_offset(chinese_captcha_element, first_position[0], first_position[1]).click().perform() # m.click(x_relative + first_position[0], y_relative + browser_navigation_panel_height + first_position[1]) self.auto_login(browser, 710, 590) def english_captcha(self, english_captcha_element, browser): base64_text = english_captcha_element.get_attribute("src") import base64 code = base64_text.replace("data:image/jpg;base64,","").replace("%0A", "") if code != 'null': fh = open("yzm_en.jpeg", "wb") fh.write(base64.b64decode(code)) fh.close() # 使用封装的云打码平台进行英文验证,TODO:修改为开源的 OCR 接口识别 # 知乎的 captcha 应该是经过处理的,目前使用百度 OCR 及tesseract的识别效果均不理想 from ArticleSpider.utils.yudama_requests import YDMHttp yundama = YDMHttp() code = yundama.decode("yzm_en.jpeg", 5000, 60) while True: if code == "": code = yundama.decode("yzm_en.jpeg", 5000, 60) else: break captcha_element = browser.find_element_by_css_selector('.SignFlow-captchaContainer input[name="captcha"]') action_chains = ActionChains(browser) action_chains.double_click(captcha_element).perform() captcha_element.send_keys(code) self.auto_login(browser) |
为能正常使用 Cookie,需在 settings.py 中进行配置
1 2 3 4 5 6 7 8 9 |
COOKIES_ENABLED = True COOKIES_DEBUG = True # 选择开启 DOWNLOADER_MIDDLEWARES = { # 'ArticleSpider.middlewares.ArticlespiderDownloaderMiddleware': 543, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2, } USER_AGENT = "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" |
上面这一小段知乎登录的代码还有一些优化空间,中英文验证码都跑通了,为便于阅读源码和使用拆分为了几个方法,在编写过程中我自己的 macOS 对于 PyUserInput的支持似乎发生过变化,发现并不能进行模拟点击操作,后来测试 pynput 等开源的包在执行时都未能达到效果。
最终还是使用selenium自带的ActionChains来进行偏移实现对中文倒立文字的点击,以及双击避免在预填内容之后的重复输入。这样反倒不用在计算时加上浏览器导航以及 macOS 顶部菜单的距离。以上保留了PyUserInput和导航计算相关的代码,或许在其他的电脑上仍可以使用。
无头模式
1 2 3 4 5 6 7 8 9 10 11 |
from selenium import webdriver from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument("--headless") # 可选设置,用于设置分辨率 chrome_options.add_argument("--window-size=1920,1080") # 可选设置,不加载图片,用于提升速度 prefs = {'profile.managed_default_content_settings.images': 2} chrome_options.add_experimental_option('prefs', prefs) driver = webdriver.Chrome(chrome_options=chrome_options) |
知乎文字倒立识别
https://github.com/muchrooms/zheye
下载将zheye放到项目目录下,并安装依赖
代码参见以上chinese_captcha部分
常见问题
1、fake_useragent.errors.FakeUserAgentError: Maximum amount of retries reached
这个问题的解决方案有很多,比如直接下载文件并指定该文件目录(如/home/alan/):
1 2 |
location = '/home/alan/fake_useragent_%s.json' % fake_useragent.VERSION self.ua = fake_useragent.UserAgent(path=location) |
或直接下载该文件命名为fake_useragent_版本号.json(如fake_useragent_0.1.11.json)到临时目录中(临时目录可通过Python 的内置命令tempfile.gettempdir()进行获取),这样也就无需再指定 path 参数
对于第二种方法,我们还可以编写一段代码直接在自定义Middleware初始化时自动为我们完成下载工作:
1 2 3 4 5 6 7 8 9 |
try: self.ua = fake_useragent.UserAgent() except: file_name = fake_useragent.settings.DB import urllib rsp = urllib.request.urlopen(fake_useragent.settings.CACHE_SERVER) json_content = rsp.read() with open(file_name, 'wb') as f: f.write(json_content) |
值得一提的是,在 macOS 下操作时可能会出现报错
1 |
urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate |
此时执行如下命令再运行即可(其中的3.6可根据具体的版本进行修改)
1 |
/Applications/Python\ 3.6/Install\ Certificates.command |
其实质是对certifi包进行了安装升级,因此类似下面的命令也可达到相同的效果
1 |
python3.6 -m pip install --upgrade certifi |
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1108)
1 2 |
import ssl ssl._create_default_https_context = ssl._create_unverified_context |
2、Chrome version must be between 70 and 73
chromedriver 与本地 Chrome 版本不符,请升级 Chrome 或下载相对应的 chromedriver 版本
下载地址:https://sites.google.com/a/chromium.org/chromedriver/downloads
并将chromedriver文件放到对应的 Python bin 目录下
3、AttributeError: module ‘lib’ has no attribute ‘X509_V_FLAG_CB_ISSUER_CHECK’
1 2 |
python3 -m pip install pip --upgrade pip install pyopenssl --upgrade |