利用Python爬取简书

1.不要频繁运行程序模拟登录

频繁模拟登录并识别验证码后,会出现验证码却来越模糊到难以识别,并且识别后点击"确认"按钮无法登录(或者说登录按键失效的)的情况。如图所示的位置失效:

sure_button

2.超级鹰

超级鹰打码平台 打码效率可以达到90%以上。在平台上注册绑定微信后会赠送1000积分,基本够用了。如图是我的积分情况:

jifen

3.超级鹰软件ID和验证码类型

软件ID相当于工作牌(或护照),每次打码都必须携带;验证码类型需要你去 平台 确认。例如该项目的验证码类型属于9004 坐标多选,返回1~4个坐标

4.识别思路(简要)

首先,获取验证码位置并获取网页截图;然后,裁剪获取验证码图像并以字节流的格式发送给超级鹰打码平台;最后,转化识别结果并使用Selenium点击登录。

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.common.exceptions import NoSuchElementException
from time import sleep
from PIL import Image
from io import BytesIO
from utils.chaojiying import Chaojiying_Client
from utils.config import *
'''
想要学习Python?Python学习交流群:984632579满足你的需求,资料都已经上传群文件,可以自行下载!
'''
class Crack_Jianshu(object):def __init__(self):"""初始化"""self.url = URLself.browser = webdriver.Chrome()self.wait = WebDriverWait(self.browser, TIME_OUT)# 简书登录账号、密码self.email = EMAILself.password = PASSWORD# 创建超级鹰Client对象self.chaojiying = Chaojiying_Client(CHAIJIYING_USERNAME, CHAOJIYING_PASSWORD, CHAIJIYING_SOFT_ID)def __del__(self):"""gc机制关闭浏览器"""self.browser.close()def open(self):"""打开简书网页版登录界面输入邮箱账号、密码:return: None"""self.browser.get(self.url)# 邮箱email = self.wait.until(EC.presence_of_element_located((By.ID, 'session_email_or_mobile_number')))# 密码password = self.wait.until(EC.presence_of_element_located((By.ID, 'session_password')))# 输入邮箱email.clear()email.send_keys(self.email)sleep(2)# 输入密码password.clear()password.send_keys(self.password)sleep(2)def get_submit_btn(self):"""获取登录按钮:return: button"""button = self.wait.until(EC.element_to_be_clickable((By.ID, 'sign-in-form-submit-btn')))return buttondef get_touclick_element(self):"""获取验证码图片对象:return: 图片对象"""element = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_item_img')))return elementdef get_code_position(self):"""获取验证码位置:return: 验证码位置列表"""element = self.get_touclick_element()sleep(3)# 相对位置location = element.location# 宽高度size = element.size# 坐标值top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size['width']# 验证码左上角和右下角坐标return [left, top, right, bottom]def get_screenshot(self):"""获取网页截图:return: 截图对象"""screenshot = self.browser.get_screenshot_as_png()screenshot = Image.open(BytesIO(screenshot))return screenshotdef get_touclick_image(self, name='captcha.png'):"""获取验证码图片:return: 图片对象"""left, top, right, bottom = self.get_code_position()print('验证码位置:', left, top, right, bottom)# 获取网页截图的Image对象screenshot = self.get_screenshot()# 获取验证码的Image对象jianshu_code = screenshot.crop((left, top, right, bottom))# 存储jianshu_code.save(name)return jianshu_codedef get_points(self, captcha_result):"""解析超级鹰识别结果:param captcha_result: 识别结果:return: 转化结果"""# 获取pic_str的valuesgroups = captcha_result.get('pic_str').split('|')# 将 字符串坐标值 转换为 整数型的坐标值locations = [[int(number) for number in group.split(',')] for group in groups]return locationsdef touch_click_words(self, locations):"""点击验证图片:param locations: 点击位置:return: None"""cnt = 1for location in locations:print('坐标点{}: {}'.format(cnt,location))ActionChains(self.browser).move_to_element_with_offset(self.get_touclick_element(),location[0], location[1]).click().perform()cnt = cnt + 1sleep(1)def get_verifi_button(self):"""确认按钮:return: None"""submit = self.wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="geetest_commit_tip"]')))submit.click()def get_article_info(self):passdef connect_db(self):passdef save_to_db(self):passdef crack_login(self):"""登录:return: None"""# 打开简书登录界面self.open()# 点击登录按钮button = self.get_submit_btn()button.click()# 获取验证码图片image = self.get_touclick_image()bytes_array = BytesIO()# 存储为字节流格式image.save(bytes_array, format='PNG')# 识别验证码result = self.chaojiying.PostPic(bytes_array.getvalue(), CHAOJIYING_KIND)print("\n超级鹰识别结果:{}\n".format(result))locations = self.get_points(result)self.touch_click_words(locations)sleep(3)# 点击确认按钮self.get_verifi_button()# 通过获取"Logo"判断是否登录成功sleep(5)success = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'logo')))if success:print('\nSuccessful login!\n')# 失败重试if not success:print("-" * 50)self.crack_login()if __name__ == '__main__':crack = Crack_Jianshu()crack.crack_login()

 


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部