Selenium获取PTA平台所有题集和答案存为json文件
注意cv2库是opencv-python

使用方法最下方url列表写要爬的题集,该题集要已经提交完毕
最后的json文件自己建,里面存一对花括号
网站的网页结构变了,有点css类名变了,稍微改了一下
爬取代码 2021-12-12更新可用
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
import requests
import time
import numpy
import cv2
import os
import json# 定义全局变量请求网页之前等待的时间,防止请求过快被拒绝
my_time = 3.5def login_PTA(my_account, my_password):# 输入账号密码并点击登录account = web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[1]/div/div/div[1]/div/div/div/input')password = web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[1]/div/div/div[2]/div/div/div/input')account.send_keys(my_account)password.send_keys(my_password)web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[2]/button').click() # 找到登录按钮并点击web.find_element_by_xpath('/html/body/div[1]/div[2]/div/div[2]/form/div[2]/button/div/div').click()print("ok")for i in range(5):time.sleep(3) # 等待验证码加载完成,时间间隔可根据网速调整,# print('当前url:' + web.current_url)# 如果当前url改变说明已经登录成功if web.current_url != login_url:breakcracking_captcha()def cracking_captcha():"""破解验证码"""# bg背景图片bg_img_src = web.find_element_by_xpath('/html/body/div[3]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/img[1]').get_attribute('src')# front可拖动图片front_img_src = web.find_element_by_xpath('/html/body/div[3]/div[2]/div/div/div[2]/div/div[1]/div/div[1]/img[2]').get_attribute('src')# 保存图片with open("bg.jpg", mode="wb") as f:f.write(requests.get(bg_img_src).content)with open("front.jpg", mode="wb") as f:f.write(requests.get(front_img_src).content)# 将图片加载至内存bg = cv2.imread("bg.jpg")front = cv2.imread("front.jpg")# 将背景图片转化为灰度图片,将三原色降维bg = cv2.cvtColor(bg, cv2.COLOR_BGR2GRAY)# 将可滑动图片转化为灰度图片,将三原色降维front = cv2.cvtColor(front, cv2.COLOR_BGR2GRAY)front = front[front.any(1)]# 用cv算法匹配精度最高的xy值result = cv2.matchTemplate(bg, front, cv2.TM_CCOEFF_NORMED)# numpy解析xy,注意xy与实际为相反,x=y,y=xx, y = numpy.unravel_index(numpy.argmax(result), result.shape)# 找到可拖动区域div = web.find_element_by_xpath('/html/body/div[3]/div[2]/div/div/div[2]/div/div[2]/div[2]')# 拖动滑块,以实际相反的y值代替xActionChains(web).drag_and_drop_by_offset(div, xoffset=y // 0.946, yoffset=0).perform()# 至此成功破解验证码,由于算法问题,准确率不能达到100%,所以加了循环判断def get_question_type_url(headers_collection_url):time.sleep(my_time)"""获取当前章节题目类型的url"""web.get(headers_collection_url)single_choice_url = ""judgment_url = ""fill_in_the_blanks_url = ""program_fill_in_the_blanks_url = ""function_url = ""programming_url = ""questions_type_list_a = web.find_element_by_css_selector("[class='pc-h container_3U5RB pc-gap-default']").find_elements_by_css_selector("a")for t in questions_type_list_a:questions_type_name = t.find_element_by_css_selector("[class='pc-text-raw']").textthis_url = t.get_attribute('href')if questions_type_name == '单选题':single_choice_url = this_urlelif questions_type_name == '判断题':judgment_url = this_urlelif questions_type_name == '填空题':fill_in_the_blanks_url = this_urlelif questions_type_name == '程序填空题':program_fill_in_the_blanks_url = this_urlelif questions_type_name == '函数题':function_url = this_urlelif questions_type_name == '编程题':programming_url = this_url# print(single_choice_url)# print(judgment_url)# print(fill_in_the_blanks)# print(program_fill_in_the_blanks)# print(function)# print(programming)question_type_dict = {"single_choice_url": single_choice_url,"judgment_url": judgment_url,"fill_in_the_blanks_url": fill_in_the_blanks_url,"program_fill_in_the_blanks_url": program_fill_in_the_blanks_url,"function_url": function_url,"programming_url": programming_url}return question_type_dictdef get_judgment(judgment_url):"""获取判断题并返回题集字典--判断题自动改错"""time.sleep(my_time)web.get(judgment_url)judgment_question_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")questions_dict = {}num = 0success_num = 0fail_num = 0for judgment in judgment_question_list:try:question = judgment.find_element_by_css_selector("[class='rendered-markdown']").find_element_by_xpath('p').textis_select_T = judgment.find_element_by_css_selector("[class='mr-2 mt-1 focus:outline-none']").is_selected()isTrue = judgment.find_element_by_css_selector("[class='pc-text inline']").find_element_by_css_selector("[class='pc-text-raw']").textif isTrue != "答案正确":if is_select_T:answer = "F"else:answer = "T"else:if is_select_T:answer = "T"else:answer = "F"print(question)print(answer)questions_dict[question] = answernum += 1except Exception:traceback.print_exc()fail_num += 1print("当前题目获取失败, 上一题序号: " + str(success_num) + "-序号, 当前题集:" + judgment_url + ", 程序跳过该题继续执行")continueprint("判断题--题集: " + judgment_url + "获取题目数量--成功: " + str(num) + " 失败: " + str(fail_num))return questions_dictdef get_single_choice(single_choice_url):"""获取选择题并返回题集字典"""time.sleep(my_time)web.get(single_choice_url)single_choice_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")questions_dict = {}success_num = 0fail_num = 0for single_choice in single_choice_list:try:success_num += 1question = single_choice.find_element_by_css_selector("[class='pc-x min-w-0 shrink']").textoptions_urls = single_choice.find_elements_by_css_selector("[class='flex items-start p-2 rounded hover:bg-gray-100 focus:bg-gray-200 focus-within:bg-gray-100 focus-within:ring focus-within:ring-blue-300 multiple-choice-label min-w-0 items-baseline']")option = []answer = ""for option_url in options_urls:this_answer = option_url.find_element_by_css_selector("[class='rendered-markdown']").textoption.append(this_answer)if option_url.find_element_by_css_selector("[class='mr-2 mt-1 focus:outline-none']").is_selected():answer = this_answeris_true = single_choice.find_element_by_css_selector("[class='pc-text inline']").find_element_by_css_selector("[class='pc-text-raw']").textprint(question)print(option)print(answer)print(is_true)questions_dict[question] = [option, answer, is_true]except:fail_num += 1success_num -= 1print("当前题目获取失败, 上一题序号: " + str(success_num) + "-序号, 当前题集:" + single_choice_url + ", 程序跳过该题继续执行")continueprint("选择题--题集: " + single_choice_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(fail_num))return questions_dictdef get_fill_or_program_in_the_blanks(fill_or_program_in_the_blanks_url):"""获取填空题并返回题集字典"""time.sleep(my_time)web.get(fill_or_program_in_the_blanks_url)fill_or_program_in_the_blanks_list = web.find_elements_by_css_selector("[class='pc-x pc-dtfd-ipt min-w-0']")questions_dict = {}success_num = 0fail_num = 0for fill_or_program_in_the_blanks in fill_or_program_in_the_blanks_list:try:success_num += 1question = fill_or_program_in_the_blanks.find_element_by_css_selector("[class='rendered-markdown']").textanswer_list_input = fill_or_program_in_the_blanks.find_element_by_css_selector("[class='rendered-markdown']").find_elements_by_css_selector("input")answer = []for a in answer_list_input:answer.append(a.get_attribute("value"))is_true = fill_or_program_in_the_blanks.find_element_by_css_selector("[class='pc-text inline']").find_element_by_css_selector("[class='pc-text-raw']").textprint(question)print(answer)print(is_true)questions_dict[question] = [answer, is_true]except:fail_num += 1success_num -= 1print("当前题目获取失败, 上一题序号: " + str(success_num) + "-序号, 当前题集:" + fill_or_program_in_the_blanks_url + ", 程序跳过该题继续执行")continueprint("填空/程序填空题--题集: " + fill_or_program_in_the_blanks_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(fail_num))return questions_dictdef get_function_or_programming(function_or_programming_url):"""获取函数/编程题并返回题集字典"""time.sleep(my_time)web.get(function_or_programming_url)questions_dict = {}# 获取所以题目行trp_problems = web.find_elements_by_xpath('/html/body/div/div[2]/div[1]/div/div[2]/div[2]/div/div[1]/table//tbody/tr')# 存放所有题目的链接problems_href = []for tr in trp_problems:problems_href.append(tr.find_element_by_xpath('td[3]/a').get_attribute('href'))success_num = 0fail_num = 0for problem in problems_href:# 这里循环3次的目的是防止请求过快被限制,如果正常执行则退出,否则继续请求(3次还get不到跳过)for i in range(3):try:time.sleep(my_time) # 根据网速设置时间间隔,访问太快也会被提示web.get(problem)# 获取题目和答案problem_title = web.find_element_by_css_selector("[class='text-center text-light text-base font-bold my-4']").textanswer = web.find_element_by_css_selector("[class='codeEditor_2kCM6 grow shrink']").find_element_by_css_selector('textarea').get_attribute('value')problem_content = web.find_element_by_css_selector("[class='rendered-markdown']").textquestions_dict[problem_title] = [problem_content, answer]print(problem_title)print(problem_content)print(answer)success_num += 1breakexcept:continuefail_num += 1 # 如果能执行到这说明当前题目获取失败print("函数/编程题--题集: " + function_or_programming_url + "获取题目数量--成功: " + str(success_num) + " 失败: " + str(fail_num))return questions_dictdef write_question_file(url_list, judgment_file_name, single_choice_file_name, fill_in_the_blanks_name,program_fill_in_the_blanks_name, function_name, programming_name):"""将题目分类并写入json文件"""for url in url_list:this_question_type_dict = get_question_type_url(url)questions_dict = {}def write_file(this_name):if not os.path.exists(os.getcwd() + "\\" + this_name):new_file = open(this_name, 'w')new_file.write("{}")new_file.close()f = open(this_name, 'r', encoding="utf-8")content = f.read()file_dict = json.loads(content)f.close()file_dict.update(questions_dict)judgment_file = open(this_name, mode='w', encoding="utf-8")judgment_file.write(json.dumps(file_dict, ensure_ascii=False))judgment_file.close()print("-----当前题记长度-----------------------------------------------------" + str(len(questions_dict)))print("-----写入文件--总长度-------------------------------------------------" + str(len(file_dict)))# 判断题if this_question_type_dict['judgment_url'] != "":questions_dict = get_judgment(this_question_type_dict['judgment_url'])write_file(this_name=judgment_file_name)# 选择题if this_question_type_dict['single_choice_url'] != "":questions_dict = get_single_choice(this_question_type_dict['single_choice_url'])write_file(this_name=single_choice_file_name)# 填空题if this_question_type_dict['fill_in_the_blanks_url'] != "":questions_dict = get_fill_or_program_in_the_blanks(this_question_type_dict['fill_in_the_blanks_url'])write_file(this_name=fill_in_the_blanks_name)# 程序填空题if this_question_type_dict['program_fill_in_the_blanks_url'] != "":questions_dict = get_fill_or_program_in_the_blanks(this_question_type_dict['program_fill_in_the_blanks_url'])write_file(this_name=program_fill_in_the_blanks_name)# 函数题if this_question_type_dict['function_url'] != "":questions_dict = get_function_or_programming(function_or_programming_url=this_question_type_dict['function_url'])write_file(this_name=function_name)# # 编程题if this_question_type_dict['programming_url'] != "":questions_dict = get_function_or_programming(function_or_programming_url=this_question_type_dict['programming_url'])write_file(this_name=programming_name)if __name__ == '__main__':# 创建 WebDriver 对象,指明使用chrome浏览器驱动web = webdriver.Chrome(service=Service(r'C:\Users\Cat\AppData\Local\Google\Chrome\Application\chromedriver.exe'))web.implicitly_wait(5)login_url = 'https://pintia.cn/auth/login'# 调用WebDriver 对象的get方法 可以让浏览器打开 指定网址web.get('https://pintia.cn/auth/login')login_PTA('zzz@qq.com', 'xxx')# # 题目集类型(数据库)# # 1判断;单选;多选!!!;填空# # 2判断;单选;填空# # 3判断;单选# # 4单选# # 5单选# # 6判断;单选# # 7判断;单选;填空# # 8判断;单选# # 9判断;单选# sql_url_list_walking = [# 'https://pintia.cn/problem-sets/1343789975057166336/problems/type/1',# 'https://pintia.cn/problem-sets/1343794588401487872/problems/type/1',# 'https://pintia.cn/problem-sets/1343799990153117696/problems/type/1',# 'https://pintia.cn/problem-sets/1343806731523719168/problems/type/2',# 'https://pintia.cn/problem-sets/1343807501140754432/problems/type/2',# 'https://pintia.cn/problem-sets/1343808640402018304/problems/type/1',# 'https://pintia.cn/problem-sets/1343811518420176896/problems/type/1',# 'https://pintia.cn/problem-sets/1343798231569530880/problems/type/1',# 'https://pintia.cn/problem-sets/1343819242272718848/problems/type/1'# ]## # 题目集类型(java)# # 1判断;单选# # 2判断;单选;填空;函数;编程# # 3判断;单选;填空;程序填空;函数;编程# # 4判断;单选;填空;程序填空;函数;编程# # 5判断;单选;填空;函数;编程# # 6判断;单选;填空;程序填空;编程# # 7判断;单选;填空;编程# # 8判断;单选;填空;程序填空;函数;编程# java_url_list_lgr = [# 'https://pintia.cn/problem-sets/1368832382463172608/problems/type/1',# 'https://pintia.cn/problem-sets/1368833022220361728/problems/type/1',# 'https://pintia.cn/problem-sets/1369164346714021888/problems/type/1',# 'https://pintia.cn/problem-sets/1369165326734123008/problems/type/1',# 'https://pintia.cn/problem-sets/1369165872660537344/problems/type/1',# 'https://pintia.cn/problem-sets/1369166179822002176/problems/type/1',# 'https://pintia.cn/problem-sets/1369166486127828992/problems/type/1',# 'https://pintia.cn/problem-sets/1369166803779248128/problems/type/1'# ]java_url_list_lxf = ['https://pintia.cn/problem-sets/1468315811752116224/problems/type/1']write_question_file(java_url_list_lxf, "dataSql\\judgment.json", "dataSql\\single_choice.json","dataSql\\fill_in_the_blanks.json", "dataSql\\program_fill_in_the_blanks.json","dataSql\\function.json", "dataSql\\programming.json")
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
