Selenium操作CNKI(中国知网)网站的远见搜索

最近想对单位发表的期刊情况做一个文献计量学分析,数据倒也不多,但是手动保存优点麻烦。两年前用spynner.py做过抓取CNKI管理后台,spynner.py是基于qtwebkit的一个包,跟ghost.py基本差不多,这也算跟selenium有一定的渊源,spynner基本弃坑了,近几年一直没再动过。CNKI网站为了反爬,做的太复杂,平时自己访问都感觉慢,一看源代码,加载了一堆js,能不慢吗?

Python2,selenium操作如下(仅关键部分,代码不全):

# coding: utf-8import time
import datetime
import sys
import os
import randomfrom selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities'''
manipulate
'''
#左键点击
def common_click(driver,element_id,sleep_time=3):actions = ActionChains(driver)actions.move_to_element(element_id)actions.click(element_id)actions.perform()time.sleep(sleep_time) 
#鼠标悬浮
def common_hover(driver,element_id,sleep_time=3):actions = ActionChains(driver)actions.move_to_element(element_id)actions.perform()time.sleep(sleep_time) def print_scr(driver,filename):driver.get_screenshot_as_file(filename)
#填充表单   
def fill_text(driver,element,content):element.clear()element.send_keys(content)time.sleep(0.5)
#---------------------------------
def page_config():driver.get('http://yuanjian.cnki.com.cn/home/?type=corpus')addtime=time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())   time.sleep(3)search_type=driver.find_element_by_class_name("item")common_click(driver,search_type)print('发现搜索类型!')search_type_unit=driver.find_element_by_xpath('//li[@val="unit"]')common_click(driver,search_type_unit)print('设置搜索类型为单位!')search_input = driver.find_element_by_class_name('MulSerchKey')#print len(search_input)fill_text(driver,search_input,'你的单位'.decode('gb18030'))print('填写搜索框内容!')search_btn=driver.find_element_by_class_name('search')common_click(driver,search_btn)print('获得搜索结果!')time.sleep(10)list_style=driver.find_element_by_class_name('zyxz')common_click(driver,list_style)print('切换显示样式!')list_order_btn=driver.find_element_by_class_name('rank')common_hover(driver,list_order_btn)list_order= driver.find_element_by_xpath('//a[@onclick="Order(2);"]')common_click(driver,list_order,10)print('切换为按时间排序!')print('配置完成,开始抓取页面')print '-'*60driver.get_screenshot_as_file(addtime+'.png')SaveAsLocalFile('1.html',driver.page_source,write_type='a+')next_page=driver.find_element_by_link_text("下一页>".decode('gb18030'))print next_page.textcommon_click(driver,next_page,10)SaveAsLocalFile('2.html',driver.page_source,write_type='a+')if __name__ == '__main__':#init driverDesiredCapabilities.PHANTOMJS['phantomjs.page.settings.loadImages'] = False  DesiredCapabilities.PHANTOMJS['phantomjs.page.settings.userAgent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 "driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])driver.set_script_timeout(30)driver.set_page_load_timeout(30)#print 'Browser driver initialized!'print '******System initialized!******'
#Main----------------------------------------------------------page_config()
#Exit----------------------------------------------------------driver.quit()


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部