selenium自动化获取起点排行榜
import re
import openpyxl
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import etree
from html import unescape# 获取HTML源码,普通方式无法抓取到源码,所以需要使用代理访问后在提取内容
def getHtml(url):try:# 用户代理headers = {'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}# 获取请求对象r = requests.get(url, timeout=5, headers=headers)r.raise_for_status()# 返回页面内容return r.textexcept Exception as e:print(e.__traceback__)# 解析内容
def getInfo(text):# 获取xpath解析对象e = etree.HTML(text)html = etree.tostring(e, encoding="utf-8").decode("utf-8") # 防止中文类似中国的乱码。# print(html)# 品类一 (# 传说,在那古老的星空深处,伫立着一道血与火侵染的红色之门。传奇与神话,黑暗与光明,无尽传说皆在这古老的门户中流淌。# 俯瞰星门,热血照耀天地,黑暗终将离去!《星门》漫画11月18日上线!!!对漫画感兴趣的朋友可以去看看。…" > )nofollow = e.xpath('//meta[@name="description"]')nofollow = etree.tostring(nofollow[0], encoding="utf-8").decode("utf-8")nofollow = re.findall('创作的(.+?)小说', unescape(nofollow))print(nofollow)# 品类二 ()category = e.xpath('//meta[@property="og:novel:category"]')category = etree.tostring(category[0], encoding="utf-8").decode("utf-8")category = re.findall('content="(.+?)"/>', unescape(category))print(category)# 书名 ()title = e.xpath('//meta[@property="og:title"]')title = etree.tostring(title[0], encoding="utf-8").decode("utf-8")title = re.findall('content="(.+?)"/>', unescape(title))print(title)# 作者 ()author = e.xpath('//meta[@property="og:novel:author"]')author = etree.tostring(author[0], encoding="utf-8").decode("utf-8")author = re.findall('content="(.+?)"/>', unescape(author))print(author)# 状态 ()status = e.xpath('//meta[@property="og:novel:status"]')status = etree.tostring(status[0], encoding="utf-8").decode("utf-8")status = re.findall('content="(.+?)"/>', unescape(status))print(status)# 字数 (647.82万 ……
)count = e.xpath('//p[@class="count"]/em[1]/text()')print(count)# 简介 ( 传说,在那古老的星空深处,伫立着一道血与火侵染的红色之门。
)
intro = e.xpath('//p[@id="book-intro-detail"]/text()')intro = "".join(intro)intro = [intro]print(intro)return zip(nofollow, category, title, author, status, count, intro)# 保存数据
def save(info, text, y, data, ws):# 排名信息for nofollow, category, title, author, status, count, intro in data:ws.append([info, text, nofollow, category, title, author, status, count, intro])if __name__ == '__main__':# 创建excel文件# 获取工作簿wb = openpyxl.Workbook()# 获取工作表ws = wb.active# 设置工作表名ws.title = '起点中文网月票榜'# 设置表头ws.append(['标题', "链接", '品类一', '品类二', '书名', '作者', '状态', '字数', '简介'])# 共5页for x in range(10, 13):for y in range(1, 4):for z in range(1, 21):if y == 3 and z > 10:continueurl = "https://www.qidian.com/rank/yuepiao/year2022/"options = webdriver.ChromeOptions()options.add_argument("disable-blink-features=AutomationControlled")browser = webdriver.Chrome(options=options)# browser = webdriver.Chrome() # 初始化浏览器为chrome浏览器browser.maximize_window() # 设置全屏browser.get(url) # 访问网页text = getHtml(browser)print(text)# print(browser.page_source)# 选择第几月browser.find_element(by=By.ID, value='month').click()months = browser.find_elements(by=By.CLASS_NAME, value='lbf-combobox-item')months[-x].click() # 这里4是12月, 5是11月,6是10月, -x刚好从1到12print(len(months))# 选择第几页pagepagination = browser.find_elements(by=By.CLASS_NAME, value='lbf-pagination-page ')print(len(pagination))pagination[y - 1].click()# 选择第几项element = browser.find_elements(by=By.CLASS_NAME, value='book-img-box')print(len(element))element[z-1].click()browser.switch_to.window(browser.window_handles[1]) # 切换当前页面标签currentPageUrl = browser.current_urlprint(currentPageUrl)text = getHtml(currentPageUrl)data = getInfo(text)save("year2022-month" + str(x) + "-page" + str(y) + "-option" + str((y - 1) * 20 + z), currentPageUrl, y, data, ws)# 保存wb.save('qidian.xlsx')browser.close()

参考:【爬虫实战】起点中文网排行榜(XPath)
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
