"""本代码用于爬取易班已发表数据
(由于写这篇代码的时候就一页,就没设置翻页)
用到的库:selenium、csv、pandas、bs4、time、re
最终成果:形成易班推文数据csv格式
作者:UPC.故里
注:有成功概率,因为易班有时候有登录验证有时候没有"""import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup"""
The By implementation.
"""
class By(object):"""Set of supported locator strategies."""ID = "id"XPATH = "xpath"LINK_TEXT = "link text"PARTIAL_LINK_TEXT = "partial link text"NAME = "name"TAG_NAME = "tag name"CLASS_NAME = "class name"CSS_SELECTOR = "css selector"
url= 'https://www.yiban.cn/Org/orglistShow/type/forum/puid/5370538#'
driver = webdriver.Firefox()
driver.get(url)
time.sleep(1)
stu_number = ''
stu_password = ''
'''
根据xpath查看账号和密码的id
stu_number_xpath = '//*[@id="account-txt"]' #填写账号的位置的xpath --> id是un
stu_password_xpath = '//*[@id="password-txt"]' #填写密码的位置的xpath --> id是pd
'''
driver.find_element_by_id('account-txt').send_keys(stu_number)
time.sleep(1)
driver.find_element_by_id('password-txt').send_keys(stu_password)
time.sleep(1)
sign_in_xpath = '//*[@id="login-btn"]'
ActionChains(driver).click(driver.find_element_by_xpath(sign_in_xpath)).perform()
time.sleep(5)
ActionChains(driver).click(driver.find_element_by_xpath('/html/body/main/div/div[2]/div[2]/ul/li[3]/a')).perform()
time.sleep(2)a_text_list = []
span_text_list = []
for j in range(6): xTable = '/html/body/main/div/div[2]/div[3]'table = driver.find_element_by_xpath(xTable).get_attribute('innerHTML')soup = BeautifulSoup(table,'html.parser') a = soup.find_all('a') for row in a:row_a_text = row.texta_text_list.extend([row_a_text])span = soup.find_all('span') for row in span:row_a_text = row.textspan_text_list.extend([row_a_text])time.sleep(2)next_page = '/html/body/main/div/div[2]/div[4]/div/div/a[2]'element = driver.find_element_by_xpath(next_page)driver.execute_script("arguments[0].click();", element)time.sleep(2)data = {'活动院系':span_text_list[::6],'推文标题':a_text_list[1::9],'推送板块':span_text_list[1::6],'推送时间':span_text_list[2::6],'阅读量':span_text_list[3::6],'点赞量':span_text_list[4::6],'评论量':span_text_list[5::6]}
dataframe = pd.DataFrame(data)
dataframe.to_csv('Yiban.csv',index=False, sep=',',encoding='utf-8-sig')driver.close()
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!