Python爬虫思路模板
使用Python进行爬虫,原理上是共通的,首先先对网站发起数据请求,可以使用requests模块、urllib.request方法,对于动态网站可以使用selenium模拟浏览器登录方法。拿到服务器返回的网页数据后,对数据进行解析,获取感兴趣的信息,可以使用re模块进行正则匹配,xpath或BeautifulSoup选择节点的方法,对于网页数据是json格式的可以直接导入json模块直接进行解析。解析后的数据放在一个列表或元组的可迭代对象中,最后保存数据,常见的保存方法有:直接存入csv文件或者使用pymysql库存入MySQL数据库。
请求数据
######一、向网站发起数据请求#######1、requests
import requestsdef get_page(url):html = requests.get(url = url,headers= headers).content.decode('utf-8')
# html = requests.get(url = url,
# headers= headers).text()
# html = requests.get(url = url,
# headers= headers).json()return html#2、selenium模拟浏览器
from selenium import webdriver
import time#添加无界面参数
options = webdriver.ChromeOptions()
options.add_argument('--headless')#创建浏览器对象
browser = webdriver.Chrome()browser.get('https://www.baidu.com')#向搜索框输入
word = input('请输入想要搜索的内容:')
browser.find_element_by_id('kw').send_keys(word)
#模拟点击百度一下
browser.find_element_by_id('su').click()#给出时间加载页面
time.sleep(2)
browser.find_element_by_class_name('n').click()##模拟滑动页面到最底部
def scroll_to_bottom(driver):js = "return action=document.body.scrollHeight"# 初始化现在滚动条所在高度为0height = 0# 当前窗口总高度new_height = driver.execute_script(js)while height < new_height:# 将滚动条调整至页面底部for i in range(height, new_height, 100):driver.execute_script('window.scrollTo(0, {})'.format(i))time.sleep(3)height = new_heighttime.sleep(3)new_height = driver.execute_script(js)#3、urllib
#请求模块
from urllib import request
#编码模块
from urllib import parse
#定义变量
word = input('请输入想要搜索的内容:')
url = "http://www.baidu.com/?s"
headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)"}
#编码,拼接完整URL
query_string = parse.urlencode({'wd':word})
url = url + query_string
#url = "http://www.baidu.com/?s{}".format(urlencode({'wd':word}))
#url = "http://www.baidu.com/?s%s"%urlencode({'wd':word})
#创建请求对象,对请求进行包装
req = request.Request(url=url,headers=headers)
#获取响应对象
respond = request.urlopen(req)
#读取内容
html = respond.read().decode('utf-8')
#保存到文件
filename = '{}.html'.format(word)
with open(filename,'w',encoding='utf-8') as f:f.write(html)
解析数据
######二、解析数据#######1、re模块
import redef parse_page(html):pattern1 = re.compile('data-fav-favortype="1" data-name="(.*?)">收藏',re.S)shop_list = pattern1.findall(html)pattern2 = re.compile('口味(.*?) ',re.S)taste_list = pattern2.findall(html)pattern3 = re.compile('环境(.*?) ',re.S)envir_list = pattern3.findall(html)pattern4 = re.compile('服务(.*?) ',re.S)serve_list = pattern4.findall(html)info_list = list(zip(shop_list,taste_list,envir_list,serve_list))return info_list#2、xpath选择节点
from lxml import etreedef parse_page(html):parse_html = etree.HTML(html)#新建一个字典用于提取想要的信息house_dict = {}#新建一个列表添加所有信息,方便后期存储house_list = []#一个页面包含大类的节点dd_list = parse_html.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"]|//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')for dd in dd_list:house_dict['address'] = dd.xpath('.//div[@class="title"]/a/text()')[0].strip()house_dict['introduction'] = dd.xpath('.//div[@class="houseInfo"]/text()')[0].strip()unit_price = dd.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()house_dict['unit_price'] = unit_price[2:]total_price = dd.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()')[0].strip()total_price = float(total_price) * 10000house_dict['total_price'] = total_pricehouse = tuple(house_dict.values())house_list.append(house)return house_list#3、json模块
import jsondef parse_page(html):#json.loads()方法把字典形式的字符串转换为json格式json_html = json.loads(html.split('fetchJSON_comment98(')[1][:-2])comments_dict = {}comments_list = []for comment in json_html['comments']:comments_dict['referenceName'] = comment['referenceName']comments_dict['creationTime'] = comment['creationTime']comments_dict['nickname'] = comment['nickname']comments_dict['content'] = comment['content']comments_dict['type'] = typecomments_list.append(list(comments_dict.values()))return comments_list#4、BeautifulSoup
from bs4 import BeautifulSoupdef parse_page(html):soup = BeautifulSoup(html, 'lxml')img_ul = soup.find_all('div', {'class': 'li_img'})soup.select('.panel .panel-heading') #选择class='panel'下class='panel-heading'节点soup.select('ul li') #选择ul标签下li节点soup.select('#list-2 .element') #选择id='list-2'标签下class='element'节点soup.select('ul')[0] #选择名称为ul标签第一个
保存数据
######三、存储数据#######1、存入到csv文件
import csvdef save_page(info_list):with open('文件.csv','a',encoding='utf-8',newline='') as f:writer = csv.writer(f)writer.writerow(['品牌','排名','最低价','最高价','销售量'])writer.writerows(info_list)#2、存储到mysql数据库
import pymysqldef save_page(info_list):db = pymysql.connect(host = 'localhost',port = 3306,user ='root',passwd = '密码',database = 'stu',charset = 'utf8')#获取游标cursor = db.cursor()ins = 'insert into phone values (%s,%s,%s)'cursor.executemany(ins, info_list)db.commit()#存储图片
# #连接数据库
# db = pymysql.connect(host = 'localhost',
# port = 3306,
# user ='root',
# passwd = '密码',
# database = 'stu',
# charset = 'utf8')
# #获取游标
# cursor = db.cursor()# #存储文件
# with open ('mysql.jpg','rb') as fd:
# data = fd.read()# try:
# #sql语句
# sql = "insert into images values (1,'mysql.jpg',%s);"
# #用execute自动传参方法将二进制内容传入语句
# cur.execute(sql,[data])
# db.commit()
# except Exception as e:
# #报错回滚
# db.rollback()
# print(e)# cur.close()
# db.close()# #获取文件
# sql = "select * from images where filename='mysql.jpg'"# cur.execute(sql)
# image = cur.fetchone()
# with open (image[1],'wb') as fd:
# fd.write(image[2])# cur.close()
# db.close()
爬虫实例
借助一个爬取链家二手房信息的案例,将整体爬取思路进行回顾并将所有代码进行封装。
######实例:封装#######爬取链家二手房信息
import requests
import pymysql
import time
import random
from lxml import etreeclass LianjiaSpider:def __init__(self):self.url = 'https://wh.lianjia.com/ershoufang/pg{}/'self.ua_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0']self.page = 1self.db = pymysql.connect('localhost','root','密码','lianjiadb',charset='utf8')self.cursor = self.db.cursor()def get_page(self,url):headers = {'User-Agent':random.choice(self.ua_list)}html = requests.get(url=url,headers=headers).content.decode('utf-8')self.parse_page(html)def parse_page(self,html):parse_html = etree.HTML(html)house_dict = {}house_list = []ins = 'insert into lianjia values (%s,%s,%s,%s)'dd_list = parse_html.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"]|//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')for dd in dd_list:house_dict['address'] = dd.xpath('.//div[@class="title"]/a/text()')[0].strip()house_dict['introduction'] = dd.xpath('.//div[@class="houseInfo"]/text()')[0].strip()unit_price = dd.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()house_dict['unit_price'] = unit_price[2:]total_price = dd.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()')[0].strip()total_price = float(total_price) * 10000house_dict['total_price'] = total_pricehouse = tuple(house_dict.values())house_list.append(house)print(house_list)self.cursor.executemany(ins,house_list)self.db.commit()def main(self):for page in range(100):url = self.url.format(page)self.get_page(url)time.sleep(random.randint(1,3))print('第%s页爬取完成'%self.page)self.page+=1self.cursor.close()self.db.close()if __name__ == '__main__':start = time.time()spider = LianjiaSpider()spider.main()end = time.time()print('执行时间: %.2f'%(end-start))
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
