Python爬虫思路模板

2023-11-23 14:15:40

使用Python进行爬虫，原理上是共通的，首先先对网站发起数据请求，可以使用requests模块、urllib.request方法，对于动态网站可以使用selenium模拟浏览器登录方法。拿到服务器返回的网页数据后，对数据进行解析，获取感兴趣的信息，可以使用re模块进行正则匹配，xpath或BeautifulSoup选择节点的方法，对于网页数据是json格式的可以直接导入json模块直接进行解析。解析后的数据放在一个列表或元组的可迭代对象中，最后保存数据，常见的保存方法有：直接存入csv文件或者使用pymysql库存入MySQL数据库。

请求数据

######一、向网站发起数据请求#######1、requests
import requestsdef get_page(url):html = requests.get(url = url,headers= headers).content.decode('utf-8')
#     html = requests.get(url = url,
#                        headers= headers).text()
#     html = requests.get(url = url,
#                        headers= headers).json()return html#2、selenium模拟浏览器
from selenium import webdriver
import time#添加无界面参数
options = webdriver.ChromeOptions()
options.add_argument('--headless')#创建浏览器对象
browser = webdriver.Chrome()browser.get('https://www.baidu.com')#向搜索框输入 
word = input('请输入想要搜索的内容：')
browser.find_element_by_id('kw').send_keys(word)
#模拟点击百度一下
browser.find_element_by_id('su').click()#给出时间加载页面
time.sleep(2)
browser.find_element_by_class_name('n').click()##模拟滑动页面到最底部
def scroll_to_bottom(driver):js = "return action=document.body.scrollHeight"# 初始化现在滚动条所在高度为0height = 0# 当前窗口总高度new_height = driver.execute_script(js)while height < new_height:# 将滚动条调整至页面底部for i in range(height, new_height, 100):driver.execute_script('window.scrollTo(0, {})'.format(i))time.sleep(3)height = new_heighttime.sleep(3)new_height = driver.execute_script(js)#3、urllib
#请求模块
from urllib import request
#编码模块
from urllib import parse
#定义变量
word = input('请输入想要搜索的内容：')
url = "http://www.baidu.com/?s"
headers = {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)"}
#编码，拼接完整URL
query_string = parse.urlencode({'wd':word})
url = url + query_string
#url = "http://www.baidu.com/?s{}".format(urlencode({'wd':word}))
#url = "http://www.baidu.com/?s%s"%urlencode({'wd':word})
#创建请求对象，对请求进行包装
req = request.Request(url=url,headers=headers)
#获取响应对象
respond = request.urlopen(req)
#读取内容
html = respond.read().decode('utf-8')
#保存到文件
filename = '{}.html'.format(word)
with open(filename,'w',encoding='utf-8') as f:f.write(html)

解析数据

######二、解析数据#######1、re模块
import redef parse_page(html):pattern1 = re.compile('data-fav-favortype="1" data-name="(.*?)">收藏',re.S)shop_list = pattern1.findall(html)pattern2 = re.compile('口味(.*?)',re.S)taste_list = pattern2.findall(html)pattern3 = re.compile('环境(.*?)',re.S)envir_list = pattern3.findall(html)pattern4 = re.compile('服务(.*?)',re.S)serve_list = pattern4.findall(html)info_list = list(zip(shop_list,taste_list,envir_list,serve_list))return info_list#2、xpath选择节点
from lxml import etreedef parse_page(html):parse_html = etree.HTML(html)#新建一个字典用于提取想要的信息house_dict = {}#新建一个列表添加所有信息，方便后期存储house_list = []#一个页面包含大类的节点dd_list = parse_html.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"]|//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')for dd in dd_list:house_dict['address'] = dd.xpath('.//div[@class="title"]/a/text()')[0].strip()house_dict['introduction'] = dd.xpath('.//div[@class="houseInfo"]/text()')[0].strip()unit_price = dd.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()house_dict['unit_price'] = unit_price[2:]total_price = dd.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()')[0].strip()total_price = float(total_price) * 10000house_dict['total_price'] = total_pricehouse = tuple(house_dict.values())house_list.append(house)return house_list#3、json模块
import jsondef parse_page(html):#json.loads()方法把字典形式的字符串转换为json格式json_html = json.loads(html.split('fetchJSON_comment98(')[1][:-2])comments_dict = {}comments_list = []for comment in json_html['comments']:comments_dict['referenceName'] = comment['referenceName']comments_dict['creationTime'] = comment['creationTime']comments_dict['nickname'] = comment['nickname']comments_dict['content'] = comment['content']comments_dict['type'] = typecomments_list.append(list(comments_dict.values()))return comments_list#4、BeautifulSoup
from bs4 import BeautifulSoupdef parse_page(html):soup = BeautifulSoup(html, 'lxml')img_ul = soup.find_all('div', {'class': 'li_img'})soup.select('.panel .panel-heading') #选择class='panel'下class='panel-heading'节点soup.select('ul li') #选择ul标签下li节点soup.select('#list-2 .element') #选择id='list-2'标签下class='element'节点soup.select('ul')[0] #选择名称为ul标签第一个

保存数据

######三、存储数据#######1、存入到csv文件
import csvdef save_page(info_list):with open('文件.csv','a',encoding='utf-8',newline='') as f:writer = csv.writer(f)writer.writerow(['品牌','排名','最低价','最高价','销售量'])writer.writerows(info_list)#2、存储到mysql数据库
import pymysqldef save_page(info_list):db = pymysql.connect(host = 'localhost',port = 3306,user ='root',passwd = '密码',database = 'stu',charset = 'utf8')#获取游标cursor = db.cursor()ins = 'insert into phone values (%s,%s,%s)'cursor.executemany(ins, info_list)db.commit()#存储图片
# #连接数据库
# db = pymysql.connect(host = 'localhost',
#                         port = 3306,
#                         user ='root',
#                         passwd = '密码',
#                         database = 'stu',
#                         charset = 'utf8')
# #获取游标
# cursor = db.cursor()# #存储文件
# with open ('mysql.jpg','rb') as fd:
#     data = fd.read()# try:
#     #sql语句
#     sql = "insert into images values (1,'mysql.jpg',%s);"
#     #用execute自动传参方法将二进制内容传入语句
#     cur.execute(sql,[data])
#     db.commit()
# except Exception as e:
#     #报错回滚
#     db.rollback()
#     print(e)# cur.close()
# db.close()# #获取文件
# sql = "select * from images where filename='mysql.jpg'"# cur.execute(sql)
# image = cur.fetchone()
# with open (image[1],'wb') as fd:
#     fd.write(image[2])# cur.close()
# db.close()

爬虫实例

借助一个爬取链家二手房信息的案例，将整体爬取思路进行回顾并将所有代码进行封装。

######实例：封装#######爬取链家二手房信息
import requests
import pymysql
import time
import random
from lxml import etreeclass LianjiaSpider:def __init__(self):self.url = 'https://wh.lianjia.com/ershoufang/pg{}/'self.ua_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1','Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0']self.page = 1self.db = pymysql.connect('localhost','root','密码','lianjiadb',charset='utf8')self.cursor = self.db.cursor()def get_page(self,url):headers = {'User-Agent':random.choice(self.ua_list)}html = requests.get(url=url,headers=headers).content.decode('utf-8')self.parse_page(html)def parse_page(self,html):parse_html = etree.HTML(html)house_dict = {}house_list = []ins = 'insert into lianjia values (%s,%s,%s,%s)'dd_list = parse_html.xpath('//ul[@class="sellListContent"]/li[@class="clear LOGCLICKDATA"]|//ul[@class="sellListContent"]/li[@class="clear LOGVIEWDATA LOGCLICKDATA"]')for dd in dd_list:house_dict['address'] = dd.xpath('.//div[@class="title"]/a/text()')[0].strip()house_dict['introduction'] = dd.xpath('.//div[@class="houseInfo"]/text()')[0].strip()unit_price = dd.xpath('.//div[@class="unitPrice"]/span/text()')[0].strip()house_dict['unit_price'] = unit_price[2:]total_price = dd.xpath('.//div[@class="totalPrice totalPrice2"]/span/text()')[0].strip()total_price = float(total_price) * 10000house_dict['total_price'] = total_pricehouse = tuple(house_dict.values())house_list.append(house)print(house_list)self.cursor.executemany(ins,house_list)self.db.commit()def main(self):for page in range(100):url = self.url.format(page)self.get_page(url)time.sleep(random.randint(1,3))print('第%s页爬取完成'%self.page)self.page+=1self.cursor.close()self.db.close()if __name__ == '__main__':start = time.time()spider = LianjiaSpider()spider.main()end = time.time()print('执行时间： %.2f'%(end-start))

本文来自互联网用户投稿，文章观点仅代表作者本人，不代表本站立场，不承担相关法律责任。如若转载，请注明出处。 如若内容造成侵权/违法违规/事实不符，请点击【内容举报】进行投诉反馈！

标签：技术

上一篇 > CSS布局练习，jd左侧导航栏
下一篇 > 爬虫第三式：某某二手车数据抓取 -二级页面

Duilib中list控件支持ctrl和shif多行选中的实现

[ICML2015]Batch Normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shif

win10系统微软输入法于eclipse ctrl+shif+f冲突间接处理办法

Codeforces Round #259 (Div. 2) B. Little Pony and Sort by Shif

读LDD3，内存映射与DMA--PAGE_SHIF…

VMware虚拟机安装XP【要先分区，再设置BOOT 启动CD，shif+上移】

更换iBus五笔的左与右Shif

sublime ctrl+shif+f 没用解决办法

idea 对 ctrl + z 的撤销是 ctrl + shif + z

计算机最早的设计师应用于,计算机应用基础选择题doc.doc

win10自带截图神器：Win+Shift+S

Python基础之文件目录操作

python简述目录_Python基础之文件目录操作(示例代码)

tp5 如何做数据采集

任务2-7(服务器字体+阿里巴巴矢量库)

html标签（1)：h1~h6,p,br,pre,hr

TI 电量计介绍与芯片选型指南

几款TI电源芯片简介

TI DSP芯片C2000系列读取FLASH数据

德州仪器(Ti)平台嵌入式开发基础

TI三相电机智能栅极驱动芯片特点分类

省选模拟（12.08） T3 圈圈圈圈圈圈圈圈

Hadoop生态圈技术栈（上）

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之6.Impala交互式查询

小猿圈之Linux下Mysql 操作命令

大数据Hadoop生态圈常用面试题

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之4.Hive DDL、DQL和数据操作

备战Noip2018模拟赛11（B组）T3 Monogatari 物语

【智能优化算法-圆圈搜索算法】基于圆圈搜索算法Circle Search Algorithm求解单目标优化问题附matlab代码

NYOJ 78 圈水池

递归问题跑道汽车绕圈问题 Python实现

Hadoop生态圈（三）：MapReduce

Python爬虫思路模板

请求数据

解析数据

保存数据

爬虫实例

相关文章