01_urllib

1、urllib的简单使用

import urllib.request#(1)定义一个url 就是要访问的地址
url = 'http://www.baidu.com'#(2) 模拟浏览器向服务器发送请求 response 响应
response = urllib.request.urlopen(url)#(3) 获取响应中的页面的源码 content内容的意思
# read方法 返回的是字节形式的二进制数据(b开头)
#我们要将二进制的数据转换为字符串   二进制 ——> 字符串 解码 decode(‘编码的格式’)
content = response.read().decode('utf-8')#(4) 打印数据
print(content)

2、urllib:一个类型和六个方法

import urllib.request
url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)#返回response类型  HTTPResponse类型
# content = type(response)
# print(content)#按照一个字节一个字节的去读
# content = response.read()
# print(content)#返回多少个字节
# content = response.read(5)
# print(content)#读取一行
# content = response.readline()
# print(content)#读取多行
# content = response.readlines()
# print(content)#返回状态码
# print(response.getcode())#返回的是url地址
# print(response.geturl())#获取的是状态信息 响应头
# print(response.getheaders())

3、urllib:下载

import urllib.request# 下载网页
# url_page = 'http://www.baidu.com'# urlretrieve(url,filename) : url代表的是下载的路径  filename是文件的新名字
#在python中 可以变量的名字 也可以直接写值
# urllib.request.urlretrieve(url_page,'baidu.html')#下载图片
# url_img = 'https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fc-ssl.duitang.com%2Fuploads%2Fitem%2F202005%2F22%2F20200522131408_obupx.jpeg&refer=http%3A%2F%2Fc-ssl.duitang.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=auto?sec=1655520388&t=d7f6d94a25d1d6e3cdbaa03a87fb1c59'
# urllib.request.urlretrieve(url=url_img,filename='lisa.jpg')#下载视频
url_video = 'https://vd4.bdstatic.com/mda-kfsm974k6xwx9y8j/v1-cae/sc/mda-kfsm974k6xwx9y8j.mp4?v_from_s=hkapp-haokan-nanjing&auth_key=1652930638-0-0-dce2ef63e5e2c4d4cbafeea2dd68cff1&bcevod_channel=searchbox_feed&pd=1&cd=0&pt=3&logid=3238681392&vid=2292765971694816622&abtest=102148_2-17451_1&klogid=3238681392'
urllib.request.urlretrieve(url_video,'news.mp4')

4、urllib:请求对象的定制

import urllib.request
url = 'https://www.baidu.com'# url 的组成
#https://www.baidu.com/s?ie=UTF-8&wd=周杰伦
# httl/https    www.baidu.com       80/443       s      ie=UTF-8&wd=周杰伦       #
# 协议                主机           端口号        路径      参数                  锚点
# http    80
# https   443
# mysql   3306
# oracle  1521
# redis   6379
# mongodb 27017
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}   #字典
#因为urlopen方法中不能存储字典,所以header不能传递进去
#请求对象得定制
#注意:因为参数顺序得问题,不能直接写url和headers  中间还有data  所以我们需要关键字传参
request = urllib.request.Request(url = url,headers = headers)response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
print(content)

5、get请求

(1)quote方法:转换为Unicode编码(单个参数)

# https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
# %E5%91%A8%E6%9D%B0%E4%BC%A6 = 周杰伦  是Unicode编码 全球统一
# 需求:获取 https://www.baidu.com/s?wd=周杰伦 的网页源码import urllib.requesturl = 'https://www.baidu.com/s?wd='
# 请求对象的定制为了解决反爬的第一种手段
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}#将周杰伦三个字变成Unicode编码
#我们需要依赖urllib.parse
name = urllib.parse.quote('周杰伦')url = url + name# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)#模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)content = response.read().decode('utf-8')print(content)

(2) urlencode方法:多个参数的时候

# urlencode 应用场景:多个参数的时候
# https://www.baidu.com/s?wd=周杰伦&sex=男# import urllib.parse
# data = {
#     'wd':'周杰伦',
#     'sex':'男',
#     'location':'中国台湾省'
# }
# a = urllib.parse.urlencode(data)
# print(a)#获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E5%8F%B0%E6%B9%BE%E7%9C%81 的网页源码import urllib.request
import urllib.parse
base_url = 'https://www.baidu.com/s?'data = {'wd': '周杰伦','sex': '男','location': '中国台湾省'
}new_data = urllib.parse.urlencode(data)
# print(new_data)#请求资源路径
url = base_url + new_data
print(url)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
#请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
#模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)

6、post请求

(1)请求百度翻译

import urllib.request
import urllib.parseurl = 'https://fanyi.baidu.com/sug'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
data = {'kw':'spider',
}
# post请求的参数,必须要进行编码
data = urllib.parse.urlencode(data).encode('utf-8')#post的请求的参数 是不会拼接在url的后面的 而是需要放在请求对象定制的参数中
#post请求的参数 必须要进行编码
request = urllib.request.Request(url=url,data=data,headers=headers)
#模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
#获取响应的数据
content = response.read().decode('utf-8')
print(content)
print(type(content))import json
obj = json.loads(content)
print(obj)"""
总结:
1、post请求方式的参数必须编码 data = urllib.parse.urlencode(data)
2、编码之后必须调用encode的方法data=urllib.parse.erlencode(data).encode('utf-8')
3、参数是放在请求对象定制的方法中 :request = urllib.request.Request(url=url,data=data,headers=headers)"""

(1)请求百度翻译详解

import urllib.request
import urllib.parse
url =  'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
headers = {# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36',# 'Accept':'*/*',# 'Accept-Encoding': 'gzip, deflate, br',# 'Accept-Language': 'zh-CN,zh;q=0.9',# 'Connection': 'keep-alive',# 'Content-Length': '136',# 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8','Cookie': 'BIDUPSID=9C33D9AF842C65882B22D090B46042A7; PSTM=1610940896; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; __yjs_duid=1_0eb9831c9922fbeed75979b5e0cfd7231619776633784; BAIDUID=9139193BF8529188B5B5F4741D0FD50D:FG=1; APPGUIDE_10_0_2=1; BDSFRCVID_BFESS=tP8OJeC62mn-gd3DoJHSKm8DOg3F4AQTH6aoOpQpj2wA39Ni7uPrEG0PHU8g0KubT5mFogKKy2OTH9DF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tbuJ_KDyJKP3fP36qR6VMPIHqxby26nL3jn9aJ5nJDoCVnTojTJUXfk_jNoThlvMtDCj2qkaQpP-HJ7yM-5HbfPwbbbTJbcz3CrPKl0MLPOYbb0xynoDLT0B5xnMBMnGamOnanra3fAKftnOM46JehL3346-35543bRTLnLy5KJtMDFljTu2DjQyeU5eetjK2CntsJOOaCvVHlQOy4oWK441Dh7MQt6R36chWqvEfp-WDqvoD-Jc3M04X-o9-hvT-54e2p3FBUQJHC33Qft20b0m3gFq3q8La5bD-R7jWhvBhl72y5rUQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8IjHCDt5FjtRIeV-35b5rfjJrT5-QSMICShUFsWlOCB2Q-XPoO3KJWeUokQMo-b4CUhH7RqPriWbRM2MbgylRpjM3K0462XjKBKPOpK-DfX2TxoUJ2XM0VMloMqtnWKqLebPRih6j9Qg-8KpQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hC09j68MjjoM5h5K5-LXJR6BWb_8Kb7VbPbMeMnkbfJBDxcXe4bbt2nyaqnRWbCWsh7TDUDMLq_7yajK2MRjL6RBot3ptKTFjD3G0b5pQT8ryb_OK5OibCrn_R6Iab3vOPI4XpO1ef0zBN5thURB2DkO-4bCWJ5TMl5jDh3Mb6ksD-FtqtJHKbDt_II-JfK; BDUSS=NOUXhyMjAwblBqV2NpfkF0dU9Ca1Z3cGNhOTh0R0lBMHU1TWl-bjU0ck1aNUJpRVFBQUFBJCQAAAAAAAAAAAEAAAAe0iJm0ru49tXm1qqw9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMzaaGLM2mhiV3; BDUSS_BFESS=NOUXhyMjAwblBqV2NpfkF0dU9Ca1Z3cGNhOTh0R0lBMHU1TWl-bjU0ck1aNUJpRVFBQUFBJCQAAAAAAAAAAAEAAAAe0iJm0ru49tXm1qqw9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMzaaGLM2mhiV3; BAIDUID_BFESS=875209CD6EF1B6A11342D5227CD66391:FG=1; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1650515716,1650872384,1651827471,1652925735; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; av1_switch_v3=0; PSINO=7; BA_HECTOR=a4210hal8la101ag821h8brmv0r; RT="z=1&dm=baidu.com&si=6304f9glu2&ss=l3cf5iav&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=46b&ul=c3gs0&hd=c3gvq"; H_PS_PSSID=36425_36367_34812_35914_36166_34584_35978_36055_26350_36315_36447; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1652952606; ab_sr=1.0.1_ZWI2ZWRiZWI1Y2JiODRlYWRjZTlkNThjZjAyNDI5NDFhZDg1MGFmN2VkYzI1ZmVlMGIxYjVmZGNhOTEyOWU5ODJlMDI5ZDUxYjc2ODJhOGRlZWZmMzk1NmZjM2U2NDVjNWE3YWIwNjVkOTQ3Zjc2Mjk3Nzg0ZmQ3MmZmNmJlODkzMTQzMjE4OWZiMWExODJiMzZlOTA2NGJkMTkxZmNhZDZjNWQxMzgwZWE1YmM4ZjI5OGViNzM0Y2EyNmRjMjVl',# 'Host': 'fanyi.baidu.com',# 'Origin': 'https://fanyi.baidu.com',# 'Referer': 'https://fanyi.baidu.com/translate?aldtype=16047&query=&keyfrom=baidu&smartresult=dict&lang=auto2zh',# 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',# 'sec-ch-ua-mobile': '?0',# 'sec-ch-ua-platform': '"Windows"',# 'Sec-Fetch-Dest': 'empty',# 'Sec-Fetch-Mode': 'cors',# 'Sec-Fetch-Site': 'same-origin',# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36',# 'X-Requested-With': 'XMLHttpRequest'
}
data = {'from': 'en','to': 'zh','query': 'spider','transtype': 'realtime','simple_means_flag': '3','sign': '63766.268839','token': '92389bc1e4d32b64ec36f56fb41f03db','domain': 'common'
}#post请求的参数 必须进行编码 并且要调用encode方法
data = urllib.parse.urlencode(data).encode('utf-8')request = urllib.request.Request(url=url,data=data,headers=headers)response = urllib.request.urlopen(request)content = response.read().decode('utf-8')import json
obj = json.loads(content)
print(obj)

7、ajax的get请求豆瓣电影第一页


#get请求
#获取豆瓣电影的第一页的数据并且保存起来import urllib.request
url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start=0&genres=%E5%96%9C%E5%89%A7'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
#(1)请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
#(2)获取响应的数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
#(3)数据下载到本地
#open方法默认情况下使用的是gbk的编码 如果我们要保存汉字,需要在open方法中指定编码格式为utf-8
#encoding = ‘utf-8'# fp = open('douban.json','w',encoding='utf-8')
# fp.write(content)with open('dounban1.json','w',encoding='utf-8') as fp:fp.write(content)

8、ajax的get请求豆瓣电影前10页

# https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&
# start=0&genres=%E5%96%9C%E5%89%A7# https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&
# start=20&genres=%E5%96%9C%E5%89%A7# https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&
# start=40&genres=%E5%96%9C%E5%89%A7# https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&
# start=80&genres=%E5%96%9C%E5%89%A7#下载前十页
#(1)请求对象的定制
#(2)获取响应的数据
#(3)下载数据
import urllib.parse
import urllib.request
def create_request(page):base_url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&'data = {'start':(page - 1) * 20,'limit':20}data = urllib.parse.urlencode(data)url = base_url + dataprint(url)headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}request = urllib.request.Request(url=url,headers=headers)return requestdef get_content(request):response = urllib.request.urlopen(request)content = response.read().decode('utf-8')return contentdef down_load(page,content):with open('douban_'+str(page) +'.json','w',encoding='utf-8')as fp:fp.write(content)if __name__ == '__main__':start_page = int(input('请输入起始的页码:'))end_page = int(input('请输入结束的页码:'))for page in range(start_page,end_page+1):#每一页都有自己的请求对象的定制request = create_request(page)#获取响应的数据content = get_content(request)#下载down_load(page,content)

9、ajax的post请求肯德基官网

# 页面1
# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
# post
# cname:
# pid:
# keyword: 北京
# pageIndex: 1
# pageSize: 10# 页面2
# http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
# post
# cname:
# pid:
# keyword: 北京
# pageIndex: 2
# pageSize: 10import urllib.request
import urllib.parse
# base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
def create_request(page):base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'data = {'cname':'','pid':'','keyword': '北京','pageIndex': page,'pageSize': '10'}data = urllib.parse.urlencode(data).encode('utf-8')headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'}request = urllib.request.Request(url=base_url,data=data,headers=headers)return request
def get_content(requset):response = urllib.request.urlopen(requset)content = response.read().decode('utf-8')return contentdef down_load(page,content):with open('kfc_'+str(page)+'.json','w',encoding='utf-8')as fp:fp.write(content)if __name__ == '__main__':start_page = int(input('请输入起始页码:'))end_page = int(input('请输入结束页码:'))for page in range(start_page,end_page+1):#请求对象的定制requests = create_request(page)#获取网页源码content = get_content(requests)#下载down_load(page,content)

10、urllib异常

import urllib.request
import urllib.error
# url = 'https://blog.csdn.net/JHXL_/article/details/1246767841'
url = 'http://www.goudan111.com'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
try:request = urllib.request.Request(url=url,headers=headers)response = urllib.request.urlopen(request)content = response.read().decode('utf-8')print(content)
except urllib.error.HTTPError:print('系统正在升级。。')
except urllib.error.URLError:print('我都说了 系统正在升级。。。')

11、handler的基本使用

import urllib.request
url = 'http://www.baidu.com'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}request = urllib.request.Request(url=url,headers=headers)
# (1)获取handler 对象
handler = urllib.request.HTTPHandler()# (2)获取opener对象
opener = urllib.request.build_opener(handler)# (3)调用open方法
response = opener.open(request)content = response.read().decode('utf-8')
print(content)

12、代理

import urllib.request
url = 'https://www.baidu.com/s?ie=UTF-8&wd=ip'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
# 请求对象的定制
request = urllib.request.Request(url=url,headers=headers)
# 模拟浏览器访问服务器
# response = urllib.request.urlopen(request)#找的代理,可以买
proxies = {'http':'118.24.219.151:16817'
}
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
#获取响应信息
content = response.read().decode('utf-8')with open('daili.html','w',encoding='utf-8')as fp:fp.write(content)

13、代理池

import urllib.request
proxies_pool = [{'http':'14.215.212.37:9168'},{'http':'14.215.212.37:9168'},{'http':'14.215.212.37:9168'}
]import random
proxies = random.choice(proxies_pool)url = 'https://www.baidu.com/s?ie=UTF-8&wd=ip'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}request = urllib.request.Request(url=url,headers=headers)
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')with open('dailichi.html','w',encoding='utf-8')as fp:fp.write(content)


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部