工具------获取最新的代理ip(66网)

爬取66代理网的代理ip

使用技术:
- python3
- request
- xpath


import userAgent
import requests
from lxml import etreeclass Proxies:"""爬取66免费代理网的代理proDicList = Proxies.get_proxies(4)"""url = 'http://www.66ip.cn/' #全国# url = 'http://www.66ip.cn/areaindex_1/'proxies = []geshu = 0@staticmethoddef get_one_proxy(count=1):count = countproxy = Proxies.get_proxies(count)return proxy[0]@staticmethoddef get_proxies(count):"""返回含有count个的代理字典的列表"""count = countProxies.geshu = counti = 2  # 从第二页开始url_list = []  # 存放有效的代理字典tempDic = {}  # 临时存取每一页的字典while True:      #一直爬取,直到大于指定的个数后,爬取完当前页,停止i += 1url_new = Proxies.url+ str(i) + '.html'content = Proxies.get_content(url_new)tempDic = Proxies.get_info(content)   #爬取了一个页面的代理if len(Proxies.proxies) > count:return  Proxies.proxies[:count]@staticmethoddef get_content(url):"""根据url获取网页内容"""user_agent = userAgent.User_Agent.get_user_agent('pc')headers = user_agentresponse = requests.get(url=url, headers=headers)return response.text@staticmethoddef get_info(content):datas_ips = etree.HTML(content).xpath('//div[contains(@id,"main")]/div/div[1]/table/tr[position()>1]/td[1]/text()')datas_ports = etree.HTML(content).xpath('//div[contains(@id,"main")]/div/div[1]/table/tr[position()>1]/td[2]/text()')for i in range(len(datas_ips)):Proxies.verif_ip(datas_ips[i], datas_ports[i])  # 验证,将有效的代理保存下来if len(Proxies.proxies)> Proxies.geshu:break# print(i)# print(Proxies.proxies)# print('%s :--- %s'%(datas_ips,datas_ports))@staticmethoddef verif_ip(ip, port):   #验证代理的有效性user_agent = userAgent.User_Agent.get_user_agent('pc')url = 'http://www.baidu.com'proxies = {}try:left = 'http'right = 'http://' + ip + ":" + portproxies[left] = rightres = requests.get(url=url,proxies=proxies, headers=user_agent, timeout=0.1)  //过滤一些反应慢的ipif res.status_code == 200:Proxies.proxies.append(proxies)# print(Proxies.proxies)# print('验证通过')return Trueexcept :try:left = 'https'right = 'https://' + ip + ":" + portproxies = {}proxies[left] = rightres = requests.get(url=url, proxies=proxies, headers=user_agent, timeout=0.1)if res.status_code == 200:Proxies.proxies.append(proxies)   # 如果代理有效,则保存到类属性# print('----', Proxies.proxies)# print('----验证通过')return Trueexcept Exception as e:# print("验证失败", e)return Falseelse:# print("验证失败")return False
if __name__ == '__main__':proxies = Proxies.get_one_proxy()proxies = Proxies.get_proxies(10)print(proxies)


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部