爬虫进阶(入门)

爬虫的目的当然不仅仅是能下载图片网页视频等,大部分情况还是需要

获取数据,获取数据的话就需要对html,xml,json等文件进行处理

  • xpath选择器BeautifulSoup来选取网页节点,进一步获取数据
  • requests库,代替urllib.request,用来请求、代理

他们的用法这里不做记录。

1. 代理

代理分类

  • 透明(表面上是代理ip 实际上用的还是真实ip)
  • 匿名(不会用真实的ip,知道是代理ip 但无法识别ip地址)
  • 高匿(模拟浏览器,使用的是代理ip 以假乱真)

使用场景

爬取网站反爬机制会对ip进行限制(封ip)

使用

import urllib.request
import urllib.parse# 配置代理对象,把协议作为键,主机和端口号为值
handler = urllib.request.ProxyHandler(proxies={'http':'183.33.192.247:9797'})url = 'https://www.baidu.com/s?wd=ip'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)opener = urllib.request.build_opener(handler)response = opener.open(request)content = response.read().decode('utf-8')with open('proxy.html','w',encoding='utf-8') as fp:fp.write(content)

2. 工具

抓包工具

fiddler

直接去官网下载fiddler

mac安装还需要下载mono

安装教程

mac启动

mono --arch=32 Fiddler.exe

注意:使用了一下发现在mac上很不友好,windows下好评还是很多的

mac下使用青花瓷

Charles

下载

安装配置

xpath helper

Xpath插件的使用

  1. 打开Chrome浏览器的扩展程序(设置-更多工具-扩展程序)
  2. 把Xpath.crx拖拽到里面
  3. 把浏览器退出,重启
  4. Ctrl+Shift+X打开xpath帮助工具(浏览器上面的黑色输入框)
  5. 按住shift,鼠标移动到网页内的任意元素位置上,可以自动定位该元素的xpath路径
正则替换

Sublime,

alt+command+f打开替换,windows是ctrl+h

上边输入(.*?):\s(.*)

下边输入"\1":"\2"就可以替换为json正常格式>

3.实战

站长素材(path)
import urllib.request
import urllib.parse
from lxml import etree
import osdef  get_url(page):if page == 1:url = 'http://sc.chinaz.com/tupian/shuaigetupian.html'else:url = 'http://sc.chinaz.com/tupian/shuaigetupian_{}.html'.format(page)return urldef request_data(url):handler = urllib.request.HTTPHandler()opener = urllib.request.build_opener(handler)headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}request = urllib.request.Request(url=url,headers=headers)response = opener.open(request)return responsedef parse_data(response):html_str = response.read().decode('utf-8')# etree.parse()tree = etree.HTML(html_str)img_xpath = '//div[@id="container"]/div//img'img_nodes = tree.xpath(img_xpath)items = []for img_node in img_nodes:src = img_node.xpath('./@src2')[0]alt = img_node.xpath('./@alt')[0]item = {'src':src,'alt':alt}items.append(item)return itemsdef download_data(items):for item in items:src = item['src']alt = item['alt']suffix = src.split('.')[-1]file_name = alt+'.'+suffixfile_path = os.path.join('shuaige',file_name)urllib.request.urlretrieve(src,file_path)print(file_name+'下载完成')print('全部下载完成')if __name__ == '__main__':page = int(input('请输入要查询的页码:'))url = get_url(page)response = request_data(url)items = parse_data(response)download_data(items)
股票(BeautifulSoup)
import urllib.request
import urllib.parse
from lxml import etree
from bs4 import BeautifulSoupurl = 'http://quote.stockstar.com/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}request = urllib.request.Request(url=url,headers=headers)response = urllib.request.urlopen(request)content = response.read().decode('gb2312')# tree = etree.HTML(content)
soup = BeautifulSoup(content,'lxml')# 使用BeatifullSoup定位tr_list
tr_list = soup.select('#datalist tr')# 使用BeatifullSoup
for tr in tr_list:td_list = tr.find_all('td')for td in td_list:# 如果string不行,还get_text()一定行print(td.string)
智联招聘(BeautifulSoup)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
from Item import GSXX
import jsondef request_data(url):headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}data = {'jl':'北京','kw':'python','p':1,}data = urllib.parse.urlencode(data)url = url + datarequest = urllib.request.Request(url=url,headers=headers)response = urllib.request.urlopen(request)return responsedef parse_data(response):content = response.read().decode('utf-8')soup = BeautifulSoup(content,'lxml')table_list = soup.select('#newlist_list_content_table table')[1:]items = []for table in table_list:zwmc = table.select('.zwmc')[0].get_text().strip('\n')gsmc = table.select('.gsmc')[0].get_text().strip('\n')zwyx = table.select('.zwyx')[0].get_text().strip('\n')gzdd = table.select('.gzdd')[0].get_text().strip('\n')item = GSXX(zwmc,gsmc,zwyx,gzdd)items.append(item)return itemsdef transform_data(item):return {name:item.name}def save_data(items):for item in items:json_obj = item.__dict__json.dump(json_obj,open('zhilian.json','a',encoding='utf-8'),ensure_ascii=False)def study_json():# 关于json文件操作的几个方法# pickle模块  序列化操作# json.load() 从json文件中读取json对象# json.loads() 把一个字符串对象读取成一个json对象# json.dump() 把一个json对象写入到文件中# json.dumps() 把一个json对象读取成字符串对象dic = {'name':'dancer','age':10}json_str = json.dumps(dic)print(type(json_str))json.dump(dic,open('test.json','w',encoding='utf-8'))json_obj = json.load(open('test.json','r',encoding='utf-8'))print(json_obj,type(json_obj))json_obj2 = json.loads(json_str,encoding='utf-8')print(type(json_obj2),json_obj2)if __name__ == '__main__':# 带参数的GET请求url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?'response = request_data(url)items = parse_data(response)save_data(items)

Selenuim使用

  • Selemuim 模拟浏览器行为
  • pantormJS
阳光宽频网视频下载
import requests
from lxml import etree
import json
from selenium import webdriver
import timedef handle_video(name, url):headers = {'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1'}# r = requests.get(url=url, headers=headers)# 在这又不对了,因为video标签是通过js动态加载进来的# html_tree = etree.HTML(r.text)# video_src = html_tree.xpath('//video[@class="vjs-tech"]')# print(video_src)# 发现数据也是动态添加进来的,所以这个时候使用phantomjs和selenium请求页面,执行其中的js代码,将真正的数据展示给你即可,然后再去解析它# 创建浏览器对象driver = webdriver.PhantomJS('phantomjs.exe')# 向指定地址发送请求driver.get(url)# 让程序稍微休眠一下,去执行里面的js代码time.sleep(5)# 【注】得到网页源代码,此时page_source是执行js之后的网页代码html_tree = etree.HTML(driver.page_source)video_src = html_tree.xpath('//video[@class="vjs-tech"]/source/@src')[0]# print(video_src)# with open('1.html', 'w', encoding='utf-8') as fp:#   fp.write(driver.page_source)r = requests.get(url=video_src, headers=headers)filename = './video/' + name + '.mp4'with open(filename, 'wb') as fp:fp.write(r.content)print(filename + '下载完毕')def handle_index(url, json_url):headers = {'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1'}r = requests.get(url=json_url, headers=headers)# 是否会将json格式的字符串转化为python对象# print(r.text)obj = json.loads(r.text)# print(len(obj['data']))print('开始下载。。。。。。。')for video_obj in obj['data']:# 获取视频名字name = video_obj['video_id']# 获取视频连接video_url = 'https://365yg.com' + video_obj['source_url']# print(name, video_url)# 获取视频连接,并且下载视频handle_video(name, video_url)print('结束下载')# 如下方式是不对的,因为内容是通过js动态添加的,所以我们需要抓包,捕获数据接口# 通过xpath获取所有符合要求的a连接# html_tree = etree.HTML(r.text)# a_list = html_tree.xpath('//div[@class="title-box"]')# print(a_list)def main():# 首页的urlurl = 'https://365yg.com/'json_url = 'https://365yg.com/api/pc/feed/?category=video&utm_source=toutiao&widen=3&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1253AF48A0FCE5&cp=5A4A4F4CAEE59E1&_signature='# 得到首页的内容,获取里面所有的符合条件的a标签的hrefhandle_index(url, json_url)if __name__ == '__main__':main()

阳光视频下载

from selenium import webdriver
from handless import share_browser
import time
from lxml import etree
import os
import requestsshouye_url = "http://www.365yg.com/"driver = share_browser()driver.get(shouye_url)
time.sleep(3)driver.execute_script("document.documentElement.scrollTop=1000000")
time.sleep(3)detail_xpath = '//a[@ga_event="video_comment_click"]/@href'tree = etree.HTML(driver.page_source)
detail_list = tree.xpath(detail_xpath)
for detail in detail_list:detail_url = 'http://www.365yg.com' + detaildriver.get(detail_url)time.sleep(1)tree1 = etree.HTML(driver.page_source)src_xpath = '//video/@src'src = tree1.xpath(src_xpath)[0]name = tree1.xpath('//h2/text()')[0]if not src.startswith("http:"):src = 'http:' + srcfile_path = 'ygvideo'file_name = name + '.mp4'path = os.path.join(file_path,file_name)r = requests.get(src)with open(path,'wb') as fp:fp.write(r.content)
验证码识别

利用第三方平台集成环境

云打码

  • 注册开发者账号
  • 登录并创建软件
  • 记住软件id,key
  • 下载python调试文档
  • 将两个dll文件及py文件放到项目目录下

YDpython3.x.py

# -*- coding: cp936 -*-import sys
import os
from ctypes import *class Recgonizier():def __init__(self):# print('>>>正在初始化...')self.YDMApi = windll.LoadLibrary('yundamaAPI')self.appId = '软件id'self.appKey = b'软件key'# print('软件ID:%d\r\n软件密钥:%s' % (self.appId, self.appKey))self.username = b'普通账号'self.password = b'密码'# 第一步:初始化云打码,只需调用一次即可self.YDMApi.YDM_SetAppInfo(self.appId, self.appKey)# 第二步:登陆云打码账号,只需调用一次即可self.uid = self.YDMApi.YDM_Login(self.username, self.password)if self.username == b'test':exit('\r\n>>>请先设置用户名密码')####################### 一键识别函数 YDM_EasyDecodeByPath #######################def esay_recognition(self, file_path):# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.htmlcodetype = 1004# 分配30个字节存放识别结果result = c_char_p(b"                              ")# 识别超时时间 单位:秒timeout = 60# 验证码文件路径file_path = file_path.encode('utf-8')filename = file_path# 一键识别函数,无需调用 YDM_SetAppInfo 和 YDM_Login,适合脚本调用captchaId = self.YDMApi.YDM_EasyDecodeByPath(self.username, self.password, self.appId, self.appKey, filename, codetype, timeout, result)# print("一键识别:验证码ID:%d,识别结果:%s" % (captchaId, result.value))return result.value########################## 普通识别函数 YDM_DecodeByPath #########################def normal_recognition(self, file_path):print('\r\n>>>正在登陆...')if uid > 0:balance = self.YDMApi.YDM_GetBalance(self.username, self.password)print('\r\n>>>正在普通识别...')codetype = 1004result = c_char_p(b"                              ")file_path = file_path.encode('utf-8')filename = file_path# 普通识别函数,需先调用 YDM_SetAppInfo 和 YDM_Login 初始化captchaId = YDMApi.YDM_DecodeByPath(filename, codetype, result)return result.value
古诗文网站
import requests
from bs4 import BeautifulSoup
from RecognizerFile import Recgonizier# 请求这个页面,获取验证码图片
login_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
# 处理登录
post_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}s = requests.Session()
response = s.get(login_url,headers=headers)
response.encoding='utf-8'soup = BeautifulSoup(response.text,'lxml')
img_url = 'https://so.gushiwen.org' + soup.select('#imgCode')[0].attrs.get('src')# 注意要使用绘画请求图片
img_response = s.get(img_url)
with open('getCode.jpg','wb') as fp:fp.write(img_response.content)r = Recgonizier()
code = r.esay_recognition('getCode.jpg')
print('验证码识别结果'+str(code))a = soup.select('#__VIEWSTATE')[0].attrs.get('value')
b = soup.select('#__VIEWSTATEGENERATOR')[0].attrs.get('value')print(a,b)
data = {'__VIEWSTATE': a,'__VIEWSTATEGENERATOR': b,'from': 'http://so.gushiwen.org/user/collect.aspx','email': '邮箱@qq.com','pwd': '密码','code': code,'denglu': '登录'
}response = s.post(url=post_url,data=data,headers=headers)
response.encoding='utf-8'
with open('gushi.html','w',encoding='utf-8') as fp:fp.write(response.text)

4. 模拟登录

QQmail
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time# 模拟登陆qq邮箱
driver = webdriver.Chrome()
driver.get("https://mail.qq.com/")
time.sleep(5)
# 切换iframe
driver.switch_to.frame("login_frame")
# 用户名 密码
driver.find_element_by_name("u").send_keys("******@qq.com")driver.find_element_by_name("p").send_keys("your password")
time.sleep(5)
driver.find_element_by_id("login_button").click()
# driver.find_element_by_id("login_button")send_keys(Keys.RETURN)
time.sleep(5)
print(driver.get_cookies())
driver.save_screenshot('qq.png')driver.quit()
知乎
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time#模拟登录知乎
driver=webdriver.Chrome()
url='https://www.zhihu.com/signin'
driver.get(url)driver.find_element_by_name('username').send_keys('189********')
time.sleep(3)
driver.find_element_by_name('password').send_keys('your password')
time.sleep(3)driver.find_element_by_xpath('//form/button').click()
driver.save_screenshot('zhihu.png')
print(driver.get_cookies())driver.quit()


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部