网易云课堂python网络爬虫视频源码及实现

代码截图:

(源代码在最下面)

import requests
res = requests.get('http://www.sina.com.cn/')
res.encoding = 'utf-8'
print(res.text)######################################
# 用BeautifulSoup4剖析网页元素
from bs4 import BeautifulSoup
html_sample = ' \
 \
 \

Hello World

\ This is link1 \ This is link2 \ \ 'soup = BeautifulSoup(html_sample) print(soup.text)######################################## # BeautifulSoup基础操作 # 使用select找出含有h1标签的元素 soup = BeautifulSoup(html_sample) header = soup.select('h1') # 返回一个列表 print(header) print(header[0]) print(header[0].text)####################################### # 使用select找出含有a的标签 soup = BeautifulSoup(html_sample) alink = soup.select('a') print(alink) for link in alink:print(link)print(link.text)print(link['href'])###################################### # 使用select找出所有id为title的元素(id前面需要加#) alink = soup.select('#title') print(alink)##################################### # 使用select找出所有class为link的元素(class前面需要加.) soup = BeautifulSoup(html_sample) for link in soup.select('.link'):print(link)#################################### # 使用select找出所有a tag的href链接 alinks = soup.select('a') for link in alinks:print(link['href']) # 原理:会把标签的属性包装成字典#################################### # 观察如何抓取新浪新闻信息 # 抓取时间、标题、内容 import requests from bs4 import BeautifulSoupres = requests.get('http://news.sina.com.cn/china') res.encoding = 'utf-8' soup = BeautifulSoup(res.text)for news in soup.select('.news-item'):if (len(news.select('h2')) > 0): h2 = news.select('h2')[0].text time = news.select('.time')[0].texttimesource = news.select('.time')[0].contents[0].strip()a = news.select('a')[0]['href']print(time, h2, a)################################### # 抓取新闻内文页面 res = requests.get('http://news.sina.com.cn/c/gat/2018-10-24/doc-ifxeuwws7683546.shtml') res.encoding = 'utf-8' # print (res.text) soup = BeautifulSoup(res.text) # 获取新闻内标题 #print (soup.select('#artibodyTitle')[0].text)# 获取新闻内时间 timesource = soup.select('.date')[0].contents[0].strip().replace(' ','') print (timesource) print (type(timesource)) from datetime import datetime dt = datetime.strptime(timesource,'%Y年%m月%d日%H:%M') #按照特定时间格式字符串转换(解释)为时间类型 print (dt) print (type(dt)) print (type(dt.strftime('%Y-%M-%d'))) # 将时间格式化# 获取新闻来源 #print (soup.select('.time-source span a')[0].text)################################## # 整理新闻内文、获取编辑名称 ' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) # 用空格将内容各元素连接起来# 获取编辑名称 editor = soup.select('.show_author')[0].text.strip('责任编辑:') print (editor)################################# # 获取评论数 print (soup.select('#commentCount1'))comments = requests.get('http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-fypnyqi1126795&group=&compress=0&ie=gbk&oe=gbk&page=1&page_size=20&jsvar=loader_1540784047769_15941830') comments.encoding = 'utf-8' # print (comments.text.lstrip('var loader_1540784047769_15941830=')) import json jd = json.loads(comments.text.lstrip('var loader_1540784047769_15941830=')) print (jd) print (jd['result']['count']['total'])################################# commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=gbk&oe=gbk&page=1&page_size=20&jsvar=loader_1540784047769_15941830' commentURL.format('fypnyqi1126795') comments.encoding = 'utf-8' #print (comments.text.lstrip('var loader_1540784047769_15941830=')) import json jd = json.loads(comments.text.lstrip('var loader_1540784047769_15941830=')) print (jd) print (jd['result']['count']['total'])################################## import re import json commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=gbk&oe=gbk&page=1&page_size=20&jsvar=loader_1540784047769_15941830' def getCommentCounts(newsurl):m = re.search('doc-i(.*).shtml',newsurl)newsid = m.group(1) # 1返回(.*)中的内容,0返回整个字符串comments = requests.get(commentURL.format(newsid))jd = json.loads(comments.text.strip('var loader_1540784047769_15941830='))return jd['result']['count']['total'] news = 'http://news.sina.com.cn/o/2017-12-06/doc-ifypnyqi1126795.shtml' getCommentCounts(news)################################# # 建立新闻内文信息抽取函数 def getNewsDetail(newsurl):result = {}res = requests.get(newsurl)res.encoding='utf-8'soup = BeautifulSoup(res.text,'html.parser')# result['title'] = soup.select('#artibodyTitle')[0].textresult['title'] = soup.select('.main-title')[0].text#result['newssource'] = soup.select('.time-source span a')[0].text#timesource = soup.select('.time-source')[0].contents[0].strip()timesource = soup.select('.date')[0].contents[0].strip().replace(' ','')result['dt'] = datetime.strptime(timesource,'%Y年%m月%d日%H:%M') result['article'] = ' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) # 用空格将内容各元素连接起来result['editor'] = soup.select('.show_author')[0].text.strip('责任编辑:')result['comment'] = getCommentCounts(news)return result################################ # 插曲:text和conents的用法 import requests from bs4 import BeautifulSoup response = requests.get('https://www.baidu.com') re_text = response.text re_content = response.content print (re_text) print (type(re_text)) print (re_content) print (type(re_content)) response.encoding = 'utf-8' re_text = response.text print (re_text)############################### # getNewsDetail('http://news.sina.com.cn/o/2017-12-06/doc-ifypnyqi1126795.shtml') getNewsDetail('http://news.sina.com.cn/o/2018-10-30/doc-ifxeuwws9358830.shtml')############################### res = requests.get('https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page=1&encode=utf-8&callback=feedCardJsonpCallback&_=1540805221737') jd = json.loads(res.text.lstrip('try{').lstrip('feedCardJsonpCallback(').rstrip('tch(e){};').rstrip(';}ca').rstrip(')')) for url in (jd['result']['data']):print (url['url']) # "comment_total":3}]}});}catch(e){};############################### def parseListLinks(url):newsdetails = []res = requests.get(url)jd = json.loads(res.text.lstrip('try{').lstrip('feedCardJsonpCallback(').rstrip('tch(e){};').rstrip(';}ca').rstrip(')'))for url in (jd['result']['data']):newsdetails.append(getNewsDetail(url['url']))return newsdetails############################### url = 'https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback=feedCardJsonpCallback&_=1540805221737' news_total = [] for i in range(1,3):newsurl = url.format(i)newsary = parseListLinks(newsurl)news_total.extend(newsary) print (news_total)############################## # 整理数据 import pandas df = pandas.DataFrame(news_total) df.head(40)############################## # 将数据存入excel中和数据库中 df.to_excel('news.xlsx') import sqlite3 with sqlite3.connect('news1.sqlite') as db:df.to_sql('news',con = db)############################## #读取数据 import sqlite3 with sqlite3.connect('news.sqlite') as db:df2 = pandas.read_sql_query('SELECT * FROM news',con = db)

课程地址:https://study.163.com/course/introduction.htm?courseId=1003285002


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部