网易云课堂python网络爬虫视频源码及实现
代码截图:
(源代码在最下面)

import requests
res = requests.get('http://www.sina.com.cn/')
res.encoding = 'utf-8'
print(res.text)######################################
# 用BeautifulSoup4剖析网页元素
from bs4 import BeautifulSoup
html_sample = ' \
\
\
Hello World
\
This is link1 \
This is link2 \
\
'soup = BeautifulSoup(html_sample)
print(soup.text)########################################
# BeautifulSoup基础操作
# 使用select找出含有h1标签的元素
soup = BeautifulSoup(html_sample)
header = soup.select('h1') # 返回一个列表
print(header)
print(header[0])
print(header[0].text)#######################################
# 使用select找出含有a的标签
soup = BeautifulSoup(html_sample)
alink = soup.select('a')
print(alink)
for link in alink:print(link)print(link.text)print(link['href'])######################################
# 使用select找出所有id为title的元素(id前面需要加#)
alink = soup.select('#title')
print(alink)#####################################
# 使用select找出所有class为link的元素(class前面需要加.)
soup = BeautifulSoup(html_sample)
for link in soup.select('.link'):print(link)####################################
# 使用select找出所有a tag的href链接
alinks = soup.select('a')
for link in alinks:print(link['href']) # 原理:会把标签的属性包装成字典####################################
# 观察如何抓取新浪新闻信息
# 抓取时间、标题、内容
import requests
from bs4 import BeautifulSoupres = requests.get('http://news.sina.com.cn/china')
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text)for news in soup.select('.news-item'):if (len(news.select('h2')) > 0): h2 = news.select('h2')[0].text time = news.select('.time')[0].texttimesource = news.select('.time')[0].contents[0].strip()a = news.select('a')[0]['href']print(time, h2, a)###################################
# 抓取新闻内文页面
res = requests.get('http://news.sina.com.cn/c/gat/2018-10-24/doc-ifxeuwws7683546.shtml')
res.encoding = 'utf-8'
# print (res.text)
soup = BeautifulSoup(res.text)
# 获取新闻内标题
#print (soup.select('#artibodyTitle')[0].text)# 获取新闻内时间
timesource = soup.select('.date')[0].contents[0].strip().replace(' ','')
print (timesource)
print (type(timesource))
from datetime import datetime
dt = datetime.strptime(timesource,'%Y年%m月%d日%H:%M') #按照特定时间格式字符串转换(解释)为时间类型
print (dt)
print (type(dt))
print (type(dt.strftime('%Y-%M-%d'))) # 将时间格式化# 获取新闻来源
#print (soup.select('.time-source span a')[0].text)##################################
# 整理新闻内文、获取编辑名称
' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) # 用空格将内容各元素连接起来# 获取编辑名称
editor = soup.select('.show_author')[0].text.strip('责任编辑:')
print (editor)#################################
# 获取评论数
print (soup.select('#commentCount1'))comments = requests.get('http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-fypnyqi1126795&group=&compress=0&ie=gbk&oe=gbk&page=1&page_size=20&jsvar=loader_1540784047769_15941830')
comments.encoding = 'utf-8'
# print (comments.text.lstrip('var loader_1540784047769_15941830='))
import json
jd = json.loads(comments.text.lstrip('var loader_1540784047769_15941830='))
print (jd)
print (jd['result']['count']['total'])#################################
commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=gbk&oe=gbk&page=1&page_size=20&jsvar=loader_1540784047769_15941830'
commentURL.format('fypnyqi1126795')
comments.encoding = 'utf-8'
#print (comments.text.lstrip('var loader_1540784047769_15941830='))
import json
jd = json.loads(comments.text.lstrip('var loader_1540784047769_15941830='))
print (jd)
print (jd['result']['count']['total'])##################################
import re
import json
commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=gbk&oe=gbk&page=1&page_size=20&jsvar=loader_1540784047769_15941830'
def getCommentCounts(newsurl):m = re.search('doc-i(.*).shtml',newsurl)newsid = m.group(1) # 1返回(.*)中的内容,0返回整个字符串comments = requests.get(commentURL.format(newsid))jd = json.loads(comments.text.strip('var loader_1540784047769_15941830='))return jd['result']['count']['total']
news = 'http://news.sina.com.cn/o/2017-12-06/doc-ifypnyqi1126795.shtml'
getCommentCounts(news)#################################
# 建立新闻内文信息抽取函数
def getNewsDetail(newsurl):result = {}res = requests.get(newsurl)res.encoding='utf-8'soup = BeautifulSoup(res.text,'html.parser')# result['title'] = soup.select('#artibodyTitle')[0].textresult['title'] = soup.select('.main-title')[0].text#result['newssource'] = soup.select('.time-source span a')[0].text#timesource = soup.select('.time-source')[0].contents[0].strip()timesource = soup.select('.date')[0].contents[0].strip().replace(' ','')result['dt'] = datetime.strptime(timesource,'%Y年%m月%d日%H:%M') result['article'] = ' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) # 用空格将内容各元素连接起来result['editor'] = soup.select('.show_author')[0].text.strip('责任编辑:')result['comment'] = getCommentCounts(news)return result################################
# 插曲:text和conents的用法
import requests
from bs4 import BeautifulSoup
response = requests.get('https://www.baidu.com')
re_text = response.text
re_content = response.content
print (re_text)
print (type(re_text))
print (re_content)
print (type(re_content))
response.encoding = 'utf-8'
re_text = response.text
print (re_text)###############################
# getNewsDetail('http://news.sina.com.cn/o/2017-12-06/doc-ifypnyqi1126795.shtml')
getNewsDetail('http://news.sina.com.cn/o/2018-10-30/doc-ifxeuwws9358830.shtml')###############################
res = requests.get('https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page=1&encode=utf-8&callback=feedCardJsonpCallback&_=1540805221737')
jd = json.loads(res.text.lstrip('try{').lstrip('feedCardJsonpCallback(').rstrip('tch(e){};').rstrip(';}ca').rstrip(')'))
for url in (jd['result']['data']):print (url['url'])
# "comment_total":3}]}});}catch(e){};###############################
def parseListLinks(url):newsdetails = []res = requests.get(url)jd = json.loads(res.text.lstrip('try{').lstrip('feedCardJsonpCallback(').rstrip('tch(e){};').rstrip(';}ca').rstrip(')'))for url in (jd['result']['data']):newsdetails.append(getNewsDetail(url['url']))return newsdetails###############################
url = 'https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback=feedCardJsonpCallback&_=1540805221737'
news_total = []
for i in range(1,3):newsurl = url.format(i)newsary = parseListLinks(newsurl)news_total.extend(newsary)
print (news_total)##############################
# 整理数据
import pandas
df = pandas.DataFrame(news_total)
df.head(40)##############################
# 将数据存入excel中和数据库中
df.to_excel('news.xlsx')
import sqlite3
with sqlite3.connect('news1.sqlite') as db:df.to_sql('news',con = db)##############################
#读取数据
import sqlite3
with sqlite3.connect('news.sqlite') as db:df2 = pandas.read_sql_query('SELECT * FROM news',con = db)
课程地址:https://study.163.com/course/introduction.htm?courseId=1003285002
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
