网易云课堂python网络爬虫视频源码及实现

2023-09-02 04:25:01

代码截图：

（源代码在最下面）

import requests
res = requests.get('http://www.sina.com.cn/')
res.encoding = 'utf-8'
print(res.text)######################################
# 用BeautifulSoup4剖析网页元素
from bs4 import BeautifulSoup
html_sample = ' \
 \
 \
Hello World \
This is link1 \
This is link2 \
 \
'soup = BeautifulSoup(html_sample)
print(soup.text)########################################
# BeautifulSoup基础操作
# 使用select找出含有h1标签的元素
soup = BeautifulSoup(html_sample)
header = soup.select('h1') # 返回一个列表
print(header)
print(header[0])
print(header[0].text)#######################################
# 使用select找出含有a的标签
soup = BeautifulSoup(html_sample)
alink = soup.select('a')
print(alink)
for link in alink:print(link)print(link.text)print(link['href'])######################################
# 使用select找出所有id为title的元素(id前面需要加#)
alink = soup.select('#title')
print(alink)#####################################
# 使用select找出所有class为link的元素(class前面需要加.)
soup = BeautifulSoup(html_sample)
for link in soup.select('.link'):print(link)####################################
# 使用select找出所有a tag的href链接
alinks = soup.select('a')
for link in alinks:print(link['href']) # 原理：会把标签的属性包装成字典####################################
# 观察如何抓取新浪新闻信息
# 抓取时间、标题、内容
import requests
from bs4 import BeautifulSoupres = requests.get('http://news.sina.com.cn/china')
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text)for news in soup.select('.news-item'):if (len(news.select('h2')) > 0): h2 = news.select('h2')[0].text time = news.select('.time')[0].texttimesource = news.select('.time')[0].contents[0].strip()a = news.select('a')[0]['href']print(time, h2, a)###################################
# 抓取新闻内文页面
res = requests.get('http://news.sina.com.cn/c/gat/2018-10-24/doc-ifxeuwws7683546.shtml')
res.encoding = 'utf-8'
# print (res.text)
soup = BeautifulSoup(res.text)
# 获取新闻内标题
#print (soup.select('#artibodyTitle')[0].text)# 获取新闻内时间
timesource = soup.select('.date')[0].contents[0].strip().replace(' ','')
print (timesource)
print (type(timesource))
from datetime import datetime
dt = datetime.strptime(timesource,'%Y年%m月%d日%H:%M') #按照特定时间格式字符串转换（解释）为时间类型
print (dt)
print (type(dt))
print (type(dt.strftime('%Y-%M-%d'))) # 将时间格式化# 获取新闻来源
#print (soup.select('.time-source span a')[0].text)##################################
# 整理新闻内文、获取编辑名称
' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) # 用空格将内容各元素连接起来# 获取编辑名称
editor = soup.select('.show_author')[0].text.strip('责任编辑：')
print (editor)#################################
# 获取评论数
print (soup.select('#commentCount1'))comments = requests.get('http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-fypnyqi1126795&group=&compress=0&ie=gbk&oe=gbk&page=1&page_size=20&jsvar=loader_1540784047769_15941830')
comments.encoding = 'utf-8'
# print (comments.text.lstrip('var loader_1540784047769_15941830='))
import json
jd = json.loads(comments.text.lstrip('var loader_1540784047769_15941830='))
print (jd)
print (jd['result']['count']['total'])#################################
commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=gbk&oe=gbk&page=1&page_size=20&jsvar=loader_1540784047769_15941830'
commentURL.format('fypnyqi1126795')
comments.encoding = 'utf-8'
#print (comments.text.lstrip('var loader_1540784047769_15941830='))
import json
jd = json.loads(comments.text.lstrip('var loader_1540784047769_15941830='))
print (jd)
print (jd['result']['count']['total'])##################################
import re
import json
commentURL = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=gbk&oe=gbk&page=1&page_size=20&jsvar=loader_1540784047769_15941830'
def getCommentCounts(newsurl):m = re.search('doc-i(.*).shtml',newsurl)newsid = m.group(1) # 1返回（.*）中的内容，0返回整个字符串comments = requests.get(commentURL.format(newsid))jd = json.loads(comments.text.strip('var loader_1540784047769_15941830='))return  jd['result']['count']['total']
news = 'http://news.sina.com.cn/o/2017-12-06/doc-ifypnyqi1126795.shtml'
getCommentCounts(news)#################################
# 建立新闻内文信息抽取函数
def getNewsDetail(newsurl):result = {}res = requests.get(newsurl)res.encoding='utf-8'soup = BeautifulSoup(res.text,'html.parser')# result['title'] = soup.select('#artibodyTitle')[0].textresult['title'] = soup.select('.main-title')[0].text#result['newssource'] = soup.select('.time-source span a')[0].text#timesource = soup.select('.time-source')[0].contents[0].strip()timesource = soup.select('.date')[0].contents[0].strip().replace(' ','')result['dt'] = datetime.strptime(timesource,'%Y年%m月%d日%H:%M') result['article'] = ' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) # 用空格将内容各元素连接起来result['editor'] = soup.select('.show_author')[0].text.strip('责任编辑：')result['comment'] = getCommentCounts(news)return result################################
# 插曲：text和conents的用法
import requests
from bs4 import BeautifulSoup
response = requests.get('https://www.baidu.com')
re_text = response.text
re_content = response.content
print (re_text)
print (type(re_text))
print (re_content)
print (type(re_content))
response.encoding = 'utf-8'
re_text = response.text
print (re_text)###############################
# getNewsDetail('http://news.sina.com.cn/o/2017-12-06/doc-ifypnyqi1126795.shtml')
getNewsDetail('http://news.sina.com.cn/o/2018-10-30/doc-ifxeuwws9358830.shtml')###############################
res = requests.get('https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page=1&encode=utf-8&callback=feedCardJsonpCallback&_=1540805221737')
jd = json.loads(res.text.lstrip('try{').lstrip('feedCardJsonpCallback(').rstrip('tch(e){};').rstrip(';}ca').rstrip(')'))
for url in (jd['result']['data']):print (url['url'])
# "comment_total":3}]}});}catch(e){};###############################
def parseListLinks(url):newsdetails = []res = requests.get(url)jd = json.loads(res.text.lstrip('try{').lstrip('feedCardJsonpCallback(').rstrip('tch(e){};').rstrip(';}ca').rstrip(')'))for url in (jd['result']['data']):newsdetails.append(getNewsDetail(url['url']))return newsdetails###############################
url = 'https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page={}&encode=utf-8&callback=feedCardJsonpCallback&_=1540805221737'
news_total = []
for i in range(1,3):newsurl = url.format(i)newsary = parseListLinks(newsurl)news_total.extend(newsary)
print (news_total)##############################
# 整理数据
import pandas
df = pandas.DataFrame(news_total)
df.head(40)##############################
# 将数据存入excel中和数据库中
df.to_excel('news.xlsx')
import sqlite3
with sqlite3.connect('news1.sqlite') as db:df.to_sql('news',con = db)##############################
#读取数据
import sqlite3
with sqlite3.connect('news.sqlite') as db:df2 = pandas.read_sql_query('SELECT * FROM news',con = db)

课程地址：https://study.163.com/course/introduction.htm?courseId=1003285002

本文来自互联网用户投稿，文章观点仅代表作者本人，不代表本站立场，不承担相关法律责任。如若转载，请注明出处。 如若内容造成侵权/违法违规/事实不符，请点击【内容举报】进行投诉反馈！

标签：技术

Duilib中list控件支持ctrl和shif多行选中的实现

[ICML2015]Batch Normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shif

win10系统微软输入法于eclipse ctrl+shif+f冲突间接处理办法

Codeforces Round #259 (Div. 2) B. Little Pony and Sort by Shif

读LDD3，内存映射与DMA--PAGE_SHIF…

VMware虚拟机安装XP【要先分区，再设置BOOT 启动CD，shif+上移】

更换iBus五笔的左与右Shif

sublime ctrl+shif+f 没用解决办法

idea 对 ctrl + z 的撤销是 ctrl + shif + z

计算机最早的设计师应用于,计算机应用基础选择题doc.doc

win10自带截图神器：Win+Shift+S

Python基础之文件目录操作

python简述目录_Python基础之文件目录操作(示例代码)

tp5 如何做数据采集

任务2-7(服务器字体+阿里巴巴矢量库)

html标签（1)：h1~h6,p,br,pre,hr

TI 电量计介绍与芯片选型指南

几款TI电源芯片简介

TI DSP芯片C2000系列读取FLASH数据

德州仪器(Ti)平台嵌入式开发基础

TI三相电机智能栅极驱动芯片特点分类

省选模拟（12.08） T3 圈圈圈圈圈圈圈圈

Hadoop生态圈技术栈（上）

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之6.Impala交互式查询

小猿圈之Linux下Mysql 操作命令

大数据Hadoop生态圈常用面试题

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之4.Hive DDL、DQL和数据操作

备战Noip2018模拟赛11（B组）T3 Monogatari 物语

【智能优化算法-圆圈搜索算法】基于圆圈搜索算法Circle Search Algorithm求解单目标优化问题附matlab代码

NYOJ 78 圈水池

递归问题跑道汽车绕圈问题 Python实现

Hadoop生态圈（三）：MapReduce

网易云课堂python网络爬虫视频源码及实现

Hello World

相关文章