python webviewer爬虫_Python:的web爬虫实现及原理(BeautifulSoup工具)

最近一直在学习python,学习完了基本语法就练习了一个爬虫demo,下面总结下。

主要逻辑是

1)初始化url管理器,也就是将rooturl加入到url管理器中

2)在url管理器中得到新的new_url

3)根据新new_url得到它的内容html_cont  (工具 urllib.request.urlopen(url))

4)解析这个新页面的内容html_cont并得到新的子url,并保存解析内容结果  (利用BeautifulSoup工具)

5)将新得到的子url保存到url管理器

6)迭代2-5步骤,知道输出某个阈值的数量即可停止

7)输出爬去的结果

注意编码问题,一致为UTF-8 --   .decode('UTF-8')

BeautifulSoup工具的安装方式:进入Python3.x\Script下 输入指令 pip install beautifulsoup4

主页面spider_main.py:

'''Created on 2016-3-30@author: rongyu'''frombike_spider import url_manager, html_downloader, html_parser, html_outputerclass SpiderMain(object):

def __init__(self):

self.urls=url_manager.UrlManager()

self.downloader=html_downloader.HtmlDownloader()

self.parser=html_parser.HtmlParser()

self.outputer=html_outputer.HtmlOutputer()

def craw(self, root_url):

count= 1self.urls.add_new_url(root_url)whileself.urls.has_new_url():try:

new_url=self.urls.get_new_url()

print ('craw %d:%s'%(count,new_url))

html_cont=self.downloader.download(new_url)

new_urls,new_data=self.parser.parse(new_url,html_cont)

self.urls.add_new_urls(new_urls)

self.outputer.collect_data(new_data)if count == 100:breakcount= count + 1except:

print('craw failed')

self.outputer.output_html()

#主程序入口if __name__=="__main__":

root_url= "http://baike.baidu.com/view/21087.htm"obj_spider=SpiderMain()

obj_spider.craw(root_url)    #根据url开始爬取

url管理器页面UrlManager.py

'''Created on 2016-3-30@author: rongyu'''

class UrlManager(object):

def __init__(self):

self.new_urls= set()

self.old_urls= set()

def add_new_url(self,url):if url isNone:return

if url not in self.new_urls and url not inself.old_urls:

self.new_urls.add(url)

def has_new_url(self):return len(self.new_urls) != 0def get_new_url(self):

new_url=self.new_urls.pop()

self.old_urls.add(new_url)returnnew_url

def add_new_urls(self,urls):if urls is None or len(urls) == 0:return

for url inurls:

self.add_new_url(url)

下载器页面 HtmlDownloader.py

import urllib.requestclass HtmlDownloader(object):

def download(self,url):if url isNone:returnNone

response=urllib.request.urlopen(url)return response.read().decode('UTF-8')

解析器页面HtmlParser.py

frombs4 import BeautifulSoup

import re

import urllib.parseclass HtmlParser(object):

def _get_new_urls(self, page_url, soup):

new_urls= set()

#/view/234.htm

links= soup.find_all('a',href=re.compile(r"/view/\d+\.htm"))for link inlinks:

new_url= link['href']

new_full_url=urllib.parse.urljoin(page_url,new_url)

new_urls.add(new_full_url)returnnew_urls

def _get_new_data(self, page_url, soup):

res_data={}

#url

res_data['url'] =page_url

#

Python

title_node= soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")

res_data['title'] =title_node.get_text()

#

summary_node= soup.find('div',class_="lemma-summary")

res_data['summary'] =summary_node.get_text()returnres_data

def parse(self,page_url,html_cont):if page_url is None or html_cont isNone:returnsoup= BeautifulSoup(html_cont,'html.parser',from_encoding='UTF-8')

new_urls=self._get_new_urls(page_url,soup)

new_data=self._get_new_data(page_url,soup)returnnew_urls,new_data

输出器的代码HtmlOutputer.py

class HtmlOutputer(object):

def __init__(self):

self.datas=[]

def collect_data(self,data):if data isNone:

#print("collect_data -data is none!")returnself.datas.append(data)

#print(self.datas)

def output_html(self):

fout= open('output.html','w')

fout.write("")

fout.write("

")

fout.write("

fout.write("

")

fout.write("

%s"%data['url'].encode('UTF-8'))

fout.write("

%s"%data['title'].encode('UTF-8'))

fout.write("

%s"%data['summary'].encode('UTF-8'))

fout.write("

")

fout.write("

")

fout.write("")

fout.write("")

实验结果:

控制台输出

B0jkEIO83A06AAAAAElFTkSuQmCC


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部