python抓取东方财富网股票的公告的里面的链接
正则表达式匹配网址http://data.eastmoney.com/notice/的公告标题和里面的链接,并自动记录到本地文本里
# -*- coding: UTF-8 -*-
__author__ = 'Intgrp'import urllib # 网页操作模块,获取网页数据
import re # 正则表达式def get_content_link_title(url):html = urllib.urlopen(url).read()#匹配出股票的 [链接,公告]reg = r'30010#reg = r'(\d+)' 匹配类似 数字的类型# \xd7\xaa\xb5\xbd\xd7\xee\xba\xf3\xd2\xbb\xd2\xb3 汉子为转到最后一页reg = r'(\d+)'reg = re.compile(reg)page = re.findall(reg,html)return pagedef get_next_page_link(i):#由网页源代码可以得出,链接类型为#例如http://data.eastmoney.com/Notice/Noticelist.aspx?type=0&market=all&date=&page=5#固得出http://data.eastmoney.com/Notice/Noticelist.aspx?type=0&market=all&date=&page=页码return "http://data.eastmoney.com/Notice/Noticelist.aspx?type=0&market=all&date=&page=%s"%i
'''
def content_Write2Txt(filename,content,i):f = open(filename,'a')f.writelines("----------page_num=%s----------------\n"%i)for i in content:f.writelines(i[0])f.writelines("\t")f.writelines(i[1])f.write("\n")f.close()
'''
def content_Write2Txt(content,i):f.writelines("----------page_num=%s----------------\n"%i) #可以去掉for i in content:f.writelines(i[0])f.writelines("\t")f.writelines(i[1])f.write("\n")if __name__=="__main__":page_num = get_content_num("http://data.eastmoney.com/notice/")#获得总共几页print "总共%s页"%page_num[0]f = open("test.txt",'a')for i in range(1,int(page_num[0])):url = get_next_page_link(i)#获取第i页urlcontent=get_content_link_title(url)#得出每页的内容content_Write2Txt(content,i)#附加形式写入txt文件print "第%s页已经完成"%if.close()print "write success!"
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
