requests库+re正则表达式爬取并解析古诗文网
# requests + re
# requests: 数据爬取
# re:数据解析import requests
import redef parse(url):# 定义请求头headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',}response = requests.get(url,headers)text = response.text# 核心: 利用re来解析数据# 限定符后面的?表示非贪婪模式# re.DOTALL 可以让 . 运算符匹配到\n,.运算符默认是匹配不到\n的titles = re.findall(r'.*?(.*?)', text, re.DOTALL)dynasties = re.findall(r'.*?(.*?)', text, re.DOTALL)authors = re.findall(r'.*?.*?(.*?)', text, re.DOTALL)contents_tags = re.findall(r'(.*?)', text, re.DOTALL)contents = []for content in contents_tags:content = re.sub(r'<.*?>', '', content)contents.append(content.strip())poems = []for value in zip(titles, dynasties, authors, contents):title, dynasty, author, content = valuepoem = [{'title': title,'dynasties': dynasty,'authors': author,'contents': content}]# 将字典作为元素添加到列表中poems.append(poem)for poem in poems:print(poem)print('~'*100)def main():# 爬取指定的页数for page in range(1, 51):url = 'https://www.gushiwen.org/default_{}.aspx'.format(page)parse(url)if __name__ == '__main__':main()
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
