python爬虫训练:动漫网10000部动漫信息抓取
本文只供学习,不做他用!
import re
import threading
import requests
import os
import time
import openpyxl# 爬取全部首页图并存储headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
lst_name=[]def get_url(n):url=f'https://mox.moe/l/all,all,all,sortpoint,all,all/{n}.htm'resp=requests.get(url,headers=headers)# 动漫缩略图pic_url1=re.compile(')pic_url=pic_url1.findall(resp.text)# print(pic_url)# 动漫信息name1=re.compile("(?P.*?)
\[(?P.*?)\]
" )pic_name = name1.findall(resp.text)# print(pic_name)# 动漫评分score1 = re.compile('(.*?)
')pic_score=score1.findall(resp.text)# print(pic_score)lst_name = []for name in pic_name:lst_name.append(name[1])# save_page(pic_url,lst_name)#保存图片save_xlsx(pic_name,pic_score)#动漫详情写入exceldef save_page(url,name):for url1,name1 in zip(url,name):print(url1,name1)file=os.path.join('动漫封面',name1)resp=requests.get(url1,headers=headers).contentwith open(file+'.jpg','wb') as f:f.write(resp)print(name1+'已保存')def save_xlsx(name,score):wb=openpyxl.Workbook()sheet=wb.activelst=['序号','名称','作者','详情页','评分']sheet.append(lst)lst1=[]for index,name1 in enumerate(name):lst=[index,name1[1],name1[2],name1[0],score[index]]# lst1.append(lst)sheet.append(lst)wb.save('动漫信息.xlsx')if __name__ == '__main__':path=os.path.exists('动漫封面')os.mkdir('动漫封面') if not path else print('文件夹已建立')t = threading.BoundedSemaphore(5)for n in range(1,10):#抓取几页就写几页r=threading.Thread(target=get_url,args=(n,))r.start()if threading.active_count()!=1:passelse:print('全部保存完毕')


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
