server=[]for i inrange(10):url="https://maoyan.com/board/4?offset="+str(i*10)cat=requests.get(url,headers=headers,proxies ={"http":"58.253.157.136"})cat_text=cat.textserver.append(cat.text)
建立空表
maoyan=pd.DataFrame()
信息提取
for i in server:data=etree.HTML(i)pos=data.xpath('//dl[@class="board-wrapper"]')for j in pos:ranking=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/i/text()')picture=j.xpath('.//a/img/@data-src')movie_web=j.xpath('.//div/div[1]/div[1]/p[1]/a/@href')movie_name=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[1]/p[1]/a/text()')performer=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[1]/p[2]/text()')Release_date=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[1]/p[3]/text()')grades1=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[2]/p/i[1]/text()')grades2=j.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/div/div/div[2]/p/i[2]/text()')score=[]for h inrange(0,len(grades1)):score.append(grades1[h]+grades2[h])result=pd.DataFrame({"电影排名":ranking,"图片地址":picture,"电影详情页地址":movie_web,"电影名称":movie_name,"电影主演":performer,"首映时间":Release_date,"评分":score})maoyan=maoyan.append(result,ignore_index=True)