scrapy爬取亚马逊商品评论
这一篇使用scrapy爬虫框架实现亚马逊商品评论的抓取。
1、创建一个爬虫项目:
scrapy startproject MySpiderTest
2、item.py中定义数据item:
import scrapy
from scrapy.item import Field, Itemclass ItcastItem(Item):# define the fields for your item here like:# name = scrapy.Field()name = Field()title = Field()info = Field()# 评论信息
class AmazonReviewItem(Item):user_id = Field()user_name = Field()data_asin = Field()name = Field() # 商品名称review_title = Field()review_star_rating = Field() # 评分review_date = Field() # 日期review_info = Field()# 商品信息
class AmazonGoodsItem(scrapy.Item):# define the fields for your item here like:#collection = 'amazon' # 数据表s_href = scrapy.Field() # 小分类urldata_asin = scrapy.Field() # 商品编号name = scrapy.Field() # 商品名称goods_url = scrapy.Field() # 商品urlbrand = scrapy.Field() # 商品品牌price = scrapy.Field() # 商品价格freight = scrapy.Field() # 运费
3、spider目录创建爬虫amazon_review.py:
# -*- coding: utf-8 -*-
import scrapy
from urllib import parse as url_parse
from mySpiderTest.items import AmazonGoodsItem, AmazonReviewItem
import re
from copy import deepcopy# 爬取亚马逊评论信息
# 通过搜索关键字查询出来的列表,如k=phone
class AmazonReviewSpider(scrapy.Spider):name = 'amazon_review'allowed_domains = ['www.amazon.com']# start_urls = ['https://www.amazon.com/s?k=phone&ref=nb_sb_noss']def __init__(self, category=None, *args, **kwargs):super(AmazonReviewSpider, self).__init__(*args, **kwargs)self.start_urls = []if category is not None:keys = category.split(",")for key in keys:self.start_urls.append('https://www.amazon.com/s?k=' + key + '&ref=nb_sb_noss')else:# 默认搜索phoneself.start_urls = ['https://www.amazon.com/s?k=phone&ref=nb_sb_noss']self.log("category = %s" % category)def parse(self, response):item = AmazonGoodsItem()div_list = response.xpath("//*[@id=\"search\"]//div[@class=\"s-result-list s-search-results sg-row\"]/div")self.log("div_list_len=%s" % str(len(div_list)))for each_div in div_list:# data_asin = each_div.xpath("@data-asin").extract_first()# item['data_asin'] = data_asingoods_url = each_div.xpath(".//h2/a/@href").extract_first()item['goods_url'] = url_parse.unquote(goods_url)item['name'] = self.get_goods_name(item['goods_url'])item['data_asin'] = self.get_data_asin(item['goods_url'])# self.log("************* item[name]: %s" % item)# 商品评论详情第一页review_url = 'https://www.amazon.com/' + item['name'] \+ '/product-reviews/' + item['data_asin'] \+ '/ref=cm_cr_getr_d_paging_btm_next_1?ie=UTF8' \+ '&reviewerType=all_reviews&pageNumber=1'yield scrapy.Request(review_url,callback=self.parse_review_detail,meta={"item": deepcopy(item)})# 下一页 xpath=//*[@id="search"]/div[1]/div[2]/div/span[7]/div/div/div/ul/li[7]/anext_url = response.xpath("//*[@id='search']/div[1]/div[2]/div/span[7]/div/div/div/ul/li[7]/a/@href").extract_first()if next_url is not None:next_url = 'https://www.amazon.cn' + next_urlyield scrapy.Request(next_url,callback=self.parse)def parse_review_detail(self, response):goods_item = response.meta["item"]# //*[@id="customer_review-R35WB3S3WWC9DN"]/div[4]/spanfor each in response.xpath("//*[starts-with(@id,\"customer_review-\")]"):item = AmazonReviewItem()item['data_asin'] = goods_item['data_asin']item['name'] = goods_item['name']item['data_asin'] = goods_item['data_asin']item['name'] = goods_item['name']item["user_id"] = each.xpath("@id").extract_first().split("-")[1]item["user_name"] = each.xpath("//span[@class='a-profile-name']")\.xpath('string(.)').extract()[0]item['review_title'] = each.xpath("//a[@data-hook='review-title']")\.xpath('string(.)').extract()[0]item['review_star_rating'] = each.xpath("//i[@data-hook='review-star-rating']")\.xpath('string(.)').extract()[0]item['review_date'] = each.xpath("//span[@data-hook='review-date']")\.xpath('string(.)').extract()[0]item['review_info'] = each.xpath("//span[@data-hook='review-body']") \.xpath('string(.)').extract()[0]yield item# 是否有下一页next_page = response.xpath("//*[@id=\"cm_cr-pagination_bar\"]/ul/li[2]/a/@href").extract_first()self.log("-------next_page = %s" % next_page)if next_page is not None:next_page = response.urljoin(next_page)yield scrapy.Request(next_page,callback=self.parse_review_detail,meta={"item": deepcopy(goods_item)})@staticmethoddef get_goods_name(url):name = ''if url is None:return nameregex1 = re.compile(r"url=\/.*?\/")is_contain_url = re.search(regex1, url)if is_contain_url:'''url = /gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_2?ie=UTF8&adId=A0805009AF1ES0KA13RD&url=/VTech-CS6529-4-Answering-Cordless-Handsets/dp/B00WHYS0R2/ref=sr_1_2_sspa?keywords=phone&qid=1561524478&s=gateway&sr=8-2-spons&psc=1&qualifier=1561524478&id=96741371645175&widgetName=sp_atf'''name = re.findall(re.compile(regex1), url)[0].split("/")[1]else:'''url = /Panasonic-KX-TGD532W-Expandable-Cordless-Answering/dp/B071GQB94T/ref=sr_1_3?keywords=phone&qid=1561524478&s=gateway&sr=8-3'''name = url.split("/")[1]return name@staticmethoddef get_data_asin(url):asin = ''if url is None:return asinregex1 = re.compile(r"dp\/.*?\/")is_contain_dp = re.search(regex1, url)'''url = /gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_2?ie=UTF8&adId=A0805009AF1ES0KA13RD&url=/VTech-CS6529-4-Answering-Cordless-Handsets/dp/B00WHYS0R2/ref=sr_1_2_sspa?keywords=phone&qid=1561524478&s=gateway&sr=8-2-spons&psc=1&qualifier=1561524478&id=96741371645175&widgetName=sp_atfurl = /Panasonic-KX-TGD532W-Expandable-Cordless-Answering/dp/B071GQB94T/ref=sr_1_3?keywords=phone&qid=1561524478&s=gateway&sr=8-3'''if is_contain_dp:asin = re.findall(regex1, url)[0].split("/")[1]else:asin = url.split("/")[1]return asin
4、定义pipelines.py:
class MyspidertestPipeline(object):def __init__(self):# super(self)self.review_file = codecs.open('amazon_reviews.json', 'a', encoding="utf-8")def process_item(self, item, spider):lines = json.dumps(dict(item), ensure_ascii=False) + "\n"self.review_file.write(lines)return itemdef spider_closed(self, spider):self.review_file.closed()
5、修改settings.py :
ITEM_PIPELINES = {'mySpiderTest.pipelines.MyspidertestPipeline': 300,
}
6、运行:
# category=phone是传递key参数字,多个用逗号隔开:category=huawei,oppo,vivo
scrapy crawl amazon_review -a category=phone
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
