亚马逊+ip代理+多线程

2023-11-22 11:51:12

前言

该文章为学习使用，严禁用于商业用途和非法用途，否则由此产生的一切后果均与作者无关！如有侵权，请私信联系作者删除~

正文

import random
import time
from loguru import logger
from queue import Queue
import threading
from retrying import retry
from feapder.network.user_agent import get
import pymysql
import requests
from lxml import etreeclass Amazon:def __init__(self):self.db = pymysql.connect(host='localhost', user='root', password='root', db='spiders')self.cursor = self.db.cursor()self.index_url = 'https://www.amazon.cn/gp/navigation/ajax/generic.html?ajaxTemplate=hamburgerMainContent&pageType=Gateway&hmDataAjaxHint=1&navDeviceType=desktop&isSmile=0&isPrime=0&isBackup=false&hashCustomerAndSessionId=c108bde04b677f19f2e5d7df74ff6ce0cad515fc&isExportMode=false&languageCode=zh_CN&environmentVFI=AmazonNavigationCards%2Fdevelopment%40B6099827072-AL2_x86_64&secondLayerTreeName=apparel_shoes%2Bcomputer_office%2Bhome_kitchen%2Bbeauty_pca%2Bkindle_ebook%2Bsports_outdoor%2Bgrocery%2Bbaby_toy%2Bphones_elec%2Bjewelry_watch%2Bhome_improvement%2Bvideo_game%2Bmusical_instrument%2Bcamera'self.headers = {"Referer": "https://www.amazon.cn/ref=nav_logo","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36","X-Requested-With": "XMLHttpRequest","downlink": "10","ect": "4g","rtt": "200",}self.ip_url = 'http://v2.api.juliangip.com/dynamic/getips?num=1&pt=1&result_type=text&split=1&trade_no=1731739714589664&sign=bd9ff26a4b9dcd0cf0142bcd6fcc25af'self.ip_queue = Queue()self.classify_url_queue = Queue()self.detail_url_queue = Queue()self.data_queue = Queue()def create_table(self):# 使用预处理语句创建表sql = '''CREATE TABLE IF NOT EXISTS amazon(id int primary key auto_increment not null,price VARCHAR(255) NOT NULL, title VARCHAR(255) NOT NULL,goods_url VARCHAR(255) NOT NULL,classify VARCHAR(255) NOT NULL)'''try:self.cursor.execute(sql)print("CREATE TABLE SUCCESS.")except Exception as ex:print(f"CREATE TABLE FAILED,CASE:{ex}")def get_ip(self):while True:if self.ip_queue.empty():res = requests.get(url='http://v2.api.juliangip.com/dynamic/getips?num=1&pt=1&result_type=text&split=1&trade_no=1731739714589664&sign=bd9ff26a4b9dcd0cf0142bcd6fcc25af')self.ip_queue.put(res.text)print(res.text)else:continue@retry(stop_max_attempt_number=3)def get_data(self, url):ip = self.ip_queue.get()proxies = {'http': 'http://' + ip}self.headers['User-Agent'] = get()resp = requests.get(url, headers=self.headers, timeout=5, proxies=proxies)if resp.status_code == 200:self.ip_queue.put(ip)else:logger.error('状态码错误')return respdef get_info_url(self):response = self.get_data(self.index_url)html_object = etree.HTML(response.text)# 前两个没有请求地址li_list = html_object.xpath('//ul/li[position() > 2]')for li in li_list:item = {}if li.xpath('./a/text()'):# 全部分类的数据是分类页面if '全部' in li.xpath('./a/text()')[0]:continue# 带有https的为广告页面if 'http' in li.xpath('./a/@href')[0]:continueitem['title'] = li.xpath('./a/text()')[0]# item['href'] = li.xpath('./a/@href')[0]# print(li.xpath('./a/@href')[0])item['href'] = li.xpath('./a/@href')[0].split('=')[1].split('&')[0]print(item)self.classify_url_queue.put(item)def detail_url_get(self):while True:info_url = self.classify_url_queue.get()time.sleep(random.randint(500, 800) / 1000)try:# 请求分类查看所有商品页面response = self.get_data("https://www.amazon.cn/s?rh=n%3A" + info_url['href'] + "&fs=true")except Exception as e:logger.error("https://www.amazon.cn/s?rh=n%3A" + info_url['href'] + "&fs=true")continuehtml_data = etree.HTML(response.text)if html_data.xpath('//span[@class="s-pagination-strip"]/span[last()]/text()'):# 获取到分类的总共页数max_page = html_data.xpath('//span[@class="s-pagination-strip"]/span[last()]/text()')[0]for page in range(1, int(max_page) + 1):# 每个分类单独页面拼接new_url = 'https://www.amazon.cn/s?rh=n%3A' + info_url['href'] + '&fs=true&page=' + str(page)# 请求每个分类的单独页面try:res = self.get_data(new_url)except Exception as e:logger.error(new_url)continuehtml = etree.HTML(res.text)detail_href_list = html.xpath('//div[@class="sg-col-inner"]/span/div[1]/div/div/div//h2/a/@href')# time.sleep(random.randint(200, 500) / 1000)# 获取到详情商品的详情地址和分类for detail_href in detail_href_list:item = {}item['detail_href'] = detail_hrefitem['classify_data'] = info_url['title']self.detail_url_queue.put(item)self.classify_url_queue.task_done()def parses_data(self):while True:goods_data = self.detail_url_queue.get()# 拼接详情页面goods_url = 'https://www.amazon.cn' + goods_data['detail_href']try:response = self.get_data(goods_url)except Exception as e:logger.error(goods_url)continuehtml_data = etree.HTML(response.text)# 获取商品标题title = html_data.xpath('//div[@id="centerCol"]//h1/span/text()')[0] if html_data.xpath('//div[@id="centerCol"]//h1/span/text()') else '空'# 获取商品价格if html_data.xpath('//div[@id="centerCol"]//div[@id="apex_desktop"]//span[@class="a-price-whole"]/text()'):price = "￥" + html_data.xpath('//div[@id="centerCol"]//div[@id="apex_desktop"]//span[@class="a-price-whole"]/text()')[0]else:price = '-'.join(html_data.xpath('//td[@class="a-span12"]//span[@class="a-offscreen"]/text()'))print([goods_data['classify_data'], title.strip(), price, goods_url])self.data_queue.put((goods_data['classify_data'], title.strip(), price, goods_url))self.detail_url_queue.task_done()def save_data(self):while True:data_list = []for i in range(30):data = self.data_queue.get()logger.info(data)data_list.append((0,) + data)self.data_queue.task_done()# SQL 插入语句sql = 'INSERT INTO yamaxun(id, price, title, classify, goods_url) values(%s, %s, %s, %s, %s)'# 执行 SQL 语句try:# print(sql, (0, data[0], data[1], data[2], data[3]))self.cursor.executemany(sql, data_list)# 提交到数据库执行self.db.commit()print('数据插入成功...')except Exception as e:print(f'数据插入失败: {e}')# 如果发生错误就回滚self.db.rollback()def main(self):self.create_table()t_list = []# 获取ip线程t_ip = threading.Thread(target=self.get_ip)t_list.append(t_ip)# 获取分类线程t_info = threading.Thread(target=self.get_info_url)t_list.append(t_info)# 获取详细商品线程for i in range(2):t_detail_url = threading.Thread(target=self.detail_url_get)t_list.append(t_detail_url)# 解析数据线程for i in range(3):t_paras = threading.Thread(target=self.parses_data)t_list.append(t_paras)# 保存数据线程t_save = threading.Thread(target=self.save_data)t_list.append(t_save)for t in t_list:t.setDaemon(True)t.start()time.sleep(3)for q in [self.classify_url_queue, self.detail_url_queue, self.detail_url_queue]:q.join()if __name__ == '__main__':logger.add('runtime{time}.log', rotation='500 MB')ama = Amazon()ama.main()

最后

欢迎联系作者交流更多

本文来自互联网用户投稿，文章观点仅代表作者本人，不代表本站立场，不承担相关法律责任。如若转载，请注明出处。 如若内容造成侵权/违法违规/事实不符，请点击【内容举报】进行投诉反馈！

标签：技术

上一篇 > 『高效程序员的45个习惯：敏捷开发修炼之道』读书笔记
下一篇 > IP代理池的搭建（案例---某马逊实战爬虫）

Duilib中list控件支持ctrl和shif多行选中的实现

[ICML2015]Batch Normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shif

win10系统微软输入法于eclipse ctrl+shif+f冲突间接处理办法

Codeforces Round #259 (Div. 2) B. Little Pony and Sort by Shif

读LDD3，内存映射与DMA--PAGE_SHIF…

VMware虚拟机安装XP【要先分区，再设置BOOT 启动CD，shif+上移】

更换iBus五笔的左与右Shif

sublime ctrl+shif+f 没用解决办法

idea 对 ctrl + z 的撤销是 ctrl + shif + z

计算机最早的设计师应用于,计算机应用基础选择题doc.doc

win10自带截图神器：Win+Shift+S

Python基础之文件目录操作

python简述目录_Python基础之文件目录操作(示例代码)

tp5 如何做数据采集

任务2-7(服务器字体+阿里巴巴矢量库)

html标签（1)：h1~h6,p,br,pre,hr

TI 电量计介绍与芯片选型指南

几款TI电源芯片简介

TI DSP芯片C2000系列读取FLASH数据

德州仪器(Ti)平台嵌入式开发基础

TI三相电机智能栅极驱动芯片特点分类

省选模拟（12.08） T3 圈圈圈圈圈圈圈圈

Hadoop生态圈技术栈（上）

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之6.Impala交互式查询

小猿圈之Linux下Mysql 操作命令

大数据Hadoop生态圈常用面试题

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之4.Hive DDL、DQL和数据操作

备战Noip2018模拟赛11（B组）T3 Monogatari 物语

【智能优化算法-圆圈搜索算法】基于圆圈搜索算法Circle Search Algorithm求解单目标优化问题附matlab代码

NYOJ 78 圈水池

递归问题跑道汽车绕圈问题 Python实现

Hadoop生态圈（三）：MapReduce

亚马逊+ip代理+多线程

前言

正文

最后

相关文章