爬虫--拉勾网数据分析岗招聘信息爬取

主程序

import requests
import json
from lxml import etree
from mysqlhelper import MysqlHelper
import time
import randomclass LaGouSpider():def __init__(self):# 选取热门城市self.sqlHelper = MysqlHelper()self.insertSql = '''INSERT INTO lagou VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''self.headers = {"Accept": "application/json, text/javascript, */*; q=0.01","Referer": "https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city=%E4%B8%8A%E6%B5%B7","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}self.area_list = ['北京', '上海', '深圳', '广州', '杭州', '成都', '南京', '武汉', '西安', '厦门', '长沙', '苏州', '天津']self.index_url = "https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city={}"self.start_url = "https://www.lagou.com/jobs/positionAjax.json?"self.get_pageNum = ""def delay_time(self):randtime = random.randint(4,6)time.sleep(randtime)def get_cookies(self, s,city):response = requests.get(self.index_url.format(city), headers=self.headers)return response.cookiesdef get_num(self, city):response = requests.get(self.index_url.format(city), headers=self.headers)html_ele = etree.HTML(response.text)pageNum = html_ele.xpath('//span[@class="span totalNum"]/text()')if pageNum:return pageNum[0]else:return 1def get_response(self,s, url, city, pagenum):start_params = {"city": city,"needAddtionalResult": "false"}start_data = {"first": "false","pn": pagenum,  # 页码"kd": "数据分析"}response = s.post(url, headers=self.headers, cookies=self.cookies, params=start_params,data=start_data)return responsedef save_data(self, city, positionName, district, companyShortName, salary, workYear, education, financeStage,companySize,companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName, hitags,firstType, thirdType, skillLables):data = (city, positionName, district, companyShortName, salary, workYear, education, financeStage, companySize,companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName, hitags,firstType, thirdType, skillLables)self.sqlHelper.execute_modify_sql(self.insertSql, data)def main(self):for city in self.area_list:pageNum = self.get_num(city)self.delay_time()for num in range(1, int(pageNum) + 1):s = requests.Session()self.cookies = self.get_cookies(s, city)self.delay_time()response = self.get_response(s,self.start_url, city, num)print(response.text)json_ele = json.loads(response.text)zhaopin_list = json_ele['content']['positionResult']['result']# print(zhaopin_list)for info in zhaopin_list:positionName = info['positionName']district = info['district']companyShortName = info['companyShortName']salary = info['salary']workYear = info['workYear']education = info['education']financeStage = info['financeStage']  # 融资companySize = info['companySize']companyLabelList = info['companyLabelList']  # 公司标签列表companyLabel = ",".join(companyLabelList)positionLablesList = info['positionLables']  # 小标签if type(positionLablesList) == list:positionLable = ",".join(positionLablesList)else:positionLable = info['positionLables']longitude = info['longitude']  # 经度latitude = info['latitude']  # 维度formatCreateTime = info['formatCreateTime']  # 发布时间companyFullName = info['companyFullName']hitagsList = info['hitags']  # 福利if type(hitagsList) == list:hitags = ",".join(hitagsList)else:hitags = info['hitags']firstType = info['firstType']thirdType = info['thirdType']skillLablesList = info['skillLables']if type(skillLablesList) == list:skillLables = ",".join(skillLablesList)else:skillLables = info['skillLables']positionId = str(info['positionId'])# print(city, positionName, district, companyShortName, salary, workYear, education, financeStage,#       companySize,#       companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName, hitags,#       firstType, thirdType, skillLables)print("[INFO]:",city,num)# 保存数据self.save_data(city, positionName, district, companyShortName, salary, workYear, education,financeStage,companySize,companyLabel, positionLable, longitude, latitude, formatCreateTime, companyFullName,hitags,firstType, thirdType, skillLables)self.delay_time()if __name__ == '__main__':spider = LaGouSpider()spider.main()

MysqlHelper类文件

import pymysqlclass MysqlHelper(object):def __init__(self):self.conn = pymysql.connect(host='127.0.0.1', port=3306,user='root', passwd='666666',db='test', charset='utf8mb4')self.cursor = self.conn.cursor()def execute_modify_sql(self, sql, data):self.cursor.execute(sql, data)self.conn.commit()def __del__(self):self.cursor.close()self.conn.close()


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部