import requests_html
import json
import time
def num(num):nums = 0num = num.replace('找到相关结果约','').replace('个','')num = num.replace('找到约','').replace('条结果','')num = num.replace(',','').replace(' ','')if '亿' in num:numr = num.split('亿')nums += int(numr[0])*10000*10000num = numr[1]if '万' in num:numr = num.split('万')nums += int(numr[0])*10000num = numr[1]if not num:num=0nums += int(num)return nums
QueryUrlBaidu = 'https://www.baidu.com/s?ie=utf-8&wd=%s'
QueryUrlHaosou = 'https://www.so.com/s?q=%s'
QueryUrlHaosouM = 'https://m.so.com/s?q=%s'
QueryUrlSogou = 'https://www.sogou.com/web?query=%s'
QueryUrlShenma = 'https://m.sm.cn/s?q=%s'
s = requests_html.HTMLSession()
s.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36','Cookie': ''\'PSTM=1578469724; BAIDUID=0401106C23030C7C6758A2EEDC00E967:FG=1; delPer=0; BD_CK_SAM=1; PSINO=3; H_PS_PSSID=1459_21103_30210_30496_26350_22159; BIDUPSID=50F03331BB9348FA0982DB22EE7F245E; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BD_UPN=12314353; H_PS_645EC=077dTzRh%2FuK7a64A8%2BHXcVlxSDZpdSuxDYLrTquT1QClk0BRAmSqjmPn5ek'\'__guid=34870781.1471611762366714400.1578476949585.0554; QiHooGUID=2FFB2A68E2B5180C7D8EDC555794A7EE.1578477208864; _S=d0c11butf4e2pg07naoo0j4b91; opqopq=b94e4818125a8e53b6c4c258100f3e23.1578477208; count=1; dpr=1; webp=1; __huid=11NUowBiKSdaKDraeZ71uFZk%2FmOKYjfZl0CjTkGIqGMzw%3D; gtHuid=1'\'ABTEST=4|1578477296|v17; SNUID=F484FCC2D8DC43453A49B7D8D93A1423; IPLOC=CN4406; SUID=2C5C241B2513910A000000005E15A6F0; ld=VZllllllll2WVzbZlllllVcqIsllllllphStalllll9llllljllll5@@@@@@@@@@; SUV=00AD855B1B245C2C5E15A6F13B70B719; browerV=3; osV=1'\,
})
def getnum(url,sname=''):if not url:print('[warnning]','请设置url')try:r = s.get(url)if sname=='Baidu':text = r.html.find('.c-border b',first=True)elif sname=='Haosou':text = r.html.find('span.nums',first=True)elif sname=='HaosouM':text = r.html.find('p.site-s-result strong',first=True)elif sname=='Sogou':text = r.html.find('p.sr-num',first=True)elif sname=='Shenma':text = r.html.find('div.site-body p i',first=True)else:print('[warnning]','没有匹配到结果',url)return 0if text :return num(text.text)else:print('[warnning]','没有匹配到结果,通常为反爬限制',url)return 0except BaseException as e:print('[error]','抽取失败',url)print('[error]','抽取失败详情',e)return 0
def checksite(domainarr=[],sleep=0,filename=''):if not domainarr:print('[warnning]','请设置domain')returnprint('[running]','共获取域名数:',len(domainarr))data = []for domain in domainarr:query = 'site:%s'%domainnum = getnum(QueryUrlBaidu%query,'Baidu')print('[running]',domain,'Baidu',num)data.append({'domain':domain,'search':'baidu','value':num,})num = getnum(QueryUrlSogou%query,'Sogou')print('[running]',domain,'Sogou',num)data.append({'domain':domain,'search':'sogou','value':num,})num = getnum(QueryUrlShenma%query,'Shenma')print('[running]',domain,'Shenma',num)data.append({'domain':domain,'search':'shenma','value':num,})if domain.split('.')[0] == 'm' : num = getnum(QueryUrlHaosouM%query,'HaosouM')else:num = getnum(QueryUrlHaosou%query,'Haosou')print('[running]',domain,'Haosou',num)data.append({'domain':domain,'search':'haosou','value':num,})if sleep:print('[running]','休眠:',sleep)time.sleep(sleep)print('[running]',json.dumps(data))if filename:print('[running]','执行保存文件',filename)with open(filename, 'w', encoding='utf-8') as filejson:filejson.write(json.dumps(data))if __name__ == '__main__':domainr = ['www.seowhy.com',]sleep = 0date = time.strftime("%Y%m%d",time.localtime(int(time.time())))filename = 'site.%s.json'%datechecksite(domainr,sleep,filename)
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!