爬虫模拟POST请求https (爬中国银行汇率----中文)

 

2020.06.26

https://srh.bankofchina.com/search/whpj/search_cn.jsp

因为中行汇率改版本(改为了https请求,页面也有很大变化), 需要重新爬中行汇率,原代码请求的数据总与页面返回的不同,最后感觉应该是模拟的POST是失败的,打断点质量是GET模拟。可能是HTTPS影响吧。

试了网上各种方法,测试的的结果 不是POST请求,还是GET,比如,

 

 

HttpURLConnection  , HttpsURLConnection  
// 发送POST请求必须设置如下两行
conn.setDoOutput(true);
conn.setDoInput(true);
httpURLConnection.setRequestMethod("POST");

 

或ssl

javax.net.ssl.X509TrustManager
SSLContext sslContext = SSLContext.getInstance("SSL");
TrustManager[] tm = { new MyX509TrustManager() };
// 设置当前实例使用的SSLSoctetFactory
conn.setSSLSocketFactory(ssf);

 

以上设置都没效果!!!!!!!

如果是http大家可以试网上的通用方法。

----------------------------------------------------------------

 

 

 


import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.joda.time.DateTime;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;/*** 爬取中国银行汇率-----------中** @author lm*/
@Service
public class CrawlingExchangeRateCNService {public static void main(String[] args) {CrawlingExchangeRateCNService crawlingExchangeRateService = new CrawlingExchangeRateCNService();crawlingExchangeRateService.execute();}public void execute() {
//        List queryList = getExchangeRate("USD", "");List queryList = getExchangeRate("美元", "");System.out.println("长度:" + queryList.size());System.out.println("汇总:" + queryList);}/*** 获取当日传入币别汇率信息** @param sourceCurrency 币别* @param date           日期* @return*/private List getExchangeRate(String sourceCurrency, String date) {/***判断入参lsDate是否为空,若为空则赋值为当前时间**/String lsToday = StringUtils.isEmpty(date) ? new DateTime().toString("yyyy-MM-dd") : date;List list = new ArrayList();for (int page = 1; page <= 10; page++) {/**抓取时间为lsToday,币别为sourceCurrency,页数为page的中国银行网页信息*/String searchEnHtml = getSearchEnHtml(lsToday, sourceCurrency, String.valueOf(page));/**开始解析html中的汇率列表信息**/Map map = assembleObjByHtml(searchEnHtml, sourceCurrency, lsToday);String flag = (String) map.get("flag");String htmlPage = (String) map.get("page");list.add (map.get("list"));/**当flag为1执行成功时,或总页数等于循环查询到的页数时,则不需要再次进行查询**/if ("1".equals(flag) || Integer.parseInt(htmlPage) < page) {break;}}return list;}/*** 获取整个网页的内容** @param lsToday          传入当前时间或空* @param lsSourceCurrency 币种* @param liPage           当前查询页数* @return*/private String getSearchEnHtml(String lsToday, String lsSourceCurrency, String liPage) {//        StringBuilder url = new StringBuilder("https://srh.bankofchina.com/search/whpj/searchen.jsp?");StringBuilder url = new StringBuilder("https://srh.bankofchina.com/search/whpj/search_cn.jsp?");url.append("erectDate=").append(lsToday);url.append("¬hing=").append(lsToday);url.append("&pjname=").append(lsSourceCurrency);url.append("&page=").append(liPage);System.out.println("拼接好的url:" + url);CloseableHttpClient httpClient = HttpClients.createDefault();CloseableHttpResponse response = null;HttpPost httpPost = new HttpPost(url.toString());httpPost.addHeader("Content-Type", "application/x-www-form-urlencoded;charset=utf-8");httpPost.setHeader("Accept", "Accept: text/plain, */*");httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36");httpPost.addHeader("x-amazon-user-agent", "AmazonJavascriptScratchpad/1.0 (Language=Javascript)");httpPost.addHeader("X-Requested-With", "XMLHttpRequest");String html = "";try {response = httpClient.execute(httpPost);/**判断响应状态为200,进行处理**/if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {HttpEntity httpEntity = response.getEntity();html = EntityUtils.toString(httpEntity, "utf-8");} else {System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));}} catch (ClientProtocolException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} finally {HttpClientUtils.closeQuietly(response);HttpClientUtils.closeQuietly(httpClient);}/***返回请求得到的页面**/return html;}/*** 根据取得的网页,解析html中的内容 先不做业务逻辑,全部查询** @param html             要解析的html* @param lsSourceCurrency 币种* @param lsToday          日期* @return*/private Map assembleObjByHtml(String html, String lsSourceCurrency, String lsToday) {/**存储数据**/Map map = new HashMap(5);/**使用Jsoup将html解析为Document对象**/Document document = Jsoup.parse(html);/**获取页面隐藏域中存放的当前页数**/Elements pageItem = document.getElementsByAttributeValue("name", "page");String pageItemValue = "";pageItemValue = pageItem.select("input[name=page]").val();map.put("page", pageItemValue);/**获取页面的整个table信息,这个返回的页面基本上是返回多个table,下方需要细化处理**/Elements tables = document.getElementsByTag("table");/**设置存放汇率信息的table下标为-1(默认不存在)**/int tableIndex = -1;/**从table中循环获取,查找含有Currency Name字段的table**/for (int i = 0; i < tables.size(); i++) {Element element = tables.get(i);String text = element.text();/**找到含有汇率信息的table,给tableIndex赋值,跳出循环**/if (text.indexOf("货币名称") > -1) {tableIndex = i;break;}}List list = new ArrayList();/**如果找到汇率列表信息**/if (tableIndex > -1) {Element table = tables.get(tableIndex);/**遍历该表格内的所有的 */Elements trs = table.select("tr");for (int i = 1; i < trs.size(); ++i) {TerstEntity terstEntity = new TerstEntity();Element tr = trs.get(i);/**将数据放入实体对象中*/Elements tds = tr.select("td");//过滤  if(tds !=null && tds.size() == 7){System.out.println(tds.get(0).text() + " "+i);terstEntity.setCurrencyName(tds.get(0).text());terstEntity.setBuyingRate(tds.get(1).text());terstEntity.setCashBuyingRate(tds.get(2).text());terstEntity.setSellingRate(tds.get(3).text());terstEntity.setCashSellingRate(tds.get(4).text());terstEntity.setMiddleRate(tds.get(5).text());terstEntity.setPubTime(tds.get(6).text());list.add(terstEntity);}}map.put("list", list);}else{map.put("flag", "1");}return map;}}

 

 

 

import lombok.Data;/*** 测试使用*/
@Data
public class TerstEntity {private String currencyName;private String buyingRate;private String cashBuyingRate;private String sellingRate;private String cashSellingRate;private String middleRate;private String PubTime;}

 

		org.jsoupjsoup1.12.1

 


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部