JAVA——基于HttpComponents(HttpClient)的简单网络爬虫DEMO

基本概念

HttpComponents(HttpClient)

超文本传输​​协议(HTTP)可能是当今Internet上使用的最重要的协议。Web服务,支持网络的设备和网络计算的增长继续将HTTP协议的作用扩展到用户驱动的Web浏览器之外,同时增加了需要HTTP支持的应用程序的数量。

HttpComponents是为扩展而设计的,同时提供了对基本HTTP协议的强大支持,对于构建HTTP感知的客户端和服务器应用程序(例如Web浏览器,Web Spider,HTTP代理,Web服务传输库或利用或扩展HTTP协议以进行分布式通信。

官网

官网地址:http://hc.apache.org/ 

Maven

        org.apache.httpcomponentshttpcore4.4.10org.apache.httpcomponentshttpclient4.5.6org.apache.commonscommons-collections44.1org.jsoupjsoup1.11.3

源代码

HTTPClientPool 

package club.zstuca.httpclient;import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;/*** Https忽略证书*/
public class HTTPClientPool {private static final String HTTP = "http";private static final String HTTPS = "https";private static SSLConnectionSocketFactory sslConnectionSocketFactory = null;private static PoolingHttpClientConnectionManager poolingHttpClientConnectionManager = null;//连接池管理类private static SSLContextBuilder sslContextBuilder = null;//管理Https连接的上下文类static {try {sslContextBuilder = new SSLContextBuilder().loadTrustMaterial(null,new TrustStrategy() {@Overridepublic boolean isTrusted(X509Certificate[] x509Certificates, String s)throws CertificateException {//                    信任所有站点 直接返回truereturn true;}});//"SSLv2Hello", "SSLv3", "TLSv1"sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslContextBuilder.build(),new String[]{"TLSv1.2"},null,NoopHostnameVerifier.INSTANCE);Registry registryBuilder = RegistryBuilder.create().register(HTTP, new PlainConnectionSocketFactory()).register(HTTPS, sslConnectionSocketFactory).build();poolingHttpClientConnectionManager = new PoolingHttpClientConnectionManager(registryBuilder);poolingHttpClientConnectionManager.setMaxTotal(200);} catch (NoSuchAlgorithmException e) {e.printStackTrace();} catch (KeyStoreException e) {e.printStackTrace();} catch (KeyManagementException e) {e.printStackTrace();}}/*** 获取连接** @return* @throws Exception*/public static CloseableHttpClient getHttpClient() throws Exception {CloseableHttpClient httpClient = HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).setConnectionManager(poolingHttpClientConnectionManager).setConnectionManagerShared(true).setDefaultCookieStore(new BasicCookieStore()).setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36").build();return httpClient;}
}

Web Crawler

package club.zstuca.httpclient;import org.apache.http.*;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StringUtils;import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Map;/*** Http/Https请求的工具类*/
public class HTTPClientUtil {// 日志private static Logger logger = LoggerFactory.getLogger(HTTPClientUtil.class);// Request params default Configprivate static RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).setConnectionRequestTimeout(5000).setSocketTimeout(5000).setRedirectsEnabled(false).build();// HttpClientprivate static CloseableHttpClient httpClient = null;// HTTP Requestprivate static HttpRequestBase httpRequest = null;// HTTP Responseprivate static CloseableHttpResponse httpResponse = null;/**** @param HttpRequestType* @param url* @param header* @param params* @param httpEntity* @return*/public static String doRequest(String HttpRequestType,String url,Map header,Map params,HttpEntity httpEntity) {String resultStr = "";if (StringUtils.isEmpty(url)) {return resultStr;}try {// Set GET paramssetHttpURIParams(url,params);// Set POST paramsif("POST".equals(HttpRequestType)&&httpEntity != null){((HttpPost)httpRequest).setEntity(httpEntity);}// Set HTTP headersetHttpHeader(header);// Send POSTsendHttpRequest();// ResponseresultStr = dealWithHttpResponse();} catch (Exception e) {e.printStackTrace();} finally {closeConnection();}return resultStr;}/*** 发送POST请求** @param url:请求地址* @param header:请求头参数* @param httpEntity:表单参数  form提交 json/xml参数* @return*/public static String doPostRequest(String url, Map header, HttpEntity httpEntity) {String resultStr = "";if (StringUtils.isEmpty(url)) {return resultStr;}try {getHttpRequest("POST");HttpPost httpPost = (HttpPost)httpRequest;httpPost.setURI(new URIBuilder(url).build());// Set HTTP headersetHttpHeader(header);// Set POST paramsif (httpEntity != null) {httpPost.setEntity(httpEntity);}sendHttpRequest();// ResponseresultStr = dealWithHttpResponse();} catch (Exception e) {e.printStackTrace();} finally {closeConnection();}return resultStr;}/*** 发送GET请求* @param url URL* @param header HTTP header info* @param params GET params* @return*/public static String doGetRequest(String url, Map header, Map params) {String resultStr = "";if (StringUtils.isEmpty(url)) {return resultStr;}try {// getHttpRequestgetHttpRequest("GET");// Set GET paramssetHttpURIParams(url,params);// Set HTTP headersetHttpHeader(header);// Send POSTsendHttpRequest();// ResponseresultStr = dealWithHttpResponse();} catch (Exception e) {e.printStackTrace();} finally {closeConnection();}return resultStr;}/**** @param HttpRequestType* @throws Exception*/private static void getHttpRequest(String HttpRequestType) throws Exception {httpClient = HTTPClientPool.getHttpClient();if("GET".equals(HttpRequestType)){httpRequest = new HttpGet();}else if("POST".equals(HttpRequestType)){httpRequest = new HttpPost();}}/**** @param header*/private static void setHttpHeader(Map header){if (!(header == null || header.isEmpty())) {for (Map.Entry headerEntry : header.entrySet()) {httpRequest.setHeader(headerEntry.getKey(), headerEntry.getValue());}}}/**** @param url* @param params* @throws URISyntaxException*/private static void setHttpURIParams(String url,Map params) throws URISyntaxException {// URIBuilderURIBuilder urlbuilder = new URIBuilder(url);if (!(params == null || params.isEmpty())) {// Set GET paramsfor (Map.Entry stringStringEntry : params.entrySet()) {urlbuilder.setParameter(stringStringEntry.getKey(), stringStringEntry.getValue());}}httpRequest.setURI(urlbuilder.build());}/**** @throws IOException*/private static void  sendHttpRequest() throws IOException {// Request ConfighttpRequest.setConfig(requestConfig);// Send POSThttpResponse = httpClient.execute(httpRequest);return ;}/**** @return Response String UTF-8*/private static String dealWithHttpResponse(){String resultStr = "";try{if (httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {resultStr = EntityUtils.toString(httpResponse.getEntity(),"UTF-8");} else {StringBuffer stringBuffer = new StringBuffer();HeaderIterator headerIterator = httpResponse.headerIterator();while (headerIterator.hasNext()) {stringBuffer.append("\t" + headerIterator.next());}}}catch (IOException e) {e.printStackTrace();}return resultStr;}/*** 关掉连接释放资源*/private static void closeConnection() {if (httpClient != null) {try {httpClient.close();} catch (IOException e) {e.printStackTrace();}}if (httpResponse != null) {try {httpResponse.close();} catch (IOException e) {e.printStackTrace();}}}}

TEST 

package clua.zstuca;import club.zstuca.httpclient.HTTPClientUtil;import java.util.HashMap;public class HTTPTEST {public static void main(String[] args) {HTTPClientUtil.doGetRequest("http://www.baidu.com",null,null);HTTPClientUtil.doGetRequest("http://api.help.bj.cn/apis/weather/", null, new HashMap(){{put("id","101060101");}});}
}

教学资源

https://www.bilibili.com/video/av68932809 

参考文章

https://blog.csdn.net/qwe86314/article/details/91450098


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部