记录一次java语言使用httpclient爬取网站接口数据的经历
需要用到的依赖:
httpclient和httpcore是封装了http请求的工具类
jsoup可以将返回的网页html找到你需要的xml节点,很方便
<dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.13</version> <!-- 请检查并使用最新版本 --></dependency><dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpcore</artifactId><version>4.4.14</version> <!-- 请检查并使用最新版本 --></dependency><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.13.1</version></dependency>
java类:
需要将网站请求中的cookie配置到BasicClientCookie 对象中,然后添加到请求中去,如何获取cookie文章最后有截图
package com.utils;import org.apache.http.HttpResponse;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;import java.io.IOException;public class HttpClientWithCookieExample {public static void main(String[] args) throws InterruptedException {// 创建一个Cookie存储对象(支持多个cookie)CookieStore cookieStore = new BasicCookieStore();// 创建一个Cookie并设置属性BasicClientCookie cookie = new BasicClientCookie("ASP.NET_SessionId", "mkuq512333ljwcqkfq4i");cookie.setDomain("abc.com");cookie.setPath("/");BasicClientCookie cookie1 = new BasicClientCookie("Email", "abc@qq.com");cookie1.setDomain("abc.com");cookie1.setPath("/");BasicClientCookie cookie2 = new BasicClientCookie("Password", "511B0D5F341BDDBD9A5348923B48D14C");cookie2.setDomain("abc.com");cookie2.setPath("/");// 将Cookie添加到Cookie存储中cookieStore.addCookie(cookie);cookieStore.addCookie(cookie1);cookieStore.addCookie(cookie2);// 创建一个HttpClientContext对象,并将Cookie存储设置进去HttpClientContext context = HttpClientContext.create();context.setCookieStore(cookieStore);// 创建HttpClientHttpClient httpClient = HttpClients.custom().setDefaultCookieStore(cookieStore).build();extracted_area( context, httpClient);}/*** 爬取区域信息* @param context* @param httpClient* @throws InterruptedException*/private static void extracted_area(HttpClientContext context, HttpClient httpClient) throws InterruptedException {int page = 1;HttpGet request = null;for (int i = 1; i<= page; i++){// 创建一个HttpGet请求,用于发送HTTP GET请求request = new HttpGet("https://abc.com/adminKdUser/GuanLi/AreaList.aspx");// 设置请求头try {// 使用HttpClient发送请求HttpResponse response = httpClient.execute(request, context);String result = "";if (response != null) {int statusCode = response.getStatusLine().getStatusCode();result = EntityUtils.toString(response.getEntity(), "utf-8");//System.out.println("\n返回码:" + statusCode + "\n返内容:" + result);Document doc = Jsoup.parse(result);Elements tables = doc.select("table");if (tables == null){System.out.println("第"+i+"页===终止");break;}System.out.println("第"+i+"页==="+tables.html());/* if (result.contains("<div class=\"content\">")){int s = result.indexOf("<div class=\"content\">");result = result.substring(s);System.out.println("截取后返内容:" + result);}*/JDBCBean.executeUpdate(i,"<table>"+tables.html()+"</table>");}} catch (IOException e) {System.out.println(i+"解析失败");}finally{Thread.sleep(1000);}}}private static void extracted_fanyi(HttpClientContext context, HttpClient httpClient) throws InterruptedException {int page = 984;HttpGet request = null;for (int i = 1; i<= page; i++){// 创建一个HttpGet请求,用于发送HTTP GET请求request = new HttpGet("https://abc.com/123/GuanLi/FanYiList.aspx?page="+i);// 设置请求头try {// 使用HttpClient发送请求HttpResponse response = httpClient.execute(request, context);String result = "";if (response != null) {int statusCode = response.getStatusLine().getStatusCode();result = EntityUtils.toString(response.getEntity(), "utf-8");//System.out.println("\n返回码:" + statusCode + "\n返内容:" + result);Document doc = Jsoup.parse(result);Elements tables = doc.select("table");if (tables == null){System.out.println("第"+i+"页===终止");break;}System.out.println("第"+i+"页==="+tables.html());/* if (result.contains("<div class=\"content\">")){int s = result.indexOf("<div class=\"content\">");result = result.substring(s);System.out.println("截取后返内容:" + result);}*/JDBCBean.executeUpdate(i,"<table>"+tables.html()+"</table>");}} catch (IOException e) {System.out.println(i+"解析失败");}finally{Thread.sleep(1000);}}}private static void extracted( HttpClientContext context, HttpClient httpClient) throws InterruptedException {int page = 2415;HttpGet request = null;for (int i = 1; i<= page; i++){// 创建一个HttpGet请求,用于发送HTTP GET请求request = new HttpGet("https://abc.com/123/User/GoodRecordList.aspx?page="+i);// 设置请求头try {// 使用HttpClient发送请求HttpResponse response = httpClient.execute(request, context);String result = "";if (response != null) {int statusCode = response.getStatusLine().getStatusCode();result = EntityUtils.toString(response.getEntity(), "utf-8");//System.out.println("\n返回码:" + statusCode + "\n返内容:" + result);Document doc = Jsoup.parse(result);Elements tables = doc.select("table");if (tables == null){System.out.println("第"+i+"页===终止");break;}System.out.println("第"+i+"页==="+tables.html());/* if (result.contains("<div class=\"content\">")){int s = result.indexOf("<div class=\"content\">");result = result.substring(s);System.out.println("截取后返内容:" + result);}*/JDBCBean.executeUpdate(i,"<table>"+tables.html()+"</table>");}} catch (IOException e) {System.out.println(i+"解析失败");}finally{Thread.sleep(1000);}}}
}
此处不方便透露实际网站,就用百度来作例子,取请求标头中的cookie内容,并且拼接到BasicClientCookie中即可