目录
准备工作
yml
1.入门程序(获取到静态页面)
2.HttpClient---Get
2.1 修改成连接池
3.HttpClient---Get带参数
3.1 修改成连接池
4.HttpClient---Post
4.1 修改成连接池
5.HttpClient---Post带参数
6.HttpClient-连接池
7.设置请求信息
8.jsoup介绍.
9.jsoup解析url
10.jsoup解析字符串
11.jsoup解析文件
12.所有dom方式获取元素
13.元素中获取数据
准备工作
导入依赖
<dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.2</version></dependency>
yml
logging:level:root: infocom.lrm: debug
1.入门程序(获取到静态页面)
package com.itheima.reggie.utils;import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;import java.io.IOException;/*** @Author lpc**/
public class CrawlerFirst {public static void main(String[] args) throws Exception {//1.打开浏览器,创建Httpclient对象CloseableHttpClient httpClient = HttpClients.createDefault();//2.输入网址,发起get请求创建HttpGet对象HttpGet httpGet = new HttpGet("https://www.itcast.cn/");//3.按回车,发起请求,返回响应,使用Httpclient对象发起请求CloseableHttpResponse response = httpClient.execute(httpGet);//4.解析响应,获取数据//判斯状态码是否是200if (response.getStatusLine().getStatusCode()==200){HttpEntity httpEntity = response.getEntity();//获取前端静态页面String content = EntityUtils.toString(httpEntity,"utf8");System.out.println(content);}}
}
2.HttpClient---Get
package com.itheima.reggie.utils;import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;import java.io.IOException;/*** @Author lpc* @Date 2024 03 12 00 23**/
public class CrawlerFirst {public static void main(String[] args){//1.打开浏览器,创建Httpclient对象CloseableHttpClient httpClient = HttpClients.createDefault();//2.输入网址,发起get请求创建HttpGet对象HttpGet httpGet = new HttpGet("https://www.itcast.cn/");//3.按回车,发起请求,返回响应,使用Httpclient对象发起请求CloseableHttpResponse response = null;try {response = httpClient.execute(httpGet);//4.解析响应,获取数据//判斯状态码是否是200if (response.getStatusLine().getStatusCode()==200){HttpEntity httpEntity = response.getEntity();//获取前端静态页面String content = EntityUtils.toString(httpEntity,"utf8");System.out.println(content.length());}} catch (IOException e) {throw new RuntimeException(e);}finally {try {//关闭responseresponse.close();} catch (IOException e) {throw new RuntimeException(e);}try {//关闭浏览器httpClient.close();} catch (IOException e) {throw new RuntimeException(e);}}}
}
2.1 修改成连接池
package org.example;import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;import java.io.IOException;/*** @Author lpc* @Date 2024 03 14 09 38**/
public class Test {public static void main(String[] args) {//创建连接池PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();//设置最大连接数cm.setMaxTotal(100);//设置每个主机的最大连接数cm.setDefaultMaxPerRoute(10);//使用连接池管理器发起请求doGet(cm);}public static void doGet(PoolingHttpClientConnectionManager cm){//不是每次创建新的httpClient,而是从连接池中获取HttpClient对象CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();HttpGet httpGet = new HttpGet("http://www.itcast.cn");CloseableHttpResponse response=null;try {response = httpClient.execute(httpGet);if (response.getStatusLine().getStatusCode()==200){String content = EntityUtils.toString(response.getEntity(), "utf8");System.out.println(content.length());}} catch (IOException e) {throw new RuntimeException(e);}finally {if (response!=null){try {response.close();} catch (IOException e) {throw new RuntimeException(e);}//不能关闭,由连接池管理// httpClient.close();}}}
}
3.HttpClient---Get带参数
package org.example;import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;import java.io.IOException;
import java.net.URISyntaxException;/*** @Author lpc* @Date 2024 03 13 20 44**/
public class Test2 {public static void main(String[] args) throws Exception {//1.打开浏览器CloseableHttpClient httpClient = HttpClients.createDefault();//设置请求地址是: http://yun.itheima.com/search?keys=Java//带参数的get方法设置//创建URIBuilderURIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");//设置参数 可以设置多个uriBuilder.setParameter("keys","Java");//2.输入网址,发起get请求创建HttpGet对象HttpGet httpGet = new HttpGet(uriBuilder.build());System.out.println("发起请求的信息"+httpGet);//3.CloseableHttpResponse response=null;try {response = httpClient.execute(httpGet);if (response.getStatusLine().getStatusCode()==200){HttpEntity httpEntity = response.getEntity();//String s = EntityUtils.toString(httpEntity, "utf8");System.out.println(s);}} catch (IOException e) {throw new RuntimeException(e);}finally {try {response.close();} catch (IOException e) {throw new RuntimeException(e);}try {httpClient.close();} catch (IOException e) {throw new RuntimeException(e);}}}
}
3.1 修改成连接池
4.HttpClient---Post
package org.example;import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;import java.io.IOException;/*** @Author lpc* @Date 2024 03 13 20 59**/
public class Post {public static void main(String[] args) {//1.打开浏览器CloseableHttpClient httpClient = HttpClients.createDefault();//2.输入网址,发起get请求创建HttpGet对象//HttpGet httpGet = new HttpGet("https://www.itcast.cn/");HttpPost httpPost = new HttpPost("https://www.itcast.cn/");//3.CloseableHttpResponse response=null;try {// response = httpClient.execute(httpGet);response = httpClient.execute(httpPost);if (response.getStatusLine().getStatusCode()==200){HttpEntity httpEntity = response.getEntity();//String s = EntityUtils.toString(httpEntity, "utf8");System.out.println(s);}} catch (IOException e) {throw new RuntimeException(e);}finally {try {response.close();} catch (IOException e) {throw new RuntimeException(e);}try {httpClient.close();} catch (IOException e) {throw new RuntimeException(e);}}}
}
4.1 修改成连接池
package org.example;import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;import java.io.IOException;/*** @Author lpc* @Date 2024 03 14 10 02**/
public class Postl {public static void main(String[] args){//创建连接池管理器PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();//设置最大连接数cm.setMaxTotal(100);//设置每个主机最大连接数cm.setDefaultMaxPerRoute(10);//发起请求doPost(cm);}private static void doPost(PoolingHttpClientConnectionManager cm) {//不是每次创建新的httpClient,而是从连接池中获取HttpClient对象CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();//2.输入网址 发起Post请求HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");CloseableHttpResponse response=null;try {response = httpClient.execute(httpPost);if (response.getStatusLine().getStatusCode()==200){String s = EntityUtils.toString(response.getEntity());System.out.println(s.length());}} catch (IOException e) {throw new RuntimeException(e);}finally {if (response!=null){try {response.close();} catch (IOException e) {throw new RuntimeException(e);}}//不用关闭,由连接池管理// httpClient.close();}}
}
5.HttpClient---Post带参数
package org.example;import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;/*** @Author lpc* @Date 2024 03 13 20 59**/
public class Post {public static void main(String[] args) throws Exception {//1.打开浏览器CloseableHttpClient httpClient = HttpClients.createDefault();//2.输入网址 发起Post请求HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");//声明List集合,封装表单中的参数List<NameValuePair> params =new ArrayList<NameValuePair>();//设置请求地址是: http://yun.itheima.com/search?keys=Javaparams.add(new BasicNameValuePair("keys","Java"));//创建表单的Entity对象,第一个参数就是封装的表单数据,第二个参数就是编码UrlEncodedFormEntity urlEncodedFormEntity = new UrlEncodedFormEntity(params,"utf8");//设置表单的Entity对象到Post请求中httpPost.setEntity(urlEncodedFormEntity);CloseableHttpResponse response=null;try {// response = httpClient.execute(httpGet);response = httpClient.execute(httpPost);if (response.getStatusLine().getStatusCode()==200){HttpEntity httpEntity = response.getEntity();//String s = EntityUtils.toString(httpEntity, "utf8");System.out.println(s);}} catch (IOException e) {throw new RuntimeException(e);}finally {try {response.close();} catch (IOException e) {throw new RuntimeException(e);}try {httpClient.close();} catch (IOException e) {throw new RuntimeException(e);}}}
}
6.HttpClient-连接池
如果每次请求都要创建HttpClient,会有频繁创建和销毁的问题,可以使用连接池来解决这个问题。·
测试以下代码,并断点查看每次获取的HttpClient都是不一样的。。
package org.example;import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;import java.io.IOException;/*** @Author lpc* @Date 2024 03 14 09 38**/
public class Test {public static void main(String[] args) {//创建连接池PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();//设置最大连接数cm.setMaxTotal(100);//设置每个主机的最大连接数cm.setDefaultMaxPerRoute(10);//使用连接池管理器发起请求doGet(cm);}public static void doGet(PoolingHttpClientConnectionManager cm){//不是每次创建新的httpClient,而是从连接池中获取HttpClient对象CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();HttpGet httpGet = new HttpGet("http://www.itcast.cn");CloseableHttpResponse response=null;try {response = httpClient.execute(httpGet);if (response.getStatusLine().getStatusCode()==200){String content = EntityUtils.toString(response.getEntity(), "utf8");System.out.println(content.length());}} catch (IOException e) {throw new RuntimeException(e);}finally {if (response!=null){try {response.close();} catch (IOException e) {throw new RuntimeException(e);}//不能关闭,由连接池管理// httpClient.close();}}}
}
7.设置请求信息
package org.example;import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;import java.io.IOException;/*** @Author lpc* @Date 2024 03 14 09 38**/
public class Test {public static void main(String[] args) {//创建连接池PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();//设置最大连接数cm.setMaxTotal(100);//设置每个主机的最大连接数cm.setDefaultMaxPerRoute(10);//使用连接池管理器发起请求doGet(cm);}public static void doGet(PoolingHttpClientConnectionManager cm){//不是每次创建新的httpClient,而是从连接池中获取HttpClient对象CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();HttpGet httpGet = new HttpGet("http://www.itcast.cn");//配置请求信息RequestConfig config=RequestConfig.custom().setConnectTimeout(1000) //创建连接的最长时间,单位是毫秒.setConnectionRequestTimeout(500)//设置获取连接的最长时间.setSocketTimeout(10*1000)//设置数据传输的最长时间.build();//给请求设置请求信息httpGet.setConfig(config);CloseableHttpResponse response=null;try {response = httpClient.execute(httpGet);if (response.getStatusLine().getStatusCode()==200){String content = EntityUtils.toString(response.getEntity(), "utf8");System.out.println(content.length());}} catch (IOException e) {throw new RuntimeException(e);}finally {if (response!=null){try {response.close();} catch (IOException e) {throw new RuntimeException(e);}//不能关闭,由连接池管理// httpClient.close();}}}
}
8.jsoup介绍.
jsoup是一款Java 的 HTML解析器,可直接解析某个URL地址、HTML文木内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。.
jsoup 的主要功能如下:
1.从一个 URL,文件或字符串中解析HTML;
2.使用DOM或CSS选择器来查找、取出数据;
3.可操作HTML元素、属性、文本;·
依赖
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.13.1</version>
</dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>4.12</version><scope>test</scope>
</dependency><!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.4</version>
</dependency><!-- lang3 --><dependency><groupId>org.apache.commons</groupId><artifactId>commons-lang3</artifactId><version>3.8.1</version></dependency>
9.jsoup解析url
package jsoup;import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;import java.net.MalformedURLException;
import java.net.URL;/*** @Author lpc* @Date 2024 03 14 10 44**/
public class jsoupTestFirst {@Testpublic void testJsoupUrl() throws Exception {//解析URL地址Document parse = Jsoup.parse(new URL("http://www.itcast.cn"), 10*1000);//获取title的内容Element title = parse.getElementsByTag("title").first();System.out.println(title.text());}}
10.jsoup解析字符串
package jsoup;import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;/*** @Author lpc* @Date 2024 03 14 10 44**/
public class jsoupTestFirst {@Testpublic void testString() throws Exception {//使用工具读取文件,获取字符串String file = FileUtils.readFileToString(new File("D:\\file.html"), "utf8");//解析字符串Document document = Jsoup.parse(file);//获取title的内容String title = document.getElementsByTag("title").first().text();System.out.println(title);}}
11.jsoup解析文件
@Testpublic void testFile() throws Exception {//解析文件Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");String title = parse.getElementsByTag("title").first().text();System.out.println(title);}
12.所有dom方式获取元素
@Testpublic void testDom() throws Exception {//解析文件,获取Document对象Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");//获取元素//1.//Element elementById = parse.getElementById("popupMenu");//2.//Element elementById=parse.getElementsByTag("span").first();//3.// Elements elementById = parse.getElementsByClass("city_nav");//4.Elements elementById=parse.getElementsByAttribute("abc");System.out.println(elementById.text());}
13.元素中获取数据
@Testpublic void testData() throws Exception {//解析文件Document parse = Jsoup.parse(new File("D:\\file.html"), "utf8");//根据id获取元素Element elementById = parse.getElementById("test");System.out.println(elementById);//1.从元素中获取idString str1=elementById.id();System.out.println(str1);//2.从元素中获取classNameString str2=elementById.className();System.out.println(str2);//3.从元素获取attr的值String str3=elementById.attr("id");System.out.println(str3);//4。从元素中获取所有属性Attributes attributes = elementById.attributes();System.out.println(attributes);//5.从元素中获取文本内容String str4=elementById.text();System.out.println(str4);}