1.Jsoup可以使用类似于CSS或jQuery的语法来查找和操作元素.
2.实例如下:
public static void main(String[] args) throws Exception{// 创建httpClient实例CloseableHttpClient httpClient = HttpClients.createDefault();// 创建httpGet实例HttpGet httpGet = new HttpGet("http://www.cnblogs.com");httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0");CloseableHttpResponse response = httpClient.execute(httpGet);String content = null;if(response != null){HttpEntity entity = response.getEntity(); content = EntityUtils.toString(entity, "UTF-8"); // 获取网页内容Document document = Jsoup.parse(content); // 解析网页,得到文档对象// 1.查找所有帖子DOMElements elements = document.select(".post_item .post_item_body h3 a");for(Element ele : elements){System.out.println("博客标题:" + ele.text());}System.out.println("------------------------分割线------------------------");// 2.查找带有href属性的a元素Elements hrefElements = document.select("a[href]");for(Element ele : hrefElements){System.out.println(ele.toString());}System.out.println("------------------------分割线------------------------");// 3.查找扩展名为.png的图片DOM节点Elements imgElements = document.select("img[src$=.png]");for(Element ele : imgElements){System.out.println(ele.toString());}System.out.println("------------------------分割线------------------------");// 4.获取tag为title的第一个DOM元素Element titleEle = document.getElementsByTag("title").first();System.out.println("标题为:" + titleEle.text());}if(response != null){response.close();}if(httpClient != null){httpClient.close();}}
3.Jsoup学习地址
开源博客系统-Jsoup