java springboot+jsoup写一段爬虫脚本将指定地址的图片链接文本超链接地址存入自己的属性类对象中

首先还是最基本的要在 pom.xml 引入依赖

<dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.14.1</version>
</dependency>

然后我们可以在项目中创建一个属性类我这里就叫 WebContent了
参考代码如下

package com.example.webdom.domain;public class WebContent {private String url;private String text;private String image;public WebContent(String url, String text, String image) {this.url = url;this.text = text;this.image = image;}public String getUrl() {return url;}public String getText() {return text;}public String getImage() {return image;}public void setUrl(String url) {this.url = url;}public void setText(String text) {this.text = text;}public void setImage(String image) {this.image = image;}@Overridepublic String toString() {return "WebContent{" +"a标签链接='" + url + '\'' +", 文本内容='" + text + '\'' +", 图片路径='" + image + '\'' +'}';}
}

这里我们定义了三个变量 url 用来存 a标签的链接地址 text用来存a标签的文本信息又或者图片的 alt内容 image用来存图片的url
定义了他们的get set方法这里为了方便大家看写了 toString函数
在这里插入图片描述
然后我们在逻辑类编写代码如下

package com.example.webdom.controller;import com.example.webdom.domain.WebContent;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import java.io.IOException;
import java.util.ArrayList;
import java.util.List;public class WebCrawler {public static void main(String[] args) {String url = "https://www.baidu.com/?tn=48021271_25_hao_pg"; // 要爬取的网页URLtry {Document doc = Jsoup.connect(url).get(); // 通过Jsoup连接并获取网页内容List<WebContent> webContents = new ArrayList<>(); // 创建属性类对象列表Elements links = doc.select("a[href]"); // 选择所有带有href属性的<a>元素for (Element link : links) {String linkText = link.text(); // 获取链接文本String linkHref = link.attr("href"); // 获取链接URLWebContent webContent = new WebContent(linkHref, linkText, null); // 创建属性类对象webContents.add(webContent); // 添加到对象列表}Elements images = doc.select("img[src]"); // 选择所有带有src属性的<img>元素for (Element image : images) {String imageUrl = image.attr("src"); // 获取图片URLString imageAlt = image.attr("alt"); // 获取图片alt属性WebContent webContent = new WebContent(null, imageAlt, imageUrl); // 创建属性类对象webContents.add(webContent); // 添加到对象列表}for (WebContent webContent : webContents) {System.out.println("----------------");System.out.println(webContent);}} catch (IOException e) {e.printStackTrace();}}
}