【爬虫】自动下载指定网站全部图片（Java版）

爬虫是一种自动化程序，能够模拟人类的浏览行为，访问网络资源并提取所需数据。它可以通过发送HTTP请求获取网页内容，并对网页进行解析和数据提取。

在大多数时候，提到爬虫我们就会想到 Python，其实 Java 也是可以实现爬虫的。

Java提供了很多网络编程相关的类库，但为了方便我们编写爬虫程序，可以引入一些第三方库，如HttpClient、Jsoup等。这些库提供了更简洁、易用的接口，帮助我们快速实现爬虫功能。

Tips：

下方代码中的网址请自行更替
图片下载到指定文件夹，文件夹需要是存在的
图片名称（前缀）可以自定义，程序会自动编号
本项目使用 Maven 管理依赖，若是不使用 Maven 则需要自行下载 jsoup 的 jar 包

pom 文件：

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>org.example</groupId><artifactId>JavaCrawler</artifactId><version>1.0-SNAPSHOT</version><properties><maven.compiler.source>8</maven.compiler.source><maven.compiler.target>8</maven.compiler.target></properties><dependencies><!--Java网络爬虫工具--><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.14.3</version></dependency></dependencies></project>

完整代码：

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;/*** 爬取指定网址上的图片** Tips：* 若出现 403 错误则可能是由于“写入”访问被禁止而造成的，* 当试图将文件上载到目录或在目录中修改文件，但该目录不允许"写"访问时就会出现此种错误** @author 秋玄* @version 1.0* @since 1.0*/
public class App {public static void main(String[] args) {// 网站地址String site = "https://xxx.xx.com/";// 图片保存路径String filePath = "F://test";// 自定义图片名称String fileName = "img";downloadImg(site,filePath,fileName);}/*** 获取指定网站上所有图片* @param website       指定网站的完整域名 包括请求协议，例如：www.xxx.com* @param filePath      图片存放路径 例如：F://test* @param fileName      图片名称 例如：xxx*/private static void downloadImg(String website,String filePath,String fileName) {List<String> urlList = new ArrayList<>();try {// 获取网站图片的 src// 连接到指定网站Connection connection = Jsoup.connect(website);// 获取网站页面上所有的 DOM 元素Document document = connection.get();// 获取所有的 img 元素Elements imgs = document.getElementsByTag("img");// 遍历 imgsfor (int i = 0; i < imgs.size(); i++) {// 获取 img 元素的 src 属性String src = imgs.get(i).attr("src");// url地址以 “//” 开始，需要拼接请求协议if (src.startsWith("//")){src = "https:" + src;}// 路径为 空 或 “about:blank” 则不添加到 List 中if (src.length() != 0 && !"about:blank".equals(src)) {urlList.add(src);}// 下载图片getImg(urlList,filePath,fileName);}} catch (IOException e) {throw new RuntimeException(e);}}/*** 下载指定 URL 的图片* @param imgURL        图片地址的 list 集合* @param filePath      图片存放路径* @param fileName      图片文件名称*/private static void getImg(List<String> imgURL,String filePath,String fileName){InputStream in = null;FileOutputStream fos = null;// 遍历图片地址 list 集合for (int i = 0; i < imgURL.size(); i++) {try {URL url = new URL(imgURL.get(i));in = url.openStream();// 拼接文件存放路径及文件名String path = appendPath(filePath,fileName,i);// 将图片写入本地fos = new FileOutputStream(path);byte[] bytes = new byte[1024];int count = in.read(bytes);while(count != -1){fos.write(bytes,0,count);fos.flush();count = in.read(bytes);}} catch (IOException e) {throw new RuntimeException(e);}finally {// 释放资源if (in != null) {try {in.close();} catch (IOException e) {throw new RuntimeException(e);}}if (fos != null) {try {fos.close();} catch (IOException e) {throw new RuntimeException(e);}}}}}/*** 拼接文件存放路径及文件名* @param filePath      文件路径* @param fileName      文件名* @param i             文件编号* @return              文件完整路径* 格式：文件路径 + 文件名称 + _ + 文件编号 + 文件后缀（.jpg）*/private static String appendPath(String filePath,String fileName,Integer i) {return filePath + "//" + fileName + "_" + (i + 1) + ".jpg";}
}

一叶知秋，奥妙玄心

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.mzph.cn/news/202574.shtml

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！