java读读取取html文文件件,并并获获取取body中中所所有有的的标标签签及及内内容容的的案案例例
这里的获取的是html文件中body 中的所有标签以及内容
package com.lmt.service.file;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import org.springframework.stereotype.Component;
import com.lmt.config.UrlConstants;
Component
public class ParseFile {
/**
* 解析html文件
* param file
* return
*/
public String readHtml(File file){
String body = "";
try {
FileInputStream iStream = new FileInputStream(file);
Reader reader = new InputStreamReader(iStream);
BufferedReader htmlReader = new BufferedReader(reader);
String line;
boolean found = false;
while (!found && (line = htmlReader.readLine()) != null) {
if (line.toLowerCase().indexOf("
的前面可能存在空格found = true;
}
}
found = false;
while (!found && (line = htmlReader.readLine()) != null) {
if (line.toLowerCase().indexOf("
found = true;
} else {
// 果存在图片,则将相对路径转换为绝对路径
String lowerCaseLine = line.toLowerCase();
if (lowerCaseLine.contains("src")) {
//这里是定义图片的访问路径
String directory = "D:/test";
// 果路径名不以反斜杠结尾,则手动添加反斜杠
/*if (!directory.endsWith("\\")) {
directory = directory + "\\";
}*/
// line = line.substring(0, lowerCaseLine.indexOf("src") + 5) + directory +
line.substring(lowerCaseLine.indexOf("src") + 5);
/*String filename = extractFilename(line);
line = line.substri