文章目录
- 前言
- 简介
- 第一步:引入依赖
- 第二步:编写文件解析处理类
- 第三步:Word解析类
- 第四步:PDF解析类
- 第五步:Txt解析类
- 总结
前言
请各大网友尊重本人原创知识分享,谨记本人博客:南国以南i、
提示:以下是本篇文章正文内容,下面案例可供参考
简介
在日常开发中我们经常会解析各类文件如:.docx、pdf、txt
读取文件内容进行下一步逻辑处理,本文例举解析上述文件
第一步:引入依赖
友情链接
:创建Springboot 项目请移步 点我!点我!点我!
<dependency><groupId>cn.hutool</groupId><artifactId>hutool-all</artifactId><version>5.8.5</version></dependency><!-- 提取pdf中的文字--><dependency><groupId>com.itextpdf</groupId><artifactId>itextpdf</artifactId><version>5.5.6</version></dependency><!--获取pdf文件的总页数--><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId><version>1.8.11</version></dependency><!-- poi --><dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>4.0.0</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>4.0.0</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml-schemas</artifactId><version>4.0.0</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>4.0.0</version></dependency><!-- FileUtils依赖--><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.4</version></dependency>
第二步:编写文件解析处理类
// 定义静态的文件后缀private static final String SUFFIX_DOC = ".doc";private static final String SUFFIX_DOCX = ".docx";private static final String SUFFIX_PDF = ".pdf";private static final String SUFFIX_TXT = ".txt";/*** .* 根据文件类型解析文件内容** @param file 文件* @return 解析内容*/private static String readFileToString(File file) {StringBuilder readStr = new StringBuilder();String fileType = file.getName().substring(file.getName().lastIndexOf("."));log.debug("解析文件类型为[{}]文件", fileType);switch (fileType) {case SUFFIX_DOC: {readStr.append(WordUtil.readWord(SUFFIX_DOC, file.getAbsolutePath()));break;}case SUFFIX_DOCX: {readStr.append(WordUtil.readWord(SUFFIX_DOCX, file.getAbsolutePath()));break;}case SUFFIX_PDF: {readStr.append(PDFToWordUtil.getTextFromPdf(file.getAbsolutePath()));break;}case SUFFIX_TXT: {readStr.append(TxtUtil.readTxtFile(file.getAbsolutePath()));break;}default: {log.error("文件类型不正确,请上传.doc、.docx、.pdf、.txt后缀文件");throw new RuntimeException("文件类型不正确,请上传.doc、.docx、.pdf、.txt后缀文件");}}return readStr.toString();}
第三步:Word解析类
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;import java.io.FileInputStream;
import java.io.InputStream;/*** Word 操作工具类*/
@Slf4j
public class WordUtil {// 定义静态的文件后缀public static final String SUFFIX_DOC = ".doc";public static final String SUFFIX_DOCX = ".docx";/*** 读取 Word 入口方法,根据后缀,调用方法** @param suffix 文件后缀* @param filePath 文件路径* @return*/public static String readWord(String suffix, String filePath) {String wordStr = "";try (InputStream input = new FileInputStream(filePath)) {// docx 类型if (SUFFIX_DOCX.equals(suffix)) {wordStr = readDocx(input);// doc 类型} else if (SUFFIX_DOC.equals(suffix)) {wordStr = readDoc(input);}} catch (Exception e) {log.error("readWord [{}] is error", filePath);}wordStr = wordStr.replace("\n", "");wordStr = wordStr.replace("\\\\r", "");wordStr = wordStr.replace("\\\\t", "");return wordStr;}/*** 读取 doc 类型,使用 WordExtractor 对象,传递输入流** @param inputStream* @return*/private static String readDoc(InputStream inputStream) {try {String content = "";WordExtractor ex = new WordExtractor(inputStream);content = ex.getText();ex.close();return content;} catch (Exception e) {return null;}}/*** 读取 docx 类型,使用 XWPFDocument 对象,传递输入流** @param inputStream* @return*/private static String readDocx(InputStream inputStream) {try {String content = "";XWPFDocument xdoc = new XWPFDocument(inputStream);XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);content = extractor.getText();extractor.close();return content;} catch (Exception e) {return null;}}
}
第四步:PDF解析类
@Slf4j
public class PDFToWordUtil {/*** @Description: 提取pdf中的文字 第一种方法* @Param: fileUrlList:地址* @Param: pages:页码* @return: content:提取的文字*/public static String PDFToWord(String fileUrlList) throws IOException {//linux---start---fileUrlList = fileUrlList.replaceAll("\\\\", File.separator);//linux--end----Integer pages = PDFToPage(fileUrlList);String fileName = fileUrlList;//源文件的位置PdfReader reader = null;//PDF读取器reader = new PdfReader(fileName);String content = "";for (int i = 1; i <= pages; i++) {content += PdfTextExtractor.getTextFromPage(reader, i); // 读取PDF中第i页(用哪一页就写几)的文档内容,并转成String}content = content.replace("\n", "");content = content.replace("\\\\r", "");content = content.replace("\\\\t", "");log.debug(content);//控制台打印PDF第一页的内容return content;}/*** @Description: 提取pdf中的页码* @Param: fileUrlList:地址* @return: pages:页码数*/public static int PDFToPage(String fileUrlList) {//linux---start---fileUrlList = fileUrlList.replaceAll("\\\\", File.separator);//linux--end----File file = new File(fileUrlList);PdfReader pdfReader = null;try {pdfReader = new PdfReader(new FileInputStream(file));} catch (IOException e) {e.printStackTrace();}int pages = pdfReader.getNumberOfPages();log.debug("pdf文件的总页数为:" + pages);return pages;}/*** @Description: 提取pdf中的文字 第二种方法* @Param: pdfPath:地址* @return: content:提取的文字*/public static String getTextFromPdf(String pdfPath) {String content = null;try {// 是否排序boolean sort = false;// 开始提取页数int startPage = 1;// 结束提取页数int endPage = Integer.MAX_VALUE;//InputStream input = null;//linux---start---// pdfPath = pdfPath.replaceAll("\\\\", File.separator);//linux--end----File pdfFile = new File(pdfPath);PDDocument document = null;try (InputStream input = new FileInputStream(pdfFile)) {// 加载 pdf 文档PDFParser parser = new PDFParser(input);parser.parse();document = parser.getPDDocument();// 获取内容信息PDFTextStripper pts = new PDFTextStripper();pts.setSortByPosition(sort);endPage = document.getNumberOfPages();log.debug("Total Page: " + endPage);pts.setStartPage(startPage);pts.setEndPage(endPage);try {content = pts.getText(document);} catch (Exception e) {throw e;}log.debug("Get PDF Content ...");} catch (Exception e) {throw e;} finally {if (null != document)document.close();}content = content.replace("\n", "");content = content.replace("\\\\r", "");content = content.replace("\\\\t", "");} catch (Exception e) {log.error("getTextFromPdf [{}] is error", pdfPath);}return content;}}
第五步:Txt解析类
@Slf4j
public class TxtUtil {/*** .* 获取文本内容** @return 文件内容*/public static String readTxtFile(String filePath) {String txtStr = "";try {File file = new File(filePath);if (!file.exists()) {log.error("可读文件不存在[{}]", file.getAbsolutePath());}txtStr = FileUtils.readFileToString(file, CharsetUtil.UTF_8);} catch (IOException e) {log.error("readTxtFile [{}] is error", filePath);}txtStr = txtStr.replace("\n", "");txtStr = txtStr.replace("\\\\r", "");txtStr = txtStr.replace("\\\\t", "");return txtStr;}/*** .* 写入数据** @param outPath 输出路径* @param context 内容*/public static void writeFile(String outPath, String context) {try {File file = new File(outPath);FileUtils.write(file, context, CharsetUtil.UTF_8, false);} catch (IOException e) {log.error("writeFile is error {}", e);}}
}
总结
我是南国以南i记录点滴每天成长一点点,学习是永无止境的!转载请附原文链接!!!