读取txt,doc,docx文档格式的文本内容,通过不同格式,读取逻辑不同,避免造成文本内容乱码问题,
这里需要安装Maven:
版本最好统一
<dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>5.2.3</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>5.2.3</version></dependency><dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>5.2.3</version></dependency>
/*** 根据文本不同的编码格式拿到文本内容* @param file* @return* @throws IOException*/
private String getContent(MultipartFile file) throws IOException {String fileName = file.getOriginalFilename();if (fileName != null) {if (fileName.endsWith(".txt")) {return readTextFile(file.getBytes());} else if (fileName.endsWith(".doc")) {return readDocFile(file);} else if (fileName.endsWith(".docx")) {return readDocxFile(file);}}return "";}/*** 文本编码格式*/private static final List<Charset> FALLBACK_ENCODINGS = Arrays.asList(StandardCharsets.UTF_8,Charset.forName("GBK"),Charset.forName("GB2312"),StandardCharsets.ISO_8859_1);/*** 读取txt格式的文件* @param fileBytes* @return*/private String readTextFile(byte[] fileBytes) {// 使用 UniversalDetector 检测文件编码UniversalDetector detector = new UniversalDetector(null);detector.handleData(fileBytes, 0, fileBytes.length);detector.dataEnd();String encoding = detector.getDetectedCharset();if (encoding != null) {String content = new String(fileBytes, Charset.forName(encoding));if (isValidContent(content)) {return content;}}// 尝试使用多种常见编码解析文件内容for (Charset charset : FALLBACK_ENCODINGS) {String content = new String(fileBytes, charset);if (isValidContent(content)) {return content;}}// 如果所有尝试都失败,返回默认的 UTF-8 编码内容return new String(fileBytes, StandardCharsets.UTF_8);}/*** 读取doc格式的文件* @param file* @return* @throws IOException*/private static String readDocFile(MultipartFile file) throws IOException {try (InputStream inputStream = file.getInputStream();HWPFDocument doc = new HWPFDocument(inputStream)) {WordExtractor extractor = new WordExtractor(doc);return extractor.getText();}}/*** 读取docx格式的文件* @param file* @return* @throws IOException*/private String readDocxFile(MultipartFile file) throws IOException {InputStream inputStream = file.getInputStream();XWPFDocument docx = new XWPFDocument(inputStream);XWPFWordExtractor extractor = new XWPFWordExtractor(docx);String content = extractor.getText();docx.close();inputStream.close();return content;}