一、 关于设计
(一)数据库
确定外键标识,需判断该外键是否有可能被修改。如菜单id,菜单code,菜单名,前两者都可做外键,后面一个则不应做外键。
二、关于组件
(一)POI
1. 文档页数统计
import lombok. extern. slf4j. Slf4j ;
import org. apache. pdfbox. pdmodel. PDDocument ;
import org. ofdrw. reader. OFDReader ;
import org. springframework. web. multipart. MultipartFile ; import java. io. File ;
import java. io. FileInputStream ;
import java. io. IOException ;
import java. io. InputStream ;
import java. nio. file. Path ;
import java. nio. file. Paths ;
@Slf4j
public class LvDocPageCounter { public static final String DOCUMENT_PAGE_TEMP = "DOCUMENT_PAGE_TEMP" ; public static int getPageCount ( String filePath) { String fileType = getFileType ( filePath) ; try { switch ( fileType) { case "pdf" : return getPdfPageCount ( filePath) ; case "docx" : return getDocxPageCount ( filePath) ; case "doc" : return getDocPageCount ( filePath) ; case "ofd" : return getOfdPageCount ( filePath) ; default : log. warn ( "不支持的文件类型:{}" , filePath) ; return 1 ;
} } catch ( Exception e) { log. warn ( "读取文件异常:{},{}" , filePath, e) ; return 0 ; } } private static String getFileType ( String filePath) { int dotIndex = filePath. lastIndexOf ( '.' ) ; if ( dotIndex == - 1 || dotIndex == filePath. length ( ) - 1 ) { log. warn ( "文件名中没有找到扩展名:{}" , filePath) ; return "" ; } return filePath. substring ( dotIndex + 1 ) . toLowerCase ( ) ; } private static int getPdfPageCount ( String filePath) throws IOException { try ( PDDocument document = Loader . loadPDF ( new File ( filePath) ) ) {
int numberOfPages = document. getNumberOfPages ( ) ; document. close ( ) ; return numberOfPages; } } private static int getDocPageCount ( String filePath) throws IOException {
try ( InputStream inputStream = new FileInputStream ( filePath) ) { com. aspose. words. Document doc = new com. aspose. words. Document( inputStream) ; int num = doc. getPageCount ( ) ; doc. cleanup ( ) ; return num; } catch ( Exception e) { e. printStackTrace ( ) ; return 0 ; } } private static int getDocxPageCount ( String filePath) throws IOException {
try ( InputStream inputStream = new FileInputStream ( filePath) ) { com. aspose. words. Document doc = new com. aspose. words. Document( inputStream) ; int num = doc. getPageCount ( ) ; doc. cleanup ( ) ; return num; } catch ( Exception e) { e. printStackTrace ( ) ; return 0 ; } } private static int getOfdPageCount ( String filePath) throws IOException { Path ofdFile = Paths . get ( filePath) ; OFDReader ofdReader = new OFDReader ( ofdFile) ; int numberOfPages = ofdReader. getNumberOfPages ( ) ; ofdReader. close ( ) ; return numberOfPages; } public static Integer getPageCount ( MultipartFile inputStream, String originalFilename) { try ( InputStream inputStream1 = inputStream. getInputStream ( ) ) { return getPageCount ( inputStream1, originalFilename) ; } catch ( IOException e) { log. warn ( "读取文件异常:{},{}" , originalFilename, e) ; return 0 ; } }
}
2. 文本提取
import cn. hutool. core. io. FileUtil ;
import lombok. extern. slf4j. Slf4j ;
import org. apache. commons. io. FilenameUtils ;
import org. apache. pdfbox. pdmodel. PDDocument ;
import org. apache. pdfbox. text. PDFTextStripper ;
import org. apache. poi. hwpf. HWPFDocument ;
import org. apache. poi. hwpf. extractor. WordExtractor ;
import org. apache. poi. xwpf. extractor. XWPFWordExtractor ;
import org. apache. poi. xwpf. usermodel. XWPFDocument ;
import org. ofdrw. converter. export. TextExporter ; import java. io. File ;
import java. io. FileInputStream ;
import java. io. IOException ;
import java. io. InputStream ;
import java. nio. file. Path ;
import java. nio. file. Paths ;
import java. util. concurrent. atomic. AtomicInteger ;
@Slf4j
public class LvDocTxTHunter { private static AtomicInteger UPPER_LIMIT = new AtomicInteger ( 50 ) ; public static String readText ( String filePath) { int pageCount = LvDocPageCounter . getPageCount ( filePath) ; if ( pageCount > UPPER_LIMIT . get ( ) ) { log. warn ( "文件过大:{},{}" , filePath, pageCount) ; return "" ; } String fileType = getFileType ( filePath) ; try { switch ( fileType) { case "pdf" : return readPdfText ( filePath) ; case "doc" : return readDocText ( filePath) ; case "docx" : return readDocxText ( filePath) ; case "ofd" : return readOfdText ( filePath) ; default : log. warn ( "不支持的文件类型:{}" , filePath) ; return "" ; } } catch ( IOException e) { log. warn ( "读取文件异常:{},{}" , filePath, e) ; return "" ; } } private static String getFileType ( String filePath) { int dotIndex = filePath. lastIndexOf ( '.' ) ; if ( dotIndex == - 1 || dotIndex == filePath. length ( ) - 1 ) { log. warn ( "文件名中没有找到扩展名:{}" , filePath) ; return "" ; } return filePath. substring ( dotIndex + 1 ) . toLowerCase ( ) ; } private static String readPdfText ( String filePath) throws IOException { try ( PDDocument document = Loader . loadPDF ( filePath) ) { String text = new PDFTextStripper ( ) . getText ( document) ; document. close ( ) ; return text; } } private static String readDocText ( String filePath) throws IOException { try ( InputStream inputStream = new FileInputStream ( filePath) ; HWPFDocument document = new HWPFDocument ( inputStream) ) { WordExtractor extractor = new WordExtractor ( document) ; String text = extractor. getText ( ) ; document. close ( ) ; return text; } } private static String readDocxText ( String filePath) throws IOException { try ( InputStream inputStream = new FileInputStream ( filePath) ; XWPFDocument document = new XWPFDocument ( inputStream) ) { XWPFWordExtractor extractor = new XWPFWordExtractor ( document) ; String text = extractor. getText ( ) ; document. close ( ) ; return text; } } private static String readOfdText ( String filePath) throws IOException { Path txtPath = Paths . get ( "DOCUMENT_PAGE_TEMP" , FilenameUtils . getBaseName ( filePath) + ".txt" ) ; TextExporter textExporter = new TextExporter ( Paths . get ( filePath) , txtPath) ; textExporter. export ( ) ; String s = FileUtil . readUtf8String ( txtPath. toFile ( ) ) ; textExporter. close ( ) ; return s; } public static String readText ( File tempFile) { return readText ( tempFile. getPath ( ) ) ; }
}
3. 文案转换
private static void systemInit ( ) { FontLoader preload = FontLoader. Preload ( ) ; preload. scanFontDir ( Paths . get ( FileUtil . local, "font" ) ) ; Field namePathMapping = ReflectUtil . getField ( FontLoader . class , "fontNamePathMapping" ) ; Map < String , String > fontNamePathMapping = ( Map < String , String > ) ReflectUtil . getFieldValue ( preload, namePathMapping) ; System . out. println ( "加载字体:" + JSONUtil . toJsonStr ( fontNamePathMapping. keySet ( ) ) ) ; }
public static void convertOfdToPDFByBridge ( String ofdPath, String distPath, String pdfPath) throws IOException { log. debug ( "解析文件:{}" , ofdPath) ; Path ofdFilePath = Paths . get ( ofdPath) ; Path dir = Paths . get ( distPath) ; PDFExporterIText exporter = new PDFExporterIText ( ofdFilePath, Paths . get ( pdfPath) ) ; exporter. export ( ) ; exporter. close ( ) ; }