好吧,之前用的是旧版的,现在出了个新版的,优先选择用新版的哈。
从官网下载相应的开发包,然后主要需要找到这几个东西添加到项目工程里面,1.Data文件夹 2.NLPIR_JNI.DLL 3.NLPIR.jar 4.nlpir.properties
添加完那些东西后,需要配置的东西主要为nlpir.properties文件,大概内容如下:
dll_or_so_path=D\:\\Spiliter\\NLPIR_JNI.dll
data_dir_parent_path=.
主要要配置dll_or_so_path,里面的路径为项目里面NLPIR_JNI.dll的绝对路径
搞定后就可以开始写代码了:
package shell;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Properties;
import kevin.zhang.NLPIR;
public class Spliter {
NLPIR nlpir = null;
public Spliter(){
this.nlpir = new NLPIR();
String argu = "././";
try {
if (this.nlpir.NLPIR_Init(argu.getBytes("GB2312"),0,"0".getBytes("GB2312")) == false){
System.out.println("init failed");
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
public void SliptFile(String file,int tag){
String encoding = "utf8";
try {
String of = file.replace(".", "_new.");
FileOutputStream out = null;
out = new FileOutputStream(new File(of));
InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
BufferedReader bw = new BufferedReader(read);
String line = null;
while((line = bw.readLine()) != null){
System.out.println(line);
String newLine = getSplitString(line,tag);
out.write((newLine + "\r\n").getBytes("utf8") );
}
bw.close();
out.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public String getSplitString(String text,int tag){
String splitStr = null;
byte nativeBytes[];
try {
nativeBytes = nlpir.NLPIR_ParagraphProcess(text.getBytes("gbk"), tag);
//因为分词是用c++编写的所以最后有一个\0,这边就不需要了,不然转码有问题的
splitStr = new String(nativeBytes, 0, nativeBytes.length - 1, "gbk");
System.out.println("分词结果为: " + splitStr);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return splitStr;
}
public static void main(String[] args){
Spliter s = new Spliter();
s.SliptFile("d:\\w.txt",0);
}
}