1、首先在pom中引入ansj_seg和nlp-lang的依赖包,
ansj_seg包的作用:
这是一个基于n-Gram+CRF+HMM的中文分词的java实现;
分词速度达到每秒钟大约200万字左右(mac air下测试),准确率能达到96%以上;
目前实现了.中文分词. 中文姓名识别 . 用户自定义词典,关键字提取,自动摘要,关键字标记等功能;
可以应用到自然语言处理等方面,适用于对分词效果要求高的各种项目;
nlp-lang包的作用(nlp常用工具和组件):
工具:词语标准化、tire树结构、双数组tire树、文本断句、html标签清理、Viterbi算法增加;
组件:汉字转拼音、简繁体转换、bloomfilter、指纹去重、SimHash文章相似度计算、词贡献统计、基于内存的搜索提示、WordWeight词频统计,词idf统计,词类别相关度统计;
Maven:
<!-- nlp-lang -->
<dependency><groupId>org.nlpcn</groupId><artifactId>nlp-lang</artifactId><version>1.7.2</version>
</dependency>
<!-- ansj_seg -->
<dependency><groupId>org.ansj</groupId><artifactId>ansj_seg</artifactId><version>5.1.2</version>
</dependency>
2、创建WordUtil类,如下:
package com.mengyao.nlp.util;import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;import org.ansj.app.keyword.KeyWordComputer;
import org.ansj.app.keyword.Keyword;
import org.ansj.app.summary.SummaryComputer;
import org.ansj.app.summary.pojo.Summary;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.NlpAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.commons.lang3.StringUtils;
import org.nlpcn.commons.lang.jianfan.JianFan;
import org.nlpcn.commons.lang.pinyin.Pinyin;
import org.nlpcn.commons.lang.util.WordAlert;
import org.nlpcn.commons.lang.util.WordWeight;/*** * @author mengyao**/
public class WordUtil {public static void main(String[] args) {System.out.println("2016/06/25".matches("^\\d{4}(\\-|\\/|\\.)\\d{1,2}\\1\\d{1,2}$"));System.out.println("20160625".matches("^\\d{8}$"));}/*** 文章摘要* @param title* @param content* @return*/public static String getSummary(String title, String content) {SummaryComputer summaryComputer = new SummaryComputer(title, content);Summary summary = summaryComputer.toSummary();return summary.getSummary();}/*** 带标题的文章关键词提取* @param title* @param content* @return*/public static List<Keyword> getKeyWord(String title, String content) {List<Keyword> keyWords = new ArrayList<Keyword>();KeyWordComputer<NlpAnalysis> kwc = new KeyWordComputer<NlpAnalysis>(20);Collection<Keyword> result = kwc.computeArticleTfidf(title, content);for (Keyword keyword : result) {keyWords.add(keyword);}return keyWords;} /*** 不带标题的文章关键词提取* @param content* @return*/public static List<Keyword> getKeyWord2(String content) {List<Keyword> keyWords = new ArrayList<Keyword>();KeyWordComputer<NlpAnalysis> kwc = new KeyWordComputer<NlpAnalysis>(20);Collection<Keyword> result = kwc.computeArticleTfidf(content);for (Keyword keyword : result) {keyWords.add(keyword);}return keyWords;} /*** 标准分词* @param text* @return*/public static List<Term> getToSeg(String text) {List<Term> words = new ArrayList<Term>();Result parse = ToAnalysis.parse(text);for (Term term : parse) {if (null!=term.getName()&&!term.getName().trim().isEmpty()) {words.add(term);}}return words;}/*** NLP分词* @param text* @return*/public static List<Term> getNlpSeg(String text) {List<Term> words = new ArrayList<Term>();Result parse = NlpAnalysis.parse(text);for (Term term : parse) {if (null!=term.getName()&&!term.getName().trim().isEmpty()) {words.add(term);}}return words;}/*** Index分词* @param text* @return*/public static List<Term> getIndexSeg(String text) {List<Term> words = new ArrayList<Term>();Result parse = IndexAnalysis.parse(text);for (Term term : parse) {if (null!=term.getName()&&!term.getName().trim().isEmpty()) {words.add(term);}}return words;}/*** 简体转繁体* @param word* @return*/public static String jian2fan(String text) {return JianFan.j2f(text);}/*** 繁体转简体* @param word* @return*/public static String fan2jian(String text) {return JianFan.f2j(text);}/*** 拼音(不带音标)* @param word* @return*/public static String pinyin(String text) {StringBuilder builder = new StringBuilder();List<String> pinyins = Pinyin.pinyin(text);for (String pinyin : pinyins) {if (null != pinyin) {builder.append(pinyin+" "); }}return builder.toString();}/*** 拼音(不带音标,首字母大写)* @param word* @return*/public static String pinyinUp(String text) {StringBuilder builder = new StringBuilder();List<String> pinyins = Pinyin.pinyin(text);for (String pinyin : pinyins) {if (StringUtils.isEmpty(pinyin)) {continue;}builder.append(pinyin.substring(0,1).toUpperCase()+pinyin.substring(1));}return builder.toString();}/*** 拼音(带数字音标)* @param word* @return*/public static String tonePinyin(String text) {StringBuilder builder = new StringBuilder();List<String> pinyins = Pinyin.tonePinyin(text);for (String pinyin : pinyins) {if (null != pinyin) {builder.append(pinyin+" "); }}return builder.toString();}/*** 拼音(带符号音标)* @param word* @return*/public static String unicodePinyin(String text) {StringBuilder builder = new StringBuilder();List<String> pinyins = Pinyin.unicodePinyin(text);for (String pinyin : pinyins) {if (null != pinyin) {builder.append(pinyin+" "); }}return builder.toString();}/*** 词频统计* @param words* @return*/public static Map<String, Double> wordCount(List<String> words) {WordWeight ww = new WordWeight();for (String word : words) {ww.add(word);}return ww.export();}/*** 词频统计* @param words* @return*/public static List<String> wordCount1(List<String> words) {List<String> wcs = new ArrayList<String>();WordWeight ww = new WordWeight();for (String word : words) {ww.add(word);}Map<String, Double> export = ww.export();for (Entry<String, Double> entry : export.entrySet()) {wcs.add(entry.getKey()+":"+entry.getValue());}return wcs;}/*** 语种识别:1英文;0中文* @param words* @return*/public static int language(String word) {return WordAlert.isEnglish(word)?1:0;}}