java setlt;intgt;_java使用Nagao算法实现新词发现、热门词的挖掘

采用Nagao算法统计各个子字符串的频次,然后基于这些频次统计每个字符串的词频、左右邻个数、左右熵、交互信息(内部凝聚度)。

名词解释:

Nagao算法:一种快速的统计文本里所有子字符串频次的算法。详细算法可见http://www.doc88.com/p-664123446503.html

词频:该字符串在文档中出现的次数。出现次数越多越重要。

左右邻个数:文档中该字符串的左边和右边出现的不同的字的个数。左右邻越多,说明字符串成词概率越高。

左右熵:文档中该字符串的左边和右边出现的不同的字的数量分布的熵。类似上面的指标,有一定区别。

交互信息:每次将某字符串分成两部分,左半部分字符串和右半部分字符串,计算其同时出现的概率除于其各自独立出现的概率,最后取所有的划分里面概率最小值。这个值越大,说明字符串内部凝聚度越高,越可能成词。

算法具体流程:

1.  将输入文件逐行读入,按照非汉字([^\u4E00-\u9FA5]+)以及停词“的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说”,

分成一个个字符串,代码如下:

String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");

停用词可以修改。

2.  获取所有切分后的字符串的左子串和右子串,分别加入左、右PTable

3.  对PTable排序,并计算LTable。LTable记录的是,排序后的PTable中,下一个子串同上一个子串具有相同字符的数量

4.  遍历PTable和LTable,即可得到所有子字符串的词频、左右邻

5.  根据所有子字符串的词频、左右邻结果,输出字符串的词频、左右邻个数、左右熵、交互信息

1.  NagaoAlgorithm.java

package com.algo.word;

import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.FileWriter;

import java.io.IOException;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.Collections;

import java.util.HashMap;

import java.util.HashSet;

import java.util.List;

import java.util.Map;

import java.util.Set;

public class NagaoAlgorithm {

private int N;

private List leftPTable;

private int[] leftLTable;

private List rightPTable;

private int[] rightLTable;

private double wordNumber;

private Map wordTFNeighbor;

private final static String stopwords = "的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说";

private NagaoAlgorithm(){

//default N = 5

N = 5;

leftPTable = new ArrayList();

rightPTable = new ArrayList();

wordTFNeighbor = new HashMap();

}

//reverse phrase

private String reverse(String phrase) {

StringBuilder reversePhrase = new StringBuilder();

for (int i = phrase.length() - 1; i >= 0; i--)

reversePhrase.append(phrase.charAt(i));

return reversePhrase.toString();

}

//co-prefix length of s1 and s2

private int coPrefixLength(String s1, String s2){

int coPrefixLength = 0;

for(int i = 0; i < Math.min(s1.length(), s2.length()); i++){

if(s1.charAt(i) == s2.charAt(i)) coPrefixLength++;

else break;

}

return coPrefixLength;

}

//add substring of line to pTable

private void addToPTable(String line){

//split line according to consecutive none Chinese character

String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");

for(String phrase : phrases){

for(int i = 0; i < phrase.length(); i++)

rightPTable.add(phrase.substring(i));

String reversePhrase = reverse(phrase);

for(int i = 0; i < reversePhrase.length(); i++)

leftPTable.add(reversePhrase.substring(i));

wordNumber += phrase.length();

}

}

//count lTable

private void countLTable(){

Collections.sort(rightPTable);

rightLTable = new int[rightPTable.size()];

for(int i = 1; i < rightPTable.size(); i++)

rightLTable[i] = coPrefixLength(rightPTable.get(i-1), rightPTable.get(i));

Collections.sort(leftPTable);

leftLTable = new int[leftPTable.size()];

for(int i = 1; i < leftPTable.size(); i++)

leftLTable[i] = coPrefixLength(leftPTable.get(i-1), leftPTable.get(i));

System.out.println("Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable");

}

//according to pTable and lTable, count statistical result: TF, neighbor distribution

private void countTFNeighbor(){

//get TF and right neighbor

for(int pIndex = 0; pIndex < rightPTable.size(); pIndex++){

String phrase = rightPTable.get(pIndex);

for(int length = 1 + rightLTable[pIndex]; length <= N && length <= phrase.length(); length++){

String word = phrase.substring(0, length);

TFNeighbor tfNeighbor = new TFNeighbor();

tfNeighbor.incrementTF();

if(phrase.length() > length)

tfNeighbor.addToRightNeighbor(phrase.charAt(length));

for(int lIndex = pIndex+1; lIndex < rightLTable.length; lIndex++){

if(rightLTable[lIndex] >= length){

tfNeighbor.incrementTF();

String coPhrase = rightPTable.get(lIndex);

if(coPhrase.length() > length)

tfNeighbor.addToRightNeighbor(coPhrase.charAt(length));

}

else break;

}

wordTFNeighbor.put(word, tfNeighbor);

}

}

//get left neighbor

for(int pIndex = 0; pIndex < leftPTable.size(); pIndex++){

String phrase = leftPTable.get(pIndex);

for(int length = 1 + leftLTable[pIndex]; length <= N && length <= phrase.length(); length++){

String word = reverse(phrase.substring(0, length));

TFNeighbor tfNeighbor = wordTFNeighbor.get(word);

if(phrase.length() > length)

tfNeighbor.addToLeftNeighbor(phrase.charAt(length));

for(int lIndex = pIndex + 1; lIndex < leftLTable.length; lIndex++){

if(leftLTable[lIndex] >= length){

String coPhrase = leftPTable.get(lIndex);

if(coPhrase.length() > length)

tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length));

}

else break;

}

}

}

System.out.println("Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor");

}

//according to wordTFNeighbor, count MI of word

private double countMI(String word){

if(word.length() <= 1) return 0;

double coProbability = wordTFNeighbor.get(word).getTF()/wordNumber;

List mi = new ArrayList(word.length());

for(int pos = 1; pos < word.length(); pos++){

String leftPart = word.substring(0, pos);

String rightPart = word.substring(pos);

double leftProbability = wordTFNeighbor.get(leftPart).getTF()/wordNumber;

double rightProbability = wordTFNeighbor.get(rightPart).getTF()/wordNumber;

mi.add(coProbability/(leftProbability*rightProbability));

}

return Collections.min(mi);

}

//save TF, (left and right) neighbor number, neighbor entropy, mutual information

private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){

try {

//read stop words file

Set stopWords = new HashSet();

BufferedReader br = new BufferedReader(new FileReader(stopList));

String line;

while((line = br.readLine()) != null){

if(line.length() > 1)

stopWords.add(line);

}

br.close();

//output words TF, neighbor info, MI

BufferedWriter bw = new BufferedWriter(new FileWriter(out));

for(Map.Entry entry : wordTFNeighbor.entrySet()){

if( entry.getKey().length() <= 1 || stopWords.contains(entry.getKey()) ) continue;

TFNeighbor tfNeighbor = entry.getValue();

int tf, leftNeighborNumber, rightNeighborNumber;

double mi;

tf = tfNeighbor.getTF();

leftNeighborNumber = tfNeighbor.getLeftNeighborNumber();

rightNeighborNumber = tfNeighbor.getRightNeighborNumber();

mi = countMI(entry.getKey());

if(tf > Integer.parseInt(threshold[0]) && leftNeighborNumber > Integer.parseInt(threshold[1]) &&

rightNeighborNumber > Integer.parseInt(threshold[2]) && mi > Integer.parseInt(threshold[3]) ){

StringBuilder sb = new StringBuilder();

sb.append(entry.getKey());

sb.append(",").append(tf);

sb.append(",").append(leftNeighborNumber);

sb.append(",").append(rightNeighborNumber);

sb.append(",").append(tfNeighbor.getLeftNeighborEntropy());

sb.append(",").append(tfNeighbor.getRightNeighborEntropy());

sb.append(",").append(mi).append("\n");

bw.write(sb.toString());

}

}

bw.close();

} catch (IOException e) {

throw new RuntimeException(e);

}

System.out.println("Info: [Nagao Algorithm Step 4]: having saved to file");

}

//apply nagao algorithm to input file

public static void applyNagao(String[] inputs, String out, String stopList){

NagaoAlgorithm nagao = new NagaoAlgorithm();

//step 1: add phrases to PTable

String line;

for(String in : inputs){

try {

BufferedReader br = new BufferedReader(new FileReader(in));

while((line = br.readLine()) != null){

nagao.addToPTable(line);

}

br.close();

} catch (IOException e) {

throw new RuntimeException();

}

}

System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable");

//step 2: sort PTable and count LTable

nagao.countLTable();

//step3: count TF and Neighbor

nagao.countTFNeighbor();

//step4: save TF NeighborInfo and MI

nagao.saveTFNeighborInfoMI(out, stopList, "20,3,3,5".split(","));

}

public static void applyNagao(String[] inputs, String out, String stopList, int n, String filter){

NagaoAlgorithm nagao = new NagaoAlgorithm();

nagao.setN(n);

String[] threshold = filter.split(",");

if(threshold.length != 4){

System.out.println("ERROR: filter must have 4 numbers, seperated with ',' ");

return;

}

//step 1: add phrases to PTable

String line;

for(String in : inputs){

try {

BufferedReader br = new BufferedReader(new FileReader(in));

while((line = br.readLine()) != null){

nagao.addToPTable(line);

}

br.close();

} catch (IOException e) {

throw new RuntimeException();

}

}

System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable");

//step 2: sort PTable and count LTable

nagao.countLTable();

//step3: count TF and Neighbor

nagao.countTFNeighbor();

//step4: save TF NeighborInfo and MI

nagao.saveTFNeighborInfoMI(out, stopList, threshold);

}

private void setN(int n){

N = n;

}

public static void main(String[] args) {

String[] ins = {"E://test//ganfen.txt"};

applyNagao(ins, "E://test//out.txt", "E://test//stoplist.txt");

}

}

2. TFNeighbor.java

package com.algo.word;

import java.util.HashMap;

import java.util.Map;

public class TFNeighbor {

private int tf;

private Map leftNeighbor;

private Map rightNeighbor;

TFNeighbor(){

leftNeighbor = new HashMap();

rightNeighbor = new HashMap();

}

//add word to leftNeighbor

public void addToLeftNeighbor(char word){

//leftNeighbor.put(word, 1 + leftNeighbor.getOrDefault(word, 0));

Integer number = leftNeighbor.get(word);

leftNeighbor.put(word, number == null? 1: 1+number);

}

//add word to rightNeighbor

public void addToRightNeighbor(char word){

//rightNeighbor.put(word, 1 + rightNeighbor.getOrDefault(word, 0));

Integer number = rightNeighbor.get(word);

rightNeighbor.put(word, number == null? 1: 1+number);

}

//increment tf

public void incrementTF(){

tf++;

}

public int getLeftNeighborNumber(){

return leftNeighbor.size();

}

public int getRightNeighborNumber(){

return rightNeighbor.size();

}

public double getLeftNeighborEntropy(){

double entropy = 0;

int sum = 0;

for(int number : leftNeighbor.values()){

entropy += number*Math.log(number);

sum += number;

}

if(sum == 0) return 0;

return Math.log(sum) - entropy/sum;

}

public double getRightNeighborEntropy(){

double entropy = 0;

int sum = 0;

for(int number : rightNeighbor.values()){

entropy += number*Math.log(number);

sum += number;

}

if(sum == 0) return 0;

return Math.log(sum) - entropy/sum;

}

public int getTF(){

return tf;

}

}

3. Main.java

package com.algo.word;

public class Main {

public static void main(String[] args) {

//if 3 arguments, first argument is input files splitting with ','

//second argument is output file

//output 7 columns split with ',' , like below:

//word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information

//third argument is stop words list

if(args.length == 3)

NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2]);

//if 4 arguments, forth argument is the NGram parameter N

//5th argument is threshold of output words, default is "20,3,3,5"

//output TF > 20 && (left | right) neighbor number > 3 && MI > 5

else if(args.length == 5)

NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2], Integer.parseInt(args[3]), args[4]);

}

}

以上所述就是本文的全部内容了,希望大家能够喜欢。

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/244059.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

朋友生日聚会上的演唱者

这是在北京簋街参加一个朋友的生日聚会上见到的一个歌者。他很幽默也很敬业&#xff0c;只要10元钱&#xff0c;你就可以任意点一首歌请他唱。

草原上的云

草原上的云是有生命的。因为天空蓝得纯正&#xff0c;云便白得干净璀璨。它像细胞一样随着风可以分裂和变幻出各种不同的形态&#xff0c;有时是男人和女人&#xff0c;有时是牛羊马群&#xff0c;有时是移动的山&#xff0c;有时是凝固的海。所以&#xff0c;云常常是故乡孩子…

java实体设置扩展属性setextattributes_transactionAttributes各属性意义及配置

在使用spring提供的JpaTemplate进行查询时&#xff0c;如果数据量超过100 条&#xff0c;查询效率就会明显降低。由于开始时使用JPA内部的双向关联&#xff0c;造成各实体内部关联过多&#xff0c;从而影响所有的操作&#xff0c;因此怀疑是因为JPA的关联关系所致。但是去掉关联…

旅途中的人物之一:卖菜老人

这是我在去往呼伦贝尔的路上&#xff0c;见到的一位卖菜的老人。他卖的是油豆角和旱黄瓜。这两种蔬菜在北京很难见到。前者炖排骨&#xff0c;很香&#xff1b;后者有真正的黄瓜味道。当看我要给他照相的时候&#xff0c;他竟然忽地站起来&#xff0c;立正&#xff0c;就这样看…

旅途人物之二:幸福的孩子们

我的外甥女紫薇&#xff0c;北京国际少年合唱团的歌手。她与我们一同去往内蒙。在去乌兰浩特的乡村——太平山的路上&#xff0c;我在山坡的草丛里抓到一只蝈蝈&#xff0c;送给她。这是我童年最喜欢的一种昆虫&#xff0c;与中原的蝈蝈的叫声不同&#xff0c;且会飞&#xff0…

java 可忽略参数_如何使用“&var =”忽略URL参数

我在控制器中作为可选参数&#xff1a;GetPostalCodeLocation(string postalCode, string postalCodeExt, string place "", string district "")如果在URL我没有把它区域工作&#xff0c;但在应用程序中生成此URL&#xff1a;Url:"http://localhos…

也说电影《魔比斯环》

让人期待已久迄今为止投资规模最大&#xff08;1.3亿人民币&#xff09;的国产动画大片《魔比斯环》终于登场&#xff0c;可是观众的反应却不甚理想。这是我在看首映式之前就有所预料的&#xff0c;因为好莱坞的势力实在强悍&#xff0c;别说是中国的动画&#xff0c;就算是欧洲…

有一种小说叫“纯爱”:为“纯爱小说系列写的序言

一谈到女性文学&#xff0c;便总与“女权”、“身体”、“性”&#xff0c;甚至“胸口”沾边&#xff0c;让人不禁嗅到浓浓的火药味儿。从女性身份的隐藏和丧失&#xff0c;到夸大女性的欲望和权力&#xff0c;女性写作一路走来&#xff0c;恰恰缺少的是两极之间有关女人“原质…

java教程菜鸟教程组合模式,组合实体模式

组合实体模式组合实体模式(Composite Entity Pattern)用在 EJB 持久化机制中。一个组合实体是一个 EJB 实体 bean&#xff0c;代表了对象的图解。当更新一个组合实体时&#xff0c;内部依赖对象 beans 会自动更新&#xff0c;因为它们是由 EJB 实体 bean 管理的。以下是组合实体…

女垒姑娘最漂亮,青年女足最顽强

昨天中午看了一场世界女子垒球锦标赛&#xff0c;发现中国的垒球姑娘真是个个长得都漂亮呢&#xff0c;在女子集体运动队里&#xff0c;我敢说女垒恐怕是整体最美的队伍。昨天对手是实力最弱的南非队&#xff0c;因此美女们的发挥更潇洒&#xff0c;姿态也最优美&#xff0c;就…

大漠印象:鄂尔多斯

近日再去内蒙古西部&#xff0c;到了辉腾锡勒草原、包头、鄂尔多斯等地&#xff0c;拍了些照片。大漠落日圆。可在我眼里这仿佛是外太空的某个无生物星球&#xff0c;荒凉而又陌生。天体学家说&#xff1a;地球的未来就是这样的景象。沙漠的纹路像是骤然间凝固的水面&#xff0…

php 命令行 php.ini,php web环境和命令行环境下查找php.ini的方法分享

php web环境和命令行环境下查找php.ini的位置php.ini 是php运行的配置文件&#xff0c;不是一个必须使用的文件。不过通常php程序运行都需要加载php.ini文件。 php.ini 提供了一些重要的参数。比如运行的时候出错的时候是否报错&#xff0c;运行的内存最大是多少 和一些php扩展…

在辉腾锡勒我见到了狼

9月初在辉腾锡勒的黄花沟见到了两只野狼。起初我有点不相信&#xff0c;因为我已经30来年没见过野狼了。在我的家乡呼伦贝尔草原狼几乎绝迹。此刻&#xff0c;它们悠闲地从我们身边走过&#xff0c;完全无视人类的存在。带路的当地老乡告诉我&#xff0c;由于这几年禁止打狼&am…

不同时期的同学的聚会

北京图图蒙古食府。大学同学。内蒙呼伦贝尔。小学同学。内蒙呼伦贝尔。初中同学。内蒙包头固阳村农家菜。北京高中同学。

java 张量运算,博客 | Tensorflow_01_运算符与张量值

feed_dict 方法它不止是一个方法&#xff0c;同时还是一个观念&#xff0c;让我们可以更加明确的了解到节点创立的时候&#xff0c;并不包含了让节点执行动作的过程&#xff0c;也因为 Tensorflow 这样的特性&#xff0c;我们可以让流程先创立好&#xff0c;最后等到要运算真正…

老外肖像(一)

在北京朝阳公园见到的加拿大小女孩。这是那个女孩的弟弟&#xff0c;非常顽皮。一个叫奥利沃的匈牙利人&#xff0c;却在中国教授汉语。我说他有欧洲古典诗人的气质&#xff0c;他说他的梦想是做飞行员。这个女孩是第一次来中国的德国人&#xff0c;很像鲁本斯画中的人物。奥利…

哲学是一种生活

《庄子》里有一则寓言&#xff0c;说到一棵巨大的树&#xff0c;这棵树既不能做家具&#xff0c;也不能盖房子。有人问&#xff1a;这样的树有什么用&#xff1f;庄子回答&#xff1a;它的用处就是没有用。如果它能够做家具、盖房子&#xff0c;它早就被人砍伐了。正是因为无用…

php如何解决并发问题,PHP如何解决并发问题

PHP如何解决并发问题有个问题&#xff1a;一个进程开启事务对表的某一行做了修改&#xff0c;但还没有提交&#xff0c;另一个进程查询该行数据&#xff0c;获取到的是原始的&#xff0c;这时候上面的事物提交了&#xff0c;我再用这个原始数据的时候就有问题……那我们该怎么解…