TF-IDF + K-Means 中文聚类例子 - scala

Demo仅供参考

  • 使用spark1.6

import java.io.{BufferedReader, InputStreamReader}
import java.util.Arraysimport org.ansj.splitWord.analysis.ToAnalysis
import org.apache.hadoop.fs.FSDataInputStream
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.{SparseVector, Vectors}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.storage.StorageLevelimport scala.collection.mutable.ArrayBuffer
/*** Created by Zsh on 1/22 0022.*/
object tfid {def main(args: Array[String]): Unit = {val conf = newSparkConf().setAppName("TF-IDF Clustering").setMaster("yarn-client")val sc = new SparkContext(conf)val sqlContext = new SQLContext(sc)import sqlContext.implicits._//    val sentenceData = sqlContext.createDataFrame(Seq(//      (1, tokenizer2("利用特征向量和标签产生一个预测模型。 MLlib使用Pipeline代表这个工作流")),//      (2, tokenizer2("这些变化发生在ML包里面。MLlib模块下现在有两个包:MLlib和ML。ML把整个机器学")),//      (3, tokenizer2("Mahout是hadoop的一个机器学习库,主要的编程模型是MapReduce;Spark ML则是基于Spark的机器学习,Spark自身拥有MLlib作为机器学习库。")),//        (3, tokenizer2("日本东京电视台的人气综艺节目《开运鉴定团》主要对古董进行鉴定不过偶尔也会发生失误的状况节目开播以来最重大的发现日前他们在节目里鉴定")),//          (3, tokenizer2("对许多人来说,看着老爸老妈现在的样子,大概很难想象他们曾经也是青春靓丽,甚至颜值惊人。然而,谁没年轻过呢?对于这个话题,最近又有不"))//    )).toDF("label", "sentence")val rawTrainingData = sc.textFile("/wcc.txt")val dataFrame = rawTrainingData.map(x=>{(0,tokenizer2(x))}).persist(StorageLevel.MEMORY_AND_DISK).toDF("label", "sentence")//        val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")//        val wordsData = tokenizer.transform(sentenceData)//    println(wordsData.select("words"))//        wordsData.show(false)//    val frame = sentenceData.map(x=>{(x.getAs("label"),x.getAs("sentence"))}).toDF("label", "sentence")//    val str = tokenizer2("老师都快放假啊李开复啊可是对方")val numClusters = 10  //聚类数val numIterations = 30val runTimes = 3var clusterIndex: Int = 0val hashingTF = new HashingTF(). setInputCol("sentence").setOutputCol("rawFeatures").setNumFeatures(100000)val featurizedData = hashingTF.transform(dataFrame)featurizedData.show(false)println(featurizedData.count())val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")val idfModel = idf.fit(featurizedData)val rescaledData  = idfModel.transform(featurizedData)println(rescaledData)rescaledData.select("features","label").show(false)//    val value = rescaledData.select("features", "label","sentence")//      //      .map(_.get(0))//      .map(x => {//      val vector = x.get(0).asInstanceOf[org.apache.spark.mllib.linalg.Vector]//      (vector,x.get(2))//    })val value = rescaledData.select("features", "label","sentence").map{case Row(features:org.apache.spark.mllib.linalg.Vector,label:Int,sentence)=>(features,sentence)}//训练val clusters: KMeansModel =KMeans.train(value.map(_._1), numClusters, numIterations, runTimes)println("Cluster Number:" + clusters.clusterCenters.length)println("Cluster Centers Information Overview:")clusters.clusterCenters.foreach(x => {println("聚类质心点向量:" + clusterIndex + ":")println(x)clusterIndex += 1})// 输出本次聚类操作的收敛性,此值越低越好val kMeansCost = clusters.computeCost(value.map(_._1))println("K-Means Cost: " + kMeansCost)//begin to check which cluster each test data belongs to based on the clustering result// 输出每组数据及其所属的子集索引value.map(x=>{//预测(clusters.predict(x._1)+":"+x._2.toString)}).saveAsTextFile("/0123")}def tokenizer2(line: String): Seq[String] = {val reg1 = "@\\w{2,20}:".rval reg2 = "http://[0-9a-zA-Z/\\?&#%$@\\=\\\\]+".r//    println("stopwordSet,stopwordSet:"+stopwordSet.size)AnsjSegment(line).split(",").filter(_!=null).filter(token => !reg1.pattern.matcher(token).matches).filter(token => !reg2.pattern.matcher(token).matches).filter(token => !stopwordSet.contains(token)).toSeq}def AnsjSegment(line: String): String={val StopNatures="""w","",null,"s", "f", "b", "z", "r", "q", "d", "p", "c", "uj", "ul","en", "y", "o", "h", "k", "x"""val KeepNatures=List("n","v","a","m","t")val StopWords=Arrays.asList("的", "是","了")  //Arrays.asList(stopwordlist.toString())//val filter = new FilterRecognition()//加入停用词//filter.insertStopWords(StopWords)//加入停用词性//filter.insertStopNatures(StopNatures)//filter.insertStopRegex("小.*?")//此步骤将会只取分词,不附带词性//for (i <- Range(0, filter1.size())) {//word += words.get(i).getName//}val words = ToAnalysis.parse(line)val word = ArrayBuffer[String]()for (i <- Range(0,words.size())) { //KeepNatures.contains(words.get(i).getNatureStr.substring(0,1))&&if(KeepNatures.contains(words.get(i).getNatureStr.substring(0,1))&&words.get(i).getName.length()>=2)word += words.get(i).getName}word.mkString(",")}var stopwordSet: Set[String] = getStopFile()def getStopFile():Set[String]={//集群请跑下面的路径var inputStream: FSDataInputStream = nullvar bufferedReader: BufferedReader = nullval stopword = ArrayBuffer[String]()try {//获取到HDFS的输入流,可以参考上一篇文档val stopWordsCn=ConfigurationManager.getProperty(Constants.STOP_WORDS_CN)inputStream = HDFSUtil.getFSDataInputStream(stopWordsCn)//转成缓冲流bufferedReader = new BufferedReader(new InputStreamReader(inputStream))//一次读取一行var lineTxt: String = bufferedReader.readLine()while (lineTxt != null) {//      println("lineTxt:"+lineTxt)stopword +=lineTxtlineTxt = bufferedReader.readLine()}stopwordSet = stopword.toSet}catch{case e: Exception => e.printStackTrace()}finally{if (bufferedReader != null) {bufferedReader.close()}if (inputStream != null) {HDFSUtil.close(inputStream)}}//println("结束词语个数:"+stopwordSet.size)stopwordSet}
}

全部配置文件-自行提取所需配置

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>com.izhonghong</groupId><artifactId>mission-center-new</artifactId><version>1.0-SNAPSHOT</version><properties><maven.compiler.source>1.6</maven.compiler.source><maven.compiler.target>1.6</maven.compiler.target><encoding>UTF-8</encoding><scala.tools.version>2.10</scala.tools.version><scala.version>2.10.6</scala.version><hbase.version>1.2.2</hbase.version></properties><dependencies><!-- <dependency><groupId>org.apache.spark</groupId><artifactId>spark-mllib_2.11</artifactId><version>2.1.0</version></dependency>--><!--<dependency><groupId>org.apache.spark</groupId><artifactId>spark-mllib_2.11</artifactId><version>1.6.0</version></dependency>--><!-- <dependency><groupId>com.hankcs</groupId><artifactId>hanlp</artifactId><version>portable-1.5.0</version></dependency>--><dependency><groupId>org.apache.spark</groupId><artifactId>spark-mllib_2.10</artifactId><version>1.6.0</version></dependency><dependency><groupId>org.ansj</groupId><artifactId>ansj_seg</artifactId><version>5.0.4</version></dependency><dependency><groupId>org.scala-lang</groupId><artifactId>scala-library</artifactId><version>2.10.6</version></dependency><dependency><groupId>org.apache.kafka</groupId><artifactId>kafka-clients</artifactId><version>0.10.0.0</version></dependency><dependency><groupId>net.sf.json-lib</groupId><classifier>jdk15</classifier><artifactId>json-lib</artifactId><version>2.4</version></dependency><dependency><groupId>org.apache.spark</groupId><artifactId>spark-streaming-kafka_2.10</artifactId><version>1.6.2</version></dependency><!-- <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka-0-10_2.10</artifactId><version>2.1.1</version> </dependency> --><dependency><groupId>org.apache.spark</groupId><artifactId>spark-streaming_2.10</artifactId><version>1.6.2</version><exclusions><exclusion><artifactId>scala-library</artifactId><groupId>org.scala-lang</groupId></exclusion></exclusions></dependency><!-- <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.10</artifactId><version>2.1.1</version> <scope>provided</scope> </dependency> --><dependency><groupId>com.huaban</groupId><artifactId>jieba-analysis</artifactId><version>1.0.2</version></dependency><dependency><groupId>com.alibaba</groupId><artifactId>fastjson</artifactId><version>1.2.14</version></dependency><dependency><groupId>redis.clients</groupId><artifactId>jedis</artifactId><version>2.9.0</version></dependency><dependency><groupId>org.scala-lang</groupId><artifactId>scala-library</artifactId><version>${scala.version}</version></dependency><dependency><groupId>org.apache.hbase</groupId><artifactId>hbase-server</artifactId><version>1.2.2</version><exclusions><exclusion><artifactId>servlet-api-2.5</artifactId><groupId>org.mortbay.jetty</groupId></exclusion></exclusions></dependency><!--  <dependency><groupId>com.alibaba</groupId><artifactId>fastjson</artifactId><version>1.2.18</version></dependency>--><dependency><groupId>org.apache.spark</groupId><artifactId>spark-core_2.10</artifactId><version>1.6.2</version><!-- <version>2.1.1</version> --></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-client</artifactId><version>2.7.0</version></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-common</artifactId><version>2.7.0</version></dependency><dependency><groupId>org.apache.hadoop</groupId><artifactId>hadoop-hdfs</artifactId><version>2.7.0</version><exclusions><exclusion><groupId>javax.servlet.jsp</groupId><artifactId>*</artifactId></exclusion><exclusion><artifactId>servlet-api</artifactId><groupId>javax.servlet</groupId></exclusion></exclusions></dependency><dependency><groupId>org.apache.spark</groupId><artifactId>spark-sql_2.10</artifactId><version>1.6.2</version></dependency><dependency><groupId>org.apache.spark</groupId><artifactId>spark-hive_2.10</artifactId><version>1.6.2</version></dependency><dependency><groupId>mysql</groupId><artifactId>mysql-connector-java</artifactId><version>5.1.39</version></dependency><!--<dependency><groupId>org.apache.hbase</groupId><artifactId>hbase-server</artifactId><version>1.2.2</version></dependency>--><!-- Test --><dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>4.11</version><scope>test</scope></dependency><dependency><groupId>org.specs2</groupId><artifactId>specs2_${scala.tools.version}</artifactId><version>1.13</version><scope>test</scope></dependency><dependency><groupId>org.scalatest</groupId><artifactId>scalatest_${scala.tools.version}</artifactId><version>2.0.M6-SNAP8</version><scope>test</scope></dependency></dependencies><build><plugins><plugin><groupId>net.alchim31.maven</groupId><artifactId>scala-maven-plugin</artifactId><version>3.2.0</version><executions><execution><goals><goal>compile</goal><goal>testCompile</goal></goals></execution></executions></plugin><plugin><groupId>org.apache.maven.plugins</groupId><artifactId>maven-jar-plugin</artifactId><configuration><archive><manifest><addClasspath>true</addClasspath><classpathPrefix>lib/</classpathPrefix><mainClass></mainClass></manifest></archive></configuration></plugin><plugin><groupId>org.apache.maven.plugins</groupId><artifactId>maven-compiler-plugin</artifactId><configuration><source>1.8</source><target>1.8</target></configuration></plugin><plugin><groupId>org.apache.maven.plugins</groupId><artifactId>maven-dependency-plugin</artifactId><executions><execution><id>copy</id><phase>package</phase><goals><goal>copy-dependencies</goal></goals><configuration><outputDirectory>${project.build.directory}/lib</outputDirectory></configuration></execution></executions></plugin></plugins></build><!-- <build> <plugins> <plugin> <artifactId>maven-assembly-plugin</artifactId><configuration> <archive> <manifest> 这里要替换成jar包main方法所在类 <mainClass>com.sf.pps.client.IntfClientCall</mainClass></manifest> <manifestEntries> <Class-Path>.</Class-Path> </manifestEntries></archive> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef></descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id>this is used for inheritance merges <phase>package</phase> 指定在打包节点执行jar包合并操作<goals> <goal>single</goal> </goals> </execution> </executions> </plugin></plugins> </build> --></project>

 

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/510183.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

双目标定与矫正 matlab

matlab版本有2015a 或更新的本 1.预先拍摄好多幅标定板图像 像这样&#xff1a; 分别将左右图像放在left和right文件夹中。 2.相机标定 选择APPS,下拉菜单选择"stereo camera calibration" 进入一下界面&#xff0c;点击“add image”,分别设置camera1 &#xff0…

平衡二叉树模板

#include <iostream> #include <cstdio> #include <cstring> #include <algorithm> using namespace std; struct node {int ndata; //记录关键字数值node *l,*r;int nheight; //平衡因子 }; int height(node* p) //返回…

Hbase PageFilter 取出数量不准确问题

PageFilter pf new PageFilter(pageFilterSize); filterList.addFilter(pf); PageFilter&#xff1a;设置获取的数据行数PageFilter作用在region上&#xff0c;只能保证当前region返回相应的数量&#xff0c;也就是说大概可理解为 region数量*pageFilterSize

CV_LOAD_IMAGE_COLOR 和 CV_BGR2RGBA找不到定义

添加下面头文件 #include "opencv2/imgcodecs/legacy/constants_c.h" #include "opencv2/imgproc/types_c.h"

堆排序原理及算法实现(最大堆)

堆排序 堆排序是利用堆的性质进行的一种选择排序。下面先讨论一下堆。 1.堆 堆实际上是一棵完全二叉树&#xff0c;其任何一非叶节点满足性质&#xff1a; Key[i]<key[2i1]&&Key[i]<key[2i2]或者Key[i]>Key[2i1]&&key>key[2i2] 即任何一非叶节点的…

Storm 安装

下载地址&#xff1a;http://storm.apache.org/ 下载完成后解压到相应目录 修改 conf目录下的storm.yaml文件,集群的所有服务器统一配置 #zookeeper地址&#xff0c;默认端口2181 storm.zookeeper.servers:- "test1"- "test2"- "test3"#nimbus…

Failed to connect to 127.0.0.1 port 1080: Connection refused package:git

windows平台&#xff1a; 解决方法&#xff1a; 打开c:/user/***/..gitconfig 删除一下内容&#xff1a; [http "https://github.com"] proxy 127.0.0.1:50647 [https "https://github.com"] proxy 127.0.0.1:50647 proxy 127.0.0.1:506…

Hbase Region in transition (RIT) 异常解决

查看Hbase状态&#xff0c;突然发现出现了RIT&#xff0c;并且很长时间了~ 查看了一些相关RIT介绍 &#xff08;部分介绍和Hbase2.0不同&#xff0c;如&#xff1a;RIT状态信息2.0已不在zookeeper保存&#xff09; https://mp.weixin.qq.com/s?__bizMzU5OTQ1MDEzMA&mid224…

邻接表的构建、DFS、BFS搜索

接着上次的文章“图的构建&#xff08;邻接链表法&#xff09;”&#xff0c;邻接链表法构建图相对来说比较简单&#xff0c;并且遍历起来也相对简单&#xff0c;但是要是动态添加图的节点和图的边&#xff0c;则是实在不方便&#xff0c;不过现在不管那么多&#xff0c;今天的…

Storm性能简单测试

看了很多关于Storm性能的文章&#xff0c;说法不一&#xff0c;自己根据实际业务测一下是否能满足自身要求&#xff08;只做了简单集群和代码调优&#xff09; 场景&#xff1a;kafka消费数据标准化后存储到Hbase中 服务器资源&#xff1a;两台32G内存做Supervisor 使用资源平…

C++之带有默认参数值的构造函数

在一个类中 &#xff0c;如果程序员没有写&#xff0c;任何一个构造函数&#xff0c;则编译器将为该类提供一个默认的构造函数&#xff0c;如果程序员对类的构造函数进行了重载&#xff0c;则编译器将不提供默构造函数&#xff0c;这里需要手动书写一个无参的构造函数&#xff…

Hbase Shell 介绍

目录 Scan 创建表 Count 清空表数据 删除数据 删除表 删除列簇 判断表是否为‘enable’ 插入 Region管理 Scan 查询某个表某个列的数据&#xff1a; scan tableName,{COLUMN>列族&#xff1a;列,LIMIT>需要查看条数} 指定开始Rowkey查询 scan tableName,{STARTRO…

拓扑排序 详解 + 并查集 详解 + 最小生成树详解

若您发现本文有什么错误&#xff0c;请联系我&#xff0c;我会及时改正的&#xff0c;谢谢您的合作&#xff01; 本文为原创文章&#xff0c;转载请注明出处 本文链接 &#xff1a; http://www.cnblogs.com/Yan-C/p/3943940.html 。 哎呀&#xff0c;好久了啊&#xff0c;想…

Hbase 2.0 RegionObserver使用

参考&#xff1a;http://hbase.apache.org/2.0/book.html#cp Hbase2.0 不支持 1.x版本的RegionObserver &#xff0c;查看hbase官网更新说明&#xff0c;自己做了测试并通过 Hbase RegionObserver import java.io.IOException; import java.util.List; import java.util.Opti…

ubuntu 16.0安装ros-kinetic

1.设置sources.list sudo sh -c echo "deb http://packages.ros.org/ros/ubuntu $(lsb_release -sc) main" > /etc/apt/sources.list.d/ros-latest.list best: sudo sh -c . /etc/lsb-release && echo "deb http://mirrors.tuna.tsinghua.edu.cn/…

Impala Shell 简单命令

目录 1.更新元数据 2.对查询结果去格式化 3.查询结果存储到文件 4.去格式化后指定分隔符 5.-p或者--show-profiles:显示查询的执行计划(与EXPLAIN语句输出相同)和每个查询语句底层的执行步骤的详细信息. 6.指定主机名连接 (-i) 7.执行查询语句 8.指定脚本文件执行SQL …

gtest测试用例

google tf测试框架代码 https://github.com/google/googletest/tree/2fe3bd994b3189899d93f1d5a881e725e046fdc2 跑单个测试例 ./test --gtest_filterAllTest.t1 跑这个模块的所有测试例 ./test --gtest_filterAllTest*

__stdcall

__stdcall是函数调用约定的一种&#xff0c;函数调用约定主要约束了两件事&#xff1a;1.参数传递顺序2.调用堆栈由谁&#xff08;调用函数或被调用函数&#xff09;清理常见的函数调用约定&#xff1a;stdcall cdecl fastcall thiscall naked call__stdcall表示1.参数从右向左…

CDH邮件预警

转载自&#xff1a;http://blog.51cto.com/feature09/2055835 在CDH的7180页面找到Cloudera Managerment Service 如图所示&#xff1a; 在Configuration中&#xff0c;搜索alert 设置接收信息的邮箱。 设置内容都在图片上&#xff1a; 添加邮件页眉说明&#xff0c;第一行显…