一、分组求TOP N最小值
计算文本里面的每个key分组求TOP N最小值,输出结果。
二、maven设置
<?xml version="1.0" encoding="UTF-8"?><project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>com.mk</groupId><artifactId>spark-test</artifactId><version>1.0</version><name>spark-test</name><url>http://spark.mk.com</url><properties><project.build.sourceEncoding>UTF-8</project.build.sourceEncoding><maven.compiler.source>1.8</maven.compiler.source><maven.compiler.target>1.8</maven.compiler.target><scala.version>2.11.1</scala.version><spark.version>2.4.4</spark.version><hadoop.version>2.6.0</hadoop.version></properties><dependencies><!-- scala依赖--><dependency><groupId>org.scala-lang</groupId><artifactId>scala-library</artifactId><version>${scala.version}</version></dependency><!-- spark依赖--><dependency><groupId>org.apache.spark</groupId><artifactId>spark-core_2.11</artifactId><version>${spark.version}</version></dependency><dependency><groupId>org.apache.spark</groupId><artifactId>spark-sql_2.11</artifactId><version>${spark.version}</version></dependency><dependency><groupId>junit</groupId><artifactId>junit</artifactId><version>4.11</version><scope>test</scope></dependency></dependencies><build><pluginManagement><plugins><plugin><artifactId>maven-clean-plugin</artifactId><version>3.1.0</version></plugin><plugin><artifactId>maven-resources-plugin</artifactId><version>3.0.2</version></plugin><plugin><artifactId>maven-compiler-plugin</artifactId><version>3.8.0</version></plugin><plugin><artifactId>maven-surefire-plugin</artifactId><version>2.22.1</version></plugin><plugin><artifactId>maven-jar-plugin</artifactId><version>3.0.2</version></plugin></plugins></pluginManagement></build>
</project>
三、编程代码
public class TopNApp implements SparkConfInfo {public static class SortData implements Comparable<SortData>, Serializable {private String key;private Integer value;public SortData(String key, Integer value) {this.key = key;this.value = value;}public String getKey() {return key;}public void setKey(String key) {this.key = key;}public Integer getValue() {return value;}public void setValue(Integer value) {this.value = value;}@Overridepublic int compareTo(SortData o) {if (o == null) {return 1;}int diff = this.value - o.value;if (diff != 0)return diff;if(key == o.key)return 0 ;if(key == null)return -1;if(o.key == null)return 1;return this.key.compareTo(o.key);}}public static void main(String[] args) {String filePath = "E:\\spark\\groubByNumber.txt";SparkSession sparkSession = new TopNApp().getSparkConf("groubByNumber");JavaPairRDD<String, Integer> numbers = sparkSession.sparkContext().textFile(filePath, 4).toJavaRDD().flatMap(v -> Arrays.asList(v.split("\n")).iterator()).mapToPair(v -> {String[] data = v.split("\\s+");if (data.length != 2) {return null;}if (!data[1].matches("-?[0-9]+(.[0-9]+)?"))return null;return new Tuple2<>(data[0], Integer.valueOf(data[1]));}).filter(v -> v != null).cache();//数据量大会溢出内存无法计算
// numbers.groupByKey()
// .sortByKey(true)
// .mapValues(v -> {
//
// Integer value = null;
// Iterator<Integer> it = v.iterator();
// while (it.hasNext()) {
// Integer val = it.next();
// if(value==null || value>val){
// value = val;
// }
// }
// return value;
// })
// .map(v-> new SortData(v._1, v._2))
// .sortBy(v->v, true, 3)
// .take(3)
// .forEach(v -> System.out.println(v._1 + ":" + v._2));//这种聚合数据再计算numbers.combineByKey(min -> min, // 将val映射为一个元组,作为分区内聚合初始值(min,val) -> {if (min > val) {min = val;}return min;}, //分区内聚合,(a, b) -> Math.min(a, b)) //分区间聚合.map(v-> new SortData(v._1, v._2)).sortBy(v->v, true, 3).take(3).forEach(v -> System.out.println(v.key + ":" + v.value));sparkSession.stop();}
}public interface SparkConfInfo {default SparkSession getSparkConf(String appName){SparkConf sparkConf = new SparkConf();if(System.getProperty("os.name").toLowerCase().contains("win")) {sparkConf.setMaster("local[4]");System.out.println("使用本地模拟是spark");}else{sparkConf.setMaster("spark://hadoop01:7077,hadoop02:7077,hadoop03:7077");sparkConf.set("spark.driver.host","192.168.150.1");//本地ip,必须与spark集群能够相互访问,如:同一个局域网sparkConf.setJars(new String[] {".\\out\\artifacts\\spark_test\\spark-test.jar"});//项目构建生成的路径}SparkSession session = SparkSession.builder().appName(appName).config(sparkConf).config(sparkConf).getOrCreate();return session;}
}
groubByNumber.txt文件内容
A 100
A 24
B 43
C 774
D 43
D 37
D 78
E 42
C 68
F 89
G 49
F 543
H 36
E 888
A 258
A 538
B 79
B 6
H 67
C 99
输出
B:6
A:24
H:36
四、take方法
List<T> take(int num);
获取前num元素返回