第十一站：Java翡翠绿——大数据处理的力量

在Java大数据处理领域，Hadoop和Spark是两个至关重要的框架，它们充分展示了Java在处理大规模数据集方面的实力。下面我将通过简化的范例来讲解这两个框架的基本使用。

Hadoop MapReduce 示例

Hadoop MapReduce 是一种编程模型，用于处理和生成大数据集。下面是一个基本的Word Count程序示例，用于统计文本文件中每个单词出现的次数。

步骤简述:

Mapper 阶段：读取输入文件的每一行，将其分割成单词，并为每个单词生成键值对（单词，1）。
Reducer 阶段：对相同单词的键值对进行汇总，计算每个单词的总次数。

Java代码示例（简略版）：

import java.io.IOException;
import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;public class WordCount {public static class TokenizerMapperextends Mapper<LongWritable, Text, Text, IntWritable>{private final static IntWritable one = new IntWritable(1);private Text word = new Text();public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {StringTokenizer itr = new StringTokenizer(value.toString());while (itr.hasMoreTokens()) {word.set(itr.nextToken());context.write(word, one);}}}public static class IntSumReducerextends Reducer<Text,IntWritable,Text,IntWritable> {private IntWritable result = new IntWritable();public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) {sum += val.get();}result.set(sum);context.write(key, result);}}public static void main(String[] args) throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf, "word count");job.setJarByClass(WordCount.class);job.setMapperClass(TokenizerMapper.class);job.setCombinerClass(IntSumReducer.class);job.setReducerClass(IntSumReducer.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}

Apache Spark 示例

Spark 提供了更高级的数据处理能力，支持内存计算，可以更高效地执行迭代算法和交互式数据分析。下面是一个使用Spark Java API进行Word Count的简单示例。

Java代码示例（使用Spark）:

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import scala.Tuple2;public class SparkWordCount {public static void main(String[] args) {SparkConf conf = new SparkConf().setAppName("Spark Word Count");JavaSparkContext sc = new JavaSparkContext(conf);JavaRDD<String> textFile = sc.textFile("hdfs://localhost:9000/path/to/input.txt");JavaRDD<String> words = textFile.flatMap(s -> Arrays.asList(s.split(" ")).iterator());JavaPairRDD<String, Integer> counts = words.mapToPair(word -> new Tuple2<>(word, 1)).reduceByKey((a, b) -> a + b);counts.saveAsTextFile("hdfs://localhost:9000/path/to/output");sc.stop();}
}

在这两个示例中，我们看到了如何使用Java语言在Hadoop MapReduce和Apache Spark框架下进行大数据处理。Hadoop提供了基础的分布式存储和计算能力，而Spark在此基础上提供了更高级的处理速度和灵活性，特别是在需要快速迭代处理和实时分析的场景中。

Apache Hadoop MapReduce 进阶示例：二次排序(Secondary Sort)

在Hadoop MapReduce中，有时我们需要对输出的键进行排序，同时还要保证相同键下的值也按照一定的顺序排列，这就是所谓的“二次排序”。下面是一个实现二次排序的Java示例。

Java代码示例：

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Comparator;import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;public class SecondarySort extends Configured implements Tool {public static class MyKey implements WritableComparable<MyKey> {protected Text first = new Text();protected IntWritable second = new IntWritable();public MyKey() {}public MyKey(Text first, IntWritable second) {this.first = first;this.second = second;}public void write(DataOutput out) throws IOException {first.write(out);second.write(out);}public void readFields(DataInput in) throws IOException {first.readFields(in);second.readFields(in);}public int compareTo(MyKey other) {int cmp = this.first.compareTo(other.first);if(cmp != 0) {return cmp;} else {return this.second.compareTo(other.second);}}@Overridepublic String toString() {return first + "\t" + second;}}public static class MyValue implements Writable {private Text value = new Text();public MyValue() {}public MyValue(String value) {this.value.set(value);}public void write(DataOutput out) throws IOException {value.write(out);}public void readFields(DataInput in) throws IOException {value.readFields(in);}@Overridepublic String toString() {return value.toString();}}public static class MyMapper extends Mapper<LongWritable, Text, MyKey, MyValue> {private final MyKey key = new MyKey();private final MyValue value = new MyValue();public void map(LongWritable offset, Text lineText, Context context)throws IOException, InterruptedException {String line = lineText.toString();String[] parts = line.split("\t");key.set(new Text(parts[0]), new IntWritable(Integer.parseInt(parts[1])));value.set(new Text(parts[2]));context.write(key, value);}}public static class MyPartitioner extends Partitioner<MyKey, MyValue> {@Overridepublic int getPartition(MyKey key, MyValue value, int numPartitions) {return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions;}}public static class MyGroupComparator extends WritableComparator {protected MyGroupComparator() {super(MyKey.class, true);}@Overridepublic int compare(WritableComparable a, WritableComparable b) {MyKey keyA = (MyKey) a;MyKey keyB = (MyKey) b;return keyA.getFirst().compareTo(keyB.getFirst());}}public static class MyComparator extends WritableComparator {protected MyComparator() {super(MyKey.class, true);}@Overridepublic int compare(WritableComparable a, WritableComparable b) {MyKey keyA = (MyKey) a;MyKey keyB = (MyKey) b;int cmp = keyA.getFirst().compareTo(keyB.getFirst());if(cmp != 0) {return cmp;} else {return keyA.getSecond().compareTo(keyB.getSecond());}}}public static class MyReducer extends Reducer<MyKey, MyValue, Text, IntWritable> {private IntWritable result = new IntWritable();public void reduce(MyKey key, Iterable<MyValue> values, Context context)throws IOException, InterruptedException {for (MyValue val : values) {context.write(key.getFirst(), result);}}}public int run(String[] args) throws Exception {Job job = Job.getInstance(getConf(), "Secondary Sort");job.setJarByClass(SecondarySort.class);job.setMapperClass(MyMapper.class);job.setReducerClass(MyReducer.class);job.setPartitionerClass(MyPartitioner.class);job.setGroupingComparatorClass(MyGroupComparator.class);job.setSortComparatorClass(MyComparator.class);job.setOutputKeyClass(MyKey.class);job.setOutputValueClass(MyValue.class);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));return job.waitForCompletion(true) ? 0 : 1;}public static void main(String[] args) throws Exception {int exitCode = ToolRunner.run(new SecondarySort(), args);System.exit(exitCode);}
}