SemiJoin,也叫半连接,是从分布式数据库中借鉴过来的方法。它的产生动机是:对于reduce side join,跨机器的数据传输量非常大,这成了join操作的一个瓶颈,如果能够在map端过滤掉不会参加join操作的数据,则可以大大节省网络IO。实现方法很简单:选取一个小表,假设是File1,将其参与join的key抽取出来,保存到文件File3中,File3文件一般很小,可以放到内存中。在map阶段,使用DistributedCache将File3复制到各个TaskTracker上,然后将File2中不在File3中的key对应的记录过滤掉,剩下的reduce阶段的工作与reduce side join相同。此实例中,还是采用第一个实例中的数据,假如我们只过滤sex为1的user,并将key存于user_id文件中(注意:每行的数据一定要带上双引号啊),如下:
"ID"
 "1"
 "2"
 "3"
 "5"
 "6"
 "8"
 "9"
完整代码如下,此实例中我们采用新的API来写:
public class SemiJoin extends Configured implements Tool
{public static class MapClass extends Mapper<LongWritable, Text, Text, Text>{// 用于缓存user_id文件中的数据private Set<String> userIds = new HashSet<String>();private Text key = new Text();private Text value = new Text();private String[] keyValue;// 此方法会在map方法执行之前执行@Overrideprotected void setup(Context context) throws IOException, InterruptedException{BufferedReader in = null;try{// 从当前作业中获取要缓存的文件Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());String userId = null;for (Path path : paths){if (path.toString().contains("user_id")){in = new BufferedReader(new FileReader(path.toString()));while (null != (userId = in.readLine())){userIds.add(userId);}}}}catch (IOException e){e.printStackTrace();}finally{try{if(in != null){in.close(); }}catch (IOException e){e.printStackTrace();}}}public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException{// 在map阶段过滤掉不需要的数据this.keyValue = value.toString().split(",");if(userIds.contains(keyValue[0])){this.key.set(keyValue[0]);this.value.set(keyValue[1]);context.write(this.key, this.value);}}}public static class Reduce extends Reducer<Text, Text, Text, Text>{private Text value = new Text();private StringBuilder sb;public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{sb = new StringBuilder();for(Text val : values){sb.append(val.toString());sb.append(",");}this.value.set(sb.deleteCharAt(sb.length()-1).toString());context.write(key, this.value);}}public int run(String[] args) throws Exception{Job job = new Job(getConf(), "SemiJoin");job.setJobName("SemiJoin");job.setJarByClass(SemiJoin.class);job.setMapperClass(MapClass.class);job.setReducerClass(Reduce.class);job.setInputFormatClass(TextInputFormat.class);job.setOutputFormatClass(TextOutputFormat.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);String[] otherArgs = new GenericOptionsParser(job.getConfiguration(), args).getRemainingArgs();// 我们把第一个参数的地址作为要缓存的文件路径DistributedCache.addCacheFile(new Path(otherArgs[0]).toUri(), job.getConfiguration());FileInputFormat.addInputPath(job, new Path(otherArgs[1]));FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));return job.waitForCompletion(true) ? 0 : 1;}public static void main(String[] args) throws Exception{int res = ToolRunner.run(new Configuration(), new SemiJoin(), args);System.exit(res);}}
转发:https://blog.csdn.net/huashetianzu/article/details/7823326