hadoop关联文件处理

c001.txt

------------------------------

c002.txt

----------------------------

a001.txt

-------------------------

filetype|addressid|address
addr|1|罗湖
addr|2|福田
addr|3|南山
addr|4|宝安

输出结果：

-----------------------

commid commname addr
15 福田小区6 福田
16 福田小区8 福田
6 福田小区1 福田
7 福田小区2 福田
13 南山小区6 南山
14 南山小区7 南山
4 南山小区1 南山
5 南山小区2 南山
9 南山3 南山
3 宝安小区1 宝安
8 宝安2 宝安
12 宝安小区5 宝安

----------------------------

代码：

package org.apache.hadoop.examples;import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.fs.Path;public class TestUnion {public static int count=0;public static class TestUnionMapper extends Mapper<Object,Text,Text,Text>{        public void map(Object key,Text values,Context context) throws IOException,InterruptedException{            if(values.toString().indexOf("filetype")>=0){        return;}StringTokenizer itr=new StringTokenizer(values.toString(),"|");String fileType="";String fileTypeId="";while(itr.hasMoreTokens()){            fileType=itr.nextToken();            if(fileType.compareToIgnoreCase("addr")==0) {                    String addressId=itr.nextToken();String addressName=itr.nextToken();fileTypeId="2";    //标记为地址context.write(new Text(addressId),new Text(fileTypeId+"|"+addressName));}else if(fileType.compareToIgnoreCase("comm")==0) {                String commId=itr.nextToken();String commName=itr.nextToken();String addressId=itr.nextToken();fileTypeId="1";    //标记为小区context.write(new Text(addressId),new Text(fileTypeId+"|"+commId+"|"+commName));}}}}public static class TestUnionReducer extends Reducer<Text,Text,Text,Text>{public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{List<String> addrs=new ArrayList<String>();List<String> comms=new ArrayList<String>();if(count<=0){count++;context.write(new Text("commid"),new Text("commname    addr"));                return;}else {   　　　　　　      for(Text val:values){String []astr=val.toString().trim().split("\\|"); // | 为特殊字符，必须转义        String fileTypeId=astr[0];                        if(fileTypeId.compareToIgnoreCase("1")==0) //comm
                    {                            String commId=astr[1];String commName=astr[2];comms.add(commId+"    "+commName);                        }else if(fileTypeId.compareToIgnoreCase("2")==0)  //addr
                    {String addr=astr[1];addrs.add(addr);                        }                    }}if(comms.size()>0 && addrs.size()>0){                                for(int m=0;m<comms.size();m++)                for(int n=0;n<addrs.size();n++)    //其实只有一条记录对应上面的context.write(new Text(comms.get(m)),new Text(addrs.get(n)));                }}}public static void main(String[] args) throws Exception{// TODO Auto-generated method stubif(args.length!=2){System.err.println("please input two agrs:<in>  <out>");System.exit(2);}Configuration conf=new Configuration();Job    job=new Job(conf,"union data");job.setJarByClass(TestUnion.class);job.setMapperClass(TestUnionMapper.class);job.setReducerClass(TestUnionReducer.class);//job.setNumReduceTasks(0);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.addInputPath(job,new Path(args[0]));FileOutputFormat.setOutputPath(job,new Path(args[1]));System.exit(job.waitForCompletion(true)?0:1);}}