首页 > 代码库 > hadoop学习笔记之倒排索引
hadoop学习笔记之倒排索引
开发工具:eclipse
目标:对下面文档phone_numbers进行倒排索引:
13599999999 10086
13899999999 120
13944444444 13800138000
13722222222 13800138000
18800000000 120
13722222222 10086
18944444444 10086
代码:
1 import java.io.IOException; 2 import org.apache.hadoop.conf.Configured; 3 import org.apache.hadoop.conf.Configuration; 4 import org.apache.hadoop.fs.Path; 5 import org.apache.hadoop.util.Tool; 6 import org.apache.hadoop.util.ToolRunner; 7 import org.apache.hadoop.io.*; 8 import org.apache.hadoop.mapreduce.*; 9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;11 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;12 13 public class Test_1 extends Configured implements Tool 14 {15 enum Counter16 {17 LINESKIP, // error lines18 }19 20 public static class Map extends Mapper<LongWritable, Text, Text, Text>21 {22 public void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException23 {24 String line = value.toString(); // read original data25 26 try27 {28 // process data29 String[] lineSplit = line.split(" ");30 String anum = lineSplit[0];31 String bnum = lineSplit[1];32 33 context.write(new Text(bnum), new Text(anum)); // map output34 }35 catch(java.lang.ArrayIndexOutOfBoundsException e)36 {37 context.getCounter(Counter.LINESKIP).increment(1);38 return;39 }40 41 }42 }43 public static class Reduce extends Reducer<Text, Text, Text, Text>44 {45 public void reduce(Text key, Iterable<Text>values, Context context)throws IOException, InterruptedException46 {47 String valueString;48 String out = "";49 50 for (Text value : values)51 {52 valueString = value.toString();53 out += valueString + "|";54 }55 56 context.write(key, new Text(out)); // reduce output57 }58 }59 public int run(String[] args)throws Exception60 {61 Configuration conf = getConf();62 63 Job job = new Job(conf, "Test_1"); // task name64 job.setJarByClass(Test_1.class); // specified task65 66 FileInputFormat.addInputPath(job, new Path(args[0])); // input path67 FileOutputFormat.setOutputPath(job, new Path(args[1])); // output path68 69 job.setMapperClass(Map.class);70 job.setReducerClass(Reduce.class);71 job.setOutputFormatClass(TextOutputFormat.class);72 job.setOutputKeyClass(Text.class);73 job.setOutputValueClass(Text.class);74 75 job.waitForCompletion(true);76 77 return job.isSuccessful() ? 0 : 1;78 }79 80 public static void main(String[] args)throws Exception81 {82 int res = ToolRunner.run(new Configuration(), new Test_1(), args);83 System.exit(res);84 }85 }
运行结果:
hadoop学习笔记之倒排索引
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。