首页 > 代码库 > 二次排序

二次排序

工作原理:

使用如下map和reduce:(特别注意输入输出类型, 其中IntPair为自定义类型)

public static class Map extends Mapper<LongWritable, Text, IntPair, IntWritable> 
public static class Reduce extends Reducer<IntPair, IntWritable, IntWritable, IntWritable>

     在map阶段,使用job.setInputFormatClass(TextInputFormat)做为输入格式。注意输出应该符合自定义Map中定义的输出<IntPair, IntWritable>。最终是生成一个List<IntPair, IntWritable>。在map阶段的最后,会先调用job.setPartitionerClass对这个List进行分区,每个分区映射到一个reducer。每个分区内可以调用job.setSortComparatorClass设置的key比较函数类排序。如果没有通过job.setSortComparatorClass设置key比较函数类,则使用key的实现的compareTo方法。在随后的例子中,第一个例子中,使用了IntPair实现的compareTo方法,而在下一个例子中,专门定义了key比较函数类。

     在reduce阶段,reducer接收到所有映射到这个reducer的map输出后,也是会调用job.setSortComparatorClass设置的key比较函数类对所有数据对排序。然后开始构造一个key对应的value迭代器。这时就要用到分组,使用jobjob.setGroupingComparatorClass设置的分组函数类。只要这个比较器比较的两个key相同,他们就属于同一个组,它们的value放在一个value迭代器,而这个迭代器的key使用属于同一个组的所有key的第一个key。最后就是进入Reducer的reduce方法,reduce方法的输入是所有的(key和它的value迭代器)。同样注意输入与输出的类型必须与自定义的Reducer中声明的一致。

代码:

package org.apache.hadoop.examples;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.RawComparator;import org.apache.hadoop.io.Text;import org.apache.hadoop.io.WritableComparable;import org.apache.hadoop.io.WritableComparator;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Partitioner;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.util.GenericOptionsParser;/** * This is an example Hadoop Map/Reduce application. * It reads the text input files that must contain two integers per a line. * The output is sorted by the first and second number and grouped on the  * first number. * * To run: bin/hadoop jar build/hadoop-examples.jar secondarysort *            <i>in-dir</i> <i>out-dir</i>  */public class SecondarySort {   /**   * Define a pair of integers that are writable.   * They are serialized in a byte comparable format.   */  public static class IntPair                       implements WritableComparable<IntPair> {    private int first = 0;    private int second = 0;        /**     * Set the left and right values.     */    public void set(int left, int right) {      first = left;      second = right;    }    public int getFirst() {      return first;    }    public int getSecond() {      return second;    }    /**     * Read the two integers.      * Encoded as: MIN_VALUE -> 0, 0 -> -MIN_VALUE, MAX_VALUE-> -1     */    @Override    public void readFields(DataInput in) throws IOException {      first = in.readInt() + Integer.MIN_VALUE;      second = in.readInt() + Integer.MIN_VALUE;    }    @Override    public void write(DataOutput out) throws IOException {      out.writeInt(first - Integer.MIN_VALUE);      out.writeInt(second - Integer.MIN_VALUE);    }    @Override    public int hashCode() {      return first * 157 + second;    }    @Override    public boolean equals(Object right) {      if (right instanceof IntPair) {        IntPair r = (IntPair) right;        return r.first == first && r.second == second;      } else {        return false;      }    }    /** A Comparator that compares serialized IntPair. */     public static class Comparator extends WritableComparator {      public Comparator() {        super(IntPair.class);      }      public int compare(byte[] b1, int s1, int l1,                         byte[] b2, int s2, int l2) {        return compareBytes(b1, s1, l1, b2, s2, l2);      }    }    static {                                        // register this comparator      WritableComparator.define(IntPair.class, new Comparator());    }    @Override    public int compareTo(IntPair o) {      if (first != o.first) {        return first < o.first ? -1 : 1;      } else if (second != o.second) {        return second < o.second ? -1 : 1;      } else {        return 0;      }    }  }    /**   * Partition based on the first part of the pair.   */  public static class FirstPartitioner extends Partitioner<IntPair,IntWritable>{    @Override    public int getPartition(IntPair key, IntWritable value,                             int numPartitions) {      return Math.abs(key.getFirst() * 127) % numPartitions;    }  }  /**   * Compare only the first part of the pair, so that reduce is called once   * for each value of the first part.   */  public static class FirstGroupingComparator                 implements RawComparator<IntPair> {    @Override    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {      return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8,                                              b2, s2, Integer.SIZE/8);    }    @Override    public int compare(IntPair o1, IntPair o2) {      int l = o1.getFirst();      int r = o2.getFirst();      return l == r ? 0 : (l < r ? -1 : 1);    }  }  /**   * Read two integers from each line and generate a key, value pair   * as ((left, right), right).   */  public static class MapClass          extends Mapper<LongWritable, Text, IntPair, IntWritable> {        private final IntPair key = new IntPair();    private final IntWritable value = http://www.mamicode.com/new IntWritable();        @Override    public void map(LongWritable inKey, Text inValue,                     Context context) throws IOException, InterruptedException {      StringTokenizer itr = new StringTokenizer(inValue.toString());      int left = 0;      int right = 0;      if (itr.hasMoreTokens()) {        left = Integer.parseInt(itr.nextToken());        if (itr.hasMoreTokens()) {          right = Integer.parseInt(itr.nextToken());        }        key.set(left, right);        value.set(right);        context.write(key, value);      }    }  }    /**   * A reducer class that just emits the sum of the input values.   */  public static class Reduce          extends Reducer<IntPair, IntWritable, Text, IntWritable> {    private static final Text SEPARATOR =       new Text("------------------------------------------------");    private final Text first = new Text();        @Override    public void reduce(IntPair key, Iterable<IntWritable> values,                       Context context                       ) throws IOException, InterruptedException {      context.write(SEPARATOR, null);      first.set(Integer.toString(key.getFirst()));      for(IntWritable value: values) {        context.write(first, value);      }    }  }    public static void main(String[] args) throws Exception {    Configuration conf = new Configuration();    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();    if (otherArgs.length != 2) {      System.err.println("Usage: secondarysrot <in> <out>");      System.exit(2);    }    Job job = new Job(conf, "secondary sort");    job.setJarByClass(SecondarySort.class);    job.setMapperClass(MapClass.class);    job.setReducerClass(Reduce.class);    // group and partition by the first int in the pair    job.setPartitionerClass(FirstPartitioner.class);    job.setGroupingComparatorClass(FirstGroupingComparator.class);    // the map output is IntPair, IntWritable    job.setMapOutputKeyClass(IntPair.class);    job.setMapOutputValueClass(IntWritable.class);    // the reduce output is Text, IntWritable    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(IntWritable.class);        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));    System.exit(job.waitForCompletion(true) ? 0 : 1);  }}
----引用文章 http://blog.csdn.net/heyutao007/article/details/5890103
----引用hadoop1.2.1源码