首页 > 代码库 > hadoop 不同URLTitle文件提取关联URL

hadoop 不同URLTitle文件提取关联URL

package com.sogou.web.selector.updana.wapPc;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.sogou.web.selector.wapcoverage.GBKOutputFormat;

public class URLTitle extends Configured implements Tool {
    private static class KeyPartitioner extends Partitioner<TextPair, Text>{

        @Override
        public int getPartition(TextPair key, Text value, int numPartitions) {
            // TODO Auto-generated method stub
            return (key.getFirst().hashCode()&Integer.MAX_VALUE)%numPartitions;
        }
        
    }
    private static class GroupPartitioner extends WritableComparator{

        protected GroupPartitioner() {
            super(TextPair.class,true);
        }
        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            // TODO Auto-generated method stub
            TextPair t1=(TextPair)a;
            TextPair t2=(TextPair)b;
            return t1.getFirst().compareTo(t2.getFirst());
        }
        
    }
    public int run(String[] args) throws Exception {
        // TODO Auto-generated method stub
        Job job = new Job(this.getConf(), "URL_Title_Analysis");
        //设置运行job
        job.setJarByClass(this.getClass());
        //设置Map相关内容
        job.setMapperClass(WapPCMapper.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        //设子reduce
        job.setReducerClass(WapPcReducer.class);
        job.setOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        //设置输出入格式文件
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(GBKOutputFormat.class);
        
        //设置分区和分组
        job.setPartitionerClass(KeyPartitioner.class);
        job.setGroupingComparatorClass(GroupPartitioner.class);
        
        System.exit(job.waitForCompletion(true) ? 0 : 1);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        Tool UrlTitle = new URLTitle();
        ToolRunner.run(UrlTitle, args);
    }
}

可以提取A,B两个文件中的URL和Title中相等的Title,并输出需要的Title的关联URL