首页 > 代码库 > hadoop 不同URLTitle文件提取关联URL
hadoop 不同URLTitle文件提取关联URL
package com.sogou.web.selector.updana.wapPc; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import com.sogou.web.selector.wapcoverage.GBKOutputFormat; public class URLTitle extends Configured implements Tool { private static class KeyPartitioner extends Partitioner<TextPair, Text>{ @Override public int getPartition(TextPair key, Text value, int numPartitions) { // TODO Auto-generated method stub return (key.getFirst().hashCode()&Integer.MAX_VALUE)%numPartitions; } } private static class GroupPartitioner extends WritableComparator{ protected GroupPartitioner() { super(TextPair.class,true); } @Override public int compare(WritableComparable a, WritableComparable b) { // TODO Auto-generated method stub TextPair t1=(TextPair)a; TextPair t2=(TextPair)b; return t1.getFirst().compareTo(t2.getFirst()); } } public int run(String[] args) throws Exception { // TODO Auto-generated method stub Job job = new Job(this.getConf(), "URL_Title_Analysis"); //设置运行job job.setJarByClass(this.getClass()); //设置Map相关内容 job.setMapperClass(WapPCMapper.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); //设子reduce job.setReducerClass(WapPcReducer.class); job.setOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); //设置输出入格式文件 job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(GBKOutputFormat.class); //设置分区和分组 job.setPartitionerClass(KeyPartitioner.class); job.setGroupingComparatorClass(GroupPartitioner.class); System.exit(job.waitForCompletion(true) ? 0 : 1); return 0; } public static void main(String[] args) throws Exception { Tool UrlTitle = new URLTitle(); ToolRunner.run(UrlTitle, args); } }
可以提取A,B两个文件中的URL和Title中相等的Title,并输出需要的Title的关联URL
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。