首页 > 代码库 > 一个经典的MapRuduce实例------webcount(网站分析访客信息)

一个经典的MapRuduce实例------webcount(网站分析访客信息)

统计某一特定网站的某个时辰访客人数

所用版本:hadoop2.6.5

数据样式如下:

111.111.111.111 - - [16/Dec/2012:05:32:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:05:33:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:05:34:45 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:05:34:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:09:34:55 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:10:23:30 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
111.111.111.111 - - [16/Dec/2012:10:32:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"

辅助类

 1 package com.trendwise.software;
 2 
 3 import java.text.SimpleDateFormat; 
 4 import java.util.Date; 
 5 import java.io.DataInput; import java.io.DataOutput; 
 6 import java.io.IOException; 
 7 import org.apache.hadoop.io.WritableComparable; 
 8 
 9 public class DateWritable implements WritableComparable<DateWritable>{
10     private final static SimpleDateFormat formatter = new SimpleDateFormat( "yyyy-MM-dd‘ T ‘HH:mm:ss.SSS" ); 
11     private Date date; 
12     public Date getDate() { 
13         return date; 
14     } 
15     public void setDate( Date date ) { 
16         this.date = date; 
17     } 
18 
19     @Override
20     public void readFields(DataInput in) throws IOException {
21         date = new Date( in.readLong() );         
22     }
23 
24     @Override
25     public void write(DataOutput out) throws IOException {
26         out.writeLong( date.getTime() );         
27     }
28 
29     @Override
30     public int compareTo(DateWritable o) {
31         return date.compareTo( o.getDate() ); 
32     }
33     
34     public String toString() { 
35         return formatter.format( date); 
36     }     
37 }

mapper 映射特定年份中每月每天每个时辰的访客数

 1 package com.trendwise.software;
 2 
 3 import java.io.IOException;
 4 import java.util.Calendar;
 5 import org.apache.hadoop.io.IntWritable;
 6 import org.apache.hadoop.io.LongWritable;
 7 import org.apache.hadoop.io.Text;
 8 import org.apache.hadoop.mapreduce.Mapper;
 9 
10 public class LogMapper extends Mapper<LongWritable, Text, DateWritable, IntWritable> { 
11     public static DateWritable dates = new DateWritable(); 
12     public final static IntWritable two = new IntWritable(1); 
13     public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 
14         String text = value.toString(); 
15         // Get the date and time 
16         int openBracket = text.indexOf( ‘[‘ ); 
17         int closeBracket = text.indexOf( ‘]‘ ); 
18         if( openBracket != -1 && closeBracket != -1 ) { 
19             // Read the date 
20             String dateString = text.substring( text.indexOf( ‘[‘ ) + 1, text. indexOf( ‘]‘ ) ); 
21             // Build a date object from a string of the form: 16/Dec/2012:05:32:50 -0500 
22             int index = 0; 
23             int nextIndex = dateString.indexOf( ‘/‘ ); 
24             int day = Integer.parseInt( dateString.substring(index, nextIndex) );
25             
26             index = nextIndex; nextIndex = dateString.indexOf( ‘/‘, index+1 ); 
27             String month = dateString.substring( index+1, nextIndex ); 
28             index = nextIndex; 
29             nextIndex = dateString.indexOf( ‘:‘, index ); 
30             int year = Integer.parseInt(dateString.substring(index + 1, nextIndex)); 
31             index = nextIndex; nextIndex = dateString.indexOf( ‘:‘, index+1 ); 
32             int hour = Integer.parseInt(dateString.substring(index + 1, nextIndex)); 
33             // Build a calendar object for this date 
34             Calendar calendar = Calendar.getInstance(); 
35             calendar.set( Calendar.DATE, day );
36             calendar.set( Calendar.YEAR, year ); 
37             calendar.set( Calendar.HOUR, hour ); 
38             calendar.set( Calendar.MINUTE, 0 ); 
39             calendar.set( Calendar.SECOND, 0 ); 
40             calendar.set( Calendar.MILLISECOND, 0 ); 
41             if( month.equalsIgnoreCase( "dec" ) ) { 
42                 calendar.set( Calendar.MONTH, Calendar.DECEMBER ); 
43             } 
44             else if( month.equalsIgnoreCase( "nov" ) ) { 
45                 calendar.set( Calendar.MONTH, Calendar.NOVEMBER ); 
46             } 
47             else if( month.equalsIgnoreCase( "oct" ) ) { 
48                 calendar.set( Calendar.MONTH, Calendar.OCTOBER ); 
49             }
50             else if( month.equalsIgnoreCase( "sep" ) ) { 
51                 calendar.set( Calendar.MONTH, Calendar.SEPTEMBER ); 
52             } 
53             else if( month.equalsIgnoreCase( "aug" ) ) { 
54                 calendar.set( Calendar.MONTH, Calendar.AUGUST ); 
55             } 
56             else if( month.equalsIgnoreCase( "jul" ) ) { 
57                 calendar.set( Calendar.MONTH, Calendar.JULY ); 
58             } 
59             else if( month.equalsIgnoreCase( "jun" ) ) {
60                 calendar.set( Calendar.MONTH, Calendar.JUNE ); 
61             } 
62             else if( month.equalsIgnoreCase( "may" ) ) {
63                 calendar.set( Calendar.MONTH, Calendar.MAY ); 
64             } 
65             else if( month.equalsIgnoreCase( "apr" ) ) { 
66                 calendar.set( Calendar.MONTH, Calendar.APRIL ); 
67             } 
68             else if( month.equalsIgnoreCase( "mar" ) ) { 
69                 calendar.set( Calendar.MONTH, Calendar.MARCH ); 
70             } 
71             else if( month.equalsIgnoreCase( "feb" ) ) { 
72                 calendar.set( Calendar.MONTH, Calendar.FEBRUARY ); 
73             } 
74             else if( month.equalsIgnoreCase( "jan" ) ) { 
75                 calendar.set( Calendar.MONTH, Calendar.JANUARY ); 
76             } 
77             
78             dates.setDate( calendar.getTime() ); 
79             context.write(dates, two); 
80             
81         }
82     }
83 }

reducer 汇总一个时辰内访客人数

 1 package com.trendwise.software;
 2 
 3 import java.io.IOException;
 4 import org.apache.hadoop.io.IntWritable;
 5 import org.apache.hadoop.mapreduce.Reducer;
 6  
 7 public class  LogReducer extends Reducer<DateWritable, IntWritable, DateWritable, IntWritable> {
 8     @Override
 9     public void reduce( DateWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { 
10     
11         int countn = 0; 
12         for(IntWritable v :values){ 
13             countn += v.get(); 
14         }     
15         context.write(key, new IntWritable( countn) ); 
16     } 
17 }

driver 配置信息,程序入口

 1 package com.trendwise.software;
 2 
 3 import java.io.IOException;
 4 import org.apache.hadoop.conf.Configuration;
 5 import org.apache.hadoop.fs.Path;
 6 import org.apache.hadoop.io.IntWritable;
 7 import org.apache.hadoop.mapreduce.Job;
 8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
10 
11 public class Driver {
12     
13     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 
14                 
15         String in = args[0];
16         String out = args[1];
17         int unitmb =Integer.valueOf(args[2]);                
18         int nreducer = Integer.valueOf(args[3]);
19         
20         Configuration conf = new Configuration();                
21         conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf(unitmb * 1024 * 1024));
22         conf.set("mapred.min.split.size", String.valueOf(unitmb * 1024 * 1024));
23         conf.set("mapreduce.input.fileinputformat.split.minsize.per.node", String.valueOf(unitmb * 1024 * 1024));
24         conf.set("mapreduce.input.fileinputformat.split.minsize.per.rack", String.valueOf(unitmb * 1024 * 1024));
25                 
26         Job job = new Job(conf);        
27         FileInputFormat.addInputPath(job, new Path(in));
28         FileOutputFormat.setOutputPath(job, new Path(out));            
29         job.setMapperClass(LogMapper.class); 
30         job.setReducerClass(LogReducer.class); 
31         job.setCombinerClass(LogReducer.class); 
32         job.setNumReduceTasks(nreducer);
33         job.setMapOutputKeyClass(DateWritable.class);
34         job.setMapOutputValueClass(IntWritable.class);    
35         job.setOutputKeyClass(DateWritable.class); 
36         job.setOutputValueClass(IntWritable.class);
37         job.setJarByClass(Driver.class);
38         job.waitForCompletion(true);    
39                     
40     }     
41 }

command

技术分享

result

技术分享

技术分享

 

一个经典的MapRuduce实例------webcount(网站分析访客信息)