首页 > 代码库 > 五、基于hadoop的nginx访问日志分析--userAgent和spider

五、基于hadoop的nginx访问日志分析--userAgent和spider

useragent:

代码(不包含蜘蛛):

# cat top_10_useragent.py 
#!/usr/bin/env python
# coding=utf-8

from mrjob.job import MRJob
from mrjob.step import MRStep
from nginx_accesslog_parser import NginxLineParser

import heapq

class UserAgent(MRJob):

    nginx_line_parser = NginxLineParser()

    def mapper(self, _, line):
        
        self.nginx_line_parser.parse(line)
        field_item = self.nginx_line_parser.http_user_agent
        if field_item is not None:
            yield field_item, 1

    def reducer_sum(self, key, values):

        yield None, (sum(values), key)

    def reducer_top100(self, _, values):
        for count, path in heapq.nlargest(10, values):
            yield count, path
       # for count, path in sorted(values, reverse=True)[:10]:
       #    yield count, path

    def steps(self):
        return (
            MRStep(mapper=self.mapper,
                   reducer=self.reducer_sum
                   ),
            MRStep(reducer=self.reducer_top100)
        )

def main():
    UserAgent.run()

if __name__ == __main__:
    main()

结果:

# python3 top_10_useragent.py access_all.log-20161227 
No configs found; falling back on auto-configuration
Creating temp directory /tmp/top_10_useragent.root.20161228.090725.308144
Running step 1 of 2...
Running step 2 of 2...
Streaming final output from /tmp/top_10_useragent.root.20161228.090725.308144/output...
85262    "IE"
79611    "Chrome"
48560    "Other"
10662    "Firefox"
7927    "Mobile Safari UI/WKWebView"
7182    "Sogou Explorer"
6681    "QQ Browser"
1988    "Mobile Safari"
1781    "Maxthon"
1404    "Edge"
Removing temp directory /tmp/top_10_useragent.root.20161228.090725.308144...

蜘蛛:

#!/usr/bin/env python
# coding=utf-8

from mrjob.job import MRJob
from mrjob.step import MRStep
from nginx_accesslog_parser import NginxLineParser

import heapq

class Spider(MRJob):

    nginx_line_parser = NginxLineParser()

    def mapper(self, _, line):
        
        self.nginx_line_parser.parse(line)
        field_item = self.nginx_line_parser.user_agent_type
        if field_item is not None:
            yield field_item, 1

    def reducer_sum(self, key, values):

        yield None, (sum(values), key)

    def reducer_top100(self, _, values):
        for count, path in heapq.nlargest(10, values):
            yield count, path
       # for count, path in sorted(values, reverse=True)[:10]:
       #    yield count, path

    def steps(self):
        return (
            MRStep(mapper=self.mapper,
                   reducer=self.reducer_sum
                   ),
            MRStep(reducer=self.reducer_top100)
        )

def main():
    Spider.run()

if __name__ == __main__:
    main()

执行结果:

# python3 top_10_spider.py access_all.log-20161227 
No configs found; falling back on auto-configuration
Creating temp directory /tmp/top_10_spider.root.20161228.091326.295972
Running step 1 of 2...
Running step 2 of 2...
Streaming final output from /tmp/top_10_spider.root.20161228.091326.295972/output...
33542    "magpie-crawler"
25880    "Other"
16578    "Sogou web spider"
6383    "bingbot"
3688    "Baiduspider"
1487    "Yahoo! Slurp"
1096    "JikeSpider"
731    "YisouSpider"
648    "Baiduspider-image"
470    "Googlebot"
Removing temp directory /tmp/top_10_spider.root.20161228.091326.295972...

 

五、基于hadoop的nginx访问日志分析--userAgent和spider