首页 > 代码库 > 新浪微博数据挖掘食谱之十: 元素篇 (提取转发微博的元素)

新浪微博数据挖掘食谱之十: 元素篇 (提取转发微博的元素)

#!/usr/bin/python 
# -*- coding: utf-8 -*-

'''
Created on 2015-1-6
@author: beyondzhou
@name: extract_repost_attributions.py
'''

# Extract repost attributions
def extract_repost_attributions():
    
    # import 
    from search import weibo_search
    from entities import weibo_entities
    from login import weibo_login
    from statuses import fetch_repost_timeline, fetch_weibo_status, get_rt_attributions
    import json
    
    # Access to sina api
    weibo_api = weibo_login()
    
    # Do the search
    subject = weibo_search(topic='iphone')
    
    # Decode entities
    (mids, names, texts, dates, reposts, comments, likes) = weibo_entities(subject)
    
    # Find weibo id whose repost number is above then 1
    for index in range(len(reposts)):
        if reposts[index] > 0:
            weibo_id_reposted = mids[index]
            print 'reposts number: %s, weibo_id_reposted: %s' % (reposts[index], weibo_id_reposted)
            break
        
    # Find repost timeline     
    repost_timeline = fetch_repost_timeline(weibo_api, count = 200, page = 1, weibo_id = weibo_id_reposted)
    
    # Extract repost attribution (use the first record of repost_timeline to do the example)
    repost_attributions = get_rt_attributions(repost_timeline[0])
    
    # Output repost weibo
    repost_weibo = fetch_weibo_status(weibo_api, weibo_id = weibo_id_reposted)
    print json.dumps(repost_weibo, indent=1)
    print 'Output repost weibo done!\n'
    
    # Output repost attribution
    for repost_att in repost_attributions:
        print repost_att    
    print 'Output repost attribution done!\n'
        
if __name__ == '__main__':
    extract_repost_attributions()
# Get repost repost weibo timeline
def fetch_repost_timeline(weibo_api, count = 200, page = 1, weibo_id = 1):
    
    repost_timeline = weibo_api.statuses.repost_timeline.get(count=count, page=page, id = weibo_id)
    statuses = repost_timeline['reposts']
    return statuses

# Get weibo status
def fetch_weibo_status(weibo_api, weibo_id = 1):
    
    weibo_status = weibo_api.statuses.show.get(id = weibo_id)
    return weibo_status

# get repost attributions
def get_rt_attributions(repost):
    import re
    
    # Regex adapted from Stack Overflow (http://bit.ly/1821y0J)
    rt_patterns = re.compile(ur"(RT|via|\u8f6c\u53d1)((?:\b\W*@\w+)+)", re.IGNORECASE)
    rt_attributions = []

    # Inspect the tweet to see if it was produced with /statuses/retweet/:id.
    # See https://dev.twitter.com/docs/api/1.1/get/statuses/retweets/%3Aid.
    if repost.has_key('retweeted_status'):
        attribution = repost['retweeted_status']['user']['screen_name'].lower()
        rt_attributions.append(attribution)

    # Also, inspect the tweet for the presence of "legacy" retweet patterns
    # such as "RT" and "via", which are still widely used for various reasons
    # and potentially very useful. See https://dev.twitter.com/discussions/2847 
    # and https://dev.twitter.com/discussions/1748 for some details on how/why.
    '''
    subject = 'RT @SocialWebMining'
    import re
    rt_patterns = re.compile(r"(RT|via)((?:\b\W*@\w+)+)", re.IGNORECASE)
    rt_patterns.findall(subject)[0][1]
    Out[29]: ' @SocialWebMining'
    rt_patterns.findall(subject)
    Out[30]: [('RT', ' @SocialWebMining')]

    rt_patterns.findall(subject)[0]
    Out[31]: ('RT', ' @SocialWebMining')

    rt_patterns.findall(subject)[0][1]
    Out[32]: ' @SocialWebMining'
    
    In [32]: repost_attributions = '\u798f\u5229\u6d3e\u9001\u673a'

    In [33]: repost_attributions.decode("unicode_escape")
    Out[33]: u'\u798f\u5229\u6d3e\u9001\u673a'

    In [34]: print repost_attributions.decode("unicode_escape")
              福利派送机
    '''
        
    try:
        rt_attributions += [
            mention.strip()
                for mention in rt_patterns.findall(repost['text'])[0][1].split()
        ]
    except IndexError, _:
        pass
    
    # Filter out any duplicates
    return list(set([rta.strip("@").lower() for rta in rt_attributions]))

Result:

callback_url: https://api.weibo.com/oauth2/authorize?redirect_uri=http%3A//apps.weibo.com/guaguastd&response_type=code&client_id=2925245021
return_redirect_uri: http://weibo.com/login.php?url=http%3A%2F%2Fapps.weibo.com%2Fguaguastd%3Fcode%3D9d0a0ecb4df4db1d8d1a6ef5460c5e82
code: ['9d0a0ecb4df4db1d8d1a6ef5460c5e82']
now_handle: ce2b7c50-9531-11e4-b8c2-7bd88716b5dd
http://passport.weibo.com/
all_handles: [u'ce2b7c50-9531-11e4-b8c2-7bd88716b5dd', u'd3ba1000-9531-11e4-b8c2-7bd88716b5dd']
search done!
mids entities done!
names entities done!
texts entities done!
dates entities done!
reposts entities done!
comments entities done!
likes entities done!
reposts number: 6964, weibo_id_reposted: 3795801400243898
{
 "reposts_count": 6975, 
 "truncated": false, 
 "text": "1 toy 1 day\uff0c\u7b2c178\u671f\uff1a\u7f8e\u56fdBluelounge\uff0diPhone 5/5s\u6700\u4f73\u89c2\u770b\u89d2\u5ea6\u5145\u7535\u57fa\u5ea7\u3002\u624b\u673a\u653e\u5728\u684c\u4e0a\u5145\u7535\uff0c\u60f3\u770b\u4e00\u4e9b\u4e1c\u897f\uff0c\u611f\u89c9\u603b\u662f\u4e0d\u8212\u670d\u3002\u6709\u4e86\u5b83\uff0c\u4e0d\u4ec5\u5916\u89c2\u9ad8\u5927\u4e0a\uff0c\u8fd8\u8ba9\u4f60\u6709\u4e2a\u66f4\u597d\u7684\u89c2\u770b\u89d2\u5ea6\uff0c\u5145\u7535\u65f6\u7528\u8d77\u6765\u4e5f\u662f\u90a3\u4e48\u987a\u7545\u81ea\u5982\uff08\u8fd9\u662f\u6211\u9001\u51fa\u7684\u7b2c2232\u4ef6\u793c\u7269\uff0c\u5173\u8f6c\uff0c1\u67086\u65e5\u62bd\uff0c\u4e0d\u52301\u5929\uff0c\u5bf9\uff0c\u53ea\u5728\u7231\u8d34\uff0c\u56e0\u4e3a\u6709\u4f60\uff01\uff09", 
 "visible": {
  "type": 0, 
  "list_id": 0
 }, 
 "in_reply_to_status_id": "", 
 "bmiddle_pic": "http://ww1.sinaimg.cn/bmiddle/005wRYdajw1enz2uspb4xj313y0pgmza.jpg", 
 "id": 3795801400243898, 
 "thumbnail_pic": "http://ww1.sinaimg.cn/thumbnail/005wRYdajw1enz2uspb4xj313y0pgmza.jpg", 
 "mid": "3795801400243898", 
 "source": "<a href=http://www.mamicode.com/"http://weibo.com/" rel=/"nofollow/">/u5fae/u535a weibo.com", >

新浪微博数据挖掘食谱之十: 元素篇 (提取转发微博的元素)