首页 > 代码库 > Python 提取新浪微博的博文中的元素(包含Text, Screen_name)

Python 提取新浪微博的博文中的元素(包含Text, Screen_name)

CODE:

#!/usr/bin/python 
# -*- coding: utf-8 -*-

'''
Created on 2014-7-8
@author: guaguastd
@name: extractWeiboEntities.py
'''

if __name__ == '__main__':
    
    import json
    
    # get weibo_api to access sina api
    from sinaWeiboLogin import sinaWeiboLogin
    sinaWeiboApi = sinaWeiboLogin()
    
    # import sinaWeibo
    from sinaWeibo import extractWeiboEntities
    
    # import sinaWeoboStatuses
    from sinaWeiboStatuses import publicTimeline
    
    # get the new 200 weibo
    statuses = publicTimeline(sinaWeiboApi, 200)
    status_texts,screen_names,words = extractWeiboEntities(statuses)  
                 
    # Explore the first 5 items for each...
    print json.dumps(status_texts[0:5], indent=1)
    print json.dumps(screen_names[0:5], indent=1)
    print json.dumps(words[0:5], indent=1)

RESULT:

[
 "[\u795e\u9a6c]2014\u590f\u5b63\u65b0\u6b3e\u5973\u88c5\u97e9\u56fd\u4e1c\u5927\u95e8\u4ee3\u8d2d \u65e0\u8896t\u6064\u5973\u4fee\u8eab\u5706\u9886\u663e\u7626\u96ea\u7eba\u4e0a\u8863  http://t.cn/RvCUVwB", 
 "\u52ff\u5fd8\u56fd\u803b\uff0c\u632f\u5174\u4e2d\u534e\uff01\u81f3\u4eca\u65e0\u6cd5\u5fd8\u8bb0\u65e5\u5bc7\u523a\u5200\u4e0a\u7684\u5a74\u513f\uff01\uff01\uff01\uff01\uff01\uff01\uff01\u75db\u5fc3\u75be\u9996 \u6211\u5206\u4eab\u4e86http://t.cn/Rvdm1cn", 
 "\u7626\u8138\u7684\u4ea7\u54c1\u7528\u8fc7\u597d\u591a\u597d\u591a\uff0c\u603b\u662f\u4ee5\u89c1\u4e0d\u5230\u6548\u679c\u7ed3\u5c40\uff01\u4f46\u662f\u4e00\u76f4\u8ffd\u6c42V\u8138\u7684\u5fc3\u6ca1\u6539\u53d8\u8fc7\uff01\u76f4\u5230\u6211\u627e\u5230\u4e86\u8fd9\u4e2a\u4f70\u8349\u4e16\u5bb6V\u8138\u795e\u5668\uff01[\u5fc3]\u6d82\u4e0a\u7acb\u523b\u5c31\u6709\u7d27\u81f4\u611f\uff0c\u7761\u524d\u6d82\u62b9\uff0c\u9192\u6765\u770b\u5230\u7684\u6548\u679c\u4f60\u771f\u7684\u4f1a\u5c16\u53eb\u7684\uff01[\u7231\u4f60]\u54ea\u91cc\u4e0d\u7626\u6d82\u54ea\u91cc\uff0c\u518d\u4e5f\u4e0d\u7528\u62c5\u5fc3\u6211\u7684\u5305\u5b50\u8138\u5566\uff01\u7f8e\u4e3d\u4fcf\u4f73\u4eba\u63a8\u8350\uff1a[\u4e2d\u7bad]http://t.cn/RvntLNh", 
 "\u5a01\u6b66MAERZ2014\u6625\u88c5\u65b0\u6b3e\u7537\u88c5\u957f\u8896\u886c\u886b \u97e9\u7248\u4fee\u8eab\u7537\u58eb\u7ecf\u5178\u7eaf\u68c9\u683c\u5b50\u886c\u8863\u6f6e  http://t.cn/RvCyu61", 
 "[\u563b\u563b]2014\u590f\u88c5\u65b0\u6b3e\u5973\u58eb\u788e\u82b1\u886c\u886b\u4fee\u8eab\u5927\u7801\u957f\u8896\u7eaf\u68c9\u5370\u82b1\u886c\u8863\u97e9\u7248\u4e0a\u8863\u6f6e  http://t.cn/RvCUIw5"
]
[
 "\u53e4\u6708\u79cb\u666f", 
 "Lcineferit", 
 "\u7efd\u653e\u9ec4\u8272\u7261\u4e39aa", 
 "\u4e8c\u9505\u9505\u4e8c\u59d0\u59d0", 
 "lang\u6d6a\u6f2b\u66f2"
]
[
 "[\u795e\u9a6c]2014\u590f\u5b63\u65b0\u6b3e\u5973\u88c5\u97e9\u56fd\u4e1c\u5927\u95e8\u4ee3\u8d2d", 
 "\u65e0\u8896t\u6064\u5973\u4fee\u8eab\u5706\u9886\u663e\u7626\u96ea\u7eba\u4e0a\u8863", 
 "http://t.cn/RvCUVwB", 
 "\u52ff\u5fd8\u56fd\u803b\uff0c\u632f\u5174\u4e2d\u534e\uff01\u81f3\u4eca\u65e0\u6cd5\u5fd8\u8bb0\u65e5\u5bc7\u523a\u5200\u4e0a\u7684\u5a74\u513f\uff01\uff01\uff01\uff01\uff01\uff01\uff01\u75db\u5fc3\u75be\u9996", 
 "\u6211\u5206\u4eab\u4e86http://t.cn/Rvdm1cn"
]