首页 > 代码库 > 统计中国,美国,世界排名前50的关键词并进行比较
统计中国,美国,世界排名前50的关键词并进行比较
1 获取中国所有关键词
import pymysql import json conn= pymysql.connect( host=‘localhost‘, port = 3306, user=‘root‘, passwd=‘‘, db =‘python‘, ) cursor = conn.cursor() sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where authorinfor like ‘%china%‘ && union_kwd_str != ‘‘" a = cursor.execute(sql) print a b = cursor.fetchmany(a) #b has 7887 abstract list abstract_list = [] pmc_id_dict= {} for j in range(a): abstract_list.append(b[j][0]) pmc_id_dict[j] = b[j][1] def output_to_json(data,filename): with open(filename,‘w‘) as file: file.write(json.dumps(data)) file.close() return json.dumps(data) output_data = { ‘country‘: "china", ‘count‘: a, ‘keyword‘: abstract_list } output_to_json(output_data, ‘1203_china_kwd.json‘)
选出排名前50的关键词
import re import collections import json def input_from_json(filename): with open(filename,‘r‘) as file: data = json.loads(file.read()) file.close() return data def count_word(path): result = {} keyword_list = input_from_json(path)[‘keyword‘] for all_the_text in keyword_list: for word in all_the_text.split(‘,‘): if word not in result: result[word] = 0 result[word] += 1 return result def sort_by_count(d): d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1])) return d if __name__ == ‘__main__‘: file_name = "1203_china_kwd.json" fobj1 = open(‘1204_top50_china_kwd_list.json‘,‘w‘) fobj2 = open(‘1203_top15_china_kwd.json‘,‘w‘) dword = count_word(file_name) dword = sort_by_count(dword) jsonlist = [] num = 0 top_china_kwd_list = [] for key,value in dword.items(): num += 1 key = re.sub("_", " ", key) data = { ‘name‘: key, ‘value‘: value } json_data = json.dumps(data) if num < 50: top_china_kwd_list.append(key) fobj2.write(json_data) fobj2.write(‘,‘) if num == 50: top_china_kwd_list.append(key) fobj2.write(json_data) data = { ‘china_kwd‘:top_china_kwd_list } json_data = json.dumps(data) fobj1.write(json_data)
2.获取美国的所有关键词,并做统计,与中国的统计代码相似,下一步工作是整合代码。
import pymysql import json conn= pymysql.connect( host=‘localhost‘, port = 3306, user=‘root‘, passwd=‘‘, db =‘python‘, ) cursor = conn.cursor() sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where authorinfor like ‘%USA%‘ && union_kwd_str != ‘‘" a = cursor.execute(sql) print a b = cursor.fetchmany(a) #b has 7887 abstract list abstract_list = [] pmc_id_dict= {} for j in range(a): abstract_list.append(b[j][0]) pmc_id_dict[j] = b[j][1] def output_to_json(data,filename): with open(filename,‘w‘) as file: file.write(json.dumps(data)) file.close() return json.dumps(data) output_data = { ‘country‘: "USA", ‘count‘: a, ‘keyword‘: abstract_list } output_to_json(output_data, ‘1204_USA_kwd.json‘)
美国前50的关键词
import re import collections import json def input_from_json(filename): with open(filename,‘r‘) as file: data = json.loads(file.read()) file.close() return data def count_word(path): result = {} keyword_list = input_from_json(path)[‘keyword‘] for all_the_text in keyword_list: for word in all_the_text.split(‘,‘): if word not in result: result[word] = 0 result[word] += 1 return result def sort_by_count(d): d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1])) return d if __name__ == ‘__main__‘: file_name = "1204_USA_kwd.json" fobj1 = open(‘1204_top50_USA_kwd_list.json‘,‘w‘) fobj2 = open(‘1204_top50_USA_kwd.json‘,‘w‘) dword = count_word(file_name) dword = sort_by_count(dword) jsonlist = [] num = 0 top_USA_kwd_list = [] for key,value in dword.items(): num += 1 key = re.sub("_", " ", key) data = { ‘name‘: key, ‘value‘: value } json_data = json.dumps(data) if num < 50: top_USA_kwd_list.append(key) fobj2.write(json_data) fobj2.write(‘,‘) if num == 50: top_USA_kwd_list.append(key) fobj2.write(json_data) data = { ‘USA_kwd‘:top_USA_kwd_list } json_data = json.dumps(data) fobj1.write(json_data)
3,世界的前50的关键词
import pymysql import json conn= pymysql.connect( host=‘localhost‘, port = 3306, user=‘root‘, passwd=‘‘, db =‘python‘, ) cursor = conn.cursor() sql = "SELECT union_kwd_str,pmc_id FROM alzheimer where union_kwd_str != ‘‘" a = cursor.execute(sql) print a b = cursor.fetchmany(a) #b has 7887 abstract list abstract_list = [] pmc_id_dict= {} for j in range(a): abstract_list.append(b[j][0]) pmc_id_dict[j] = b[j][1] def output_to_json(data,filename): with open(filename,‘w‘) as file: file.write(json.dumps(data)) file.close() return json.dumps(data) output_data = { ‘country‘: "world", ‘count‘: a, ‘keyword‘: abstract_list } output_to_json(output_data, ‘1203_world_kwd.json‘)
世界前50关键词
import re import collections import json def input_from_json(filename): with open(filename,‘r‘) as file: data = json.loads(file.read()) file.close() return data def count_word(path): result = {} keyword_list = input_from_json(path)[‘keyword‘] for all_the_text in keyword_list: for word in all_the_text.split(‘,‘): if word not in result: result[word] = 0 result[word] += 1 return result def sort_by_count(d): d = collections.OrderedDict(sorted(d.items(), key = lambda t: -t[1])) return d if __name__ == ‘__main__‘: file_name = "1203_world_kwd.json" fobj1 = open(‘1204_top50_world_kwd_list.json‘,‘w‘) fobj2 = open(‘1203_top15_world_kwd.json‘,‘w‘) dword = count_word(file_name) dword = sort_by_count(dword) jsonlist = [] num = 0 top_world_kwd_list = [] for key,value in dword.items(): num += 1 key = re.sub("_", " ", key) data = { ‘name‘: key, ‘value‘: value } json_data = json.dumps(data) if num < 50: top_world_kwd_list.append(key) fobj2.write(json_data) fobj2.write(‘,‘) if num == 50: top_world_kwd_list.append(key) fobj2.write(json_data) data = { ‘world_kwd‘:top_world_kwd_list } json_data = json.dumps(data) fobj1.write(json_data)
4.比较中国与美国的关键词有哪些相似的,以及中国与世界的研究热点有哪些相似的
import json def input_from_json(filename): with open(filename,‘r‘) as file: data = json.loads(file.read()) file.close() return data china_path = ‘1204_top50_china_kwd_list.json‘ world_path = ‘1204_top50_world_kwd_list.json‘ USA_path = ‘1204_top50_USA_kwd_list.json‘ china_kwd_list = input_from_json(china_path)[‘china_kwd‘] world_kwd_list = input_from_json(world_path)[‘world_kwd‘] USA_kwd_list = input_from_json(USA_path)[‘USA_kwd‘] a = set(china_kwd_list) b = set(world_kwd_list) c = set(USA_kwd_list) china_world_same_kwd =list(a&b) for kwd in china_world_same_kwd: kwd = kwd.encode(‘utf-8‘) print kwd print len(china_world_same_kwd) print ‘\n‘ china_USA_same_kwd =list(a&c) for kwd in china_USA_same_kwd: kwd = kwd.encode(‘utf-8‘) print kwd print len(china_world_same_kwd)
统计中国,美国,世界排名前50的关键词并进行比较
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。