首页 > 代码库 > 任一英文的纯文本文件,统计其中的单词出现个数

任一英文的纯文本文件,统计其中的单词出现个数

第一版: 效率低

path = test.txt
with open(path,encoding=utf-8,newline=‘‘) as f:
    word = []
    words_dict= {}
    for letter in f.read():
        if letter.isalnum():
            word.append(letter)
        elif letter.isspace(): #空白字符 空格 \t \n
            if word:
                word = ‘‘.join(word).lower() #转小写
                if word not in words_dict:
                    words_dict[word] = 1
                else:
                    words_dict[word] += 1
                word = []

#处理最后一个单词
if word:
    word = ‘‘.join(word).lower()  # 转小写
    if word not in words_dict:
        words_dict[word] = 1
    else:
        words_dict[word] += 1
    word = []

for k,v in words_dict.items():
    print(k,v)

第二版:

缺点:遇到大文件要一次读入内存,性能不好

path = test.txt
with open(path,r,encoding=utf-8) as f:
    data = f.read()
    word_reg = re.compile(r\w+)
    #word_reg = re.compile(r‘\w+\b‘)
    word_list = word_reg.findall(data)
    word_list = [word.lower() for word in word_list] #转小写
    word_set = set(word_list)  #避免重复查询
    # words_dict = {}
    # for word in word_set:
    #     words_dict[word] = word_list.count(word)

    # 简洁写法
    words_dict = {word: word_list.count(word) for word in word_set}
    for k,v in words_dict.items():
        print(k,v)

第三版:

path = test.txt
with open(path, r, encoding=utf-8) as f:
    word_list = []
    word_reg = re.compile(r\w+)
    for line in f:
        #line_words = word_reg.findall(line)
        #比上面的正则更加简单
        line_words = line.split()
        word_list.extend(line_words)
    word_set = set(word_list)  # 避免重复查询
    words_dict = {word: word_list.count(word) for word in word_set}
    for k, v in words_dict.items():
        print(k, v)

 

任一英文的纯文本文件,统计其中的单词出现个数