首页 > 代码库 > 音字转换实验、HMM+viterbi

音字转换实验、HMM+viterbi

RT,NLP实验二。音字转换,其中用到的思想比较基本、比较老。

1.首先统计unigram和bigram的频数

2.词作为状态集,音作为观测序列。

3.计算转移矩阵概率和发射矩阵概率,建立HMM模型

4.给定HMM模型和观测序列,采用viterbi算法动态规划解码。

viterbi.py

# -*- coding: cp936 -*-
"""
    viterbi.py
    author:messiandzcy
    date:2014.12.11
"""
import sys
#from numpy import *
#申请矩阵
def matrix(rows,cols):
    mat = [[0 for col in range(cols)]for row in range(rows)]
    return mat
#加载unigram
def LoadUnigram():
    print "Loading Unigram..."
    fp = open("unigramSorted.txt","r")
    uni={}
    iters=1 #控制读入行数
    for line in fp:
        w,freq = line.split()
        if len(w)==2 and int(freq)>=2700:#w in debug:   #状态集只包含频繁单字
            uni[w]=int(freq) 
            #iters += 1
        #uni[w]=int(freq)
        #if iters>=1300:break     #控制状态集的大小,有内存溢出问题
    fp.close()
    print "Loaded Unigram SUCCESS!"
    print len(uni)
    return uni
#加载bigram
def LoadBigram():
    print "Loading Bigram..."
    fp = open("bigramSorted.txt","r")
    big={}
    iters=1
    for line in fp:
        w,freq = line.split()
        big[w]=int(freq)
        iters += 1
    fp.close()
    print "Loaded Bigram SUCCESS!"
    return big
#输出测试
def printf(A,m,n):
    for i in range(m):
        for j in range(n):
            print "%.8lf"%(A[i][j]),
        print
#首先计算状态转移概率矩阵A
def computeA(uni,big):
    num1,num2=len(uni),len(big)
    #print uni
    sum_uni,sum_big = 0,0 #计算总频数
    for w in uni:
        sum_uni+=uni[w]
    for w in big:
        sum_big+=big[w]
    #print sum_uni,sum_big
    lst = uni.keys() #lst存储int和键的映射
    #print "num1=%d"%num1
    #print lst
    A = matrix(num1,num1) #申请转移矩阵,每个下标号对应字典中一个词
    #开始计算转移概率矩阵
    for i in range(num1):
        for j in range(num1):
            fenmu = (uni.get(lst[i],0)+1.0)/(sum_uni+num1) #Laplace smooth
            fenzi = (big.get(lst[i]+lst[j],0)+1.0)/(sum_big+num2)
            A[i][j]=fenzi/fenmu
        #sys.stdout.write('\n')
    #print "A="
    #printf(A,num1,num1)
    return (A,lst)
#加载lexicon.txt
def LoadLexicon():
    print "Loading 'lexicon.txt'..."
    fp = open("lexicon.txt","r")
    lex={} #字典:键是单词,值是音标的列表
    iters=1
    for line in fp:
        tmp = []
        word = line.split()[0]
        if word in lex: #如果单词已经在词典中
            for yin in line.split()[1:]:lex[word].append(yin[:-1])
        else:   #新开一个单词
            for yin in line.split()[1:]:tmp.append(yin[:-1])
            lex[word]=tmp
        #if iters>=12:break
        iters += 1
    #对词典的值进行列表去重
    for key in lex:
        lex[key]=list(set(lex[key]))
    fp.close()
    print "Loaded 'lexicon.txt' SUCCESS!"
    return lex
#计算发射概率矩阵B
def computeB(lst,lex):
    yin = []
    for key in lex:
        #print key
        #print lex[key]
        yin += lex[key]
    yin = list(set(yin))    #去重
    #print yin
    num1,num2=len(lst),len(yin)
    B = matrix(num1,num2) #申请发射矩阵,行是词,列是音
    #print B
    for i in range(num1):   #对于每个词
        yinlst=lex.get(lst[i])  #lst[i]全部是单字
        if yinlst is None:continue
        for yinj in yinlst:
            j = yin.index(yinj)
            B[i][j]=1.0/len(yinlst)
            #print B[i][j]
            #print yinj
        #print yinlst
    #print B
    #print len(yin)
    return (B,yin)
#给定HMM模型,开始解码
def viterbi(A,B,lst,yin,string):
    states =len(lst)    #总状态数
    iters = len(string) #迭代次数
    pi = [1.0/states for i in range(states)] #初始
    fai = matrix(iters,states)    #dp记录最优路径
    delta = matrix(iters,states)    #主要计算矩阵
    #初始化
    for i in range(states): #对每个状态
        delta[0][i]=pi[i]*B[i][yin.index(string[0])]
    #print delta[0]
    #迭代
    t = 1
    while t<iters:
        for i in range(states): #对每个状态
            left = []
            maxx,keep = 0,0
            for j in range(states):
                if delta[t-1][j]*A[j][i]>maxx:
                    maxx=delta[t-1][j]*A[j][i]
                    keep = j
            delta[t][i]=maxx*B[i][yin.index(string[t])]
            fai[t][i]=keep
        t += 1
    #print fai
    #print delta
    #终止
    P = 0
    for i in range(states):
        if delta[t-1][i]>=P:
            P=delta[t-1][i]
            end = i
    print P
    #end是终止状态
    res = []
    for k in range(t-1,-1,-1):
        res.append(lst[end])
        end = fai[k][end]
    print "".join(res[::-1])
    #print string

#主函数
uni=LoadUnigram() #一元词典
big=LoadBigram()    #二元词典
lex=LoadLexicon()   #音词转换表
A,lst=computeA(uni,big) #lst映射表必须跟着
B,yin=computeB(lst,lex) #yin映射表也必须返回
while True: #for test
    print "input a string(split by ' ')"
    string=raw_input().split()
    #string=['yi','zhi','mei','li','de','xiao','hua']
    viterbi(A,B,lst,yin,string) #decode
    #print B[lst.index(m)][yin.index(n)]
"""
#maxNum=7136 #申请43373的矩阵时会溢出
#matrix = arange(maxNum*maxNum)
#a = [0 for i in range(maxNum)]
#mat = a*maxNum
#matrix(maxNum,maxNum)
#print "hehe"
"""


音字转换实验、HMM+viterbi