首页 > 代码库 > 双向最大匹配分词算法

双向最大匹配分词算法

#!/usr/bin/python

#encoding=gbk

import sys

 

dictMaxLength = 5

dctDict = {}

encoding=‘gbk‘

‘‘‘

初始化字典、初始化最大词长

‘‘‘

def initDct(dct):

    global dctDict

    global dictMaxLength

    dctobj = open(dct)

    for line in dctobj:

        line = line.strip()

        word = line.split("\t")[0].strip()

        dctDict[word] = line

        word = word.strip().decode(encoding)

        if dictMaxLength < len(word):

            dictMaxLength = len(word)

    dctobj.close()

‘‘‘

正向最大匹配算法

‘‘‘

def maximunMathching(sent):

    global dictMaxLength

    global dctDict

    index = 0

    j = 0

    result = ""

    sent = sent.strip().decode(encoding)

    sentLen = len(sent)

    while(index < sentLen):

        for i in range(dictMaxLength, 0, -1):

            j = i + index

            if j > sentLen:

                j = sentLen

            sub = sent[index:j]

            if len(sub) > 1:

                if dctDict.has_key(sub.encode(encoding)):

                    index += i

                    result += sub.encode(encoding) + " "

                    break;

            else:

                index += i

                if not sub.encode(encoding) == " ":

                    result += sub.encode(encoding) + " "

                break

    return result.strip()

‘‘‘

逆向最大匹配算法

‘‘‘

def reverseMaximunMathching(sent):

    global dctDict

    global dictMaxLength

    sb = ""

    sent = sent.strip().decode(encoding)

    index = len(sent)

    j = 0

    list = []

    while index >= 0:

        for i in range(dictMaxLength, 0, -1):

            j = index - i

            if j < 0: j = 0

            sub = sent[j:index]

            if len(sub) > 1:

                if dctDict.has_key(sub.encode(encoding)):

                    list.append(sub.encode(encoding))

                    index = index - i

                    break;

            else:

                if not sub.encode(encoding) == " ":

                    list.append(sub.encode(encoding))

                index = index - i

                break

    list.reverse()

    return " ".join(list)

‘‘‘

非字典词、单字字典词、总词数 越少越好

‘‘‘

def segmenter(sent):

    mm = maximunMathching(sent).strip()

    rmm = reverseMaximunMathching(sent).strip()

    if mm == rmm:

        return mm

    else:

        return bmmResult(mm, rmm)

 

‘‘‘

非字典词、单字字典词、总词数 越少越好

‘‘‘

def bmmResult(mm, rmm):

    #print mm

    #print rmm

    global dctDict

    mmlist = mm.split(" ")

    rmmlist = rmm.split(" ")

    oovNum_mm = 0

    oovNum_rmm = 0

    sigNum_mm = 0

    sigNum_rmm = 0

    totNum_mm = len(mmlist)

    totNum_rmm = len(rmmlist)

 

    for word in mmlist:

        if not dctDict.has_key(word):

            oovNum_mm += 1

        if len(word.decode(encoding)) == 1:

            sigNum_mm += 1

    for word in rmmlist:

        if not dctDict.has_key(word):

            oovNum_rmm += 1

        if len(word.decode(encoding)) == 1:

            sigNum_rmm += 1

    MMWMix = 0

    RMMNWMix = 0

    if oovNum_mm > oovNum_rmm:

        RMMNWMix += 1

    elif oovNum_mm < oovNum_rmm:

        MMWMix += 1

    if sigNum_mm > sigNum_rmm:

        RMMNWMix += 1

    elif sigNum_mm < sigNum_rmm:

        MMWMix += 1

    if totNum_mm > totNum_rmm:

        RMMNWMix += 1

    elif totNum_mm < totNum_rmm:

        MMWMix += 1

    #print oovNum_mm, sigNum_mm, totNum_mm

    #print oovNum_rmm, sigNum_rmm, totNum_rmm

    if MMWMix < MMWMix:

        return mm

    else:

        return rmm

            

 

def handleFile(input, output):

    inputobj = open(input)

    outputobj = open(output,"w")

    index = 0

    for line in inputobj:

        index += 1

        if index % 100000 == 0:

            print str(index) + "\r"

        line = line.strip().lower()

        seg = segmenter(line)

        outputobj.write(seg.strip() + "\n")

    inputobj.close()

    outputobj.close()

    

if __name__ == ‘__main__‘:

    if len(sys.argv) != 4:

        print "Usage %s dict[in] inFile[in] outFile[out]." %sys.argv[0]

        sys.exit(-1)

 

    dct = sys.argv[1]

    input = sys.argv[2]

    output = sys.argv[3]

    

    initDct(dct)

    #sent = "chien中华人民共和国在1949年成立了"

    #print segmenter(sent)

    handleFile(input, output)

 

双向最大匹配分词算法