双向最大匹配分词算法

首页 > 代码库 > 双向最大匹配分词算法

2024-07-28 21:41:12 220人阅读

#!/usr/bin/python

#encoding=gbk

import sys

dictMaxLength = 5

dctDict = {}

encoding=‘gbk‘

‘‘‘

初始化字典、初始化最大词长

‘‘‘

def initDct(dct):

global dctDict

global dictMaxLength

dctobj = open(dct)

for line in dctobj:

line = line.strip()

word = line.split("\t")[0].strip()

dctDict[word] = line

word = word.strip().decode(encoding)

if dictMaxLength < len(word):

dictMaxLength = len(word)

dctobj.close()

‘‘‘

正向最大匹配算法

‘‘‘

def maximunMathching(sent):

global dictMaxLength

global dctDict

index = 0

j = 0

result = ""

sent = sent.strip().decode(encoding)

sentLen = len(sent)

while(index < sentLen):

for i in range(dictMaxLength, 0, -1):

j = i + index

if j > sentLen:

j = sentLen

sub = sent[index:j]

if len(sub) > 1:

if dctDict.has_key(sub.encode(encoding)):

index += i

result += sub.encode(encoding) + " "

break;

else:

index += i

if not sub.encode(encoding) == " ":

result += sub.encode(encoding) + " "

break

return result.strip()

‘‘‘

逆向最大匹配算法

‘‘‘

def reverseMaximunMathching(sent):

global dctDict

global dictMaxLength

sb = ""

sent = sent.strip().decode(encoding)

index = len(sent)

j = 0

list = []

while index >= 0:

for i in range(dictMaxLength, 0, -1):

j = index - i

if j < 0: j = 0

sub = sent[j:index]

if len(sub) > 1:

if dctDict.has_key(sub.encode(encoding)):

list.append(sub.encode(encoding))

index = index - i

break;

else:

if not sub.encode(encoding) == " ":

list.append(sub.encode(encoding))

index = index - i

break

list.reverse()

return " ".join(list)

‘‘‘

非字典词、单字字典词、总词数越少越好

‘‘‘

def segmenter(sent):

mm = maximunMathching(sent).strip()

rmm = reverseMaximunMathching(sent).strip()

if mm == rmm:

return mm

else:

return bmmResult(mm, rmm)

‘‘‘

非字典词、单字字典词、总词数越少越好

‘‘‘

def bmmResult(mm, rmm):

#print mm

#print rmm

global dctDict

mmlist = mm.split(" ")

rmmlist = rmm.split(" ")

oovNum_mm = 0

oovNum_rmm = 0

sigNum_mm = 0

sigNum_rmm = 0

totNum_mm = len(mmlist)

totNum_rmm = len(rmmlist)

for word in mmlist:

if not dctDict.has_key(word):

oovNum_mm += 1

if len(word.decode(encoding)) == 1:

sigNum_mm += 1

for word in rmmlist:

if not dctDict.has_key(word):

oovNum_rmm += 1

if len(word.decode(encoding)) == 1:

sigNum_rmm += 1

MMWMix = 0

RMMNWMix = 0

if oovNum_mm > oovNum_rmm:

RMMNWMix += 1

elif oovNum_mm < oovNum_rmm:

MMWMix += 1

if sigNum_mm > sigNum_rmm:

RMMNWMix += 1

elif sigNum_mm < sigNum_rmm:

MMWMix += 1

if totNum_mm > totNum_rmm:

RMMNWMix += 1

elif totNum_mm < totNum_rmm:

MMWMix += 1

#print oovNum_mm, sigNum_mm, totNum_mm

#print oovNum_rmm, sigNum_rmm, totNum_rmm

if MMWMix < MMWMix:

return mm

else:

return rmm

def handleFile(input, output):

inputobj = open(input)

outputobj = open(output,"w")

index = 0

for line in inputobj:

index += 1

if index % 100000 == 0:

print str(index) + "\r"

line = line.strip().lower()

seg = segmenter(line)

outputobj.write(seg.strip() + "\n")

inputobj.close()

outputobj.close()

if __name__ == ‘__main__‘:

if len(sys.argv) != 4:

print "Usage %s dict[in] inFile[in] outFile[out]." %sys.argv[0]

sys.exit(-1)

dct = sys.argv[1]

input = sys.argv[2]

output = sys.argv[3]

initDct(dct)

#sent = "chien中华人民共和国在1949年成立了"

#print segmenter(sent)

handleFile(input, output)

双向最大匹配分词算法

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > 双向最大匹配分词算法

双向最大匹配分词算法

看完仍有疑问？有类似问题直接问程序猿