首页 > 代码库 > 【算法】基于树形结构分词
【算法】基于树形结构分词
1 #!/usr/bin/env python 2 #encoding=gbk 3 import os 4 import sys 5 6 G_ENCODING="gbk" 7 """ 8 =============================== 9 中文分词 10 1. 机械分词 11 2. 统计分词 12 3. 理解分词 13 =============================== 14 基于树形结构分词策略(结合机械分词,统计分词) 15 例:笔记本电脑 16 dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9} 17 ------------------------------- 18 | <s> | 19 ------------------------------- 20 / / \ 21 [笔] [笔记] [笔记本] [笔记本电脑] 22 / / / 23 [记] [本] [电] [电脑] 24 / / 25 [本] [电] [电脑] 26 / \ / 27 [电] [电脑] [脑] 28 / 29 [脑] 30 ------------------------------- 31 path: 笔 记 本 电 脑 -- score: [0.32768] 32 path: 笔 记 本 电脑 -- score: [0.4608] 33 path: 笔记 本 电 脑 -- score: [0.4608] 34 path: 笔记 本 电脑 -- score: [0.648] 35 path: 笔记本 电 脑 -- score: [0.576] 36 path: 笔记本 电脑 -- score: [0.81] 37 path: 笔记本电脑 -- score: [0.9] 38 39 best path: 笔记本电脑 -- score: [0.9] 40 41 ------------------------------- 42 1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重) 43 2、最少切分、OOV、最少单字等策略 44 ==获取最佳分词路径 45 ------------------------------- 46 Q1、如果句子过长,树非常大,遍历费时(需优化) 47 Q2、字典加载(需优化) 48 以下给出该思想的简单实现[python]: 49 """ 50 51 class Stack(): 52 def __init__(self, volume = 0): 53 self.list = [] if volume == 0 else [0 for i in range(0,volume)] 54 self.top = 0 55 56 def push(self, element): 57 if self.list != None: 58 self.top += 1 59 self.list[self.top] = element 60 61 def pop(self): 62 if self.list != None and self.top >= 0: 63 ele = self.list[self.top] 64 self.list[self.top] = None 65 self.top -= 1 66 return ele 67 return None 68 def empty(self): 69 return self.top == 0 70 71 class Node(): 72 def __init__(self, data, next = None, prev = None, depth = 0, wlen = 0, weight = 0.0): 73 self.data =http://www.mamicode.com/ data 74 self.next = next if next != None else [] 75 self.prev = prev 76 self.depth = depth 77 self.wlen = wlen 78 self.weight = weight 79 80 def isLeaf(self): 81 return self.next == None or self.next == [] 82 83 class Tree(): 84 def __init__(self, root = None): 85 self.root = root 86 """append a child node to child""" 87 def append(self, node, cnode): 88 if node != None and cnode != None: 89 node.next.append(cnode) 90 cnode.prev = node 91 cnode.depth = node.depth + 1 92 return 0 93 return -1 94 95 """depth first search(binary preorder)""" 96 def depth_first_search(self, node): 97 list = [] 98 if node != None: 99 stack = Stack(30)100 stack.push(node)101 while not stack.empty():102 tmp = stack.pop()103 list.append(tmp)104 for i in range(len(tmp.next) - 1, -1, -1):105 stack.push(tmp.next[i])106 return list107 108 class Tokenizer():109 """init the tree"""110 def load(self, tree, pnode, cache, dict):111 clen = len(cache)112 for node in tree.depth_first_search(pnode):113 if node.isLeaf():114 i = node.wlen115 j = i116 while j < clen:117 j += 1118 tmp = cache[i:j].encode(G_ENCODING)119 if dict.has_key(tmp) or len(tmp) == 1:120 tnode = Node(tmp, wlen = j, weight = dict.get(tmp))121 tree.append(node, tnode)122 self.load(tree, tnode, cache, dict)123 return 0124 """backtrance"""125 def backtrance(self, node, list):126 if node.prev != None and node.prev.data != "<s>":127 list.append(node.prev)128 self.backtrance(node.prev, list)129 return 0130 131 def bestpath(self, tree):132 highestScore = 0133 bestpath = ""134 for node in tree.depth_first_search(tree.root):135 """find the leaf node and backtrance to find the bese path"""136 if node.isLeaf():137 list = [node]138 self.backtrance(node, list)139 list.reverse()140 """141 1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重)142 2、最少切分、OOV、最少单字等策略143 这里只是简单给出路径权重的乘积得分144 145 """146 sc = 1.0147 tp = ""148 for xn in list:149 sc *= xn.weight if xn.weight > 0 else 1150 tp += xn.data + " "151 if sc > highestScore: 152 highestScore = sc153 bestpath = tp.strip()154 print "path: %s -- score: [%s]"%(tp.strip(), sc)155 print "\nbest path: %s -- score: [%s]"%(bestpath, highestScore)156 return bestpath157 def example():158 sent = "笔记本电脑"159 dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9}160 cache = unicode(sent, G_ENCODING)161 tokenizer = Tokenizer()162 tree = Tree(Node("<s>"))163 """init tree"""164 tokenizer.load(tree, tree.root, cache, dict)165 """backtrance and find the best path"""166 tokenizer.bestpath(tree)167 example()
【算法】基于树形结构分词
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。