首页 > 代码库 > 【算法】基于树形结构分词

【算法】基于树形结构分词

  1 #!/usr/bin/env python  2 #encoding=gbk  3 import os  4 import sys  5   6 G_ENCODING="gbk"  7 """  8 ===============================  9 中文分词 10 1. 机械分词 11 2. 统计分词 12 3. 理解分词 13 =============================== 14 基于树形结构分词策略(结合机械分词,统计分词) 15 例:笔记本电脑  16     dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9} 17          ------------------------------- 18         |              <s>              | 19          ------------------------------- 20         /         /         \            21       [笔]     [笔记]    [笔记本]    [笔记本电脑] 22        /          /        /    23      [记]       [本]     [电] [电脑] 24       /         /   25     [本]      [电] [电脑] 26     /  \       / 27  [电] [电脑] [脑] 28   / 29 [脑]  30 ------------------------------- 31 path: 笔 记 本 电 脑  -- score: [0.32768] 32 path: 笔 记 本 电脑   -- score: [0.4608] 33 path: 笔记 本 电 脑   -- score: [0.4608] 34 path: 笔记 本 电脑    -- score: [0.648] 35 path: 笔记本 电 脑    -- score: [0.576] 36 path: 笔记本 电脑     -- score: [0.81] 37 path: 笔记本电脑      -- score: [0.9] 38  39 best path: 笔记本电脑 -- score: [0.9] 40  41 ------------------------------- 42 1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重) 43 2、最少切分、OOV、最少单字等策略 44 ==获取最佳分词路径 45 ------------------------------- 46 Q1、如果句子过长,树非常大,遍历费时(需优化) 47 Q2、字典加载(需优化) 48 以下给出该思想的简单实现[python]: 49 """ 50  51 class Stack(): 52     def __init__(self, volume = 0): 53         self.list = [] if volume == 0 else [0 for i in range(0,volume)] 54         self.top = 0 55  56     def push(self, element): 57         if self.list != None:  58             self.top += 1 59             self.list[self.top] = element 60  61     def pop(self): 62         if self.list != None and self.top >= 0: 63             ele = self.list[self.top] 64             self.list[self.top] = None 65             self.top -= 1 66             return ele 67         return None 68     def empty(self): 69         return self.top == 0 70      71 class Node(): 72     def __init__(self, data, next = None, prev = None, depth = 0, wlen = 0, weight = 0.0): 73         self.data =http://www.mamicode.com/ data 74         self.next = next if next != None else [] 75         self.prev = prev 76         self.depth = depth 77         self.wlen = wlen 78         self.weight = weight 79  80     def isLeaf(self): 81         return self.next == None or self.next == [] 82  83 class Tree(): 84     def __init__(self, root = None): 85         self.root = root 86     """append a child node to child""" 87     def append(self, node, cnode): 88         if node != None and cnode != None: 89             node.next.append(cnode) 90             cnode.prev = node 91             cnode.depth = node.depth + 1 92             return 0 93         return -1 94  95     """depth first search(binary preorder)"""   96     def depth_first_search(self, node): 97         list = [] 98         if node != None: 99             stack = Stack(30)100             stack.push(node)101             while not stack.empty():102                 tmp = stack.pop()103                 list.append(tmp)104                 for i in range(len(tmp.next) - 1, -1, -1):105                     stack.push(tmp.next[i])106         return list107 108 class Tokenizer():109     """init the tree"""110     def load(self, tree, pnode, cache, dict):111         clen = len(cache)112         for node in tree.depth_first_search(pnode):113             if node.isLeaf():114                 i = node.wlen115                 j = i116                 while j < clen:117                     j += 1118                     tmp = cache[i:j].encode(G_ENCODING)119                     if dict.has_key(tmp) or len(tmp) == 1:120                         tnode = Node(tmp, wlen = j, weight = dict.get(tmp))121                         tree.append(node, tnode)122                         self.load(tree, tnode, cache, dict)123         return 0124     """backtrance"""125     def backtrance(self, node, list):126         if node.prev != None and node.prev.data != "<s>":127             list.append(node.prev)128             self.backtrance(node.prev, list)129         return 0130 131     def bestpath(self, tree):132         highestScore = 0133         bestpath = ""134         for node in tree.depth_first_search(tree.root):135             """find the leaf node and backtrance to find the bese path"""136             if node.isLeaf():137                 list = [node]138                 self.backtrance(node, list)139                 list.reverse()140                 """141                 1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重)142                 2、最少切分、OOV、最少单字等策略143                 这里只是简单给出路径权重的乘积得分144             145                 """146                 sc = 1.0147                 tp = ""148                 for xn in list:149                     sc *= xn.weight if xn.weight > 0 else 1150                     tp += xn.data + " "151                 if sc > highestScore: 152                     highestScore = sc153                     bestpath = tp.strip()154                 print "path: %s -- score: [%s]"%(tp.strip(), sc)155         print "\nbest path: %s -- score: [%s]"%(bestpath, highestScore)156         return bestpath157 def example():158     sent = "笔记本电脑"159     dict = {"":0.8,"":0.8,"":0.8,"":0.8,"":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9}160     cache = unicode(sent, G_ENCODING)161     tokenizer = Tokenizer()162     tree = Tree(Node("<s>"))163     """init tree"""164     tokenizer.load(tree, tree.root, cache, dict)165     """backtrance and find the best path"""166     tokenizer.bestpath(tree)167 example()

 

【算法】基于树形结构分词