首页 > 代码库 > 中文分词选取-依概率选取

中文分词选取-依概率选取

上一篇文章中介绍了一种中文分词的选取算法,本篇文章将介绍另外一种中文分词选取算法,依概率选取算法。

        中文分词分词完成之后,还是上篇文章中的原则,分词结果不唯一,然后我们算法的目的是从几种分词好的算法之后选取一个作为分词的最终结果。算法会统计每个词在所有文档中的概率,该算法的中心思想是计算一个字符串中所有分词的概率之积,选取概率最大的作为分词的最终结果。

        算法步骤:第一步,通过上几篇文章的的算法对字符串进行分词;第二步,扫描每一次分词结果;第三步,计算每一次分词结果的所有词的概率之积;第四步,选出乘积最大的那种分词。

        算法实现:下面是将以上过程的代码实现,分别采用正向最大匹配分词算法和逆向最大匹配算法,然后通过该算法选出最终结果。代码如下:


package com;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

public class Segmentation3 {
	private Map<String, Double> dictionary = new HashMap<String, Double>();
	private static String request = "有意见分歧";
	
	public void setDictionary() {
		dictionary.put("有", 0.0181);
		dictionary.put("有意", 0.0005);
		dictionary.put("意见", 0.0010);
		dictionary.put("见", 0.0002);
		dictionary.put("分歧", 0.0001);
	}
	
	public String leftMax() {
		String response = "";
		String s = "";
		for(int i=0; i<request.length(); i++) {
			s += request.charAt(i);
			if(dictionary.containsKey(s) && aheadCount(s, dictionary)==1) {
				response += (s + "/");
				s = "";
			} else if(aheadCount(s, dictionary) > 0) {
				
			} else {
				response += (s + "/");
				s = "";
			}
		}
		return response;
	}
	
	private int aheadCount(String s, Map<String, Double> map) {
		int count = 0;
		Iterator<String> it = map.keySet().iterator();
		while(it.hasNext()) {
			String key = it.next();
			if((s.length()<=key.length()) && (s.equals(key.substring(0, s.length())))) count ++;
		}
		return count;
	}
	
	public String rightMax() {
		String response = "";
		String s = "";
		for(int i=request.length()-1; i>=0; i--) {
			s = request.charAt(i) + s;
			if(dictionary.containsKey(s) && tailCount(s, dictionary)==1) {
				response = (s + "/") + response;
				s = "";
			} else if(tailCount(s, dictionary) > 0) {
				
			} else {
				response = (s + "/") + response;
				s = "";
			}
		}
		return response;
	}
	
	private int tailCount(String s, Map<String, Double> map) {
		int count = 0;
		Iterator<String> it = map.keySet().iterator();
		while(it.hasNext()) {
			String key = it.next();
			if((s.length()<=key.length()) && (s.equals(key.substring(key.length()-s.length(), key.length())))) count ++;
		}
		return count;
	}
	
	public double getScore(String s) {
		double score = 1.0;
		String[] words = s.split("/");
		for(String word : words) {
			if(dictionary.containsKey(word)) score *= dictionary.get(word);
		}
		System.out.println(score);
		return score;
	}
	
	public static void main(String[] args) {
		System.out.println(request);
		String response;
		Segmentation3 seg = new Segmentation3();
		seg.setDictionary();
		String response1 = seg.leftMax();
		System.out.println(response1);
		String response2 = seg.rightMax();
		System.out.println(response2);
		if(seg.getScore(response1)>seg.getScore(response2))
			response = response1;
		else response = response2;
		System.out.println(response);
	}
}

程序运行的结果:

有意见分歧
有意/见/分歧/
有/意见/分歧/
1.0000000000000001E-11
1.8100000000000004E-9
有/意见/分歧/