首页 > 代码库 > 给定字典做分词

给定字典做分词

最近需要用到分词,无聊写个算法。。。

算法:给定一个字典和一句话,做分词;

Target:输入词典,输出所有可能的分词结果

思路:dfs

加速:首先判断是不是这句话里所有的词在字典中都有(validate)


//
//  Wordsplit.cpp
//  
//  Target: Find all possible splitting of a sentence given a dictionary dict
//  Howto:  refer to main
//
//  Created by Rachel on 14-8-16.
//  Copyright (c) 2014年 ZJU. All rights reserved.
//

#include <iostream>
#include <stdio.h>
#include "vector"
#include <set>
#include<unordered_set>
using namespace std;

class Wordsplit {
private:
    vector<string> list;
    bool match(string s, string cur_ele){
        int l = cur_ele.length();
        if (s.substr(0,l)==cur_ele) {
            return true;
        }
        return false;
    }
    
    bool validate(string s, unordered_set<string> &dict){
        //1. calculate all alphabets in the query
        set<char> alpha;
        for (int i=0; i<s.length(); i++) {
            alpha.insert(s[i]);
            }
        //2. calculate all alphabets in the dictionary
        set<char> beta;
        unordered_set<string>::iterator dict_it;
        for (dict_it = dict.begin(); dict_it!=dict.end(); dict_it++) {
            for (int i=0; i<(*dict_it).length(); i++) {
                beta.insert((*dict_it)[i]);
            }
        }
        set<char>::iterator it;
        for (it = alpha.begin(); it!=alpha.end(); it++) {
            if (beta.find(*it)==beta.end()) {
                return false;
            }
        }
        return true;
    }
    
public:
    string split(string s, unordered_set<string> &dict, string cur_str){
        if (s.length()==0) {
            list.push_back(cur_str.substr(0,cur_str.length()-1));
            return s;
        }
        //cout<<s<<endl;
        unordered_set<string>::iterator it;
        for (it=dict.begin(); it!=dict.end(); it++) {
            if (match(s, (*it))) {
                string tmp_str = cur_str;
                string latter = s.substr(it->length(), s.length()-it->length());
                cur_str += (*it) + " "; // add current word to cur_str
                cur_str += split(latter, dict, cur_str); // split remaining words
                cur_str = tmp_str; //back to last status
            }
        }
        return "no result";
    }
    
    
    vector<string> main(string s, unordered_set<string> &dict) {
        if (!validate(s, dict)) {
            return list;
        }
        split(s, dict, "");
        return list;
    }
};

int main()
{
    Wordsplit s;
    unordered_set<string> L={"程序员","公务员","员","我","喜","做","程序","一","欢","喜欢","做一个","一个"};
    vector<string> V = s.main("我喜欢做一个程序员", L);
    vector<string>::iterator it;
    for (it=V.begin(); it!=V.end(); it++) {
        cout<<(*it)<<endl;
    }
}



输出:

喜欢 做一个 程序

喜欢 做一个 程序员

喜欢 一个 程序

喜欢 一个 程序员

做一个 程序

做一个 程序员

一个 程序

一个 程序员