首页 > 代码库 > 未登录词识别

未登录词识别

未登录词识别:不在词典中的词---新词:杀马特---命名实体:奥克兰


主要解决方案:基于规则合词,然后通过百度验证。

Start Char Char    1-2-Combine    #[图 n][里 f][市场 n][站 n]
Start Char Char Char    1-3-Combine    #
Start Char Char Char Char    1-4-Combine    #
Start Char Char Char Char Char    1-5-Combine    #
Start Char Char Char Char Char Char    1-6-Combine    #
Start Direction Char    1-2-Combine    #东澳站 南势站
Start Char Word    1-2-Combine    #[台 j][中港 nz][站 n]
Word Char Keyword    0-1-Combine    #[梨园 nz][寮 g][站 v][白沙 nz][屯 ng][站 n]
Char Char Keyword    0-1-Combine    #[商水县 ns][黄 a][寨 ng][站 n]
NumPrefix Num    0-1-Seq    #地五医院
Num NumSuffix    0-1-Seq    #93/号/酒家
Num Num    0-1-Combine #
Num Num Num    0-2-Combine #
Num Num Num Num    0-3-Combine #
Num Num Num Num Num    0-4-Combine #
Num Num Num Num Num Num    0-5-Combine #
Num Num Num Num Num Num Num    0-6-Combine #
Num Num Num Num Num Num Num Num    0-7-Combine #
Num Num Num Num Num Num Num Num Num    0-8-Combine #
Num Num Num Num Num Num Num Num Num Num    0-9-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-10-Combine    #
Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-9-Combine    #
Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-8-Combine    #
Letter Letter Letter Letter Letter Letter Letter Letter    0-7-Combine    #
Letter Letter Letter Letter Letter Letter Letter    0-6-Combine    #
Letter Letter Letter Letter Letter Letter    0-5-Combine    #
Letter Letter Letter Letter Letter    0-4-Combine    #
Letter Letter Letter Letter    0-3-Combine    #
Letter Letter Letter    0-2-Combine    #
Letter Letter    0-1-Combine    #
Num NumSuffix Keyword    0-1-Seq    #海口1号场BLACKSTONE球场
Num Char Char Keyword    0-2-Combine    #八里岔中学
Char Num Char Keyword    0-2-Combine    #八里岔中学
Char Char Num Keyword    0-2-Combine    #八里岔中学
ackage cn.tianditu.mt.common;import java.io.BufferedReader;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import java.util.ArrayList;import java.util.LinkedList;import java.util.List;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;public class Grammar {        protected static Log logger = LogFactory.getLog(Grammar.class);        public final class TSTNode {        public CombinRule data = http://www.mamicode.com/null;        protected TSTNode loNode;        protected TSTNode eqNode;        protected TSTNode hiNode;        protected SegMarkType splitchar;        public TSTNode(SegMarkType type) {            this.splitchar = type;        }    }    public TSTNode rootNode;        public TSTNode add(List<SegMarkType> word) {        if (null == word) {            throw new NullPointerException("空指针异常");        }        int charIndex = 0;        if (null == rootNode) {            rootNode = new TSTNode(word.get(0));        }        TSTNode currentNode = rootNode;        while (true) {            int charComp = word.get(charIndex).compareTo(currentNode.splitchar);            if (charComp == 0) {                charIndex++;                if (charIndex == word.size()) {                    return currentNode;                }                if (null == currentNode.eqNode) {                    currentNode.eqNode = new TSTNode(word.get(charIndex));                }                currentNode = currentNode.eqNode;            } else if (charComp < 0) {                if (null == currentNode.loNode) {                    currentNode.loNode = new TSTNode(word.get(charIndex));                }                currentNode = currentNode.loNode;            } else {                if (null == currentNode.hiNode) {                    currentNode.hiNode = new TSTNode(word.get(charIndex));                }                currentNode = currentNode.hiNode;            }        }    }    protected TSTNode getNode(List<SegMarkType> word) {        if (null == word) {            return null;        }        int len = word.size();        if (len == 0)            return null;        TSTNode currentNode = rootNode; // 匹配过程中的当前节点的位置        int charIndex = 0; // 表示当前要比较的字符在Key中的位置        SegMarkType cmpChar = word.get(charIndex);        int charComp;        while (true) {            if (currentNode == null) {// 没找到                return null;            }            charComp = cmpChar.compareTo(currentNode.splitchar);            if (charComp == 0) {// 相等往下走                charIndex++;                if (charIndex == len) {// 找到了                    return currentNode;                } else {                    cmpChar = word.get(charIndex);// 词往下走                }                currentNode = currentNode.eqNode;            } else if (charComp < 0) {// 小于往左走                currentNode = currentNode.loNode;            } else {// 大于往右走                currentNode = currentNode.hiNode;            }        }    }    public MatchRet matchLong(List<WordInfo> tokens, int offset) {        if (tokens == null || rootNode == null) {            return null;        }        MatchRet ret = null;        TSTNode currentNode = rootNode;        int index = offset;        while (currentNode != null) {            int charComp = tokens.get(index).getType().compareTo(                    currentNode.splitchar);            if (charComp == 0) {                index++;                if (currentNode.data != null) {                    ret = new MatchRet(currentNode, index);                }                if (index == tokens.size()) {                    return ret;                }                currentNode = currentNode.eqNode;            } else if (charComp < 0) {                currentNode = currentNode.loNode;            } else {                currentNode = currentNode.hiNode;            }        }        return ret;    }    /**     * 根据语法规则进行合并     * 支持多次合并     * 且保留了源序列     * @param tokens     * @param rules     * @return     */    private List<WordInfo> combineByRules(List<WordInfo> tokens,List<Combin> rules){        if(rules==null){            return tokens;        }        List<WordInfo> list=new ArrayList<WordInfo>();        for (int i = 0; i < tokens.size();) {            for (Combin com : rules) {                if(i==com.getStart()){                    int start=com.getStart();                    int end=com.getEnd();                                        List<WordInfo> sub=tokens.subList(start, end+1);//前闭后开                    StringBuilder buff=new StringBuilder();                    for (WordInfo wordInfo : sub) {                        buff.append(wordInfo.getCn());                    }                                        String cn=buff.toString();                    SegMarkType type=com.getType();                                        WordInfo info=new WordInfo(cn,null,type,sub);                                    list.add(info);                                        i=end+1;                                        continue;                }                    }            list.add(tokens.get(i));            i++;                }        return list;    }        /**     * 仅支持一次合并,不支持内部的多次合并,即无法达到有限状态机的效果     * @param tokens     * @param rules     */    @SuppressWarnings("unused")    private void CombineOnce(LinkedList<WordInfo> tokens,            List<Combin> rules) {        for (Combin com : rules) {            int start = com.getStart();            int end = com.getEnd();            SegMarkType type = com.getType();                        StringBuilder buff=new StringBuilder();            for (int i = start; i <= end; i++) {                WordInfo word=tokens.get(i);                buff.append(word.getCn());            }                        int dis=end-start+1;            for (int i = 0; i < dis; i++) {                tokens.remove(start);            }                        String cn=buff.toString();            WordInfo info=new WordInfo(cn,null,type);            tokens.add(start, info);                    }    }    public List<WordInfo> tag(List<WordInfo> tokens) {        if (tokens == null || rootNode == null) {            return null;        }        List<Combin> rules = new ArrayList<Combin>();        for (int i = 0; i < tokens.size();) {            MatchRet ret = matchLong(tokens, i);            if (null != ret) {                CombinRule rule = ret.getNode().data;//找到了树上的东西                int indexCurrent = ret.getIndex()-1;                List<Combin> list_com = rule.getPosition();                for (Combin com : list_com) {                    int start = indexCurrent - rule.getLen() + 1                            + com.getStart();                    int end = indexCurrent - rule.getLen() + 1 + com.getEnd();                    Combin c = new Combin(start, end, com.getType());//拿到规则                    rules.add(c);//放入规则列表                }                i = ret.getIndex();            } else {                i++;            }        }        List<WordInfo> words= combineByRules(tokens,rules);//根据规则合并        return words;    }    public Grammar(Config config){        loadGrammar(config.getBasicGramFileName());        loadGrammar(config.getGramFileName());    }            public void loadGrammar(String gramFileName){        try {            FileReader fileReader = new FileReader(gramFileName);            BufferedReader reader = new BufferedReader(fileReader);            String line;            try {                while ((line = reader.readLine()) != null) {                    String[] arr=line.split("\t");                                        List<SegMarkType> seq=FormSeq(arr[0]);                    CombinRule rule=FormRule(arr[1],seq.size());                                        TSTNode node = this.add(seq);                    node.data=rule;                }            } catch (NullPointerException e) {                logger.info(e.getMessage());                logger.info(e.getStackTrace());            } catch (IllegalArgumentException e) {                logger.info(e.getMessage());                logger.info(e.getStackTrace());            } catch (IOException e) {                logger.info(e.getMessage());                logger.info(e.getStackTrace());            }        } catch (FileNotFoundException e) {            logger.info(e.getMessage());            logger.info(e.getStackTrace());        }    }                    private CombinRule FormRule(String line,int size) {                List<Combin> rec = new ArrayList<Combin>();        String[] arr_1=line.split("#");        for (String str : arr_1) {            String[] arr_2=str.split("-");            int start = Integer.parseInt(arr_2[0]);            int end=Integer.parseInt(arr_2[1]);            SegMarkType type=Enum.valueOf(SegMarkType.class, arr_2[2].trim());            Combin pos = new Combin(start, end, type);            rec.add(pos);        }                CombinRule rule = new CombinRule(rec,size);        return rule;    }    private List<SegMarkType> FormSeq(String string) {        List<SegMarkType> list=new ArrayList<SegMarkType>();        String[] arr=string.split(" ");        for (String str : arr) {            SegMarkType type=Enum.valueOf(SegMarkType.class, str);            list.add(type);        }        return list;    }    }

 

未登录词识别