首页 > 代码库 > English Morphology

English Morphology

最近参与一个小project,需要编写一个针对英文单词的stem 算法。

1. 最为常见的stem 算法 就是The English (Porter2) stemming algorithm http://snowball.tartarus.org/algorithms/english/stemmer.html

// This file was generated automatically by the Snowball to Java compilerpackage org.tartarus.snowball.ext;import org.tartarus.snowball.Among; /**  * This class was automatically generated by a Snowball to Java compiler   * It implements the stemming algorithm defined by a snowball script.  */public class englishStemmer extends org.tartarus.snowball.SnowballStemmer {private static final long serialVersionUID = 1L;        private final static englishStemmer methodObject = new englishStemmer ();                private final static Among a_0[] = {                    new Among ( "arsen", -1, -1, "", methodObject ),                    new Among ( "commun", -1, -1, "", methodObject ),                    new Among ( "gener", -1, -1, "", methodObject )                };                private final static Among a_1[] = {                    new Among ( "‘", -1, 1, "", methodObject ),                    new Among ( "‘s‘", 0, 1, "", methodObject ),                    new Among ( "‘s", -1, 1, "", methodObject )                };                private final static Among a_2[] = {                    new Among ( "ied", -1, 2, "", methodObject ),                    new Among ( "s", -1, 3, "", methodObject ),                    new Among ( "ies", 1, 2, "", methodObject ),                    new Among ( "sses", 1, 1, "", methodObject ),                    new Among ( "ss", 1, -1, "", methodObject ),                    new Among ( "us", 1, -1, "", methodObject )                };                private final static Among a_3[] = {                    new Among ( "", -1, 3, "", methodObject ),                    new Among ( "bb", 0, 2, "", methodObject ),                    new Among ( "dd", 0, 2, "", methodObject ),                    new Among ( "ff", 0, 2, "", methodObject ),                    new Among ( "gg", 0, 2, "", methodObject ),                    new Among ( "bl", 0, 1, "", methodObject ),                    new Among ( "mm", 0, 2, "", methodObject ),                    new Among ( "nn", 0, 2, "", methodObject ),                    new Among ( "pp", 0, 2, "", methodObject ),                    new Among ( "rr", 0, 2, "", methodObject ),                    new Among ( "at", 0, 1, "", methodObject ),                    new Among ( "tt", 0, 2, "", methodObject ),                    new Among ( "iz", 0, 1, "", methodObject )                };                private final static Among a_4[] = {                    new Among ( "ed", -1, 2, "", methodObject ),                    new Among ( "eed", 0, 1, "", methodObject ),                    new Among ( "ing", -1, 2, "", methodObject ),                    new Among ( "edly", -1, 2, "", methodObject ),                    new Among ( "eedly", 3, 1, "", methodObject ),                    new Among ( "ingly", -1, 2, "", methodObject )                };                private final static Among a_5[] = {                    new Among ( "anci", -1, 3, "", methodObject ),                    new Among ( "enci", -1, 2, "", methodObject ),                    new Among ( "ogi", -1, 13, "", methodObject ),                    new Among ( "li", -1, 16, "", methodObject ),                    new Among ( "bli", 3, 12, "", methodObject ),                    new Among ( "abli", 4, 4, "", methodObject ),                    new Among ( "alli", 3, 8, "", methodObject ),                    new Among ( "fulli", 3, 14, "", methodObject ),                    new Among ( "lessli", 3, 15, "", methodObject ),                    new Among ( "ousli", 3, 10, "", methodObject ),                    new Among ( "entli", 3, 5, "", methodObject ),                    new Among ( "aliti", -1, 8, "", methodObject ),                    new Among ( "biliti", -1, 12, "", methodObject ),                    new Among ( "iviti", -1, 11, "", methodObject ),                    new Among ( "tional", -1, 1, "", methodObject ),                    new Among ( "ational", 14, 7, "", methodObject ),                    new Among ( "alism", -1, 8, "", methodObject ),                    new Among ( "ation", -1, 7, "", methodObject ),                    new Among ( "ization", 17, 6, "", methodObject ),                    new Among ( "izer", -1, 6, "", methodObject ),                    new Among ( "ator", -1, 7, "", methodObject ),                    new Among ( "iveness", -1, 11, "", methodObject ),                    new Among ( "fulness", -1, 9, "", methodObject ),                    new Among ( "ousness", -1, 10, "", methodObject )                };                private final static Among a_6[] = {                    new Among ( "icate", -1, 4, "", methodObject ),                    new Among ( "ative", -1, 6, "", methodObject ),                    new Among ( "alize", -1, 3, "", methodObject ),                    new Among ( "iciti", -1, 4, "", methodObject ),                    new Among ( "ical", -1, 4, "", methodObject ),                    new Among ( "tional", -1, 1, "", methodObject ),                    new Among ( "ational", 5, 2, "", methodObject ),                    new Among ( "ful", -1, 5, "", methodObject ),                    new Among ( "ness", -1, 5, "", methodObject )                };                private final static Among a_7[] = {                    new Among ( "ic", -1, 1, "", methodObject ),                    new Among ( "ance", -1, 1, "", methodObject ),                    new Among ( "ence", -1, 1, "", methodObject ),                    new Among ( "able", -1, 1, "", methodObject ),                    new Among ( "ible", -1, 1, "", methodObject ),                    new Among ( "ate", -1, 1, "", methodObject ),                    new Among ( "ive", -1, 1, "", methodObject ),                    new Among ( "ize", -1, 1, "", methodObject ),                    new Among ( "iti", -1, 1, "", methodObject ),                    new Among ( "al", -1, 1, "", methodObject ),                    new Among ( "ism", -1, 1, "", methodObject ),                    new Among ( "ion", -1, 2, "", methodObject ),                    new Among ( "er", -1, 1, "", methodObject ),                    new Among ( "ous", -1, 1, "", methodObject ),                    new Among ( "ant", -1, 1, "", methodObject ),                    new Among ( "ent", -1, 1, "", methodObject ),                    new Among ( "ment", 15, 1, "", methodObject ),                    new Among ( "ement", 16, 1, "", methodObject )                };                private final static Among a_8[] = {                    new Among ( "e", -1, 1, "", methodObject ),                    new Among ( "l", -1, 2, "", methodObject )                };                private final static Among a_9[] = {                    new Among ( "succeed", -1, -1, "", methodObject ),                    new Among ( "proceed", -1, -1, "", methodObject ),                    new Among ( "exceed", -1, -1, "", methodObject ),                    new Among ( "canning", -1, -1, "", methodObject ),                    new Among ( "inning", -1, -1, "", methodObject ),                    new Among ( "earring", -1, -1, "", methodObject ),                    new Among ( "herring", -1, -1, "", methodObject ),                    new Among ( "outing", -1, -1, "", methodObject )                };                private final static Among a_10[] = {                    new Among ( "andes", -1, -1, "", methodObject ),                    new Among ( "atlas", -1, -1, "", methodObject ),                    new Among ( "bias", -1, -1, "", methodObject ),                    new Among ( "cosmos", -1, -1, "", methodObject ),                    new Among ( "dying", -1, 3, "", methodObject ),                    new Among ( "early", -1, 9, "", methodObject ),                    new Among ( "gently", -1, 7, "", methodObject ),                    new Among ( "howe", -1, -1, "", methodObject ),                    new Among ( "idly", -1, 6, "", methodObject ),                    new Among ( "lying", -1, 4, "", methodObject ),                    new Among ( "news", -1, -1, "", methodObject ),                    new Among ( "only", -1, 10, "", methodObject ),                    new Among ( "singly", -1, 11, "", methodObject ),                    new Among ( "skies", -1, 2, "", methodObject ),                    new Among ( "skis", -1, 1, "", methodObject ),                    new Among ( "sky", -1, -1, "", methodObject ),                    new Among ( "tying", -1, 5, "", methodObject ),                    new Among ( "ugly", -1, 8, "", methodObject )                };                private static final char g_v[] = {17, 65, 16, 1 };                private static final char g_v_WXY[] = {1, 17, 65, 208, 1 };                private static final char g_valid_LI[] = {55, 141, 2 };        private boolean B_Y_found;        private int I_p2;        private int I_p1;                private void copy_from(englishStemmer other) {                    B_Y_found = other.B_Y_found;                    I_p2 = other.I_p2;                    I_p1 = other.I_p1;                    super.copy_from(other);                }                private boolean r_prelude() {            int v_1;            int v_2;            int v_3;            int v_4;            int v_5;                    // (, line 25                    // unset Y_found, line 26                    B_Y_found = false;                    // do, line 27                    v_1 = cursor;                    lab0: do {                        // (, line 27                        // [, line 27                        bra = cursor;                        // literal, line 27                        if (!(eq_s(1, "‘")))                        {                            break lab0;                        }                        // ], line 27                        ket = cursor;                        // delete, line 27                        slice_del();                    } while (false);                    cursor = v_1;                    // do, line 28                    v_2 = cursor;                    lab1: do {                        // (, line 28                        // [, line 28                        bra = cursor;                        // literal, line 28                        if (!(eq_s(1, "y")))                        {                            break lab1;                        }                        // ], line 28                        ket = cursor;                        // <-, line 28                        slice_from("Y");                        // set Y_found, line 28                        B_Y_found = true;                    } while (false);                    cursor = v_2;                    // do, line 29                    v_3 = cursor;                    lab2: do {                        // repeat, line 29                        replab3: while(true)                        {                            v_4 = cursor;                            lab4: do {                                // (, line 29                                // goto, line 29                                golab5: while(true)                                {                                    v_5 = cursor;                                    lab6: do {                                        // (, line 29                                        if (!(in_grouping(g_v, 97, 121)))                                        {                                            break lab6;                                        }                                        // [, line 29                                        bra = cursor;                                        // literal, line 29                                        if (!(eq_s(1, "y")))                                        {                                            break lab6;                                        }                                        // ], line 29                                        ket = cursor;                                        cursor = v_5;                                        break golab5;                                    } while (false);                                    cursor = v_5;                                    if (cursor >= limit)                                    {                                        break lab4;                                    }                                    cursor++;                                }                                // <-, line 29                                slice_from("Y");                                // set Y_found, line 29                                B_Y_found = true;                                continue replab3;                            } while (false);                            cursor = v_4;                            break replab3;                        }                    } while (false);                    cursor = v_3;                    return true;                }                private boolean r_mark_regions() {            int v_1;            int v_2;                    // (, line 32                    I_p1 = limit;                    I_p2 = limit;                    // do, line 35                    v_1 = cursor;                    lab0: do {                        // (, line 35                        // or, line 41                        lab1: do {                            v_2 = cursor;                            lab2: do {                                // among, line 36                                if (find_among(a_0, 3) == 0)                                {                                    break lab2;                                }                                break lab1;                            } while (false);                            cursor = v_2;                            // (, line 41                            // gopast, line 41                            golab3: while(true)                            {                                lab4: do {                                    if (!(in_grouping(g_v, 97, 121)))                                    {                                        break lab4;                                    }                                    break golab3;                                } while (false);                                if (cursor >= limit)                                {                                    break lab0;                                }                                cursor++;                            }                            // gopast, line 41                            golab5: while(true)                            {                                lab6: do {                                    if (!(out_grouping(g_v, 97, 121)))                                    {                                        break lab6;                                    }                                    break golab5;                                } while (false);                                if (cursor >= limit)                                {                                    break lab0;                                }                                cursor++;                            }                        } while (false);                        // setmark p1, line 42                        I_p1 = cursor;                        // gopast, line 43                        golab7: while(true)                        {                            lab8: do {                                if (!(in_grouping(g_v, 97, 121)))                                {                                    break lab8;                                }                                break golab7;                            } while (false);                            if (cursor >= limit)                            {                                break lab0;                            }                            cursor++;                        }                        // gopast, line 43                        golab9: while(true)                        {                            lab10: do {                                if (!(out_grouping(g_v, 97, 121)))                                {                                    break lab10;                                }                                break golab9;                            } while (false);                            if (cursor >= limit)                            {                                break lab0;                            }                            cursor++;                        }                        // setmark p2, line 43                        I_p2 = cursor;                    } while (false);                    cursor = v_1;                    return true;                }                private boolean r_shortv() {            int v_1;                    // (, line 49                    // or, line 51                    lab0: do {                        v_1 = limit - cursor;                        lab1: do {                            // (, line 50                            if (!(out_grouping_b(g_v_WXY, 89, 121)))                            {                                break lab1;                            }                            if (!(in_grouping_b(g_v, 97, 121)))                            {                                break lab1;                            }                            if (!(out_grouping_b(g_v, 97, 121)))                            {                                break lab1;                            }                            break lab0;                        } while (false);                        cursor = limit - v_1;                        // (, line 52                        if (!(out_grouping_b(g_v, 97, 121)))                        {                            return false;                        }                        if (!(in_grouping_b(g_v, 97, 121)))                        {                            return false;                        }                        // atlimit, line 52                        if (cursor > limit_backward)                        {                            return false;                        }                    } while (false);                    return true;                }                private boolean r_R1() {                    if (!(I_p1 <= cursor))                    {                        return false;                    }                    return true;                }                private boolean r_R2() {                    if (!(I_p2 <= cursor))                    {                        return false;                    }                    return true;                }                private boolean r_Step_1a() {            int among_var;            int v_1;            int v_2;                    // (, line 58                    // try, line 59                    v_1 = limit - cursor;                    lab0: do {                        // (, line 59                        // [, line 60                        ket = cursor;                        // substring, line 60                        among_var = find_among_b(a_1, 3);                        if (among_var == 0)                        {                            cursor = limit - v_1;                            break lab0;                        }                        // ], line 60                        bra = cursor;                        switch(among_var) {                            case 0:                                cursor = limit - v_1;                                break lab0;                            case 1:                                // (, line 62                                // delete, line 62                                slice_del();                                break;                        }                    } while (false);                    // [, line 65                    ket = cursor;                    // substring, line 65                    among_var = find_among_b(a_2, 6);                    if (among_var == 0)                    {                        return false;                    }                    // ], line 65                    bra = cursor;                    switch(among_var) {                        case 0:                            return false;                        case 1:                            // (, line 66                            // <-, line 66                            slice_from("ss");                            break;                        case 2:                            // (, line 68                            // or, line 68                            lab1: do {                                v_2 = limit - cursor;                                lab2: do {                                    // (, line 68                                    // hop, line 68                                    {                                        int c = cursor - 2;                                        if (limit_backward > c || c > limit)                                        {                                            break lab2;                                        }                                        cursor = c;                                    }                                    // <-, line 68                                    slice_from("i");                                    break lab1;                                } while (false);                                cursor = limit - v_2;                                // <-, line 68                                slice_from("ie");                            } while (false);                            break;                        case 3:                            // (, line 69                            // next, line 69                            if (cursor <= limit_backward)                            {                                return false;                            }                            cursor--;                            // gopast, line 69                            golab3: while(true)                            {                                lab4: do {                                    if (!(in_grouping_b(g_v, 97, 121)))                                    {                                        break lab4;                                    }                                    break golab3;                                } while (false);                                if (cursor <= limit_backward)                                {                                    return false;                                }                                cursor--;                            }                            // delete, line 69                            slice_del();                            break;                    }                    return true;                }                private boolean r_Step_1b() {            int among_var;            int v_1;            int v_3;            int v_4;                    // (, line 74                    // [, line 75                    ket = cursor;                    // substring, line 75                    among_var = find_among_b(a_4, 6);                    if (among_var == 0)                    {                        return false;                    }                    // ], line 75                    bra = cursor;                    switch(among_var) {                        case 0:                            return false;                        case 1:                            // (, line 77                            // call R1, line 77                            if (!r_R1())                            {                                return false;                            }                            // <-, line 77                            slice_from("ee");                            break;                        case 2:                            // (, line 79                            // test, line 80                            v_1 = limit - cursor;                            // gopast, line 80                            golab0: while(true)                            {                                lab1: do {                                    if (!(in_grouping_b(g_v, 97, 121)))                                    {                                        break lab1;                                    }                                    break golab0;                                } while (false);                                if (cursor <= limit_backward)                                {                                    return false;                                }                                cursor--;                            }                            cursor = limit - v_1;                            // delete, line 80                            slice_del();                            // test, line 81                            v_3 = limit - cursor;                            // substring, line 81                            among_var = find_among_b(a_3, 13);                            if (among_var == 0)                            {                                return false;                            }                            cursor = limit - v_3;                            switch(among_var) {                                case 0:                                    return false;                                case 1:                                    // (, line 83                                    // <+, line 83                                    {                                        int c = cursor;                                        insert(cursor, cursor, "e");                                        cursor = c;                                    }                                    break;                                case 2:                                    // (, line 86                                    // [, line 86                                    ket = cursor;                                    // next, line 86                                    if (cursor <= limit_backward)                                    {                                        return false;                                    }                                    cursor--;                                    // ], line 86                                    bra = cursor;                                    // delete, line 86                                    slice_del();                                    break;                                case 3:                                    // (, line 87                                    // atmark, line 87                                    if (cursor != I_p1)                                    {                                        return false;                                    }                                    // test, line 87                                    v_4 = limit - cursor;                                    // call shortv, line 87                                    if (!r_shortv())                                    {                                        return false;                                    }                                    cursor = limit - v_4;                                    // <+, line 87                                    {                                        int c = cursor;                                        insert(cursor, cursor, "e");                                        cursor = c;                                    }                                    break;                            }                            break;                    }                    return true;                }                private boolean r_Step_1c() {            int v_1;            int v_2;                    // (, line 93                    // [, line 94                    ket = cursor;                    // or, line 94                    lab0: do {                        v_1 = limit - cursor;                        lab1: do {                            // literal, line 94                            if (!(eq_s_b(1, "y")))                            {                                break lab1;                            }                            break lab0;                        } while (false);                        cursor = limit - v_1;                        // literal, line 94                        if (!(eq_s_b(1, "Y")))                        {                            return false;                        }                    } while (false);                    // ], line 94                    bra = cursor;                    if (!(out_grouping_b(g_v, 97, 121)))                    {                        return false;                    }                    // not, line 95                    {                        v_2 = limit - cursor;                        lab2: do {                            // atlimit, line 95                            if (cursor > limit_backward)                            {                                break lab2;                            }                            return false;                        } while (false);                        cursor = limit - v_2;                    }                    // <-, line 96                    slice_from("i");                    return true;                }                private boolean r_Step_2() {            int among_var;                    // (, line 99                    // [, line 100                    ket = cursor;                    // substring, line 100                    among_var = find_among_b(a_5, 24);                    if (among_var == 0)                    {                        return false;                    }                    // ], line 100                    bra = cursor;                    // call R1, line 100                    if (!r_R1())                    {                        return false;                    }                    switch(among_var) {                        case 0:                            return false;                        case 1:                            // (, line 101                            // <-, line 101                            slice_from("tion");                            break;                        case 2:                            // (, line 102                            // <-, line 102                            slice_from("ence");                            break;                        case 3:                            // (, line 103                            // <-, line 103                            slice_from("ance");                            break;                        case 4:                            // (, line 104                            // <-, line 104                            slice_from("able");                            break;                        case 5:                            // (, line 105                            // <-, line 105                            slice_from("ent");                            break;                        case 6:                            // (, line 107                            // <-, line 107                            slice_from("ize");                            break;                        case 7:                            // (, line 109                            // <-, line 109                            slice_from("ate");                            break;                        case 8:                            // (, line 111                            // <-, line 111                            slice_from("al");                            break;                        case 9:                            // (, line 112                            // <-, line 112                            slice_from("ful");                            break;                        case 10:                            // (, line 114                            // <-, line 114                            slice_from("ous");                            break;                        case 11:                            // (, line 116                            // <-, line 116                            slice_from("ive");                            break;                        case 12:                            // (, line 118                            // <-, line 118                            slice_from("ble");                            break;                        case 13:                            // (, line 119                            // literal, line 119                            if (!(eq_s_b(1, "l")))                            {                                return false;                            }                            // <-, line 119                            slice_from("og");                            break;                        case 14:                            // (, line 120                            // <-, line 120                            slice_from("ful");                            break;                        case 15:                            // (, line 121                            // <-, line 121                            slice_from("less");                            break;                        case 16:                            // (, line 122                            if (!(in_grouping_b(g_valid_LI, 99, 116)))                            {                                return false;                            }                            // delete, line 122                            slice_del();                            break;                    }                    return true;                }                private boolean r_Step_3() {            int among_var;                    // (, line 126                    // [, line 127                    ket = cursor;                    // substring, line 127                    among_var = find_among_b(a_6, 9);                    if (among_var == 0)                    {                        return false;                    }                    // ], line 127                    bra = cursor;                    // call R1, line 127                    if (!r_R1())                    {                        return false;                    }                    switch(among_var) {                        case 0:                            return false;                        case 1:                            // (, line 128                            // <-, line 128                            slice_from("tion");                            break;                        case 2:                            // (, line 129                            // <-, line 129                            slice_from("ate");                            break;                        case 3:                            // (, line 130                            // <-, line 130                            slice_from("al");                            break;                        case 4:                            // (, line 132                            // <-, line 132                            slice_from("ic");                            break;                        case 5:                            // (, line 134                            // delete, line 134                            slice_del();                            break;                        case 6:                            // (, line 136                            // call R2, line 136                            if (!r_R2())                            {                                return false;                            }                            // delete, line 136                            slice_del();                            break;                    }                    return true;                }                private boolean r_Step_4() {            int among_var;            int v_1;                    // (, line 140                    // [, line 141                    ket = cursor;                    // substring, line 141                    among_var = find_among_b(a_7, 18);                    if (among_var == 0)                    {                        return false;                    }                    // ], line 141                    bra = cursor;                    // call R2, line 141                    if (!r_R2())                    {                        return false;                    }                    switch(among_var) {                        case 0:                            return false;                        case 1:                            // (, line 144                            // delete, line 144                            slice_del();                            break;                        case 2:                            // (, line 145                            // or, line 145                            lab0: do {                                v_1 = limit - cursor;                                lab1: do {                                    // literal, line 145                                    if (!(eq_s_b(1, "s")))                                    {                                        break lab1;                                    }                                    break lab0;                                } while (false);                                cursor = limit - v_1;                                // literal, line 145                                if (!(eq_s_b(1, "t")))                                {                                    return false;                                }                            } while (false);                            // delete, line 145                            slice_del();                            break;                    }                    return true;                }                private boolean r_Step_5() {            int among_var;            int v_1;            int v_2;                    // (, line 149                    // [, line 150                    ket = cursor;                    // substring, line 150                    among_var = find_among_b(a_8, 2);                    if (among_var == 0)                    {                        return false;                    }                    // ], line 150                    bra = cursor;                    switch(among_var) {                        case 0:                            return false;                        case 1:                            // (, line 151                            // or, line 151                            lab0: do {                                v_1 = limit - cursor;                                lab1: do {                                    // call R2, line 151                                    if (!r_R2())                                    {                                        break lab1;                                    }                                    break lab0;                                } while (false);                                cursor = limit - v_1;                                // (, line 151                                // call R1, line 151                                if (!r_R1())                                {                                    return false;                                }                                // not, line 151                                {                                    v_2 = limit - cursor;                                    lab2: do {                                        // call shortv, line 151                                        if (!r_shortv())                                        {                                            break lab2;                                        }                                        return false;                                    } while (false);                                    cursor = limit - v_2;                                }                            } while (false);                            // delete, line 151                            slice_del();                            break;                        case 2:                            // (, line 152                            // call R2, line 152                            if (!r_R2())                            {                                return false;                            }                            // literal, line 152                            if (!(eq_s_b(1, "l")))                            {                                return false;                            }                            // delete, line 152                            slice_del();                            break;                    }                    return true;                }                private boolean r_exception2() {                    // (, line 156                    // [, line 158                    ket = cursor;                    // substring, line 158                    if (find_among_b(a_9, 8) == 0)                    {                        return false;                    }                    // ], line 158                    bra = cursor;                    // atlimit, line 158                    if (cursor > limit_backward)                    {                        return false;                    }                    return true;                }                private boolean r_exception1() {            int among_var;                    // (, line 168                    // [, line 170                    bra = cursor;                    // substring, line 170                    among_var = find_among(a_10, 18);                    if (among_var == 0)                    {                        return false;                    }                    // ], line 170                    ket = cursor;                    // atlimit, line 170                    if (cursor < limit)                    {                        return false;                    }                    switch(among_var) {                        case 0:                            return false;                        case 1:                            // (, line 174                            // <-, line 174                            slice_from("ski");                            break;                        case 2:                            // (, line 175                            // <-, line 175                            slice_from("sky");                            break;                        case 3:                            // (, line 176                            // <-, line 176                            slice_from("die");                            break;                        case 4:                            // (, line 177                            // <-, line 177                            slice_from("lie");                            break;                        case 5:                            // (, line 178                            // <-, line 178                            slice_from("tie");                            break;                        case 6:                            // (, line 182                            // <-, line 182                            slice_from("idl");                            break;                        case 7:                            // (, line 183                            // <-, line 183                            slice_from("gentl");                            break;                        case 8:                            // (, line 184                            // <-, line 184                            slice_from("ugli");                            break;                        case 9:                            // (, line 185                            // <-, line 185                            slice_from("earli");                            break;                        case 10:                            // (, line 186                            // <-, line 186                            slice_from("onli");                            break;                        case 11:                            // (, line 187                            // <-, line 187                            slice_from("singl");                            break;                    }                    return true;                }                private boolean r_postlude() {            int v_1;            int v_2;                    // (, line 203                    // Boolean test Y_found, line 203                    if (!(B_Y_found))                    {                        return false;                    }                    // repeat, line 203                    replab0: while(true)                    {                        v_1 = cursor;                        lab1: do {                            // (, line 203                            // goto, line 203                            golab2: while(true)                            {                                v_2 = cursor;                                lab3: do {                                    // (, line 203                                    // [, line 203                                    bra = cursor;                                    // literal, line 203                                    if (!(eq_s(1, "Y")))                                    {                                        break lab3;                                    }                                    // ], line 203                                    ket = cursor;                                    cursor = v_2;                                    break golab2;                                } while (false);                                cursor = v_2;                                if (cursor >= limit)                                {                                    break lab1;                                }                                cursor++;                            }                            // <-, line 203                            slice_from("y");                            continue replab0;                        } while (false);                        cursor = v_1;                        break replab0;                    }                    return true;                }                public boolean stem() {            int v_1;            int v_2;            int v_3;            int v_4;            int v_5;            int v_6;            int v_7;            int v_8;            int v_9;            int v_10;            int v_11;            int v_12;            int v_13;                    // (, line 205                    // or, line 207                    lab0: do {                        v_1 = cursor;                        lab1: do {                            // call exception1, line 207                            if (!r_exception1())                            {                                break lab1;                            }                            break lab0;                        } while (false);                        cursor = v_1;                        lab2: do {                            // not, line 208                            {                                v_2 = cursor;                                lab3: do {                                    // hop, line 208                                    {                                        int c = cursor + 3;                                        if (0 > c || c > limit)                                        {                                            break lab3;                                        }                                        cursor = c;                                    }                                    break lab2;                                } while (false);                                cursor = v_2;                            }                            break lab0;                        } while (false);                        cursor = v_1;                        // (, line 208                        // do, line 209                        v_3 = cursor;                        lab4: do {                            // call prelude, line 209                            if (!r_prelude())                            {                                break lab4;                            }                        } while (false);                        cursor = v_3;                        // do, line 210                        v_4 = cursor;                        lab5: do {                            // call mark_regions, line 210                            if (!r_mark_regions())                            {                                break lab5;                            }                        } while (false);                        cursor = v_4;                        // backwards, line 211                        limit_backward = cursor; cursor = limit;                        // (, line 211                        // do, line 213                        v_5 = limit - cursor;                        lab6: do {                            // call Step_1a, line 213                            if (!r_Step_1a())                            {                                break lab6;                            }                        } while (false);                        cursor = limit - v_5;                        // or, line 215                        lab7: do {                            v_6 = limit - cursor;                            lab8: do {                                // call exception2, line 215                                if (!r_exception2())                                {                                    break lab8;                                }                                break lab7;                            } while (false);                            cursor = limit - v_6;                            // (, line 215                            // do, line 217                            v_7 = limit - cursor;                            lab9: do {                                // call Step_1b, line 217                                if (!r_Step_1b())                                {                                    break lab9;                                }                            } while (false);                            cursor = limit - v_7;                            // do, line 218                            v_8 = limit - cursor;                            lab10: do {                                // call Step_1c, line 218                                if (!r_Step_1c())                                {                                    break lab10;                                }                            } while (false);                            cursor = limit - v_8;                            // do, line 220                            v_9 = limit - cursor;                            lab11: do {                                // call Step_2, line 220                                if (!r_Step_2())                                {                                    break lab11;                                }                            } while (false);                            cursor = limit - v_9;                            // do, line 221                            v_10 = limit - cursor;                            lab12: do {                                // call Step_3, line 221                                if (!r_Step_3())                                {                                    break lab12;                                }                            } while (false);                            cursor = limit - v_10;                            // do, line 222                            v_11 = limit - cursor;                            lab13: do {                                // call Step_4, line 222                                if (!r_Step_4())                                {                                    break lab13;                                }                            } while (false);                            cursor = limit - v_11;                            // do, line 224                            v_12 = limit - cursor;                            lab14: do {                                // call Step_5, line 224                                if (!r_Step_5())                                {                                    break lab14;                                }                            } while (false);                            cursor = limit - v_12;                        } while (false);                        cursor = limit_backward;                        // do, line 227                        v_13 = cursor;                        lab15: do {                            // call postlude, line 227                            if (!r_postlude())                            {                                break lab15;                            }                        } while (false);                        cursor = v_13;                    } while (false);                    return true;                }        public boolean equals( Object o ) {            return o instanceof englishStemmer;        }        public int hashCode() {            return englishStemmer.class.getName().hashCode();        }}
porter2 stemming algorithm

 然而,porter stemming 仅仅是一个基于后缀的词干提取技术,它仅仅定义了一些基本的后缀规则,能识别出"books"->"book"等. 然而针对一些诸如 "bought"->"buy","brought"->"bring"等异常形式并不能识别出来。

2. The dragon toolkit (http://dragon.ischool.drexel.edu/download.asp)

然后发现上面nlp 处理工具,其中的EngLemmatiser 类就是stem类,能提取出单词的词干。

它首先定义一些基本点后缀规则(只有十几条),然后定义一些独立于这些规则的异常词库(master slave 的形式,这样就能基本实现单词词干的正确提取,解决了porter stemming 存在的问题。

String dictionaryPath = "lemmatiser";        EngLemmatiser lemmatiser =  new EngLemmatiser(dictionaryPath, false, true);        String a = "brought";        String lemmatizedWord = lemmatiser.lemmatize(a);        System.out.println(lemmatizedWord);
View Code

然而我还是觉得,在规则基础之上附加词典的技术过于死板,不够灵活。

3. Stanford CoreNLP

后来发现斯坦福大学的一个NLP工具,其中提取词干的技术:针对大量语料库进行机器学习,利用有限自动机提炼并生成规则(不必附加词典)。能完美解决词干的提取问题,准确率很高。它对地名、人名等专有词识别不出来,但达到了基本的需求。

    String word="magnificus";        Morphology morph=new Morphology();        System.out.println(morph.stem(word));
View Code