首页 > 代码库 > 自然语言理解 之 统计词频

自然语言理解 之 统计词频

统计词频,中文字体编码格式:GB2312。

  1 #include <iostream>  2 #include <fstream>  3 #include <algorithm>  4 #include <functional>  5 #include <string>  6 #include <vector>  7 #include <map>  8 #include <unordered_map>  9 #include <sstream> 10 #include <ctime> 11 using namespace std; 12  13 typedef long clock_t; 14 typedef pair<string, int> Pair_StrInt; 15 typedef string::iterator StrItr; 16 typedef vector<Pair_StrInt>::iterator Vec_Pair_StrInt_Itr; 17 #define ERROR0 cerr << "Open error !!!" << endl; exit(1); 18 #define ERROR1 cerr << "无法识别 !!!" << endl; exit(1); 19 #define Lim 100 20  21 string infile = "Ci.txt"; 22 string outfile1 = "out1.txt"; 23 string outfile2 = "out2.txt"; 24 string outfile3 = "out3.txt"; 25 string project_time = "project_time.txt"; 26 string One_strArr[100]; 27 string Two_strArr[100]; 28 string Three_strArr[100]; 29 ifstream fin; 30 ofstream fout; 31 string Text; 32  33 struct myNode { 34     string Chant; // 词牌名 35     string Rules; // 格式 36 }; 37  38 bool Pair_StrInt_Cmp(const Pair_StrInt& p0, const Pair_StrInt& p1) { return (p0.second > p1.second); } 39 unordered_map<string, int> StrInt_Hash; 40  41 void InitText(string _infile) { 42     fin.open(_infile); 43     if (!fin) { ERROR0; } 44  45     ////////////////////////////////////////////////////////////////////////// 46     // 将整个文件读入 string : 流迭代器 47     std::ostringstream tmp; 48     tmp << fin.rdbuf(); 49     string Text_tmp = tmp.str(); 50     ////////////////////////////////////////////////////////////////////////// 51  52     StrItr str_itr; 53     string strTmp; 54     unsigned char Judge; 55  56     for (str_itr = Text_tmp.begin(); str_itr != Text_tmp.end();) { 57         Judge = (*str_itr); 58         if (Judge >= 0xB0 && Judge <= 0xF7) { 59             strTmp = ""; 60             strTmp += (*str_itr); 61             strTmp += (*(str_itr + 1)); 62             str_itr += 2; 63             Text += strTmp; 64         } 65         else { ++str_itr; } 66  67     } 68  69     fin.close(); 70     fin.clear(); 71 } 72  73 // 输出到文件 74 void myOutput(const vector<Pair_StrInt> &StrInt_Vec, string out) { 75     fout.open(out); 76     if (!fout) { ERROR0; } 77  78     vector<Pair_StrInt>::const_iterator pair_itr; 79     for (pair_itr = StrInt_Vec.begin(); pair_itr != StrInt_Vec.end(); ++pair_itr) { 80         fout << pair_itr->first << "\t" << pair_itr->second << endl; 81     } 82  83     fout.close(); 84     fout.clear(); 85 } 86  87 // 获取一个中文字的词频 88 void getOneWord(string out1) { 89     string strTmp; 90  91     int str_len = Text.size(); 92     for (int i = 0; i < str_len; i += 2) { 93         strTmp = Text.substr(i, 2); 94         StrInt_Hash[strTmp] += 1; 95     } 96      97     vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end()); 98     StrInt_Hash.clear(); 99     std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp);100 101     myOutput(StrInt_Vec, out1);102 103     StrInt_Vec.clear();104 }105 106 // 获取两个中文字的词频107 void getTwoWord(string out2) {108     string strTmp;109 110     int str_len = Text.size();111     for (int i = 0; i < (str_len - 2); i += 2) {112         strTmp = Text.substr(i, 4);113         StrInt_Hash[strTmp] += 1;114     }115 116     vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end());117     StrInt_Hash.clear();118     std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp);119 120     myOutput(StrInt_Vec, out2);121 122     StrInt_Vec.clear();123 }124 125 // 获取三个中文字的词频126 void getThreeWord(string out3) {127     string strTmp;128 129     int str_len = Text.size();130     for (int i = 0; i < (str_len - 4); i += 2) {131         strTmp = Text.substr(i, 6);132         StrInt_Hash[strTmp] += 1;133     }134 135     vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end());136     StrInt_Hash.clear();137     std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp);138 139     myOutput(StrInt_Vec, out3);140 141     StrInt_Vec.clear();142 }143 144 // 自动生成词145 void Poetry(string _strTmp) {146     int len = _strTmp.size();147     int myRandom;148     srand((unsigned)(time(NULL)));149     for (int i = 0; i < len; ++i) {150         switch (_strTmp[i])151         {152         case 2: {153             myRandom = rand() % Lim;154             cout << Two_strArr[myRandom];155             break;156         }157         case 1: {158             myRandom = rand() % Lim;159             cout << One_strArr[myRandom];160             break;161         }162         case 3: {163             myRandom = rand() % Lim;164             cout << Three_strArr[myRandom];165             break;166         }167         case 0: {168             cout << \n;169             break;170         }171         case -: {172             cout << "  ";173             break;174         }175         default: {176             cout << _strTmp.substr(i, 2);177             ++i;178             break;179         }180         }181     }182     cout << endl;183 }184 185 // 生成词前的预处理186 void makePoetry(string out1, string out2, string out3) {187     ifstream fin1, fin2, fin3;188     ofstream fout1, fout2, fout3;189     fin1.open(out1);190     if (!fin1) { ERROR0; }191     fin2.open(out2);192     if (!fin2) { ERROR0; }193     fin3.open(out3);194     if (!fin3) { ERROR0; }195     string strTmp;196     for (int i = 0; i < Lim; ++i) {197         getline(fin1, strTmp);198         One_strArr[i] = strTmp.substr(0, 2);199         getline(fin2, strTmp);200         Two_strArr[i] = strTmp.substr(0, 4);201         getline(fin3, strTmp);202         Three_strArr[i] = strTmp.substr(0, 6);203     }204 205     myNode node0;206     node0.Chant = "念奴娇";207     node0.Rules = "·220-22,12,222。22,21:222。22,22,23。22,222。0-222,23,22。22,3222。22,23,22。22,222。0";208 209     string strTmp0 = "---" + node0.Chant + node0.Rules;210     Poetry(strTmp0);211     system("pause");212 }213 214 void Solve() {215     216     InitText(infile);217 218     ofstream fout;219     fout.open(project_time);220     clock_t myStart, myFinish;221     double totaltime;222     //////////////////////////////////////////////////////////////////////////223     myStart = clock();224     //////////////////////////////////////////////////////////////////////////225     getOneWord(outfile1);226     //////////////////////////////////////////////////////////////////////////227     getTwoWord(outfile2);228     /////////////////////////////////////////////////////////////////////////229     getThreeWord(outfile3);230     //////////////////////////////////////////////////////////////////////////231 232     myFinish = clock();233     totaltime = (double)(myFinish - myStart) / CLOCKS_PER_SEC;234 235     fout << "运行时间为: " << totaltime << " 秒。" << endl;236     fout.close();237     fout.clear();238     239 240     makePoetry(outfile1, outfile2, outfile3);241 }242 243 int main() {244     Solve();245     return 0;246 }

 

自然语言理解 之 统计词频