首页 > 代码库 > TF-IDF算法确定阅读主题词解答英语阅读Title题目

TF-IDF算法确定阅读主题词解答英语阅读Title题目

     对文章best title的选项进行打分

#include <windows.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <iostream>

using namespace std;
#define N 5269        //文献数目  
#define textN 10        //题目数目
#define ERROR 1
#define OK 0
const int WORD_LENGTH = 30;//定义单个单词最大长度
char temp[WORD_LENGTH];//定义用以临时存放单词的数组

typedef struct Node {
    char word[WORD_LENGTH] = { \0 };
    int time = 0;
    int textnum = 0;
    double weight = 0;
}wordNode, wordLink;
char Libword[900][WORD_LENGTH] = { 0 };            //900条停用词库
int wordleng = 0;    //词库中实际词条数目
wordNode sumWord[1000];//文章词表
int wordNum = 0;//文章中的非重单词数
int sumWordNum = 0;//文章总词数
double score[4] = { 0 };//选项分数
int DoLibStop(char *name, char memory[][WORD_LENGTH])
{
    FILE *cp = fopen(name, "r");//词库位置
    char ch;
    while (!feof(cp))                         //读取词库
    {
        ch = fgetc(cp);
        for (int i = 0; ch != 13 && i<22 && ch != 10; i++)//回车区分词
        {
            Libword[wordleng][i] = ch;
            ch = fgetc(cp);
        }
        //     std::cout<<(word[wordleng]);         //屏幕输出。临时
        wordleng++;
    }
    fclose(cp);    //关闭停用词库
    return wordleng;
}
void wordDelSpe(char word[]) //去掉特殊字符
{
    int i, k, j;
    char *specialChar = ",.;:‘“”?!><+=|*&^%$#@\"[](){}0123456789";//定义特殊字符集
    for (i = 0; i<strlen(word); i++)
    {
        //筛选并去除字符串中的特殊字符
        for (k = 0; k<strlen(specialChar); k++)
        {
            if (word[i] == specialChar[k])
            {
                j = i;
                while (j<strlen(word))
                {
                    word[j] = word[j + 1];
                    j++;
                }
                i--;
                break;
            }
        }
    }
}
bool wordCmpStop(char *word)//将人称代词及其他常用词去掉
{
    int simNum = wordleng;
    for (int i = 0; i<strlen(word); i++)//筛选并将字符串中的大写字母转化为小写字母
        if (word[i] >= A&& word[i] <= Z)
            word[i] += 32;
    for (int i = 0; i<simNum; i++)
        if (strcmp(word, Libword[i]) == 0)
            return true;
    return false;
}
void wordSearch(char *word, int &wordnum) {
    int i = 0;
    while (i < wordnum && (strcmp(sumWord[i].word, word) != 0))
    {
        i++;
    }
    if (i < wordnum)
        sumWord[i].time++;
    if (i == wordnum)
    {
        strcpy(sumWord[i].word, word);
        wordnum++;
        sumWord[i].time = 1;
    }
    sumWordNum += 1;
}
//void wordSearch(char *word, int &wordnum) {
//    int i = 0;
//    while (i < wordnum && (strcmp(sumWord[i].word, word) != 0) && (!strstr(sumWord[i].word, word) || !strstr(word, sumWord[i].word)))
//    {
//        i++;
//    }
//    if (i < wordnum)
//    {
//        if (!strcmp(sumWord[i].word, word) || strstr(sumWord[i].word, word))
//            sumWord[i].time++;
//        else
//        {
//            strcpy(sumWord[i].word, word);
//            sumWord[i].time++;
//        }
//    }
//
//    if (i == wordnum)
//    {
//        strcpy(sumWord[i].word, word);
//        wordnum++;
//        sumWord[i].time = 1;
//    }
//    sumWordNum += 1;
//}
void doArticle(char *file0)
{
    FILE *file;
    if ((file = fopen(file0, "r")) == NULL) {
        //这里是绝对路径,基于XCode编译器查找方便的需求
        printf("%s文件读取失败!", file0);
        system("pause");
        exit(1);
    }
    while ((fscanf(file, "%s", temp)) != EOF)
    {
        if (temp[0] == *)//遇到题目了
            break;
        wordDelSpe(temp);
        if (wordCmpStop(temp) == true)
        {
            sumWordNum += 1;
            continue;
        }
        wordSearch(temp, wordNum);
    }
    fclose(file);//关闭文件
}
void copyNode(wordNode& node1, wordNode &node2)//node2复制到node1
{
    strcpy(node1.word, node2.word);
    node1.time = node2.time;
    node1.textnum = node2.textnum;
    node1.weight = node2.weight;
}
void sortWord()//直接插入排序
{
    wordNode t;
    int i, j;
    /*cout << wordNum << endl;*/
    for (i = 1; i < wordNum; i++)
    {
        copyNode(t, sumWord[i]);
        for (j = i - 1; j >= 0 && sumWord[j].weight<t.weight; j--)
        {
            copyNode(sumWord[j + 1], sumWord[j]);
        }
        copyNode(sumWord[j + 1], t);
    }
}
void fileCount(char file[N][50])
{
    int i, j;
    FILE *f;
    for (i = 0; i <N; i++)
    {
        f = fopen(file[i], "r");
        if (!f)
        {
            printf("%s文件读取失败!", file[i]);
            /*system("pause");
            exit(1);*/
            continue;
        }
        while ((fscanf(f, "%s", temp)) != EOF)
        {
            wordDelSpe(temp);
            j = 0;
            while (j < wordNum && (strcmp(sumWord[j].word, temp) != 0))
            {
                j++;
            }
            if (j < wordNum)
                sumWord[j].textnum++;//文章数++
        }
        fclose(f);//关闭文件
    }
}
void calWeight(wordNode *sumWord, int wordNum)
{
    int i;
    for (i = 0; i < wordNum; i++)
        sumWord[i].weight = (sumWord[i].time * 1.0 / sumWordNum)*log((N*1.0) / (sumWord[i].textnum + 1));
}
int numWei(int n)
{
    if (n / 10 == 0)
        return 1;
    else if (n / 100 == 0)
        return 2;
    else if (n / 1000 == 0)
        return 3;
    else
        return 4;
}
void fileNameMake(char file[][50], int n)
{
    int i = 0, j = 0, i1, num;
    /*strcpy(file[0], "txt\\txt1.txt");
    strcpy(file[1], "txt\\txt2.txt");
    strcpy(file[2], "txt\\txt3.txt");
    strcpy(file[3], "txt\\txt4.txt");
    strcpy(file[4], "txt\\txt5.txt");*/
    for (i = 0; i < n; i++)
    {
        strcpy(file[i], "fileLib\\\\txt");
        num = numWei(i + 1);
        //cout << num << endl;
        i1 = i + 1;
        for (j = num - 1; j >= 0; j--)
        {
            *(file[i] + 12 + j) = i1 % 10 + 48;
            i1 = i1 / 10;
        }
        strcpy(file[i] + 12 + num, ".txt");
        /*cout << file[i] << endl;*/
        /*if (strcmp(file[i] , "txt\\\\txt00.txt")==0)
        cout << i << endl;*/
    }
}
void saveNameMake(char savefile[][50], int n)
{
    int i = 0, j = 0, i1, num;
    for (i = 0; i < n; i++)
    {
        strcpy(savefile[i], "savefile\\\\save");
        num = numWei(i + 1);
        //cout << num << endl;
        i1 = i + 1;
        for (j = num - 1; j >= 0; j--)
        {
            *(savefile[i] + 14 + j) = i1 % 10 + 48;
            i1 = i1 / 10;
        }
        strcpy(savefile[i] + 14 + num, ".txt");
        //cout << savefile[i] << endl;
        /*if (strcmp(file[i] , "txt\\\\txt00.txt")==0)
        cout << i << endl;*/
    }
}
void textNameMake(char textfile[][50], int n)
{
    int i = 0, j = 0, i1, num;
    for (i = 0; i < n; i++)
    {
        strcpy(textfile[i], "textfile\\\\text");
        num = numWei(i + 1);
        //cout << num << endl;
        i1 = i + 1;
        for (j = num - 1; j >= 0; j--)
        {
            *(textfile[i] + 14 + j) = i1 % 10 + 48;
            i1 = i1 / 10;
        }
        strcpy(textfile[i] + 14 + num, ".txt");
        //cout << textfile[i] << endl;
        /*if (strcmp(file[i] , "txt\\\\txt00.txt")==0)
        cout << i << endl;*/
    }
}
void scoreArticle(char *file0,int k,char *answers, int &answerNum)
{
    FILE *file;
    int i;
    if ((file = fopen(file0, "r")) == NULL) {
        //这里是绝对路径,基于XCode编译器查找方便的需求
        printf("%s文件读取失败!", file0);
        system("pause");
        exit(1);
    }
    int count = -1;
    while ((fscanf(file, "%s", temp)) != EOF&&temp[0] != *) {}//※号提示题目
    while ((fscanf(file, "%s", temp)) != EOF)
    {
        if (temp[0] == *)
            break;
        if (!strcmp(temp, "A.") || !strcmp(temp, "B.") || !strcmp(temp, "C.") || !strcmp(temp, "D."))
        {
            count++;
            continue;
        }
        wordDelSpe(temp);
        for (i = 0; i<strlen(temp); i++)//筛选并将字符串中的大写字母转化为小写字母
            if (temp[i] >= A&& temp[i] <= Z)
                temp[i] += 32;
        for (i = 0; i < wordNum; i++)
        {
            if (!strcmp(temp, sumWord[i].word))
                score[count] += sumWord[i].weight;
        }
    }
    fscanf(file, "%s", temp);
    /*cout << temp[1] << "###"<<endl;*/
    answers[k] = temp[1];
    answerNum++;
    fclose(file);//关闭文件
}
void doArticleLocal(char *file0)
{
    FILE *file;
    int i;
    if ((file = fopen(file0, "r")) == NULL) {
        //这里是绝对路径,基于XCode编译器查找方便的需求
        printf("%s文件读取失败!",file0);
        system("pause");
        exit(1);
    }
    while ((fscanf(file, "%s", temp)) != EOF)
    {
        if (temp[strlen(temp) - 1] == #)
        {
            /*cout << "遇到了#" << endl;*/
            break;
        }
        wordDelSpe(temp);
        for (i = 0; i<strlen(temp); i++)//筛选并将字符串中的大写字母转化为小写字母
            if (temp[i] >= A&& temp[i] <= Z)
                temp[i] += 32;
        for (i = 0; i < wordNum; i++)
        {
            if (!strcmp(temp, sumWord[i].word))
            {
                sumWord[i].weight *= 1.5;
                /*cout << "改了" << endl;*/
            }
        }
    }
    while ((fscanf(file, "%s", temp)) != EOF&&temp[0] != #) {}//再次遇到#号,最后一段
    while ((fscanf(file, "%s", temp)) != EOF)
    {
        if (temp[0] == *)
        {
            /*cout << "遇到了*" << endl;*/
            break;
        }
        wordDelSpe(temp);
        for (i = 0; i<strlen(temp); i++)//筛选并将字符串中的大写字母转化为小写字母
            if (temp[i] >= A&& temp[i] <= Z)
                temp[i] += 32;
        for (i = 0; i < wordNum; i++)
        {
            if (!strcmp(temp, sumWord[i].word))
                sumWord[i].weight *= 1.5;
        }
    }
    fclose(file);//关闭文件
}
void doArticleAll(char *file0,char file[][50],char *savefile,int id,char *answers,int &answerNum,int &correctNum)
{
    /*cout << "correctNum" << correctNum << endl;*/
    int i;
    char ans;    //答案
    doArticle(file0);    //处理题目文本
    fileCount(file);    //统计文件库中单词出现次数
    calWeight(sumWord, wordNum);    //计算权重
    sortWord();    //排序
    doArticleLocal(file0);//根据位置调整权值
    FILE *p = fopen(savefile, "w");        //文本输出    
    fprintf(p, "    word         \t词频\t文章数\t权重\n");            //输出到文档结果
    fprintf(p, "本文共%d个词,%d个不重复词\n", sumWordNum, wordNum);
    for (i = 0; i < wordNum; i++)
    {
        fprintf(p, "%-16s\t%d\t%d\t%f\n", sumWord[i].word, sumWord[i].time, sumWord[i].textnum, sumWord[i].weight);
    }
    fclose(p);
    //doArticleLocal(file0);//根据位置调整权值
    scoreArticle(file0,id,answers,answerNum);
    std::cout << ""<<id+1<<"题结果成功输出到文件:" << savefile << endl;
    std::cout << "成功得到结果:" << endl;
    for (int h = 0; h < 4; h++)
    {
        cout << score[h] << endl;
    }
    int max = 0;
    for (int k = 1; k < 4; k++)
    {
        if (score[k] > score[max])
            max = k;
    }
    ans = 65 + max;
    cout << "答案是:" << ans ;
    if (answers[id] == ans)
    {
        cout <<" 正确"<<endl;
        correctNum++;
    }
    else
        cout << " 错误 (正确答案为:" << answers[id]<<"" << endl ;
    cout << endl << "-----------------------------" << endl;
    /*cout << "correctNum" << correctNum << endl;*/
}
void clearSumWord()//清空数组
{
    for (int i = 0; i < 1000; i++)
    {
        sumWord[i].weight = 0;
        sumWord[i].time = 0;
        sumWord[i].textnum = 0;
        strcpy(sumWord[i].word, "\0");
    }
}
void main(int n, char *arg[])
{
    char answers[textN] = { \0 };
    int answerNum = 0,correctNum = 0, i = 0, j = 0;
    double corretPersent;        //正确率
    char textfile[textN][50] = { \0 };    //题目名字
    char savefile[textN][50] = { \0 };    //保存文件
    char file[N][50] = { \0 };            //需要检索的文献
    //char *savefile1 = "savefile\\save.txt";    //结果存放文档
    char *LibStop = "stopLib\\stop.txt";    //停用词库                                    
    //char *file0 = "textfile\\text1.txt";
    //char *file0 = "text2.txt";
    clock_t start0, finish0;            //程序运行时间
    double sftime0;
    start0 = clock();
    fileNameMake(file, N);
    textNameMake(textfile, textN);
    saveNameMake(savefile, textN);
    cout << endl;
    wordleng = DoLibStop(LibStop, Libword);//停用词处理
    /*cout << file0 << "hah" << endl;*/
    answerNum = 0;
    correctNum = 0;
    for(int k=0;k<textN;k++)
    { 
        doArticleAll(textfile[k], file, savefile[k],k,answers,answerNum,correctNum);
        clearSumWord();
    }
    /*cout << "correctNum" << correctNum << endl;
    cout << "answerNum" << answerNum << endl;*/
    corretPersent = correctNum*100.0 / textN;
    printf("共%d篇文章,正确率为%.2f%%\n", textN, corretPersent);
    finish0 = clock();
    sftime0 = (double)(finish0 - start0) / CLOCKS_PER_SEC;//计算用时
    std::cout << endl<< "共用时间:" << sftime0 << "秒." << endl;
    system("pause");
}


第1题结果成功输出到文件:savefile\\save1.txt
成功得到结果:
4.27272
4.31105
4.24789
4.24789
答案是:B 正确

-----------------------------
第2题结果成功输出到文件:savefile\\save2.txt
成功得到结果:
4.30785
4.31105
4.25257
4.32183
答案是:D 错误 (正确答案为:C)

-----------------------------
第3题结果成功输出到文件:savefile\\save3.txt
成功得到结果:
4.47317
4.4314
4.25882
4.34237
答案是:A 正确

-----------------------------
第4题结果成功输出到文件:savefile\\save4.txt
成功得到结果:
7.15344
4.4314
6.94828
7.02264
答案是:A 错误 (正确答案为:B)

-----------------------------
第5题结果成功输出到文件:savefile\\save5.txt
成功得到结果:
7.16518
4.43581
6.95683
7.02264
答案是:A 正确

-----------------------------
第6题结果成功输出到文件:savefile\\save6.txt
成功得到结果:
7.16882
4.43563
6.97361
7.05793
答案是:A 错误 (正确答案为:C)

-----------------------------
第7题结果成功输出到文件:savefile\\save7.txt
成功得到结果:
7.36186
4.62905
7.17293
7.17759
答案是:A 错误 (正确答案为:B)

-----------------------------
第8题结果成功输出到文件:savefile\\save8.txt
成功得到结果:
7.40113
4.63213
7.21154
7.23798
答案是:A 错误 (正确答案为:B)

-----------------------------
第9题结果成功输出到文件:savefile\\save9.txt
成功得到结果:
7.4557
4.67378
7.2737
7.28944
答案是:A 错误 (正确答案为:C)

-----------------------------
第10题结果成功输出到文件:savefile\\save10.txt
成功得到结果:
7.55512
4.67378
7.2737
7.28944
答案是:A 错误 (正确答案为:D)

-----------------------------
共10篇文章,正确率为30.00%

共用时间:111.989秒.
请按任意键继续. . .

技术分享

输入文章第一段结尾和最后一段开头标记#,题目和答案标记*

TF-IDF算法确定阅读主题词解答英语阅读Title题目