首页 > 代码库 > 2nd 词频统计更新
2nd 词频统计更新
词频统计更新
实现功能:从控制台输入文件路径,并统计单词总数及不重复的单词数,并输出所有单词词频,同时排序。
头文件
1 #include <stdio.h>2 #include <stdlib.h>3 #include <string.h>
定义宏
#define WORD_LENGTH 250
定义结构体及全局变量
typedef struct Node{ char word[WORD_LENGTH]; int time; struct Node *next;}wordNode;typedef struct TopNode{ int sum; //全文单词个数 int num; //全文无重复单词个数 wordNode * next;}TopNode;TopNode t;TopNode * L = NULL;
声明文件中使用的函数
wordNode *wordSearch(char *word);void wordJob(char word[]);void wordCount(char *word);void printCountList();void PrintFirstTenTimes();void mergeSort(wordNode **head);void FrontBackSplit(wordNode *head,wordNode **pre,wordNode **next);wordNode *SortedMerge(wordNode *pre,wordNode *next);void release();
主函数
int main(int argc,char *argv[]){ char temp[WORD_LENGTH];//定义用以临时存放单词的数组 char file_path[100]; wordNode * h; FILE *file; printf("请输入文件路径:"); gets(file_path); if((file = fopen(file_path, "r")) == NULL) { printf("文件读取失败!"); exit(1); } L = &t; L->num = 0; L->sum = 0; L->next = NULL; while((fscanf(file,"%s",temp))!= EOF) { L->sum++; wordJob(temp); wordCount(temp); } fclose(file); printCountList(); printf("\n\n输出词频最高的10个词\n"); h = L->next; mergeSort(&h); //排序 PrintFirstTenTimes(); release(); return 0;}
查找单词所在节点并返回
wordNode *wordSearch(char *word){ char * t; wordNode *node; wordNode *nextNode = L->next; if(L->next == NULL) { node = (wordNode*)malloc(sizeof(wordNode)); strcpy(node->word,word); node->time = 0; node->next = NULL; //初试化,必须有,否则会发生错误。 L->num++; L->next = node; return node; } while(nextNode != NULL) //查找匹配单词 { t = nextNode->word; if(strcmp(t,word) == 0) { return nextNode; } nextNode = nextNode->next; } if(nextNode == NULL) //原链表中不存在该单词 { node = (wordNode*)malloc(sizeof(wordNode)); strcpy(node->word, word); node->time = 0; node->next = L->next; L->next = node; L->num++; return node; } else return nextNode; //返回查找到的节点}
词频统计
void wordCount(char *word){ wordNode *tmpNode; tmpNode = wordSearch(word); //word所在的节点 tmpNode->time++;}
输出所有词频
void printCountList(){ int i = 0; wordNode *node = L->next; if(L->next == NULL) { printf("该文件无内容!"); } else { printf("\n这篇文章总计%d词\n\n不重复单词共%d个\n",L->sum,L->num); printf("\n输出所有单词的频数\n"); while(node != NULL) { printf(" %s:%d次\t",node->word,node->time); i++; node = node->next; if(i%4 == 0) printf("\n"); } }}
输出词频最高的10个词
void PrintFirstTenTimes(){ wordNode *node = L->next; int i = 1; if(L->next == NULL) { printf("该文件无内容!"); } else { while (node != NULL && i<=10) { printf("\t%s:%d次\n",node->word,node->time); node = node->next; i++; } }}
对词频统计结果进行插入排序
void mergeSort(wordNode **headnode){ wordNode *pre,*next,*head; head = *headnode; if(head == NULL || head->next == NULL) { return; } FrontBackSplit(head,&pre,&next); mergeSort(&pre); mergeSort(&next); *headnode = SortedMerge(pre,next); //插入排序}
取尾节点
void FrontBackSplit(wordNode *source,wordNode **pre,wordNode **next){ wordNode *fast; wordNode *slow; if(source == NULL || source->next == NULL) { *pre = source; *next = NULL; } else { slow = source; fast = source->next; while(fast != NULL) { fast = fast->next; if(fast != NULL) { slow = slow->next; fast = fast->next; } } *pre = source; fast = source; *next = slow->next; //pre和next为传址 slow->next = NULL; }}
取频数最大的节点作为头节点
wordNode *SortedMerge(wordNode *pre,wordNode *next){ wordNode *result = NULL; if(pre == NULL) return next; else if(next == NULL) return pre; if(pre->time >= next->time) { result = pre; result->next = SortedMerge(pre->next,next); } else { result = next; result->next = SortedMerge(pre,next->next); } return result;}
处理单词
void wordJob(char word[]){ int i,k; for(i = 0;i<strlen(word);i++) { if(word[i]>=‘A‘&& word[i]<=‘Z‘) { word[i] += 32; continue; } if(word[i]<‘a‘||word[i]>‘z‘) { if(i == (strlen(word)-1)) { word[i] = ‘\0‘; } else { k = i; while(i < strlen(word)) { word[i] = word[i+1]; i++; } i = k; } } }}
释放所有结点内存
void release(){ wordNode *pre; if(L->next == NULL) return; pre = L->next; while(pre != NULL) { L->next = pre->next; free(pre); pre = L->next; }}
ssh://git@git.coding.net:amberpass/cptjgx.git
https://git.coding.net/amberpass/cptjgx.git
2nd 词频统计更新
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。