浅析敏感词过滤算法(C++)

首页 > 代码库 > 浅析敏感词过滤算法(C++)

浅析敏感词过滤算法(C++)

2024-07-15 06:05:09 227人阅读

为了提高查找效率，这里将敏感词用树形结构存储，每个节点有一个map成员，其映射关系为一个string对应一个TreeNode。

STL::map是按照operator<比较判断元素是否相同，以及比较元素的大小，然后选择合适的位置插入到树中。为了提高map的插入及查询效率，可以选用hash_map或unordered_map。关于他们的效率，可以参考http://blog.csdn.net/whizchen/article/details/9286557。

下面主要实现了TreeNode类，进行节点的插入以及查询。(这里命名用Trie比较专业)

 1 #include<map> 2 #include<string> 3 //#include<unordered_map>   4 using namespace std; 5  6  7 class Tree; 8 class TreeNode 9 {10     friend class Tree;11     typedef map<string,TreeNode> _TreeMap;12     typedef map<string,TreeNode>::iterator _TreeMapIterator;13 //     typedef unordered_map<string,TreeNode> _TreeMap;14 //     typedef unordered_map<string,TreeNode>::iterator _TreeMapIterator;15 private:16     string m_character;17     _TreeMap m_map;18     TreeNode* m_parent;19 public:20     TreeNode(string character);21     TreeNode(){22         m_character="";23     };24     string getCharacter() const;25     TreeNode* findChild(string& nextCharacter);26     TreeNode* insertChild(string& nextCharacter);27     28 };

TreeNode.h

#include <iostream>#include "TreeNode.h"using namespace std;string TreeNode::getCharacter() const {    return m_character;}TreeNode::TreeNode(string character){    if (character.size() == 2)        m_character.assign(character);    else         cout<<"error";}TreeNode* TreeNode::findChild(string& nextCharacter){    _TreeMapIterator TreeMapIt = m_map.find(nextCharacter);    //利用map/unordered_map进行查找    return (TreeMapIt == m_map.end()) ? NULL :&TreeMapIt->second;    return NULL;}TreeNode* TreeNode::insertChild(string& nextCharacter){    if(!findChild(nextCharacter))    //添加节点，并返回节点位置    {        m_map.insert(pair<string, TreeNode>(nextCharacter, TreeNode(nextCharacter)));        return &(m_map.find(nextCharacter)->second);    }     return NULL;}

TreeNode.cpp

接下来实现这个tree，在建立TreeNode树时，以parent为根节点建立，一开始parent为m_emptyRoot,然后把keyword按照规则添加到树中，假设一开始m_emptyRoot为空，keyword为"敏感词"，则会以"敏感词"为一条分支建立成为一颗树枝‘敏‘->‘感‘->‘词‘,此后，若想再添加"敏感度"，由于"敏感词"与"敏感度"的前两个字相同，则会在‘敏‘->‘感‘->‘词‘的基础上，从字‘感‘开始新生长出一颗分支，即‘敏‘->‘感‘->‘度‘,这两颗分支共用‘敏‘->‘感‘。

程序中暂时考虑中文的情况，如果需要考虑英文或中英文结合的情况，将PACE改为1，另外程序做出部分修改即可。

下面代码实现了Tree类，进行树的构成及查询。

 1 #include "TreeNode.h" 2 using namespace std; 3  4 class Tree 5 { 6 public: 7     int count;    //当前查找的一个敏感词的字数 8     TreeNode* insert(string& keyword); 9     TreeNode* insert(const char* keyword);10     TreeNode* find(string& keyword);11     Tree(){12         count = 0;13     };14 private:15     TreeNode m_emptyRoot;16     int m_pace;17     TreeNode* insert(TreeNode* parent, string& keyword);18     TreeNode* insertBranch(TreeNode* parent, string& keyword);19     TreeNode* find(TreeNode* parent,string& keyword);20 21 };

Tree.h

 1 #include "Tree.h" 2 #include<iostream> 3  4 #define PACE 2    //如果需要考虑英文或中英文结合的情况，将PACE改为1，另外程序还需要做部分修改 5  6 TreeNode* Tree::insert(string& keyword)  7 { 8     return insert(&m_emptyRoot, keyword); 9 }10 11 TreeNode* Tree::insert(const char* keyword)12 {13     string wordstr(keyword);14     return insert(wordstr);15 }16 17 18 TreeNode* Tree::insert(TreeNode* parent, string& keyword)19 {20     if(keyword.size()==0)21         return NULL;22     string firstChar=keyword.substr(0,PACE);23     TreeNode* firstNode = parent->findChild(firstChar);24     if(firstNode==NULL)25         return insertBranch(parent,keyword);26     string restChar=keyword.substr(PACE,keyword.size());27     return insert(firstNode,restChar);28 }29 30 TreeNode* Tree::insertBranch(TreeNode* parent,string& keyword)31 {32     string firstChar=keyword.substr(0,PACE);33     TreeNode* firstNode = parent->insertChild(firstChar);34     if(firstNode!=NULL)35     {36         string restChar=keyword.substr(PACE,keyword.size());37         if(!restChar.empty())38             return insertBranch(firstNode,restChar);39     }40     return NULL;41 }42 43 TreeNode* Tree::find(string& keyword)44 {45     return find(&m_emptyRoot,keyword);46 }47 48 49 TreeNode* Tree::find(TreeNode* parent,string& keyword)50 {51     string firstChar=keyword.substr(0,PACE);52     TreeNode* firstNode = parent->findChild(firstChar);53     if(firstNode==NULL)            //未找到54     {55         count=0;56         return NULL;        57     }58     string restChar=keyword.substr(PACE,keyword.size());59     if(firstNode->m_map.empty())        //对应词组结束，则判断该词为敏感词60     {61         //std::cout<<count+1<<endl;62         return firstNode;63     }64     if(keyword.size()==PACE)    //最后一个字65         return NULL;66     count++;67     return find(firstNode,restChar);68 }

Tree.cpp

最后就是利用上述的Tree来实现敏感词过滤，Filter::censor(string& source)函数用来进行敏感词过滤，source即输入的字符串，如果source包含敏感词，则用"**"代替掉。

Filter::load(const char* filePath)函数通过文件载入敏感词，并构建Tree。

为使实现简单，代码中过滤了英文数字及一些符号，让敏感词库的词能全部被识别。这里有2个问题遗留下来：

1.需要考虑英文，已经中英文结合的敏感词。程序还需要作出一定修改；

2.载入文件后，可对敏感词做出一定优化。

下面代码实现了Filter类，调用函数实现敏感词过滤。

 1 #include <string> 2 #include "Tree.h" 3  4 class Filter 5 { 6 private: 7     Tree m_tree; 8      9 public:10     void load(const char* fileName);11     bool m_initialized;12     void censor(string& source);13 };

Filter.h

 1 #include <iostream> 2 #include <fstream> 3 #include "Filter.h" 4  5 void Filter::load(const char* filePath) 6 { 7     ifstream keywordsFile(filePath, ios::in); 8     if (keywordsFile.is_open()) 9     {10         char buffer[256];11         int count = 0;12         int offset = 0;13         while((buffer[offset]=keywordsFile.get())!=EOF)14         {15             if((buffer[offset]>=‘a‘&&buffer[offset]<=‘z‘)||16                 (buffer[offset]>=‘A‘&&buffer[offset]<=‘Z‘)||17                 (buffer[offset]>=‘0‘&&buffer[offset]<=‘9‘)||18                 buffer[offset]==‘\‘‘)19                 continue;            20             string word1;21             word1.assign(buffer,offset);22             if(buffer[offset]==‘,‘&&(offset%2)==0)23             {24                 string word;25                 if(offset)26                 {27                     word.assign(buffer,offset);28                     m_tree.insert(word);29                 }30                 offset = 0;31             }32             else33                 offset++;34         }35     }36     keywordsFile.close();37     m_initialized = true;38 }39 40 41 void Filter::censor(string& source)42 {43     if (!m_initialized)44     {45         cout<<"没有载入关键词";46         return;47     }48     else49     {50         int length = source.size();51         for (int i = 0; i < length; i += 2)52         {53             string substring = source.substr(i, length - i);54             if (m_tree.find(substring) != NULL)    //发现敏感词55             {56                 cout<<substring.substr(0,(m_tree.count+1)*2)<<endl;57                 source.replace(i,(m_tree.count+1)*2,"**");58                 length = source.size();59             }60         }61     }62 }

Filter.cpp

最后就是调用Filter类，通过文件输入，并将过滤的结果输出到文件，并输出用时。

 1 #include<iostream> 2 #include<string> 3 #include<list> 4 #include <fstream> 5 #include "Filter.h" 6 #include <sys/timeb.h> 7 using namespace std; 8  9 void main()10 {11     Filter filter;12     string str;13     filter.load("keywords.txt");14     ifstream inputFile("input.txt",ios::in);15     inputFile>>str;16     inputFile.close();17     ofstream outputFile("output.txt",ios::out);18     struct timeb startTime,endTime;19     ftime(&startTime);20     for(int i=0;i<1000;i++)21     {22         filter.censor(str);23     }24     ftime(&endTime);25     cout<<str<<endl;26     cout<<"查询用时："<<(endTime.time-startTime.time)*1000 +27         (endTime.millitm - startTime.millitm)<<"ms"<<endl;28     outputFile<<str<<endl;29     outputFile<<"查询用时："<<(endTime.time-startTime.time)*1000 + 30         (endTime.millitm - startTime.millitm)<<"ms";31     outputFile.close();32 }

Process.cpp

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > 浅析敏感词过滤算法(C++)

浅析敏感词过滤算法(C++)

看完仍有疑问？有类似问题直接问程序猿