首页 > 代码库 > 一个在字符串中查找多个关键字的函数strstrs(三种不同算法实现及效率分析)

一个在字符串中查找多个关键字的函数strstrs(三种不同算法实现及效率分析)

平时项目中有时需要用到在字符串中搜索两个或更多的关键字的情景。例如:将字符串"ab|cd#ef|"按竖线或者井号做分隔

如果是大项目,一般会采用正则表达式做处理。但有时写个小程序,不想因此引进一个正则库,所以我自己写了一个支持多关键字版本的字符串查找函数strstrs

 

函数说明:

 1 #include <stdio.h>
 2 #include <windows.h>
 3 
 4 #ifndef IN
 5 #define IN 
 6 #endif
 7 
 8 //函数说明:在字符串中搜索指定的关键字,支持1-nCnt个关键字
 9 //strToFind 待查找字符串 不允许为空
10 //strKeywords 搜索关键字字符串数组 不允许为空 数组元素不允许为空(NULL),但可以是空串("")
11 //nCnt 关键字个数
12 //pFound 查找到的关键字在字符串数组的位置 不允许为空
13 //返回值:
14 //1 如果关键字存在空串,则返回strToFind 
15 //2 如果找不到关键字则返回NULL
16 //3 如果找到关键字,则返回关键字在strKeywords中的位置(位置从0开始)
17 
18 //使用哈希加二分查找实现
19 const char *strstrs(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
20 //使用哈希加链接实现 推荐使用
21 const char *strstrs_ext(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
22 //依次查找关键字的实现
23 const char *strstrs_normal(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
24 
25 //以下是为了使用方便而增加的一些重载,没多大意义
26 char *strstrs(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
27 char *strstrs_ext(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
28 char *strstrs_normal(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound);
29 
30 char *strstrs(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
31 char *strstrs_ext(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
32 char *strstrs_normal(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
33 
34 const char *strstrs(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
35 const char *strstrs_ext(const char *strToFind, const char *strKeywords[], size_t nCnt, int pFound);
36 const char *strstrs_normal(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound);
37 void tets_strstrs(int nStep); // 0 strstrs 1 strstrs_ext 2 strstrs_normal

 

 

函数实现及相应测试代码:

技术分享
  1 // stdafx.cpp : source file that includes just the standard includes
  2 // sqlite_test.pch will be the pre-compiled header
  3 // stdafx.obj will contain the pre-compiled type information
  4 
  5 #include "stdafx.h"
  6 #include <assert.h>
  7 #include <stdlib.h>
  8 #include <time.h>
  9 #include <stdio.h>
 10 
 11 
 12 // TODO: reference any additional headers you need in STDAFX.H
 13 // and not in this file
 14 
 15 
 16 const char *strstrs(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
 17 {
 18     return strstrs(const_cast<char *>(strToFind), strKeywords, nCnt, pFound);
 19 }
 20 
 21 const char *strstrs_ext(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
 22 {
 23     return strstrs_ext(const_cast<char *>(strToFind), strKeywords, nCnt, pFound);
 24 }
 25 
 26 const char *strstrs_normal(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
 27 {
 28     return strstrs_normal(const_cast<char *>(strToFind), strKeywords, nCnt, pFound);
 29 }
 30 
 31 const char *strstrs(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
 32 {
 33     return strstrs(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
 34 }
 35 
 36 const char *strstrs_ext(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
 37 {
 38     return strstrs_ext(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
 39 }
 40 
 41 const char *strstrs_normal(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
 42 {
 43     return strstrs_normal(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
 44 }
 45 
 46 
 47 char *strstrs(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
 48 {
 49     return strstrs(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
 50 }
 51 
 52 char *strstrs_ext(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
 53 {
 54     return strstrs_ext(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
 55 }
 56 
 57 char *strstrs_normal(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound)
 58 {
 59     return strstrs_normal(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound);
 60 }
 61 
 62 typedef struct tagKeyPos
 63 {
 64     const char *m_str;
 65     size_t m_nIdx;
 66     size_t m_strLen;
 67 }KeyPos;
 68 
 69 int __strstrs_cmp(const void *p1, const void *p2)
 70 {
 71     const KeyPos *pLeft = (KeyPos *)p1, *pRight = (KeyPos *)p2;
 72     int nCmp = strcmp(pLeft->m_str, pRight->m_str);
 73     if (nCmp == 0)
 74     {
 75         return pLeft->m_nIdx - pRight->m_nIdx;
 76     }
 77 
 78     return nCmp;
 79 }
 80 
 81 //lower_bound
 82 KeyPos *__strstrs_find_first(KeyPos *pRealBeg, KeyPos *pRealEnd, size_t *pKeyLenArr, KeyPos *pKey)
 83 {
 84     KeyPos *pBeg = pRealBeg;
 85     KeyPos *pEnd = pRealEnd;
 86 
 87     KeyPos *pEqal = NULL;
 88     while (pBeg != pEnd)
 89     {
 90         pEqal = pBeg +  (pEnd - pBeg) / 2;
 91         int nCmp = memcmp( pEqal->m_str, pKey->m_str, pEqal->m_strLen );
 92         if (nCmp == 0)
 93         {
 94             //若相等,则往前找,直至找到最后一个相等的元素
 95             while (pEqal != pBeg)
 96             {
 97                 pEqal--;
 98                 if (memcmp( pEqal->m_str, pKey->m_str, pEqal->m_strLen ))
 99                 {
100                     return pEqal + 1;
101                 }
102             }
103 
104             return pBeg;
105         }
106         else if (nCmp > 0)
107         {
108             //中值比目标值大
109             pEnd = pEqal;
110         }
111         else
112         {
113             //中值比目标值小
114             pBeg = pEqal + 1;
115         }
116 
117     }
118 
119     return pRealEnd;
120 }
121 
122 char *strstrs(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
123 {
124     //作者:皇家救星 创建于:2016-10-19 
125     //有bug请发送邮件至89475049@qq.com 邮件主题注明:strstrs问题反馈
126     //异常参数判断
127     assert(strToFind != NULL);
128     assert(strKeywords != NULL);
129     assert(pFound != NULL);
130     assert(nCnt > 0);
131 
132     //记录各个关键字首字符到集合中 后面判断用
133     bool mpFirstChar[256] = {0}; //这里如果用位图,可以节省不少空间
134     for (size_t i = 0; i < nCnt; i++)
135     {
136         //linux和win的char类型定义不一样 这里统一强制转换一下
137         assert(strKeywords[i] != NULL);
138         mpFirstChar[(unsigned)strKeywords[i][0]] = true;
139         if (strKeywords[i][0] == \0)
140         {
141             *pFound = i;
142             return strToFind;
143         }
144     }
145 
146     KeyPos *sortKeywords = new KeyPos[nCnt];
147     for (size_t i = 0; i < nCnt; i++)
148     {
149         sortKeywords[i].m_str = strKeywords[i];
150         sortKeywords[i].m_strLen = strlen(strKeywords[i]);
151         sortKeywords[i].m_nIdx = i;
152     }
153     qsort(sortKeywords, nCnt, sizeof(KeyPos), __strstrs_cmp);
154 
155     const char *p = strToFind;
156     KeyPos key;
157     KeyPos *pEnd = sortKeywords + nCnt;
158     KeyPos *pResult = NULL;
159     while (*p)
160     {
161         //判断当前字符是否在关键串首字符集中
162         if (mpFirstChar[(unsigned)*p])
163         {
164             key.m_str = p;
165             pResult = __strstrs_find_first(sortKeywords, pEnd, NULL, &key);
166             if (pResult != pEnd)
167             {
168                 *pFound = pResult->m_nIdx;
169                 delete []sortKeywords;
170                 return const_cast<char *>(p);
171             }
172         }
173 
174         p++;
175     }
176 
177     delete []sortKeywords;
178     return NULL;
179 }
180 
181 char *strstrs_ext(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
182 {
183     typedef struct tagKeyPos
184     {
185         size_t m_strLen;
186         size_t m_strIdx;
187         struct tagKeyPos *m_next;
188     }KeyPos;
189 
190     //作者:皇家救星 创建于:2016-10-19 
191     //有bug请发送邮件至89475049@qq.com 邮件主题注明:strstrs问题反馈
192     //异常参数判断
193     assert(strToFind != NULL);
194     assert(strKeywords != NULL);
195     assert(pFound != NULL);
196     assert(nCnt > 0);
197 
198     //仿内存池 减少new调用次数
199     KeyPos *memPool = new KeyPos[nCnt]; //注意:memPool分配失败会抛异常
200     memset(memPool, 0, nCnt * sizeof(KeyPos));
201     int nUsed = 0;
202 
203     //记录各个关键字首字符到集合中 后面判断用
204     KeyPos mpFirstChar[256];
205     memset(mpFirstChar, 0, sizeof(mpFirstChar));
206     for (size_t i = 0; i < nCnt; i++)
207     {
208         KeyPos *pPos = &memPool[nUsed++];
209         //如果同一个首字符对应多个关键字,则用链表连起来
210         assert(strKeywords[i] != NULL);
211         pPos->m_strIdx = i;
212         pPos->m_strLen = strlen(strKeywords[i]);
213         pPos->m_next = NULL;
214 
215         if (pPos->m_strLen == 0)
216         {
217             *pFound = i;
218             delete []memPool;
219             return strToFind;
220         }
221 
222         //找到链表最后一个非空节点,把新的节点插到最后面
223         //这里其实插到最前面效率会更高,这样做是为了保持行为跟其它几个函数一致
224         KeyPos *pLast = &mpFirstChar[(unsigned)strKeywords[i][0]];
225         while (pLast->m_next)
226         {
227             pLast = pLast->m_next;
228         }
229         pLast->m_next = pPos;
230     }
231 
232     const char *p = strToFind;
233     while (*p)
234     {
235         //判断当前字符是否在关键串首字符集中
236         for (KeyPos *pPos = mpFirstChar[(unsigned)*p].m_next; pPos != NULL; pPos = pPos->m_next)
237         {
238             if (memcmp(p, strKeywords[pPos->m_strIdx], pPos->m_strLen) == 0)
239             {
240                 *pFound = pPos->m_strIdx;
241                 delete []memPool;
242                 return const_cast<char *>(p);
243             }
244         }
245 
246         p++;
247     }
248 
249     delete []memPool;
250     return NULL;
251 }
252 
253 char *strstrs_normal(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound)
254 {
255     //作者:皇家救星 创建于:2016-10-19 
256     //有bug请发送邮件至89475049@qq.com 邮件主题注明:strstrs问题反馈
257     //异常参数判断
258     assert(strToFind != NULL);
259     assert(strKeywords != NULL);
260     assert(pFound != NULL);
261     assert(nCnt > 0);
262 
263     char *p = NULL;
264     for (size_t i = 0; i < nCnt; i++)
265     {
266         assert(strKeywords[i] != NULL);
267         if (strKeywords[i][0] == \0)
268         {
269             *pFound = i;
270             return strToFind;
271         }
272     }
273 
274     for (size_t i = 0; i < nCnt; i++)
275     {
276         assert(strKeywords[i] != NULL);
277         if ((p = strstr(strToFind, strKeywords[i])) != NULL)
278         {
279             *pFound = i;
280             return p;
281         }
282     }
283     return NULL;
284 }
285 
286 
287 
288 //效率比较及准确性测试函数
289 void tets_strstrs(int nStep)
290 {
291     const int max_length = 10000;
292     const int max_keyword = 1000;
293     char *strToFound = new char[max_length + 1]; //待查找的字符串
294     char *strBackup = new char[max_length + 1]; 
295     char *strKeywords[max_keyword]; //关键字数组
296     const char strBase64[65] = {"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"};
297 
298     //为避免结果全是找不到关键字,随机将一个关键字复制到strToFound中
299     //这样肯定会有找到关键字的情况,结果更有意义
300     bool arrayFoundFlags[max_keyword] = {0}; //标记是否把关键字复制到strToFound中
301     int arrayFoundIdxs[max_keyword] = {0}; //待替换的关键字(序号)
302     int arrayFoundBeg[max_keyword] = {0}; //在strToFound替换关键字的起始位置
303 
304     srand((int)time(NULL));
305     //初始化要查询的字符串
306     for (int i = 0; i < max_length; i++)
307     {
308         strToFound[i] = strBase64[rand() % 64];
309     }
310     strToFound[max_length] = \0;
311 
312     //初始化查询关键字
313     for (int i = 0; i < max_keyword; i++)
314     {
315         strKeywords[i] = new char[1024 + 1];
316 
317         int nLen = rand() % 1024;
318         for (int j = 0; j < nLen; j++)
319         {
320             strKeywords[i][j] = strBase64[rand() % 64];
321         }
322         strKeywords[i][nLen] = \0;
323 
324         //为避免随机结果都是查不到的情况,这里增加一些干预
325         if (0 != (rand() % 10))
326         {
327             arrayFoundFlags[i] = true;
328             arrayFoundIdxs[i] = rand() % (i + 1);
329             arrayFoundBeg[i] = rand() % (max_length - strlen(strKeywords[arrayFoundIdxs[i]]));
330         }
331     }
332 
333     for (int cmpType = 0; cmpType < 3; cmpType++)
334     {
335         int nSn = 0;
336         double total_start = GetTickCount();
337         for (size_t nCnt = 0; nCnt < max_keyword; nCnt++)
338         {
339             bool bSetFound = arrayFoundFlags[nCnt];
340             int nBeg = 0;
341             int nChange = 0;
342             if (bSetFound)
343             {
344                 int idxKeyword = arrayFoundIdxs[nCnt];
345                 nChange = strlen(strKeywords[idxKeyword]);
346                 nBeg = arrayFoundBeg[nCnt];
347                 memcpy(strBackup, strToFound + nBeg, nChange);
348                 memcpy(strToFound + nBeg, strKeywords[idxKeyword], nChange);
349             }
350 
351             double start = GetTickCount();
352             int nFoundCnt = 0;
353 
354             for (int nStrlen = 0; nStrlen < max_length; nStrlen += nStep)
355             {
356                 char cBak = strToFound[nStrlen];
357                 strToFound[nStrlen] = \0;
358                 int nFound = -1;
359                 const char *p;
360                 switch (cmpType)
361                 {
362                 case 0:
363                     p = strstrs(strToFound, strKeywords, nCnt + 1, &nFound);
364                     break;
365                 case 1:
366                     p = strstrs_ext(strToFound, strKeywords, nCnt + 1, &nFound);
367                     break;
368                 default:
369                     p = strstrs_normal(strToFound, strKeywords, nCnt + 1, &nFound);
370                     break;
371                 }
372 
373                 fprintf(stderr, "cmpType %d %d %d\n", cmpType, nSn, nFound);
374                 nSn++;
375                 if (p != NULL)
376                 {
377                     nFoundCnt++;
378                 }
379 
380                 if (bSetFound && (p == NULL) && ((nBeg + nChange) <= nStrlen) &&
381                     (strstr(strToFound, strKeywords[arrayFoundIdxs[nCnt]]) != NULL))
382                 {
383                     printf("cmpType = %d ###############################error!\n", cmpType);
384 
385                     switch (cmpType)
386                     {
387                     case 0:
388                         p = strstrs(strToFound, strKeywords, nCnt + 1, &nFound);
389                         break;
390                     case 1:
391                         p = strstrs_ext(strToFound, strKeywords, nCnt + 1, &nFound);
392                         break;
393                     default:
394                         p = strstrs_normal(strToFound, strKeywords, nCnt + 1, &nFound);
395                         break;
396                     }
397                 }
398 
399                 strToFound[nStrlen] = cBak;
400             }
401 
402             double end = GetTickCount();
403             printf("%d %d %f %d\n", 
404                 cmpType, nCnt + 1, end - start, nFoundCnt);
405 
406             if (bSetFound)
407             {
408                 memcpy(strToFound + nBeg, strBackup, nChange);
409             }
410         }
411 
412 
413         double total_end = GetTickCount();
414         fprintf(stderr, "总共耗时[%f]\n", total_end - total_start);
415     }
416 
417     //TODO: 此处应该要释放内存 
418     delete []strToFound;
419     delete []strBackup;
420     for (int i = 0; i < max_keyword; i++)
421     {
422         delete []strKeywords[i];
423     }
424 }
View Code

 

函数效率比较图:

0 代表strstrs

1 代表strstrs_ext

2 代表strstrs_normal

 

可以看出strstrs_ext比较稳定,而且效率也比较高

 

技术分享

 

一个在字符串中查找多个关键字的函数strstrs(三种不同算法实现及效率分析)