首页 > 代码库 > 一个在字符串中查找多个关键字的函数strstrs(三种不同算法实现及效率分析)
一个在字符串中查找多个关键字的函数strstrs(三种不同算法实现及效率分析)
平时项目中有时需要用到在字符串中搜索两个或更多的关键字的情景。例如:将字符串"ab|cd#ef|"按竖线或者井号做分隔
如果是大项目,一般会采用正则表达式做处理。但有时写个小程序,不想因此引进一个正则库,所以我自己写了一个支持多关键字版本的字符串查找函数strstrs
函数说明:
1 #include <stdio.h> 2 #include <windows.h> 3 4 #ifndef IN 5 #define IN 6 #endif 7 8 //函数说明:在字符串中搜索指定的关键字,支持1-nCnt个关键字 9 //strToFind 待查找字符串 不允许为空 10 //strKeywords 搜索关键字字符串数组 不允许为空 数组元素不允许为空(NULL),但可以是空串("") 11 //nCnt 关键字个数 12 //pFound 查找到的关键字在字符串数组的位置 不允许为空 13 //返回值: 14 //1 如果关键字存在空串,则返回strToFind 15 //2 如果找不到关键字则返回NULL 16 //3 如果找到关键字,则返回关键字在strKeywords中的位置(位置从0开始) 17 18 //使用哈希加二分查找实现 19 const char *strstrs(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 20 //使用哈希加链接实现 推荐使用 21 const char *strstrs_ext(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 22 //依次查找关键字的实现 23 const char *strstrs_normal(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 24 25 //以下是为了使用方便而增加的一些重载,没多大意义 26 char *strstrs(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 27 char *strstrs_ext(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 28 char *strstrs_normal(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound); 29 30 char *strstrs(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound); 31 char *strstrs_ext(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound); 32 char *strstrs_normal(IN char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound); 33 34 const char *strstrs(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound); 35 const char *strstrs_ext(const char *strToFind, const char *strKeywords[], size_t nCnt, int pFound); 36 const char *strstrs_normal(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound); 37 void tets_strstrs(int nStep); // 0 strstrs 1 strstrs_ext 2 strstrs_normal
函数实现及相应测试代码:
1 // stdafx.cpp : source file that includes just the standard includes 2 // sqlite_test.pch will be the pre-compiled header 3 // stdafx.obj will contain the pre-compiled type information 4 5 #include "stdafx.h" 6 #include <assert.h> 7 #include <stdlib.h> 8 #include <time.h> 9 #include <stdio.h> 10 11 12 // TODO: reference any additional headers you need in STDAFX.H 13 // and not in this file 14 15 16 const char *strstrs(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound) 17 { 18 return strstrs(const_cast<char *>(strToFind), strKeywords, nCnt, pFound); 19 } 20 21 const char *strstrs_ext(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound) 22 { 23 return strstrs_ext(const_cast<char *>(strToFind), strKeywords, nCnt, pFound); 24 } 25 26 const char *strstrs_normal(const char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound) 27 { 28 return strstrs_normal(const_cast<char *>(strToFind), strKeywords, nCnt, pFound); 29 } 30 31 const char *strstrs(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) 32 { 33 return strstrs(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); 34 } 35 36 const char *strstrs_ext(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) 37 { 38 return strstrs_ext(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); 39 } 40 41 const char *strstrs_normal(const char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) 42 { 43 return strstrs_normal(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); 44 } 45 46 47 char *strstrs(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) 48 { 49 return strstrs(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); 50 } 51 52 char *strstrs_ext(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) 53 { 54 return strstrs_ext(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); 55 } 56 57 char *strstrs_normal(IN char *strToFind, IN char *strKeywords[], size_t nCnt, int *pFound) 58 { 59 return strstrs_normal(const_cast<char *>(strToFind), (const char **)strKeywords, nCnt, pFound); 60 } 61 62 typedef struct tagKeyPos 63 { 64 const char *m_str; 65 size_t m_nIdx; 66 size_t m_strLen; 67 }KeyPos; 68 69 int __strstrs_cmp(const void *p1, const void *p2) 70 { 71 const KeyPos *pLeft = (KeyPos *)p1, *pRight = (KeyPos *)p2; 72 int nCmp = strcmp(pLeft->m_str, pRight->m_str); 73 if (nCmp == 0) 74 { 75 return pLeft->m_nIdx - pRight->m_nIdx; 76 } 77 78 return nCmp; 79 } 80 81 //lower_bound 82 KeyPos *__strstrs_find_first(KeyPos *pRealBeg, KeyPos *pRealEnd, size_t *pKeyLenArr, KeyPos *pKey) 83 { 84 KeyPos *pBeg = pRealBeg; 85 KeyPos *pEnd = pRealEnd; 86 87 KeyPos *pEqal = NULL; 88 while (pBeg != pEnd) 89 { 90 pEqal = pBeg + (pEnd - pBeg) / 2; 91 int nCmp = memcmp( pEqal->m_str, pKey->m_str, pEqal->m_strLen ); 92 if (nCmp == 0) 93 { 94 //若相等,则往前找,直至找到最后一个相等的元素 95 while (pEqal != pBeg) 96 { 97 pEqal--; 98 if (memcmp( pEqal->m_str, pKey->m_str, pEqal->m_strLen )) 99 { 100 return pEqal + 1; 101 } 102 } 103 104 return pBeg; 105 } 106 else if (nCmp > 0) 107 { 108 //中值比目标值大 109 pEnd = pEqal; 110 } 111 else 112 { 113 //中值比目标值小 114 pBeg = pEqal + 1; 115 } 116 117 } 118 119 return pRealEnd; 120 } 121 122 char *strstrs(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound) 123 { 124 //作者:皇家救星 创建于:2016-10-19 125 //有bug请发送邮件至89475049@qq.com 邮件主题注明:strstrs问题反馈 126 //异常参数判断 127 assert(strToFind != NULL); 128 assert(strKeywords != NULL); 129 assert(pFound != NULL); 130 assert(nCnt > 0); 131 132 //记录各个关键字首字符到集合中 后面判断用 133 bool mpFirstChar[256] = {0}; //这里如果用位图,可以节省不少空间 134 for (size_t i = 0; i < nCnt; i++) 135 { 136 //linux和win的char类型定义不一样 这里统一强制转换一下 137 assert(strKeywords[i] != NULL); 138 mpFirstChar[(unsigned)strKeywords[i][0]] = true; 139 if (strKeywords[i][0] == ‘\0‘) 140 { 141 *pFound = i; 142 return strToFind; 143 } 144 } 145 146 KeyPos *sortKeywords = new KeyPos[nCnt]; 147 for (size_t i = 0; i < nCnt; i++) 148 { 149 sortKeywords[i].m_str = strKeywords[i]; 150 sortKeywords[i].m_strLen = strlen(strKeywords[i]); 151 sortKeywords[i].m_nIdx = i; 152 } 153 qsort(sortKeywords, nCnt, sizeof(KeyPos), __strstrs_cmp); 154 155 const char *p = strToFind; 156 KeyPos key; 157 KeyPos *pEnd = sortKeywords + nCnt; 158 KeyPos *pResult = NULL; 159 while (*p) 160 { 161 //判断当前字符是否在关键串首字符集中 162 if (mpFirstChar[(unsigned)*p]) 163 { 164 key.m_str = p; 165 pResult = __strstrs_find_first(sortKeywords, pEnd, NULL, &key); 166 if (pResult != pEnd) 167 { 168 *pFound = pResult->m_nIdx; 169 delete []sortKeywords; 170 return const_cast<char *>(p); 171 } 172 } 173 174 p++; 175 } 176 177 delete []sortKeywords; 178 return NULL; 179 } 180 181 char *strstrs_ext(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound) 182 { 183 typedef struct tagKeyPos 184 { 185 size_t m_strLen; 186 size_t m_strIdx; 187 struct tagKeyPos *m_next; 188 }KeyPos; 189 190 //作者:皇家救星 创建于:2016-10-19 191 //有bug请发送邮件至89475049@qq.com 邮件主题注明:strstrs问题反馈 192 //异常参数判断 193 assert(strToFind != NULL); 194 assert(strKeywords != NULL); 195 assert(pFound != NULL); 196 assert(nCnt > 0); 197 198 //仿内存池 减少new调用次数 199 KeyPos *memPool = new KeyPos[nCnt]; //注意:memPool分配失败会抛异常 200 memset(memPool, 0, nCnt * sizeof(KeyPos)); 201 int nUsed = 0; 202 203 //记录各个关键字首字符到集合中 后面判断用 204 KeyPos mpFirstChar[256]; 205 memset(mpFirstChar, 0, sizeof(mpFirstChar)); 206 for (size_t i = 0; i < nCnt; i++) 207 { 208 KeyPos *pPos = &memPool[nUsed++]; 209 //如果同一个首字符对应多个关键字,则用链表连起来 210 assert(strKeywords[i] != NULL); 211 pPos->m_strIdx = i; 212 pPos->m_strLen = strlen(strKeywords[i]); 213 pPos->m_next = NULL; 214 215 if (pPos->m_strLen == 0) 216 { 217 *pFound = i; 218 delete []memPool; 219 return strToFind; 220 } 221 222 //找到链表最后一个非空节点,把新的节点插到最后面 223 //这里其实插到最前面效率会更高,这样做是为了保持行为跟其它几个函数一致 224 KeyPos *pLast = &mpFirstChar[(unsigned)strKeywords[i][0]]; 225 while (pLast->m_next) 226 { 227 pLast = pLast->m_next; 228 } 229 pLast->m_next = pPos; 230 } 231 232 const char *p = strToFind; 233 while (*p) 234 { 235 //判断当前字符是否在关键串首字符集中 236 for (KeyPos *pPos = mpFirstChar[(unsigned)*p].m_next; pPos != NULL; pPos = pPos->m_next) 237 { 238 if (memcmp(p, strKeywords[pPos->m_strIdx], pPos->m_strLen) == 0) 239 { 240 *pFound = pPos->m_strIdx; 241 delete []memPool; 242 return const_cast<char *>(p); 243 } 244 } 245 246 p++; 247 } 248 249 delete []memPool; 250 return NULL; 251 } 252 253 char *strstrs_normal(char *strToFind, const char *strKeywords[], size_t nCnt, int *pFound) 254 { 255 //作者:皇家救星 创建于:2016-10-19 256 //有bug请发送邮件至89475049@qq.com 邮件主题注明:strstrs问题反馈 257 //异常参数判断 258 assert(strToFind != NULL); 259 assert(strKeywords != NULL); 260 assert(pFound != NULL); 261 assert(nCnt > 0); 262 263 char *p = NULL; 264 for (size_t i = 0; i < nCnt; i++) 265 { 266 assert(strKeywords[i] != NULL); 267 if (strKeywords[i][0] == ‘\0‘) 268 { 269 *pFound = i; 270 return strToFind; 271 } 272 } 273 274 for (size_t i = 0; i < nCnt; i++) 275 { 276 assert(strKeywords[i] != NULL); 277 if ((p = strstr(strToFind, strKeywords[i])) != NULL) 278 { 279 *pFound = i; 280 return p; 281 } 282 } 283 return NULL; 284 } 285 286 287 288 //效率比较及准确性测试函数 289 void tets_strstrs(int nStep) 290 { 291 const int max_length = 10000; 292 const int max_keyword = 1000; 293 char *strToFound = new char[max_length + 1]; //待查找的字符串 294 char *strBackup = new char[max_length + 1]; 295 char *strKeywords[max_keyword]; //关键字数组 296 const char strBase64[65] = {"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"}; 297 298 //为避免结果全是找不到关键字,随机将一个关键字复制到strToFound中 299 //这样肯定会有找到关键字的情况,结果更有意义 300 bool arrayFoundFlags[max_keyword] = {0}; //标记是否把关键字复制到strToFound中 301 int arrayFoundIdxs[max_keyword] = {0}; //待替换的关键字(序号) 302 int arrayFoundBeg[max_keyword] = {0}; //在strToFound替换关键字的起始位置 303 304 srand((int)time(NULL)); 305 //初始化要查询的字符串 306 for (int i = 0; i < max_length; i++) 307 { 308 strToFound[i] = strBase64[rand() % 64]; 309 } 310 strToFound[max_length] = ‘\0‘; 311 312 //初始化查询关键字 313 for (int i = 0; i < max_keyword; i++) 314 { 315 strKeywords[i] = new char[1024 + 1]; 316 317 int nLen = rand() % 1024; 318 for (int j = 0; j < nLen; j++) 319 { 320 strKeywords[i][j] = strBase64[rand() % 64]; 321 } 322 strKeywords[i][nLen] = ‘\0‘; 323 324 //为避免随机结果都是查不到的情况,这里增加一些干预 325 if (0 != (rand() % 10)) 326 { 327 arrayFoundFlags[i] = true; 328 arrayFoundIdxs[i] = rand() % (i + 1); 329 arrayFoundBeg[i] = rand() % (max_length - strlen(strKeywords[arrayFoundIdxs[i]])); 330 } 331 } 332 333 for (int cmpType = 0; cmpType < 3; cmpType++) 334 { 335 int nSn = 0; 336 double total_start = GetTickCount(); 337 for (size_t nCnt = 0; nCnt < max_keyword; nCnt++) 338 { 339 bool bSetFound = arrayFoundFlags[nCnt]; 340 int nBeg = 0; 341 int nChange = 0; 342 if (bSetFound) 343 { 344 int idxKeyword = arrayFoundIdxs[nCnt]; 345 nChange = strlen(strKeywords[idxKeyword]); 346 nBeg = arrayFoundBeg[nCnt]; 347 memcpy(strBackup, strToFound + nBeg, nChange); 348 memcpy(strToFound + nBeg, strKeywords[idxKeyword], nChange); 349 } 350 351 double start = GetTickCount(); 352 int nFoundCnt = 0; 353 354 for (int nStrlen = 0; nStrlen < max_length; nStrlen += nStep) 355 { 356 char cBak = strToFound[nStrlen]; 357 strToFound[nStrlen] = ‘\0‘; 358 int nFound = -1; 359 const char *p; 360 switch (cmpType) 361 { 362 case 0: 363 p = strstrs(strToFound, strKeywords, nCnt + 1, &nFound); 364 break; 365 case 1: 366 p = strstrs_ext(strToFound, strKeywords, nCnt + 1, &nFound); 367 break; 368 default: 369 p = strstrs_normal(strToFound, strKeywords, nCnt + 1, &nFound); 370 break; 371 } 372 373 fprintf(stderr, "cmpType %d %d %d\n", cmpType, nSn, nFound); 374 nSn++; 375 if (p != NULL) 376 { 377 nFoundCnt++; 378 } 379 380 if (bSetFound && (p == NULL) && ((nBeg + nChange) <= nStrlen) && 381 (strstr(strToFound, strKeywords[arrayFoundIdxs[nCnt]]) != NULL)) 382 { 383 printf("cmpType = %d ###############################error!\n", cmpType); 384 385 switch (cmpType) 386 { 387 case 0: 388 p = strstrs(strToFound, strKeywords, nCnt + 1, &nFound); 389 break; 390 case 1: 391 p = strstrs_ext(strToFound, strKeywords, nCnt + 1, &nFound); 392 break; 393 default: 394 p = strstrs_normal(strToFound, strKeywords, nCnt + 1, &nFound); 395 break; 396 } 397 } 398 399 strToFound[nStrlen] = cBak; 400 } 401 402 double end = GetTickCount(); 403 printf("%d %d %f %d\n", 404 cmpType, nCnt + 1, end - start, nFoundCnt); 405 406 if (bSetFound) 407 { 408 memcpy(strToFound + nBeg, strBackup, nChange); 409 } 410 } 411 412 413 double total_end = GetTickCount(); 414 fprintf(stderr, "总共耗时[%f]\n", total_end - total_start); 415 } 416 417 //TODO: 此处应该要释放内存 418 delete []strToFound; 419 delete []strBackup; 420 for (int i = 0; i < max_keyword; i++) 421 { 422 delete []strKeywords[i]; 423 } 424 }
函数效率比较图:
0 代表strstrs
1 代表strstrs_ext
2 代表strstrs_normal
可以看出strstrs_ext比较稳定,而且效率也比较高
一个在字符串中查找多个关键字的函数strstrs(三种不同算法实现及效率分析)
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。