首页 > 代码库 > faster strlen
faster strlen
From: Will DeWitt Jr. Subject: Fast strlen routine?NewsGroup: borland.public.delphi.language.basmDate Posted: 28-May-2003 at 13:50:4 PST Download from GoogleI‘ve been tinkering with re-writing some of the standard C run-timelibrary routines and haven‘t really played much with MMX instructions,or SSE for that matter. But I thought what I came up with wasinteresting and maybe worth sharing--function strlenmmx(s: PAnsiChar): longword; register;asm TEST EAX, EAX JZ @@Error PXOR MM1, MM1 MOV ECX, EAX // save original pointer@@1: MOVQ MM0, [EAX] // grab 8 chars PCMPEQB MM0, MM1 // check all 8 for null/0 (00 = null, FF = not null - for each char in MM0) PMOVMSKB EDX, MM0 // move 1-bit mask of each char to DL ADD EAX, 8 // move pointer forward 8 chars TEST EDX, EDX // check for any null/0 chars JNZ @@2 MOVQ MM0, [EAX] // unroll twice (#1) PCMPEQB MM0, MM1 PMOVMSKB EDX, MM0 ADD EAX, 8 TEST EDX, EDX JNZ @@2 MOVQ MM0, [EAX] // (#2) PCMPEQB MM0, MM1 PMOVMSKB EDX, MM0 ADD EAX, 8 TEST EDX, EDX JZ @@1@@2: EMMS BSF EDX, EDX SUB EAX, DWORD PTR [@@SubTable+EDX*4] SUB EAX, ECX RET@@SubTable: DD 8 DD 7 DD 6 DD 5 DD 4 DD 3 DD 2 DD 1 DD 0@@Error:end;
function _PCharLen(P: _PAnsiChr): Longint;{$IFNDEF LEGACY_PCHARLEN}begin Result := 0; if P <> nil then while P[Result] <> #0 do Inc(Result);end;{$ELSE !LEGACY_PCHARLEN}{$IFDEF CPUX86}asm TEST EAX,EAX JE @@5 PUSH EAX XOR ECX,ECX@@0: CMP CL,[EAX+0] JE @@4 CMP CL,[EAX+1] JE @@3 CMP CL,[EAX+2] JE @@2 CMP CL,[EAX+3] JE @@1 ADD EAX,4 JMP @@0@@1: INC EAX@@2: INC EAX@@3: INC EAX@@4: POP ECX SUB EAX,ECX@@5:end;{$ENDIF CPUX86}{$ENDIF !LEGACY_PCHARLEN}
int i; while (*str++ != ‘\0‘) ++i; return i;
http://www.strchr.com/optimized_strlen_function
http://www.strchr.com/sse2_optimised_strlen
size_t strlen(const char * str){ const char *s; for (s = str; *s; ++s) {} return(s - str);}
size_t strlen(const char *s) { const char *start = s; while(*s) s++; return s - start;}
// for x86 onlysize_t my_strlen(const char *s) { size_t len = 0; for(;;) { unsigned x = *(unsigned*)s; if((x & 0xFF) == 0) return len; if((x & 0xFF00) == 0) return len + 1; if((x & 0xFF0000) == 0) return len + 2; if((x & 0xFF000000) == 0) return len + 3; s += 4, len += 4; }}
#ifndef WORDS_BIGENDIAN #if 0 static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes { register int i = 0; if (!(x & (1 << 0))) i ++; else return i; if (!(x & (1 << 1))) i ++; else return i; if (!(x & (1 << 2))) i ++; else return i; if (!(x & (1 << 3))) i ++; else return i; if (!(x & (1 << 4))) i ++; else return i; if (!(x & (1 << 5))) i ++; else return i; if (!(x & (1 << 6))) i ++; else return i; if (!(x & (1 << 7))) i ++; else return i; if (!(x & (1 << 8))) i ++; else return i; if (!(x & (1 << 9))) i ++; else return i; if (!(x & (1 << 10))) i ++; else return i; if (!(x & (1 << 11))) i ++; else return i; if (!(x & (1 << 12))) i ++; else return i; if (!(x & (1 << 13))) i ++; else return i; if (!(x & (1 << 14))) i ++; else return i; if (!(x & (1 << 15))) i ++; return i; } #elif 0 static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes { // http://www.hackersdelight.org/: ntz3() shortened for 16-bit mask by Peter Kankowski register int n = 1; if ((x & 0x000000FFU) == 0) {n += 8; x >>= 8;} if ((x & 0x0000000FU) == 0) {n += 4; x >>= 4;} if ((x & 0x00000003U) == 0) {n += 2; x >>= 2;} return n - (x & 1); } #else static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40 { // this is current winner for speed static const unsigned char table[256] = { 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, }; if ((unsigned char)x) return table[(unsigned char)x]; return table[x >> 8] + 8; // t[x / 256] + 8 } #endif#else #if 0 static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes { register int i = 0; if (!(x & (1 << 15))) i ++; else return i; if (!(x & (1 << 14))) i ++; else return i; if (!(x & (1 << 13))) i ++; else return i; if (!(x & (1 << 12))) i ++; else return i; if (!(x & (1 << 11))) i ++; else return i; if (!(x & (1 << 10))) i ++; else return i; if (!(x & (1 << 9))) i ++; else return i; if (!(x & (1 << 8))) i ++; else return i; if (!(x & (1 << 7))) i ++; else return i; if (!(x & (1 << 6))) i ++; else return i; if (!(x & (1 << 5))) i ++; else return i; if (!(x & (1 << 4))) i ++; else return i; if (!(x & (1 << 3))) i ++; else return i; if (!(x & (1 << 2))) i ++; else return i; if (!(x & (1 << 1))) i ++; else return i; if (!(x & (1 << 0))) i ++; return i; } #else static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes { // http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask register int n = 0; if (x <= 0x000000FFU) {n = n + 8; x = x << 8;} if (x <= 0x00000FFFU) {n = n + 4; x = x << 4;} if (x <= 0x00003FFFU) {n = n + 2; x = x << 2;} if (x <= 0x00007FFFU) {n = n + 1;} return n; } #endif#endifsize_t strlen(const char *str){ register size_t len = 0; // align to 16 bytes while ((((intptr_t)str) & (sizeof(__m128i)-1)) != 0) { if (*str++ == 0) return len; ++ len; } // search for 0 __m128i xmm0 = _mm_setzero_si128(); __m128i xmm1; int mask = 0; for (;;) { xmm1 = _mm_load_si128((__m128i *)str); xmm1 = _mm_cmpeq_epi8(xmm1, xmm0); if ((mask = _mm_movemask_epi8(xmm1)) != 0) { // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask // find index of first set bit #ifndef _DISABLE_ASM_BSF // define it to disable ASM #if (_MSC_VER >= 1300) // make sure <intrin.h> is included unsigned long pos; _BitScanForward(&pos, mask); len += (size_t)pos; #elif defined(_MSC_VER) // earlier MSVC‘s do not have _BitScanForward, use inline asm __asm bsf edx, mask ; edx = bsf(mask) __asm add edx, len ; edx += len __asm mov len, edx ; len = edx #elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz len += __builtin_ctz(mask); #elif defined(__GNUC__) // older GCC shall use inline asm unsigned int pos; asm("bsf %1, %0" : "=r" (pos) : "rm" (mask)); len += (size_t)pos; #else // none of choices exist, use local BSF implementation len += count_bits_to_0(mask); #endif #else len += count_bits_to_0(mask); #endif break; } str += sizeof(__m128i); len += sizeof(__m128i); } return len;}
This implementation would win more performance boost if ‘count_bits_to_0‘ is optimised in less conditions.
We could use _mm_loadu_si128 to load unaligned data and thus skip own aligning loop but the performance
will still be worse due to additional CPU cycles if _mm_loadu_si128 is used.
SSE2 SIMD instructions are present on all modern CPUs and thus this implementation
may bring real benefits to intensive database/text processing applications.
License: Public Domain.
http://stackoverflow.com/questions/2372315/how-to-implement-strlen-as-fast-as-possible
also do two micro-optimizations:
Since most strings we use scan consist of ASCII chars in the range 0~127, the
high bit is (almost) never set, so only check for it in a second test.Increment an index rather than a pointer,
which is cheaper on some architectures (notably x86) and give you the length for ‘free‘...
uint32_t gatopeich_strlen32(const char* str){ uint32_t *u32 = (uint32_t*)str, u, abcd, i=0; while(1) { u = u32[i++]; abcd = (u-0x01010101) & 0x80808080; if (abcd && // If abcd is not 0, we have NUL or a non-ASCII char > 127... (abcd &= ~u)) // ... Discard non-ASCII chars { #if BYTE_ORDER == BIG_ENDIAN return 4*i - (abcd&0xffff0000 ? (abcd&0xff000000?4:3) : abcd&0xff00?2:1); #else return 4*i - (abcd&0xffff ? (abcd&0xff?4:3) : abcd&0xff0000?2:1); #endif } }}
http://www.opensource.apple.com/source/Libc/Libc-997.1.1/string/FreeBSD/strlen.c
strlen.c [plain text]/*- * Copyright (c) 2009 Xin LI <delphij@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS‘‘ AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */#include <sys/cdefs.h>__FBSDID("$FreeBSD: src/lib/libc/string/strlen.c,v 1.7 2009/01/26 07:31:28 delphij Exp $");#include <limits.h>#include <sys/types.h>#include <string.h>/* * Portable strlen() for 32-bit and 64-bit systems. * * Rationale: it is generally much more efficient to do word length * operations and avoid branches on modern computer systems, as * compared to byte-length operations with a lot of branches. * * The expression: * * ((x - 0x01....01) & ~x & 0x80....80) * * would evaluate to a non-zero value iff any of the bytes in the * original word is zero. However, we can further reduce ~1/3 of * time if we consider that strlen() usually operate on 7-bit ASCII * by employing the following expression, which allows false positive * when high bit of 1 and use the tail case to catch these case: * * ((x - 0x01....01) & 0x80....80) * * This is more than 5.2 times as fast as the raw implementation on * Intel T7300 under long mode for strings longer than word length. *//* Magic numbers for the algorithm */#if LONG_BIT == 32static const unsigned long mask01 = 0x01010101;static const unsigned long mask80 = 0x80808080;#elif LONG_BIT == 64static const unsigned long mask01 = 0x0101010101010101;static const unsigned long mask80 = 0x8080808080808080;#else#error Unsupported word size#endif#define LONGPTR_MASK (sizeof(long) - 1)/* * Helper macro to return string length if we caught the zero * byte. */#define testbyte(x) do { if (p[x] == ‘\0‘) return (p - str + x); } while (0)size_tstrlen(const char *str){ const char *p; const unsigned long *lp; /* Skip the first few bytes until we have an aligned p */ for (p = str; (uintptr_t)p & LONGPTR_MASK; p++) if (*p == ‘\0‘) return (p - str); /* Scan the rest of the string using word sized operation */ for (lp = (const unsigned long *)p; ; lp++) if ((*lp - mask01) & mask80) { p = (const char *)(lp); testbyte(0); testbyte(1); testbyte(2); testbyte(3);#if (LONG_BIT >= 64) testbyte(4); testbyte(5); testbyte(6); testbyte(7);#endif } /* NOTREACHED */ return (0);}
http://www.stdlib.net/~colmmacc/strlen.c.html
1 /* Copyright (C) 1991, 1993, 1997, 2000, 2003 Free Software Foundation, Inc. 2 This file is part of the GNU C Library. 3 Written by Torbjorn Granlund (tege@sics.se), 4 with help from Dan Sahlin (dan@sics.se); 5 commentary by Jim Blandy (jimb@ai.mit.edu). 6 7 The GNU C Library is free software; you can redistribute it and/or 8 modify it under the terms of the GNU Lesser General Public 9 License as published by the Free Software Foundation; either 10 version 2.1 of the License, or (at your option) any later version. 11 12 The GNU C Library is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 Lesser General Public License for more details. 16 17 You should have received a copy of the GNU Lesser General Public 18 License along with the GNU C Library; if not, write to the Free 19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 20 02111-1307 USA. */ 21 22 #include <string.h> 23 #include <stdlib.h> 24 25 #undef strlen 26 27 /* Return the length of the null-terminated string STR. Scan for 28 the null terminator quickly by testing four bytes at a time. */ 29 size_t 30 strlen (str) 31 const char *str; 32 { 33 const char *char_ptr; 34 const unsigned long int *longword_ptr; 35 unsigned long int longword, magic_bits, himagic, lomagic; 36 37 /* Handle the first few characters by reading one character at a time. 38 Do this until CHAR_PTR is aligned on a longword boundary. */ 39 for (char_ptr = str; ((unsigned long int) char_ptr 40 & (sizeof (longword) - 1)) != 0; 41 ++char_ptr) 42 if (*char_ptr == ‘\0‘) 43 return char_ptr - str; 44 45 /* All these elucidatory comments refer to 4-byte longwords, 46 but the theory applies equally well to 8-byte longwords. */ 47 48 longword_ptr = (unsigned long int *) char_ptr; 49 50 /* Bits 31, 24, 16, and 8 of this number are zero. Call these bits 51 the "holes." Note that there is a hole just to the left of 52 each byte, with an extra at the end: 53 54 bits: 01111110 11111110 11111110 11111111 55 bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD 56 57 The 1-bits make sure that carries propagate to the next 0-bit. 58 The 0-bits provide holes for carries to fall into. */ 59 magic_bits = 0x7efefeffL; 60 himagic = 0x80808080L; 61 lomagic = 0x01010101L; 62 if (sizeof (longword) > 4) 63 { 64 /* 64-bit version of the magic. */ 65 /* Do the shift in two steps to avoid a warning if long has 32 bits. */ 66 magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; 67 himagic = ((himagic << 16) << 16) | himagic; 68 lomagic = ((lomagic << 16) << 16) | lomagic; 69 } 70 if (sizeof (longword) > 8) 71 abort (); 72 73 /* Instead of the traditional loop which tests each character, 74 we will test a longword at a time. The tricky part is testing 75 if *any of the four* bytes in the longword in question are zero. */ 76 for (;;) 77 { 78 /* We tentatively exit the loop if adding MAGIC_BITS to 79 LONGWORD fails to change any of the hole bits of LONGWORD. 80 81 1) Is this safe? Will it catch all the zero bytes? 82 Suppose there is a byte with all zeros. Any carry bits 83 propagating from its left will fall into the hole at its 84 least significant bit and stop. Since there will be no 85 carry from its most significant bit, the LSB of the 86 byte to the left will be unchanged, and the zero will be 87 detected. 88 89 2) Is this worthwhile? Will it ignore everything except 90 zero bytes? Suppose every byte of LONGWORD has a bit set 91 somewhere. There will be a carry into bit 8. If bit 8 92 is set, this will carry into bit 16. If bit 8 is clear, 93 one of bits 9-15 must be set, so there will be a carry 94 into bit 16. Similarly, there will be a carry into bit 95 24. If one of bits 24-30 is set, there will be a carry 96 into bit 31, so all of the hole bits will be changed. 97 98 The one misfire occurs when bits 24-30 are clear and bit 99 31 is set; in this case, the hole at bit 31 is not100 changed. If we had access to the processor carry flag,101 we could close this loophole by putting the fourth hole102 at bit 32!103 104 So it ignores everything except 128‘s, when they‘re aligned105 properly. */106 107 longword = *longword_ptr++;108 109 if (110 #if 0111 /* Add MAGIC_BITS to LONGWORD. */112 (((longword + magic_bits)113 114 /* Set those bits that were unchanged by the addition. */115 ^ ~longword)116 117 /* Look at only the hole bits. If any of the hole bits118 are unchanged, most likely one of the bytes was a119 zero. */120 & ~magic_bits)121 #else122 ((longword - lomagic) & himagic)123 #endif124 != 0)125 {126 /* Which of the bytes was the zero? If none of them were, it was127 a misfire; continue the search. */128 129 const char *cp = (const char *) (longword_ptr - 1);130 131 if (cp[0] == 0)132 return cp - str;133 if (cp[1] == 0)134 return cp - str + 1;135 if (cp[2] == 0)136 return cp - str + 2;137 if (cp[3] == 0)138 return cp - str + 3;139 if (sizeof (longword) > 4)140 {141 if (cp[4] == 0)142 return cp - str + 4;143 if (cp[5] == 0)144 return cp - str + 5;145 if (cp[6] == 0)146 return cp - str + 6;147 if (cp[7] == 0)148 return cp - str + 7;149 }150 }151 }152 }153 libc_hidden_builtin_def (strlen)
1 /* Copyright (C) 2011-2014 Free Software Foundation, Inc. 2 This file is part of the GNU C Library. 3 Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library. If not, see 17 <http://www.gnu.org/licenses/>. */ 18 19 #include <string.h> 20 #include <stdint.h> 21 #include "string-endian.h" 22 23 size_t 24 strlen (const char *s) 25 { 26 /* Get an aligned pointer. */ 27 const uintptr_t s_int = (uintptr_t) s; 28 const uint64_t *p = (const uint64_t *) (s_int & -8); 29 30 /* Read and MASK the first word. */ 31 uint64_t v = *p | MASK (s_int); 32 33 uint64_t bits; 34 while ((bits = __insn_v1cmpeqi (v, 0)) == 0) 35 v = *++p; 36 37 return ((const char *) p) + (CFZ (bits) >> 3) - s; 38 } 39 libc_hidden_builtin_def (strlen)
http://tonybai.com/2009/04/11/glibc-strlen-source-analysis/
直接操作C标准库提供的字符串操作函数是有一定风险的,稍有不慎就会导致内存问题。
这周用业余时间写了一个小型的安全字符串操作库,但是测试之后才发现自己的实现有很大的性能缺陷。
在Solaris上初步做了一个简单的性能比对,以下是得到的性能数据(以strlen的数据为例):
当传入的字符串长度为10时,执行100w次:
strlen 执行时间是:32762毫秒
my_strlen执行时间是:491836毫秒
当传入的字符串长度为20时,执行100w次:
strlen 执行时间是:35075毫秒
my_strlen执行时间是:770397毫秒
很显然,标准库中strlen的消耗仅是my_strlen的十分之一不到,且其性能消耗随着字符串长度的增加并未有近线性的增加,
而my_strlen则是变化明显。想必大家这时也能猜到my_strlen采用了传统的实现的方式,即采用逐个字节判断是否为‘‘方式,
这也与测试出的现象相符。本着刨根问底的精神,我在网上找到了GNU提供的C标准库中strlen实现的源码,
要看看GLIBC中strlen究竟采用何种技巧才达到了那么高的性能。
说实话在性能优化这方面自己一直还处于比较初级的位置,这也将是自己将来努力的一个方向。
下载了全部GLIBC的代码包,这个包还真不小。在string子目录下找到strlen.c,这就是大多数UNIX平台、
Linux平台以及绝大多数GNU软件使用的strlen的实现源码了。
这份代码由Torbjorn Granlund(还实现了memcpy)编写,Jim Blandy和Dan Sahlin提供了帮助和注释。
包括注释在内,GLIBC的strlen的代码足足有近130行,大致浏览一下, 没有怎么看懂,可耐下心来细致阅读,
还是有些心得的。下面是strlen源码摘要版,后面我将针对这段代码写一些我的理解:
1 /* Return the length of the null-terminated string STR. Scan for 2 the null terminator quickly by testing four bytes at a time. */ 3 size_t strlen (str) const char *str; 4 { 5 const char *char_ptr; 6 const unsigned long int *longword_ptr; 7 unsigned long int longword, magic_bits, himagic, lomagic; 8 9 /* Handle the first few characters by reading one character at a time. 10 Do this until CHAR_PTR is aligned on a longword boundary. */ 11 12 for (char_ptr = str; ((unsigned long int) char_ptr 13 & (sizeof (longword) – 1)) != 0; 14 ++char_ptr) 15 if (*char_ptr == ‘‘) 16 return char_ptr – str; 17 18 /* All these elucidatory comments refer to 4-byte longwords, 19 but the theory applies equally well to 8-byte longwords. */ 20 21 longword_ptr = (unsigned long int *) char_ptr; 22 23 himagic = 0x80808080L; 24 lomagic = 0x01010101L; 25 26 if (sizeof (longword) > 8) 27 abort (); 28 29 /* Instead of the traditional loop which tests each character, 30 we will test a longword at a time. The tricky part is testing 31 if *any of the four* bytes in the longword in question are zero. */ 32 33 for (;;) 34 { 35 longword = *longword_ptr++; 36 37 if ( ((longword – lomagic) & himagic) != 0) 38 { 39 /* Which of the bytes was the zero? If none of them were, it was 40 a misfire; continue the search. */ 41 42 const char *cp = (const char *) (longword_ptr – 1); 43 44 if (cp[0] == 0) 45 return cp – str; 46 if (cp[1] == 0) 47 return cp – str + 1; 48 if (cp[2] == 0) 49 return cp – str + 2; 50 if (cp[3] == 0) 51 return cp – str + 3; 52 if (sizeof (longword) > 4) 53 { 54 if (cp[4] == 0) 55 return cp – str + 4; 56 if (cp[5] == 0) 57 return cp – str + 5; 58 if (cp[6] == 0) 59 return cp – str + 6; 60 if (cp[7] == 0) 61 return cp – str + 7; 62 } 63 } 64 } 65 }
从这段代码开头作者的注释我们大致可以了解到该strlen实现的原理:
就是通过每次测试四个字节来代替传统实现中每次测试一个字节的方法。
知道这个原理了,那么还需要解决两个难题:
1) C标准库要求有很好的移植性,在绝大部分系统体系结构下都应该能正确运行。
那么每次拿出4个字节比较(unsigned long int),就需要考虑内存对齐问题,
传入的字符串的首字符地址可不一定在4对齐的地址上;
2) 如何对四个字节进行测试,找出其中某个字节为全0,这是个技巧问题。
12~21行的代码解决的就是第一个问题:
12 for (char_ptr = str; ((unsigned long int) char_ptr 13 & (sizeof (longword) – 1)) != 0; 14 ++char_ptr) 15 if (*char_ptr == ‘‘) 16 return char_ptr – str; 17 18 /* All these elucidatory comments refer to 4-byte longwords, 19 but the theory applies equally well to 8-byte longwords. */ 20 21 longword_ptr = (unsigned long int *) char_ptr;
作者通过一个for-loop找到传入字符串中第一个地址对齐到4的字符的地址,由于该地址已经对齐到4,
所以最后一行那个强制转型是安全的。虽然可以通过圆整算式直接得到该对齐地址,但是考虑到这个区间可能存在的‘‘,
一个字符一个字符比对也是不可避免的。在很多严格对齐的架构上(比如SUN的SPARC平台),
编译器一般会将字符串地址在编译器就放到对齐的地址上,这样一来,实际执行strlen时for-loop很少能执行一步。
第二个问题作者则是通过一个"带前提"的技巧来解决的。作者设定了两个掩码变量:
23 himagic = 0x80808080L; 24 lomagic = 0x01010101L;
并通过一个conditional expression完成了对四字节中全0字节的检测:
((longword – lomagic) & himagic) != 0
我们将himagic和lomagic按bit展开:
himagic 1000 0000 1000 0000 1000 0000 1000 0000
lomagic 0000 0001 0000 0001 0000 0001 0000 0001
对于这样的代码,似乎没有什么理论可以遵循,需要在实践中去理解。
起初我构造了一个不含全0字节的longword,比如:
longword 1000 0001 1000 0001 1000 0001 1000 0001,
然后按照那个条件表达式计算后,居然也满足!=0的条件,是不是作者的逻辑有问题呢?
后来转念一想,这种逻辑是有“前提条件”的。回顾一下strlen是做什么的,其输入参数是任意的么?
当然不是。输入的字符串中每个字符的值都在[0, 127]的ascii码范围内,
也就是说每个字节最高位的bit都是0,这样longword就应该是如下这个样子了:
longword 0xxx xxxx 0xxx xxxx 0xxx xxxx 0xxx xxxx
基于这样的前提我们考虑两种情况:
当longword中没有全0字节时,比如:
longword 0000 0001 0000 0001 0000 0001 0000 0001
这样在做完计算后,值为0,不满足条件。
当longword中有全零字节时,比如:
longword 0000 0000 0000 0001 0000 0001 0000 0001
这样在做完计算后,最高字节最高bit的值肯定为1,满足!=0条件,全0字节被检测出来。
也就是说一旦有全0字节,在减去lomagic时势必会产生借位,全0的那个字节在减去lomagic后最高位bit肯定由0变1,
这样与himagic一与,肯定不为0,就是这么检测出来的。
这一方法在64位平台依然适用,上面的代码摘要中省略了对64bit平台的特殊处理,为的是使代码逻辑更清晰,更易读。
faster strlen