Besides the technique of unrolling the loop and thus introducing performance boost for strlen function explained in Optimized strlen function it is also possible to go a little bit further and try to optimise strlen with SSE2 SIMD instructions to make it even more faster.
SSE2 instructions provide ability to execute single operation on multiple variables simultaneously and we can try to use such behaviour to optimise a stage of searching for 0 in the string.
Optimized strlen function did provide unrolling but 4 conditions are processed by separate CPU instructions if compiled for 32-bit software platform and 8! for 64-bit, so we can try to optimise it into just 2 SSE2 instructions:
- PCMPEQB (_mm_cmpeq_epi8): compares 8-bit integers for equality (we will use it to get position of 0 in 16 positions (8-bit numbers))
- PMOVMSKB (_mm_movemask_epi8): creates 16-bit mask from the most significant bits of 8-bit integers (we use it to get integral mask with 0 positions expressed in unset bits for simplified comparison and final phase 'count_bits_to_0')
Implementation:
#ifndef WORDS_BIGENDIAN
#if 0
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
register int i = 0;
if (!(x & (1 << 0))) i ++;
else return i;
if (!(x & (1 << 1))) i ++;
else return i;
if (!(x & (1 << 2))) i ++;
else return i;
if (!(x & (1 << 3))) i ++;
else return i;
if (!(x & (1 << 4))) i ++;
else return i;
if (!(x & (1 << 5))) i ++;
else return i;
if (!(x & (1 << 6))) i ++;
else return i;
if (!(x & (1 << 7))) i ++;
else return i;
if (!(x & (1 << 8))) i ++;
else return i;
if (!(x & (1 << 9))) i ++;
else return i;
if (!(x & (1 << 10))) i ++;
else return i;
if (!(x & (1 << 11))) i ++;
else return i;
if (!(x & (1 << 12))) i ++;
else return i;
if (!(x & (1 << 13))) i ++;
else return i;
if (!(x & (1 << 14))) i ++;
else return i;
if (!(x & (1 << 15))) i ++;
return i;
}
#elif 0
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
// http://www.hackersdelight.org/: ntz3() shortened for 16-bit mask by Peter Kankowski
register int n = 1;
if ((x & 0x000000FFU) == 0) {n += 8; x >>= 8;}
if ((x & 0x0000000FU) == 0) {n += 4; x >>= 4;}
if ((x & 0x00000003U) == 0) {n += 2; x >>= 2;}
return n - (x & 1);
}
#else
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40
{ // this is current winner for speed
static const unsigned char table[256] =
{
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
};
if ((unsigned char)x)
return table[(unsigned char)x];
return table[x >> 8] + 8; // t[x / 256] + 8
}
#endif
#else
#if 0
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
register int i = 0;
if (!(x & (1 << 15))) i ++;
else return i;
if (!(x & (1 << 14))) i ++;
else return i;
if (!(x & (1 << 13))) i ++;
else return i;
if (!(x & (1 << 12))) i ++;
else return i;
if (!(x & (1 << 11))) i ++;
else return i;
if (!(x & (1 << 10))) i ++;
else return i;
if (!(x & (1 << 9))) i ++;
else return i;
if (!(x & (1 << 8))) i ++;
else return i;
if (!(x & (1 << 7))) i ++;
else return i;
if (!(x & (1 << 6))) i ++;
else return i;
if (!(x & (1 << 5))) i ++;
else return i;
if (!(x & (1 << 4))) i ++;
else return i;
if (!(x & (1 << 3))) i ++;
else return i;
if (!(x & (1 << 2))) i ++;
else return i;
if (!(x & (1 << 1))) i ++;
else return i;
if (!(x & (1 << 0))) i ++;
return i;
}
#else
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
// http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask
register int n = 0;
if (x <= 0x000000FFU) {n = n + 8; x = x << 8;}
if (x <= 0x00000FFFU) {n = n + 4; x = x << 4;}
if (x <= 0x00003FFFU) {n = n + 2; x = x << 2;}
if (x <= 0x00007FFFU) {n = n + 1;}
return n;
}
#endif
#endif
size_t strlen(const char *str)
{
register size_t len = 0;
// align to 16 bytes
while ((((intptr_t)str) & (sizeof(__m128i)-1)) != 0)
{
if (*str++ == 0)
return len;
++ len;
}
// search for 0
__m128i xmm0 = _mm_setzero_si128();
__m128i xmm1;
int mask = 0;
for (;;)
{
xmm1 = _mm_load_si128((__m128i *)str);
xmm1 = _mm_cmpeq_epi8(xmm1, xmm0);
if ((mask = _mm_movemask_epi8(xmm1)) != 0)
{
// got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask
// find index of first set bit
#ifndef _DISABLE_ASM_BSF // define it to disable ASM
#if (_MSC_VER >= 1300) // make sure <intrin.h> is included
unsigned long pos;
_BitScanForward(&pos, mask);
len += (size_t)pos;
#elif defined(_MSC_VER) // earlier MSVC's do not have _BitScanForward, use inline asm
__asm bsf edx, mask ; edx = bsf(mask)
__asm add edx, len ; edx += len
__asm mov len, edx ; len = edx
#elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz
len += __builtin_ctz(mask);
#elif defined(__GNUC__) // older GCC shall use inline asm
unsigned int pos;
asm("bsf %1, %0" : "=r" (pos) : "rm" (mask));
len += (size_t)pos;
#else // none of choices exist, use local BSF implementation
len += count_bits_to_0(mask);
#endif
#else
len += count_bits_to_0(mask);
#endif
break;
}
str += sizeof(__m128i);
len += sizeof(__m128i);
}
return len;
}
This implementation would win more performance boost if 'count_bits_to_0' is optimised in less conditions.
We could use _mm_loadu_si128 to load unaligned data and thus skip own aligning loop but the performance will still be worse due to additional CPU cycles if _mm_loadu_si128 is used.
SSE2 SIMD instructions are present on all modern CPUs and thus this implementation may bring real benefits to intensive database/text processing applications.
License: Public Domain.
26 comments
First, the mask is limited to 16 bits, not 32. Second, you can use some methods from Henry Warren's book or Bit Twiddling Hacks page, for example:
When using this function, your method is the fastest for the long string:
Thank you very much Peter, you advised excellent resources for bit tricks and also the proposed algorythm for counting unset bits provides around 9% speedup (if bit was set somewhere in 14-13 position) on my machine in regards to original version which is hidden by #if 0 region now.
The results on short strings can be worse also due alignment issue. I made my tests with the following string which was aligned to 16 bytes:
and sse2 strlen outperformed your unrolled strlen by around 70%, but once I force alignment to 1 byte for example, my sse2 strlen is loosing by almost 68% (this is also true according the test results you posted)!!! While your unrolled strlen does not suffer that much from alignment issue. It would be nice if you could repeat test for 16 bit aligned data as I have demonstrated to see your results.
So I would conclude that properly aligned string data for short strings will make sse2 optimised strlen the total winner in its context. String data of long strings on the other hand will not introduce much of alignment dependency as sse2 zero looking core will compensate with its speed and will win anyway.
One more sse2 strlen drawback is its dependency on SSE2 SIMD instructions and for those who wants to have own strlen I would advise combining Optimized strlen function (as it is a winner for universal application) for platforms lacking SSE2 and this sse2 strlen implementations for systems having it.
I've updated code accordingly and added counting for leading zeroes for Big-Endian platforms modified for 16-bit mask thanks to your advise.
The first test included very short strings (3-5 characters). SSE loop need some time to set up, so it was a little slower than others on short strings, but very fast on the long one. Imho, SSE is a useful technique for many programs. Thank you very much for this article.
Yes, for such short strings (3-5 chars) loading into SSE cache will eliminate all benefits, although results are almost as for normal strlen that is excellent keeping in mind real x2-x3 boost on longer strings. Thank you for updated test results.
How about this? I'm lazy to use VC (yes, i'm linux user) so this is untested but may be faster.
Thank you very much! With this method, Dmitry's function is the fastest one:
Thank you Nazo for your kind contribution. Yes!, in my tests it is also faster and I've just updated source of front page to make it a current effective little-endian zero counting method of strlen SSE2 implementation. At last short strings are defeated with your help!!! :)
During a sudden recall by Piotr that 80x86 family has BSF instruction which is just what we are trying to improve I wish to propose the following addition to our strlen method with ASM inclusions:
Also during performance measurments it was revealed that AMD K8 CPUs have slow BSF implementation which is 46% slower on my AMD K8 CPU than latest Nazo's implementation.
Piotr have found out from Agner Fog's publications that it is corect that K8 is slow on BSF, although K10 family is much improved and Intel CPUs have no problems as well, so in most general cases we shall rely on BSF instruction usage and use custom Nazo's implementation when BSF is absent or CPU's BSF implementation is too slow.
From Agner Fog's table:
Results on my PC (Pentium M Dothan 1.5 GHz, using MSVC with the _BitScanForward intrinsic):
on x86, replacing
with
will be a bit faster. (shr → movzbl) also using +8ed table will be a bit faster on some cases.
Unfortunately, only with GCC. When using MSVC++ 2005,
table[(unsigned short)x»8]
is one instruction longer. The compiler does not use MOVZX from high-byte register.From /~colmmacc/ blog:
So, strlen() optimization can be useful in practice!
Do you really think it's a valid argument? They claim "an app spent 10% CPU time in strlen, then less than 1/100th of 1%." That is 1000 times improvement which is obviously not possible by optimizing the inside of the strlen function. The fastest way to do strlen is simply not using it. :)
Even for constant strings the compiler can do more than most people care to remember:
Yes, you are right. Most likely, they stored the length with each string instead of using strlen.
MSVC and most likely some other compilers optimize the code const char *pszStr="SomeString"; SomeFunction(pszStr, strlen(pszStr)); by replacing strlen call with size_t constant (10 in our case). MSVC 2008 uses the same optimization also for unicode strings, which was not there in VC2005.
So don't be afraid of using strlen, there is no need to optimize it by hand using sizeof.
Thanks! I've checked MSVC++ 2005.
const char *pszStr
is not optimized, but it you addstatic
, strlen will be replaced with a constant:So you should remember to always add
static
keyword. Thank you very much; I did not know about this optimization.I don't have these new MSVC compilers here, but I guess that they really don't use size_t at all, instead they recognize strlen as an intrinsic so the code optimization phase reduces the constant expression over the constant sequence of characters. I don't expect that it's so common among compilers. Anybody tried anything else? Anyway, am I right that if the string is "ab\0cd" the result will be 2 in these new MSVC compilers, which proves that they don't use size_t?
We probably misunderstood each other: size_t is not some kind of optimization, but an unsigned integer type to store the size of an object.
For "ab\0cd",
push 2
is generated by MSVC. GCC doesn't support this optimization (as of GCC 4.4).Lapsus calami. I've meant "sizeof." sizeof( "ab\0cd" ) - 1 is of course 5. Applying strlen results in 2. Newer MSVC compilers obviously really use strlen to calculate the result. I'd also like to know if the optimization is wired to specific function or if they really can recognize some function without side effects? What happens if you make your own "int mystrlen( char* s )" and you call it the same way as before?
Below is the code generated with XCode 3.2 (Mac OS X 10.6 Snow Leopard, GCC).
And it works correctly in case of return strlen("Some\0String"), moving 4 to eax.
Sorry for badly formatted post. Please feel free to adjust it.
Thanks, that's interesting. It appears that for GCC:
You can register if you want to edit your comments and post own articles.
Ace, it's wired to strlen. MSVC++ and GCC cannot recognize functions without side effects. BTW, I'm writing an article on a related topic now :)
One more question from somebody who doesn't have latest MSVC at hand, can strlen be a part of the constant expressions which are then used for the things that have to be known at the compile time, like the size of the array on the stack? On my MSVC 6, I can do
No, using functions in constant expressions would violate C/C++ standard, but it would be a very useful feature.
That's because the
s
pointer can be modified by other code. Just add aconst
qualifier to the pointer, and GCC should optimize it:And about GCC not recognizing functions without side effects, you can use GCC's function attributes to explicitly mark them
__attribute__ ((pure))
for functions without side effects, and__attribute__ ((const))
for functions without side effects that don't depend on global memory (strlen()
is pure, but not const, since it has no side effects but has to access the string pointed to by its argument).