Besides the technique of unrolling the loop and thus introducing performance boost for strlen function explained in Optimized strlen function it is also possible to go a little bit further and try to optimise strlen with SSE2 SIMD instructions to make it even more faster.
SSE2 instructions provide ability to execute single operation on multiple variables simultaneously and we can try to use such behaviour to optimise a stage of searching for 0 in the string.
Optimized strlen function did provide unrolling but 4 conditions are processed by separate CPU instructions if compiled for 32-bit software platform and 8! for 64-bit, so we can try to optimise it into just 2 SSE2 instructions:
- PCMPEQB (_mm_cmpeq_epi8): compares 8-bit integers for equality (we will use it to get position of 0 in 16 positions (8-bit numbers))
- PMOVMSKB (_mm_movemask_epi8): creates 16-bit mask from the most significant bits of 8-bit integers (we use it to get integral mask with 0 positions expressed in unset bits for simplified comparison and final phase 'count_bits_to_0')
Implementation:
#ifndef WORDS_BIGENDIAN
#if 0
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
register int i = 0;
if (!(x & (1 << 0))) i ++;
else return i;
if (!(x & (1 << 1))) i ++;
else return i;
if (!(x & (1 << 2))) i ++;
else return i;
if (!(x & (1 << 3))) i ++;
else return i;
if (!(x & (1 << 4))) i ++;
else return i;
if (!(x & (1 << 5))) i ++;
else return i;
if (!(x & (1 << 6))) i ++;
else return i;
if (!(x & (1 << 7))) i ++;
else return i;
if (!(x & (1 << 8))) i ++;
else return i;
if (!(x & (1 << 9))) i ++;
else return i;
if (!(x & (1 << 10))) i ++;
else return i;
if (!(x & (1 << 11))) i ++;
else return i;
if (!(x & (1 << 12))) i ++;
else return i;
if (!(x & (1 << 13))) i ++;
else return i;
if (!(x & (1 << 14))) i ++;
else return i;
if (!(x & (1 << 15))) i ++;
return i;
}
#elif 0
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
// http://www.hackersdelight.org/: ntz3() shortened for 16-bit mask by Peter Kankowski
register int n = 1;
if ((x & 0x000000FFU) == 0) {n += 8; x >>= 8;}
if ((x & 0x0000000FU) == 0) {n += 4; x >>= 4;}
if ((x & 0x00000003U) == 0) {n += 2; x >>= 2;}
return n - (x & 1);
}
#else
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40
{ // this is current winner for speed
static const unsigned char table[256] =
{
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
};
if ((unsigned char)x)
return table[(unsigned char)x];
return table[x >> 8] + 8; // t[x / 256] + 8
}
#endif
#else
#if 0
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
register int i = 0;
if (!(x & (1 << 15))) i ++;
else return i;
if (!(x & (1 << 14))) i ++;
else return i;
if (!(x & (1 << 13))) i ++;
else return i;
if (!(x & (1 << 12))) i ++;
else return i;
if (!(x & (1 << 11))) i ++;
else return i;
if (!(x & (1 << 10))) i ++;
else return i;
if (!(x & (1 << 9))) i ++;
else return i;
if (!(x & (1 << 8))) i ++;
else return i;
if (!(x & (1 << 7))) i ++;
else return i;
if (!(x & (1 << 6))) i ++;
else return i;
if (!(x & (1 << 5))) i ++;
else return i;
if (!(x & (1 << 4))) i ++;
else return i;
if (!(x & (1 << 3))) i ++;
else return i;
if (!(x & (1 << 2))) i ++;
else return i;
if (!(x & (1 << 1))) i ++;
else return i;
if (!(x & (1 << 0))) i ++;
return i;
}
#else
static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
{
// http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask
register int n = 0;
if (x <= 0x000000FFU) {n = n + 8; x = x << 8;}
if (x <= 0x00000FFFU) {n = n + 4; x = x << 4;}
if (x <= 0x00003FFFU) {n = n + 2; x = x << 2;}
if (x <= 0x00007FFFU) {n = n + 1;}
return n;
}
#endif
#endif
size_t strlen(const char *str)
{
register size_t len = 0;
// align to 16 bytes
while ((((intptr_t)str) & (sizeof(__m128i)-1)) != 0)
{
if (*str++ == 0)
return len;
++ len;
}
// search for 0
__m128i xmm0 = _mm_setzero_si128();
__m128i xmm1;
int mask = 0;
for (;;)
{
xmm1 = _mm_load_si128((__m128i *)str);
xmm1 = _mm_cmpeq_epi8(xmm1, xmm0);
if ((mask = _mm_movemask_epi8(xmm1)) != 0)
{
// got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask
// find index of first set bit
#ifndef _DISABLE_ASM_BSF // define it to disable ASM
#if (_MSC_VER >= 1300) // make sure <intrin.h> is included
unsigned long pos;
_BitScanForward(&pos, mask);
len += (size_t)pos;
#elif defined(_MSC_VER) // earlier MSVC's do not have _BitScanForward, use inline asm
__asm bsf edx, mask ; edx = bsf(mask)
__asm add edx, len ; edx += len
__asm mov len, edx ; len = edx
#elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz
len += __builtin_ctz(mask);
#elif defined(__GNUC__) // older GCC shall use inline asm
unsigned int pos;
asm("bsf %1, %0" : "=r" (pos) : "rm" (mask));
len += (size_t)pos;
#else // none of choices exist, use local BSF implementation
len += count_bits_to_0(mask);
#endif
#else
len += count_bits_to_0(mask);
#endif
break;
}
str += sizeof(__m128i);
len += sizeof(__m128i);
}
return len;
}
This implementation would win more performance boost if 'count_bits_to_0' is optimised in less conditions.
We could use _mm_loadu_si128 to load unaligned data and thus skip own aligning loop but the performance will still be worse due to additional CPU cycles if _mm_loadu_si128 is used.
SSE2 SIMD instructions are present on all modern CPUs and thus this implementation may bring real benefits to intensive database/text processing applications.
License: Public Domain.
26 comments
Ten recent comments are shown below. Show all comments
I don't have these new MSVC compilers here, but I guess that they really don't use size_t at all, instead they recognize strlen as an intrinsic so the code optimization phase reduces the constant expression over the constant sequence of characters. I don't expect that it's so common among compilers. Anybody tried anything else? Anyway, am I right that if the string is "ab\0cd" the result will be 2 in these new MSVC compilers, which proves that they don't use size_t?
We probably misunderstood each other: size_t is not some kind of optimization, but an unsigned integer type to store the size of an object.
For "ab\0cd",
push 2
is generated by MSVC. GCC doesn't support this optimization (as of GCC 4.4).Lapsus calami. I've meant "sizeof." sizeof( "ab\0cd" ) - 1 is of course 5. Applying strlen results in 2. Newer MSVC compilers obviously really use strlen to calculate the result. I'd also like to know if the optimization is wired to specific function or if they really can recognize some function without side effects? What happens if you make your own "int mystrlen( char* s )" and you call it the same way as before?
Below is the code generated with XCode 3.2 (Mac OS X 10.6 Snow Leopard, GCC).
And it works correctly in case of return strlen("Some\0String"), moving 4 to eax.
Sorry for badly formatted post. Please feel free to adjust it.
Thanks, that's interesting. It appears that for GCC:
You can register if you want to edit your comments and post own articles.
Ace, it's wired to strlen. MSVC++ and GCC cannot recognize functions without side effects. BTW, I'm writing an article on a related topic now :)
One more question from somebody who doesn't have latest MSVC at hand, can strlen be a part of the constant expressions which are then used for the things that have to be known at the compile time, like the size of the array on the stack? On my MSVC 6, I can do
No, using functions in constant expressions would violate C/C++ standard, but it would be a very useful feature.
That's because the
s
pointer can be modified by other code. Just add aconst
qualifier to the pointer, and GCC should optimize it:And about GCC not recognizing functions without side effects, you can use GCC's function attributes to explicitly mark them
__attribute__ ((pure))
for functions without side effects, and__attribute__ ((const))
for functions without side effects that don't depend on global memory (strlen()
is pure, but not const, since it has no side effects but has to access the string pointed to by its argument).