Abstract:
Optimising strlen function using SSE2 SIMD instructions.

Created 9 months ago by Dmitry Kostjuchenko
Last changed 24 days ago
Contributors: Peter Kankowski and Nazo
Filed under Low-level code optimization

SSE2 optimised strlen

Besides the technique of unrolling the loop and thus introducing performance boost for strlen function explained in Optimized strlen function it is also possible to go a little bit further and try to optimise strlen with SSE2 SIMD instructions to make it even more faster.

SSE2 instructions provide ability to execute single operation on multiple variables simultaneously and we can try to use such behaviour to optimise a stage of searching for 0 in the string.

Optimized strlen function did provide unrolling but 4 conditions are processed by separate CPU instructions if compiled for 32-bit software platform and 8! for 64-bit, so we can try to optimise it into just 2 SSE2 instructions:

  1. PCMPEQB (_mm_cmpeq_epi8): compares 8-bit integers for equality (we will use it to get position of 0 in 16 positions (8-bit numbers))
  2. PMOVMSKB (_mm_movemask_epi8): creates 16-bit mask from the most significant bits of 8-bit integers (we use it to get integral mask with 0 positions expressed in unset bits for simplified comparison and final phase 'count_bits_to_0')

Implementation:

#ifndef WORDS_BIGENDIAN
    #if 0
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
        {
            register int i = 0;
            if (!(x & (1 << 0))) i ++;
            else return i;
            if (!(x & (1 << 1))) i ++;
            else return i;
            if (!(x & (1 << 2))) i ++;
            else return i;
            if (!(x & (1 << 3))) i ++;
            else return i;
            if (!(x & (1 << 4))) i ++;
            else return i;
            if (!(x & (1 << 5))) i ++;
            else return i;
            if (!(x & (1 << 6))) i ++;
            else return i;
            if (!(x & (1 << 7))) i ++;
            else return i;
            if (!(x & (1 << 8))) i ++;
            else return i;
            if (!(x & (1 << 9))) i ++;
            else return i;
            if (!(x & (1 << 10))) i ++;
            else return i;
            if (!(x & (1 << 11))) i ++;
            else return i;
            if (!(x & (1 << 12))) i ++;
            else return i;
            if (!(x & (1 << 13))) i ++;
            else return i;
            if (!(x & (1 << 14))) i ++;
            else return i;
            if (!(x & (1 << 15))) i ++;
            return i;
        }
    #elif 0
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
        {
            // http://www.hackersdelight.org/: ntz3() shortened for 16-bit mask by Peter Kankowski
            register int n = 1;
            if ((x & 0x000000FFU) == 0) {n += 8; x >>= 8;}
            if ((x & 0x0000000FU) == 0) {n += 4; x >>= 4;}
            if ((x & 0x00000003U) == 0) {n += 2; x >>= 2;}
            return n - (x & 1);
        }
    #else
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40
        {                                                 // this is current winner for speed
            static const unsigned char table[256] = 
            {
                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
            };
            if ((unsigned char)x)
                return table[(unsigned char)x];
            return table[x >> 8] + 8; // t[x / 256] + 8
        }
    #endif
#else
    #if 0
        static inline int count_bits_to_0(unsigned int x)  // counting trailing zeroes
        {
            register int i = 0;
            if (!(x & (1 << 15))) i ++;
            else return i;
            if (!(x & (1 << 14))) i ++;
            else return i;
            if (!(x & (1 << 13))) i ++;
            else return i;
            if (!(x & (1 << 12))) i ++;
            else return i;
            if (!(x & (1 << 11))) i ++;
            else return i;
            if (!(x & (1 << 10))) i ++;
            else return i;
            if (!(x & (1 << 9))) i ++;
            else return i;
            if (!(x & (1 << 8))) i ++;
            else return i;
            if (!(x & (1 << 7))) i ++;
            else return i;
            if (!(x & (1 << 6))) i ++;
            else return i;
            if (!(x & (1 << 5))) i ++;
            else return i;
            if (!(x & (1 << 4))) i ++;
            else return i;
            if (!(x & (1 << 3))) i ++;
            else return i;
            if (!(x & (1 << 2))) i ++;
            else return i;
            if (!(x & (1 << 1))) i ++;
            else return i;
            if (!(x & (1 << 0))) i ++;
            return i;
        }
    #else
        static inline int count_bits_to_0(unsigned int x)  // counting trailing zeroes
        {
            // http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask
            register int n = 0;
            if (x <= 0x000000FFU) {n = n + 8; x = x << 8;}
            if (x <= 0x00000FFFU) {n = n + 4; x = x << 4;}
            if (x <= 0x00003FFFU) {n = n + 2; x = x << 2;}
            if (x <= 0x00007FFFU) {n = n + 1;}
            return n;
        }
    #endif
#endif
size_t strlen(const char *str)
{
    register size_t len = 0;
    // align to 16 bytes
    while ((((intptr_t)str) & (sizeof(__m128i)-1)) != 0)
    {
        if (*str++ == 0)
            return len;
        ++ len;
    }
    // search for 0
    __m128i xmm0 = _mm_setzero_si128();
    __m128i xmm1;
    int mask = 0;
    for (;;)
    {
        xmm1 = _mm_load_si128((__m128i *)str);
        xmm1 = _mm_cmpeq_epi8(xmm1, xmm0);
        if ((mask = _mm_movemask_epi8(xmm1)) != 0)
        {
            // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask
            // find index of first set bit

        #ifndef _DISABLE_ASM_BSF // define it to disable ASM
            #if (_MSC_VER >= 1300)   // make sure <intrin.h> is included
                unsigned long pos;
                _BitScanForward(&pos, mask);
                len += (size_t)pos;
            #elif defined(_MSC_VER)  // earlier MSVC's do not have _BitScanForward, use inline asm
                __asm bsf edx, mask ; edx = bsf(mask)
                __asm add edx, len  ; edx += len
                __asm mov len, edx  ; len = edx
            #elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz
                len += __builtin_ctz(mask);
            #elif defined(__GNUC__) // older GCC shall use inline asm
                unsigned int pos;
                asm("bsf %1, %0" : "=r" (pos) : "rm" (mask));
                len += (size_t)pos;
            #else                    // none of choices exist, use local BSF implementation
                len += count_bits_to_0(mask);
            #endif
        #else
            len += count_bits_to_0(mask);
        #endif

            break;
        }
        str += sizeof(__m128i);
        len += sizeof(__m128i);
    }
    return len;
}

This implementation would win more performance boost if 'count_bits_to_0' is optimised in less conditions.

We could use _mm_loadu_si128 to load unaligned data and thus skip own aligning loop but the performance will still be worse due to additional CPU cycles if _mm_loadu_si128 is used.

SSE2 SIMD instructions are present on all modern CPUs and thus this implementation may bring real benefits to intensive database/text processing applications.

License: Public Domain.

24 comments

Ten recent comments are shown below. Show all comments

PRR, 6 months ago

MSVC and most likely some other compilers optimize the code const char *pszStr="SomeString"; SomeFunction(pszStr, strlen(pszStr)); by replacing strlen call with size_t constant (10 in our case). MSVC 2008 uses the same optimization also for unicode strings, which was not there in VC2005.

So don't be afraid of using strlen, there is no need to optimize it by hand using sizeof.

Peter Kankowski, 6 months ago

Thanks! I've checked MSVC++ 2005. const char *pszStr is not optimized, but it you add static, strlen will be replaced with a constant:

static const char *s = "SomeString";
char buff[200];
wsprintf(buff, "%d", (int)strlen(s));
push   10
lea    eax, DWORD PTR _buff$[esp+204]
push   OFFSET $SG-6
push   eax
call   DWORD PTR __imp__wsprintfA

So you should remember to always add static keyword. Thank you very much; I did not know about this optimization.

ace, 5 months ago

I don't have these new MSVC compilers here, but I guess that they really don't use size_t at all, instead they recognize strlen as an intrinsic so the code optimization phase reduces the constant expression over the constant sequence of characters. I don't expect that it's so common among compilers. Anybody tried anything else? Anyway, am I right that if the string is "ab\0cd" the result will be 2 in these new MSVC compilers, which proves that they don't use size_t?

Peter Kankowski, 5 months ago

We probably misunderstood each other: size_t is not some kind of optimization, but an unsigned integer type to store the size of an object.

For "ab\0cd", push 2 is generated by MSVC. GCC doesn't support this optimization (as of GCC 4.4).

ace, 5 months ago
size_t is not some kind of optimization

Lapsus calami. I've meant "sizeof." sizeof( "ab\0cd" ) - 1 is of course 5. Applying strlen results in 2. Newer MSVC compilers obviously really use strlen to calculate the result. I'd also like to know if the optimization is wired to specific function or if they really can recognize some function without side effects? What happens if you make your own "int mystrlen( char* s )" and you call it the same way as before?

PRR, 5 months ago

Below is the code generated with XCode 3.2 (Mac OS X 10.6 Snow Leopard, GCC).

int main (int argc, const char * argv[]) {
    return strlen("SomeString");
}
0x00001f8a  <+0000>  push   %ebp
0x00001f8b  <+0001>  mov    %esp,%ebp
0x00001f8d  <+0003>  mov    $0xb,%eax
0x00001f92  <+0008>  leave  
0x00001f93  <+0009>  ret

And it works correctly in case of return strlen("Some\0String"), moving 4 to eax.

PRR, 5 months ago

Sorry for badly formatted post. Please feel free to adjust it.

Peter Kankowski, 5 months ago

Thanks, that's interesting. It appears that for GCC:

strlen("SomeString") // is optimized
 
char * s = "SomeString"; // is optimized
return strlen(s);
 
const char * s = "SomeString"; // is optimized
return strlen(s);
 
static const char * s = "SomeString"; // is not optimized
return strlen(s);

You can register if you want to edit your comments and post own articles.

Ace, it's wired to strlen. MSVC++ and GCC cannot recognize functions without side effects. BTW, I'm writing an article on a related topic now :)

ace, 5 months ago
don't be afraid of using strlen, there is no need to optimize it by hand using sizeof.

One more question from somebody who doesn't have latest MSVC at hand, can strlen be a part of the constant expressions which are then used for the things that have to be known at the compile time, like the size of the array on the stack? On my MSVC 6, I can do

char s[] = "something";
char a[ 2 * sizeof( s ) ];
Peter Kankowski, 5 months ago

No, using functions in constant expressions would violate C/C++ standard, but it would be a very useful feature.

Your name:
Comment: