Abstract:
Optimising strlen function using SSE2 SIMD instructions.

Created by Dmitry Kostjuchenko
Last changed
Contributors: Peter Kankowski and Nazo
Filed under Low-level code optimization

Share on social sitesReddit Digg Delicious Buzz Facebook Twitter

SSE2 optimised strlen

Besides the technique of unrolling the loop and thus introducing performance boost for strlen function explained in Optimized strlen function it is also possible to go a little bit further and try to optimise strlen with SSE2 SIMD instructions to make it even more faster.

SSE2 instructions provide ability to execute single operation on multiple variables simultaneously and we can try to use such behaviour to optimise a stage of searching for 0 in the string.

Optimized strlen function did provide unrolling but 4 conditions are processed by separate CPU instructions if compiled for 32-bit software platform and 8! for 64-bit, so we can try to optimise it into just 2 SSE2 instructions:

  1. PCMPEQB (_mm_cmpeq_epi8): compares 8-bit integers for equality (we will use it to get position of 0 in 16 positions (8-bit numbers))
  2. PMOVMSKB (_mm_movemask_epi8): creates 16-bit mask from the most significant bits of 8-bit integers (we use it to get integral mask with 0 positions expressed in unset bits for simplified comparison and final phase 'count_bits_to_0')

Implementation:

#ifndef WORDS_BIGENDIAN
    #if 0
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
        {
            register int i = 0;
            if (!(x & (1 << 0))) i ++;
            else return i;
            if (!(x & (1 << 1))) i ++;
            else return i;
            if (!(x & (1 << 2))) i ++;
            else return i;
            if (!(x & (1 << 3))) i ++;
            else return i;
            if (!(x & (1 << 4))) i ++;
            else return i;
            if (!(x & (1 << 5))) i ++;
            else return i;
            if (!(x & (1 << 6))) i ++;
            else return i;
            if (!(x & (1 << 7))) i ++;
            else return i;
            if (!(x & (1 << 8))) i ++;
            else return i;
            if (!(x & (1 << 9))) i ++;
            else return i;
            if (!(x & (1 << 10))) i ++;
            else return i;
            if (!(x & (1 << 11))) i ++;
            else return i;
            if (!(x & (1 << 12))) i ++;
            else return i;
            if (!(x & (1 << 13))) i ++;
            else return i;
            if (!(x & (1 << 14))) i ++;
            else return i;
            if (!(x & (1 << 15))) i ++;
            return i;
        }
    #elif 0
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
        {
            // http://www.hackersdelight.org/: ntz3() shortened for 16-bit mask by Peter Kankowski
            register int n = 1;
            if ((x & 0x000000FFU) == 0) {n += 8; x >>= 8;}
            if ((x & 0x0000000FU) == 0) {n += 4; x >>= 4;}
            if ((x & 0x00000003U) == 0) {n += 2; x >>= 2;}
            return n - (x & 1);
        }
    #else
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40
        {                                                 // this is current winner for speed
            static const unsigned char table[256] = 
            {
                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
            };
            if ((unsigned char)x)
                return table[(unsigned char)x];
            return table[x >> 8] + 8; // t[x / 256] + 8
        }
    #endif
#else
    #if 0
        static inline int count_bits_to_0(unsigned int x)  // counting trailing zeroes
        {
            register int i = 0;
            if (!(x & (1 << 15))) i ++;
            else return i;
            if (!(x & (1 << 14))) i ++;
            else return i;
            if (!(x & (1 << 13))) i ++;
            else return i;
            if (!(x & (1 << 12))) i ++;
            else return i;
            if (!(x & (1 << 11))) i ++;
            else return i;
            if (!(x & (1 << 10))) i ++;
            else return i;
            if (!(x & (1 << 9))) i ++;
            else return i;
            if (!(x & (1 << 8))) i ++;
            else return i;
            if (!(x & (1 << 7))) i ++;
            else return i;
            if (!(x & (1 << 6))) i ++;
            else return i;
            if (!(x & (1 << 5))) i ++;
            else return i;
            if (!(x & (1 << 4))) i ++;
            else return i;
            if (!(x & (1 << 3))) i ++;
            else return i;
            if (!(x & (1 << 2))) i ++;
            else return i;
            if (!(x & (1 << 1))) i ++;
            else return i;
            if (!(x & (1 << 0))) i ++;
            return i;
        }
    #else
        static inline int count_bits_to_0(unsigned int x)  // counting trailing zeroes
        {
            // http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask
            register int n = 0;
            if (x <= 0x000000FFU) {n = n + 8; x = x << 8;}
            if (x <= 0x00000FFFU) {n = n + 4; x = x << 4;}
            if (x <= 0x00003FFFU) {n = n + 2; x = x << 2;}
            if (x <= 0x00007FFFU) {n = n + 1;}
            return n;
        }
    #endif
#endif
size_t strlen(const char *str)
{
    register size_t len = 0;
    // align to 16 bytes
    while ((((intptr_t)str) & (sizeof(__m128i)-1)) != 0)
    {
        if (*str++ == 0)
            return len;
        ++ len;
    }
    // search for 0
    __m128i xmm0 = _mm_setzero_si128();
    __m128i xmm1;
    int mask = 0;
    for (;;)
    {
        xmm1 = _mm_load_si128((__m128i *)str);
        xmm1 = _mm_cmpeq_epi8(xmm1, xmm0);
        if ((mask = _mm_movemask_epi8(xmm1)) != 0)
        {
            // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask
            // find index of first set bit

        #ifndef _DISABLE_ASM_BSF // define it to disable ASM
            #if (_MSC_VER >= 1300)   // make sure <intrin.h> is included
                unsigned long pos;
                _BitScanForward(&pos, mask);
                len += (size_t)pos;
            #elif defined(_MSC_VER)  // earlier MSVC's do not have _BitScanForward, use inline asm
                __asm bsf edx, mask ; edx = bsf(mask)
                __asm add edx, len  ; edx += len
                __asm mov len, edx  ; len = edx
            #elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz
                len += __builtin_ctz(mask);
            #elif defined(__GNUC__) // older GCC shall use inline asm
                unsigned int pos;
                asm("bsf %1, %0" : "=r" (pos) : "rm" (mask));
                len += (size_t)pos;
            #else                    // none of choices exist, use local BSF implementation
                len += count_bits_to_0(mask);
            #endif
        #else
            len += count_bits_to_0(mask);
        #endif

            break;
        }
        str += sizeof(__m128i);
        len += sizeof(__m128i);
    }
    return len;
}

This implementation would win more performance boost if 'count_bits_to_0' is optimised in less conditions.

We could use _mm_loadu_si128 to load unaligned data and thus skip own aligning loop but the performance will still be worse due to additional CPU cycles if _mm_loadu_si128 is used.

SSE2 SIMD instructions are present on all modern CPUs and thus this implementation may bring real benefits to intensive database/text processing applications.

License: Public Domain.

About the author

Dmitry Kostjuchenko

Peter is the developer of Aba Search and Replace, a tool for replacing text in multiple files. He likes to program in C with a bit of C++, also in x86 assembly language, Python, and PHP.

Created by Dmitry Kostjuchenko
Last changed
Contributors: Peter Kankowski and Nazo

26 comments

Peter Kankowski,
This implementation would win more performance boost if 'count_bits_to_0' is optimised in less conditions.

First, the mask is limited to 16 bits, not 32. Second, you can use some methods from Henry Warren's book or Bit Twiddling Hacks page, for example:

static inline int count_bits_to_0(unsigned int x)
{
   // For little-endian machines
   int n = 1;
   if ((x & 0xFF) == 0) {n += 8; x >>= 8;}
   if ((x & 0x0F) == 0) {n += 4; x >>= 4;}
   if ((x & 0x03) == 0) {n += 2; x >>= 2;}
   return n - (x & 1);
}

When using this function, your method is the fastest for the long string:

Function Short strings The long string
strlen by Microsoft 1471 2104
strlen by Peter 1019 1471
strlen by Dmitry 1730 670
strlen by Agner Fog 1279 1056
Dmitry Kostjuchenko,

Thank you very much Peter, you advised excellent resources for bit tricks and also the proposed algorythm for counting unset bits provides around 9% speedup (if bit was set somewhere in 14-13 position) on my machine in regards to original version which is hidden by #if 0 region now.

The results on short strings can be worse also due alignment issue. I made my tests with the following string which was aligned to 16 bytes:

__declspec(align(16)) const char tst[] = "My Short Strlen Test";

and sse2 strlen outperformed your unrolled strlen by around 70%, but once I force alignment to 1 byte for example, my sse2 strlen is loosing by almost 68% (this is also true according the test results you posted)!!! While your unrolled strlen does not suffer that much from alignment issue. It would be nice if you could repeat test for 16 bit aligned data as I have demonstrated to see your results.

So I would conclude that properly aligned string data for short strings will make sse2 optimised strlen the total winner in its context. String data of long strings on the other hand will not introduce much of alignment dependency as sse2 zero looking core will compensate with its speed and will win anyway.

One more sse2 strlen drawback is its dependency on SSE2 SIMD instructions and for those who wants to have own strlen I would advise combining Optimized strlen function (as it is a winner for universal application) for platforms lacking SSE2 and this sse2 strlen implementations for systems having it.

I've updated code accordingly and added counting for leading zeroes for Big-Endian platforms modified for 16-bit mask thanks to your advise.

Peter Kankowski,
It would be nice if you could repeat test for 16 bit aligned data as I have demonstrated to see your results.
Function Short strings The long string
strlen by Microsoft 1360 1160
strlen by Peter 1400 1430
strlen by Dmitry 1430 550
strlen by Agner Fog 1276 1070

The first test included very short strings (3-5 characters). SSE loop need some time to set up, so it was a little slower than others on short strings, but very fast on the long one. Imho, SSE is a useful technique for many programs. Thank you very much for this article.

Dmitry Kostjuchenko,

Yes, for such short strings (3-5 chars) loading into SSE cache will eliminate all benefits, although results are almost as for normal strlen that is excellent keeping in mind real x2-x3 boost on longer strings. Thank you for updated test results.

Nazo,

How about this? I'm lazy to use VC (yes, i'm linux user) so this is untested but may be faster.

//Public Domain by Nazo
const unsigned char table[0x100] = {
  7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
  4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
};
 
static inline int count_bits_to_0(unsigned int x)
{
//    ASSERT(x < 0x10000);
    if ((unsigned char)x)
      return table[(unsigned char)x];
    return table[x>>8] + 8;
}
Peter Kankowski,

Thank you very much! With this method, Dmitry's function is the fastest one:

Function Short strings The long string
strlen by Microsoft 1360 1160
strlen by Peter 1400 1430
strlen by Dmitry 905 542
strlen by Agner Fog 1276 1070
Dmitry Kostjuchenko,

Thank you Nazo for your kind contribution. Yes!, in my tests it is also faster and I've just updated source of front page to make it a current effective little-endian zero counting method of strlen SSE2 implementation. At last short strings are defeated with your help!!! :)

Dmitry Kostjuchenko,

During a sudden recall by Piotr that 80x86 family has BSF instruction which is just what we are trying to improve I wish to propose the following addition to our strlen method with ASM inclusions:

if ((mask = _mm_movemask_epi8(xmm1)) != 0)
{
    // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask
    // find index of first set bit
 
    #ifndef _DISABLE_ASM_BSF // define it to disable ASM
    #if (_MSC_VER >= 1300)   // make sure <intrin.h> is included
        unsigned long pos;
        _BitScanForward(&pos, mask);
        len += (size_t)pos;
    #elif defined(_MSC_VER)  // earlier MSVC's do not have _BitScanForward, use inline asm
        __asm bsf edx, mask ; edx = bsf(mask)
        __asm add edx, len  ; edx += len
        __asm mov len, edx  ; len = edx
    #elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz
        len += __builtin_ctz(mask);
    #elif defined(__GNUC__) // older GCC shall use inline asm
        unsigned int pos;
        asm("bsf %1, %0" : "=r" (pos) : "rm" (mask));
        len += (size_t)pos;
    #else                    // none of choices exist, use local BSF implementation
        len += count_bits_to_0(mask);
    #endif
    #else
        len += count_bits_to_0(mask);
    #endif
 
    break;
}

Also during performance measurments it was revealed that AMD K8 CPUs have slow BSF implementation which is 46% slower on my AMD K8 CPU than latest Nazo's implementation.

Piotr have found out from Agner Fog's publications that it is corect that K8 is slow on BSF, although K10 family is much improved and Intel CPUs have no problems as well, so in most general cases we shall rely on BSF instruction usage and use custom Nazo's implementation when BSF is absent or CPU's BSF implementation is too slow.

From Agner Fog's table:

  AMD K8
BSF r16/32,r      21 Ops    Latency 8     Reciprocal throughput 8

  AMD K10
BSF r16/32,r       7 Ops    Latency 7     Reciprocal throughput 3

  Pentium IV
BSF r,r            2 uops   Latency 4     Reciprocal throughput 2

  Intel Pentium M, Intel Core
BSF r,r            2 Ops    Latency 2     Reciprocal throughput 1
Peter Kankowski,

Results on my PC (Pentium M Dothan 1.5 GHz, using MSVC with the _BitScanForward intrinsic):

Function Short strings The long string
strlen by Microsoft 1360 1160
strlen by Peter 1400 1430
strlen by Dmitry (the latest version) 802 515
strlen by Agner Fog 1276 1070
Nazo,

on x86, replacing

    return table[x>>8] + 8;

with

    return table[(unsigned short)x>>8] + 8;

will be a bit faster. (shr → movzbl) also using +8ed table will be a bit faster on some cases.

Peter Kankowski,

Unfortunately, only with GCC. When using MSVC++ 2005, table[(unsigned short)x»8] is one instruction longer. The compiler does not use MOVZX from high-byte register.

Peter Kankowski,

From /~colmmacc/ blog:

A few years ago, at Joost, we came across exactly this problem. An application that was spending about 10% of its time inside strlen. Within an hour or so of rewriting some primitives, it was down to less than a hundredth of a percent.

So, strlen() optimization can be useful in practice!

ace,
as spending about 10% of its time inside strlen. Within an hour or so of rewriting some primitives, it was down to less than a hundredth of a percent.
So, strlen() optimization can be useful in practice!

Do you really think it's a valid argument? They claim "an app spent 10% CPU time in strlen, then less than 1/100th of 1%." That is 1000 times improvement which is obviously not possible by optimizing the inside of the strlen function. The fastest way to do strlen is simply not using it. :)

Even for constant strings the compiler can do more than most people care to remember:

    static char s[] = "abcde";
    const int L = sizeof( s ) - 1;
Peter Kankowski,
The fastest way to do strlen is simply not using it. :)

Yes, you are right. Most likely, they stored the length with each string instead of using strlen.

PRR,

MSVC and most likely some other compilers optimize the code const char *pszStr="SomeString"; SomeFunction(pszStr, strlen(pszStr)); by replacing strlen call with size_t constant (10 in our case). MSVC 2008 uses the same optimization also for unicode strings, which was not there in VC2005.

So don't be afraid of using strlen, there is no need to optimize it by hand using sizeof.

Peter Kankowski,

Thanks! I've checked MSVC++ 2005. const char *pszStr is not optimized, but it you add static, strlen will be replaced with a constant:

static const char *s = "SomeString";
char buff[200];
wsprintf(buff, "%d", (int)strlen(s));
push   10
lea    eax, DWORD PTR _buff$[esp+204]
push   OFFSET $SG-6
push   eax
call   DWORD PTR __imp__wsprintfA

So you should remember to always add static keyword. Thank you very much; I did not know about this optimization.

ace,

I don't have these new MSVC compilers here, but I guess that they really don't use size_t at all, instead they recognize strlen as an intrinsic so the code optimization phase reduces the constant expression over the constant sequence of characters. I don't expect that it's so common among compilers. Anybody tried anything else? Anyway, am I right that if the string is "ab\0cd" the result will be 2 in these new MSVC compilers, which proves that they don't use size_t?

Peter Kankowski,

We probably misunderstood each other: size_t is not some kind of optimization, but an unsigned integer type to store the size of an object.

For "ab\0cd", push 2 is generated by MSVC. GCC doesn't support this optimization (as of GCC 4.4).

ace,
size_t is not some kind of optimization

Lapsus calami. I've meant "sizeof." sizeof( "ab\0cd" ) - 1 is of course 5. Applying strlen results in 2. Newer MSVC compilers obviously really use strlen to calculate the result. I'd also like to know if the optimization is wired to specific function or if they really can recognize some function without side effects? What happens if you make your own "int mystrlen( char* s )" and you call it the same way as before?

PRR,

Below is the code generated with XCode 3.2 (Mac OS X 10.6 Snow Leopard, GCC).

int main (int argc, const char * argv[]) {
    return strlen("SomeString");
}
0x00001f8a  <+0000>  push   %ebp
0x00001f8b  <+0001>  mov    %esp,%ebp
0x00001f8d  <+0003>  mov    $0xb,%eax
0x00001f92  <+0008>  leave  
0x00001f93  <+0009>  ret

And it works correctly in case of return strlen("Some\0String"), moving 4 to eax.

PRR,

Sorry for badly formatted post. Please feel free to adjust it.

Peter Kankowski,

Thanks, that's interesting. It appears that for GCC:

strlen("SomeString") // is optimized
 
char * s = "SomeString"; // is optimized
return strlen(s);
 
const char * s = "SomeString"; // is optimized
return strlen(s);
 
static const char * s = "SomeString"; // is not optimized
return strlen(s);

You can register if you want to edit your comments and post own articles.

Ace, it's wired to strlen. MSVC++ and GCC cannot recognize functions without side effects. BTW, I'm writing an article on a related topic now :)

ace,
don't be afraid of using strlen, there is no need to optimize it by hand using sizeof.

One more question from somebody who doesn't have latest MSVC at hand, can strlen be a part of the constant expressions which are then used for the things that have to be known at the compile time, like the size of the array on the stack? On my MSVC 6, I can do

char s[] = "something";
char a[ 2 * sizeof( s ) ];
Peter Kankowski,

No, using functions in constant expressions would violate C/C++ standard, but it would be a very useful feature.

ninjalj,
static const char * s = "SomeString"; // is not optimized

That's because the s pointer can be modified by other code. Just add a const qualifier to the pointer, and GCC should optimize it:

static const char *const s = "SomeString"; // is optimized

And about GCC not recognizing functions without side effects, you can use GCC's function attributes to explicitly mark them __attribute__ ((pure)) for functions without side effects, and __attribute__ ((const)) for functions without side effects that don't depend on global memory (strlen() is pure, but not const, since it has no side effects but has to access the string pointed to by its argument).

Peter Kankowski,
Ninjalj, thank you very much for the information.

Your name:


Comment: