### Fast cube root, square root, and reciprocal for x86/SSE CPUs.

References : Posted by kaleja[AT]estarcion[DOT]com
Notes :
All of these methods use SSE instructions or bit twiddling tricks to get a rough approximation to cube root, square root, or reciprocal, which is then refined with one or more Newton-Raphson approximation steps.

Each is named to indicate its approximate level of accuracy and a comment describes its performance relative to the conventional versions.

Code :
// These functions approximate reciprocal, square root, and
// cube root to varying degrees of precision substantially
// faster than the straightforward methods 1/x, sqrtf(x),
// and powf( x, 1.0f/3.0f ). All require SSE-enabled CPU & OS.
// Unlike the powf() solution, the cube roots also correctly
// handle negative inputs.

#define REALLY_INLINE __forceinline

// ~34 clocks on Pentium M vs. ~360 for powf
REALLY_INLINE float cuberoot_sse_8bits( float x )
{
float z;
static const float three = 3.0f;
_asm
{
mov        eax, x                // x as bits
movss    xmm2, x                // x2: x
movss    xmm1, three            // x1: 3
// Magic floating point cube root done with integer math.
// The exponent is divided by three in such a way that
// remainder bits get shoved into the top of the normalized
// mantissa.
mov        ecx, eax            // copy of x
and        eax, 0x7FFFFFFF        // exponent & mantissa of x in biased-127
sub     eax, 0x3F800000        // exponent & mantissa of x in 2's comp
sar     eax, 10                //
imul    eax, 341            // 341/1024 ~= .333
add        eax, 0x3F800000        // back to biased-127
and        ecx, 0x80000000        // original sign and mantissa
or      eax, ecx            // masked new exponent, old sign and mantissa
mov        z, eax                //

// use SSE to refine the first approximation
movss    xmm0, z                ;// x0: z
movss    xmm3, xmm0            ;// x3: z
mulss    xmm3, xmm0            ;// x3: z*z
movss    xmm4, xmm3            ;// x4: z*z
mulss   xmm3, xmm1            ;// x3: 3*z*z
rcpss    xmm3, xmm3            ;// x3: ~ 1/(3*z*z)
mulss   xmm4, xmm0            ;// x4: z*z*z
subss    xmm4, xmm2            ;// x4: z*z*z-x
mulss    xmm4, xmm3            ;// x4: (z*z*z-x) / (3*z*z)
subss    xmm0, xmm4            ;// x0: z' accurate to within about 0.3%
movss    z, xmm0
}

return z;
}

// ~60 clocks on Pentium M vs. ~360 for powf
REALLY_INLINE float cuberoot_sse_16bits( float x )
{
float z;
static const float three = 3.0f;
_asm
{
mov        eax, x                // x as bits
movss    xmm2, x                // x2: x
movss    xmm1, three            // x1: 3
// Magic floating point cube root done with integer math.
// The exponent is divided by three in such a way that
// remainder bits get shoved into the top of the normalized
// mantissa.
mov        ecx, eax            // copy of x
and        eax, 0x7FFFFFFF        // exponent & mantissa of x in biased-127
sub     eax, 0x3F800000        // exponent & mantissa of x in 2's comp
sar     eax, 10                //
imul    eax, 341            // 341/1024 ~= .333
add        eax, 0x3F800000        // back to biased-127
and        ecx, 0x80000000        // original sign and mantissa
or      eax, ecx            // masked new exponent, old sign and mantissa
mov        z, eax                //

// use SSE to refine the first approximation
movss    xmm0, z                ;// x0: z
movss    xmm3, xmm0            ;// x3: z
mulss    xmm3, xmm0            ;// x3: z*z
movss    xmm4, xmm3            ;// x4: z*z
mulss   xmm3, xmm1            ;// x3: 3*z*z
rcpss    xmm3, xmm3            ;// x3: ~ 1/(3*z*z)
mulss   xmm4, xmm0            ;// x4: z*z*z
subss    xmm4, xmm2            ;// x4: z*z*z-x
mulss    xmm4, xmm3            ;// x4: (z*z*z-x) / (3*z*z)
subss    xmm0, xmm4            ;// x0: z' accurate to within about 0.3%

movss    xmm3, xmm0            ;// x3: z
mulss    xmm3, xmm0            ;// x3: z*z
movss    xmm4, xmm3            ;// x4: z*z
mulss   xmm3, xmm1            ;// x3: 3*z*z
rcpss    xmm3, xmm3            ;// x3: ~ 1/(3*z*z)
mulss   xmm4, xmm0            ;// x4: z*z*z
subss    xmm4, xmm2            ;// x4: z*z*z-x
mulss    xmm4, xmm3            ;// x4: (z*z*z-x) / (3*z*z)
subss    xmm0, xmm4            ;// x0: z'' accurate to within about 0.001%

movss    z, xmm0
}

return z;
}

// ~77 clocks on Pentium M vs. ~360 for powf
REALLY_INLINE float cuberoot_sse_22bits( float x )
{
float z;
static const float three = 3.0f;
_asm
{
mov        eax, x                // x as bits
movss    xmm2, x                // x2: x
movss    xmm1, three            // x1: 3
// Magic floating point cube root done with integer math.
// The exponent is divided by three in such a way that
// remainder bits get shoved into the top of the normalized
// mantissa.
mov        ecx, eax            // copy of x
and        eax, 0x7FFFFFFF        // exponent & mantissa of x in biased-127
sub     eax, 0x3F800000        // exponent & mantissa of x in 2's comp
sar     eax, 10                //
imul    eax, 341            // 341/1024 ~= .333
add        eax, 0x3F800000        // back to biased-127
and        ecx, 0x80000000        // original sign and mantissa
or      eax, ecx            // masked new exponent, old sign and mantissa
mov        z, eax                //

// use SSE to refine the first approximation
movss    xmm0, z                // x0: z
movss    xmm3, xmm0            // x3: z
mulss    xmm3, xmm0            // x3: z*z
movss    xmm4, xmm3            // x4: z*z
mulss   xmm3, xmm1            // x3: 3*z*z
rcpss    xmm3, xmm3            // x3: ~ 1/(3*z*z)
mulss   xmm4, xmm0            // x4: z*z*z
subss    xmm4, xmm2            // x4: z*z*z-x
mulss    xmm4, xmm3            // x4: (z*z*z-x) / (3*z*z)
subss    xmm0, xmm4            // x0: z' accurate to within about 0.3%

movss    xmm3, xmm0            // x3: z
mulss    xmm3, xmm0            // x3: z*z
movss    xmm4, xmm3            // x4: z*z
mulss   xmm3, xmm1            // x3: 3*z*z
rcpss    xmm3, xmm3            // x3: ~ 1/(3*z*z)
mulss   xmm4, xmm0            // x4: z*z*z
subss    xmm4, xmm2            // x4: z*z*z-x
mulss    xmm4, xmm3            // x4: (z*z*z-x) / (3*z*z)
subss    xmm0, xmm4            // x0: z'' accurate to within about 0.001%

movss    xmm3, xmm0            // x3: z
mulss    xmm3, xmm0            // x3: z*z
movss    xmm4, xmm3            // x4: z*z
mulss   xmm3, xmm1            // x3: 3*z*z
rcpss    xmm3, xmm3            // x3: ~ 1/(3*z*z)
mulss   xmm4, xmm0            // x4: z*z*z
subss    xmm4, xmm2            // x4: z*z*z-x
mulss    xmm4, xmm3            // x4: (z*z*z-x) / (3*z*z)
subss    xmm0, xmm4            // x0: z''' accurate to within about 0.000012%

movss    z, xmm0
}

return z;
}

// ~6 clocks on Pentium M vs. ~24 for single precision sqrtf
REALLY_INLINE float squareroot_sse_11bits( float x )
{
float z;
_asm
{
rsqrtss xmm0, x
rcpss    xmm0, xmm0
movss    z, xmm0            // z ~= sqrt(x) to 0.038%
}
return z;
}

// ~19 clocks on Pentium M vs. ~24 for single precision sqrtf
REALLY_INLINE float squareroot_sse_22bits( float x )
{
static float half = 0.5f;
float z;
_asm
{
movss    xmm1, x            // x1: x
rsqrtss xmm2, xmm1        // x2: ~1/sqrt(x) = 1/z
rcpss    xmm0, xmm2        // x0: z == ~sqrt(x) to 0.05%

movss    xmm4, xmm0        // x4: z
movss    xmm3, half
mulss    xmm4, xmm4        // x4: z*z
mulss    xmm2, xmm3        // x2: 1 / 2z
subss    xmm4, xmm1        // x4: z*z-x
mulss    xmm4, xmm2        // x4: (z*z-x)/2z
subss    xmm0, xmm4        // x0: z' to 0.000015%

movss    z, xmm0
}
return z;
}

// ~12 clocks on Pentium M vs. ~16 for single precision divide
REALLY_INLINE float reciprocal_sse_22bits( float x )
{
float z;
_asm
{
rcpss    xmm0, x            // x0: z ~= 1/x
movss    xmm2, x            // x2: x
movss    xmm1, xmm0        // x1: z ~= 1/x
addss    xmm0, xmm0        // x0: 2z
mulss    xmm1, xmm1        // x1: z^2
mulss    xmm1, xmm2        // x1: xz^2
subss    xmm0, xmm1        // x0: z' ~= 1/x to 0.000012%

movss    z, xmm0
}
return z;
}

from : Christian[AT]savioursofsoul[DOT]de
comment : Good job!

from : martini[AT]slightlyintoxicated[DOT]com
comment : Dude, what's the freakin' point of using SSE for scalar functions?

from : kaleja[AT]estarcion[DOT]com
comment : The freakin' point is that it's faster than x87, and sometimes you can't do 4 at once. Also, the function signatures (float func(float)) are well established for the scalar versions, but different people have different standards for the vector versions (float* vs __m128& or whatever). Converting from scalar to vector is trivial for the square root and reciprocal examples, and I'm not going to take the time to investigate a 2-way MMX solution to the first approximation of cube root, but you can feel free.

from : nobody[AT]nowhere[DOT]com
comment : These are terrific. Wish I had seen them sooner.

from :
comment : order cheap soma ,cheap soma cheap soma