# Fast cube root, square root, and reciprocal for x86/SSE CPUs.¶

notes
```All of these methods use SSE instructions or bit twiddling tricks to get a rough
approximation to cube root, square root, or reciprocal, which is then refined with one or
more Newton-Raphson approximation steps.

Each is named to indicate its approximate level of accuracy and a comment describes its
performance relative to the conventional versions.
```
code
 ``` 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222``` ```// These functions approximate reciprocal, square root, and // cube root to varying degrees of precision substantially // faster than the straightforward methods 1/x, sqrtf(x), // and powf( x, 1.0f/3.0f ). All require SSE-enabled CPU & OS. // Unlike the powf() solution, the cube roots also correctly // handle negative inputs. #define REALLY_INLINE __forceinline // ~34 clocks on Pentium M vs. ~360 for powf REALLY_INLINE float cuberoot_sse_8bits( float x ) { float z; static const float three = 3.0f; _asm { mov eax, x // x as bits movss xmm2, x // x2: x movss xmm1, three // x1: 3 // Magic floating point cube root done with integer math. // The exponent is divided by three in such a way that // remainder bits get shoved into the top of the normalized // mantissa. mov ecx, eax // copy of x and eax, 0x7FFFFFFF // exponent & mantissa of x in biased-127 sub eax, 0x3F800000 // exponent & mantissa of x in 2's comp sar eax, 10 // imul eax, 341 // 341/1024 ~= .333 add eax, 0x3F800000 // back to biased-127 and eax, 0x7FFFFFFF // remask and ecx, 0x80000000 // original sign and mantissa or eax, ecx // masked new exponent, old sign and mantissa mov z, eax // // use SSE to refine the first approximation movss xmm0, z ;// x0: z movss xmm3, xmm0 ;// x3: z mulss xmm3, xmm0 ;// x3: z*z movss xmm4, xmm3 ;// x4: z*z mulss xmm3, xmm1 ;// x3: 3*z*z rcpss xmm3, xmm3 ;// x3: ~ 1/(3*z*z) mulss xmm4, xmm0 ;// x4: z*z*z subss xmm4, xmm2 ;// x4: z*z*z-x mulss xmm4, xmm3 ;// x4: (z*z*z-x) / (3*z*z) subss xmm0, xmm4 ;// x0: z' accurate to within about 0.3% movss z, xmm0 } return z; } // ~60 clocks on Pentium M vs. ~360 for powf REALLY_INLINE float cuberoot_sse_16bits( float x ) { float z; static const float three = 3.0f; _asm { mov eax, x // x as bits movss xmm2, x // x2: x movss xmm1, three // x1: 3 // Magic floating point cube root done with integer math. // The exponent is divided by three in such a way that // remainder bits get shoved into the top of the normalized // mantissa. mov ecx, eax // copy of x and eax, 0x7FFFFFFF // exponent & mantissa of x in biased-127 sub eax, 0x3F800000 // exponent & mantissa of x in 2's comp sar eax, 10 // imul eax, 341 // 341/1024 ~= .333 add eax, 0x3F800000 // back to biased-127 and eax, 0x7FFFFFFF // remask and ecx, 0x80000000 // original sign and mantissa or eax, ecx // masked new exponent, old sign and mantissa mov z, eax // // use SSE to refine the first approximation movss xmm0, z ;// x0: z movss xmm3, xmm0 ;// x3: z mulss xmm3, xmm0 ;// x3: z*z movss xmm4, xmm3 ;// x4: z*z mulss xmm3, xmm1 ;// x3: 3*z*z rcpss xmm3, xmm3 ;// x3: ~ 1/(3*z*z) mulss xmm4, xmm0 ;// x4: z*z*z subss xmm4, xmm2 ;// x4: z*z*z-x mulss xmm4, xmm3 ;// x4: (z*z*z-x) / (3*z*z) subss xmm0, xmm4 ;// x0: z' accurate to within about 0.3% movss xmm3, xmm0 ;// x3: z mulss xmm3, xmm0 ;// x3: z*z movss xmm4, xmm3 ;// x4: z*z mulss xmm3, xmm1 ;// x3: 3*z*z rcpss xmm3, xmm3 ;// x3: ~ 1/(3*z*z) mulss xmm4, xmm0 ;// x4: z*z*z subss xmm4, xmm2 ;// x4: z*z*z-x mulss xmm4, xmm3 ;// x4: (z*z*z-x) / (3*z*z) subss xmm0, xmm4 ;// x0: z'' accurate to within about 0.001% movss z, xmm0 } return z; } // ~77 clocks on Pentium M vs. ~360 for powf REALLY_INLINE float cuberoot_sse_22bits( float x ) { float z; static const float three = 3.0f; _asm { mov eax, x // x as bits movss xmm2, x // x2: x movss xmm1, three // x1: 3 // Magic floating point cube root done with integer math. // The exponent is divided by three in such a way that // remainder bits get shoved into the top of the normalized // mantissa. mov ecx, eax // copy of x and eax, 0x7FFFFFFF // exponent & mantissa of x in biased-127 sub eax, 0x3F800000 // exponent & mantissa of x in 2's comp sar eax, 10 // imul eax, 341 // 341/1024 ~= .333 add eax, 0x3F800000 // back to biased-127 and eax, 0x7FFFFFFF // remask and ecx, 0x80000000 // original sign and mantissa or eax, ecx // masked new exponent, old sign and mantissa mov z, eax // // use SSE to refine the first approximation movss xmm0, z // x0: z movss xmm3, xmm0 // x3: z mulss xmm3, xmm0 // x3: z*z movss xmm4, xmm3 // x4: z*z mulss xmm3, xmm1 // x3: 3*z*z rcpss xmm3, xmm3 // x3: ~ 1/(3*z*z) mulss xmm4, xmm0 // x4: z*z*z subss xmm4, xmm2 // x4: z*z*z-x mulss xmm4, xmm3 // x4: (z*z*z-x) / (3*z*z) subss xmm0, xmm4 // x0: z' accurate to within about 0.3% movss xmm3, xmm0 // x3: z mulss xmm3, xmm0 // x3: z*z movss xmm4, xmm3 // x4: z*z mulss xmm3, xmm1 // x3: 3*z*z rcpss xmm3, xmm3 // x3: ~ 1/(3*z*z) mulss xmm4, xmm0 // x4: z*z*z subss xmm4, xmm2 // x4: z*z*z-x mulss xmm4, xmm3 // x4: (z*z*z-x) / (3*z*z) subss xmm0, xmm4 // x0: z'' accurate to within about 0.001% movss xmm3, xmm0 // x3: z mulss xmm3, xmm0 // x3: z*z movss xmm4, xmm3 // x4: z*z mulss xmm3, xmm1 // x3: 3*z*z rcpss xmm3, xmm3 // x3: ~ 1/(3*z*z) mulss xmm4, xmm0 // x4: z*z*z subss xmm4, xmm2 // x4: z*z*z-x mulss xmm4, xmm3 // x4: (z*z*z-x) / (3*z*z) subss xmm0, xmm4 // x0: z''' accurate to within about 0.000012% movss z, xmm0 } return z; } // ~6 clocks on Pentium M vs. ~24 for single precision sqrtf REALLY_INLINE float squareroot_sse_11bits( float x ) { float z; _asm { rsqrtss xmm0, x rcpss xmm0, xmm0 movss z, xmm0 // z ~= sqrt(x) to 0.038% } return z; } // ~19 clocks on Pentium M vs. ~24 for single precision sqrtf REALLY_INLINE float squareroot_sse_22bits( float x ) { static float half = 0.5f; float z; _asm { movss xmm1, x // x1: x rsqrtss xmm2, xmm1 // x2: ~1/sqrt(x) = 1/z rcpss xmm0, xmm2 // x0: z == ~sqrt(x) to 0.05% movss xmm4, xmm0 // x4: z movss xmm3, half mulss xmm4, xmm4 // x4: z*z mulss xmm2, xmm3 // x2: 1 / 2z subss xmm4, xmm1 // x4: z*z-x mulss xmm4, xmm2 // x4: (z*z-x)/2z subss xmm0, xmm4 // x0: z' to 0.000015% movss z, xmm0 } return z; } // ~12 clocks on Pentium M vs. ~16 for single precision divide REALLY_INLINE float reciprocal_sse_22bits( float x ) { float z; _asm { rcpss xmm0, x // x0: z ~= 1/x movss xmm2, x // x2: x movss xmm1, xmm0 // x1: z ~= 1/x addss xmm0, xmm0 // x0: 2z mulss xmm1, xmm1 // x1: z^2 mulss xmm1, xmm2 // x1: xz^2 subss xmm0, xmm1 // x0: z' ~= 1/x to 0.000012% movss z, xmm0 } return z; } ```