atan2 on SPU - CellPerformance

n 2006 March 03 on the IBM developerWorks Cell Broadband Engine Architecture forum [ibm.com] an interesting question was asked:

"I am trying to port an application from an older version of SDK to SDK 1.0. It uses atan2(.....) function, which is causing trouble... This code worked fine on SDK28, but now it looks like the new functions dont have this particular function defined..
I did change the makefile to include $(SDKLIB)/libmath.a

I searched in ./sysroot/usr/spu/include/* and src/include/spu/* but couldn't find a headerfile that has it defined.

Can anyone please suggest if I should just change the code to not use that function or is there a way to invoke it still?

Thanks!"

It turned out this function was not available in the SDK.

The following is a branch-free implementation of atan2 vector floats for the SPU. A scalar version which simply casts to vector and back is also provided. This implementation is fairly quick-and-dirty and no particular level of accuracy is gauranteed, but it should be usable for many purposes.

Or download the source files:
cp_fatan-cbe-spu.h
cp_fatan-cbe-spu.c

This code is C99 source. For gcc, use the following flags: -std=c99 -pedantic

0// ## cp_fatan-cbe-spu.h (C99) 1// ## Version 1.0 2// ## 3// ## Copyright (c) 2006 Mike Acton 4// ## 5// ## SIGNIFICANT REFERENCES: 6// ## 7// ## [1] Cephes Math Library Release 2.8: June, 2000 8// ## Copyright 1984, 1995, 2000, Stephen L. Moshier 9// ## [2] Numerical Computation Guide (PDF) 10// ## Copyright 2000, Sun Microsystems, Inc. 11// ## [3] IEEE 754 Support in C99 (PDF) 12// ## Copyright 2001, Jim Thomas 13// ## [4] Solaris 10 Reference Manual : atan2(3M) 14// ## Copyright 1994-2005, Sun Microsystems, Inc. 15// ## 16// ## Permission is hereby granted, free of charge, to any person obtaining 17// ## a copy of this software and associated documentation files 18// ## (the "Software"), to deal in the Software without restriction, including 19// ## without limitation the rights to use, copy, modify, merge, publish, 20// ## distribute, sublicense, and/or sell copies of the Software, and to permit 21// ## persons to whom the Software is furnished to do so, subject to the 22// ## following conditions: 23// ## 24// ## The above copyright notice and this permission notice shall be included 25// ## in all copies or substantial portions of the Software. 26// ## 27// ## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 28// ## OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 29// ## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 30// ## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 31// ## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 32// ## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 33// ## THE SOFTWARE. 34// ## 35 36#ifndef CP_FATAN_CBE_SPU_H 37#define CP_FATAN_CBE_SPU_H 38 39#include <stdint.h> 40#include <spu_intrinsics.h> 41 42// ## 43// ## Global Floating-point constants (32 bit) 44// ## 45// ## Constant is loaded in each element of 32 bit floating-point vector 46// ## from local store. 47// ## 48// ## cp_flpio4() +PI/+4 49// ## cp_flt3p8() tan( +3.0 * PI / +8.0 ) 50// ## cp_flnpio2() -PI/+2 51// ## cp_flpio2() +PI/+2 52// ## cp_flpt66() +0.66 53// ## cp_flpi() +PI 54// ## cp_flnpi() -PI 55 56extern const vector unsigned int _cp_f_pio4; 57extern const vector unsigned int _cp_f_t3p8; 58extern const vector unsigned int _cp_f_npio2; 59extern const vector unsigned int _cp_f_pio2; 60extern const vector unsigned int _cp_f_pt66; 61extern const vector unsigned int _cp_f_pi; 62extern const vector unsigned int _cp_f_npi; 63 64static inline qword 65cp_flpio4( void ) 66{ 67 return si_lqa( (intptr_t)&_cp_f_pio4 ); 68} 69 70static inline qword 71cp_flt3p8( void ) 72{ 73 return si_lqa( (intptr_t)&_cp_f_t3p8 ); 74} 75 76static inline qword 77cp_flnpio2( void ) 78{ 79 return si_lqa( (intptr_t)&_cp_f_npio2 ); 80} 81 82static inline qword 83cp_flpio2( void ) 84{ 85 return si_lqa( (intptr_t)&_cp_f_pio2 ); 86} 87 88static inline qword 89cp_flpt66( void ) 90{ 91 return si_lqa( (intptr_t)&_cp_f_pt66 ); 92} 93 94static inline qword 95cp_flpi( void ) 96{ 97 return si_lqa( (intptr_t)&_cp_f_pi ); 98} 99 100static inline qword 101cp_flnpi( void ) 102{ 103 return si_lqa( (intptr_t)&_cp_f_npi ); 104} 105 106// ## 107// ## Load-Immediate Floating-point constants (32 bit) 108// ## 109// ## Constant is loaded in each element of 32 bit floating-point vector 110// ## using immediate values. i.e. No loads 111// ## 112// ## cp_filzero() +0.0 +0x00000000 113// ## cp_filnzero() -0.0 +0x80000000 114// ## cp_filone() +1.0 +0x3f800000 115// ## cp_filtwo() +2.0 +0x40000000 116// ## cp_filinf() +INF +0x7f800000 117// ## cp_filninf() -INF +0xff800000 118// ## cp_filnan() NaN +0x7fc00000 119// ## 120 121static inline qword 122cp_filzero( void ) 123{ 124 return si_ilhu( (int16_t)0x0000 ); 125} 126 127static inline qword 128cp_filnzero( void ) 129{ 130 return si_ilhu( (int16_t)0x8000 ); 131} 132 133static inline qword 134cp_filone( void ) 135{ 136 return si_ilhu( (int16_t)0x3f80 ); 137} 138 139static inline qword 140cp_filtwo( void ) 141{ 142 return si_ilhu( (int16_t)0x4000 ); 143} 144 145static inline qword 146cp_filinf( void ) 147{ 148 return si_ilhu( (int16_t)0x7f80 ); 149} 150 151static inline qword 152cp_filninf( void ) 153{ 154 return si_ilhu( (int16_t)0xff80 ); 155} 156 157static inline qword 158cp_filnan( void ) 159{ 160 return si_ilhu( (int16_t)0x7fc0 ); 161} 162 163// ## 164// ## cp_fatan() Coefficients and other constants 165// ## 166 167extern const vector unsigned int _cp_f_atan_q4; 168extern const vector unsigned int _cp_f_atan_q3; 169extern const vector unsigned int _cp_f_atan_q2; 170extern const vector unsigned int _cp_f_atan_q1; 171extern const vector unsigned int _cp_f_atan_q0; 172extern const vector unsigned int _cp_f_atan_p4; 173extern const vector unsigned int _cp_f_atan_p3; 174extern const vector unsigned int _cp_f_atan_p2; 175extern const vector unsigned int _cp_f_atan_p1; 176extern const vector unsigned int _cp_f_atan_p0; 177extern const vector unsigned int _cp_f_hmorebits; 178extern const vector unsigned int _cp_f_morebits; 179 180// ## cp_fatan(x) 181// ## 182// ## 0 <= x <= 0.66 183// ## -PI/2 <= cp_fatan(x) <= +PI/2 184// ## 185// ## Each floating-point component of the result is a function of 186// ## the corresponding components of x: 187// ## 188// ## 0.0 { x == 0.0 189// ## 190// ## +PI { 191// ## --- { x == INF 192// ## 2.0 { 193// ## 194// ## -PI { 195// ## --- { x == -INF 196// ## 2.0 { 197// ## 198// ## 199// ## 2 4 6 8 { 200// ## P + P x + P x + P x + P x { 201// ## 2 0 1 2 3 4 { 202// ## x x ----------------------------------- + x { otherwise 203// ## 2 4 6 8 10 { 204// ## Q + Q x + Q x + Q x + Q x + x { 205// ## 0 1 2 3 4 { 206 207static inline qword 208_cp_fatan( const qword x ) 209{ 210 // ## 211 // ## Load constants 212 // ## 213 214 const qword f_one = cp_filone(); 215 const qword f_inf = cp_filinf(); 216 const qword f_ninf = cp_filninf(); 217 const qword f_msb = cp_filnzero(); 218 const qword f_zero = cp_filzero(); 219 220 const qword f_pt66 = si_lqa( (intptr_t)&_cp_f_pt66 ); 221 const qword f_pio2 = si_lqa( (intptr_t)&_cp_f_pio2 ); 222 const qword f_npio2 = si_lqa( (intptr_t)&_cp_f_npio2 ); 223 const qword f_pio4 = si_lqa( (intptr_t)&_cp_f_pio4 ); 224 const qword f_t3p8 = si_lqa( (intptr_t)&_cp_f_t3p8 ); 225 226 const qword f_atan_p0 = si_lqa( (intptr_t)&_cp_f_atan_p0 ); 227 const qword f_atan_p1 = si_lqa( (intptr_t)&_cp_f_atan_p1 ); 228 const qword f_atan_p2 = si_lqa( (intptr_t)&_cp_f_atan_p2 ); 229 const qword f_atan_p3 = si_lqa( (intptr_t)&_cp_f_atan_p3 ); 230 const qword f_atan_p4 = si_lqa( (intptr_t)&_cp_f_atan_p4 ); 231 const qword f_atan_q0 = si_lqa( (intptr_t)&_cp_f_atan_q0 ); 232 const qword f_atan_q1 = si_lqa( (intptr_t)&_cp_f_atan_q1 ); 233 const qword f_atan_q2 = si_lqa( (intptr_t)&_cp_f_atan_q2 ); 234 const qword f_atan_q3 = si_lqa( (intptr_t)&_cp_f_atan_q3 ); 235 const qword f_atan_q4 = si_lqa( (intptr_t)&_cp_f_atan_q4 ); 236 const qword f_morebits = si_lqa( (intptr_t)&_cp_f_morebits ); 237 const qword f_hmorebits = si_lqa( (intptr_t)&_cp_f_hmorebits ); 238 239 // ## 240 // ## pos_x = -x { x < 0 241 // ## x { otherwise 242 // ## 243 244 const qword neg_x = si_xor( x, f_msb ); 245 const qword sign_mask = si_fcgt( f_zero, x ); 246 const qword pos_x = si_selb( x, neg_x, sign_mask ); 247 248 // ## 249 // ## Range reduction 250 // ## 251 252 // ## 253 // ## range0_mask = ( pos_x > tan( 3.0 * PI / 8.0 ) ) 254 // ## range1_mask = ( pos_x <= 0.66 ) 255 // ## range2_mask = !( range0_mask || range1_mask ) 256 // ## 257 258 const qword range0_mask = si_fcgt( pos_x, f_t3p8 ); 259 const qword range1_gt_mask = si_fcgt( f_pt66, pos_x ); 260 const qword range1_eq_mask = si_fceq( f_pt66, pos_x ); 261 const qword range1_mask = si_or( range1_gt_mask, range1_eq_mask ); 262 const qword range2_mask = si_nor( range0_mask, range1_mask ); 263 264 // ## 265 // ## range0_x = -1.0 266 // ## ----- 267 // ## pos_x 268 // ## 269 // ## range0_y = PI 270 // ## --- 271 // ## 2.0 272 // ## 273 274 const qword range0_x0 = si_frest( pos_x ); 275 const qword range0_x1 = si_fi( pos_x, range0_x0 ); 276 const qword range0_x2 = si_fnms( range0_x1, pos_x, f_one ); 277 const qword range0_x3 = si_fma( range0_x2, range0_x1, range0_x1 ); 278 const qword range0_x = si_xor( range0_x3, f_msb ); 279 const qword range0_y = f_pio2; 280 281 // ## 282 // ## range1_x = pos_x 283 // ## range1_y = 0.0 284 // ## 285 286 const qword range1_x = pos_x; 287 const qword range1_y = f_zero; 288 289 290 // ## 291 // ## range2_x = (pos_x-1.0) 292 // ## ----------- 293 // ## (pos_x+1.0) 294 // ## 295 // ## range2_y = PI 296 // ## --- 297 // ## 4.0 298 // ## 299 300 const qword range2_y = f_pio4; 301 const qword range2_x0num = si_fs( pos_x, f_one ); 302 const qword range2_x0den = si_fa( pos_x, f_one ); 303 const qword range2_x0 = si_frest( range2_x0den ); 304 const qword range2_x1 = si_fnms( range2_x0, range2_x0den, f_one ); 305 const qword range2_x2 = si_fma( range2_x1, range2_x0, range2_x0 ); 306 const qword range2_x = si_fm( range2_x0num, range2_x2 ); 307 308 // ## 309 // ## range_x = range0_x { range0_mask 310 // ## range1_x { range1_mask 311 // ## range2_x { range2_mask 312 // ## 313 // ## range_y = range0_y { range0_mask 314 // ## range1_y { range1_mask 315 // ## range2_y { range2_mask 316 // ## 317 318 const qword range_x0 = si_selb( range2_x, range0_x, range0_mask ); 319 const qword range_x = si_selb( range_x0, range1_x, range1_mask ); 320 const qword range_y0 = si_selb( range2_y, range0_y, range0_mask ); 321 const qword range_y = si_selb( range_y0, range1_y, range1_mask ); 322 323 // ## 324 // ## 2 325 // ## xp2 = range_x 326 // ## 2 3 4 327 // ## P + P xp2 + P xp2 + P xp2 + P xp2 328 // ## 0 1 2 3 4 329 // ## zdiv = ------------------------------------------ 330 // ## 2 3 4 5 331 // ## Q + Q xp2 + Q xp2 + Q xp2 + Q xp2 + xp2 332 // ## 0 1 2 3 4 333 // ## 334 // ## z1 = range_x * ( xp2 * zdiv ) + range_x 335 // ## 336 337 const qword xp2 = si_fm( range_x, range_x ); 338 const qword znum0 = f_atan_p0; 339 const qword znum1 = si_fma( znum0, xp2, f_atan_p1 ); 340 const qword znum2 = si_fma( znum1, xp2, f_atan_p2 ); 341 const qword znum3 = si_fma( znum2, xp2, f_atan_p3 ); 342 const qword znum = si_fma( znum3, xp2, f_atan_p4 ); 343 const qword zden0 = si_fa( xp2, f_atan_q0 ); 344 const qword zden1 = si_fma( zden0, xp2, f_atan_q1 ); 345 const qword zden2 = si_fma( zden1, xp2, f_atan_q2 ); 346 const qword zden3 = si_fma( zden2, xp2, f_atan_q3 ); 347 const qword zden = si_fma( zden3, xp2, f_atan_q4 ); 348 const qword zden_r0 = si_frest( zden ); 349 const qword zden_r1 = si_fnms( zden_r0, zden, f_one ); 350 const qword zden_r = si_fma( zden_r1, zden_r0, zden_r0 ); 351 const qword zdiv = si_fm( znum, zden_r ); 352 const qword z0 = si_fm( xp2, zdiv ); 353 const qword z1 = si_fma( range_x, z0, range_x ); 354 355 // ## 356 // ## zadd = z1 + 0.5 * MOREBITS { range2_mask 357 // ## z1 + MOREBITS { range1_mask 358 // ## z1 { otherwise 359 // ## 360 // ## yaddz = range_y + zadd 361 // ## 362 // ## pos_yaddz = yaddz { yaddz >= 0 363 // ## -yaddz { yaddz < 0 364 // ## 365 366 const qword zadd0 = si_selb( f_zero, f_hmorebits, range2_mask ); 367 const qword zadd1 = si_selb( zadd0, f_morebits, range1_mask ); 368 const qword zadd = si_fa( z1, zadd1 ); 369 const qword yaddz = si_fa( range_y, zadd ); 370 const qword neg_yaddz = si_xor( yaddz, f_msb ); 371 const qword pos_yaddz = si_selb( yaddz, neg_yaddz, sign_mask ); 372 373 // ## 374 // ## result_y0 = 0.0 { x == 0.0 375 // ## pos_yaddz { otherwise 376 // ## 377 378 const qword x_eqz_mask = si_fceq( f_zero, x ); 379 const qword result_y0 = si_selb( pos_yaddz, x, x_eqz_mask ); 380 381 // ## 382 // ## result_y2 = +PI { 383 // ## --- { x == INF 384 // ## 2.0 { 385 // ## 386 // ## -PI { 387 // ## --- { x == -INF 388 // ## 2.0 { 389 // ## 390 // ## result_y0 { otherwise 391 // ## 392 393 const qword x_eqinf_mask = si_fceq( f_inf, x ); 394 const qword x_eqninf_mask = si_fceq( f_ninf, x ); 395 const qword result_y1 = si_selb( result_y0, f_pio2, x_eqinf_mask ); 396 const qword result = si_selb( result_y1, f_npio2, x_eqninf_mask ); 397 398 return (result); 399} 400 401static inline vector float 402cp_fatan( const vector float x ) 403{ 404 return (vector float)( _cp_fatan( (qword)x ) ); 405} 406 407static inline float 408cp_fatan_scalar( const float x ) 409{ 410 const qword vx = si_from_float( x ); 411 const qword vresult = _cp_fatan( vx ); 412 const float result = si_to_float( vresult ); 413 414 return (result); 415} 416 417// ## cp_fatan2(y,x) 418// ## 419// ## -INF <= x <= INF 420// ## -INF <= y <= INF 421// ## -PI <= cp_fatan2(y,x) <= +PI 422// ## 423// ## Each floating-point component of the result is a function of 424// ## the corresponding components of y and x: 425// ## 426// ## +PI { (y == +0.0) && (x < 0.0) 427// ## 428// ## -PI { (y == -0.0) && (x < 0.0) 429// ## 430// ## +0.0 { (y == +0.0) && (x > 0.0) 431// ## 432// ## -0.0 { (y == -0.0) && (x > 0.0) 433// ## 434// ## -PI { 435// ## ---- { (y < 0.0) && (x == 0.0) 436// ## +2.0 { 437// ## 438// ## +PI { 439// ## ---- { (y > 0.0) && (x == 0.0) 440// ## +2.0 { 441// ## 442// ## NaN { (y == NaN) || (x == NaN) 443// ## 444// ## +PI { (y == +0.0) && (x == -0.0) 445// ## 446// ## -PI { (y == -0.0) && (x == -0.0) 447// ## 448// ## +0.0 { (y == +0.0) && (x == +0.0) 449// ## 450// ## -0.0 { (y == -0.0) && (x == +0.0) 451// ## 452// ## +PI { 453// ## --- { (y == +INF) && (x == +INF) 454// ## 4.0 { 455// ## 456// ## -PI { 457// ## --- { (y == -INF) && (x == +INF) 458// ## 4.0 { 459// ## 460// ## +3.0 PI { 461// ## ------- { (y == +INF) && (x == -INF) 462// ## +4.0 { 463// ## 464// ## -3.0 PI { 465// ## ------- { (y == -INF) && (x == -INF) 466// ## +4.0 { 467// ## 468// ## +PI { isfinite(y) && (+y > 0) && (x == -INF) 469// ## 470// ## -PI { isfinite(y) && (-y > 0) && (x == -INF) 471// ## 472// ## +0.0 { isfinite(y) && (+y > 0) && (x == +INF) 473// ## 474// ## -0.0 { isfinite(y) && (-y > 0) && (x == +INF) 475// ## 476// ## +PI { 477// ## ---- { (isfinite(x) && (y == +INF) 478// ## +2.0 { 479// ## 480// ## -PI { 481// ## --- { (isfinite(x) && (y == -INF) 482// ## +2.0 { 483// ## 484// ## ( y ) { 485// ## +PI + cp_atan( - ) { ( x < 0.0 ) && ( y >= 0.0 ) 486// ## ( x ) { 487// ## 488// ## ( y ) { 489// ## -PI + cp_atan( - ) { ( x < 0.0 ) && ( y < 0.0 ) 490// ## ( x ) { 491// ## 492// ## ( y ) { 493// ## +0.0 + cp_atan( - ) { otherwise 494// ## ( x ) { 495// ## 496 497qword _cp_fatan2( qword y, qword x ) 498{ 499 const qword f_one = cp_filone(); 500 const qword f_zero = cp_filzero(); 501 const qword f_pi = si_lqa( (intptr_t)&_cp_f_pi ); 502 const qword f_npi = si_lqa( (intptr_t)&_cp_f_npi ); 503 504 // ## 505 // ## yox = y 506 // ## - 507 // ## x 508 // ## 509 // ## z = +PI + cp_atan( yox ) { ( x < 0.0 ) && ( y >= 0.0 ) 510 // ## -PI + cp_atan( yox ) { ( x < 0.0 ) && ( y < 0.0 ) 511 // ## 0.0 + cp_atan( yox ) { otherwise 512 513 const qword x_ltz_mask = si_fcgt( f_zero, x ); 514 const qword y_ltz_mask = si_fcgt( f_zero, y ); 515 const qword xy_ltz_mask = si_and( x_ltz_mask, y_ltz_mask ); 516 const qword zadd0 = si_selb( f_zero, f_pi, x_ltz_mask ); 517 const qword zadd = si_selb( zadd0, f_npi, xy_ltz_mask ); 518 const qword x_r0 = si_frest( x ); 519 const qword x_r1 = si_fnms( x_r0, x, f_one ); 520 const qword x_r = si_fma( x_r1, x_r0, x_r0 ); 521 const qword yox = si_fm( y, x_r ); 522 const qword atan_yox = _cp_fatan( yox ); 523 const qword result = si_fa( zadd, atan_yox ); 524 525 return (result); 526} 527 528vector float cp_fatan2( vector float arg0 /* y */, vector float arg1 /* x */ ) 529{ 530 const qword y = (qword)arg0; 531 const qword x = (qword)arg1; 532 const qword result = _cp_fatan2( y, x ); 533 534 return (vector float)(result); 535} 536 537float cp_fatan2_scalar( float arg0 /* y */, float arg1 /* x */ ) 538{ 539 const qword y = si_from_float( arg0 ); 540 const qword x = si_from_float( arg1 ); 541 const qword z = _cp_fatan2( y, x ); 542 const float result = si_to_float( z ); 543 544 return( result ); 545} 546 547#endif /* CP_FATAN_CBE_SPU_H */

0// ## cp_fatan-cbe-spu.c (C99) 1// ## Version 1.0 2// ## 3// ## Copyright (c) 2006 Mike Acton 4// ## 5// ## SIGNIFICANT REFERENCES: 6// ## 7// ## [1] Cephes Math Library Release 2.8: June, 2000 8// ## Copyright 1984, 1995, 2000, Stephen L. Moshier 9// ## [2] Numerical Computation Guide (PDF) 10// ## Copyright 2000, Sun Microsystems, Inc. 11// ## [3] IEEE 754 Support in C99 (PDF) 12// ## Copyright 2001, Jim Thomas 13// ## [4] Solaris 10 Reference Manual : atan2(3M) 14// ## Copyright 1994-2005, Sun Microsystems, Inc. 15// ## 16// ## Permission is hereby granted, free of charge, to any person obtaining 17// ## a copy of this software and associated documentation files 18// ## (the "Software"), to deal in the Software without restriction, including 19// ## without limitation the rights to use, copy, modify, merge, publish, 20// ## distribute, sublicense, and/or sell copies of the Software, and to permit 21// ## persons to whom the Software is furnished to do so, subject to the 22// ## following conditions: 23// ## 24// ## The above copyright notice and this permission notice shall be included 25// ## in all copies or substantial portions of the Software. 26// ## 27// ## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 28// ## OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 29// ## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 30// ## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 31// ## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 32// ## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 33// ## THE SOFTWARE. 34// ## 35 36// Loading these contants from (global) SPU local memory is going to be a win over building them 37// or storing them locally near the function. 38 39const vector unsigned int _cp_f_pio4 = {+0x3F490FDA,+0x3F490FDA,+0x3F490FDA,+0x3F490FDA}; 40const vector unsigned int _cp_f_t3p8 = {+0x401A8279,+0x401A8279,+0x401A8279,+0x401A8279}; 41const vector unsigned int _cp_f_npio2 = {-0x4036F026,-0x4036F026,-0x4036F026,-0x4036F026}; 42const vector unsigned int _cp_f_pio2 = {+0x3FC90FDA,+0x3FC90FDA,+0x3FC90FDA,+0x3FC90FDA}; 43const vector unsigned int _cp_f_pt66 = {+0x3F28F5C2,+0x3F28F5C2,+0x3F28F5C2,+0x3F28F5C2}; 44const vector unsigned int _cp_f_pi = {+0x40490fda,+0x40490fda,+0x40490fda,+0x40490fda}; 45const vector unsigned int _cp_f_npi = {-0x3fb6f026,-0x3fb6f026,-0x3fb6f026,-0x3fb6f026}; 46 47const vector unsigned int _cp_f_atan_q4 = {+0x43428CF7,+0x43428CF7,+0x43428CF7,+0x43428CF7}; 48const vector unsigned int _cp_f_atan_q3 = {+0x43F2B1F8,+0x43F2B1F8,+0x43F2B1F8,+0x43F2B1F8}; 49const vector unsigned int _cp_f_atan_q2 = {+0x43D870C6,+0x43D870C6,+0x43D870C6,+0x43D870C6}; 50const vector unsigned int _cp_f_atan_q1 = {+0x432506EA,+0x432506EA,+0x432506EA,+0x432506EA}; 51const vector unsigned int _cp_f_atan_q0 = {+0x41C6DE22,+0x41C6DE22,+0x41C6DE22,+0x41C6DE22}; 52const vector unsigned int _cp_f_atan_p4 = {-0x3D7E4CB1,-0x3D7E4CB1,-0x3D7E4CB1,-0x3D7E4CB1}; 53const vector unsigned int _cp_f_atan_p3 = {-0x3D0A3A07,-0x3D0A3A07,-0x3D0A3A07,-0x3D0A3A07}; 54const vector unsigned int _cp_f_atan_p2 = {-0x3D69FB9F,-0x3D69FB9F,-0x3D69FB9F,-0x3D69FB9F}; 55const vector unsigned int _cp_f_atan_p1 = {-0x3E7EBD5E,-0x3E7EBD5E,-0x3E7EBD5E,-0x3E7EBD5E}; 56const vector unsigned int _cp_f_atan_p0 = {-0x409FFC03,-0x409FFC03,-0x409FFC03,-0x409FFC03}; 57const vector unsigned int _cp_f_hmorebits = {+0x240D3131,+0x240D3131,+0x240D3131,+0x240D3131}; 58const vector unsigned int _cp_f_morebits = {+0x248D3131,+0x248D3131,+0x248D3131,+0x248D3131}; 59