template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet psincos_float(const Packet& _x) { #if 1 typedef typename unpacket_traits::integer_packet PacketI; const Packet cst_2oPI = pset1(0.636619746685028076171875f); // 2/PI const Packet cst_rounding_magic = pset1(12582912); // 2^23 for rounding const PacketI csti_1 = pset1(1); const Packet cst_sign_mask = pset1frombits(0x80000000u); const float cst_PIo4_bound = 0.78539816339744f*1.1f; // a bit more than PI/4 Packet x = pabs(_x); // Scale x by 2/Pi to find x's octant. Packet y = pmul(x, cst_2oPI); // Rounding trick: Packet y_round = padd(y, cst_rounding_magic); PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi // Compute the sign to apply to the polynomial. // sin: sign = second_bit(y_int) xor signbit(_x) // cos: sign = second_bit(y_int+1) Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(pshiftleft<30>(y_int))) : preinterpret(pshiftleft<30>(padd(y_int,csti_1))); sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit // Get the polynomial selection mask from the second bit of y_int // We'll calculate both (sin and cos) polynomials and then select from the two. Packet poly_mask = preinterpret(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. // The magic pass: "Extended precision modular arithmetic" #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) // This version requires true FMA for high accuracy // It provides a max error of 1ULP for x < 117435.992f, i.e.: // abs_err<5.9605e-08 && rel_err<1.1923e-7 // Without true FMA, 1ULP accuracy is maintained for x<15.7, // but accuracy is immediately lost for x>15.7 x = pmadd(y, pset1(-1.57079601287841796875f), x); x = pmadd(y, pset1(-3.1391647326017846353352069854736328125e-07f), x); x = pmadd(y, pset1(-5.390302529957764765544681040410068817436695098876953125e-15f), x); // For extremely large input, precision degrades. // TODO: branch on max input to fallback to more accurate modulo in the unlikely case that x>117435.992 // something like if(pallones(pcmp_lt(x,pset1(117435.992f)))) {...} // Meanwhile, let's just clamp: x = pmin(x,pset1( cst_PIo4_bound)); x = pmax(x,pset1(-cst_PIo4_bound)); #else // Without true FMA, using one more iteration permits to maintain 1ULP accuracy // for x<2pi, and 2ULP for x<51981. // In this range we have: abs_err<1.192092895507812e-07 && rel_err<1.730722734237313e-07 x = pmadd(y, pset1(-3.140625/2.), x); x = pmadd(y, pset1(-0.0009670257568359375/2.), x); x = pmadd(y, pset1(-6.2771141529083251953e-07/2.), x); x = pmadd(y, pset1(-1.2154201256553420762e-10/2.), x); // For extremely large input, precision degrades. // TODO: branch on max input to fallback to more accurate modulo in the unlikely case that x>117435.992 // something like if(pallones(pcmp_lt(x,pset1(51981f)))) {...} // Meanwhile, let's just clamp: x = pmin(x,pset1( cst_PIo4_bound)); x = pmax(x,pset1(-cst_PIo4_bound)); #endif Packet x2 = pmul(x,x); // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4) Packet y1 = pset1(2.4372266125283204019069671630859375e-05f); y1 = pmadd(y1, x2, pset1(-0.00138865201734006404876708984375f )); y1 = pmadd(y1, x2, pset1(0.041666619479656219482421875f )); y1 = pmadd(y1, x2, pset1(-0.5f)); y1 = pmadd(y1, x2, pset1(1.f)); // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4) // octave/matlab code to compute those coefficients: // x = (0:0.0001:pi/4)'; // A = [x.^3 x.^5 x.^7]; // w = ((1.-(x/(pi/4)).^2).^3)*1000+1; # weights trading relative accuracy // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1 // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1)) // Packet y2 = pset1(-0.0001958506847391916613678602976378329003637190908193588256835938f); y2 = pmadd(y2, x2, pset1( 0.0083326216955056514601452022361627314239740371704101562500000000f)); y2 = pmadd(y2, x2, pset1(-0.1666666101652799214516420533982454799115657806396484375000000000f)); y2 = pmul(y2, x2); y2 = pmadd(y2, x, x); // Select the correct result from the two polynomials. y = ComputeSine ? pselect(poly_mask,y2,y1) : pselect(poly_mask,y1,y2); // Update the sign return pxor(y, sign_bit); }