Bugzilla – Attachment 913 Details for
Bug 1652
Corner cases in SIMD sin/cos
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
Log In
[x]
|
Forgot Password
Login:
[x]
This bugzilla service is closed. All entries have been migrated to
https://gitlab.com/libeigen/eigen
Improved version of psincos
bug1652_improved_sincos.cpp (text/plain), 4.74 KB, created by
Gael Guennebaud
on 2019-01-07 15:27:58 UTC
(
hide
)
Description:
Improved version of psincos
Filename:
MIME Type:
Creator:
Gael Guennebaud
Created:
2019-01-07 15:27:58 UTC
Size:
4.74 KB
patch
obsolete
>template<bool ComputeSine,typename Packet> >EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS >EIGEN_UNUSED >Packet psincos_float(const Packet& _x) >{ > #if 1 > typedef typename unpacket_traits<Packet>::integer_packet PacketI; > > const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI > const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding > const PacketI csti_1 = pset1<PacketI>(1); > const Packet cst_sign_mask = pset1frombits<Packet>(0x80000000u); > const float cst_PIo4_bound = 0.78539816339744f*1.1f; // a bit more than PI/4 > > Packet x = pabs(_x); > > // Scale x by 2/Pi to find x's octant. > Packet y = pmul(x, cst_2oPI); > > // Rounding trick: > Packet y_round = padd(y, cst_rounding_magic); > PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24) > y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi > > // Compute the sign to apply to the polynomial. > // sin: sign = second_bit(y_int) xor signbit(_x) > // cos: sign = second_bit(y_int+1) > Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(pshiftleft<30>(y_int))) > : preinterpret<Packet>(pshiftleft<30>(padd(y_int,csti_1))); > sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit > > // Get the polynomial selection mask from the second bit of y_int > // We'll calculate both (sin and cos) polynomials and then select from the two. > Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); > > // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. > // The magic pass: "Extended precision modular arithmetic" > #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) > // This version requires true FMA for high accuracy > // It provides a max error of 1ULP for x < 117435.992f, i.e.: > // abs_err<5.9605e-08 && rel_err<1.1923e-7 > // Without true FMA, 1ULP accuracy is maintained for x<15.7, > // but accuracy is immediately lost for x>15.7 > x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x); > x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x); > x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x); > // For extremely large input, precision degrades. > // TODO: branch on max input to fallback to more accurate modulo in the unlikely case that x>117435.992 > // something like if(pallones(pcmp_lt(x,pset1<Packet>(117435.992f)))) {...} > // Meanwhile, let's just clamp: > x = pmin(x,pset1<Packet>( cst_PIo4_bound)); > x = pmax(x,pset1<Packet>(-cst_PIo4_bound)); > #else > // Without true FMA, using one more iteration permits to maintain 1ULP accuracy > // for x<2pi, and 2ULP for x<51981. > // In this range we have: abs_err<1.192092895507812e-07 && rel_err<1.730722734237313e-07 > x = pmadd(y, pset1<Packet>(-3.140625/2.), x); > x = pmadd(y, pset1<Packet>(-0.0009670257568359375/2.), x); > x = pmadd(y, pset1<Packet>(-6.2771141529083251953e-07/2.), x); > x = pmadd(y, pset1<Packet>(-1.2154201256553420762e-10/2.), x); > > // For extremely large input, precision degrades. > // TODO: branch on max input to fallback to more accurate modulo in the unlikely case that x>117435.992 > // something like if(pallones(pcmp_lt(x,pset1<Packet>(51981f)))) {...} > // Meanwhile, let's just clamp: > x = pmin(x,pset1<Packet>( cst_PIo4_bound)); > x = pmax(x,pset1<Packet>(-cst_PIo4_bound)); > #endif > > Packet x2 = pmul(x,x); > > // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4) > Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f); > y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f )); > y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f )); > y1 = pmadd(y1, x2, pset1<Packet>(-0.5f)); > y1 = pmadd(y1, x2, pset1<Packet>(1.f)); > > // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4) > // octave/matlab code to compute those coefficients: > // x = (0:0.0001:pi/4)'; > // A = [x.^3 x.^5 x.^7]; > // w = ((1.-(x/(pi/4)).^2).^3)*1000+1; # weights trading relative accuracy > // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1 > // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1)) > // > Packet y2 = pset1<Packet>(-0.0001958506847391916613678602976378329003637190908193588256835938f); > y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326216955056514601452022361627314239740371704101562500000000f)); > y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666101652799214516420533982454799115657806396484375000000000f)); > y2 = pmul(y2, x2); > y2 = pmadd(y2, x, x); > > // Select the correct result from the two polynomials. > y = ComputeSine ? pselect(poly_mask,y2,y1) > : pselect(poly_mask,y1,y2); > > // Update the sign > return pxor(y, sign_bit); >}
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Raw
Actions:
View
Attachments on
bug 1652
:
911
| 913