--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -228,16 +228,20 @@ Packet pexp_float(const Packet _x) y = pmadd(y, r, cst_cephes_exp_p5); y = pmadd(y, r2, r); y = padd(y, cst_1); // Return 2^m * exp(r). return pmax(pldexp(y,m), _x); } +// make it the default path for scalar float +template<> +float pexp(const float& a) { return pexp_float(a); } + template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet pexp_double(const Packet _x) { Packet x = _x; const Packet cst_1 = pset1(1.0); @@ -296,16 +300,20 @@ Packet pexp_double(const Packet _x) x = pdiv(px, psub(qx, px)); x = pmadd(cst_2, x, cst_1); // Construct the result 2^n * exp(g) = e * x. The max is used to catch // non-finite values in the input. return pmax(pldexp(x,fx), _x); } +// make it the default path for scalar double +template<> +double pexp(const double& a) { return pexp_double(a); } + // The following code is inspired by the following stack-overflow answer: // https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 // It has been largely optimized: // - By-pass calls to frexp. // - Aligned loads of required 96 bits of 2/pi. This is accomplished by // (1) balancing the mantissa and exponent to the required bits of 2/pi are // aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi. // - Avoid a branch in rounding and extraction of the remaining fractional part. --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ a/Eigen/src/Core/functors/UnaryFunctors.h @@ -889,18 +889,17 @@ struct functor_traits struct scalar_logistic_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { - const T one = T(1); - return one / (one + numext::exp(-x)); + return packetOp(x); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { const Packet one = pset1(T(1)); return pdiv(one, padd(one, pexp(pnegate(x)))); } }; @@ -914,19 +913,17 @@ struct scalar_logistic_op { * logistic is interpolated because it was easier to make the fit converge. * */ template <> struct scalar_logistic_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const { - if (x < -18.0f) return 0.0f; - else if (x > 18.0f) return 1.0f; - else return 1.0f / (1.0f + numext::exp(-x)); + return packetOp(x); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& _x) const { // Clamp the inputs to the range [-18, 18] since anything outside // this range is 0.0f or 1.0f in single-precision. const Packet x = pmax(pmin(_x, pset1(18.0)), pset1(-18.0)); --- a/test/packetmath.cpp +++ a/test/packetmath.cpp @@ -566,16 +566,31 @@ template::quiet_NaN(); packet_helper::HasTanh,Packet> h; h.store(data2, internal::ptanh(h.load(data1))); VERIFY((numext::isnan)(data2[0])); } + { + internal::scalar_logistic_op logistic; + for (int i=0; i(-20,20); + } + internal::pstore(data2, logistic.packetOp(internal::pload(data1))); + for (int i=0; i::quiet_NaN(); packet_helper::HasLGamma,Packet> h; h.store(data2, internal::plgamma(h.load(data1))); VERIFY((numext::isnan)(data2[0])); } if (internal::packet_traits::HasErf) { @@ -960,17 +975,16 @@ struct runner::run(); } }; EIGEN_DECLARE_TEST(packetmath) { g_first_pass = true; for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_1( runner::run() ); CALL_SUBTEST_2( runner::run() ); CALL_SUBTEST_3( runner::run() ); CALL_SUBTEST_4( runner >::run() ); CALL_SUBTEST_5( runner >::run() ); CALL_SUBTEST_6(( packetmath::type>() )); g_first_pass = false; }