This bugzilla service is closed. All entries have been migrated to https://gitlab.com/libeigen/eigen
View | Details | Raw Unified | Return to bug 895
Collapse All | Expand All

(-)a/Eigen/src/Core/arch/AVX/MathFunctions.h (+241 lines)
Line 0 Link Here
1
// This file is part of Eigen, a lightweight C++ template library
2
// for linear algebra.
3
//
4
// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
5
//
6
// This Source Code Form is subject to the terms of the Mozilla
7
// Public License v. 2.0. If a copy of the MPL was not distributed
8
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10
#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
11
#define EIGEN_MATH_FUNCTIONS_AVX_H
12
13
// For some reason, this function didn't make it into the avxintirn.h
14
// used by the compiler, so we'll just wrap it.
15
#define _mm256_setr_m128(lo, hi) \
16
  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
17
18
19
namespace Eigen {
20
21
namespace internal {
22
23
// Natural logarithm
24
// Computes log(x) as log(2^e * m) = k*e + log(m), where m is in the range
25
// [sqrt(1/2),sqrt(2)). In this range, the logarithm can be easily
26
// approximated by a polynomial centered on m=1 for stability.
27
// TODO(gonnet): Further reduce the interval allowing for lower-degree
28
//               polynomial interpolants -> ... -> profit!
29
template <>
30
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
31
plog<Packet8f>(const Packet8f& _x) {
32
  Packet8f x = _x;
33
  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
34
  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
35
  _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
36
37
  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
38
39
  /* the smallest non denormalized float number */
40
  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
41
  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
42
43
  /* natural logarithm computed for 8 simultaneous float
44
    return NaN for x <= 0
45
  */
46
  _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
47
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
48
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
49
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
50
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
51
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
52
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
53
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
54
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
55
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
56
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
57
  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
58
59
  Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ);
60
  Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
61
62
  // Truncate input values to the minimum positive normal.
63
  x = pmax(x, p8f_min_norm_pos);
64
65
// Extract the shifted exponents (No bitwise shifting in regular AVX, so convert
66
// to
67
// SSE and do it there).
68
#ifdef EIGEN_VECTORIZE_AVX2
69
  Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_srli_epi32((__m256i)x, 23));
70
#else
71
  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256((__m256i)x, 0), 23);
72
  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256((__m256i)x, 1), 23);
73
  Packet8f emm0 = _mm256_cvtepi32_ps(_mm256_setr_m128(lo, hi));
74
#endif
75
  Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
76
77
  /* Set the exponents to -1, i.e. x are in the range [0.5,1). */
78
  x = _mm256_and_ps(x, p8f_inv_mant_mask);
79
  x = _mm256_or_ps(x, p8f_half);
80
81
  /* part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
82
   * and shift by -1. The values are then centered around 0, which improves
83
   * the stability of the polynomial evaluation.
84
     if( x < SQRTHF ) {
85
       e -= 1;
86
       x = x + x - 1.0;
87
     } else { x = x - 1.0; }
88
  */
89
  Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
90
  Packet8f tmp = _mm256_and_ps(x, mask);
91
  x = psub(x, p8f_1);
92
  e = psub(e, _mm256_and_ps(p8f_1, mask));
93
  x = padd(x, tmp);
94
95
  Packet8f x2 = pmul(x, x);
96
  Packet8f x3 = pmul(x2, x);
97
98
  // Evaluate the polynomial approximant of degree 8 in three parts, probably
99
  // to improve instruction-level parallelism.
100
  // TODO(gonnet): Split into odd/even polynomials, avoids computing x3 and
101
  //               some of the multiplications at the end.
102
  Packet8f y, y1, y2;
103
  y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
104
  y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
105
  y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
106
  y = pmadd(y, x, p8f_cephes_log_p2);
107
  y1 = pmadd(y1, x, p8f_cephes_log_p5);
108
  y2 = pmadd(y2, x, p8f_cephes_log_p8);
109
  y = pmadd(y, x3, y1);
110
  y = pmadd(y, x3, y2);
111
  y = pmul(y, x3);
112
113
  // Add the logarithm of the exponent back to the result of the interpolation.
114
  y1 = pmul(e, p8f_cephes_log_q1);
115
  tmp = pmul(x2, p8f_half);
116
  y = padd(y, y1);
117
  x = psub(x, tmp);
118
  y2 = pmul(e, p8f_cephes_log_q2);
119
  x = padd(x, y);
120
  x = padd(x, y2);
121
122
  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
123
  return _mm256_or_ps(
124
      _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
125
      _mm256_and_ps(iszero_mask, p8f_minus_inf));
126
}
127
128
// Exponential function. Works by writing "x = m*log(2) + r" where
129
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
130
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
131
template <>
132
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
133
pexp<Packet8f>(const Packet8f& _x) {
134
  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
135
  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
136
  _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
137
138
  _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
139
  _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
140
141
  _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
142
143
  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
144
  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
145
  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
146
  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
147
  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
148
  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
149
150
  // Clamp x.
151
  Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
152
153
  // Express exp(x) as exp(m*ln(2) + r), start by extracting
154
  // m = floor(x/ln(2) + 0.5).
155
  Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
156
157
// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
158
// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
159
// truncation errors. Note that we don't use the "pmadd" function here to
160
// ensure that a precision-preserving FMA instruction is used.
161
#ifdef EIGEN_VECTORIZE_FMA
162
  _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
163
  Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
164
#else
165
  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
166
  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
167
  Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
168
  r = psub(r, pmul(m, p8f_cephes_exp_C2));
169
#endif
170
171
  Packet8f r2 = pmul(r, r);
172
173
  // TODO(gonnet): Split into odd/even polynomials and try to exploit
174
  //               instruction-level parallelism.
175
  Packet8f y = p8f_cephes_exp_p0;
176
  y = pmadd(y, r, p8f_cephes_exp_p1);
177
  y = pmadd(y, r, p8f_cephes_exp_p2);
178
  y = pmadd(y, r, p8f_cephes_exp_p3);
179
  y = pmadd(y, r, p8f_cephes_exp_p4);
180
  y = pmadd(y, r, p8f_cephes_exp_p5);
181
  y = pmadd(y, r2, r);
182
  y = padd(y, p8f_1);
183
184
  // Build emm0 = 2^m.
185
  Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
186
#ifdef EIGEN_VECTORIZE_AVX2
187
  emm0 = _mm256_slli_epi32(emm0, 23);
188
#else
189
  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 0), 23);
190
  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(emm0, 1), 23);
191
  emm0 = _mm256_setr_m128(lo, hi);
192
#endif
193
194
  // Return 2^m * exp(r).
195
  return pmul(y, _mm256_castsi256_ps(emm0));
196
}
197
198
// Functions for sqrt.
199
// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
200
// of Newton's method, at a cost of 1-2 bits of precision as opposed to the
201
// exact solution. The main advantage of this approach is not just speed, but
202
// also the fact that it can be inlined and pipelined with other computations,
203
// further reducing its effective latency.
204
#if EIGEN_FAST_MATH
205
template <>
206
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
207
psqrt<Packet8f>(const Packet8f& _x) {
208
  _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
209
  _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
210
  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
211
212
  Packet8f neg_half = pmul(_x, p8f_minus_half);
213
214
  // select only the inverse sqrt of positive normal inputs (denormals are
215
  // flushed to zero and cause infs as well).
216
  Packet8f non_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_GE_OQ);
217
  Packet8f x = _mm256_and_ps(non_zero_mask, _mm256_rsqrt_ps(_x));
218
219
  // Do a single step of Newton's iteration.
220
  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
221
222
  // Multiply the original _x by it's reciprocal square root to extract the
223
  // square root.
224
  return pmul(_x, x);
225
}
226
#else
227
template <>
228
EIGEN_STRONG_INLINE Packet8f psqrt<Packet8f>(const Packet8f& x) {
229
  return _mm256_sqrt_ps(x);
230
}
231
#endif
232
template <>
233
EIGEN_STRONG_INLINE Packet4d psqrt<Packet4d>(const Packet4d& x) {
234
  return _mm256_sqrt_pd(x);
235
}
236
237
}  // end namespace internal
238
239
}  // end namespace Eigen
240
241
#endif  // EIGEN_MATH_FUNCTIONS_AVX_H
(-)a/Eigen/src/Core/arch/AVX/PacketMath.h (-4 / +6 lines)
Lines 51-83 template<> struct packet_traits<float> Link Here
51
    Vectorizable = 1,
51
    Vectorizable = 1,
52
    AlignedOnScalar = 1,
52
    AlignedOnScalar = 1,
53
    size=8,
53
    size=8,
54
    HasHalfPacket = 1,
54
    HasHalfPacket = 1,
55
55
56
    HasDiv  = 1,
56
    HasDiv  = 1,
57
    HasSin  = 0,
57
    HasSin  = 0,
58
    HasCos  = 0,
58
    HasCos  = 0,
59
    HasLog  = 0,
59
    HasLog  = 1,
60
    HasExp  = 0,
60
    HasExp  = 1,
61
    HasSqrt = 0
61
    HasSqrt = 1
62
  };
62
  };
63
 };
63
 };
64
template<> struct packet_traits<double> : default_packet_traits
64
template<> struct packet_traits<double> : default_packet_traits
65
{
65
{
66
  typedef Packet4d type;
66
  typedef Packet4d type;
67
  typedef Packet2d half;
67
  typedef Packet2d half;
68
  enum {
68
  enum {
69
    Vectorizable = 1,
69
    Vectorizable = 1,
70
    AlignedOnScalar = 1,
70
    AlignedOnScalar = 1,
71
    size=4,
71
    size=4,
72
    HasHalfPacket = 1,
72
    HasHalfPacket = 1,
73
73
74
    HasDiv  = 1,
74
    HasDiv  = 1,
75
    HasExp  = 0
75
    HasLog  = 0,
76
    HasExp  = 0,
77
    HasSqrt = 1
76
  };
78
  };
77
};
79
};
78
80
79
/* Proper support for integers is only provided by AVX2. In the meantime, we'll
81
/* Proper support for integers is only provided by AVX2. In the meantime, we'll
80
   use SSE instructions and packets to deal with integers.
82
   use SSE instructions and packets to deal with integers.
81
template<> struct packet_traits<int>    : default_packet_traits
83
template<> struct packet_traits<int>    : default_packet_traits
82
{
84
{
83
  typedef Packet8i type;
85
  typedef Packet8i type;

Return to bug 895