This bugzilla service is closed. All entries have been migrated to https://gitlab.com/libeigen/eigen
View | Details | Raw Unified | Return to bug 930 | Differences between
and this patch

Collapse All | Expand All

(-)a/unsupported/Eigen/CMakeLists.txt (+1 lines)
Lines 4-11 set(Eigen_HEADERS AdolcForward BVH Itera Link Here
4
   )
4
   )
5
5
6
install(FILES
6
install(FILES
7
  ${Eigen_HEADERS}
7
  ${Eigen_HEADERS}
8
  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
8
  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
9
  )
9
  )
10
10
11
add_subdirectory(src)
11
add_subdirectory(src)
12
add_subdirectory(CXX11)
(-)a/unsupported/Eigen/CXX11/CMakeLists.txt (+9 lines)
Line 0 Link Here
1
set(Eigen_HEADERS Tensor TensorSymmetry Core MeasureCacheSizes
2
   )
3
4
install(FILES
5
  ${Eigen_HEADERS}
6
  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
7
  )
8
9
add_subdirectory(src)
(-)a/unsupported/Eigen/CXX11/MeasureCacheSizes (+28 lines)
Line 0 Link Here
1
// This file is part of Eigen, a lightweight C++ template library
2
// for linear algebra.
3
//
4
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
5
//
6
// This Source Code Form is subject to the terms of the Mozilla
7
// Public License v. 2.0. If a copy of the MPL was not distributed
8
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10
#ifndef EIGEN_MEASURE_CACHE_SIZES_MODULE
11
#define EIGEN_MEASURE_CACHE_SIZES_MODULE
12
13
/** \defgroup MeasureCacheSizes module
14
  *
15
  * Offers a simple measure_cache_sizes benchmark measuring CPU cache sizes
16
  * 
17
  * This is useful as a fallback in situations where this information
18
  * can't be queried directly. This information can then be passed to Eigen
19
  * to tune its algorithms.
20
  *
21
  * \code
22
  * #include <unsupported/Eigen/CXX11/MeasureCacheSizes>
23
  * \endcode
24
  */
25
26
#include "src/MeasureCacheSizes/MeasureCacheSizes.h"
27
28
#endif // EIGEN_MEASURE_CACHE_LEVELS_MODULE
(-)a/unsupported/Eigen/CXX11/src/CMakeLists.txt (+4 lines)
Line 0 Link Here
1
#ADD_SUBDIRECTORY(Tensor)
2
#ADD_SUBDIRECTORY(TensorSymmetry)
3
#ADD_SUBDIRECTORY(Core)
4
ADD_SUBDIRECTORY(MeasureCacheSizes)
(-)a/unsupported/Eigen/CXX11/src/MeasureCacheSizes/CMakeLists.txt (+6 lines)
Line 0 Link Here
1
FILE(GLOB Eigen_MeasureCacheSizes_SRCS "*.h")
2
3
INSTALL(FILES
4
  ${Eigen_MeasureCacheSizes_SRCS}
5
  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/MeasureCacheSizes COMPONENT Devel
6
  )
(-)a/unsupported/Eigen/CXX11/src/MeasureCacheSizes/MeasureCacheSizes.h (+404 lines)
Line 0 Link Here
1
// This file is part of Eigen, a lightweight C++ template library
2
// for linear algebra.
3
//
4
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
5
//
6
// This Source Code Form is subject to the terms of the Mozilla
7
// Public License v. 2.0. If a copy of the MPL was not distributed
8
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10
#ifndef EIGEN_MEASURE_CACHE_SIZES_H
11
#define EIGEN_MEASURE_CACHE_SIZES_H
12
13
#include <Eigen/Core>
14
15
#include <ctime>
16
#include <cstdint>
17
#include <cmath>
18
#include <cstdlib>
19
#include <cstring>
20
#include <limits>
21
#include <algorithm>
22
#include <vector>
23
24
#ifndef __linux__
25
#include <sys/time.h>
26
#endif
27
28
namespace Eigen {
29
namespace internal {
30
31
using std::size_t;
32
using std::ptrdiff_t;
33
using std::uint8_t;
34
using std::uint64_t;
35
36
const size_t megabyte = 1 << 20;
37
const size_t kilobyte = 1 << 10;
38
39
// We must repeat memory accesses enough times to give
40
// the CPU a good chance to have our stuff in cache for
41
// most of the measurements, so that the effect of caches
42
// can be measured. Since our accesses are by large memcpy's,
43
// the number of times that each cache line is touched is
44
// roughly equal to the number of times that each byte is touched.
45
const int minimum_times_accessed_each_byte = 8;
46
47
// Even with clock_gettime, time measurement accuracy is often
48
// worse than 1e-5 s. We avoid measuring times shorter than 1e-2 s
49
// so as to make this a non-issue. Increasing this minimum further
50
// significantly slows down the measurements.
51
const uint64_t minimum_measurement_time_in_ns = 1e7;
52
53
// We repeat all measurements a few times and keep the best throughput
54
// for each workload. This removes random noise and warm-up effects.
55
const int measurement_passes = 3;
56
57
// Minimum ratio of throughputs of different cache levels.
58
// For example, 1.15 means that any difference smaller than 15%
59
// will be interpreted as measurement inaccuracy, not as
60
// indicating different cache levels.
61
const float never_separate_levels_closer_than = 1.15;
62
63
// We need to be able to distinguish the highest level of cache (e.g. L3)
64
// from RAM. To do so, we need to know the throughput of RAM, so we need
65
// to know a working set size that cannot plausibly fit in cache. That is
66
// implausibly_large_cache_size. It should be set to a value larger than
67
// any CPU's cache on a given architecture.
68
//
69
// Conversely implausibly_small_cache_size should be set to a size
70
// smaller than any CPU's L1 cache, in order for us to correctly detect
71
// the L1 cache.
72
//
73
// Keep in mind that we require each cache level to be
74
// reflected in at least two consecutive power-of-two measurements,
75
// so for instance if the L1 cache size is 4k, implausibly_small_cache_size
76
// must be set to at most 2k in order for us to detect it.
77
#if defined __aarch64__ || defined __arm__
78
  const size_t implausibly_large_cache_size = 8 * megabyte;
79
  const size_t implausibly_small_cache_size = 1 * kilobyte;
80
#elif defined __x86_64__ || defined __i386__
81
  const size_t implausibly_large_cache_size = 64 * megabyte;
82
  const size_t implausibly_small_cache_size = 4 * kilobyte;
83
#else
84
#error "What's your architecture?"
85
#endif
86
87
// To avoid alignment-related noise, especially with small working set sizes,
88
// we keep our buffers on an alignment much higher than any plausible cache
89
// line size. It doesn't hurt, then, to make this the typical value of
90
// page size on most systems.
91
const size_t alignment = 0x1000;
92
93
class measurement_context_t
94
{
95
  size_t allocated_buffer_size;
96
  uint8_t* buffer1;
97
  uint8_t* buffer2;
98
99
  __attribute__((noinline))
100
  static void consume(uint8_t&)
101
  {}
102
103
  static uint64_t get_raw_time_in_ns()
104
  {
105
  #ifdef __linux__
106
    timespec t;
107
    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
108
    return t.tv_nsec + 1e9 * t.tv_sec;
109
  #else
110
    timeval t;
111
    gettimeofday(&t, nullptr);
112
    return 1e3 * t.tv_usec + 1e9 * t.tv_sec;
113
  #endif
114
  }
115
116
public:
117
  measurement_context_t(size_t working_set_size)
118
  {
119
    allocated_buffer_size = working_set_size / 2;
120
    bool alloc_failure =
121
      posix_memalign(reinterpret_cast<void**>(&buffer1), alignment, allocated_buffer_size) ||
122
      posix_memalign(reinterpret_cast<void**>(&buffer2), alignment, allocated_buffer_size);
123
    eigen_assert(!alloc_failure);
124
    EIGEN_ONLY_USED_FOR_DEBUG(alloc_failure);
125
126
    // Initial warm up and ensure that the compiler can't optimize all
127
    // the workload away as depending on uninitialized values
128
    for (size_t i = 0; i < allocated_buffer_size; i++) {
129
      buffer1[i] = buffer2[i] = uint8_t(i);
130
    }
131
  }
132
133
  ~measurement_context_t()
134
  {
135
    free(buffer1);
136
    free(buffer2);
137
  }
138
139
  float measure_throughput(size_t working_set_size)
140
  {
141
    const size_t buffer_size = working_set_size / 2;
142
    eigen_assert(buffer_size <= allocated_buffer_size);
143
144
    size_t iters_at_a_time = 1;
145
    uint64_t bytes_copied = 0;
146
    const uint64_t minimum_bytes_copied = buffer_size * minimum_times_accessed_each_byte;
147
    uint64_t t1, t2;
148
    t1 = get_raw_time_in_ns();
149
    while (true) {
150
      t2 = get_raw_time_in_ns();
151
      uint64_t elapsed = t2 - t1;
152
      if (elapsed >= minimum_measurement_time_in_ns && bytes_copied >= minimum_bytes_copied) {
153
        consume(buffer1[buffer_size / 2]);
154
        consume(buffer2[buffer_size / 2]);
155
        return float(bytes_copied) / elapsed; // bytes per ns, equivalently GB/s
156
      }
157
      for (size_t i = 0; i < iters_at_a_time; i++) {
158
        // Inner benchmark loop. We use memcpy as it is probably well optimized
159
        // for optimal throughput.
160
161
        // We copy back and forth. This could be updated in the future
162
        // if we wanted to test more specific cache behavior.
163
        uint8_t* bufferA = (i & 1) ? buffer1 : buffer2;
164
        uint8_t* bufferB = (i & 1) ? buffer2 : buffer1;
165
166
        // We apply a variable offset to make it near-impossible for the compiler
167
        // to outsmart us and optimize away our memcpy calls.
168
        const size_t offset = 16 * (i & 15);
169
        const size_t copy_size = buffer_size - offset;
170
171
        memcpy(bufferB, bufferA + offset, copy_size);
172
        bytes_copied += copy_size;
173
      }
174
      eigen_assert(iters_at_a_time <= std::numeric_limits<size_t>::max() / 2);
175
      iters_at_a_time *= 2;
176
    }
177
  }
178
};
179
180
/*
181
 * A measurement_t is the result of a single experimental measurement of
182
 * sustained memory throughput for a buffer of a given size.
183
 */
184
struct measurement_t
185
{
186
  // The size of the memory buffer that this measurement used
187
  size_t size;
188
189
  // The sustained memory throughput that we measured, in GB/s
190
  float throughput;
191
192
  // Identifier of the cache level that this buffer size fits in.
193
  // The initial value 0 means that this has not yet been determined.
194
  // Identifiers are not ordered, and in particular are not
195
  // numerically equal to cache levels. Also, buffers too large to
196
  // fit in any cache still get an identifier, which corresponds to
197
  // RAM.
198
  int cache_level_id;
199
200
  measurement_t(size_t s = 0)
201
    : size(s)
202
    , throughput(0.0f)
203
    , cache_level_id(0)
204
  {}
205
};
206
207
/* Measures memory throughput for various buffer sizes.
208
 * This gathers all the experimental data that will be needed to
209
 * determine the cache structure.
210
 */
211
inline void measure_throughput_for_various_sizes(
212
              std::vector<measurement_t>& measurements)
213
{
214
  eigen_assert(measurements.empty());
215
  measurement_context_t measurement_context(implausibly_large_cache_size);
216
217
  for (size_t size = implausibly_small_cache_size;
218
       size <= implausibly_large_cache_size;
219
       size *= 2)
220
  {
221
    measurements.push_back(measurement_t(size));
222
  }
223
224
  for (size_t pass = 1; pass <= measurement_passes; pass++) {
225
    for (ptrdiff_t i = measurements.size() - 1; i >= 0; i--) {
226
      measurements[i].throughput = std::max(
227
        measurements[i].throughput,
228
        measurement_context.measure_throughput(measurements[i].size));
229
      if (size_t(i) < measurements.size() - 1) {
230
        measurements[i].throughput = std::max(
231
          measurements[i].throughput,
232
          measurements[i+1].throughput);
233
      }
234
    }
235
  }
236
}
237
238
/* Partitions a vector of measurement levels into "cache levels", i.e.
239
 * groups having approximately the same throughput. The partitioning is
240
 * written into the cache_level_id fields of the measurement levels.
241
 */
242
inline void partition_measurements_into_cache_levels(
243
              std::vector<measurement_t>& measurements)
244
{
245
  eigen_assert(!measurements.empty());
246
247
  int current_cache_level_id = 0;
248
  int next_cache_level_id = 1;
249
250
  measurements.front().cache_level_id = next_cache_level_id++;
251
  measurements.back().cache_level_id = next_cache_level_id++;
252
253
  float threshold = std::log(never_separate_levels_closer_than);
254
  
255
  ptrdiff_t step = 1;
256
257
  // Step 1: we traverse the measurements back and forth, in the direction
258
  // of 'step'. In each traversal, we try to identify groups of
259
  // at least two consecutive measurements with approximately the same
260
  // throughput, as determined by the current value of 'threshold'. Such groups
261
  // are called cache levels, and are recorded in the 'cache_level' field
262
  // of each measurement. We continue until no two consecutive measurements
263
  // are left without a known cache level.
264
  while(true) {
265
    ptrdiff_t start_level = step == 1 ? 0 : measurements.size() - 1;
266
    ptrdiff_t end_level = step == -1 ? 0 : measurements.size() - 1;
267
    for (ptrdiff_t i = start_level; i != end_level; i += step) {
268
      if (measurements[i].cache_level_id) {
269
        current_cache_level_id = measurements[i].cache_level_id;
270
        continue;
271
      }
272
      if (current_cache_level_id) {
273
        // Test if the current measurement can join the current cache level
274
        float ratio = measurements[i].throughput / measurements[i - step].throughput;
275
        if (std::abs(std::log(ratio)) < threshold) {
276
          measurements[i].cache_level_id = current_cache_level_id; 
277
        } else {
278
          current_cache_level_id = 0;
279
        }
280
      }
281
      if (!current_cache_level_id) {
282
        // Test if the current and next measurements are close enough to
283
        // create a new cache level for them
284
        if (!measurements[i + step].cache_level_id) {
285
          float ratio = measurements[i + step].throughput / measurements[i].throughput;
286
          if (std::abs(std::log(ratio)) < threshold)
287
          {
288
            current_cache_level_id = next_cache_level_id++;
289
            measurements[i].cache_level_id = current_cache_level_id;
290
          }
291
        }
292
      }
293
    }
294
295
    bool finished = true;
296
    for (size_t i = 0; i < measurements.size() - 1; i++) {
297
      // Test if two consecutive measurements are still left without a cache_level
298
      if (!measurements[i].cache_level_id && !measurements[i+1].cache_level_id) {
299
        finished = false;
300
      }
301
    }
302
303
    if (finished) {
304
      break;
305
    }
306
307
    step = -step;
308
    threshold *= 1.2f; // The value of this multiplier doesn't matter much.
309
                       // We just increase the threshold little by little
310
  }                    // until only isolated 1-wide gaps remain.
311
312
  // Step 2: there may still remain isolated measurements either without a cache level,
313
  // or with a cache level different from either of their neighbors'.
314
  // We resolve those isolated measurements by letting them join the cache level
315
  // immediately above or below, whichever is closer.
316
  bool might_have_isolated_level = true;
317
  while (might_have_isolated_level) {
318
    for (size_t i = 0; i < measurements.size(); i++) {
319
      bool isolated_below = i == 0 ||
320
        measurements[i].cache_level_id != measurements[i-1].cache_level_id;
321
      bool isolated_above = i == measurements.size() - 1 ||
322
        measurements[i].cache_level_id != measurements[i+1].cache_level_id;
323
      if (i == 0 && isolated_above) {
324
        measurements[0].cache_level_id = measurements[1].cache_level_id;
325
        continue;
326
      }
327
      if (i == measurements.size() - 1 && isolated_below) {
328
        measurements[measurements.size() - 1].cache_level_id =
329
          measurements[measurements.size() - 2].cache_level_id;
330
        continue;
331
      }
332
      if (isolated_below && isolated_above) {
333
        float distance_to_prev =
334
          std::abs(std::log(measurements[i-1].throughput / measurements[i].throughput));
335
        float distance_to_next =
336
          std::abs(std::log(measurements[i+1].throughput / measurements[i].throughput));
337
        if (distance_to_prev < distance_to_next) {
338
          measurements[i].cache_level_id = measurements[i-1].cache_level_id;
339
        } else {
340
          measurements[i].cache_level_id = measurements[i+1].cache_level_id;
341
        }
342
        continue;
343
      }
344
      might_have_isolated_level = false;
345
    }
346
  }
347
}
348
349
/* Takes a vector of measurement levels already partitioned into cache levels,
350
 * and outputs a plain vector of integers giving the size in bytes of each cache level.
351
 */
352
inline void get_cache_sizes_from_partitioned_measurements(
353
              std::vector<size_t>& cache_sizes,
354
              const std::vector<measurement_t>& measurements)
355
{
356
  eigen_assert(cache_sizes.empty());
357
358
  for (size_t i = 1; i < measurements.size() - 1; i++) {
359
    // should have generated cache_level_id's
360
    eigen_assert(measurements[i].cache_level_id);
361
362
    // pick biggest size in each cache level, and omit the last one, as it's
363
    // just RAM, not a real cache level.
364
    if (measurements[i].cache_level_id != measurements[i+1].cache_level_id &&
365
        measurements[i].cache_level_id != measurements.back().cache_level_id)
366
    {
367
      cache_sizes.push_back(measurements[i].size);
368
    }
369
  }
370
}
371
372
} // namespace internal
373
374
/**
375
 * Empirically determines the CPU's cache structure (number of levels and size of
376
 * each level) by running a benchmark. Takes roughly 1 second to complete.
377
 *
378
 * \param cache_sizes Output-parameter where to store the description of the
379
 *                     cache structure. This is a vector of size_t's, one
380
 *                     per cache level, giving its size in bytes.
381
 *
382
 * For example, on a Nexus 4 phone, the resulting cache_sizes vector is
383
 *   [16384, 1048576]  (size 2)
384
 * which means that there are 2 levels of cache: L1=16k, L2=1M.
385
 *
386
 * The sizes are approximate and may vary between runs. In particular,
387
 * only power-of-two sizes are ever returned.
388
 *
389
 * This function is only useful on systems where the cache structure cannot
390
 * be queried directly from the CPU. For example, userspace programs
391
 * on Android on ARM CPUs as of 2015.
392
 */
393
inline void measure_cache_sizes(std::vector<size_t>& cache_sizes)
394
{
395
  using namespace internal;
396
  std::vector<measurement_t> measurements;
397
  measure_throughput_for_various_sizes(measurements);
398
  partition_measurements_into_cache_levels(measurements);
399
  get_cache_sizes_from_partitioned_measurements(cache_sizes, measurements);
400
}
401
402
} // namespace Eigen
403
404
#endif // EIGEN_MEASURE_CACHE_SIZES_H

Return to bug 930