This bugzilla service is closed. All entries have been migrated to https://gitlab.com/libeigen/eigen
View | Details | Raw Unified | Return to bug 930 | Differences between
and this patch

Collapse All | Expand All

(-)a/unsupported/Eigen/CMakeLists.txt (+1 lines)
Lines 4-11 set(Eigen_HEADERS AdolcForward BVH Itera Link Here
4
   )
4
   )
5
5
6
install(FILES
6
install(FILES
7
  ${Eigen_HEADERS}
7
  ${Eigen_HEADERS}
8
  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
8
  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
9
  )
9
  )
10
10
11
add_subdirectory(src)
11
add_subdirectory(src)
12
add_subdirectory(CXX11)
(-)a/unsupported/Eigen/CXX11/CMakeLists.txt (+9 lines)
Line 0 Link Here
1
set(Eigen_HEADERS Tensor TensorSymmetry Core MeasureCacheSizes
2
   )
3
4
install(FILES
5
  ${Eigen_HEADERS}
6
  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
7
  )
8
9
add_subdirectory(src)
(-)a/unsupported/Eigen/CXX11/MeasureCacheSizes (+28 lines)
Line 0 Link Here
1
// This file is part of Eigen, a lightweight C++ template library
2
// for linear algebra.
3
//
4
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
5
//
6
// This Source Code Form is subject to the terms of the Mozilla
7
// Public License v. 2.0. If a copy of the MPL was not distributed
8
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10
#ifndef EIGEN_MEASURE_CACHE_SIZES_MODULE
11
#define EIGEN_MEASURE_CACHE_SIZES_MODULE
12
13
/** \defgroup MeasureCacheSizes module
14
  *
15
  * Offers a simple measure_cache_sizes benchmark measuring CPU cache sizes
16
  * 
17
  * This is useful as a fallback in situations where this information
18
  * can't be queried directly. This information can then be passed to Eigen
19
  * to tune its algorithms.
20
  *
21
  * \code
22
  * #include <unsupported/Eigen/CXX11/MeasureCacheSizes>
23
  * \endcode
24
  */
25
26
#include "src/MeasureCacheSizes/MeasureCacheSizes.h"
27
28
#endif // EIGEN_MEASURE_CACHE_LEVELS_MODULE
(-)a/unsupported/Eigen/CXX11/src/CMakeLists.txt (+4 lines)
Line 0 Link Here
1
#ADD_SUBDIRECTORY(Tensor)
2
#ADD_SUBDIRECTORY(TensorSymmetry)
3
#ADD_SUBDIRECTORY(Core)
4
ADD_SUBDIRECTORY(MeasureCacheSizes)
(-)a/unsupported/Eigen/CXX11/src/MeasureCacheSizes/CMakeLists.txt (+6 lines)
Line 0 Link Here
1
FILE(GLOB Eigen_MeasureCacheSizes_SRCS "*.h")
2
3
INSTALL(FILES
4
  ${Eigen_MeasureCacheSizes_SRCS}
5
  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/MeasureCacheSizes COMPONENT Devel
6
  )
(-)a/unsupported/Eigen/CXX11/src/MeasureCacheSizes/MeasureCacheSizes.h (+405 lines)
Line 0 Link Here
1
// This file is part of Eigen, a lightweight C++ template library
2
// for linear algebra.
3
//
4
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
5
//
6
// This Source Code Form is subject to the terms of the Mozilla
7
// Public License v. 2.0. If a copy of the MPL was not distributed
8
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10
#ifndef EIGEN_MEASURE_CACHE_SIZES_H
11
#define EIGEN_MEASURE_CACHE_SIZES_H
12
13
#include <Eigen/Core>
14
15
#include <ctime>
16
#include <cassert>
17
#include <cstdint>
18
#include <cmath>
19
#include <cstdlib>
20
#include <cstring>
21
#include <limits>
22
#include <algorithm>
23
#include <vector>
24
25
#ifndef __linux__
26
#include <sys/time.h>
27
#endif
28
29
namespace Eigen {
30
namespace internal {
31
32
using std::size_t;
33
using std::ptrdiff_t;
34
using std::uint8_t;
35
using std::uint64_t;
36
37
const size_t megabyte = 1 << 20;
38
const size_t kilobyte = 1 << 10;
39
40
// We must repeat memory accesses enough times to give
41
// the CPU a good chance to have our stuff in cache for
42
// most of the measurements, so that the effect of caches
43
// can be measured. Since our accesses are by large memcpy's,
44
// the number of times that each cache line is touched is
45
// roughly equal to the number of times that each byte is touched.
46
const int minimum_times_accessed_each_byte = 8;
47
48
// Even with clock_gettime, time measurement accuracy is often
49
// worse than 1e-5 s. We avoid measuring times shorter than 1e-2 s
50
// so as to make this a non-issue. Increasing this minimum further
51
// significantly slows down the measurements.
52
const uint64_t minimum_measurement_time_in_ns = 1e7;
53
54
// We repeat all measurements a few times and keep the best throughput
55
// for each workload. This removes random noise and warm-up effects.
56
const int measurement_passes = 3;
57
58
// Minimum ratio of throughputs of different cache measurements.
59
// For example, 1.15 means that any difference smaller than 15%
60
// will be interpreted as measurement inaccuracy, not as
61
// indicating different cache measurements.
62
const float never_separate_levels_closer_than = 1.15;
63
64
// We need to be able to distinguish the highest level of cache (e.g. L3)
65
// from RAM. To do so, we need to know the throughput of RAM, so we need
66
// to know a working set size that cannot plausibly fit in cache. That is
67
// implausibly_large_cache_size. It should be set to a value larger than
68
// any CPU's cache on a given architecture.
69
//
70
// Conversely implausibly_small_cache_size should be set to a size
71
// smaller than any CPU's L1 cache, in order for us to correctly detect
72
// the L1 cache.
73
//
74
// Keep in mind that we require each cache level to be
75
// reflected in at least two consecutive power-of-two measurements,
76
// so for instance if the L1 cache size is 4k, implausibly_small_cache_size
77
// must be set to at most 2k in order for us to detect it.
78
#if defined __aarch64__ || defined __arm__
79
  const size_t implausibly_large_cache_size = 8 * megabyte;
80
  const size_t implausibly_small_cache_size = 1 * kilobyte;
81
#elif defined __x86_64__ || defined __i386__
82
  const size_t implausibly_large_cache_size = 64 * megabyte;
83
  const size_t implausibly_small_cache_size = 4 * kilobyte;
84
#else
85
#error "What's your architecture?"
86
#endif
87
88
// To avoid alignment-related noise, especially with small working set sizes,
89
// we keep our buffers on an alignment much higher than any plausible cache
90
// line size. It doesn't hurt, then, to make this the typical value of
91
// page size on most systems.
92
const size_t alignment = 0x1000;
93
94
class cache_measurement_context_t
95
{
96
  size_t allocated_buffer_size;
97
  uint8_t* buffer1;
98
  uint8_t* buffer2;
99
100
  __attribute__((noinline))
101
  static void consume(uint8_t&)
102
  {}
103
104
  static uint64_t get_raw_time_in_ns()
105
  {
106
  #ifdef __linux__
107
    timespec t;
108
    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
109
    return t.tv_nsec + 1e9 * t.tv_sec;
110
  #else
111
    timeval t;
112
    gettimeofday(&t, nullptr);
113
    return 1e3 * t.tv_usec + 1e9 * t.tv_sec;
114
  #endif
115
  }
116
117
public:
118
  cache_measurement_context_t(size_t working_set_size)
119
  {
120
    allocated_buffer_size = working_set_size / 2;
121
    bool alloc_failure =
122
      posix_memalign(reinterpret_cast<void**>(&buffer1), alignment, allocated_buffer_size) ||
123
      posix_memalign(reinterpret_cast<void**>(&buffer2), alignment, allocated_buffer_size);
124
    eigen_assert(!alloc_failure);
125
    EIGEN_ONLY_USED_FOR_DEBUG(alloc_failure);
126
127
    // Initial warm up and ensure that the compiler can't optimize all
128
    // the workload away as depending on uninitialized values
129
    for (size_t i = 0; i < allocated_buffer_size; i++) {
130
      buffer1[i] = buffer2[i] = uint8_t(i);
131
    }
132
  }
133
134
  ~cache_measurement_context_t()
135
  {
136
    free(buffer1);
137
    free(buffer2);
138
  }
139
140
  float measure_throughput(size_t working_set_size)
141
  {
142
    const size_t buffer_size = working_set_size / 2;
143
    eigen_assert(buffer_size <= allocated_buffer_size);
144
145
    size_t iters_at_a_time = 1;
146
    uint64_t bytes_copied = 0;
147
    const uint64_t minimum_bytes_copied = buffer_size * minimum_times_accessed_each_byte;
148
    uint64_t t1, t2;
149
    t1 = get_raw_time_in_ns();
150
    while (true) {
151
      t2 = get_raw_time_in_ns();
152
      uint64_t elapsed = t2 - t1;
153
      if (elapsed >= minimum_measurement_time_in_ns && bytes_copied >= minimum_bytes_copied) {
154
        consume(buffer1[buffer_size / 2]);
155
        consume(buffer2[buffer_size / 2]);
156
        return float(bytes_copied) / elapsed; // bytes per ns, equivalently GB/s
157
      }
158
      for (size_t i = 0; i < iters_at_a_time; i++) {
159
        // Inner benchmark loop. We use memcpy as it is probably well optimized
160
        // for optimal throughput.
161
162
        // We copy back and forth. This could be updated in the future
163
        // if we wanted to test more specific cache behavior.
164
        uint8_t* bufferA = (i & 1) ? buffer1 : buffer2;
165
        uint8_t* bufferB = (i & 1) ? buffer2 : buffer1;
166
167
        // We apply a variable offset to make it near-impossible for the compiler
168
        // to outsmart us and optimize away our memcpy calls.
169
        const size_t offset = 16 * (i & 15);
170
        const size_t copy_size = buffer_size - offset;
171
172
        memcpy(bufferB, bufferA + offset, copy_size);
173
        bytes_copied += copy_size;
174
      }
175
      eigen_assert(iters_at_a_time <= std::numeric_limits<size_t>::max() / 2);
176
      iters_at_a_time *= 2;
177
    }
178
  }
179
};
180
181
/*
182
 * A measurement_t is the result of a single experimental measurement of
183
 * sustained memory throughput for a buffer of a given size.
184
 */
185
struct measurement_t
186
{
187
  // The size of the memory buffer that this measurement used
188
  size_t size;
189
190
  // The sustained memory throughput that we measured, in GB/s
191
  float throughput;
192
193
  // Identifier of the cache level that this buffer size fits in.
194
  // The initial value 0 means that this has not yet been determined.
195
  // Identifiers are not ordered, and in particular are not
196
  // numerically equal to cache levels. Also, buffers too large to
197
  // fit in any cache still get an identifier, which corresponds to
198
  // RAM.
199
  int cache_level_id;
200
201
  measurement_t(size_t s = 0)
202
    : size(s)
203
    , throughput(0.0f)
204
    , cache_level_id(0)
205
  {}
206
};
207
208
/* Measures memory throughput for various buffer sizes.
209
 * This gathers all the experimental data that will be needed to
210
 * determine the cache structure.
211
 */
212
inline void measure_throughput_for_various_sizes(
213
              std::vector<measurement_t>& measurements)
214
{
215
  eigen_assert(measurements.empty());
216
  cache_measurement_context_t cache_measurement_context(implausibly_large_cache_size);
217
218
  for (size_t size = implausibly_small_cache_size;
219
       size <= implausibly_large_cache_size;
220
       size *= 2)
221
  {
222
    measurements.push_back(measurement_t(size));
223
  }
224
225
  for (size_t pass = 1; pass <= measurement_passes; pass++) {
226
    for (ptrdiff_t i = measurements.size() - 1; i >= 0; i--) {
227
      measurements[i].throughput = std::max(
228
        measurements[i].throughput,
229
        cache_measurement_context.measure_throughput(measurements[i].size));
230
      if (size_t(i) < measurements.size() - 1) {
231
        measurements[i].throughput = std::max(
232
          measurements[i].throughput,
233
          measurements[i+1].throughput);
234
      }
235
    }
236
  }
237
}
238
239
/* Partitions a vector of measurement levels into "cache levels", i.e.
240
 * groups having approximately the same throughput. The partitioning is
241
 * written into the cache_level_id fields of the measurement levels.
242
 */
243
inline void partition_measurements_into_cache_levels(
244
              std::vector<measurement_t>& measurements)
245
{
246
  eigen_assert(!measurements.empty());
247
248
  int current_cache_level_id = 0;
249
  int next_cache_level_id = 1;
250
251
  measurements.front().cache_level_id = next_cache_level_id++;
252
  measurements.back().cache_level_id = next_cache_level_id++;
253
254
  float threshold = std::log(never_separate_levels_closer_than);
255
  
256
  ptrdiff_t step = 1;
257
258
  // Step 1: we traverse the measurements back and forth, in the direction
259
  // of 'step'. In each traversal, we try to identify groups of
260
  // at least two consecutive measurements with approximately the same
261
  // throughput, as determined by the current value of 'threshold'. Such groups
262
  // are called cache levels, and are recorded in the 'cache_level' field
263
  // of each measurement. We continue until no two consecutive measurements
264
  // are left without a measurement.
265
  while(true) {
266
    ptrdiff_t start_level = step == 1 ? 0 : measurements.size() - 1;
267
    ptrdiff_t end_level = step == -1 ? 0 : measurements.size() - 1;
268
    for (ptrdiff_t i = start_level; i != end_level; i += step) {
269
      if (measurements[i].cache_level_id) {
270
        current_cache_level_id = measurements[i].cache_level_id;
271
        continue;
272
      }
273
      if (current_cache_level_id) {
274
        // Test if the current measurement can join the current cache level
275
        float ratio = measurements[i].throughput / measurements[i - step].throughput;
276
        if (std::abs(std::log(ratio)) < threshold) {
277
          measurements[i].cache_level_id = current_cache_level_id; 
278
        } else {
279
          current_cache_level_id = 0;
280
        }
281
      }
282
      if (!current_cache_level_id) {
283
        // Test if the current and next measurements are close enough to
284
        // create a new cache level for them
285
        if (!measurements[i + step].cache_level_id) {
286
          float ratio = measurements[i + step].throughput / measurements[i].throughput;
287
          if (std::abs(std::log(ratio)) < threshold)
288
          {
289
            current_cache_level_id = next_cache_level_id++;
290
            measurements[i].cache_level_id = current_cache_level_id;
291
          }
292
        }
293
      }
294
    }
295
296
    bool finished = true;
297
    for (size_t i = 0; i < measurements.size() - 1; i++) {
298
      // Test if two consecutive measurements are still left without a cache_level
299
      if (!measurements[i].cache_level_id && !measurements[i+1].cache_level_id) {
300
        finished = false;
301
      }
302
    }
303
304
    if (finished) {
305
      break;
306
    }
307
308
    step = -step;
309
    threshold *= 1.2f; // The value of this multiplier doesn't matter much.
310
                       // We just increase the threshold little by little
311
  }                    // until only isolated 1-wide gaps remain.
312
313
  // Step 2: there may still remain isolated measurements either without a cache level,
314
  // or with a cache level different from either of their neighbors'.
315
  // We resolve those isolated measurements by letting them join the cache level
316
  // immediately above or below, whichever is closer.
317
  bool might_have_isolated_level = true;
318
  while (might_have_isolated_level) {
319
    for (size_t i = 0; i < measurements.size(); i++) {
320
      bool isolated_below = i == 0 ||
321
        measurements[i].cache_level_id != measurements[i-1].cache_level_id;
322
      bool isolated_above = i == measurements.size() - 1 ||
323
        measurements[i].cache_level_id != measurements[i+1].cache_level_id;
324
      if (i == 0 && isolated_above) {
325
        measurements[0].cache_level_id = measurements[1].cache_level_id;
326
        continue;
327
      }
328
      if (i == measurements.size() - 1 && isolated_below) {
329
        measurements[measurements.size() - 1].cache_level_id =
330
          measurements[measurements.size() - 2].cache_level_id;
331
        continue;
332
      }
333
      if (isolated_below && isolated_above) {
334
        float distance_to_prev =
335
          std::abs(std::log(measurements[i-1].throughput / measurements[i].throughput));
336
        float distance_to_next =
337
          std::abs(std::log(measurements[i+1].throughput / measurements[i].throughput));
338
        if (distance_to_prev < distance_to_next) {
339
          measurements[i].cache_level_id = measurements[i-1].cache_level_id;
340
        } else {
341
          measurements[i].cache_level_id = measurements[i+1].cache_level_id;
342
        }
343
        continue;
344
      }
345
      might_have_isolated_level = false;
346
    }
347
  }
348
}
349
350
/* Takes a vector of measurement levels already partitioned into cache levels,
351
 * and outputs a plain vector of integers giving the size in bytes of each cache level.
352
 */
353
inline void get_cache_sizes_from_partitioned_measurements(
354
              std::vector<size_t>& cache_sizes,
355
              const std::vector<measurement_t>& measurements)
356
{
357
  eigen_assert(cache_sizes.empty());
358
359
  for (size_t i = 1; i < measurements.size() - 1; i++) {
360
    // should have generated cache_level_id's
361
    eigen_assert(measurements[i].cache_level_id);
362
363
    // pick biggest size in each cache level, and omit the last one, as it's
364
    // just RAM, not a real cache level.
365
    if (measurements[i].cache_level_id != measurements[i+1].cache_level_id &&
366
        measurements[i].cache_level_id != measurements.back().cache_level_id)
367
    {
368
      cache_sizes.push_back(measurements[i].size);
369
    }
370
  }
371
}
372
373
} // namespace internal
374
375
/**
376
 * Empirically determines the CPU's cache structure (number of levels and size of
377
 * each level) by running a benchmark. Takes roughly 1 second to complete.
378
 *
379
 * \param cache_sizes Output-parameter where to store the description of the
380
 *                     cache structure. This is a vector of size_t's, one
381
 *                     per cache level, giving its size in bytes.
382
 *
383
 * For example, on a Nexus 4 phone, the resulting cache_sizes vector is
384
 *   [16384, 1048576]  (size 2)
385
 * which means that there are 2 levels of cache: L1=16k, L2=1M.
386
 *
387
 * The sizes are approximate and may vary between runs. In particular,
388
 * only power-of-two sizes are ever returned.
389
 *
390
 * This function is only useful on systems where the cache structure cannot
391
 * be queried directly from the CPU. For example, userspace programs
392
 * on Android on ARM CPUs as of 2015.
393
 */
394
inline void measure_cache_sizes(std::vector<size_t>& cache_sizes)
395
{
396
  using namespace internal;
397
  std::vector<measurement_t> measurements;
398
  measure_throughput_for_various_sizes(measurements);
399
  partition_measurements_into_cache_levels(measurements);
400
  get_cache_sizes_from_partitioned_measurements(cache_sizes, measurements);
401
}
402
403
} // namespace Eigen
404
405
#endif // EIGEN_MEASURE_CACHE_SIZES_H

Return to bug 930