Line 0
Link Here
|
|
|
1 |
// This file is part of Eigen, a lightweight C++ template library |
2 |
// for linear algebra. |
3 |
// |
4 |
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com> |
5 |
// |
6 |
// This Source Code Form is subject to the terms of the Mozilla |
7 |
// Public License v. 2.0. If a copy of the MPL was not distributed |
8 |
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9 |
|
10 |
#ifndef EIGEN_MEASURE_CACHE_SIZES_H |
11 |
#define EIGEN_MEASURE_CACHE_SIZES_H |
12 |
|
13 |
#include <Eigen/Core> |
14 |
|
15 |
#include <ctime> |
16 |
#include <cassert> |
17 |
#include <cstdint> |
18 |
#include <cmath> |
19 |
#include <cstdlib> |
20 |
#include <cstring> |
21 |
#include <limits> |
22 |
#include <algorithm> |
23 |
#include <vector> |
24 |
|
25 |
#ifndef __linux__ |
26 |
#include <sys/time.h> |
27 |
#endif |
28 |
|
29 |
namespace Eigen { |
30 |
namespace internal { |
31 |
|
32 |
using std::size_t; |
33 |
using std::ptrdiff_t; |
34 |
using std::uint8_t; |
35 |
using std::uint64_t; |
36 |
|
37 |
const size_t megabyte = 1 << 20; |
38 |
const size_t kilobyte = 1 << 10; |
39 |
|
40 |
// We must repeat memory accesses enough times to give |
41 |
// the CPU a good chance to have our stuff in cache for |
42 |
// most of the measurements, so that the effect of caches |
43 |
// can be measured. Since our accesses are by large memcpy's, |
44 |
// the number of times that each cache line is touched is |
45 |
// roughly equal to the number of times that each byte is touched. |
46 |
const int minimum_times_accessed_each_byte = 8; |
47 |
|
48 |
// Even with clock_gettime, time measurement accuracy is often |
49 |
// worse than 1e-5 s. We avoid measuring times shorter than 1e-2 s |
50 |
// so as to make this a non-issue. Increasing this minimum further |
51 |
// significantly slows down the measurements. |
52 |
const uint64_t minimum_measurement_time_in_ns = 1e7; |
53 |
|
54 |
// We repeat all measurements a few times and keep the best throughput |
55 |
// for each workload. This removes random noise and warm-up effects. |
56 |
const int measurement_passes = 3; |
57 |
|
58 |
// Minimum ratio of throughputs of different cache measurements. |
59 |
// For example, 1.15 means that any difference smaller than 15% |
60 |
// will be interpreted as measurement inaccuracy, not as |
61 |
// indicating different cache measurements. |
62 |
const float never_separate_levels_closer_than = 1.15; |
63 |
|
64 |
// We need to be able to distinguish the highest level of cache (e.g. L3) |
65 |
// from RAM. To do so, we need to know the throughput of RAM, so we need |
66 |
// to know a working set size that cannot plausibly fit in cache. That is |
67 |
// implausibly_large_cache_size. It should be set to a value larger than |
68 |
// any CPU's cache on a given architecture. |
69 |
// |
70 |
// Conversely implausibly_small_cache_size should be set to a size |
71 |
// smaller than any CPU's L1 cache, in order for us to correctly detect |
72 |
// the L1 cache. |
73 |
// |
74 |
// Keep in mind that we require each cache level to be |
75 |
// reflected in at least two consecutive power-of-two measurements, |
76 |
// so for instance if the L1 cache size is 4k, implausibly_small_cache_size |
77 |
// must be set to at most 2k in order for us to detect it. |
78 |
#if defined __aarch64__ || defined __arm__ |
79 |
const size_t implausibly_large_cache_size = 8 * megabyte; |
80 |
const size_t implausibly_small_cache_size = 1 * kilobyte; |
81 |
#elif defined __x86_64__ || defined __i386__ |
82 |
const size_t implausibly_large_cache_size = 64 * megabyte; |
83 |
const size_t implausibly_small_cache_size = 4 * kilobyte; |
84 |
#else |
85 |
#error "What's your architecture?" |
86 |
#endif |
87 |
|
88 |
// To avoid alignment-related noise, especially with small working set sizes, |
89 |
// we keep our buffers on an alignment much higher than any plausible cache |
90 |
// line size. It doesn't hurt, then, to make this the typical value of |
91 |
// page size on most systems. |
92 |
const size_t alignment = 0x1000; |
93 |
|
94 |
class cache_measurement_context_t |
95 |
{ |
96 |
size_t allocated_buffer_size; |
97 |
uint8_t* buffer1; |
98 |
uint8_t* buffer2; |
99 |
|
100 |
__attribute__((noinline)) |
101 |
static void consume(uint8_t&) |
102 |
{} |
103 |
|
104 |
static uint64_t get_raw_time_in_ns() |
105 |
{ |
106 |
#ifdef __linux__ |
107 |
timespec t; |
108 |
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t); |
109 |
return t.tv_nsec + 1e9 * t.tv_sec; |
110 |
#else |
111 |
timeval t; |
112 |
gettimeofday(&t, nullptr); |
113 |
return 1e3 * t.tv_usec + 1e9 * t.tv_sec; |
114 |
#endif |
115 |
} |
116 |
|
117 |
public: |
118 |
cache_measurement_context_t(size_t working_set_size) |
119 |
{ |
120 |
allocated_buffer_size = working_set_size / 2; |
121 |
bool alloc_failure = |
122 |
posix_memalign(reinterpret_cast<void**>(&buffer1), alignment, allocated_buffer_size) || |
123 |
posix_memalign(reinterpret_cast<void**>(&buffer2), alignment, allocated_buffer_size); |
124 |
eigen_assert(!alloc_failure); |
125 |
EIGEN_ONLY_USED_FOR_DEBUG(alloc_failure); |
126 |
|
127 |
// Initial warm up and ensure that the compiler can't optimize all |
128 |
// the workload away as depending on uninitialized values |
129 |
for (size_t i = 0; i < allocated_buffer_size; i++) { |
130 |
buffer1[i] = buffer2[i] = uint8_t(i); |
131 |
} |
132 |
} |
133 |
|
134 |
~cache_measurement_context_t() |
135 |
{ |
136 |
free(buffer1); |
137 |
free(buffer2); |
138 |
} |
139 |
|
140 |
float measure_throughput(size_t working_set_size) |
141 |
{ |
142 |
const size_t buffer_size = working_set_size / 2; |
143 |
eigen_assert(buffer_size <= allocated_buffer_size); |
144 |
|
145 |
size_t iters_at_a_time = 1; |
146 |
uint64_t bytes_copied = 0; |
147 |
const uint64_t minimum_bytes_copied = buffer_size * minimum_times_accessed_each_byte; |
148 |
uint64_t t1, t2; |
149 |
t1 = get_raw_time_in_ns(); |
150 |
while (true) { |
151 |
t2 = get_raw_time_in_ns(); |
152 |
uint64_t elapsed = t2 - t1; |
153 |
if (elapsed >= minimum_measurement_time_in_ns && bytes_copied >= minimum_bytes_copied) { |
154 |
consume(buffer1[buffer_size / 2]); |
155 |
consume(buffer2[buffer_size / 2]); |
156 |
return float(bytes_copied) / elapsed; // bytes per ns, equivalently GB/s |
157 |
} |
158 |
for (size_t i = 0; i < iters_at_a_time; i++) { |
159 |
// Inner benchmark loop. We use memcpy as it is probably well optimized |
160 |
// for optimal throughput. |
161 |
|
162 |
// We copy back and forth. This could be updated in the future |
163 |
// if we wanted to test more specific cache behavior. |
164 |
uint8_t* bufferA = (i & 1) ? buffer1 : buffer2; |
165 |
uint8_t* bufferB = (i & 1) ? buffer2 : buffer1; |
166 |
|
167 |
// We apply a variable offset to make it near-impossible for the compiler |
168 |
// to outsmart us and optimize away our memcpy calls. |
169 |
const size_t offset = 16 * (i & 15); |
170 |
const size_t copy_size = buffer_size - offset; |
171 |
|
172 |
memcpy(bufferB, bufferA + offset, copy_size); |
173 |
bytes_copied += copy_size; |
174 |
} |
175 |
eigen_assert(iters_at_a_time <= std::numeric_limits<size_t>::max() / 2); |
176 |
iters_at_a_time *= 2; |
177 |
} |
178 |
} |
179 |
}; |
180 |
|
181 |
/* |
182 |
* A measurement_t is the result of a single experimental measurement of |
183 |
* sustained memory throughput for a buffer of a given size. |
184 |
*/ |
185 |
struct measurement_t |
186 |
{ |
187 |
// The size of the memory buffer that this measurement used |
188 |
size_t size; |
189 |
|
190 |
// The sustained memory throughput that we measured, in GB/s |
191 |
float throughput; |
192 |
|
193 |
// Identifier of the cache level that this buffer size fits in. |
194 |
// The initial value 0 means that this has not yet been determined. |
195 |
// Identifiers are not ordered, and in particular are not |
196 |
// numerically equal to cache levels. Also, buffers too large to |
197 |
// fit in any cache still get an identifier, which corresponds to |
198 |
// RAM. |
199 |
int cache_level_id; |
200 |
|
201 |
measurement_t(size_t s = 0) |
202 |
: size(s) |
203 |
, throughput(0.0f) |
204 |
, cache_level_id(0) |
205 |
{} |
206 |
}; |
207 |
|
208 |
/* Measures memory throughput for various buffer sizes. |
209 |
* This gathers all the experimental data that will be needed to |
210 |
* determine the cache structure. |
211 |
*/ |
212 |
inline void measure_throughput_for_various_sizes( |
213 |
std::vector<measurement_t>& measurements) |
214 |
{ |
215 |
eigen_assert(measurements.empty()); |
216 |
cache_measurement_context_t cache_measurement_context(implausibly_large_cache_size); |
217 |
|
218 |
for (size_t size = implausibly_small_cache_size; |
219 |
size <= implausibly_large_cache_size; |
220 |
size *= 2) |
221 |
{ |
222 |
measurements.push_back(measurement_t(size)); |
223 |
} |
224 |
|
225 |
for (size_t pass = 1; pass <= measurement_passes; pass++) { |
226 |
for (ptrdiff_t i = measurements.size() - 1; i >= 0; i--) { |
227 |
measurements[i].throughput = std::max( |
228 |
measurements[i].throughput, |
229 |
cache_measurement_context.measure_throughput(measurements[i].size)); |
230 |
if (size_t(i) < measurements.size() - 1) { |
231 |
measurements[i].throughput = std::max( |
232 |
measurements[i].throughput, |
233 |
measurements[i+1].throughput); |
234 |
} |
235 |
} |
236 |
} |
237 |
} |
238 |
|
239 |
/* Partitions a vector of measurement levels into "cache levels", i.e. |
240 |
* groups having approximately the same throughput. The partitioning is |
241 |
* written into the cache_level_id fields of the measurement levels. |
242 |
*/ |
243 |
inline void partition_measurements_into_cache_levels( |
244 |
std::vector<measurement_t>& measurements) |
245 |
{ |
246 |
eigen_assert(!measurements.empty()); |
247 |
|
248 |
int current_cache_level_id = 0; |
249 |
int next_cache_level_id = 1; |
250 |
|
251 |
measurements.front().cache_level_id = next_cache_level_id++; |
252 |
measurements.back().cache_level_id = next_cache_level_id++; |
253 |
|
254 |
float threshold = std::log(never_separate_levels_closer_than); |
255 |
|
256 |
ptrdiff_t step = 1; |
257 |
|
258 |
// Step 1: we traverse the measurements back and forth, in the direction |
259 |
// of 'step'. In each traversal, we try to identify groups of |
260 |
// at least two consecutive measurements with approximately the same |
261 |
// throughput, as determined by the current value of 'threshold'. Such groups |
262 |
// are called cache levels, and are recorded in the 'cache_level' field |
263 |
// of each measurement. We continue until no two consecutive measurements |
264 |
// are left without a measurement. |
265 |
while(true) { |
266 |
ptrdiff_t start_level = step == 1 ? 0 : measurements.size() - 1; |
267 |
ptrdiff_t end_level = step == -1 ? 0 : measurements.size() - 1; |
268 |
for (ptrdiff_t i = start_level; i != end_level; i += step) { |
269 |
if (measurements[i].cache_level_id) { |
270 |
current_cache_level_id = measurements[i].cache_level_id; |
271 |
continue; |
272 |
} |
273 |
if (current_cache_level_id) { |
274 |
// Test if the current measurement can join the current cache level |
275 |
float ratio = measurements[i].throughput / measurements[i - step].throughput; |
276 |
if (std::abs(std::log(ratio)) < threshold) { |
277 |
measurements[i].cache_level_id = current_cache_level_id; |
278 |
} else { |
279 |
current_cache_level_id = 0; |
280 |
} |
281 |
} |
282 |
if (!current_cache_level_id) { |
283 |
// Test if the current and next measurements are close enough to |
284 |
// create a new cache level for them |
285 |
if (!measurements[i + step].cache_level_id) { |
286 |
float ratio = measurements[i + step].throughput / measurements[i].throughput; |
287 |
if (std::abs(std::log(ratio)) < threshold) |
288 |
{ |
289 |
current_cache_level_id = next_cache_level_id++; |
290 |
measurements[i].cache_level_id = current_cache_level_id; |
291 |
} |
292 |
} |
293 |
} |
294 |
} |
295 |
|
296 |
bool finished = true; |
297 |
for (size_t i = 0; i < measurements.size() - 1; i++) { |
298 |
// Test if two consecutive measurements are still left without a cache_level |
299 |
if (!measurements[i].cache_level_id && !measurements[i+1].cache_level_id) { |
300 |
finished = false; |
301 |
} |
302 |
} |
303 |
|
304 |
if (finished) { |
305 |
break; |
306 |
} |
307 |
|
308 |
step = -step; |
309 |
threshold *= 1.2f; // The value of this multiplier doesn't matter much. |
310 |
// We just increase the threshold little by little |
311 |
} // until only isolated 1-wide gaps remain. |
312 |
|
313 |
// Step 2: there may still remain isolated measurements either without a cache level, |
314 |
// or with a cache level different from either of their neighbors'. |
315 |
// We resolve those isolated measurements by letting them join the cache level |
316 |
// immediately above or below, whichever is closer. |
317 |
bool might_have_isolated_level = true; |
318 |
while (might_have_isolated_level) { |
319 |
for (size_t i = 0; i < measurements.size(); i++) { |
320 |
bool isolated_below = i == 0 || |
321 |
measurements[i].cache_level_id != measurements[i-1].cache_level_id; |
322 |
bool isolated_above = i == measurements.size() - 1 || |
323 |
measurements[i].cache_level_id != measurements[i+1].cache_level_id; |
324 |
if (i == 0 && isolated_above) { |
325 |
measurements[0].cache_level_id = measurements[1].cache_level_id; |
326 |
continue; |
327 |
} |
328 |
if (i == measurements.size() - 1 && isolated_below) { |
329 |
measurements[measurements.size() - 1].cache_level_id = |
330 |
measurements[measurements.size() - 2].cache_level_id; |
331 |
continue; |
332 |
} |
333 |
if (isolated_below && isolated_above) { |
334 |
float distance_to_prev = |
335 |
std::abs(std::log(measurements[i-1].throughput / measurements[i].throughput)); |
336 |
float distance_to_next = |
337 |
std::abs(std::log(measurements[i+1].throughput / measurements[i].throughput)); |
338 |
if (distance_to_prev < distance_to_next) { |
339 |
measurements[i].cache_level_id = measurements[i-1].cache_level_id; |
340 |
} else { |
341 |
measurements[i].cache_level_id = measurements[i+1].cache_level_id; |
342 |
} |
343 |
continue; |
344 |
} |
345 |
might_have_isolated_level = false; |
346 |
} |
347 |
} |
348 |
} |
349 |
|
350 |
/* Takes a vector of measurement levels already partitioned into cache levels, |
351 |
* and outputs a plain vector of integers giving the size in bytes of each cache level. |
352 |
*/ |
353 |
inline void get_cache_sizes_from_partitioned_measurements( |
354 |
std::vector<size_t>& cache_sizes, |
355 |
const std::vector<measurement_t>& measurements) |
356 |
{ |
357 |
eigen_assert(cache_sizes.empty()); |
358 |
|
359 |
for (size_t i = 1; i < measurements.size() - 1; i++) { |
360 |
// should have generated cache_level_id's |
361 |
eigen_assert(measurements[i].cache_level_id); |
362 |
|
363 |
// pick biggest size in each cache level, and omit the last one, as it's |
364 |
// just RAM, not a real cache level. |
365 |
if (measurements[i].cache_level_id != measurements[i+1].cache_level_id && |
366 |
measurements[i].cache_level_id != measurements.back().cache_level_id) |
367 |
{ |
368 |
cache_sizes.push_back(measurements[i].size); |
369 |
} |
370 |
} |
371 |
} |
372 |
|
373 |
} // namespace internal |
374 |
|
375 |
/** |
376 |
* Empirically determines the CPU's cache structure (number of levels and size of |
377 |
* each level) by running a benchmark. Takes roughly 1 second to complete. |
378 |
* |
379 |
* \param cache_sizes Output-parameter where to store the description of the |
380 |
* cache structure. This is a vector of size_t's, one |
381 |
* per cache level, giving its size in bytes. |
382 |
* |
383 |
* For example, on a Nexus 4 phone, the resulting cache_sizes vector is |
384 |
* [16384, 1048576] (size 2) |
385 |
* which means that there are 2 levels of cache: L1=16k, L2=1M. |
386 |
* |
387 |
* The sizes are approximate and may vary between runs. In particular, |
388 |
* only power-of-two sizes are ever returned. |
389 |
* |
390 |
* This function is only useful on systems where the cache structure cannot |
391 |
* be queried directly from the CPU. For example, userspace programs |
392 |
* on Android on ARM CPUs as of 2015. |
393 |
*/ |
394 |
inline void measure_cache_sizes(std::vector<size_t>& cache_sizes) |
395 |
{ |
396 |
using namespace internal; |
397 |
std::vector<measurement_t> measurements; |
398 |
measure_throughput_for_various_sizes(measurements); |
399 |
partition_measurements_into_cache_levels(measurements); |
400 |
get_cache_sizes_from_partitioned_measurements(cache_sizes, measurements); |
401 |
} |
402 |
|
403 |
} // namespace Eigen |
404 |
|
405 |
#endif // EIGEN_MEASURE_CACHE_SIZES_H |