Lines 1-23
Link Here
|
1 |
// This file is part of Eigen, a lightweight C++ template library |
1 |
// This file is part of Eigen, a lightweight C++ template library |
2 |
// for linear algebra. |
2 |
// for linear algebra. |
3 |
// |
3 |
// |
4 |
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> |
4 |
// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr> |
5 |
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com> |
5 |
// Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com> |
6 |
// |
6 |
// |
7 |
// This Source Code Form is subject to the terms of the Mozilla |
7 |
// This Source Code Form is subject to the terms of the Mozilla |
8 |
// Public License v. 2.0. If a copy of the MPL was not distributed |
8 |
// Public License v. 2.0. If a copy of the MPL was not distributed |
9 |
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9 |
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10 |
|
10 |
|
11 |
#ifndef EIGEN_CACHE_SIZES_H |
11 |
#ifndef EIGEN_CACHE_SIZES_H |
12 |
#define EIGEN_CACHE_SIZES_H |
12 |
#define EIGEN_CACHE_SIZES_H |
13 |
|
13 |
|
14 |
namespace Eigen { |
14 |
namespace Eigen { |
15 |
|
15 |
|
|
|
16 |
#if !defined(EIGEN_NO_CPUID) |
17 |
# if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64 |
18 |
# if defined(__PIC__) && EIGEN_ARCH_i386 |
19 |
// Case for x86 with PIC |
20 |
# define EIGEN_CPUID(abcd,func,id) \ |
21 |
__asm__ __volatile__ ("xchgl %%ebx, %k1;cpuid; xchgl %%ebx,%k1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id)); |
22 |
# elif defined(__PIC__) && EIGEN_ARCH_x86_64 |
23 |
// Case for x64 with PIC. In theory this is only a problem with recent gcc and with medium or large code model, not with the default small code model. |
24 |
// However, we cannot detect which code model is used, and the xchg overhead is negligible anyway. |
25 |
# define EIGEN_CPUID(abcd,func,id) \ |
26 |
__asm__ __volatile__ ("xchg{q}\t{%%}rbx, %q1; cpuid; xchg{q}\t{%%}rbx, %q1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id)); |
27 |
# else |
28 |
// Case for x86_64 or x86 w/o PIC |
29 |
# define EIGEN_CPUID(abcd,func,id) \ |
30 |
__asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id) ); |
31 |
# endif |
32 |
# elif EIGEN_COMP_MSVC |
33 |
# if (EIGEN_COMP_MSVC > 1500) && EIGEN_ARCH_i386_OR_x86_64 |
34 |
# define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id) |
35 |
# endif |
36 |
# endif |
37 |
#endif |
38 |
|
39 |
namespace internal { |
40 |
|
41 |
#ifdef EIGEN_CPUID |
42 |
|
43 |
inline bool cpuid_is_vendor(int abcd[4], const int vendor[3]) |
44 |
{ |
45 |
return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2]; |
46 |
} |
47 |
|
48 |
inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) |
49 |
{ |
50 |
int abcd[4]; |
51 |
l1 = l2 = l3 = 0; |
52 |
int cache_id = 0; |
53 |
int cache_type = 0; |
54 |
do { |
55 |
abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; |
56 |
EIGEN_CPUID(abcd,0x4,cache_id); |
57 |
cache_type = (abcd[0] & 0x0F) >> 0; |
58 |
if(cache_type==1||cache_type==3) // data or unified cache |
59 |
{ |
60 |
int cache_level = (abcd[0] & 0xE0) >> 5; // A[7:5] |
61 |
int ways = (abcd[1] & 0xFFC00000) >> 22; // B[31:22] |
62 |
int partitions = (abcd[1] & 0x003FF000) >> 12; // B[21:12] |
63 |
int line_size = (abcd[1] & 0x00000FFF) >> 0; // B[11:0] |
64 |
int sets = (abcd[2]); // C[31:0] |
65 |
|
66 |
int cache_size = (ways+1) * (partitions+1) * (line_size+1) * (sets+1); |
67 |
|
68 |
switch(cache_level) |
69 |
{ |
70 |
case 1: l1 = cache_size; break; |
71 |
case 2: l2 = cache_size; break; |
72 |
case 3: l3 = cache_size; break; |
73 |
default: break; |
74 |
} |
75 |
} |
76 |
cache_id++; |
77 |
} while(cache_type>0 && cache_id<16); |
78 |
} |
79 |
|
80 |
inline void queryCacheSizes_intel_codes(int& l1, int& l2, int& l3) |
81 |
{ |
82 |
int abcd[4]; |
83 |
abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; |
84 |
l1 = l2 = l3 = 0; |
85 |
EIGEN_CPUID(abcd,0x00000002,0); |
86 |
unsigned char * bytes = reinterpret_cast<unsigned char *>(abcd)+2; |
87 |
bool check_for_p2_core2 = false; |
88 |
for(int i=0; i<14; ++i) |
89 |
{ |
90 |
switch(bytes[i]) |
91 |
{ |
92 |
case 0x0A: l1 = 8; break; // 0Ah data L1 cache, 8 KB, 2 ways, 32 byte lines |
93 |
case 0x0C: l1 = 16; break; // 0Ch data L1 cache, 16 KB, 4 ways, 32 byte lines |
94 |
case 0x0E: l1 = 24; break; // 0Eh data L1 cache, 24 KB, 6 ways, 64 byte lines |
95 |
case 0x10: l1 = 16; break; // 10h data L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64) |
96 |
case 0x15: l1 = 16; break; // 15h code L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64) |
97 |
case 0x2C: l1 = 32; break; // 2Ch data L1 cache, 32 KB, 8 ways, 64 byte lines |
98 |
case 0x30: l1 = 32; break; // 30h code L1 cache, 32 KB, 8 ways, 64 byte lines |
99 |
case 0x60: l1 = 16; break; // 60h data L1 cache, 16 KB, 8 ways, 64 byte lines, sectored |
100 |
case 0x66: l1 = 8; break; // 66h data L1 cache, 8 KB, 4 ways, 64 byte lines, sectored |
101 |
case 0x67: l1 = 16; break; // 67h data L1 cache, 16 KB, 4 ways, 64 byte lines, sectored |
102 |
case 0x68: l1 = 32; break; // 68h data L1 cache, 32 KB, 4 ways, 64 byte lines, sectored |
103 |
case 0x1A: l2 = 96; break; // code and data L2 cache, 96 KB, 6 ways, 64 byte lines (IA-64) |
104 |
case 0x22: l3 = 512; break; // code and data L3 cache, 512 KB, 4 ways (!), 64 byte lines, dual-sectored |
105 |
case 0x23: l3 = 1024; break; // code and data L3 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored |
106 |
case 0x25: l3 = 2048; break; // code and data L3 cache, 2048 KB, 8 ways, 64 byte lines, dual-sectored |
107 |
case 0x29: l3 = 4096; break; // code and data L3 cache, 4096 KB, 8 ways, 64 byte lines, dual-sectored |
108 |
case 0x39: l2 = 128; break; // code and data L2 cache, 128 KB, 4 ways, 64 byte lines, sectored |
109 |
case 0x3A: l2 = 192; break; // code and data L2 cache, 192 KB, 6 ways, 64 byte lines, sectored |
110 |
case 0x3B: l2 = 128; break; // code and data L2 cache, 128 KB, 2 ways, 64 byte lines, sectored |
111 |
case 0x3C: l2 = 256; break; // code and data L2 cache, 256 KB, 4 ways, 64 byte lines, sectored |
112 |
case 0x3D: l2 = 384; break; // code and data L2 cache, 384 KB, 6 ways, 64 byte lines, sectored |
113 |
case 0x3E: l2 = 512; break; // code and data L2 cache, 512 KB, 4 ways, 64 byte lines, sectored |
114 |
case 0x40: l2 = 0; break; // no integrated L2 cache (P6 core) or L3 cache (P4 core) |
115 |
case 0x41: l2 = 128; break; // code and data L2 cache, 128 KB, 4 ways, 32 byte lines |
116 |
case 0x42: l2 = 256; break; // code and data L2 cache, 256 KB, 4 ways, 32 byte lines |
117 |
case 0x43: l2 = 512; break; // code and data L2 cache, 512 KB, 4 ways, 32 byte lines |
118 |
case 0x44: l2 = 1024; break; // code and data L2 cache, 1024 KB, 4 ways, 32 byte lines |
119 |
case 0x45: l2 = 2048; break; // code and data L2 cache, 2048 KB, 4 ways, 32 byte lines |
120 |
case 0x46: l3 = 4096; break; // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines |
121 |
case 0x47: l3 = 8192; break; // code and data L3 cache, 8192 KB, 8 ways, 64 byte lines |
122 |
case 0x48: l2 = 3072; break; // code and data L2 cache, 3072 KB, 12 ways, 64 byte lines |
123 |
case 0x49: if(l2!=0) l3 = 4096; else {check_for_p2_core2=true; l3 = l2 = 4096;} break;// code and data L3 cache, 4096 KB, 16 ways, 64 byte lines (P4) or L2 for core2 |
124 |
case 0x4A: l3 = 6144; break; // code and data L3 cache, 6144 KB, 12 ways, 64 byte lines |
125 |
case 0x4B: l3 = 8192; break; // code and data L3 cache, 8192 KB, 16 ways, 64 byte lines |
126 |
case 0x4C: l3 = 12288; break; // code and data L3 cache, 12288 KB, 12 ways, 64 byte lines |
127 |
case 0x4D: l3 = 16384; break; // code and data L3 cache, 16384 KB, 16 ways, 64 byte lines |
128 |
case 0x4E: l2 = 6144; break; // code and data L2 cache, 6144 KB, 24 ways, 64 byte lines |
129 |
case 0x78: l2 = 1024; break; // code and data L2 cache, 1024 KB, 4 ways, 64 byte lines |
130 |
case 0x79: l2 = 128; break; // code and data L2 cache, 128 KB, 8 ways, 64 byte lines, dual-sectored |
131 |
case 0x7A: l2 = 256; break; // code and data L2 cache, 256 KB, 8 ways, 64 byte lines, dual-sectored |
132 |
case 0x7B: l2 = 512; break; // code and data L2 cache, 512 KB, 8 ways, 64 byte lines, dual-sectored |
133 |
case 0x7C: l2 = 1024; break; // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored |
134 |
case 0x7D: l2 = 2048; break; // code and data L2 cache, 2048 KB, 8 ways, 64 byte lines |
135 |
case 0x7E: l2 = 256; break; // code and data L2 cache, 256 KB, 8 ways, 128 byte lines, sect. (IA-64) |
136 |
case 0x7F: l2 = 512; break; // code and data L2 cache, 512 KB, 2 ways, 64 byte lines |
137 |
case 0x80: l2 = 512; break; // code and data L2 cache, 512 KB, 8 ways, 64 byte lines |
138 |
case 0x81: l2 = 128; break; // code and data L2 cache, 128 KB, 8 ways, 32 byte lines |
139 |
case 0x82: l2 = 256; break; // code and data L2 cache, 256 KB, 8 ways, 32 byte lines |
140 |
case 0x83: l2 = 512; break; // code and data L2 cache, 512 KB, 8 ways, 32 byte lines |
141 |
case 0x84: l2 = 1024; break; // code and data L2 cache, 1024 KB, 8 ways, 32 byte lines |
142 |
case 0x85: l2 = 2048; break; // code and data L2 cache, 2048 KB, 8 ways, 32 byte lines |
143 |
case 0x86: l2 = 512; break; // code and data L2 cache, 512 KB, 4 ways, 64 byte lines |
144 |
case 0x87: l2 = 1024; break; // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines |
145 |
case 0x88: l3 = 2048; break; // code and data L3 cache, 2048 KB, 4 ways, 64 byte lines (IA-64) |
146 |
case 0x89: l3 = 4096; break; // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines (IA-64) |
147 |
case 0x8A: l3 = 8192; break; // code and data L3 cache, 8192 KB, 4 ways, 64 byte lines (IA-64) |
148 |
case 0x8D: l3 = 3072; break; // code and data L3 cache, 3072 KB, 12 ways, 128 byte lines (IA-64) |
149 |
|
150 |
default: break; |
151 |
} |
152 |
} |
153 |
if(check_for_p2_core2 && l2 == l3) |
154 |
l3 = 0; |
155 |
l1 *= 1024; |
156 |
l2 *= 1024; |
157 |
l3 *= 1024; |
158 |
} |
159 |
|
160 |
inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs) |
161 |
{ |
162 |
if(max_std_funcs>=4) |
163 |
queryCacheSizes_intel_direct(l1,l2,l3); |
164 |
else |
165 |
queryCacheSizes_intel_codes(l1,l2,l3); |
166 |
} |
167 |
|
168 |
inline void queryCacheSizes_amd(int& l1, int& l2, int& l3) |
169 |
{ |
170 |
int abcd[4]; |
171 |
abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; |
172 |
EIGEN_CPUID(abcd,0x80000005,0); |
173 |
l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB |
174 |
abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0; |
175 |
EIGEN_CPUID(abcd,0x80000006,0); |
176 |
l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB |
177 |
l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB |
178 |
} |
179 |
#endif |
180 |
|
181 |
/** \internal |
182 |
* Queries and returns the cache sizes in Bytes of the L1, L2, and L3 data caches respectively */ |
183 |
inline void queryCacheSizes(int& l1, int& l2, int& l3) |
184 |
{ |
185 |
#ifdef EIGEN_CPUID |
186 |
int abcd[4]; |
187 |
const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e}; |
188 |
const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163}; |
189 |
const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!" |
190 |
|
191 |
// identify the CPU vendor |
192 |
EIGEN_CPUID(abcd,0x0,0); |
193 |
int max_std_funcs = abcd[1]; |
194 |
if(cpuid_is_vendor(abcd,GenuineIntel)) |
195 |
queryCacheSizes_intel(l1,l2,l3,max_std_funcs); |
196 |
else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_)) |
197 |
queryCacheSizes_amd(l1,l2,l3); |
198 |
else |
199 |
// by default let's use Intel's API |
200 |
queryCacheSizes_intel(l1,l2,l3,max_std_funcs); |
201 |
|
202 |
// here is the list of other vendors: |
203 |
// ||cpuid_is_vendor(abcd,"VIA VIA VIA ") |
204 |
// ||cpuid_is_vendor(abcd,"CyrixInstead") |
205 |
// ||cpuid_is_vendor(abcd,"CentaurHauls") |
206 |
// ||cpuid_is_vendor(abcd,"GenuineTMx86") |
207 |
// ||cpuid_is_vendor(abcd,"TransmetaCPU") |
208 |
// ||cpuid_is_vendor(abcd,"RiseRiseRise") |
209 |
// ||cpuid_is_vendor(abcd,"Geode by NSC") |
210 |
// ||cpuid_is_vendor(abcd,"SiS SiS SiS ") |
211 |
// ||cpuid_is_vendor(abcd,"UMC UMC UMC ") |
212 |
// ||cpuid_is_vendor(abcd,"NexGenDriven") |
213 |
#else |
214 |
l1 = l2 = l3 = -1; |
215 |
#endif |
216 |
} |
217 |
|
218 |
} // end namespace internal |
219 |
|
16 |
class CacheSizes |
220 |
class CacheSizes |
17 |
{ |
221 |
{ |
18 |
private: |
222 |
private: |
19 |
static const size_t MaxNumberOfLevels = 4; |
223 |
static const size_t MaxNumberOfLevels = 4; |
20 |
size_t m_numberOfLevels; |
224 |
size_t m_numberOfLevels; |
21 |
size_t m_levels[MaxNumberOfLevels]; |
225 |
size_t m_levels[MaxNumberOfLevels]; |
22 |
|
226 |
|
23 |
void clear() |
227 |
void clear() |