Lines 41-82
void computeProductBlockingSizes(SizeTyp
Link Here
|
41 |
// Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and |
41 |
// Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and |
42 |
// mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed |
42 |
// mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed |
43 |
// per kc x nr vertical small panels where nr is the blocking size along the n dimension |
43 |
// per kc x nr vertical small panels where nr is the blocking size along the n dimension |
44 |
// at the register level. For vectorization purpose, these small vertical panels are unpacked, |
44 |
// at the register level. For vectorization purpose, these small vertical panels are unpacked, |
45 |
// e.g., each coefficient is replicated to fit a packet. This small vertical panel has to |
45 |
// e.g., each coefficient is replicated to fit a packet. This small vertical panel has to |
46 |
// stay in L1 cache. |
46 |
// stay in L1 cache. |
47 |
|
47 |
|
48 |
typedef gebp_traits<LhsScalar,RhsScalar> Traits; |
48 |
typedef gebp_traits<LhsScalar,RhsScalar> Traits; |
49 |
enum { |
|
|
50 |
kdiv = KcFactor * 2 * Traits::nr |
51 |
* Traits::RhsProgress * sizeof(RhsScalar), |
52 |
mr = gebp_traits<LhsScalar,RhsScalar>::mr, |
53 |
mr_mask = (0xffffffff/mr)*mr |
54 |
}; |
55 |
|
49 |
|
56 |
// k = std::min<SizeType>(k, l1/kdiv); |
50 |
// First, we compute kc |
57 |
// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; |
51 |
SizeType kdiv = KcFactor * 2 * Traits::nr * Traits::RhsProgress * sizeof(RhsScalar); |
58 |
// if(_m<m) m = _m & mr_mask; |
52 |
SizeType kc = CacheSizes::L1() / kdiv; |
59 |
|
53 |
|
60 |
// In unit tests we do not want to use extra large matrices, |
54 |
// This optimization seems specific to x86/x86-64. It harms performance on a |
61 |
// so we reduce the block size to check the blocking strategy is not flawed |
55 |
// Nexus 4 (ARM) device where the optimal value of kc is around 512 or 1024. |
62 |
#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS |
56 |
#if EIGEN_ARCH_i386_OR_x86_64 |
63 |
// k = std::min<SizeType>(k,240); |
57 |
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS |
64 |
// n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); |
58 |
// Limit block size in tests to cover blocking logic without using huge matrices |
65 |
// m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); |
59 |
kc = std::min<SizeType>(kc,24); |
66 |
|
60 |
#else |
67 |
k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240); |
61 |
kc = std::min<SizeType>(kc,sizeof(LhsScalar)<=4 ? 360 : 240); |
68 |
n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); |
62 |
#endif |
69 |
m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); |
63 |
#endif |
70 |
#else |
64 |
|
71 |
k = std::min<SizeType>(k,24); |
65 |
// Clamp k to kc |
72 |
n = std::min<SizeType>(n,384/sizeof(RhsScalar)); |
66 |
k = std::min<SizeType>(k, kc); |
73 |
m = std::min<SizeType>(m,384/sizeof(RhsScalar)); |
67 |
|
74 |
#endif |
68 |
// Next, compute mc and nc, using the clamped value of k, so that if k is very small, |
|
|
69 |
// we can go bigger in the n and m dimensions. |
70 |
SizeType mc = CacheSizes::TopLevel() / (4 * sizeof(LhsScalar) * k); |
71 |
mc -= mc % Traits::mr; |
72 |
m = std::min<SizeType>(m, mc); |
73 |
SizeType nc = CacheSizes::TopLevel() / (4 * sizeof(RhsScalar) * k); |
74 |
nc -= nc % Traits::nr; |
75 |
n = std::min<SizeType>(n, nc); |
76 |
|
77 |
// Similar optimization as above - seems specific to x86/x86-64. |
78 |
#if EIGEN_ARCH_i386_OR_x86_64 |
79 |
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS |
80 |
// Limit block size in tests to cover blocking logic without using huge matrices |
81 |
m = std::min<SizeType>(m,384/sizeof(RhsScalar)); |
82 |
n = std::min<SizeType>(n,384/sizeof(RhsScalar)); |
83 |
#else |
84 |
m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); |
85 |
n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); |
86 |
#endif |
87 |
#endif |
75 |
} |
88 |
} |
76 |
|
89 |
|
77 |
template<typename LhsScalar, typename RhsScalar, typename SizeType> |
90 |
template<typename LhsScalar, typename RhsScalar, typename SizeType> |
78 |
inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) |
91 |
inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) |
79 |
{ |
92 |
{ |
80 |
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); |
93 |
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); |
81 |
} |
94 |
} |
82 |
|
95 |
|