Bugzilla – Attachment 519 Details for
Bug 931
Fix tracking and use of cache sizes
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
Log In
[x]
|
Forgot Password
Login:
[x]
This bugzilla service is closed. All entries have been migrated to
https://gitlab.com/libeigen/eigen
[patch]
Part 2: fix computeProductBlockingSizes, actually use the cache sizes
use-cachesizes (text/plain), 3.61 KB, created by
Benoit Jacob
on 2015-01-19 22:09:41 UTC
(
hide
)
Description:
Part 2: fix computeProductBlockingSizes, actually use the cache sizes
Filename:
MIME Type:
Creator:
Benoit Jacob
Created:
2015-01-19 22:09:41 UTC
Size:
3.61 KB
patch
obsolete
># HG changeset patch ># Parent 104b568f8e3815f0dfee78bdebd2e1f7916d3f4c > >diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h >--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h >+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h >@@ -41,42 +41,55 @@ void computeProductBlockingSizes(SizeTyp > // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and > // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed > // per kc x nr vertical small panels where nr is the blocking size along the n dimension > // at the register level. For vectorization purpose, these small vertical panels are unpacked, > // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to > // stay in L1 cache. > > typedef gebp_traits<LhsScalar,RhsScalar> Traits; >- enum { >- kdiv = KcFactor * 2 * Traits::nr >- * Traits::RhsProgress * sizeof(RhsScalar), >- mr = gebp_traits<LhsScalar,RhsScalar>::mr, >- mr_mask = (0xffffffff/mr)*mr >- }; > >-// k = std::min<SizeType>(k, l1/kdiv); >-// SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; >-// if(_m<m) m = _m & mr_mask; >- >- // In unit tests we do not want to use extra large matrices, >- // so we reduce the block size to check the blocking strategy is not flawed >-#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS >-// k = std::min<SizeType>(k,240); >-// n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); >-// m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); >- >- k = std::min<SizeType>(k,sizeof(LhsScalar)<=4 ? 360 : 240); >- n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); >- m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); >-#else >- k = std::min<SizeType>(k,24); >- n = std::min<SizeType>(n,384/sizeof(RhsScalar)); >- m = std::min<SizeType>(m,384/sizeof(RhsScalar)); >-#endif >+ // First, we compute kc >+ SizeType kdiv = KcFactor * 2 * Traits::nr * Traits::RhsProgress * sizeof(RhsScalar); >+ SizeType kc = CacheSizes::L1() / kdiv; >+ >+ // This optimization seems specific to x86/x86-64. It harms performance on a >+ // Nexus 4 (ARM) device where the optimal value of kc is around 512 or 1024. >+ #if EIGEN_ARCH_i386_OR_x86_64 >+ #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS >+ // Limit block size in tests to cover blocking logic without using huge matrices >+ kc = std::min<SizeType>(kc,24); >+ #else >+ kc = std::min<SizeType>(kc,sizeof(LhsScalar)<=4 ? 360 : 240); >+ #endif >+ #endif >+ >+ // Clamp k to kc >+ k = std::min<SizeType>(k, kc); >+ >+ // Next, compute mc and nc, using the clamped value of k, so that if k is very small, >+ // we can go bigger in the n and m dimensions. >+ SizeType mc = CacheSizes::TopLevel() / (4 * sizeof(LhsScalar) * k); >+ mc -= mc % Traits::mr; >+ m = std::min<SizeType>(m, mc); >+ SizeType nc = CacheSizes::TopLevel() / (4 * sizeof(RhsScalar) * k); >+ nc -= nc % Traits::nr; >+ n = std::min<SizeType>(n, nc); >+ >+ // Similar optimization as above - seems specific to x86/x86-64. >+ #if EIGEN_ARCH_i386_OR_x86_64 >+ #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS >+ // Limit block size in tests to cover blocking logic without using huge matrices >+ m = std::min<SizeType>(m,384/sizeof(RhsScalar)); >+ n = std::min<SizeType>(n,384/sizeof(RhsScalar)); >+ #else >+ m = std::min<SizeType>(m,3840/sizeof(RhsScalar)); >+ n = std::min<SizeType>(n,3840/sizeof(RhsScalar)); >+ #endif >+ #endif > } > > template<typename LhsScalar, typename RhsScalar, typename SizeType> > inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) > { > computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); > } >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 931
:
517
|
518
|
519
|
520