Bugzilla – Attachment 413 Details for
Bug 724
Use aligned loads/stores whenever possible to speedup the GEBP kernel
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
Log In
[x]
|
Forgot Password
Login:
[x]
This bugzilla service is closed. All entries have been migrated to
https://gitlab.com/libeigen/eigen
[patch]
Patch against the latest version of the code
aligned_mem.patch (text/plain), 15.78 KB, created by
Benoit Steiner
on 2014-01-08 18:29:44 UTC
(
hide
)
Description:
Patch against the latest version of the code
Filename:
MIME Type:
Creator:
Benoit Steiner
Created:
2014-01-08 18:29:44 UTC
Size:
15.78 KB
patch
obsolete
># HG changeset patch ># User Benoit Steiner <benoit.steiner.goog@gmail.com> ># Date 1389202069 28800 ># Node ID 81b9a6239532455502d6656a5afaba153e9b0997 ># Parent de929dec75c78d7da17f4b8915c6b36e72b27c7d >Use aligned loads and stores whenever possible to store the result of a matrix multiplication. This speeds up a few brain benchmarks by as much as 15% > >diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h >--- a/Eigen/src/Core/GenericPacketMath.h >+++ b/Eigen/src/Core/GenericPacketMath.h >@@ -152,16 +152,23 @@ pandnot(const Packet& a, const Packet& b > /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ > template<typename Packet> EIGEN_DEVICE_FUNC inline Packet > pload(const typename unpacket_traits<Packet>::type* from) { return *from; } > > /** \internal \returns a packet version of \a *from, (un-aligned load) */ > template<typename Packet> EIGEN_DEVICE_FUNC inline Packet > ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; } > >+/** \internal \returns a packet version of \a *from */ >+template<typename Packet, bool aligned> inline Packet >+pload(const typename unpacket_traits<Packet>::type* from) { >+ if (aligned) return pload<Packet>(from); >+ else return ploadu<Packet>(from); >+} >+ > /** \internal \returns a packet with elements of \a *from duplicated. > * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and > * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]} > * Currently, this function is only used for scalar * complex products. > */ > template<typename Packet> EIGEN_DEVICE_FUNC inline Packet > ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; } > >@@ -176,16 +183,23 @@ plset(const Scalar& a) { return a; } > /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ > template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) > { (*to) = from; } > > /** \internal copy the packet \a from to \a *to, (un-aligned store) */ > template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) > { (*to) = from; } > >+/** \internal copy the packet \a from to \a *to */ >+template<typename Scalar, typename Packet, bool aligned> inline void >+pstore(Scalar* to, const Packet& from) { >+ if (aligned) pstore<Scalar, Packet>(to, from); >+ else pstoreu<Scalar, Packet>(to, from); >+} >+ > /** \internal tries to do cache prefetching of \a addr */ > template<typename Scalar> inline void prefetch(const Scalar* addr) > { > #if !defined(_MSC_VER) > __builtin_prefetch(addr); > #endif > } > >@@ -348,9 +362,8 @@ template<> inline std::complex<double> p > > #endif > > } // end namespace internal > > } // end namespace Eigen > > #endif // EIGEN_GENERIC_PACKET_MATH_H >- >diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h >--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h >+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h >@@ -505,17 +505,17 @@ protected: > > /* optimized GEneral packed Block * packed Panel product kernel > * > * Mixing type logic: C += A * B > * | A | B | comments > * |real |cplx | no vectorization yet, would require to pack A with duplication > * |cplx |real | easy vectorization > */ >-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> >+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs, bool alignedLoadStores> > struct gebp_kernel > { > typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; > typedef typename Traits::ResScalar ResScalar; > typedef typename Traits::LhsPacket LhsPacket; > typedef typename Traits::RhsPacket RhsPacket; > typedef typename Traits::ResPacket ResPacket; > typedef typename Traits::AccPacket AccPacket; >@@ -527,19 +527,19 @@ struct gebp_kernel > ResPacketSize = Traits::ResPacketSize > }; > > EIGEN_DONT_INLINE > void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, > Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); > }; > >-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> >+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs, bool alignedLoadStores> > EIGEN_DONT_INLINE >-void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> >+void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs,alignedLoadStores> > ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, > Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) > { > Traits traits; > > if(strideA==-1) strideA = depth; > if(strideB==-1) strideB = depth; > conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; >@@ -752,60 +752,60 @@ EIGEN_ASM_COMMENT("mybegin4"); > blA += mr; > } > > if(nr==4) > { > ResPacket R0, R1, R2, R3, R4, R5, R6; > ResPacket alphav = pset1<ResPacket>(alpha); > >- R0 = ploadu<ResPacket>(r0); >- R1 = ploadu<ResPacket>(r1); >- R2 = ploadu<ResPacket>(r2); >- R3 = ploadu<ResPacket>(r3); >- R4 = ploadu<ResPacket>(r0 + ResPacketSize); >- R5 = ploadu<ResPacket>(r1 + ResPacketSize); >- R6 = ploadu<ResPacket>(r2 + ResPacketSize); >+ R0 = pload<ResPacket, alignedLoadStores>(r0); >+ R1 = pload<ResPacket, alignedLoadStores>(r1); >+ R2 = pload<ResPacket, alignedLoadStores>(r2); >+ R3 = pload<ResPacket, alignedLoadStores>(r3); >+ R4 = pload<ResPacket, alignedLoadStores>(r0 + ResPacketSize); >+ R5 = pload<ResPacket, alignedLoadStores>(r1 + ResPacketSize); >+ R6 = pload<ResPacket, alignedLoadStores>(r2 + ResPacketSize); > traits.acc(C0, alphav, R0); >- pstoreu(r0, R0); >- R0 = ploadu<ResPacket>(r3 + ResPacketSize); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r0, R0); >+ R0 = pload<ResPacket, alignedLoadStores>(r3 + ResPacketSize); > > traits.acc(C1, alphav, R1); > traits.acc(C2, alphav, R2); > traits.acc(C3, alphav, R3); > traits.acc(C4, alphav, R4); > traits.acc(C5, alphav, R5); > traits.acc(C6, alphav, R6); > traits.acc(C7, alphav, R0); > >- pstoreu(r1, R1); >- pstoreu(r2, R2); >- pstoreu(r3, R3); >- pstoreu(r0 + ResPacketSize, R4); >- pstoreu(r1 + ResPacketSize, R5); >- pstoreu(r2 + ResPacketSize, R6); >- pstoreu(r3 + ResPacketSize, R0); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r1, R1); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r2, R2); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r3, R3); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r0 + ResPacketSize, R4); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r1 + ResPacketSize, R5); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r2 + ResPacketSize, R6); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r3 + ResPacketSize, R0); > } > else > { > ResPacket R0, R1, R4; > ResPacket alphav = pset1<ResPacket>(alpha); > >- R0 = ploadu<ResPacket>(r0); >- R1 = ploadu<ResPacket>(r1); >- R4 = ploadu<ResPacket>(r0 + ResPacketSize); >+ R0 = pload<ResPacket, alignedLoadStores>(r0); >+ R1 = pload<ResPacket, alignedLoadStores>(r1); >+ R4 = pload<ResPacket, alignedLoadStores>(r0 + ResPacketSize); > traits.acc(C0, alphav, R0); >- pstoreu(r0, R0); >- R0 = ploadu<ResPacket>(r1 + ResPacketSize); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r0, R0); >+ R0 = pload<ResPacket, alignedLoadStores>(r1 + ResPacketSize); > traits.acc(C1, alphav, R1); > traits.acc(C4, alphav, R4); > traits.acc(C5, alphav, R0); >- pstoreu(r1, R1); >- pstoreu(r0 + ResPacketSize, R4); >- pstoreu(r1 + ResPacketSize, R0); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r1, R1); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r0 + ResPacketSize, R4); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r1 + ResPacketSize, R0); > } > > } > > if(rows-peeled_mc>=LhsProgress) > { > Index i = peeled_mc; > const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; >@@ -935,30 +935,30 @@ EIGEN_ASM_COMMENT("mybegin4"); > ResPacket R0, R1, R2, R3; > ResPacket alphav = pset1<ResPacket>(alpha); > > ResScalar* r0 = &res[(j2+0)*resStride + i]; > ResScalar* r1 = r0 + resStride; > ResScalar* r2 = r1 + resStride; > ResScalar* r3 = r2 + resStride; > >- R0 = ploadu<ResPacket>(r0); >- R1 = ploadu<ResPacket>(r1); >- if(nr==4) R2 = ploadu<ResPacket>(r2); >- if(nr==4) R3 = ploadu<ResPacket>(r3); >+ R0 = pload<ResPacket, alignedLoadStores>(r0); >+ R1 = pload<ResPacket, alignedLoadStores>(r1); >+ if(nr==4) R2 = pload<ResPacket, alignedLoadStores>(r2); >+ if(nr==4) R3 = pload<ResPacket, alignedLoadStores>(r3); > > traits.acc(C0, alphav, R0); > traits.acc(C1, alphav, R1); > if(nr==4) traits.acc(C2, alphav, R2); > if(nr==4) traits.acc(C3, alphav, R3); > >- pstoreu(r0, R0); >- pstoreu(r1, R1); >- if(nr==4) pstoreu(r2, R2); >- if(nr==4) pstoreu(r3, R3); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r0, R0); >+ pstore<ResScalar, ResPacket, alignedLoadStores>(r1, R1); >+ if(nr==4) pstore<ResScalar, ResPacket, alignedLoadStores>(r2, R2); >+ if(nr==4) pstore<ResScalar, ResPacket, alignedLoadStores>(r3, R3); > } > for(Index i=peeled_mc2; i<rows; i++) > { > const LhsScalar* blA = &blockA[i*strideA+offsetA]; > prefetch(&blA[0]); > > // gets a 1 x nr res block as registers > ResScalar C0(0), C1(0), C2(0), C3(0); >diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h >--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h >+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h >@@ -39,21 +39,24 @@ struct general_matrix_matrix_product<Ind > LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, > ColMajor> > ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); > } > }; > > /* Specialization for a col-major destination matrix > * => Blocking algorithm following Goto's paper */ >+/* Handling of col-major destination matrix >+ * => Blocking algorithm following Goto's paper */ > template< > typename Index, > typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, >- typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> >-struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor> >+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, >+ bool ResMatrixAligned> >+struct general_matrix_matrix_product_internal > { > > typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; > static void run(Index rows, Index cols, Index depth, > const LhsScalar* _lhs, Index lhsStride, > const RhsScalar* _rhs, Index rhsStride, > ResScalar* res, Index resStride, > ResScalar alpha, >@@ -66,17 +69,17 @@ static void run(Index rows, Index cols, > typedef gebp_traits<LhsScalar,RhsScalar> Traits; > > Index kc = blocking.kc(); // cache block size along the K direction > Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction > //Index nc = blocking.nc(); // cache block size along the N direction > > gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; > gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; >- gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; >+ gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResMatrixAligned> gebp; > > #ifdef EIGEN_HAS_OPENMP > if(info) > { > // this is the parallel version! > Index tid = omp_get_thread_num(); > Index threads = omp_get_num_threads(); > >@@ -185,16 +188,45 @@ static void run(Index rows, Index cols, > gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); > } > } > } > } > > }; > >+template< >+ typename Index, >+ typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, >+ typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> >+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor> >+{ >+typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; >+static void run(Index rows, Index cols, Index depth, >+ const LhsScalar* _lhs, Index lhsStride, >+ const RhsScalar* _rhs, Index rhsStride, >+ ResScalar* res, Index resStride, >+ ResScalar alpha, >+ level3_blocking<LhsScalar,RhsScalar>& blocking, >+ GemmParallelInfo<Index>* info = 0) >+{ >+ typedef gebp_traits<LhsScalar,RhsScalar> Traits; >+ typedef packet_traits<ResScalar> ResTraits; >+ >+ if (((Traits::mr % ResTraits::size) == 0) && >+ (((size_t)res) % (ResTraits::size*sizeof(ResScalar)) == 0) && >+ ((resStride % ResTraits::size) == 0)) { >+ // The kernel will use aligned loads and stores to update the result matrix for all the micro panels. >+ general_matrix_matrix_product_internal<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar, RhsStorageOrder, ConjugateRhs, true>::run(rows, cols, depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking, info); >+ } else { >+ general_matrix_matrix_product_internal<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar, RhsStorageOrder, ConjugateRhs, false>::run(rows, cols, depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking, info); >+ } >+} >+}; >+ > /********************************************************************************* > * Specialization of GeneralProduct<> for "large" GEMM, i.e., > * implementation of the high level wrapper to general_matrix_matrix_product > **********************************************************************************/ > > template<typename Lhs, typename Rhs> > struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> > > : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> > >diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h >--- a/Eigen/src/Core/util/BlasUtil.h >+++ b/Eigen/src/Core/util/BlasUtil.h >@@ -13,17 +13,17 @@ > // This file contains many lightweight helper classes used to > // implement and control fast level 2 and level 3 BLAS-like routines. > > namespace Eigen { > > namespace internal { > > // forward declarations >-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false> >+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false, bool alignedLoadStores = false> > struct gebp_kernel; > > template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false> > struct gemm_pack_rhs; > > template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false> > struct gemm_pack_lhs; >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 724
: 413 |
414
|
415
|
423
|
433