Bugzilla – Attachment 902 Details for
Bug 1633
Improve float matrix multiplication performance on ARM NEON (take 2)
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
Log In
[x]
|
Forgot Password
Login:
[x]
This bugzilla service is closed. All entries have been migrated to
https://gitlab.com/libeigen/eigen
[patch]
faster kernel
fasterkernel.diff (text/plain), 22.08 KB, created by
Benoit Jacob
on 2018-11-28 18:12:46 UTC
(
hide
)
Description:
faster kernel
Filename:
MIME Type:
Creator:
Benoit Jacob
Created:
2018-11-28 18:12:46 UTC
Size:
22.08 KB
patch
obsolete
>diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h >--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h >+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h >@@ -339,10 +339,10 @@ inline void computeProductBlockingSizes( > > /* Vectorization logic > * real*real: unpack rhs to constant packets, ... >- * >+ * > * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i), > * storing each res packet into two packets (2x2), >- * at the end combine them: swap the second and addsub them >+ * at the end combine them: swap the second and addsub them > * cf*cf : same but with 2x4 blocks > * cplx*real : unpack rhs to constant packets, ... > * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual >@@ -362,7 +362,7 @@ public: > LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, > RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, > ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, >- >+ > NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, > > // register block size along the N direction must be 1 or 4 >@@ -378,7 +378,7 @@ public: > #else > mr = default_mr, > #endif >- >+ > LhsProgress = LhsPacketSize, > RhsProgress = 1 > }; >@@ -393,28 +393,28 @@ public: > typedef LhsPacket LhsPacket4Packing; > > typedef ResPacket AccPacket; >- >+ > EIGEN_STRONG_INLINE void initAcc(AccPacket& p) > { > p = pset1<ResPacket>(ResScalar(0)); > } >- >+ > EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) > { > pbroadcast4(b, b0, b1, b2, b3); > } >- >+ > // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) > // { > // pbroadcast2(b, b0, b1); > // } >- >+ > template<typename RhsPacketType> > EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const > { > dest = pset1<RhsPacketType>(*b); > } >- >+ > EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const > { > dest = ploadquad<RhsPacket>(b); >@@ -452,7 +452,7 @@ public: > { > r = pmadd(c,alpha,r); > } >- >+ > template<typename ResPacketHalf> > EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const > { >@@ -476,7 +476,7 @@ public: > LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, > RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, > ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, >- >+ > NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, > nr = 4, > #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) >@@ -510,7 +510,7 @@ public: > { > dest = pset1<RhsPacket>(*b); > } >- >+ > EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const > { > dest = pset1<RhsPacket>(*b); >@@ -530,7 +530,7 @@ public: > { > pbroadcast4(b, b0, b1, b2, b3); > } >- >+ > // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) > // { > // pbroadcast2(b, b0, b1); >@@ -605,7 +605,7 @@ public: > typedef std::complex<RealScalar> LhsScalar; > typedef std::complex<RealScalar> RhsScalar; > typedef std::complex<RealScalar> ResScalar; >- >+ > enum { > ConjLhs = _ConjLhs, > ConjRhs = _ConjRhs, >@@ -623,7 +623,7 @@ public: > LhsProgress = ResPacketSize, > RhsProgress = 1 > }; >- >+ > typedef typename packet_traits<RealScalar>::type RealPacket; > typedef typename packet_traits<Scalar>::type ScalarPacket; > typedef DoublePacket<RealPacket> DoublePacketType; >@@ -633,7 +633,7 @@ public: > typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket; > typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; > typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket; >- >+ > EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } > > EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) >@@ -654,7 +654,7 @@ public: > dest.first = pset1<RealPacket>(real(*b)); > dest.second = pset1<RealPacket>(imag(*b)); > } >- >+ > EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const > { > loadRhs(b,dest); >@@ -664,7 +664,7 @@ public: > eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4); > loadRhs(b,dest); > } >- >+ > EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) > { > // FIXME not sure that's the best way to implement it! >@@ -673,7 +673,7 @@ public: > loadRhs(b+2, b2); > loadRhs(b+3, b3); > } >- >+ > // Vectorized path > EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) > { >@@ -681,7 +681,7 @@ public: > loadRhs(b+0, b0); > loadRhs(b+1, b1); > } >- >+ > // Scalar path > EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) > { >@@ -711,9 +711,9 @@ public: > { > c = cj.pmadd(a,b,c); > } >- >+ > EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } >- >+ > EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const > { > // assemble c >@@ -738,7 +738,7 @@ public: > tmp = pcplxflip(ResPacket(c.second)); > tmp = psub(pconj(ResPacket(c.first)),tmp); > } >- >+ > r = pmadd(tmp,alpha,r); > } > >@@ -763,7 +763,7 @@ public: > LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, > RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, > ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, >- >+ > NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, > // FIXME: should depend on NumberOfRegisters > nr = 4, >@@ -793,12 +793,12 @@ public: > { > dest = pset1<RhsPacket>(*b); > } >- >+ > void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) > { > pbroadcast4(b, b0, b1, b2, b3); > } >- >+ > // EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) > // { > // // FIXME not sure that's the best way to implement it! >@@ -810,7 +810,7 @@ public: > { > dest = ploaddup<LhsPacket>(a); > } >- >+ > EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const > { > eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4); >@@ -835,7 +835,7 @@ public: > #else > tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); > #endif >- >+ > } > > EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const >@@ -859,8 +859,6 @@ template<> > struct gebp_traits <float, float, false, false,Architecture::NEON> > : gebp_traits<float,float,false,false,Architecture::Generic> > { >- typedef float RhsPacket; >- > EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) > { > loadRhs(b+0, b0); >@@ -871,7 +869,7 @@ struct gebp_traits <float, float, false, > > EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const > { >- dest = *b; >+ dest = vdupq_n_f32(*b); > } > > EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const >@@ -881,7 +879,27 @@ struct gebp_traits <float, float, false, > > EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/) const > { >- c = vfmaq_n_f32(c, a, b); >+ c = vfmaq_f32(c, a, b); >+ } >+ >+ EIGEN_STRONG_INLINE void loadRhsForUseByLane(const RhsScalar* b, RhsPacket& dest) const >+ { >+ dest = vld1q_f32(b); >+ } >+ >+ template <int Lane> >+ EIGEN_STRONG_INLINE >+ void maddByLane(const LhsPacket& a, const RhsPacket& b, AccPacket& c) const >+ { >+ if (Lane == 0) { >+ c = vfmaq_lane_f32(c, a, vget_low_f32(b), 0); >+ } else if (Lane == 1) { >+ c = vfmaq_lane_f32(c, a, vget_low_f32(b), 1); >+ } else if (Lane == 2) { >+ c = vfmaq_lane_f32(c, a, vget_high_f32(b), 0); >+ } else if (Lane == 3) { >+ c = vfmaq_lane_f32(c, a, vget_high_f32(b), 1); >+ } > } > }; > >@@ -1015,7 +1033,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > { > Traits traits; > SwappedTraits straits; >- >+ > if(strideA==-1) strideA = depth; > if(strideB==-1) strideB = depth; > conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; >@@ -1025,7 +1043,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; > enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) > const Index peeled_kc = depth & ~(pk-1); >- const Index prefetch_res_offset = 32/sizeof(ResScalar); >+ const Index prefetch_res_offset = 32/sizeof(ResScalar); > // const Index depth2 = depth & ~1; > > //---------- Process 3 * LhsProgress rows at once ---------- >@@ -1050,10 +1068,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > { > for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress) > { >- >+ > // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely > // stored into 3 x nr registers. >- >+ > const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)]; > prefetch(&blA[0]); > >@@ -1083,9 +1101,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > for(Index k=0; k<peeled_kc; k+=pk) > { > EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); >- RhsPacket B_0, T0; >+ RhsPacket B_0; > LhsPacket A2; > >+#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON >+ static_assert(RhsProgress == 1, ""); >+ static_assert(Traits::RhsProgress == 1, ""); > #define EIGEN_GEBP_ONESTEP(K) \ > do { \ > EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ >@@ -1095,6 +1116,32 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ > traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ > traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ >+ traits.loadRhsForUseByLane(blB + (0+4*K)*Traits::RhsProgress, B_0); \ >+ traits.template maddByLane<0>(A0, B_0, C0); \ >+ traits.template maddByLane<0>(A1, B_0, C4); \ >+ traits.template maddByLane<0>(A2, B_0, C8); \ >+ traits.template maddByLane<1>(A0, B_0, C1); \ >+ traits.template maddByLane<1>(A1, B_0, C5); \ >+ traits.template maddByLane<1>(A2, B_0, C9); \ >+ traits.template maddByLane<2>(A0, B_0, C2); \ >+ traits.template maddByLane<2>(A1, B_0, C6); \ >+ traits.template maddByLane<2>(A2, B_0, C10); \ >+ traits.template maddByLane<3>(A0, B_0, C3); \ >+ traits.template maddByLane<3>(A1, B_0, C7); \ >+ traits.template maddByLane<3>(A2, B_0, C11); \ >+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ >+ } while(false) >+#else >+#define EIGEN_GEBP_ONESTEP(K) \ >+ do { \ >+ RhsPacket T0; \ >+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ >+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ >+ internal::prefetch(blA+(3*K+16)*LhsProgress); \ >+ if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \ >+ traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ >+ traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ >+ traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ > traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ > traits.madd(A0, B_0, C0, T0); \ > traits.madd(A1, B_0, C4, T0); \ >@@ -1113,7 +1160,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > traits.madd(A2, B_0, C11, B_0); \ > EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ > } while(false) >- >+#endif > internal::prefetch(blB); > EIGEN_GEBP_ONESTEP(0); > EIGEN_GEBP_ONESTEP(1); >@@ -1132,7 +1179,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > // process remaining peeled loop > for(Index k=peeled_kc; k<depth; k++) > { >- RhsPacket B_0, T0; >+ RhsPacket B_0; > LhsPacket A2; > EIGEN_GEBP_ONESTEP(0); > blB += 4*RhsProgress; >@@ -1182,7 +1229,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > traits.acc(C11, alphav, R2); > r3.storePacket(0 * Traits::ResPacketSize, R0); > r3.storePacket(1 * Traits::ResPacketSize, R1); >- r3.storePacket(2 * Traits::ResPacketSize, R2); >+ r3.storePacket(2 * Traits::ResPacketSize, R2); > } > } > >@@ -1207,7 +1254,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > // performs "inner" products > const RhsScalar* blB = &blockB[j2*strideB+offsetB]; > LhsPacket A0, A1, A2; >- >+ > for(Index k=0; k<peeled_kc; k+=pk) > { > EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1"); >@@ -1225,7 +1272,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > traits.madd(A2, B_0, C8, B_0); \ > EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ > } while(false) >- >+ > EIGEN_GEBGP_ONESTEP(0); > EIGEN_GEBGP_ONESTEP(1); > EIGEN_GEBGP_ONESTEP(2); >@@ -1261,7 +1308,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > traits.acc(C8, alphav, R2); > r0.storePacket(0 * Traits::ResPacketSize, R0); > r0.storePacket(1 * Traits::ResPacketSize, R1); >- r0.storePacket(2 * Traits::ResPacketSize, R2); >+ r0.storePacket(2 * Traits::ResPacketSize, R2); > } > } > } >@@ -1283,10 +1330,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > { > for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress) > { >- >+ > // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely > // stored into 2 x nr registers. >- >+ > const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)]; > prefetch(&blA[0]); > >@@ -1314,10 +1361,32 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > for(Index k=0; k<peeled_kc; k+=pk) > { > EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); >- RhsPacket B_0, B1, B2, B3, T0; > >- #define EIGEN_GEBGP_ONESTEP(K) \ >+#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON >+ static_assert(RhsProgress == 1, ""); >+ static_assert(Traits::RhsProgress == 1, ""); >+#define EIGEN_GEBGP_ONESTEP(K) \ >+ do { \ >+ RhsPacket B_0; \ >+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ >+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ >+ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ >+ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ >+ traits.loadRhsForUseByLane(blB + (0+4*K)*Traits::RhsProgress, B_0); \ >+ traits.template maddByLane<0>(A0, B_0, C0); \ >+ traits.template maddByLane<0>(A1, B_0, C4); \ >+ traits.template maddByLane<1>(A0, B_0, C1); \ >+ traits.template maddByLane<1>(A1, B_0, C5); \ >+ traits.template maddByLane<2>(A0, B_0, C2); \ >+ traits.template maddByLane<2>(A1, B_0, C6); \ >+ traits.template maddByLane<3>(A0, B_0, C3); \ >+ traits.template maddByLane<3>(A1, B_0, C7); \ >+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ >+ } while(false) >+#else >+#define EIGEN_GEBGP_ONESTEP(K) \ > do { \ >+ RhsPacket B_0, B1, B2, B3, T0; \ > EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ > EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ > traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ >@@ -1333,7 +1402,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > traits.madd(A1, B3, C7, B3); \ > EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ > } while(false) >- >+#endif >+ > internal::prefetch(blB+(48+0)); > EIGEN_GEBGP_ONESTEP(0); > EIGEN_GEBGP_ONESTEP(1); >@@ -1353,7 +1423,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > // process remaining peeled loop > for(Index k=peeled_kc; k<depth; k++) > { >- RhsPacket B_0, B1, B2, B3, T0; > EIGEN_GEBGP_ONESTEP(0); > blB += 4*RhsProgress; > blA += 2*Traits::LhsProgress; >@@ -1390,7 +1459,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > r3.storePacket(1 * Traits::ResPacketSize, R3); > } > } >- >+ > // Deal with remaining columns of the rhs > for(Index j2=packet_cols4; j2<cols; j2++) > { >@@ -1416,7 +1485,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > { > EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1"); > RhsPacket B_0, B1; >- >+ > #define EIGEN_GEBGP_ONESTEP(K) \ > do { \ > EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ >@@ -1428,7 +1497,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > traits.madd(A1, B_0, C4, B_0); \ > EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ > } while(false) >- >+ > EIGEN_GEBGP_ONESTEP(0); > EIGEN_GEBGP_ONESTEP(1); > EIGEN_GEBGP_ONESTEP(2); >@@ -1466,6 +1535,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > } > } > } >+ > //---------- Process 1 * LhsProgress rows at once ---------- > if(mr>=1*Traits::LhsProgress) > { >@@ -1477,7 +1547,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > { > // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely > // stored into 1 x nr registers. >- >+ > const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; > prefetch(&blA[0]); > >@@ -1507,9 +1577,27 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > { > EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4"); > RhsPacket B_0, B1, B2, B3; >- >+ >+#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON >+ static_assert(RhsProgress == 1, ""); >+ static_assert(Traits::RhsProgress == 1, ""); > #define EIGEN_GEBGP_ONESTEP(K) \ >- do { \ >+ do { \ >+ RhsPacket B_0; \ >+ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ >+ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ >+ traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ >+ traits.loadRhsForUseByLane(blB + (0+4*K)*Traits::RhsProgress, B_0); \ >+ traits.template maddByLane<0>(A0, B_0, C0); \ >+ traits.template maddByLane<1>(A0, B_0, C1); \ >+ traits.template maddByLane<2>(A0, B_0, C2); \ >+ traits.template maddByLane<3>(A0, B_0, C3); \ >+ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ >+ } while(false) >+#else >+#define EIGEN_GEBGP_ONESTEP(K) \ >+ do { \ >+ RhsPacket B_0, B1, B2, B3; \ > EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ > EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ > traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ >@@ -1520,7 +1608,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > traits.madd(A0, B3, C3, B3); \ > EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ > } while(false) >- >+#endif >+ > internal::prefetch(blB+(48+0)); > EIGEN_GEBGP_ONESTEP(0); > EIGEN_GEBGP_ONESTEP(1); >@@ -1540,7 +1629,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > // process remaining peeled loop > for(Index k=peeled_kc; k<depth; k++) > { >- RhsPacket B_0, B1, B2, B3; > EIGEN_GEBGP_ONESTEP(0); > blB += 4*RhsProgress; > blA += 1*LhsProgress; >@@ -1586,7 +1674,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > { > EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1"); > RhsPacket B_0; >- >+ > #define EIGEN_GEBGP_ONESTEP(K) \ > do { \ > EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \ >@@ -1629,6 +1717,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > } > } > } >+ > //---------- Process remaining rows, 1 at once ---------- > if(peeled_mc1<rows) > { >@@ -1760,12 +1849,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Ind > B_1 = blB[1]; > CJMADD(cj,A0,B_0,C0, B_0); > CJMADD(cj,A0,B_1,C1, B_1); >- >+ > B_0 = blB[2]; > B_1 = blB[3]; > CJMADD(cj,A0,B_0,C2, B_0); > CJMADD(cj,A0,B_1,C3, B_1); >- >+ > blB += 4; > } > res(i, j2 + 0) += alpha * C0; >@@ -2253,7 +2342,7 @@ inline std::ptrdiff_t l2CacheSize() > } > > /** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\ >-rs. >+rs. > * \sa setCpuCacheSize */ > inline std::ptrdiff_t l3CacheSize() > {
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 1633
:
901
|
902
|
910
|
914
|
917
|
918
|
922
|
923