Lines 1252-1267
EIGEN_DONT_INLINE void gemm_pack_rhs<Sca
Link Here
|
1252 |
if(PanelMode) count += (stride-offset-depth); |
1252 |
if(PanelMode) count += (stride-offset-depth); |
1253 |
} |
1253 |
} |
1254 |
} |
1254 |
} |
1255 |
|
1255 |
|
1256 |
// this version is optimized for row major matrices |
1256 |
// this version is optimized for row major matrices |
1257 |
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> |
1257 |
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> |
1258 |
struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> |
1258 |
struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> |
1259 |
{ |
1259 |
{ |
|
|
1260 |
typedef typename packet_traits<Scalar>::type Packet; |
1260 |
enum { PacketSize = packet_traits<Scalar>::size }; |
1261 |
enum { PacketSize = packet_traits<Scalar>::size }; |
1261 |
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); |
1262 |
EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); |
1262 |
}; |
1263 |
}; |
1263 |
|
1264 |
|
1264 |
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> |
1265 |
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> |
1265 |
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> |
1266 |
EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> |
1266 |
::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) |
1267 |
::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) |
1267 |
{ |
1268 |
{ |
Lines 1271-1292
EIGEN_DONT_INLINE void gemm_pack_rhs<Sca
Link Here
|
1271 |
Index packet_cols = (cols/nr) * nr; |
1272 |
Index packet_cols = (cols/nr) * nr; |
1272 |
Index count = 0; |
1273 |
Index count = 0; |
1273 |
for(Index j2=0; j2<packet_cols; j2+=nr) |
1274 |
for(Index j2=0; j2<packet_cols; j2+=nr) |
1274 |
{ |
1275 |
{ |
1275 |
// skip what we have before |
1276 |
// skip what we have before |
1276 |
if(PanelMode) count += nr * offset; |
1277 |
if(PanelMode) count += nr * offset; |
1277 |
for(Index k=0; k<depth; k++) |
1278 |
for(Index k=0; k<depth; k++) |
1278 |
{ |
1279 |
{ |
1279 |
const Scalar* b0 = &rhs[k*rhsStride + j2]; |
1280 |
if (nr == PacketSize) { |
1280 |
blockB[count+0] = cj(b0[0]); |
1281 |
Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); |
1281 |
blockB[count+1] = cj(b0[1]); |
1282 |
pstoreu(blockB+count, cj.pconj(A)); |
1282 |
if(nr==4) blockB[count+2] = cj(b0[2]); |
1283 |
count += PacketSize; |
1283 |
if(nr==4) blockB[count+3] = cj(b0[3]); |
1284 |
} else { |
1284 |
count += nr; |
1285 |
const Scalar* b0 = &rhs[k*rhsStride + j2]; |
|
|
1286 |
blockB[count+0] = cj(b0[0]); |
1287 |
blockB[count+1] = cj(b0[1]); |
1288 |
if(nr==4) blockB[count+2] = cj(b0[2]); |
1289 |
if(nr==4) blockB[count+3] = cj(b0[3]); |
1290 |
count += nr; |
1291 |
} |
1285 |
} |
1292 |
} |
1286 |
// skip what we have after |
1293 |
// skip what we have after |
1287 |
if(PanelMode) count += nr * (stride-offset-depth); |
1294 |
if(PanelMode) count += nr * (stride-offset-depth); |
1288 |
} |
1295 |
} |
1289 |
// copy the remaining columns one at a time (nr==1) |
1296 |
// copy the remaining columns one at a time (nr==1) |
1290 |
for(Index j2=packet_cols; j2<cols; ++j2) |
1297 |
for(Index j2=packet_cols; j2<cols; ++j2) |
1291 |
{ |
1298 |
{ |
1292 |
if(PanelMode) count += offset; |
1299 |
if(PanelMode) count += offset; |