Lines 1740-1763
EIGEN_DONT_INLINE void gemm_pack_rhs<Sca
Link Here
|
1740 |
const Scalar* b3 = &rhs[(j2+3)*rhsStride]; |
1740 |
const Scalar* b3 = &rhs[(j2+3)*rhsStride]; |
1741 |
|
1741 |
|
1742 |
Index k=0; |
1742 |
Index k=0; |
1743 |
if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ?? |
1743 |
if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ?? |
1744 |
{ |
1744 |
{ |
1745 |
for(; k<peeled_k; k+=PacketSize) { |
1745 |
for(; k<peeled_k; k+=PacketSize) { |
1746 |
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel; |
1746 |
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel; |
1747 |
kernel.packet[0] = ploadu<Packet>(&b0[k]); |
1747 |
kernel.packet[0] = ploadu<Packet>(&b0[k]); |
1748 |
kernel.packet[1] = ploadu<Packet>(&b1[k]); |
1748 |
kernel.packet[1%PacketSize] = ploadu<Packet>(&b1[k]); |
1749 |
kernel.packet[2] = ploadu<Packet>(&b2[k]); |
1749 |
kernel.packet[2%PacketSize] = ploadu<Packet>(&b2[k]); |
1750 |
kernel.packet[3] = ploadu<Packet>(&b3[k]); |
1750 |
kernel.packet[3%PacketSize] = ploadu<Packet>(&b3[k]); |
1751 |
ptranspose(kernel); |
1751 |
ptranspose(kernel); |
1752 |
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); |
1752 |
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); |
1753 |
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1])); |
1753 |
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); |
1754 |
pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2])); |
1754 |
pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize])); |
1755 |
pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3])); |
1755 |
pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize])); |
1756 |
count+=4*PacketSize; |
1756 |
count+=4*PacketSize; |
1757 |
} |
1757 |
} |
1758 |
} |
1758 |
} |
1759 |
for(; k<depth; k++) |
1759 |
for(; k<depth; k++) |
1760 |
{ |
1760 |
{ |
1761 |
blockB[count+0] = cj(b0[k]); |
1761 |
blockB[count+0] = cj(b0[k]); |
1762 |
blockB[count+1] = cj(b1[k]); |
1762 |
blockB[count+1] = cj(b1[k]); |
1763 |
blockB[count+2] = cj(b2[k]); |
1763 |
blockB[count+2] = cj(b2[k]); |