This bugzilla service is closed. All entries have been migrated to https://gitlab.com/libeigen/eigen
View | Details | Raw Unified | Return to bug 945
Collapse All | Expand All

(-)a/Eigen/src/Core/products/GeneralBlockPanelKernel.h (-6 / +6 lines)
Lines 1740-1763 EIGEN_DONT_INLINE void gemm_pack_rhs<Sca Link Here
1740
      const Scalar* b3 = &rhs[(j2+3)*rhsStride];
1740
      const Scalar* b3 = &rhs[(j2+3)*rhsStride];
1741
      
1741
      
1742
      Index k=0;
1742
      Index k=0;
1743
      if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ??
1743
      if((PacketSize%4)==0) // TODO enbale vectorized transposition for PacketSize==2 ??
1744
      {
1744
      {
1745
        for(; k<peeled_k; k+=PacketSize) {
1745
        for(; k<peeled_k; k+=PacketSize) {
1746
          PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
1746
          PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
1747
          kernel.packet[0] = ploadu<Packet>(&b0[k]);
1747
          kernel.packet[0] = ploadu<Packet>(&b0[k]);
1748
          kernel.packet[1] = ploadu<Packet>(&b1[k]);
1748
          kernel.packet[1%PacketSize] = ploadu<Packet>(&b1[k]);
1749
          kernel.packet[2] = ploadu<Packet>(&b2[k]);
1749
          kernel.packet[2%PacketSize] = ploadu<Packet>(&b2[k]);
1750
          kernel.packet[3] = ploadu<Packet>(&b3[k]);
1750
          kernel.packet[3%PacketSize] = ploadu<Packet>(&b3[k]);
1751
          ptranspose(kernel);
1751
          ptranspose(kernel);
1752
          pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1752
          pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
1753
          pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1]));
1753
          pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
1754
          pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2]));
1754
          pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
1755
          pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3]));
1755
          pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
1756
          count+=4*PacketSize;
1756
          count+=4*PacketSize;
1757
        }
1757
        }
1758
      }
1758
      }
1759
      for(; k<depth; k++)
1759
      for(; k<depth; k++)
1760
      {
1760
      {
1761
        blockB[count+0] = cj(b0[k]);
1761
        blockB[count+0] = cj(b0[k]);
1762
        blockB[count+1] = cj(b1[k]);
1762
        blockB[count+1] = cj(b1[k]);
1763
        blockB[count+2] = cj(b2[k]);
1763
        blockB[count+2] = cj(b2[k]);

Return to bug 945