This bugzilla service is closed. All entries have been migrated to https://gitlab.com/libeigen/eigen
View | Details | Raw Unified | Return to bug 724
Collapse All | Expand All

(-)a/Eigen/src/Core/GenericPacketMath.h (-1 / +14 lines)
Lines 152-167 pandnot(const Packet& a, const Packet& b Link Here
152
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
152
/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
153
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
153
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
154
pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
154
pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
155
155
156
/** \internal \returns a packet version of \a *from, (un-aligned load) */
156
/** \internal \returns a packet version of \a *from, (un-aligned load) */
157
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
157
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
158
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
158
ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
159
159
160
/** \internal \returns a packet version of \a *from */
161
template<typename Packet, bool aligned> inline Packet
162
pload(const typename unpacket_traits<Packet>::type* from) {
163
  if (aligned) return pload<Packet>(from);
164
  else return ploadu<Packet>(from);
165
}
166
160
/** \internal \returns a packet with elements of \a *from duplicated.
167
/** \internal \returns a packet with elements of \a *from duplicated.
161
  * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
168
  * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
162
  * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
169
  * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
163
  * Currently, this function is only used for scalar * complex products.
170
  * Currently, this function is only used for scalar * complex products.
164
 */
171
 */
165
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
172
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
166
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
173
ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
167
174
Lines 176-191 plset(const Scalar& a) { return a; } Link Here
176
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
183
/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
177
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
184
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from)
178
{ (*to) = from; }
185
{ (*to) = from; }
179
186
180
/** \internal copy the packet \a from to \a *to, (un-aligned store) */
187
/** \internal copy the packet \a from to \a *to, (un-aligned store) */
181
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
188
template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
182
{ (*to) = from; }
189
{ (*to) = from; }
183
190
191
/** \internal copy the packet \a from to \a *to */
192
template<typename Scalar, typename Packet, bool aligned> inline void
193
pstore(Scalar* to, const Packet& from) {
194
  if (aligned) pstore<Scalar, Packet>(to, from);
195
  else pstoreu<Scalar, Packet>(to, from);
196
}
197
184
/** \internal tries to do cache prefetching of \a addr */
198
/** \internal tries to do cache prefetching of \a addr */
185
template<typename Scalar> inline void prefetch(const Scalar* addr)
199
template<typename Scalar> inline void prefetch(const Scalar* addr)
186
{
200
{
187
#if !defined(_MSC_VER)
201
#if !defined(_MSC_VER)
188
__builtin_prefetch(addr);
202
__builtin_prefetch(addr);
189
#endif
203
#endif
190
}
204
}
191
205
Lines 348-356 template<> inline std::complex<double> p Link Here
348
362
349
#endif
363
#endif
350
364
351
} // end namespace internal
365
} // end namespace internal
352
366
353
} // end namespace Eigen
367
} // end namespace Eigen
354
368
355
#endif // EIGEN_GENERIC_PACKET_MATH_H
369
#endif // EIGEN_GENERIC_PACKET_MATH_H
356
(-)a/Eigen/src/Core/products/GeneralBlockPanelKernel.h (-35 / +35 lines)
Lines 505-521 protected: Link Here
505
505
506
/* optimized GEneral packed Block * packed Panel product kernel
506
/* optimized GEneral packed Block * packed Panel product kernel
507
 *
507
 *
508
 * Mixing type logic: C += A * B
508
 * Mixing type logic: C += A * B
509
 *  |  A  |  B  | comments
509
 *  |  A  |  B  | comments
510
 *  |real |cplx | no vectorization yet, would require to pack A with duplication
510
 *  |real |cplx | no vectorization yet, would require to pack A with duplication
511
 *  |cplx |real | easy vectorization
511
 *  |cplx |real | easy vectorization
512
 */
512
 */
513
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
513
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs, bool alignedLoadStores>
514
struct gebp_kernel
514
struct gebp_kernel
515
{
515
{
516
  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
516
  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
517
  typedef typename Traits::ResScalar ResScalar;
517
  typedef typename Traits::ResScalar ResScalar;
518
  typedef typename Traits::LhsPacket LhsPacket;
518
  typedef typename Traits::LhsPacket LhsPacket;
519
  typedef typename Traits::RhsPacket RhsPacket;
519
  typedef typename Traits::RhsPacket RhsPacket;
520
  typedef typename Traits::ResPacket ResPacket;
520
  typedef typename Traits::ResPacket ResPacket;
521
  typedef typename Traits::AccPacket AccPacket;
521
  typedef typename Traits::AccPacket AccPacket;
Lines 527-545 struct gebp_kernel Link Here
527
    ResPacketSize = Traits::ResPacketSize
527
    ResPacketSize = Traits::ResPacketSize
528
  };
528
  };
529
529
530
  EIGEN_DONT_INLINE
530
  EIGEN_DONT_INLINE
531
  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
531
  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
532
                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0);
532
                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0);
533
};
533
};
534
534
535
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
535
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs, bool alignedLoadStores>
536
EIGEN_DONT_INLINE
536
EIGEN_DONT_INLINE
537
void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
537
void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs,alignedLoadStores>
538
  ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
538
  ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
539
               Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB)
539
               Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB)
540
  {
540
  {
541
    Traits traits;
541
    Traits traits;
542
    
542
    
543
    if(strideA==-1) strideA = depth;
543
    if(strideA==-1) strideA = depth;
544
    if(strideB==-1) strideB = depth;
544
    if(strideB==-1) strideB = depth;
545
    conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
545
    conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
Lines 752-811 EIGEN_ASM_COMMENT("mybegin4"); Link Here
752
          blA += mr;
752
          blA += mr;
753
        }
753
        }
754
754
755
        if(nr==4)
755
        if(nr==4)
756
        {
756
        {
757
          ResPacket R0, R1, R2, R3, R4, R5, R6;
757
          ResPacket R0, R1, R2, R3, R4, R5, R6;
758
          ResPacket alphav = pset1<ResPacket>(alpha);
758
          ResPacket alphav = pset1<ResPacket>(alpha);
759
759
760
          R0 = ploadu<ResPacket>(r0);
760
          R0 = pload<ResPacket, alignedLoadStores>(r0);
761
          R1 = ploadu<ResPacket>(r1);
761
          R1 = pload<ResPacket, alignedLoadStores>(r1);
762
          R2 = ploadu<ResPacket>(r2);
762
          R2 = pload<ResPacket, alignedLoadStores>(r2);
763
          R3 = ploadu<ResPacket>(r3);
763
          R3 = pload<ResPacket, alignedLoadStores>(r3);
764
          R4 = ploadu<ResPacket>(r0 + ResPacketSize);
764
          R4 = pload<ResPacket, alignedLoadStores>(r0 + ResPacketSize);
765
          R5 = ploadu<ResPacket>(r1 + ResPacketSize);
765
          R5 = pload<ResPacket, alignedLoadStores>(r1 + ResPacketSize);
766
          R6 = ploadu<ResPacket>(r2 + ResPacketSize);
766
          R6 = pload<ResPacket, alignedLoadStores>(r2 + ResPacketSize);
767
          traits.acc(C0, alphav, R0);
767
          traits.acc(C0, alphav, R0);
768
          pstoreu(r0, R0);
768
          pstore<ResScalar, ResPacket, alignedLoadStores>(r0, R0);
769
          R0 = ploadu<ResPacket>(r3 + ResPacketSize);
769
          R0 = pload<ResPacket, alignedLoadStores>(r3 + ResPacketSize);
770
770
771
          traits.acc(C1, alphav, R1);
771
          traits.acc(C1, alphav, R1);
772
          traits.acc(C2, alphav, R2);
772
          traits.acc(C2, alphav, R2);
773
          traits.acc(C3, alphav, R3);
773
          traits.acc(C3, alphav, R3);
774
          traits.acc(C4, alphav, R4);
774
          traits.acc(C4, alphav, R4);
775
          traits.acc(C5, alphav, R5);
775
          traits.acc(C5, alphav, R5);
776
          traits.acc(C6, alphav, R6);
776
          traits.acc(C6, alphav, R6);
777
          traits.acc(C7, alphav, R0);
777
          traits.acc(C7, alphav, R0);
778
          
778
          
779
          pstoreu(r1, R1);
779
          pstore<ResScalar, ResPacket, alignedLoadStores>(r1, R1);
780
          pstoreu(r2, R2);
780
          pstore<ResScalar, ResPacket, alignedLoadStores>(r2, R2);
781
          pstoreu(r3, R3);
781
          pstore<ResScalar, ResPacket, alignedLoadStores>(r3, R3);
782
          pstoreu(r0 + ResPacketSize, R4);
782
          pstore<ResScalar, ResPacket, alignedLoadStores>(r0 + ResPacketSize, R4);
783
          pstoreu(r1 + ResPacketSize, R5);
783
          pstore<ResScalar, ResPacket, alignedLoadStores>(r1 + ResPacketSize, R5);
784
          pstoreu(r2 + ResPacketSize, R6);
784
          pstore<ResScalar, ResPacket, alignedLoadStores>(r2 + ResPacketSize, R6);
785
          pstoreu(r3 + ResPacketSize, R0);
785
          pstore<ResScalar, ResPacket, alignedLoadStores>(r3 + ResPacketSize, R0);
786
        }
786
        }
787
        else
787
        else
788
        {
788
        {
789
          ResPacket R0, R1, R4;
789
          ResPacket R0, R1, R4;
790
          ResPacket alphav = pset1<ResPacket>(alpha);
790
          ResPacket alphav = pset1<ResPacket>(alpha);
791
791
792
          R0 = ploadu<ResPacket>(r0);
792
          R0 = pload<ResPacket, alignedLoadStores>(r0);
793
          R1 = ploadu<ResPacket>(r1);
793
          R1 = pload<ResPacket, alignedLoadStores>(r1);
794
          R4 = ploadu<ResPacket>(r0 + ResPacketSize);
794
          R4 = pload<ResPacket, alignedLoadStores>(r0 + ResPacketSize);
795
          traits.acc(C0, alphav, R0);
795
          traits.acc(C0, alphav, R0);
796
          pstoreu(r0, R0);
796
          pstore<ResScalar, ResPacket, alignedLoadStores>(r0, R0);
797
          R0 = ploadu<ResPacket>(r1 + ResPacketSize);
797
          R0 = pload<ResPacket, alignedLoadStores>(r1 + ResPacketSize);
798
          traits.acc(C1, alphav, R1);
798
          traits.acc(C1, alphav, R1);
799
          traits.acc(C4, alphav, R4);
799
          traits.acc(C4, alphav, R4);
800
          traits.acc(C5, alphav, R0);
800
          traits.acc(C5, alphav, R0);
801
          pstoreu(r1, R1);
801
          pstore<ResScalar, ResPacket, alignedLoadStores>(r1, R1);
802
          pstoreu(r0 + ResPacketSize, R4);
802
          pstore<ResScalar, ResPacket, alignedLoadStores>(r0 + ResPacketSize, R4);
803
          pstoreu(r1 + ResPacketSize, R0);
803
          pstore<ResScalar, ResPacket, alignedLoadStores>(r1 + ResPacketSize, R0);
804
        }
804
        }
805
        
805
        
806
      }
806
      }
807
      
807
      
808
      if(rows-peeled_mc>=LhsProgress)
808
      if(rows-peeled_mc>=LhsProgress)
809
      {
809
      {
810
        Index i = peeled_mc;
810
        Index i = peeled_mc;
811
        const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
811
        const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
Lines 935-964 EIGEN_ASM_COMMENT("mybegin4"); Link Here
935
        ResPacket R0, R1, R2, R3;
935
        ResPacket R0, R1, R2, R3;
936
        ResPacket alphav = pset1<ResPacket>(alpha);
936
        ResPacket alphav = pset1<ResPacket>(alpha);
937
937
938
        ResScalar* r0 = &res[(j2+0)*resStride + i];
938
        ResScalar* r0 = &res[(j2+0)*resStride + i];
939
        ResScalar* r1 = r0 + resStride;
939
        ResScalar* r1 = r0 + resStride;
940
        ResScalar* r2 = r1 + resStride;
940
        ResScalar* r2 = r1 + resStride;
941
        ResScalar* r3 = r2 + resStride;
941
        ResScalar* r3 = r2 + resStride;
942
942
943
                  R0 = ploadu<ResPacket>(r0);
943
                  R0 = pload<ResPacket, alignedLoadStores>(r0);
944
                  R1 = ploadu<ResPacket>(r1);
944
                  R1 = pload<ResPacket, alignedLoadStores>(r1);
945
        if(nr==4) R2 = ploadu<ResPacket>(r2);
945
        if(nr==4) R2 = pload<ResPacket, alignedLoadStores>(r2);
946
        if(nr==4) R3 = ploadu<ResPacket>(r3);
946
        if(nr==4) R3 = pload<ResPacket, alignedLoadStores>(r3);
947
947
948
                  traits.acc(C0, alphav, R0);
948
                  traits.acc(C0, alphav, R0);
949
                  traits.acc(C1, alphav, R1);
949
                  traits.acc(C1, alphav, R1);
950
        if(nr==4) traits.acc(C2, alphav, R2);
950
        if(nr==4) traits.acc(C2, alphav, R2);
951
        if(nr==4) traits.acc(C3, alphav, R3);
951
        if(nr==4) traits.acc(C3, alphav, R3);
952
952
953
                  pstoreu(r0, R0);
953
                  pstore<ResScalar, ResPacket, alignedLoadStores>(r0, R0);
954
                  pstoreu(r1, R1);
954
                  pstore<ResScalar, ResPacket, alignedLoadStores>(r1, R1);
955
        if(nr==4) pstoreu(r2, R2);
955
        if(nr==4) pstore<ResScalar, ResPacket, alignedLoadStores>(r2, R2);
956
        if(nr==4) pstoreu(r3, R3);
956
        if(nr==4) pstore<ResScalar, ResPacket, alignedLoadStores>(r3, R3);
957
      }
957
      }
958
      for(Index i=peeled_mc2; i<rows; i++)
958
      for(Index i=peeled_mc2; i<rows; i++)
959
      {
959
      {
960
        const LhsScalar* blA = &blockA[i*strideA+offsetA];
960
        const LhsScalar* blA = &blockA[i*strideA+offsetA];
961
        prefetch(&blA[0]);
961
        prefetch(&blA[0]);
962
962
963
        // gets a 1 x nr res block as registers
963
        // gets a 1 x nr res block as registers
964
        ResScalar C0(0), C1(0), C2(0), C3(0);
964
        ResScalar C0(0), C1(0), C2(0), C3(0);
(-)a/Eigen/src/Core/products/GeneralMatrixMatrix.h (-3 / +35 lines)
Lines 39-59 struct general_matrix_matrix_product<Ind Link Here
39
      LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
39
      LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
40
      ColMajor>
40
      ColMajor>
41
    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
41
    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
42
  }
42
  }
43
};
43
};
44
44
45
/*  Specialization for a col-major destination matrix
45
/*  Specialization for a col-major destination matrix
46
 *    => Blocking algorithm following Goto's paper */
46
 *    => Blocking algorithm following Goto's paper */
47
/*  Handling of col-major destination matrix
48
 *    => Blocking algorithm following Goto's paper */
47
template<
49
template<
48
  typename Index,
50
  typename Index,
49
  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
51
  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
50
  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
52
  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
51
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
53
  bool ResMatrixAligned>
54
struct general_matrix_matrix_product_internal
52
{
55
{
53
56
54
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
57
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
55
static void run(Index rows, Index cols, Index depth,
58
static void run(Index rows, Index cols, Index depth,
56
  const LhsScalar* _lhs, Index lhsStride,
59
  const LhsScalar* _lhs, Index lhsStride,
57
  const RhsScalar* _rhs, Index rhsStride,
60
  const RhsScalar* _rhs, Index rhsStride,
58
  ResScalar* res, Index resStride,
61
  ResScalar* res, Index resStride,
59
  ResScalar alpha,
62
  ResScalar alpha,
Lines 66-82 static void run(Index rows, Index cols, Link Here
66
  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
69
  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
67
70
68
  Index kc = blocking.kc();                   // cache block size along the K direction
71
  Index kc = blocking.kc();                   // cache block size along the K direction
69
  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
72
  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
70
  //Index nc = blocking.nc(); // cache block size along the N direction
73
  //Index nc = blocking.nc(); // cache block size along the N direction
71
74
72
  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
75
  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
73
  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
76
  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
74
  gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
77
  gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResMatrixAligned> gebp;
75
78
76
#ifdef EIGEN_HAS_OPENMP
79
#ifdef EIGEN_HAS_OPENMP
77
  if(info)
80
  if(info)
78
  {
81
  {
79
    // this is the parallel version!
82
    // this is the parallel version!
80
    Index tid = omp_get_thread_num();
83
    Index tid = omp_get_thread_num();
81
    Index threads = omp_get_num_threads();
84
    Index threads = omp_get_num_threads();
82
    
85
    
Lines 185-200 static void run(Index rows, Index cols, Link Here
185
        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
188
        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
186
      }
189
      }
187
    }
190
    }
188
  }
191
  }
189
}
192
}
190
193
191
};
194
};
192
195
196
template<
197
  typename Index,
198
  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
199
  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
200
struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
201
{
202
typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
203
static void run(Index rows, Index cols, Index depth,
204
  const LhsScalar* _lhs, Index lhsStride,
205
  const RhsScalar* _rhs, Index rhsStride,
206
  ResScalar* res, Index resStride,
207
  ResScalar alpha,
208
  level3_blocking<LhsScalar,RhsScalar>& blocking,
209
  GemmParallelInfo<Index>* info = 0)
210
{
211
  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
212
  typedef packet_traits<ResScalar> ResTraits;
213
214
  if (((Traits::mr % ResTraits::size) == 0) &&
215
      (((size_t)res) % (ResTraits::size*sizeof(ResScalar)) == 0) &&
216
      ((resStride % ResTraits::size) == 0)) {
217
    // The kernel will use aligned loads and stores to update the result matrix for all the micro panels.
218
    general_matrix_matrix_product_internal<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar, RhsStorageOrder, ConjugateRhs, true>::run(rows, cols, depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking, info);
219
  } else {
220
    general_matrix_matrix_product_internal<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar, RhsStorageOrder, ConjugateRhs, false>::run(rows, cols, depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking, info);
221
  }
222
}
223
};
224
193
/*********************************************************************************
225
/*********************************************************************************
194
*  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
226
*  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
195
*  implementation of the high level wrapper to general_matrix_matrix_product
227
*  implementation of the high level wrapper to general_matrix_matrix_product
196
**********************************************************************************/
228
**********************************************************************************/
197
229
198
template<typename Lhs, typename Rhs>
230
template<typename Lhs, typename Rhs>
199
struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
231
struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
200
 : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
232
 : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
(-)a/Eigen/src/Core/util/BlasUtil.h (-1 / +1 lines)
Lines 13-29 Link Here
13
// This file contains many lightweight helper classes used to
13
// This file contains many lightweight helper classes used to
14
// implement and control fast level 2 and level 3 BLAS-like routines.
14
// implement and control fast level 2 and level 3 BLAS-like routines.
15
15
16
namespace Eigen {
16
namespace Eigen {
17
17
18
namespace internal {
18
namespace internal {
19
19
20
// forward declarations
20
// forward declarations
21
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
21
template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false, bool alignedLoadStores = false>
22
struct gebp_kernel;
22
struct gebp_kernel;
23
23
24
template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
24
template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
25
struct gemm_pack_rhs;
25
struct gemm_pack_rhs;
26
26
27
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
27
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
28
struct gemm_pack_lhs;
28
struct gemm_pack_lhs;
29
29

Return to bug 724