diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -76,20 +76,20 @@ private: SrcIsRowMajor = SrcFlags&RowMajorBit, StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)), MightVectorize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit) && bool(functor_traits::PacketAccess), MayInnerVectorize = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0 && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 - && int(JointAlignment)>=int(InnerRequiredAlignment), + && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess - && ((int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), + && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ MaySliceVectorize = bool(MightVectorize) && bool(DstHasDirectAccess) && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*InnerPacketSize) /* slice vectorization can be slow, so we only want it if the slices are big, which is indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block in a fixed-size matrix */ }; @@ -125,18 +125,19 @@ public: enum { Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal)) ? ( int(MayUnrollCompletely) ? int(CompleteUnrolling) : int(MayUnrollInner) ? int(InnerUnrolling) : int(NoUnrolling) ) : int(Traversal) == int(LinearVectorizedTraversal) - ? ( bool(MayUnrollCompletely) && (int(DstAlignment)>=int(LinearRequiredAlignment)) ? int(CompleteUnrolling) - : int(NoUnrolling) ) + ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment))) + ? int(CompleteUnrolling) + : int(NoUnrolling) ) : int(Traversal) == int(LinearTraversal) ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) ) : int(NoUnrolling) }; #ifdef EIGEN_DEBUG_ASSIGN static void debug() @@ -151,16 +152,17 @@ public: EIGEN_DEBUG_VAR(SrcAlignment) EIGEN_DEBUG_VAR(LinearRequiredAlignment) EIGEN_DEBUG_VAR(InnerRequiredAlignment) EIGEN_DEBUG_VAR(JointAlignment) EIGEN_DEBUG_VAR(InnerSize) EIGEN_DEBUG_VAR(InnerMaxSize) EIGEN_DEBUG_VAR(LinearPacketSize) EIGEN_DEBUG_VAR(InnerPacketSize) + EIGEN_DEBUG_VAR(ActualPacketSize) EIGEN_DEBUG_VAR(StorageOrdersAgree) EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MayLinearize) EIGEN_DEBUG_VAR(MayInnerVectorize) EIGEN_DEBUG_VAR(MayLinearVectorize) EIGEN_DEBUG_VAR(MaySliceVectorize) std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; EIGEN_DEBUG_VAR(UnrollingLimit) @@ -251,44 +253,45 @@ struct copy_using_evaluator_innervec_Com // FIXME: this is not very clean, perhaps this information should be provided by the kernel? typedef typename Kernel::DstEvaluatorType DstEvaluatorType; typedef typename DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; enum { outer = Index / DstXprType::InnerSizeAtCompileTime, inner = Index % DstXprType::InnerSizeAtCompileTime, - JointAlignment = Kernel::AssignmentTraits::JointAlignment, - DefaultAlignment = unpacket_traits::alignment + SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, + DstAlignment = Kernel::AssignmentTraits::DstAlignment }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { - kernel.template assignPacketByOuterInner(outer, inner); + kernel.template assignPacketByOuterInner(outer, inner); enum { NextIndex = Index + unpacket_traits::size }; copy_using_evaluator_innervec_CompleteUnrolling::run(kernel); } }; template struct copy_using_evaluator_innervec_CompleteUnrolling { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } }; template struct copy_using_evaluator_innervec_InnerUnrolling { typedef typename Kernel::PacketType PacketType; enum { - DefaultAlignment = unpacket_traits::alignment + SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, + DstAlignment = Kernel::AssignmentTraits::DstAlignment }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer) { - kernel.template assignPacketByOuterInner(outer, Index_); + kernel.template assignPacketByOuterInner(outer, Index_); enum { NextIndex = Index_ + unpacket_traits::size }; copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); } }; template struct copy_using_evaluator_innervec_InnerUnrolling { @@ -433,26 +436,27 @@ struct dense_assignment_loop struct dense_assignment_loop { typedef typename Kernel::PacketType PacketType; enum { - DefaultAlignment = unpacket_traits::alignment + SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, + DstAlignment = Kernel::AssignmentTraits::DstAlignment }; EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { const Index innerSize = kernel.innerSize(); const Index outerSize = kernel.outerSize(); const Index packetSize = unpacket_traits::size; for(Index outer = 0; outer < outerSize; ++outer) for(Index inner = 0; inner < innerSize; inner+=packetSize) - kernel.template assignPacketByOuterInner(outer, inner); + kernel.template assignPacketByOuterInner(outer, inner); } }; template struct dense_assignment_loop { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -22,17 +22,17 @@ private: typedef typename find_best_packet<_Scalar,size>::type PacketScalar; enum { row_major_bit = _Options&RowMajor ? RowMajorBit : 0, is_dynamic_size_storage = _MaxRows==Dynamic || _MaxCols==Dynamic, max_size = is_dynamic_size_storage ? Dynamic : _MaxRows*_MaxCols, default_alignment = compute_default_alignment<_Scalar,max_size>::value, actual_alignment = ((_Options&DontAlign)==0) ? default_alignment : 0, required_alignment = unpacket_traits::alignment, - packet_access_bit = packet_traits<_Scalar>::Vectorizable && (actual_alignment>=required_alignment) ? PacketAccessBit : 0 + packet_access_bit = (packet_traits<_Scalar>::Vectorizable && (EIGEN_UNALIGNED_VECTORIZE || (actual_alignment>=required_alignment))) ? PacketAccessBit : 0 }; public: typedef _Scalar Scalar; typedef Dense StorageKind; typedef Eigen::Index StorageIndex; typedef MatrixXpr XprKind; enum { diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -751,16 +751,21 @@ namespace Eigen { #endif #if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES #define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES #else #define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES #endif + +#ifndef EIGEN_UNALIGNED_VECTORIZE +#define EIGEN_UNALIGNED_VECTORIZE 1 +#endif + //---------------------------------------------------------------------- #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD #define EIGEN_RESTRICT #endif #ifndef EIGEN_RESTRICT #define EIGEN_RESTRICT __restrict diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -2,16 +2,24 @@ // for linear algebra. // // Copyright (C) 2015 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#ifdef EIGEN_TEST_PART_1 +#define EIGEN_UNALIGNED_VECTORIZE 1 +#endif + +#ifdef EIGEN_TEST_PART_2 +#define EIGEN_UNALIGNED_VECTORIZE 0 +#endif + #ifdef EIGEN_DEFAULT_TO_ROW_MAJOR #undef EIGEN_DEFAULT_TO_ROW_MAJOR #endif #define EIGEN_DEBUG_ASSIGN #include "main.h" #include using internal::demangle_flags; @@ -139,20 +147,25 @@ struct vectorization_logic InnerVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()), InnerVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix44(),Matrix44()+Matrix44(), InnerVectorizedTraversal,InnerUnrolling)); VERIFY(test_assign(Matrix44u(),Matrix44()+Matrix44(), - LinearTraversal,NoUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearTraversal, + EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling)); + + VERIFY(test_assign(Matrix1(),Matrix1()+Matrix1(), + (Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal, + CompleteUnrolling)); VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(), - LinearTraversal,CompleteUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal, CompleteUnrolling)); VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3), InnerVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix44r().row(2),Matrix44r().row(1)+Matrix44r().row(1), InnerVectorizedTraversal,CompleteUnrolling)); if(PacketSize>1) @@ -162,20 +175,27 @@ struct vectorization_logic LinearTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), LinearTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()), LinearVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix(),Matrix()+Matrix(), - HalfPacketSize==1 ? InnerVectorizedTraversal : LinearTraversal,NoUnrolling)); + HalfPacketSize==1 ? InnerVectorizedTraversal : + EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : + LinearTraversal, + NoUnrolling)); + + VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), - DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)); + (EIGEN_UNALIGNED_VECTORIZE) ? InnerVectorizedTraversal : DefaultTraversal, + (EIGEN_UNALIGNED_VECTORIZE || PacketSize<=4) ? CompleteUnrolling : InnerUnrolling )); VERIFY(test_assign(Vector1(),Matrix11()*Vector1(), InnerVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()), InnerVectorizedTraversal,InnerUnrolling+CompleteUnrolling)); } @@ -282,34 +302,36 @@ struct vectorization_logic_half InnerVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()), InnerVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix57(),Matrix57()+Matrix57(), InnerVectorizedTraversal,InnerUnrolling)); VERIFY(test_assign(Matrix57u(),Matrix57()+Matrix57(), - LinearTraversal,NoUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearTraversal, + EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling)); VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(), - LinearTraversal,CompleteUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling)); if(PacketSize>1) { typedef Matrix Matrix33c; VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1), LinearTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), LinearTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()), PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix(),Matrix()+Matrix(), - LinearTraversal,NoUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? (PacketSize==1 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal, + NoUnrolling)); VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)); VERIFY(test_assign(Vector1(),Matrix11()*Vector1(), InnerVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Matrix11(),Matrix11().lazyProduct(Matrix11()), @@ -362,25 +384,25 @@ void test_vectorization_logic() CALL_SUBTEST( vectorization_logic_half::run() ); CALL_SUBTEST( vectorization_logic_half::run() ); CALL_SUBTEST( vectorization_logic_half >::run() ); CALL_SUBTEST( vectorization_logic_half >::run() ); if(internal::packet_traits::Vectorizable) { VERIFY(test_assign(Matrix(),Matrix()+Matrix(), - LinearTraversal,CompleteUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling)); VERIFY(test_redux(Matrix(), - DefaultTraversal,CompleteUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : DefaultTraversal,CompleteUnrolling)); } if(internal::packet_traits::Vectorizable) { VERIFY(test_assign(Matrix(),Matrix()+Matrix(), - LinearTraversal,CompleteUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling)); VERIFY(test_redux(Matrix(), - DefaultTraversal,CompleteUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : DefaultTraversal,CompleteUnrolling)); } #endif // EIGEN_VECTORIZE }