diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -642,21 +642,25 @@ struct evaluator::size) * sizeof(Scalar), + KeepsPacketAccess = bool(HasNoInnerStride) && ( bool(IsDynamicSize) || HasNoOuterStride || ( OuterStrideAtCompileTime!=Dynamic - && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime)%EIGEN_ALIGN_BYTES)==0 ) ), + && ((static_cast(sizeof(Scalar))*OuterStrideAtCompileTime) % AlignBytes)==0 ) ), Flags0 = evaluator::Flags, Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit), Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime)) ? int(Flags1) : int(Flags1 & ~LinearAccessBit), Flags = KeepsPacketAccess ? int(Flags2) : (int(Flags2) & ~PacketAccessBit) }; EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map) @@ -712,17 +716,20 @@ struct evaluator::ret), OuterStrideAtCompileTime = HasSameStorageOrderAsArgType ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits::size) == 0) && (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, - MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) ? AlignedBit : 0, + // TODO: should check for smaller packet types once we can handle multi-sized packet types + AlignBytes = int(packet_traits::size) * sizeof(Scalar), + + MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % AlignBytes) == 0)) ? AlignedBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, Flags0 = evaluator::Flags & ( (HereditaryBits & ~RowMajorBit) | DirectAccessBit | MaskPacketAccessBit | MaskAlignedBit), Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit }; @@ -820,22 +827,25 @@ protected: // all action is via the data() as returned by the Block expression. template struct block_evaluator : mapbase_evaluator, typename Block::PlainObject> { typedef Block XprType; + typedef typename XprType::Scalar Scalar; EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) : mapbase_evaluator(block) { + // TODO: should check for smaller packet types once we can handle multi-sized packet types + const int AlignBytes = int(packet_traits::size) * sizeof(Scalar); // FIXME this should be an internal assertion - eigen_assert(EIGEN_IMPLIES(evaluator::Flags&AlignedBit, (size_t(block.data()) % EIGEN_ALIGN_BYTES) == 0) && "data is not aligned"); + eigen_assert(EIGEN_IMPLIES(evaluator::Flags&AlignedBit, (size_t(block.data()) % AlignBytes) == 0) && "data is not aligned"); } }; // -------------------- Select -------------------- // TODO shall we introduce a ternary_evaluator? // TODO enable vectorization for Select diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -29,24 +29,46 @@ EIGEN_DEVICE_FUNC void check_static_allocation_size() { // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit #if EIGEN_STACK_ALLOCATION_LIMIT EIGEN_STATIC_ASSERT(Size * sizeof(T) <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG); #endif } +template::type, + bool Match = bool((Size%unpacket_traits::size)==0), + bool TryHalf = bool(unpacket_traits::size > Size) + && bool(unpacket_traits::size > unpacket_traits::half>::size) > +struct compute_default_alignment +{ + enum { value = 0 }; +}; + +template +struct compute_default_alignment // Match +{ + enum { value = sizeof(T) * unpacket_traits::size }; +}; + +template +struct compute_default_alignment +{ + // current packet too large, try with an half-packet + enum { value = compute_default_alignment::half>::value }; +}; + /** \internal * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned: * to 16 bytes boundary if the total size is a multiple of 16 bytes. */ template + : compute_default_alignment::value > +// : (((Size*sizeof(T))%EIGEN_ALIGN_BYTES)==0) ? EIGEN_ALIGN_BYTES : 0 > struct plain_array { T array[Size]; EIGEN_DEVICE_FUNC plain_array() { check_static_allocation_size(); @@ -76,24 +98,81 @@ struct plain_array #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \ eigen_assert((reinterpret_cast(array) & (sizemask)) == 0 \ && "this assertion is explained here: " \ "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \ " **** READ THIS WEB PAGE !!! ****"); #endif template -struct plain_array +struct plain_array { - EIGEN_USER_ALIGN_DEFAULT T array[Size]; + EIGEN_ALIGN_TO_BOUNDARY(8) T array[Size]; EIGEN_DEVICE_FUNC plain_array() { - EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(EIGEN_ALIGN_BYTES-1); + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(7); + check_static_allocation_size(); + } + + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; + +template +struct plain_array +{ + EIGEN_ALIGN_TO_BOUNDARY(16) T array[Size]; + + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(15); + check_static_allocation_size(); + } + + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; + +template +struct plain_array +{ + EIGEN_ALIGN_TO_BOUNDARY(32) T array[Size]; + + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(31); + check_static_allocation_size(); + } + + EIGEN_DEVICE_FUNC + plain_array(constructor_without_unaligned_array_assert) + { + check_static_allocation_size(); + } +}; + +template +struct plain_array +{ + EIGEN_ALIGN_TO_BOUNDARY(64) T array[Size]; + + EIGEN_DEVICE_FUNC + plain_array() + { + EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(63); check_static_allocation_size(); } EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) { check_static_allocation_size(); } diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -313,16 +313,19 @@ #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 #else #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 #endif // Defined the boundary (in bytes) on which the data needs to be aligned. Note // that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be // aligned at all regardless of the value of this #define. +// TODO should be renamed EIGEN_MAXIMAL_ALIGN_BYTES, +// for instance with AVX 1 EIGEN_MAXIMAL_ALIGN_BYTES=32 while for 'int' 16 bytes alignment is always enough, +// and 16 bytes alignment is also enough for Vector4f. #define EIGEN_ALIGN_BYTES 16 #ifdef EIGEN_DONT_ALIGN #ifndef EIGEN_DONT_ALIGN_STATICALLY #define EIGEN_DONT_ALIGN_STATICALLY #endif #define EIGEN_ALIGN 0 #elif !defined(EIGEN_DONT_VECTORIZE) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -154,23 +154,26 @@ class compute_matrix_flags }; template class compute_matrix_evaluator_flags { enum { row_major_bit = Options&RowMajor ? RowMajorBit : 0, is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic, + + // TODO: should check for smaller packet types once we can handle multi-sized packet types + align_bytes = int(packet_traits::size) * sizeof(Scalar), aligned_bit = ( ((Options&DontAlign)==0) && ( #if EIGEN_ALIGN_STATICALLY - ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % EIGEN_ALIGN_BYTES) == 0)) + ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % align_bytes) == 0)) #else 0 #endif || #if EIGEN_ALIGN is_dynamic_size_storage diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp --- a/test/unalignedassert.cpp +++ b/test/unalignedassert.cpp @@ -76,17 +76,17 @@ void construct_at_boundary(int boundary) T *x = ::new(reinterpret_cast(_buf)) T; x[0].setZero(); // just in order to silence warnings x->~T(); } #endif void unalignedassert() { - #if EIGEN_ALIGN_STATICALLY +#if EIGEN_ALIGN_STATICALLY construct_at_boundary(4); construct_at_boundary(4); construct_at_boundary(16); construct_at_boundary(16); construct_at_boundary(4); construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(16); @@ -95,33 +95,34 @@ void unalignedassert() construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(4); construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(16); construct_at_boundary(4); construct_at_boundary(EIGEN_ALIGN_BYTES); construct_at_boundary(16); - #endif +#endif check_unalignedassert_good(); check_unalignedassert_good(); check_unalignedassert_good(); check_unalignedassert_good(); check_unalignedassert_good(); check_unalignedassert_good(); check_unalignedassert_good >(); #if EIGEN_ALIGN_STATICALLY - if(EIGEN_ALIGN_BYTES==16) + if(EIGEN_ALIGN_BYTES>=16) { VERIFY_RAISES_ASSERT(construct_at_boundary(8)); VERIFY_RAISES_ASSERT(construct_at_boundary(8)); VERIFY_RAISES_ASSERT(construct_at_boundary(8)); + VERIFY_RAISES_ASSERT(construct_at_boundary(8)); } for(int b=8; b(b)); VERIFY_RAISES_ASSERT(construct_at_boundary(b)); VERIFY_RAISES_ASSERT(construct_at_boundary(b)); VERIFY_RAISES_ASSERT(construct_at_boundary(b)); VERIFY_RAISES_ASSERT(construct_at_boundary(b)); diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -209,17 +209,17 @@ template(InnerVectorizedTraversal,CompleteUnrolling))); VERIFY((test_assign< Map, Aligned, InnerStride<3*PacketSize> >, Matrix >(DefaultTraversal,CompleteUnrolling))); VERIFY((test_assign(Matrix11(), Matrix()*Matrix(), - PacketSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD?DefaultTraversal:InnerVectorizedTraversal, CompleteUnrolling))); + InnerVectorizedTraversal, CompleteUnrolling))); #endif VERIFY(test_assign(MatrixXX(10,10),MatrixXX(20,20).block(10,10,2,3), SliceVectorizedTraversal,NoUnrolling)); VERIFY(test_redux(VectorX(10), LinearVectorizedTraversal,NoUnrolling));