Bugzilla – Attachment 729 Details for
Bug 1294
RowMajor Matrix-Vector mul x10 slower than ColMajor
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
Log In
[x]
|
Forgot Password
Login:
[x]
This bugzilla service is closed. All entries have been migrated to
https://gitlab.com/libeigen/eigen
Code for benchmark
ConsoleApplication4.cpp (text/plain), 7.86 KB, created by
Nikolai
on 2016-09-12 13:31:50 UTC
(
hide
)
Description:
Code for benchmark
Filename:
MIME Type:
Creator:
Nikolai
Created:
2016-09-12 13:31:50 UTC
Size:
7.86 KB
patch
obsolete
>#include "stdafx.h" > >#include <vector> >#include <iostream> >#include <chrono> > >#include <Eigen/Eigen> >#include <Eigen/StdVector> > >using namespace std; >using namespace chrono; >using namespace Eigen; > >using HRC = high_resolution_clock; > >static const __m128 MM_ONES = _mm_set1_ps(1.0f); > >void multhom(Matrix<float, 4, 4, RowMajor> const& m, Vector3f* src, size_t num) { > const float* matrix = m.data(); > const __m128 r0 = _mm_loadu_ps(matrix + 0); > const __m128 r1 = _mm_loadu_ps(matrix + 4); > const __m128 r2 = _mm_loadu_ps(matrix + 8); > > // unrolled loop > size_t i = 0; > size_t N = (num / 2) * 2; > for (; i < N; i += 2) { > // Load [X0,Y0,Z0,1] into xyz1 > // Load [X1,Y1,Z1,1] into uvw1 > __m128 xyzu = _mm_loadu_ps(&src[i].x()); // X0 Y0 Z0 X1 > __m128 vw11 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)&src[i + 1].y()); // Y1 Z1 1 1 > vw11 = _mm_loadh_pi(vw11, (const __m64*)&MM_ONES); > // convert xyzuvw11 to xyz1 and uvw1 > __m128 zu1v = _mm_shuffle_ps(xyzu, vw11, _MM_SHUFFLE(0, 2, 3, 2)); // Z0 X1 1 Y1 > __m128 xyz1 = _mm_shuffle_ps(xyzu, zu1v, _MM_SHUFFLE(2, 0, 1, 0)); // X0 Y0 Z0 1 > __m128 uvw1 = _mm_shuffle_ps(zu1v, vw11, _MM_SHUFFLE(3, 1, 3, 1)); // X1 Y1 Z1 1 > > // Perform matrix multiplication > __m128 x = _mm_mul_ps(xyz1, r0); > __m128 y = _mm_mul_ps(xyz1, r1); > __m128 z = _mm_mul_ps(xyz1, r2); > > __m128 u = _mm_mul_ps(uvw1, r0); > __m128 v = _mm_mul_ps(uvw1, r1); > __m128 w = _mm_mul_ps(uvw1, r2); > > __m128 xy = _mm_hadd_ps(x, y); > __m128 uv = _mm_hadd_ps(u, v); > __m128 zw = _mm_hadd_ps(z, w); > > __m128 xyzw = _mm_hadd_ps(xy, zw); > __m128 uvzw = _mm_hadd_ps(uv, zw); > > // Prepare the 6 output values (x'y'z'u'v'w') > __m128 out_zuvw = _mm_shuffle_ps(uvzw, uvzw, _MM_SHUFFLE(3, 1, 0, 2)); > __m128 out_xyzu = _mm_movelh_ps(xyzw, out_zuvw); > > _mm_storeu_ps(&src[i].x(), out_xyzu); // write X,Y,Z,U > _mm_storeh_pi((__m64*)&src[i + 1].y(), out_zuvw); // write V,W > } > // tail for loop unrolling > if (i < num) { > __m128 v = _mm_loadl_pi(MM_ONES, (__m64*)&src[i].x()); > v = _mm_insert_ps(v, _mm_load_ss(&src[i].z()), 0x20); > __m128 x = _mm_mul_ps(r0, v); > __m128 y = _mm_mul_ps(r1, v); > __m128 z = _mm_mul_ps(r2, v); > __m128 xy = _mm_hadd_ps(x, y); > __m128 zw = _mm_hadd_ps(z, y); > __m128 out = _mm_hadd_ps(xy, zw); // x,y,z,? > _mm_storel_pi((__m64*)&src[i].x(), out); > _mm_store_ss(&src[i].z(), _mm_movehl_ps(out, out)); > } >} > >using VT = vector<Vector3f, aligned_allocator<Vector3f>>; > >template <int Opt> >void mul1(Matrix<float, 4, 4, Opt> const& __restrict m, Vector3f& __restrict p) { > p = (m * p.homogeneous()).template head<3>(); >} > >template <int Opt> >void mul2(Matrix<float, 4, 4, Opt> const& __restrict m, Vector3f& __restrict p) { > p = (m * p.homogeneous().eval()).template head<3>(); >} > >template <int Opt> >void mul3(Matrix<float, 4, 4, Opt> const& __restrict m, Vector3f& __restrict p) { > Vector4f h = p.homogeneous(); > p = (m * h).template head<3>(); >} > >template <int Opt> >void mul4(Matrix<float, 4, 4, Opt> const& __restrict m, Vector3f& __restrict p) { > Vector4f p4; > p4.head<3>() = p; > p4[3] = 1; > p = (m * p4).template head<3>(); >} > >template <int Opt> >void mulFast(Matrix<float, 4, 4, Opt> const& __restrict m, Vector3f& __restrict p) { > float res[3] = {m(0, 3), m(1, 3), m(2, 3)}; > for (size_t i = 0; i < 3; ++i) { > for (size_t j = 0; j < 3; ++j) { > res[i] += m(i, j) * p[j]; > } > } > for (size_t i = 0; i < 3; ++i) p[i] = res[i]; >} > >using VT = vector<Vector3f, aligned_allocator<Vector3f>>; > >template <int Opt, typename F> >__declspec(noinline) void mulAll(VT& __restrict data, Matrix<float, 4, 4, Opt> const& __restrict m, F mul) { > auto* __restrict d = data.data(); > const size_t sz = data.size(); > for (size_t i = 0; i < sz; ++i) { > mul(m, d[i]); > } >} > >template <int Opt> >__declspec(noinline) void mulAllTr(VT& __restrict data, Transform<float, 3, Affine, Opt> const& __restrict m) { > auto* __restrict d = data.data(); > const size_t sz = data.size(); > for (size_t j = 0; j < sz; ++j) { > d[j] = m * d[j]; > } >} > > >template <typename T1, typename T2> >void test(string cap, T1 init, T2 f) { > > size_t N = 2000000; > size_t R = 10; > > auto m = init(); > VT data(N); > for (size_t i = 0; i < N; ++i) data[i].setRandom(); > > cout << cap << ": "; > > int64_t avg = 0, avgmin = numeric_limits<int64_t>::max(); > for (size_t i = 0; i < R; ++i) { > auto t1 = HRC::now(); > f(data, m); > auto t2 = HRC::now(); > auto dur = duration_cast<microseconds>(t2 - t1).count(); > avg += dur; > avgmin = min(avgmin, dur); > } > avg /= R; > cout << "Avg: " << avg / 1000.0 << ", min: " << avgmin / 1000.0 << endl; >} > > >int main() { > test("MatFloat ColMajor 1", > [=] { return Matrix<float, 4, 4, ColMajor>::Random().eval(); }, > [=](auto& data, auto const& mm) { mulAll(data, mm, [](auto& m, auto& p) { mul1(m, p); }); } > ); > test("MatFloat ColMajor 2", > [=] { return Matrix<float, 4, 4, ColMajor>::Random().eval(); }, > [=](auto& data, auto const& mm) { mulAll(data, mm, [](auto& m, auto& p) { mul2(m, p); }); } > ); > test("MatFloat ColMajor 3", > [=] { return Matrix<float, 4, 4, ColMajor>::Random().eval(); }, > [=](auto& data, auto const& mm) { mulAll(data, mm, [](auto& m, auto& p) { mul3(m, p); }); } > ); > test("MatFloat ColMajor 4", > [=] { return Matrix<float, 4, 4, ColMajor>::Random().eval(); }, > [=](auto& data, auto const& mm) { mulAll(data, mm, [](auto& m, auto& p) { mul4(m, p); }); } > ); > test("MatFloat ColMajor Fast", > [=] { return Matrix<float, 4, 4, ColMajor>::Random().eval(); }, > [=](auto& data, auto const& mm) { mulAll(data, mm, [](auto& m, auto& p) { mulFast(m, p); }); } > ); > > test("MatFloat RowMajor 1", > [=] { return Matrix<float, 4, 4, RowMajor>::Random().eval(); }, > [=](auto& data, auto const& mm) { mulAll(data, mm, [](auto& m, auto& p) { mul1(m, p); }); } > ); > test("MatFloat RowMajor 2", > [=] { return Matrix<float, 4, 4, RowMajor>::Random().eval(); }, > [=](auto& data, auto const& mm) { mulAll(data, mm, [](auto& m, auto& p) { mul2(m, p); }); } > ); > test("MatFloat RowMajor 3", > [=] { return Matrix<float, 4, 4, RowMajor>::Random().eval(); }, > [=](auto& data, auto const& mm) { mulAll(data, mm, [](auto& m, auto& p) { mul3(m, p); }); } > ); > test("MatFloat RowMajor 4", > [=] { return Matrix<float, 4, 4, RowMajor>::Random().eval(); }, > [=](auto& data, auto const& mm) { mulAll(data, mm, [](auto& m, auto& p) { mul4(m, p); }); } > ); > test("MatFloat RowMajor Fast", > [=] { return Matrix<float, 4, 4, RowMajor>::Random().eval(); }, > [=](auto& data, auto const& mm) { mulAll(data, mm, [](auto& m, auto& p) { mulFast(m, p); }); } > ); > > test("MatFloat RowMajor SSE", > [=] { return Matrix<float, 4, 4, RowMajor>::Random().eval(); }, > [=](auto& data, auto const& m) { multhom(m, data.data(), data.size()); } > ); > > test("TrFloat ColMajor", > [=] { > Transform<float, 3, Affine, ColMajor> m{}; > m.matrix().setRandom(); > return m; > }, > [=](auto& data, auto const& m) { mulAllTr(data, m); } > ); > > test("TrFloat RowMajor", > [=] { > Transform<float, 3, Affine, RowMajor> m{}; > m.matrix().setRandom(); > return m; > }, > [=](auto& data, auto const& m) { mulAllTr(data, m); } > ); > > return 0; >}
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Raw
Actions:
View
Attachments on
bug 1294
:
728
| 729