Bugzilla – Attachment 907 Details for
Bug 1642
Implement faster GEMM kernel for AVX512
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
Log In
[x]
|
Forgot Password
Login:
[x]
This bugzilla service is closed. All entries have been migrated to
https://gitlab.com/libeigen/eigen
bench_matrix_vs_tensor.cpp
bench_matrix_vs_tensor.cpp (text/x-c++src), 7.78 KB, created by
william.tambellini
on 2018-12-10 20:45:44 UTC
(
hide
)
Description:
bench_matrix_vs_tensor.cpp
Filename:
MIME Type:
Creator:
william.tambellini
Created:
2018-12-10 20:45:44 UTC
Size:
7.78 KB
patch
obsolete
>// Authors: William Tambellini <william.tambellini@gmail.com> with the help of Christoph Hertzberg > >// Little bench to compare the speed of Eigen Matrix vs Tensor > >#include <iostream> >#include <iomanip> >#include <bench/BenchTimer.h> >#include <unsupported/Eigen/CXX11/Tensor> > >using namespace Eigen; >using namespace std; > >typedef Matrix<float, 1, Eigen::Dynamic, Eigen::RowMajor> EigenRowVector; >typedef Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor> EigenMatrix; >typedef Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor> EigenArray; > >// colwise log softmax >template<typename T> >void EMatrixLogSoftmax(const EigenMatrix& input, EigenMatrix& output) { > // using arrays allows to call native vectorizable exp and log > EigenArray wMinusMax = input.rowwise() - input.colwise().maxCoeff(); > output = wMinusMax.rowwise() - wMinusMax.exp().colwise().sum().log(); >} > >static Eigen::IndexList<Eigen::type2index<0>> alongClass; // 1st dimension >Eigen::IndexList<int, Eigen::type2index<1>> bcast; >template<typename T> >void ETensorLogSoftmax(const Tensor<T, 2>& input, Tensor<T, 2>& output) { > bcast.set(0, input.dimension(0)); > IndexList<Eigen::type2index<1>, typename Eigen::Index> dims2d; > dims2d.set(1, input.dimension(1)); > // creating a real tensor is faster thant auto which would resolve to some TensorExpr > Eigen::Tensor<T,2> wMinusMax = input - input.maximum(alongClass).eval().reshape(dims2d).broadcast(bcast); > output = wMinusMax - wMinusMax.exp().sum(alongClass).log().eval().reshape(dims2d).broadcast(bcast); >} > >void EMatrixTranspose(const EigenMatrix& input, EigenMatrix& output) { > output = input.transpose(); >} > >const Eigen::array<ptrdiff_t, 2> shuffles({1,0}); >template<typename T> >void ETensorTranspose(const Tensor<T, 2>& i, Tensor<T,2>& o) { > o = i.shuffle(shuffles); >} > >// Compare all elements of the matrix against the tensor. Returns false if the difference of any couple of elements is too high. >bool compare(const EigenMatrix& m, const Eigen::Tensor<float, 2>& t) { > // Sanitiy check > if (m.rows() != t.dimension(0) || m.cols() != t.dimension(1)) { > std::cerr << "inputs must have the same sizes: " << m.rows() << "x" << m.cols() << " vs " << t.dimension(0) << "x" << t.dimension(1) << std::endl; > return false; > } > for (unsigned r = 0; r < m.rows(); ++r) > for (unsigned c = 0; c < m.cols(); ++c) > // some diffs appear on a x86 64 CPU at lower precision > if (fabs(m(r,c) - t.coeff({r,c})) > 0.00001) { > std::cerr << "Diff between Matrix and Tensor at " << r << "," << c << " : " << m(r,c) << " vs " << t.coeff({r,c}) << std::endl; > return false; > } > return true; >} > >const unsigned coutw = 16; >#define COUT(M) std::cout << std::setw(coutw) << M; > >void transpose(const unsigned repeat = 100) { > BenchTimer t; > std::cout << "\nTranspose:\n"; > std::cout << "Repeat: " << repeat << std::endl; > COUT("NRows"); COUT("NCols"); COUT("EMatrix"); COUT("ETensor"); COUT(std::endl); > > for (unsigned s = 64; s <= 2048; s = s*2) { > COUT(s); COUT(s); > > EigenMatrix inputMatrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>::Random(s, s); > EigenMatrix outputMatrix(s, s); > t.start(); > for (unsigned r = 0; r < repeat; ++r) { > EMatrixTranspose(inputMatrix, outputMatrix); > } > t.stop(); > COUT(t.value()); > > Tensor<float, 2> inputTensor = Eigen::Tensor<float, 2>(Eigen::array<Index,2>({s, s})); > memcpy(inputTensor.data(), inputMatrix.data(), sizeof(float) * s * s); > Tensor<float, 2> outputTensor = Eigen::Tensor<float, 2>(Eigen::array<Index,2>({s, s})); > > t.start(); > for (unsigned r = 0; r < repeat; ++r) { > ETensorTranspose(inputTensor, outputTensor); > } > t.stop(); > COUT(t.value() << "\n"); > if (!compare(outputMatrix, outputTensor)) > break; > } >} > >void logSoftmax(const unsigned repeat = 200) { > BenchTimer t; > std::cout << "\nLogSoftmax:\n"; > std::cout << "Repeat: " << repeat << std::endl; > COUT("NRows"); COUT("NCols"); COUT("EMatrix"); COUT("ETensor"); COUT(std::endl); > for (unsigned nc = 1; nc < 3; ++nc) > for (unsigned s = 2048; s < 50024; s = s*2) { > COUT(s); COUT(nc); > > // EigenMatrix > EigenMatrix inputMatrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>::Random(s, nc); > EigenMatrix outputMatrix(s, nc); > t.start(); > for (unsigned r = 0; r < repeat; ++r) > EMatrixLogSoftmax<float>(inputMatrix, outputMatrix); > t.stop(); > COUT(t.value()); > > // EigenTensor > Tensor<float, 2> inputTensor = Eigen::Tensor<float, 2>(Eigen::array<Index,2>({s, nc})); > memcpy(inputTensor.data(), inputMatrix.data(), sizeof(float) * s * nc); > Tensor<float, 2> outputTensor = Eigen::Tensor<float, 2>(Eigen::array<Index,2>({s, nc})); > t.start(); > for (unsigned r = 0; r < repeat; ++r) > ETensorLogSoftmax<float>(inputTensor, outputTensor); > t.stop(); > COUT(t.value() << "\n"); > > if (!compare(outputMatrix, outputTensor)) > break; > } >} > >void maximum(const unsigned repeat = 1000) { > BenchTimer t; > std::cout << "\nMaximum:\n"; > std::cout << "Repeat: " << repeat << std::endl; > > COUT("NRows"); COUT("NCols"); COUT("EMatrix"); COUT("ETensor"); COUT(std::endl); > > for (unsigned nc = 1; nc < 4; ++nc) > for (unsigned nr = 1024; nr < 50000; nr = nr*2) { > COUT(nr); COUT(nc); > > EigenMatrix inputMatrix = Matrix<float, Eigen::Dynamic, Eigen::Dynamic>::Random(nr, nc); > EigenMatrix outputMatrix(nr, nc); > t.start(); > for (unsigned r = 0; r < repeat; ++r) > outputMatrix = inputMatrix.colwise().maxCoeff(); > t.stop(); > COUT(t.value()); > > Tensor<float, 2> inputTensor = Tensor<float, 2>(Eigen::array<Index,2>({nr, nc})); > memcpy(inputTensor.data(), inputMatrix.data(), sizeof(float) * nr * nc); > Tensor<float, 2> outputTensor = Eigen::Tensor<float, 2>(Eigen::array<Index,2>({nr, nc})); > t.start(); > for (unsigned r = 0; r < repeat; ++r) > outputTensor = inputTensor.maximum(Eigen::array<int,1>({0})); > t.stop(); > COUT(t.value() << std::endl); > > if (!compare(outputMatrix, outputTensor.shuffle(Eigen::array<ptrdiff_t, 2>({1,0})))) > break; > } >} > >void matmul(const unsigned repeat = 10) { > BenchTimer t; > std::cout << "\nMatmul: M=N=K\n"; > std::cout << "Repeat: " << repeat << std::endl; > COUT("MNK"); COUT("EMatrix"); COUT("ETensor"); COUT(std::endl); > > for (unsigned s = 256; s <= 2048; s*=2) { > COUT(s); > > EigenMatrix inputMatrix = Matrix<float, Eigen::Dynamic, Eigen::Dynamic>::Random(s, s); > EigenMatrix outputMatrix(s, s); > t.start(); > for (unsigned r = 0; r < repeat; ++r) > outputMatrix = inputMatrix * inputMatrix; > t.stop(); > COUT(t.value()); > > Tensor<float, 2> inputTensor = Tensor<float, 2>(Eigen::array<Index,2>({s, s})); > memcpy(inputTensor.data(), inputMatrix.data(), sizeof(float) * s * s); > Tensor<float, 2> outputTensor = Eigen::Tensor<float, 2>(Eigen::array<Index,2>({s, s})); > const Eigen::array<Eigen::IndexPair<int>, 1> pd({{Eigen::IndexPair<int>(1, 0)}}); > t.start(); > for (unsigned r = 0; r < repeat; ++r) > outputTensor = inputTensor.contract(inputTensor, pd); > t.stop(); > COUT(t.value() << std::endl); > > if (!compare(outputMatrix, outputTensor)) > break; > } >} > >int main(int argc, char* argv[]) >{ > std::cout << "Bench Eigen Matrix vs Tensor" << std::endl; > std::cout << "Usage: program numberOfEigenThreads (default to 1)" << std::endl; > std::cout << "GCC: " << __VERSION__ << std::endl; > std::cout << "Eigen version: " << EIGEN_WORLD_VERSION << "." << EIGEN_MAJOR_VERSION << "." << EIGEN_MINOR_VERSION << std::endl; > std::cout << "Simd: " << Eigen::SimdInstructionSetsInUse() << std::endl; > Eigen::setNbThreads(argc > 1 ? std::stoi(argv[1]) : 1); > std::cout << "Eigen::nbThreads: " << Eigen::nbThreads() << std::endl; >#if defined(EIGEN_NO_DEBUG) > std::cout << "EIGEN_NO_DEBUG" << std::endl; >#endif >#if defined(EIGEN_VECTORIZE) > std::cout << "EIGEN_VECTORIZE\n"; >#endif > >#if defined(EIGEN_HAS_OPENMP) > std::cout << "EIGEN_HAS_OPENMP: " << _OPENMP << std::endl; > std::cout << "omp_get_num_threads: " << omp_get_num_threads() << std::endl; >#endif > > matmul(); > > maximum(); > > transpose(); > > logSoftmax(); > > return EXIT_SUCCESS; >}
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Raw
Actions:
View
Attachments on
bug 1642
: 907