Eigen-unsupported  3.4.90 (git rev a4098ac676528a83cfb73d4d26ce1b42ec05f47c)
TensorConvolutionSycl.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9
10//
11// This Source Code Form is subject to the terms of the Mozilla
12// Public License v. 2.0. If a copy of the MPL was not distributed
13// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14
15#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
17
18#include "./InternalHeaderCheck.h"
19
20namespace Eigen {
21
30enum class convolution_type { CONV1D, CONV2D, CONV3D };
31template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
32 typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
33struct EigenConvolutionKernel;
34template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
35 typename Kernel_accessor, typename Buffer_accessor>
36struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
37 Buffer_accessor, convolution_type::CONV1D> {
38 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
39 Local_accessor;
40 Local_accessor local_acc;
41 Evaluator device_evaluator;
42 Kernel_accessor kernel_filter;
43 Buffer_accessor buffer_acc;
44 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
45 const size_t kernelSize;
46 const cl::sycl::range<2> input_range;
47 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
48 Buffer_accessor buffer_acc_,
49 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
50 const size_t kernelSize_, const cl::sycl::range<2> input_range_)
51 : local_acc(local_acc_),
52 device_evaluator(device_evaluator_),
53 kernel_filter(kernel_filter_),
54 buffer_acc(buffer_acc_),
55 indexMapper(indexMapper_),
56 kernelSize(kernelSize_),
57 input_range(input_range_) {}
58
59 template <typename BooleanDim2>
60 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
61 return (boolean_check[0] && boolean_check[1]);
62 }
63 void operator()(cl::sycl::nd_item<2> itemID) {
64 auto buffer_ptr = buffer_acc.get_pointer();
65 auto kernel_ptr = kernel_filter.get_pointer();
66 // the required row to be calculated for the for each plane in shered memory
67 const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
68 const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
69 const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
70 const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
72 for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
73 const size_t local_index = i + plane_kernel_offset;
74 const size_t tensor_index =
75 plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
76
77 local_acc[local_index] =
78 (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
79 ? device_evaluator.coeff(tensor_index)
80 : CoeffReturnType(0);
81 }
82
83 itemID.barrier(cl::sycl::access::fence_space::local_space);
84
85 // calculate the convolution // output start x
86 const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
87 if (boundary_check(itemID.get_global_id() < input_range)) {
88 CoeffReturnType result = static_cast<CoeffReturnType>(0);
89 const size_t index = plane_kernel_offset + itemID.get_local_id(0);
90 for (size_t k = 0; k < kernelSize; ++k) {
91 result += (local_acc[k + index] * kernel_ptr[k]);
92 }
93 const size_t tensor_index =
94 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
95 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
96 buffer_ptr[tensor_index] = result;
97 }
98 }
99};
100
101template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
102 typename Kernel_accessor, typename Buffer_accessor>
103struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
104 Buffer_accessor, convolution_type::CONV2D> {
105 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
106 Local_accessor;
107 Local_accessor local_acc;
108 Evaluator device_evaluator;
109 Kernel_accessor kernel_filter;
110 Buffer_accessor buffer_acc;
111 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
112 const cl::sycl::range<2> kernel_size;
113 const cl::sycl::range<3> input_range;
114 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
115 Buffer_accessor buffer_acc_,
116 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
117 const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
118 : local_acc(local_acc_),
119 device_evaluator(device_evaluator_),
120 kernel_filter(kernel_filter_),
121 buffer_acc(buffer_acc_),
122 indexMapper(indexMapper_),
123 kernel_size(kernel_size_),
124 input_range(input_range_) {}
125 template <typename BooleanDim3>
126 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
127 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
128 }
129
130 void operator()(cl::sycl::nd_item<3> itemID) {
131 auto buffer_ptr = buffer_acc.get_pointer();
132 auto kernel_ptr = kernel_filter.get_pointer();
133 // the required row to be calculated for the for each plane in shered memory
134 const auto num_input = cl::sycl::range<2>{
135 (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
136
137 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
138 const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
139
140 const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
141 itemID.get_group(1) * itemID.get_local_range()[1]};
142
143 // fill the local memory
144 bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
145 for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
146 const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
147 bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
148 for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
149 const size_t local_index = i + local_input_offset;
150 const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
151 i + input_offset[0], j + input_offset[1]);
152 local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
153 in_range_dim1 && in_range_dim2)
154 ? device_evaluator.coeff(tensor_index)
155 : CoeffReturnType(0);
156 }
157 }
158
159 itemID.barrier(cl::sycl::access::fence_space::local_space);
160
161 // output offset start for each thread
162 const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
163 itemID.get_group(1) * itemID.get_local_range()[1]};
164
165 if (boundary_check(itemID.get_global_id() < input_range)) {
166 CoeffReturnType result = static_cast<CoeffReturnType>(0);
167
168 for (size_t j = 0; j < kernel_size[1]; j++) {
169 size_t kernel_offset = kernel_size[0] * j;
170 const size_t index =
171 (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
172 for (size_t i = 0; i < kernel_size[0]; i++) {
173 result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
174 }
175 }
176 const size_t tensor_index =
177 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
178 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
179 itemID.get_local_id(1) + output_offset[1]);
180
181 buffer_ptr[tensor_index] = result;
182 }
183 }
184};
185
186template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
187 typename Kernel_accessor, typename Buffer_accessor>
188struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
189 Buffer_accessor, convolution_type::CONV3D> {
190 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
191 Local_accessor;
192 Local_accessor local_acc;
193 Evaluator device_evaluator;
194 Kernel_accessor kernel_filter;
195 Buffer_accessor buffer_acc;
196 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
197 const cl::sycl::range<3> kernel_size;
198 const cl::sycl::range<3> input_range;
199 const size_t numP;
200
201 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
202 Buffer_accessor buffer_acc_,
203 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
204 const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
205 const size_t numP_)
206 : local_acc(local_acc_),
207 device_evaluator(device_evaluator_),
208 kernel_filter(kernel_filter_),
209 buffer_acc(buffer_acc_),
210 indexMapper(indexMapper_),
211 kernel_size(kernel_size_),
212 input_range(input_range_),
213 numP(numP_) {}
214 template <typename BooleanDim3>
215 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
216 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
217 }
218 void operator()(cl::sycl::nd_item<3> itemID) {
219 auto buffer_ptr = buffer_acc.get_pointer();
220 auto kernel_ptr = kernel_filter.get_pointer();
221 const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
222
223 const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
224
225 const auto output_offset =
226 cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
227
228 for (size_t p = 0; p < numP; p++) {
230 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
231 for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
232 size_t local_index_dim2 = num_input[0] * num_input[1] * k;
233 bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
234 for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
235 bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
236 size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
237 for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
238 bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
239 const size_t local_index = local_index_dim1 + i;
240 const size_t tensor_index =
241 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
242 i + input_offset[0], j + input_offset[1], k + input_offset[2]);
243 local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
244 }
245 }
246 }
247 itemID.barrier(cl::sycl::access::fence_space::local_space);
248
249 // calculate the convolution
250
251 if (boundary_check(itemID.get_global_id() < input_range)) {
252 CoeffReturnType result = static_cast<CoeffReturnType>(0);
253 for (size_t k = 0; k < kernel_size[2]; k++) {
254 for (size_t j = 0; j < kernel_size[1]; j++) {
255 for (size_t i = 0; i < kernel_size[0]; i++) {
256 const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
257 const size_t local_index =
258 ((i + itemID.get_local_id(0)) +
259 num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
260
261 result += (local_acc[local_index] * kernel_ptr[kernel_index]);
262 }
263 }
264 }
265 const size_t tensor_index =
266 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
267 indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
268 buffer_ptr[tensor_index] = result;
269 }
270
271 itemID.barrier(cl::sycl::access::fence_space::local_space);
272 }
273 }
274};
275
276template <typename Indices, typename InputArgType, typename KernelArgType>
277struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
278 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
279
280 static const int NumDims =
281 internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
282 static const int NumKernelDims = internal::array_size<Indices>::value;
283 typedef typename XprType::Index Index;
284 typedef DSizes<Index, NumDims> Dimensions;
285 typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
286 typedef const Eigen::SyclDevice Device;
287 typedef typename XprType::CoeffReturnType CoeffReturnType;
288 typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
289 typedef typename InputArgType::Scalar Scalar;
290 static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
291 typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
292 typedef typename Storage::Type EvaluatorPointerType;
293 typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
294
295 enum {
296 IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
297 TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
298 PacketAccess = false,
299 BlockAccess = false,
300 PreferBlockAccess = false,
301 Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
302 CoordAccess = false, // to be implemented
303 RawAccess = false
304 };
305
306 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
307 typedef internal::TensorBlockNotImplemented TensorBlock;
308 //===--------------------------------------------------------------------===//
309
310 TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
311 : m_inputImpl(op.inputExpression(), device),
312 m_kernelArg(op.kernelExpression()),
313 m_kernelImpl(op.kernelExpression(), device),
314 m_indices(op.indices()),
315 m_buf(NULL),
316 m_kernel(NULL),
317 m_local_kernel(false),
318 m_device(device) {
319 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
320 static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
321 YOU_MADE_A_PROGRAMMING_MISTAKE);
322
323 const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
324 const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
325 m_kernelImpl.dimensions();
326
327 m_dimensions = m_inputImpl.dimensions();
328 for (int i = 0; i < NumKernelDims; ++i) {
329 const Index index = op.indices()[i];
330 const Index input_dim = input_dims[index];
331 const Index kernel_dim = kernel_dims[i];
332 const Index result_dim = input_dim - kernel_dim + 1;
333 m_dimensions[index] = result_dim;
334 }
335 }
336
337 EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
338
339 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
340 preloadKernel();
341 m_inputImpl.evalSubExprsIfNeeded(NULL);
342 if (data) {
343 executeEval(data);
344 return false;
345 } else {
346 m_buf = (EvaluatorPointerType)m_device.get(
347 (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
348 executeEval(m_buf);
349 return true;
350 }
351 }
352
353 EIGEN_STRONG_INLINE void cleanup() {
354 m_inputImpl.cleanup();
355 if (m_buf) {
356 m_device.deallocate_temp(m_buf);
357 m_buf = NULL;
358 }
359 if (m_local_kernel) {
360 m_device.deallocate_temp(m_kernel);
361 m_local_kernel = false;
362 }
363 m_kernel = NULL;
364 }
366 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
368 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
369
370 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
371 // Don't make a local copy of the kernel unless we have to (i.e. it's an
372 // expression that needs to be evaluated)
373 typename KernelStorage::Type in_place = m_kernelImpl.data();
374 if (in_place) {
375 m_kernel = in_place;
376 m_local_kernel = false;
377 } else {
378 ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
379 EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
380 typedef TensorEvalToOp<const KernelArgType> EvalTo;
381 EvalTo evalToTmp(m_device.get(local), m_kernelArg);
382 const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
383 internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
384 m_kernel = local;
385 m_local_kernel = true;
386 }
387 }
388
389 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
390 typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
391 typedef typename InputEvaluator::Dimensions InputDims;
392 switch (NumKernelDims) {
393 case 1: {
394 const size_t numX = dimensions()[m_indices[0]];
395 const size_t numP = dimensions().TotalSize() / numX;
396 const auto input_dim = std::array<size_t, 2>{numX, numP};
397 auto global_range = cl::sycl::range<2>{};
398 auto local_range = cl::sycl::range<2>{};
399 const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
400
401 m_device.parallel_for_setup(input_dim, global_range, local_range);
402 const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
403 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
404 const array<Index, 1> indices{{m_indices[0]}};
405 const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
406 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
407
408 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
409 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
410 ConvKernel;
411
412 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
413 m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
414 indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
415 break;
416 }
417
418 case 2: {
419 auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
420 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
421 auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
422 (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
423 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
424 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
425 const size_t numP = dimensions().TotalSize() / (numX * numY);
426 auto input_dim = std::array<size_t, 3>{numX, numY, numP};
427
428 auto global_range = cl::sycl::range<3>{};
429 auto local_range = cl::sycl::range<3>{};
430
431 m_device.parallel_for_setup(input_dim, global_range, local_range);
432
433 const size_t local_memory_size =
434 (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
435 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
436 const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
437 const array<Index, 2> kernel_dims{
438 {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
439 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
440 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
441 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
442 ConvKernel;
443 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
444 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
445 indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
446 break;
447 }
448
449 case 3: {
450 auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
451 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
452 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
453
454 auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
455 (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
456 (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
457
458 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
459 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
460 const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
461 auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
462 const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
463
464 const array<Index, 3> indices{
465 {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
466 const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
467 m_kernelImpl.dimensions()[kernel_index[1]],
468 m_kernelImpl.dimensions()[kernel_index[2]]}};
469
470 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
471
472 auto global_range = cl::sycl::range<3>{};
473 auto local_range = cl::sycl::range<3>{};
474
475 m_device.parallel_for_setup(input_dim, global_range, local_range);
476 auto local_memory_range = (local_range + kernel_size - 1);
477 const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
478
479 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
480 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
481 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
482 ConvKernel;
483 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
484 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
485 indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
486 break;
487 }
488
489 default: {
490 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
491 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
492 }
493 }
494 }
495
496 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
497 eigen_assert(m_buf != NULL);
498 eigen_assert(index < m_dimensions.TotalSize());
499 return m_buf[index];
500 }
501
502 template <int LoadMode>
503 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
504 eigen_assert(m_buf != NULL);
505 eigen_assert(index < m_dimensions.TotalSize());
506 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
507 }
508
509 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
510 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
511 // model.
512 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
513 // We ignore the use of fused multiply-add.
514 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
515 const double firstIndex_compute_cost =
516 NumDims *
517 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
518 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
519 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
520 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
521 }
522 // binding placeholder accessors to a command group handler for SYCL
523 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
524 m_kernelImpl.bind(cgh);
525 m_inputImpl.bind(cgh);
526 m_buf.bind(cgh);
527 m_kernel.bind(cgh);
528 }
529
530 private:
531 // No assignment (copies are needed by the kernels)
532 TensorEvaluator &operator=(const TensorEvaluator &);
533 TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
534 KernelArgType m_kernelArg;
535 TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
536 Indices m_indices;
537 Dimensions m_dimensions;
538 EvaluatorPointerType m_buf;
539 typename KernelStorage::Type m_kernel;
540 bool m_local_kernel;
541 const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
542}; // namespace Eigen
543
544} // end namespace Eigen
545
546#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index