47 #endif // HAVE_VIENNACL
55 namespace implementation
61 template <enum Backend,
class Matrix>
65 typedef typename Matrix::Scalar
T;
83 static void compute(Matrix X, Matrix W, Matrix Y,
bool flip ,
84 bool overwrite, int32_t stride_x, int32_t stride_y);
90 template <>
template <
class Matrix>
93 typedef typename Matrix::Scalar
T;
111 bool overwrite, int32_t stride_x, int32_t stride_y)
119 int32_t rx = (kx-1)/2;
120 int32_t ry = (ky-1)/2;
122 for (int32_t x=0; x<width; x+=stride_x)
124 int32_t xout = x/stride_x;
126 for (int32_t y=0; y<height; y+=stride_y)
128 int32_t yout = y/stride_y;
130 T sum = overwrite ? 0 : Y(yout,xout);
131 for (int32_t x1=x-rx; x1<=x+rx; x1++)
133 int32_t wx = flip ? x1-x+rx : rx-x1+x;
134 for (int32_t y1=y-ry; y1<=y+ry; y1++)
136 if (x1>=0 && y1>=0 && x1<width && y1<height)
139 sum += W(y1-y+ry,wx)*X(y1,x1);
141 sum += W(ry-y1+y,wx)*X(y1,x1);
150 #endif // HAVE_EIGEN3
155 template <>
template <
class Matrix>
156 struct convolve<Backend::VIENNACL, Matrix>
158 typedef typename Matrix::Scalar
T;
162 static viennacl::ocl::kernel& generate_kernel_unity_stride(
163 int32_t radius_x, int32_t radius_y,
bool flip,
bool overwrite)
165 std::string kernel_name =
166 "convolve_unity_stride_" + ocl::get_type_string<T>() +
"_" +
167 std::to_string(radius_x) +
"_" + std::to_string(radius_y);
169 if (flip) kernel_name.append(
"_flip");
170 if (overwrite) kernel_name.append(
"_overwrite");
172 if (ocl::kernel_exists(kernel_name))
173 return ocl::get_kernel(kernel_name);
175 std::string source = ocl::generate_kernel_preamble<T>(kernel_name);
177 if (flip) source.append(
"#define FLIP\n");
178 if (overwrite) source.append(
"#define OVERWRITE\n");
180 source.append(
"#define RADIUS_X " + std::to_string(radius_x) +
"\n");
181 source.append(
"#define RADIUS_Y " + std::to_string(radius_y) +
"\n");
185 #define W_WIDTH (2*RADIUS_X+1)
186 #define W_HEIGHT (2*RADIUS_Y+1)
188 #define X_LOCAL_WIDTH (WORK_GROUP_SIZE_2D+2*RADIUS_X)
189 #define X_LOCAL_HEIGHT (WORK_GROUP_SIZE_2D+2*RADIUS_Y)
191 inline DATATYPE readX(read_only __global DATATYPE* X, int x, int y,
192 int X_width, int X_height, int X_offset)
194 if (x>=0 && y>=0 && x<X_width && y<X_height)
195 return X[y + x*X_height + X_offset];
200 __kernel void KERNEL_NAME(
201 read_only __global DATATYPE* X, int X_width, int X_height, int X_offset,
202 __constant DATATYPE* W, int W_offset,
203 __global DATATYPE* Y, int Y_offset)
205 __local DATATYPE X_local[X_LOCAL_WIDTH][X_LOCAL_HEIGHT];
207 int x = get_global_id(0);
208 int y = get_global_id(1);
210 int xl = get_local_id(0);
211 int yl = get_local_id(1);
213 if (xl==WORK_GROUP_SIZE_2D-1 && yl == WORK_GROUP_SIZE_2D-1)
215 for (int rx=0; rx<=2*RADIUS_X; rx++)
216 for (int ry=0; ry<=2*RADIUS_Y; ry++)
217 X_local[xl+rx][yl+ry] = readX(X, x-RADIUS_X+rx, y-RADIUS_Y+ry, X_width, X_height, X_offset);
219 else if (xl==WORK_GROUP_SIZE_2D-1)
221 for (int rx=0; rx<=2*RADIUS_X; rx++)
222 X_local[xl+rx][yl] = readX(X, x-RADIUS_X+rx, y-RADIUS_Y, X_width, X_height, X_offset);
224 else if (yl == WORK_GROUP_SIZE_2D-1)
226 for (int ry=0; ry<=2*RADIUS_Y; ry++)
227 X_local[xl][yl+ry] = readX(X, x-RADIUS_X, y-RADIUS_Y+ry, X_width, X_height, X_offset);
230 X_local[xl][yl] = readX(X, x-RADIUS_X, y-RADIUS_Y, X_width, X_height, X_offset);
232 barrier(CLK_LOCAL_MEM_FENCE);
234 if (x>=X_width || y>=X_height)
238 for (int x1=0; x1<W_WIDTH; x1++)
241 int wx = x1*W_HEIGHT+W_offset;
243 int wx = (2*RADIUS_X-x1)*W_HEIGHT+W_offset;
246 for (int y1=0; y1<W_HEIGHT; y1++)
250 sum += W[y1+wx]*X_local[inx][iny];
252 sum += W[2*RADIUS_Y-y1+wx]*X_local[inx][iny];
257 Y[y+X_height*x + Y_offset] = sum;
259 Y[y+X_height*x + Y_offset] += sum;
265 viennacl::ocl::kernel& kernel = ocl::compile_kernel(kernel_name, source);
267 kernel.local_work_size(0, OCL_WORK_GROUP_SIZE_2D);
268 kernel.local_work_size(1, OCL_WORK_GROUP_SIZE_2D);
275 static viennacl::ocl::kernel& generate_kernel_arbitrary_stride(
276 int32_t radius_x, int32_t radius_y,
bool flip,
bool overwrite)
278 std::string kernel_name =
279 "convolve_arbitrary_stride_" + ocl::get_type_string<T>() +
"_" +
280 std::to_string(radius_x) +
"_" + std::to_string(radius_y);
282 if (flip) kernel_name.append(
"_flip");
283 if (overwrite) kernel_name.append(
"_overwrite");
285 if (ocl::kernel_exists(kernel_name))
286 return ocl::get_kernel(kernel_name);
288 std::string source = ocl::generate_kernel_preamble<T>(kernel_name);
290 if (flip) source.append(
"#define FLIP\n");
291 if (overwrite) source.append(
"#define OVERWRITE\n");
293 source.append(
"#define RADIUS_X " + std::to_string(radius_x) +
"\n");
294 source.append(
"#define RADIUS_Y " + std::to_string(radius_y) +
"\n");
298 #define W_WIDTH (2*RADIUS_X+1)
299 #define W_HEIGHT (2*RADIUS_Y+1)
301 #define X_LOCAL_WIDTH (WORK_GROUP_SIZE_2D+2*RADIUS_X)
302 #define X_LOCAL_HEIGHT (WORK_GROUP_SIZE_2D+2*RADIUS_Y)
304 __kernel void KERNEL_NAME(
305 read_only __global DATATYPE* X, int X_width, int X_height, int X_offset,
306 __constant DATATYPE* W, int W_offset,
307 __global DATATYPE* Y, int Y_offset,
308 int stride_x, int stride_y)
310 __local DATATYPE X_local[WORK_GROUP_SIZE_2D][WORK_GROUP_SIZE_2D];
312 int x = get_global_id(0)*stride_x;
313 int y = get_global_id(1)*stride_y;
315 int Y_width = X_width/stride_x;
316 int Y_height = X_height/stride_y;
318 if (get_global_id(0)>=Y_width || get_global_id(1)>=Y_height)
322 for (int x1=0; x1<W_WIDTH; x1++)
325 int wx = x1*W_HEIGHT+W_offset;
327 int wx = (2*RADIUS_X-x1)*W_HEIGHT+W_offset;
329 int inx = x1+x-RADIUS_X;
330 for (int y1=0; y1<W_HEIGHT; y1++)
332 int iny = y1+y-RADIUS_Y;
333 if (inx>=0 && iny>=0 && inx<X_width && iny<X_height)
336 sum += W[y1+wx]*X[iny+inx*X_height+X_offset];
338 sum += W[2*RADIUS_Y-y1+wx]*X[iny+inx*X_height+X_offset];
344 Y[get_global_id(1)+Y_height*get_global_id(0) + Y_offset] = sum;
346 Y[get_global_id(1)+Y_height*get_global_id(0) + Y_offset] += sum;
352 viennacl::ocl::kernel& kernel = ocl::compile_kernel(kernel_name, source);
354 kernel.local_work_size(0, OCL_WORK_GROUP_SIZE_2D);
355 kernel.local_work_size(1, OCL_WORK_GROUP_SIZE_2D);
376 static void compute(CGPUMatrix<T> X, CGPUMatrix<T> W, CGPUMatrix<T> Y,
bool flip ,
377 bool overwrite, int32_t stride_x, int32_t stride_y)
379 if (stride_x==1 && stride_y==1)
381 viennacl::ocl::kernel& kernel = generate_kernel_unity_stride<T>(
382 (W.num_cols-1)/2, (W.num_rows-1)/2, flip, overwrite);
384 kernel.global_work_size(0, ocl::align_to_multiple_2d(Y.num_cols));
385 kernel.global_work_size(1, ocl::align_to_multiple_2d(Y.num_rows));
387 viennacl::ocl::enqueue(kernel(
388 X.vcl_matrix(), cl_int(X.num_cols), cl_int(X.num_rows), cl_int(X.offset),
389 W.vcl_matrix(), cl_int(W.offset),
390 Y.vcl_matrix(), cl_int(Y.offset)));
394 viennacl::ocl::kernel& kernel = generate_kernel_arbitrary_stride<T>(
395 (W.num_cols-1)/2, (W.num_rows-1)/2, flip, overwrite);
397 kernel.global_work_size(0, ocl::align_to_multiple_2d(Y.num_cols));
398 kernel.global_work_size(1, ocl::align_to_multiple_2d(Y.num_rows));
400 viennacl::ocl::enqueue(kernel(
401 X.vcl_matrix(), cl_int(X.num_cols), cl_int(X.num_rows), cl_int(X.offset),
402 W.vcl_matrix(), cl_int(W.offset),
403 Y.vcl_matrix(), cl_int(Y.offset),
404 cl_int(stride_x), cl_int(stride_y)));
409 #endif // HAVE_VIENNACL
416 #endif // CONVOLVE_H_
Eigen::Matrix< T, Eigen::Dynamic, Eigen::Dynamic > MatrixXt
static void compute(SGMatrix< T > X, SGMatrix< T > W, SGMatrix< T > Y, bool flip, bool overwrite, int32_t stride_x, int32_t stride_y)
Generic class sum which provides a static compute method. This class is specialized for different typ...
Eigen::Matrix< T, Eigen::Dynamic, 1 > VectorXt
static void compute(Matrix X, Matrix W, Matrix Y, bool flip, bool overwrite, int32_t stride_x, int32_t stride_y)