-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheckThreadIndex.cu
More file actions
78 lines (65 loc) · 2.12 KB
/
checkThreadIndex.cu
File metadata and controls
78 lines (65 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#include "cuda_runtime.h"
#include <cstdio>
#define CHECK(call){ \
const cudaError_t error = call; \
if(error != cudaSuccess){ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code: %d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(-10 * error); \
}}
void initialInt(int *arr, int size){
for(int i=0;i<size;++i) arr[i] = i;
}
void printMatrix(int *C, const int nx, const int ny){
int *ic = C;
printf("\nMatrix: (%d * %d) \n", nx, ny);
for(int iy=0;iy<ny;++iy){
for(int ix=0;ix<nx;++ix){
printf("%3d", ic[ix]);
}
ic += nx;
printf("\n");
}
printf("\n");
}
__global__ void printThreadIndex(int *A, const int nx, const int ny){
auto ix = threadIdx.x + blockIdx.x * blockDim.x;
auto iy = threadIdx.y + blockIdx.y * blockDim.y;
auto idx = nx * iy + ix;
printf("thread_id (%d, %d) block_id (%d, %d) coordinate (%d, %d) global index %2d ival %2d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, ix, iy, idx, A[idx]);
}
int main(int argc, char **argv){
printf("%s Starting...\n", argv[0]);
// get device info
int dev = 0;
cudaDeviceProp deviceProp {};
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("Using Device %d: %s\n", dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
// set Matrix dimension
int nx = 8;
int ny = 6;
int nxy = nx * ny;
auto nBytes = nxy * sizeof(float);
// malloc host memory
auto *h_A = (int *) malloc(nBytes);
initialInt(h_A, nxy);
printMatrix(h_A, nx, ny);
// malloc device memory
int *d_A {nullptr};
cudaMalloc((int**)&d_A, nBytes);
// transfer data
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
// set up execution configuration
dim3 block_size {4, 2};
dim3 grid_size {(nx + block_size.x - 1) / block_size.x, (ny + block_size.y - 1) / block_size.y};
// invoke kernel function
printThreadIndex<<<grid_size, block_size>>>(d_A, nx, ny);
cudaDeviceSynchronize();
// free memory
cudaFree(d_A);
free(h_A);
// reset device
cudaDeviceReset();
return 0;
}