| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 
 | #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 
 #include <stdio.h>
 
 
 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);
 
 
 __global__ void addKernel(int *c, const int *a, const int *b)
 {
 
 int i = threadIdx.x;
 c[i] = a[i] + b[i];
 }
 
 int main()
 {
 const int arraySize = 5;
 const int a[arraySize] = { 1, 2, 3, 4, 5 };
 const int b[arraySize] = { 10, 20, 30, 40, 50 };
 int c[arraySize] = { 0 };
 
 
 cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
 
 if (cudaStatus != cudaSuccess) {
 fprintf(stderr, "addWithCuda failed!");
 return 1;
 }
 
 printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
 c[0], c[1], c[2], c[3], c[4]);
 
 
 
 cudaStatus = cudaThreadExit();
 if (cudaStatus != cudaSuccess) {
 fprintf(stderr, "cudaThreadExit failed!");
 return 1;
 }
 
 return 0;
 }
 
 
 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)
 {
 
 int *dev_a = 0;
 int *dev_b = 0;
 int *dev_c = 0;
 
 cudaError_t cudaStatus;
 
 
 cudaStatus = cudaSetDevice(0);
 if (cudaStatus != cudaSuccess) {
 fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
 goto Error;
 }
 
 
 cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
 
 if (cudaStatus != cudaSuccess) {
 fprintf(stderr, "cudaMalloc failed!");
 goto Error;
 }
 
 cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
 if (cudaStatus != cudaSuccess) {
 fprintf(stderr, "cudaMalloc failed!");
 goto Error;
 }
 
 cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
 if (cudaStatus != cudaSuccess) {
 fprintf(stderr, "cudaMalloc failed!");
 goto Error;
 }
 
 
 
 cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
 if (cudaStatus != cudaSuccess) {
 fprintf(stderr, "cudaMemcpy failed!");
 goto Error;
 }
 
 cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
 if (cudaStatus != cudaSuccess) {
 fprintf(stderr, "cudaMemcpy failed!");
 goto Error;
 }
 
 
 addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
 
 
 cudaStatus = cudaThreadSynchronize();
 if (cudaStatus != cudaSuccess) {
 fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
 goto Error;
 }
 
 
 cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
 if (cudaStatus != cudaSuccess) {
 fprintf(stderr, "cudaMemcpy failed!");
 goto Error;
 }
 
 Error:
 
 cudaFree(dev_c);
 cudaFree(dev_a);
 cudaFree(dev_b);
 
 return cudaStatus;
 }
 
 |