#include <stdio.h>
#include <stdlib.h>
/**
* @brief This function sums two array elements and stores the result into a
* third array.
* @details This function has the __global__ function identifier, meaning that
* it will be launched from the host and executed on the device. Each CUDA
* thread calculates its index and operates on the corresponding index in the
* arrays given. It can be visualised as follows:
*
* my_index
* |
* (my_index - 1) --+ | +-- (my_index + 1)
* | | |
* V V V
* _ _ _
* array a: ... |_|_|_| ...
* + _ _ _
* array b: ... |_|_|_| ...
* = _ _ _
* array c: ... |_|_|_| ...
*
**/
__global__ void sum(int* a, int* b, int* c, int size)
{
int my_index = blockIdx.x * blockDim.x + threadIdx.x;
if(my_index < size)
{
c[my_index] = a[my_index] + b[my_index];
}
}
/**
* @brief Illustrates how to declare a __global__ function.
* @details This application consists of 3 arrays: a, b, c. It is a classic
* "c[i] = a[i] + b[i]" computation. Arrays 'a' and 'b' contain the input data
* and will be passed from the host to the device. The sum function is then
* executed on the device, thanks to the __global__ function specifier. The
* array 'c', which contains the output,will be taken back from the device to
* the host once the computation is complete.
**/
int main(int argc, char* argv[])
{
const int ARRAY_SIZE = 10;
const int ARRAY_SIZE_BYTES = ARRAY_SIZE * sizeof(int);
// Declare pointers that will point to the memory allocated on the host.
int a_host[ARRAY_SIZE];
int b_host[ARRAY_SIZE];
int c_host[ARRAY_SIZE];
for(int i = 0; i < ARRAY_SIZE; i++)
{
a_host[i] = i;
b_host[i] = ARRAY_SIZE + i;
c_host[i] = 0;
}
// Declare pointers that will point to the memory allocated on the device.
int* a_device;
int* b_device;
int* c_device;
// Allocate memory on the device
cudaMalloc(&a_device, ARRAY_SIZE_BYTES);
cudaMalloc(&b_device, ARRAY_SIZE_BYTES);
cudaMalloc(&c_device, ARRAY_SIZE_BYTES);
// Copy the input arrays to the device
cudaMemcpy(a_device, a_host, ARRAY_SIZE_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(b_device, b_host, ARRAY_SIZE_BYTES, cudaMemcpyHostToDevice);
// Launch the kernel on the device
sum<<<1, ARRAY_SIZE>>>(a_device, b_device, c_device, ARRAY_SIZE);
// Copy the output array back from the device to the host and print its values
cudaMemcpy(c_host, c_device, ARRAY_SIZE_BYTES, cudaMemcpyDeviceToHost);
printf("a + b = c\n");
for(int i = 0; i < ARRAY_SIZE; i++)
{
printf("%d + %2d = %2d\n", a_host[i], b_host[i], c_host[i]);
}
printf("\n");
// Free resources
cudaFree(a_device);
cudaFree(b_device);
cudaFree(c_device);
return EXIT_SUCCESS;
}