Execution configuration

C

<<<...>>>

Definition

<<<...>>> is the special syntax in CUDA, called the execution configuration. It must be specified every time a function with the __global__ function specifier is invoked.

Copy

Feedback

<<<dim3 grid_size,
   dim3 block_size,
   size_t memory_size,
   cudaStream_t stream>>>

Parameters

grid_size

Specifies the size of the grid, that is, the number of blocks in each of the three dimensions grid_size.x, grid_size.y and grid_size.z

block_size

Specifies the size of each block, that is, the number of threads in each of the three dimensions block_size.x, block_size.y and block_size.z.

memory_size [Optional]

Specifies the number of bytes that is dynamically allocated in the shared memory of each block; the one used by __shared__ variables that are external arrays. The amount of memory dynamically allocated is in addition to the statically allocated memory. If they exceed the total amount of shared memory on a block, the function call will fail. Since memory_size is an optional argument; memory_size has a default value, if it is not passed it defaults to 0.

stream [Optional]

Specifies the steam to use. Since it is an optional parameter, it has a default value, if it is not passed it defaults to 0.

Example

Copy

Feedback

#include <stdio.h>
#include <stdlib.h>

/**
 * @brief This function sets the element value to that of the block they have
 * been processed by.
 **/
__global__ void set_to_block_id(int* a, int width, int height)
{
    int my_index_x = blockIdx.x * blockDim.x + threadIdx.x;
    int my_index_y = blockIdx.y * blockDim.y + threadIdx.y;
     if(my_index_x < width && my_index_y < height)
     {
         a[my_index_y * width + my_index_x] = gridDim.x * blockIdx.y + blockIdx.x;
     }
}

/**
 * @brief Illustrates how to use the execution configuration when launching a
 * kernel.
 * @details This application consists in a kernel setting the value of the array
 * elements it processes to that of the block by which it has been processed.
 * That allows to see the distribution of workload and understand better how the
 * array elements are dispatched to CUDA threads. This application uses the
 * simplest version of the execution configuration; it does not use the optional
 * arguments.
 **/
int main(int argc, char* argv[])
{
    const int ARRAY_SIZE_X = 15;
    const int ARRAY_SIZE_Y = 8;
    const int ARRAY_SIZE_BYTES = ARRAY_SIZE_X * ARRAY_SIZE_Y * sizeof(int);

    // Declare pointers that will point to the memory allocated on the host.
    int a_host[ARRAY_SIZE_X * ARRAY_SIZE_Y];
    for(int i = 0; i < ARRAY_SIZE_X * ARRAY_SIZE_Y; i++)
    {
        a_host[i] = 0;
    }

    // Declare pointers that will point to the memory allocated on the device.
    int* a_device;

    // Allocate memory on the device
    cudaMalloc(&a_device, ARRAY_SIZE_BYTES);

    // Copy the input arrays to the device
    cudaMemcpy(a_device, a_host, ARRAY_SIZE_BYTES, cudaMemcpyHostToDevice);

    // Launch the kernel on the device
    dim3 grid_size;
    grid_size.x = 3;
    grid_size.y = 2;
    grid_size.z = 1;

    dim3 block_size;
    block_size.x = 5;
    block_size.y = 4;
    block_size.z = 1;
    set_to_block_id<<<grid_size, block_size>>>(a_device, ARRAY_SIZE_X, ARRAY_SIZE_Y);

    // Copy the output array back from the device to the host and print its values
    cudaMemcpy(a_host, a_device, ARRAY_SIZE_BYTES, cudaMemcpyDeviceToHost);
    for(int i = 0; i < ARRAY_SIZE_Y; i++)
    {
        for(int j = 0; j < ARRAY_SIZE_X; j++)
        {
            printf("%d ", a_host[i * ARRAY_SIZE_X + j]);
        }
        printf("\n");
    }
    printf("\n");
    fflush(stdout);

    // Free resources
    cudaFree(a_device);

    return EXIT_SUCCESS;
}