Execution space qualifiers




__host__ is an execution space qualifier indicating that the corresponding function is to be called from the host only and executed on the host. __host__ is not compatible with __global__ but it is compatible with __device__; making the corresponding function executable both on the host and a device but callable only from the host. Such a function therefore contains code for both the device and the host, but the __CUDA_ARCH__ macro allows to differentiate between the two. Specifying no execution space qualifier or specifying __host__ is identical; the function is compiled for the host only. Other execution space qualifiers are __global__ and __device__.




#include <stdio.h>
#include <stdlib.h>

 * @brief This function sets the element value to 1 if the element is processed
 * on the device, and 2 if it is processed on the host.
__host__ __device__ void set_to_1_or_2(int* a, int number_on_gpu, int number_on_cpu)
    #ifdef __CUDA_ARCH__
        // We are on the device
        int my_index = blockIdx.x * blockDim.x + threadIdx.x;
         if(my_index < number_on_gpu)
             a[my_index] = 1;
         // We are on the host
         for(int i = number_on_gpu; i < number_on_gpu + number_on_cpu; i++)
             a[i] = 2;

 * @brief This function acts as an interface between the host and the __host__
 * __device__ function set_to_1_or_2.
__global__ void launch_set_to_1_or_2(int* a, int number_on_gpu, int number_on_cpu)
    set_to_1_or_2(a, number_on_gpu, number_on_cpu);

 * @brief Illustrates how to use a function both on the host and device.
int main(int argc, char* argv[])
    const int ARRAY_SIZE_GPU = 10;
    const int ARRAY_SIZE_CPU = 10;
    const int ARRAY_SIZE_BYTES = (ARRAY_SIZE_GPU + ARRAY_SIZE_CPU) * sizeof(int);

    // Declare pointers that will point to the memory allocated on the host.
    int a_host[ARRAY_SIZE_GPU + ARRAY_SIZE_CPU];
    for(int i = 0; i < ARRAY_SIZE_GPU + ARRAY_SIZE_CPU; i++)
        a_host[i] = 0;

    // Launch the host version
    set_to_1_or_2(a_host, ARRAY_SIZE_GPU, ARRAY_SIZE_CPU);

    // Declare pointers that will point to the memory allocated on the device.
    int* a_device;

    // Allocate memory on the device
    cudaMalloc(&a_device, ARRAY_SIZE_BYTES);

    // Copy the input arrays to the device
    cudaMemcpy(a_device, a_host, ARRAY_SIZE_BYTES, cudaMemcpyHostToDevice);

    // Launch the kernel on the device
    dim3 grid_size;
    grid_size.x = 1;
    grid_size.y = 1;
    grid_size.z = 1;

    dim3 block_size;
    block_size.x = ARRAY_SIZE_GPU;
    block_size.y = 1;
    block_size.z = 1;
    launch_set_to_1_or_2<<<grid_size, block_size>>>(a_device, ARRAY_SIZE_GPU, ARRAY_SIZE_CPU);

    // Copy the output array back from the device to the host and print its values
    cudaMemcpy(a_host, a_device, ARRAY_SIZE_BYTES, cudaMemcpyDeviceToHost);
    for(int i = 0; i < ARRAY_SIZE_GPU + ARRAY_SIZE_CPU; i++)
        printf("%d ", a_host[i]);

    // Free resources

    return EXIT_SUCCESS;