Memory management




cudaMalloc is a function that can be called from the host or the device to allocate memory on the device, much like malloc for the host. The memory allocated with cudaMalloc must be freed with cudaFree. Other variants of cudaMalloc are cudaMallocPitch, cudaMallocArray, cudaMalloc3D, cudaMalloc3DArray, cudaMallocHost and cuMemAlloc.



__host____device__ ​cudaError_t cudaMalloc(void** buffer, size_t size);



The variable in which store the address of the buffer corresponding to the memory allocated on the device.


The size of the memory to allocate on the device, in bytes.

Return value

The error code returned from the memory allocation:

  • cudaSucess: the memory was successfully allocated.
  • cudaErrorMemoryAllocation: an error occured.




#include <stdio.h>
#include <stdlib.h>

 * @brief This function sets the element value to that of the index of the
 * thread they have been processed by.
__global__ void set_to_index(int* a, int size)
    int my_index = blockIdx.x * blockDim.x + threadIdx.x;
    if(my_index < size)
        a[my_index] = my_index;

 * @brief Illustrates how to allocate memory on the device.
 * @details This application consists in a simple kernel launched to set the
 * elements passed to their index in the array. Part of the process is to
 * allocate the memory on the device, before data can be sent on it.
int main(int argc, char* argv[])
    const int ARRAY_SIZE = 10;
    const int ARRAY_SIZE_BYTES = ARRAY_SIZE * sizeof(int);

    // Declare pointers that will point to the memory allocated on the host.
    int a_host[ARRAY_SIZE];
    for(int i = 0; i < ARRAY_SIZE; i++)
        a_host[i] = 0;

    // Declare pointers that will point to the memory allocated on the device.
    int* a_device;

    // Allocate memory on the device
    cudaMalloc(&a_device, ARRAY_SIZE_BYTES);

    // Copy the input arrays to the device
    cudaMemcpy(a_device, a_host, ARRAY_SIZE_BYTES, cudaMemcpyHostToDevice);

    // Launch the kernel on the device
    set_to_index<<<1, ARRAY_SIZE>>>(a_device, ARRAY_SIZE);

    // Copy the output array back from the device to the host and print its values
    cudaMemcpy(a_host, a_device, ARRAY_SIZE_BYTES, cudaMemcpyDeviceToHost);
    for(int i = 0; i < ARRAY_SIZE; i++)
        printf("%d ", a_host[i]);

    // Free resources

    return EXIT_SUCCESS;