@nathanletwory
I created a CUDA project using the CUDA 12.0 Runtime template:
Using this template creates a project that already contains some working CUDA code in Kernel.cu

The included C++ CUDA example code Kernel.cu
is simple; it adds 2 arrays. Most of the code is shown below.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors a and b from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
After a build of this CUDA project there are the following files in the Release directory:
Kernel.cu
happens to include a main() section so it can be run by itself. But the function of interest for calling from a C++ DLL is:
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
This is what I want to call from my C++ DLL code. Once I get this working, then I should be able to expand this solution to add many other procedures to Kernel.cu and call these from my C++ DLL. Of course I will need to carefully consult the CUDA guide:
CUDA C++ Programming Guide (nvidia.com)
in order to properly pass values and setup the number of Threads per Block, Blocks per Grid and possibly Thread Block Clusters.
Currently the header in my C++ DLL code, which supports Rhino operations, looks like this:
#include "StdAfx.h"
#include "rhinoSdkPlugInDeclare.h"
//#include "SampleImportGeomviewPlugIn.h"
#include "Resource.h"
#include <string>
#include <cstdlib>
#include "stdafx.h"
#include <iostream>
#include <fstream>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <windows.h>
#include <sstream> // std::istringstream
//#include <winsock2.h>
#include <ppl.h>
//#include <concurrent_vector.h>
#include <array>
#include "stdafx.h"
#include <chrono>
#include <math.h>
#include <list>
#include <queue>
#include <unordered_set>
#include <thread> // sleep_for
//#include <map>
//#include <unordered_map>
#include <filesystem>
#include <sys/stat.h>
#include <wchar.h>
//#include <random>
#include <cstdlib>
#include <tuple>
#include <set>
using namespace std::this_thread; // sleep_for, sleep_until
using namespace std::chrono_literals; // ns, us, ms, s, h, etc.
#define COMPILER_SUPPORTS_128_BIT_INTEGERS 1
//#include <vector>
//#include <tuple>
//#include <algorithm>
//#include <iostream>
#include <locale.h>
// The plug-in object must be constructed before any plug-in classes derived
// from CRhinoCommand. The #pragma init_seg(lib) ensures that this happens.
#pragma warning( push )
#pragma warning( disable : 4073 )
#pragma init_seg( lib )
#pragma warning( pop )
using namespace concurrency;
using namespace std;
#define DLLEXPORT extern "C" __declspec(dllexport)
#define UCLASS()
#define GENERATED_BODY()
#define UFUNCTION()
What do I need to add to this header in order to be able to call the addWithCuda
procedure in the CUDA code compiled into the file kernal.cu OBJ file. Or is there something else than needs to be changed so that my C++ DLL code can call the CUDA procedure?
Lastly, in my C++ DLL would I call addWithCuda
using:
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
just like shown in the main()
section of the Kernel.cu
code?
Regards,
Terry.