float *dev_dets; cudaError_t err = cudaSuccess; err = cudaMalloc((void **)&dev_dets, sizeof(dets)); if (err != cudaSuccess) { printf("cudaMalloc failed!"); return1; } cudaMemcpy(dev_dets, dets, sizeof(dets), cudaMemcpyHostToDevice); std::cout << "Copied data to GPU.\n";
// get back copied cuda data float host_dets[sizeof(dets)/sizeof(float)]; cudaMemcpy(&host_dets, dev_dets, sizeof(dets), cudaMemcpyDeviceToHost); std::cout << "Copied from cuda back to host.\n"; std::cout << "host_dets size: " << sizeof(host_dets) << std::endl; for (int i=0;i<sizeof(dets)/sizeof(float);i++) { std::cout << host_dets[i] << " "; } std::cout << std::endl; cudaFree(dev_dets);
std::cout << "done.\n"; return0; }
// 输出为 96 Copied data to GPU. Copied from cuda back to host. host_dets size: 96 2334567611234545122247479455665203755750000 done.
// 输出为 thread_id (3,1) block_id (0,1) coordinate (3, 3), global index 27
在输出时不能使用 std::cout, std 命名空间不能使用到 GPU 上
CUDA 的 Thrust 库
CUDA 的 Thrust 库是基于标准模板库 STL 的 CUDA 的 C++ 模板库, 通过与 CUDA C 配合使用,节省了大量优化算法的时间,保证了性能与开发效率,在 CUDA Toolkit 中包含 Thrust,无需额外安装,只需导入相应头文件,在调用时使用 thrust 命名空间,并尽量不要使用 using namespace std; 语句,因为 thrust 库和 STL 库非常多的重名
intmain(){ // H has storage for 4 integers thrust::host_vector<int> H(4); // initialize individual elements H[0] = 14; H[1] = 20; H[2] = 38; H[3] = 46; H.push_back(52); // H.size() returns the size of vector H std::cout << "H has size " << H.size() << std::endl; // print contents of H // for(int i = 0; i < H.size(); i++) for(auto i:H) std::cout << i << std::endl; // resize H H.resize(2); std::cout << "H now has size " << H.size() << std::endl; // Copy host_vector H to device_vector D thrust::device_vector<int> D = H; // elements of D can be modified D[0] = 99; D[1] = 88; // print contents of D for(auto i:D) std::cout << i << std::endl; // H and D are automatically deleted when the function returns return0; }
// 输出为 H has size 5 14 20 38 46 52 H now has size 2 99 88
intmain(){ // initialize all ten integers of a device_vector to 1 thrust::device_vector<int> D(10, 1); // set the first seven elements of a vector to 9 thrust::fill(D.begin(), D.begin() + 7, 9); // initialize a host_vector with the first five elements of D thrust::host_vector<int> H(D.begin(), D.begin() + 5); // set the elements of H to 0, 1, 2, 3, ... thrust::sequence(H.begin(), H.end()); // copy all of H back to the beginning of D thrust::copy(H.begin(), H.end(), D.begin()); // print D for(auto i:D)dd std::cout << i << " "; std::cout << std::endl; return0; }
// raw pointer to device memory int * raw_ptr; cudaMalloc((void **) &raw_ptr, N * sizeof(int)); // wrap raw pointer with a device_ptr thrust::device_ptr<int> dev_ptr(raw_ptr); // use device_ptr in thrust algorithms thrust::fill(dev_ptr, dev_ptr + N, (int) 0);
intmain(){ // allocate three device_vectors with 10 elements thrust::device_vector<int> X(10); thrust::device_vector<int> Y(10); thrust::device_vector<int> Z(10); // initialize X to 0,1,2,3, .... thrust::sequence(X.begin(), X.end()); // compute Y = -X thrust::transform(X.begin(), X.end(), Y.begin(), thrust::negate<int>()); // fill Z with twos thrust::fill(Z.begin(), Z.end(), 2); // compute Y = X mod 2 thrust::transform(X.begin(), X.end(), Z.begin(), Y.begin(), thrust::modulus<int>()); // replace all the ones in Y with tens thrust::replace(Y.begin(), Y.end(), 1, 10); // print Y thrust::copy(Y.begin(), Y.end(), std::ostream_iterator<int>(std::cout, " ")); return0; }
// 输出为 010010010010010
SAXPY
SAXPY(Scalar Alpha X Plus Y)是一个在 BLAS(Basic Linear Algebra Subprograms)函数库提供中的函数,并且是一个并行向量处理机(vector processor)中常用的计算操作指令,为标量乘法和向量加法的组合,如 $y = a*x + y$,其中 $x$ 和 $y$ 为向量,$a$ 为标量常数。下面的程序定义了一个 functor 实现 SAXPY
voidsaxpy(float A, thrust::device_vector<float>& X, thrust::device_vector<float>& Y) { // y = a * x + y thrust::transform(X.begin(), X.end(), Y.begin(), Y.begin(), saxpy_functor(A)); }
// square<T> computes the square of a number f(x) -> x*x template <typename T> structsquare { __host__ __device__ T operator()(const T& x)const{ return x * x; } };
#include<thrust/sort.h> #include<thrust/functional.h> ... constint N = 6; int A[N] = {1, 4, 2, 8, 5, 7}; thrust::sort(A, A + N); // A is now {1, 2, 4, 5, 7, 8}
... constint N = 6; int keys[N] = { 1, 4, 2, 8, 5, 7}; char values[N] = {'a', 'b', 'c', 'd', 'e', 'f'}; thrust::sort_by_key(keys, keys + N, values); // keys is now { 1, 2, 4, 5, 7, 8} // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
... constint N = 6; int A[N] = {1, 4, 2, 8, 5, 7}; thrust::stable_sort(A, A + N, thrust::greater<int>()); // A is now {8, 7, 5, 4, 2, 1}
上例中的 thrust::stable_sort接受用户自定义比较运算符
max_element(min_element)
求最大(小)值
1 2 3 4 5
#include<thrust/extrema.h> ... thrust::device_vector<type>::iterator iter = thrust::max_element(dvec.begin(),dvec.end()); int position = iter - dvec.begin(); type max_val = *iter;
======== Warning: nvprof is not supported on devices with compute capability 8.0 and higher. Use NVIDIA Nsight Systems for GPU tracing and CPU sampling and NVIDIA Nsight Compute for GPU profiling. Refer https://developer.nvidia.com/tools-overview for more details.
目前主流的 CUDA 驱动不再支持nvprof命令,但我们仍可以在 NVIDIA Nsight Systems 中使用,在终端输入 nsys nvprof ./*.o就可以看到CUDA 程序执行的具体内容