CVB++ 14.1
Cvb/CppCudaMemory
1// ----------------------------------------------------------------------------
2/// \brief Demonstrates how to pass user-allocated memory as the target
3/// buffer(s) for image acquisition from a camera. In this particular
4/// example, the allocated memory is either CUDA managed memory or
5/// host mapped memory, enabling automatic page migration or "zero-copy"
6/// access to the acquired images respectively and thus reducing manual
7/// copy overhead.
8// ----------------------------------------------------------------------------
9
10#include <iostream>
11#include <memory>
12#include <string>
13#include <vector>
14#include <cstdlib>
15#include <iomanip>
16#include <sstream>
17
18#include <cvb/device_factory.hpp>
19#include <cvb/global.hpp>
20#include <cvb/driver/driver.hpp>
21#include <cvb/image.hpp>
22#include <cvb/composite.hpp>
23#include <cvb/utilities/utilities.hpp>
24#include <cvb/driver/image_stream.hpp>
25
26#include <cuda_runtime.h>
27
28/*
29 * Controls whether managed memory ("automatic page migration") or
30 * "zero-copy" mapped host memory should be used.
31 *
32 * In the \b true case cudaMallocManaged will be used to allocate the memory.
33 * The pointer is written to by the acquisition engine and the same pointer
34 * can be used for CUDA device kernels.
35 * In the \b false case, cudaAllocHost with cudaHostAllocWriteCombined | cudaHostAllocMapped
36 * is used for the acquisition engine buffers.
37 * This is effectively "zero-copy" memory, as the GPU will be able to work on the host memory
38 * by getting the corresponding device pointer from cudaHostGetDevicePointer.
39 * Note: cudaHostAllocWriteCombined leads to disabled host caching of the data, which in turn
40 * leads to slow reads on CPU, so the acquired buffers should mainly be consumed on the GPU.
41 * The output image is manually managed memory in the latter case.
42 */
43static const constexpr bool USE_MANAGED_MEMORY = true;
44
45static const constexpr auto TIMEOUT = std::chrono::milliseconds(3000);
46static const constexpr int NUM_ELEMENTS_TO_ACQUIRE = 4;
47static const constexpr int NUM_BUFFERS = 4;
48static const constexpr int SOBEL_RADIUS = 1;
49__constant__ const int sobelKernel[9] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
50
51#define CUDA_CHECK(ans) \
52{ \
53 CudaAssert((ans), __FILE__, __LINE__); \
54}
55inline void CudaAssert(cudaError_t code, const char *file, int line, bool abort = true)
56{
57 if (code != cudaSuccess)
58 {
59 std::cerr << "CUDA Error: " << cudaGetErrorString(code) << " " << file << ":" << line << std::endl;
60 if (abort)
61 exit(code);
62 }
63}
64
65void SetupCudaDevice()
66{
67 int gpuCount = 0;
68 CUDA_CHECK(cudaGetDeviceCount(&gpuCount));
69 if(gpuCount == 0)
70 throw std::runtime_error{"No CUDA device present."};
71
72 cudaDeviceProp deviceProps;
73 CUDA_CHECK(cudaGetDeviceProperties(&deviceProps, 0));
74 std::cout << "Using CUDA device: " << deviceProps.name << std::endl;
75
76 if (USE_MANAGED_MEMORY)
77 {
78 if(!deviceProps.managedMemory)
79 throw std::runtime_error{"CUDA device doesn't support managed memory"};
80 }
81 else
82 {
83 if (!deviceProps.canMapHostMemory)
84 throw std::runtime_error{"CUDA device doesn't support mapping host memory"};
85 CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
86 }
87}
88
89__global__ void gpuSobel(const std::uint8_t *inputImage, int width, int height, std::uint8_t *targetImage)
90{
91 extern __shared__ std::uint8_t inputShMem[];
92
93 // calculate the global index of the filtered output image
94 const int block_start_x = blockDim.x * blockIdx.x;
95 const int block_start_y = blockDim.y * blockIdx.y;
96 const int o_x = blockDim.x * blockIdx.x + threadIdx.x;
97 const int o_y = blockDim.y * blockIdx.y + threadIdx.y;
98 const int output_index = o_y * (width - 2 * SOBEL_RADIUS) + o_x;
99 const int s_x = threadIdx.x + SOBEL_RADIUS;
100 const int s_y = threadIdx.y + SOBEL_RADIUS;
101 const int shmem_width = blockDim.x + 2 * SOBEL_RADIUS + 1;
102 const int shmem_height = blockDim.y + 2 * SOBEL_RADIUS;
103 const int shmem_index = s_y * (shmem_width) + s_x;
104 const int numberOfPix = (width - 2 * SOBEL_RADIUS) * (height - 2 * SOBEL_RADIUS);
105
106 // calculate global index of the input image
107 const int i_x = o_x + SOBEL_RADIUS; // o_x = output_index % aoi_width
108 const int i_y = o_y + SOBEL_RADIUS; // o_y = output_index / aoi_width
109 const int input_index = i_y * width + i_x;
110
111 inputShMem[shmem_index] = (input_index < width * height) ? inputImage[input_index] : 0;
112 if (threadIdx.x < 2)
113 { // l & r
114 const int idx = i_y * width + block_start_x + threadIdx.x * (shmem_width - 2);
115 inputShMem[s_y * shmem_width + threadIdx.x * (shmem_width - 2)] = (idx < width * height) ? inputImage[idx] : 0;
116 }
117
118 if (threadIdx.y < 2)
119 { // t & b
120 const int idx = (block_start_y + threadIdx.y * (shmem_height - 1)) * width + i_x;
121 inputShMem[threadIdx.y * (shmem_height - 1) * shmem_width + s_x] = (idx < width * height) ? inputImage[idx] : 0;
122 }
123
124 if (threadIdx.y == 0 && threadIdx.x == 0)
125 { // corners
126 const int idxTopLeft = block_start_y * width + block_start_x;
127 const int idxTopRight = block_start_y * width + block_start_x + shmem_width - 2;
128 const int idxBotLeft = (block_start_y + shmem_height - 1) * width + block_start_x;
129 const int idxBotRight = (block_start_y + shmem_height - 1) * width + block_start_x + shmem_width - 2;
130
131 inputShMem[0] = (idxTopLeft < width * height) ? inputImage[idxTopLeft] : 0;
132 inputShMem[shmem_width - 2] = (idxTopRight < width * height) ? inputImage[idxTopRight] : 0;
133 inputShMem[(shmem_height - 1) * shmem_width] = (idxBotLeft < width * height) ? inputImage[idxBotLeft] : 0;
134 inputShMem[(shmem_height - 1) * shmem_width + shmem_width - 2] =
135 (idxBotRight < width * height) ? inputImage[idxBotRight] : 0;
136 }
137
138 __syncthreads(); // make sure every thread sees inputShMem.
139
140 // test, if the index is still in range
141 if (output_index < numberOfPix)
142 {
143 // variables for solbel filtering
144 std::uint16_t sumX = 0, sumY = 0;
145
146 // filter in x-direction
147 sumX = inputShMem[shmem_index - shmem_width - 1] * sobelKernel[0]
148 + inputShMem[shmem_index - shmem_width + 1] * sobelKernel[2] + inputShMem[shmem_index - 1] * sobelKernel[3]
149 + inputShMem[shmem_index + 1] * sobelKernel[5] + inputShMem[shmem_index + shmem_width - 1] * sobelKernel[6]
150 + inputShMem[shmem_index + shmem_width + 1] * sobelKernel[8];
151
152 // filter in y-direction
153 sumY = inputShMem[shmem_index - shmem_width - 1] * sobelKernel[0]
154 + inputShMem[shmem_index - shmem_width] * sobelKernel[3]
155 + inputShMem[shmem_index - shmem_width + 1] * sobelKernel[6]
156 + inputShMem[shmem_index + shmem_width - 1] * sobelKernel[2]
157 + inputShMem[shmem_index + shmem_width] * sobelKernel[5]
158 + inputShMem[shmem_index + shmem_width + 1] * sobelKernel[8];
159
160 targetImage[output_index] = min((sumX + sumY) / 2, std::numeric_limits<std::uint8_t>::max());
161 }
162}
163
164namespace Tutorial
165{
166
167 /* FlowSetPool does not store the FlowSets for further
168 * use, therefore we create a subclass to release the
169 * buffer later
170 */
171 class UserFlowSetPool final : public Cvb::Driver::FlowSetPool
172 {
173
174 using UserFlowSetPoolPtr = std::shared_ptr<UserFlowSetPool>;
175
176
177 public:
178 UserFlowSetPool(const std::vector<Cvb::FlowInfo> &flowInfo) noexcept
179 : Cvb::FlowSetPool(flowInfo, Cvb::FlowSetPool::ProtectedTag{})
180 {
181 }
182
183 virtual ~UserFlowSetPool()
184 {
185 for (auto &flowSet : *this)
186 for (auto &flow : flowSet)
187 if (USE_MANAGED_MEMORY)
188 {
189 CUDA_CHECK(cudaFree(flow.Buffer));
190 }
191 else
192 {
193 CUDA_CHECK(cudaFreeHost(flow.Buffer));
194 }
195 }
196
197 static UserFlowSetPoolPtr Create(const std::vector<Cvb::FlowInfo> &flowInfos)
198 {
199 return std::make_shared<UserFlowSetPool>(flowInfos);
200 }
201 };
202
203} // namespace Tutorial
204
205template <class Func>
206void AcquireData(const Cvb::CompositeStreamPtr &stream, Func &&processFunc)
207{
208 for (auto i = 0; i < NUM_ELEMENTS_TO_ACQUIRE; i++)
209 {
210 Cvb::CompositePtr composite;
211 Cvb::WaitStatus waitStatus;
212 Cvb::NodeMapEnumerator enumerator;
213 std::tie(composite, waitStatus, enumerator) = stream->WaitFor(TIMEOUT);
214 switch (waitStatus)
215 {
216 default:
217 std::cout << "wait status unknown.\n";
218 case Cvb::WaitStatus::Abort:
219 case Cvb::WaitStatus::Timeout:
220 {
221 std::cout << "wait status not ok\n";
222 continue;
223 }
224 case Cvb::WaitStatus::Ok:
225 {
226 break;
227 }
228 }
229
230 // assume the composites first element is an image
231 auto firstElement = composite->ItemAt(0);
232
233 if (!Cvb::holds_alternative<Cvb::ImagePtr>(firstElement))
234 {
235 std::cout << "composite does not contain an image at the first element\n";
236 continue;
237 }
238
239 auto image = Cvb::get<Cvb::ImagePtr>(firstElement);
240 auto linearAccess = image->Plane(0).LinearAccess();
241 std::cout << "acquired image: " << i << " at memory location: " << reinterpret_cast<intptr_t>(linearAccess.BasePtr()) << "\n";
242
243 // generate file name
244 std::stringstream acqFileName;
245 acqFileName << "acquired" << std::setw(2) << std::setfill('0') << i << ".png";
246 image->Save(acqFileName.str());
247 std::stringstream procFileName;
248 procFileName << "processed" << std::setw(2) << std::setfill('0') << i << ".png";
249
250 // call processing lambda
251 processFunc(image, linearAccess, procFileName.str());
252 }
253}
254
255int main()
256{
257 try
258 {
259 // check for present CUDA device, print name and check for required device features
260 SetupCudaDevice();
261
262 // discover transport layers
263 auto infoList = Cvb::DeviceFactory::Discover(Cvb::DiscoverFlags::IgnoreVins);
264
265 // can't continue the demo if there's no available device
266 if (infoList.empty())
267 throw std::runtime_error("There is no available device for this demonstration.");
268
269 // instantiate the first device in the discovered list
270 auto device = Cvb::DeviceFactory::Open<Cvb::GenICamDevice>(infoList[0].AccessToken(), Cvb::AcquisitionStack::GenTL);
271
272 if (device->StreamCount() == 0)
273 throw std::runtime_error("There is no available stream for this demonstration.");
274
275 // get the first stream
276 auto stream = device->Stream<Cvb::CompositeStream>(0);
277
278 // get the flow set information that is needed for the current stream
279 auto flowSetInfo = stream->FlowSetInfo();
280
281 // create a subclass of FlowSetPool to store the created buffer
282 auto flowSetPoolPtr = Tutorial::UserFlowSetPool::Create(flowSetInfo);
283
284 std::generate_n(std::back_inserter(*flowSetPoolPtr), NUM_BUFFERS, [&flowSetInfo]() {
285 auto flows = std::vector<void *>(flowSetInfo.size());
286 std::transform(flowSetInfo.begin(), flowSetInfo.end(), flows.begin(),
287 [](Cvb::Driver::FlowInfo info) {
288 void *ptr = nullptr;
289 if (USE_MANAGED_MEMORY)
290 {
291 // allocate managed memory, but attach to host
292 CUDA_CHECK(cudaMallocManaged(&ptr, info.Size, cudaMemAttachHost));
293 }
294 else
295 {
296 // allocate host memory, which is readable from the GPU
297 CUDA_CHECK(cudaHostAlloc(&ptr, info.Size, cudaHostAllocWriteCombined | cudaHostAllocMapped));
298 }
299 return ptr;
300 });
301 return flows;
302 });
303
304 // register the user flow set pool
305 stream->RegisterExternalFlowSetPool(std::move(flowSetPoolPtr));
306
307 std::uint8_t *dTarget = nullptr;
308 std::uint8_t *target = nullptr;
309 if (USE_MANAGED_MEMORY)
310 {
311 // allocate managed memory, but attach to host
312 CUDA_CHECK(cudaMallocManaged(&dTarget, flowSetInfo.front().Size, cudaMemAttachHost));
313 }
314 else
315 {
316 // allocate manually managed memory on GPU
317 CUDA_CHECK(cudaMalloc(&dTarget, flowSetInfo.front().Size));
318 // allocate pinned memory on host
319 CUDA_CHECK(cudaMallocHost(&target, flowSetInfo.front().Size));
320 }
321
322 // start the data acquisition for that stream
323 stream->EngineStart();
324 stream->DeviceStart();
325
326 // create a GPU processing stream
327 cudaStream_t gpuStream;
328 CUDA_CHECK(cudaStreamCreate(&gpuStream));
329 if (USE_MANAGED_MEMORY)
330 {
331 AcquireData(stream, [&](Cvb::ImagePtr &image, Cvb::LinearAccessData &linearAccess, std::string outputName) {
332 const dim3 blocks{(static_cast<unsigned>(image->Width() - 2 * SOBEL_RADIUS) + 15) / 16,
333 (static_cast<unsigned>(image->Height() - 2 * SOBEL_RADIUS) + 15) / 16};
334 const dim3 threads{16, 16};
335 std::uint8_t *dInput = reinterpret_cast<std::uint8_t *>(linearAccess.BasePtr());
336 // attach memory to stream -> make writable from GPU
337 CUDA_CHECK(cudaStreamAttachMemAsync(gpuStream, dTarget));
338 CUDA_CHECK(cudaStreamAttachMemAsync(gpuStream, dInput));
339
340 // enqueue CUDA kernel
341 gpuSobel<<<blocks, threads, (16 + 2 * SOBEL_RADIUS + 1) * (16 + 2 * SOBEL_RADIUS), gpuStream>>>(
342 dInput, image->Width(), image->Height(), dTarget);
343
344 // prefetch target buffer
345 CUDA_CHECK(cudaStreamAttachMemAsync(gpuStream, dTarget, 0, cudaMemAttachHost));
346
347 // detach from stream -> make writable from acquisition engine again
348 CUDA_CHECK(cudaStreamAttachMemAsync(gpuStream, dInput, 0, cudaMemAttachHost));
349 CUDA_CHECK(cudaStreamSynchronize(gpuStream));
350
351 // map managed memory to Cvb image
352 auto wrapped = Cvb::WrappedImage::FromGrey8Pixels(dTarget, image->Width() - 2 * SOBEL_RADIUS,
353 image->Height() - 2 * SOBEL_RADIUS);
354 wrapped->Save(outputName);
355 });
356 }
357 else
358 AcquireData(stream, [&](Cvb::ImagePtr &image, Cvb::LinearAccessData &linearAccess, std::string outputName) {
359 const dim3 blocks{(static_cast<unsigned>(image->Width()) + 15) / 16,
360 (static_cast<unsigned>(image->Height()) + 15) / 16};
361 const dim3 threads{16, 16};
362
363 // get GPU pointer for mapped host pointer
364 std::uint8_t *dImage;
365 CUDA_CHECK(cudaHostGetDevicePointer(&dImage, reinterpret_cast<void *>(linearAccess.BasePtr()), 0));
366
367 // enqueue CUDA kernel
368 gpuSobel<<<blocks, threads, (16 + 2 * SOBEL_RADIUS + 1) * (16 + 2 * SOBEL_RADIUS), gpuStream>>>(dImage, image->Width(),
369 image->Height(), dTarget);
370
371 // copy output to host memory
372 CUDA_CHECK(cudaMemcpyAsync(target, dTarget,
373 (image->Width() - 2 * SOBEL_RADIUS) * (image->Height() - 2 * SOBEL_RADIUS),
374 cudaMemcpyDeviceToHost, gpuStream));
375 CUDA_CHECK(cudaStreamSynchronize(gpuStream));
376
377 // map host memory to Cvb image
378 auto wrapped = Cvb::WrappedImage::FromGrey8Pixels(target, image->Width() - 2 * SOBEL_RADIUS,
379 image->Height() - 2 * SOBEL_RADIUS);
380 wrapped->Save(outputName);
381 });
382 CUDA_CHECK(cudaStreamDestroy(gpuStream));
383
384 stream->DeviceAbort();
385 stream->EngineAbort();
386
387 // deregister the user flow set pool to get free buffer (releaseCallback)
388 stream->DeregisterFlowSetPool();
389 }
390 catch (const std::exception &e)
391 {
392 std::cout << e.what() << std::endl;
393 }
394
395 return 0;
396}