56 template<
typename real_t,
typename index_t>
class CUDATools
74 if(cuInit(0) != CUDA_SUCCESS)
76 std::cout <<
"Error initializing CUDA driver" << std::endl;;
81 cuDeviceGetCount(&deviceCount);
84 std::cout <<
"No CUDA-enabled devices found" << std::endl;
88 if(cuDeviceGet(&cuDevice, 0) != CUDA_SUCCESS)
90 std::cout <<
"Cannot get CUDA device" << std::endl;
94 if(cuCtxCreate(&cuContext, 0, cuDevice) != CUDA_SUCCESS)
96 std::cout <<
"Error creating CUDA context" << std::endl;
100 if(cuModuleLoad(&smoothModule,
"CUDA/Smooth.cubin") != CUDA_SUCCESS)
102 std::cout <<
"Error loading CUDA module \"Smooth\"" << std::endl;
112 std::map<
int, std::deque<index_t> > & colour_sets, std::vector<real_t> & quality,
113 int orientation,
size_t dimensions)
118 NElements = mesh->_NElements;
119 NSElements = surface->get_number_facets();
122 NNListToArray(mesh->NNList);
123 colourSetsToArray(colour_sets);
124 NEListToArray(mesh->NEList);
125 SNEListToArray(surface->SNEList);
126 surfaceNodesToArray(surface->surface_nodes);
129 copyArrayToDevice<real_t>(mesh->
get_coords(0), CUDA_coords, NNodes * ndims);
130 copyArrayToDevice<real_t>(mesh->
get_metric(0), CUDA_metric, NNodes * ndims * ndims);
131 copyArrayToDevice<real_t>(surface->get_normal(0), CUDA_normals, NSElements * ndims);
132 copyArrayToDevice<real_t>(&quality[0], CUDA_quality, NElements);
133 copyArrayToDevice<index_t>(&mesh->_ENList[0], CUDA_ENList, NElements * nloc);
134 copyArrayToDevice<index_t>(surface->get_coplanar_ids(), CUDA_coplanar_ids, NSElements);
135 copyArrayToDevice<index_t>(&surface->SENList[0], CUDA_SENList, NSElements * ndims);
136 copyArrayToDevice<index_t>(NNListArray, CUDA_NNListArray, NNListArray_size);
137 copyArrayToDevice<index_t>(NNListIndex, CUDA_NNListIndex, NNodes+1);
138 copyArrayToDevice<index_t>(colourArray, CUDA_colourArray, NNodes);
139 copyArrayToDevice<index_t>(NEListArray, CUDA_NEListArray, NEListArray_size);
140 copyArrayToDevice<index_t>(NEListIndex, CUDA_NEListIndex, NNodes+1);
141 copyArrayToDevice<index_t>(SNEListArray, CUDA_SNEListArray, NSElements * ndims);
142 copyArrayToDevice<index_t>(SNEListIndex, CUDA_SNEListIndex, NNodes);
143 copyArrayToDevice<uint32_t>(surfaceNodesArray, CUDA_surfaceNodesArray, surfaceNodesArray_size);
149 #define SET_CONSTANT(SYMBOL_NAME) \
150 cuModuleGetGlobal(&address, &symbol_size, smoothModule, #SYMBOL_NAME); \
151 cuMemcpyHtoD(address, &CUDA_ ## SYMBOL_NAME, symbol_size);
170 cuModuleGetGlobal(&CUDA_orientation, &symbol_size, smoothModule,
"orientation");
171 cuMemcpyHtoD(CUDA_orientation, &orientation, symbol_size);
176 copyArrayToDeviceNoAlloc<real_t>((real_t *) &mesh->_coords[0], CUDA_coords, NNodes * ndims);
181 copyArrayToDeviceNoAlloc<real_t>((real_t *) &mesh->metric[0], CUDA_metric, NNodes * ndims * ndims);
186 copyArrayFromDevice<real_t>((real_t *) &mesh->_coords[0], CUDA_coords, NNodes * ndims);
191 copyArrayFromDevice<real_t>((real_t *) &mesh->metric[0], CUDA_metric, NNodes * ndims * ndims);
196 if(cuMemAlloc(&CUDA_smoothStatus, NNodes *
sizeof(
unsigned char)) != CUDA_SUCCESS)
198 std::cout <<
"Error allocating CUDA memory" << std::endl;
205 cuModuleGetGlobal(&address, &symbol_size, smoothModule,
"smoothStatus");
206 cuMemcpyHtoD(address, &CUDA_smoothStatus, symbol_size);
211 if(cuMemAlloc(&CUDA_activeVertices, NNodes *
sizeof(
unsigned char)) != CUDA_SUCCESS)
213 std::cout <<
"Error allocating CUDA memory" << std::endl;
220 cuModuleGetGlobal(&address, &symbol_size, smoothModule,
"activeVertices");
221 cuMemcpyHtoD(address, &CUDA_activeVertices, symbol_size);
226 copyArrayFromDevice<unsigned char>( (
unsigned char *) &status[0], CUDA_smoothStatus, NNodes);
231 cuMemFree(CUDA_coords);
232 cuMemFree(CUDA_metric);
233 cuMemFree(CUDA_normals);
234 cuMemFree(CUDA_quality);
235 cuMemFree(CUDA_ENList);
236 cuMemFree(CUDA_coplanar_ids);
237 cuMemFree(CUDA_SENList);
238 cuMemFree(CUDA_NNListArray);
239 cuMemFree(CUDA_NNListIndex);
240 cuMemFree(CUDA_colourArray);
241 cuMemFree(CUDA_SNEListArray);
242 cuMemFree(CUDA_SNEListIndex);
243 cuMemFree(CUDA_NEListArray);
244 cuMemFree(CUDA_NEListIndex);
245 cuMemFree(CUDA_surfaceNodesArray);
246 cuMemFree(CUDA_smoothStatus);
248 delete[] NNListArray;
249 delete[] NNListIndex;
250 delete[] colourArray;
251 delete[] colourIndex;
252 delete[] surfaceNodesArray;
253 delete[] SNEListArray;
254 delete[] SNEListIndex;
255 delete[] NEListArray;
256 delete[] NEListIndex;
258 cuCtxDestroy(cuContext);
263 if(cuModuleGetFunction(&smoothKernel, smoothModule, method.c_str()) != CUDA_SUCCESS)
265 std::cout <<
"Error loading CUDA kernel " << method << std::endl;
272 CUdeviceptr CUDA_ColourSetAddr = CUDA_colourArray + colourIndex[--colour] *
sizeof(
index_t);
273 index_t NNodesInSet = colourIndex[colour+1] - colourIndex[colour];
274 threadsPerBlock = 32;
275 blocksPerGrid = (NNodesInSet + threadsPerBlock - 1) / threadsPerBlock;
277 void * args[] = {&CUDA_ColourSetAddr, &NNodesInSet};
279 CUresult result = cuLaunchKernel(smoothKernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, 0, args, NULL);
280 if(result != CUDA_SUCCESS)
282 std::cout <<
"Error launching CUDA kernel for colour " << colour << std::endl;
286 result = cuCtxSynchronize();
287 if(result != CUDA_SUCCESS)
288 std::cout <<
"Sync result " << result << std::endl;
292 void NNListToArray(
const std::vector< std::deque<index_t> > & NNList)
294 typename std::vector< std::deque<index_t> >::const_iterator vec_it;
295 typename std::deque<index_t>::const_iterator deque_it;
299 for(vec_it = NNList.begin(); vec_it != NNList.end(); vec_it++)
300 offset += vec_it->size();
302 NNListArray_size = offset;
304 NNListIndex =
new index_t[NNodes+1];
305 NNListArray =
new index_t[NNListArray_size];
309 for(vec_it = NNList.begin(); vec_it != NNList.end(); vec_it++)
311 NNListIndex[index++] = offset;
313 for(deque_it = vec_it->begin(); deque_it != vec_it->end(); deque_it++)
314 NNListArray[offset++] = *deque_it;
317 assert(index == NNList.size());
318 NNListIndex[index] = offset;
321 void colourSetsToArray(
const std::map<
int, std::deque<index_t> > & colour_sets)
323 typename std::map< int, std::deque<index_t> >::const_iterator map_it;
324 typename std::deque<index_t>::const_iterator deque_it;
326 NColours = colour_sets.size();
328 colourIndex =
new index_t[NColours+1];
329 colourArray =
new index_t[NNodes];
333 for(map_it = colour_sets.begin(); map_it != colour_sets.end(); map_it++)
335 colourIndex[map_it->first - 1] = offset;
337 for(deque_it = map_it->second.begin(); deque_it != map_it->second.end(); deque_it++)
338 colourArray[offset++] = *deque_it;
341 colourIndex[colour_sets.size()] = offset;
344 void surfaceNodesToArray(
const std::vector<bool> & surface_nodes)
346 const size_t nbits =
sizeof(uint32_t) * 8;
348 surfaceNodesArray_size = NNodes / nbits + (NNodes % nbits ? 1 : 0);
349 surfaceNodesArray =
new uint32_t[surfaceNodesArray_size];
350 memset(surfaceNodesArray, 0, surfaceNodesArray_size *
sizeof(uint32_t));
352 for(
index_t i = 0; i < (int) surface_nodes.size(); i++)
353 if(surface_nodes[i] ==
true)
354 surfaceNodesArray[i / nbits] |= 1 << i % nbits;
357 void SNEListToArray(
const std::map<
int, std::set<index_t> > & SNEList)
359 typename std::map< int, std::set<index_t> >::const_iterator map_it;
360 typename std::set<index_t>::const_iterator set_it;
362 SNEListArray =
new index_t[NSElements*ndims];
363 SNEListIndex =
new index_t[NNodes];
364 memset(SNEListIndex, 0, NNodes *
sizeof(
index_t));
367 for(map_it = SNEList.begin(); map_it != SNEList.end(); map_it++)
369 SNEListIndex[map_it->first] = offset;
370 for(set_it = map_it->second.begin(); set_it != map_it->second.end(); set_it++)
371 SNEListArray[offset++] = *set_it;
375 void NEListToArray(
const std::vector< std::set<index_t> > & NEList)
377 typename std::vector< std::set<index_t> >::const_iterator vec_it;
378 typename std::set<index_t>::const_iterator set_it;
382 for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++)
383 offset += vec_it->size();
385 NEListArray_size = offset;
387 NEListIndex =
new index_t[NNodes+1];
388 NEListArray =
new index_t[NEListArray_size];
392 for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++)
394 NEListIndex[index++] = offset;
396 for(set_it = vec_it->begin(); set_it != vec_it->end(); set_it++)
397 NEListArray[offset++] = *set_it;
400 assert(index == NEList.size());
401 NEListIndex[index] = offset;
404 template<
typename type>
405 inline void copyArrayToDevice(
const type * array, CUdeviceptr & CUDA_array,
index_t array_size)
407 if(cuMemAlloc(&CUDA_array, array_size *
sizeof(type)) != CUDA_SUCCESS)
409 std::cout <<
"Error allocating CUDA memory" << std::endl;
413 cuMemcpyHtoD(CUDA_array, array, array_size *
sizeof(type));
416 template<
typename type>
417 inline void copyArrayToDeviceNoAlloc(
const type * array, CUdeviceptr & CUDA_array,
index_t array_size)
419 cuMemcpyHtoD(CUDA_array, array, array_size *
sizeof(type));
422 template<
typename type>
423 inline void copyArrayFromDevice(type * array, CUdeviceptr & CUDA_array,
index_t array_size)
425 cuMemcpyDtoH(array, CUDA_array, array_size *
sizeof(type));
433 CUmodule smoothModule;
434 CUmodule coarsenModule;
435 CUmodule refineModule;
437 CUfunction smoothKernel;
438 CUfunction coarsenKernel;
439 CUfunction refineKernel;
441 unsigned int threadsPerBlock, blocksPerGrid;
443 index_t NNodes, NElements, NSElements, ndims, nloc;
445 CUdeviceptr CUDA_coords;
446 CUdeviceptr CUDA_metric;
447 CUdeviceptr CUDA_coplanar_ids;
448 CUdeviceptr CUDA_normals;
449 CUdeviceptr CUDA_ENList;
450 CUdeviceptr CUDA_SENList;
451 CUdeviceptr CUDA_quality;
452 CUdeviceptr CUDA_smoothStatus;
453 CUdeviceptr CUDA_activeVertices;
457 CUdeviceptr CUDA_NNListArray;
458 CUdeviceptr CUDA_NNListIndex;
463 CUdeviceptr CUDA_NEListArray;
464 CUdeviceptr CUDA_NEListIndex;
469 CUdeviceptr CUDA_colourArray;
472 uint32_t * surfaceNodesArray;
473 index_t surfaceNodesArray_size;
474 CUdeviceptr CUDA_surfaceNodesArray;
478 CUdeviceptr CUDA_SNEListArray;
479 CUdeviceptr CUDA_SNEListIndex;
481 CUdeviceptr CUDA_orientation;
const real_t * get_coords(index_t nid) const
Return positions vector.
const double * get_metric(index_t nid) const
Return metric at that vertex.
size_t get_number_nodes() const
Return the number of nodes in the mesh.