PRAgMaTIc  master
CUDATools.h
Go to the documentation of this file.
1 /* Copyright (C) 2010 Imperial College London and others.
2  *
3  * Please see the AUTHORS file in the main source directory for a
4  * full list of copyright holders.
5  *
6  * Georgios Rokos
7  * Software Performance Optimisation Group
8  * Department of Computing
9  * Imperial College London
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above
17  * copyright notice, this list of conditions and the following
18  * disclaimer in the documentation and/or other materials provided
19  * with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
22  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
23  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
28  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
30  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
31  * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
32  * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #ifndef CUDATOOLS_H
37 #define CUDATOOLS_H
38 
39 #include <algorithm>
40 #include <cassert>
41 #include <cmath>
42 #include <deque>
43 #include <string>
44 #include <map>
45 #include <set>
46 #include <vector>
47 
48 #include <iostream>
49 #include <stdint.h>
50 
51 #include <cuda.h>
52 
53 #include "Mesh.h"
54 #include "Surface.h"
55 
56 template<typename real_t, typename index_t> class CUDATools
57 {
58 public:
59 
61  {
62  enabled = false;
63  }
64 
65  bool isEnabled()
66  {
67  return enabled;
68  }
69 
70  void initialize()
71  {
72  enabled = false;
73 
74  if(cuInit(0) != CUDA_SUCCESS)
75  {
76  std::cout << "Error initializing CUDA driver" << std::endl;;
77  return;
78  }
79 
80  int deviceCount = 0;
81  cuDeviceGetCount(&deviceCount);
82  if(deviceCount == 0)
83  {
84  std::cout << "No CUDA-enabled devices found" << std::endl;
85  return;
86  }
87 
88  if(cuDeviceGet(&cuDevice, 0) != CUDA_SUCCESS)
89  {
90  std::cout << "Cannot get CUDA device" << std::endl;
91  return;
92  }
93 
94  if(cuCtxCreate(&cuContext, 0, cuDevice) != CUDA_SUCCESS)
95  {
96  std::cout << "Error creating CUDA context" << std::endl;
97  return;
98  }
99 
100  if(cuModuleLoad(&smoothModule, "CUDA/Smooth.cubin") != CUDA_SUCCESS)
101  {
102  std::cout << "Error loading CUDA module \"Smooth\"" << std::endl;
103  return;
104  }
105  //cuModuleLoad(&coarsenModule, "CUDA/Coarsen.ptx");
106  //cuModuleLoad(&refineModule, "CUDA/Refine.ptx");
107 
108  enabled = true;
109  }
110 
111  void copyMeshDataToDevice(Mesh<real_t, index_t> * mesh, Surface<real_t, index_t> * surface,
112  std::map<int, std::deque<index_t> > & colour_sets, std::vector<real_t> & quality,
113  int orientation, size_t dimensions)
114  {
115  ndims = dimensions;
116  nloc = ndims+1;
117  NNodes = mesh->get_number_nodes();
118  NElements = mesh->_NElements;
119  NSElements = surface->get_number_facets();
120 
121  // convert pragmatic data-structures to C-style arrays
122  NNListToArray(mesh->NNList);
123  colourSetsToArray(colour_sets);
124  NEListToArray(mesh->NEList);
125  SNEListToArray(surface->SNEList);
126  surfaceNodesToArray(surface->surface_nodes);
127 
128  // and copy everything to the device
129  copyArrayToDevice<real_t>(mesh->get_coords(0), CUDA_coords, NNodes * ndims);
130  copyArrayToDevice<real_t>(mesh->get_metric(0), CUDA_metric, NNodes * ndims * ndims);
131  copyArrayToDevice<real_t>(surface->get_normal(0), CUDA_normals, NSElements * ndims);
132  copyArrayToDevice<real_t>(&quality[0], CUDA_quality, NElements);
133  copyArrayToDevice<index_t>(&mesh->_ENList[0], CUDA_ENList, NElements * nloc);
134  copyArrayToDevice<index_t>(surface->get_coplanar_ids(), CUDA_coplanar_ids, NSElements);
135  copyArrayToDevice<index_t>(&surface->SENList[0], CUDA_SENList, NSElements * ndims);
136  copyArrayToDevice<index_t>(NNListArray, CUDA_NNListArray, NNListArray_size);
137  copyArrayToDevice<index_t>(NNListIndex, CUDA_NNListIndex, NNodes+1);
138  copyArrayToDevice<index_t>(colourArray, CUDA_colourArray, NNodes);
139  copyArrayToDevice<index_t>(NEListArray, CUDA_NEListArray, NEListArray_size);
140  copyArrayToDevice<index_t>(NEListIndex, CUDA_NEListIndex, NNodes+1);
141  copyArrayToDevice<index_t>(SNEListArray, CUDA_SNEListArray, NSElements * ndims);
142  copyArrayToDevice<index_t>(SNEListIndex, CUDA_SNEListIndex, NNodes);
143  copyArrayToDevice<uint32_t>(surfaceNodesArray, CUDA_surfaceNodesArray, surfaceNodesArray_size);
144 
145  //set the constant symbols of the smoothing-kernel, i.e. the addresses of all arrays copied above
146  CUdeviceptr address;
147  size_t symbol_size;
148 
149 #define SET_CONSTANT(SYMBOL_NAME) \
150  cuModuleGetGlobal(&address, &symbol_size, smoothModule, #SYMBOL_NAME); \
151  cuMemcpyHtoD(address, &CUDA_ ## SYMBOL_NAME, symbol_size);
152 
153  SET_CONSTANT(coords)
154  SET_CONSTANT(metric)
155  SET_CONSTANT(normals)
156  SET_CONSTANT(quality)
157  SET_CONSTANT(ENList)
158  SET_CONSTANT(SENList)
159  SET_CONSTANT(NNListArray)
160  SET_CONSTANT(NNListIndex)
161  SET_CONSTANT(NEListArray)
162  SET_CONSTANT(NEListIndex)
163  SET_CONSTANT(SNEListArray)
164  SET_CONSTANT(SNEListIndex)
165  SET_CONSTANT(coplanar_ids)
166  SET_CONSTANT(surfaceNodesArray)
167  SET_CONSTANT(smoothStatus)
168 
169  // set element orientation in CUDA smoothing kernel
170  cuModuleGetGlobal(&CUDA_orientation, &symbol_size, smoothModule, "orientation");
171  cuMemcpyHtoD(CUDA_orientation, &orientation, symbol_size);
172  }
173 
175  {
176  copyArrayToDeviceNoAlloc<real_t>((real_t *) &mesh->_coords[0], CUDA_coords, NNodes * ndims);
177  }
178 
180  {
181  copyArrayToDeviceNoAlloc<real_t>((real_t *) &mesh->metric[0], CUDA_metric, NNodes * ndims * ndims);
182  }
183 
185  {
186  copyArrayFromDevice<real_t>((real_t *) &mesh->_coords[0], CUDA_coords, NNodes * ndims);
187  }
188 
190  {
191  copyArrayFromDevice<real_t>((real_t *) &mesh->metric[0], CUDA_metric, NNodes * ndims * ndims);
192  }
193 
195  {
196  if(cuMemAlloc(&CUDA_smoothStatus, NNodes * sizeof(unsigned char)) != CUDA_SUCCESS)
197  {
198  std::cout << "Error allocating CUDA memory" << std::endl;
199  exit(1);
200  }
201 
202  // set the constant symbol in CUDA smoothing kernel
203  CUdeviceptr address;
204  size_t symbol_size;
205  cuModuleGetGlobal(&address, &symbol_size, smoothModule, "smoothStatus");
206  cuMemcpyHtoD(address, &CUDA_smoothStatus, symbol_size);
207  }
208 
210  {
211  if(cuMemAlloc(&CUDA_activeVertices, NNodes * sizeof(unsigned char)) != CUDA_SUCCESS)
212  {
213  std::cout << "Error allocating CUDA memory" << std::endl;
214  exit(1);
215  }
216 
217  // set the constant symbol in CUDA smoothing kernel
218  CUdeviceptr address;
219  size_t symbol_size;
220  cuModuleGetGlobal(&address, &symbol_size, smoothModule, "activeVertices");
221  cuMemcpyHtoD(address, &CUDA_activeVertices, symbol_size);
222  }
223 
224  void retrieveSmoothStatus(std::vector<unsigned char> & status)
225  {
226  copyArrayFromDevice<unsigned char>( (unsigned char *) &status[0], CUDA_smoothStatus, NNodes);
227  }
228 
230  {
231  cuMemFree(CUDA_coords);
232  cuMemFree(CUDA_metric);
233  cuMemFree(CUDA_normals);
234  cuMemFree(CUDA_quality);
235  cuMemFree(CUDA_ENList);
236  cuMemFree(CUDA_coplanar_ids);
237  cuMemFree(CUDA_SENList);
238  cuMemFree(CUDA_NNListArray);
239  cuMemFree(CUDA_NNListIndex);
240  cuMemFree(CUDA_colourArray);
241  cuMemFree(CUDA_SNEListArray);
242  cuMemFree(CUDA_SNEListIndex);
243  cuMemFree(CUDA_NEListArray);
244  cuMemFree(CUDA_NEListIndex);
245  cuMemFree(CUDA_surfaceNodesArray);
246  cuMemFree(CUDA_smoothStatus);
247 
248  delete[] NNListArray;
249  delete[] NNListIndex;
250  delete[] colourArray;
251  delete[] colourIndex;
252  delete[] surfaceNodesArray;
253  delete[] SNEListArray;
254  delete[] SNEListIndex;
255  delete[] NEListArray;
256  delete[] NEListIndex;
257 
258  cuCtxDestroy(cuContext);
259  }
260 
261  void setSmoothingKernel(std::string method, std::vector<unsigned char> & status)
262  {
263  if(cuModuleGetFunction(&smoothKernel, smoothModule, method.c_str()) != CUDA_SUCCESS)
264  {
265  std::cout << "Error loading CUDA kernel " << method << std::endl;
266  enabled = false;
267  }
268  }
269 
270  void launchSmoothingKernel(int colour)
271  {
272  CUdeviceptr CUDA_ColourSetAddr = CUDA_colourArray + colourIndex[--colour] * sizeof(index_t);
273  index_t NNodesInSet = colourIndex[colour+1] - colourIndex[colour];
274  threadsPerBlock = 32;
275  blocksPerGrid = (NNodesInSet + threadsPerBlock - 1) / threadsPerBlock;
276 
277  void * args[] = {&CUDA_ColourSetAddr, &NNodesInSet};
278 
279  CUresult result = cuLaunchKernel(smoothKernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, 0, args, NULL);
280  if(result != CUDA_SUCCESS)
281  {
282  std::cout << "Error launching CUDA kernel for colour " << colour << std::endl;
283  return;
284  }
285 
286  result = cuCtxSynchronize();
287  if(result != CUDA_SUCCESS)
288  std::cout << "Sync result " << result << std::endl;
289  }
290 
291 private:
292  void NNListToArray(const std::vector< std::deque<index_t> > & NNList)
293  {
294  typename std::vector< std::deque<index_t> >::const_iterator vec_it;
295  typename std::deque<index_t>::const_iterator deque_it;
296  index_t offset = 0;
297  index_t index = 0;
298 
299  for(vec_it = NNList.begin(); vec_it != NNList.end(); vec_it++)
300  offset += vec_it->size();
301 
302  NNListArray_size = offset;
303 
304  NNListIndex = new index_t[NNodes+1];
305  NNListArray = new index_t[NNListArray_size];
306 
307  offset = 0;
308 
309  for(vec_it = NNList.begin(); vec_it != NNList.end(); vec_it++)
310  {
311  NNListIndex[index++] = offset;
312 
313  for(deque_it = vec_it->begin(); deque_it != vec_it->end(); deque_it++)
314  NNListArray[offset++] = *deque_it;
315  }
316 
317  assert(index == NNList.size());
318  NNListIndex[index] = offset;
319  }
320 
321  void colourSetsToArray(const std::map< int, std::deque<index_t> > & colour_sets)
322  {
323  typename std::map< int, std::deque<index_t> >::const_iterator map_it;
324  typename std::deque<index_t>::const_iterator deque_it;
325 
326  NColours = colour_sets.size();
327 
328  colourIndex = new index_t[NColours+1];
329  colourArray = new index_t[NNodes];
330 
331  index_t offset = 0;
332 
333  for(map_it = colour_sets.begin(); map_it != colour_sets.end(); map_it++)
334  {
335  colourIndex[map_it->first - 1] = offset;
336 
337  for(deque_it = map_it->second.begin(); deque_it != map_it->second.end(); deque_it++)
338  colourArray[offset++] = *deque_it;
339  }
340 
341  colourIndex[colour_sets.size()] = offset;
342  }
343 
344  void surfaceNodesToArray(const std::vector<bool> & surface_nodes)
345  {
346  const size_t nbits = sizeof(uint32_t) * 8;
347 
348  surfaceNodesArray_size = NNodes / nbits + (NNodes % nbits ? 1 : 0);
349  surfaceNodesArray = new uint32_t[surfaceNodesArray_size];
350  memset(surfaceNodesArray, 0, surfaceNodesArray_size * sizeof(uint32_t));
351 
352  for(index_t i = 0; i < (int) surface_nodes.size(); i++)
353  if(surface_nodes[i] == true)
354  surfaceNodesArray[i / nbits] |= 1 << i % nbits;
355  }
356 
357  void SNEListToArray(const std::map<int, std::set<index_t> > & SNEList)
358  {
359  typename std::map< int, std::set<index_t> >::const_iterator map_it;
360  typename std::set<index_t>::const_iterator set_it;
361 
362  SNEListArray = new index_t[NSElements*ndims];
363  SNEListIndex = new index_t[NNodes];
364  memset(SNEListIndex, 0, NNodes * sizeof(index_t));
365 
366  index_t offset = 0;
367  for(map_it = SNEList.begin(); map_it != SNEList.end(); map_it++)
368  {
369  SNEListIndex[map_it->first] = offset;
370  for(set_it = map_it->second.begin(); set_it != map_it->second.end(); set_it++)
371  SNEListArray[offset++] = *set_it;
372  }
373  }
374 
375  void NEListToArray(const std::vector< std::set<index_t> > & NEList)
376  {
377  typename std::vector< std::set<index_t> >::const_iterator vec_it;
378  typename std::set<index_t>::const_iterator set_it;
379  index_t offset = 0;
380  index_t index = 0;
381 
382  for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++)
383  offset += vec_it->size();
384 
385  NEListArray_size = offset;
386 
387  NEListIndex = new index_t[NNodes+1];
388  NEListArray = new index_t[NEListArray_size];
389 
390  offset = 0;
391 
392  for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++)
393  {
394  NEListIndex[index++] = offset;
395 
396  for(set_it = vec_it->begin(); set_it != vec_it->end(); set_it++)
397  NEListArray[offset++] = *set_it;
398  }
399 
400  assert(index == NEList.size());
401  NEListIndex[index] = offset;
402  }
403 
404  template<typename type>
405  inline void copyArrayToDevice(const type * array, CUdeviceptr & CUDA_array, index_t array_size)
406  {
407  if(cuMemAlloc(&CUDA_array, array_size * sizeof(type)) != CUDA_SUCCESS)
408  {
409  std::cout << "Error allocating CUDA memory" << std::endl;
410  exit(1);
411  }
412 
413  cuMemcpyHtoD(CUDA_array, array, array_size * sizeof(type));
414  }
415 
416  template<typename type>
417  inline void copyArrayToDeviceNoAlloc(const type * array, CUdeviceptr & CUDA_array, index_t array_size)
418  {
419  cuMemcpyHtoD(CUDA_array, array, array_size * sizeof(type));
420  }
421 
422  template<typename type>
423  inline void copyArrayFromDevice(type * array, CUdeviceptr & CUDA_array, index_t array_size)
424  {
425  cuMemcpyDtoH(array, CUDA_array, array_size * sizeof(type));
426  }
427 
428  bool enabled;
429 
430  CUdevice cuDevice;
431  CUcontext cuContext;
432 
433  CUmodule smoothModule;
434  CUmodule coarsenModule;
435  CUmodule refineModule;
436 
437  CUfunction smoothKernel;
438  CUfunction coarsenKernel;
439  CUfunction refineKernel;
440 
441  unsigned int threadsPerBlock, blocksPerGrid;
442 
443  index_t NNodes, NElements, NSElements, ndims, nloc;
444 
445  CUdeviceptr CUDA_coords;
446  CUdeviceptr CUDA_metric;
447  CUdeviceptr CUDA_coplanar_ids;
448  CUdeviceptr CUDA_normals;
449  CUdeviceptr CUDA_ENList;
450  CUdeviceptr CUDA_SENList;
451  CUdeviceptr CUDA_quality;
452  CUdeviceptr CUDA_smoothStatus;
453  CUdeviceptr CUDA_activeVertices;
454 
455  index_t * NNListArray;
456  index_t * NNListIndex;
457  CUdeviceptr CUDA_NNListArray;
458  CUdeviceptr CUDA_NNListIndex;
459  index_t NNListArray_size;
460 
461  index_t * NEListArray;
462  index_t * NEListIndex;
463  CUdeviceptr CUDA_NEListArray;
464  CUdeviceptr CUDA_NEListIndex;
465  index_t NEListArray_size;
466 
467  index_t * colourArray;
468  index_t* colourIndex;
469  CUdeviceptr CUDA_colourArray;
470  index_t NColours;
471 
472  uint32_t * surfaceNodesArray;
473  index_t surfaceNodesArray_size;
474  CUdeviceptr CUDA_surfaceNodesArray;
475 
476  index_t * SNEListArray;
477  index_t * SNEListIndex;
478  CUdeviceptr CUDA_SNEListArray;
479  CUdeviceptr CUDA_SNEListIndex;
480 
481  CUdeviceptr CUDA_orientation;
482 };
483 
484 #endif
void freeResources()
Definition: CUDATools.h:229
void launchSmoothingKernel(int colour)
Definition: CUDATools.h:270
void retrieveSmoothStatus(std::vector< unsigned char > &status)
Definition: CUDATools.h:224
void copyCoordinatesToDevice(Mesh< real_t, index_t > *mesh)
Definition: CUDATools.h:174
void copyCoordinatesFromDevice(Mesh< real_t, index_t > *mesh)
Definition: CUDATools.h:184
void copyMeshDataToDevice(Mesh< real_t, index_t > *mesh, Surface< real_t, index_t > *surface, std::map< int, std::deque< index_t > > &colour_sets, std::vector< real_t > &quality, int orientation, size_t dimensions)
Definition: CUDATools.h:111
const real_t * get_coords(index_t nid) const
Return positions vector.
Definition: Mesh.h:384
int index_t
void copyMetricFromDevice(Mesh< real_t, index_t > *mesh)
Definition: CUDATools.h:189
Manages mesh data.
Definition: Mesh.h:70
#define SET_CONSTANT(SYMBOL_NAME)
void copyMetricToDevice(Mesh< real_t, index_t > *mesh)
Definition: CUDATools.h:179
void reserveSmoothStatusMemory()
Definition: CUDATools.h:194
void setSmoothingKernel(std::string method, std::vector< unsigned char > &status)
Definition: CUDATools.h:261
bool isEnabled()
Definition: CUDATools.h:65
const double * get_metric(index_t nid) const
Return metric at that vertex.
Definition: Mesh.h:396
void reserveActiveVerticesMemory()
Definition: CUDATools.h:209
size_t get_number_nodes() const
Return the number of nodes in the mesh.
Definition: Mesh.h:369
void initialize()
Definition: CUDATools.h:70
CUDATools()
Definition: CUDATools.h:60