AmpTools
GPUManager.cc
Go to the documentation of this file.
1 //******************************************************************************
2 // This file is part of AmpTools, a package for performing Amplitude Analysis
3 //
4 // Copyright Trustees of Indiana University 2010, all rights reserved
5 //
6 // This software written by Matthew Shepherd, Ryan Mitchell, and
7 // Hrayr Matevosyan at Indiana University, Bloomington
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions
11 // are met:
12 // 1. Redistributions of source code must retain the above copyright
13 // notice and author attribution, this list of conditions and the
14 // following disclaimer.
15 // 2. Redistributions in binary form must reproduce the above copyright
16 // notice and author attribution, this list of conditions and the
17 // following disclaimer in the documentation and/or other materials
18 // provided with the distribution.
19 // 3. Neither the name of the University nor the names of its contributors
20 // may be used to endorse or promote products derived from this software
21 // without specific prior written permission.
22 //
23 // Creation of derivative forms of this software for commercial
24 // utilization may be subject to restriction; written permission may be
25 // obtained from the Trustees of Indiana University.
26 //
27 // INDIANA UNIVERSITY AND THE AUTHORS MAKE NO REPRESENTATIONS OR WARRANTIES,
28 // EXPRESS OR IMPLIED. By way of example, but not limitation, INDIANA
29 // UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCANTABILITY OR
30 // FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THIS SOFTWARE OR
31 // DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS,
32 // OR OTHER RIGHTS. Neither Indiana University nor the authors shall be
33 // held liable for any liability with respect to any claim by the user or
34 // any other party arising from use of the program.
35 //******************************************************************************
36 
37 #ifdef USE_MPI
38 #include <mpi.h>
39 #endif
40 
41 #include <iostream>
42 #include <math.h>
43 #include <string.h>
44 #include <sys/time.h>
45 #include "cuda_runtime.h"
46 
48 #include "IUAmpTools/AmpVecs.h"
49 
50 #include "GPUManager/GPUKernel.h"
51 #include "GPUManager/GPUManager.h"
52 
53 #ifdef VTRACE
54 #include "vt_user.h"
55 #endif
56 
57 bool GPUManager::m_cudaDisplay = false;
58 
59 template <class T>
60 void reduce(int size, int threads, int blocks, T *d_idata, T *d_odata);
61 
63 {
64  m_iNEvents=0;
65  m_iNTrueEvents=0;
66 
67  m_iNAmps=0;
68 
69  m_iEventArrSize=0;
70  m_iAmpArrSize=0;
71  m_iVArrSize=0;
72 
73  //Host Arrays
74  m_pfVVStar=0;
75  m_pfRes=0;
76 
77  //Device Arrays
78  m_pfDevData=0;
79  m_pcDevCalcAmp=0;
80  m_piDevPerm=0;
81  m_pfDevAmps=0;
82  m_pfDevWeights=0;
83  m_pfDevVVStar=0;
84  m_pfDevResRe=0;
85  m_pfDevResIm=0;
86  m_pfDevREDUCE=0;
87 
88  //CUDA Thread and Grid sizes
89  m_iDimGridX=0;
90  m_iDimGridY=0;
91  m_iDimThreadX=0;
92  m_iDimThreadY=0;
93 
94  int thisDevice = 0;
95 
96  if( !m_cudaDisplay )
97  cout<<"\n################### CUDA DEVICE ##################\n";
98 
99 #ifdef USE_MPI
100 
101  // Note that a better algorithm would be to utilize the jobs "local
102  // rank" on the machine instead of global rank -- this needs development.
103  // The obvious problem with the technique below is that, e.g., in a
104  // two-GPU per node cluster, all even rank jobs will use device zero.
105  // If two even rank jobs land on the same node device zero will be
106  // overloaded and device 1 will be unused.
107 
108  int rank, devs;
109  MPI_Comm_rank( MPI_COMM_WORLD, &rank );
110  cudaGetDeviceCount(&devs);
111  thisDevice = rank % devs;
112 
113  if( !m_cudaDisplay ) {
114  cout << "\nParallel GPU configuration requested." << endl;
115  cout << "\nNumber of CUDA devices available on this node: " << devs << endl;
116  cout << "\nMPI process " << rank << " is using device " << thisDevice << endl;
117  }
118 #endif
119 
121  gpuErrChk( cudaSetDevice( thisDevice ) );
122  cudaDeviceProp devProp;
123  gpuErrChk( cudaGetDeviceProperties( &devProp, thisDevice ) );
124 
125  m_devProp_major = devProp.major;
126 
127  if( ! m_cudaDisplay ){
128 
129  cout<<"Current GPU Properites:\n";
130  cout<<"\t Name: "<<devProp.name<<endl;
131  cout<<"\t Total global memory: "<<devProp.totalGlobalMem/((float)1024*1024)<<" MB"<<endl;
132  cout<<"\t Rev.: "<<devProp.major<<"."<<devProp.minor<<endl;
133  cout<<"\t Precision (size of GDouble): " << sizeof(GDouble) << " bytes" << endl;
134  cout<<"##################################################\n\n";
136  m_cudaDisplay = true;
137  }
138 
139  if( m_devProp_major == 1 && devProp.minor < 3 ){
140 
141  // double precision operations need 1.3 hardware or higher
142  assert( sizeof( GDouble ) <= 4 );
143  }
144 }
145 
147 {
148  GPUManager();
149  init( a );
150 }
151 
153 {
154  clearAll();
155 }
156 
157 
158 // Initialization routines:
159 
160 void
162 {
163  clearAll();
164 
165  // copy over some info from the AmpVecs object for array dimensions
166  m_iNTrueEvents = a.m_iNTrueEvents;
167  m_iNEvents = a.m_iNEvents;
168  m_iNParticles = a.m_iNParticles;
169  m_iNAmps = a.m_iNTerms;
170 
171  // the rest of the data are derived:
172  m_iEventArrSize = sizeof(GDouble) * m_iNEvents;
173  m_iTrueEventArrSize = sizeof(GDouble) * m_iNTrueEvents;
174 
175  // size needed to store amplitudes for each event
176  m_iAmpArrSize = 2 * sizeof(GDouble) * m_iNEvents * m_iNAmps;
177 
178  // size of upper half of ViVj* matrix
179  m_iVArrSize = 2 * sizeof(GDouble) * m_iNAmps * ( m_iNAmps + 1 ) / 2;
180 
181  // host memory needed for intensity or integral calculation
182  cudaMallocHost( (void**)&m_pfVVStar , m_iVArrSize );
183  cudaMallocHost( (void**)&m_pfRes , m_iEventArrSize );
184 
185  double totalMemory = 0;
186 
187  totalMemory += m_iVArrSize;
188  totalMemory += 4 * m_iEventArrSize;
189  totalMemory += 4 * m_iNParticles * m_iEventArrSize;
190  totalMemory += m_iNParticles * sizeof( int );
191  totalMemory += a.m_maxFactPerEvent * m_iEventArrSize;
192  totalMemory += m_iAmpArrSize;
193 
194  totalMemory /= (1024*1024);
195 
196  cout << "Attempting to allocate " << totalMemory << " MB of global GPU memory." << endl;
197 
198  // device memory needed for intensity or integral calculation and sum
199  gpuErrChk( cudaMalloc( (void**)&m_pfDevVVStar , m_iVArrSize ) ) ;
200  gpuErrChk( cudaMalloc( (void**)&m_pfDevWeights , m_iEventArrSize ) ) ;
201  gpuErrChk( cudaMalloc( (void**)&m_pfDevResRe , m_iEventArrSize ) ) ;
202  gpuErrChk( cudaMalloc( (void**)&m_pfDevResIm , m_iEventArrSize ) ) ;
203  gpuErrChk( cudaMalloc( (void**)&m_pfDevREDUCE , m_iEventArrSize ) ) ;
204 
205  // allocate device memory needed for amplitude calculations
206  gpuErrChk( cudaMalloc( (void**)&m_pfDevData , 4 * m_iNParticles * m_iEventArrSize ) ) ;
207  gpuErrChk( cudaMalloc( (void**)&m_piDevPerm , m_iNParticles * sizeof( int ) ) ) ;
208  gpuErrChk( cudaMalloc( (void**)&m_pcDevCalcAmp , a.m_maxFactPerEvent * m_iEventArrSize ) ) ;
209  gpuErrChk( cudaMalloc( (void**)&m_pfDevAmps , m_iAmpArrSize ) ) ;
210 
211  cout << "GPU memory allocated for " << m_iNAmps << " amplitudes and "
212  << m_iNEvents << " events (" << m_iNTrueEvents << " actual events)."
213  << endl;
214 
215  //CUDA Dims
216  calcCUDADims();
217 }
218 
219 
220 void
222 {
223 
224 #ifdef VTRACE
225  VT_TRACER( "GPUManager::copyDataToGPU" );
226 #endif
227 
228  // make sure AmpVecs has been loaded with data
229  assert( a.m_pdData );
230 
231  // copy the data into the device
232  gpuErrChk( cudaMemcpy( m_pfDevData, a.m_pdData,
233  4 * m_iNParticles * m_iEventArrSize,
234  cudaMemcpyHostToDevice ) );
235 
236  // copy the weights to the GPU
237  gpuErrChk( cudaMemcpy( m_pfDevWeights, a.m_pdWeights,
238  m_iEventArrSize, cudaMemcpyHostToDevice ) );
239 }
240 
241 void
243 {
244 
245 #ifdef VTRACE
246  VT_TRACER( "GPUManager::copyAmpsToGPU" );
247 #endif
248 
249  // this array is not allocated by default on GPU enabled code
250  // to save CPU memory -- the user must allocate it explicitly
251  assert( a.m_pdAmpFactors != NULL && a.m_pdAmps != NULL );
252 
253  gpuErrChk( cudaMemcpy( a.m_pdAmps, m_pfDevAmps,
254  m_iAmpArrSize,
255  cudaMemcpyDeviceToHost ) );
256 
257  gpuErrChk( cudaMemcpy( a.m_pdAmpFactors, m_pcDevCalcAmp,
258  a.m_maxFactPerEvent * m_iEventArrSize,
259  cudaMemcpyDeviceToHost ) );
260 }
261 
262 void
263 GPUManager::calcAmplitudeAll( const Amplitude* amp, unsigned long long offset,
264  const vector< vector< int > >* pvPermutations )
265 {
266 
267 #ifdef VTRACE
268  VT_TRACER( "GPUManager::calcAmplitudeAll" );
269 #endif
270 
271  dim3 dimBlock( m_iDimThreadX, m_iDimThreadY );
272  dim3 dimGrid( m_iDimGridX, m_iDimGridY );
273 
274  // do the computation for all events for each permutation in the
275  // vector of permunations
276  vector< vector< int > >::const_iterator permItr = pvPermutations->begin();
277 
278  // if this is not true, AmplitudeManager hasn't been setup properly
279  assert( permItr->size() == m_iNParticles );
280 
281  unsigned long long permOffset = 0;
282  for( ; permItr != pvPermutations->end(); ++permItr ){
283 
284  // copy the permutation to global memory
285  gpuErrChk( cudaMemcpy( m_piDevPerm, &((*permItr)[0]),
286  m_iNParticles * sizeof( int ),
287  cudaMemcpyHostToDevice ) );
288 
289  // calculate amplitude factor for all events --
290  // casting amp array to WCUComplex for 8 or 16 bit write
291  // operation of both real and complex parts at once
292 
293  amp->calcAmplitudeGPU( dimGrid, dimBlock, m_pfDevData,
294  (WCUComplex*)&m_pcDevCalcAmp[offset+permOffset],
295  m_piDevPerm, m_iNParticles, m_iNEvents,
296  *permItr );
297 
298  // check to be sure kernel execution was OK
299  cudaError_t cerrKernel=cudaGetLastError();
300  if( cerrKernel!= cudaSuccess ){
301 
302  cout << "\nKERNEL LAUNCH ERROR [" << amp->name() << "]: "
303  << cudaGetErrorString( cerrKernel ) << endl;
304  assert( false );
305  }
306 
307  // increment the offset so that we place the computation for the
308  // next permutation after the previous in pcResAmp
309  permOffset += 2 * m_iNEvents;
310  }
311 }
312 
313 void
314 GPUManager::assembleTerms( int iAmpInd, int nFact, int nPerm ){
315 
316 #ifdef VTRACE
317  VT_TRACER( "GPUManager::assembleTerms" );
318 #endif
319 
320  dim3 dimBlock( m_iDimThreadX, m_iDimThreadY );
321  dim3 dimGrid( m_iDimGridX, m_iDimGridY );
322 
323  gpuErrChk( cudaMemset( &(m_pfDevAmps[2*m_iNEvents*iAmpInd]), 0,
324  2*sizeof(GDouble)*m_iNEvents ) );
325 
326  GPU_ExecFactPermKernel( dimGrid, dimBlock, &(m_pfDevAmps[2*m_iNEvents*iAmpInd]),
327  m_pcDevCalcAmp, nFact, nPerm, m_iNEvents );
328 }
329 
330 double
331 GPUManager::calcSumLogIntensity( const vector< complex< double > >& prodCoef,
332  const vector< vector< bool > >& cohMtx )
333 {
334 
335 #ifdef VTRACE
336  VT_TRACER( "GPUManager::calcSumLogIntensity" );
337 #endif
338 
339  unsigned int i,j;
340 
341  // precompute the real and imaginary parts of ViVj* and copy to
342  // GPU global memory
343  complex< double > cdFij;
344  for( i = 0; i< m_iNAmps; i++) {
345  for( j = 0; j <= i; j++ ) {
346 
347  cdFij = prodCoef[i] * conj( prodCoef[j] );
348 
349  // here is the transition from double -> GDouble
350  m_pfVVStar[2*(i*(i+1)/2+j)] =
351  ( cohMtx[i][j] ? static_cast< GDouble >( cdFij.real() ) : 0 );
352  m_pfVVStar[2*(i*(i+1)/2+j)+1] =
353  ( cohMtx[i][j] ? static_cast< GDouble >( cdFij.imag() ) : 0 );
354  }
355  }
356 
357  // copy the production factors to GPU memory
358  gpuErrChk( cudaMemcpy( m_pfDevVVStar, m_pfVVStar,
359  m_iVArrSize, cudaMemcpyHostToDevice ) );
360 
361  // compute the logs of the intensities
362  dim3 dimBlock( m_iDimThreadX, m_iDimThreadY );
363  dim3 dimGrid( m_iDimGridX, m_iDimGridY );
364  GPU_ExecAmpKernel( dimGrid, dimBlock, m_pfDevAmps, m_pfDevVVStar, m_pfDevWeights,
365  m_iNAmps, m_iNEvents, m_pfDevResRe );
366 
367  // Now the summation of the results -- do this on the CPU for small
368  // numbers of events or cases where double precision GPU is not enabled
369 
370  double dGPUResult = 0;
371 
372  if( m_iNTrueEvents <= m_iNBlocks || sizeof( GDouble ) <= 4 )
373  {
374  gpuErrChk( cudaMemcpy( m_pfRes,m_pfDevResRe,
375  m_iTrueEventArrSize,cudaMemcpyDeviceToHost ) );
376  for(i=0; i<m_iNTrueEvents; i++)
377  dGPUResult += m_pfRes[i];
378  }
379  else
380  {
381 
382  // zeroing out the padding as not to alter the results after the reduction
383  gpuErrChk( cudaMemset( m_pfDevResRe+m_iNTrueEvents,0,
384  sizeof(GDouble)*(m_iNEvents-m_iNTrueEvents)) );
385  // execute the kernel to sum partial sums from each block on CPU
386  reduce<GDouble>(m_iNEvents, m_iNThreads, m_iNBlocks, m_pfDevResRe, m_pfDevREDUCE);
387 
388  // copy result from device to host
389  gpuErrChk( cudaMemcpy( m_pfRes, m_pfDevREDUCE, m_iNBlocks*sizeof(GDouble),
390  cudaMemcpyDeviceToHost) );
391  for(i=0; i<m_iNBlocks; i++)
392  dGPUResult += m_pfRes[i];
393  }
394  return dGPUResult;
395 }
396 
397 void
398 GPUManager::calcIntegral( GDouble* result, int iAmp, int jAmp, int iNGenEvents ){
399 
400 #ifdef VTRACE
401  VT_TRACER( "GPUManager::calcIntegral" );
402 #endif
403 
404  dim3 dimBlock( m_iDimThreadX, m_iDimThreadY );
405  dim3 dimGrid( m_iDimGridX, m_iDimGridY );
406 
407  GPU_ExecIntElementKernel( dimGrid, dimBlock, iAmp, jAmp, m_pfDevAmps,
408  m_pfDevWeights, m_pfDevResRe, m_pfDevResIm,
409  m_iNEvents );
410 
411  // add up the real parts
412  result[0] = 0;
413 
414  if( m_iNTrueEvents <= m_iNBlocks || sizeof( GDouble ) <= 4 )
415  {
416  gpuErrChk( cudaMemcpy( m_pfRes, m_pfDevResRe,
417  m_iTrueEventArrSize, cudaMemcpyDeviceToHost ) );
418 
419  for(int i=0; i<m_iNTrueEvents; i++){
420 
421  result[0] += m_pfRes[i];
422  }
423 
424  result[0] /= static_cast< GDouble >( iNGenEvents );
425  }
426  else
427  {
428  gpuErrChk( cudaMemset( m_pfDevResRe + m_iNTrueEvents, 0,
429  sizeof(GDouble)*( m_iNEvents - m_iNTrueEvents ) ) );
430 
431  // execute the kernel to sum partial sums from each block on GPU
432  reduce<GDouble>(m_iNEvents, m_iNThreads, m_iNBlocks, m_pfDevResRe, m_pfDevREDUCE);
433 
434  // copy result from device to host
435  gpuErrChk( cudaMemcpy( m_pfRes, m_pfDevREDUCE, m_iNBlocks*sizeof(GDouble),
436  cudaMemcpyDeviceToHost) );
437 
438  for(int i = 0; i < m_iNBlocks; i++){
439 
440  result[0] += m_pfRes[i];
441  }
442 
443  result[0] /= static_cast< GDouble >( iNGenEvents );
444  }
445 
446  // repeat for imaginary parts
447  result[1] = 0;
448 
449  if( m_iNTrueEvents <= m_iNBlocks || sizeof( GDouble ) <= 4 ) {
450 
451  gpuErrChk( cudaMemcpy( m_pfRes, m_pfDevResIm,
452  m_iTrueEventArrSize, cudaMemcpyDeviceToHost ) );
453 
454  for(int i=0; i<m_iNTrueEvents; i++){
455 
456  result[1] += m_pfRes[i];
457  }
458 
459  result[1] /= static_cast< GDouble >( iNGenEvents );
460  }
461  else {
462 
463  gpuErrChk( cudaMemset( m_pfDevResIm + m_iNTrueEvents, 0,
464  sizeof(GDouble)*( m_iNEvents - m_iNTrueEvents ) ) );
465 
466  // execute the kernel to sum partial sums from each block on GPU
467  reduce<GDouble>(m_iNEvents, m_iNThreads, m_iNBlocks, m_pfDevResIm, m_pfDevREDUCE);
468 
469  // copy result from device to host
470  gpuErrChk( cudaMemcpy( m_pfRes, m_pfDevREDUCE, m_iNBlocks*sizeof(GDouble),
471  cudaMemcpyDeviceToHost) );
472  for(int i = 0; i < m_iNBlocks; i++){
473 
474  result[1] += m_pfRes[i];
475  }
476 
477  result[1] /= static_cast< GDouble >( iNGenEvents );
478  }
479 }
480 
481 
482 // Methods to clear memory:
483 
485 {
486 
487  m_iNParticles=0;
488  m_iNEvents=0;
489  m_iNTrueEvents=0;
490  m_iEventArrSize=0;
491  m_iTrueEventArrSize=0;
492 
493  m_iNEvents=0;
494  m_iNTrueEvents=0;
495  m_iNAmps=0;
496 
497  m_iEventArrSize=0;
498  m_iTrueEventArrSize=0;
499  m_iAmpArrSize=0;
500  m_iVArrSize=0;
501 
502  //Host Memory
503 
504  //Allocated pointers
505 
506  if(m_pfVVStar)
507  cudaFreeHost(m_pfVVStar);
508  m_pfVVStar=0;
509 
510  if(m_pfRes)
511  cudaFreeHost(m_pfRes);
512  m_pfRes=0;
513 
514  //Device Memory
515 
516  if(m_pfDevData)
517  cudaFree(m_pfDevData);
518  m_pfDevData=0;
519 
520  if(m_pcDevCalcAmp)
521  cudaFree(m_pcDevCalcAmp);
522  m_pcDevCalcAmp=0;
523 
524  if(m_piDevPerm)
525  cudaFree(m_piDevPerm);
526  m_piDevPerm=0;
527 
528  if(m_pfDevAmps)
529  cudaFree(m_pfDevAmps);
530  m_pfDevAmps=0;
531 
532  if(m_pfDevVVStar)
533  cudaFree(m_pfDevVVStar);
534  m_pfDevVVStar=0;
535 
536  if(m_pfDevWeights)
537  cudaFree(m_pfDevWeights);
538  m_pfDevWeights=0;
539 
540  if(m_pfDevResRe)
541  cudaFree(m_pfDevResRe);
542  m_pfDevResRe=0;
543 
544  if(m_pfDevResIm)
545  cudaFree(m_pfDevResIm);
546  m_pfDevResIm=0;
547 
548  if(m_pfDevREDUCE)
549  cudaFree(m_pfDevREDUCE);
550  m_pfDevREDUCE=0;
551 
552  //CUDA Thread and Grid sizes
553  m_iDimGridX=0;
554  m_iDimGridY=0;
555  m_iDimThreadX=0;
556  m_iDimThreadY=0;
557 }
558 
559 // Internal utilities:
560 void GPUManager::calcCUDADims()
561 {
562  if(m_iNEvents<1)
563  return;
564 
565  m_iDimThreadX=GPU_BLOCK_SIZE_X;
566  m_iDimThreadY=GPU_BLOCK_SIZE_Y;
567 
568  unsigned int iBlockSizeSq=GPU_BLOCK_SIZE_SQ;
569  unsigned int iNBlocks=m_iNEvents/iBlockSizeSq;
570  if(iNBlocks<=1)
571  {
572  m_iDimGridX=1;
573  m_iDimGridY=1;
574  }
575  else
576  {
577  unsigned int iDivLo=1,iDivHi=iNBlocks;
578  for(iDivLo=static_cast<int>(sqrt(iNBlocks));iDivLo>=1;iDivLo--)
579  {
580  iDivHi=iNBlocks/iDivLo;
581  if(iDivLo*iDivHi==iNBlocks)
582  break;
583  }
584  m_iDimGridX=iDivLo;
585  m_iDimGridY=iDivHi;
586  }
587 
588  // cout<<"\tThread dimensions: ("<<m_iDimThreadX<<","<<m_iDimThreadY<<")\n";
589  // cout<<"\tGrid dimensions: ("<<m_iDimGridX<<","<<m_iDimGridY<<")\n";
590 
591  //Reduction Parameters
592  unsigned int maxThreads = ( m_devProp_major >= 2 ? 1024 : 512 ); // number of threads per block
593  unsigned int maxBlocks = 1024;
594 
595  if (m_iNEvents == 1)
596  m_iNThreads = 1;
597  else
598  m_iNThreads = (m_iNEvents < maxThreads*2) ? m_iNEvents / 2 : maxThreads;
599 
600  m_iNBlocks = m_iNEvents / (m_iNThreads * 2);
601  m_iNBlocks = min(maxBlocks, m_iNBlocks);
602 
603  // cout<<"Reduction:\n";
604  // cout<<"\tNumber of threads: "<<m_iNThreads<<"\n";
605  // cout<<"\tNumber of blocks: "<<m_iNBlocks<<"\n\n\n"<<flush;
606 }
607 
void calcAmplitudeAll(const Amplitude *amp, unsigned long long offset, const vector< vector< int > > *pvPermutations)
Definition: GPUManager.cc:263
#define gpuErrChk(ans)
Definition: GPUManager.h:52
#define GPU_BLOCK_SIZE_SQ
unsigned long long m_iNEvents
Definition: AmpVecs.h:68
void calcIntegral(GDouble *result, int iAmp, int jAmp, int iNGenEvents)
Definition: GPUManager.cc:398
#define GPU_BLOCK_SIZE_Y
unsigned int m_maxFactPerEvent
Definition: AmpVecs.h:100
void init(const AmpVecs &a)
Definition: GPUManager.cc:161
void GPU_ExecIntElementKernel(dim3 dimGrid, dim3 dimBlock, int iA, int iB, GDouble *pfDevAmps, GDouble *pfDevWeights, GDouble *pfDevResRe, GDouble *pfDevResIm, int nEvents)
void clearAll()
Definition: GPUManager.cc:484
void assembleTerms(int iAmpInd, int nFact, int nPerm)
Definition: GPUManager.cc:314
double GDouble
void copyAmpsFromGPU(AmpVecs &a)
Definition: GPUManager.cc:242
void GPU_ExecAmpKernel(dim3 dimGrid, dim3 dimBlock, GDouble *pfDevAmps, GDouble *pfDevVVStar, GDouble *pfDevWeights, int nAmps, int nEvents, GDouble *pfDevRes)
GDouble * m_pdAmps
Definition: AmpVecs.h:117
#define NULL
Definition: URtypes.h:72
virtual string name() const =0
unsigned long long m_iNTrueEvents
Definition: AmpVecs.h:75
virtual void calcAmplitudeGPU(dim3 dimGrid, dim3 dimBlock, GPU_AMP_PROTO, const vector< int > &perm) const
Definition: Amplitude.cc:168
#define GPU_BLOCK_SIZE_X
double sqrt(double)
double calcSumLogIntensity(const vector< complex< double > > &prodCoef, const vector< vector< bool > > &cohMtx)
Definition: GPUManager.cc:331
GDouble * m_pdWeights
Definition: AmpVecs.h:111
unsigned int m_iNParticles
Definition: AmpVecs.h:88
GDouble * m_pdData
Definition: AmpVecs.h:106
GDouble * m_pdAmpFactors
Definition: AmpVecs.h:124
void copyDataToGPU(const AmpVecs &a)
Definition: GPUManager.cc:221
void GPU_ExecFactPermKernel(dim3 dimGrid, dim3 dimBlock, GDouble *pfDevAmps, GDouble *pcDevCalcAmp, int nFact, int nPerm, int nEvents)
unsigned int m_iNTerms
Definition: AmpVecs.h:94
void reduce(int size, int threads, int blocks, T *d_idata, T *d_odata)