45 #include "cuda_runtime.h" 57 bool GPUManager::m_cudaDisplay =
false;
60 void reduce(
int size,
int threads,
int blocks, T *d_idata, T *d_odata);
97 cout<<
"\n################### CUDA DEVICE ##################\n";
109 MPI_Comm_rank( MPI_COMM_WORLD, &rank );
110 cudaGetDeviceCount(&devs);
111 thisDevice = rank % devs;
113 if( !m_cudaDisplay ) {
114 cout <<
"\nParallel GPU configuration requested." << endl;
115 cout <<
"\nNumber of CUDA devices available on this node: " << devs << endl;
116 cout <<
"\nMPI process " << rank <<
" is using device " << thisDevice << endl;
121 gpuErrChk( cudaSetDevice( thisDevice ) );
122 cudaDeviceProp devProp;
123 gpuErrChk( cudaGetDeviceProperties( &devProp, thisDevice ) );
125 m_devProp_major = devProp.major;
127 if( ! m_cudaDisplay ){
129 cout<<
"Current GPU Properites:\n";
130 cout<<
"\t Name: "<<devProp.name<<endl;
131 cout<<
"\t Total global memory: "<<devProp.totalGlobalMem/((float)1024*1024)<<
" MB"<<endl;
132 cout<<
"\t Rev.: "<<devProp.major<<
"."<<devProp.minor<<endl;
133 cout<<
"\t Precision (size of GDouble): " <<
sizeof(
GDouble) <<
" bytes" << endl;
134 cout<<
"##################################################\n\n";
136 m_cudaDisplay =
true;
139 if( m_devProp_major == 1 && devProp.minor < 3 ){
142 assert(
sizeof(
GDouble ) <= 4 );
172 m_iEventArrSize =
sizeof(
GDouble) * m_iNEvents;
173 m_iTrueEventArrSize =
sizeof(
GDouble) * m_iNTrueEvents;
176 m_iAmpArrSize = 2 *
sizeof(
GDouble) * m_iNEvents * m_iNAmps;
179 m_iVArrSize = 2 *
sizeof(
GDouble) * m_iNAmps * ( m_iNAmps + 1 ) / 2;
182 cudaMallocHost( (
void**)&m_pfVVStar , m_iVArrSize );
183 cudaMallocHost( (
void**)&m_pfRes , m_iEventArrSize );
185 double totalMemory = 0;
187 totalMemory += m_iVArrSize;
188 totalMemory += 4 * m_iEventArrSize;
189 totalMemory += 4 * m_iNParticles * m_iEventArrSize;
190 totalMemory += m_iNParticles *
sizeof( int );
192 totalMemory += m_iAmpArrSize;
194 totalMemory /= (1024*1024);
196 cout <<
"Attempting to allocate " << totalMemory <<
" MB of global GPU memory." << endl;
199 gpuErrChk( cudaMalloc( (
void**)&m_pfDevVVStar , m_iVArrSize ) ) ;
200 gpuErrChk( cudaMalloc( (
void**)&m_pfDevWeights , m_iEventArrSize ) ) ;
201 gpuErrChk( cudaMalloc( (
void**)&m_pfDevResRe , m_iEventArrSize ) ) ;
202 gpuErrChk( cudaMalloc( (
void**)&m_pfDevResIm , m_iEventArrSize ) ) ;
203 gpuErrChk( cudaMalloc( (
void**)&m_pfDevREDUCE , m_iEventArrSize ) ) ;
206 gpuErrChk( cudaMalloc( (
void**)&m_pfDevData , 4 * m_iNParticles * m_iEventArrSize ) ) ;
207 gpuErrChk( cudaMalloc( (
void**)&m_piDevPerm , m_iNParticles *
sizeof(
int ) ) ) ;
209 gpuErrChk( cudaMalloc( (
void**)&m_pfDevAmps , m_iAmpArrSize ) ) ;
211 cout <<
"GPU memory allocated for " << m_iNAmps <<
" amplitudes and " 212 << m_iNEvents <<
" events (" << m_iNTrueEvents <<
" actual events)." 225 VT_TRACER(
"GPUManager::copyDataToGPU" );
233 4 * m_iNParticles * m_iEventArrSize,
234 cudaMemcpyHostToDevice ) );
238 m_iEventArrSize, cudaMemcpyHostToDevice ) );
246 VT_TRACER(
"GPUManager::copyAmpsToGPU" );
255 cudaMemcpyDeviceToHost ) );
259 cudaMemcpyDeviceToHost ) );
264 const vector< vector< int > >* pvPermutations )
268 VT_TRACER(
"GPUManager::calcAmplitudeAll" );
271 dim3 dimBlock( m_iDimThreadX, m_iDimThreadY );
272 dim3 dimGrid( m_iDimGridX, m_iDimGridY );
276 vector< vector< int > >::const_iterator permItr = pvPermutations->begin();
279 assert( permItr->size() == m_iNParticles );
281 unsigned long long permOffset = 0;
282 for( ; permItr != pvPermutations->end(); ++permItr ){
285 gpuErrChk( cudaMemcpy( m_piDevPerm, &((*permItr)[0]),
286 m_iNParticles *
sizeof(
int ),
287 cudaMemcpyHostToDevice ) );
294 (WCUComplex*)&m_pcDevCalcAmp[offset+permOffset],
295 m_piDevPerm, m_iNParticles, m_iNEvents,
299 cudaError_t cerrKernel=cudaGetLastError();
300 if( cerrKernel!= cudaSuccess ){
302 cout <<
"\nKERNEL LAUNCH ERROR [" << amp->
name() <<
"]: " 303 << cudaGetErrorString( cerrKernel ) << endl;
309 permOffset += 2 * m_iNEvents;
317 VT_TRACER(
"GPUManager::assembleTerms" );
320 dim3 dimBlock( m_iDimThreadX, m_iDimThreadY );
321 dim3 dimGrid( m_iDimGridX, m_iDimGridY );
323 gpuErrChk( cudaMemset( &(m_pfDevAmps[2*m_iNEvents*iAmpInd]), 0,
324 2*
sizeof(
GDouble)*m_iNEvents ) );
327 m_pcDevCalcAmp, nFact, nPerm, m_iNEvents );
332 const vector< vector< bool > >& cohMtx )
336 VT_TRACER(
"GPUManager::calcSumLogIntensity" );
343 complex< double > cdFij;
344 for( i = 0; i< m_iNAmps; i++) {
345 for( j = 0; j <= i; j++ ) {
347 cdFij = prodCoef[i] * conj( prodCoef[j] );
350 m_pfVVStar[2*(i*(i+1)/2+j)] =
351 ( cohMtx[i][j] ?
static_cast< GDouble >( cdFij.real() ) : 0 );
352 m_pfVVStar[2*(i*(i+1)/2+j)+1] =
353 ( cohMtx[i][j] ?
static_cast< GDouble >( cdFij.imag() ) : 0 );
358 gpuErrChk( cudaMemcpy( m_pfDevVVStar, m_pfVVStar,
359 m_iVArrSize, cudaMemcpyHostToDevice ) );
362 dim3 dimBlock( m_iDimThreadX, m_iDimThreadY );
363 dim3 dimGrid( m_iDimGridX, m_iDimGridY );
364 GPU_ExecAmpKernel( dimGrid, dimBlock, m_pfDevAmps, m_pfDevVVStar, m_pfDevWeights,
365 m_iNAmps, m_iNEvents, m_pfDevResRe );
370 double dGPUResult = 0;
372 if( m_iNTrueEvents <= m_iNBlocks ||
sizeof(
GDouble ) <= 4 )
374 gpuErrChk( cudaMemcpy( m_pfRes,m_pfDevResRe,
375 m_iTrueEventArrSize,cudaMemcpyDeviceToHost ) );
376 for(i=0; i<m_iNTrueEvents; i++)
377 dGPUResult += m_pfRes[i];
383 gpuErrChk( cudaMemset( m_pfDevResRe+m_iNTrueEvents,0,
384 sizeof(
GDouble)*(m_iNEvents-m_iNTrueEvents)) );
386 reduce<GDouble>(m_iNEvents, m_iNThreads, m_iNBlocks, m_pfDevResRe, m_pfDevREDUCE);
390 cudaMemcpyDeviceToHost) );
391 for(i=0; i<m_iNBlocks; i++)
392 dGPUResult += m_pfRes[i];
401 VT_TRACER(
"GPUManager::calcIntegral" );
404 dim3 dimBlock( m_iDimThreadX, m_iDimThreadY );
405 dim3 dimGrid( m_iDimGridX, m_iDimGridY );
408 m_pfDevWeights, m_pfDevResRe, m_pfDevResIm,
414 if( m_iNTrueEvents <= m_iNBlocks ||
sizeof(
GDouble ) <= 4 )
416 gpuErrChk( cudaMemcpy( m_pfRes, m_pfDevResRe,
417 m_iTrueEventArrSize, cudaMemcpyDeviceToHost ) );
419 for(
int i=0; i<m_iNTrueEvents; i++){
421 result[0] += m_pfRes[i];
424 result[0] /=
static_cast< GDouble >( iNGenEvents );
428 gpuErrChk( cudaMemset( m_pfDevResRe + m_iNTrueEvents, 0,
429 sizeof(
GDouble)*( m_iNEvents - m_iNTrueEvents ) ) );
432 reduce<GDouble>(m_iNEvents, m_iNThreads, m_iNBlocks, m_pfDevResRe, m_pfDevREDUCE);
436 cudaMemcpyDeviceToHost) );
438 for(
int i = 0; i < m_iNBlocks; i++){
440 result[0] += m_pfRes[i];
443 result[0] /=
static_cast< GDouble >( iNGenEvents );
449 if( m_iNTrueEvents <= m_iNBlocks ||
sizeof(
GDouble ) <= 4 ) {
451 gpuErrChk( cudaMemcpy( m_pfRes, m_pfDevResIm,
452 m_iTrueEventArrSize, cudaMemcpyDeviceToHost ) );
454 for(
int i=0; i<m_iNTrueEvents; i++){
456 result[1] += m_pfRes[i];
459 result[1] /=
static_cast< GDouble >( iNGenEvents );
463 gpuErrChk( cudaMemset( m_pfDevResIm + m_iNTrueEvents, 0,
464 sizeof(
GDouble)*( m_iNEvents - m_iNTrueEvents ) ) );
467 reduce<GDouble>(m_iNEvents, m_iNThreads, m_iNBlocks, m_pfDevResIm, m_pfDevREDUCE);
471 cudaMemcpyDeviceToHost) );
472 for(
int i = 0; i < m_iNBlocks; i++){
474 result[1] += m_pfRes[i];
477 result[1] /=
static_cast< GDouble >( iNGenEvents );
491 m_iTrueEventArrSize=0;
498 m_iTrueEventArrSize=0;
507 cudaFreeHost(m_pfVVStar);
511 cudaFreeHost(m_pfRes);
517 cudaFree(m_pfDevData);
521 cudaFree(m_pcDevCalcAmp);
525 cudaFree(m_piDevPerm);
529 cudaFree(m_pfDevAmps);
533 cudaFree(m_pfDevVVStar);
537 cudaFree(m_pfDevWeights);
541 cudaFree(m_pfDevResRe);
545 cudaFree(m_pfDevResIm);
549 cudaFree(m_pfDevREDUCE);
560 void GPUManager::calcCUDADims()
569 unsigned int iNBlocks=m_iNEvents/iBlockSizeSq;
577 unsigned int iDivLo=1,iDivHi=iNBlocks;
578 for(iDivLo=static_cast<int>(
sqrt(iNBlocks));iDivLo>=1;iDivLo--)
580 iDivHi=iNBlocks/iDivLo;
581 if(iDivLo*iDivHi==iNBlocks)
592 unsigned int maxThreads = ( m_devProp_major >= 2 ? 1024 : 512 );
593 unsigned int maxBlocks = 1024;
598 m_iNThreads = (m_iNEvents < maxThreads*2) ? m_iNEvents / 2 : maxThreads;
600 m_iNBlocks = m_iNEvents / (m_iNThreads * 2);
601 m_iNBlocks = min(maxBlocks, m_iNBlocks);
void calcAmplitudeAll(const Amplitude *amp, unsigned long long offset, const vector< vector< int > > *pvPermutations)
#define GPU_BLOCK_SIZE_SQ
unsigned long long m_iNEvents
void calcIntegral(GDouble *result, int iAmp, int jAmp, int iNGenEvents)
unsigned int m_maxFactPerEvent
void init(const AmpVecs &a)
void GPU_ExecIntElementKernel(dim3 dimGrid, dim3 dimBlock, int iA, int iB, GDouble *pfDevAmps, GDouble *pfDevWeights, GDouble *pfDevResRe, GDouble *pfDevResIm, int nEvents)
void assembleTerms(int iAmpInd, int nFact, int nPerm)
void copyAmpsFromGPU(AmpVecs &a)
void GPU_ExecAmpKernel(dim3 dimGrid, dim3 dimBlock, GDouble *pfDevAmps, GDouble *pfDevVVStar, GDouble *pfDevWeights, int nAmps, int nEvents, GDouble *pfDevRes)
virtual string name() const =0
unsigned long long m_iNTrueEvents
virtual void calcAmplitudeGPU(dim3 dimGrid, dim3 dimBlock, GPU_AMP_PROTO, const vector< int > &perm) const
double calcSumLogIntensity(const vector< complex< double > > &prodCoef, const vector< vector< bool > > &cohMtx)
unsigned int m_iNParticles
void copyDataToGPU(const AmpVecs &a)
void GPU_ExecFactPermKernel(dim3 dimGrid, dim3 dimBlock, GDouble *pfDevAmps, GDouble *pcDevCalcAmp, int nFact, int nPerm, int nEvents)
void reduce(int size, int threads, int blocks, T *d_idata, T *d_odata)