#include <cuda.h>
-#include "types/simple.h"
+#include "types/simple.h"
#include "types/nbnxn_pairlist.h"
#include "types/nb_verlet.h"
#include "types/ishift.h"
* order of corresponding enumerated types defined in nbnxn_cuda_types.h.
*/
static const nbnxn_cu_kfunc_ptr_t
-nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+ nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
{
{ { k_nbnxn_cutoff, k_nbnxn_cutoff_prune },
{ k_nbnxn_cutoff_ener, k_nbnxn_cutoff_ener_prune } },
return shmem;
}
-/*! As we execute nonbonded workload in separate streams, before launching
+/*! As we execute nonbonded workload in separate streams, before launching
the kernel we need to make sure that he following operations have completed:
- atomdata allocation and related H2D transfers (every nstlist step);
- pair list H2D transfer (every nstlist step);
However, for the sake of having a future-proof implementation, we use the
misc_ops_done event to record the point in time when the above operations
are finished and synchronize with this event in the non-local stream.
-*/
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
+ */
+void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
const nbnxn_atomdata_t *nbatom,
- int flags,
- int iloc)
+ int flags,
+ int iloc)
{
- cudaError_t stat;
- int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+ cudaError_t stat;
+ int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
/* CUDA kernel launch-related stuff */
- int shmem, nblock;
- dim3 dim_block, dim_grid;
+ int shmem, nblock;
+ dim3 dim_block, dim_grid;
nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
- cu_atomdata_t *adat = cu_nb->atdat;
- cu_nbparam_t *nbp = cu_nb->nbparam;
- cu_plist_t *plist = cu_nb->plist[iloc];
- cu_timers_t *t = cu_nb->timers;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cu_atomdata_t *adat = cu_nb->atdat;
+ cu_nbparam_t *nbp = cu_nb->nbparam;
+ cu_plist_t *plist = cu_nb->plist[iloc];
+ cu_timers_t *t = cu_nb->timers;
+ cudaStream_t stream = cu_nb->stream[iloc];
- bool bCalcEner = flags & GMX_FORCE_VIRIAL;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
- bool bDoTime = cu_nb->bDoTime;
+ bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bDoTime = cu_nb->bDoTime;
/* turn energy calculation always on/off (for debugging/testing only) */
bCalcEner = (bCalcEner || always_ener) && !never_ener;
/* HtoD x, q */
cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
- adat_len * sizeof(*adat->xq), stream);
+ adat_len * sizeof(*adat->xq), stream);
if (bDoTime)
{
NCL_PER_SUPERCL, plist->na_c);
}
- nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
+ nb_kernel<<< dim_grid, dim_block, shmem, stream>>> (*adat, *nbp, *plist, bCalcFshift);
CU_LAUNCH_ERR("k_calc_nb");
if (bDoTime)
}
}
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
const nbnxn_atomdata_t *nbatom,
- int flags,
- int aloc)
+ int flags,
+ int aloc)
{
cudaError_t stat;
- int adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
- int iloc = -1;
+ int adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
+ int iloc = -1;
/* determine interaction locality from atom locality */
if (LOCAL_A(aloc))
gmx_incons(stmp);
}
- cu_atomdata_t *adat = cu_nb->atdat;
- cu_timers_t *t = cu_nb->timers;
- bool bDoTime = cu_nb->bDoTime;
- cudaStream_t stream = cu_nb->stream[iloc];
+ cu_atomdata_t *adat = cu_nb->atdat;
+ cu_timers_t *t = cu_nb->timers;
+ bool bDoTime = cu_nb->bDoTime;
+ cudaStream_t stream = cu_nb->stream[iloc];
- bool bCalcEner = flags & GMX_FORCE_VIRIAL;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* don't launch copy-back if there was no work to do */
if (cu_nb->plist[iloc]->nsci == 0)
*(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
}
- /* With DD the local D2H transfer can only start after the non-local
+ /* With DD the local D2H transfer can only start after the non-local
has been launched. */
if (iloc == eintLocal && cu_nb->bUseTwoStreams)
{
}
/* DtoH f */
- cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
+ cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
(adat_len)*sizeof(*adat->f), stream);
/* After the non-local D2H is launched the nonlocal_done event can be
* polling wait for the GPU.
*/
static inline bool atomic_cas(volatile unsigned int *ptr,
- unsigned int oldval,
- unsigned int newval)
+ unsigned int oldval,
+ unsigned int newval)
{
assert(ptr);
real *e_lj, real *e_el, rvec *fshift)
{
/* NOTE: only implemented for single-precision at this time */
- cudaError_t stat;
- int i, adat_end, iloc = -1;
+ cudaError_t stat;
+ int i, adat_end, iloc = -1;
volatile unsigned int *poll_word;
/* determine interaction locality from atom locality */
gmx_incons(stmp);
}
- cu_plist_t *plist = cu_nb->plist[iloc];
- cu_timers_t *timers = cu_nb->timers;
- wallclock_gpu_t *timings = cu_nb->timings;
- nb_staging nbst = cu_nb->nbst;
+ cu_plist_t *plist = cu_nb->plist[iloc];
+ cu_timers_t *timers = cu_nb->timers;
+ wallclock_gpu_t *timings = cu_nb->timings;
+ nb_staging nbst = cu_nb->nbst;
- bool bCalcEner = flags & GMX_FORCE_VIRIAL;
- bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
+ bool bCalcEner = flags & GMX_FORCE_VIRIAL;
+ bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
/* turn energy calculation always on/off (for debugging/testing only) */
- bCalcEner = (bCalcEner || always_ener) && !never_ener;
+ bCalcEner = (bCalcEner || always_ener) && !never_ener;
/* don't launch wait/update timers & counters if there was no work to do
stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
}
- else
+ else
{
/* Busy-wait until we get the signal pattern set in last byte
* of the l/nl float vector. This pattern corresponds to a floating
* The polling uses atomic compare-exchange.
*/
poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
- while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
+ while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern))
+ {
+ }
}
/* timing data accumulation */
/* X/q H2D and F D2H timings */
timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
- timers->stop_nb_h2d[iloc]);
+ timers->stop_nb_h2d[iloc]);
timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
- timers->stop_nb_d2h[iloc]);
+ timers->stop_nb_d2h[iloc]);
/* only count atdat and pair-list H2D at pair-search step */
if (plist->bDoPrune)
{
timings->pl_h2d_c++;
timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
- timers->stop_atdat);
+ timers->stop_atdat);
}
timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
- timers->stop_pl_h2d[iloc]);
+ timers->stop_pl_h2d[iloc]);
}
}
}
/*! Return the reference to the nbfp texture. */
-const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
{
return nbfp_texref;
}
/*! Return the reference to the coulomb_tab. */
-const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
{
return coulomb_tab_texref;
}