Uncrustify all files
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_cuda / nbnxn_cuda_data_mgmt.cu
index 5de2e80988a77087e62a1697bb998104481567a4..fd75a3d1f0cc08c6bc4cf2015e65bfa2583cea47 100644 (file)
@@ -70,8 +70,8 @@ static unsigned int gpu_min_ci_balanced_factor = 40;
 
 /* Functions from nbnxn_cuda.cu */
 extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
-extern const struct texture<float, 1, cudaReadModeElementType>nbnxn_cuda_get_nbfp_texref();
-extern const struct texture<float, 1, cudaReadModeElementType>nbnxn_cuda_get_coulomb_tab_texref();
+extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
+extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
 
 /* We should actually be using md_print_warn in md_logging.c,
  * but we can't include mpi.h in CUDA code.
@@ -113,9 +113,9 @@ static void init_ewald_coulomb_force_table(cu_nbparam_t          *nbp,
                                            const cuda_dev_info_t *dev_info)
 {
     float       *ftmp, *coul_tab;
-    int         tabsize;
-    double      tabscale;
-    cudaError_t stat;
+    int          tabsize;
+    double       tabscale;
+    cudaError_t  stat;
 
     tabsize     = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
     /* Subtract 2 iso 1 to avoid access out of range due to rounding */
@@ -182,7 +182,7 @@ static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
     cudaError_t stat;
 
     ad->ntypes  = ntypes;
-    stat = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
+    stat        = cudaMalloc((void**)&ad->shift_vec, SHIFTS*sizeof(*ad->shift_vec));
     CU_RET_ERR(stat, "cudaMalloc failed on ad->shift_vec");
     ad->bShiftVecUploaded = false;
 
@@ -259,25 +259,25 @@ static int pick_ewald_kernel_type(bool                   bTwinCut,
 
 
 /*! Initializes the nonbonded parameter data structure. */
-static void init_nbparam(cu_nbparam_t *nbp,
+static void init_nbparam(cu_nbparam_t              *nbp,
                          const interaction_const_t *ic,
-                         const nbnxn_atomdata_t *nbat,
-                         const cuda_dev_info_t *dev_info)
+                         const nbnxn_atomdata_t    *nbat,
+                         const cuda_dev_info_t     *dev_info)
 {
     cudaError_t stat;
     int         ntypes, nnbfp;
 
     ntypes  = nbat->ntype;
 
-    nbp->ewald_beta = ic->ewaldcoeff_q;
-    nbp->sh_ewald   = ic->sh_ewald;
-    nbp->epsfac     = ic->epsfac;
-    nbp->two_k_rf   = 2.0 * ic->k_rf;
-    nbp->c_rf       = ic->c_rf;
-    nbp->rvdw_sq    = ic->rvdw * ic->rvdw;
-    nbp->rcoulomb_sq= ic->rcoulomb * ic->rcoulomb;
-    nbp->rlist_sq   = ic->rlist * ic->rlist;
-    nbp->sh_invrc6  = ic->sh_invrc6;
+    nbp->ewald_beta  = ic->ewaldcoeff_q;
+    nbp->sh_ewald    = ic->sh_ewald;
+    nbp->epsfac      = ic->epsfac;
+    nbp->two_k_rf    = 2.0 * ic->k_rf;
+    nbp->c_rf        = ic->c_rf;
+    nbp->rvdw_sq     = ic->rvdw * ic->rvdw;
+    nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
+    nbp->rlist_sq    = ic->rlist * ic->rlist;
+    nbp->sh_invrc6   = ic->sh_invrc6;
 
     if (ic->eeltype == eelCUT)
     {
@@ -287,7 +287,7 @@ static void init_nbparam(cu_nbparam_t *nbp,
     {
         nbp->eeltype = eelCuRF;
     }
-    else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
+    else if ((EEL_PME(ic->eeltype) || ic->eeltype == eelEWALD))
     {
         /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
         nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
@@ -306,41 +306,41 @@ static void init_nbparam(cu_nbparam_t *nbp,
     }
 
     nnbfp = 2*ntypes*ntypes;
-    stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
+    stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 
 #ifdef TEXOBJ_SUPPORTED
-        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (dev_info->prop.major >= 3)
-        {
-            cudaResourceDesc rd;
-            memset(&rd, 0, sizeof(rd));
-            rd.resType                  = cudaResourceTypeLinear;
-            rd.res.linear.devPtr        = nbp->nbfp;
-            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
-            rd.res.linear.desc.x        = 32;
-            rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
-
-            cudaTextureDesc td;
-            memset(&td, 0, sizeof(td));
-            td.readMode                 = cudaReadModeElementType;
-            stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
-            CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
-        }
-        else
+    /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+    if (dev_info->prop.major >= 3)
+    {
+        cudaResourceDesc rd;
+        memset(&rd, 0, sizeof(rd));
+        rd.resType                  = cudaResourceTypeLinear;
+        rd.res.linear.devPtr        = nbp->nbfp;
+        rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+        rd.res.linear.desc.x        = 32;
+        rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
+
+        cudaTextureDesc td;
+        memset(&td, 0, sizeof(td));
+        td.readMode                 = cudaReadModeElementType;
+        stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
+        CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
+    }
+    else
 #endif
-        {
-            cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
-            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
-                                   nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
-            CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
-        }
+    {
+        cudaChannelFormatDesc cd = cudaCreateChannelDesc<float>();
+        stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
+                               nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
+        CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
+    }
 }
 
 /*! Re-generate the GPU Ewald force table, resets rlist, and update the
  *  electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t           cu_nb,
                                          const interaction_const_t *ic)
 {
     cu_nbparam_t *nbp = cu_nb->nbparam;
@@ -379,7 +379,7 @@ static void init_plist(cu_plist_t *pl)
 static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
 {
     cudaError_t stat;
-    int eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync: cudaEventDefault );
+    int         eventflags = ( bUseCudaEventBlockingSync ? cudaEventBlockingSync : cudaEventDefault );
 
     stat = cudaEventCreateWithFlags(&(t->start_atdat), eventflags);
     CU_RET_ERR(stat, "cudaEventCreate on start_atdat failed");
@@ -419,12 +419,12 @@ static void init_timings(wallclock_gpu_t *t)
 
     t->nb_h2d_t = 0.0;
     t->nb_d2h_t = 0.0;
-    t->nb_c    = 0;
+    t->nb_c     = 0;
     t->pl_h2d_t = 0.0;
     t->pl_h2d_c = 0;
     for (i = 0; i < 2; i++)
     {
-        for(j = 0; j < 2; j++)
+        for (j = 0; j < 2; j++)
         {
             t->ktime[i][j].t = 0.0;
             t->ktime[i][j].c = 0;
@@ -432,22 +432,25 @@ static void init_timings(wallclock_gpu_t *t)
     }
 }
 
-void nbnxn_cuda_init(FILE *fplog,
-                     nbnxn_cuda_ptr_t *p_cu_nb,
+void nbnxn_cuda_init(FILE                 *fplog,
+                     nbnxn_cuda_ptr_t     *p_cu_nb,
                      const gmx_gpu_info_t *gpu_info,
-                     const gmx_gpu_opt_t *gpu_opt,
-                     int my_gpu_index,
-                     gmx_bool bLocalAndNonlocal)
+                     const gmx_gpu_opt_t  *gpu_opt,
+                     int                   my_gpu_index,
+                     gmx_bool              bLocalAndNonlocal)
 {
-    cudaError_t stat;
+    cudaError_t       stat;
     nbnxn_cuda_ptr_t  nb;
-    char sbuf[STRLEN];
-    bool bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
-    int cuda_drv_ver;
+    char              sbuf[STRLEN];
+    bool              bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
+    int               cuda_drv_ver;
 
     assert(gpu_info);
 
-    if (p_cu_nb == NULL) return;
+    if (p_cu_nb == NULL)
+    {
+        return;
+    }
 
     snew(nb, 1);
     snew(nb->atdat, 1);
@@ -516,7 +519,7 @@ void nbnxn_cuda_init(FILE *fplog,
      * With polling wait event-timing also needs to be disabled.
      *
      * The overhead is greatly reduced in API v5.0 drivers and the improvement
-     $ is independent of runtime version. Hence, with API v5.0 drivers and later
+     * is independent of runtime version. Hence, with API v5.0 drivers and later
      * we won't switch to polling.
      *
      * NOTE: Unfortunately, this is known to fail when GPUs are shared by (t)MPI,
@@ -592,8 +595,8 @@ void nbnxn_cuda_init(FILE *fplog,
                         "      However, the polling wait workaround can not be used because\n%s\n"
                         "      Consider updating the driver or turning ECC off.",
                         (bX86 && bTMPIAtomics) ?
-                            "      GPU(s) are being oversubscribed." :
-                            "      atomic operations are not supported by the platform/CPU+compiler.");
+                        "      GPU(s) are being oversubscribed." :
+                        "      atomic operations are not supported by the platform/CPU+compiler.");
                 md_print_warn(fplog, sbuf);
             }
         }
@@ -651,15 +654,15 @@ void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t                cu_nb,
     nbnxn_cuda_clear_e_fshift(cu_nb);
 }
 
-void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t        cu_nb,
                               const nbnxn_pairlist_t *h_plist,
-                              int iloc)
+                              int                     iloc)
 {
-    char         sbuf[STRLEN];
-    cudaError_t  stat;
-    bool         bDoTime    = cu_nb->bDoTime;
-    cudaStream_t stream     = cu_nb->stream[iloc];
-    cu_plist_t   *d_plist   = cu_nb->plist[iloc];
+    char          sbuf[STRLEN];
+    cudaError_t   stat;
+    bool          bDoTime    = cu_nb->bDoTime;
+    cudaStream_t  stream     = cu_nb->stream[iloc];
+    cu_plist_t   *d_plist    = cu_nb->plist[iloc];
 
     if (d_plist->na_c < 0)
     {
@@ -682,19 +685,19 @@ void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
     }
 
     cu_realloc_buffered((void **)&d_plist->sci, h_plist->sci, sizeof(*d_plist->sci),
-                         &d_plist->nsci, &d_plist->sci_nalloc,
-                         h_plist->nsci,
-                         stream, true);
+                        &d_plist->nsci, &d_plist->sci_nalloc,
+                        h_plist->nsci,
+                        stream, true);
 
     cu_realloc_buffered((void **)&d_plist->cj4, h_plist->cj4, sizeof(*d_plist->cj4),
-                         &d_plist->ncj4, &d_plist->cj4_nalloc,
-                         h_plist->ncj4,
-                         stream, true);
+                        &d_plist->ncj4, &d_plist->cj4_nalloc,
+                        h_plist->ncj4,
+                        stream, true);
 
     cu_realloc_buffered((void **)&d_plist->excl, h_plist->excl, sizeof(*d_plist->excl),
-                         &d_plist->nexcl, &d_plist->excl_nalloc,
-                         h_plist->nexcl,
-                         stream, true);
+                        &d_plist->nexcl, &d_plist->excl_nalloc,
+                        h_plist->nexcl,
+                        stream, true);
 
     if (bDoTime)
     {
@@ -706,16 +709,16 @@ void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t cu_nb,
     d_plist->bDoPrune = true;
 }
 
-void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t        cu_nb,
                                 const nbnxn_atomdata_t *nbatom)
 {
-    cu_atomdata_t *adat = cu_nb->atdat;
-    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+    cu_atomdata_t *adat  = cu_nb->atdat;
+    cudaStream_t   ls    = cu_nb->stream[eintLocal];
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
     {
-        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec, 
+        cu_copy_H2D_async(adat->shift_vec, nbatom->shift_vec,
                           SHIFTS * sizeof(*adat->shift_vec), ls);
         adat->bShiftVecUploaded = true;
     }
@@ -724,9 +727,9 @@ void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t cu_nb,
 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
 {
-    cudaError_t   stat;
-    cu_atomdata_t *adat = cu_nb->atdat;
-    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+    cudaError_t    stat;
+    cu_atomdata_t *adat  = cu_nb->atdat;
+    cudaStream_t   ls    = cu_nb->stream[eintLocal];
 
     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -735,9 +738,9 @@ static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
 {
-    cudaError_t   stat;
-    cu_atomdata_t *adat = cu_nb->atdat;
-    cudaStream_t  ls    = cu_nb->stream[eintLocal];
+    cudaError_t    stat;
+    cu_atomdata_t *adat  = cu_nb->atdat;
+    cudaStream_t   ls    = cu_nb->stream[eintLocal];
 
     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -750,7 +753,7 @@ static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
 void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
 {
     nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
-    /* clear shift force array and energies if the outputs were 
+    /* clear shift force array and energies if the outputs were
        used in the current step */
     if (flags & GMX_FORCE_VIRIAL)
     {
@@ -758,18 +761,18 @@ void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
     }
 }
 
-void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
+void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t        cu_nb,
                               const nbnxn_atomdata_t *nbat)
 {
-    cudaError_t   stat;
-    int           nalloc, natoms;
-    bool          realloced;
-    bool          bDoTime   = cu_nb->bDoTime;
-    cu_timers_t   *timers   = cu_nb->timers;
-    cu_atomdata_t *d_atdat  = cu_nb->atdat;
-    cudaStream_t  ls        = cu_nb->stream[eintLocal];
-
-    natoms = nbat->natoms;
+    cudaError_t    stat;
+    int            nalloc, natoms;
+    bool           realloced;
+    bool           bDoTime   = cu_nb->bDoTime;
+    cu_timers_t   *timers    = cu_nb->timers;
+    cu_atomdata_t *d_atdat   = cu_nb->atdat;
+    cudaStream_t   ls        = cu_nb->stream[eintLocal];
+
+    natoms    = nbat->natoms;
     realloced = false;
 
     if (bDoTime)
@@ -802,10 +805,10 @@ void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
         CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
 
         d_atdat->nalloc = nalloc;
-        realloced = true;
+        realloced       = true;
     }
 
-    d_atdat->natoms = natoms;
+    d_atdat->natoms       = natoms;
     d_atdat->natoms_local = nbat->natoms_local;
 
     /* need to clear GPU f output if realloc happened */
@@ -826,13 +829,16 @@ void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t cu_nb,
 
 void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
 {
-    cudaError_t     stat;
+    cudaError_t      stat;
     cu_atomdata_t   *atdat;
     cu_nbparam_t    *nbparam;
     cu_plist_t      *plist, *plist_nl;
     cu_timers_t     *timers;
 
-    if (cu_nb == NULL) return;
+    if (cu_nb == NULL)
+    {
+        return;
+    }
 
     atdat       = cu_nb->atdat;
     nbparam     = cu_nb->nbparam;
@@ -957,7 +963,7 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
 
 void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
 {
-    cudaError_t stat;
+    cudaError_t  stat;
     cudaStream_t stream = cu_nb->stream[iloc];
 
     stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
@@ -980,7 +986,7 @@ void nbnxn_cuda_reset_timings(nbnxn_cuda_ptr_t cu_nb)
 int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
 {
     return cu_nb != NULL ?
-        gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
+           gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
 
 }