Apply clang-format to source tree
[alexxy/gromacs.git] / src / gromacs / nbnxm / kernels_reference / kernel_gpu_ref.cpp
index d9489ec746205252b2fd72167f24defa4bf8fdc5..3820a719fa95c20495a51d845b59b31b42f5f4cb 100644 (file)
 static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
 static const int c_clSize          = c_nbnxnGpuClusterSize;
 
-void
-nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu     *nbl,
-                     const nbnxn_atomdata_t     *nbat,
-                     const interaction_const_t  *iconst,
-                     rvec                       *shift_vec,
-                     const gmx::StepWorkload    &stepWork,
-                     int                         clearF,
-                     gmx::ArrayRef<real>         f,
-                     real  *                     fshift,
-                     real  *                     Vc,
-                     real  *                     Vvdw)
+void nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu*    nbl,
+                          const nbnxn_atomdata_t*    nbat,
+                          const interaction_const_t* iconst,
+                          rvec*                      shift_vec,
+                          const gmx::StepWorkload&   stepWork,
+                          int                        clearF,
+                          gmx::ArrayRef<real>        f,
+                          real*                      fshift,
+                          real*                      Vc,
+                          real*                      Vvdw)
 {
     gmx_bool            bEwald;
-    const real         *Ftab = nullptr;
+    const real*         Ftab = nullptr;
     real                rcut2, rvdw2, rlist2;
     int                 ntype;
     real                facel;
@@ -95,19 +94,22 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu     *nbl,
     real                int_bit;
     real                fexcl;
     real                c6, c12;
-    const nbnxn_excl_t *excl[2];
+    const nbnxn_excl_texcl[2];
 
-    int                 npair_tot, npair;
-    int                 nhwu, nhwu_pruned;
+    int npair_tot, npair;
+    int nhwu, nhwu_pruned;
 
     if (nbl->na_ci != c_clSize)
     {
-        gmx_fatal(FARGS, "The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d", nbl->na_ci, c_clSize);
+        gmx_fatal(FARGS,
+                  "The neighborlist cluster size in the GPU reference kernel is %d, expected it to "
+                  "be %d",
+                  nbl->na_ci, c_clSize);
     }
 
     if (clearF == enbvClearFYes)
     {
-        for (real &elem : f)
+        for (realelem : f)
         {
             elem = 0;
         }
@@ -119,123 +121,124 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu     *nbl,
         Ftab = iconst->coulombEwaldTables->tableF.data();
     }
 
-    rcut2                = iconst->rcoulomb*iconst->rcoulomb;
-    rvdw2                = iconst->rvdw*iconst->rvdw;
+    rcut2 = iconst->rcoulomb * iconst->rcoulomb;
+    rvdw2 = iconst->rvdw * iconst->rvdw;
 
-    rlist2               = nbl->rlist*nbl->rlist;
+    rlist2 = nbl->rlist * nbl->rlist;
 
-    const int  *type     = nbat->params().type.data();
+    const int* type      = nbat->params().type.data();
     facel                = iconst->epsfac;
-    const real *shiftvec = shift_vec[0];
-    const real *vdwparam = nbat->params().nbfp.data();
+    const realshiftvec = shift_vec[0];
+    const realvdwparam = nbat->params().nbfp.data();
     ntype                = nbat->params().numTypes;
 
-    const real *x        = nbat->x().data();
+    const real* x = nbat->x().data();
 
     npair_tot   = 0;
     nhwu        = 0;
     nhwu_pruned = 0;
 
-    for (const nbnxn_sci_t &nbln : nbl->sci)
+    for (const nbnxn_sci_tnbln : nbl->sci)
     {
-        ish3             = 3*nbln.shift;
-        shX              = shiftvec[ish3];
-        shY              = shiftvec[ish3+1];
-        shZ              = shiftvec[ish3+2];
-        cj4_ind0         = nbln.cj4_ind_start;
-        cj4_ind1         = nbln.cj4_ind_end;
-        sci              = nbln.sci;
-        vctot            = 0;
-        Vvdwtot          = 0;
-
-        if (nbln.shift == CENTRAL &&
-            nbl->cj4[cj4_ind0].cj[0] == sci*c_numClPerSupercl)
+        ish3     = 3 * nbln.shift;
+        shX      = shiftvec[ish3];
+        shY      = shiftvec[ish3 + 1];
+        shZ      = shiftvec[ish3 + 2];
+        cj4_ind0 = nbln.cj4_ind_start;
+        cj4_ind1 = nbln.cj4_ind_end;
+        sci      = nbln.sci;
+        vctot    = 0;
+        Vvdwtot  = 0;
+
+        if (nbln.shift == CENTRAL && nbl->cj4[cj4_ind0].cj[0] == sci * c_numClPerSupercl)
         {
             /* we have the diagonal:
              * add the charge self interaction energy term
              */
             for (im = 0; im < c_numClPerSupercl; im++)
             {
-                ci = sci*c_numClPerSupercl + im;
+                ci = sci * c_numClPerSupercl + im;
                 for (ic = 0; ic < c_clSize; ic++)
                 {
-                    ia     = ci*c_clSize + ic;
-                    iq     = x[ia*nbat->xstride+3];
-                    vctot += iq*iq;
+                    ia = ci * c_clSize + ic;
+                    iq = x[ia * nbat->xstride + 3];
+                    vctot += iq * iq;
                 }
             }
             if (!bEwald)
             {
-                vctot *= -facel*0.5*iconst->c_rf;
+                vctot *= -facel * 0.5 * iconst->c_rf;
             }
             else
             {
                 /* last factor 1/sqrt(pi) */
-                vctot *= -facel*iconst->ewaldcoeff_q*M_1_SQRTPI;
+                vctot *= -facel * iconst->ewaldcoeff_q * M_1_SQRTPI;
             }
         }
 
         for (cj4_ind = cj4_ind0; (cj4_ind < cj4_ind1); cj4_ind++)
         {
-            excl[0]           = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
-            excl[1]           = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
+            excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
+            excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
 
             for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
             {
-                cj               = nbl->cj4[cj4_ind].cj[jm];
+                cj = nbl->cj4[cj4_ind].cj[jm];
 
                 for (im = 0; im < c_numClPerSupercl; im++)
                 {
                     /* We're only using the first imask,
                      * but here imei[1].imask is identical.
                      */
-                    if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*c_numClPerSupercl + im)) & 1)
+                    if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm * c_numClPerSupercl + im)) & 1)
                     {
                         gmx_bool within_rlist;
 
-                        ci               = sci*c_numClPerSupercl + im;
+                        ci = sci * c_numClPerSupercl + im;
 
-                        within_rlist     = FALSE;
-                        npair            = 0;
+                        within_rlist = FALSE;
+                        npair        = 0;
                         for (ic = 0; ic < c_clSize; ic++)
                         {
-                            ia               = ci*c_clSize + ic;
+                            ia = ci * c_clSize + ic;
 
-                            is               = ia*nbat->xstride;
-                            ifs              = ia*nbat->fstride;
-                            ix               = shX + x[is+0];
-                            iy               = shY + x[is+1];
-                            iz               = shZ + x[is+2];
-                            iq               = facel*x[is+3];
-                            nti              = ntype*2*type[ia];
+                            is  = ia * nbat->xstride;
+                            ifs = ia * nbat->fstride;
+                            ix  = shX + x[is + 0];
+                            iy  = shY + x[is + 1];
+                            iz  = shZ + x[is + 2];
+                            iq  = facel * x[is + 3];
+                            nti = ntype * 2 * type[ia];
 
-                            fix              = 0;
-                            fiy              = 0;
-                            fiz              = 0;
+                            fix = 0;
+                            fiy = 0;
+                            fiz = 0;
 
                             for (jc = 0; jc < c_clSize; jc++)
                             {
-                                ja               = cj*c_clSize + jc;
+                                ja = cj * c_clSize + jc;
 
-                                if (nbln.shift == CENTRAL &&
-                                    ci == cj && ja <= ia)
+                                if (nbln.shift == CENTRAL && ci == cj && ja <= ia)
                                 {
                                     continue;
                                 }
 
-                                constexpr int clusterPerSplit = c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
-                                int_bit = static_cast<real>((excl[jc/clusterPerSplit]->pair[(jc & (clusterPerSplit - 1))*c_clSize + ic]
-                                                             >> (jm*c_numClPerSupercl + im)) & 1);
-
-                                js               = ja*nbat->xstride;
-                                jfs              = ja*nbat->fstride;
-                                jx               = x[js+0];
-                                jy               = x[js+1];
-                                jz               = x[js+2];
-                                dx               = ix - jx;
-                                dy               = iy - jy;
-                                dz               = iz - jz;
-                                rsq              = dx*dx + dy*dy + dz*dz;
+                                constexpr int clusterPerSplit =
+                                        c_nbnxnGpuClusterSize / c_nbnxnGpuClusterpairSplit;
+                                int_bit = static_cast<real>(
+                                        (excl[jc / clusterPerSplit]->pair[(jc & (clusterPerSplit - 1)) * c_clSize + ic]
+                                         >> (jm * c_numClPerSupercl + im))
+                                        & 1);
+
+                                js  = ja * nbat->xstride;
+                                jfs = ja * nbat->fstride;
+                                jx  = x[js + 0];
+                                jy  = x[js + 1];
+                                jz  = x[js + 2];
+                                dx  = ix - jx;
+                                dy  = iy - jy;
+                                dz  = iz - jz;
+                                rsq = dx * dx + dy * dy + dz * dz;
                                 if (rsq < rlist2)
                                 {
                                     within_rlist = TRUE;
@@ -245,90 +248,94 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu     *nbl,
                                     continue;
                                 }
 
-                                if (type[ia] != ntype-1 && type[ja] != ntype-1)
+                                if (type[ia] != ntype - 1 && type[ja] != ntype - 1)
                                 {
                                     npair++;
                                 }
 
                                 // Ensure distance do not become so small that r^-12 overflows
-                                rsq              = std::max(rsq, NBNXN_MIN_RSQ);
+                                rsq = std::max(rsq, NBNXN_MIN_RSQ);
 
-                                rinv             = gmx::invsqrt(rsq);
-                                rinvsq           = rinv*rinv;
+                                rinv   = gmx::invsqrt(rsq);
+                                rinvsq = rinv * rinv;
 
-                                qq               = iq*x[js+3];
+                                qq = iq * x[js + 3];
                                 if (!bEwald)
                                 {
                                     /* Reaction-field */
-                                    krsq  = iconst->k_rf*rsq;
-                                    fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
+                                    krsq  = iconst->k_rf * rsq;
+                                    fscal = qq * (int_bit * rinv - 2 * krsq) * rinvsq;
                                     if (stepWork.computeEnergy)
                                     {
-                                        vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
+                                        vcoul = qq * (int_bit * rinv + krsq - iconst->c_rf);
                                     }
                                 }
                                 else
                                 {
-                                    r     = rsq*rinv;
-                                    rt    = r*iconst->coulombEwaldTables->scale;
-                                    n0    = static_cast<int>(rt);
-                                    eps   = rt - static_cast<real>(n0);
+                                    r   = rsq * rinv;
+                                    rt  = r * iconst->coulombEwaldTables->scale;
+                                    n0  = static_cast<int>(rt);
+                                    eps = rt - static_cast<real>(n0);
 
-                                    fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
+                                    fexcl = (1 - eps) * Ftab[n0] + eps * Ftab[n0 + 1];
 
-                                    fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
+                                    fscal = qq * (int_bit * rinvsq - fexcl) * rinv;
 
                                     if (stepWork.computeEnergy)
                                     {
-                                        vcoul = qq*((int_bit - std::erf(iconst->ewaldcoeff_q*r))*rinv - int_bit*iconst->sh_ewald);
+                                        vcoul = qq
+                                                * ((int_bit - std::erf(iconst->ewaldcoeff_q * r)) * rinv
+                                                   - int_bit * iconst->sh_ewald);
                                     }
                                 }
 
                                 if (rsq < rvdw2)
                                 {
-                                    tj        = nti + 2*type[ja];
+                                    tj = nti + 2 * type[ja];
 
                                     /* Vanilla Lennard-Jones cutoff */
-                                    c6        = vdwparam[tj];
-                                    c12       = vdwparam[tj+1];
+                                    c6  = vdwparam[tj];
+                                    c12 = vdwparam[tj + 1];
 
-                                    rinvsix   = int_bit*rinvsq*rinvsq*rinvsq;
-                                    Vvdw_disp = c6*rinvsix;
-                                    Vvdw_rep  = c12*rinvsix*rinvsix;
-                                    fscal    += (Vvdw_rep - Vvdw_disp)*rinvsq;
+                                    rinvsix   = int_bit * rinvsq * rinvsq * rinvsq;
+                                    Vvdw_disp = c6 * rinvsix;
+                                    Vvdw_rep  = c12 * rinvsix * rinvsix;
+                                    fscal += (Vvdw_rep - Vvdw_disp) * rinvsq;
 
                                     if (stepWork.computeEnergy)
                                     {
-                                        vctot   += vcoul;
+                                        vctot += vcoul;
 
                                         Vvdwtot +=
-                                            (Vvdw_rep + int_bit*c12*iconst->repulsion_shift.cpot)/12 -
-                                            (Vvdw_disp + int_bit*c6*iconst->dispersion_shift.cpot)/6;
+                                                (Vvdw_rep + int_bit * c12 * iconst->repulsion_shift.cpot) / 12
+                                                - (Vvdw_disp
+                                                   + int_bit * c6 * iconst->dispersion_shift.cpot)
+                                                          / 6;
                                     }
                                 }
 
-                                tx        = fscal*dx;
-                                ty        = fscal*dy;
-                                tz        = fscal*dz;
-                                fix       = fix + tx;
-                                fiy       = fiy + ty;
-                                fiz       = fiz + tz;
-                                f[jfs+0] -= tx;
-                                f[jfs+1] -= ty;
-                                f[jfs+2] -= tz;
+                                tx  = fscal * dx;
+                                ty  = fscal * dy;
+                                tz  = fscal * dz;
+                                fix = fix + tx;
+                                fiy = fiy + ty;
+                                fiz = fiz + tz;
+                                f[jfs + 0] -= tx;
+                                f[jfs + 1] -= ty;
+                                f[jfs + 2] -= tz;
                             }
 
-                            f[ifs+0]        += fix;
-                            f[ifs+1]        += fiy;
-                            f[ifs+2]        += fiz;
-                            fshift[ish3]     = fshift[ish3]   + fix;
-                            fshift[ish3+1]   = fshift[ish3+1] + fiy;
-                            fshift[ish3+2]   = fshift[ish3+2] + fiz;
+                            f[ifs + 0] += fix;
+                            f[ifs + 1] += fiy;
+                            f[ifs + 2] += fiz;
+                            fshift[ish3]     = fshift[ish3] + fix;
+                            fshift[ish3 + 1] = fshift[ish3 + 1] + fiy;
+                            fshift[ish3 + 2] = fshift[ish3 + 2] + fiz;
 
                             /* Count in half work-units.
                              * In CUDA one work-unit is 2 warps.
                              */
-                            if ((ic+1) % (c_clSize/c_nbnxnGpuClusterpairSplit) == 0)
+                            if ((ic + 1) % (c_clSize / c_nbnxnGpuClusterpairSplit) == 0)
                             {
                                 npair_tot += npair;
 
@@ -349,24 +356,22 @@ nbnxn_kernel_gpu_ref(const NbnxnPairlistGpu     *nbl,
 
         if (stepWork.computeEnergy)
         {
-            ggid             = 0;
-            Vc[ggid]         = Vc[ggid]   + vctot;
-            Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
+            ggid       = 0;
+            Vc[ggid]   = Vc[ggid] + vctot;
+            Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
         }
     }
 
     if (debug)
     {
         fprintf(debug, "number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
-                nbl->na_ci, nbl->na_ci,
-                nhwu, nhwu_pruned, nhwu_pruned/static_cast<double>(nhwu));
+                nbl->na_ci, nbl->na_ci, nhwu, nhwu_pruned, nhwu_pruned / static_cast<double>(nhwu));
         fprintf(debug, "generic kernel pair interactions:            %d\n",
-                nhwu*nbl->na_ci/2*nbl->na_ci);
+                nhwu * nbl->na_ci / 2 * nbl->na_ci);
         fprintf(debug, "generic kernel post-prune pair interactions: %d\n",
-                nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
-        fprintf(debug, "generic kernel non-zero pair interactions:   %d\n",
-                npair_tot);
+                nhwu_pruned * nbl->na_ci / 2 * nbl->na_ci);
+        fprintf(debug, "generic kernel non-zero pair interactions:   %d\n", npair_tot);
         fprintf(debug, "ratio non-zero/post-prune pair interactions: %4.2f\n",
-                npair_tot/static_cast<double>(nhwu_pruned*gmx::exactDiv(nbl->na_ci, 2)*nbl->na_ci));
+                npair_tot / static_cast<double>(nhwu_pruned * gmx::exactDiv(nbl->na_ci, 2) * nbl->na_ci));
     }
 }