Added CUDA LJ-PME nbnxn kernels

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_atomdata.c
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c

index ba0b9a5bbcb3aa426ce67924d8431fcc7a68a6a5..d780c21331237dec7333243a5b3bbec39f4a7b9e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_atomdata.c
+++ b/src/gromacs/mdlib/nbnxn_atomdata.c
@@ -360,32 +360,39 @@ void copy_rvec_to_nbat_real(const int *a, int na, int na_round,
      }
  }
  
-/* Stores the LJ parameter data in a format convenient for the SIMD kernels */
-static void set_ljparam_simd_data(nbnxn_atomdata_t *nbat)
+/* Stores the LJ parameter data in a format convenient for different kernels */
+static void set_lj_parameter_data(nbnxn_atomdata_t *nbat, gmx_bool bSIMD)
  {
      int  nt, i, j;
      real c6, c12;
  
      nt = nbat->ntype;
  
-    /* nbfp_s4 stores two parameters using a stride of 4,
-     * because this would suit x86 SIMD single-precision
-     * quad-load intrinsics. There's a slight inefficiency in
-     * allocating and initializing nbfp_s4 when it might not
-     * be used, but introducing the conditional code is not
-     * really worth it. */
-    nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4));
-    for (i = 0; i < nt; i++)
+    if (bSIMD)
      {
-        for (j = 0; j < nt; j++)
+        /* nbfp_s4 stores two parameters using a stride of 4,
+         * because this would suit x86 SIMD single-precision
+         * quad-load intrinsics. There's a slight inefficiency in
+         * allocating and initializing nbfp_s4 when it might not
+         * be used, but introducing the conditional code is not
+         * really worth it. */
+        nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4));
+        for (i = 0; i < nt; i++)
          {
-            nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
-            nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
-            nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
-            nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
+            for (j = 0; j < nt; j++)
+            {
+                nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
+                nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
+                nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
+                nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
+            }
          }
      }
  
+    /* We use combination rule data for SIMD combination rule kernels
+     * and with LJ-PME kernels. We then only need parameters per atom type,
+     * not per pair of atom types.
+     */
      switch (nbat->comb_rule)
      {
          case ljcrGEOM:
@@ -393,7 +400,7 @@ static void set_ljparam_simd_data(nbnxn_atomdata_t *nbat)
  
              for (i = 0; i < nt; i++)
              {
-                /* Copy the diagonal from the nbfp matrix */
+                /* Store the sqrt of the diagonal from the nbfp matrix */
                  nbat->nbfp_comb[i*2  ] = sqrt(nbat->nbfp[(i*nt+i)*2  ]);
                  nbat->nbfp_comb[i*2+1] = sqrt(nbat->nbfp[(i*nt+i)*2+1]);
              }
@@ -527,7 +534,7 @@ void nbnxn_atomdata_init(FILE *fp,
      int      i, j, nth;
      real     c6, c12, tol;
      char    *ptr;
-    gmx_bool simple, bCombGeom, bCombLB;
+    gmx_bool simple, bCombGeom, bCombLB, bSIMD;
  
      if (alloc == NULL)
      {
@@ -688,10 +695,10 @@ void nbnxn_atomdata_init(FILE *fp,
              gmx_incons("Unknown enbnxninitcombrule");
      }
  
-    if (simple)
-    {
-        set_ljparam_simd_data(nbat);
-    }
+    bSIMD = (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
+             nb_kernel_type == nbnxnk4xN_SIMD_2xNN);
+
+    set_lj_parameter_data(nbat, bSIMD);
  
      nbat->natoms  = 0;
      nbat->type    = NULL;
@@ -700,27 +707,25 @@ void nbnxn_atomdata_init(FILE *fp,
      {
          int pack_x;
  
-        switch (nb_kernel_type)
+        if (bSIMD)
          {
-            case nbnxnk4xN_SIMD_4xN:
-            case nbnxnk4xN_SIMD_2xNN:
-                pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
-                             nbnxn_kernel_to_cj_size(nb_kernel_type));
-                switch (pack_x)
-                {
-                    case 4:
-                        nbat->XFormat = nbatX4;
-                        break;
-                    case 8:
-                        nbat->XFormat = nbatX8;
-                        break;
-                    default:
-                        gmx_incons("Unsupported packing width");
-                }
-                break;
-            default:
-                nbat->XFormat = nbatXYZ;
-                break;
+            pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
+                         nbnxn_kernel_to_cj_size(nb_kernel_type));
+            switch (pack_x)
+            {
+                case 4:
+                    nbat->XFormat = nbatX4;
+                    break;
+                case 8:
+                    nbat->XFormat = nbatX8;
+                    break;
+                default:
+                    gmx_incons("Unsupported packing width");
+            }
+        }
+        else
+        {
+            nbat->XFormat = nbatXYZ;
          }
  
          nbat->FFormat = nbat->XFormat;