LJ combination rule kernels for CUDA

author Berk Hess <hess@kth.se>

Tue, 23 Feb 2016 18:51:48 +0000 (19:51 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Tue, 22 Mar 2016 00:54:33 +0000 (01:54 +0100)
author Berk Hess <hess@kth.se>
Tue, 23 Feb 2016 18:51:48 +0000 (19:51 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Tue, 22 Mar 2016 00:54:33 +0000 (01:54 +0100)
diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst

index 1489f55e1d9bc070af7f29f8db0300417ea11d67..f1bf98fe306e43d62338320a6b1c4a313322be5b 100644 (file)
--- a/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@ -257,6 +257,10 @@ Performance and Run Control
          used in initializing domain decomposition communicators. Rank reordering
          is default, but can be switched off with this environment variable.
  
+``GMX_NO_LJ_COMB_RULE``
+        force the use of LJ paremeter lookup instead of using combination rules
+        in the non-bonded kernels.
+
  ``GMX_NO_CUDA_STREAMSYNC``
          the opposite of ``GMX_CUDA_STREAMSYNC``. Disables the use of the
          standard cudaStreamSynchronize-based GPU waiting to improve performance when using CUDA driver API
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index c7aa7c68624ec1e34f0b1bbd83cae2cd80665e55..56ae3bf98d8a082aa3573f5429053c5d3ce65665 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -2145,7 +2145,10 @@ static void init_nb_verlet(FILE                *fp,
  
              bSimpleList = nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type);
  
-            if (bSimpleList && (fr->vdwtype == evdwCUT && (fr->vdw_modifier == eintmodNONE || fr->vdw_modifier == eintmodPOTSHIFT)))
+            if (fr->vdwtype == evdwCUT &&
+                (fr->vdw_modifier == eintmodNONE ||
+                 fr->vdw_modifier == eintmodPOTSHIFT) &&
+                getenv("GMX_NO_LJ_COMB_RULE") == NULL)
              {
                  /* Plain LJ cut-off: we can optimize with combination rules */
                  enbnxninitcombrule = enbnxninitcombruleDETECT;
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.cpp b/src/gromacs/mdlib/nbnxn_atomdata.cpp

index c8061f0259fc8f7c0620ed150e9ccfcf31d2de70..562acc4233b2db40a90769360e56717c35998b87 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_atomdata.cpp
+++ b/src/gromacs/mdlib/nbnxn_atomdata.cpp
@@ -903,6 +903,13 @@ static void nbnxn_atomdata_set_ljcombparams(nbnxn_atomdata_t    *nbat,
                                                        ncz*grid->na_sc,
                                                        nbat->lj_comb + ash*2);
                  }
+                else if (nbat->XFormat == nbatXYZQ)
+                {
+                    copy_lj_to_nbat_lj_comb<1>(nbat->nbfp_comb,
+                                               nbat->type + ash,
+                                               ncz*grid->na_sc,
+                                               nbat->lj_comb + ash*2);
+                }
              }
          }
      }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 5b01a74ec8c1151511ebdc29a1e0bba203117be3..eb3094f75aace28d5f67035bb3d6fac3fca50ad3 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -175,45 +175,45 @@ static inline int calc_nb_kernel_nblock(int nwork_units, gmx_device_info_t *dinf
  /*! Force-only kernel function pointers. */
  static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
  {
-    { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda            },
-    { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda             },
-    { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda        },
-    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
-    { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda             },
-    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda      }
+    { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_F_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda            },
+    { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_F_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda             },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda        },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
+    { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_F_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda             },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda      }
  };
  
  /*! Force + energy kernel function pointers. */
  static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
  {
-    { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda              },
-    { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda               },
-    { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda          },
-    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda     },
-    { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda               },
-    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda        }
+    { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda            },
+    { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda             },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda        },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda },
+    { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda             },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda      }
  };
  
  /*! Force + pruning kernel function pointers. */
  static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
  {
-    { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda            },
-    { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,              nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda             },
-    { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,         nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda        },
-    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda,  nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda },
-    { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,              nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda             },
-    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,       nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda      }
+    { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda             },
+    { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda              },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda         },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda  },
+    { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda              },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda       }
  };
  
  /*! Force + energy + pruning kernel function pointers. */
  static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
  {
-    { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda            },
-    { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda             },
-    { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda        },
-    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
-    { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda             },
-    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda      }
+    { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda            },
+    { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda             },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda        },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
+    { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda             },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda      }
  };
  
  /*! Return a pointer to the kernel version to be executed at the current step. */
@@ -261,7 +261,7 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int
  }
  
  /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
-static inline int calc_shmem_required(const int num_threads_z, gmx_device_info_t gmx_unused *dinfo)
+static inline int calc_shmem_required(const int num_threads_z, gmx_device_info_t gmx_unused *dinfo, const cu_nbparam_t *nbp)
  {
      int shmem;
  
@@ -275,8 +275,17 @@ static inline int calc_shmem_required(const int num_threads_z, gmx_device_info_t
      shmem += num_threads_z * 2 * c_nbnxnGpuJgroupSize * sizeof(int);
      if (dinfo->prop.major >= 3)
      {
-        /* i-atom types in shared memory */
-        shmem += c_numClPerSupercl * c_clSize * sizeof(int);
+        if (nbp->vdwtype == evdwCuCUTCOMBGEOM ||
+            nbp->vdwtype == evdwCuCUTCOMBLB)
+        {
+            /* i-atom LJ combination parameters in shared memory */
+            shmem += c_numClPerSupercl * c_clSize * sizeof(float2);
+        }
+        else
+        {
+            /* i-atom types in shared memory */
+            shmem += c_numClPerSupercl * c_clSize * sizeof(int);
+        }
      }
      if (dinfo->prop.major < 3)
      {
@@ -421,7 +430,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
      nblock    = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
      dim_block = dim3(c_clSize, c_clSize, num_threads_z);
      dim_grid  = dim3(nblock, 1, 1);
-    shmem     = calc_shmem_required(num_threads_z, nb->dev_info);
+    shmem     = calc_shmem_required(num_threads_z, nb->dev_info, nbp);
  
      if (debug)
      {
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index aa1d33c70aa19d0430d85671b52de70402b691ea..3f94bd35ff96c270e61305b42eedc08682a9c9a7 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -276,13 +276,36 @@ static void init_nbparam(cu_nbparam_t              *nbp,
  
      set_cutoff_parameters(nbp, ic);
  
+    /* The kernel code supports LJ combination rules (geometric and LB) for
+     * all kernel types, but we only generate useful combination rule kernels.
+     * We currently only use LJ combination rule (geometric and LB) kernels
+     * for plain cut-off LJ. On Maxwell the force only kernels speed up 15%
+     * with PME and 20% with RF, the other kernels speed up about half as much.
+     * For LJ force-switch the geometric rule would give 7% speed-up, but this
+     * combination is rarely used. LJ force-switch with LB rule is more common,
+     * but gives only 1% speed-up.
+     */
      if (ic->vdwtype == evdwCUT)
      {
          switch (ic->vdw_modifier)
          {
              case eintmodNONE:
              case eintmodPOTSHIFT:
-                nbp->vdwtype = evdwCuCUT;
+                switch (nbat->comb_rule)
+                {
+                    case ljcrNONE:
+                        nbp->vdwtype = evdwCuCUT;
+                        break;
+                    case ljcrGEOM:
+                        nbp->vdwtype = evdwCuCUTCOMBGEOM;
+                        break;
+                    case ljcrLB:
+                        nbp->vdwtype = evdwCuCUTCOMBLB;
+                        break;
+                    default:
+                        gmx_incons("The requested LJ combination rule is not implemented in the CUDA GPU accelerated kernels!");
+                        break;
+                }
                  break;
              case eintmodFORCESWITCH:
                  nbp->vdwtype = evdwCuFSWITCH;
@@ -735,6 +758,9 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
      cu_atomdata_t *d_atdat   = nb->atdat;
      cudaStream_t   ls        = nb->stream[eintLocal];
  
+    bool           bUseLjCombination = (nb->nbparam->vdwtype == evdwCuCUTCOMBGEOM ||
+                                        nb->nbparam->vdwtype == evdwCuCUTCOMBLB);
+
      natoms    = nbat->natoms;
      realloced = false;
  
@@ -757,15 +783,23 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
              cu_free_buffered(d_atdat->f, &d_atdat->natoms, &d_atdat->nalloc);
              cu_free_buffered(d_atdat->xq);
              cu_free_buffered(d_atdat->atom_types);
+            cu_free_buffered(d_atdat->lj_comb);
          }
  
          stat = cudaMalloc((void **)&d_atdat->f, nalloc*sizeof(*d_atdat->f));
          CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->f");
          stat = cudaMalloc((void **)&d_atdat->xq, nalloc*sizeof(*d_atdat->xq));
          CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->xq");
-
-        stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
-        CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
+        if (bUseLjCombination)
+        {
+            stat = cudaMalloc((void **)&d_atdat->lj_comb, nalloc*sizeof(*d_atdat->lj_comb));
+            CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->lj_comb");
+        }
+        else
+        {
+            stat = cudaMalloc((void **)&d_atdat->atom_types, nalloc*sizeof(*d_atdat->atom_types));
+            CU_RET_ERR(stat, "cudaMalloc failed on d_atdat->atom_types");
+        }
  
          d_atdat->nalloc = nalloc;
          realloced       = true;
@@ -780,8 +814,16 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
          nbnxn_cuda_clear_f(nb, nalloc);
      }
  
-    cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
-                      natoms*sizeof(*d_atdat->atom_types), ls);
+    if (bUseLjCombination)
+    {
+        cu_copy_H2D_async(d_atdat->lj_comb, nbat->lj_comb,
+                          natoms*sizeof(*d_atdat->lj_comb), ls);
+    }
+    else
+    {
+        cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
+                          natoms*sizeof(*d_atdat->atom_types), ls);
+    }
  
      if (bDoTime)
      {
@@ -924,6 +966,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
      cu_free_buffered(atdat->f, &atdat->natoms, &atdat->nalloc);
      cu_free_buffered(atdat->xq);
      cu_free_buffered(atdat->atom_types, &atdat->ntypes);
+    cu_free_buffered(atdat->lj_comb);
  
      cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
      cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index 3d3183cc784fa617dc2222420c631e8fd73bceea..42bf5b15b64806b893c30b895c2d4b802842aaf7 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -77,6 +77,9 @@
  #define LJ_EWALD
  #endif
  
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+#define LJ_COMB
+#endif
  
  /*
     Kernel launch parameters:
@@ -158,8 +161,13 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #endif
      nbnxn_cj4_t        *pl_cj4      = plist.cj4;
      const nbnxn_excl_t *excl        = plist.excl;
+#ifndef LJ_COMB
      const int          *atom_types  = atdat.atom_types;
      int                 ntypes      = atdat.ntypes;
+#else
+    const float2       *lj_comb     = atdat.lj_comb;
+    float2              ljcp_i, ljcp_j;
+#endif
      const float4       *xq          = atdat.xq;
      float3             *f           = atdat.f;
      const float3       *shift_vec   = atdat.shift_vec;
@@ -210,13 +218,20 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  
      int          sci, ci, cj,
                   ai, aj,
-                 cij4_start, cij4_end,
-                 typei, typej,
-                 i, jm, j4, wexcl_idx;
+                 cij4_start, cij4_end;
+#ifndef LJ_COMB
+    int          typei, typej;
+#endif
+    int          i, jm, j4, wexcl_idx;
      float        qi, qj_f,
-                 r2, inv_r, inv_r2, inv_r6,
-                 c6, c12,
-                 int_bit,
+                 r2, inv_r, inv_r2;
+#if !defined LJ_COMB_LB || defined CALC_ENERGIES
+    float        inv_r6, c6, c12;
+#endif
+#ifdef LJ_COMB_LB
+    float        sigma, epsilon;
+#endif
+    float        int_bit,
                   F_invr;
  #ifdef CALC_ENERGIES
      float        E_lj, E_el;
@@ -235,10 +250,16 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  
      /* shmem buffer for i x+q pre-loading */
      extern __shared__  float4 xqib[];
+
      /* shmem buffer for cj, for each warp separately */
-    int *cjs    = ((int *)(xqib + c_numClPerSupercl * c_clSize)) + tidxz * 2 * c_nbnxnGpuJgroupSize;
+    int *cjs       = ((int *)(xqib + c_numClPerSupercl * c_clSize)) + tidxz * 2 * c_nbnxnGpuJgroupSize;
+#ifndef LJ_COMB
      /* shmem buffer for i atom-type pre-loading */
-    int *atib   = ((int *)(xqib + c_numClPerSupercl * c_clSize)) + NTHREAD_Z * 2 * c_nbnxnGpuJgroupSize;
+    int *atib      = ((int *)(xqib + c_numClPerSupercl * c_clSize)) + NTHREAD_Z * 2 * c_nbnxnGpuJgroupSize;
+#else
+    /* shmem buffer for i-atom LJ combination rule parameters */
+    float2 *ljcpib = ((float2 *)(xqib + c_numClPerSupercl * c_clSize)) + NTHREAD_Z * 2 * c_nbnxnGpuJgroupSize;
+#endif
  
      nb_sci      = pl_sci[bidx];         /* my i super-cluster's index = current bidx */
      sci         = nb_sci.sci;           /* super-cluster */
@@ -255,8 +276,13 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
          xqbuf.w *= nbparam.epsfac;
          xqib[tidxj * c_clSize + tidxi] = xqbuf;
  
+#ifndef LJ_COMB
          /* Pre-load the i-atom types into shared memory */
          atib[tidxj * c_clSize + tidxi] = atom_types[ai];
+#else
+        /* Pre-load the LJ combination parameters into shared memory */
+        ljcpib[tidxj * c_clSize + tidxi] = lj_comb[ai];
+#endif
      }
      __syncthreads();
  
@@ -346,7 +372,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                      xqbuf   = xq[aj];
                      xj      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
                      qj_f    = xqbuf.w;
+#ifndef LJ_COMB
                      typej   = atom_types[aj];
+#else
+                    ljcp_j  = lj_comb[aj];
+#endif
  
                      fcj_buf = make_float3(0.0f);
  
@@ -389,17 +419,33 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                              {
                                  /* load the rest of the i-atom parameters */
                                  qi      = xqbuf.w;
-                                typei   = atib[i * c_clSize + tidxi];
  
+#ifndef LJ_COMB
                                  /* LJ 6*C6 and 12*C12 */
+                                typei   = atib[i * c_clSize + tidxi];
                                  c6      = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej));
                                  c12     = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej) + 1);
+#else
+                                ljcp_i  = ljcpib[i * c_clSize + tidxi];
+#ifdef LJ_COMB_GEOM
+                                c6      = ljcp_i.x * ljcp_j.x;
+                                c12     = ljcp_i.y * ljcp_j.y;
+#else
+                                /* LJ 2^(1/6)*sigma and 12*epsilon */
+                                sigma   = ljcp_i.x + ljcp_j.x;
+                                epsilon = ljcp_i.y * ljcp_j.y;
+#if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+                                convert_sigma_epsilon_to_c6_c12(sigma, epsilon, &c6, &c12);
+#endif
+#endif                          /* LJ_COMB_GEOM */
+#endif                          /* LJ_COMB */
  
                                  /* avoid NaN for excluded pairs at r=0 */
                                  r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
  
                                  inv_r   = rsqrt(r2);
                                  inv_r2  = inv_r * inv_r;
+#if !defined LJ_COMB_LB || defined CALC_ENERGIES
                                  inv_r6  = inv_r2 * inv_r2 * inv_r2;
  #ifdef EXCLUSION_FORCES
                                  /* We could mask inv_r2, but with Ewald
@@ -412,6 +458,16 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                                  E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam.repulsion_shift.cpot)*c_oneTwelveth -
                                                       c6 * (inv_r6 + nbparam.dispersion_shift.cpot)*c_oneSixth);
  #endif
+#else                           /* !LJ_COMB_LB || CALC_ENERGIES */
+                                float sig_r  = sigma*inv_r;
+                                float sig_r2 = sig_r*sig_r;
+                                float sig_r6 = sig_r2*sig_r2*sig_r2;
+#ifdef EXCLUSION_FORCES
+                                sig_r6 *= int_bit;
+#endif                          /* EXCLUSION_FORCES */
+
+                                F_invr  = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
+#endif                          /* !LJ_COMB_LB || CALC_ENERGIES */
  
  #ifdef LJ_FORCE_SWITCH
  #ifdef CALC_ENERGIES
@@ -556,3 +612,5 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #undef EL_EWALD_ANY
  #undef EXCLUSION_FORCES
  #undef LJ_EWALD
+
+#undef LJ_COMB
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh

index 04213482c381467272c3e9213576fc80eae46d71..4078d100f3fd4e59d6802d2b7662a9efae0ffcee 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh
@@ -77,6 +77,9 @@
  #define LJ_EWALD
  #endif
  
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+#define LJ_COMB
+#endif
  
  /*
     Kernel launch parameters:
@@ -123,8 +126,13 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #endif
      nbnxn_cj4_t        *pl_cj4      = plist.cj4;
      const nbnxn_excl_t *excl        = plist.excl;
+#ifndef LJ_COMB
      const int          *atom_types  = atdat.atom_types;
      int                 ntypes      = atdat.ntypes;
+#else
+    const float2       *lj_comb     = atdat.lj_comb;
+    float2              ljcp_i, ljcp_j;
+#endif
      const float4       *xq          = atdat.xq;
      float3             *f           = atdat.f;
      const float3       *shift_vec   = atdat.shift_vec;
@@ -170,13 +178,20 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  
      int          sci, ci, cj,
                   ai, aj,
-                 cij4_start, cij4_end,
-                 typei, typej,
-                 i, jm, j4, wexcl_idx;
+                 cij4_start, cij4_end;
+#ifndef LJ_COMB
+    int          typei, typej;
+#endif
+    int          i, jm, j4, wexcl_idx;
      float        qi, qj_f,
-                 r2, inv_r, inv_r2, inv_r6,
-                 c6, c12,
-                 int_bit,
+                 r2, inv_r, inv_r2;
+#if !defined LJ_COMB_LB || defined CALC_ENERGIES
+    float        inv_r6, c6, c12;
+#endif
+#ifdef LJ_COMB_LB
+    float        sigma, epsilon;
+#endif
+    float        int_bit,
                   F_invr;
  #ifdef CALC_ENERGIES
      float        E_lj, E_el;
@@ -304,7 +319,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                      xqbuf   = xq[aj];
                      xj      = make_float3(xqbuf.x, xqbuf.y, xqbuf.z);
                      qj_f    = xqbuf.w;
+#ifndef LJ_COMB
                      typej   = atom_types[aj];
+#else
+                    ljcp_j  = lj_comb[aj];
+#endif
  
                      fcj_buf = make_float3(0.0f);
  
@@ -348,17 +367,33 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                              {
                                  /* load the rest of the i-atom parameters */
                                  qi      = xqbuf.w;
-                                typei   = atom_types[ai];
  
+#ifndef LJ_COMB
                                  /* LJ 6*C6 and 12*C12 */
+                                typei   = atom_types[ai];
                                  c6      = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej));
                                  c12     = tex1Dfetch(nbfp_texref, 2 * (ntypes * typei + typej) + 1);
+#else
+                                ljcp_i  = lj_comb[ai];
+#ifdef LJ_COMB_GEOM
+                                c6      = ljcp_i.x * ljcp_j.x;
+                                c12     = ljcp_i.y * ljcp_j.y;
+#else
+                                /* LJ 2^(1/6)*sigma and 12*epsilon */
+                                sigma   = ljcp_i.x + ljcp_j.x;
+                                epsilon = ljcp_i.y * ljcp_j.y;
+#if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+                                convert_sigma_epsilon_to_c6_c12(sigma, epsilon, &c6, &c12);
+#endif
+#endif                          /* LJ_COMB_GEOM */
+#endif                          /* LJ_COMB */
  
                                  /* avoid NaN for excluded pairs at r=0 */
                                  r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
  
                                  inv_r   = rsqrt(r2);
                                  inv_r2  = inv_r * inv_r;
+#if !defined LJ_COMB_LB || defined CALC_ENERGIES
                                  inv_r6  = inv_r2 * inv_r2 * inv_r2;
  #ifdef EXCLUSION_FORCES
                                  /* We could mask inv_r2, but with Ewald
@@ -371,6 +406,16 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                                  E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam.repulsion_shift.cpot)*c_oneTwelveth -
                                                       c6 * (inv_r6 + nbparam.dispersion_shift.cpot)*c_oneSixth);
  #endif
+#else                           /* !LJ_COMB_LB || CALC_ENERGIES */
+                                float sig_r  = sigma*inv_r;
+                                float sig_r2 = sig_r*sig_r;
+                                float sig_r6 = sig_r2*sig_r2*sig_r2;
+#ifdef EXCLUSION_FORCES
+                                sig_r6 *= int_bit;
+#endif                          /* EXCLUSION_FORCES */
+
+                                F_invr  = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
+#endif                          /* !LJ_COMB_LB || CALC_ENERGIES */
  
  #ifdef LJ_FORCE_SWITCH
  #ifdef CALC_ENERGIES
@@ -525,3 +570,5 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #undef EL_EWALD_ANY
  #undef EXCLUSION_FORCES
  #undef LJ_EWALD
+
+#undef LJ_COMB
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

index c130845fcb2d8cfcf222d9a61fb0543eba42f508..07979c1248f56e7bac19ac2a134741703394faf1 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -91,6 +91,21 @@ extern texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
  extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
  
+/*! Convert LJ sigma,epsilon parameters to C6,C12. */
+static __forceinline__ __device__
+void convert_sigma_epsilon_to_c6_c12(const float  sigma,
+                                     const float  epsilon,
+                                     float       *c6,
+                                     float       *c12)
+{
+    float sigma2, sigma6;
+
+    sigma2 = sigma * sigma;
+    sigma6 = sigma2 *sigma2 * sigma2;
+    *c6    = epsilon * sigma6;
+    *c12   = *c6 * sigma6;
+}
+
  /*! Apply force switch,  force + energy version. */
  static __forceinline__ __device__
  void calculate_force_switch_F(const  cu_nbparam_t nbparam,
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh

index ab179e65c3c2e5e9e99228101338b62b011a0e61..e56ce654115f66ce9325ae877c78d9737865b8e3 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -66,6 +66,18 @@
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__
  #include FLAVOR_LEVEL_GENERATOR
  #undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w geometric combination rules */
+#define LJ_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJCombGeom ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w LB combination rules */
+#define LJ_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJCombLB ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombGeom ## __VA_ARGS__
@@ -102,6 +114,18 @@
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__
  #include FLAVOR_LEVEL_GENERATOR
  #undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w geometric combination rules */
+#define LJ_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJCombGeom ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w LB combination rules */
+#define LJ_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJCombLB ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombGeom ## __VA_ARGS__
@@ -138,6 +162,18 @@
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__
  #include FLAVOR_LEVEL_GENERATOR
  #undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w geometric combination rules */
+#define LJ_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJCombGeom ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w LB combination rules */
+#define LJ_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJCombLB ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombGeom ## __VA_ARGS__
@@ -175,6 +211,18 @@
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
  #include FLAVOR_LEVEL_GENERATOR
  #undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w geometric combination rules */
+#define LJ_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJCombGeom ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w LB combination rules */
+#define LJ_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJCombLB ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
@@ -211,6 +259,18 @@
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__
  #include FLAVOR_LEVEL_GENERATOR
  #undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w geometric combination rules */
+#define LJ_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJCombGeom ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w LB combination rules */
+#define LJ_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJCombLB ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombGeom ## __VA_ARGS__
@@ -247,6 +307,18 @@
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
  #include FLAVOR_LEVEL_GENERATOR
  #undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w geometric combination rules */
+#define LJ_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJCombGeom ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* cut-off + V shift LJ w LB combination rules */
+#define LJ_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJCombLB ## __VA_ARGS__
+#include FLAVOR_LEVEL_GENERATOR
+#undef LJ_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
  /* LJ-Ewald w geometric combination rules */
  #define LJ_EWALD_COMB_GEOM
  #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h

index 8de9305aa690aa60d29727b3bf2bda2d6da16934..8519b5598ea6cb39b35b45580e8004e288f1cc96 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -92,7 +92,7 @@ enum eelCu {
   * should match the order of enumerated types below.
   */
  enum evdwCu {
-    evdwCuCUT, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuEWALDGEOM, evdwCuEWALDLB, evdwCuNR
+    evdwCuCUT, evdwCuCUTCOMBGEOM, evdwCuCUTCOMBLB, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuEWALDGEOM, evdwCuEWALDLB, evdwCuNR
  };
  
  /* All structs prefixed with "cu_" hold data used in GPU calculations and
@@ -138,6 +138,7 @@ struct cu_atomdata
  
      int      ntypes;            /**< number of atom types                         */
      int     *atom_types;        /**< atom type indices, size natoms               */
+    float2  *lj_comb;           /**< sqrt(c6),sqrt(c12) size natoms               */
  
      float3  *shift_vec;         /**< shifts                                       */
      bool     bShiftVecUploaded; /**< true if the shift vector has been uploaded   */
author	Berk Hess <hess@kth.se>
	Tue, 23 Feb 2016 18:51:48 +0000 (19:51 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Tue, 22 Mar 2016 00:54:33 +0000 (01:54 +0100)
docs/user-guide/environment-variables.rst		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_atomdata.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h		patch \| blob \| history