Tweak the CUDA GPU shared mem/L1 config
authorSzilard Pall <pall.szilard@gmail.com>
Tue, 30 Jun 2015 22:55:55 +0000 (00:55 +0200)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Sun, 27 Sep 2015 10:08:21 +0000 (12:08 +0200)
We use 20.5 Kb shared memory on CC >=3.0, so the current shmem
preferred setup is unnecessary. This change is on it own cosmetic,
but it allows making use of the larger amount of L1 available for
global load caching on hardware that supports it (K40, K80, Tegra K1,
& CC 5.2) by passing the appropriate command line option ("-dlcm=ca").

Change-Id: If7e6b7ecb64d4864ef34d525fc032ec13cb26f03

src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index cedf77200731edac3cc513c2441ef4e2106a0e44..e305e9a3a84cfd3f536378eb1b99b5d77dd003dc 100644 (file)
@@ -788,11 +788,11 @@ void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo)
         {
             if (devinfo->prop.major >= 3)
             {
-                /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
-                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
-                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
-                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
-                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+                /* Default kernel on sm 3.x and later 32/32 kB Shared/L1 */
+                cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferEqual);
+                cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
+                cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferEqual);
+                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferEqual);
             }
             else
             {