src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include <stdlib.h>
  40 #include <assert.h>
  41
  42 #if defined(_MSVC)
  43 #include <limits>
  44 #endif
  45
  46 #include <cuda.h>
  47
  48 #include "types/simple.h"
  49 #include "types/nbnxn_pairlist.h"
  50 #include "types/nb_verlet.h"
  51 #include "types/ishift.h"
  52 #include "types/force_flags.h"
  53 #include "../nbnxn_consts.h"
  54
  55 #ifdef TMPI_ATOMICS
  56 #include "thread_mpi/atomic.h"
  57 #endif
  58
  59 #include "nbnxn_cuda_types.h"
  60 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  61 #include "nbnxn_cuda.h"
  62 #include "nbnxn_cuda_data_mgmt.h"
  63
  64 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  65 #define USE_TEXOBJ
  66 #endif
  67
  68 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
  69 texture<float, 1, cudaReadModeElementType> nbfp_texref;
  70
  71 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  72 texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  73
  74 /* Convenience defines */
  75 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  76 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  77
  78 /***** The kernels come here *****/
  79 #include "nbnxn_cuda_kernel_utils.cuh"
  80
  81 /* Top-level kernel generation: will generate through multiple inclusion the
  82  * following flavors for all kernels:
  83  * - force-only output;
  84  * - force and energy output;
  85  * - force-only with pair list pruning;
  86  * - force and energy output with pair list pruning.
  87  */
  88 /** Force only **/
  89 #include "nbnxn_cuda_kernels.cuh"
  90 /** Force & energy **/
  91 #define CALC_ENERGIES
  92 #include "nbnxn_cuda_kernels.cuh"
  93 #undef CALC_ENERGIES
  94
  95 /*** Pair-list pruning kernels ***/
  96 /** Force only **/
  97 #define PRUNE_NBL
  98 #include "nbnxn_cuda_kernels.cuh"
  99 /** Force & energy **/
 100 #define CALC_ENERGIES
 101 #include "nbnxn_cuda_kernels.cuh"
 102 #undef CALC_ENERGIES
 103 #undef PRUNE_NBL
 104
 105 /*! Nonbonded kernel function pointer type */
 106 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 107                                      const cu_nbparam_t,
 108                                      const cu_plist_t,
 109                                      bool);
 110
 111 /*********************************/
 112
 113 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 114 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 115 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 116 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 117
 118
 119 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 120  * and corresponds to having the exponent set to the maximum (127 -- single
 121  * precision) and the mantissa to 0.
 122  */
 123 static unsigned int poll_wait_pattern = (0x7FU << 23);
 124
 125 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 126 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 127 {
 128     int max_grid_x_size;
 129
 130     assert(dinfo);
 131
 132     max_grid_x_size = dinfo->prop.maxGridSize[0];
 133
 134     /* do we exceed the grid x dimension limit? */
 135     if (nwork_units > max_grid_x_size)
 136     {
 137         gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 138                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 139                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 140     }
 141
 142     return nwork_units;
 143 }
 144
 145
 146 /* Constant arrays listing all kernel function pointers and enabling selection
 147    of a kernel in an elegant manner. */
 148
 149 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 150 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 151
 152 /*! Pointers to the default kernels organized in a 3 dim array by:
 153  *  electrostatics type, energy calculation on/off, and pruning on/off.
 154  *
 155  *  Note that the order of electrostatics (1st dimension) has to match the
 156  *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
 157  */
 158 static const nbnxn_cu_kfunc_ptr_t
 159 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 160 {
 161     { { k_nbnxn_cutoff,                     k_nbnxn_cutoff_prune },
 162       { k_nbnxn_cutoff_ener,                k_nbnxn_cutoff_ener_prune } },
 163     { { k_nbnxn_rf,                         k_nbnxn_rf_prune },
 164       { k_nbnxn_rf_ener,                    k_nbnxn_rf_ener_prune } },
 165     { { k_nbnxn_ewald_tab,                  k_nbnxn_ewald_tab_prune },
 166       { k_nbnxn_ewald_tab_ener,             k_nbnxn_ewald_tab_ener_prune } },
 167     { { k_nbnxn_ewald_tab_twin,             k_nbnxn_ewald_tab_twin_prune },
 168       { k_nbnxn_ewald_tab_twin_ener,        k_nbnxn_ewald_twin_ener_prune } },
 169     { { k_nbnxn_ewald,                      k_nbnxn_ewald_prune },
 170       { k_nbnxn_ewald_ener,                 k_nbnxn_ewald_ener_prune } },
 171     { { k_nbnxn_ewald_twin,                 k_nbnxn_ewald_twin_prune },
 172       { k_nbnxn_ewald_twin_ener,            k_nbnxn_ewald_twin_ener_prune } },
 173 };
 174
 175 /*! Pointers to the legacy kernels organized in a 3 dim array by:
 176  *  electrostatics type, energy calculation on/off, and pruning on/off.
 177  *
 178  *  Note that the order of electrostatics (1st dimension) has to match the
 179  *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
 180  */
 181 static const nbnxn_cu_kfunc_ptr_t
 182 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 183 {
 184     { { k_nbnxn_cutoff_legacy,              k_nbnxn_cutoff_prune_legacy },
 185       { k_nbnxn_cutoff_ener_legacy,         k_nbnxn_cutoff_ener_prune_legacy } },
 186     { { k_nbnxn_rf_legacy,                  k_nbnxn_rf_prune_legacy },
 187       { k_nbnxn_rf_ener_legacy,             k_nbnxn_rf_ener_prune_legacy } },
 188     { { k_nbnxn_ewald_tab_legacy,           k_nbnxn_ewald_tab_prune_legacy },
 189       { k_nbnxn_ewald_tab_ener_legacy,      k_nbnxn_ewald_tab_ener_prune_legacy } },
 190     { { k_nbnxn_ewald_tab_twin_legacy,      k_nbnxn_ewald_tab_twin_prune_legacy },
 191       { k_nbnxn_ewald_tab_twin_ener_legacy, k_nbnxn_ewald_tab_twin_ener_prune_legacy } },
 192 };
 193
 194 /*! Return a pointer to the kernel version to be executed at the current step. */
 195 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 196                                                        bool bDoEne, bool bDoPrune)
 197 {
 198     assert(kver < eNbnxnCuKNR);
 199     assert(eeltype < eelCuNR);
 200
 201     if (NBNXN_KVER_LEGACY(kver))
 202     {
 203         /* no analytical Ewald with legacy kernels */
 204         assert(eeltype <= eelCuEWALD_TAB_TWIN);
 205
 206         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 207     }
 208     else
 209     {
 210         return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 211     }
 212 }
 213
 214 /*! Calculates the amount of shared memory required for kernel version in use. */
 215 static inline int calc_shmem_required(int kver)
 216 {
 217     int shmem;
 218
 219     /* size of shmem (force-buffers/xq/atom type preloading) */
 220     if (NBNXN_KVER_LEGACY(kver))
 221     {
 222         /* i-atom x+q in shared memory */
 223         shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 224         /* force reduction buffers in shared memory */
 225         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 226     }
 227     else
 228     {
 229         /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 230         /* i-atom x+q in shared memory */
 231         shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 232         /* cj in shared memory, for both warps separately */
 233         shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 234 #ifdef IATYPE_SHMEM
 235         /* i-atom types in shared memory */
 236         shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 237 #endif
 238 #if __CUDA_ARCH__ < 300
 239         /* force reduction buffers in shared memory */
 240         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 241 #endif
 242     }
 243
 244     return shmem;
 245 }
 246
 247 /*! As we execute nonbonded workload in separate streams, before launching
 248    the kernel we need to make sure that he following operations have completed:
 249    - atomdata allocation and related H2D transfers (every nstlist step);
 250    - pair list H2D transfer (every nstlist step);
 251    - shift vector H2D transfer (every nstlist step);
 252    - force (+shift force and energy) output clearing (every step).
 253
 254    These operations are issued in the local stream at the beginning of the step
 255    and therefore always complete before the local kernel launch. The non-local
 256    kernel is launched after the local on the same device/context, so this is
 257    inherently scheduled after the operations in the local stream (including the
 258    above "misc_ops").
 259    However, for the sake of having a future-proof implementation, we use the
 260    misc_ops_done event to record the point in time when the above  operations
 261    are finished and synchronize with this event in the non-local stream.
 262 */
 263 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 264                               const nbnxn_atomdata_t *nbatom,
 265                               int flags,
 266                               int iloc)
 267 {
 268     cudaError_t stat;
 269     int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 270     /* CUDA kernel launch-related stuff */
 271     int  shmem, nblock;
 272     dim3 dim_block, dim_grid;
 273     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 274
 275     cu_atomdata_t   *adat   = cu_nb->atdat;
 276     cu_nbparam_t    *nbp    = cu_nb->nbparam;
 277     cu_plist_t      *plist  = cu_nb->plist[iloc];
 278     cu_timers_t     *t      = cu_nb->timers;
 279     cudaStream_t    stream  = cu_nb->stream[iloc];
 280
 281     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 282     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 283     bool bDoTime     = cu_nb->bDoTime;
 284
 285     /* turn energy calculation always on/off (for debugging/testing only) */
 286     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 287
 288     /* Don't launch the non-local kernel if there is no work to do.
 289        Doing the same for the local kernel is more complicated, since the
 290        local part of the force array also depends on the non-local kernel.
 291        So to avoid complicating the code and to reduce the risk of bugs,
 292        we always call the local kernel, the local x+q copy and later (not in
 293        this function) the stream wait, local f copyback and the f buffer
 294        clearing. All these operations, except for the local interaction kernel,
 295        are needed for the non-local interactions. The skip of the local kernel
 296        call is taken care of later in this function. */
 297     if (iloc == eintNonlocal && plist->nsci == 0)
 298     {
 299         return;
 300     }
 301
 302     /* calculate the atom data index range based on locality */
 303     if (LOCAL_I(iloc))
 304     {
 305         adat_begin  = 0;
 306         adat_len    = adat->natoms_local;
 307     }
 308     else
 309     {
 310         adat_begin  = adat->natoms_local;
 311         adat_len    = adat->natoms - adat->natoms_local;
 312     }
 313
 314     /* When we get here all misc operations issues in the local stream are done,
 315        so we record that in the local stream and wait for it in the nonlocal one. */
 316     if (cu_nb->bUseTwoStreams)
 317     {
 318         if (iloc == eintLocal)
 319         {
 320             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 321             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 322         }
 323         else
 324         {
 325             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 326             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 327         }
 328     }
 329
 330     /* beginning of timed HtoD section */
 331     if (bDoTime)
 332     {
 333         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 334         CU_RET_ERR(stat, "cudaEventRecord failed");
 335     }
 336
 337     /* HtoD x, q */
 338     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 339                       adat_len * sizeof(*adat->xq), stream);
 340
 341     if (bDoTime)
 342     {
 343         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 344         CU_RET_ERR(stat, "cudaEventRecord failed");
 345     }
 346
 347     if (plist->nsci == 0)
 348     {
 349         /* Don't launch an empty local kernel (not allowed with CUDA) */
 350         return;
 351     }
 352
 353     /* beginning of timed nonbonded calculation section */
 354     if (bDoTime)
 355     {
 356         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 357         CU_RET_ERR(stat, "cudaEventRecord failed");
 358     }
 359
 360     /* get the pointer to the kernel flavor we need to use */
 361     nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 362                                     plist->bDoPrune || always_prune);
 363
 364     /* kernel launch config */
 365     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 366     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 367     dim_grid  = dim3(nblock, 1, 1);
 368     shmem     = calc_shmem_required(cu_nb->kernel_ver);
 369
 370     if (debug)
 371     {
 372         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 373                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 374                 dim_block.x, dim_block.y, dim_block.z,
 375                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 376                 NCL_PER_SUPERCL, plist->na_c);
 377     }
 378
 379     nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 380     CU_LAUNCH_ERR("k_calc_nb");
 381
 382     if (bDoTime)
 383     {
 384         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 385         CU_RET_ERR(stat, "cudaEventRecord failed");
 386     }
 387 }
 388
 389 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 390                                const nbnxn_atomdata_t *nbatom,
 391                                int flags,
 392                                int aloc)
 393 {
 394     cudaError_t stat;
 395     int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 396     int iloc = -1;
 397
 398     /* determine interaction locality from atom locality */
 399     if (LOCAL_A(aloc))
 400     {
 401         iloc = eintLocal;
 402     }
 403     else if (NONLOCAL_A(aloc))
 404     {
 405         iloc = eintNonlocal;
 406     }
 407     else
 408     {
 409         char stmp[STRLEN];
 410         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 411                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 412         gmx_incons(stmp);
 413     }
 414
 415     cu_atomdata_t   *adat   = cu_nb->atdat;
 416     cu_timers_t     *t      = cu_nb->timers;
 417     bool            bDoTime = cu_nb->bDoTime;
 418     cudaStream_t    stream  = cu_nb->stream[iloc];
 419
 420     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 421     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 422
 423     /* don't launch non-local copy-back if there was no non-local work to do */
 424     if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
 425     {
 426         return;
 427     }
 428
 429     /* calculate the atom data index range based on locality */
 430     if (LOCAL_A(aloc))
 431     {
 432         adat_begin  = 0;
 433         adat_len    = adat->natoms_local;
 434         adat_end    = cu_nb->atdat->natoms_local;
 435     }
 436     else
 437     {
 438         adat_begin  = adat->natoms_local;
 439         adat_len    = adat->natoms - adat->natoms_local;
 440         adat_end    = cu_nb->atdat->natoms;
 441     }
 442
 443     /* beginning of timed D2H section */
 444     if (bDoTime)
 445     {
 446         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 447         CU_RET_ERR(stat, "cudaEventRecord failed");
 448     }
 449
 450     if (!cu_nb->bUseStreamSync)
 451     {
 452         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 453            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 454         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 455         {
 456 #ifdef NAN
 457             nbatom->out[0].f[i] = NAN;
 458 #else
 459 #  ifdef _MSVC
 460             if (numeric_limits<float>::has_quiet_NaN)
 461             {
 462                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 463             }
 464             else
 465 #  endif
 466             {
 467                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 468             }
 469 #endif
 470         }
 471
 472         /* Set the last four bytes of the force array to a bit pattern
 473            which can't be the result of the force calculation:
 474            max exponent (127) and zero mantissa. */
 475         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 476     }
 477
 478     /* With DD the local D2H transfer can only start after the non-local
 479        kernel has finished. */
 480     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 481     {
 482         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 483         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 484     }
 485
 486     /* DtoH f */
 487     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 488                       (adat_len)*sizeof(*adat->f), stream);
 489
 490     /* After the non-local D2H is launched the nonlocal_done event can be
 491        recorded which signals that the local D2H can proceed. This event is not
 492        placed after the non-local kernel because we want the non-local data
 493        back first. */
 494     if (iloc == eintNonlocal)
 495     {
 496         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 497         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 498     }
 499
 500     /* only transfer energies in the local stream */
 501     if (LOCAL_I(iloc))
 502     {
 503         /* DtoH fshift */
 504         if (bCalcFshift)
 505         {
 506             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 507                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 508         }
 509
 510         /* DtoH energies */
 511         if (bCalcEner)
 512         {
 513             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 514                               sizeof(*cu_nb->nbst.e_lj), stream);
 515             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 516                               sizeof(*cu_nb->nbst.e_el), stream);
 517         }
 518     }
 519
 520     if (bDoTime)
 521     {
 522         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 523         CU_RET_ERR(stat, "cudaEventRecord failed");
 524     }
 525 }
 526
 527 /* Atomic compare-exchange operation on unsigned values. It is used in
 528  * polling wait for the GPU.
 529  */
 530 static inline bool atomic_cas(volatile unsigned int *ptr,
 531                               unsigned int oldval,
 532                               unsigned int newval)
 533 {
 534     assert(ptr);
 535
 536 #ifdef TMPI_ATOMICS
 537     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 538 #else
 539     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 540     return true;
 541 #endif
 542 }
 543
 544 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 545                          const nbnxn_atomdata_t *nbatom,
 546                          int flags, int aloc,
 547                          real *e_lj, real *e_el, rvec *fshift)
 548 {
 549     /* NOTE:  only implemented for single-precision at this time */
 550     cudaError_t stat;
 551     int i, adat_end, iloc = -1;
 552     volatile unsigned int *poll_word;
 553
 554     /* determine interaction locality from atom locality */
 555     if (LOCAL_A(aloc))
 556     {
 557         iloc = eintLocal;
 558     }
 559     else if (NONLOCAL_A(aloc))
 560     {
 561         iloc = eintNonlocal;
 562     }
 563     else
 564     {
 565         char stmp[STRLEN];
 566         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 567                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 568         gmx_incons(stmp);
 569     }
 570
 571     cu_plist_t      *plist   = cu_nb->plist[iloc];
 572     cu_timers_t     *timers  = cu_nb->timers;
 573     wallclock_gpu_t *timings = cu_nb->timings;
 574     nb_staging      nbst     = cu_nb->nbst;
 575
 576     bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 577     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 578
 579     /* turn energy calculation always on/off (for debugging/testing only) */
 580     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 581
 582     /* Launch wait/update timers & counters, unless doing the non-local phase
 583        when there is not actually work to do. This is consistent with
 584        nbnxn_cuda_launch_kernel.
 585
 586        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 587        counters could end up being inconsistent due to not being incremented
 588        on some of the nodes! */
 589     if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
 590     {
 591         return;
 592     }
 593
 594     /* calculate the atom data index range based on locality */
 595     if (LOCAL_A(aloc))
 596     {
 597         adat_end = cu_nb->atdat->natoms_local;
 598     }
 599     else
 600     {
 601         adat_end = cu_nb->atdat->natoms;
 602     }
 603
 604     if (cu_nb->bUseStreamSync)
 605     {
 606         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 607         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 608     }
 609     else
 610     {
 611         /* Busy-wait until we get the signal pattern set in last byte
 612          * of the l/nl float vector. This pattern corresponds to a floating
 613          * point number which can't be the result of the force calculation
 614          * (maximum, 127 exponent and 0 mantissa).
 615          * The polling uses atomic compare-exchange.
 616          */
 617         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 618         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 619     }
 620
 621     /* timing data accumulation */
 622     if (cu_nb->bDoTime)
 623     {
 624         /* only increase counter once (at local F wait) */
 625         if (LOCAL_I(iloc))
 626         {
 627             timings->nb_c++;
 628             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 629         }
 630
 631         /* kernel timings */
 632         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 633             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 634
 635         /* X/q H2D and F D2H timings */
 636         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 637                                                  timers->stop_nb_h2d[iloc]);
 638         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 639                                                  timers->stop_nb_d2h[iloc]);
 640
 641         /* only count atdat and pair-list H2D at pair-search step */
 642         if (plist->bDoPrune)
 643         {
 644             /* atdat transfer timing (add only once, at local F wait) */
 645             if (LOCAL_A(aloc))
 646             {
 647                 timings->pl_h2d_c++;
 648                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 649                                                          timers->stop_atdat);
 650             }
 651
 652             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 653                                                      timers->stop_pl_h2d[iloc]);
 654         }
 655     }
 656
 657     /* add up energies and shift forces (only once at local F wait) */
 658     if (LOCAL_I(iloc))
 659     {
 660         if (bCalcEner)
 661         {
 662             *e_lj += *nbst.e_lj;
 663             *e_el += *nbst.e_el;
 664         }
 665
 666         if (bCalcFshift)
 667         {
 668             for (i = 0; i < SHIFTS; i++)
 669             {
 670                 fshift[i][0] += nbst.fshift[i].x;
 671                 fshift[i][1] += nbst.fshift[i].y;
 672                 fshift[i][2] += nbst.fshift[i].z;
 673             }
 674         }
 675     }
 676
 677     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 678     plist->bDoPrune = false;
 679 }
 680
 681 /*! Return the reference to the nbfp texture. */
 682 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 683 {
 684     return nbfp_texref;
 685 }
 686
 687 /*! Return the reference to the coulomb_tab. */
 688 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 689 {
 690     return coulomb_tab_texref;
 691 }
 692
 693 /*! Set up the cache configuration for the non-bonded kernels,
 694  */
 695 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 696 {
 697     cudaError_t stat;
 698
 699     for (int i = 0; i < eelCuNR; i++)
 700     {
 701         for (int j = 0; j < nEnergyKernelTypes; j++)
 702         {
 703             for (int k = 0; k < nPruneKernelTypes; k++)
 704             {
 705                 /* Legacy kernel 16/48 kB Shared/L1
 706                  * No analytical Ewald!
 707                  */
 708                 if (i != eelCuEWALD_ANA && i != eelCuEWALD_ANA_TWIN)
 709                 {
 710                     stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 711                     CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 712                 }
 713
 714                 if (devinfo->prop.major >= 3)
 715                 {
 716                     /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 717                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 718                 }
 719                 else
 720                 {
 721                     /* On Fermi prefer L1 gives 2% higher performance */
 722                     /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 723                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 724                 }
 725                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 726             }
 727         }
 728     }
 729 }