src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2012, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 #include <stdlib.h>
  37 #include <assert.h>
  38
  39 #if defined(_MSVC)
  40 #include <limits>
  41 #endif
  42
  43 #include <cuda.h>
  44
  45 #include "types/simple.h"
  46 #include "types/nbnxn_pairlist.h"
  47 #include "types/nb_verlet.h"
  48 #include "types/ishift.h"
  49 #include "types/force_flags.h"
  50 #include "../nbnxn_consts.h"
  51
  52 #ifdef TMPI_ATOMICS
  53 #include "thread_mpi/atomic.h"
  54 #endif
  55
  56 #include "nbnxn_cuda_types.h"
  57 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  58 #include "nbnxn_cuda.h"
  59 #include "nbnxn_cuda_data_mgmt.h"
  60
  61
  62 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
  63 texture<float, 1, cudaReadModeElementType> tex_nbfp;
  64
  65 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  66 texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
  67
  68 /* Convenience defines */
  69 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  70 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  71
  72 /***** The kernels come here *****/
  73 #include "nbnxn_cuda_kernel_utils.cuh"
  74
  75 /* Top-level kernel generation: will generate through multiple inclusion the
  76  * following flavors for all kernels:
  77  * - force-only output;
  78  * - force and energy output;
  79  * - force-only with pair list pruning;
  80  * - force and energy output with pair list pruning.
  81  */
  82 /** Force only **/
  83 #include "nbnxn_cuda_kernels.cuh"
  84 /** Force & energy **/
  85 #define CALC_ENERGIES
  86 #include "nbnxn_cuda_kernels.cuh"
  87 #undef CALC_ENERGIES
  88
  89 /*** Pair-list pruning kernels ***/
  90 /** Force only **/
  91 #define PRUNE_NBL
  92 #include "nbnxn_cuda_kernels.cuh"
  93 /** Force & energy **/
  94 #define CALC_ENERGIES
  95 #include "nbnxn_cuda_kernels.cuh"
  96 #undef CALC_ENERGIES
  97 #undef PRUNE_NBL
  98
  99 /*! Nonbonded kernel function pointer type */
 100 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 101                                      const cu_nbparam_t,
 102                                      const cu_plist_t,
 103                                      bool);
 104
 105 /*********************************/
 106
 107 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 108 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 109 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 110 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 111
 112
 113 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 114  * and corresponds to having the exponent set to the maximum (127 -- single
 115  * precision) and the mantissa to 0.
 116  */
 117 static unsigned int poll_wait_pattern = (0x7FU << 23);
 118
 119 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 120 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 121 {
 122     int max_grid_x_size;
 123
 124     assert(dinfo);
 125
 126     max_grid_x_size = dinfo->prop.maxGridSize[0];
 127
 128     /* do we exceed the grid x dimension limit? */
 129     if (nwork_units > max_grid_x_size)
 130     {
 131         gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 132                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 133                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 134     }
 135
 136     return nwork_units;
 137 }
 138
 139
 140 /* Constant arrays listing all kernel function pointers and enabling selection
 141    of a kernel in an elegant manner. */
 142
 143 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 144 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 145
 146 /*! Pointers to the default kernels organized in a 3 dim array by:
 147  *  electrostatics type, energy calculation on/off, and pruning on/off.
 148  *
 149  *  Note that the order of electrostatics (1st dimension) has to match the
 150  *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
 151  */
 152 static const nbnxn_cu_kfunc_ptr_t
 153 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 154 {
 155     { { k_nbnxn_cutoff,                     k_nbnxn_cutoff_prune },
 156       { k_nbnxn_cutoff_ener,                k_nbnxn_cutoff_ener_prune } },
 157     { { k_nbnxn_rf,                         k_nbnxn_rf_prune },
 158       { k_nbnxn_rf_ener,                    k_nbnxn_rf_ener_prune } },
 159     { { k_nbnxn_ewald_tab,                  k_nbnxn_ewald_tab_prune },
 160       { k_nbnxn_ewald_tab_ener,             k_nbnxn_ewald_tab_ener_prune } },
 161     { { k_nbnxn_ewald_tab_twin,             k_nbnxn_ewald_tab_twin_prune },
 162       { k_nbnxn_ewald_tab_twin_ener,        k_nbnxn_ewald_twin_ener_prune } },
 163     { { k_nbnxn_ewald,                      k_nbnxn_ewald_prune },
 164       { k_nbnxn_ewald_ener,                 k_nbnxn_ewald_ener_prune } },
 165     { { k_nbnxn_ewald_twin,                 k_nbnxn_ewald_twin_prune },
 166       { k_nbnxn_ewald_twin_ener,            k_nbnxn_ewald_twin_ener_prune } },
 167 };
 168
 169 /*! Pointers to the legacy kernels organized in a 3 dim array by:
 170  *  electrostatics type, energy calculation on/off, and pruning on/off.
 171  *
 172  *  Note that the order of electrostatics (1st dimension) has to match the
 173  *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
 174  */
 175 static const nbnxn_cu_kfunc_ptr_t
 176 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 177 {
 178     { { k_nbnxn_cutoff_legacy,              k_nbnxn_cutoff_prune_legacy },
 179       { k_nbnxn_cutoff_ener_legacy,         k_nbnxn_cutoff_ener_prune_legacy } },
 180     { { k_nbnxn_rf_legacy,                  k_nbnxn_rf_prune_legacy },
 181       { k_nbnxn_rf_ener_legacy,             k_nbnxn_rf_ener_prune_legacy } },
 182     { { k_nbnxn_ewald_tab_legacy,           k_nbnxn_ewald_tab_prune_legacy },
 183       { k_nbnxn_ewald_tab_ener_legacy,      k_nbnxn_ewald_tab_ener_prune_legacy } },
 184     { { k_nbnxn_ewald_tab_twin_legacy,      k_nbnxn_ewald_tab_twin_prune_legacy },
 185       { k_nbnxn_ewald_tab_twin_ener_legacy, k_nbnxn_ewald_tab_twin_ener_prune_legacy } },
 186 };
 187
 188 /*! Return a pointer to the kernel version to be executed at the current step. */
 189 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 190                                                        bool bDoEne, bool bDoPrune)
 191 {
 192     assert(kver < eNbnxnCuKNR);
 193     assert(eeltype < eelCuNR);
 194
 195     if (NBNXN_KVER_LEGACY(kver))
 196     {
 197         /* no analytical Ewald with legacy kernels */
 198         assert(eeltype <= eelCuEWALD_TAB_TWIN);
 199
 200         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 201     }
 202     else
 203     {
 204         return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 205     }
 206 }
 207
 208 /*! Calculates the amount of shared memory required for kernel version in use. */
 209 static inline int calc_shmem_required(int kver)
 210 {
 211     int shmem;
 212
 213     /* size of shmem (force-buffers/xq/atom type preloading) */
 214     if (NBNXN_KVER_LEGACY(kver))
 215     {
 216         /* i-atom x+q in shared memory */
 217         shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 218         /* force reduction buffers in shared memory */
 219         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 220     }
 221     else
 222     {
 223         /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 224         /* i-atom x+q in shared memory */
 225         shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 226         /* cj in shared memory, for both warps separately */
 227         shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 228 #ifdef IATYPE_SHMEM
 229         /* i-atom types in shared memory */
 230         shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 231 #endif
 232 #if __CUDA_ARCH__ < 300
 233         /* force reduction buffers in shared memory */
 234         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 235 #endif
 236     }
 237
 238     return shmem;
 239 }
 240
 241 /*! As we execute nonbonded workload in separate streams, before launching
 242    the kernel we need to make sure that he following operations have completed:
 243    - atomdata allocation and related H2D transfers (every nstlist step);
 244    - pair list H2D transfer (every nstlist step);
 245    - shift vector H2D transfer (every nstlist step);
 246    - force (+shift force and energy) output clearing (every step).
 247
 248    These operations are issued in the local stream at the beginning of the step
 249    and therefore always complete before the local kernel launch. The non-local
 250    kernel is launched after the local on the same device/context, so this is
 251    inherently scheduled after the operations in the local stream (including the
 252    above "misc_ops").
 253    However, for the sake of having a future-proof implementation, we use the
 254    misc_ops_done event to record the point in time when the above  operations
 255    are finished and synchronize with this event in the non-local stream.
 256 */
 257 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 258                               const nbnxn_atomdata_t *nbatom,
 259                               int flags,
 260                               int iloc)
 261 {
 262     cudaError_t stat;
 263     int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 264     /* CUDA kernel launch-related stuff */
 265     int  shmem, nblock;
 266     dim3 dim_block, dim_grid;
 267     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 268
 269     cu_atomdata_t   *adat   = cu_nb->atdat;
 270     cu_nbparam_t    *nbp    = cu_nb->nbparam;
 271     cu_plist_t      *plist  = cu_nb->plist[iloc];
 272     cu_timers_t     *t      = cu_nb->timers;
 273     cudaStream_t    stream  = cu_nb->stream[iloc];
 274
 275     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 276     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 277     bool bDoTime     = cu_nb->bDoTime;
 278
 279     /* turn energy calculation always on/off (for debugging/testing only) */
 280     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 281
 282     /* don't launch the kernel if there is no work to do */
 283     if (plist->nsci == 0)
 284     {
 285         return;
 286     }
 287
 288     /* calculate the atom data index range based on locality */
 289     if (LOCAL_I(iloc))
 290     {
 291         adat_begin  = 0;
 292         adat_len    = adat->natoms_local;
 293     }
 294     else
 295     {
 296         adat_begin  = adat->natoms_local;
 297         adat_len    = adat->natoms - adat->natoms_local;
 298     }
 299
 300     /* When we get here all misc operations issues in the local stream are done,
 301        so we record that in the local stream and wait for it in the nonlocal one. */
 302     if (cu_nb->bUseTwoStreams)
 303     {
 304         if (iloc == eintLocal)
 305         {
 306             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 307             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 308         }
 309         else
 310         {
 311             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 312             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 313         }
 314     }
 315
 316     /* beginning of timed HtoD section */
 317     if (bDoTime)
 318     {
 319         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 320         CU_RET_ERR(stat, "cudaEventRecord failed");
 321     }
 322
 323     /* HtoD x, q */
 324     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 325                       adat_len * sizeof(*adat->xq), stream);
 326
 327     if (bDoTime)
 328     {
 329         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 330         CU_RET_ERR(stat, "cudaEventRecord failed");
 331     }
 332
 333     /* beginning of timed nonbonded calculation section */
 334     if (bDoTime)
 335     {
 336         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 337         CU_RET_ERR(stat, "cudaEventRecord failed");
 338     }
 339
 340     /* get the pointer to the kernel flavor we need to use */
 341     nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 342                                     plist->bDoPrune || always_prune);
 343
 344     /* kernel launch config */
 345     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 346     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 347     dim_grid  = dim3(nblock, 1, 1);
 348     shmem     = calc_shmem_required(cu_nb->kernel_ver);
 349
 350     if (debug)
 351     {
 352         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 353                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 354                 dim_block.x, dim_block.y, dim_block.z,
 355                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 356                 NCL_PER_SUPERCL, plist->na_c);
 357     }
 358
 359     nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 360     CU_LAUNCH_ERR("k_calc_nb");
 361
 362     if (bDoTime)
 363     {
 364         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 365         CU_RET_ERR(stat, "cudaEventRecord failed");
 366     }
 367 }
 368
 369 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 370                                const nbnxn_atomdata_t *nbatom,
 371                                int flags,
 372                                int aloc)
 373 {
 374     cudaError_t stat;
 375     int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 376     int iloc = -1;
 377
 378     /* determine interaction locality from atom locality */
 379     if (LOCAL_A(aloc))
 380     {
 381         iloc = eintLocal;
 382     }
 383     else if (NONLOCAL_A(aloc))
 384     {
 385         iloc = eintNonlocal;
 386     }
 387     else
 388     {
 389         char stmp[STRLEN];
 390         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 391                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 392         gmx_incons(stmp);
 393     }
 394
 395     cu_atomdata_t   *adat   = cu_nb->atdat;
 396     cu_timers_t     *t      = cu_nb->timers;
 397     bool            bDoTime = cu_nb->bDoTime;
 398     cudaStream_t    stream  = cu_nb->stream[iloc];
 399
 400     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 401     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 402
 403     /* don't launch copy-back if there was no work to do */
 404     if (cu_nb->plist[iloc]->nsci == 0)
 405     {
 406         return;
 407     }
 408
 409     /* calculate the atom data index range based on locality */
 410     if (LOCAL_A(aloc))
 411     {
 412         adat_begin  = 0;
 413         adat_len    = adat->natoms_local;
 414         adat_end    = cu_nb->atdat->natoms_local;
 415     }
 416     else
 417     {
 418         adat_begin  = adat->natoms_local;
 419         adat_len    = adat->natoms - adat->natoms_local;
 420         adat_end    = cu_nb->atdat->natoms;
 421     }
 422
 423     /* beginning of timed D2H section */
 424     if (bDoTime)
 425     {
 426         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 427         CU_RET_ERR(stat, "cudaEventRecord failed");
 428     }
 429
 430     if (!cu_nb->bUseStreamSync)
 431     {
 432         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 433            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 434         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 435         {
 436 #ifdef NAN
 437             nbatom->out[0].f[i] = NAN;
 438 #else
 439 #  ifdef _MSVC
 440             if (numeric_limits<float>::has_quiet_NaN)
 441             {
 442                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 443             }
 444             else
 445 #  endif
 446             {
 447                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 448             }
 449 #endif
 450         }
 451
 452         /* Set the last four bytes of the force array to a bit pattern
 453            which can't be the result of the force calculation:
 454            max exponent (127) and zero mantissa. */
 455         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 456     }
 457
 458     /* With DD the local D2H transfer can only start after the non-local
 459        has been launched. */
 460     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 461     {
 462         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 463         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 464     }
 465
 466     /* DtoH f */
 467     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 468                       (adat_len)*sizeof(*adat->f), stream);
 469
 470     /* After the non-local D2H is launched the nonlocal_done event can be
 471        recorded which signals that the local D2H can proceed. This event is not
 472        placed after the non-local kernel because we first need the non-local
 473        data back first. */
 474     if (iloc == eintNonlocal)
 475     {
 476         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 477         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 478     }
 479
 480     /* only transfer energies in the local stream */
 481     if (LOCAL_I(iloc))
 482     {
 483         /* DtoH fshift */
 484         if (bCalcFshift)
 485         {
 486             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 487                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 488         }
 489
 490         /* DtoH energies */
 491         if (bCalcEner)
 492         {
 493             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 494                               sizeof(*cu_nb->nbst.e_lj), stream);
 495             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 496                               sizeof(*cu_nb->nbst.e_el), stream);
 497         }
 498     }
 499
 500     if (bDoTime)
 501     {
 502         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 503         CU_RET_ERR(stat, "cudaEventRecord failed");
 504     }
 505 }
 506
 507 /* Atomic compare-exchange operation on unsigned values. It is used in
 508  * polling wait for the GPU.
 509  */
 510 static inline bool atomic_cas(volatile unsigned int *ptr,
 511                               unsigned int oldval,
 512                               unsigned int newval)
 513 {
 514     assert(ptr);
 515
 516 #ifdef TMPI_ATOMICS
 517     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 518 #else
 519     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 520     return true;
 521 #endif
 522 }
 523
 524 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 525                          const nbnxn_atomdata_t *nbatom,
 526                          int flags, int aloc,
 527                          real *e_lj, real *e_el, rvec *fshift)
 528 {
 529     /* NOTE:  only implemented for single-precision at this time */
 530     cudaError_t stat;
 531     int i, adat_end, iloc = -1;
 532     volatile unsigned int *poll_word;
 533
 534     /* determine interaction locality from atom locality */
 535     if (LOCAL_A(aloc))
 536     {
 537         iloc = eintLocal;
 538     }
 539     else if (NONLOCAL_A(aloc))
 540     {
 541         iloc = eintNonlocal;
 542     }
 543     else
 544     {
 545         char stmp[STRLEN];
 546         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 547                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 548         gmx_incons(stmp);
 549     }
 550
 551     cu_plist_t      *plist   = cu_nb->plist[iloc];
 552     cu_timers_t     *timers  = cu_nb->timers;
 553     wallclock_gpu_t *timings = cu_nb->timings;
 554     nb_staging      nbst     = cu_nb->nbst;
 555
 556     bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 557     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 558
 559     /* turn energy calculation always on/off (for debugging/testing only) */
 560     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 561
 562     /* don't launch wait/update timers & counters if there was no work to do
 563
 564        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 565        counters could end up being inconsistent due to not being incremented
 566        on some of the nodes! */
 567     if (cu_nb->plist[iloc]->nsci == 0)
 568     {
 569         return;
 570     }
 571
 572     /* calculate the atom data index range based on locality */
 573     if (LOCAL_A(aloc))
 574     {
 575         adat_end = cu_nb->atdat->natoms_local;
 576     }
 577     else
 578     {
 579         adat_end = cu_nb->atdat->natoms;
 580     }
 581
 582     if (cu_nb->bUseStreamSync)
 583     {
 584         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 585         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 586     }
 587     else
 588     {
 589         /* Busy-wait until we get the signal pattern set in last byte
 590          * of the l/nl float vector. This pattern corresponds to a floating
 591          * point number which can't be the result of the force calculation
 592          * (maximum, 127 exponent and 0 mantissa).
 593          * The polling uses atomic compare-exchange.
 594          */
 595         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 596         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 597     }
 598
 599     /* timing data accumulation */
 600     if (cu_nb->bDoTime)
 601     {
 602         /* only increase counter once (at local F wait) */
 603         if (LOCAL_I(iloc))
 604         {
 605             timings->nb_c++;
 606             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 607         }
 608
 609         /* kernel timings */
 610         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 611             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 612
 613         /* X/q H2D and F D2H timings */
 614         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 615                                                  timers->stop_nb_h2d[iloc]);
 616         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 617                                                  timers->stop_nb_d2h[iloc]);
 618
 619         /* only count atdat and pair-list H2D at pair-search step */
 620         if (plist->bDoPrune)
 621         {
 622             /* atdat transfer timing (add only once, at local F wait) */
 623             if (LOCAL_A(aloc))
 624             {
 625                 timings->pl_h2d_c++;
 626                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 627                                                          timers->stop_atdat);
 628             }
 629
 630             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 631                                                      timers->stop_pl_h2d[iloc]);
 632         }
 633     }
 634
 635     /* add up energies and shift forces (only once at local F wait) */
 636     if (LOCAL_I(iloc))
 637     {
 638         if (bCalcEner)
 639         {
 640             *e_lj += *nbst.e_lj;
 641             *e_el += *nbst.e_el;
 642         }
 643
 644         if (bCalcFshift)
 645         {
 646             for (i = 0; i < SHIFTS; i++)
 647             {
 648                 fshift[i][0] += nbst.fshift[i].x;
 649                 fshift[i][1] += nbst.fshift[i].y;
 650                 fshift[i][2] += nbst.fshift[i].z;
 651             }
 652         }
 653     }
 654
 655     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 656     plist->bDoPrune = false;
 657 }
 658
 659 /*! Return the reference to the nbfp texture. */
 660 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 661 {
 662     return tex_nbfp;
 663 }
 664
 665 /*! Return the reference to the coulomb_tab. */
 666 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 667 {
 668     return tex_coulomb_tab;
 669 }
 670
 671 /*! Set up the cache configuration for the non-bonded kernels,
 672  */
 673 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 674 {
 675     cudaError_t stat;
 676
 677     for (int i = 0; i < eelCuNR; i++)
 678     {
 679         for (int j = 0; j < nEnergyKernelTypes; j++)
 680         {
 681             for (int k = 0; k < nPruneKernelTypes; k++)
 682             {
 683                 /* Legacy kernel 16/48 kB Shared/L1
 684                  * No analytical Ewald!
 685                  */
 686                 if (i != eelCuEWALD_ANA && i != eelCuEWALD_ANA_TWIN)
 687                 {
 688                     stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 689                     CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 690                 }
 691
 692                 if (devinfo->prop.major >= 3)
 693                 {
 694                     /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 695                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 696                 }
 697                 else
 698                 {
 699                     /* On Fermi prefer L1 gives 2% higher performance */
 700                     /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 701                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 702                 }
 703                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 704             }
 705         }
 706     }
 707 }