src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include <stdlib.h>
  40 #include <assert.h>
  41
  42 #if defined(_MSVC)
  43 #include <limits>
  44 #endif
  45
  46 #include "types/simple.h"
  47 #include "types/nbnxn_pairlist.h"
  48 #include "types/nb_verlet.h"
  49 #include "types/ishift.h"
  50 #include "types/force_flags.h"
  51 #include "../nbnxn_consts.h"
  52
  53 #ifdef TMPI_ATOMICS
  54 #include "thread_mpi/atomic.h"
  55 #endif
  56
  57 #include "nbnxn_cuda_types.h"
  58 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  59 #include "nbnxn_cuda.h"
  60 #include "nbnxn_cuda_data_mgmt.h"
  61
  62
  63 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
  64 texture<float, 1, cudaReadModeElementType> tex_nbfp;
  65
  66 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  67 texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
  68
  69 /* Convenience defines */
  70 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  71 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  72
  73 /***** The kernels come here *****/
  74 #include "nbnxn_cuda_kernel_utils.cuh"
  75
  76 /* Generate all combinations of kernels through multiple inclusion:
  77    F, F + E, F + prune, F + E + prune. */
  78 /** Force only **/
  79 #include "nbnxn_cuda_kernels.cuh"
  80 /** Force & energy **/
  81 #define CALC_ENERGIES
  82 #include "nbnxn_cuda_kernels.cuh"
  83 #undef CALC_ENERGIES
  84
  85 /*** Pair-list pruning kernels ***/
  86 /** Force only **/
  87 #define PRUNE_NBL
  88 #include "nbnxn_cuda_kernels.cuh"
  89 /** Force & energy **/
  90 #define CALC_ENERGIES
  91 #include "nbnxn_cuda_kernels.cuh"
  92 #undef CALC_ENERGIES
  93 #undef PRUNE_NBL
  94
  95 /*! Nonbonded kernel function pointer type */
  96 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
  97                                      const cu_nbparam_t,
  98                                      const cu_plist_t,
  99                                      bool);
 100
 101 /*********************************/
 102
 103 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 104 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 105 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 106 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 107
 108
 109 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 110  * and corresponds to having the exponent set to the maximum (127 -- single
 111  * precision) and the mantissa to 0.
 112  */
 113 static unsigned int poll_wait_pattern = (0x7FU << 23);
 114
 115 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 116 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 117 {
 118     int max_grid_x_size;
 119
 120     assert(dinfo);
 121
 122     max_grid_x_size = dinfo->prop.maxGridSize[0];
 123
 124     /* do we exceed the grid x dimension limit? */
 125     if (nwork_units > max_grid_x_size)
 126     {
 127         gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
 128                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 129                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 130     }
 131
 132     return nwork_units;
 133 }
 134
 135
 136 /* Constant arrays listing all kernel function pointers and enabling selection
 137    of a kernel in an elegant manner. */
 138
 139 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 140 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 141
 142 /* Default kernels */
 143 static const nbnxn_cu_kfunc_ptr_t
 144 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 145 {
 146     { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
 147       { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
 148     { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
 149       { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
 150     { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
 151       { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
 152     { { k_nbnxn_cutoff,             k_nbnxn_cutoff_prune },
 153       { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
 154 };
 155
 156 /* Legacy kernels */
 157 static const nbnxn_cu_kfunc_ptr_t
 158 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 159 {
 160     { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
 161       { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
 162     { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
 163       { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
 164     { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
 165       { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
 166     { { k_nbnxn_cutoff_legacy,          k_nbnxn_cutoff_prune_legacy },
 167       { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
 168 };
 169
 170 /*! Return a pointer to the kernel version to be executed at the current step. */
 171 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 172                                                        bool bDoEne, bool bDoPrune)
 173 {
 174     assert(kver < eNbnxnCuKNR);
 175     assert(eeltype < eelCuNR);
 176
 177     if (NBNXN_KVER_LEGACY(kver))
 178     {
 179         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 180     }
 181     else
 182     {
 183         return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 184     }
 185 }
 186
 187 /*! Calculates the amount of shared memory required for kernel version in use. */
 188 static inline int calc_shmem_required(int kver)
 189 {
 190     int shmem;
 191
 192     /* size of shmem (force-buffers/xq/atom type preloading) */
 193     if (NBNXN_KVER_LEGACY(kver))
 194     {
 195         /* i-atom x+q in shared memory */
 196         shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 197         /* force reduction buffers in shared memory */
 198         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 199     }
 200     else
 201     {
 202         /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 203         /* i-atom x+q in shared memory */
 204         shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 205         /* cj in shared memory, for both warps separately */
 206         shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 207 #ifdef IATYPE_SHMEM
 208         /* i-atom types in shared memory */
 209         shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 210 #endif
 211 #if __CUDA_ARCH__ < 300
 212         /* force reduction buffers in shared memory */
 213         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 214 #endif
 215     }
 216
 217     return shmem;
 218 }
 219
 220 /*! As we execute nonbonded workload in separate streams, before launching
 221    the kernel we need to make sure that he following operations have completed:
 222    - atomdata allocation and related H2D transfers (every nstlist step);
 223    - pair list H2D transfer (every nstlist step);
 224    - shift vector H2D transfer (every nstlist step);
 225    - force (+shift force and energy) output clearing (every step).
 226
 227    These operations are issued in the local stream at the beginning of the step
 228    and therefore always complete before the local kernel launch. The non-local
 229    kernel is launched after the local on the same device/context, so this is
 230    inherently scheduled after the operations in the local stream (including the
 231    above "misc_ops").
 232    However, for the sake of having a future-proof implementation, we use the
 233    misc_ops_done event to record the point in time when the above  operations
 234    are finished and synchronize with this event in the non-local stream.
 235 */
 236 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 237                               const nbnxn_atomdata_t *nbatom,
 238                               int flags,
 239                               int iloc)
 240 {
 241     cudaError_t stat;
 242     int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 243     /* CUDA kernel launch-related stuff */
 244     int  shmem, nblock;
 245     dim3 dim_block, dim_grid;
 246     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 247
 248     cu_atomdata_t   *adat   = cu_nb->atdat;
 249     cu_nbparam_t    *nbp    = cu_nb->nbparam;
 250     cu_plist_t      *plist  = cu_nb->plist[iloc];
 251     cu_timers_t     *t      = cu_nb->timers;
 252     cudaStream_t    stream  = cu_nb->stream[iloc];
 253
 254     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 255     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 256     bool bDoTime     = cu_nb->bDoTime;
 257
 258     /* turn energy calculation always on/off (for debugging/testing only) */
 259     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 260
 261     /* don't launch the kernel if there is no work to do */
 262     if (plist->nsci == 0)
 263     {
 264         return;
 265     }
 266
 267     /* calculate the atom data index range based on locality */
 268     if (LOCAL_I(iloc))
 269     {
 270         adat_begin  = 0;
 271         adat_len    = adat->natoms_local;
 272     }
 273     else
 274     {
 275         adat_begin  = adat->natoms_local;
 276         adat_len    = adat->natoms - adat->natoms_local;
 277     }
 278
 279     /* When we get here all misc operations issues in the local stream are done,
 280        so we record that in the local stream and wait for it in the nonlocal one. */
 281     if (cu_nb->bUseTwoStreams)
 282     {
 283         if (iloc == eintLocal)
 284         {
 285             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 286             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 287         }
 288         else
 289         {
 290             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 291             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 292         }
 293     }
 294
 295     /* beginning of timed HtoD section */
 296     if (bDoTime)
 297     {
 298         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 299         CU_RET_ERR(stat, "cudaEventRecord failed");
 300     }
 301
 302     /* HtoD x, q */
 303     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 304                       adat_len * sizeof(*adat->xq), stream);
 305
 306     if (bDoTime)
 307     {
 308         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 309         CU_RET_ERR(stat, "cudaEventRecord failed");
 310     }
 311
 312     /* beginning of timed nonbonded calculation section */
 313     if (bDoTime)
 314     {
 315         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 316         CU_RET_ERR(stat, "cudaEventRecord failed");
 317     }
 318
 319     /* get the pointer to the kernel flavor we need to use */
 320     nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 321                                     plist->bDoPrune || always_prune);
 322
 323     /* kernel launch config */
 324     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 325     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 326     dim_grid  = dim3(nblock, 1, 1);
 327     shmem     = calc_shmem_required(cu_nb->kernel_ver);
 328
 329     if (debug)
 330     {
 331         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 332                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 333                 dim_block.x, dim_block.y, dim_block.z,
 334                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 335                 NCL_PER_SUPERCL, plist->na_c);
 336     }
 337
 338     nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 339     CU_LAUNCH_ERR("k_calc_nb");
 340
 341     if (bDoTime)
 342     {
 343         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 344         CU_RET_ERR(stat, "cudaEventRecord failed");
 345     }
 346 }
 347
 348 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 349                                const nbnxn_atomdata_t *nbatom,
 350                                int flags,
 351                                int aloc)
 352 {
 353     cudaError_t stat;
 354     int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 355     int iloc = -1;
 356
 357     /* determine interaction locality from atom locality */
 358     if (LOCAL_A(aloc))
 359     {
 360         iloc = eintLocal;
 361     }
 362     else if (NONLOCAL_A(aloc))
 363     {
 364         iloc = eintNonlocal;
 365     }
 366     else
 367     {
 368         char stmp[STRLEN];
 369         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 370                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 371         gmx_incons(stmp);
 372     }
 373
 374     cu_atomdata_t   *adat   = cu_nb->atdat;
 375     cu_timers_t     *t      = cu_nb->timers;
 376     bool            bDoTime = cu_nb->bDoTime;
 377     cudaStream_t    stream  = cu_nb->stream[iloc];
 378
 379     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 380     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 381
 382     /* don't launch copy-back if there was no work to do */
 383     if (cu_nb->plist[iloc]->nsci == 0)
 384     {
 385         return;
 386     }
 387
 388     /* calculate the atom data index range based on locality */
 389     if (LOCAL_A(aloc))
 390     {
 391         adat_begin  = 0;
 392         adat_len    = adat->natoms_local;
 393         adat_end    = cu_nb->atdat->natoms_local;
 394     }
 395     else
 396     {
 397         adat_begin  = adat->natoms_local;
 398         adat_len    = adat->natoms - adat->natoms_local;
 399         adat_end    = cu_nb->atdat->natoms;
 400     }
 401
 402     /* beginning of timed D2H section */
 403     if (bDoTime)
 404     {
 405         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 406         CU_RET_ERR(stat, "cudaEventRecord failed");
 407     }
 408
 409     if (!cu_nb->bUseStreamSync)
 410     {
 411         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 412            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 413         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 414         {
 415 #ifdef NAN
 416             nbatom->out[0].f[i] = NAN;
 417 #else
 418 #  ifdef _MSVC
 419             if (numeric_limits<float>::has_quiet_NaN)
 420             {
 421                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 422             }
 423             else
 424 #  endif
 425             {
 426                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 427             }
 428 #endif
 429         }
 430
 431         /* Set the last four bytes of the force array to a bit pattern
 432            which can't be the result of the force calculation:
 433            max exponent (127) and zero mantissa. */
 434         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 435     }
 436
 437     /* With DD the local D2H transfer can only start after the non-local
 438        has been launched. */
 439     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 440     {
 441         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 442         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 443     }
 444
 445     /* DtoH f */
 446     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 447                       (adat_len)*sizeof(*adat->f), stream);
 448
 449     /* After the non-local D2H is launched the nonlocal_done event can be
 450        recorded which signals that the local D2H can proceed. This event is not
 451        placed after the non-local kernel because we first need the non-local
 452        data back first. */
 453     if (iloc == eintNonlocal)
 454     {
 455         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 456         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 457     }
 458
 459     /* only transfer energies in the local stream */
 460     if (LOCAL_I(iloc))
 461     {
 462         /* DtoH fshift */
 463         if (bCalcFshift)
 464         {
 465             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 466                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 467         }
 468
 469         /* DtoH energies */
 470         if (bCalcEner)
 471         {
 472             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 473                               sizeof(*cu_nb->nbst.e_lj), stream);
 474             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 475                               sizeof(*cu_nb->nbst.e_el), stream);
 476         }
 477     }
 478
 479     if (bDoTime)
 480     {
 481         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 482         CU_RET_ERR(stat, "cudaEventRecord failed");
 483     }
 484 }
 485
 486 /* Atomic compare-exchange operation on unsigned values. It is used in
 487  * polling wait for the GPU.
 488  */
 489 static inline bool atomic_cas(volatile unsigned int *ptr,
 490                               unsigned int oldval,
 491                               unsigned int newval)
 492 {
 493     assert(ptr);
 494
 495 #ifdef TMPI_ATOMICS
 496     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 497 #else
 498     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 499     return true;
 500 #endif
 501 }
 502
 503 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 504                          const nbnxn_atomdata_t *nbatom,
 505                          int flags, int aloc,
 506                          float *e_lj, float *e_el, rvec *fshift)
 507 {
 508     cudaError_t stat;
 509     int i, adat_end, iloc = -1;
 510     volatile unsigned int *poll_word;
 511
 512     /* determine interaction locality from atom locality */
 513     if (LOCAL_A(aloc))
 514     {
 515         iloc = eintLocal;
 516     }
 517     else if (NONLOCAL_A(aloc))
 518     {
 519         iloc = eintNonlocal;
 520     }
 521     else
 522     {
 523         char stmp[STRLEN];
 524         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 525                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 526         gmx_incons(stmp);
 527     }
 528
 529     cu_plist_t      *plist   = cu_nb->plist[iloc];
 530     cu_timers_t     *timers  = cu_nb->timers;
 531     wallclock_gpu_t *timings = cu_nb->timings;
 532     nb_staging      nbst     = cu_nb->nbst;
 533
 534     bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 535     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 536
 537     /* turn energy calculation always on/off (for debugging/testing only) */
 538     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 539
 540     /* don't launch wait/update timers & counters if there was no work to do
 541
 542        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 543        counters could end up being inconsistent due to not being incremented
 544        on some of the nodes! */
 545     if (cu_nb->plist[iloc]->nsci == 0)
 546     {
 547         return;
 548     }
 549
 550     /* calculate the atom data index range based on locality */
 551     if (LOCAL_A(aloc))
 552     {
 553         adat_end = cu_nb->atdat->natoms_local;
 554     }
 555     else
 556     {
 557         adat_end = cu_nb->atdat->natoms;
 558     }
 559
 560     if (cu_nb->bUseStreamSync)
 561     {
 562         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 563         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 564     }
 565     else
 566     {
 567         /* Busy-wait until we get the signal pattern set in last byte
 568          * of the l/nl float vector. This pattern corresponds to a floating
 569          * point number which can't be the result of the force calculation
 570          * (maximum, 127 exponent and 0 mantissa).
 571          * The polling uses atomic compare-exchange.
 572          */
 573         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 574         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 575     }
 576
 577     /* timing data accumulation */
 578     if (cu_nb->bDoTime)
 579     {
 580         /* only increase counter once (at local F wait) */
 581         if (LOCAL_I(iloc))
 582         {
 583             timings->nb_c++;
 584             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 585         }
 586
 587         /* kernel timings */
 588         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 589             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 590
 591         /* X/q H2D and F D2H timings */
 592         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 593                                                  timers->stop_nb_h2d[iloc]);
 594         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 595                                                  timers->stop_nb_d2h[iloc]);
 596
 597         /* only count atdat and pair-list H2D at pair-search step */
 598         if (plist->bDoPrune)
 599         {
 600             /* atdat transfer timing (add only once, at local F wait) */
 601             if (LOCAL_A(aloc))
 602             {
 603                 timings->pl_h2d_c++;
 604                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 605                                                          timers->stop_atdat);
 606             }
 607
 608             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 609                                                      timers->stop_pl_h2d[iloc]);
 610         }
 611     }
 612
 613     /* add up energies and shift forces (only once at local F wait) */
 614     if (LOCAL_I(iloc))
 615     {
 616         if (bCalcEner)
 617         {
 618             *e_lj += *nbst.e_lj;
 619             *e_el += *nbst.e_el;
 620         }
 621
 622         if (bCalcFshift)
 623         {
 624             for (i = 0; i < SHIFTS; i++)
 625             {
 626                 fshift[i][0] += nbst.fshift[i].x;
 627                 fshift[i][1] += nbst.fshift[i].y;
 628                 fshift[i][2] += nbst.fshift[i].z;
 629             }
 630         }
 631     }
 632
 633     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 634     plist->bDoPrune = false;
 635 }
 636
 637 /*! Return the reference to the nbfp texture. */
 638 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 639 {
 640     return tex_nbfp;
 641 }
 642
 643 /*! Return the reference to the coulomb_tab. */
 644 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 645 {
 646     return tex_coulomb_tab;
 647 }
 648
 649 /*! Set up the cache configuration for the non-bonded kernels,
 650  */
 651 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 652 {
 653     cudaError_t stat;
 654
 655     for (int i = 0; i < eelCuNR; i++)
 656         for (int j = 0; j < nEnergyKernelTypes; j++)
 657             for (int k = 0; k < nPruneKernelTypes; k++)
 658             {
 659                 /* Legacy kernel 16/48 kB Shared/L1 */
 660                 stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 661                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 662
 663                 if (devinfo->prop.major >= 3)
 664                 {
 665                     /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 666                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 667                 }
 668                 else
 669                 {
 670                     /* On Fermi prefer L1 gives 2% higher performance */
 671                     /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 672                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 673                 }
 674                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 675             }
 676 }