src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2012, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 #include <stdlib.h>
  37 #include <assert.h>
  38
  39 #if defined(_MSVC)
  40 #include <limits>
  41 #endif
  42
  43 #include "types/simple.h"
  44 #include "types/nbnxn_pairlist.h"
  45 #include "types/nb_verlet.h"
  46 #include "types/ishift.h"
  47 #include "types/force_flags.h"
  48 #include "../nbnxn_consts.h"
  49
  50 #ifdef TMPI_ATOMICS
  51 #include "thread_mpi/atomic.h"
  52 #endif
  53
  54 #include "nbnxn_cuda_types.h"
  55 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  56 #include "nbnxn_cuda.h"
  57 #include "nbnxn_cuda_data_mgmt.h"
  58
  59
  60 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
  61 texture<float, 1, cudaReadModeElementType> tex_nbfp;
  62
  63 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  64 texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
  65
  66 /* Convenience defines */
  67 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  68 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  69
  70 /***** The kernels come here *****/
  71 #include "nbnxn_cuda_kernel_utils.cuh"
  72
  73 /* Generate all combinations of kernels through multiple inclusion:
  74    F, F + E, F + prune, F + E + prune. */
  75 /** Force only **/
  76 #include "nbnxn_cuda_kernels.cuh"
  77 /** Force & energy **/
  78 #define CALC_ENERGIES
  79 #include "nbnxn_cuda_kernels.cuh"
  80 #undef CALC_ENERGIES
  81
  82 /*** Pair-list pruning kernels ***/
  83 /** Force only **/
  84 #define PRUNE_NBL
  85 #include "nbnxn_cuda_kernels.cuh"
  86 /** Force & energy **/
  87 #define CALC_ENERGIES
  88 #include "nbnxn_cuda_kernels.cuh"
  89 #undef CALC_ENERGIES
  90 #undef PRUNE_NBL
  91
  92 /*! Nonbonded kernel function pointer type */
  93 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
  94                                      const cu_nbparam_t,
  95                                      const cu_plist_t,
  96                                      bool);
  97
  98 /*********************************/
  99
 100 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 101 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 102 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 103 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 104
 105
 106 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 107  * and corresponds to having the exponent set to the maximum (127 -- single
 108  * precision) and the mantissa to 0.
 109  */
 110 static unsigned int poll_wait_pattern = (0x7FU << 23);
 111
 112 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 113 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 114 {
 115     int max_grid_x_size;
 116
 117     assert(dinfo);
 118
 119     max_grid_x_size = dinfo->prop.maxGridSize[0];
 120
 121     /* do we exceed the grid x dimension limit? */
 122     if (nwork_units > max_grid_x_size)
 123     {
 124         gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
 125                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 126                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 127     }
 128
 129     return nwork_units;
 130 }
 131
 132
 133 /* Constant arrays listing all kernel function pointers and enabling selection
 134    of a kernel in an elegant manner. */
 135
 136 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 137 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 138
 139 /* Default kernels */
 140 static const nbnxn_cu_kfunc_ptr_t
 141 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 142 {
 143     { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
 144       { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
 145     { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
 146       { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
 147     { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
 148       { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
 149     { { k_nbnxn_cutoff,             k_nbnxn_cutoff_prune },
 150       { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
 151 };
 152
 153 /* Legacy kernels */
 154 static const nbnxn_cu_kfunc_ptr_t
 155 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 156 {
 157     { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
 158       { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
 159     { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
 160       { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
 161     { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
 162       { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
 163     { { k_nbnxn_cutoff_legacy,          k_nbnxn_cutoff_prune_legacy },
 164       { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
 165 };
 166
 167 /*! Return a pointer to the kernel version to be executed at the current step. */
 168 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 169                                                        bool bDoEne, bool bDoPrune)
 170 {
 171     assert(kver < eNbnxnCuKNR);
 172     assert(eeltype < eelCuNR);
 173
 174     if (NBNXN_KVER_LEGACY(kver))
 175     {
 176         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 177     }
 178     else
 179     {
 180         return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 181     }
 182 }
 183
 184 /*! Calculates the amount of shared memory required for kernel version in use. */
 185 static inline int calc_shmem_required(int kver)
 186 {
 187     int shmem;
 188
 189     /* size of shmem (force-buffers/xq/atom type preloading) */
 190     if (NBNXN_KVER_LEGACY(kver))
 191     {
 192         /* i-atom x+q in shared memory */
 193         shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 194         /* force reduction buffers in shared memory */
 195         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 196     }
 197     else
 198     {
 199         /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 200         /* i-atom x+q in shared memory */
 201         shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 202         /* cj in shared memory, for both warps separately */
 203         shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 204 #ifdef IATYPE_SHMEM
 205         /* i-atom types in shared memory */
 206         shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 207 #endif
 208 #if __CUDA_ARCH__ < 300
 209         /* force reduction buffers in shared memory */
 210         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 211 #endif
 212     }
 213
 214     return shmem;
 215 }
 216
 217 /*! As we execute nonbonded workload in separate streams, before launching
 218    the kernel we need to make sure that he following operations have completed:
 219    - atomdata allocation and related H2D transfers (every nstlist step);
 220    - pair list H2D transfer (every nstlist step);
 221    - shift vector H2D transfer (every nstlist step);
 222    - force (+shift force and energy) output clearing (every step).
 223
 224    These operations are issued in the local stream at the beginning of the step
 225    and therefore always complete before the local kernel launch. The non-local
 226    kernel is launched after the local on the same device/context, so this is
 227    inherently scheduled after the operations in the local stream (including the
 228    above "misc_ops").
 229    However, for the sake of having a future-proof implementation, we use the
 230    misc_ops_done event to record the point in time when the above  operations
 231    are finished and synchronize with this event in the non-local stream.
 232 */
 233 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 234                               const nbnxn_atomdata_t *nbatom,
 235                               int flags,
 236                               int iloc)
 237 {
 238     cudaError_t stat;
 239     int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 240     /* CUDA kernel launch-related stuff */
 241     int  shmem, nblock;
 242     dim3 dim_block, dim_grid;
 243     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 244
 245     cu_atomdata_t   *adat   = cu_nb->atdat;
 246     cu_nbparam_t    *nbp    = cu_nb->nbparam;
 247     cu_plist_t      *plist  = cu_nb->plist[iloc];
 248     cu_timers_t     *t      = cu_nb->timers;
 249     cudaStream_t    stream  = cu_nb->stream[iloc];
 250
 251     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 252     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 253     bool bDoTime     = cu_nb->bDoTime;
 254
 255     /* turn energy calculation always on/off (for debugging/testing only) */
 256     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 257
 258     /* don't launch the kernel if there is no work to do */
 259     if (plist->nsci == 0)
 260     {
 261         return;
 262     }
 263
 264     /* calculate the atom data index range based on locality */
 265     if (LOCAL_I(iloc))
 266     {
 267         adat_begin  = 0;
 268         adat_len    = adat->natoms_local;
 269     }
 270     else
 271     {
 272         adat_begin  = adat->natoms_local;
 273         adat_len    = adat->natoms - adat->natoms_local;
 274     }
 275
 276     /* When we get here all misc operations issues in the local stream are done,
 277        so we record that in the local stream and wait for it in the nonlocal one. */
 278     if (cu_nb->bUseTwoStreams)
 279     {
 280         if (iloc == eintLocal)
 281         {
 282             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 283             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 284         }
 285         else
 286         {
 287             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 288             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 289         }
 290     }
 291
 292     /* beginning of timed HtoD section */
 293     if (bDoTime)
 294     {
 295         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 296         CU_RET_ERR(stat, "cudaEventRecord failed");
 297     }
 298
 299     /* HtoD x, q */
 300     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 301                       adat_len * sizeof(*adat->xq), stream);
 302
 303     if (bDoTime)
 304     {
 305         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 306         CU_RET_ERR(stat, "cudaEventRecord failed");
 307     }
 308
 309     /* beginning of timed nonbonded calculation section */
 310     if (bDoTime)
 311     {
 312         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 313         CU_RET_ERR(stat, "cudaEventRecord failed");
 314     }
 315
 316     /* get the pointer to the kernel flavor we need to use */
 317     nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 318                                     plist->bDoPrune || always_prune);
 319
 320     /* kernel launch config */
 321     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 322     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 323     dim_grid  = dim3(nblock, 1, 1);
 324     shmem     = calc_shmem_required(cu_nb->kernel_ver);
 325
 326     if (debug)
 327     {
 328         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 329                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 330                 dim_block.x, dim_block.y, dim_block.z,
 331                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 332                 NCL_PER_SUPERCL, plist->na_c);
 333     }
 334
 335     nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 336     CU_LAUNCH_ERR("k_calc_nb");
 337
 338     if (bDoTime)
 339     {
 340         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 341         CU_RET_ERR(stat, "cudaEventRecord failed");
 342     }
 343 }
 344
 345 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 346                                const nbnxn_atomdata_t *nbatom,
 347                                int flags,
 348                                int aloc)
 349 {
 350     cudaError_t stat;
 351     int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 352     int iloc = -1;
 353
 354     /* determine interaction locality from atom locality */
 355     if (LOCAL_A(aloc))
 356     {
 357         iloc = eintLocal;
 358     }
 359     else if (NONLOCAL_A(aloc))
 360     {
 361         iloc = eintNonlocal;
 362     }
 363     else
 364     {
 365         char stmp[STRLEN];
 366         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 367                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 368         gmx_incons(stmp);
 369     }
 370
 371     cu_atomdata_t   *adat   = cu_nb->atdat;
 372     cu_timers_t     *t      = cu_nb->timers;
 373     bool            bDoTime = cu_nb->bDoTime;
 374     cudaStream_t    stream  = cu_nb->stream[iloc];
 375
 376     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 377     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 378
 379     /* don't launch copy-back if there was no work to do */
 380     if (cu_nb->plist[iloc]->nsci == 0)
 381     {
 382         return;
 383     }
 384
 385     /* calculate the atom data index range based on locality */
 386     if (LOCAL_A(aloc))
 387     {
 388         adat_begin  = 0;
 389         adat_len    = adat->natoms_local;
 390         adat_end    = cu_nb->atdat->natoms_local;
 391     }
 392     else
 393     {
 394         adat_begin  = adat->natoms_local;
 395         adat_len    = adat->natoms - adat->natoms_local;
 396         adat_end    = cu_nb->atdat->natoms;
 397     }
 398
 399     /* beginning of timed D2H section */
 400     if (bDoTime)
 401     {
 402         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 403         CU_RET_ERR(stat, "cudaEventRecord failed");
 404     }
 405
 406     if (!cu_nb->bUseStreamSync)
 407     {
 408         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 409            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 410         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 411         {
 412 #ifdef NAN
 413             nbatom->out[0].f[i] = NAN;
 414 #else
 415 #  ifdef _MSVC
 416             if (numeric_limits<float>::has_quiet_NaN)
 417             {
 418                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 419             }
 420             else
 421 #  endif
 422             {
 423                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 424             }
 425 #endif
 426         }
 427
 428         /* Set the last four bytes of the force array to a bit pattern
 429            which can't be the result of the force calculation:
 430            max exponent (127) and zero mantissa. */
 431         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 432     }
 433
 434     /* With DD the local D2H transfer can only start after the non-local
 435        has been launched. */
 436     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 437     {
 438         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 439         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 440     }
 441
 442     /* DtoH f */
 443     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 444                       (adat_len)*sizeof(*adat->f), stream);
 445
 446     /* After the non-local D2H is launched the nonlocal_done event can be
 447        recorded which signals that the local D2H can proceed. This event is not
 448        placed after the non-local kernel because we first need the non-local
 449        data back first. */
 450     if (iloc == eintNonlocal)
 451     {
 452         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 453         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 454     }
 455
 456     /* only transfer energies in the local stream */
 457     if (LOCAL_I(iloc))
 458     {
 459         /* DtoH fshift */
 460         if (bCalcFshift)
 461         {
 462             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 463                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 464         }
 465
 466         /* DtoH energies */
 467         if (bCalcEner)
 468         {
 469             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 470                               sizeof(*cu_nb->nbst.e_lj), stream);
 471             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 472                               sizeof(*cu_nb->nbst.e_el), stream);
 473         }
 474     }
 475
 476     if (bDoTime)
 477     {
 478         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 479         CU_RET_ERR(stat, "cudaEventRecord failed");
 480     }
 481 }
 482
 483 /* Atomic compare-exchange operation on unsigned values. It is used in
 484  * polling wait for the GPU.
 485  */
 486 static inline bool atomic_cas(volatile unsigned int *ptr,
 487                               unsigned int oldval,
 488                               unsigned int newval)
 489 {
 490     assert(ptr);
 491
 492 #ifdef TMPI_ATOMICS
 493     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 494 #else
 495     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 496     return true;
 497 #endif
 498 }
 499
 500 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 501                          const nbnxn_atomdata_t *nbatom,
 502                          int flags, int aloc,
 503                          float *e_lj, float *e_el, rvec *fshift)
 504 {
 505     cudaError_t stat;
 506     int i, adat_end, iloc = -1;
 507     volatile unsigned int *poll_word;
 508
 509     /* determine interaction locality from atom locality */
 510     if (LOCAL_A(aloc))
 511     {
 512         iloc = eintLocal;
 513     }
 514     else if (NONLOCAL_A(aloc))
 515     {
 516         iloc = eintNonlocal;
 517     }
 518     else
 519     {
 520         char stmp[STRLEN];
 521         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 522                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 523         gmx_incons(stmp);
 524     }
 525
 526     cu_plist_t      *plist   = cu_nb->plist[iloc];
 527     cu_timers_t     *timers  = cu_nb->timers;
 528     wallclock_gpu_t *timings = cu_nb->timings;
 529     nb_staging      nbst     = cu_nb->nbst;
 530
 531     bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 532     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 533
 534     /* turn energy calculation always on/off (for debugging/testing only) */
 535     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 536
 537     /* don't launch wait/update timers & counters if there was no work to do
 538
 539        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 540        counters could end up being inconsistent due to not being incremented
 541        on some of the nodes! */
 542     if (cu_nb->plist[iloc]->nsci == 0)
 543     {
 544         return;
 545     }
 546
 547     /* calculate the atom data index range based on locality */
 548     if (LOCAL_A(aloc))
 549     {
 550         adat_end = cu_nb->atdat->natoms_local;
 551     }
 552     else
 553     {
 554         adat_end = cu_nb->atdat->natoms;
 555     }
 556
 557     if (cu_nb->bUseStreamSync)
 558     {
 559         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 560         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 561     }
 562     else
 563     {
 564         /* Busy-wait until we get the signal pattern set in last byte
 565          * of the l/nl float vector. This pattern corresponds to a floating
 566          * point number which can't be the result of the force calculation
 567          * (maximum, 127 exponent and 0 mantissa).
 568          * The polling uses atomic compare-exchange.
 569          */
 570         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 571         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 572     }
 573
 574     /* timing data accumulation */
 575     if (cu_nb->bDoTime)
 576     {
 577         /* only increase counter once (at local F wait) */
 578         if (LOCAL_I(iloc))
 579         {
 580             timings->nb_c++;
 581             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 582         }
 583
 584         /* kernel timings */
 585         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 586             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 587
 588         /* X/q H2D and F D2H timings */
 589         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 590                                                  timers->stop_nb_h2d[iloc]);
 591         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 592                                                  timers->stop_nb_d2h[iloc]);
 593
 594         /* only count atdat and pair-list H2D at pair-search step */
 595         if (plist->bDoPrune)
 596         {
 597             /* atdat transfer timing (add only once, at local F wait) */
 598             if (LOCAL_A(aloc))
 599             {
 600                 timings->pl_h2d_c++;
 601                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 602                                                          timers->stop_atdat);
 603             }
 604
 605             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 606                                                      timers->stop_pl_h2d[iloc]);
 607         }
 608     }
 609
 610     /* add up energies and shift forces (only once at local F wait) */
 611     if (LOCAL_I(iloc))
 612     {
 613         if (bCalcEner)
 614         {
 615             *e_lj += *nbst.e_lj;
 616             *e_el += *nbst.e_el;
 617         }
 618
 619         if (bCalcFshift)
 620         {
 621             for (i = 0; i < SHIFTS; i++)
 622             {
 623                 fshift[i][0] += nbst.fshift[i].x;
 624                 fshift[i][1] += nbst.fshift[i].y;
 625                 fshift[i][2] += nbst.fshift[i].z;
 626             }
 627         }
 628     }
 629
 630     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 631     plist->bDoPrune = false;
 632 }
 633
 634 /*! Return the reference to the nbfp texture. */
 635 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 636 {
 637     return tex_nbfp;
 638 }
 639
 640 /*! Return the reference to the coulomb_tab. */
 641 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 642 {
 643     return tex_coulomb_tab;
 644 }
 645
 646 /*! Set up the cache configuration for the non-bonded kernels,
 647  */
 648 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 649 {
 650     cudaError_t stat;
 651
 652     for (int i = 0; i < eelCuNR; i++)
 653         for (int j = 0; j < nEnergyKernelTypes; j++)
 654             for (int k = 0; k < nPruneKernelTypes; k++)
 655             {
 656                 /* Legacy kernel 16/48 kB Shared/L1 */
 657                 stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 658                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 659
 660                 if (devinfo->prop.major >= 3)
 661                 {
 662                     /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 663                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 664                 }
 665                 else
 666                 {
 667                     /* On Fermi prefer L1 gives 2% higher performance */
 668                     /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 669                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 670                 }
 671                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 672             }
 673 }