src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2012, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 #include <stdlib.h>
  37 #include <assert.h>
  38
  39 #if defined(_MSVC)
  40 #include <limits>
  41 #endif
  42
  43 #include <cuda.h>
  44
  45 #include "types/simple.h"
  46 #include "types/nbnxn_pairlist.h"
  47 #include "types/nb_verlet.h"
  48 #include "types/ishift.h"
  49 #include "types/force_flags.h"
  50 #include "../nbnxn_consts.h"
  51
  52 #ifdef TMPI_ATOMICS
  53 #include "thread_mpi/atomic.h"
  54 #endif
  55
  56 #include "nbnxn_cuda_types.h"
  57 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  58 #include "nbnxn_cuda.h"
  59 #include "nbnxn_cuda_data_mgmt.h"
  60
  61
  62 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
  63 texture<float, 1, cudaReadModeElementType> tex_nbfp;
  64
  65 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  66 texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
  67
  68 /* Convenience defines */
  69 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  70 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  71
  72 /***** The kernels come here *****/
  73 #include "nbnxn_cuda_kernel_utils.cuh"
  74
  75 /* Generate all combinations of kernels through multiple inclusion:
  76    F, F + E, F + prune, F + E + prune. */
  77 /** Force only **/
  78 #include "nbnxn_cuda_kernels.cuh"
  79 /** Force & energy **/
  80 #define CALC_ENERGIES
  81 #include "nbnxn_cuda_kernels.cuh"
  82 #undef CALC_ENERGIES
  83
  84 /*** Pair-list pruning kernels ***/
  85 /** Force only **/
  86 #define PRUNE_NBL
  87 #include "nbnxn_cuda_kernels.cuh"
  88 /** Force & energy **/
  89 #define CALC_ENERGIES
  90 #include "nbnxn_cuda_kernels.cuh"
  91 #undef CALC_ENERGIES
  92 #undef PRUNE_NBL
  93
  94 /*! Nonbonded kernel function pointer type */
  95 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
  96                                      const cu_nbparam_t,
  97                                      const cu_plist_t,
  98                                      bool);
  99
 100 /*********************************/
 101
 102 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 103 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 104 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 105 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 106
 107
 108 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 109  * and corresponds to having the exponent set to the maximum (127 -- single
 110  * precision) and the mantissa to 0.
 111  */
 112 static unsigned int poll_wait_pattern = (0x7FU << 23);
 113
 114 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 115 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 116 {
 117     int max_grid_x_size;
 118
 119     assert(dinfo);
 120
 121     max_grid_x_size = dinfo->prop.maxGridSize[0];
 122
 123     /* do we exceed the grid x dimension limit? */
 124     if (nwork_units > max_grid_x_size)
 125     {
 126         gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
 127                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 128                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 129     }
 130
 131     return nwork_units;
 132 }
 133
 134
 135 /* Constant arrays listing all kernel function pointers and enabling selection
 136    of a kernel in an elegant manner. */
 137
 138 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 139 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 140
 141 /* Default kernels */
 142 static const nbnxn_cu_kfunc_ptr_t
 143 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 144 {
 145     { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
 146       { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
 147     { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
 148       { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
 149     { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
 150       { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
 151     { { k_nbnxn_cutoff,             k_nbnxn_cutoff_prune },
 152       { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
 153 };
 154
 155 /* Legacy kernels */
 156 static const nbnxn_cu_kfunc_ptr_t
 157 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 158 {
 159     { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
 160       { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
 161     { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
 162       { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
 163     { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
 164       { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
 165     { { k_nbnxn_cutoff_legacy,          k_nbnxn_cutoff_prune_legacy },
 166       { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
 167 };
 168
 169 /*! Return a pointer to the kernel version to be executed at the current step. */
 170 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 171                                                        bool bDoEne, bool bDoPrune)
 172 {
 173     assert(kver < eNbnxnCuKNR);
 174     assert(eeltype < eelCuNR);
 175
 176     if (NBNXN_KVER_LEGACY(kver))
 177     {
 178         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 179     }
 180     else
 181     {
 182         return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 183     }
 184 }
 185
 186 /*! Calculates the amount of shared memory required for kernel version in use. */
 187 static inline int calc_shmem_required(int kver)
 188 {
 189     int shmem;
 190
 191     /* size of shmem (force-buffers/xq/atom type preloading) */
 192     if (NBNXN_KVER_LEGACY(kver))
 193     {
 194         /* i-atom x+q in shared memory */
 195         shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 196         /* force reduction buffers in shared memory */
 197         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 198     }
 199     else
 200     {
 201         /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 202         /* i-atom x+q in shared memory */
 203         shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 204         /* cj in shared memory, for both warps separately */
 205         shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 206 #ifdef IATYPE_SHMEM
 207         /* i-atom types in shared memory */
 208         shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 209 #endif
 210 #if __CUDA_ARCH__ < 300
 211         /* force reduction buffers in shared memory */
 212         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 213 #endif
 214     }
 215
 216     return shmem;
 217 }
 218
 219 /*! As we execute nonbonded workload in separate streams, before launching
 220    the kernel we need to make sure that he following operations have completed:
 221    - atomdata allocation and related H2D transfers (every nstlist step);
 222    - pair list H2D transfer (every nstlist step);
 223    - shift vector H2D transfer (every nstlist step);
 224    - force (+shift force and energy) output clearing (every step).
 225
 226    These operations are issued in the local stream at the beginning of the step
 227    and therefore always complete before the local kernel launch. The non-local
 228    kernel is launched after the local on the same device/context, so this is
 229    inherently scheduled after the operations in the local stream (including the
 230    above "misc_ops").
 231    However, for the sake of having a future-proof implementation, we use the
 232    misc_ops_done event to record the point in time when the above  operations
 233    are finished and synchronize with this event in the non-local stream.
 234 */
 235 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 236                               const nbnxn_atomdata_t *nbatom,
 237                               int flags,
 238                               int iloc)
 239 {
 240     cudaError_t stat;
 241     int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 242     /* CUDA kernel launch-related stuff */
 243     int  shmem, nblock;
 244     dim3 dim_block, dim_grid;
 245     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 246
 247     cu_atomdata_t   *adat   = cu_nb->atdat;
 248     cu_nbparam_t    *nbp    = cu_nb->nbparam;
 249     cu_plist_t      *plist  = cu_nb->plist[iloc];
 250     cu_timers_t     *t      = cu_nb->timers;
 251     cudaStream_t    stream  = cu_nb->stream[iloc];
 252
 253     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 254     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 255     bool bDoTime     = cu_nb->bDoTime;
 256
 257     /* turn energy calculation always on/off (for debugging/testing only) */
 258     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 259
 260     /* don't launch the kernel if there is no work to do */
 261     if (plist->nsci == 0)
 262     {
 263         return;
 264     }
 265
 266     /* calculate the atom data index range based on locality */
 267     if (LOCAL_I(iloc))
 268     {
 269         adat_begin  = 0;
 270         adat_len    = adat->natoms_local;
 271     }
 272     else
 273     {
 274         adat_begin  = adat->natoms_local;
 275         adat_len    = adat->natoms - adat->natoms_local;
 276     }
 277
 278     /* When we get here all misc operations issues in the local stream are done,
 279        so we record that in the local stream and wait for it in the nonlocal one. */
 280     if (cu_nb->bUseTwoStreams)
 281     {
 282         if (iloc == eintLocal)
 283         {
 284             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 285             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 286         }
 287         else
 288         {
 289             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 290             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 291         }
 292     }
 293
 294     /* beginning of timed HtoD section */
 295     if (bDoTime)
 296     {
 297         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 298         CU_RET_ERR(stat, "cudaEventRecord failed");
 299     }
 300
 301     /* HtoD x, q */
 302     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 303                       adat_len * sizeof(*adat->xq), stream);
 304
 305     if (bDoTime)
 306     {
 307         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 308         CU_RET_ERR(stat, "cudaEventRecord failed");
 309     }
 310
 311     /* beginning of timed nonbonded calculation section */
 312     if (bDoTime)
 313     {
 314         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 315         CU_RET_ERR(stat, "cudaEventRecord failed");
 316     }
 317
 318     /* get the pointer to the kernel flavor we need to use */
 319     nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 320                                     plist->bDoPrune || always_prune);
 321
 322     /* kernel launch config */
 323     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 324     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 325     dim_grid  = dim3(nblock, 1, 1);
 326     shmem     = calc_shmem_required(cu_nb->kernel_ver);
 327
 328     if (debug)
 329     {
 330         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 331                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 332                 dim_block.x, dim_block.y, dim_block.z,
 333                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 334                 NCL_PER_SUPERCL, plist->na_c);
 335     }
 336
 337     nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 338     CU_LAUNCH_ERR("k_calc_nb");
 339
 340     if (bDoTime)
 341     {
 342         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 343         CU_RET_ERR(stat, "cudaEventRecord failed");
 344     }
 345 }
 346
 347 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 348                                const nbnxn_atomdata_t *nbatom,
 349                                int flags,
 350                                int aloc)
 351 {
 352     cudaError_t stat;
 353     int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 354     int iloc = -1;
 355
 356     /* determine interaction locality from atom locality */
 357     if (LOCAL_A(aloc))
 358     {
 359         iloc = eintLocal;
 360     }
 361     else if (NONLOCAL_A(aloc))
 362     {
 363         iloc = eintNonlocal;
 364     }
 365     else
 366     {
 367         char stmp[STRLEN];
 368         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 369                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 370         gmx_incons(stmp);
 371     }
 372
 373     cu_atomdata_t   *adat   = cu_nb->atdat;
 374     cu_timers_t     *t      = cu_nb->timers;
 375     bool            bDoTime = cu_nb->bDoTime;
 376     cudaStream_t    stream  = cu_nb->stream[iloc];
 377
 378     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 379     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 380
 381     /* don't launch copy-back if there was no work to do */
 382     if (cu_nb->plist[iloc]->nsci == 0)
 383     {
 384         return;
 385     }
 386
 387     /* calculate the atom data index range based on locality */
 388     if (LOCAL_A(aloc))
 389     {
 390         adat_begin  = 0;
 391         adat_len    = adat->natoms_local;
 392         adat_end    = cu_nb->atdat->natoms_local;
 393     }
 394     else
 395     {
 396         adat_begin  = adat->natoms_local;
 397         adat_len    = adat->natoms - adat->natoms_local;
 398         adat_end    = cu_nb->atdat->natoms;
 399     }
 400
 401     /* beginning of timed D2H section */
 402     if (bDoTime)
 403     {
 404         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 405         CU_RET_ERR(stat, "cudaEventRecord failed");
 406     }
 407
 408     if (!cu_nb->bUseStreamSync)
 409     {
 410         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 411            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 412         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 413         {
 414 #ifdef NAN
 415             nbatom->out[0].f[i] = NAN;
 416 #else
 417 #  ifdef _MSVC
 418             if (numeric_limits<float>::has_quiet_NaN)
 419             {
 420                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 421             }
 422             else
 423 #  endif
 424             {
 425                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 426             }
 427 #endif
 428         }
 429
 430         /* Set the last four bytes of the force array to a bit pattern
 431            which can't be the result of the force calculation:
 432            max exponent (127) and zero mantissa. */
 433         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 434     }
 435
 436     /* With DD the local D2H transfer can only start after the non-local
 437        has been launched. */
 438     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 439     {
 440         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 441         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 442     }
 443
 444     /* DtoH f */
 445     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 446                       (adat_len)*sizeof(*adat->f), stream);
 447
 448     /* After the non-local D2H is launched the nonlocal_done event can be
 449        recorded which signals that the local D2H can proceed. This event is not
 450        placed after the non-local kernel because we first need the non-local
 451        data back first. */
 452     if (iloc == eintNonlocal)
 453     {
 454         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 455         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 456     }
 457
 458     /* only transfer energies in the local stream */
 459     if (LOCAL_I(iloc))
 460     {
 461         /* DtoH fshift */
 462         if (bCalcFshift)
 463         {
 464             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 465                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 466         }
 467
 468         /* DtoH energies */
 469         if (bCalcEner)
 470         {
 471             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 472                               sizeof(*cu_nb->nbst.e_lj), stream);
 473             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 474                               sizeof(*cu_nb->nbst.e_el), stream);
 475         }
 476     }
 477
 478     if (bDoTime)
 479     {
 480         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 481         CU_RET_ERR(stat, "cudaEventRecord failed");
 482     }
 483 }
 484
 485 /* Atomic compare-exchange operation on unsigned values. It is used in
 486  * polling wait for the GPU.
 487  */
 488 static inline bool atomic_cas(volatile unsigned int *ptr,
 489                               unsigned int oldval,
 490                               unsigned int newval)
 491 {
 492     assert(ptr);
 493
 494 #ifdef TMPI_ATOMICS
 495     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 496 #else
 497     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 498     return true;
 499 #endif
 500 }
 501
 502 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 503                          const nbnxn_atomdata_t *nbatom,
 504                          int flags, int aloc,
 505                          real *e_lj, real *e_el, rvec *fshift)
 506 {
 507     /* NOTE:  only implemented for single-precision at this time */
 508     cudaError_t stat;
 509     int i, adat_end, iloc = -1;
 510     volatile unsigned int *poll_word;
 511
 512     /* determine interaction locality from atom locality */
 513     if (LOCAL_A(aloc))
 514     {
 515         iloc = eintLocal;
 516     }
 517     else if (NONLOCAL_A(aloc))
 518     {
 519         iloc = eintNonlocal;
 520     }
 521     else
 522     {
 523         char stmp[STRLEN];
 524         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 525                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 526         gmx_incons(stmp);
 527     }
 528
 529     cu_plist_t      *plist   = cu_nb->plist[iloc];
 530     cu_timers_t     *timers  = cu_nb->timers;
 531     wallclock_gpu_t *timings = cu_nb->timings;
 532     nb_staging      nbst     = cu_nb->nbst;
 533
 534     bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 535     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 536
 537     /* turn energy calculation always on/off (for debugging/testing only) */
 538     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 539
 540     /* don't launch wait/update timers & counters if there was no work to do
 541
 542        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 543        counters could end up being inconsistent due to not being incremented
 544        on some of the nodes! */
 545     if (cu_nb->plist[iloc]->nsci == 0)
 546     {
 547         return;
 548     }
 549
 550     /* calculate the atom data index range based on locality */
 551     if (LOCAL_A(aloc))
 552     {
 553         adat_end = cu_nb->atdat->natoms_local;
 554     }
 555     else
 556     {
 557         adat_end = cu_nb->atdat->natoms;
 558     }
 559
 560     if (cu_nb->bUseStreamSync)
 561     {
 562         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 563         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 564     }
 565     else
 566     {
 567         /* Busy-wait until we get the signal pattern set in last byte
 568          * of the l/nl float vector. This pattern corresponds to a floating
 569          * point number which can't be the result of the force calculation
 570          * (maximum, 127 exponent and 0 mantissa).
 571          * The polling uses atomic compare-exchange.
 572          */
 573         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 574         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 575     }
 576
 577     /* timing data accumulation */
 578     if (cu_nb->bDoTime)
 579     {
 580         /* only increase counter once (at local F wait) */
 581         if (LOCAL_I(iloc))
 582         {
 583             timings->nb_c++;
 584             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 585         }
 586
 587         /* kernel timings */
 588         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 589             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 590
 591         /* X/q H2D and F D2H timings */
 592         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 593                                                  timers->stop_nb_h2d[iloc]);
 594         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 595                                                  timers->stop_nb_d2h[iloc]);
 596
 597         /* only count atdat and pair-list H2D at pair-search step */
 598         if (plist->bDoPrune)
 599         {
 600             /* atdat transfer timing (add only once, at local F wait) */
 601             if (LOCAL_A(aloc))
 602             {
 603                 timings->pl_h2d_c++;
 604                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 605                                                          timers->stop_atdat);
 606             }
 607
 608             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 609                                                      timers->stop_pl_h2d[iloc]);
 610         }
 611     }
 612
 613     /* add up energies and shift forces (only once at local F wait) */
 614     if (LOCAL_I(iloc))
 615     {
 616         if (bCalcEner)
 617         {
 618             *e_lj += *nbst.e_lj;
 619             *e_el += *nbst.e_el;
 620         }
 621
 622         if (bCalcFshift)
 623         {
 624             for (i = 0; i < SHIFTS; i++)
 625             {
 626                 fshift[i][0] += nbst.fshift[i].x;
 627                 fshift[i][1] += nbst.fshift[i].y;
 628                 fshift[i][2] += nbst.fshift[i].z;
 629             }
 630         }
 631     }
 632
 633     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 634     plist->bDoPrune = false;
 635 }
 636
 637 /*! Return the reference to the nbfp texture. */
 638 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 639 {
 640     return tex_nbfp;
 641 }
 642
 643 /*! Return the reference to the coulomb_tab. */
 644 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 645 {
 646     return tex_coulomb_tab;
 647 }
 648
 649 /*! Set up the cache configuration for the non-bonded kernels,
 650  */
 651 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 652 {
 653     cudaError_t stat;
 654
 655     for (int i = 0; i < eelCuNR; i++)
 656         for (int j = 0; j < nEnergyKernelTypes; j++)
 657             for (int k = 0; k < nPruneKernelTypes; k++)
 658             {
 659                 /* Legacy kernel 16/48 kB Shared/L1 */
 660                 stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 661                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 662
 663                 if (devinfo->prop.major >= 3)
 664                 {
 665                     /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 666                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 667                 }
 668                 else
 669                 {
 670                     /* On Fermi prefer L1 gives 2% higher performance */
 671                     /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 672                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 673                 }
 674                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 675             }
 676 }