src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2012, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 #include <stdlib.h>
  37 #include <assert.h>
  38
  39 #if defined(_MSVC)
  40 #include <limits>
  41 #endif
  42
  43 #include <cuda.h>
  44
  45 #include "types/simple.h"
  46 #include "types/nbnxn_pairlist.h"
  47 #include "types/nb_verlet.h"
  48 #include "types/ishift.h"
  49 #include "types/force_flags.h"
  50 #include "../nbnxn_consts.h"
  51
  52 #ifdef TMPI_ATOMICS
  53 #include "thread_mpi/atomic.h"
  54 #endif
  55
  56 #include "nbnxn_cuda_types.h"
  57 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  58 #include "nbnxn_cuda.h"
  59 #include "nbnxn_cuda_data_mgmt.h"
  60
  61
  62 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
  63 texture<float, 1, cudaReadModeElementType> tex_nbfp;
  64
  65 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  66 texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
  67
  68 /* Convenience defines */
  69 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  70 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  71
  72 /***** The kernels come here *****/
  73 #include "nbnxn_cuda_kernel_utils.cuh"
  74
  75 /* Generate all combinations of kernels through multiple inclusion:
  76    F, F + E, F + prune, F + E + prune. */
  77 /** Force only **/
  78 #include "nbnxn_cuda_kernels.cuh"
  79 /** Force & energy **/
  80 #define CALC_ENERGIES
  81 #include "nbnxn_cuda_kernels.cuh"
  82 #undef CALC_ENERGIES
  83
  84 /*** Pair-list pruning kernels ***/
  85 /** Force only **/
  86 #define PRUNE_NBL
  87 #include "nbnxn_cuda_kernels.cuh"
  88 /** Force & energy **/
  89 #define CALC_ENERGIES
  90 #include "nbnxn_cuda_kernels.cuh"
  91 #undef CALC_ENERGIES
  92 #undef PRUNE_NBL
  93
  94 /*! Nonbonded kernel function pointer type */
  95 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
  96                                      const cu_nbparam_t,
  97                                      const cu_plist_t,
  98                                      bool);
  99
 100 /*********************************/
 101
 102 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 103 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 104 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 105 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 106
 107
 108 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 109  * and corresponds to having the exponent set to the maximum (127 -- single
 110  * precision) and the mantissa to 0.
 111  */
 112 static unsigned int poll_wait_pattern = (0x7FU << 23);
 113
 114 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 115 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 116 {
 117     int max_grid_x_size;
 118
 119     assert(dinfo);
 120
 121     max_grid_x_size = dinfo->prop.maxGridSize[0];
 122
 123     /* do we exceed the grid x dimension limit? */
 124     if (nwork_units > max_grid_x_size)
 125     {
 126         gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
 127                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 128                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 129     }
 130
 131     return nwork_units;
 132 }
 133
 134
 135 /* Constant arrays listing all kernel function pointers and enabling selection
 136    of a kernel in an elegant manner. */
 137
 138 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 139 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 140
 141 /* Default kernels */
 142 static const nbnxn_cu_kfunc_ptr_t
 143 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 144 {
 145     { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
 146       { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
 147     { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
 148       { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
 149     { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
 150       { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
 151     { { k_nbnxn_cutoff,             k_nbnxn_cutoff_prune },
 152       { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
 153 };
 154
 155 /* Legacy kernels */
 156 static const nbnxn_cu_kfunc_ptr_t
 157 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 158 {
 159     { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
 160       { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
 161     { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
 162       { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
 163     { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
 164       { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
 165     { { k_nbnxn_cutoff_legacy,          k_nbnxn_cutoff_prune_legacy },
 166       { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
 167 };
 168
 169 /*! Return a pointer to the kernel version to be executed at the current step. */
 170 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 171                                                        bool bDoEne, bool bDoPrune)
 172 {
 173     assert(kver < eNbnxnCuKNR);
 174     assert(eeltype < eelCuNR);
 175
 176     if (NBNXN_KVER_LEGACY(kver))
 177     {
 178         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 179     }
 180     else
 181     {
 182         return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 183     }
 184 }
 185
 186 /*! Calculates the amount of shared memory required for kernel version in use. */
 187 static inline int calc_shmem_required(int kver)
 188 {
 189     int shmem;
 190
 191     /* size of shmem (force-buffers/xq/atom type preloading) */
 192     if (NBNXN_KVER_LEGACY(kver))
 193     {
 194         /* i-atom x+q in shared memory */
 195         shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 196         /* force reduction buffers in shared memory */
 197         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 198     }
 199     else
 200     {
 201         /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 202         /* i-atom x+q in shared memory */
 203         shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 204         /* cj in shared memory, for both warps separately */
 205         shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 206 #ifdef IATYPE_SHMEM
 207         /* i-atom types in shared memory */
 208         shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 209 #endif
 210 #if __CUDA_ARCH__ < 300
 211         /* force reduction buffers in shared memory */
 212         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 213 #endif
 214     }
 215
 216     return shmem;
 217 }
 218
 219 /*! As we execute nonbonded workload in separate streams, before launching
 220    the kernel we need to make sure that he following operations have completed:
 221    - atomdata allocation and related H2D transfers (every nstlist step);
 222    - pair list H2D transfer (every nstlist step);
 223    - shift vector H2D transfer (every nstlist step);
 224    - force (+shift force and energy) output clearing (every step).
 225
 226    These operations are issued in the local stream at the beginning of the step
 227    and therefore always complete before the local kernel launch. The non-local
 228    kernel is launched after the local on the same device/context, so this is
 229    inherently scheduled after the operations in the local stream (including the
 230    above "misc_ops").
 231    However, for the sake of having a future-proof implementation, we use the
 232    misc_ops_done event to record the point in time when the above  operations
 233    are finished and synchronize with this event in the non-local stream.
 234 */
 235 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 236                               const nbnxn_atomdata_t *nbatom,
 237                               int flags,
 238                               int iloc)
 239 {
 240     cudaError_t stat;
 241     int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 242     /* CUDA kernel launch-related stuff */
 243     int  shmem, nblock;
 244     dim3 dim_block, dim_grid;
 245     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 246
 247     cu_atomdata_t   *adat   = cu_nb->atdat;
 248     cu_nbparam_t    *nbp    = cu_nb->nbparam;
 249     cu_plist_t      *plist  = cu_nb->plist[iloc];
 250     cu_timers_t     *t      = cu_nb->timers;
 251     cudaStream_t    stream  = cu_nb->stream[iloc];
 252
 253     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 254     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 255     bool bDoTime     = cu_nb->bDoTime;
 256
 257     /* turn energy calculation always on/off (for debugging/testing only) */
 258     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 259
 260     /* don't launch the kernel if there is no work to do */
 261     if (plist->nsci == 0)
 262     {
 263         return;
 264     }
 265
 266     /* calculate the atom data index range based on locality */
 267     if (LOCAL_I(iloc))
 268     {
 269         adat_begin  = 0;
 270         adat_len    = adat->natoms_local;
 271     }
 272     else
 273     {
 274         adat_begin  = adat->natoms_local;
 275         adat_len    = adat->natoms - adat->natoms_local;
 276     }
 277
 278     /* When we get here all misc operations issues in the local stream are done,
 279        so we record that in the local stream and wait for it in the nonlocal one. */
 280     if (cu_nb->bUseTwoStreams)
 281     {
 282         if (iloc == eintLocal)
 283         {
 284             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 285             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 286         }
 287         else
 288         {
 289             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 290             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 291         }
 292     }
 293
 294     /* beginning of timed HtoD section */
 295     if (bDoTime)
 296     {
 297         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 298         CU_RET_ERR(stat, "cudaEventRecord failed");
 299     }
 300
 301     /* HtoD x, q */
 302     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 303                       adat_len * sizeof(*adat->xq), stream);
 304
 305     if (bDoTime)
 306     {
 307         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 308         CU_RET_ERR(stat, "cudaEventRecord failed");
 309     }
 310
 311     /* beginning of timed nonbonded calculation section */
 312     if (bDoTime)
 313     {
 314         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 315         CU_RET_ERR(stat, "cudaEventRecord failed");
 316     }
 317
 318     /* get the pointer to the kernel flavor we need to use */
 319     nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 320                                     plist->bDoPrune || always_prune);
 321
 322     /* kernel launch config */
 323     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 324     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 325     dim_grid  = dim3(nblock, 1, 1);
 326     shmem     = calc_shmem_required(cu_nb->kernel_ver);
 327
 328     if (debug)
 329     {
 330         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 331                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 332                 dim_block.x, dim_block.y, dim_block.z,
 333                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 334                 NCL_PER_SUPERCL, plist->na_c);
 335     }
 336
 337     nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 338     CU_LAUNCH_ERR("k_calc_nb");
 339
 340     if (bDoTime)
 341     {
 342         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 343         CU_RET_ERR(stat, "cudaEventRecord failed");
 344     }
 345 }
 346
 347 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 348                                const nbnxn_atomdata_t *nbatom,
 349                                int flags,
 350                                int aloc)
 351 {
 352     cudaError_t stat;
 353     int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 354     int iloc = -1;
 355
 356     /* determine interaction locality from atom locality */
 357     if (LOCAL_A(aloc))
 358     {
 359         iloc = eintLocal;
 360     }
 361     else if (NONLOCAL_A(aloc))
 362     {
 363         iloc = eintNonlocal;
 364     }
 365     else
 366     {
 367         char stmp[STRLEN];
 368         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 369                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 370         gmx_incons(stmp);
 371     }
 372
 373     cu_atomdata_t   *adat   = cu_nb->atdat;
 374     cu_timers_t     *t      = cu_nb->timers;
 375     bool            bDoTime = cu_nb->bDoTime;
 376     cudaStream_t    stream  = cu_nb->stream[iloc];
 377
 378     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 379     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 380
 381     /* don't launch copy-back if there was no work to do */
 382     if (cu_nb->plist[iloc]->nsci == 0)
 383     {
 384         return;
 385     }
 386
 387     /* calculate the atom data index range based on locality */
 388     if (LOCAL_A(aloc))
 389     {
 390         adat_begin  = 0;
 391         adat_len    = adat->natoms_local;
 392         adat_end    = cu_nb->atdat->natoms_local;
 393     }
 394     else
 395     {
 396         adat_begin  = adat->natoms_local;
 397         adat_len    = adat->natoms - adat->natoms_local;
 398         adat_end    = cu_nb->atdat->natoms;
 399     }
 400
 401     /* beginning of timed D2H section */
 402     if (bDoTime)
 403     {
 404         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 405         CU_RET_ERR(stat, "cudaEventRecord failed");
 406     }
 407
 408     if (!cu_nb->bUseStreamSync)
 409     {
 410         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 411            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 412         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 413         {
 414 #ifdef NAN
 415             nbatom->out[0].f[i] = NAN;
 416 #else
 417 #  ifdef _MSVC
 418             if (numeric_limits<float>::has_quiet_NaN)
 419             {
 420                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 421             }
 422             else
 423 #  endif
 424             {
 425                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 426             }
 427 #endif
 428         }
 429
 430         /* Set the last four bytes of the force array to a bit pattern
 431            which can't be the result of the force calculation:
 432            max exponent (127) and zero mantissa. */
 433         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 434     }
 435
 436     /* With DD the local D2H transfer can only start after the non-local
 437        has been launched. */
 438     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 439     {
 440         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 441         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 442     }
 443
 444     /* DtoH f */
 445     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 446                       (adat_len)*sizeof(*adat->f), stream);
 447
 448     /* After the non-local D2H is launched the nonlocal_done event can be
 449        recorded which signals that the local D2H can proceed. This event is not
 450        placed after the non-local kernel because we first need the non-local
 451        data back first. */
 452     if (iloc == eintNonlocal)
 453     {
 454         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 455         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 456     }
 457
 458     /* only transfer energies in the local stream */
 459     if (LOCAL_I(iloc))
 460     {
 461         /* DtoH fshift */
 462         if (bCalcFshift)
 463         {
 464             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 465                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 466         }
 467
 468         /* DtoH energies */
 469         if (bCalcEner)
 470         {
 471             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 472                               sizeof(*cu_nb->nbst.e_lj), stream);
 473             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 474                               sizeof(*cu_nb->nbst.e_el), stream);
 475         }
 476     }
 477
 478     if (bDoTime)
 479     {
 480         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 481         CU_RET_ERR(stat, "cudaEventRecord failed");
 482     }
 483 }
 484
 485 /* Atomic compare-exchange operation on unsigned values. It is used in
 486  * polling wait for the GPU.
 487  */
 488 static inline bool atomic_cas(volatile unsigned int *ptr,
 489                               unsigned int oldval,
 490                               unsigned int newval)
 491 {
 492     assert(ptr);
 493
 494 #ifdef TMPI_ATOMICS
 495     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 496 #else
 497     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 498     return true;
 499 #endif
 500 }
 501
 502 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 503                          const nbnxn_atomdata_t *nbatom,
 504                          int flags, int aloc,
 505                          float *e_lj, float *e_el, rvec *fshift)
 506 {
 507     cudaError_t stat;
 508     int i, adat_end, iloc = -1;
 509     volatile unsigned int *poll_word;
 510
 511     /* determine interaction locality from atom locality */
 512     if (LOCAL_A(aloc))
 513     {
 514         iloc = eintLocal;
 515     }
 516     else if (NONLOCAL_A(aloc))
 517     {
 518         iloc = eintNonlocal;
 519     }
 520     else
 521     {
 522         char stmp[STRLEN];
 523         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 524                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 525         gmx_incons(stmp);
 526     }
 527
 528     cu_plist_t      *plist   = cu_nb->plist[iloc];
 529     cu_timers_t     *timers  = cu_nb->timers;
 530     wallclock_gpu_t *timings = cu_nb->timings;
 531     nb_staging      nbst     = cu_nb->nbst;
 532
 533     bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 534     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 535
 536     /* turn energy calculation always on/off (for debugging/testing only) */
 537     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 538
 539     /* don't launch wait/update timers & counters if there was no work to do
 540
 541        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 542        counters could end up being inconsistent due to not being incremented
 543        on some of the nodes! */
 544     if (cu_nb->plist[iloc]->nsci == 0)
 545     {
 546         return;
 547     }
 548
 549     /* calculate the atom data index range based on locality */
 550     if (LOCAL_A(aloc))
 551     {
 552         adat_end = cu_nb->atdat->natoms_local;
 553     }
 554     else
 555     {
 556         adat_end = cu_nb->atdat->natoms;
 557     }
 558
 559     if (cu_nb->bUseStreamSync)
 560     {
 561         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 562         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 563     }
 564     else
 565     {
 566         /* Busy-wait until we get the signal pattern set in last byte
 567          * of the l/nl float vector. This pattern corresponds to a floating
 568          * point number which can't be the result of the force calculation
 569          * (maximum, 127 exponent and 0 mantissa).
 570          * The polling uses atomic compare-exchange.
 571          */
 572         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 573         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 574     }
 575
 576     /* timing data accumulation */
 577     if (cu_nb->bDoTime)
 578     {
 579         /* only increase counter once (at local F wait) */
 580         if (LOCAL_I(iloc))
 581         {
 582             timings->nb_c++;
 583             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 584         }
 585
 586         /* kernel timings */
 587         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 588             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 589
 590         /* X/q H2D and F D2H timings */
 591         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 592                                                  timers->stop_nb_h2d[iloc]);
 593         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 594                                                  timers->stop_nb_d2h[iloc]);
 595
 596         /* only count atdat and pair-list H2D at pair-search step */
 597         if (plist->bDoPrune)
 598         {
 599             /* atdat transfer timing (add only once, at local F wait) */
 600             if (LOCAL_A(aloc))
 601             {
 602                 timings->pl_h2d_c++;
 603                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 604                                                          timers->stop_atdat);
 605             }
 606
 607             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 608                                                      timers->stop_pl_h2d[iloc]);
 609         }
 610     }
 611
 612     /* add up energies and shift forces (only once at local F wait) */
 613     if (LOCAL_I(iloc))
 614     {
 615         if (bCalcEner)
 616         {
 617             *e_lj += *nbst.e_lj;
 618             *e_el += *nbst.e_el;
 619         }
 620
 621         if (bCalcFshift)
 622         {
 623             for (i = 0; i < SHIFTS; i++)
 624             {
 625                 fshift[i][0] += nbst.fshift[i].x;
 626                 fshift[i][1] += nbst.fshift[i].y;
 627                 fshift[i][2] += nbst.fshift[i].z;
 628             }
 629         }
 630     }
 631
 632     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 633     plist->bDoPrune = false;
 634 }
 635
 636 /*! Return the reference to the nbfp texture. */
 637 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 638 {
 639     return tex_nbfp;
 640 }
 641
 642 /*! Return the reference to the coulomb_tab. */
 643 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 644 {
 645     return tex_coulomb_tab;
 646 }
 647
 648 /*! Set up the cache configuration for the non-bonded kernels,
 649  */
 650 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 651 {
 652     cudaError_t stat;
 653
 654     for (int i = 0; i < eelCuNR; i++)
 655         for (int j = 0; j < nEnergyKernelTypes; j++)
 656             for (int k = 0; k < nPruneKernelTypes; k++)
 657             {
 658                 /* Legacy kernel 16/48 kB Shared/L1 */
 659                 stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 660                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 661
 662                 if (devinfo->prop.major >= 3)
 663                 {
 664                     /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 665                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 666                 }
 667                 else
 668                 {
 669                     /* On Fermi prefer L1 gives 2% higher performance */
 670                     /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 671                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 672                 }
 673                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 674             }
 675 }