src/mdlib/nbnxn_cuda/nbnxn_cuda.cu

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 #include <stdlib.h>
  40 #include <assert.h>
  41
  42 #if defined(_MSVC)
  43 #include <limits>
  44 #endif
  45
  46 #include <cuda.h>
  47
  48 #include "types/simple.h"
  49 #include "types/nbnxn_pairlist.h"
  50 #include "types/nb_verlet.h"
  51 #include "types/ishift.h"
  52 #include "types/force_flags.h"
  53 #include "../nbnxn_consts.h"
  54
  55 #ifdef TMPI_ATOMICS
  56 #include "thread_mpi/atomic.h"
  57 #endif
  58
  59 #include "nbnxn_cuda_types.h"
  60 #include "../../gmxlib/cuda_tools/cudautils.cuh"
  61 #include "nbnxn_cuda.h"
  62 #include "nbnxn_cuda_data_mgmt.h"
  63
  64 #if defined TEXOBJ_SUPPORTED && __CUDA_ARCH__ >= 300
  65 #define USE_TEXOBJ
  66 #endif
  67
  68 /*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
  69 texture<float, 1, cudaReadModeElementType> nbfp_texref;
  70
  71 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
  72 texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  73
  74 /* Convenience defines */
  75 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  76 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  77
  78 /***** The kernels come here *****/
  79 #include "nbnxn_cuda_kernel_utils.cuh"
  80
  81 /* Top-level kernel generation: will generate through multiple inclusion the
  82  * following flavors for all kernels:
  83  * - force-only output;
  84  * - force and energy output;
  85  * - force-only with pair list pruning;
  86  * - force and energy output with pair list pruning.
  87  */
  88 /** Force only **/
  89 #include "nbnxn_cuda_kernels.cuh"
  90 /** Force & energy **/
  91 #define CALC_ENERGIES
  92 #include "nbnxn_cuda_kernels.cuh"
  93 #undef CALC_ENERGIES
  94
  95 /*** Pair-list pruning kernels ***/
  96 /** Force only **/
  97 #define PRUNE_NBL
  98 #include "nbnxn_cuda_kernels.cuh"
  99 /** Force & energy **/
 100 #define CALC_ENERGIES
 101 #include "nbnxn_cuda_kernels.cuh"
 102 #undef CALC_ENERGIES
 103 #undef PRUNE_NBL
 104
 105 /*! Nonbonded kernel function pointer type */
 106 typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
 107                                      const cu_nbparam_t,
 108                                      const cu_plist_t,
 109                                      bool);
 110
 111 /*********************************/
 112
 113 /* XXX always/never run the energy/pruning kernels -- only for benchmarking purposes */
 114 static bool always_ener  = (getenv("GMX_GPU_ALWAYS_ENER") != NULL);
 115 static bool never_ener   = (getenv("GMX_GPU_NEVER_ENER") != NULL);
 116 static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
 117
 118
 119 /* Bit-pattern used for polling-based GPU synchronization. It is used as a float
 120  * and corresponds to having the exponent set to the maximum (127 -- single
 121  * precision) and the mantissa to 0.
 122  */
 123 static unsigned int poll_wait_pattern = (0x7FU << 23);
 124
 125 /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
 126 static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 127 {
 128     int max_grid_x_size;
 129
 130     assert(dinfo);
 131
 132     max_grid_x_size = dinfo->prop.maxGridSize[0];
 133
 134     /* do we exceed the grid x dimension limit? */
 135     if (nwork_units > max_grid_x_size)
 136     {
 137         gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
 138                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
 139                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
 140     }
 141
 142     return nwork_units;
 143 }
 144
 145
 146 /* Constant arrays listing all kernel function pointers and enabling selection
 147    of a kernel in an elegant manner. */
 148
 149 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 150 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 151
 152 /*! Pointers to the default kernels organized in a 3 dim array by:
 153  *  electrostatics type, energy calculation on/off, and pruning on/off.
 154  *
 155  *  Note that the order of electrostatics (1st dimension) has to match the
 156  *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
 157  */
 158 static const nbnxn_cu_kfunc_ptr_t
 159 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 160 {
 161     { { k_nbnxn_cutoff,                     k_nbnxn_cutoff_prune },
 162       { k_nbnxn_cutoff_ener,                k_nbnxn_cutoff_ener_prune } },
 163     { { k_nbnxn_rf,                         k_nbnxn_rf_prune },
 164       { k_nbnxn_rf_ener,                    k_nbnxn_rf_ener_prune } },
 165     { { k_nbnxn_ewald_tab,                  k_nbnxn_ewald_tab_prune },
 166       { k_nbnxn_ewald_tab_ener,             k_nbnxn_ewald_tab_ener_prune } },
 167     { { k_nbnxn_ewald_tab_twin,             k_nbnxn_ewald_tab_twin_prune },
 168       { k_nbnxn_ewald_tab_twin_ener,        k_nbnxn_ewald_twin_ener_prune } },
 169     { { k_nbnxn_ewald,                      k_nbnxn_ewald_prune },
 170       { k_nbnxn_ewald_ener,                 k_nbnxn_ewald_ener_prune } },
 171     { { k_nbnxn_ewald_twin,                 k_nbnxn_ewald_twin_prune },
 172       { k_nbnxn_ewald_twin_ener,            k_nbnxn_ewald_twin_ener_prune } },
 173 };
 174
 175 /*! Pointers to the legacy kernels organized in a 3 dim array by:
 176  *  electrostatics type, energy calculation on/off, and pruning on/off.
 177  *
 178  *  Note that the order of electrostatics (1st dimension) has to match the
 179  *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
 180  */
 181 static const nbnxn_cu_kfunc_ptr_t
 182 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 183 {
 184     { { k_nbnxn_cutoff_legacy,              k_nbnxn_cutoff_prune_legacy },
 185       { k_nbnxn_cutoff_ener_legacy,         k_nbnxn_cutoff_ener_prune_legacy } },
 186     { { k_nbnxn_rf_legacy,                  k_nbnxn_rf_prune_legacy },
 187       { k_nbnxn_rf_ener_legacy,             k_nbnxn_rf_ener_prune_legacy } },
 188     { { k_nbnxn_ewald_tab_legacy,           k_nbnxn_ewald_tab_prune_legacy },
 189       { k_nbnxn_ewald_tab_ener_legacy,      k_nbnxn_ewald_tab_ener_prune_legacy } },
 190     { { k_nbnxn_ewald_tab_twin_legacy,      k_nbnxn_ewald_tab_twin_prune_legacy },
 191       { k_nbnxn_ewald_tab_twin_ener_legacy, k_nbnxn_ewald_tab_twin_ener_prune_legacy } },
 192 };
 193
 194 /*! Return a pointer to the kernel version to be executed at the current step. */
 195 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 196                                                        bool bDoEne, bool bDoPrune)
 197 {
 198     assert(kver < eNbnxnCuKNR);
 199     assert(eeltype < eelCuNR);
 200
 201     if (NBNXN_KVER_LEGACY(kver))
 202     {
 203         /* no analytical Ewald with legacy kernels */
 204         assert(eeltype <= eelCuEWALD_TAB_TWIN);
 205
 206         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 207     }
 208     else
 209     {
 210         return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
 211     }
 212 }
 213
 214 /*! Calculates the amount of shared memory required for kernel version in use. */
 215 static inline int calc_shmem_required(int kver)
 216 {
 217     int shmem;
 218
 219     /* size of shmem (force-buffers/xq/atom type preloading) */
 220     if (NBNXN_KVER_LEGACY(kver))
 221     {
 222         /* i-atom x+q in shared memory */
 223         shmem =  NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 224         /* force reduction buffers in shared memory */
 225         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 226     }
 227     else
 228     {
 229         /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
 230         /* i-atom x+q in shared memory */
 231         shmem  = NCL_PER_SUPERCL * CL_SIZE * sizeof(float4);
 232         /* cj in shared memory, for both warps separately */
 233         shmem += 2 * NBNXN_GPU_JGROUP_SIZE * sizeof(int);
 234 #ifdef IATYPE_SHMEM
 235         /* i-atom types in shared memory */
 236         shmem += NCL_PER_SUPERCL * CL_SIZE * sizeof(int);
 237 #endif
 238 #if __CUDA_ARCH__ < 300
 239         /* force reduction buffers in shared memory */
 240         shmem += CL_SIZE * CL_SIZE * 3 * sizeof(float);
 241 #endif
 242     }
 243
 244     return shmem;
 245 }
 246
 247 /*! As we execute nonbonded workload in separate streams, before launching
 248    the kernel we need to make sure that he following operations have completed:
 249    - atomdata allocation and related H2D transfers (every nstlist step);
 250    - pair list H2D transfer (every nstlist step);
 251    - shift vector H2D transfer (every nstlist step);
 252    - force (+shift force and energy) output clearing (every step).
 253
 254    These operations are issued in the local stream at the beginning of the step
 255    and therefore always complete before the local kernel launch. The non-local
 256    kernel is launched after the local on the same device/context, so this is
 257    inherently scheduled after the operations in the local stream (including the
 258    above "misc_ops").
 259    However, for the sake of having a future-proof implementation, we use the
 260    misc_ops_done event to record the point in time when the above  operations
 261    are finished and synchronize with this event in the non-local stream.
 262 */
 263 void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb,
 264                               const nbnxn_atomdata_t *nbatom,
 265                               int flags,
 266                               int iloc)
 267 {
 268     cudaError_t stat;
 269     int adat_begin, adat_len;  /* local/nonlocal offset and length used for xq and f */
 270     /* CUDA kernel launch-related stuff */
 271     int  shmem, nblock;
 272     dim3 dim_block, dim_grid;
 273     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 274
 275     cu_atomdata_t   *adat   = cu_nb->atdat;
 276     cu_nbparam_t    *nbp    = cu_nb->nbparam;
 277     cu_plist_t      *plist  = cu_nb->plist[iloc];
 278     cu_timers_t     *t      = cu_nb->timers;
 279     cudaStream_t    stream  = cu_nb->stream[iloc];
 280
 281     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 282     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 283     bool bDoTime     = cu_nb->bDoTime;
 284
 285     /* turn energy calculation always on/off (for debugging/testing only) */
 286     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 287
 288     /* Don't launch the non-local kernel if there is no work to do.
 289        Doing the same for the local kernel is more complicated, since the
 290        local part of the force array also depends on the non-local kernel.
 291        So to avoid complicating the code and to reduce the risk of bugs,
 292        we always call the local kernel, the local x+q copy and later (not in
 293        this function) the stream wait, local f copyback and the f buffer
 294        clearing. All these operations, except for the local interaction kernel,
 295        are needed for the non-local interactions. */
 296     if (iloc == eintNonlocal && plist->nsci == 0)
 297     {
 298         return;
 299     }
 300
 301     /* calculate the atom data index range based on locality */
 302     if (LOCAL_I(iloc))
 303     {
 304         adat_begin  = 0;
 305         adat_len    = adat->natoms_local;
 306     }
 307     else
 308     {
 309         adat_begin  = adat->natoms_local;
 310         adat_len    = adat->natoms - adat->natoms_local;
 311     }
 312
 313     /* When we get here all misc operations issues in the local stream are done,
 314        so we record that in the local stream and wait for it in the nonlocal one. */
 315     if (cu_nb->bUseTwoStreams)
 316     {
 317         if (iloc == eintLocal)
 318         {
 319             stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
 320             CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
 321         }
 322         else
 323         {
 324             stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
 325             CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
 326         }
 327     }
 328
 329     /* beginning of timed HtoD section */
 330     if (bDoTime)
 331     {
 332         stat = cudaEventRecord(t->start_nb_h2d[iloc], stream);
 333         CU_RET_ERR(stat, "cudaEventRecord failed");
 334     }
 335
 336     /* HtoD x, q */
 337     cu_copy_H2D_async(adat->xq + adat_begin, nbatom->x + adat_begin * 4,
 338                       adat_len * sizeof(*adat->xq), stream);
 339
 340     if (bDoTime)
 341     {
 342         stat = cudaEventRecord(t->stop_nb_h2d[iloc], stream);
 343         CU_RET_ERR(stat, "cudaEventRecord failed");
 344     }
 345
 346     /* beginning of timed nonbonded calculation section */
 347     if (bDoTime)
 348     {
 349         stat = cudaEventRecord(t->start_nb_k[iloc], stream);
 350         CU_RET_ERR(stat, "cudaEventRecord failed");
 351     }
 352
 353     /* get the pointer to the kernel flavor we need to use */
 354     nb_kernel = select_nbnxn_kernel(cu_nb->kernel_ver, nbp->eeltype, bCalcEner,
 355                                     plist->bDoPrune || always_prune);
 356
 357     /* kernel launch config */
 358     nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
 359     dim_block = dim3(CL_SIZE, CL_SIZE, 1);
 360     dim_grid  = dim3(nblock, 1, 1);
 361     shmem     = calc_shmem_required(cu_nb->kernel_ver);
 362
 363     if (debug)
 364     {
 365         fprintf(debug, "GPU launch configuration:\n\tThread block: %dx%dx%d\n\t"
 366                 "Grid: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
 367                 dim_block.x, dim_block.y, dim_block.z,
 368                 dim_grid.x, dim_grid.y, plist->nsci*NCL_PER_SUPERCL,
 369                 NCL_PER_SUPERCL, plist->na_c);
 370     }
 371
 372     nb_kernel<<<dim_grid, dim_block, shmem, stream>>>(*adat, *nbp, *plist, bCalcFshift);
 373     CU_LAUNCH_ERR("k_calc_nb");
 374
 375     if (bDoTime)
 376     {
 377         stat = cudaEventRecord(t->stop_nb_k[iloc], stream);
 378         CU_RET_ERR(stat, "cudaEventRecord failed");
 379     }
 380 }
 381
 382 void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t cu_nb,
 383                                const nbnxn_atomdata_t *nbatom,
 384                                int flags,
 385                                int aloc)
 386 {
 387     cudaError_t stat;
 388     int adat_begin, adat_len, adat_end;  /* local/nonlocal offset and length used for xq and f */
 389     int iloc = -1;
 390
 391     /* determine interaction locality from atom locality */
 392     if (LOCAL_A(aloc))
 393     {
 394         iloc = eintLocal;
 395     }
 396     else if (NONLOCAL_A(aloc))
 397     {
 398         iloc = eintNonlocal;
 399     }
 400     else
 401     {
 402         char stmp[STRLEN];
 403         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 404                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 405         gmx_incons(stmp);
 406     }
 407
 408     cu_atomdata_t   *adat   = cu_nb->atdat;
 409     cu_timers_t     *t      = cu_nb->timers;
 410     bool            bDoTime = cu_nb->bDoTime;
 411     cudaStream_t    stream  = cu_nb->stream[iloc];
 412
 413     bool bCalcEner   = flags & GMX_FORCE_VIRIAL;
 414     bool bCalcFshift = flags & GMX_FORCE_VIRIAL;
 415
 416     /* don't launch non-local copy-back if there was no non-local work to do */
 417     if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
 418     {
 419         return;
 420     }
 421
 422     /* calculate the atom data index range based on locality */
 423     if (LOCAL_A(aloc))
 424     {
 425         adat_begin  = 0;
 426         adat_len    = adat->natoms_local;
 427         adat_end    = cu_nb->atdat->natoms_local;
 428     }
 429     else
 430     {
 431         adat_begin  = adat->natoms_local;
 432         adat_len    = adat->natoms - adat->natoms_local;
 433         adat_end    = cu_nb->atdat->natoms;
 434     }
 435
 436     /* beginning of timed D2H section */
 437     if (bDoTime)
 438     {
 439         stat = cudaEventRecord(t->start_nb_d2h[iloc], stream);
 440         CU_RET_ERR(stat, "cudaEventRecord failed");
 441     }
 442
 443     if (!cu_nb->bUseStreamSync)
 444     {
 445         /* For safety reasons set a few (5%) forces to NaN. This way even if the
 446            polling "hack" fails with some future NVIDIA driver we'll get a crash. */
 447         for (int i = adat_begin; i < 3*adat_end + 2; i += adat_len/20)
 448         {
 449 #ifdef NAN
 450             nbatom->out[0].f[i] = NAN;
 451 #else
 452 #  ifdef _MSVC
 453             if (numeric_limits<float>::has_quiet_NaN)
 454             {
 455                 nbatom->out[0].f[i] = numeric_limits<float>::quiet_NaN();
 456             }
 457             else
 458 #  endif
 459             {
 460                 nbatom->out[0].f[i] = GMX_REAL_MAX;
 461             }
 462 #endif
 463         }
 464
 465         /* Set the last four bytes of the force array to a bit pattern
 466            which can't be the result of the force calculation:
 467            max exponent (127) and zero mantissa. */
 468         *(unsigned int*)&nbatom->out[0].f[adat_end*3 - 1] = poll_wait_pattern;
 469     }
 470
 471     /* With DD the local D2H transfer can only start after the non-local
 472        kernel has finished. */
 473     if (iloc == eintLocal && cu_nb->bUseTwoStreams)
 474     {
 475         stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
 476         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
 477     }
 478
 479     /* DtoH f */
 480     cu_copy_D2H_async(nbatom->out[0].f + adat_begin * 3, adat->f + adat_begin,
 481                       (adat_len)*sizeof(*adat->f), stream);
 482
 483     /* After the non-local D2H is launched the nonlocal_done event can be
 484        recorded which signals that the local D2H can proceed. This event is not
 485        placed after the non-local kernel because we want the non-local data
 486        back first. */
 487     if (iloc == eintNonlocal)
 488     {
 489         stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
 490         CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
 491     }
 492
 493     /* only transfer energies in the local stream */
 494     if (LOCAL_I(iloc))
 495     {
 496         /* DtoH fshift */
 497         if (bCalcFshift)
 498         {
 499             cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
 500                               SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
 501         }
 502
 503         /* DtoH energies */
 504         if (bCalcEner)
 505         {
 506             cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
 507                               sizeof(*cu_nb->nbst.e_lj), stream);
 508             cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
 509                               sizeof(*cu_nb->nbst.e_el), stream);
 510         }
 511     }
 512
 513     if (bDoTime)
 514     {
 515         stat = cudaEventRecord(t->stop_nb_d2h[iloc], stream);
 516         CU_RET_ERR(stat, "cudaEventRecord failed");
 517     }
 518 }
 519
 520 /* Atomic compare-exchange operation on unsigned values. It is used in
 521  * polling wait for the GPU.
 522  */
 523 static inline bool atomic_cas(volatile unsigned int *ptr,
 524                               unsigned int oldval,
 525                               unsigned int newval)
 526 {
 527     assert(ptr);
 528
 529 #ifdef TMPI_ATOMICS
 530     return tMPI_Atomic_cas((tMPI_Atomic_t *)ptr, oldval, newval);
 531 #else
 532     gmx_incons("Atomic operations not available, atomic_cas() should not have been called!");
 533     return true;
 534 #endif
 535 }
 536
 537 void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
 538                          const nbnxn_atomdata_t *nbatom,
 539                          int flags, int aloc,
 540                          real *e_lj, real *e_el, rvec *fshift)
 541 {
 542     /* NOTE:  only implemented for single-precision at this time */
 543     cudaError_t stat;
 544     int i, adat_end, iloc = -1;
 545     volatile unsigned int *poll_word;
 546
 547     /* determine interaction locality from atom locality */
 548     if (LOCAL_A(aloc))
 549     {
 550         iloc = eintLocal;
 551     }
 552     else if (NONLOCAL_A(aloc))
 553     {
 554         iloc = eintNonlocal;
 555     }
 556     else
 557     {
 558         char stmp[STRLEN];
 559         sprintf(stmp, "Invalid atom locality passed (%d); valid here is only "
 560                 "local (%d) or nonlocal (%d)", aloc, eatLocal, eatNonlocal);
 561         gmx_incons(stmp);
 562     }
 563
 564     cu_plist_t      *plist   = cu_nb->plist[iloc];
 565     cu_timers_t     *timers  = cu_nb->timers;
 566     wallclock_gpu_t *timings = cu_nb->timings;
 567     nb_staging      nbst     = cu_nb->nbst;
 568
 569     bool    bCalcEner   = flags & GMX_FORCE_VIRIAL;
 570     bool    bCalcFshift = flags & GMX_FORCE_VIRIAL;
 571
 572     /* turn energy calculation always on/off (for debugging/testing only) */
 573     bCalcEner = (bCalcEner || always_ener) && !never_ener;
 574
 575     /* Launch wait/update timers & counters, unless doing the non-local phase
 576        when there is not actually work to do. This is consistent with
 577        nbnxn_cuda_launch_kernel.
 578
 579        NOTE: if timing with multiple GPUs (streams) becomes possible, the
 580        counters could end up being inconsistent due to not being incremented
 581        on some of the nodes! */
 582     if (iloc == eintNonlocal && cu_nb->plist[iloc]->nsci == 0)
 583     {
 584         return;
 585     }
 586
 587     /* calculate the atom data index range based on locality */
 588     if (LOCAL_A(aloc))
 589     {
 590         adat_end = cu_nb->atdat->natoms_local;
 591     }
 592     else
 593     {
 594         adat_end = cu_nb->atdat->natoms;
 595     }
 596
 597     if (cu_nb->bUseStreamSync)
 598     {
 599         stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
 600         CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
 601     }
 602     else
 603     {
 604         /* Busy-wait until we get the signal pattern set in last byte
 605          * of the l/nl float vector. This pattern corresponds to a floating
 606          * point number which can't be the result of the force calculation
 607          * (maximum, 127 exponent and 0 mantissa).
 608          * The polling uses atomic compare-exchange.
 609          */
 610         poll_word = (volatile unsigned int*)&nbatom->out[0].f[adat_end*3 - 1];
 611         while (atomic_cas(poll_word, poll_wait_pattern, poll_wait_pattern)) {}
 612     }
 613
 614     /* timing data accumulation */
 615     if (cu_nb->bDoTime)
 616     {
 617         /* only increase counter once (at local F wait) */
 618         if (LOCAL_I(iloc))
 619         {
 620             timings->nb_c++;
 621             timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].c += 1;
 622         }
 623
 624         /* kernel timings */
 625         timings->ktime[plist->bDoPrune ? 1 : 0][bCalcEner ? 1 : 0].t +=
 626             cu_event_elapsed(timers->start_nb_k[iloc], timers->stop_nb_k[iloc]);
 627
 628         /* X/q H2D and F D2H timings */
 629         timings->nb_h2d_t += cu_event_elapsed(timers->start_nb_h2d[iloc],
 630                                                  timers->stop_nb_h2d[iloc]);
 631         timings->nb_d2h_t += cu_event_elapsed(timers->start_nb_d2h[iloc],
 632                                                  timers->stop_nb_d2h[iloc]);
 633
 634         /* only count atdat and pair-list H2D at pair-search step */
 635         if (plist->bDoPrune)
 636         {
 637             /* atdat transfer timing (add only once, at local F wait) */
 638             if (LOCAL_A(aloc))
 639             {
 640                 timings->pl_h2d_c++;
 641                 timings->pl_h2d_t += cu_event_elapsed(timers->start_atdat,
 642                                                          timers->stop_atdat);
 643             }
 644
 645             timings->pl_h2d_t += cu_event_elapsed(timers->start_pl_h2d[iloc],
 646                                                      timers->stop_pl_h2d[iloc]);
 647         }
 648     }
 649
 650     /* add up energies and shift forces (only once at local F wait) */
 651     if (LOCAL_I(iloc))
 652     {
 653         if (bCalcEner)
 654         {
 655             *e_lj += *nbst.e_lj;
 656             *e_el += *nbst.e_el;
 657         }
 658
 659         if (bCalcFshift)
 660         {
 661             for (i = 0; i < SHIFTS; i++)
 662             {
 663                 fshift[i][0] += nbst.fshift[i].x;
 664                 fshift[i][1] += nbst.fshift[i].y;
 665                 fshift[i][2] += nbst.fshift[i].z;
 666             }
 667         }
 668     }
 669
 670     /* turn off pruning (doesn't matter if this is pair-search step or not) */
 671     plist->bDoPrune = false;
 672 }
 673
 674 /*! Return the reference to the nbfp texture. */
 675 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_nbfp_texref()
 676 {
 677     return nbfp_texref;
 678 }
 679
 680 /*! Return the reference to the coulomb_tab. */
 681 const struct texture<float, 1, cudaReadModeElementType>& nbnxn_cuda_get_coulomb_tab_texref()
 682 {
 683     return coulomb_tab_texref;
 684 }
 685
 686 /*! Set up the cache configuration for the non-bonded kernels,
 687  */
 688 void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 689 {
 690     cudaError_t stat;
 691
 692     for (int i = 0; i < eelCuNR; i++)
 693     {
 694         for (int j = 0; j < nEnergyKernelTypes; j++)
 695         {
 696             for (int k = 0; k < nPruneKernelTypes; k++)
 697             {
 698                 /* Legacy kernel 16/48 kB Shared/L1
 699                  * No analytical Ewald!
 700                  */
 701                 if (i != eelCuEWALD_ANA && i != eelCuEWALD_ANA_TWIN)
 702                 {
 703                     stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 704                     CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 705                 }
 706
 707                 if (devinfo->prop.major >= 3)
 708                 {
 709                     /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
 710                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
 711                 }
 712                 else
 713                 {
 714                     /* On Fermi prefer L1 gives 2% higher performance */
 715                     /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
 716                     stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
 717                 }
 718                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
 719             }
 720         }
 721     }
 722 }