src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2012, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 /* Note that floating-point constants in CUDA code should be suffixed
  37  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  38  * code that is in double precision.
  39  */
  40
  41 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
  42
  43 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  44 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  45
  46 #define WARP_SIZE_POW2_EXPONENT     (5)
  47 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  48 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  49 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  50
  51 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  52 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  53
  54 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  55  *  Original idea: OpenMM
  56  */
  57 static inline __device__
  58 float interpolate_coulomb_force_r(float r, float scale)
  59 {
  60     float   normalized = scale * r;
  61     int     index = (int) normalized;
  62     float   fract2 = normalized - index;
  63     float   fract1 = 1.0f - fract2;
  64
  65     return  fract1 * tex1Dfetch(tex_coulomb_tab, index)
  66             + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
  67 }
  68
  69 /*! Final j-force reduction; this generic implementation works with
  70  *  arbitrary array sizes.
  71  */
  72 static inline __device__
  73 void reduce_force_j_generic(float *f_buf, float3 *fout,
  74                             int tidxi, int tidxj, int aidx)
  75 {
  76     if (tidxi == 0)
  77     {
  78         float3 f = make_float3(0.0f);
  79         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
  80         {
  81             f.x += f_buf[                  j];
  82             f.y += f_buf[    FBUF_STRIDE + j];
  83             f.z += f_buf[2 * FBUF_STRIDE + j];
  84         }
  85
  86         atomicAdd(&fout[aidx], f);
  87     }
  88 }
  89
  90 /*! Final j-force reduction; this implementation only with power of two
  91  *  array sizes and with sm >= 3.0
  92  */
  93 #if __CUDA_ARCH__ >= 300
  94 static inline __device__
  95 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
  96                               int tidxi, int aidx)
  97 {
  98     int i;
  99
 100 #pragma unroll 3
 101     for (i = 0; i < 3; i++)
 102     {
 103         f.x += __shfl_down(f.x, 1<<i);
 104         f.y += __shfl_down(f.y, 1<<i);
 105         f.z += __shfl_down(f.z, 1<<i);
 106     }
 107
 108     /* Write the reduced j-force on one thread for each j */
 109     if (tidxi == 0)
 110     {
 111         atomicAdd(&fout[aidx], f);
 112     }
 113 }
 114 #endif
 115
 116 /*! Final i-force reduction; this generic implementation works with
 117  *  arbitrary array sizes.
 118  */
 119 static inline __device__
 120 void reduce_force_i_generic(float *f_buf, float3 *fout,
 121                             float3 *fshift_buf, bool bCalcFshift,
 122                             int tidxi, int tidxj, int aidx)
 123 {
 124     if (tidxj == 0)
 125     {
 126         float3 f = make_float3(0.0f);
 127         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 128         {
 129             f.x += f_buf[                  j];
 130             f.y += f_buf[    FBUF_STRIDE + j];
 131             f.z += f_buf[2 * FBUF_STRIDE + j];
 132         }
 133
 134         atomicAdd(&fout[aidx], f);
 135
 136         if (bCalcFshift)
 137         {
 138             *fshift_buf += f;
 139         }
 140     }
 141 }
 142
 143 /*! Final i-force reduction; this implementation works only with power of two
 144  *  array sizes.
 145  */
 146 static inline __device__
 147 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 148                          float3 *fshift_buf, bool bCalcFshift,
 149                          int tidxi, int tidxj, int aidx)
 150 {
 151     int     i, j;
 152     float3  f = make_float3(0.0f);
 153
 154     /* Reduce the initial CL_SIZE values for each i atom to half
 155      * every step by using CL_SIZE * i threads.
 156      * Can't just use i as loop variable because than nvcc refuses to unroll.
 157      */
 158     i = CL_SIZE/2;
 159     # pragma unroll 5
 160     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 161     {
 162         if (tidxj < i)
 163         {
 164
 165             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 166             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 167             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 168         }
 169         i >>= 1;
 170     }
 171
 172     /* i == 1, last reduction step, writing to global mem */
 173     if (tidxj == 0)
 174     {
 175         f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 176         f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 177         f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 178
 179         atomicAdd(&fout[aidx], f);
 180
 181         if (bCalcFshift)
 182         {
 183             *fshift_buf += f;
 184         }
 185     }
 186 }
 187
 188 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 189  *  on whether the size of the array to be reduced is power of two or not.
 190  */
 191 static inline __device__
 192 void reduce_force_i(float *f_buf, float3 *f,
 193                     float3 *fshift_buf, bool bCalcFshift,
 194                     int tidxi, int tidxj, int ai)
 195 {
 196     if ((CL_SIZE & (CL_SIZE - 1)))
 197     {
 198         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 199     }
 200     else
 201     {
 202         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 203     }
 204 }
 205
 206 /*! Final i-force reduction; this implementation works only with power of two
 207  *  array sizes and with sm >= 3.0
 208  */
 209 #if __CUDA_ARCH__ >= 300
 210 static inline __device__
 211 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 212                               float3 *fshift_buf, bool bCalcFshift,
 213                               int tidxj, int aidx)
 214 {
 215     int j;
 216
 217 #pragma unroll 2
 218     for (j = 0; j < 2; j++)
 219     {
 220         fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
 221         fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
 222         fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
 223     }
 224
 225     /* The first thread in the warp writes the reduced force */
 226     if (tidxj == 0 || tidxj == 4)
 227     {
 228         atomicAdd(&fout[aidx], fin);
 229
 230         if (bCalcFshift)
 231         {
 232             fshift_buf->x += fin.x;
 233             fshift_buf->y += fin.y;
 234             fshift_buf->z += fin.z;
 235         }
 236     }
 237 }
 238 #endif
 239
 240 /*! Energy reduction; this implementation works only with power of two
 241  *  array sizes.
 242  */
 243 static inline __device__
 244 void reduce_energy_pow2(volatile float *buf,
 245                         float *e_lj, float *e_el,
 246                         unsigned int tidx)
 247 {
 248     int     i, j;
 249     float   e1, e2;
 250
 251     i = WARP_SIZE/2;
 252
 253     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 254 # pragma unroll 10
 255     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 256     {
 257         if (tidx < i)
 258         {
 259             buf[              tidx] += buf[              tidx + i];
 260             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 261         }
 262         i >>= 1;
 263     }
 264
 265     /* last reduction step, writing to global mem */
 266     if (tidx == 0)
 267     {
 268         e1 = buf[              tidx] + buf[              tidx + i];
 269         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 270
 271         atomicAdd(e_lj, e1);
 272         atomicAdd(e_el, e2);
 273     }
 274 }
 275
 276 /*! Energy reduction; this implementation works only with power of two
 277  *  array sizes and with sm >= 3.0
 278  */
 279 #if __CUDA_ARCH__ >= 300
 280 static inline __device__
 281 void reduce_energy_warp_shfl(float E_lj, float E_el,
 282                              float *e_lj, float *e_el,
 283                              int tidx)
 284 {
 285     int i, sh;
 286
 287     sh = 1;
 288 #pragma unroll 5
 289     for (i = 0; i < 5; i++)
 290     {
 291         E_lj += __shfl_down(E_lj,sh);
 292         E_el += __shfl_down(E_el,sh);
 293         sh += sh;
 294     }
 295
 296     /* The first thread in the warp writes the reduced energies */
 297     if (tidx == 0 || tidx == WARP_SIZE)
 298     {
 299         atomicAdd(e_lj,E_lj);
 300         atomicAdd(e_el,E_el);
 301     }
 302 }
 303 #endif /* __CUDA_ARCH__ */
 304
 305 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */