src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 /* Note that floating-point constants in CUDA code should be suffixed
  40  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  41  * code that is in double precision.
  42  */
  43
  44 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
  45
  46 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  47 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  48
  49 #define WARP_SIZE_POW2_EXPONENT     (5)
  50 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  51 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  52 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  53
  54 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  55 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  56
  57 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  58  *  Original idea: OpenMM
  59  */
  60 static inline __device__
  61 float interpolate_coulomb_force_r(float r, float scale)
  62 {
  63     float   normalized = scale * r;
  64     int     index = (int) normalized;
  65     float   fract2 = normalized - index;
  66     float   fract1 = 1.0f - fract2;
  67
  68     return  fract1 * tex1Dfetch(tex_coulomb_tab, index)
  69             + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
  70 }
  71
  72 /*! Calculate analytical Ewald correction term. */
  73 static inline __device__
  74 float pmecorrF(float z2)
  75 {
  76     const float FN6 = -1.7357322914161492954e-8f;
  77     const float FN5 = 1.4703624142580877519e-6f;
  78     const float FN4 = -0.000053401640219807709149f;
  79     const float FN3 = 0.0010054721316683106153f;
  80     const float FN2 = -0.019278317264888380590f;
  81     const float FN1 = 0.069670166153766424023f;
  82     const float FN0 = -0.75225204789749321333f;
  83
  84     const float FD4 = 0.0011193462567257629232f;
  85     const float FD3 = 0.014866955030185295499f;
  86     const float FD2 = 0.11583842382862377919f;
  87     const float FD1 = 0.50736591960530292870f;
  88     const float FD0 = 1.0f;
  89
  90     float       z4;
  91     float       polyFN0,polyFN1,polyFD0,polyFD1;
  92
  93     z4          = z2*z2;
  94
  95     polyFD0     = FD4*z4 + FD2;
  96     polyFD1     = FD3*z4 + FD1;
  97     polyFD0     = polyFD0*z4 + FD0;
  98     polyFD0     = polyFD1*z2 + polyFD0;
  99
 100     polyFD0     = 1.0f/polyFD0;
 101
 102     polyFN0     = FN6*z4 + FN4;
 103     polyFN1     = FN5*z4 + FN3;
 104     polyFN0     = polyFN0*z4 + FN2;
 105     polyFN1     = polyFN1*z4 + FN1;
 106     polyFN0     = polyFN0*z4 + FN0;
 107     polyFN0     = polyFN1*z2 + polyFN0;
 108
 109     return polyFN0*polyFD0;
 110 }
 111
 112 /*! Final j-force reduction; this generic implementation works with
 113  *  arbitrary array sizes.
 114  */
 115 static inline __device__
 116 void reduce_force_j_generic(float *f_buf, float3 *fout,
 117                             int tidxi, int tidxj, int aidx)
 118 {
 119     if (tidxi == 0)
 120     {
 121         float3 f = make_float3(0.0f);
 122         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 123         {
 124             f.x += f_buf[                  j];
 125             f.y += f_buf[    FBUF_STRIDE + j];
 126             f.z += f_buf[2 * FBUF_STRIDE + j];
 127         }
 128
 129         atomicAdd(&fout[aidx], f);
 130     }
 131 }
 132
 133 /*! Final j-force reduction; this implementation only with power of two
 134  *  array sizes and with sm >= 3.0
 135  */
 136 #if __CUDA_ARCH__ >= 300
 137 static inline __device__
 138 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 139                               int tidxi, int aidx)
 140 {
 141     int i;
 142
 143 #pragma unroll 3
 144     for (i = 0; i < 3; i++)
 145     {
 146         f.x += __shfl_down(f.x, 1<<i);
 147         f.y += __shfl_down(f.y, 1<<i);
 148         f.z += __shfl_down(f.z, 1<<i);
 149     }
 150
 151     /* Write the reduced j-force on one thread for each j */
 152     if (tidxi == 0)
 153     {
 154         atomicAdd(&fout[aidx], f);
 155     }
 156 }
 157 #endif
 158
 159 /*! Final i-force reduction; this generic implementation works with
 160  *  arbitrary array sizes.
 161  */
 162 static inline __device__
 163 void reduce_force_i_generic(float *f_buf, float3 *fout,
 164                             float3 *fshift_buf, bool bCalcFshift,
 165                             int tidxi, int tidxj, int aidx)
 166 {
 167     if (tidxj == 0)
 168     {
 169         float3 f = make_float3(0.0f);
 170         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 171         {
 172             f.x += f_buf[                  j];
 173             f.y += f_buf[    FBUF_STRIDE + j];
 174             f.z += f_buf[2 * FBUF_STRIDE + j];
 175         }
 176
 177         atomicAdd(&fout[aidx], f);
 178
 179         if (bCalcFshift)
 180         {
 181             *fshift_buf += f;
 182         }
 183     }
 184 }
 185
 186 /*! Final i-force reduction; this implementation works only with power of two
 187  *  array sizes.
 188  */
 189 static inline __device__
 190 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 191                          float3 *fshift_buf, bool bCalcFshift,
 192                          int tidxi, int tidxj, int aidx)
 193 {
 194     int     i, j;
 195     float3  f = make_float3(0.0f);
 196
 197     /* Reduce the initial CL_SIZE values for each i atom to half
 198      * every step by using CL_SIZE * i threads.
 199      * Can't just use i as loop variable because than nvcc refuses to unroll.
 200      */
 201     i = CL_SIZE/2;
 202     # pragma unroll 5
 203     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 204     {
 205         if (tidxj < i)
 206         {
 207
 208             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 209             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 210             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 211         }
 212         i >>= 1;
 213     }
 214
 215     /* i == 1, last reduction step, writing to global mem */
 216     if (tidxj == 0)
 217     {
 218         f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 219         f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 220         f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 221
 222         atomicAdd(&fout[aidx], f);
 223
 224         if (bCalcFshift)
 225         {
 226             *fshift_buf += f;
 227         }
 228     }
 229 }
 230
 231 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 232  *  on whether the size of the array to be reduced is power of two or not.
 233  */
 234 static inline __device__
 235 void reduce_force_i(float *f_buf, float3 *f,
 236                     float3 *fshift_buf, bool bCalcFshift,
 237                     int tidxi, int tidxj, int ai)
 238 {
 239     if ((CL_SIZE & (CL_SIZE - 1)))
 240     {
 241         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 242     }
 243     else
 244     {
 245         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 246     }
 247 }
 248
 249 /*! Final i-force reduction; this implementation works only with power of two
 250  *  array sizes and with sm >= 3.0
 251  */
 252 #if __CUDA_ARCH__ >= 300
 253 static inline __device__
 254 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 255                               float3 *fshift_buf, bool bCalcFshift,
 256                               int tidxj, int aidx)
 257 {
 258     int j;
 259
 260 #pragma unroll 2
 261     for (j = 0; j < 2; j++)
 262     {
 263         fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
 264         fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
 265         fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
 266     }
 267
 268     /* The first thread in the warp writes the reduced force */
 269     if (tidxj == 0 || tidxj == 4)
 270     {
 271         atomicAdd(&fout[aidx], fin);
 272
 273         if (bCalcFshift)
 274         {
 275             fshift_buf->x += fin.x;
 276             fshift_buf->y += fin.y;
 277             fshift_buf->z += fin.z;
 278         }
 279     }
 280 }
 281 #endif
 282
 283 /*! Energy reduction; this implementation works only with power of two
 284  *  array sizes.
 285  */
 286 static inline __device__
 287 void reduce_energy_pow2(volatile float *buf,
 288                         float *e_lj, float *e_el,
 289                         unsigned int tidx)
 290 {
 291     int     i, j;
 292     float   e1, e2;
 293
 294     i = WARP_SIZE/2;
 295
 296     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 297 # pragma unroll 10
 298     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 299     {
 300         if (tidx < i)
 301         {
 302             buf[              tidx] += buf[              tidx + i];
 303             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 304         }
 305         i >>= 1;
 306     }
 307
 308     /* last reduction step, writing to global mem */
 309     if (tidx == 0)
 310     {
 311         e1 = buf[              tidx] + buf[              tidx + i];
 312         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 313
 314         atomicAdd(e_lj, e1);
 315         atomicAdd(e_el, e2);
 316     }
 317 }
 318
 319 /*! Energy reduction; this implementation works only with power of two
 320  *  array sizes and with sm >= 3.0
 321  */
 322 #if __CUDA_ARCH__ >= 300
 323 static inline __device__
 324 void reduce_energy_warp_shfl(float E_lj, float E_el,
 325                              float *e_lj, float *e_el,
 326                              int tidx)
 327 {
 328     int i, sh;
 329
 330     sh = 1;
 331 #pragma unroll 5
 332     for (i = 0; i < 5; i++)
 333     {
 334         E_lj += __shfl_down(E_lj,sh);
 335         E_el += __shfl_down(E_el,sh);
 336         sh += sh;
 337     }
 338
 339     /* The first thread in the warp writes the reduced energies */
 340     if (tidx == 0 || tidx == WARP_SIZE)
 341     {
 342         atomicAdd(e_lj,E_lj);
 343         atomicAdd(e_el,E_el);
 344     }
 345 }
 346 #endif /* __CUDA_ARCH__ */
 347
 348 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */