src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /* Note that floating-point constants in CUDA code should be suffixed
  37  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  38  * code that is in double precision.
  39  */
  40
  41 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
  42
  43 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  44 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  45
  46 #define WARP_SIZE_POW2_EXPONENT     (5)
  47 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  48 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  49 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  50
  51 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  52 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  53
  54 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  55  *  Original idea: from the OpenMM project
  56  */
  57 static inline __device__
  58 float interpolate_coulomb_force_r(float r, float scale)
  59 {
  60     float   normalized = scale * r;
  61     int     index      = (int) normalized;
  62     float   fract2     = normalized - index;
  63     float   fract1     = 1.0f - fract2;
  64
  65     return fract1 * tex1Dfetch(coulomb_tab_texref, index)
  66            + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
  67 }
  68
  69 #ifdef TEXOBJ_SUPPORTED
  70 static inline __device__
  71 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
  72                                   float r, float scale)
  73 {
  74     float   normalized = scale * r;
  75     int     index      = (int) normalized;
  76     float   fract2     = normalized - index;
  77     float   fract1     = 1.0f - fract2;
  78
  79     return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
  80            fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
  81 }
  82 #endif
  83
  84
  85 /*! Calculate analytical Ewald correction term. */
  86 static inline __device__
  87 float pmecorrF(float z2)
  88 {
  89     const float FN6 = -1.7357322914161492954e-8f;
  90     const float FN5 = 1.4703624142580877519e-6f;
  91     const float FN4 = -0.000053401640219807709149f;
  92     const float FN3 = 0.0010054721316683106153f;
  93     const float FN2 = -0.019278317264888380590f;
  94     const float FN1 = 0.069670166153766424023f;
  95     const float FN0 = -0.75225204789749321333f;
  96
  97     const float FD4 = 0.0011193462567257629232f;
  98     const float FD3 = 0.014866955030185295499f;
  99     const float FD2 = 0.11583842382862377919f;
 100     const float FD1 = 0.50736591960530292870f;
 101     const float FD0 = 1.0f;
 102
 103     float       z4;
 104     float       polyFN0, polyFN1, polyFD0, polyFD1;
 105
 106     z4          = z2*z2;
 107
 108     polyFD0     = FD4*z4 + FD2;
 109     polyFD1     = FD3*z4 + FD1;
 110     polyFD0     = polyFD0*z4 + FD0;
 111     polyFD0     = polyFD1*z2 + polyFD0;
 112
 113     polyFD0     = 1.0f/polyFD0;
 114
 115     polyFN0     = FN6*z4 + FN4;
 116     polyFN1     = FN5*z4 + FN3;
 117     polyFN0     = polyFN0*z4 + FN2;
 118     polyFN1     = polyFN1*z4 + FN1;
 119     polyFN0     = polyFN0*z4 + FN0;
 120     polyFN0     = polyFN1*z2 + polyFN0;
 121
 122     return polyFN0*polyFD0;
 123 }
 124
 125 /*! Final j-force reduction; this generic implementation works with
 126  *  arbitrary array sizes.
 127  */
 128 static inline __device__
 129 void reduce_force_j_generic(float *f_buf, float3 *fout,
 130                             int tidxi, int tidxj, int aidx)
 131 {
 132     if (tidxi == 0)
 133     {
 134         float3 f = make_float3(0.0f);
 135         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 136         {
 137             f.x += f_buf[                  j];
 138             f.y += f_buf[    FBUF_STRIDE + j];
 139             f.z += f_buf[2 * FBUF_STRIDE + j];
 140         }
 141
 142         atomicAdd(&fout[aidx], f);
 143     }
 144 }
 145
 146 /*! Final j-force reduction; this implementation only with power of two
 147  *  array sizes and with sm >= 3.0
 148  */
 149 #if __CUDA_ARCH__ >= 300
 150 static inline __device__
 151 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 152                               int tidxi, int aidx)
 153 {
 154     int i;
 155
 156 #pragma unroll 3
 157     for (i = 0; i < 3; i++)
 158     {
 159         f.x += __shfl_down(f.x, 1<<i);
 160         f.y += __shfl_down(f.y, 1<<i);
 161         f.z += __shfl_down(f.z, 1<<i);
 162     }
 163
 164     /* Write the reduced j-force on one thread for each j */
 165     if (tidxi == 0)
 166     {
 167         atomicAdd(&fout[aidx], f);
 168     }
 169 }
 170 #endif
 171
 172 /*! Final i-force reduction; this generic implementation works with
 173  *  arbitrary array sizes.
 174  */
 175 static inline __device__
 176 void reduce_force_i_generic(float *f_buf, float3 *fout,
 177                             float3 *fshift_buf, bool bCalcFshift,
 178                             int tidxi, int tidxj, int aidx)
 179 {
 180     if (tidxj == 0)
 181     {
 182         float3 f = make_float3(0.0f);
 183         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 184         {
 185             f.x += f_buf[                  j];
 186             f.y += f_buf[    FBUF_STRIDE + j];
 187             f.z += f_buf[2 * FBUF_STRIDE + j];
 188         }
 189
 190         atomicAdd(&fout[aidx], f);
 191
 192         if (bCalcFshift)
 193         {
 194             *fshift_buf += f;
 195         }
 196     }
 197 }
 198
 199 /*! Final i-force reduction; this implementation works only with power of two
 200  *  array sizes.
 201  */
 202 static inline __device__
 203 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 204                          float3 *fshift_buf, bool bCalcFshift,
 205                          int tidxi, int tidxj, int aidx)
 206 {
 207     int     i, j;
 208     float3  f = make_float3(0.0f);
 209
 210     /* Reduce the initial CL_SIZE values for each i atom to half
 211      * every step by using CL_SIZE * i threads.
 212      * Can't just use i as loop variable because than nvcc refuses to unroll.
 213      */
 214     i = CL_SIZE/2;
 215     # pragma unroll 5
 216     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 217     {
 218         if (tidxj < i)
 219         {
 220
 221             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 222             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 223             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 224         }
 225         i >>= 1;
 226     }
 227
 228     /* i == 1, last reduction step, writing to global mem */
 229     if (tidxj == 0)
 230     {
 231         f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 232         f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 233         f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 234
 235         atomicAdd(&fout[aidx], f);
 236
 237         if (bCalcFshift)
 238         {
 239             *fshift_buf += f;
 240         }
 241     }
 242 }
 243
 244 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 245  *  on whether the size of the array to be reduced is power of two or not.
 246  */
 247 static inline __device__
 248 void reduce_force_i(float *f_buf, float3 *f,
 249                     float3 *fshift_buf, bool bCalcFshift,
 250                     int tidxi, int tidxj, int ai)
 251 {
 252     if ((CL_SIZE & (CL_SIZE - 1)))
 253     {
 254         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 255     }
 256     else
 257     {
 258         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 259     }
 260 }
 261
 262 /*! Final i-force reduction; this implementation works only with power of two
 263  *  array sizes and with sm >= 3.0
 264  */
 265 #if __CUDA_ARCH__ >= 300
 266 static inline __device__
 267 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 268                               float3 *fshift_buf, bool bCalcFshift,
 269                               int tidxj, int aidx)
 270 {
 271     int j;
 272
 273 #pragma unroll 2
 274     for (j = 0; j < 2; j++)
 275     {
 276         fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
 277         fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
 278         fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
 279     }
 280
 281     /* The first thread in the warp writes the reduced force */
 282     if (tidxj == 0 || tidxj == 4)
 283     {
 284         atomicAdd(&fout[aidx], fin);
 285
 286         if (bCalcFshift)
 287         {
 288             fshift_buf->x += fin.x;
 289             fshift_buf->y += fin.y;
 290             fshift_buf->z += fin.z;
 291         }
 292     }
 293 }
 294 #endif
 295
 296 /*! Energy reduction; this implementation works only with power of two
 297  *  array sizes.
 298  */
 299 static inline __device__
 300 void reduce_energy_pow2(volatile float *buf,
 301                         float *e_lj, float *e_el,
 302                         unsigned int tidx)
 303 {
 304     int     i, j;
 305     float   e1, e2;
 306
 307     i = WARP_SIZE/2;
 308
 309     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 310 # pragma unroll 10
 311     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 312     {
 313         if (tidx < i)
 314         {
 315             buf[              tidx] += buf[              tidx + i];
 316             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 317         }
 318         i >>= 1;
 319     }
 320
 321     /* last reduction step, writing to global mem */
 322     if (tidx == 0)
 323     {
 324         e1 = buf[              tidx] + buf[              tidx + i];
 325         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 326
 327         atomicAdd(e_lj, e1);
 328         atomicAdd(e_el, e2);
 329     }
 330 }
 331
 332 /*! Energy reduction; this implementation works only with power of two
 333  *  array sizes and with sm >= 3.0
 334  */
 335 #if __CUDA_ARCH__ >= 300
 336 static inline __device__
 337 void reduce_energy_warp_shfl(float E_lj, float E_el,
 338                              float *e_lj, float *e_el,
 339                              int tidx)
 340 {
 341     int i, sh;
 342
 343     sh = 1;
 344 #pragma unroll 5
 345     for (i = 0; i < 5; i++)
 346     {
 347         E_lj += __shfl_down(E_lj, sh);
 348         E_el += __shfl_down(E_el, sh);
 349         sh   += sh;
 350     }
 351
 352     /* The first thread in the warp writes the reduced energies */
 353     if (tidx == 0 || tidx == WARP_SIZE)
 354     {
 355         atomicAdd(e_lj, E_lj);
 356         atomicAdd(e_el, E_el);
 357     }
 358 }
 359 #endif /* __CUDA_ARCH__ */
 360
 361 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */