src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 /* Note that floating-point constants in CUDA code should be suffixed
  40  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  41  * code that is in double precision.
  42  */
  43
  44 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
  45
  46 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  47 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  48
  49 #define WARP_SIZE_POW2_EXPONENT     (5)
  50 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  51 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  52 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  53
  54 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  55 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  56
  57 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  58  *  Original idea: OpenMM
  59  */
  60 static inline __device__
  61 float interpolate_coulomb_force_r(float r, float scale)
  62 {
  63     float   normalized = scale * r;
  64     int     index = (int) normalized;
  65     float   fract2 = normalized - index;
  66     float   fract1 = 1.0f - fract2;
  67
  68     return  fract1 * tex1Dfetch(coulomb_tab_texref, index)
  69             + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
  70 }
  71
  72 #ifdef TEXOBJ_SUPPORTED
  73 static inline __device__
  74 float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
  75                                   float r, float scale)
  76 {
  77     float   normalized = scale * r;
  78     int     index = (int) normalized;
  79     float   fract2 = normalized - index;
  80     float   fract1 = 1.0f - fract2;
  81
  82     return  fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
  83             fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
  84 }
  85 #endif
  86
  87
  88 /*! Calculate analytical Ewald correction term. */
  89 static inline __device__
  90 float pmecorrF(float z2)
  91 {
  92     const float FN6 = -1.7357322914161492954e-8f;
  93     const float FN5 = 1.4703624142580877519e-6f;
  94     const float FN4 = -0.000053401640219807709149f;
  95     const float FN3 = 0.0010054721316683106153f;
  96     const float FN2 = -0.019278317264888380590f;
  97     const float FN1 = 0.069670166153766424023f;
  98     const float FN0 = -0.75225204789749321333f;
  99
 100     const float FD4 = 0.0011193462567257629232f;
 101     const float FD3 = 0.014866955030185295499f;
 102     const float FD2 = 0.11583842382862377919f;
 103     const float FD1 = 0.50736591960530292870f;
 104     const float FD0 = 1.0f;
 105
 106     float       z4;
 107     float       polyFN0,polyFN1,polyFD0,polyFD1;
 108
 109     z4          = z2*z2;
 110
 111     polyFD0     = FD4*z4 + FD2;
 112     polyFD1     = FD3*z4 + FD1;
 113     polyFD0     = polyFD0*z4 + FD0;
 114     polyFD0     = polyFD1*z2 + polyFD0;
 115
 116     polyFD0     = 1.0f/polyFD0;
 117
 118     polyFN0     = FN6*z4 + FN4;
 119     polyFN1     = FN5*z4 + FN3;
 120     polyFN0     = polyFN0*z4 + FN2;
 121     polyFN1     = polyFN1*z4 + FN1;
 122     polyFN0     = polyFN0*z4 + FN0;
 123     polyFN0     = polyFN1*z2 + polyFN0;
 124
 125     return polyFN0*polyFD0;
 126 }
 127
 128 /*! Final j-force reduction; this generic implementation works with
 129  *  arbitrary array sizes.
 130  */
 131 static inline __device__
 132 void reduce_force_j_generic(float *f_buf, float3 *fout,
 133                             int tidxi, int tidxj, int aidx)
 134 {
 135     if (tidxi == 0)
 136     {
 137         float3 f = make_float3(0.0f);
 138         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 139         {
 140             f.x += f_buf[                  j];
 141             f.y += f_buf[    FBUF_STRIDE + j];
 142             f.z += f_buf[2 * FBUF_STRIDE + j];
 143         }
 144
 145         atomicAdd(&fout[aidx], f);
 146     }
 147 }
 148
 149 /*! Final j-force reduction; this implementation only with power of two
 150  *  array sizes and with sm >= 3.0
 151  */
 152 #if __CUDA_ARCH__ >= 300
 153 static inline __device__
 154 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 155                               int tidxi, int aidx)
 156 {
 157     int i;
 158
 159 #pragma unroll 3
 160     for (i = 0; i < 3; i++)
 161     {
 162         f.x += __shfl_down(f.x, 1<<i);
 163         f.y += __shfl_down(f.y, 1<<i);
 164         f.z += __shfl_down(f.z, 1<<i);
 165     }
 166
 167     /* Write the reduced j-force on one thread for each j */
 168     if (tidxi == 0)
 169     {
 170         atomicAdd(&fout[aidx], f);
 171     }
 172 }
 173 #endif
 174
 175 /*! Final i-force reduction; this generic implementation works with
 176  *  arbitrary array sizes.
 177  */
 178 static inline __device__
 179 void reduce_force_i_generic(float *f_buf, float3 *fout,
 180                             float3 *fshift_buf, bool bCalcFshift,
 181                             int tidxi, int tidxj, int aidx)
 182 {
 183     if (tidxj == 0)
 184     {
 185         float3 f = make_float3(0.0f);
 186         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 187         {
 188             f.x += f_buf[                  j];
 189             f.y += f_buf[    FBUF_STRIDE + j];
 190             f.z += f_buf[2 * FBUF_STRIDE + j];
 191         }
 192
 193         atomicAdd(&fout[aidx], f);
 194
 195         if (bCalcFshift)
 196         {
 197             *fshift_buf += f;
 198         }
 199     }
 200 }
 201
 202 /*! Final i-force reduction; this implementation works only with power of two
 203  *  array sizes.
 204  */
 205 static inline __device__
 206 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 207                          float3 *fshift_buf, bool bCalcFshift,
 208                          int tidxi, int tidxj, int aidx)
 209 {
 210     int     i, j;
 211     float3  f = make_float3(0.0f);
 212
 213     /* Reduce the initial CL_SIZE values for each i atom to half
 214      * every step by using CL_SIZE * i threads.
 215      * Can't just use i as loop variable because than nvcc refuses to unroll.
 216      */
 217     i = CL_SIZE/2;
 218     # pragma unroll 5
 219     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 220     {
 221         if (tidxj < i)
 222         {
 223
 224             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 225             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 226             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 227         }
 228         i >>= 1;
 229     }
 230
 231     /* i == 1, last reduction step, writing to global mem */
 232     if (tidxj == 0)
 233     {
 234         f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 235         f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 236         f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 237
 238         atomicAdd(&fout[aidx], f);
 239
 240         if (bCalcFshift)
 241         {
 242             *fshift_buf += f;
 243         }
 244     }
 245 }
 246
 247 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 248  *  on whether the size of the array to be reduced is power of two or not.
 249  */
 250 static inline __device__
 251 void reduce_force_i(float *f_buf, float3 *f,
 252                     float3 *fshift_buf, bool bCalcFshift,
 253                     int tidxi, int tidxj, int ai)
 254 {
 255     if ((CL_SIZE & (CL_SIZE - 1)))
 256     {
 257         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 258     }
 259     else
 260     {
 261         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 262     }
 263 }
 264
 265 /*! Final i-force reduction; this implementation works only with power of two
 266  *  array sizes and with sm >= 3.0
 267  */
 268 #if __CUDA_ARCH__ >= 300
 269 static inline __device__
 270 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 271                               float3 *fshift_buf, bool bCalcFshift,
 272                               int tidxj, int aidx)
 273 {
 274     int j;
 275
 276 #pragma unroll 2
 277     for (j = 0; j < 2; j++)
 278     {
 279         fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
 280         fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
 281         fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
 282     }
 283
 284     /* The first thread in the warp writes the reduced force */
 285     if (tidxj == 0 || tidxj == 4)
 286     {
 287         atomicAdd(&fout[aidx], fin);
 288
 289         if (bCalcFshift)
 290         {
 291             fshift_buf->x += fin.x;
 292             fshift_buf->y += fin.y;
 293             fshift_buf->z += fin.z;
 294         }
 295     }
 296 }
 297 #endif
 298
 299 /*! Energy reduction; this implementation works only with power of two
 300  *  array sizes.
 301  */
 302 static inline __device__
 303 void reduce_energy_pow2(volatile float *buf,
 304                         float *e_lj, float *e_el,
 305                         unsigned int tidx)
 306 {
 307     int     i, j;
 308     float   e1, e2;
 309
 310     i = WARP_SIZE/2;
 311
 312     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 313 # pragma unroll 10
 314     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 315     {
 316         if (tidx < i)
 317         {
 318             buf[              tidx] += buf[              tidx + i];
 319             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 320         }
 321         i >>= 1;
 322     }
 323
 324     /* last reduction step, writing to global mem */
 325     if (tidx == 0)
 326     {
 327         e1 = buf[              tidx] + buf[              tidx + i];
 328         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 329
 330         atomicAdd(e_lj, e1);
 331         atomicAdd(e_el, e2);
 332     }
 333 }
 334
 335 /*! Energy reduction; this implementation works only with power of two
 336  *  array sizes and with sm >= 3.0
 337  */
 338 #if __CUDA_ARCH__ >= 300
 339 static inline __device__
 340 void reduce_energy_warp_shfl(float E_lj, float E_el,
 341                              float *e_lj, float *e_el,
 342                              int tidx)
 343 {
 344     int i, sh;
 345
 346     sh = 1;
 347 #pragma unroll 5
 348     for (i = 0; i < 5; i++)
 349     {
 350         E_lj += __shfl_down(E_lj,sh);
 351         E_el += __shfl_down(E_el,sh);
 352         sh += sh;
 353     }
 354
 355     /* The first thread in the warp writes the reduced energies */
 356     if (tidx == 0 || tidx == WARP_SIZE)
 357     {
 358         atomicAdd(e_lj,E_lj);
 359         atomicAdd(e_el,E_el);
 360     }
 361 }
 362 #endif /* __CUDA_ARCH__ */
 363
 364 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */