src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 /* Note that floating-point constants in CUDA code should be suffixed
  40  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  41  * code that is in double precision.
  42  */
  43
  44 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
  45
  46 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  47 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  48
  49 #define WARP_SIZE_POW2_EXPONENT     (5)
  50 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  51 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  52 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  53
  54 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  55 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  56
  57 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  58  *  Original idea: OpenMM
  59  */
  60 static inline __device__
  61 float interpolate_coulomb_force_r(float r, float scale)
  62 {
  63     float   normalized = scale * r;
  64     int     index = (int) normalized;
  65     float   fract2 = normalized - index;
  66     float   fract1 = 1.0f - fract2;
  67
  68     return  fract1 * tex1Dfetch(tex_coulomb_tab, index)
  69             + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
  70 }
  71
  72 /*! Final j-force reduction; this generic implementation works with
  73  *  arbitrary array sizes.
  74  */
  75 static inline __device__
  76 void reduce_force_j_generic(float *f_buf, float3 *fout,
  77                             int tidxi, int tidxj, int aidx)
  78 {
  79     if (tidxi == 0)
  80     {
  81         float3 f = make_float3(0.0f);
  82         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
  83         {
  84             f.x += f_buf[                  j];
  85             f.y += f_buf[    FBUF_STRIDE + j];
  86             f.z += f_buf[2 * FBUF_STRIDE + j];
  87         }
  88
  89         atomicAdd(&fout[aidx], f);
  90     }
  91 }
  92
  93 /*! Final j-force reduction; this implementation only with power of two
  94  *  array sizes and with sm >= 3.0
  95  */
  96 #if __CUDA_ARCH__ >= 300
  97 static inline __device__
  98 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
  99                               int tidxi, int aidx)
 100 {
 101     int i;
 102
 103 #pragma unroll 3
 104     for (i = 0; i < 3; i++)
 105     {
 106         f.x += __shfl_down(f.x, 1<<i);
 107         f.y += __shfl_down(f.y, 1<<i);
 108         f.z += __shfl_down(f.z, 1<<i);
 109     }
 110
 111     /* Write the reduced j-force on one thread for each j */
 112     if (tidxi == 0)
 113     {
 114         atomicAdd(&fout[aidx], f);
 115     }
 116 }
 117 #endif
 118
 119 /*! Final i-force reduction; this generic implementation works with
 120  *  arbitrary array sizes.
 121  */
 122 static inline __device__
 123 void reduce_force_i_generic(float *f_buf, float3 *fout,
 124                             float3 *fshift_buf, bool bCalcFshift,
 125                             int tidxi, int tidxj, int aidx)
 126 {
 127     if (tidxj == 0)
 128     {
 129         float3 f = make_float3(0.0f);
 130         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 131         {
 132             f.x += f_buf[                  j];
 133             f.y += f_buf[    FBUF_STRIDE + j];
 134             f.z += f_buf[2 * FBUF_STRIDE + j];
 135         }
 136
 137         atomicAdd(&fout[aidx], f);
 138
 139         if (bCalcFshift)
 140         {
 141             *fshift_buf += f;
 142         }
 143     }
 144 }
 145
 146 /*! Final i-force reduction; this implementation works only with power of two
 147  *  array sizes.
 148  */
 149 static inline __device__
 150 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 151                          float3 *fshift_buf, bool bCalcFshift,
 152                          int tidxi, int tidxj, int aidx)
 153 {
 154     int     i, j;
 155     float3  f = make_float3(0.0f);
 156
 157     /* Reduce the initial CL_SIZE values for each i atom to half
 158      * every step by using CL_SIZE * i threads.
 159      * Can't just use i as loop variable because than nvcc refuses to unroll.
 160      */
 161     i = CL_SIZE/2;
 162     # pragma unroll 5
 163     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 164     {
 165         if (tidxj < i)
 166         {
 167
 168             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 169             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 170             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 171         }
 172         i >>= 1;
 173     }
 174
 175     /* i == 1, last reduction step, writing to global mem */
 176     if (tidxj == 0)
 177     {
 178         f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 179         f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 180         f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 181
 182         atomicAdd(&fout[aidx], f);
 183
 184         if (bCalcFshift)
 185         {
 186             *fshift_buf += f;
 187         }
 188     }
 189 }
 190
 191 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 192  *  on whether the size of the array to be reduced is power of two or not.
 193  */
 194 static inline __device__
 195 void reduce_force_i(float *f_buf, float3 *f,
 196                     float3 *fshift_buf, bool bCalcFshift,
 197                     int tidxi, int tidxj, int ai)
 198 {
 199     if ((CL_SIZE & (CL_SIZE - 1)))
 200     {
 201         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 202     }
 203     else
 204     {
 205         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 206     }
 207 }
 208
 209 /*! Final i-force reduction; this implementation works only with power of two
 210  *  array sizes and with sm >= 3.0
 211  */
 212 #if __CUDA_ARCH__ >= 300
 213 static inline __device__
 214 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 215                               float3 *fshift_buf, bool bCalcFshift,
 216                               int tidxj, int aidx)
 217 {
 218     int j;
 219
 220 #pragma unroll 2
 221     for (j = 0; j < 2; j++)
 222     {
 223         fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
 224         fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
 225         fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
 226     }
 227
 228     /* The first thread in the warp writes the reduced force */
 229     if (tidxj == 0 || tidxj == 4)
 230     {
 231         atomicAdd(&fout[aidx], fin);
 232
 233         if (bCalcFshift)
 234         {
 235             fshift_buf->x += fin.x;
 236             fshift_buf->y += fin.y;
 237             fshift_buf->z += fin.z;
 238         }
 239     }
 240 }
 241 #endif
 242
 243 /*! Energy reduction; this implementation works only with power of two
 244  *  array sizes.
 245  */
 246 static inline __device__
 247 void reduce_energy_pow2(volatile float *buf,
 248                         float *e_lj, float *e_el,
 249                         unsigned int tidx)
 250 {
 251     int     i, j;
 252     float   e1, e2;
 253
 254     i = WARP_SIZE/2;
 255
 256     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 257 # pragma unroll 10
 258     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 259     {
 260         if (tidx < i)
 261         {
 262             buf[              tidx] += buf[              tidx + i];
 263             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 264         }
 265         i >>= 1;
 266     }
 267
 268     /* last reduction step, writing to global mem */
 269     if (tidx == 0)
 270     {
 271         e1 = buf[              tidx] + buf[              tidx + i];
 272         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 273
 274         atomicAdd(e_lj, e1);
 275         atomicAdd(e_el, e2);
 276     }
 277 }
 278
 279 /*! Energy reduction; this implementation works only with power of two
 280  *  array sizes and with sm >= 3.0
 281  */
 282 #if __CUDA_ARCH__ >= 300
 283 static inline __device__
 284 void reduce_energy_warp_shfl(float E_lj, float E_el,
 285                              float *e_lj, float *e_el,
 286                              int tidx)
 287 {
 288     int i, sh;
 289
 290     sh = 1;
 291 #pragma unroll 5
 292     for (i = 0; i < 5; i++)
 293     {
 294         E_lj += __shfl_down(E_lj,sh);
 295         E_el += __shfl_down(E_el,sh);
 296         sh += sh;
 297     }
 298
 299     /* The first thread in the warp writes the reduced energies */
 300     if (tidx == 0 || tidx == WARP_SIZE)
 301     {
 302         atomicAdd(e_lj,E_lj);
 303         atomicAdd(e_el,E_el);
 304     }
 305 }
 306 #endif /* __CUDA_ARCH__ */
 307
 308 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */