src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2012, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 /* Note that floating-point constants in CUDA code should be suffixed
  37  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  38  * code that is in double precision.
  39  */
  40
  41 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
  42
  43 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  44 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  45
  46 #define WARP_SIZE_POW2_EXPONENT     (5)
  47 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  48 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  49 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  50
  51 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
  52 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
  53
  54 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  55  *  Original idea: OpenMM
  56  */
  57 static inline __device__
  58 float interpolate_coulomb_force_r(float r, float scale)
  59 {
  60     float   normalized = scale * r;
  61     int     index = (int) normalized;
  62     float   fract2 = normalized - index;
  63     float   fract1 = 1.0f - fract2;
  64
  65     return  fract1 * tex1Dfetch(tex_coulomb_tab, index)
  66             + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
  67 }
  68
  69 /*! Calculate analytical Ewald correction term. */
  70 static inline __device__
  71 float pmecorrF(float z2)
  72 {
  73     const float FN6 = -1.7357322914161492954e-8f;
  74     const float FN5 = 1.4703624142580877519e-6f;
  75     const float FN4 = -0.000053401640219807709149f;
  76     const float FN3 = 0.0010054721316683106153f;
  77     const float FN2 = -0.019278317264888380590f;
  78     const float FN1 = 0.069670166153766424023f;
  79     const float FN0 = -0.75225204789749321333f;
  80
  81     const float FD4 = 0.0011193462567257629232f;
  82     const float FD3 = 0.014866955030185295499f;
  83     const float FD2 = 0.11583842382862377919f;
  84     const float FD1 = 0.50736591960530292870f;
  85     const float FD0 = 1.0f;
  86
  87     float       z4;
  88     float       polyFN0,polyFN1,polyFD0,polyFD1;
  89
  90     z4          = z2*z2;
  91
  92     polyFD0     = FD4*z4 + FD2;
  93     polyFD1     = FD3*z4 + FD1;
  94     polyFD0     = polyFD0*z4 + FD0;
  95     polyFD0     = polyFD1*z2 + polyFD0;
  96
  97     polyFD0     = 1.0f/polyFD0;
  98
  99     polyFN0     = FN6*z4 + FN4;
 100     polyFN1     = FN5*z4 + FN3;
 101     polyFN0     = polyFN0*z4 + FN2;
 102     polyFN1     = polyFN1*z4 + FN1;
 103     polyFN0     = polyFN0*z4 + FN0;
 104     polyFN0     = polyFN1*z2 + polyFN0;
 105
 106     return polyFN0*polyFD0;
 107 }
 108
 109 /*! Final j-force reduction; this generic implementation works with
 110  *  arbitrary array sizes.
 111  */
 112 static inline __device__
 113 void reduce_force_j_generic(float *f_buf, float3 *fout,
 114                             int tidxi, int tidxj, int aidx)
 115 {
 116     if (tidxi == 0)
 117     {
 118         float3 f = make_float3(0.0f);
 119         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
 120         {
 121             f.x += f_buf[                  j];
 122             f.y += f_buf[    FBUF_STRIDE + j];
 123             f.z += f_buf[2 * FBUF_STRIDE + j];
 124         }
 125
 126         atomicAdd(&fout[aidx], f);
 127     }
 128 }
 129
 130 /*! Final j-force reduction; this implementation only with power of two
 131  *  array sizes and with sm >= 3.0
 132  */
 133 #if __CUDA_ARCH__ >= 300
 134 static inline __device__
 135 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
 136                               int tidxi, int aidx)
 137 {
 138     int i;
 139
 140 #pragma unroll 3
 141     for (i = 0; i < 3; i++)
 142     {
 143         f.x += __shfl_down(f.x, 1<<i);
 144         f.y += __shfl_down(f.y, 1<<i);
 145         f.z += __shfl_down(f.z, 1<<i);
 146     }
 147
 148     /* Write the reduced j-force on one thread for each j */
 149     if (tidxi == 0)
 150     {
 151         atomicAdd(&fout[aidx], f);
 152     }
 153 }
 154 #endif
 155
 156 /*! Final i-force reduction; this generic implementation works with
 157  *  arbitrary array sizes.
 158  */
 159 static inline __device__
 160 void reduce_force_i_generic(float *f_buf, float3 *fout,
 161                             float3 *fshift_buf, bool bCalcFshift,
 162                             int tidxi, int tidxj, int aidx)
 163 {
 164     if (tidxj == 0)
 165     {
 166         float3 f = make_float3(0.0f);
 167         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 168         {
 169             f.x += f_buf[                  j];
 170             f.y += f_buf[    FBUF_STRIDE + j];
 171             f.z += f_buf[2 * FBUF_STRIDE + j];
 172         }
 173
 174         atomicAdd(&fout[aidx], f);
 175
 176         if (bCalcFshift)
 177         {
 178             *fshift_buf += f;
 179         }
 180     }
 181 }
 182
 183 /*! Final i-force reduction; this implementation works only with power of two
 184  *  array sizes.
 185  */
 186 static inline __device__
 187 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 188                          float3 *fshift_buf, bool bCalcFshift,
 189                          int tidxi, int tidxj, int aidx)
 190 {
 191     int     i, j;
 192     float3  f = make_float3(0.0f);
 193
 194     /* Reduce the initial CL_SIZE values for each i atom to half
 195      * every step by using CL_SIZE * i threads.
 196      * Can't just use i as loop variable because than nvcc refuses to unroll.
 197      */
 198     i = CL_SIZE/2;
 199     # pragma unroll 5
 200     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 201     {
 202         if (tidxj < i)
 203         {
 204
 205             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 206             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 207             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 208         }
 209         i >>= 1;
 210     }
 211
 212     /* i == 1, last reduction step, writing to global mem */
 213     if (tidxj == 0)
 214     {
 215         f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 216         f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 217         f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 218
 219         atomicAdd(&fout[aidx], f);
 220
 221         if (bCalcFshift)
 222         {
 223             *fshift_buf += f;
 224         }
 225     }
 226 }
 227
 228 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 229  *  on whether the size of the array to be reduced is power of two or not.
 230  */
 231 static inline __device__
 232 void reduce_force_i(float *f_buf, float3 *f,
 233                     float3 *fshift_buf, bool bCalcFshift,
 234                     int tidxi, int tidxj, int ai)
 235 {
 236     if ((CL_SIZE & (CL_SIZE - 1)))
 237     {
 238         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 239     }
 240     else
 241     {
 242         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 243     }
 244 }
 245
 246 /*! Final i-force reduction; this implementation works only with power of two
 247  *  array sizes and with sm >= 3.0
 248  */
 249 #if __CUDA_ARCH__ >= 300
 250 static inline __device__
 251 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 252                               float3 *fshift_buf, bool bCalcFshift,
 253                               int tidxj, int aidx)
 254 {
 255     int j;
 256
 257 #pragma unroll 2
 258     for (j = 0; j < 2; j++)
 259     {
 260         fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
 261         fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
 262         fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
 263     }
 264
 265     /* The first thread in the warp writes the reduced force */
 266     if (tidxj == 0 || tidxj == 4)
 267     {
 268         atomicAdd(&fout[aidx], fin);
 269
 270         if (bCalcFshift)
 271         {
 272             fshift_buf->x += fin.x;
 273             fshift_buf->y += fin.y;
 274             fshift_buf->z += fin.z;
 275         }
 276     }
 277 }
 278 #endif
 279
 280 /*! Energy reduction; this implementation works only with power of two
 281  *  array sizes.
 282  */
 283 static inline __device__
 284 void reduce_energy_pow2(volatile float *buf,
 285                         float *e_lj, float *e_el,
 286                         unsigned int tidx)
 287 {
 288     int     i, j;
 289     float   e1, e2;
 290
 291     i = WARP_SIZE/2;
 292
 293     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 294 # pragma unroll 10
 295     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 296     {
 297         if (tidx < i)
 298         {
 299             buf[              tidx] += buf[              tidx + i];
 300             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 301         }
 302         i >>= 1;
 303     }
 304
 305     /* last reduction step, writing to global mem */
 306     if (tidx == 0)
 307     {
 308         e1 = buf[              tidx] + buf[              tidx + i];
 309         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 310
 311         atomicAdd(e_lj, e1);
 312         atomicAdd(e_el, e2);
 313     }
 314 }
 315
 316 /*! Energy reduction; this implementation works only with power of two
 317  *  array sizes and with sm >= 3.0
 318  */
 319 #if __CUDA_ARCH__ >= 300
 320 static inline __device__
 321 void reduce_energy_warp_shfl(float E_lj, float E_el,
 322                              float *e_lj, float *e_el,
 323                              int tidx)
 324 {
 325     int i, sh;
 326
 327     sh = 1;
 328 #pragma unroll 5
 329     for (i = 0; i < 5; i++)
 330     {
 331         E_lj += __shfl_down(E_lj,sh);
 332         E_el += __shfl_down(E_el,sh);
 333         sh += sh;
 334     }
 335
 336     /* The first thread in the warp writes the reduced energies */
 337     if (tidx == 0 || tidx == WARP_SIZE)
 338     {
 339         atomicAdd(e_lj,E_lj);
 340         atomicAdd(e_el,E_el);
 341     }
 342 }
 343 #endif /* __CUDA_ARCH__ */
 344
 345 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */