src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  12  * Copyright (c) 2001-2012, The GROMACS development team,
  13  * check out http://www.gromacs.org for more information.
  14  *
  15  * This program is free software; you can redistribute it and/or
  16  * modify it under the terms of the GNU General Public License
  17  * as published by the Free Software Foundation; either version 2
  18  * of the License, or (at your option) any later version.
  19  *
  20  * If you want to redistribute modifications, please consider that
  21  * scientific software is very special. Version control is crucial -
  22  * bugs must be traceable. We will be happy to consider code for
  23  * inclusion in the official distribution, but derived work must not
  24  * be called official GROMACS. Details are found in the README & COPYING
  25  * files - if they are missing, get the official version at www.gromacs.org.
  26  *
  27  * To help us fund GROMACS development, we humbly ask that you cite
  28  * the papers on the package - you can find them in the top README file.
  29  *
  30  * For more info, check our website at http://www.gromacs.org
  31  *
  32  * And Hey:
  33  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  34  */
  35
  36 /* Note that floating-point constants in CUDA code should be suffixed
  37  * with f (e.g. 0.5f), to stop the compiler producing intermediate
  38  * code that is in double precision.
  39  */
  40
  41 #include "../../gmxlib/cuda_tools/vectype_ops.cuh"
  42
  43 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
  44 #define NBNXN_CUDA_KERNEL_UTILS_CUH
  45
  46 #define WARP_SIZE_POW2_EXPONENT     (5)
  47 #define CL_SIZE_POW2_EXPONENT       (3)  /* change this together with GPU_NS_CLUSTER_SIZE !*/
  48 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
  49 #define FBUF_STRIDE                 (CL_SIZE_SQ)
  50
  51 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  52  *  Original idea: OpenMM
  53  */
  54 static inline __device__
  55 float interpolate_coulomb_force_r(float r, float scale)
  56 {
  57     float   normalized = scale * r;
  58     int     index = (int) normalized;
  59     float   fract2 = normalized - index;
  60     float   fract1 = 1.0f - fract2;
  61
  62     return  fract1 * tex1Dfetch(tex_coulomb_tab, index)
  63             + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
  64 }
  65
  66 /*! Final j-force reduction; this generic implementation works with
  67  *  arbitrary array sizes.
  68  */
  69 static inline __device__
  70 void reduce_force_j_generic(float *f_buf, float3 *fout,
  71                             int tidxi, int tidxj, int aidx)
  72 {
  73     if (tidxi == 0)
  74     {
  75         float3 f = make_float3(0.0f);
  76         for (int j = tidxj * CL_SIZE; j < (tidxj + 1) * CL_SIZE; j++)
  77         {
  78             f.x += f_buf[                  j];
  79             f.y += f_buf[    FBUF_STRIDE + j];
  80             f.z += f_buf[2 * FBUF_STRIDE + j];
  81         }
  82
  83         atomicAdd(&fout[aidx], f);
  84     }
  85 }
  86
  87 /*! Final j-force reduction; this implementation only with power of two
  88  *  array sizes and with sm >= 3.0
  89  */
  90 #if __CUDA_ARCH__ >= 300
  91 static inline __device__
  92 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
  93                               int tidxi, int aidx)
  94 {
  95     int i;
  96
  97 #pragma unroll 3
  98     for (i = 0; i < 3; i++)
  99     {
 100         f.x += __shfl_down(f.x, 1<<i);
 101         f.y += __shfl_down(f.y, 1<<i);
 102         f.z += __shfl_down(f.z, 1<<i);
 103     }
 104
 105     /* Write the reduced j-force on one thread for each j */
 106     if (tidxi == 0)
 107     {
 108         atomicAdd(&fout[aidx], f);
 109     }
 110 }
 111 #endif
 112
 113 /*! Final i-force reduction; this generic implementation works with
 114  *  arbitrary array sizes.
 115  */
 116 static inline __device__
 117 void reduce_force_i_generic(float *f_buf, float3 *fout,
 118                             float3 *fshift_buf, bool bCalcFshift,
 119                             int tidxi, int tidxj, int aidx)
 120 {
 121     if (tidxj == 0)
 122     {
 123         float3 f = make_float3(0.0f);
 124         for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
 125         {
 126             f.x += f_buf[                  j];
 127             f.y += f_buf[    FBUF_STRIDE + j];
 128             f.z += f_buf[2 * FBUF_STRIDE + j];
 129         }
 130
 131         atomicAdd(&fout[aidx], f);
 132
 133         if (bCalcFshift)
 134         {
 135             *fshift_buf += f;
 136         }
 137     }
 138 }
 139
 140 /*! Final i-force reduction; this implementation works only with power of two
 141  *  array sizes.
 142  */
 143 static inline __device__
 144 void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
 145                          float3 *fshift_buf, bool bCalcFshift,
 146                          int tidxi, int tidxj, int aidx)
 147 {
 148     int     i, j;
 149     float3  f = make_float3(0.0f);
 150
 151     /* Reduce the initial CL_SIZE values for each i atom to half
 152      * every step by using CL_SIZE * i threads.
 153      * Can't just use i as loop variable because than nvcc refuses to unroll.
 154      */
 155     i = CL_SIZE/2;
 156     # pragma unroll 5
 157     for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 158     {
 159         if (tidxj < i)
 160         {
 161
 162             f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 163             f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 164             f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 165         }
 166         i >>= 1;
 167     }
 168
 169     /* i == 1, last reduction step, writing to global mem */
 170     if (tidxj == 0)
 171     {
 172         f.x = f_buf[                  tidxj * CL_SIZE + tidxi] + f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
 173         f.y = f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 174         f.z = f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] + f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
 175
 176         atomicAdd(&fout[aidx], f);
 177
 178         if (bCalcFshift)
 179         {
 180             *fshift_buf += f;
 181         }
 182     }
 183 }
 184
 185 /*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
 186  *  on whether the size of the array to be reduced is power of two or not.
 187  */
 188 static inline __device__
 189 void reduce_force_i(float *f_buf, float3 *f,
 190                     float3 *fshift_buf, bool bCalcFshift,
 191                     int tidxi, int tidxj, int ai)
 192 {
 193     if ((CL_SIZE & (CL_SIZE - 1)))
 194     {
 195         reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 196     }
 197     else
 198     {
 199         reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
 200     }
 201 }
 202
 203 /*! Final i-force reduction; this implementation works only with power of two
 204  *  array sizes and with sm >= 3.0
 205  */
 206 #if __CUDA_ARCH__ >= 300
 207 static inline __device__
 208 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
 209                               float3 *fshift_buf, bool bCalcFshift,
 210                               int tidxj, int aidx)
 211 {
 212     int j;
 213
 214 #pragma unroll 2
 215     for (j = 0; j < 2; j++)
 216     {
 217         fin.x += __shfl_down(fin.x,  CL_SIZE<<j);
 218         fin.y += __shfl_down(fin.y,  CL_SIZE<<j);
 219         fin.z += __shfl_down(fin.z,  CL_SIZE<<j);
 220     }
 221
 222     /* The first thread in the warp writes the reduced force */
 223     if (tidxj == 0 || tidxj == 4)
 224     {
 225         atomicAdd(&fout[aidx], fin);
 226
 227         if (bCalcFshift)
 228         {
 229             fshift_buf->x += fin.x;
 230             fshift_buf->y += fin.y;
 231             fshift_buf->z += fin.z;
 232         }
 233     }
 234 }
 235 #endif
 236
 237 /*! Energy reduction; this implementation works only with power of two
 238  *  array sizes.
 239  */
 240 static inline __device__
 241 void reduce_energy_pow2(volatile float *buf,
 242                         float *e_lj, float *e_el,
 243                         unsigned int tidx)
 244 {
 245     int     i, j;
 246     float   e1, e2;
 247
 248     i = WARP_SIZE/2;
 249
 250     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 251 # pragma unroll 10
 252     for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
 253     {
 254         if (tidx < i)
 255         {
 256             buf[              tidx] += buf[              tidx + i];
 257             buf[FBUF_STRIDE + tidx] += buf[FBUF_STRIDE + tidx + i];
 258         }
 259         i >>= 1;
 260     }
 261
 262     /* last reduction step, writing to global mem */
 263     if (tidx == 0)
 264     {
 265         e1 = buf[              tidx] + buf[              tidx + i];
 266         e2 = buf[FBUF_STRIDE + tidx] + buf[FBUF_STRIDE + tidx + i];
 267
 268         atomicAdd(e_lj, e1);
 269         atomicAdd(e_el, e2);
 270     }
 271 }
 272
 273 /*! Energy reduction; this implementation works only with power of two
 274  *  array sizes and with sm >= 3.0
 275  */
 276 #if __CUDA_ARCH__ >= 300
 277 static inline __device__
 278 void reduce_energy_warp_shfl(float E_lj, float E_el,
 279                              float *e_lj, float *e_el,
 280                              int tidx)
 281 {
 282     int i, sh;
 283
 284     sh = 1;
 285 #pragma unroll 5
 286     for (i = 0; i < 5; i++)
 287     {
 288         E_lj += __shfl_down(E_lj,sh);
 289         E_el += __shfl_down(E_el,sh);
 290         sh += sh;
 291     }
 292
 293     /* The first thread in the warp writes the reduced energies */
 294     if (tidx == 0 || tidx == WARP_SIZE)
 295     {
 296         atomicAdd(e_lj,E_lj);
 297         atomicAdd(e_el,E_el);
 298     }
 299 }
 300 #endif /* __CUDA_ARCH__ */
 301
 302 #endif /* NBNXN_CUDA_KERNEL_UTILS_CUH */