src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 3.2.0
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2004, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * GROningen Mixture of Alchemy and Childrens' Stories
  35  */
  36 #ifdef HAVE_CONFIG_H
  37 #include <config.h>
  38 #endif
  39
  40 #include <math.h>
  41
  42 #include "types/simple.h"
  43 #include "maths.h"
  44 #include "vec.h"
  45 #include "typedefs.h"
  46 #include "force.h"
  47 #include "nbnxn_kernel_gpu_ref.h"
  48 #include "../nbnxn_consts.h"
  49 #include "nbnxn_kernel_common.h"
  50
  51 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  52 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  53
  54 void
  55 nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t     *nbl,
  56                      const nbnxn_atomdata_t     *nbat,
  57                      const interaction_const_t  *iconst,
  58                      rvec                       *shift_vec,
  59                      int                        force_flags,
  60                      int                        clearF,
  61                      real *                     f,
  62                      real *                     fshift,
  63                      real *                     Vc,
  64                      real *                     Vvdw)
  65 {
  66     const nbnxn_sci_t *nbln;
  67     const real    *x;
  68     gmx_bool      bEner;
  69     gmx_bool      bEwald;
  70     const real    *Ftab=NULL;
  71     real          rcut2,rvdw2,rlist2;
  72     int           ntype;
  73     real          facel;
  74     int           n;
  75     int           ish3;
  76     int           sci;
  77     int           cj4_ind0,cj4_ind1,cj4_ind;
  78     int           ci,cj;
  79     int           ic,jc,ia,ja,is,ifs,js,jfs,im,jm;
  80     int           n0;
  81     int           ggid;
  82     real          shX,shY,shZ;
  83     real          fscal,tx,ty,tz;
  84     real          rinvsq;
  85     real          iq;
  86     real          qq,vcoul=0,krsq,vctot;
  87     int           nti;
  88     int           tj;
  89     real          rt,r,eps;
  90     real          rinvsix;
  91     real          Vvdwtot;
  92     real          Vvdw_rep,Vvdw_disp;
  93     real          ix,iy,iz,fix,fiy,fiz;
  94     real          jx,jy,jz;
  95     real          dx,dy,dz,rsq,rinv;
  96     int           int_bit;
  97     real          fexcl;
  98     real          c6,c12,cexp1,cexp2,br;
  99     const real *  shiftvec;
 100     real *        vdwparam;
 101     int *         shift;
 102     int *         type;
 103     const nbnxn_excl_t *excl[2];
 104
 105     int           npair_tot,npair;
 106     int           nhwu,nhwu_pruned;
 107
 108     if (nbl->na_ci != CL_SIZE)
 109     {
 110         gmx_fatal(FARGS,"The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d",nbl->na_ci,CL_SIZE);
 111     }
 112
 113     if (clearF == enbvClearFYes)
 114     {
 115         clear_f(nbat, 0, f);
 116     }
 117
 118     bEner = (force_flags & GMX_FORCE_ENERGY);
 119
 120     bEwald = EEL_FULL(iconst->eeltype);
 121     if (bEwald)
 122     {
 123         Ftab = iconst->tabq_coul_F;
 124     }
 125
 126     rcut2               = iconst->rcoulomb*iconst->rcoulomb;
 127     rvdw2               = iconst->rvdw*iconst->rvdw;
 128
 129     rlist2              = nbl->rlist*nbl->rlist;
 130
 131     type                = nbat->type;
 132     facel               = iconst->epsfac;
 133     shiftvec            = shift_vec[0];
 134     vdwparam            = nbat->nbfp;
 135     ntype               = nbat->ntype;
 136
 137     x = nbat->x;
 138
 139     npair_tot   = 0;
 140     nhwu        = 0;
 141     nhwu_pruned = 0;
 142
 143     for(n=0; n<nbl->nsci; n++)
 144     {
 145         nbln = &nbl->sci[n];
 146
 147         ish3             = 3*nbln->shift;
 148         shX              = shiftvec[ish3];
 149         shY              = shiftvec[ish3+1];
 150         shZ              = shiftvec[ish3+2];
 151         cj4_ind0         = nbln->cj4_ind_start;
 152         cj4_ind1         = nbln->cj4_ind_end;
 153         sci              = nbln->sci;
 154         vctot            = 0;
 155         Vvdwtot          = 0;
 156
 157         if (nbln->shift == CENTRAL &&
 158             nbl->cj4[cj4_ind0].cj[0] == sci*NCL_PER_SUPERCL)
 159         {
 160             /* we have the diagonal:
 161              * add the charge self interaction energy term
 162              */
 163             for(im=0; im<NCL_PER_SUPERCL; im++)
 164             {
 165                 ci = sci*NCL_PER_SUPERCL + im;
 166                 for (ic=0; ic<CL_SIZE; ic++)
 167                 {
 168                     ia     = ci*CL_SIZE + ic;
 169                     iq     = x[ia*nbat->xstride+3];
 170                     vctot += iq*iq;
 171                 }
 172             }
 173             if (!bEwald)
 174             {
 175                 vctot *= -facel*0.5*iconst->c_rf;
 176             }
 177             else
 178             {
 179                 /* last factor 1/sqrt(pi) */
 180                 vctot *= -facel*iconst->ewaldcoeff*M_1_SQRTPI;
 181             }
 182         }
 183
 184         for(cj4_ind=cj4_ind0; (cj4_ind<cj4_ind1); cj4_ind++)
 185         {
 186             excl[0]           = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
 187             excl[1]           = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
 188
 189             for(jm=0; jm<NBNXN_GPU_JGROUP_SIZE; jm++)
 190             {
 191                 cj               = nbl->cj4[cj4_ind].cj[jm];
 192
 193                 for(im=0; im<NCL_PER_SUPERCL; im++)
 194                 {
 195                     /* We're only using the first imask,
 196                      * but here imei[1].imask is identical.
 197                      */
 198                     if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*NCL_PER_SUPERCL+im)) & 1)
 199                     {
 200                         gmx_bool within_rlist;
 201
 202                         ci               = sci*NCL_PER_SUPERCL + im;
 203
 204                         within_rlist     = FALSE;
 205                         npair            = 0;
 206                         for(ic=0; ic<CL_SIZE; ic++)
 207                         {
 208                             ia               = ci*CL_SIZE + ic;
 209
 210                             is               = ia*nbat->xstride;
 211                             ifs              = ia*nbat->fstride;
 212                             ix               = shX + x[is+0];
 213                             iy               = shY + x[is+1];
 214                             iz               = shZ + x[is+2];
 215                             iq               = facel*x[is+3];
 216                             nti              = ntype*2*type[ia];
 217
 218                             fix              = 0;
 219                             fiy              = 0;
 220                             fiz              = 0;
 221
 222                             for(jc=0; jc<CL_SIZE; jc++)
 223                             {
 224                                 ja               = cj*CL_SIZE + jc;
 225
 226                                 if (nbln->shift == CENTRAL &&
 227                                     ci == cj && ja <= ia)
 228                                 {
 229                                     continue;
 230                                 }
 231
 232                                 int_bit = ((excl[jc>>2]->pair[(jc & 3)*CL_SIZE+ic] >> (jm*NCL_PER_SUPERCL+im)) & 1);
 233
 234                                 js               = ja*nbat->xstride;
 235                                 jfs              = ja*nbat->fstride;
 236                                 jx               = x[js+0];
 237                                 jy               = x[js+1];
 238                                 jz               = x[js+2];
 239                                 dx               = ix - jx;
 240                                 dy               = iy - jy;
 241                                 dz               = iz - jz;
 242                                 rsq              = dx*dx + dy*dy + dz*dz;
 243                                 if (rsq < rlist2)
 244                                 {
 245                                     within_rlist = TRUE;
 246                                 }
 247                                 if (rsq >= rcut2)
 248                                 {
 249                                     continue;
 250                                 }
 251
 252                                 if (type[ia] != ntype-1 && type[ja] != ntype-1)
 253                                 {
 254                                     npair++;
 255                                 }
 256
 257                                 /* avoid NaN for excluded pairs at r=0 */
 258                                 rsq             += (1.0 - int_bit)*NBNXN_AVOID_SING_R2_INC;
 259
 260                                 rinv             = gmx_invsqrt(rsq);
 261                                 rinvsq           = rinv*rinv;
 262                                 fscal            = 0;
 263
 264                                 qq               = iq*x[js+3];
 265                                 if (!bEwald)
 266                                 {
 267                                     /* Reaction-field */
 268                                     krsq  = iconst->k_rf*rsq;
 269                                     fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
 270                                     if (bEner)
 271                                     {
 272                                         vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
 273                                     }
 274                                 }
 275                                 else
 276                                 {
 277                                     r     = rsq*rinv;
 278                                     rt    = r*iconst->tabq_scale;
 279                                     n0    = rt;
 280                                     eps   = rt - n0;
 281
 282                                     fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
 283
 284                                     fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
 285
 286                                     if (bEner)
 287                                     {
 288                                         vcoul = qq*((int_bit - gmx_erf(iconst->ewaldcoeff*r))*rinv - int_bit*iconst->sh_ewald);
 289                                     }
 290                                 }
 291
 292                                 if (rsq < rvdw2)
 293                                 {
 294                                     tj        = nti + 2*type[ja];
 295
 296                                     /* Vanilla Lennard-Jones cutoff */
 297                                     c6        = vdwparam[tj];
 298                                     c12       = vdwparam[tj+1];
 299
 300                                     rinvsix   = int_bit*rinvsq*rinvsq*rinvsq;
 301                                     Vvdw_disp = c6*rinvsix;
 302                                     Vvdw_rep  = c12*rinvsix*rinvsix;
 303                                     fscal    += (Vvdw_rep - Vvdw_disp)*rinvsq;
 304
 305                                     if (bEner)
 306                                     {
 307                                         vctot   += vcoul;
 308
 309                                         Vvdwtot +=
 310                                             (Vvdw_rep - int_bit*c12*iconst->sh_invrc6*iconst->sh_invrc6)/12 -
 311                                             (Vvdw_disp - int_bit*c6*iconst->sh_invrc6)/6;
 312                                     }
 313                                 }
 314
 315                                 tx        = fscal*dx;
 316                                 ty        = fscal*dy;
 317                                 tz        = fscal*dz;
 318                                 fix       = fix + tx;
 319                                 fiy       = fiy + ty;
 320                                 fiz       = fiz + tz;
 321                                 f[jfs+0] -= tx;
 322                                 f[jfs+1] -= ty;
 323                                 f[jfs+2] -= tz;
 324                             }
 325
 326                             f[ifs+0]        += fix;
 327                             f[ifs+1]        += fiy;
 328                             f[ifs+2]        += fiz;
 329                             fshift[ish3]     = fshift[ish3]   + fix;
 330                             fshift[ish3+1]   = fshift[ish3+1] + fiy;
 331                             fshift[ish3+2]   = fshift[ish3+2] + fiz;
 332
 333                             /* Count in half work-units.
 334                              * In CUDA one work-unit is 2 warps.
 335                              */
 336                             if ((ic+1) % (CL_SIZE/2) == 0)
 337                             {
 338                                 npair_tot += npair;
 339
 340                                 nhwu++;
 341                                 if (within_rlist)
 342                                 {
 343                                     nhwu_pruned++;
 344                                 }
 345
 346                                 within_rlist = FALSE;
 347                                 npair        = 0;
 348                             }
 349                         }
 350                     }
 351                 }
 352             }
 353         }
 354
 355         if (bEner)
 356         {
 357             ggid = 0;
 358             Vc[ggid]         = Vc[ggid]   + vctot;
 359             Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
 360         }
 361     }
 362
 363     if (debug)
 364     {
 365         fprintf(debug,"number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
 366                 nbl->na_ci,nbl->na_ci,
 367                 nhwu,nhwu_pruned,nhwu_pruned/(double)nhwu);
 368         fprintf(debug,"generic kernel pair interactions:            %d\n",
 369                 nhwu*nbl->na_ci/2*nbl->na_ci);
 370         fprintf(debug,"generic kernel post-prune pair interactions: %d\n",
 371                 nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
 372         fprintf(debug,"generic kernel non-zero pair interactions:   %d\n",
 373                 npair_tot);
 374         fprintf(debug,"ratio non-zero/post-prune pair interactions: %4.2f\n",
 375                 npair_tot/(double)(nhwu_pruned*nbl->na_ci/2*nbl->na_ci));
 376     }
 377 }