src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_allvsall.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2009, The GROMACS Development Team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "config.h"
  40
  41 #include <math.h>
  42
  43 #include "gromacs/legacyheaders/types/simple.h"
  44
  45 #include "gromacs/math/vec.h"
  46 #include "gromacs/utility/smalloc.h"
  47
  48 #include "nb_kernel_allvsall.h"
  49 #include "gromacs/legacyheaders/nrnb.h"
  50
  51 typedef struct
  52 {
  53     real **    pvdwparam;
  54     int *      jindex;
  55     int **     exclusion_mask;
  56 }
  57 gmx_allvsall_data_t;
  58
  59 static int
  60 calc_maxoffset(int i, int natoms)
  61 {
  62     int maxoffset;
  63
  64     if ((natoms % 2) == 1)
  65     {
  66         /* Odd number of atoms, easy */
  67         maxoffset = natoms/2;
  68     }
  69     else if ((natoms % 4) == 0)
  70     {
  71         /* Multiple of four is hard */
  72         if (i < natoms/2)
  73         {
  74             if ((i % 2) == 0)
  75             {
  76                 maxoffset = natoms/2;
  77             }
  78             else
  79             {
  80                 maxoffset = natoms/2-1;
  81             }
  82         }
  83         else
  84         {
  85             if ((i % 2) == 1)
  86             {
  87                 maxoffset = natoms/2;
  88             }
  89             else
  90             {
  91                 maxoffset = natoms/2-1;
  92             }
  93         }
  94     }
  95     else
  96     {
  97         /* natoms/2 = odd */
  98         if ((i % 2) == 0)
  99         {
 100             maxoffset = natoms/2;
 101         }
 102         else
 103         {
 104             maxoffset = natoms/2-1;
 105         }
 106     }
 107
 108     return maxoffset;
 109 }
 110
 111
 112 static void
 113 setup_exclusions_and_indices(gmx_allvsall_data_t *   aadata,
 114                              t_blocka *              excl,
 115                              int                     natoms)
 116 {
 117     int i, j, k, iexcl;
 118     int nj0, nj1;
 119     int max_offset;
 120     int max_excl_offset;
 121     int nj;
 122
 123     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
 124      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
 125      * whether they should interact or not.
 126      *
 127      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
 128      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
 129      * we create a jindex array with three elements per i atom: the starting point, the point to
 130      * which we need to check exclusions, and the end point.
 131      * This way we only have to allocate a short exclusion mask per i atom.
 132      */
 133
 134     /* Allocate memory for our modified jindex array */
 135     snew(aadata->jindex, 3*natoms);
 136
 137     /* Pointer to lists with exclusion masks */
 138     snew(aadata->exclusion_mask, natoms);
 139
 140     for (i = 0; i < natoms; i++)
 141     {
 142         /* Start */
 143         aadata->jindex[3*i]   = i+1;
 144         max_offset            = calc_maxoffset(i, natoms);
 145
 146         /* Exclusions */
 147         nj0   = excl->index[i];
 148         nj1   = excl->index[i+1];
 149
 150         /* first check the max range */
 151         max_excl_offset = -1;
 152
 153         for (j = nj0; j < nj1; j++)
 154         {
 155             iexcl = excl->a[j];
 156
 157             k = iexcl - i;
 158
 159             if (k+natoms <= max_offset)
 160             {
 161                 k += natoms;
 162             }
 163
 164             max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
 165         }
 166
 167         max_excl_offset = (max_offset < max_excl_offset) ? max_offset : max_excl_offset;
 168
 169         aadata->jindex[3*i+1] = i+1+max_excl_offset;
 170
 171
 172         snew(aadata->exclusion_mask[i], max_excl_offset);
 173         /* Include everything by default */
 174         for (j = 0; j < max_excl_offset; j++)
 175         {
 176             /* Use all-ones to mark interactions that should be present, compatible with SSE */
 177             aadata->exclusion_mask[i][j] = 0xFFFFFFFF;
 178         }
 179
 180         /* Go through exclusions again */
 181         for (j = nj0; j < nj1; j++)
 182         {
 183             iexcl = excl->a[j];
 184
 185             k = iexcl - i;
 186
 187             if (k+natoms <= max_offset)
 188             {
 189                 k += natoms;
 190             }
 191
 192             if (k > 0 && k <= max_excl_offset)
 193             {
 194                 /* Excluded, kill it! */
 195                 aadata->exclusion_mask[i][k-1] = 0;
 196             }
 197         }
 198
 199         /* End */
 200         aadata->jindex[3*i+2] = i+1+max_offset;
 201     }
 202 }
 203
 204 static void
 205 setup_aadata(gmx_allvsall_data_t **  p_aadata,
 206              t_blocka *              excl,
 207              int                     natoms,
 208              int *                   type,
 209              int                     ntype,
 210              real *                  pvdwparam)
 211 {
 212     int                  i, j, idx;
 213     gmx_allvsall_data_t *aadata;
 214     real                *p;
 215
 216     snew(aadata, 1);
 217     *p_aadata = aadata;
 218
 219     /* Generate vdw params */
 220     snew(aadata->pvdwparam, ntype);
 221
 222     for (i = 0; i < ntype; i++)
 223     {
 224         snew(aadata->pvdwparam[i], 2*natoms);
 225         p = aadata->pvdwparam[i];
 226
 227         /* Lets keep it simple and use multiple steps - first create temp. c6/c12 arrays */
 228         for (j = 0; j < natoms; j++)
 229         {
 230             idx             = i*ntype+type[j];
 231             p[2*j]          = pvdwparam[2*idx];
 232             p[2*j+1]        = pvdwparam[2*idx+1];
 233         }
 234     }
 235
 236     setup_exclusions_and_indices(aadata, excl, natoms);
 237 }
 238
 239
 240
 241 void
 242 nb_kernel_allvsall(t_nblist gmx_unused *     nlist,
 243                    rvec *                    xx,
 244                    rvec *                    ff,
 245                    t_forcerec *              fr,
 246                    t_mdatoms *               mdatoms,
 247                    nb_kernel_data_t *        kernel_data,
 248                    t_nrnb *                  nrnb)
 249 {
 250     gmx_allvsall_data_t *aadata;
 251     int                  natoms;
 252     int                  ni0, ni1;
 253     int                  nj0, nj1, nj2;
 254     int                  i, j, k;
 255     real           *     charge;
 256     int           *      type;
 257     real                 facel;
 258     real           *     pvdw;
 259     int                  ggid;
 260     int           *      mask;
 261
 262     real                 ix, iy, iz, iq;
 263     real                 fix, fiy, fiz;
 264     real                 jx, jy, jz, qq;
 265     real                 dx, dy, dz;
 266     real                 tx, ty, tz;
 267     real                 rsq, rinv, rinvsq, rinvsix;
 268     real                 vcoul, vctot;
 269     real                 c6, c12, Vvdw6, Vvdw12, Vvdwtot;
 270     real                 fscal;
 271     t_blocka           * excl;
 272     real           *     f;
 273     real           *     x;
 274     real           *     Vvdw;
 275     real           *     Vc;
 276
 277     x                   = xx[0];
 278     f                   = ff[0];
 279     charge              = mdatoms->chargeA;
 280     type                = mdatoms->typeA;
 281     facel               = fr->epsfac;
 282     natoms              = mdatoms->nr;
 283     ni0                 = 0;
 284     ni1                 = mdatoms->homenr;
 285     aadata              = fr->AllvsAll_work;
 286     excl                = kernel_data->exclusions;
 287
 288     Vc                  = kernel_data->energygrp_elec;
 289     Vvdw                = kernel_data->energygrp_vdw;
 290
 291     if (aadata == NULL)
 292     {
 293         setup_aadata(&aadata, excl, natoms, type, fr->ntype, fr->nbfp);
 294         fr->AllvsAll_work  = aadata;
 295     }
 296
 297     for (i = ni0; i < ni1; i++)
 298     {
 299         /* We assume shifts are NOT used for all-vs-all interactions */
 300
 301         /* Load i atom data */
 302         ix                = x[3*i];
 303         iy                = x[3*i+1];
 304         iz                = x[3*i+2];
 305         iq                = facel*charge[i];
 306
 307         pvdw              = aadata->pvdwparam[type[i]];
 308
 309         /* Zero the potential energy for this list */
 310         Vvdwtot           = 0.0;
 311         vctot             = 0.0;
 312
 313         /* Clear i atom forces */
 314         fix               = 0.0;
 315         fiy               = 0.0;
 316         fiz               = 0.0;
 317
 318         /* Load limits for loop over neighbors */
 319         nj0              = aadata->jindex[3*i];
 320         nj1              = aadata->jindex[3*i+1];
 321         nj2              = aadata->jindex[3*i+2];
 322
 323         mask             = aadata->exclusion_mask[i];
 324
 325         /* Prologue part, including exclusion mask */
 326         for (j = nj0; j < nj1; j++, mask++)
 327         {
 328             if (*mask != 0)
 329             {
 330                 k = j%natoms;
 331
 332                 /* load j atom coordinates */
 333                 jx                = x[3*k];
 334                 jy                = x[3*k+1];
 335                 jz                = x[3*k+2];
 336
 337                 /* Calculate distance */
 338                 dx                = ix - jx;
 339                 dy                = iy - jy;
 340                 dz                = iz - jz;
 341                 rsq               = dx*dx+dy*dy+dz*dz;
 342
 343                 /* Calculate 1/r and 1/r2 */
 344                 rinv              = gmx_invsqrt(rsq);
 345                 rinvsq            = rinv*rinv;
 346
 347                 /* Load parameters for j atom */
 348                 qq                = iq*charge[k];
 349                 c6                = pvdw[2*k];
 350                 c12               = pvdw[2*k+1];
 351
 352                 /* Coulomb interaction */
 353                 vcoul             = qq*rinv;
 354                 vctot             = vctot+vcoul;
 355
 356                 /* Lennard-Jones interaction */
 357                 rinvsix           = rinvsq*rinvsq*rinvsq;
 358                 Vvdw6             = c6*rinvsix;
 359                 Vvdw12            = c12*rinvsix*rinvsix;
 360                 Vvdwtot           = Vvdwtot+Vvdw12-Vvdw6;
 361                 fscal             = (vcoul+12.0*Vvdw12-6.0*Vvdw6)*rinvsq;
 362
 363                 /* Calculate temporary vectorial force */
 364                 tx                = fscal*dx;
 365                 ty                = fscal*dy;
 366                 tz                = fscal*dz;
 367
 368                 /* Increment i atom force */
 369                 fix               = fix + tx;
 370                 fiy               = fiy + ty;
 371                 fiz               = fiz + tz;
 372
 373                 /* Decrement j atom force */
 374                 f[3*k]            = f[3*k]   - tx;
 375                 f[3*k+1]          = f[3*k+1] - ty;
 376                 f[3*k+2]          = f[3*k+2] - tz;
 377             }
 378             /* Inner loop uses 38 flops/iteration */
 379         }
 380
 381         /* Main part, no exclusions */
 382         for (j = nj1; j < nj2; j++)
 383         {
 384             k = j%natoms;
 385
 386             /* load j atom coordinates */
 387             jx                = x[3*k];
 388             jy                = x[3*k+1];
 389             jz                = x[3*k+2];
 390
 391             /* Calculate distance */
 392             dx                = ix - jx;
 393             dy                = iy - jy;
 394             dz                = iz - jz;
 395             rsq               = dx*dx+dy*dy+dz*dz;
 396
 397             /* Calculate 1/r and 1/r2 */
 398             rinv              = gmx_invsqrt(rsq);
 399             rinvsq            = rinv*rinv;
 400
 401             /* Load parameters for j atom */
 402             qq                = iq*charge[k];
 403             c6                = pvdw[2*k];
 404             c12               = pvdw[2*k+1];
 405
 406             /* Coulomb interaction */
 407             vcoul             = qq*rinv;
 408             vctot             = vctot+vcoul;
 409
 410             /* Lennard-Jones interaction */
 411             rinvsix           = rinvsq*rinvsq*rinvsq;
 412             Vvdw6             = c6*rinvsix;
 413             Vvdw12            = c12*rinvsix*rinvsix;
 414             Vvdwtot           = Vvdwtot+Vvdw12-Vvdw6;
 415             fscal             = (vcoul+12.0*Vvdw12-6.0*Vvdw6)*rinvsq;
 416
 417             /* Calculate temporary vectorial force */
 418             tx                = fscal*dx;
 419             ty                = fscal*dy;
 420             tz                = fscal*dz;
 421
 422             /* Increment i atom force */
 423             fix               = fix + tx;
 424             fiy               = fiy + ty;
 425             fiz               = fiz + tz;
 426
 427             /* Decrement j atom force */
 428             f[3*k]            = f[3*k]   - tx;
 429             f[3*k+1]          = f[3*k+1] - ty;
 430             f[3*k+2]          = f[3*k+2] - tz;
 431
 432             /* Inner loop uses 38 flops/iteration */
 433         }
 434
 435         f[3*i]   += fix;
 436         f[3*i+1] += fiy;
 437         f[3*i+2] += fiz;
 438
 439         /* Add potential energies to the group for this list */
 440         ggid             = 0;
 441
 442         Vc[ggid]         = Vc[ggid] + vctot;
 443         Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
 444
 445         /* Outer loop uses 6 flops/iteration */
 446     }
 447
 448     /* 12 flops per outer iteration
 449      * 19 flops per inner iteration
 450      */
 451     inc_nrnb(nrnb, eNR_NBKERNEL_ELEC_VDW_VF, (ni1-ni0)*12 + ((ni1-ni0)*natoms/2)*19);
 452 }