src/gromacs/gmxlib/nonbonded/nb_kernel_c/nb_kernel_allvsall.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2009, The GROMACS Development Team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "config.h"
  38
  39 #include <math.h>
  40
  41 #include "types/simple.h"
  42
  43 #include "gromacs/math/vec.h"
  44 #include "gromacs/utility/smalloc.h"
  45
  46 #include "nb_kernel_allvsall.h"
  47 #include "nrnb.h"
  48
  49 typedef struct
  50 {
  51     real **    pvdwparam;
  52     int *      jindex;
  53     int **     exclusion_mask;
  54 }
  55 gmx_allvsall_data_t;
  56
  57 static int
  58 calc_maxoffset(int i, int natoms)
  59 {
  60     int maxoffset;
  61
  62     if ((natoms % 2) == 1)
  63     {
  64         /* Odd number of atoms, easy */
  65         maxoffset = natoms/2;
  66     }
  67     else if ((natoms % 4) == 0)
  68     {
  69         /* Multiple of four is hard */
  70         if (i < natoms/2)
  71         {
  72             if ((i % 2) == 0)
  73             {
  74                 maxoffset = natoms/2;
  75             }
  76             else
  77             {
  78                 maxoffset = natoms/2-1;
  79             }
  80         }
  81         else
  82         {
  83             if ((i % 2) == 1)
  84             {
  85                 maxoffset = natoms/2;
  86             }
  87             else
  88             {
  89                 maxoffset = natoms/2-1;
  90             }
  91         }
  92     }
  93     else
  94     {
  95         /* natoms/2 = odd */
  96         if ((i % 2) == 0)
  97         {
  98             maxoffset = natoms/2;
  99         }
 100         else
 101         {
 102             maxoffset = natoms/2-1;
 103         }
 104     }
 105
 106     return maxoffset;
 107 }
 108
 109
 110 static void
 111 setup_exclusions_and_indices(gmx_allvsall_data_t *   aadata,
 112                              t_blocka *              excl,
 113                              int                     natoms)
 114 {
 115     int i, j, k, iexcl;
 116     int nj0, nj1;
 117     int max_offset;
 118     int max_excl_offset;
 119     int nj;
 120
 121     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
 122      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
 123      * whether they should interact or not.
 124      *
 125      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
 126      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
 127      * we create a jindex array with three elements per i atom: the starting point, the point to
 128      * which we need to check exclusions, and the end point.
 129      * This way we only have to allocate a short exclusion mask per i atom.
 130      */
 131
 132     /* Allocate memory for our modified jindex array */
 133     snew(aadata->jindex, 3*natoms);
 134
 135     /* Pointer to lists with exclusion masks */
 136     snew(aadata->exclusion_mask, natoms);
 137
 138     for (i = 0; i < natoms; i++)
 139     {
 140         /* Start */
 141         aadata->jindex[3*i]   = i+1;
 142         max_offset            = calc_maxoffset(i, natoms);
 143
 144         /* Exclusions */
 145         nj0   = excl->index[i];
 146         nj1   = excl->index[i+1];
 147
 148         /* first check the max range */
 149         max_excl_offset = -1;
 150
 151         for (j = nj0; j < nj1; j++)
 152         {
 153             iexcl = excl->a[j];
 154
 155             k = iexcl - i;
 156
 157             if (k+natoms <= max_offset)
 158             {
 159                 k += natoms;
 160             }
 161
 162             max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
 163         }
 164
 165         max_excl_offset = (max_offset < max_excl_offset) ? max_offset : max_excl_offset;
 166
 167         aadata->jindex[3*i+1] = i+1+max_excl_offset;
 168
 169
 170         snew(aadata->exclusion_mask[i], max_excl_offset);
 171         /* Include everything by default */
 172         for (j = 0; j < max_excl_offset; j++)
 173         {
 174             /* Use all-ones to mark interactions that should be present, compatible with SSE */
 175             aadata->exclusion_mask[i][j] = 0xFFFFFFFF;
 176         }
 177
 178         /* Go through exclusions again */
 179         for (j = nj0; j < nj1; j++)
 180         {
 181             iexcl = excl->a[j];
 182
 183             k = iexcl - i;
 184
 185             if (k+natoms <= max_offset)
 186             {
 187                 k += natoms;
 188             }
 189
 190             if (k > 0 && k <= max_excl_offset)
 191             {
 192                 /* Excluded, kill it! */
 193                 aadata->exclusion_mask[i][k-1] = 0;
 194             }
 195         }
 196
 197         /* End */
 198         aadata->jindex[3*i+2] = i+1+max_offset;
 199     }
 200 }
 201
 202 static void
 203 setup_aadata(gmx_allvsall_data_t **  p_aadata,
 204              t_blocka *              excl,
 205              int                     natoms,
 206              int *                   type,
 207              int                     ntype,
 208              real *                  pvdwparam)
 209 {
 210     int                  i, j, idx;
 211     gmx_allvsall_data_t *aadata;
 212     real                *p;
 213
 214     snew(aadata, 1);
 215     *p_aadata = aadata;
 216
 217     /* Generate vdw params */
 218     snew(aadata->pvdwparam, ntype);
 219
 220     for (i = 0; i < ntype; i++)
 221     {
 222         snew(aadata->pvdwparam[i], 2*natoms);
 223         p = aadata->pvdwparam[i];
 224
 225         /* Lets keep it simple and use multiple steps - first create temp. c6/c12 arrays */
 226         for (j = 0; j < natoms; j++)
 227         {
 228             idx             = i*ntype+type[j];
 229             p[2*j]          = pvdwparam[2*idx];
 230             p[2*j+1]        = pvdwparam[2*idx+1];
 231         }
 232     }
 233
 234     setup_exclusions_and_indices(aadata, excl, natoms);
 235 }
 236
 237
 238
 239 void
 240 nb_kernel_allvsall(t_nblist gmx_unused *     nlist,
 241                    rvec *                    xx,
 242                    rvec *                    ff,
 243                    t_forcerec *              fr,
 244                    t_mdatoms *               mdatoms,
 245                    nb_kernel_data_t *        kernel_data,
 246                    t_nrnb *                  nrnb)
 247 {
 248     gmx_allvsall_data_t *aadata;
 249     int                  natoms;
 250     int                  ni0, ni1;
 251     int                  nj0, nj1, nj2;
 252     int                  i, j, k;
 253     real           *     charge;
 254     int           *      type;
 255     real                 facel;
 256     real           *     pvdw;
 257     int                  ggid;
 258     int           *      mask;
 259
 260     real                 ix, iy, iz, iq;
 261     real                 fix, fiy, fiz;
 262     real                 jx, jy, jz, qq;
 263     real                 dx, dy, dz;
 264     real                 tx, ty, tz;
 265     real                 rsq, rinv, rinvsq, rinvsix;
 266     real                 vcoul, vctot;
 267     real                 c6, c12, Vvdw6, Vvdw12, Vvdwtot;
 268     real                 fscal;
 269     t_blocka           * excl;
 270     real           *     f;
 271     real           *     x;
 272     real           *     Vvdw;
 273     real           *     Vc;
 274
 275     x                   = xx[0];
 276     f                   = ff[0];
 277     charge              = mdatoms->chargeA;
 278     type                = mdatoms->typeA;
 279     facel               = fr->epsfac;
 280     natoms              = mdatoms->nr;
 281     ni0                 = 0;
 282     ni1                 = mdatoms->homenr;
 283     aadata              = fr->AllvsAll_work;
 284     excl                = kernel_data->exclusions;
 285
 286     Vc                  = kernel_data->energygrp_elec;
 287     Vvdw                = kernel_data->energygrp_vdw;
 288
 289     if (aadata == NULL)
 290     {
 291         setup_aadata(&aadata, excl, natoms, type, fr->ntype, fr->nbfp);
 292         fr->AllvsAll_work  = aadata;
 293     }
 294
 295     for (i = ni0; i < ni1; i++)
 296     {
 297         /* We assume shifts are NOT used for all-vs-all interactions */
 298
 299         /* Load i atom data */
 300         ix                = x[3*i];
 301         iy                = x[3*i+1];
 302         iz                = x[3*i+2];
 303         iq                = facel*charge[i];
 304
 305         pvdw              = aadata->pvdwparam[type[i]];
 306
 307         /* Zero the potential energy for this list */
 308         Vvdwtot           = 0.0;
 309         vctot             = 0.0;
 310
 311         /* Clear i atom forces */
 312         fix               = 0.0;
 313         fiy               = 0.0;
 314         fiz               = 0.0;
 315
 316         /* Load limits for loop over neighbors */
 317         nj0              = aadata->jindex[3*i];
 318         nj1              = aadata->jindex[3*i+1];
 319         nj2              = aadata->jindex[3*i+2];
 320
 321         mask             = aadata->exclusion_mask[i];
 322
 323         /* Prologue part, including exclusion mask */
 324         for (j = nj0; j < nj1; j++, mask++)
 325         {
 326             if (*mask != 0)
 327             {
 328                 k = j%natoms;
 329
 330                 /* load j atom coordinates */
 331                 jx                = x[3*k];
 332                 jy                = x[3*k+1];
 333                 jz                = x[3*k+2];
 334
 335                 /* Calculate distance */
 336                 dx                = ix - jx;
 337                 dy                = iy - jy;
 338                 dz                = iz - jz;
 339                 rsq               = dx*dx+dy*dy+dz*dz;
 340
 341                 /* Calculate 1/r and 1/r2 */
 342                 rinv              = gmx_invsqrt(rsq);
 343                 rinvsq            = rinv*rinv;
 344
 345                 /* Load parameters for j atom */
 346                 qq                = iq*charge[k];
 347                 c6                = pvdw[2*k];
 348                 c12               = pvdw[2*k+1];
 349
 350                 /* Coulomb interaction */
 351                 vcoul             = qq*rinv;
 352                 vctot             = vctot+vcoul;
 353
 354                 /* Lennard-Jones interaction */
 355                 rinvsix           = rinvsq*rinvsq*rinvsq;
 356                 Vvdw6             = c6*rinvsix;
 357                 Vvdw12            = c12*rinvsix*rinvsix;
 358                 Vvdwtot           = Vvdwtot+Vvdw12-Vvdw6;
 359                 fscal             = (vcoul+12.0*Vvdw12-6.0*Vvdw6)*rinvsq;
 360
 361                 /* Calculate temporary vectorial force */
 362                 tx                = fscal*dx;
 363                 ty                = fscal*dy;
 364                 tz                = fscal*dz;
 365
 366                 /* Increment i atom force */
 367                 fix               = fix + tx;
 368                 fiy               = fiy + ty;
 369                 fiz               = fiz + tz;
 370
 371                 /* Decrement j atom force */
 372                 f[3*k]            = f[3*k]   - tx;
 373                 f[3*k+1]          = f[3*k+1] - ty;
 374                 f[3*k+2]          = f[3*k+2] - tz;
 375             }
 376             /* Inner loop uses 38 flops/iteration */
 377         }
 378
 379         /* Main part, no exclusions */
 380         for (j = nj1; j < nj2; j++)
 381         {
 382             k = j%natoms;
 383
 384             /* load j atom coordinates */
 385             jx                = x[3*k];
 386             jy                = x[3*k+1];
 387             jz                = x[3*k+2];
 388
 389             /* Calculate distance */
 390             dx                = ix - jx;
 391             dy                = iy - jy;
 392             dz                = iz - jz;
 393             rsq               = dx*dx+dy*dy+dz*dz;
 394
 395             /* Calculate 1/r and 1/r2 */
 396             rinv              = gmx_invsqrt(rsq);
 397             rinvsq            = rinv*rinv;
 398
 399             /* Load parameters for j atom */
 400             qq                = iq*charge[k];
 401             c6                = pvdw[2*k];
 402             c12               = pvdw[2*k+1];
 403
 404             /* Coulomb interaction */
 405             vcoul             = qq*rinv;
 406             vctot             = vctot+vcoul;
 407
 408             /* Lennard-Jones interaction */
 409             rinvsix           = rinvsq*rinvsq*rinvsq;
 410             Vvdw6             = c6*rinvsix;
 411             Vvdw12            = c12*rinvsix*rinvsix;
 412             Vvdwtot           = Vvdwtot+Vvdw12-Vvdw6;
 413             fscal             = (vcoul+12.0*Vvdw12-6.0*Vvdw6)*rinvsq;
 414
 415             /* Calculate temporary vectorial force */
 416             tx                = fscal*dx;
 417             ty                = fscal*dy;
 418             tz                = fscal*dz;
 419
 420             /* Increment i atom force */
 421             fix               = fix + tx;
 422             fiy               = fiy + ty;
 423             fiz               = fiz + tz;
 424
 425             /* Decrement j atom force */
 426             f[3*k]            = f[3*k]   - tx;
 427             f[3*k+1]          = f[3*k+1] - ty;
 428             f[3*k+2]          = f[3*k+2] - tz;
 429
 430             /* Inner loop uses 38 flops/iteration */
 431         }
 432
 433         f[3*i]   += fix;
 434         f[3*i+1] += fiy;
 435         f[3*i+2] += fiz;
 436
 437         /* Add potential energies to the group for this list */
 438         ggid             = 0;
 439
 440         Vc[ggid]         = Vc[ggid] + vctot;
 441         Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
 442
 443         /* Outer loop uses 6 flops/iteration */
 444     }
 445
 446     /* 12 flops per outer iteration
 447      * 19 flops per inner iteration
 448      */
 449     inc_nrnb(nrnb, eNR_NBKERNEL_ELEC_VDW_VF, (ni1-ni0)*12 + ((ni1-ni0)*natoms/2)*19);
 450 }