2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2009, The GROMACS Development Team.
6 * Copyright (c) 2013,2014,2015,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
39 #include "nb_kernel_allvsallgb.h"
45 #include "gromacs/gmxlib/nrnb.h"
46 #include "gromacs/utility/real.h"
47 #include "gromacs/utility/smalloc.h"
53 int ** exclusion_mask;
58 calc_maxoffset(int i, int natoms)
62 if ((natoms % 2) == 1)
64 /* Odd number of atoms, easy */
67 else if ((natoms % 4) == 0)
69 /* Multiple of four is hard */
78 maxoffset = natoms/2-1;
89 maxoffset = natoms/2-1;
102 maxoffset = natoms/2-1;
111 setup_exclusions_and_indices(gmx_allvsall_data_t * aadata,
121 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
122 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
123 * whether they should interact or not.
125 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
126 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
127 * we create a jindex array with three elements per i atom: the starting point, the point to
128 * which we need to check exclusions, and the end point.
129 * This way we only have to allocate a short exclusion mask per i atom.
132 /* Allocate memory for our modified jindex array */
133 snew(aadata->jindex, 3*natoms);
135 /* Pointer to lists with exclusion masks */
136 snew(aadata->exclusion_mask, natoms);
138 for (i = 0; i < natoms; i++)
141 aadata->jindex[3*i] = i+1;
142 max_offset = calc_maxoffset(i, natoms);
145 nj0 = excl->index[i];
146 nj1 = excl->index[i+1];
148 /* first check the max range */
149 max_excl_offset = -1;
151 for (j = nj0; j < nj1; j++)
157 if (k+natoms <= max_offset)
162 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
165 max_excl_offset = (max_offset < max_excl_offset) ? max_offset : max_excl_offset;
167 aadata->jindex[3*i+1] = i+1+max_excl_offset;
169 snew(aadata->exclusion_mask[i], max_excl_offset);
170 /* Include everything by default */
171 for (j = 0; j < max_excl_offset; j++)
173 /* Use all-ones to mark interactions that should be present, compatible with SSE */
174 aadata->exclusion_mask[i][j] = 0xFFFFFFFF;
177 /* Go through exclusions again */
178 for (j = nj0; j < nj1; j++)
184 if (k+natoms <= max_offset)
189 if (k > 0 && k <= max_excl_offset)
191 /* Excluded, kill it! */
192 aadata->exclusion_mask[i][k-1] = 0;
197 aadata->jindex[3*i+2] = i+1+max_offset;
203 setup_aadata(gmx_allvsall_data_t ** p_aadata,
211 gmx_allvsall_data_t *aadata;
217 /* Generate vdw params */
218 snew(aadata->pvdwparam, ntype);
220 for (i = 0; i < ntype; i++)
222 snew(aadata->pvdwparam[i], 2*natoms);
223 p = aadata->pvdwparam[i];
225 /* Lets keep it simple and use multiple steps - first create temp. c6/c12 arrays */
226 for (j = 0; j < natoms; j++)
228 idx = i*ntype+type[j];
229 p[2*j] = pvdwparam[2*idx];
230 p[2*j+1] = pvdwparam[2*idx+1];
234 setup_exclusions_and_indices(aadata, excl, natoms);
240 nb_kernel_allvsallgb(t_nblist gmx_unused * nlist,
243 struct t_forcerec * fr,
245 nb_kernel_data_t * kernel_data,
248 gmx_allvsall_data_t *aadata;
263 real vgbtot, dvdasum;
271 real rsq, rinv, rinvsq, rinvsix;
273 real c6, c12, Vvdw6, Vvdw12, Vvdwtot;
274 real fscal, dvdatmp, fijC, vgb;
275 real Y, F, Fp, Geps, Heps2, VV, FF, eps, eps2, r, rt;
276 real dvdaj, gbscale, isaprod, isai, isaj, gbtabscale;
286 charge = mdatoms->chargeA;
287 type = mdatoms->typeA;
288 gbfactor = ((1.0/fr->ic->epsilon_r) - (1.0/fr->gb_epsilon_solvent));
289 facel = fr->ic->epsfac;
290 GBtab = fr->gbtab->data;
291 gbtabscale = fr->gbtab->scale;
292 invsqrta = fr->invsqrta;
294 vpol = kernel_data->energygrp_polarization;
296 natoms = mdatoms->nr;
298 ni1 = mdatoms->homenr;
300 aadata = reinterpret_cast<gmx_allvsall_data_t *>(fr->AllvsAll_work);
301 excl = kernel_data->exclusions;
303 Vc = kernel_data->energygrp_elec;
304 Vvdw = kernel_data->energygrp_vdw;
308 setup_aadata(&aadata, excl, natoms, type, fr->ntype, fr->nbfp);
309 fr->AllvsAll_work = aadata;
312 for (i = ni0; i < ni1; i++)
314 /* We assume shifts are NOT used for all-vs-all interactions */
316 /* Load i atom data */
320 iq = facel*charge[i];
324 pvdw = aadata->pvdwparam[type[i]];
326 /* Zero the potential energy for this list */
332 /* Clear i atom forces */
337 /* Load limits for loop over neighbors */
338 nj0 = aadata->jindex[3*i];
339 nj1 = aadata->jindex[3*i+1];
340 nj2 = aadata->jindex[3*i+2];
342 mask = aadata->exclusion_mask[i];
344 /* Prologue part, including exclusion mask */
345 for (j = nj0; j < nj1; j++, mask++)
351 /* load j atom coordinates */
356 /* Calculate distance */
360 rsq = dx*dx+dy*dy+dz*dz;
362 /* Calculate 1/r and 1/r2 */
363 rinv = 1.0/sqrt(rsq);
365 /* Load parameters for j atom */
371 qq = isaprod*(-qq)*gbfactor;
372 gbscale = isaprod*gbtabscale;
377 /* Tabulated Generalized-Born interaction */
381 /* Calculate table index */
389 Geps = eps*GBtab[nnn+2];
390 Heps2 = eps2*GBtab[nnn+3];
393 FF = Fp+Geps+2.0*Heps2;
395 fijC = qq*FF*gbscale;
396 dvdatmp = -0.5*(vgb+fijC*r);
397 dvdasum = dvdasum + dvdatmp;
398 dvda[k] = dvdaj+dvdatmp*isaj*isaj;
399 vctot = vctot + vcoul;
400 vgbtot = vgbtot + vgb;
402 /* Lennard-Jones interaction */
403 rinvsix = rinvsq*rinvsq*rinvsq;
405 Vvdw12 = c12*rinvsix*rinvsix;
406 Vvdwtot = Vvdwtot+Vvdw12-Vvdw6;
407 fscal = (12.0*Vvdw12-6.0*Vvdw6)*rinvsq-(fijC-fscal)*rinv;
409 /* Calculate temporary vectorial force */
414 /* Increment i atom force */
419 /* Decrement j atom force */
420 f[3*k] = f[3*k] - tx;
421 f[3*k+1] = f[3*k+1] - ty;
422 f[3*k+2] = f[3*k+2] - tz;
424 /* Inner loop uses 38 flops/iteration */
427 /* Main part, no exclusions */
428 for (j = nj1; j < nj2; j++)
432 /* load j atom coordinates */
437 /* Calculate distance */
441 rsq = dx*dx+dy*dy+dz*dz;
443 /* Calculate 1/r and 1/r2 */
444 rinv = 1.0/sqrt(rsq);
446 /* Load parameters for j atom */
452 qq = isaprod*(-qq)*gbfactor;
453 gbscale = isaprod*gbtabscale;
458 /* Tabulated Generalized-Born interaction */
462 /* Calculate table index */
470 Geps = eps*GBtab[nnn+2];
471 Heps2 = eps2*GBtab[nnn+3];
474 FF = Fp+Geps+2.0*Heps2;
476 fijC = qq*FF*gbscale;
477 dvdatmp = -0.5*(vgb+fijC*r);
478 dvdasum = dvdasum + dvdatmp;
479 dvda[k] = dvdaj+dvdatmp*isaj*isaj;
480 vctot = vctot + vcoul;
481 vgbtot = vgbtot + vgb;
483 /* Lennard-Jones interaction */
484 rinvsix = rinvsq*rinvsq*rinvsq;
486 Vvdw12 = c12*rinvsix*rinvsix;
487 Vvdwtot = Vvdwtot+Vvdw12-Vvdw6;
488 fscal = (12.0*Vvdw12-6.0*Vvdw6)*rinvsq-(fijC-fscal)*rinv;
490 /* Calculate temporary vectorial force */
495 /* Increment i atom force */
500 /* Decrement j atom force */
501 f[3*k] = f[3*k] - tx;
502 f[3*k+1] = f[3*k+1] - ty;
503 f[3*k+2] = f[3*k+2] - tz;
505 /* Inner loop uses 38 flops/iteration */
512 /* Add potential energies to the group for this list */
515 Vc[ggid] = Vc[ggid] + vctot;
516 Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
517 vpol[ggid] = vpol[ggid] + vgbtot;
518 dvda[i] = dvda[i] + dvdasum*isai*isai;
520 /* Outer loop uses 6 flops/iteration */
523 /* 12 flops per outer iteration
524 * 19 flops per inner iteration
526 inc_nrnb(nrnb, eNR_NBKERNEL_ELEC_VDW_VF, (ni1-ni0)*12 + ((ni1-ni0)*natoms/2)*19);