2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
36 #include "types/simple.h"
41 #include "nb_kernel_allvsall.h"
48 int ** exclusion_mask;
53 calc_maxoffset(int i,int natoms)
57 if ((natoms % 2) == 1)
59 /* Odd number of atoms, easy */
62 else if ((natoms % 4) == 0)
64 /* Multiple of four is hard */
106 setup_exclusions_and_indices(gmx_allvsall_data_t * aadata,
116 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
117 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
118 * whether they should interact or not.
120 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
121 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
122 * we create a jindex array with three elements per i atom: the starting point, the point to
123 * which we need to check exclusions, and the end point.
124 * This way we only have to allocate a short exclusion mask per i atom.
127 /* Allocate memory for our modified jindex array */
128 snew(aadata->jindex,3*natoms);
130 /* Pointer to lists with exclusion masks */
131 snew(aadata->exclusion_mask,natoms);
133 for(i=0;i<natoms;i++)
136 aadata->jindex[3*i] = i+1;
137 max_offset = calc_maxoffset(i,natoms);
140 nj0 = excl->index[i];
141 nj1 = excl->index[i+1];
143 /* first check the max range */
144 max_excl_offset = -1;
146 for(j=nj0; j<nj1; j++)
152 if( k+natoms <= max_offset )
157 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
160 max_excl_offset = (max_offset < max_excl_offset) ? max_offset : max_excl_offset;
162 aadata->jindex[3*i+1] = i+1+max_excl_offset;
165 snew(aadata->exclusion_mask[i],max_excl_offset);
166 /* Include everything by default */
167 for(j=0;j<max_excl_offset;j++)
169 /* Use all-ones to mark interactions that should be present, compatible with SSE */
170 aadata->exclusion_mask[i][j] = 0xFFFFFFFF;
173 /* Go through exclusions again */
174 for(j=nj0; j<nj1; j++)
180 if( k+natoms <= max_offset )
185 if(k>0 && k<=max_excl_offset)
187 /* Excluded, kill it! */
188 aadata->exclusion_mask[i][k-1] = 0;
193 aadata->jindex[3*i+2] = i+1+max_offset;
198 setup_aadata(gmx_allvsall_data_t ** p_aadata,
206 gmx_allvsall_data_t *aadata;
212 /* Generate vdw params */
213 snew(aadata->pvdwparam,ntype);
217 snew(aadata->pvdwparam[i],2*natoms);
218 p = aadata->pvdwparam[i];
220 /* Lets keep it simple and use multiple steps - first create temp. c6/c12 arrays */
221 for(j=0;j<natoms;j++)
223 idx = i*ntype+type[j];
224 p[2*j] = pvdwparam[2*idx];
225 p[2*j+1] = pvdwparam[2*idx+1];
229 setup_exclusions_and_indices(aadata,excl,natoms);
235 nb_kernel_allvsall(t_nblist * nlist,
240 nb_kernel_data_t * kernel_data,
243 gmx_allvsall_data_t *aadata;
260 real rsq,rinv,rinvsq,rinvsix;
262 real c6,c12,Vvdw6,Vvdw12,Vvdwtot;
272 charge = mdatoms->chargeA;
273 type = mdatoms->typeA;
275 natoms = mdatoms->nr;
276 ni0 = mdatoms->start;
277 ni1 = mdatoms->start+mdatoms->homenr;
278 aadata = fr->AllvsAll_work;
279 excl = kernel_data->exclusions;
281 Vc = kernel_data->energygrp_elec;
282 Vvdw = kernel_data->energygrp_vdw;
286 setup_aadata(&aadata,excl,natoms,type,fr->ntype,fr->nbfp);
287 fr->AllvsAll_work = aadata;
290 for(i=ni0; i<ni1; i++)
292 /* We assume shifts are NOT used for all-vs-all interactions */
294 /* Load i atom data */
298 iq = facel*charge[i];
300 pvdw = aadata->pvdwparam[type[i]];
302 /* Zero the potential energy for this list */
306 /* Clear i atom forces */
311 /* Load limits for loop over neighbors */
312 nj0 = aadata->jindex[3*i];
313 nj1 = aadata->jindex[3*i+1];
314 nj2 = aadata->jindex[3*i+2];
316 mask = aadata->exclusion_mask[i];
318 /* Prologue part, including exclusion mask */
319 for(j=nj0; j<nj1; j++,mask++)
325 /* load j atom coordinates */
330 /* Calculate distance */
334 rsq = dx*dx+dy*dy+dz*dz;
336 /* Calculate 1/r and 1/r2 */
337 rinv = gmx_invsqrt(rsq);
340 /* Load parameters for j atom */
345 /* Coulomb interaction */
349 /* Lennard-Jones interaction */
350 rinvsix = rinvsq*rinvsq*rinvsq;
352 Vvdw12 = c12*rinvsix*rinvsix;
353 Vvdwtot = Vvdwtot+Vvdw12-Vvdw6;
354 fscal = (vcoul+12.0*Vvdw12-6.0*Vvdw6)*rinvsq;
356 /* Calculate temporary vectorial force */
361 /* Increment i atom force */
366 /* Decrement j atom force */
367 f[3*k] = f[3*k] - tx;
368 f[3*k+1] = f[3*k+1] - ty;
369 f[3*k+2] = f[3*k+2] - tz;
371 /* Inner loop uses 38 flops/iteration */
374 /* Main part, no exclusions */
375 for(j=nj1; j<nj2; j++)
379 /* load j atom coordinates */
384 /* Calculate distance */
388 rsq = dx*dx+dy*dy+dz*dz;
390 /* Calculate 1/r and 1/r2 */
391 rinv = gmx_invsqrt(rsq);
394 /* Load parameters for j atom */
399 /* Coulomb interaction */
403 /* Lennard-Jones interaction */
404 rinvsix = rinvsq*rinvsq*rinvsq;
406 Vvdw12 = c12*rinvsix*rinvsix;
407 Vvdwtot = Vvdwtot+Vvdw12-Vvdw6;
408 fscal = (vcoul+12.0*Vvdw12-6.0*Vvdw6)*rinvsq;
410 /* Calculate temporary vectorial force */
415 /* Increment i atom force */
420 /* Decrement j atom force */
421 f[3*k] = f[3*k] - tx;
422 f[3*k+1] = f[3*k+1] - ty;
423 f[3*k+2] = f[3*k+2] - tz;
425 /* Inner loop uses 38 flops/iteration */
432 /* Add potential energies to the group for this list */
435 Vc[ggid] = Vc[ggid] + vctot;
436 Vvdw[ggid] = Vvdw[ggid] + Vvdwtot;
438 /* Outer loop uses 6 flops/iteration */
441 /* 12 flops per outer iteration
442 * 19 flops per inner iteration
444 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,(ni1-ni0)*12 + ((ni1-ni0)*natoms/2)*19);