Move smalloc.h to utility/
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team.
6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #ifdef HAVE_CONFIG_H
38 #include <config.h>
39 #endif
40
41 #include <math.h>
42 #include <string.h>
43
44 #include "typedefs.h"
45 #include "gromacs/utility/smalloc.h"
46 #include "genborn.h"
47 #include "vec.h"
48 #include "gromacs/fileio/pdbio.h"
49 #include "names.h"
50 #include "physics.h"
51 #include "domdec.h"
52 #include "network.h"
53 #include "gmx_fatal.h"
54 #include "mtop_util.h"
55 #include "genborn.h"
56
57 #include "gromacs/utility/gmxmpi.h"
58
59 /* Only compile this file if SSE2 intrinsics are available */
60 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
61 #include <gmx_sse2_double.h>
62 #include <emmintrin.h>
63
64 #include "genborn_sse2_double.h"
65
66 int
67 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
68                               int natoms, gmx_localtop_t *top,
69                               double *x, t_nblist *nl,
70                               gmx_genborn_t *born)
71 {
72     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
73     int           jnrA, jnrB, j3A, j3B;
74     int          *mdtype;
75     double        shX, shY, shZ;
76     int          *jjnr;
77     double       *shiftvec;
78
79     double        gpi_ai, gpi2;
80     double        factor;
81     double       *gb_radius;
82     double       *vsolv;
83     double       *work;
84     double       *dadx;
85
86     __m128d       ix, iy, iz;
87     __m128d       jx, jy, jz;
88     __m128d       dx, dy, dz;
89     __m128d       tx, ty, tz;
90     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
91     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
92     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
93     __m128d       mask, icf4, icf6, mask_cmp;
94
95     const __m128d half   = _mm_set1_pd(0.5);
96     const __m128d three  = _mm_set1_pd(3.0);
97     const __m128d one    = _mm_set1_pd(1.0);
98     const __m128d two    = _mm_set1_pd(2.0);
99     const __m128d zero   = _mm_set1_pd(0.0);
100     const __m128d four   = _mm_set1_pd(4.0);
101
102     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
103     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
104     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
105
106     factor  = 0.5 * ONE_4PI_EPS0;
107
108     gb_radius = born->gb_radius;
109     vsolv     = born->vsolv;
110     work      = born->gpol_still_work;
111     jjnr      = nl->jjnr;
112     shiftvec  = fr->shift_vec[0];
113     dadx      = fr->dadx;
114
115     jnrA = jnrB = 0;
116     jx   = _mm_setzero_pd();
117     jy   = _mm_setzero_pd();
118     jz   = _mm_setzero_pd();
119
120     n = 0;
121
122     for (i = 0; i < natoms; i++)
123     {
124         work[i] = 0;
125     }
126
127     for (i = 0; i < nl->nri; i++)
128     {
129         ii     = nl->iinr[i];
130         ii3    = ii*3;
131         is3    = 3*nl->shift[i];
132         shX    = shiftvec[is3];
133         shY    = shiftvec[is3+1];
134         shZ    = shiftvec[is3+2];
135         nj0    = nl->jindex[i];
136         nj1    = nl->jindex[i+1];
137
138         ix     = _mm_set1_pd(shX+x[ii3+0]);
139         iy     = _mm_set1_pd(shY+x[ii3+1]);
140         iz     = _mm_set1_pd(shZ+x[ii3+2]);
141
142
143         /* Polarization energy for atom ai */
144         gpi    = _mm_setzero_pd();
145
146         rai     = _mm_load1_pd(gb_radius+ii);
147         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
148
149         for (k = nj0; k < nj1-1; k += 2)
150         {
151             jnrA        = jjnr[k];
152             jnrB        = jjnr[k+1];
153
154             j3A         = 3*jnrA;
155             j3B         = 3*jnrB;
156
157             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
158
159             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
160             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
161
162             dx          = _mm_sub_pd(ix, jx);
163             dy          = _mm_sub_pd(iy, jy);
164             dz          = _mm_sub_pd(iz, jz);
165
166             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
167             rinv        = gmx_mm_invsqrt_pd(rsq);
168             rinv2       = _mm_mul_pd(rinv, rinv);
169             rinv4       = _mm_mul_pd(rinv2, rinv2);
170             rinv6       = _mm_mul_pd(rinv4, rinv2);
171
172             rvdw        = _mm_add_pd(rai, raj);
173             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
174
175             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
176
177             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
178             if (0 == _mm_movemask_pd(mask_cmp) )
179             {
180                 /* if ratio>still_p5inv for ALL elements */
181                 ccf         = one;
182                 dccf        = _mm_setzero_pd();
183             }
184             else
185             {
186                 ratio       = _mm_min_pd(ratio, still_p5inv);
187                 theta       = _mm_mul_pd(ratio, still_pip5);
188                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
189                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
190                 ccf         = _mm_mul_pd(term, term);
191                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
192                                          _mm_mul_pd(sinq, theta));
193             }
194
195             prod        = _mm_mul_pd(still_p4, vaj);
196             icf4        = _mm_mul_pd(ccf, rinv4);
197             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
198
199             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
200
201             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
202
203             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
204             dadx += 2;
205             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
206             dadx += 2;
207         }
208
209         if (k < nj1)
210         {
211             jnrA        = jjnr[k];
212
213             j3A         = 3*jnrA;
214
215             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
216
217             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
218             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
219
220             dx          = _mm_sub_sd(ix, jx);
221             dy          = _mm_sub_sd(iy, jy);
222             dz          = _mm_sub_sd(iz, jz);
223
224             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
225             rinv        = gmx_mm_invsqrt_pd(rsq);
226             rinv2       = _mm_mul_sd(rinv, rinv);
227             rinv4       = _mm_mul_sd(rinv2, rinv2);
228             rinv6       = _mm_mul_sd(rinv4, rinv2);
229
230             rvdw        = _mm_add_sd(rai, raj);
231             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
232
233             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
234
235             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
236             if (0 == _mm_movemask_pd(mask_cmp) )
237             {
238                 /* if ratio>still_p5inv for ALL elements */
239                 ccf         = one;
240                 dccf        = _mm_setzero_pd();
241             }
242             else
243             {
244                 ratio       = _mm_min_sd(ratio, still_p5inv);
245                 theta       = _mm_mul_sd(ratio, still_pip5);
246                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
247                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
248                 ccf         = _mm_mul_sd(term, term);
249                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
250                                          _mm_mul_sd(sinq, theta));
251             }
252
253             prod        = _mm_mul_sd(still_p4, vaj);
254             icf4        = _mm_mul_sd(ccf, rinv4);
255             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
256
257             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
258
259             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
260
261             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
262             dadx += 2;
263             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
264             dadx += 2;
265         }
266         gmx_mm_update_1pot_pd(gpi, work+ii);
267     }
268
269     /* Sum up the polarization energy from other nodes */
270     if (DOMAINDECOMP(cr))
271     {
272         dd_atom_sum_real(cr->dd, work);
273     }
274
275     /* Compute the radii */
276     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
277     {
278         if (born->use[i] != 0)
279         {
280             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
281             gpi2             = gpi_ai * gpi_ai;
282             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
283             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
284         }
285     }
286
287     /* Extra (local) communication required for DD */
288     if (DOMAINDECOMP(cr))
289     {
290         dd_atom_spread_real(cr->dd, born->bRad);
291         dd_atom_spread_real(cr->dd, fr->invsqrta);
292     }
293
294     return 0;
295 }
296
297
298 int
299 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
300                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
301 {
302     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
303     int           jnrA, jnrB;
304     int           j3A, j3B;
305     double        shX, shY, shZ;
306     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
307     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
308     double       *obc_param;
309     double       *gb_radius;
310     double       *work;
311     int        *  jjnr;
312     double       *dadx;
313     double       *shiftvec;
314     double        min_rad, rad;
315
316     __m128d       ix, iy, iz, jx, jy, jz;
317     __m128d       dx, dy, dz, t1, t2, t3, t4;
318     __m128d       rsq, rinv, r;
319     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
320     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
321     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
322     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
323     __m128d       dadx1, dadx2;
324     __m128d       logterm;
325     __m128d       mask;
326     __m128d       obc_mask1, obc_mask2, obc_mask3;
327
328     __m128d       oneeighth   = _mm_set1_pd(0.125);
329     __m128d       onefourth   = _mm_set1_pd(0.25);
330
331     const __m128d half  = _mm_set1_pd(0.5);
332     const __m128d three = _mm_set1_pd(3.0);
333     const __m128d one   = _mm_set1_pd(1.0);
334     const __m128d two   = _mm_set1_pd(2.0);
335     const __m128d zero  = _mm_set1_pd(0.0);
336     const __m128d neg   = _mm_set1_pd(-1.0);
337
338     /* Set the dielectric offset */
339     doffset   = born->gb_doffset;
340     gb_radius = born->gb_radius;
341     obc_param = born->param;
342     work      = born->gpol_hct_work;
343     jjnr      = nl->jjnr;
344     dadx      = fr->dadx;
345     shiftvec  = fr->shift_vec[0];
346
347     jx        = _mm_setzero_pd();
348     jy        = _mm_setzero_pd();
349     jz        = _mm_setzero_pd();
350
351     jnrA = jnrB = 0;
352
353     for (i = 0; i < born->nr; i++)
354     {
355         work[i] = 0;
356     }
357
358     for (i = 0; i < nl->nri; i++)
359     {
360         ii     = nl->iinr[i];
361         ii3    = ii*3;
362         is3    = 3*nl->shift[i];
363         shX    = shiftvec[is3];
364         shY    = shiftvec[is3+1];
365         shZ    = shiftvec[is3+2];
366         nj0    = nl->jindex[i];
367         nj1    = nl->jindex[i+1];
368
369         ix     = _mm_set1_pd(shX+x[ii3+0]);
370         iy     = _mm_set1_pd(shY+x[ii3+1]);
371         iz     = _mm_set1_pd(shZ+x[ii3+2]);
372
373         rai     = _mm_load1_pd(gb_radius+ii);
374         rai_inv = gmx_mm_inv_pd(rai);
375
376         sum_ai = _mm_setzero_pd();
377
378         sk_ai  = _mm_load1_pd(born->param+ii);
379         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
380
381         for (k = nj0; k < nj1-1; k += 2)
382         {
383             jnrA        = jjnr[k];
384             jnrB        = jjnr[k+1];
385
386             j3A         = 3*jnrA;
387             j3B         = 3*jnrB;
388
389             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
390             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
391             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
392
393             dx    = _mm_sub_pd(ix, jx);
394             dy    = _mm_sub_pd(iy, jy);
395             dz    = _mm_sub_pd(iz, jz);
396
397             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
398
399             rinv        = gmx_mm_invsqrt_pd(rsq);
400             r           = _mm_mul_pd(rsq, rinv);
401
402             /* Compute raj_inv aj1-4 */
403             raj_inv     = gmx_mm_inv_pd(raj);
404
405             /* Evaluate influence of atom aj -> ai */
406             t1            = _mm_add_pd(r, sk_aj);
407             t2            = _mm_sub_pd(r, sk_aj);
408             t3            = _mm_sub_pd(sk_aj, r);
409             obc_mask1     = _mm_cmplt_pd(rai, t1);
410             obc_mask2     = _mm_cmplt_pd(rai, t2);
411             obc_mask3     = _mm_cmplt_pd(rai, t3);
412
413             uij           = gmx_mm_inv_pd(t1);
414             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
415                                          _mm_andnot_pd(obc_mask2, rai_inv));
416             dlij          = _mm_and_pd(one, obc_mask2);
417             uij2          = _mm_mul_pd(uij, uij);
418             uij3          = _mm_mul_pd(uij2, uij);
419             lij2          = _mm_mul_pd(lij, lij);
420             lij3          = _mm_mul_pd(lij2, lij);
421
422             diff2         = _mm_sub_pd(uij2, lij2);
423             lij_inv       = gmx_mm_invsqrt_pd(lij2);
424             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
425             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
426             prod          = _mm_mul_pd(onefourth, sk2_rinv);
427
428             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
429
430             t1            = _mm_sub_pd(lij, uij);
431             t2            = _mm_mul_pd(diff2,
432                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
433                                                   prod));
434             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
435             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
436             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
437             t4            = _mm_and_pd(t4, obc_mask3);
438             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
439
440             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
441
442             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
443                                        _mm_mul_pd(prod, lij3));
444             t1            = _mm_sub_pd(t1,
445                                        _mm_mul_pd(onefourth,
446                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
447                                                              _mm_mul_pd(lij3, r))));
448             t2            = _mm_mul_pd(onefourth,
449                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
450                                                   _mm_mul_pd(uij3, r)));
451             t2            = _mm_sub_pd(t2,
452                                        _mm_add_pd(_mm_mul_pd(half, uij2),
453                                                   _mm_mul_pd(prod, uij3)));
454             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
455                                        _mm_mul_pd(rinv, rinv));
456             t3            = _mm_sub_pd(t3,
457                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
458                                                   _mm_add_pd(one,
459                                                              _mm_mul_pd(sk2_rinv, rinv))));
460             t1            = _mm_mul_pd(rinv,
461                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
462                                                   _mm_add_pd(t2, t3)));
463
464             dadx1         = _mm_and_pd(t1, obc_mask1);
465
466             /* Evaluate influence of atom ai -> aj */
467             t1            = _mm_add_pd(r, sk_ai);
468             t2            = _mm_sub_pd(r, sk_ai);
469             t3            = _mm_sub_pd(sk_ai, r);
470             obc_mask1     = _mm_cmplt_pd(raj, t1);
471             obc_mask2     = _mm_cmplt_pd(raj, t2);
472             obc_mask3     = _mm_cmplt_pd(raj, t3);
473
474             uij           = gmx_mm_inv_pd(t1);
475             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
476                                          _mm_andnot_pd(obc_mask2, raj_inv));
477             dlij          = _mm_and_pd(one, obc_mask2);
478             uij2          = _mm_mul_pd(uij, uij);
479             uij3          = _mm_mul_pd(uij2, uij);
480             lij2          = _mm_mul_pd(lij, lij);
481             lij3          = _mm_mul_pd(lij2, lij);
482
483             diff2         = _mm_sub_pd(uij2, lij2);
484             lij_inv       = gmx_mm_invsqrt_pd(lij2);
485             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
486             prod          = _mm_mul_pd(onefourth, sk2_rinv);
487
488             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
489
490             t1            = _mm_sub_pd(lij, uij);
491             t2            = _mm_mul_pd(diff2,
492                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
493                                                   prod));
494             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
495             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
496             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
497             t4            = _mm_and_pd(t4, obc_mask3);
498             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
499
500             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
501
502             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
503                                        _mm_mul_pd(prod, lij3));
504             t1            = _mm_sub_pd(t1,
505                                        _mm_mul_pd(onefourth,
506                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
507                                                              _mm_mul_pd(lij3, r))));
508             t2            = _mm_mul_pd(onefourth,
509                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
510                                                   _mm_mul_pd(uij3, r)));
511             t2            = _mm_sub_pd(t2,
512                                        _mm_add_pd(_mm_mul_pd(half, uij2),
513                                                   _mm_mul_pd(prod, uij3)));
514             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
515                                        _mm_mul_pd(rinv, rinv));
516             t3            = _mm_sub_pd(t3,
517                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
518                                                   _mm_add_pd(one,
519                                                              _mm_mul_pd(sk2_rinv, rinv))));
520             t1            = _mm_mul_pd(rinv,
521                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
522                                                   _mm_add_pd(t2, t3)));
523
524             dadx2         = _mm_and_pd(t1, obc_mask1);
525
526             _mm_store_pd(dadx, dadx1);
527             dadx += 2;
528             _mm_store_pd(dadx, dadx2);
529             dadx += 2;
530         } /* end normal inner loop */
531
532         if (k < nj1)
533         {
534             jnrA        = jjnr[k];
535
536             j3A         = 3*jnrA;
537
538             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
539             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
540             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
541
542             dx    = _mm_sub_sd(ix, jx);
543             dy    = _mm_sub_sd(iy, jy);
544             dz    = _mm_sub_sd(iz, jz);
545
546             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
547
548             rinv        = gmx_mm_invsqrt_pd(rsq);
549             r           = _mm_mul_sd(rsq, rinv);
550
551             /* Compute raj_inv aj1-4 */
552             raj_inv     = gmx_mm_inv_pd(raj);
553
554             /* Evaluate influence of atom aj -> ai */
555             t1            = _mm_add_sd(r, sk_aj);
556             t2            = _mm_sub_sd(r, sk_aj);
557             t3            = _mm_sub_sd(sk_aj, r);
558             obc_mask1     = _mm_cmplt_sd(rai, t1);
559             obc_mask2     = _mm_cmplt_sd(rai, t2);
560             obc_mask3     = _mm_cmplt_sd(rai, t3);
561
562             uij           = gmx_mm_inv_pd(t1);
563             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
564                                       _mm_andnot_pd(obc_mask2, rai_inv));
565             dlij          = _mm_and_pd(one, obc_mask2);
566             uij2          = _mm_mul_sd(uij, uij);
567             uij3          = _mm_mul_sd(uij2, uij);
568             lij2          = _mm_mul_sd(lij, lij);
569             lij3          = _mm_mul_sd(lij2, lij);
570
571             diff2         = _mm_sub_sd(uij2, lij2);
572             lij_inv       = gmx_mm_invsqrt_pd(lij2);
573             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
574             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
575             prod          = _mm_mul_sd(onefourth, sk2_rinv);
576
577             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
578
579             t1            = _mm_sub_sd(lij, uij);
580             t2            = _mm_mul_sd(diff2,
581                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
582                                                   prod));
583             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
584             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
585             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
586             t4            = _mm_and_pd(t4, obc_mask3);
587             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
588
589             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
590
591             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
592                                        _mm_mul_sd(prod, lij3));
593             t1            = _mm_sub_sd(t1,
594                                        _mm_mul_sd(onefourth,
595                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
596                                                              _mm_mul_sd(lij3, r))));
597             t2            = _mm_mul_sd(onefourth,
598                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
599                                                   _mm_mul_sd(uij3, r)));
600             t2            = _mm_sub_sd(t2,
601                                        _mm_add_sd(_mm_mul_sd(half, uij2),
602                                                   _mm_mul_sd(prod, uij3)));
603             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
604                                        _mm_mul_sd(rinv, rinv));
605             t3            = _mm_sub_sd(t3,
606                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
607                                                   _mm_add_sd(one,
608                                                              _mm_mul_sd(sk2_rinv, rinv))));
609             t1            = _mm_mul_sd(rinv,
610                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
611                                                   _mm_add_pd(t2, t3)));
612
613             dadx1         = _mm_and_pd(t1, obc_mask1);
614
615             /* Evaluate influence of atom ai -> aj */
616             t1            = _mm_add_sd(r, sk_ai);
617             t2            = _mm_sub_sd(r, sk_ai);
618             t3            = _mm_sub_sd(sk_ai, r);
619             obc_mask1     = _mm_cmplt_sd(raj, t1);
620             obc_mask2     = _mm_cmplt_sd(raj, t2);
621             obc_mask3     = _mm_cmplt_sd(raj, t3);
622
623             uij           = gmx_mm_inv_pd(t1);
624             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
625                                          _mm_andnot_pd(obc_mask2, raj_inv));
626             dlij          = _mm_and_pd(one, obc_mask2);
627             uij2          = _mm_mul_sd(uij, uij);
628             uij3          = _mm_mul_sd(uij2, uij);
629             lij2          = _mm_mul_sd(lij, lij);
630             lij3          = _mm_mul_sd(lij2, lij);
631
632             diff2         = _mm_sub_sd(uij2, lij2);
633             lij_inv       = gmx_mm_invsqrt_pd(lij2);
634             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
635             prod          = _mm_mul_sd(onefourth, sk2_rinv);
636
637             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
638
639             t1            = _mm_sub_sd(lij, uij);
640             t2            = _mm_mul_sd(diff2,
641                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
642                                                   prod));
643             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
644             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
645             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
646             t4            = _mm_and_pd(t4, obc_mask3);
647             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
648
649             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
650
651             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
652                                        _mm_mul_sd(prod, lij3));
653             t1            = _mm_sub_sd(t1,
654                                        _mm_mul_sd(onefourth,
655                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
656                                                              _mm_mul_sd(lij3, r))));
657             t2            = _mm_mul_sd(onefourth,
658                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
659                                                   _mm_mul_sd(uij3, r)));
660             t2            = _mm_sub_sd(t2,
661                                        _mm_add_sd(_mm_mul_sd(half, uij2),
662                                                   _mm_mul_sd(prod, uij3)));
663             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
664                                        _mm_mul_sd(rinv, rinv));
665             t3            = _mm_sub_sd(t3,
666                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
667                                                   _mm_add_sd(one,
668                                                              _mm_mul_sd(sk2_rinv, rinv))));
669             t1            = _mm_mul_sd(rinv,
670                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
671                                                   _mm_add_sd(t2, t3)));
672
673             dadx2         = _mm_and_pd(t1, obc_mask1);
674
675             _mm_store_pd(dadx, dadx1);
676             dadx += 2;
677             _mm_store_pd(dadx, dadx2);
678             dadx += 2;
679         }
680         gmx_mm_update_1pot_pd(sum_ai, work+ii);
681
682     }
683
684     /* Parallel summations */
685     if (DOMAINDECOMP(cr))
686     {
687         dd_atom_sum_real(cr->dd, work);
688     }
689
690     if (gb_algorithm == egbHCT)
691     {
692         /* HCT */
693         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
694         {
695             if (born->use[i] != 0)
696             {
697                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
698                 sum     = 1.0/rr - work[i];
699                 min_rad = rr + doffset;
700                 rad     = 1.0/sum;
701
702                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
703                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
704             }
705         }
706
707         /* Extra communication required for DD */
708         if (DOMAINDECOMP(cr))
709         {
710             dd_atom_spread_real(cr->dd, born->bRad);
711             dd_atom_spread_real(cr->dd, fr->invsqrta);
712         }
713     }
714     else
715     {
716         /* OBC */
717         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
718         {
719             if (born->use[i] != 0)
720             {
721                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
722                 rr_inv2 = 1.0/rr;
723                 rr      = rr-doffset;
724                 rr_inv  = 1.0/rr;
725                 sum     = rr * work[i];
726                 sum2    = sum  * sum;
727                 sum3    = sum2 * sum;
728
729                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
730                 born->bRad[i] = rr_inv - tsum*rr_inv2;
731                 born->bRad[i] = 1.0 / born->bRad[i];
732
733                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
734
735                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
736                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
737             }
738         }
739         /* Extra (local) communication required for DD */
740         if (DOMAINDECOMP(cr))
741         {
742             dd_atom_spread_real(cr->dd, born->bRad);
743             dd_atom_spread_real(cr->dd, fr->invsqrta);
744             dd_atom_spread_real(cr->dd, born->drobc);
745         }
746     }
747
748
749
750     return 0;
751 }
752
753
754 int
755 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
756                               double *x, double *f, double *fshift, double *shiftvec,
757                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
758 {
759     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
760     int           jnrA, jnrB;
761     int           j3A, j3B;
762     int        *  jjnr;
763
764     double        rbi, shX, shY, shZ;
765     double       *rb;
766
767     __m128d       ix, iy, iz;
768     __m128d       jx, jy, jz;
769     __m128d       fix, fiy, fiz;
770     __m128d       dx, dy, dz;
771     __m128d       tx, ty, tz;
772
773     __m128d       rbai, rbaj, f_gb, f_gb_ai;
774     __m128d       xmm1, xmm2, xmm3;
775
776     const __m128d two = _mm_set1_pd(2.0);
777
778     rb     = born->work;
779
780     jjnr   = nl->jjnr;
781
782     /* Loop to get the proper form for the Born radius term, sse style */
783     n0 = 0;
784     n1 = natoms;
785
786     if (gb_algorithm == egbSTILL)
787     {
788         for (i = n0; i < n1; i++)
789         {
790             rbi   = born->bRad[i];
791             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
792         }
793     }
794     else if (gb_algorithm == egbHCT)
795     {
796         for (i = n0; i < n1; i++)
797         {
798             rbi   = born->bRad[i];
799             rb[i] = rbi * rbi * dvda[i];
800         }
801     }
802     else if (gb_algorithm == egbOBC)
803     {
804         for (i = n0; i < n1; i++)
805         {
806             rbi   = born->bRad[i];
807             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
808         }
809     }
810
811     jz = _mm_setzero_pd();
812
813     n = j3A = j3B = 0;
814
815     for (i = 0; i < nl->nri; i++)
816     {
817         ii     = nl->iinr[i];
818         ii3    = ii*3;
819         is3    = 3*nl->shift[i];
820         shX    = shiftvec[is3];
821         shY    = shiftvec[is3+1];
822         shZ    = shiftvec[is3+2];
823         nj0    = nl->jindex[i];
824         nj1    = nl->jindex[i+1];
825
826         ix     = _mm_set1_pd(shX+x[ii3+0]);
827         iy     = _mm_set1_pd(shY+x[ii3+1]);
828         iz     = _mm_set1_pd(shZ+x[ii3+2]);
829
830         rbai   = _mm_load1_pd(rb+ii);
831         fix    = _mm_setzero_pd();
832         fiy    = _mm_setzero_pd();
833         fiz    = _mm_setzero_pd();
834
835
836         for (k = nj0; k < nj1-1; k += 2)
837         {
838             jnrA        = jjnr[k];
839             jnrB        = jjnr[k+1];
840
841             j3A         = 3*jnrA;
842             j3B         = 3*jnrB;
843
844             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
845
846             dx          = _mm_sub_pd(ix, jx);
847             dy          = _mm_sub_pd(iy, jy);
848             dz          = _mm_sub_pd(iz, jz);
849
850             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
851
852             /* load chain rule terms for j1-4 */
853             f_gb        = _mm_load_pd(dadx);
854             dadx       += 2;
855             f_gb_ai     = _mm_load_pd(dadx);
856             dadx       += 2;
857
858             /* calculate scalar force */
859             f_gb    = _mm_mul_pd(f_gb, rbai);
860             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
861             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
862
863             tx     = _mm_mul_pd(f_gb, dx);
864             ty     = _mm_mul_pd(f_gb, dy);
865             tz     = _mm_mul_pd(f_gb, dz);
866
867             fix    = _mm_add_pd(fix, tx);
868             fiy    = _mm_add_pd(fiy, ty);
869             fiz    = _mm_add_pd(fiz, tz);
870
871             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
872         }
873
874         /*deal with odd elements */
875         if (k < nj1)
876         {
877             jnrA        = jjnr[k];
878             j3A         = 3*jnrA;
879
880             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
881
882             dx          = _mm_sub_sd(ix, jx);
883             dy          = _mm_sub_sd(iy, jy);
884             dz          = _mm_sub_sd(iz, jz);
885
886             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
887
888             /* load chain rule terms */
889             f_gb        = _mm_load_pd(dadx);
890             dadx       += 2;
891             f_gb_ai     = _mm_load_pd(dadx);
892             dadx       += 2;
893
894             /* calculate scalar force */
895             f_gb    = _mm_mul_sd(f_gb, rbai);
896             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
897             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
898
899             tx     = _mm_mul_sd(f_gb, dx);
900             ty     = _mm_mul_sd(f_gb, dy);
901             tz     = _mm_mul_sd(f_gb, dz);
902
903             fix    = _mm_add_sd(fix, tx);
904             fiy    = _mm_add_sd(fiy, ty);
905             fiz    = _mm_add_sd(fiz, tz);
906
907             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
908         }
909
910         /* fix/fiy/fiz now contain four partial force terms, that all should be
911          * added to the i particle forces and shift forces.
912          */
913         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
914     }
915
916     return 0;
917 }
918
919 #else
920 /* keep compiler happy */
921 int genborn_sse2_dummy;
922
923 #endif /* SSE2 intrinsics available */