cdb7ecaf15b35d13c81b5a090220dbfe977b7c0d
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team.
6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #ifdef HAVE_CONFIG_H
38 #include <config.h>
39 #endif
40
41 #include <math.h>
42 #include <string.h>
43
44 #include "typedefs.h"
45 #include "smalloc.h"
46 #include "genborn.h"
47 #include "vec.h"
48 #include "gromacs/fileio/pdbio.h"
49 #include "names.h"
50 #include "physics.h"
51 #include "domdec.h"
52 #include "partdec.h"
53 #include "network.h"
54 #include "gmx_fatal.h"
55 #include "mtop_util.h"
56 #include "genborn.h"
57
58 #include "gromacs/utility/gmxmpi.h"
59
60 /* Only compile this file if SSE2 intrinsics are available */
61 #if 0 && defined (GMX_X86_SSE2)
62 #include <gmx_sse2_double.h>
63 #include <emmintrin.h>
64
65 #include "genborn_sse2_double.h"
66
67 int
68 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
69                               int natoms, gmx_localtop_t *top,
70                               double *x, t_nblist *nl,
71                               gmx_genborn_t *born)
72 {
73     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
74     int           jnrA, jnrB, j3A, j3B;
75     int          *mdtype;
76     double        shX, shY, shZ;
77     int          *jjnr;
78     double       *shiftvec;
79
80     double        gpi_ai, gpi2;
81     double        factor;
82     double       *gb_radius;
83     double       *vsolv;
84     double       *work;
85     double       *dadx;
86
87     __m128d       ix, iy, iz;
88     __m128d       jx, jy, jz;
89     __m128d       dx, dy, dz;
90     __m128d       tx, ty, tz;
91     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
92     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
93     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
94     __m128d       mask, icf4, icf6, mask_cmp;
95
96     const __m128d half   = _mm_set1_pd(0.5);
97     const __m128d three  = _mm_set1_pd(3.0);
98     const __m128d one    = _mm_set1_pd(1.0);
99     const __m128d two    = _mm_set1_pd(2.0);
100     const __m128d zero   = _mm_set1_pd(0.0);
101     const __m128d four   = _mm_set1_pd(4.0);
102
103     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
104     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
105     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
106
107     factor  = 0.5 * ONE_4PI_EPS0;
108
109     gb_radius = born->gb_radius;
110     vsolv     = born->vsolv;
111     work      = born->gpol_still_work;
112     jjnr      = nl->jjnr;
113     shiftvec  = fr->shift_vec[0];
114     dadx      = fr->dadx;
115
116     jnrA = jnrB = 0;
117     jx   = _mm_setzero_pd();
118     jy   = _mm_setzero_pd();
119     jz   = _mm_setzero_pd();
120
121     n = 0;
122
123     for (i = 0; i < natoms; i++)
124     {
125         work[i] = 0;
126     }
127
128     for (i = 0; i < nl->nri; i++)
129     {
130         ii     = nl->iinr[i];
131         ii3    = ii*3;
132         is3    = 3*nl->shift[i];
133         shX    = shiftvec[is3];
134         shY    = shiftvec[is3+1];
135         shZ    = shiftvec[is3+2];
136         nj0    = nl->jindex[i];
137         nj1    = nl->jindex[i+1];
138
139         ix     = _mm_set1_pd(shX+x[ii3+0]);
140         iy     = _mm_set1_pd(shY+x[ii3+1]);
141         iz     = _mm_set1_pd(shZ+x[ii3+2]);
142
143
144         /* Polarization energy for atom ai */
145         gpi    = _mm_setzero_pd();
146
147         rai     = _mm_load1_pd(gb_radius+ii);
148         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
149
150         for (k = nj0; k < nj1-1; k += 2)
151         {
152             jnrA        = jjnr[k];
153             jnrB        = jjnr[k+1];
154
155             j3A         = 3*jnrA;
156             j3B         = 3*jnrB;
157
158             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
159
160             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
161             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
162
163             dx          = _mm_sub_pd(ix, jx);
164             dy          = _mm_sub_pd(iy, jy);
165             dz          = _mm_sub_pd(iz, jz);
166
167             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
168             rinv        = gmx_mm_invsqrt_pd(rsq);
169             rinv2       = _mm_mul_pd(rinv, rinv);
170             rinv4       = _mm_mul_pd(rinv2, rinv2);
171             rinv6       = _mm_mul_pd(rinv4, rinv2);
172
173             rvdw        = _mm_add_pd(rai, raj);
174             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
175
176             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
177
178             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
179             if (0 == _mm_movemask_pd(mask_cmp) )
180             {
181                 /* if ratio>still_p5inv for ALL elements */
182                 ccf         = one;
183                 dccf        = _mm_setzero_pd();
184             }
185             else
186             {
187                 ratio       = _mm_min_pd(ratio, still_p5inv);
188                 theta       = _mm_mul_pd(ratio, still_pip5);
189                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
190                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
191                 ccf         = _mm_mul_pd(term, term);
192                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
193                                          _mm_mul_pd(sinq, theta));
194             }
195
196             prod        = _mm_mul_pd(still_p4, vaj);
197             icf4        = _mm_mul_pd(ccf, rinv4);
198             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
199
200             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
201
202             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
203
204             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
205             dadx += 2;
206             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
207             dadx += 2;
208         }
209
210         if (k < nj1)
211         {
212             jnrA        = jjnr[k];
213
214             j3A         = 3*jnrA;
215
216             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
217
218             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
219             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
220
221             dx          = _mm_sub_sd(ix, jx);
222             dy          = _mm_sub_sd(iy, jy);
223             dz          = _mm_sub_sd(iz, jz);
224
225             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
226             rinv        = gmx_mm_invsqrt_pd(rsq);
227             rinv2       = _mm_mul_sd(rinv, rinv);
228             rinv4       = _mm_mul_sd(rinv2, rinv2);
229             rinv6       = _mm_mul_sd(rinv4, rinv2);
230
231             rvdw        = _mm_add_sd(rai, raj);
232             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
233
234             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
235
236             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
237             if (0 == _mm_movemask_pd(mask_cmp) )
238             {
239                 /* if ratio>still_p5inv for ALL elements */
240                 ccf         = one;
241                 dccf        = _mm_setzero_pd();
242             }
243             else
244             {
245                 ratio       = _mm_min_sd(ratio, still_p5inv);
246                 theta       = _mm_mul_sd(ratio, still_pip5);
247                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
248                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
249                 ccf         = _mm_mul_sd(term, term);
250                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
251                                          _mm_mul_sd(sinq, theta));
252             }
253
254             prod        = _mm_mul_sd(still_p4, vaj);
255             icf4        = _mm_mul_sd(ccf, rinv4);
256             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
257
258             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
259
260             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
261
262             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
263             dadx += 2;
264             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
265             dadx += 2;
266         }
267         gmx_mm_update_1pot_pd(gpi, work+ii);
268     }
269
270     /* Sum up the polarization energy from other nodes */
271     if (PARTDECOMP(cr))
272     {
273         gmx_sum(natoms, work, cr);
274     }
275     else if (DOMAINDECOMP(cr))
276     {
277         dd_atom_sum_real(cr->dd, work);
278     }
279
280     /* Compute the radii */
281     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
282     {
283         if (born->use[i] != 0)
284         {
285             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
286             gpi2             = gpi_ai * gpi_ai;
287             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
288             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
289         }
290     }
291
292     /* Extra (local) communication required for DD */
293     if (DOMAINDECOMP(cr))
294     {
295         dd_atom_spread_real(cr->dd, born->bRad);
296         dd_atom_spread_real(cr->dd, fr->invsqrta);
297     }
298
299     return 0;
300 }
301
302
303 int
304 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
305                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
306 {
307     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
308     int           jnrA, jnrB;
309     int           j3A, j3B;
310     double        shX, shY, shZ;
311     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
312     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
313     double       *obc_param;
314     double       *gb_radius;
315     double       *work;
316     int        *  jjnr;
317     double       *dadx;
318     double       *shiftvec;
319     double        min_rad, rad;
320
321     __m128d       ix, iy, iz, jx, jy, jz;
322     __m128d       dx, dy, dz, t1, t2, t3, t4;
323     __m128d       rsq, rinv, r;
324     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
325     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
326     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
327     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
328     __m128d       dadx1, dadx2;
329     __m128d       logterm;
330     __m128d       mask;
331     __m128d       obc_mask1, obc_mask2, obc_mask3;
332
333     __m128d       oneeighth   = _mm_set1_pd(0.125);
334     __m128d       onefourth   = _mm_set1_pd(0.25);
335
336     const __m128d half  = _mm_set1_pd(0.5);
337     const __m128d three = _mm_set1_pd(3.0);
338     const __m128d one   = _mm_set1_pd(1.0);
339     const __m128d two   = _mm_set1_pd(2.0);
340     const __m128d zero  = _mm_set1_pd(0.0);
341     const __m128d neg   = _mm_set1_pd(-1.0);
342
343     /* Set the dielectric offset */
344     doffset   = born->gb_doffset;
345     gb_radius = born->gb_radius;
346     obc_param = born->param;
347     work      = born->gpol_hct_work;
348     jjnr      = nl->jjnr;
349     dadx      = fr->dadx;
350     shiftvec  = fr->shift_vec[0];
351
352     jx        = _mm_setzero_pd();
353     jy        = _mm_setzero_pd();
354     jz        = _mm_setzero_pd();
355
356     jnrA = jnrB = 0;
357
358     for (i = 0; i < born->nr; i++)
359     {
360         work[i] = 0;
361     }
362
363     for (i = 0; i < nl->nri; i++)
364     {
365         ii     = nl->iinr[i];
366         ii3    = ii*3;
367         is3    = 3*nl->shift[i];
368         shX    = shiftvec[is3];
369         shY    = shiftvec[is3+1];
370         shZ    = shiftvec[is3+2];
371         nj0    = nl->jindex[i];
372         nj1    = nl->jindex[i+1];
373
374         ix     = _mm_set1_pd(shX+x[ii3+0]);
375         iy     = _mm_set1_pd(shY+x[ii3+1]);
376         iz     = _mm_set1_pd(shZ+x[ii3+2]);
377
378         rai     = _mm_load1_pd(gb_radius+ii);
379         rai_inv = gmx_mm_inv_pd(rai);
380
381         sum_ai = _mm_setzero_pd();
382
383         sk_ai  = _mm_load1_pd(born->param+ii);
384         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
385
386         for (k = nj0; k < nj1-1; k += 2)
387         {
388             jnrA        = jjnr[k];
389             jnrB        = jjnr[k+1];
390
391             j3A         = 3*jnrA;
392             j3B         = 3*jnrB;
393
394             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
395             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
396             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
397
398             dx    = _mm_sub_pd(ix, jx);
399             dy    = _mm_sub_pd(iy, jy);
400             dz    = _mm_sub_pd(iz, jz);
401
402             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
403
404             rinv        = gmx_mm_invsqrt_pd(rsq);
405             r           = _mm_mul_pd(rsq, rinv);
406
407             /* Compute raj_inv aj1-4 */
408             raj_inv     = gmx_mm_inv_pd(raj);
409
410             /* Evaluate influence of atom aj -> ai */
411             t1            = _mm_add_pd(r, sk_aj);
412             t2            = _mm_sub_pd(r, sk_aj);
413             t3            = _mm_sub_pd(sk_aj, r);
414             obc_mask1     = _mm_cmplt_pd(rai, t1);
415             obc_mask2     = _mm_cmplt_pd(rai, t2);
416             obc_mask3     = _mm_cmplt_pd(rai, t3);
417
418             uij           = gmx_mm_inv_pd(t1);
419             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
420                                          _mm_andnot_pd(obc_mask2, rai_inv));
421             dlij          = _mm_and_pd(one, obc_mask2);
422             uij2          = _mm_mul_pd(uij, uij);
423             uij3          = _mm_mul_pd(uij2, uij);
424             lij2          = _mm_mul_pd(lij, lij);
425             lij3          = _mm_mul_pd(lij2, lij);
426
427             diff2         = _mm_sub_pd(uij2, lij2);
428             lij_inv       = gmx_mm_invsqrt_pd(lij2);
429             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
430             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
431             prod          = _mm_mul_pd(onefourth, sk2_rinv);
432
433             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
434
435             t1            = _mm_sub_pd(lij, uij);
436             t2            = _mm_mul_pd(diff2,
437                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
438                                                   prod));
439             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
440             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
441             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
442             t4            = _mm_and_pd(t4, obc_mask3);
443             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
444
445             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
446
447             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
448                                        _mm_mul_pd(prod, lij3));
449             t1            = _mm_sub_pd(t1,
450                                        _mm_mul_pd(onefourth,
451                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
452                                                              _mm_mul_pd(lij3, r))));
453             t2            = _mm_mul_pd(onefourth,
454                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
455                                                   _mm_mul_pd(uij3, r)));
456             t2            = _mm_sub_pd(t2,
457                                        _mm_add_pd(_mm_mul_pd(half, uij2),
458                                                   _mm_mul_pd(prod, uij3)));
459             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
460                                        _mm_mul_pd(rinv, rinv));
461             t3            = _mm_sub_pd(t3,
462                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
463                                                   _mm_add_pd(one,
464                                                              _mm_mul_pd(sk2_rinv, rinv))));
465             t1            = _mm_mul_pd(rinv,
466                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
467                                                   _mm_add_pd(t2, t3)));
468
469             dadx1         = _mm_and_pd(t1, obc_mask1);
470
471             /* Evaluate influence of atom ai -> aj */
472             t1            = _mm_add_pd(r, sk_ai);
473             t2            = _mm_sub_pd(r, sk_ai);
474             t3            = _mm_sub_pd(sk_ai, r);
475             obc_mask1     = _mm_cmplt_pd(raj, t1);
476             obc_mask2     = _mm_cmplt_pd(raj, t2);
477             obc_mask3     = _mm_cmplt_pd(raj, t3);
478
479             uij           = gmx_mm_inv_pd(t1);
480             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
481                                          _mm_andnot_pd(obc_mask2, raj_inv));
482             dlij          = _mm_and_pd(one, obc_mask2);
483             uij2          = _mm_mul_pd(uij, uij);
484             uij3          = _mm_mul_pd(uij2, uij);
485             lij2          = _mm_mul_pd(lij, lij);
486             lij3          = _mm_mul_pd(lij2, lij);
487
488             diff2         = _mm_sub_pd(uij2, lij2);
489             lij_inv       = gmx_mm_invsqrt_pd(lij2);
490             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
491             prod          = _mm_mul_pd(onefourth, sk2_rinv);
492
493             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
494
495             t1            = _mm_sub_pd(lij, uij);
496             t2            = _mm_mul_pd(diff2,
497                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
498                                                   prod));
499             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
500             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
501             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
502             t4            = _mm_and_pd(t4, obc_mask3);
503             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
504
505             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
506
507             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
508                                        _mm_mul_pd(prod, lij3));
509             t1            = _mm_sub_pd(t1,
510                                        _mm_mul_pd(onefourth,
511                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
512                                                              _mm_mul_pd(lij3, r))));
513             t2            = _mm_mul_pd(onefourth,
514                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
515                                                   _mm_mul_pd(uij3, r)));
516             t2            = _mm_sub_pd(t2,
517                                        _mm_add_pd(_mm_mul_pd(half, uij2),
518                                                   _mm_mul_pd(prod, uij3)));
519             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
520                                        _mm_mul_pd(rinv, rinv));
521             t3            = _mm_sub_pd(t3,
522                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
523                                                   _mm_add_pd(one,
524                                                              _mm_mul_pd(sk2_rinv, rinv))));
525             t1            = _mm_mul_pd(rinv,
526                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
527                                                   _mm_add_pd(t2, t3)));
528
529             dadx2         = _mm_and_pd(t1, obc_mask1);
530
531             _mm_store_pd(dadx, dadx1);
532             dadx += 2;
533             _mm_store_pd(dadx, dadx2);
534             dadx += 2;
535         } /* end normal inner loop */
536
537         if (k < nj1)
538         {
539             jnrA        = jjnr[k];
540
541             j3A         = 3*jnrA;
542
543             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
544             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
545             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
546
547             dx    = _mm_sub_sd(ix, jx);
548             dy    = _mm_sub_sd(iy, jy);
549             dz    = _mm_sub_sd(iz, jz);
550
551             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
552
553             rinv        = gmx_mm_invsqrt_pd(rsq);
554             r           = _mm_mul_sd(rsq, rinv);
555
556             /* Compute raj_inv aj1-4 */
557             raj_inv     = gmx_mm_inv_pd(raj);
558
559             /* Evaluate influence of atom aj -> ai */
560             t1            = _mm_add_sd(r, sk_aj);
561             t2            = _mm_sub_sd(r, sk_aj);
562             t3            = _mm_sub_sd(sk_aj, r);
563             obc_mask1     = _mm_cmplt_sd(rai, t1);
564             obc_mask2     = _mm_cmplt_sd(rai, t2);
565             obc_mask3     = _mm_cmplt_sd(rai, t3);
566
567             uij           = gmx_mm_inv_pd(t1);
568             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
569                                       _mm_andnot_pd(obc_mask2, rai_inv));
570             dlij          = _mm_and_pd(one, obc_mask2);
571             uij2          = _mm_mul_sd(uij, uij);
572             uij3          = _mm_mul_sd(uij2, uij);
573             lij2          = _mm_mul_sd(lij, lij);
574             lij3          = _mm_mul_sd(lij2, lij);
575
576             diff2         = _mm_sub_sd(uij2, lij2);
577             lij_inv       = gmx_mm_invsqrt_pd(lij2);
578             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
579             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
580             prod          = _mm_mul_sd(onefourth, sk2_rinv);
581
582             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
583
584             t1            = _mm_sub_sd(lij, uij);
585             t2            = _mm_mul_sd(diff2,
586                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
587                                                   prod));
588             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
589             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
590             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
591             t4            = _mm_and_pd(t4, obc_mask3);
592             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
593
594             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
595
596             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
597                                        _mm_mul_sd(prod, lij3));
598             t1            = _mm_sub_sd(t1,
599                                        _mm_mul_sd(onefourth,
600                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
601                                                              _mm_mul_sd(lij3, r))));
602             t2            = _mm_mul_sd(onefourth,
603                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
604                                                   _mm_mul_sd(uij3, r)));
605             t2            = _mm_sub_sd(t2,
606                                        _mm_add_sd(_mm_mul_sd(half, uij2),
607                                                   _mm_mul_sd(prod, uij3)));
608             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
609                                        _mm_mul_sd(rinv, rinv));
610             t3            = _mm_sub_sd(t3,
611                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
612                                                   _mm_add_sd(one,
613                                                              _mm_mul_sd(sk2_rinv, rinv))));
614             t1            = _mm_mul_sd(rinv,
615                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
616                                                   _mm_add_pd(t2, t3)));
617
618             dadx1         = _mm_and_pd(t1, obc_mask1);
619
620             /* Evaluate influence of atom ai -> aj */
621             t1            = _mm_add_sd(r, sk_ai);
622             t2            = _mm_sub_sd(r, sk_ai);
623             t3            = _mm_sub_sd(sk_ai, r);
624             obc_mask1     = _mm_cmplt_sd(raj, t1);
625             obc_mask2     = _mm_cmplt_sd(raj, t2);
626             obc_mask3     = _mm_cmplt_sd(raj, t3);
627
628             uij           = gmx_mm_inv_pd(t1);
629             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
630                                          _mm_andnot_pd(obc_mask2, raj_inv));
631             dlij          = _mm_and_pd(one, obc_mask2);
632             uij2          = _mm_mul_sd(uij, uij);
633             uij3          = _mm_mul_sd(uij2, uij);
634             lij2          = _mm_mul_sd(lij, lij);
635             lij3          = _mm_mul_sd(lij2, lij);
636
637             diff2         = _mm_sub_sd(uij2, lij2);
638             lij_inv       = gmx_mm_invsqrt_pd(lij2);
639             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
640             prod          = _mm_mul_sd(onefourth, sk2_rinv);
641
642             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
643
644             t1            = _mm_sub_sd(lij, uij);
645             t2            = _mm_mul_sd(diff2,
646                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
647                                                   prod));
648             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
649             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
650             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
651             t4            = _mm_and_pd(t4, obc_mask3);
652             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
653
654             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
655
656             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
657                                        _mm_mul_sd(prod, lij3));
658             t1            = _mm_sub_sd(t1,
659                                        _mm_mul_sd(onefourth,
660                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
661                                                              _mm_mul_sd(lij3, r))));
662             t2            = _mm_mul_sd(onefourth,
663                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
664                                                   _mm_mul_sd(uij3, r)));
665             t2            = _mm_sub_sd(t2,
666                                        _mm_add_sd(_mm_mul_sd(half, uij2),
667                                                   _mm_mul_sd(prod, uij3)));
668             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
669                                        _mm_mul_sd(rinv, rinv));
670             t3            = _mm_sub_sd(t3,
671                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
672                                                   _mm_add_sd(one,
673                                                              _mm_mul_sd(sk2_rinv, rinv))));
674             t1            = _mm_mul_sd(rinv,
675                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
676                                                   _mm_add_sd(t2, t3)));
677
678             dadx2         = _mm_and_pd(t1, obc_mask1);
679
680             _mm_store_pd(dadx, dadx1);
681             dadx += 2;
682             _mm_store_pd(dadx, dadx2);
683             dadx += 2;
684         }
685         gmx_mm_update_1pot_pd(sum_ai, work+ii);
686
687     }
688
689     /* Parallel summations */
690     if (PARTDECOMP(cr))
691     {
692         gmx_sum(natoms, work, cr);
693     }
694     else if (DOMAINDECOMP(cr))
695     {
696         dd_atom_sum_real(cr->dd, work);
697     }
698
699     if (gb_algorithm == egbHCT)
700     {
701         /* HCT */
702         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
703         {
704             if (born->use[i] != 0)
705             {
706                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
707                 sum     = 1.0/rr - work[i];
708                 min_rad = rr + doffset;
709                 rad     = 1.0/sum;
710
711                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
712                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
713             }
714         }
715
716         /* Extra communication required for DD */
717         if (DOMAINDECOMP(cr))
718         {
719             dd_atom_spread_real(cr->dd, born->bRad);
720             dd_atom_spread_real(cr->dd, fr->invsqrta);
721         }
722     }
723     else
724     {
725         /* OBC */
726         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
727         {
728             if (born->use[i] != 0)
729             {
730                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
731                 rr_inv2 = 1.0/rr;
732                 rr      = rr-doffset;
733                 rr_inv  = 1.0/rr;
734                 sum     = rr * work[i];
735                 sum2    = sum  * sum;
736                 sum3    = sum2 * sum;
737
738                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
739                 born->bRad[i] = rr_inv - tsum*rr_inv2;
740                 born->bRad[i] = 1.0 / born->bRad[i];
741
742                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
743
744                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
745                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
746             }
747         }
748         /* Extra (local) communication required for DD */
749         if (DOMAINDECOMP(cr))
750         {
751             dd_atom_spread_real(cr->dd, born->bRad);
752             dd_atom_spread_real(cr->dd, fr->invsqrta);
753             dd_atom_spread_real(cr->dd, born->drobc);
754         }
755     }
756
757
758
759     return 0;
760 }
761
762
763 int
764 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
765                               double *x, double *f, double *fshift, double *shiftvec,
766                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
767 {
768     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
769     int           jnrA, jnrB;
770     int           j3A, j3B;
771     int        *  jjnr;
772
773     double        rbi, shX, shY, shZ;
774     double       *rb;
775
776     __m128d       ix, iy, iz;
777     __m128d       jx, jy, jz;
778     __m128d       fix, fiy, fiz;
779     __m128d       dx, dy, dz;
780     __m128d       tx, ty, tz;
781
782     __m128d       rbai, rbaj, f_gb, f_gb_ai;
783     __m128d       xmm1, xmm2, xmm3;
784
785     const __m128d two = _mm_set1_pd(2.0);
786
787     rb     = born->work;
788
789     jjnr   = nl->jjnr;
790
791     /* Loop to get the proper form for the Born radius term, sse style */
792     n0 = 0;
793     n1 = natoms;
794
795     if (gb_algorithm == egbSTILL)
796     {
797         for (i = n0; i < n1; i++)
798         {
799             rbi   = born->bRad[i];
800             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
801         }
802     }
803     else if (gb_algorithm == egbHCT)
804     {
805         for (i = n0; i < n1; i++)
806         {
807             rbi   = born->bRad[i];
808             rb[i] = rbi * rbi * dvda[i];
809         }
810     }
811     else if (gb_algorithm == egbOBC)
812     {
813         for (i = n0; i < n1; i++)
814         {
815             rbi   = born->bRad[i];
816             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
817         }
818     }
819
820     jz = _mm_setzero_pd();
821
822     n = j3A = j3B = 0;
823
824     for (i = 0; i < nl->nri; i++)
825     {
826         ii     = nl->iinr[i];
827         ii3    = ii*3;
828         is3    = 3*nl->shift[i];
829         shX    = shiftvec[is3];
830         shY    = shiftvec[is3+1];
831         shZ    = shiftvec[is3+2];
832         nj0    = nl->jindex[i];
833         nj1    = nl->jindex[i+1];
834
835         ix     = _mm_set1_pd(shX+x[ii3+0]);
836         iy     = _mm_set1_pd(shY+x[ii3+1]);
837         iz     = _mm_set1_pd(shZ+x[ii3+2]);
838
839         rbai   = _mm_load1_pd(rb+ii);
840         fix    = _mm_setzero_pd();
841         fiy    = _mm_setzero_pd();
842         fiz    = _mm_setzero_pd();
843
844
845         for (k = nj0; k < nj1-1; k += 2)
846         {
847             jnrA        = jjnr[k];
848             jnrB        = jjnr[k+1];
849
850             j3A         = 3*jnrA;
851             j3B         = 3*jnrB;
852
853             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
854
855             dx          = _mm_sub_pd(ix, jx);
856             dy          = _mm_sub_pd(iy, jy);
857             dz          = _mm_sub_pd(iz, jz);
858
859             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
860
861             /* load chain rule terms for j1-4 */
862             f_gb        = _mm_load_pd(dadx);
863             dadx       += 2;
864             f_gb_ai     = _mm_load_pd(dadx);
865             dadx       += 2;
866
867             /* calculate scalar force */
868             f_gb    = _mm_mul_pd(f_gb, rbai);
869             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
870             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
871
872             tx     = _mm_mul_pd(f_gb, dx);
873             ty     = _mm_mul_pd(f_gb, dy);
874             tz     = _mm_mul_pd(f_gb, dz);
875
876             fix    = _mm_add_pd(fix, tx);
877             fiy    = _mm_add_pd(fiy, ty);
878             fiz    = _mm_add_pd(fiz, tz);
879
880             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
881         }
882
883         /*deal with odd elements */
884         if (k < nj1)
885         {
886             jnrA        = jjnr[k];
887             j3A         = 3*jnrA;
888
889             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
890
891             dx          = _mm_sub_sd(ix, jx);
892             dy          = _mm_sub_sd(iy, jy);
893             dz          = _mm_sub_sd(iz, jz);
894
895             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
896
897             /* load chain rule terms */
898             f_gb        = _mm_load_pd(dadx);
899             dadx       += 2;
900             f_gb_ai     = _mm_load_pd(dadx);
901             dadx       += 2;
902
903             /* calculate scalar force */
904             f_gb    = _mm_mul_sd(f_gb, rbai);
905             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
906             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
907
908             tx     = _mm_mul_sd(f_gb, dx);
909             ty     = _mm_mul_sd(f_gb, dy);
910             tz     = _mm_mul_sd(f_gb, dz);
911
912             fix    = _mm_add_sd(fix, tx);
913             fiy    = _mm_add_sd(fiy, ty);
914             fiz    = _mm_add_sd(fiz, tz);
915
916             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
917         }
918
919         /* fix/fiy/fiz now contain four partial force terms, that all should be
920          * added to the i particle forces and shift forces.
921          */
922         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
923     }
924
925     return 0;
926 }
927
928 #else
929 /* keep compiler happy */
930 int genborn_sse2_dummy;
931
932 #endif /* SSE2 intrinsics available */