c8836ae5f53432ced840b8b5565d4898b02c9c3c
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team.
6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include "config.h"
40
41 #include <math.h>
42 #include <string.h>
43
44 #include "gromacs/legacyheaders/typedefs.h"
45 #include "gromacs/utility/smalloc.h"
46 #include "gromacs/legacyheaders/genborn.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/fileio/pdbio.h"
49 #include "gromacs/legacyheaders/names.h"
50 #include "gromacs/math/units.h"
51 #include "gromacs/legacyheaders/domdec.h"
52 #include "gromacs/legacyheaders/network.h"
53 #include "gromacs/utility/fatalerror.h"
54 #include "gromacs/legacyheaders/genborn.h"
55
56 #include "gromacs/utility/gmxmpi.h"
57
58 /* Only compile this file if SSE2 intrinsics are available */
59 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
60 #include <gmx_sse2_double.h>
61 #include <emmintrin.h>
62
63 #include "genborn_sse2_double.h"
64
65 int
66 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
67                               int natoms, gmx_localtop_t *top,
68                               double *x, t_nblist *nl,
69                               gmx_genborn_t *born)
70 {
71     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
72     int           jnrA, jnrB, j3A, j3B;
73     int          *mdtype;
74     double        shX, shY, shZ;
75     int          *jjnr;
76     double       *shiftvec;
77
78     double        gpi_ai, gpi2;
79     double        factor;
80     double       *gb_radius;
81     double       *vsolv;
82     double       *work;
83     double       *dadx;
84
85     __m128d       ix, iy, iz;
86     __m128d       jx, jy, jz;
87     __m128d       dx, dy, dz;
88     __m128d       tx, ty, tz;
89     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
90     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
91     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
92     __m128d       mask, icf4, icf6, mask_cmp;
93
94     const __m128d half   = _mm_set1_pd(0.5);
95     const __m128d three  = _mm_set1_pd(3.0);
96     const __m128d one    = _mm_set1_pd(1.0);
97     const __m128d two    = _mm_set1_pd(2.0);
98     const __m128d zero   = _mm_set1_pd(0.0);
99     const __m128d four   = _mm_set1_pd(4.0);
100
101     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
102     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
103     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
104
105     factor  = 0.5 * ONE_4PI_EPS0;
106
107     gb_radius = born->gb_radius;
108     vsolv     = born->vsolv;
109     work      = born->gpol_still_work;
110     jjnr      = nl->jjnr;
111     shiftvec  = fr->shift_vec[0];
112     dadx      = fr->dadx;
113
114     jnrA = jnrB = 0;
115     jx   = _mm_setzero_pd();
116     jy   = _mm_setzero_pd();
117     jz   = _mm_setzero_pd();
118
119     n = 0;
120
121     for (i = 0; i < natoms; i++)
122     {
123         work[i] = 0;
124     }
125
126     for (i = 0; i < nl->nri; i++)
127     {
128         ii     = nl->iinr[i];
129         ii3    = ii*3;
130         is3    = 3*nl->shift[i];
131         shX    = shiftvec[is3];
132         shY    = shiftvec[is3+1];
133         shZ    = shiftvec[is3+2];
134         nj0    = nl->jindex[i];
135         nj1    = nl->jindex[i+1];
136
137         ix     = _mm_set1_pd(shX+x[ii3+0]);
138         iy     = _mm_set1_pd(shY+x[ii3+1]);
139         iz     = _mm_set1_pd(shZ+x[ii3+2]);
140
141
142         /* Polarization energy for atom ai */
143         gpi    = _mm_setzero_pd();
144
145         rai     = _mm_load1_pd(gb_radius+ii);
146         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
147
148         for (k = nj0; k < nj1-1; k += 2)
149         {
150             jnrA        = jjnr[k];
151             jnrB        = jjnr[k+1];
152
153             j3A         = 3*jnrA;
154             j3B         = 3*jnrB;
155
156             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
157
158             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
159             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
160
161             dx          = _mm_sub_pd(ix, jx);
162             dy          = _mm_sub_pd(iy, jy);
163             dz          = _mm_sub_pd(iz, jz);
164
165             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
166             rinv        = gmx_mm_invsqrt_pd(rsq);
167             rinv2       = _mm_mul_pd(rinv, rinv);
168             rinv4       = _mm_mul_pd(rinv2, rinv2);
169             rinv6       = _mm_mul_pd(rinv4, rinv2);
170
171             rvdw        = _mm_add_pd(rai, raj);
172             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
173
174             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
175
176             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
177             if (0 == _mm_movemask_pd(mask_cmp) )
178             {
179                 /* if ratio>still_p5inv for ALL elements */
180                 ccf         = one;
181                 dccf        = _mm_setzero_pd();
182             }
183             else
184             {
185                 ratio       = _mm_min_pd(ratio, still_p5inv);
186                 theta       = _mm_mul_pd(ratio, still_pip5);
187                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
188                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
189                 ccf         = _mm_mul_pd(term, term);
190                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
191                                          _mm_mul_pd(sinq, theta));
192             }
193
194             prod        = _mm_mul_pd(still_p4, vaj);
195             icf4        = _mm_mul_pd(ccf, rinv4);
196             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
197
198             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
199
200             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
201
202             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
203             dadx += 2;
204             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
205             dadx += 2;
206         }
207
208         if (k < nj1)
209         {
210             jnrA        = jjnr[k];
211
212             j3A         = 3*jnrA;
213
214             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
215
216             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
217             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
218
219             dx          = _mm_sub_sd(ix, jx);
220             dy          = _mm_sub_sd(iy, jy);
221             dz          = _mm_sub_sd(iz, jz);
222
223             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
224             rinv        = gmx_mm_invsqrt_pd(rsq);
225             rinv2       = _mm_mul_sd(rinv, rinv);
226             rinv4       = _mm_mul_sd(rinv2, rinv2);
227             rinv6       = _mm_mul_sd(rinv4, rinv2);
228
229             rvdw        = _mm_add_sd(rai, raj);
230             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
231
232             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
233
234             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
235             if (0 == _mm_movemask_pd(mask_cmp) )
236             {
237                 /* if ratio>still_p5inv for ALL elements */
238                 ccf         = one;
239                 dccf        = _mm_setzero_pd();
240             }
241             else
242             {
243                 ratio       = _mm_min_sd(ratio, still_p5inv);
244                 theta       = _mm_mul_sd(ratio, still_pip5);
245                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
246                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
247                 ccf         = _mm_mul_sd(term, term);
248                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
249                                          _mm_mul_sd(sinq, theta));
250             }
251
252             prod        = _mm_mul_sd(still_p4, vaj);
253             icf4        = _mm_mul_sd(ccf, rinv4);
254             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
255
256             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
257
258             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
259
260             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
261             dadx += 2;
262             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
263             dadx += 2;
264         }
265         gmx_mm_update_1pot_pd(gpi, work+ii);
266     }
267
268     /* Sum up the polarization energy from other nodes */
269     if (DOMAINDECOMP(cr))
270     {
271         dd_atom_sum_real(cr->dd, work);
272     }
273
274     /* Compute the radii */
275     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
276     {
277         if (born->use[i] != 0)
278         {
279             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
280             gpi2             = gpi_ai * gpi_ai;
281             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
282             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
283         }
284     }
285
286     /* Extra (local) communication required for DD */
287     if (DOMAINDECOMP(cr))
288     {
289         dd_atom_spread_real(cr->dd, born->bRad);
290         dd_atom_spread_real(cr->dd, fr->invsqrta);
291     }
292
293     return 0;
294 }
295
296
297 int
298 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
299                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
300 {
301     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
302     int           jnrA, jnrB;
303     int           j3A, j3B;
304     double        shX, shY, shZ;
305     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
306     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
307     double       *obc_param;
308     double       *gb_radius;
309     double       *work;
310     int        *  jjnr;
311     double       *dadx;
312     double       *shiftvec;
313     double        min_rad, rad;
314
315     __m128d       ix, iy, iz, jx, jy, jz;
316     __m128d       dx, dy, dz, t1, t2, t3, t4;
317     __m128d       rsq, rinv, r;
318     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
319     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
320     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
321     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
322     __m128d       dadx1, dadx2;
323     __m128d       logterm;
324     __m128d       mask;
325     __m128d       obc_mask1, obc_mask2, obc_mask3;
326
327     __m128d       oneeighth   = _mm_set1_pd(0.125);
328     __m128d       onefourth   = _mm_set1_pd(0.25);
329
330     const __m128d half  = _mm_set1_pd(0.5);
331     const __m128d three = _mm_set1_pd(3.0);
332     const __m128d one   = _mm_set1_pd(1.0);
333     const __m128d two   = _mm_set1_pd(2.0);
334     const __m128d zero  = _mm_set1_pd(0.0);
335     const __m128d neg   = _mm_set1_pd(-1.0);
336
337     /* Set the dielectric offset */
338     doffset   = born->gb_doffset;
339     gb_radius = born->gb_radius;
340     obc_param = born->param;
341     work      = born->gpol_hct_work;
342     jjnr      = nl->jjnr;
343     dadx      = fr->dadx;
344     shiftvec  = fr->shift_vec[0];
345
346     jx        = _mm_setzero_pd();
347     jy        = _mm_setzero_pd();
348     jz        = _mm_setzero_pd();
349
350     jnrA = jnrB = 0;
351
352     for (i = 0; i < born->nr; i++)
353     {
354         work[i] = 0;
355     }
356
357     for (i = 0; i < nl->nri; i++)
358     {
359         ii     = nl->iinr[i];
360         ii3    = ii*3;
361         is3    = 3*nl->shift[i];
362         shX    = shiftvec[is3];
363         shY    = shiftvec[is3+1];
364         shZ    = shiftvec[is3+2];
365         nj0    = nl->jindex[i];
366         nj1    = nl->jindex[i+1];
367
368         ix     = _mm_set1_pd(shX+x[ii3+0]);
369         iy     = _mm_set1_pd(shY+x[ii3+1]);
370         iz     = _mm_set1_pd(shZ+x[ii3+2]);
371
372         rai     = _mm_load1_pd(gb_radius+ii);
373         rai_inv = gmx_mm_inv_pd(rai);
374
375         sum_ai = _mm_setzero_pd();
376
377         sk_ai  = _mm_load1_pd(born->param+ii);
378         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
379
380         for (k = nj0; k < nj1-1; k += 2)
381         {
382             jnrA        = jjnr[k];
383             jnrB        = jjnr[k+1];
384
385             j3A         = 3*jnrA;
386             j3B         = 3*jnrB;
387
388             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
389             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
390             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
391
392             dx    = _mm_sub_pd(ix, jx);
393             dy    = _mm_sub_pd(iy, jy);
394             dz    = _mm_sub_pd(iz, jz);
395
396             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
397
398             rinv        = gmx_mm_invsqrt_pd(rsq);
399             r           = _mm_mul_pd(rsq, rinv);
400
401             /* Compute raj_inv aj1-4 */
402             raj_inv     = gmx_mm_inv_pd(raj);
403
404             /* Evaluate influence of atom aj -> ai */
405             t1            = _mm_add_pd(r, sk_aj);
406             t2            = _mm_sub_pd(r, sk_aj);
407             t3            = _mm_sub_pd(sk_aj, r);
408             obc_mask1     = _mm_cmplt_pd(rai, t1);
409             obc_mask2     = _mm_cmplt_pd(rai, t2);
410             obc_mask3     = _mm_cmplt_pd(rai, t3);
411
412             uij           = gmx_mm_inv_pd(t1);
413             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
414                                          _mm_andnot_pd(obc_mask2, rai_inv));
415             dlij          = _mm_and_pd(one, obc_mask2);
416             uij2          = _mm_mul_pd(uij, uij);
417             uij3          = _mm_mul_pd(uij2, uij);
418             lij2          = _mm_mul_pd(lij, lij);
419             lij3          = _mm_mul_pd(lij2, lij);
420
421             diff2         = _mm_sub_pd(uij2, lij2);
422             lij_inv       = gmx_mm_invsqrt_pd(lij2);
423             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
424             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
425             prod          = _mm_mul_pd(onefourth, sk2_rinv);
426
427             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
428
429             t1            = _mm_sub_pd(lij, uij);
430             t2            = _mm_mul_pd(diff2,
431                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
432                                                   prod));
433             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
434             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
435             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
436             t4            = _mm_and_pd(t4, obc_mask3);
437             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
438
439             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
440
441             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
442                                        _mm_mul_pd(prod, lij3));
443             t1            = _mm_sub_pd(t1,
444                                        _mm_mul_pd(onefourth,
445                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
446                                                              _mm_mul_pd(lij3, r))));
447             t2            = _mm_mul_pd(onefourth,
448                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
449                                                   _mm_mul_pd(uij3, r)));
450             t2            = _mm_sub_pd(t2,
451                                        _mm_add_pd(_mm_mul_pd(half, uij2),
452                                                   _mm_mul_pd(prod, uij3)));
453             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
454                                        _mm_mul_pd(rinv, rinv));
455             t3            = _mm_sub_pd(t3,
456                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
457                                                   _mm_add_pd(one,
458                                                              _mm_mul_pd(sk2_rinv, rinv))));
459             t1            = _mm_mul_pd(rinv,
460                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
461                                                   _mm_add_pd(t2, t3)));
462
463             dadx1         = _mm_and_pd(t1, obc_mask1);
464
465             /* Evaluate influence of atom ai -> aj */
466             t1            = _mm_add_pd(r, sk_ai);
467             t2            = _mm_sub_pd(r, sk_ai);
468             t3            = _mm_sub_pd(sk_ai, r);
469             obc_mask1     = _mm_cmplt_pd(raj, t1);
470             obc_mask2     = _mm_cmplt_pd(raj, t2);
471             obc_mask3     = _mm_cmplt_pd(raj, t3);
472
473             uij           = gmx_mm_inv_pd(t1);
474             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
475                                          _mm_andnot_pd(obc_mask2, raj_inv));
476             dlij          = _mm_and_pd(one, obc_mask2);
477             uij2          = _mm_mul_pd(uij, uij);
478             uij3          = _mm_mul_pd(uij2, uij);
479             lij2          = _mm_mul_pd(lij, lij);
480             lij3          = _mm_mul_pd(lij2, lij);
481
482             diff2         = _mm_sub_pd(uij2, lij2);
483             lij_inv       = gmx_mm_invsqrt_pd(lij2);
484             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
485             prod          = _mm_mul_pd(onefourth, sk2_rinv);
486
487             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
488
489             t1            = _mm_sub_pd(lij, uij);
490             t2            = _mm_mul_pd(diff2,
491                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
492                                                   prod));
493             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
494             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
495             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
496             t4            = _mm_and_pd(t4, obc_mask3);
497             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
498
499             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
500
501             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
502                                        _mm_mul_pd(prod, lij3));
503             t1            = _mm_sub_pd(t1,
504                                        _mm_mul_pd(onefourth,
505                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
506                                                              _mm_mul_pd(lij3, r))));
507             t2            = _mm_mul_pd(onefourth,
508                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
509                                                   _mm_mul_pd(uij3, r)));
510             t2            = _mm_sub_pd(t2,
511                                        _mm_add_pd(_mm_mul_pd(half, uij2),
512                                                   _mm_mul_pd(prod, uij3)));
513             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
514                                        _mm_mul_pd(rinv, rinv));
515             t3            = _mm_sub_pd(t3,
516                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
517                                                   _mm_add_pd(one,
518                                                              _mm_mul_pd(sk2_rinv, rinv))));
519             t1            = _mm_mul_pd(rinv,
520                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
521                                                   _mm_add_pd(t2, t3)));
522
523             dadx2         = _mm_and_pd(t1, obc_mask1);
524
525             _mm_store_pd(dadx, dadx1);
526             dadx += 2;
527             _mm_store_pd(dadx, dadx2);
528             dadx += 2;
529         } /* end normal inner loop */
530
531         if (k < nj1)
532         {
533             jnrA        = jjnr[k];
534
535             j3A         = 3*jnrA;
536
537             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
538             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
539             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
540
541             dx    = _mm_sub_sd(ix, jx);
542             dy    = _mm_sub_sd(iy, jy);
543             dz    = _mm_sub_sd(iz, jz);
544
545             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
546
547             rinv        = gmx_mm_invsqrt_pd(rsq);
548             r           = _mm_mul_sd(rsq, rinv);
549
550             /* Compute raj_inv aj1-4 */
551             raj_inv     = gmx_mm_inv_pd(raj);
552
553             /* Evaluate influence of atom aj -> ai */
554             t1            = _mm_add_sd(r, sk_aj);
555             t2            = _mm_sub_sd(r, sk_aj);
556             t3            = _mm_sub_sd(sk_aj, r);
557             obc_mask1     = _mm_cmplt_sd(rai, t1);
558             obc_mask2     = _mm_cmplt_sd(rai, t2);
559             obc_mask3     = _mm_cmplt_sd(rai, t3);
560
561             uij           = gmx_mm_inv_pd(t1);
562             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
563                                       _mm_andnot_pd(obc_mask2, rai_inv));
564             dlij          = _mm_and_pd(one, obc_mask2);
565             uij2          = _mm_mul_sd(uij, uij);
566             uij3          = _mm_mul_sd(uij2, uij);
567             lij2          = _mm_mul_sd(lij, lij);
568             lij3          = _mm_mul_sd(lij2, lij);
569
570             diff2         = _mm_sub_sd(uij2, lij2);
571             lij_inv       = gmx_mm_invsqrt_pd(lij2);
572             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
573             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
574             prod          = _mm_mul_sd(onefourth, sk2_rinv);
575
576             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
577
578             t1            = _mm_sub_sd(lij, uij);
579             t2            = _mm_mul_sd(diff2,
580                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
581                                                   prod));
582             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
583             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
584             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
585             t4            = _mm_and_pd(t4, obc_mask3);
586             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
587
588             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
589
590             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
591                                        _mm_mul_sd(prod, lij3));
592             t1            = _mm_sub_sd(t1,
593                                        _mm_mul_sd(onefourth,
594                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
595                                                              _mm_mul_sd(lij3, r))));
596             t2            = _mm_mul_sd(onefourth,
597                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
598                                                   _mm_mul_sd(uij3, r)));
599             t2            = _mm_sub_sd(t2,
600                                        _mm_add_sd(_mm_mul_sd(half, uij2),
601                                                   _mm_mul_sd(prod, uij3)));
602             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
603                                        _mm_mul_sd(rinv, rinv));
604             t3            = _mm_sub_sd(t3,
605                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
606                                                   _mm_add_sd(one,
607                                                              _mm_mul_sd(sk2_rinv, rinv))));
608             t1            = _mm_mul_sd(rinv,
609                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
610                                                   _mm_add_pd(t2, t3)));
611
612             dadx1         = _mm_and_pd(t1, obc_mask1);
613
614             /* Evaluate influence of atom ai -> aj */
615             t1            = _mm_add_sd(r, sk_ai);
616             t2            = _mm_sub_sd(r, sk_ai);
617             t3            = _mm_sub_sd(sk_ai, r);
618             obc_mask1     = _mm_cmplt_sd(raj, t1);
619             obc_mask2     = _mm_cmplt_sd(raj, t2);
620             obc_mask3     = _mm_cmplt_sd(raj, t3);
621
622             uij           = gmx_mm_inv_pd(t1);
623             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
624                                          _mm_andnot_pd(obc_mask2, raj_inv));
625             dlij          = _mm_and_pd(one, obc_mask2);
626             uij2          = _mm_mul_sd(uij, uij);
627             uij3          = _mm_mul_sd(uij2, uij);
628             lij2          = _mm_mul_sd(lij, lij);
629             lij3          = _mm_mul_sd(lij2, lij);
630
631             diff2         = _mm_sub_sd(uij2, lij2);
632             lij_inv       = gmx_mm_invsqrt_pd(lij2);
633             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
634             prod          = _mm_mul_sd(onefourth, sk2_rinv);
635
636             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
637
638             t1            = _mm_sub_sd(lij, uij);
639             t2            = _mm_mul_sd(diff2,
640                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
641                                                   prod));
642             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
643             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
644             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
645             t4            = _mm_and_pd(t4, obc_mask3);
646             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
647
648             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
649
650             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
651                                        _mm_mul_sd(prod, lij3));
652             t1            = _mm_sub_sd(t1,
653                                        _mm_mul_sd(onefourth,
654                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
655                                                              _mm_mul_sd(lij3, r))));
656             t2            = _mm_mul_sd(onefourth,
657                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
658                                                   _mm_mul_sd(uij3, r)));
659             t2            = _mm_sub_sd(t2,
660                                        _mm_add_sd(_mm_mul_sd(half, uij2),
661                                                   _mm_mul_sd(prod, uij3)));
662             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
663                                        _mm_mul_sd(rinv, rinv));
664             t3            = _mm_sub_sd(t3,
665                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
666                                                   _mm_add_sd(one,
667                                                              _mm_mul_sd(sk2_rinv, rinv))));
668             t1            = _mm_mul_sd(rinv,
669                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
670                                                   _mm_add_sd(t2, t3)));
671
672             dadx2         = _mm_and_pd(t1, obc_mask1);
673
674             _mm_store_pd(dadx, dadx1);
675             dadx += 2;
676             _mm_store_pd(dadx, dadx2);
677             dadx += 2;
678         }
679         gmx_mm_update_1pot_pd(sum_ai, work+ii);
680
681     }
682
683     /* Parallel summations */
684     if (DOMAINDECOMP(cr))
685     {
686         dd_atom_sum_real(cr->dd, work);
687     }
688
689     if (gb_algorithm == egbHCT)
690     {
691         /* HCT */
692         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
693         {
694             if (born->use[i] != 0)
695             {
696                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
697                 sum     = 1.0/rr - work[i];
698                 min_rad = rr + doffset;
699                 rad     = 1.0/sum;
700
701                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
702                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
703             }
704         }
705
706         /* Extra communication required for DD */
707         if (DOMAINDECOMP(cr))
708         {
709             dd_atom_spread_real(cr->dd, born->bRad);
710             dd_atom_spread_real(cr->dd, fr->invsqrta);
711         }
712     }
713     else
714     {
715         /* OBC */
716         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
717         {
718             if (born->use[i] != 0)
719             {
720                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
721                 rr_inv2 = 1.0/rr;
722                 rr      = rr-doffset;
723                 rr_inv  = 1.0/rr;
724                 sum     = rr * work[i];
725                 sum2    = sum  * sum;
726                 sum3    = sum2 * sum;
727
728                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
729                 born->bRad[i] = rr_inv - tsum*rr_inv2;
730                 born->bRad[i] = 1.0 / born->bRad[i];
731
732                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
733
734                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
735                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
736             }
737         }
738         /* Extra (local) communication required for DD */
739         if (DOMAINDECOMP(cr))
740         {
741             dd_atom_spread_real(cr->dd, born->bRad);
742             dd_atom_spread_real(cr->dd, fr->invsqrta);
743             dd_atom_spread_real(cr->dd, born->drobc);
744         }
745     }
746
747
748
749     return 0;
750 }
751
752
753 int
754 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
755                               double *x, double *f, double *fshift, double *shiftvec,
756                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
757 {
758     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
759     int           jnrA, jnrB;
760     int           j3A, j3B;
761     int        *  jjnr;
762
763     double        rbi, shX, shY, shZ;
764     double       *rb;
765
766     __m128d       ix, iy, iz;
767     __m128d       jx, jy, jz;
768     __m128d       fix, fiy, fiz;
769     __m128d       dx, dy, dz;
770     __m128d       tx, ty, tz;
771
772     __m128d       rbai, rbaj, f_gb, f_gb_ai;
773     __m128d       xmm1, xmm2, xmm3;
774
775     const __m128d two = _mm_set1_pd(2.0);
776
777     rb     = born->work;
778
779     jjnr   = nl->jjnr;
780
781     /* Loop to get the proper form for the Born radius term, sse style */
782     n0 = 0;
783     n1 = natoms;
784
785     if (gb_algorithm == egbSTILL)
786     {
787         for (i = n0; i < n1; i++)
788         {
789             rbi   = born->bRad[i];
790             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
791         }
792     }
793     else if (gb_algorithm == egbHCT)
794     {
795         for (i = n0; i < n1; i++)
796         {
797             rbi   = born->bRad[i];
798             rb[i] = rbi * rbi * dvda[i];
799         }
800     }
801     else if (gb_algorithm == egbOBC)
802     {
803         for (i = n0; i < n1; i++)
804         {
805             rbi   = born->bRad[i];
806             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
807         }
808     }
809
810     jz = _mm_setzero_pd();
811
812     n = j3A = j3B = 0;
813
814     for (i = 0; i < nl->nri; i++)
815     {
816         ii     = nl->iinr[i];
817         ii3    = ii*3;
818         is3    = 3*nl->shift[i];
819         shX    = shiftvec[is3];
820         shY    = shiftvec[is3+1];
821         shZ    = shiftvec[is3+2];
822         nj0    = nl->jindex[i];
823         nj1    = nl->jindex[i+1];
824
825         ix     = _mm_set1_pd(shX+x[ii3+0]);
826         iy     = _mm_set1_pd(shY+x[ii3+1]);
827         iz     = _mm_set1_pd(shZ+x[ii3+2]);
828
829         rbai   = _mm_load1_pd(rb+ii);
830         fix    = _mm_setzero_pd();
831         fiy    = _mm_setzero_pd();
832         fiz    = _mm_setzero_pd();
833
834
835         for (k = nj0; k < nj1-1; k += 2)
836         {
837             jnrA        = jjnr[k];
838             jnrB        = jjnr[k+1];
839
840             j3A         = 3*jnrA;
841             j3B         = 3*jnrB;
842
843             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
844
845             dx          = _mm_sub_pd(ix, jx);
846             dy          = _mm_sub_pd(iy, jy);
847             dz          = _mm_sub_pd(iz, jz);
848
849             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
850
851             /* load chain rule terms for j1-4 */
852             f_gb        = _mm_load_pd(dadx);
853             dadx       += 2;
854             f_gb_ai     = _mm_load_pd(dadx);
855             dadx       += 2;
856
857             /* calculate scalar force */
858             f_gb    = _mm_mul_pd(f_gb, rbai);
859             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
860             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
861
862             tx     = _mm_mul_pd(f_gb, dx);
863             ty     = _mm_mul_pd(f_gb, dy);
864             tz     = _mm_mul_pd(f_gb, dz);
865
866             fix    = _mm_add_pd(fix, tx);
867             fiy    = _mm_add_pd(fiy, ty);
868             fiz    = _mm_add_pd(fiz, tz);
869
870             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
871         }
872
873         /*deal with odd elements */
874         if (k < nj1)
875         {
876             jnrA        = jjnr[k];
877             j3A         = 3*jnrA;
878
879             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
880
881             dx          = _mm_sub_sd(ix, jx);
882             dy          = _mm_sub_sd(iy, jy);
883             dz          = _mm_sub_sd(iz, jz);
884
885             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
886
887             /* load chain rule terms */
888             f_gb        = _mm_load_pd(dadx);
889             dadx       += 2;
890             f_gb_ai     = _mm_load_pd(dadx);
891             dadx       += 2;
892
893             /* calculate scalar force */
894             f_gb    = _mm_mul_sd(f_gb, rbai);
895             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
896             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
897
898             tx     = _mm_mul_sd(f_gb, dx);
899             ty     = _mm_mul_sd(f_gb, dy);
900             tz     = _mm_mul_sd(f_gb, dz);
901
902             fix    = _mm_add_sd(fix, tx);
903             fiy    = _mm_add_sd(fiy, ty);
904             fiz    = _mm_add_sd(fiz, tz);
905
906             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
907         }
908
909         /* fix/fiy/fiz now contain four partial force terms, that all should be
910          * added to the i particle forces and shift forces.
911          */
912         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
913     }
914
915     return 0;
916 }
917
918 #else
919 /* keep compiler happy */
920 int genborn_sse2_dummy;
921
922 #endif /* SSE2 intrinsics available */