Fix remaining copyright headers
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team.
6  * Copyright (c) 2013, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #ifdef HAVE_CONFIG_H
38 #include <config.h>
39 #endif
40
41 #include <math.h>
42 #include <string.h>
43
44 #include "typedefs.h"
45 #include "smalloc.h"
46 #include "genborn.h"
47 #include "vec.h"
48 #include "grompp.h"
49 #include "gromacs/fileio/pdbio.h"
50 #include "names.h"
51 #include "physics.h"
52 #include "domdec.h"
53 #include "partdec.h"
54 #include "network.h"
55 #include "gmx_fatal.h"
56 #include "mtop_util.h"
57 #include "genborn.h"
58
59 #include "gromacs/utility/gmxmpi.h"
60
61 /* Only compile this file if SSE2 intrinsics are available */
62 #if 0 && defined (GMX_X86_SSE2)
63 #include <gmx_sse2_double.h>
64 #include <emmintrin.h>
65
66 #include "genborn_sse2_double.h"
67
68 int
69 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
70                               int natoms, gmx_localtop_t *top,
71                               double *x, t_nblist *nl,
72                               gmx_genborn_t *born)
73 {
74     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
75     int           jnrA, jnrB, j3A, j3B;
76     int          *mdtype;
77     double        shX, shY, shZ;
78     int          *jjnr;
79     double       *shiftvec;
80
81     double        gpi_ai, gpi2;
82     double        factor;
83     double       *gb_radius;
84     double       *vsolv;
85     double       *work;
86     double       *dadx;
87
88     __m128d       ix, iy, iz;
89     __m128d       jx, jy, jz;
90     __m128d       dx, dy, dz;
91     __m128d       tx, ty, tz;
92     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
93     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
94     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
95     __m128d       mask, icf4, icf6, mask_cmp;
96
97     const __m128d half   = _mm_set1_pd(0.5);
98     const __m128d three  = _mm_set1_pd(3.0);
99     const __m128d one    = _mm_set1_pd(1.0);
100     const __m128d two    = _mm_set1_pd(2.0);
101     const __m128d zero   = _mm_set1_pd(0.0);
102     const __m128d four   = _mm_set1_pd(4.0);
103
104     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
105     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
106     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
107
108     factor  = 0.5 * ONE_4PI_EPS0;
109
110     gb_radius = born->gb_radius;
111     vsolv     = born->vsolv;
112     work      = born->gpol_still_work;
113     jjnr      = nl->jjnr;
114     shiftvec  = fr->shift_vec[0];
115     dadx      = fr->dadx;
116
117     jnrA = jnrB = 0;
118     jx   = _mm_setzero_pd();
119     jy   = _mm_setzero_pd();
120     jz   = _mm_setzero_pd();
121
122     n = 0;
123
124     for (i = 0; i < natoms; i++)
125     {
126         work[i] = 0;
127     }
128
129     for (i = 0; i < nl->nri; i++)
130     {
131         ii     = nl->iinr[i];
132         ii3    = ii*3;
133         is3    = 3*nl->shift[i];
134         shX    = shiftvec[is3];
135         shY    = shiftvec[is3+1];
136         shZ    = shiftvec[is3+2];
137         nj0    = nl->jindex[i];
138         nj1    = nl->jindex[i+1];
139
140         ix     = _mm_set1_pd(shX+x[ii3+0]);
141         iy     = _mm_set1_pd(shY+x[ii3+1]);
142         iz     = _mm_set1_pd(shZ+x[ii3+2]);
143
144
145         /* Polarization energy for atom ai */
146         gpi    = _mm_setzero_pd();
147
148         rai     = _mm_load1_pd(gb_radius+ii);
149         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
150
151         for (k = nj0; k < nj1-1; k += 2)
152         {
153             jnrA        = jjnr[k];
154             jnrB        = jjnr[k+1];
155
156             j3A         = 3*jnrA;
157             j3B         = 3*jnrB;
158
159             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
160
161             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
162             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
163
164             dx          = _mm_sub_pd(ix, jx);
165             dy          = _mm_sub_pd(iy, jy);
166             dz          = _mm_sub_pd(iz, jz);
167
168             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
169             rinv        = gmx_mm_invsqrt_pd(rsq);
170             rinv2       = _mm_mul_pd(rinv, rinv);
171             rinv4       = _mm_mul_pd(rinv2, rinv2);
172             rinv6       = _mm_mul_pd(rinv4, rinv2);
173
174             rvdw        = _mm_add_pd(rai, raj);
175             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
176
177             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
178
179             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
180             if (0 == _mm_movemask_pd(mask_cmp) )
181             {
182                 /* if ratio>still_p5inv for ALL elements */
183                 ccf         = one;
184                 dccf        = _mm_setzero_pd();
185             }
186             else
187             {
188                 ratio       = _mm_min_pd(ratio, still_p5inv);
189                 theta       = _mm_mul_pd(ratio, still_pip5);
190                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
191                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
192                 ccf         = _mm_mul_pd(term, term);
193                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
194                                          _mm_mul_pd(sinq, theta));
195             }
196
197             prod        = _mm_mul_pd(still_p4, vaj);
198             icf4        = _mm_mul_pd(ccf, rinv4);
199             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
200
201             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
202
203             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
204
205             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
206             dadx += 2;
207             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
208             dadx += 2;
209         }
210
211         if (k < nj1)
212         {
213             jnrA        = jjnr[k];
214
215             j3A         = 3*jnrA;
216
217             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
218
219             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
220             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
221
222             dx          = _mm_sub_sd(ix, jx);
223             dy          = _mm_sub_sd(iy, jy);
224             dz          = _mm_sub_sd(iz, jz);
225
226             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
227             rinv        = gmx_mm_invsqrt_pd(rsq);
228             rinv2       = _mm_mul_sd(rinv, rinv);
229             rinv4       = _mm_mul_sd(rinv2, rinv2);
230             rinv6       = _mm_mul_sd(rinv4, rinv2);
231
232             rvdw        = _mm_add_sd(rai, raj);
233             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
234
235             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
236
237             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
238             if (0 == _mm_movemask_pd(mask_cmp) )
239             {
240                 /* if ratio>still_p5inv for ALL elements */
241                 ccf         = one;
242                 dccf        = _mm_setzero_pd();
243             }
244             else
245             {
246                 ratio       = _mm_min_sd(ratio, still_p5inv);
247                 theta       = _mm_mul_sd(ratio, still_pip5);
248                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
249                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
250                 ccf         = _mm_mul_sd(term, term);
251                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
252                                          _mm_mul_sd(sinq, theta));
253             }
254
255             prod        = _mm_mul_sd(still_p4, vaj);
256             icf4        = _mm_mul_sd(ccf, rinv4);
257             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
258
259             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
260
261             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
262
263             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
264             dadx += 2;
265             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
266             dadx += 2;
267         }
268         gmx_mm_update_1pot_pd(gpi, work+ii);
269     }
270
271     /* Sum up the polarization energy from other nodes */
272     if (PARTDECOMP(cr))
273     {
274         gmx_sum(natoms, work, cr);
275     }
276     else if (DOMAINDECOMP(cr))
277     {
278         dd_atom_sum_real(cr->dd, work);
279     }
280
281     /* Compute the radii */
282     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
283     {
284         if (born->use[i] != 0)
285         {
286             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
287             gpi2             = gpi_ai * gpi_ai;
288             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
289             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
290         }
291     }
292
293     /* Extra (local) communication required for DD */
294     if (DOMAINDECOMP(cr))
295     {
296         dd_atom_spread_real(cr->dd, born->bRad);
297         dd_atom_spread_real(cr->dd, fr->invsqrta);
298     }
299
300     return 0;
301 }
302
303
304 int
305 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
306                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
307 {
308     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
309     int           jnrA, jnrB;
310     int           j3A, j3B;
311     double        shX, shY, shZ;
312     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
313     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
314     double       *obc_param;
315     double       *gb_radius;
316     double       *work;
317     int        *  jjnr;
318     double       *dadx;
319     double       *shiftvec;
320     double        min_rad, rad;
321
322     __m128d       ix, iy, iz, jx, jy, jz;
323     __m128d       dx, dy, dz, t1, t2, t3, t4;
324     __m128d       rsq, rinv, r;
325     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
326     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
327     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
328     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
329     __m128d       dadx1, dadx2;
330     __m128d       logterm;
331     __m128d       mask;
332     __m128d       obc_mask1, obc_mask2, obc_mask3;
333
334     __m128d       oneeighth   = _mm_set1_pd(0.125);
335     __m128d       onefourth   = _mm_set1_pd(0.25);
336
337     const __m128d half  = _mm_set1_pd(0.5);
338     const __m128d three = _mm_set1_pd(3.0);
339     const __m128d one   = _mm_set1_pd(1.0);
340     const __m128d two   = _mm_set1_pd(2.0);
341     const __m128d zero  = _mm_set1_pd(0.0);
342     const __m128d neg   = _mm_set1_pd(-1.0);
343
344     /* Set the dielectric offset */
345     doffset   = born->gb_doffset;
346     gb_radius = born->gb_radius;
347     obc_param = born->param;
348     work      = born->gpol_hct_work;
349     jjnr      = nl->jjnr;
350     dadx      = fr->dadx;
351     shiftvec  = fr->shift_vec[0];
352
353     jx        = _mm_setzero_pd();
354     jy        = _mm_setzero_pd();
355     jz        = _mm_setzero_pd();
356
357     jnrA = jnrB = 0;
358
359     for (i = 0; i < born->nr; i++)
360     {
361         work[i] = 0;
362     }
363
364     for (i = 0; i < nl->nri; i++)
365     {
366         ii     = nl->iinr[i];
367         ii3    = ii*3;
368         is3    = 3*nl->shift[i];
369         shX    = shiftvec[is3];
370         shY    = shiftvec[is3+1];
371         shZ    = shiftvec[is3+2];
372         nj0    = nl->jindex[i];
373         nj1    = nl->jindex[i+1];
374
375         ix     = _mm_set1_pd(shX+x[ii3+0]);
376         iy     = _mm_set1_pd(shY+x[ii3+1]);
377         iz     = _mm_set1_pd(shZ+x[ii3+2]);
378
379         rai     = _mm_load1_pd(gb_radius+ii);
380         rai_inv = gmx_mm_inv_pd(rai);
381
382         sum_ai = _mm_setzero_pd();
383
384         sk_ai  = _mm_load1_pd(born->param+ii);
385         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
386
387         for (k = nj0; k < nj1-1; k += 2)
388         {
389             jnrA        = jjnr[k];
390             jnrB        = jjnr[k+1];
391
392             j3A         = 3*jnrA;
393             j3B         = 3*jnrB;
394
395             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
396             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
397             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
398
399             dx    = _mm_sub_pd(ix, jx);
400             dy    = _mm_sub_pd(iy, jy);
401             dz    = _mm_sub_pd(iz, jz);
402
403             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
404
405             rinv        = gmx_mm_invsqrt_pd(rsq);
406             r           = _mm_mul_pd(rsq, rinv);
407
408             /* Compute raj_inv aj1-4 */
409             raj_inv     = gmx_mm_inv_pd(raj);
410
411             /* Evaluate influence of atom aj -> ai */
412             t1            = _mm_add_pd(r, sk_aj);
413             t2            = _mm_sub_pd(r, sk_aj);
414             t3            = _mm_sub_pd(sk_aj, r);
415             obc_mask1     = _mm_cmplt_pd(rai, t1);
416             obc_mask2     = _mm_cmplt_pd(rai, t2);
417             obc_mask3     = _mm_cmplt_pd(rai, t3);
418
419             uij           = gmx_mm_inv_pd(t1);
420             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
421                                          _mm_andnot_pd(obc_mask2, rai_inv));
422             dlij          = _mm_and_pd(one, obc_mask2);
423             uij2          = _mm_mul_pd(uij, uij);
424             uij3          = _mm_mul_pd(uij2, uij);
425             lij2          = _mm_mul_pd(lij, lij);
426             lij3          = _mm_mul_pd(lij2, lij);
427
428             diff2         = _mm_sub_pd(uij2, lij2);
429             lij_inv       = gmx_mm_invsqrt_pd(lij2);
430             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
431             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
432             prod          = _mm_mul_pd(onefourth, sk2_rinv);
433
434             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
435
436             t1            = _mm_sub_pd(lij, uij);
437             t2            = _mm_mul_pd(diff2,
438                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
439                                                   prod));
440             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
441             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
442             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
443             t4            = _mm_and_pd(t4, obc_mask3);
444             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
445
446             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
447
448             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
449                                        _mm_mul_pd(prod, lij3));
450             t1            = _mm_sub_pd(t1,
451                                        _mm_mul_pd(onefourth,
452                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
453                                                              _mm_mul_pd(lij3, r))));
454             t2            = _mm_mul_pd(onefourth,
455                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
456                                                   _mm_mul_pd(uij3, r)));
457             t2            = _mm_sub_pd(t2,
458                                        _mm_add_pd(_mm_mul_pd(half, uij2),
459                                                   _mm_mul_pd(prod, uij3)));
460             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
461                                        _mm_mul_pd(rinv, rinv));
462             t3            = _mm_sub_pd(t3,
463                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
464                                                   _mm_add_pd(one,
465                                                              _mm_mul_pd(sk2_rinv, rinv))));
466             t1            = _mm_mul_pd(rinv,
467                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
468                                                   _mm_add_pd(t2, t3)));
469
470             dadx1         = _mm_and_pd(t1, obc_mask1);
471
472             /* Evaluate influence of atom ai -> aj */
473             t1            = _mm_add_pd(r, sk_ai);
474             t2            = _mm_sub_pd(r, sk_ai);
475             t3            = _mm_sub_pd(sk_ai, r);
476             obc_mask1     = _mm_cmplt_pd(raj, t1);
477             obc_mask2     = _mm_cmplt_pd(raj, t2);
478             obc_mask3     = _mm_cmplt_pd(raj, t3);
479
480             uij           = gmx_mm_inv_pd(t1);
481             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
482                                          _mm_andnot_pd(obc_mask2, raj_inv));
483             dlij          = _mm_and_pd(one, obc_mask2);
484             uij2          = _mm_mul_pd(uij, uij);
485             uij3          = _mm_mul_pd(uij2, uij);
486             lij2          = _mm_mul_pd(lij, lij);
487             lij3          = _mm_mul_pd(lij2, lij);
488
489             diff2         = _mm_sub_pd(uij2, lij2);
490             lij_inv       = gmx_mm_invsqrt_pd(lij2);
491             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
492             prod          = _mm_mul_pd(onefourth, sk2_rinv);
493
494             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
495
496             t1            = _mm_sub_pd(lij, uij);
497             t2            = _mm_mul_pd(diff2,
498                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
499                                                   prod));
500             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
501             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
502             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
503             t4            = _mm_and_pd(t4, obc_mask3);
504             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
505
506             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
507
508             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
509                                        _mm_mul_pd(prod, lij3));
510             t1            = _mm_sub_pd(t1,
511                                        _mm_mul_pd(onefourth,
512                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
513                                                              _mm_mul_pd(lij3, r))));
514             t2            = _mm_mul_pd(onefourth,
515                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
516                                                   _mm_mul_pd(uij3, r)));
517             t2            = _mm_sub_pd(t2,
518                                        _mm_add_pd(_mm_mul_pd(half, uij2),
519                                                   _mm_mul_pd(prod, uij3)));
520             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
521                                        _mm_mul_pd(rinv, rinv));
522             t3            = _mm_sub_pd(t3,
523                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
524                                                   _mm_add_pd(one,
525                                                              _mm_mul_pd(sk2_rinv, rinv))));
526             t1            = _mm_mul_pd(rinv,
527                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
528                                                   _mm_add_pd(t2, t3)));
529
530             dadx2         = _mm_and_pd(t1, obc_mask1);
531
532             _mm_store_pd(dadx, dadx1);
533             dadx += 2;
534             _mm_store_pd(dadx, dadx2);
535             dadx += 2;
536         } /* end normal inner loop */
537
538         if (k < nj1)
539         {
540             jnrA        = jjnr[k];
541
542             j3A         = 3*jnrA;
543
544             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
545             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
546             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
547
548             dx    = _mm_sub_sd(ix, jx);
549             dy    = _mm_sub_sd(iy, jy);
550             dz    = _mm_sub_sd(iz, jz);
551
552             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
553
554             rinv        = gmx_mm_invsqrt_pd(rsq);
555             r           = _mm_mul_sd(rsq, rinv);
556
557             /* Compute raj_inv aj1-4 */
558             raj_inv     = gmx_mm_inv_pd(raj);
559
560             /* Evaluate influence of atom aj -> ai */
561             t1            = _mm_add_sd(r, sk_aj);
562             t2            = _mm_sub_sd(r, sk_aj);
563             t3            = _mm_sub_sd(sk_aj, r);
564             obc_mask1     = _mm_cmplt_sd(rai, t1);
565             obc_mask2     = _mm_cmplt_sd(rai, t2);
566             obc_mask3     = _mm_cmplt_sd(rai, t3);
567
568             uij           = gmx_mm_inv_pd(t1);
569             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
570                                       _mm_andnot_pd(obc_mask2, rai_inv));
571             dlij          = _mm_and_pd(one, obc_mask2);
572             uij2          = _mm_mul_sd(uij, uij);
573             uij3          = _mm_mul_sd(uij2, uij);
574             lij2          = _mm_mul_sd(lij, lij);
575             lij3          = _mm_mul_sd(lij2, lij);
576
577             diff2         = _mm_sub_sd(uij2, lij2);
578             lij_inv       = gmx_mm_invsqrt_pd(lij2);
579             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
580             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
581             prod          = _mm_mul_sd(onefourth, sk2_rinv);
582
583             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
584
585             t1            = _mm_sub_sd(lij, uij);
586             t2            = _mm_mul_sd(diff2,
587                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
588                                                   prod));
589             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
590             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
591             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
592             t4            = _mm_and_pd(t4, obc_mask3);
593             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
594
595             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
596
597             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
598                                        _mm_mul_sd(prod, lij3));
599             t1            = _mm_sub_sd(t1,
600                                        _mm_mul_sd(onefourth,
601                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
602                                                              _mm_mul_sd(lij3, r))));
603             t2            = _mm_mul_sd(onefourth,
604                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
605                                                   _mm_mul_sd(uij3, r)));
606             t2            = _mm_sub_sd(t2,
607                                        _mm_add_sd(_mm_mul_sd(half, uij2),
608                                                   _mm_mul_sd(prod, uij3)));
609             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
610                                        _mm_mul_sd(rinv, rinv));
611             t3            = _mm_sub_sd(t3,
612                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
613                                                   _mm_add_sd(one,
614                                                              _mm_mul_sd(sk2_rinv, rinv))));
615             t1            = _mm_mul_sd(rinv,
616                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
617                                                   _mm_add_pd(t2, t3)));
618
619             dadx1         = _mm_and_pd(t1, obc_mask1);
620
621             /* Evaluate influence of atom ai -> aj */
622             t1            = _mm_add_sd(r, sk_ai);
623             t2            = _mm_sub_sd(r, sk_ai);
624             t3            = _mm_sub_sd(sk_ai, r);
625             obc_mask1     = _mm_cmplt_sd(raj, t1);
626             obc_mask2     = _mm_cmplt_sd(raj, t2);
627             obc_mask3     = _mm_cmplt_sd(raj, t3);
628
629             uij           = gmx_mm_inv_pd(t1);
630             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
631                                          _mm_andnot_pd(obc_mask2, raj_inv));
632             dlij          = _mm_and_pd(one, obc_mask2);
633             uij2          = _mm_mul_sd(uij, uij);
634             uij3          = _mm_mul_sd(uij2, uij);
635             lij2          = _mm_mul_sd(lij, lij);
636             lij3          = _mm_mul_sd(lij2, lij);
637
638             diff2         = _mm_sub_sd(uij2, lij2);
639             lij_inv       = gmx_mm_invsqrt_pd(lij2);
640             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
641             prod          = _mm_mul_sd(onefourth, sk2_rinv);
642
643             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
644
645             t1            = _mm_sub_sd(lij, uij);
646             t2            = _mm_mul_sd(diff2,
647                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
648                                                   prod));
649             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
650             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
651             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
652             t4            = _mm_and_pd(t4, obc_mask3);
653             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
654
655             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
656
657             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
658                                        _mm_mul_sd(prod, lij3));
659             t1            = _mm_sub_sd(t1,
660                                        _mm_mul_sd(onefourth,
661                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
662                                                              _mm_mul_sd(lij3, r))));
663             t2            = _mm_mul_sd(onefourth,
664                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
665                                                   _mm_mul_sd(uij3, r)));
666             t2            = _mm_sub_sd(t2,
667                                        _mm_add_sd(_mm_mul_sd(half, uij2),
668                                                   _mm_mul_sd(prod, uij3)));
669             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
670                                        _mm_mul_sd(rinv, rinv));
671             t3            = _mm_sub_sd(t3,
672                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
673                                                   _mm_add_sd(one,
674                                                              _mm_mul_sd(sk2_rinv, rinv))));
675             t1            = _mm_mul_sd(rinv,
676                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
677                                                   _mm_add_sd(t2, t3)));
678
679             dadx2         = _mm_and_pd(t1, obc_mask1);
680
681             _mm_store_pd(dadx, dadx1);
682             dadx += 2;
683             _mm_store_pd(dadx, dadx2);
684             dadx += 2;
685         }
686         gmx_mm_update_1pot_pd(sum_ai, work+ii);
687
688     }
689
690     /* Parallel summations */
691     if (PARTDECOMP(cr))
692     {
693         gmx_sum(natoms, work, cr);
694     }
695     else if (DOMAINDECOMP(cr))
696     {
697         dd_atom_sum_real(cr->dd, work);
698     }
699
700     if (gb_algorithm == egbHCT)
701     {
702         /* HCT */
703         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
704         {
705             if (born->use[i] != 0)
706             {
707                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
708                 sum     = 1.0/rr - work[i];
709                 min_rad = rr + doffset;
710                 rad     = 1.0/sum;
711
712                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
713                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
714             }
715         }
716
717         /* Extra communication required for DD */
718         if (DOMAINDECOMP(cr))
719         {
720             dd_atom_spread_real(cr->dd, born->bRad);
721             dd_atom_spread_real(cr->dd, fr->invsqrta);
722         }
723     }
724     else
725     {
726         /* OBC */
727         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
728         {
729             if (born->use[i] != 0)
730             {
731                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
732                 rr_inv2 = 1.0/rr;
733                 rr      = rr-doffset;
734                 rr_inv  = 1.0/rr;
735                 sum     = rr * work[i];
736                 sum2    = sum  * sum;
737                 sum3    = sum2 * sum;
738
739                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
740                 born->bRad[i] = rr_inv - tsum*rr_inv2;
741                 born->bRad[i] = 1.0 / born->bRad[i];
742
743                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
744
745                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
746                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
747             }
748         }
749         /* Extra (local) communication required for DD */
750         if (DOMAINDECOMP(cr))
751         {
752             dd_atom_spread_real(cr->dd, born->bRad);
753             dd_atom_spread_real(cr->dd, fr->invsqrta);
754             dd_atom_spread_real(cr->dd, born->drobc);
755         }
756     }
757
758
759
760     return 0;
761 }
762
763
764 int
765 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
766                               double *x, double *f, double *fshift, double *shiftvec,
767                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
768 {
769     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
770     int           jnrA, jnrB;
771     int           j3A, j3B;
772     int        *  jjnr;
773
774     double        rbi, shX, shY, shZ;
775     double       *rb;
776
777     __m128d       ix, iy, iz;
778     __m128d       jx, jy, jz;
779     __m128d       fix, fiy, fiz;
780     __m128d       dx, dy, dz;
781     __m128d       tx, ty, tz;
782
783     __m128d       rbai, rbaj, f_gb, f_gb_ai;
784     __m128d       xmm1, xmm2, xmm3;
785
786     const __m128d two = _mm_set1_pd(2.0);
787
788     rb     = born->work;
789
790     jjnr   = nl->jjnr;
791
792     /* Loop to get the proper form for the Born radius term, sse style */
793     n0 = 0;
794     n1 = natoms;
795
796     if (gb_algorithm == egbSTILL)
797     {
798         for (i = n0; i < n1; i++)
799         {
800             rbi   = born->bRad[i];
801             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
802         }
803     }
804     else if (gb_algorithm == egbHCT)
805     {
806         for (i = n0; i < n1; i++)
807         {
808             rbi   = born->bRad[i];
809             rb[i] = rbi * rbi * dvda[i];
810         }
811     }
812     else if (gb_algorithm == egbOBC)
813     {
814         for (i = n0; i < n1; i++)
815         {
816             rbi   = born->bRad[i];
817             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
818         }
819     }
820
821     jz = _mm_setzero_pd();
822
823     n = j3A = j3B = 0;
824
825     for (i = 0; i < nl->nri; i++)
826     {
827         ii     = nl->iinr[i];
828         ii3    = ii*3;
829         is3    = 3*nl->shift[i];
830         shX    = shiftvec[is3];
831         shY    = shiftvec[is3+1];
832         shZ    = shiftvec[is3+2];
833         nj0    = nl->jindex[i];
834         nj1    = nl->jindex[i+1];
835
836         ix     = _mm_set1_pd(shX+x[ii3+0]);
837         iy     = _mm_set1_pd(shY+x[ii3+1]);
838         iz     = _mm_set1_pd(shZ+x[ii3+2]);
839
840         rbai   = _mm_load1_pd(rb+ii);
841         fix    = _mm_setzero_pd();
842         fiy    = _mm_setzero_pd();
843         fiz    = _mm_setzero_pd();
844
845
846         for (k = nj0; k < nj1-1; k += 2)
847         {
848             jnrA        = jjnr[k];
849             jnrB        = jjnr[k+1];
850
851             j3A         = 3*jnrA;
852             j3B         = 3*jnrB;
853
854             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
855
856             dx          = _mm_sub_pd(ix, jx);
857             dy          = _mm_sub_pd(iy, jy);
858             dz          = _mm_sub_pd(iz, jz);
859
860             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
861
862             /* load chain rule terms for j1-4 */
863             f_gb        = _mm_load_pd(dadx);
864             dadx       += 2;
865             f_gb_ai     = _mm_load_pd(dadx);
866             dadx       += 2;
867
868             /* calculate scalar force */
869             f_gb    = _mm_mul_pd(f_gb, rbai);
870             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
871             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
872
873             tx     = _mm_mul_pd(f_gb, dx);
874             ty     = _mm_mul_pd(f_gb, dy);
875             tz     = _mm_mul_pd(f_gb, dz);
876
877             fix    = _mm_add_pd(fix, tx);
878             fiy    = _mm_add_pd(fiy, ty);
879             fiz    = _mm_add_pd(fiz, tz);
880
881             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
882         }
883
884         /*deal with odd elements */
885         if (k < nj1)
886         {
887             jnrA        = jjnr[k];
888             j3A         = 3*jnrA;
889
890             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
891
892             dx          = _mm_sub_sd(ix, jx);
893             dy          = _mm_sub_sd(iy, jy);
894             dz          = _mm_sub_sd(iz, jz);
895
896             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
897
898             /* load chain rule terms */
899             f_gb        = _mm_load_pd(dadx);
900             dadx       += 2;
901             f_gb_ai     = _mm_load_pd(dadx);
902             dadx       += 2;
903
904             /* calculate scalar force */
905             f_gb    = _mm_mul_sd(f_gb, rbai);
906             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
907             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
908
909             tx     = _mm_mul_sd(f_gb, dx);
910             ty     = _mm_mul_sd(f_gb, dy);
911             tz     = _mm_mul_sd(f_gb, dz);
912
913             fix    = _mm_add_sd(fix, tx);
914             fiy    = _mm_add_sd(fiy, ty);
915             fiz    = _mm_add_sd(fiz, tz);
916
917             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
918         }
919
920         /* fix/fiy/fiz now contain four partial force terms, that all should be
921          * added to the i particle forces and shift forces.
922          */
923         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
924     }
925
926     return 0;
927 }
928
929 #else
930 /* keep compiler happy */
931 int genborn_sse2_dummy;
932
933 #endif /* SSE2 intrinsics available */