Sort all includes in src/gromacs
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team.
6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include <math.h>
40 #include <string.h>
41
42 #include "gromacs/fileio/pdbio.h"
43 #include "gromacs/legacyheaders/domdec.h"
44 #include "gromacs/legacyheaders/genborn.h"
45 #include "gromacs/legacyheaders/names.h"
46 #include "gromacs/legacyheaders/network.h"
47 #include "gromacs/legacyheaders/typedefs.h"
48 #include "gromacs/math/units.h"
49 #include "gromacs/math/vec.h"
50 #include "gromacs/utility/fatalerror.h"
51 #include "gromacs/utility/gmxmpi.h"
52 #include "gromacs/utility/smalloc.h"
53
54 /* Only compile this file if SSE2 intrinsics are available */
55 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
56 #include "genborn_sse2_double.h"
57
58 #include <emmintrin.h>
59 #include <gmx_sse2_double.h>
60
61 int
62 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
63                               int natoms, gmx_localtop_t *top,
64                               double *x, t_nblist *nl,
65                               gmx_genborn_t *born)
66 {
67     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
68     int           jnrA, jnrB, j3A, j3B;
69     int          *mdtype;
70     double        shX, shY, shZ;
71     int          *jjnr;
72     double       *shiftvec;
73
74     double        gpi_ai, gpi2;
75     double        factor;
76     double       *gb_radius;
77     double       *vsolv;
78     double       *work;
79     double       *dadx;
80
81     __m128d       ix, iy, iz;
82     __m128d       jx, jy, jz;
83     __m128d       dx, dy, dz;
84     __m128d       tx, ty, tz;
85     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
86     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
87     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
88     __m128d       mask, icf4, icf6, mask_cmp;
89
90     const __m128d half   = _mm_set1_pd(0.5);
91     const __m128d three  = _mm_set1_pd(3.0);
92     const __m128d one    = _mm_set1_pd(1.0);
93     const __m128d two    = _mm_set1_pd(2.0);
94     const __m128d zero   = _mm_set1_pd(0.0);
95     const __m128d four   = _mm_set1_pd(4.0);
96
97     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
98     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
99     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
100
101     factor  = 0.5 * ONE_4PI_EPS0;
102
103     gb_radius = born->gb_radius;
104     vsolv     = born->vsolv;
105     work      = born->gpol_still_work;
106     jjnr      = nl->jjnr;
107     shiftvec  = fr->shift_vec[0];
108     dadx      = fr->dadx;
109
110     jnrA = jnrB = 0;
111     jx   = _mm_setzero_pd();
112     jy   = _mm_setzero_pd();
113     jz   = _mm_setzero_pd();
114
115     n = 0;
116
117     for (i = 0; i < natoms; i++)
118     {
119         work[i] = 0;
120     }
121
122     for (i = 0; i < nl->nri; i++)
123     {
124         ii     = nl->iinr[i];
125         ii3    = ii*3;
126         is3    = 3*nl->shift[i];
127         shX    = shiftvec[is3];
128         shY    = shiftvec[is3+1];
129         shZ    = shiftvec[is3+2];
130         nj0    = nl->jindex[i];
131         nj1    = nl->jindex[i+1];
132
133         ix     = _mm_set1_pd(shX+x[ii3+0]);
134         iy     = _mm_set1_pd(shY+x[ii3+1]);
135         iz     = _mm_set1_pd(shZ+x[ii3+2]);
136
137
138         /* Polarization energy for atom ai */
139         gpi    = _mm_setzero_pd();
140
141         rai     = _mm_load1_pd(gb_radius+ii);
142         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
143
144         for (k = nj0; k < nj1-1; k += 2)
145         {
146             jnrA        = jjnr[k];
147             jnrB        = jjnr[k+1];
148
149             j3A         = 3*jnrA;
150             j3B         = 3*jnrB;
151
152             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
153
154             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
155             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
156
157             dx          = _mm_sub_pd(ix, jx);
158             dy          = _mm_sub_pd(iy, jy);
159             dz          = _mm_sub_pd(iz, jz);
160
161             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
162             rinv        = gmx_mm_invsqrt_pd(rsq);
163             rinv2       = _mm_mul_pd(rinv, rinv);
164             rinv4       = _mm_mul_pd(rinv2, rinv2);
165             rinv6       = _mm_mul_pd(rinv4, rinv2);
166
167             rvdw        = _mm_add_pd(rai, raj);
168             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
169
170             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
171
172             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
173             if (0 == _mm_movemask_pd(mask_cmp) )
174             {
175                 /* if ratio>still_p5inv for ALL elements */
176                 ccf         = one;
177                 dccf        = _mm_setzero_pd();
178             }
179             else
180             {
181                 ratio       = _mm_min_pd(ratio, still_p5inv);
182                 theta       = _mm_mul_pd(ratio, still_pip5);
183                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
184                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
185                 ccf         = _mm_mul_pd(term, term);
186                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
187                                          _mm_mul_pd(sinq, theta));
188             }
189
190             prod        = _mm_mul_pd(still_p4, vaj);
191             icf4        = _mm_mul_pd(ccf, rinv4);
192             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
193
194             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
195
196             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
197
198             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
199             dadx += 2;
200             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
201             dadx += 2;
202         }
203
204         if (k < nj1)
205         {
206             jnrA        = jjnr[k];
207
208             j3A         = 3*jnrA;
209
210             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
211
212             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
213             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
214
215             dx          = _mm_sub_sd(ix, jx);
216             dy          = _mm_sub_sd(iy, jy);
217             dz          = _mm_sub_sd(iz, jz);
218
219             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
220             rinv        = gmx_mm_invsqrt_pd(rsq);
221             rinv2       = _mm_mul_sd(rinv, rinv);
222             rinv4       = _mm_mul_sd(rinv2, rinv2);
223             rinv6       = _mm_mul_sd(rinv4, rinv2);
224
225             rvdw        = _mm_add_sd(rai, raj);
226             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
227
228             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
229
230             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
231             if (0 == _mm_movemask_pd(mask_cmp) )
232             {
233                 /* if ratio>still_p5inv for ALL elements */
234                 ccf         = one;
235                 dccf        = _mm_setzero_pd();
236             }
237             else
238             {
239                 ratio       = _mm_min_sd(ratio, still_p5inv);
240                 theta       = _mm_mul_sd(ratio, still_pip5);
241                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
242                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
243                 ccf         = _mm_mul_sd(term, term);
244                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
245                                          _mm_mul_sd(sinq, theta));
246             }
247
248             prod        = _mm_mul_sd(still_p4, vaj);
249             icf4        = _mm_mul_sd(ccf, rinv4);
250             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
251
252             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
253
254             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
255
256             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
257             dadx += 2;
258             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
259             dadx += 2;
260         }
261         gmx_mm_update_1pot_pd(gpi, work+ii);
262     }
263
264     /* Sum up the polarization energy from other nodes */
265     if (DOMAINDECOMP(cr))
266     {
267         dd_atom_sum_real(cr->dd, work);
268     }
269
270     /* Compute the radii */
271     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
272     {
273         if (born->use[i] != 0)
274         {
275             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
276             gpi2             = gpi_ai * gpi_ai;
277             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
278             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
279         }
280     }
281
282     /* Extra (local) communication required for DD */
283     if (DOMAINDECOMP(cr))
284     {
285         dd_atom_spread_real(cr->dd, born->bRad);
286         dd_atom_spread_real(cr->dd, fr->invsqrta);
287     }
288
289     return 0;
290 }
291
292
293 int
294 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
295                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
296 {
297     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
298     int           jnrA, jnrB;
299     int           j3A, j3B;
300     double        shX, shY, shZ;
301     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
302     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
303     double       *obc_param;
304     double       *gb_radius;
305     double       *work;
306     int        *  jjnr;
307     double       *dadx;
308     double       *shiftvec;
309     double        min_rad, rad;
310
311     __m128d       ix, iy, iz, jx, jy, jz;
312     __m128d       dx, dy, dz, t1, t2, t3, t4;
313     __m128d       rsq, rinv, r;
314     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
315     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
316     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
317     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
318     __m128d       dadx1, dadx2;
319     __m128d       logterm;
320     __m128d       mask;
321     __m128d       obc_mask1, obc_mask2, obc_mask3;
322
323     __m128d       oneeighth   = _mm_set1_pd(0.125);
324     __m128d       onefourth   = _mm_set1_pd(0.25);
325
326     const __m128d half  = _mm_set1_pd(0.5);
327     const __m128d three = _mm_set1_pd(3.0);
328     const __m128d one   = _mm_set1_pd(1.0);
329     const __m128d two   = _mm_set1_pd(2.0);
330     const __m128d zero  = _mm_set1_pd(0.0);
331     const __m128d neg   = _mm_set1_pd(-1.0);
332
333     /* Set the dielectric offset */
334     doffset   = born->gb_doffset;
335     gb_radius = born->gb_radius;
336     obc_param = born->param;
337     work      = born->gpol_hct_work;
338     jjnr      = nl->jjnr;
339     dadx      = fr->dadx;
340     shiftvec  = fr->shift_vec[0];
341
342     jx        = _mm_setzero_pd();
343     jy        = _mm_setzero_pd();
344     jz        = _mm_setzero_pd();
345
346     jnrA = jnrB = 0;
347
348     for (i = 0; i < born->nr; i++)
349     {
350         work[i] = 0;
351     }
352
353     for (i = 0; i < nl->nri; i++)
354     {
355         ii     = nl->iinr[i];
356         ii3    = ii*3;
357         is3    = 3*nl->shift[i];
358         shX    = shiftvec[is3];
359         shY    = shiftvec[is3+1];
360         shZ    = shiftvec[is3+2];
361         nj0    = nl->jindex[i];
362         nj1    = nl->jindex[i+1];
363
364         ix     = _mm_set1_pd(shX+x[ii3+0]);
365         iy     = _mm_set1_pd(shY+x[ii3+1]);
366         iz     = _mm_set1_pd(shZ+x[ii3+2]);
367
368         rai     = _mm_load1_pd(gb_radius+ii);
369         rai_inv = gmx_mm_inv_pd(rai);
370
371         sum_ai = _mm_setzero_pd();
372
373         sk_ai  = _mm_load1_pd(born->param+ii);
374         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
375
376         for (k = nj0; k < nj1-1; k += 2)
377         {
378             jnrA        = jjnr[k];
379             jnrB        = jjnr[k+1];
380
381             j3A         = 3*jnrA;
382             j3B         = 3*jnrB;
383
384             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
385             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
386             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
387
388             dx    = _mm_sub_pd(ix, jx);
389             dy    = _mm_sub_pd(iy, jy);
390             dz    = _mm_sub_pd(iz, jz);
391
392             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
393
394             rinv        = gmx_mm_invsqrt_pd(rsq);
395             r           = _mm_mul_pd(rsq, rinv);
396
397             /* Compute raj_inv aj1-4 */
398             raj_inv     = gmx_mm_inv_pd(raj);
399
400             /* Evaluate influence of atom aj -> ai */
401             t1            = _mm_add_pd(r, sk_aj);
402             t2            = _mm_sub_pd(r, sk_aj);
403             t3            = _mm_sub_pd(sk_aj, r);
404             obc_mask1     = _mm_cmplt_pd(rai, t1);
405             obc_mask2     = _mm_cmplt_pd(rai, t2);
406             obc_mask3     = _mm_cmplt_pd(rai, t3);
407
408             uij           = gmx_mm_inv_pd(t1);
409             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
410                                          _mm_andnot_pd(obc_mask2, rai_inv));
411             dlij          = _mm_and_pd(one, obc_mask2);
412             uij2          = _mm_mul_pd(uij, uij);
413             uij3          = _mm_mul_pd(uij2, uij);
414             lij2          = _mm_mul_pd(lij, lij);
415             lij3          = _mm_mul_pd(lij2, lij);
416
417             diff2         = _mm_sub_pd(uij2, lij2);
418             lij_inv       = gmx_mm_invsqrt_pd(lij2);
419             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
420             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
421             prod          = _mm_mul_pd(onefourth, sk2_rinv);
422
423             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
424
425             t1            = _mm_sub_pd(lij, uij);
426             t2            = _mm_mul_pd(diff2,
427                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
428                                                   prod));
429             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
430             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
431             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
432             t4            = _mm_and_pd(t4, obc_mask3);
433             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
434
435             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
436
437             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
438                                        _mm_mul_pd(prod, lij3));
439             t1            = _mm_sub_pd(t1,
440                                        _mm_mul_pd(onefourth,
441                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
442                                                              _mm_mul_pd(lij3, r))));
443             t2            = _mm_mul_pd(onefourth,
444                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
445                                                   _mm_mul_pd(uij3, r)));
446             t2            = _mm_sub_pd(t2,
447                                        _mm_add_pd(_mm_mul_pd(half, uij2),
448                                                   _mm_mul_pd(prod, uij3)));
449             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
450                                        _mm_mul_pd(rinv, rinv));
451             t3            = _mm_sub_pd(t3,
452                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
453                                                   _mm_add_pd(one,
454                                                              _mm_mul_pd(sk2_rinv, rinv))));
455             t1            = _mm_mul_pd(rinv,
456                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
457                                                   _mm_add_pd(t2, t3)));
458
459             dadx1         = _mm_and_pd(t1, obc_mask1);
460
461             /* Evaluate influence of atom ai -> aj */
462             t1            = _mm_add_pd(r, sk_ai);
463             t2            = _mm_sub_pd(r, sk_ai);
464             t3            = _mm_sub_pd(sk_ai, r);
465             obc_mask1     = _mm_cmplt_pd(raj, t1);
466             obc_mask2     = _mm_cmplt_pd(raj, t2);
467             obc_mask3     = _mm_cmplt_pd(raj, t3);
468
469             uij           = gmx_mm_inv_pd(t1);
470             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
471                                          _mm_andnot_pd(obc_mask2, raj_inv));
472             dlij          = _mm_and_pd(one, obc_mask2);
473             uij2          = _mm_mul_pd(uij, uij);
474             uij3          = _mm_mul_pd(uij2, uij);
475             lij2          = _mm_mul_pd(lij, lij);
476             lij3          = _mm_mul_pd(lij2, lij);
477
478             diff2         = _mm_sub_pd(uij2, lij2);
479             lij_inv       = gmx_mm_invsqrt_pd(lij2);
480             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
481             prod          = _mm_mul_pd(onefourth, sk2_rinv);
482
483             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
484
485             t1            = _mm_sub_pd(lij, uij);
486             t2            = _mm_mul_pd(diff2,
487                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
488                                                   prod));
489             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
490             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
491             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
492             t4            = _mm_and_pd(t4, obc_mask3);
493             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
494
495             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
496
497             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
498                                        _mm_mul_pd(prod, lij3));
499             t1            = _mm_sub_pd(t1,
500                                        _mm_mul_pd(onefourth,
501                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
502                                                              _mm_mul_pd(lij3, r))));
503             t2            = _mm_mul_pd(onefourth,
504                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
505                                                   _mm_mul_pd(uij3, r)));
506             t2            = _mm_sub_pd(t2,
507                                        _mm_add_pd(_mm_mul_pd(half, uij2),
508                                                   _mm_mul_pd(prod, uij3)));
509             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
510                                        _mm_mul_pd(rinv, rinv));
511             t3            = _mm_sub_pd(t3,
512                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
513                                                   _mm_add_pd(one,
514                                                              _mm_mul_pd(sk2_rinv, rinv))));
515             t1            = _mm_mul_pd(rinv,
516                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
517                                                   _mm_add_pd(t2, t3)));
518
519             dadx2         = _mm_and_pd(t1, obc_mask1);
520
521             _mm_store_pd(dadx, dadx1);
522             dadx += 2;
523             _mm_store_pd(dadx, dadx2);
524             dadx += 2;
525         } /* end normal inner loop */
526
527         if (k < nj1)
528         {
529             jnrA        = jjnr[k];
530
531             j3A         = 3*jnrA;
532
533             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
534             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
535             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
536
537             dx    = _mm_sub_sd(ix, jx);
538             dy    = _mm_sub_sd(iy, jy);
539             dz    = _mm_sub_sd(iz, jz);
540
541             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
542
543             rinv        = gmx_mm_invsqrt_pd(rsq);
544             r           = _mm_mul_sd(rsq, rinv);
545
546             /* Compute raj_inv aj1-4 */
547             raj_inv     = gmx_mm_inv_pd(raj);
548
549             /* Evaluate influence of atom aj -> ai */
550             t1            = _mm_add_sd(r, sk_aj);
551             t2            = _mm_sub_sd(r, sk_aj);
552             t3            = _mm_sub_sd(sk_aj, r);
553             obc_mask1     = _mm_cmplt_sd(rai, t1);
554             obc_mask2     = _mm_cmplt_sd(rai, t2);
555             obc_mask3     = _mm_cmplt_sd(rai, t3);
556
557             uij           = gmx_mm_inv_pd(t1);
558             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
559                                       _mm_andnot_pd(obc_mask2, rai_inv));
560             dlij          = _mm_and_pd(one, obc_mask2);
561             uij2          = _mm_mul_sd(uij, uij);
562             uij3          = _mm_mul_sd(uij2, uij);
563             lij2          = _mm_mul_sd(lij, lij);
564             lij3          = _mm_mul_sd(lij2, lij);
565
566             diff2         = _mm_sub_sd(uij2, lij2);
567             lij_inv       = gmx_mm_invsqrt_pd(lij2);
568             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
569             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
570             prod          = _mm_mul_sd(onefourth, sk2_rinv);
571
572             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
573
574             t1            = _mm_sub_sd(lij, uij);
575             t2            = _mm_mul_sd(diff2,
576                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
577                                                   prod));
578             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
579             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
580             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
581             t4            = _mm_and_pd(t4, obc_mask3);
582             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
583
584             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
585
586             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
587                                        _mm_mul_sd(prod, lij3));
588             t1            = _mm_sub_sd(t1,
589                                        _mm_mul_sd(onefourth,
590                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
591                                                              _mm_mul_sd(lij3, r))));
592             t2            = _mm_mul_sd(onefourth,
593                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
594                                                   _mm_mul_sd(uij3, r)));
595             t2            = _mm_sub_sd(t2,
596                                        _mm_add_sd(_mm_mul_sd(half, uij2),
597                                                   _mm_mul_sd(prod, uij3)));
598             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
599                                        _mm_mul_sd(rinv, rinv));
600             t3            = _mm_sub_sd(t3,
601                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
602                                                   _mm_add_sd(one,
603                                                              _mm_mul_sd(sk2_rinv, rinv))));
604             t1            = _mm_mul_sd(rinv,
605                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
606                                                   _mm_add_pd(t2, t3)));
607
608             dadx1         = _mm_and_pd(t1, obc_mask1);
609
610             /* Evaluate influence of atom ai -> aj */
611             t1            = _mm_add_sd(r, sk_ai);
612             t2            = _mm_sub_sd(r, sk_ai);
613             t3            = _mm_sub_sd(sk_ai, r);
614             obc_mask1     = _mm_cmplt_sd(raj, t1);
615             obc_mask2     = _mm_cmplt_sd(raj, t2);
616             obc_mask3     = _mm_cmplt_sd(raj, t3);
617
618             uij           = gmx_mm_inv_pd(t1);
619             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
620                                          _mm_andnot_pd(obc_mask2, raj_inv));
621             dlij          = _mm_and_pd(one, obc_mask2);
622             uij2          = _mm_mul_sd(uij, uij);
623             uij3          = _mm_mul_sd(uij2, uij);
624             lij2          = _mm_mul_sd(lij, lij);
625             lij3          = _mm_mul_sd(lij2, lij);
626
627             diff2         = _mm_sub_sd(uij2, lij2);
628             lij_inv       = gmx_mm_invsqrt_pd(lij2);
629             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
630             prod          = _mm_mul_sd(onefourth, sk2_rinv);
631
632             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
633
634             t1            = _mm_sub_sd(lij, uij);
635             t2            = _mm_mul_sd(diff2,
636                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
637                                                   prod));
638             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
639             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
640             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
641             t4            = _mm_and_pd(t4, obc_mask3);
642             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
643
644             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
645
646             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
647                                        _mm_mul_sd(prod, lij3));
648             t1            = _mm_sub_sd(t1,
649                                        _mm_mul_sd(onefourth,
650                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
651                                                              _mm_mul_sd(lij3, r))));
652             t2            = _mm_mul_sd(onefourth,
653                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
654                                                   _mm_mul_sd(uij3, r)));
655             t2            = _mm_sub_sd(t2,
656                                        _mm_add_sd(_mm_mul_sd(half, uij2),
657                                                   _mm_mul_sd(prod, uij3)));
658             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
659                                        _mm_mul_sd(rinv, rinv));
660             t3            = _mm_sub_sd(t3,
661                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
662                                                   _mm_add_sd(one,
663                                                              _mm_mul_sd(sk2_rinv, rinv))));
664             t1            = _mm_mul_sd(rinv,
665                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
666                                                   _mm_add_sd(t2, t3)));
667
668             dadx2         = _mm_and_pd(t1, obc_mask1);
669
670             _mm_store_pd(dadx, dadx1);
671             dadx += 2;
672             _mm_store_pd(dadx, dadx2);
673             dadx += 2;
674         }
675         gmx_mm_update_1pot_pd(sum_ai, work+ii);
676
677     }
678
679     /* Parallel summations */
680     if (DOMAINDECOMP(cr))
681     {
682         dd_atom_sum_real(cr->dd, work);
683     }
684
685     if (gb_algorithm == egbHCT)
686     {
687         /* HCT */
688         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
689         {
690             if (born->use[i] != 0)
691             {
692                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
693                 sum     = 1.0/rr - work[i];
694                 min_rad = rr + doffset;
695                 rad     = 1.0/sum;
696
697                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
698                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
699             }
700         }
701
702         /* Extra communication required for DD */
703         if (DOMAINDECOMP(cr))
704         {
705             dd_atom_spread_real(cr->dd, born->bRad);
706             dd_atom_spread_real(cr->dd, fr->invsqrta);
707         }
708     }
709     else
710     {
711         /* OBC */
712         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
713         {
714             if (born->use[i] != 0)
715             {
716                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
717                 rr_inv2 = 1.0/rr;
718                 rr      = rr-doffset;
719                 rr_inv  = 1.0/rr;
720                 sum     = rr * work[i];
721                 sum2    = sum  * sum;
722                 sum3    = sum2 * sum;
723
724                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
725                 born->bRad[i] = rr_inv - tsum*rr_inv2;
726                 born->bRad[i] = 1.0 / born->bRad[i];
727
728                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
729
730                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
731                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
732             }
733         }
734         /* Extra (local) communication required for DD */
735         if (DOMAINDECOMP(cr))
736         {
737             dd_atom_spread_real(cr->dd, born->bRad);
738             dd_atom_spread_real(cr->dd, fr->invsqrta);
739             dd_atom_spread_real(cr->dd, born->drobc);
740         }
741     }
742
743
744
745     return 0;
746 }
747
748
749 int
750 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
751                               double *x, double *f, double *fshift, double *shiftvec,
752                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
753 {
754     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
755     int           jnrA, jnrB;
756     int           j3A, j3B;
757     int        *  jjnr;
758
759     double        rbi, shX, shY, shZ;
760     double       *rb;
761
762     __m128d       ix, iy, iz;
763     __m128d       jx, jy, jz;
764     __m128d       fix, fiy, fiz;
765     __m128d       dx, dy, dz;
766     __m128d       tx, ty, tz;
767
768     __m128d       rbai, rbaj, f_gb, f_gb_ai;
769     __m128d       xmm1, xmm2, xmm3;
770
771     const __m128d two = _mm_set1_pd(2.0);
772
773     rb     = born->work;
774
775     jjnr   = nl->jjnr;
776
777     /* Loop to get the proper form for the Born radius term, sse style */
778     n0 = 0;
779     n1 = natoms;
780
781     if (gb_algorithm == egbSTILL)
782     {
783         for (i = n0; i < n1; i++)
784         {
785             rbi   = born->bRad[i];
786             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
787         }
788     }
789     else if (gb_algorithm == egbHCT)
790     {
791         for (i = n0; i < n1; i++)
792         {
793             rbi   = born->bRad[i];
794             rb[i] = rbi * rbi * dvda[i];
795         }
796     }
797     else if (gb_algorithm == egbOBC)
798     {
799         for (i = n0; i < n1; i++)
800         {
801             rbi   = born->bRad[i];
802             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
803         }
804     }
805
806     jz = _mm_setzero_pd();
807
808     n = j3A = j3B = 0;
809
810     for (i = 0; i < nl->nri; i++)
811     {
812         ii     = nl->iinr[i];
813         ii3    = ii*3;
814         is3    = 3*nl->shift[i];
815         shX    = shiftvec[is3];
816         shY    = shiftvec[is3+1];
817         shZ    = shiftvec[is3+2];
818         nj0    = nl->jindex[i];
819         nj1    = nl->jindex[i+1];
820
821         ix     = _mm_set1_pd(shX+x[ii3+0]);
822         iy     = _mm_set1_pd(shY+x[ii3+1]);
823         iz     = _mm_set1_pd(shZ+x[ii3+2]);
824
825         rbai   = _mm_load1_pd(rb+ii);
826         fix    = _mm_setzero_pd();
827         fiy    = _mm_setzero_pd();
828         fiz    = _mm_setzero_pd();
829
830
831         for (k = nj0; k < nj1-1; k += 2)
832         {
833             jnrA        = jjnr[k];
834             jnrB        = jjnr[k+1];
835
836             j3A         = 3*jnrA;
837             j3B         = 3*jnrB;
838
839             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
840
841             dx          = _mm_sub_pd(ix, jx);
842             dy          = _mm_sub_pd(iy, jy);
843             dz          = _mm_sub_pd(iz, jz);
844
845             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
846
847             /* load chain rule terms for j1-4 */
848             f_gb        = _mm_load_pd(dadx);
849             dadx       += 2;
850             f_gb_ai     = _mm_load_pd(dadx);
851             dadx       += 2;
852
853             /* calculate scalar force */
854             f_gb    = _mm_mul_pd(f_gb, rbai);
855             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
856             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
857
858             tx     = _mm_mul_pd(f_gb, dx);
859             ty     = _mm_mul_pd(f_gb, dy);
860             tz     = _mm_mul_pd(f_gb, dz);
861
862             fix    = _mm_add_pd(fix, tx);
863             fiy    = _mm_add_pd(fiy, ty);
864             fiz    = _mm_add_pd(fiz, tz);
865
866             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
867         }
868
869         /*deal with odd elements */
870         if (k < nj1)
871         {
872             jnrA        = jjnr[k];
873             j3A         = 3*jnrA;
874
875             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
876
877             dx          = _mm_sub_sd(ix, jx);
878             dy          = _mm_sub_sd(iy, jy);
879             dz          = _mm_sub_sd(iz, jz);
880
881             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
882
883             /* load chain rule terms */
884             f_gb        = _mm_load_pd(dadx);
885             dadx       += 2;
886             f_gb_ai     = _mm_load_pd(dadx);
887             dadx       += 2;
888
889             /* calculate scalar force */
890             f_gb    = _mm_mul_sd(f_gb, rbai);
891             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
892             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
893
894             tx     = _mm_mul_sd(f_gb, dx);
895             ty     = _mm_mul_sd(f_gb, dy);
896             tz     = _mm_mul_sd(f_gb, dz);
897
898             fix    = _mm_add_sd(fix, tx);
899             fiy    = _mm_add_sd(fiy, ty);
900             fiz    = _mm_add_sd(fiz, tz);
901
902             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
903         }
904
905         /* fix/fiy/fiz now contain four partial force terms, that all should be
906          * added to the i particle forces and shift forces.
907          */
908         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
909     }
910
911     return 0;
912 }
913
914 #else
915 /* keep compiler happy */
916 int genborn_sse2_dummy;
917
918 #endif /* SSE2 intrinsics available */