54eca2fe79e0fc967ad1d2b94d2b1493e1cabb36
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team.
6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #include "gmxpre.h"
38
39 #include <math.h>
40 #include <string.h>
41
42 #include "gromacs/legacyheaders/typedefs.h"
43 #include "gromacs/utility/smalloc.h"
44 #include "gromacs/legacyheaders/genborn.h"
45 #include "gromacs/math/vec.h"
46 #include "gromacs/fileio/pdbio.h"
47 #include "gromacs/legacyheaders/names.h"
48 #include "gromacs/math/units.h"
49 #include "gromacs/legacyheaders/domdec.h"
50 #include "gromacs/legacyheaders/network.h"
51 #include "gromacs/utility/fatalerror.h"
52 #include "gromacs/legacyheaders/genborn.h"
53
54 #include "gromacs/utility/gmxmpi.h"
55
56 /* Only compile this file if SSE2 intrinsics are available */
57 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
58 #include <gmx_sse2_double.h>
59 #include <emmintrin.h>
60
61 #include "genborn_sse2_double.h"
62
63 int
64 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
65                               int natoms, gmx_localtop_t *top,
66                               double *x, t_nblist *nl,
67                               gmx_genborn_t *born)
68 {
69     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
70     int           jnrA, jnrB, j3A, j3B;
71     int          *mdtype;
72     double        shX, shY, shZ;
73     int          *jjnr;
74     double       *shiftvec;
75
76     double        gpi_ai, gpi2;
77     double        factor;
78     double       *gb_radius;
79     double       *vsolv;
80     double       *work;
81     double       *dadx;
82
83     __m128d       ix, iy, iz;
84     __m128d       jx, jy, jz;
85     __m128d       dx, dy, dz;
86     __m128d       tx, ty, tz;
87     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
88     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
89     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
90     __m128d       mask, icf4, icf6, mask_cmp;
91
92     const __m128d half   = _mm_set1_pd(0.5);
93     const __m128d three  = _mm_set1_pd(3.0);
94     const __m128d one    = _mm_set1_pd(1.0);
95     const __m128d two    = _mm_set1_pd(2.0);
96     const __m128d zero   = _mm_set1_pd(0.0);
97     const __m128d four   = _mm_set1_pd(4.0);
98
99     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
100     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
101     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
102
103     factor  = 0.5 * ONE_4PI_EPS0;
104
105     gb_radius = born->gb_radius;
106     vsolv     = born->vsolv;
107     work      = born->gpol_still_work;
108     jjnr      = nl->jjnr;
109     shiftvec  = fr->shift_vec[0];
110     dadx      = fr->dadx;
111
112     jnrA = jnrB = 0;
113     jx   = _mm_setzero_pd();
114     jy   = _mm_setzero_pd();
115     jz   = _mm_setzero_pd();
116
117     n = 0;
118
119     for (i = 0; i < natoms; i++)
120     {
121         work[i] = 0;
122     }
123
124     for (i = 0; i < nl->nri; i++)
125     {
126         ii     = nl->iinr[i];
127         ii3    = ii*3;
128         is3    = 3*nl->shift[i];
129         shX    = shiftvec[is3];
130         shY    = shiftvec[is3+1];
131         shZ    = shiftvec[is3+2];
132         nj0    = nl->jindex[i];
133         nj1    = nl->jindex[i+1];
134
135         ix     = _mm_set1_pd(shX+x[ii3+0]);
136         iy     = _mm_set1_pd(shY+x[ii3+1]);
137         iz     = _mm_set1_pd(shZ+x[ii3+2]);
138
139
140         /* Polarization energy for atom ai */
141         gpi    = _mm_setzero_pd();
142
143         rai     = _mm_load1_pd(gb_radius+ii);
144         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
145
146         for (k = nj0; k < nj1-1; k += 2)
147         {
148             jnrA        = jjnr[k];
149             jnrB        = jjnr[k+1];
150
151             j3A         = 3*jnrA;
152             j3B         = 3*jnrB;
153
154             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
155
156             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
157             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
158
159             dx          = _mm_sub_pd(ix, jx);
160             dy          = _mm_sub_pd(iy, jy);
161             dz          = _mm_sub_pd(iz, jz);
162
163             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
164             rinv        = gmx_mm_invsqrt_pd(rsq);
165             rinv2       = _mm_mul_pd(rinv, rinv);
166             rinv4       = _mm_mul_pd(rinv2, rinv2);
167             rinv6       = _mm_mul_pd(rinv4, rinv2);
168
169             rvdw        = _mm_add_pd(rai, raj);
170             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
171
172             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
173
174             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
175             if (0 == _mm_movemask_pd(mask_cmp) )
176             {
177                 /* if ratio>still_p5inv for ALL elements */
178                 ccf         = one;
179                 dccf        = _mm_setzero_pd();
180             }
181             else
182             {
183                 ratio       = _mm_min_pd(ratio, still_p5inv);
184                 theta       = _mm_mul_pd(ratio, still_pip5);
185                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
186                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
187                 ccf         = _mm_mul_pd(term, term);
188                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
189                                          _mm_mul_pd(sinq, theta));
190             }
191
192             prod        = _mm_mul_pd(still_p4, vaj);
193             icf4        = _mm_mul_pd(ccf, rinv4);
194             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
195
196             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
197
198             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
199
200             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
201             dadx += 2;
202             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
203             dadx += 2;
204         }
205
206         if (k < nj1)
207         {
208             jnrA        = jjnr[k];
209
210             j3A         = 3*jnrA;
211
212             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
213
214             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
215             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
216
217             dx          = _mm_sub_sd(ix, jx);
218             dy          = _mm_sub_sd(iy, jy);
219             dz          = _mm_sub_sd(iz, jz);
220
221             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
222             rinv        = gmx_mm_invsqrt_pd(rsq);
223             rinv2       = _mm_mul_sd(rinv, rinv);
224             rinv4       = _mm_mul_sd(rinv2, rinv2);
225             rinv6       = _mm_mul_sd(rinv4, rinv2);
226
227             rvdw        = _mm_add_sd(rai, raj);
228             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
229
230             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
231
232             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
233             if (0 == _mm_movemask_pd(mask_cmp) )
234             {
235                 /* if ratio>still_p5inv for ALL elements */
236                 ccf         = one;
237                 dccf        = _mm_setzero_pd();
238             }
239             else
240             {
241                 ratio       = _mm_min_sd(ratio, still_p5inv);
242                 theta       = _mm_mul_sd(ratio, still_pip5);
243                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
244                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
245                 ccf         = _mm_mul_sd(term, term);
246                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
247                                          _mm_mul_sd(sinq, theta));
248             }
249
250             prod        = _mm_mul_sd(still_p4, vaj);
251             icf4        = _mm_mul_sd(ccf, rinv4);
252             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
253
254             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
255
256             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
257
258             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
259             dadx += 2;
260             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
261             dadx += 2;
262         }
263         gmx_mm_update_1pot_pd(gpi, work+ii);
264     }
265
266     /* Sum up the polarization energy from other nodes */
267     if (DOMAINDECOMP(cr))
268     {
269         dd_atom_sum_real(cr->dd, work);
270     }
271
272     /* Compute the radii */
273     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
274     {
275         if (born->use[i] != 0)
276         {
277             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
278             gpi2             = gpi_ai * gpi_ai;
279             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
280             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
281         }
282     }
283
284     /* Extra (local) communication required for DD */
285     if (DOMAINDECOMP(cr))
286     {
287         dd_atom_spread_real(cr->dd, born->bRad);
288         dd_atom_spread_real(cr->dd, fr->invsqrta);
289     }
290
291     return 0;
292 }
293
294
295 int
296 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
297                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
298 {
299     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
300     int           jnrA, jnrB;
301     int           j3A, j3B;
302     double        shX, shY, shZ;
303     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
304     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
305     double       *obc_param;
306     double       *gb_radius;
307     double       *work;
308     int        *  jjnr;
309     double       *dadx;
310     double       *shiftvec;
311     double        min_rad, rad;
312
313     __m128d       ix, iy, iz, jx, jy, jz;
314     __m128d       dx, dy, dz, t1, t2, t3, t4;
315     __m128d       rsq, rinv, r;
316     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
317     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
318     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
319     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
320     __m128d       dadx1, dadx2;
321     __m128d       logterm;
322     __m128d       mask;
323     __m128d       obc_mask1, obc_mask2, obc_mask3;
324
325     __m128d       oneeighth   = _mm_set1_pd(0.125);
326     __m128d       onefourth   = _mm_set1_pd(0.25);
327
328     const __m128d half  = _mm_set1_pd(0.5);
329     const __m128d three = _mm_set1_pd(3.0);
330     const __m128d one   = _mm_set1_pd(1.0);
331     const __m128d two   = _mm_set1_pd(2.0);
332     const __m128d zero  = _mm_set1_pd(0.0);
333     const __m128d neg   = _mm_set1_pd(-1.0);
334
335     /* Set the dielectric offset */
336     doffset   = born->gb_doffset;
337     gb_radius = born->gb_radius;
338     obc_param = born->param;
339     work      = born->gpol_hct_work;
340     jjnr      = nl->jjnr;
341     dadx      = fr->dadx;
342     shiftvec  = fr->shift_vec[0];
343
344     jx        = _mm_setzero_pd();
345     jy        = _mm_setzero_pd();
346     jz        = _mm_setzero_pd();
347
348     jnrA = jnrB = 0;
349
350     for (i = 0; i < born->nr; i++)
351     {
352         work[i] = 0;
353     }
354
355     for (i = 0; i < nl->nri; i++)
356     {
357         ii     = nl->iinr[i];
358         ii3    = ii*3;
359         is3    = 3*nl->shift[i];
360         shX    = shiftvec[is3];
361         shY    = shiftvec[is3+1];
362         shZ    = shiftvec[is3+2];
363         nj0    = nl->jindex[i];
364         nj1    = nl->jindex[i+1];
365
366         ix     = _mm_set1_pd(shX+x[ii3+0]);
367         iy     = _mm_set1_pd(shY+x[ii3+1]);
368         iz     = _mm_set1_pd(shZ+x[ii3+2]);
369
370         rai     = _mm_load1_pd(gb_radius+ii);
371         rai_inv = gmx_mm_inv_pd(rai);
372
373         sum_ai = _mm_setzero_pd();
374
375         sk_ai  = _mm_load1_pd(born->param+ii);
376         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
377
378         for (k = nj0; k < nj1-1; k += 2)
379         {
380             jnrA        = jjnr[k];
381             jnrB        = jjnr[k+1];
382
383             j3A         = 3*jnrA;
384             j3B         = 3*jnrB;
385
386             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
387             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
388             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
389
390             dx    = _mm_sub_pd(ix, jx);
391             dy    = _mm_sub_pd(iy, jy);
392             dz    = _mm_sub_pd(iz, jz);
393
394             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
395
396             rinv        = gmx_mm_invsqrt_pd(rsq);
397             r           = _mm_mul_pd(rsq, rinv);
398
399             /* Compute raj_inv aj1-4 */
400             raj_inv     = gmx_mm_inv_pd(raj);
401
402             /* Evaluate influence of atom aj -> ai */
403             t1            = _mm_add_pd(r, sk_aj);
404             t2            = _mm_sub_pd(r, sk_aj);
405             t3            = _mm_sub_pd(sk_aj, r);
406             obc_mask1     = _mm_cmplt_pd(rai, t1);
407             obc_mask2     = _mm_cmplt_pd(rai, t2);
408             obc_mask3     = _mm_cmplt_pd(rai, t3);
409
410             uij           = gmx_mm_inv_pd(t1);
411             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
412                                          _mm_andnot_pd(obc_mask2, rai_inv));
413             dlij          = _mm_and_pd(one, obc_mask2);
414             uij2          = _mm_mul_pd(uij, uij);
415             uij3          = _mm_mul_pd(uij2, uij);
416             lij2          = _mm_mul_pd(lij, lij);
417             lij3          = _mm_mul_pd(lij2, lij);
418
419             diff2         = _mm_sub_pd(uij2, lij2);
420             lij_inv       = gmx_mm_invsqrt_pd(lij2);
421             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
422             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
423             prod          = _mm_mul_pd(onefourth, sk2_rinv);
424
425             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
426
427             t1            = _mm_sub_pd(lij, uij);
428             t2            = _mm_mul_pd(diff2,
429                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
430                                                   prod));
431             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
432             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
433             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
434             t4            = _mm_and_pd(t4, obc_mask3);
435             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
436
437             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
438
439             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
440                                        _mm_mul_pd(prod, lij3));
441             t1            = _mm_sub_pd(t1,
442                                        _mm_mul_pd(onefourth,
443                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
444                                                              _mm_mul_pd(lij3, r))));
445             t2            = _mm_mul_pd(onefourth,
446                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
447                                                   _mm_mul_pd(uij3, r)));
448             t2            = _mm_sub_pd(t2,
449                                        _mm_add_pd(_mm_mul_pd(half, uij2),
450                                                   _mm_mul_pd(prod, uij3)));
451             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
452                                        _mm_mul_pd(rinv, rinv));
453             t3            = _mm_sub_pd(t3,
454                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
455                                                   _mm_add_pd(one,
456                                                              _mm_mul_pd(sk2_rinv, rinv))));
457             t1            = _mm_mul_pd(rinv,
458                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
459                                                   _mm_add_pd(t2, t3)));
460
461             dadx1         = _mm_and_pd(t1, obc_mask1);
462
463             /* Evaluate influence of atom ai -> aj */
464             t1            = _mm_add_pd(r, sk_ai);
465             t2            = _mm_sub_pd(r, sk_ai);
466             t3            = _mm_sub_pd(sk_ai, r);
467             obc_mask1     = _mm_cmplt_pd(raj, t1);
468             obc_mask2     = _mm_cmplt_pd(raj, t2);
469             obc_mask3     = _mm_cmplt_pd(raj, t3);
470
471             uij           = gmx_mm_inv_pd(t1);
472             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
473                                          _mm_andnot_pd(obc_mask2, raj_inv));
474             dlij          = _mm_and_pd(one, obc_mask2);
475             uij2          = _mm_mul_pd(uij, uij);
476             uij3          = _mm_mul_pd(uij2, uij);
477             lij2          = _mm_mul_pd(lij, lij);
478             lij3          = _mm_mul_pd(lij2, lij);
479
480             diff2         = _mm_sub_pd(uij2, lij2);
481             lij_inv       = gmx_mm_invsqrt_pd(lij2);
482             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
483             prod          = _mm_mul_pd(onefourth, sk2_rinv);
484
485             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
486
487             t1            = _mm_sub_pd(lij, uij);
488             t2            = _mm_mul_pd(diff2,
489                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
490                                                   prod));
491             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
492             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
493             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
494             t4            = _mm_and_pd(t4, obc_mask3);
495             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
496
497             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
498
499             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
500                                        _mm_mul_pd(prod, lij3));
501             t1            = _mm_sub_pd(t1,
502                                        _mm_mul_pd(onefourth,
503                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
504                                                              _mm_mul_pd(lij3, r))));
505             t2            = _mm_mul_pd(onefourth,
506                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
507                                                   _mm_mul_pd(uij3, r)));
508             t2            = _mm_sub_pd(t2,
509                                        _mm_add_pd(_mm_mul_pd(half, uij2),
510                                                   _mm_mul_pd(prod, uij3)));
511             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
512                                        _mm_mul_pd(rinv, rinv));
513             t3            = _mm_sub_pd(t3,
514                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
515                                                   _mm_add_pd(one,
516                                                              _mm_mul_pd(sk2_rinv, rinv))));
517             t1            = _mm_mul_pd(rinv,
518                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
519                                                   _mm_add_pd(t2, t3)));
520
521             dadx2         = _mm_and_pd(t1, obc_mask1);
522
523             _mm_store_pd(dadx, dadx1);
524             dadx += 2;
525             _mm_store_pd(dadx, dadx2);
526             dadx += 2;
527         } /* end normal inner loop */
528
529         if (k < nj1)
530         {
531             jnrA        = jjnr[k];
532
533             j3A         = 3*jnrA;
534
535             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
536             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
537             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
538
539             dx    = _mm_sub_sd(ix, jx);
540             dy    = _mm_sub_sd(iy, jy);
541             dz    = _mm_sub_sd(iz, jz);
542
543             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
544
545             rinv        = gmx_mm_invsqrt_pd(rsq);
546             r           = _mm_mul_sd(rsq, rinv);
547
548             /* Compute raj_inv aj1-4 */
549             raj_inv     = gmx_mm_inv_pd(raj);
550
551             /* Evaluate influence of atom aj -> ai */
552             t1            = _mm_add_sd(r, sk_aj);
553             t2            = _mm_sub_sd(r, sk_aj);
554             t3            = _mm_sub_sd(sk_aj, r);
555             obc_mask1     = _mm_cmplt_sd(rai, t1);
556             obc_mask2     = _mm_cmplt_sd(rai, t2);
557             obc_mask3     = _mm_cmplt_sd(rai, t3);
558
559             uij           = gmx_mm_inv_pd(t1);
560             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
561                                       _mm_andnot_pd(obc_mask2, rai_inv));
562             dlij          = _mm_and_pd(one, obc_mask2);
563             uij2          = _mm_mul_sd(uij, uij);
564             uij3          = _mm_mul_sd(uij2, uij);
565             lij2          = _mm_mul_sd(lij, lij);
566             lij3          = _mm_mul_sd(lij2, lij);
567
568             diff2         = _mm_sub_sd(uij2, lij2);
569             lij_inv       = gmx_mm_invsqrt_pd(lij2);
570             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
571             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
572             prod          = _mm_mul_sd(onefourth, sk2_rinv);
573
574             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
575
576             t1            = _mm_sub_sd(lij, uij);
577             t2            = _mm_mul_sd(diff2,
578                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
579                                                   prod));
580             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
581             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
582             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
583             t4            = _mm_and_pd(t4, obc_mask3);
584             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
585
586             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
587
588             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
589                                        _mm_mul_sd(prod, lij3));
590             t1            = _mm_sub_sd(t1,
591                                        _mm_mul_sd(onefourth,
592                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
593                                                              _mm_mul_sd(lij3, r))));
594             t2            = _mm_mul_sd(onefourth,
595                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
596                                                   _mm_mul_sd(uij3, r)));
597             t2            = _mm_sub_sd(t2,
598                                        _mm_add_sd(_mm_mul_sd(half, uij2),
599                                                   _mm_mul_sd(prod, uij3)));
600             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
601                                        _mm_mul_sd(rinv, rinv));
602             t3            = _mm_sub_sd(t3,
603                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
604                                                   _mm_add_sd(one,
605                                                              _mm_mul_sd(sk2_rinv, rinv))));
606             t1            = _mm_mul_sd(rinv,
607                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
608                                                   _mm_add_pd(t2, t3)));
609
610             dadx1         = _mm_and_pd(t1, obc_mask1);
611
612             /* Evaluate influence of atom ai -> aj */
613             t1            = _mm_add_sd(r, sk_ai);
614             t2            = _mm_sub_sd(r, sk_ai);
615             t3            = _mm_sub_sd(sk_ai, r);
616             obc_mask1     = _mm_cmplt_sd(raj, t1);
617             obc_mask2     = _mm_cmplt_sd(raj, t2);
618             obc_mask3     = _mm_cmplt_sd(raj, t3);
619
620             uij           = gmx_mm_inv_pd(t1);
621             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
622                                          _mm_andnot_pd(obc_mask2, raj_inv));
623             dlij          = _mm_and_pd(one, obc_mask2);
624             uij2          = _mm_mul_sd(uij, uij);
625             uij3          = _mm_mul_sd(uij2, uij);
626             lij2          = _mm_mul_sd(lij, lij);
627             lij3          = _mm_mul_sd(lij2, lij);
628
629             diff2         = _mm_sub_sd(uij2, lij2);
630             lij_inv       = gmx_mm_invsqrt_pd(lij2);
631             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
632             prod          = _mm_mul_sd(onefourth, sk2_rinv);
633
634             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
635
636             t1            = _mm_sub_sd(lij, uij);
637             t2            = _mm_mul_sd(diff2,
638                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
639                                                   prod));
640             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
641             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
642             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
643             t4            = _mm_and_pd(t4, obc_mask3);
644             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
645
646             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
647
648             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
649                                        _mm_mul_sd(prod, lij3));
650             t1            = _mm_sub_sd(t1,
651                                        _mm_mul_sd(onefourth,
652                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
653                                                              _mm_mul_sd(lij3, r))));
654             t2            = _mm_mul_sd(onefourth,
655                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
656                                                   _mm_mul_sd(uij3, r)));
657             t2            = _mm_sub_sd(t2,
658                                        _mm_add_sd(_mm_mul_sd(half, uij2),
659                                                   _mm_mul_sd(prod, uij3)));
660             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
661                                        _mm_mul_sd(rinv, rinv));
662             t3            = _mm_sub_sd(t3,
663                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
664                                                   _mm_add_sd(one,
665                                                              _mm_mul_sd(sk2_rinv, rinv))));
666             t1            = _mm_mul_sd(rinv,
667                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
668                                                   _mm_add_sd(t2, t3)));
669
670             dadx2         = _mm_and_pd(t1, obc_mask1);
671
672             _mm_store_pd(dadx, dadx1);
673             dadx += 2;
674             _mm_store_pd(dadx, dadx2);
675             dadx += 2;
676         }
677         gmx_mm_update_1pot_pd(sum_ai, work+ii);
678
679     }
680
681     /* Parallel summations */
682     if (DOMAINDECOMP(cr))
683     {
684         dd_atom_sum_real(cr->dd, work);
685     }
686
687     if (gb_algorithm == egbHCT)
688     {
689         /* HCT */
690         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
691         {
692             if (born->use[i] != 0)
693             {
694                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
695                 sum     = 1.0/rr - work[i];
696                 min_rad = rr + doffset;
697                 rad     = 1.0/sum;
698
699                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
700                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
701             }
702         }
703
704         /* Extra communication required for DD */
705         if (DOMAINDECOMP(cr))
706         {
707             dd_atom_spread_real(cr->dd, born->bRad);
708             dd_atom_spread_real(cr->dd, fr->invsqrta);
709         }
710     }
711     else
712     {
713         /* OBC */
714         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
715         {
716             if (born->use[i] != 0)
717             {
718                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
719                 rr_inv2 = 1.0/rr;
720                 rr      = rr-doffset;
721                 rr_inv  = 1.0/rr;
722                 sum     = rr * work[i];
723                 sum2    = sum  * sum;
724                 sum3    = sum2 * sum;
725
726                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
727                 born->bRad[i] = rr_inv - tsum*rr_inv2;
728                 born->bRad[i] = 1.0 / born->bRad[i];
729
730                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
731
732                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
733                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
734             }
735         }
736         /* Extra (local) communication required for DD */
737         if (DOMAINDECOMP(cr))
738         {
739             dd_atom_spread_real(cr->dd, born->bRad);
740             dd_atom_spread_real(cr->dd, fr->invsqrta);
741             dd_atom_spread_real(cr->dd, born->drobc);
742         }
743     }
744
745
746
747     return 0;
748 }
749
750
751 int
752 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
753                               double *x, double *f, double *fshift, double *shiftvec,
754                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
755 {
756     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
757     int           jnrA, jnrB;
758     int           j3A, j3B;
759     int        *  jjnr;
760
761     double        rbi, shX, shY, shZ;
762     double       *rb;
763
764     __m128d       ix, iy, iz;
765     __m128d       jx, jy, jz;
766     __m128d       fix, fiy, fiz;
767     __m128d       dx, dy, dz;
768     __m128d       tx, ty, tz;
769
770     __m128d       rbai, rbaj, f_gb, f_gb_ai;
771     __m128d       xmm1, xmm2, xmm3;
772
773     const __m128d two = _mm_set1_pd(2.0);
774
775     rb     = born->work;
776
777     jjnr   = nl->jjnr;
778
779     /* Loop to get the proper form for the Born radius term, sse style */
780     n0 = 0;
781     n1 = natoms;
782
783     if (gb_algorithm == egbSTILL)
784     {
785         for (i = n0; i < n1; i++)
786         {
787             rbi   = born->bRad[i];
788             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
789         }
790     }
791     else if (gb_algorithm == egbHCT)
792     {
793         for (i = n0; i < n1; i++)
794         {
795             rbi   = born->bRad[i];
796             rb[i] = rbi * rbi * dvda[i];
797         }
798     }
799     else if (gb_algorithm == egbOBC)
800     {
801         for (i = n0; i < n1; i++)
802         {
803             rbi   = born->bRad[i];
804             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
805         }
806     }
807
808     jz = _mm_setzero_pd();
809
810     n = j3A = j3B = 0;
811
812     for (i = 0; i < nl->nri; i++)
813     {
814         ii     = nl->iinr[i];
815         ii3    = ii*3;
816         is3    = 3*nl->shift[i];
817         shX    = shiftvec[is3];
818         shY    = shiftvec[is3+1];
819         shZ    = shiftvec[is3+2];
820         nj0    = nl->jindex[i];
821         nj1    = nl->jindex[i+1];
822
823         ix     = _mm_set1_pd(shX+x[ii3+0]);
824         iy     = _mm_set1_pd(shY+x[ii3+1]);
825         iz     = _mm_set1_pd(shZ+x[ii3+2]);
826
827         rbai   = _mm_load1_pd(rb+ii);
828         fix    = _mm_setzero_pd();
829         fiy    = _mm_setzero_pd();
830         fiz    = _mm_setzero_pd();
831
832
833         for (k = nj0; k < nj1-1; k += 2)
834         {
835             jnrA        = jjnr[k];
836             jnrB        = jjnr[k+1];
837
838             j3A         = 3*jnrA;
839             j3B         = 3*jnrB;
840
841             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
842
843             dx          = _mm_sub_pd(ix, jx);
844             dy          = _mm_sub_pd(iy, jy);
845             dz          = _mm_sub_pd(iz, jz);
846
847             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
848
849             /* load chain rule terms for j1-4 */
850             f_gb        = _mm_load_pd(dadx);
851             dadx       += 2;
852             f_gb_ai     = _mm_load_pd(dadx);
853             dadx       += 2;
854
855             /* calculate scalar force */
856             f_gb    = _mm_mul_pd(f_gb, rbai);
857             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
858             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
859
860             tx     = _mm_mul_pd(f_gb, dx);
861             ty     = _mm_mul_pd(f_gb, dy);
862             tz     = _mm_mul_pd(f_gb, dz);
863
864             fix    = _mm_add_pd(fix, tx);
865             fiy    = _mm_add_pd(fiy, ty);
866             fiz    = _mm_add_pd(fiz, tz);
867
868             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
869         }
870
871         /*deal with odd elements */
872         if (k < nj1)
873         {
874             jnrA        = jjnr[k];
875             j3A         = 3*jnrA;
876
877             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
878
879             dx          = _mm_sub_sd(ix, jx);
880             dy          = _mm_sub_sd(iy, jy);
881             dz          = _mm_sub_sd(iz, jz);
882
883             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
884
885             /* load chain rule terms */
886             f_gb        = _mm_load_pd(dadx);
887             dadx       += 2;
888             f_gb_ai     = _mm_load_pd(dadx);
889             dadx       += 2;
890
891             /* calculate scalar force */
892             f_gb    = _mm_mul_sd(f_gb, rbai);
893             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
894             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
895
896             tx     = _mm_mul_sd(f_gb, dx);
897             ty     = _mm_mul_sd(f_gb, dy);
898             tz     = _mm_mul_sd(f_gb, dz);
899
900             fix    = _mm_add_sd(fix, tx);
901             fiy    = _mm_add_sd(fiy, ty);
902             fiz    = _mm_add_sd(fiz, tz);
903
904             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
905         }
906
907         /* fix/fiy/fiz now contain four partial force terms, that all should be
908          * added to the i particle forces and shift forces.
909          */
910         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
911     }
912
913     return 0;
914 }
915
916 #else
917 /* keep compiler happy */
918 int genborn_sse2_dummy;
919
920 #endif /* SSE2 intrinsics available */