Merge branch release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_double.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *                This source code is part of
4  *
5  *                 G   R   O   M   A   C   S
6  *
7  *          GROningen MAchine for Chemical Simulations
8  *
9  *                        VERSION 4.5
10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12  * Copyright (c) 2001-2008, The GROMACS development team,
13  * check out http://www.gromacs.org for more information.
14
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version 2
18  * of the License, or (at your option) any later version.
19  *
20  * If you want to redistribute modifications, please consider that
21  * scientific software is very special. Version control is crucial -
22  * bugs must be traceable. We will be happy to consider code for
23  * inclusion in the official distribution, but derived work must not
24  * be called official GROMACS. Details are found in the README & COPYING
25  * files - if they are missing, get the official version at www.gromacs.org.
26  *
27  * To help us fund GROMACS development, we humbly ask that you cite
28  * the papers on the package - you can find them in the top README file.
29  *
30  * For more info, check our website at http://www.gromacs.org
31  *
32  * And Hey:
33  * Groningen Machine for Chemical Simulation
34  */
35 #ifdef HAVE_CONFIG_H
36 #include <config.h>
37 #endif
38
39 #include <math.h>
40 #include <string.h>
41
42 #include "typedefs.h"
43 #include "smalloc.h"
44 #include "genborn.h"
45 #include "vec.h"
46 #include "grompp.h"
47 #include "pdbio.h"
48 #include "names.h"
49 #include "physics.h"
50 #include "domdec.h"
51 #include "partdec.h"
52 #include "network.h"
53 #include "gmx_fatal.h"
54 #include "mtop_util.h"
55 #include "genborn.h"
56
57 #include "gromacs/utility/gmxmpi.h"
58
59 /* Only compile this file if SSE2 intrinsics are available */
60 #if 0 && defined (GMX_X86_SSE2)
61 #include <gmx_sse2_double.h>
62 #include <emmintrin.h>
63
64 #include "genborn_sse2_double.h"
65
66 int
67 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
68                               int natoms, gmx_localtop_t *top,
69                               double *x, t_nblist *nl,
70                               gmx_genborn_t *born)
71 {
72     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
73     int           jnrA, jnrB, j3A, j3B;
74     int          *mdtype;
75     double        shX, shY, shZ;
76     int          *jjnr;
77     double       *shiftvec;
78
79     double        gpi_ai, gpi2;
80     double        factor;
81     double       *gb_radius;
82     double       *vsolv;
83     double       *work;
84     double       *dadx;
85
86     __m128d       ix, iy, iz;
87     __m128d       jx, jy, jz;
88     __m128d       dx, dy, dz;
89     __m128d       tx, ty, tz;
90     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
91     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
92     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
93     __m128d       mask, icf4, icf6, mask_cmp;
94
95     const __m128d half   = _mm_set1_pd(0.5);
96     const __m128d three  = _mm_set1_pd(3.0);
97     const __m128d one    = _mm_set1_pd(1.0);
98     const __m128d two    = _mm_set1_pd(2.0);
99     const __m128d zero   = _mm_set1_pd(0.0);
100     const __m128d four   = _mm_set1_pd(4.0);
101
102     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
103     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
104     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
105
106     factor  = 0.5 * ONE_4PI_EPS0;
107
108     gb_radius = born->gb_radius;
109     vsolv     = born->vsolv;
110     work      = born->gpol_still_work;
111     jjnr      = nl->jjnr;
112     shiftvec  = fr->shift_vec[0];
113     dadx      = fr->dadx;
114
115     jnrA = jnrB = 0;
116     jx   = _mm_setzero_pd();
117     jy   = _mm_setzero_pd();
118     jz   = _mm_setzero_pd();
119
120     n = 0;
121
122     for (i = 0; i < natoms; i++)
123     {
124         work[i] = 0;
125     }
126
127     for (i = 0; i < nl->nri; i++)
128     {
129         ii     = nl->iinr[i];
130         ii3    = ii*3;
131         is3    = 3*nl->shift[i];
132         shX    = shiftvec[is3];
133         shY    = shiftvec[is3+1];
134         shZ    = shiftvec[is3+2];
135         nj0    = nl->jindex[i];
136         nj1    = nl->jindex[i+1];
137
138         ix     = _mm_set1_pd(shX+x[ii3+0]);
139         iy     = _mm_set1_pd(shY+x[ii3+1]);
140         iz     = _mm_set1_pd(shZ+x[ii3+2]);
141
142
143         /* Polarization energy for atom ai */
144         gpi    = _mm_setzero_pd();
145
146         rai     = _mm_load1_pd(gb_radius+ii);
147         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
148
149         for (k = nj0; k < nj1-1; k += 2)
150         {
151             jnrA        = jjnr[k];
152             jnrB        = jjnr[k+1];
153
154             j3A         = 3*jnrA;
155             j3B         = 3*jnrB;
156
157             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
158
159             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
160             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
161
162             dx          = _mm_sub_pd(ix, jx);
163             dy          = _mm_sub_pd(iy, jy);
164             dz          = _mm_sub_pd(iz, jz);
165
166             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
167             rinv        = gmx_mm_invsqrt_pd(rsq);
168             rinv2       = _mm_mul_pd(rinv, rinv);
169             rinv4       = _mm_mul_pd(rinv2, rinv2);
170             rinv6       = _mm_mul_pd(rinv4, rinv2);
171
172             rvdw        = _mm_add_pd(rai, raj);
173             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
174
175             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
176
177             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
178             if (0 == _mm_movemask_pd(mask_cmp) )
179             {
180                 /* if ratio>still_p5inv for ALL elements */
181                 ccf         = one;
182                 dccf        = _mm_setzero_pd();
183             }
184             else
185             {
186                 ratio       = _mm_min_pd(ratio, still_p5inv);
187                 theta       = _mm_mul_pd(ratio, still_pip5);
188                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
189                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
190                 ccf         = _mm_mul_pd(term, term);
191                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
192                                          _mm_mul_pd(sinq, theta));
193             }
194
195             prod        = _mm_mul_pd(still_p4, vaj);
196             icf4        = _mm_mul_pd(ccf, rinv4);
197             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
198
199             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
200
201             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
202
203             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
204             dadx += 2;
205             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
206             dadx += 2;
207         }
208
209         if (k < nj1)
210         {
211             jnrA        = jjnr[k];
212
213             j3A         = 3*jnrA;
214
215             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
216
217             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
218             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
219
220             dx          = _mm_sub_sd(ix, jx);
221             dy          = _mm_sub_sd(iy, jy);
222             dz          = _mm_sub_sd(iz, jz);
223
224             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
225             rinv        = gmx_mm_invsqrt_pd(rsq);
226             rinv2       = _mm_mul_sd(rinv, rinv);
227             rinv4       = _mm_mul_sd(rinv2, rinv2);
228             rinv6       = _mm_mul_sd(rinv4, rinv2);
229
230             rvdw        = _mm_add_sd(rai, raj);
231             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
232
233             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
234
235             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
236             if (0 == _mm_movemask_pd(mask_cmp) )
237             {
238                 /* if ratio>still_p5inv for ALL elements */
239                 ccf         = one;
240                 dccf        = _mm_setzero_pd();
241             }
242             else
243             {
244                 ratio       = _mm_min_sd(ratio, still_p5inv);
245                 theta       = _mm_mul_sd(ratio, still_pip5);
246                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
247                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
248                 ccf         = _mm_mul_sd(term, term);
249                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
250                                          _mm_mul_sd(sinq, theta));
251             }
252
253             prod        = _mm_mul_sd(still_p4, vaj);
254             icf4        = _mm_mul_sd(ccf, rinv4);
255             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
256
257             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
258
259             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
260
261             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
262             dadx += 2;
263             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
264             dadx += 2;
265         }
266         gmx_mm_update_1pot_pd(gpi, work+ii);
267     }
268
269     /* Sum up the polarization energy from other nodes */
270     if (PARTDECOMP(cr))
271     {
272         gmx_sum(natoms, work, cr);
273     }
274     else if (DOMAINDECOMP(cr))
275     {
276         dd_atom_sum_real(cr->dd, work);
277     }
278
279     /* Compute the radii */
280     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
281     {
282         if (born->use[i] != 0)
283         {
284             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
285             gpi2             = gpi_ai * gpi_ai;
286             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
287             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
288         }
289     }
290
291     /* Extra (local) communication required for DD */
292     if (DOMAINDECOMP(cr))
293     {
294         dd_atom_spread_real(cr->dd, born->bRad);
295         dd_atom_spread_real(cr->dd, fr->invsqrta);
296     }
297
298     return 0;
299 }
300
301
302 int
303 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
304                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
305 {
306     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
307     int           jnrA, jnrB;
308     int           j3A, j3B;
309     double        shX, shY, shZ;
310     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
311     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
312     double       *obc_param;
313     double       *gb_radius;
314     double       *work;
315     int        *  jjnr;
316     double       *dadx;
317     double       *shiftvec;
318     double        min_rad, rad;
319
320     __m128d       ix, iy, iz, jx, jy, jz;
321     __m128d       dx, dy, dz, t1, t2, t3, t4;
322     __m128d       rsq, rinv, r;
323     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
324     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
325     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
326     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
327     __m128d       dadx1, dadx2;
328     __m128d       logterm;
329     __m128d       mask;
330     __m128d       obc_mask1, obc_mask2, obc_mask3;
331
332     __m128d       oneeighth   = _mm_set1_pd(0.125);
333     __m128d       onefourth   = _mm_set1_pd(0.25);
334
335     const __m128d half  = _mm_set1_pd(0.5);
336     const __m128d three = _mm_set1_pd(3.0);
337     const __m128d one   = _mm_set1_pd(1.0);
338     const __m128d two   = _mm_set1_pd(2.0);
339     const __m128d zero  = _mm_set1_pd(0.0);
340     const __m128d neg   = _mm_set1_pd(-1.0);
341
342     /* Set the dielectric offset */
343     doffset   = born->gb_doffset;
344     gb_radius = born->gb_radius;
345     obc_param = born->param;
346     work      = born->gpol_hct_work;
347     jjnr      = nl->jjnr;
348     dadx      = fr->dadx;
349     shiftvec  = fr->shift_vec[0];
350
351     jx        = _mm_setzero_pd();
352     jy        = _mm_setzero_pd();
353     jz        = _mm_setzero_pd();
354
355     jnrA = jnrB = 0;
356
357     for (i = 0; i < born->nr; i++)
358     {
359         work[i] = 0;
360     }
361
362     for (i = 0; i < nl->nri; i++)
363     {
364         ii     = nl->iinr[i];
365         ii3    = ii*3;
366         is3    = 3*nl->shift[i];
367         shX    = shiftvec[is3];
368         shY    = shiftvec[is3+1];
369         shZ    = shiftvec[is3+2];
370         nj0    = nl->jindex[i];
371         nj1    = nl->jindex[i+1];
372
373         ix     = _mm_set1_pd(shX+x[ii3+0]);
374         iy     = _mm_set1_pd(shY+x[ii3+1]);
375         iz     = _mm_set1_pd(shZ+x[ii3+2]);
376
377         rai     = _mm_load1_pd(gb_radius+ii);
378         rai_inv = gmx_mm_inv_pd(rai);
379
380         sum_ai = _mm_setzero_pd();
381
382         sk_ai  = _mm_load1_pd(born->param+ii);
383         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
384
385         for (k = nj0; k < nj1-1; k += 2)
386         {
387             jnrA        = jjnr[k];
388             jnrB        = jjnr[k+1];
389
390             j3A         = 3*jnrA;
391             j3B         = 3*jnrB;
392
393             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
394             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
395             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
396
397             dx    = _mm_sub_pd(ix, jx);
398             dy    = _mm_sub_pd(iy, jy);
399             dz    = _mm_sub_pd(iz, jz);
400
401             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
402
403             rinv        = gmx_mm_invsqrt_pd(rsq);
404             r           = _mm_mul_pd(rsq, rinv);
405
406             /* Compute raj_inv aj1-4 */
407             raj_inv     = gmx_mm_inv_pd(raj);
408
409             /* Evaluate influence of atom aj -> ai */
410             t1            = _mm_add_pd(r, sk_aj);
411             t2            = _mm_sub_pd(r, sk_aj);
412             t3            = _mm_sub_pd(sk_aj, r);
413             obc_mask1     = _mm_cmplt_pd(rai, t1);
414             obc_mask2     = _mm_cmplt_pd(rai, t2);
415             obc_mask3     = _mm_cmplt_pd(rai, t3);
416
417             uij           = gmx_mm_inv_pd(t1);
418             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
419                                          _mm_andnot_pd(obc_mask2, rai_inv));
420             dlij          = _mm_and_pd(one, obc_mask2);
421             uij2          = _mm_mul_pd(uij, uij);
422             uij3          = _mm_mul_pd(uij2, uij);
423             lij2          = _mm_mul_pd(lij, lij);
424             lij3          = _mm_mul_pd(lij2, lij);
425
426             diff2         = _mm_sub_pd(uij2, lij2);
427             lij_inv       = gmx_mm_invsqrt_pd(lij2);
428             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
429             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
430             prod          = _mm_mul_pd(onefourth, sk2_rinv);
431
432             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
433
434             t1            = _mm_sub_pd(lij, uij);
435             t2            = _mm_mul_pd(diff2,
436                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
437                                                   prod));
438             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
439             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
440             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
441             t4            = _mm_and_pd(t4, obc_mask3);
442             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
443
444             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
445
446             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
447                                        _mm_mul_pd(prod, lij3));
448             t1            = _mm_sub_pd(t1,
449                                        _mm_mul_pd(onefourth,
450                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
451                                                              _mm_mul_pd(lij3, r))));
452             t2            = _mm_mul_pd(onefourth,
453                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
454                                                   _mm_mul_pd(uij3, r)));
455             t2            = _mm_sub_pd(t2,
456                                        _mm_add_pd(_mm_mul_pd(half, uij2),
457                                                   _mm_mul_pd(prod, uij3)));
458             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
459                                        _mm_mul_pd(rinv, rinv));
460             t3            = _mm_sub_pd(t3,
461                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
462                                                   _mm_add_pd(one,
463                                                              _mm_mul_pd(sk2_rinv, rinv))));
464             t1            = _mm_mul_pd(rinv,
465                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
466                                                   _mm_add_pd(t2, t3)));
467
468             dadx1         = _mm_and_pd(t1, obc_mask1);
469
470             /* Evaluate influence of atom ai -> aj */
471             t1            = _mm_add_pd(r, sk_ai);
472             t2            = _mm_sub_pd(r, sk_ai);
473             t3            = _mm_sub_pd(sk_ai, r);
474             obc_mask1     = _mm_cmplt_pd(raj, t1);
475             obc_mask2     = _mm_cmplt_pd(raj, t2);
476             obc_mask3     = _mm_cmplt_pd(raj, t3);
477
478             uij           = gmx_mm_inv_pd(t1);
479             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
480                                          _mm_andnot_pd(obc_mask2, raj_inv));
481             dlij          = _mm_and_pd(one, obc_mask2);
482             uij2          = _mm_mul_pd(uij, uij);
483             uij3          = _mm_mul_pd(uij2, uij);
484             lij2          = _mm_mul_pd(lij, lij);
485             lij3          = _mm_mul_pd(lij2, lij);
486
487             diff2         = _mm_sub_pd(uij2, lij2);
488             lij_inv       = gmx_mm_invsqrt_pd(lij2);
489             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
490             prod          = _mm_mul_pd(onefourth, sk2_rinv);
491
492             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
493
494             t1            = _mm_sub_pd(lij, uij);
495             t2            = _mm_mul_pd(diff2,
496                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
497                                                   prod));
498             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
499             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
500             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
501             t4            = _mm_and_pd(t4, obc_mask3);
502             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
503
504             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
505
506             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
507                                        _mm_mul_pd(prod, lij3));
508             t1            = _mm_sub_pd(t1,
509                                        _mm_mul_pd(onefourth,
510                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
511                                                              _mm_mul_pd(lij3, r))));
512             t2            = _mm_mul_pd(onefourth,
513                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
514                                                   _mm_mul_pd(uij3, r)));
515             t2            = _mm_sub_pd(t2,
516                                        _mm_add_pd(_mm_mul_pd(half, uij2),
517                                                   _mm_mul_pd(prod, uij3)));
518             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
519                                        _mm_mul_pd(rinv, rinv));
520             t3            = _mm_sub_pd(t3,
521                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
522                                                   _mm_add_pd(one,
523                                                              _mm_mul_pd(sk2_rinv, rinv))));
524             t1            = _mm_mul_pd(rinv,
525                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
526                                                   _mm_add_pd(t2, t3)));
527
528             dadx2         = _mm_and_pd(t1, obc_mask1);
529
530             _mm_store_pd(dadx, dadx1);
531             dadx += 2;
532             _mm_store_pd(dadx, dadx2);
533             dadx += 2;
534         } /* end normal inner loop */
535
536         if (k < nj1)
537         {
538             jnrA        = jjnr[k];
539
540             j3A         = 3*jnrA;
541
542             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
543             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
544             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
545
546             dx    = _mm_sub_sd(ix, jx);
547             dy    = _mm_sub_sd(iy, jy);
548             dz    = _mm_sub_sd(iz, jz);
549
550             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
551
552             rinv        = gmx_mm_invsqrt_pd(rsq);
553             r           = _mm_mul_sd(rsq, rinv);
554
555             /* Compute raj_inv aj1-4 */
556             raj_inv     = gmx_mm_inv_pd(raj);
557
558             /* Evaluate influence of atom aj -> ai */
559             t1            = _mm_add_sd(r, sk_aj);
560             t2            = _mm_sub_sd(r, sk_aj);
561             t3            = _mm_sub_sd(sk_aj, r);
562             obc_mask1     = _mm_cmplt_sd(rai, t1);
563             obc_mask2     = _mm_cmplt_sd(rai, t2);
564             obc_mask3     = _mm_cmplt_sd(rai, t3);
565
566             uij           = gmx_mm_inv_pd(t1);
567             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
568                                       _mm_andnot_pd(obc_mask2, rai_inv));
569             dlij          = _mm_and_pd(one, obc_mask2);
570             uij2          = _mm_mul_sd(uij, uij);
571             uij3          = _mm_mul_sd(uij2, uij);
572             lij2          = _mm_mul_sd(lij, lij);
573             lij3          = _mm_mul_sd(lij2, lij);
574
575             diff2         = _mm_sub_sd(uij2, lij2);
576             lij_inv       = gmx_mm_invsqrt_pd(lij2);
577             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
578             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
579             prod          = _mm_mul_sd(onefourth, sk2_rinv);
580
581             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
582
583             t1            = _mm_sub_sd(lij, uij);
584             t2            = _mm_mul_sd(diff2,
585                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
586                                                   prod));
587             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
588             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
589             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
590             t4            = _mm_and_pd(t4, obc_mask3);
591             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
592
593             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
594
595             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
596                                        _mm_mul_sd(prod, lij3));
597             t1            = _mm_sub_sd(t1,
598                                        _mm_mul_sd(onefourth,
599                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
600                                                              _mm_mul_sd(lij3, r))));
601             t2            = _mm_mul_sd(onefourth,
602                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
603                                                   _mm_mul_sd(uij3, r)));
604             t2            = _mm_sub_sd(t2,
605                                        _mm_add_sd(_mm_mul_sd(half, uij2),
606                                                   _mm_mul_sd(prod, uij3)));
607             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
608                                        _mm_mul_sd(rinv, rinv));
609             t3            = _mm_sub_sd(t3,
610                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
611                                                   _mm_add_sd(one,
612                                                              _mm_mul_sd(sk2_rinv, rinv))));
613             t1            = _mm_mul_sd(rinv,
614                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
615                                                   _mm_add_pd(t2, t3)));
616
617             dadx1         = _mm_and_pd(t1, obc_mask1);
618
619             /* Evaluate influence of atom ai -> aj */
620             t1            = _mm_add_sd(r, sk_ai);
621             t2            = _mm_sub_sd(r, sk_ai);
622             t3            = _mm_sub_sd(sk_ai, r);
623             obc_mask1     = _mm_cmplt_sd(raj, t1);
624             obc_mask2     = _mm_cmplt_sd(raj, t2);
625             obc_mask3     = _mm_cmplt_sd(raj, t3);
626
627             uij           = gmx_mm_inv_pd(t1);
628             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
629                                          _mm_andnot_pd(obc_mask2, raj_inv));
630             dlij          = _mm_and_pd(one, obc_mask2);
631             uij2          = _mm_mul_sd(uij, uij);
632             uij3          = _mm_mul_sd(uij2, uij);
633             lij2          = _mm_mul_sd(lij, lij);
634             lij3          = _mm_mul_sd(lij2, lij);
635
636             diff2         = _mm_sub_sd(uij2, lij2);
637             lij_inv       = gmx_mm_invsqrt_pd(lij2);
638             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
639             prod          = _mm_mul_sd(onefourth, sk2_rinv);
640
641             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
642
643             t1            = _mm_sub_sd(lij, uij);
644             t2            = _mm_mul_sd(diff2,
645                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
646                                                   prod));
647             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
648             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
649             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
650             t4            = _mm_and_pd(t4, obc_mask3);
651             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
652
653             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
654
655             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
656                                        _mm_mul_sd(prod, lij3));
657             t1            = _mm_sub_sd(t1,
658                                        _mm_mul_sd(onefourth,
659                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
660                                                              _mm_mul_sd(lij3, r))));
661             t2            = _mm_mul_sd(onefourth,
662                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
663                                                   _mm_mul_sd(uij3, r)));
664             t2            = _mm_sub_sd(t2,
665                                        _mm_add_sd(_mm_mul_sd(half, uij2),
666                                                   _mm_mul_sd(prod, uij3)));
667             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
668                                        _mm_mul_sd(rinv, rinv));
669             t3            = _mm_sub_sd(t3,
670                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
671                                                   _mm_add_sd(one,
672                                                              _mm_mul_sd(sk2_rinv, rinv))));
673             t1            = _mm_mul_sd(rinv,
674                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
675                                                   _mm_add_sd(t2, t3)));
676
677             dadx2         = _mm_and_pd(t1, obc_mask1);
678
679             _mm_store_pd(dadx, dadx1);
680             dadx += 2;
681             _mm_store_pd(dadx, dadx2);
682             dadx += 2;
683         }
684         gmx_mm_update_1pot_pd(sum_ai, work+ii);
685
686     }
687
688     /* Parallel summations */
689     if (PARTDECOMP(cr))
690     {
691         gmx_sum(natoms, work, cr);
692     }
693     else if (DOMAINDECOMP(cr))
694     {
695         dd_atom_sum_real(cr->dd, work);
696     }
697
698     if (gb_algorithm == egbHCT)
699     {
700         /* HCT */
701         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
702         {
703             if (born->use[i] != 0)
704             {
705                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
706                 sum     = 1.0/rr - work[i];
707                 min_rad = rr + doffset;
708                 rad     = 1.0/sum;
709
710                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
711                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
712             }
713         }
714
715         /* Extra communication required for DD */
716         if (DOMAINDECOMP(cr))
717         {
718             dd_atom_spread_real(cr->dd, born->bRad);
719             dd_atom_spread_real(cr->dd, fr->invsqrta);
720         }
721     }
722     else
723     {
724         /* OBC */
725         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
726         {
727             if (born->use[i] != 0)
728             {
729                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
730                 rr_inv2 = 1.0/rr;
731                 rr      = rr-doffset;
732                 rr_inv  = 1.0/rr;
733                 sum     = rr * work[i];
734                 sum2    = sum  * sum;
735                 sum3    = sum2 * sum;
736
737                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
738                 born->bRad[i] = rr_inv - tsum*rr_inv2;
739                 born->bRad[i] = 1.0 / born->bRad[i];
740
741                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
742
743                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
744                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
745             }
746         }
747         /* Extra (local) communication required for DD */
748         if (DOMAINDECOMP(cr))
749         {
750             dd_atom_spread_real(cr->dd, born->bRad);
751             dd_atom_spread_real(cr->dd, fr->invsqrta);
752             dd_atom_spread_real(cr->dd, born->drobc);
753         }
754     }
755
756
757
758     return 0;
759 }
760
761
762 int
763 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
764                               double *x, double *f, double *fshift, double *shiftvec,
765                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
766 {
767     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
768     int           jnrA, jnrB;
769     int           j3A, j3B;
770     int        *  jjnr;
771
772     double        rbi, shX, shY, shZ;
773     double       *rb;
774
775     __m128d       ix, iy, iz;
776     __m128d       jx, jy, jz;
777     __m128d       fix, fiy, fiz;
778     __m128d       dx, dy, dz;
779     __m128d       tx, ty, tz;
780
781     __m128d       rbai, rbaj, f_gb, f_gb_ai;
782     __m128d       xmm1, xmm2, xmm3;
783
784     const __m128d two = _mm_set1_pd(2.0);
785
786     rb     = born->work;
787
788     jjnr   = nl->jjnr;
789
790     /* Loop to get the proper form for the Born radius term, sse style */
791     n0 = 0;
792     n1 = natoms;
793
794     if (gb_algorithm == egbSTILL)
795     {
796         for (i = n0; i < n1; i++)
797         {
798             rbi   = born->bRad[i];
799             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
800         }
801     }
802     else if (gb_algorithm == egbHCT)
803     {
804         for (i = n0; i < n1; i++)
805         {
806             rbi   = born->bRad[i];
807             rb[i] = rbi * rbi * dvda[i];
808         }
809     }
810     else if (gb_algorithm == egbOBC)
811     {
812         for (i = n0; i < n1; i++)
813         {
814             rbi   = born->bRad[i];
815             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
816         }
817     }
818
819     jz = _mm_setzero_pd();
820
821     n = j3A = j3B = 0;
822
823     for (i = 0; i < nl->nri; i++)
824     {
825         ii     = nl->iinr[i];
826         ii3    = ii*3;
827         is3    = 3*nl->shift[i];
828         shX    = shiftvec[is3];
829         shY    = shiftvec[is3+1];
830         shZ    = shiftvec[is3+2];
831         nj0    = nl->jindex[i];
832         nj1    = nl->jindex[i+1];
833
834         ix     = _mm_set1_pd(shX+x[ii3+0]);
835         iy     = _mm_set1_pd(shY+x[ii3+1]);
836         iz     = _mm_set1_pd(shZ+x[ii3+2]);
837
838         rbai   = _mm_load1_pd(rb+ii);
839         fix    = _mm_setzero_pd();
840         fiy    = _mm_setzero_pd();
841         fiz    = _mm_setzero_pd();
842
843
844         for (k = nj0; k < nj1-1; k += 2)
845         {
846             jnrA        = jjnr[k];
847             jnrB        = jjnr[k+1];
848
849             j3A         = 3*jnrA;
850             j3B         = 3*jnrB;
851
852             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
853
854             dx          = _mm_sub_pd(ix, jx);
855             dy          = _mm_sub_pd(iy, jy);
856             dz          = _mm_sub_pd(iz, jz);
857
858             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
859
860             /* load chain rule terms for j1-4 */
861             f_gb        = _mm_load_pd(dadx);
862             dadx       += 2;
863             f_gb_ai     = _mm_load_pd(dadx);
864             dadx       += 2;
865
866             /* calculate scalar force */
867             f_gb    = _mm_mul_pd(f_gb, rbai);
868             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
869             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
870
871             tx     = _mm_mul_pd(f_gb, dx);
872             ty     = _mm_mul_pd(f_gb, dy);
873             tz     = _mm_mul_pd(f_gb, dz);
874
875             fix    = _mm_add_pd(fix, tx);
876             fiy    = _mm_add_pd(fiy, ty);
877             fiz    = _mm_add_pd(fiz, tz);
878
879             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
880         }
881
882         /*deal with odd elements */
883         if (k < nj1)
884         {
885             jnrA        = jjnr[k];
886             j3A         = 3*jnrA;
887
888             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
889
890             dx          = _mm_sub_sd(ix, jx);
891             dy          = _mm_sub_sd(iy, jy);
892             dz          = _mm_sub_sd(iz, jz);
893
894             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
895
896             /* load chain rule terms */
897             f_gb        = _mm_load_pd(dadx);
898             dadx       += 2;
899             f_gb_ai     = _mm_load_pd(dadx);
900             dadx       += 2;
901
902             /* calculate scalar force */
903             f_gb    = _mm_mul_sd(f_gb, rbai);
904             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
905             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
906
907             tx     = _mm_mul_sd(f_gb, dx);
908             ty     = _mm_mul_sd(f_gb, dy);
909             tz     = _mm_mul_sd(f_gb, dz);
910
911             fix    = _mm_add_sd(fix, tx);
912             fiy    = _mm_add_sd(fiy, ty);
913             fiz    = _mm_add_sd(fiz, tz);
914
915             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
916         }
917
918         /* fix/fiy/fiz now contain four partial force terms, that all should be
919          * added to the i particle forces and shift forces.
920          */
921         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
922     }
923
924     return 0;
925 }
926
927 #else
928 /* keep compiler happy */
929 int genborn_sse2_dummy;
930
931 #endif /* SSE2 intrinsics available */