Code beautification with uncrustify
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_double.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *                This source code is part of
4  *
5  *                 G   R   O   M   A   C   S
6  *
7  *          GROningen MAchine for Chemical Simulations
8  *
9  *                        VERSION 4.5
10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12  * Copyright (c) 2001-2008, The GROMACS development team,
13  * check out http://www.gromacs.org for more information.
14
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version 2
18  * of the License, or (at your option) any later version.
19  *
20  * If you want to redistribute modifications, please consider that
21  * scientific software is very special. Version control is crucial -
22  * bugs must be traceable. We will be happy to consider code for
23  * inclusion in the official distribution, but derived work must not
24  * be called official GROMACS. Details are found in the README & COPYING
25  * files - if they are missing, get the official version at www.gromacs.org.
26  *
27  * To help us fund GROMACS development, we humbly ask that you cite
28  * the papers on the package - you can find them in the top README file.
29  *
30  * For more info, check our website at http://www.gromacs.org
31  *
32  * And Hey:
33  * Groningen Machine for Chemical Simulation
34  */
35 #ifdef HAVE_CONFIG_H
36 #include <config.h>
37 #endif
38
39 #include <math.h>
40 #include <string.h>
41
42 #include "typedefs.h"
43 #include "smalloc.h"
44 #include "genborn.h"
45 #include "vec.h"
46 #include "grompp.h"
47 #include "pdbio.h"
48 #include "names.h"
49 #include "physics.h"
50 #include "domdec.h"
51 #include "partdec.h"
52 #include "network.h"
53 #include "gmx_fatal.h"
54 #include "mtop_util.h"
55 #include "genborn.h"
56
57 #ifdef GMX_LIB_MPI
58 #include <mpi.h>
59 #endif
60 #ifdef GMX_THREAD_MPI
61 #include "tmpi.h"
62 #endif
63
64 /* Only compile this file if SSE2 intrinsics are available */
65 #if 0 && defined (GMX_X86_SSE2)
66 #include <gmx_sse2_double.h>
67 #include <emmintrin.h>
68
69 #include "genborn_sse2_double.h"
70
71 int
72 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
73                               int natoms, gmx_localtop_t *top,
74                               const t_atomtypes *atype, double *x, t_nblist *nl,
75                               gmx_genborn_t *born)
76 {
77     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
78     int           jnrA, jnrB, j3A, j3B;
79     int          *mdtype;
80     double        shX, shY, shZ;
81     int          *jjnr;
82     double       *shiftvec;
83
84     double        gpi_ai, gpi2;
85     double        factor;
86     double       *gb_radius;
87     double       *vsolv;
88     double       *work;
89     double       *dadx;
90
91     __m128d       ix, iy, iz;
92     __m128d       jx, jy, jz;
93     __m128d       dx, dy, dz;
94     __m128d       tx, ty, tz;
95     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
96     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
97     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
98     __m128d       mask, icf4, icf6, mask_cmp;
99
100     const __m128d half   = _mm_set1_pd(0.5);
101     const __m128d three  = _mm_set1_pd(3.0);
102     const __m128d one    = _mm_set1_pd(1.0);
103     const __m128d two    = _mm_set1_pd(2.0);
104     const __m128d zero   = _mm_set1_pd(0.0);
105     const __m128d four   = _mm_set1_pd(4.0);
106
107     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
108     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
109     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
110
111     factor  = 0.5 * ONE_4PI_EPS0;
112
113     gb_radius = born->gb_radius;
114     vsolv     = born->vsolv;
115     work      = born->gpol_still_work;
116     jjnr      = nl->jjnr;
117     shiftvec  = fr->shift_vec[0];
118     dadx      = fr->dadx;
119
120     jnrA = jnrB = 0;
121     jx   = _mm_setzero_pd();
122     jy   = _mm_setzero_pd();
123     jz   = _mm_setzero_pd();
124
125     n = 0;
126
127     for (i = 0; i < natoms; i++)
128     {
129         work[i] = 0;
130     }
131
132     for (i = 0; i < nl->nri; i++)
133     {
134         ii     = nl->iinr[i];
135         ii3    = ii*3;
136         is3    = 3*nl->shift[i];
137         shX    = shiftvec[is3];
138         shY    = shiftvec[is3+1];
139         shZ    = shiftvec[is3+2];
140         nj0    = nl->jindex[i];
141         nj1    = nl->jindex[i+1];
142
143         ix     = _mm_set1_pd(shX+x[ii3+0]);
144         iy     = _mm_set1_pd(shY+x[ii3+1]);
145         iz     = _mm_set1_pd(shZ+x[ii3+2]);
146
147
148         /* Polarization energy for atom ai */
149         gpi    = _mm_setzero_pd();
150
151         rai     = _mm_load1_pd(gb_radius+ii);
152         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
153
154         for (k = nj0; k < nj1-1; k += 2)
155         {
156             jnrA        = jjnr[k];
157             jnrB        = jjnr[k+1];
158
159             j3A         = 3*jnrA;
160             j3B         = 3*jnrB;
161
162             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
163
164             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
165             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
166
167             dx          = _mm_sub_pd(ix, jx);
168             dy          = _mm_sub_pd(iy, jy);
169             dz          = _mm_sub_pd(iz, jz);
170
171             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
172             rinv        = gmx_mm_invsqrt_pd(rsq);
173             rinv2       = _mm_mul_pd(rinv, rinv);
174             rinv4       = _mm_mul_pd(rinv2, rinv2);
175             rinv6       = _mm_mul_pd(rinv4, rinv2);
176
177             rvdw        = _mm_add_pd(rai, raj);
178             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
179
180             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
181
182             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
183             if (0 == _mm_movemask_pd(mask_cmp) )
184             {
185                 /* if ratio>still_p5inv for ALL elements */
186                 ccf         = one;
187                 dccf        = _mm_setzero_pd();
188             }
189             else
190             {
191                 ratio       = _mm_min_pd(ratio, still_p5inv);
192                 theta       = _mm_mul_pd(ratio, still_pip5);
193                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
194                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
195                 ccf         = _mm_mul_pd(term, term);
196                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
197                                          _mm_mul_pd(sinq, theta));
198             }
199
200             prod        = _mm_mul_pd(still_p4, vaj);
201             icf4        = _mm_mul_pd(ccf, rinv4);
202             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
203
204             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
205
206             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
207
208             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
209             dadx += 2;
210             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
211             dadx += 2;
212         }
213
214         if (k < nj1)
215         {
216             jnrA        = jjnr[k];
217
218             j3A         = 3*jnrA;
219
220             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
221
222             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
223             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
224
225             dx          = _mm_sub_sd(ix, jx);
226             dy          = _mm_sub_sd(iy, jy);
227             dz          = _mm_sub_sd(iz, jz);
228
229             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
230             rinv        = gmx_mm_invsqrt_pd(rsq);
231             rinv2       = _mm_mul_sd(rinv, rinv);
232             rinv4       = _mm_mul_sd(rinv2, rinv2);
233             rinv6       = _mm_mul_sd(rinv4, rinv2);
234
235             rvdw        = _mm_add_sd(rai, raj);
236             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
237
238             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
239
240             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
241             if (0 == _mm_movemask_pd(mask_cmp) )
242             {
243                 /* if ratio>still_p5inv for ALL elements */
244                 ccf         = one;
245                 dccf        = _mm_setzero_pd();
246             }
247             else
248             {
249                 ratio       = _mm_min_sd(ratio, still_p5inv);
250                 theta       = _mm_mul_sd(ratio, still_pip5);
251                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
252                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
253                 ccf         = _mm_mul_sd(term, term);
254                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
255                                          _mm_mul_sd(sinq, theta));
256             }
257
258             prod        = _mm_mul_sd(still_p4, vaj);
259             icf4        = _mm_mul_sd(ccf, rinv4);
260             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
261
262             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
263
264             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
265
266             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
267             dadx += 2;
268             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
269             dadx += 2;
270         }
271         gmx_mm_update_1pot_pd(gpi, work+ii);
272     }
273
274     /* Sum up the polarization energy from other nodes */
275     if (PARTDECOMP(cr))
276     {
277         gmx_sum(natoms, work, cr);
278     }
279     else if (DOMAINDECOMP(cr))
280     {
281         dd_atom_sum_real(cr->dd, work);
282     }
283
284     /* Compute the radii */
285     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
286     {
287         if (born->use[i] != 0)
288         {
289             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
290             gpi2             = gpi_ai * gpi_ai;
291             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
292             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
293         }
294     }
295
296     /* Extra (local) communication required for DD */
297     if (DOMAINDECOMP(cr))
298     {
299         dd_atom_spread_real(cr->dd, born->bRad);
300         dd_atom_spread_real(cr->dd, fr->invsqrta);
301     }
302
303     return 0;
304 }
305
306
307 int
308 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
309                                 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
310 {
311     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
312     int           jnrA, jnrB;
313     int           j3A, j3B;
314     double        shX, shY, shZ;
315     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
316     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
317     double       *obc_param;
318     double       *gb_radius;
319     double       *work;
320     int        *  jjnr;
321     double       *dadx;
322     double       *shiftvec;
323     double        min_rad, rad;
324
325     __m128d       ix, iy, iz, jx, jy, jz;
326     __m128d       dx, dy, dz, t1, t2, t3, t4;
327     __m128d       rsq, rinv, r;
328     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
329     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
330     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
331     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
332     __m128d       dadx1, dadx2;
333     __m128d       logterm;
334     __m128d       mask;
335     __m128d       obc_mask1, obc_mask2, obc_mask3;
336
337     __m128d       oneeighth   = _mm_set1_pd(0.125);
338     __m128d       onefourth   = _mm_set1_pd(0.25);
339
340     const __m128d half  = _mm_set1_pd(0.5);
341     const __m128d three = _mm_set1_pd(3.0);
342     const __m128d one   = _mm_set1_pd(1.0);
343     const __m128d two   = _mm_set1_pd(2.0);
344     const __m128d zero  = _mm_set1_pd(0.0);
345     const __m128d neg   = _mm_set1_pd(-1.0);
346
347     /* Set the dielectric offset */
348     doffset   = born->gb_doffset;
349     gb_radius = born->gb_radius;
350     obc_param = born->param;
351     work      = born->gpol_hct_work;
352     jjnr      = nl->jjnr;
353     dadx      = fr->dadx;
354     shiftvec  = fr->shift_vec[0];
355
356     jx        = _mm_setzero_pd();
357     jy        = _mm_setzero_pd();
358     jz        = _mm_setzero_pd();
359
360     jnrA = jnrB = 0;
361
362     for (i = 0; i < born->nr; i++)
363     {
364         work[i] = 0;
365     }
366
367     for (i = 0; i < nl->nri; i++)
368     {
369         ii     = nl->iinr[i];
370         ii3    = ii*3;
371         is3    = 3*nl->shift[i];
372         shX    = shiftvec[is3];
373         shY    = shiftvec[is3+1];
374         shZ    = shiftvec[is3+2];
375         nj0    = nl->jindex[i];
376         nj1    = nl->jindex[i+1];
377
378         ix     = _mm_set1_pd(shX+x[ii3+0]);
379         iy     = _mm_set1_pd(shY+x[ii3+1]);
380         iz     = _mm_set1_pd(shZ+x[ii3+2]);
381
382         rai     = _mm_load1_pd(gb_radius+ii);
383         rai_inv = gmx_mm_inv_pd(rai);
384
385         sum_ai = _mm_setzero_pd();
386
387         sk_ai  = _mm_load1_pd(born->param+ii);
388         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
389
390         for (k = nj0; k < nj1-1; k += 2)
391         {
392             jnrA        = jjnr[k];
393             jnrB        = jjnr[k+1];
394
395             j3A         = 3*jnrA;
396             j3B         = 3*jnrB;
397
398             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
399             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
400             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
401
402             dx    = _mm_sub_pd(ix, jx);
403             dy    = _mm_sub_pd(iy, jy);
404             dz    = _mm_sub_pd(iz, jz);
405
406             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
407
408             rinv        = gmx_mm_invsqrt_pd(rsq);
409             r           = _mm_mul_pd(rsq, rinv);
410
411             /* Compute raj_inv aj1-4 */
412             raj_inv     = gmx_mm_inv_pd(raj);
413
414             /* Evaluate influence of atom aj -> ai */
415             t1            = _mm_add_pd(r, sk_aj);
416             t2            = _mm_sub_pd(r, sk_aj);
417             t3            = _mm_sub_pd(sk_aj, r);
418             obc_mask1     = _mm_cmplt_pd(rai, t1);
419             obc_mask2     = _mm_cmplt_pd(rai, t2);
420             obc_mask3     = _mm_cmplt_pd(rai, t3);
421
422             uij           = gmx_mm_inv_pd(t1);
423             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
424                                          _mm_andnot_pd(obc_mask2, rai_inv));
425             dlij          = _mm_and_pd(one, obc_mask2);
426             uij2          = _mm_mul_pd(uij, uij);
427             uij3          = _mm_mul_pd(uij2, uij);
428             lij2          = _mm_mul_pd(lij, lij);
429             lij3          = _mm_mul_pd(lij2, lij);
430
431             diff2         = _mm_sub_pd(uij2, lij2);
432             lij_inv       = gmx_mm_invsqrt_pd(lij2);
433             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
434             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
435             prod          = _mm_mul_pd(onefourth, sk2_rinv);
436
437             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
438
439             t1            = _mm_sub_pd(lij, uij);
440             t2            = _mm_mul_pd(diff2,
441                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
442                                                   prod));
443             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
444             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
445             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
446             t4            = _mm_and_pd(t4, obc_mask3);
447             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
448
449             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
450
451             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
452                                        _mm_mul_pd(prod, lij3));
453             t1            = _mm_sub_pd(t1,
454                                        _mm_mul_pd(onefourth,
455                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
456                                                              _mm_mul_pd(lij3, r))));
457             t2            = _mm_mul_pd(onefourth,
458                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
459                                                   _mm_mul_pd(uij3, r)));
460             t2            = _mm_sub_pd(t2,
461                                        _mm_add_pd(_mm_mul_pd(half, uij2),
462                                                   _mm_mul_pd(prod, uij3)));
463             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
464                                        _mm_mul_pd(rinv, rinv));
465             t3            = _mm_sub_pd(t3,
466                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
467                                                   _mm_add_pd(one,
468                                                              _mm_mul_pd(sk2_rinv, rinv))));
469             t1            = _mm_mul_pd(rinv,
470                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
471                                                   _mm_add_pd(t2, t3)));
472
473             dadx1         = _mm_and_pd(t1, obc_mask1);
474
475             /* Evaluate influence of atom ai -> aj */
476             t1            = _mm_add_pd(r, sk_ai);
477             t2            = _mm_sub_pd(r, sk_ai);
478             t3            = _mm_sub_pd(sk_ai, r);
479             obc_mask1     = _mm_cmplt_pd(raj, t1);
480             obc_mask2     = _mm_cmplt_pd(raj, t2);
481             obc_mask3     = _mm_cmplt_pd(raj, t3);
482
483             uij           = gmx_mm_inv_pd(t1);
484             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
485                                          _mm_andnot_pd(obc_mask2, raj_inv));
486             dlij          = _mm_and_pd(one, obc_mask2);
487             uij2          = _mm_mul_pd(uij, uij);
488             uij3          = _mm_mul_pd(uij2, uij);
489             lij2          = _mm_mul_pd(lij, lij);
490             lij3          = _mm_mul_pd(lij2, lij);
491
492             diff2         = _mm_sub_pd(uij2, lij2);
493             lij_inv       = gmx_mm_invsqrt_pd(lij2);
494             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
495             prod          = _mm_mul_pd(onefourth, sk2_rinv);
496
497             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
498
499             t1            = _mm_sub_pd(lij, uij);
500             t2            = _mm_mul_pd(diff2,
501                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
502                                                   prod));
503             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
504             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
505             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
506             t4            = _mm_and_pd(t4, obc_mask3);
507             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
508
509             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
510
511             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
512                                        _mm_mul_pd(prod, lij3));
513             t1            = _mm_sub_pd(t1,
514                                        _mm_mul_pd(onefourth,
515                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
516                                                              _mm_mul_pd(lij3, r))));
517             t2            = _mm_mul_pd(onefourth,
518                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
519                                                   _mm_mul_pd(uij3, r)));
520             t2            = _mm_sub_pd(t2,
521                                        _mm_add_pd(_mm_mul_pd(half, uij2),
522                                                   _mm_mul_pd(prod, uij3)));
523             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
524                                        _mm_mul_pd(rinv, rinv));
525             t3            = _mm_sub_pd(t3,
526                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
527                                                   _mm_add_pd(one,
528                                                              _mm_mul_pd(sk2_rinv, rinv))));
529             t1            = _mm_mul_pd(rinv,
530                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
531                                                   _mm_add_pd(t2, t3)));
532
533             dadx2         = _mm_and_pd(t1, obc_mask1);
534
535             _mm_store_pd(dadx, dadx1);
536             dadx += 2;
537             _mm_store_pd(dadx, dadx2);
538             dadx += 2;
539         } /* end normal inner loop */
540
541         if (k < nj1)
542         {
543             jnrA        = jjnr[k];
544
545             j3A         = 3*jnrA;
546
547             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
548             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
549             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
550
551             dx    = _mm_sub_sd(ix, jx);
552             dy    = _mm_sub_sd(iy, jy);
553             dz    = _mm_sub_sd(iz, jz);
554
555             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
556
557             rinv        = gmx_mm_invsqrt_pd(rsq);
558             r           = _mm_mul_sd(rsq, rinv);
559
560             /* Compute raj_inv aj1-4 */
561             raj_inv     = gmx_mm_inv_pd(raj);
562
563             /* Evaluate influence of atom aj -> ai */
564             t1            = _mm_add_sd(r, sk_aj);
565             t2            = _mm_sub_sd(r, sk_aj);
566             t3            = _mm_sub_sd(sk_aj, r);
567             obc_mask1     = _mm_cmplt_sd(rai, t1);
568             obc_mask2     = _mm_cmplt_sd(rai, t2);
569             obc_mask3     = _mm_cmplt_sd(rai, t3);
570
571             uij           = gmx_mm_inv_pd(t1);
572             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
573                                       _mm_andnot_pd(obc_mask2, rai_inv));
574             dlij          = _mm_and_pd(one, obc_mask2);
575             uij2          = _mm_mul_sd(uij, uij);
576             uij3          = _mm_mul_sd(uij2, uij);
577             lij2          = _mm_mul_sd(lij, lij);
578             lij3          = _mm_mul_sd(lij2, lij);
579
580             diff2         = _mm_sub_sd(uij2, lij2);
581             lij_inv       = gmx_mm_invsqrt_pd(lij2);
582             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
583             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
584             prod          = _mm_mul_sd(onefourth, sk2_rinv);
585
586             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
587
588             t1            = _mm_sub_sd(lij, uij);
589             t2            = _mm_mul_sd(diff2,
590                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
591                                                   prod));
592             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
593             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
594             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
595             t4            = _mm_and_pd(t4, obc_mask3);
596             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
597
598             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
599
600             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
601                                        _mm_mul_sd(prod, lij3));
602             t1            = _mm_sub_sd(t1,
603                                        _mm_mul_sd(onefourth,
604                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
605                                                              _mm_mul_sd(lij3, r))));
606             t2            = _mm_mul_sd(onefourth,
607                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
608                                                   _mm_mul_sd(uij3, r)));
609             t2            = _mm_sub_sd(t2,
610                                        _mm_add_sd(_mm_mul_sd(half, uij2),
611                                                   _mm_mul_sd(prod, uij3)));
612             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
613                                        _mm_mul_sd(rinv, rinv));
614             t3            = _mm_sub_sd(t3,
615                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
616                                                   _mm_add_sd(one,
617                                                              _mm_mul_sd(sk2_rinv, rinv))));
618             t1            = _mm_mul_sd(rinv,
619                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
620                                                   _mm_add_pd(t2, t3)));
621
622             dadx1         = _mm_and_pd(t1, obc_mask1);
623
624             /* Evaluate influence of atom ai -> aj */
625             t1            = _mm_add_sd(r, sk_ai);
626             t2            = _mm_sub_sd(r, sk_ai);
627             t3            = _mm_sub_sd(sk_ai, r);
628             obc_mask1     = _mm_cmplt_sd(raj, t1);
629             obc_mask2     = _mm_cmplt_sd(raj, t2);
630             obc_mask3     = _mm_cmplt_sd(raj, t3);
631
632             uij           = gmx_mm_inv_pd(t1);
633             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
634                                          _mm_andnot_pd(obc_mask2, raj_inv));
635             dlij          = _mm_and_pd(one, obc_mask2);
636             uij2          = _mm_mul_sd(uij, uij);
637             uij3          = _mm_mul_sd(uij2, uij);
638             lij2          = _mm_mul_sd(lij, lij);
639             lij3          = _mm_mul_sd(lij2, lij);
640
641             diff2         = _mm_sub_sd(uij2, lij2);
642             lij_inv       = gmx_mm_invsqrt_pd(lij2);
643             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
644             prod          = _mm_mul_sd(onefourth, sk2_rinv);
645
646             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
647
648             t1            = _mm_sub_sd(lij, uij);
649             t2            = _mm_mul_sd(diff2,
650                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
651                                                   prod));
652             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
653             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
654             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
655             t4            = _mm_and_pd(t4, obc_mask3);
656             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
657
658             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
659
660             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
661                                        _mm_mul_sd(prod, lij3));
662             t1            = _mm_sub_sd(t1,
663                                        _mm_mul_sd(onefourth,
664                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
665                                                              _mm_mul_sd(lij3, r))));
666             t2            = _mm_mul_sd(onefourth,
667                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
668                                                   _mm_mul_sd(uij3, r)));
669             t2            = _mm_sub_sd(t2,
670                                        _mm_add_sd(_mm_mul_sd(half, uij2),
671                                                   _mm_mul_sd(prod, uij3)));
672             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
673                                        _mm_mul_sd(rinv, rinv));
674             t3            = _mm_sub_sd(t3,
675                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
676                                                   _mm_add_sd(one,
677                                                              _mm_mul_sd(sk2_rinv, rinv))));
678             t1            = _mm_mul_sd(rinv,
679                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
680                                                   _mm_add_sd(t2, t3)));
681
682             dadx2         = _mm_and_pd(t1, obc_mask1);
683
684             _mm_store_pd(dadx, dadx1);
685             dadx += 2;
686             _mm_store_pd(dadx, dadx2);
687             dadx += 2;
688         }
689         gmx_mm_update_1pot_pd(sum_ai, work+ii);
690
691     }
692
693     /* Parallel summations */
694     if (PARTDECOMP(cr))
695     {
696         gmx_sum(natoms, work, cr);
697     }
698     else if (DOMAINDECOMP(cr))
699     {
700         dd_atom_sum_real(cr->dd, work);
701     }
702
703     if (gb_algorithm == egbHCT)
704     {
705         /* HCT */
706         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
707         {
708             if (born->use[i] != 0)
709             {
710                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
711                 sum     = 1.0/rr - work[i];
712                 min_rad = rr + doffset;
713                 rad     = 1.0/sum;
714
715                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
716                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
717             }
718         }
719
720         /* Extra communication required for DD */
721         if (DOMAINDECOMP(cr))
722         {
723             dd_atom_spread_real(cr->dd, born->bRad);
724             dd_atom_spread_real(cr->dd, fr->invsqrta);
725         }
726     }
727     else
728     {
729         /* OBC */
730         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
731         {
732             if (born->use[i] != 0)
733             {
734                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
735                 rr_inv2 = 1.0/rr;
736                 rr      = rr-doffset;
737                 rr_inv  = 1.0/rr;
738                 sum     = rr * work[i];
739                 sum2    = sum  * sum;
740                 sum3    = sum2 * sum;
741
742                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
743                 born->bRad[i] = rr_inv - tsum*rr_inv2;
744                 born->bRad[i] = 1.0 / born->bRad[i];
745
746                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
747
748                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
749                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
750             }
751         }
752         /* Extra (local) communication required for DD */
753         if (DOMAINDECOMP(cr))
754         {
755             dd_atom_spread_real(cr->dd, born->bRad);
756             dd_atom_spread_real(cr->dd, fr->invsqrta);
757             dd_atom_spread_real(cr->dd, born->drobc);
758         }
759     }
760
761
762
763     return 0;
764 }
765
766
767 int
768 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
769                               double *x, double *f, double *fshift, double *shiftvec,
770                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
771 {
772     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
773     int           jnrA, jnrB;
774     int           j3A, j3B;
775     int        *  jjnr;
776
777     double        rbi, shX, shY, shZ;
778     double       *rb;
779
780     __m128d       ix, iy, iz;
781     __m128d       jx, jy, jz;
782     __m128d       fix, fiy, fiz;
783     __m128d       dx, dy, dz;
784     __m128d       tx, ty, tz;
785
786     __m128d       rbai, rbaj, f_gb, f_gb_ai;
787     __m128d       xmm1, xmm2, xmm3;
788
789     const __m128d two = _mm_set1_pd(2.0);
790
791     rb     = born->work;
792
793     jjnr   = nl->jjnr;
794
795     /* Loop to get the proper form for the Born radius term, sse style */
796     n0 = 0;
797     n1 = natoms;
798
799     if (gb_algorithm == egbSTILL)
800     {
801         for (i = n0; i < n1; i++)
802         {
803             rbi   = born->bRad[i];
804             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
805         }
806     }
807     else if (gb_algorithm == egbHCT)
808     {
809         for (i = n0; i < n1; i++)
810         {
811             rbi   = born->bRad[i];
812             rb[i] = rbi * rbi * dvda[i];
813         }
814     }
815     else if (gb_algorithm == egbOBC)
816     {
817         for (i = n0; i < n1; i++)
818         {
819             rbi   = born->bRad[i];
820             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
821         }
822     }
823
824     jz = _mm_setzero_pd();
825
826     n = j3A = j3B = 0;
827
828     for (i = 0; i < nl->nri; i++)
829     {
830         ii     = nl->iinr[i];
831         ii3    = ii*3;
832         is3    = 3*nl->shift[i];
833         shX    = shiftvec[is3];
834         shY    = shiftvec[is3+1];
835         shZ    = shiftvec[is3+2];
836         nj0    = nl->jindex[i];
837         nj1    = nl->jindex[i+1];
838
839         ix     = _mm_set1_pd(shX+x[ii3+0]);
840         iy     = _mm_set1_pd(shY+x[ii3+1]);
841         iz     = _mm_set1_pd(shZ+x[ii3+2]);
842
843         rbai   = _mm_load1_pd(rb+ii);
844         fix    = _mm_setzero_pd();
845         fiy    = _mm_setzero_pd();
846         fiz    = _mm_setzero_pd();
847
848
849         for (k = nj0; k < nj1-1; k += 2)
850         {
851             jnrA        = jjnr[k];
852             jnrB        = jjnr[k+1];
853
854             j3A         = 3*jnrA;
855             j3B         = 3*jnrB;
856
857             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
858
859             dx          = _mm_sub_pd(ix, jx);
860             dy          = _mm_sub_pd(iy, jy);
861             dz          = _mm_sub_pd(iz, jz);
862
863             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
864
865             /* load chain rule terms for j1-4 */
866             f_gb        = _mm_load_pd(dadx);
867             dadx       += 2;
868             f_gb_ai     = _mm_load_pd(dadx);
869             dadx       += 2;
870
871             /* calculate scalar force */
872             f_gb    = _mm_mul_pd(f_gb, rbai);
873             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
874             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
875
876             tx     = _mm_mul_pd(f_gb, dx);
877             ty     = _mm_mul_pd(f_gb, dy);
878             tz     = _mm_mul_pd(f_gb, dz);
879
880             fix    = _mm_add_pd(fix, tx);
881             fiy    = _mm_add_pd(fiy, ty);
882             fiz    = _mm_add_pd(fiz, tz);
883
884             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
885         }
886
887         /*deal with odd elements */
888         if (k < nj1)
889         {
890             jnrA        = jjnr[k];
891             j3A         = 3*jnrA;
892
893             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
894
895             dx          = _mm_sub_sd(ix, jx);
896             dy          = _mm_sub_sd(iy, jy);
897             dz          = _mm_sub_sd(iz, jz);
898
899             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
900
901             /* load chain rule terms */
902             f_gb        = _mm_load_pd(dadx);
903             dadx       += 2;
904             f_gb_ai     = _mm_load_pd(dadx);
905             dadx       += 2;
906
907             /* calculate scalar force */
908             f_gb    = _mm_mul_sd(f_gb, rbai);
909             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
910             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
911
912             tx     = _mm_mul_sd(f_gb, dx);
913             ty     = _mm_mul_sd(f_gb, dy);
914             tz     = _mm_mul_sd(f_gb, dz);
915
916             fix    = _mm_add_sd(fix, tx);
917             fiy    = _mm_add_sd(fiy, ty);
918             fiz    = _mm_add_sd(fiz, tz);
919
920             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
921         }
922
923         /* fix/fiy/fiz now contain four partial force terms, that all should be
924          * added to the i particle forces and shift forces.
925          */
926         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
927     }
928
929     return 0;
930 }
931
932 #else
933 /* keep compiler happy */
934 int genborn_sse2_dummy;
935
936 #endif /* SSE2 intrinsics available */