Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_double.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  * 
3  *                This source code is part of
4  * 
5  *                 G   R   O   M   A   C   S
6  * 
7  *          GROningen MAchine for Chemical Simulations
8  * 
9  *                        VERSION 4.5
10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12  * Copyright (c) 2001-2008, The GROMACS development team,
13  * check out http://www.gromacs.org for more information.
14  
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version 2
18  * of the License, or (at your option) any later version.
19  * 
20  * If you want to redistribute modifications, please consider that
21  * scientific software is very special. Version control is crucial -
22  * bugs must be traceable. We will be happy to consider code for
23  * inclusion in the official distribution, but derived work must not
24  * be called official GROMACS. Details are found in the README & COPYING
25  * files - if they are missing, get the official version at www.gromacs.org.
26  * 
27  * To help us fund GROMACS development, we humbly ask that you cite
28  * the papers on the package - you can find them in the top README file.
29  * 
30  * For more info, check our website at http://www.gromacs.org
31  * 
32  * And Hey:
33  * Groningen Machine for Chemical Simulation
34  */
35 #ifdef HAVE_CONFIG_H
36 #include <config.h>
37 #endif
38
39 #include <math.h>
40 #include <string.h>
41
42 #include "typedefs.h"
43 #include "smalloc.h"
44 #include "genborn.h"
45 #include "vec.h"
46 #include "grompp.h"
47 #include "pdbio.h"
48 #include "names.h"
49 #include "physics.h"
50 #include "domdec.h"
51 #include "partdec.h"
52 #include "network.h"
53 #include "gmx_fatal.h"
54 #include "mtop_util.h"
55 #include "genborn.h"
56
57 #ifdef GMX_LIB_MPI
58 #include <mpi.h>
59 #endif
60 #ifdef GMX_THREAD_MPI
61 #include "tmpi.h"
62 #endif
63
64 /* Only compile this file if SSE2 intrinsics are available */
65 #if 0 && defined (GMX_X86_SSE2)
66 #include <gmx_sse2_double.h>
67 #include <emmintrin.h>
68
69 #include "genborn_sse2_double.h"
70
71 int 
72 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
73                               int natoms, gmx_localtop_t *top,
74                               const t_atomtypes *atype, double *x, t_nblist *nl,
75                               gmx_genborn_t *born)
76 {
77         int i,k,n,ii,is3,ii3,nj0,nj1,offset;
78         int jnrA,jnrB,j3A,j3B;
79     int *mdtype;
80         double shX,shY,shZ;
81     int *jjnr;
82     double *shiftvec;
83     
84         double gpi_ai,gpi2;
85         double factor;
86         double *gb_radius;
87     double *vsolv;
88     double *work;
89     double *dadx;
90     
91         __m128d ix,iy,iz;
92         __m128d jx,jy,jz;
93         __m128d dx,dy,dz;
94         __m128d tx,ty,tz;
95         __m128d rsq,rinv,rinv2,rinv4,rinv6;
96         __m128d ratio,gpi,rai,raj,vai,vaj,rvdw;
97         __m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
98         __m128d mask,icf4,icf6,mask_cmp;
99             
100         const __m128d half   = _mm_set1_pd(0.5);
101         const __m128d three  = _mm_set1_pd(3.0);
102         const __m128d one    = _mm_set1_pd(1.0);
103         const __m128d two    = _mm_set1_pd(2.0);
104         const __m128d zero   = _mm_set1_pd(0.0);
105         const __m128d four   = _mm_set1_pd(4.0);
106         
107         const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
108         const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
109         const __m128d still_p4     = _mm_set1_pd(STILL_P4);
110     
111         factor  = 0.5 * ONE_4PI_EPS0;
112     
113     gb_radius = born->gb_radius;
114     vsolv     = born->vsolv;
115     work      = born->gpol_still_work;
116         jjnr      = nl->jjnr;
117     shiftvec  = fr->shift_vec[0];
118     dadx      = fr->dadx;
119     
120         jnrA = jnrB = 0;
121     jx = _mm_setzero_pd();
122     jy = _mm_setzero_pd();
123     jz = _mm_setzero_pd();
124     
125         n = 0;
126     
127         for(i=0;i<natoms;i++)
128         {
129                 work[i]=0;
130         }
131     
132         for(i=0;i<nl->nri;i++)
133         {
134         ii     = nl->iinr[i];
135                 ii3        = ii*3;
136         is3    = 3*nl->shift[i];     
137         shX    = shiftvec[is3];  
138         shY    = shiftvec[is3+1];
139         shZ    = shiftvec[is3+2];
140         nj0    = nl->jindex[i];      
141         nj1    = nl->jindex[i+1];    
142         
143         ix     = _mm_set1_pd(shX+x[ii3+0]);
144                 iy     = _mm_set1_pd(shY+x[ii3+1]);
145                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
146                 
147
148                 /* Polarization energy for atom ai */
149                 gpi    = _mm_setzero_pd();
150                 
151         rai     = _mm_load1_pd(gb_radius+ii);
152         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
153
154                 for(k=nj0;k<nj1-1;k+=2)
155                 {
156                         jnrA        = jjnr[k];   
157                         jnrB        = jjnr[k+1];
158             
159             j3A         = 3*jnrA;  
160                         j3B         = 3*jnrB;
161             
162             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
163             
164             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
165                         GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj);
166             
167                         dx          = _mm_sub_pd(ix,jx);
168                         dy          = _mm_sub_pd(iy,jy);
169                         dz          = _mm_sub_pd(iz,jz);
170             
171             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
172             rinv        = gmx_mm_invsqrt_pd(rsq);
173             rinv2       = _mm_mul_pd(rinv,rinv);
174             rinv4       = _mm_mul_pd(rinv2,rinv2);
175             rinv6       = _mm_mul_pd(rinv4,rinv2);
176             
177             rvdw        = _mm_add_pd(rai,raj);
178             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
179             
180             mask_cmp    = _mm_cmple_pd(ratio,still_p5inv);
181
182             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
183             if( 0 == _mm_movemask_pd(mask_cmp) )
184             {
185                 /* if ratio>still_p5inv for ALL elements */
186                 ccf         = one;
187                 dccf        = _mm_setzero_pd();
188             }
189             else 
190             {
191                 ratio       = _mm_min_pd(ratio,still_p5inv);
192                 theta       = _mm_mul_pd(ratio,still_pip5);
193                 gmx_mm_sincos_pd(theta,&sinq,&cosq);
194                 term        = _mm_mul_pd(half,_mm_sub_pd(one,cosq));
195                 ccf         = _mm_mul_pd(term,term);
196                 dccf        = _mm_mul_pd(_mm_mul_pd(two,term),
197                                          _mm_mul_pd(sinq,theta));
198             }
199
200             prod        = _mm_mul_pd(still_p4,vaj);
201             icf4        = _mm_mul_pd(ccf,rinv4);
202             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6);
203                         
204             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4));
205             
206             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) );
207             
208             _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
209             dadx+=2;
210             _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
211             dadx+=2;
212                 } 
213         
214         if(k<nj1)
215                 {
216                         jnrA        = jjnr[k];   
217             
218             j3A         = 3*jnrA;  
219             
220             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
221             
222             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
223                         GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj);
224             
225                         dx          = _mm_sub_sd(ix,jx);
226                         dy          = _mm_sub_sd(iy,jy);
227                         dz          = _mm_sub_sd(iz,jz);
228             
229             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
230             rinv        = gmx_mm_invsqrt_pd(rsq);
231             rinv2       = _mm_mul_sd(rinv,rinv);
232             rinv4       = _mm_mul_sd(rinv2,rinv2);
233             rinv6       = _mm_mul_sd(rinv4,rinv2);
234             
235             rvdw        = _mm_add_sd(rai,raj);
236             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
237             
238             mask_cmp    = _mm_cmple_sd(ratio,still_p5inv);
239             
240             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
241             if( 0 == _mm_movemask_pd(mask_cmp) )
242             {
243                 /* if ratio>still_p5inv for ALL elements */
244                 ccf         = one;
245                 dccf        = _mm_setzero_pd();
246             }
247             else 
248             {
249                 ratio       = _mm_min_sd(ratio,still_p5inv);
250                 theta       = _mm_mul_sd(ratio,still_pip5);
251                 gmx_mm_sincos_pd(theta,&sinq,&cosq);
252                 term        = _mm_mul_sd(half,_mm_sub_sd(one,cosq));
253                 ccf         = _mm_mul_sd(term,term);
254                 dccf        = _mm_mul_sd(_mm_mul_sd(two,term),
255                                          _mm_mul_sd(sinq,theta));
256             }
257             
258             prod        = _mm_mul_sd(still_p4,vaj);
259             icf4        = _mm_mul_sd(ccf,rinv4);
260             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6);
261
262             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4));
263             
264             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) );
265             
266             _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
267             dadx+=2;
268             _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
269             dadx+=2;
270                 } 
271         gmx_mm_update_1pot_pd(gpi,work+ii);
272         }
273     
274         /* Sum up the polarization energy from other nodes */
275         if(PARTDECOMP(cr))
276         {
277                 gmx_sum(natoms, work, cr);
278         }
279         else if(DOMAINDECOMP(cr))
280         {
281                 dd_atom_sum_real(cr->dd, work);
282         }
283         
284         /* Compute the radii */
285         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
286         {               
287                 if(born->use[i] != 0)
288                 {
289                         gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
290                         gpi2             = gpi_ai * gpi_ai;
291                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
292                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
293                 }
294         }
295     
296         /* Extra (local) communication required for DD */
297         if(DOMAINDECOMP(cr))
298         {
299                 dd_atom_spread_real(cr->dd, born->bRad);
300                 dd_atom_spread_real(cr->dd, fr->invsqrta);
301         }
302     
303         return 0;       
304 }
305
306
307 int 
308 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
309                                 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
310 {
311         int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
312     int jnrA,jnrB;
313     int j3A,j3B;
314         double shX,shY,shZ;
315         double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
316         double sum_ai2, sum_ai3,tsum,tchain,doffset;
317         double *obc_param;
318     double *gb_radius;
319     double *work;
320     int *  jjnr;
321     double *dadx;
322     double *shiftvec;
323     double min_rad,rad;
324     
325         __m128d ix,iy,iz,jx,jy,jz;
326         __m128d dx,dy,dz,t1,t2,t3,t4;
327         __m128d rsq,rinv,r;
328         __m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
329         __m128d uij,lij2,uij2,lij3,uij3,diff2;
330         __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
331         __m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
332         __m128d dadx1,dadx2;
333     __m128d logterm;
334         __m128d mask;
335         __m128d obc_mask1,obc_mask2,obc_mask3;    
336     
337     __m128d oneeighth   = _mm_set1_pd(0.125);
338     __m128d onefourth   = _mm_set1_pd(0.25);
339     
340         const __m128d half  = _mm_set1_pd(0.5);
341         const __m128d three = _mm_set1_pd(3.0);
342         const __m128d one   = _mm_set1_pd(1.0);
343         const __m128d two   = _mm_set1_pd(2.0);
344         const __m128d zero  = _mm_set1_pd(0.0);
345         const __m128d neg   = _mm_set1_pd(-1.0);
346         
347         /* Set the dielectric offset */
348         doffset   = born->gb_doffset;
349         gb_radius = born->gb_radius;
350     obc_param = born->param;
351     work      = born->gpol_hct_work;
352     jjnr      = nl->jjnr;
353     dadx      = fr->dadx;
354     shiftvec  = fr->shift_vec[0];
355     
356     jx        = _mm_setzero_pd();
357     jy        = _mm_setzero_pd();
358     jz        = _mm_setzero_pd();
359     
360     jnrA = jnrB = 0;
361     
362         for(i=0;i<born->nr;i++)
363         {
364                 work[i] = 0;
365         }
366         
367         for(i=0;i<nl->nri;i++)
368         {
369         ii     = nl->iinr[i];
370                 ii3        = ii*3;
371         is3    = 3*nl->shift[i];     
372         shX    = shiftvec[is3];  
373         shY    = shiftvec[is3+1];
374         shZ    = shiftvec[is3+2];
375         nj0    = nl->jindex[i];      
376         nj1    = nl->jindex[i+1];    
377         
378         ix     = _mm_set1_pd(shX+x[ii3+0]);
379                 iy     = _mm_set1_pd(shY+x[ii3+1]);
380                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
381                         
382                 rai    = _mm_load1_pd(gb_radius+ii);
383                 rai_inv= gmx_mm_inv_pd(rai);
384         
385                 sum_ai = _mm_setzero_pd();
386                 
387                 sk_ai  = _mm_load1_pd(born->param+ii);
388                 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
389         
390                 for(k=nj0;k<nj1-1;k+=2)
391                 {
392                         jnrA        = jjnr[k];   
393                         jnrB        = jjnr[k+1];
394                         
395             j3A         = 3*jnrA;  
396                         j3B         = 3*jnrB;
397             
398             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
399             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
400             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA,obc_param+jnrB,sk_aj);
401                         
402             dx    = _mm_sub_pd(ix, jx);
403                         dy    = _mm_sub_pd(iy, jy);
404                         dz    = _mm_sub_pd(iz, jz);
405                         
406             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
407             
408             rinv        = gmx_mm_invsqrt_pd(rsq);
409             r           = _mm_mul_pd(rsq,rinv);
410             
411                         /* Compute raj_inv aj1-4 */
412             raj_inv     = gmx_mm_inv_pd(raj);
413             
414             /* Evaluate influence of atom aj -> ai */
415             t1            = _mm_add_pd(r,sk_aj);
416             t2            = _mm_sub_pd(r,sk_aj);
417             t3            = _mm_sub_pd(sk_aj,r);
418             obc_mask1     = _mm_cmplt_pd(rai, t1);
419             obc_mask2     = _mm_cmplt_pd(rai, t2);
420             obc_mask3     = _mm_cmplt_pd(rai, t3);
421             
422             uij           = gmx_mm_inv_pd(t1);
423             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
424                                       _mm_andnot_pd(obc_mask2,rai_inv));
425             dlij          = _mm_and_pd(one,obc_mask2);
426             uij2          = _mm_mul_pd(uij, uij);
427             uij3          = _mm_mul_pd(uij2,uij);
428             lij2          = _mm_mul_pd(lij, lij);
429             lij3          = _mm_mul_pd(lij2,lij);
430                         
431             diff2         = _mm_sub_pd(uij2,lij2);
432             lij_inv       = gmx_mm_invsqrt_pd(lij2);
433             sk2_aj        = _mm_mul_pd(sk_aj,sk_aj);
434             sk2_rinv      = _mm_mul_pd(sk2_aj,rinv);
435             prod          = _mm_mul_pd(onefourth,sk2_rinv);
436                         
437             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
438             
439             t1            = _mm_sub_pd(lij,uij);
440             t2            = _mm_mul_pd(diff2,
441                                        _mm_sub_pd(_mm_mul_pd(onefourth,r),
442                                                   prod));
443             t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
444             t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
445             t4            = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij));
446             t4            = _mm_and_pd(t4,obc_mask3);
447             t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
448                         
449             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) );
450             
451             t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
452                                        _mm_mul_pd(prod,lij3));
453             t1            = _mm_sub_pd(t1,
454                                        _mm_mul_pd(onefourth,
455                                                   _mm_add_pd(_mm_mul_pd(lij,rinv),
456                                                              _mm_mul_pd(lij3,r))));
457             t2            = _mm_mul_pd(onefourth,
458                                        _mm_add_pd(_mm_mul_pd(uij,rinv),
459                                                   _mm_mul_pd(uij3,r)));
460             t2            = _mm_sub_pd(t2,
461                                        _mm_add_pd(_mm_mul_pd(half,uij2),
462                                                   _mm_mul_pd(prod,uij3)));
463             t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
464                                        _mm_mul_pd(rinv,rinv));
465             t3            = _mm_sub_pd(t3,
466                                        _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
467                                                   _mm_add_pd(one,
468                                                              _mm_mul_pd(sk2_rinv,rinv))));
469             t1            = _mm_mul_pd(rinv,
470                                        _mm_add_pd(_mm_mul_pd(dlij,t1),
471                                                   _mm_add_pd(t2,t3)));
472             
473             dadx1         = _mm_and_pd(t1,obc_mask1);
474             
475             /* Evaluate influence of atom ai -> aj */
476             t1            = _mm_add_pd(r,sk_ai);
477             t2            = _mm_sub_pd(r,sk_ai);
478             t3            = _mm_sub_pd(sk_ai,r);
479             obc_mask1     = _mm_cmplt_pd(raj, t1);
480             obc_mask2     = _mm_cmplt_pd(raj, t2);
481             obc_mask3     = _mm_cmplt_pd(raj, t3);
482             
483             uij           = gmx_mm_inv_pd(t1);
484             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
485                                       _mm_andnot_pd(obc_mask2,raj_inv));
486             dlij          = _mm_and_pd(one,obc_mask2);
487             uij2          = _mm_mul_pd(uij, uij);
488             uij3          = _mm_mul_pd(uij2,uij);
489             lij2          = _mm_mul_pd(lij, lij);
490             lij3          = _mm_mul_pd(lij2,lij);
491                         
492             diff2         = _mm_sub_pd(uij2,lij2);
493             lij_inv       = gmx_mm_invsqrt_pd(lij2);
494             sk2_rinv      = _mm_mul_pd(sk2_ai,rinv);
495             prod          = _mm_mul_pd(onefourth,sk2_rinv);
496                         
497             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
498             
499             t1            = _mm_sub_pd(lij,uij);
500             t2            = _mm_mul_pd(diff2,
501                                        _mm_sub_pd(_mm_mul_pd(onefourth,r),
502                                                   prod));
503             t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
504             t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
505             t4            = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij));
506             t4            = _mm_and_pd(t4,obc_mask3);
507             t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
508                         
509             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_and_pd(t1,obc_mask1));
510             
511             t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
512                                        _mm_mul_pd(prod,lij3));
513             t1            = _mm_sub_pd(t1,
514                                        _mm_mul_pd(onefourth,
515                                                   _mm_add_pd(_mm_mul_pd(lij,rinv),
516                                                              _mm_mul_pd(lij3,r))));
517             t2            = _mm_mul_pd(onefourth,
518                                        _mm_add_pd(_mm_mul_pd(uij,rinv),
519                                                   _mm_mul_pd(uij3,r)));
520             t2            = _mm_sub_pd(t2,
521                                        _mm_add_pd(_mm_mul_pd(half,uij2),
522                                                   _mm_mul_pd(prod,uij3)));
523             t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
524                                        _mm_mul_pd(rinv,rinv));
525             t3            = _mm_sub_pd(t3,
526                                        _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
527                                                   _mm_add_pd(one,
528                                                              _mm_mul_pd(sk2_rinv,rinv))));
529             t1            = _mm_mul_pd(rinv,
530                                        _mm_add_pd(_mm_mul_pd(dlij,t1),
531                                                   _mm_add_pd(t2,t3)));
532             
533             dadx2         = _mm_and_pd(t1,obc_mask1);
534             
535             _mm_store_pd(dadx,dadx1);
536             dadx += 2;
537             _mm_store_pd(dadx,dadx2);
538             dadx += 2;
539         } /* end normal inner loop */
540         
541                 if(k<nj1)
542                 {
543                         jnrA        = jjnr[k];   
544                         
545             j3A         = 3*jnrA;  
546             
547             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
548             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
549             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA,sk_aj);
550                         
551             dx    = _mm_sub_sd(ix, jx);
552                         dy    = _mm_sub_sd(iy, jy);
553                         dz    = _mm_sub_sd(iz, jz);
554                         
555             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
556             
557             rinv        = gmx_mm_invsqrt_pd(rsq);
558             r           = _mm_mul_sd(rsq,rinv);
559             
560                         /* Compute raj_inv aj1-4 */
561             raj_inv     = gmx_mm_inv_pd(raj);
562             
563             /* Evaluate influence of atom aj -> ai */
564             t1            = _mm_add_sd(r,sk_aj);
565             t2            = _mm_sub_sd(r,sk_aj);
566             t3            = _mm_sub_sd(sk_aj,r);
567             obc_mask1     = _mm_cmplt_sd(rai, t1);
568             obc_mask2     = _mm_cmplt_sd(rai, t2);
569             obc_mask3     = _mm_cmplt_sd(rai, t3);
570             
571             uij           = gmx_mm_inv_pd(t1);
572             lij           = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
573                                       _mm_andnot_pd(obc_mask2,rai_inv));
574             dlij          = _mm_and_pd(one,obc_mask2);
575             uij2          = _mm_mul_sd(uij, uij);
576             uij3          = _mm_mul_sd(uij2,uij);
577             lij2          = _mm_mul_sd(lij, lij);
578             lij3          = _mm_mul_sd(lij2,lij);
579             
580             diff2         = _mm_sub_sd(uij2,lij2);
581             lij_inv       = gmx_mm_invsqrt_pd(lij2);
582             sk2_aj        = _mm_mul_sd(sk_aj,sk_aj);
583             sk2_rinv      = _mm_mul_sd(sk2_aj,rinv);
584             prod          = _mm_mul_sd(onefourth,sk2_rinv);
585             
586             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
587             
588             t1            = _mm_sub_sd(lij,uij);
589             t2            = _mm_mul_sd(diff2,
590                                        _mm_sub_sd(_mm_mul_pd(onefourth,r),
591                                                   prod));
592             t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
593             t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
594             t4            = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij));
595             t4            = _mm_and_pd(t4,obc_mask3);
596             t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
597             
598             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) );
599             
600             t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
601                                        _mm_mul_sd(prod,lij3));
602             t1            = _mm_sub_sd(t1,
603                                        _mm_mul_sd(onefourth,
604                                                   _mm_add_sd(_mm_mul_sd(lij,rinv),
605                                                              _mm_mul_sd(lij3,r))));
606             t2            = _mm_mul_sd(onefourth,
607                                        _mm_add_sd(_mm_mul_sd(uij,rinv),
608                                                   _mm_mul_sd(uij3,r)));
609             t2            = _mm_sub_sd(t2,
610                                        _mm_add_sd(_mm_mul_sd(half,uij2),
611                                                   _mm_mul_sd(prod,uij3)));
612             t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
613                                        _mm_mul_sd(rinv,rinv));
614             t3            = _mm_sub_sd(t3,
615                                        _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
616                                                   _mm_add_sd(one,
617                                                              _mm_mul_sd(sk2_rinv,rinv))));
618             t1            = _mm_mul_sd(rinv,
619                                        _mm_add_sd(_mm_mul_sd(dlij,t1),
620                                                   _mm_add_pd(t2,t3)));
621             
622             dadx1         = _mm_and_pd(t1,obc_mask1);
623             
624             /* Evaluate influence of atom ai -> aj */
625             t1            = _mm_add_sd(r,sk_ai);
626             t2            = _mm_sub_sd(r,sk_ai);
627             t3            = _mm_sub_sd(sk_ai,r);
628             obc_mask1     = _mm_cmplt_sd(raj, t1);
629             obc_mask2     = _mm_cmplt_sd(raj, t2);
630             obc_mask3     = _mm_cmplt_sd(raj, t3);
631             
632             uij           = gmx_mm_inv_pd(t1);
633             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
634                                       _mm_andnot_pd(obc_mask2,raj_inv));
635             dlij          = _mm_and_pd(one,obc_mask2);
636             uij2          = _mm_mul_sd(uij, uij);
637             uij3          = _mm_mul_sd(uij2,uij);
638             lij2          = _mm_mul_sd(lij, lij);
639             lij3          = _mm_mul_sd(lij2,lij);
640             
641             diff2         = _mm_sub_sd(uij2,lij2);
642             lij_inv       = gmx_mm_invsqrt_pd(lij2);
643             sk2_rinv      = _mm_mul_sd(sk2_ai,rinv);
644             prod          = _mm_mul_sd(onefourth,sk2_rinv);
645             
646             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
647             
648             t1            = _mm_sub_sd(lij,uij);
649             t2            = _mm_mul_sd(diff2,
650                                        _mm_sub_sd(_mm_mul_sd(onefourth,r),
651                                                   prod));
652             t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
653             t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
654             t4            = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij));
655             t4            = _mm_and_pd(t4,obc_mask3);
656             t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
657             
658             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_and_pd(t1,obc_mask1));
659             
660             t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
661                                        _mm_mul_sd(prod,lij3));
662             t1            = _mm_sub_sd(t1,
663                                        _mm_mul_sd(onefourth,
664                                                   _mm_add_sd(_mm_mul_sd(lij,rinv),
665                                                              _mm_mul_sd(lij3,r))));
666             t2            = _mm_mul_sd(onefourth,
667                                        _mm_add_sd(_mm_mul_sd(uij,rinv),
668                                                   _mm_mul_sd(uij3,r)));
669             t2            = _mm_sub_sd(t2,
670                                        _mm_add_sd(_mm_mul_sd(half,uij2),
671                                                   _mm_mul_sd(prod,uij3)));
672             t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
673                                        _mm_mul_sd(rinv,rinv));
674             t3            = _mm_sub_sd(t3,
675                                        _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
676                                                   _mm_add_sd(one,
677                                                              _mm_mul_sd(sk2_rinv,rinv))));
678             t1            = _mm_mul_sd(rinv,
679                                        _mm_add_sd(_mm_mul_sd(dlij,t1),
680                                                   _mm_add_sd(t2,t3)));
681             
682             dadx2         = _mm_and_pd(t1,obc_mask1);
683             
684             _mm_store_pd(dadx,dadx1);
685             dadx += 2;
686             _mm_store_pd(dadx,dadx2);
687             dadx += 2;
688         } 
689         gmx_mm_update_1pot_pd(sum_ai,work+ii);
690         
691         }
692         
693         /* Parallel summations */
694         if(PARTDECOMP(cr))
695         {
696                 gmx_sum(natoms, work, cr);
697         }
698         else if(DOMAINDECOMP(cr))
699         {
700                 dd_atom_sum_real(cr->dd, work);
701         }
702         
703     if(gb_algorithm==egbHCT)
704     {
705         /* HCT */
706         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
707         {
708                         if(born->use[i] != 0)
709             {
710                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset; 
711                 sum     = 1.0/rr - work[i];
712                 min_rad = rr + doffset;
713                 rad     = 1.0/sum; 
714                 
715                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
716                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
717             }
718         }
719         
720         /* Extra communication required for DD */
721         if(DOMAINDECOMP(cr))
722         {
723             dd_atom_spread_real(cr->dd, born->bRad);
724             dd_atom_spread_real(cr->dd, fr->invsqrta);
725         }
726     }
727     else
728     {
729         /* OBC */
730         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
731         {
732                         if(born->use[i] != 0)
733             {
734                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
735                 rr_inv2 = 1.0/rr;
736                 rr      = rr-doffset; 
737                 rr_inv  = 1.0/rr;
738                 sum     = rr * work[i];
739                 sum2    = sum  * sum;
740                 sum3    = sum2 * sum;
741                 
742                 tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
743                 born->bRad[i] = rr_inv - tsum*rr_inv2;
744                 born->bRad[i] = 1.0 / born->bRad[i];
745                 
746                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
747                 
748                 tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
749                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
750             }
751         }
752         /* Extra (local) communication required for DD */
753         if(DOMAINDECOMP(cr))
754         {
755             dd_atom_spread_real(cr->dd, born->bRad);
756             dd_atom_spread_real(cr->dd, fr->invsqrta);
757             dd_atom_spread_real(cr->dd, born->drobc);
758         }
759     }
760     
761         
762         
763         return 0;
764 }
765
766
767 int
768 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, 
769                               double *x, double *f, double *fshift, double *shiftvec,
770                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)                                             
771 {
772         int    i,k,n,ii,jnr,ii3,is3,nj0,nj1,n0,n1;
773         int        jnrA,jnrB;
774     int    j3A,j3B;
775         int *  jjnr;
776     
777         double   rbi,shX,shY,shZ;
778         double   *rb;
779     
780         __m128d ix,iy,iz;
781         __m128d jx,jy,jz;
782         __m128d fix,fiy,fiz;
783         __m128d dx,dy,dz;
784     __m128d tx,ty,tz;
785     
786         __m128d rbai,rbaj,f_gb, f_gb_ai;
787         __m128d xmm1,xmm2,xmm3;
788         
789         const __m128d two = _mm_set1_pd(2.0);
790     
791         rb     = born->work; 
792     
793   jjnr   = nl->jjnr;
794   
795         /* Loop to get the proper form for the Born radius term, sse style */   
796   n0 = 0;
797   n1 = natoms;
798     
799         if(gb_algorithm==egbSTILL) 
800         {
801                 for(i=n0;i<n1;i++)
802                 {
803       rbi   = born->bRad[i];
804                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
805                 }
806         }
807         else if(gb_algorithm==egbHCT) 
808         {
809                 for(i=n0;i<n1;i++)
810                 {
811       rbi   = born->bRad[i];
812                         rb[i] = rbi * rbi * dvda[i];
813                 }
814         }
815         else if(gb_algorithm==egbOBC) 
816         {
817                 for(i=n0;i<n1;i++)
818                 {
819       rbi   = born->bRad[i];
820                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
821                 }
822         }
823     
824   jz = _mm_setzero_pd();
825   
826   n = j3A = j3B = 0;
827   
828         for(i=0;i<nl->nri;i++)
829         {
830     ii     = nl->iinr[i];
831                 ii3        = ii*3;
832     is3    = 3*nl->shift[i];     
833     shX    = shiftvec[is3];  
834     shY    = shiftvec[is3+1];
835     shZ    = shiftvec[is3+2];
836     nj0    = nl->jindex[i];      
837     nj1    = nl->jindex[i+1];    
838     
839     ix     = _mm_set1_pd(shX+x[ii3+0]);
840                 iy     = _mm_set1_pd(shY+x[ii3+1]);
841                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
842     
843                 rbai   = _mm_load1_pd(rb+ii);                   
844                 fix    = _mm_setzero_pd();
845                 fiy    = _mm_setzero_pd();
846                 fiz    = _mm_setzero_pd();      
847     
848         
849     for(k=nj0;k<nj1-1;k+=2)
850                 {
851                         jnrA        = jjnr[k];   
852                         jnrB        = jjnr[k+1];
853       
854       j3A         = 3*jnrA;  
855                         j3B         = 3*jnrB;
856             
857       GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
858       
859                         dx          = _mm_sub_pd(ix,jx);
860                         dy          = _mm_sub_pd(iy,jy);
861                         dz          = _mm_sub_pd(iz,jz);
862       
863       GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
864       
865                         /* load chain rule terms for j1-4 */
866                         f_gb        = _mm_load_pd(dadx);
867                         dadx += 2;
868                         f_gb_ai     = _mm_load_pd(dadx);
869                         dadx += 2;
870                         
871       /* calculate scalar force */
872       f_gb    = _mm_mul_pd(f_gb,rbai); 
873       f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
874       f_gb    = _mm_add_pd(f_gb,f_gb_ai);
875       
876       tx     = _mm_mul_pd(f_gb,dx);
877       ty     = _mm_mul_pd(f_gb,dy);
878       tz     = _mm_mul_pd(f_gb,dz);
879       
880       fix    = _mm_add_pd(fix,tx);
881       fiy    = _mm_add_pd(fiy,ty);
882       fiz    = _mm_add_pd(fiz,tz);
883       
884       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
885                 }
886     
887                 /*deal with odd elements */
888                 if(k<nj1) 
889         {
890           jnrA        = jjnr[k];   
891           j3A         = 3*jnrA;  
892           
893           GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
894           
895           dx          = _mm_sub_sd(ix,jx);
896           dy          = _mm_sub_sd(iy,jy);
897           dz          = _mm_sub_sd(iz,jz);
898           
899           GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
900           
901           /* load chain rule terms */
902           f_gb        = _mm_load_pd(dadx);
903           dadx += 2;
904           f_gb_ai     = _mm_load_pd(dadx);
905           dadx += 2;
906           
907           /* calculate scalar force */
908           f_gb    = _mm_mul_sd(f_gb,rbai); 
909           f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
910           f_gb    = _mm_add_sd(f_gb,f_gb_ai);
911           
912           tx     = _mm_mul_sd(f_gb,dx);
913           ty     = _mm_mul_sd(f_gb,dy);
914           tz     = _mm_mul_sd(f_gb,dz);
915           
916           fix    = _mm_add_sd(fix,tx);
917           fiy    = _mm_add_sd(fiy,ty);
918           fiz    = _mm_add_sd(fiz,tz);
919           
920           GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
921         } 
922     
923                 /* fix/fiy/fiz now contain four partial force terms, that all should be
924      * added to the i particle forces and shift forces. 
925      */
926                 gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3);
927         }       
928   
929         return 0;       
930 }
931
932 #else
933 /* keep compiler happy */
934 int genborn_sse2_dummy;
935
936 #endif /* SSE2 intrinsics available */
937