Merge release-4-6 (commit 'Ic142a690')
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_sse2_double.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  * 
3  *                This source code is part of
4  * 
5  *                 G   R   O   M   A   C   S
6  * 
7  *          GROningen MAchine for Chemical Simulations
8  * 
9  *                        VERSION 4.5
10  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
11  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
12  * Copyright (c) 2001-2008, The GROMACS development team,
13  * check out http://www.gromacs.org for more information.
14  
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version 2
18  * of the License, or (at your option) any later version.
19  * 
20  * If you want to redistribute modifications, please consider that
21  * scientific software is very special. Version control is crucial -
22  * bugs must be traceable. We will be happy to consider code for
23  * inclusion in the official distribution, but derived work must not
24  * be called official GROMACS. Details are found in the README & COPYING
25  * files - if they are missing, get the official version at www.gromacs.org.
26  * 
27  * To help us fund GROMACS development, we humbly ask that you cite
28  * the papers on the package - you can find them in the top README file.
29  * 
30  * For more info, check our website at http://www.gromacs.org
31  * 
32  * And Hey:
33  * Groningen Machine for Chemical Simulation
34  */
35 #ifdef HAVE_CONFIG_H
36 #include <config.h>
37 #endif
38
39 #include <math.h>
40 #include <string.h>
41
42 #include "typedefs.h"
43 #include "smalloc.h"
44 #include "genborn.h"
45 #include "vec.h"
46 #include "grompp.h"
47 #include "pdbio.h"
48 #include "names.h"
49 #include "physics.h"
50 #include "domdec.h"
51 #include "partdec.h"
52 #include "network.h"
53 #include "gmx_fatal.h"
54 #include "mtop_util.h"
55 #include "genborn.h"
56
57 #ifdef GMX_LIB_MPI
58 #include <mpi.h>
59 #endif
60 #ifdef GMX_THREAD_MPI
61 #include "tmpi.h"
62 #endif
63
64 /* Only compile this file if SSE2 intrinsics are available */
65 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) ) 
66 #include <gmx_sse2_double.h>
67 #include <xmmintrin.h>
68 #include <emmintrin.h>
69
70 #include "genborn_sse2_double.h"
71
72 int 
73 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
74                               int natoms, gmx_localtop_t *top,
75                               const t_atomtypes *atype, double *x, t_nblist *nl,
76                               gmx_genborn_t *born)
77 {
78         int i,k,n,ii,is3,ii3,nj0,nj1,offset;
79         int jnrA,jnrB,j3A,j3B;
80     int *mdtype;
81         double shX,shY,shZ;
82     int *jjnr;
83     double *shiftvec;
84     
85         double gpi_ai,gpi2;
86         double factor;
87         double *gb_radius;
88     double *vsolv;
89     double *work;
90     double *dadx;
91     
92         __m128d ix,iy,iz;
93         __m128d jx,jy,jz;
94         __m128d dx,dy,dz;
95         __m128d tx,ty,tz;
96         __m128d rsq,rinv,rinv2,rinv4,rinv6;
97         __m128d ratio,gpi,rai,raj,vai,vaj,rvdw;
98         __m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
99         __m128d mask,icf4,icf6,mask_cmp;
100             
101         const __m128d half   = _mm_set1_pd(0.5);
102         const __m128d three  = _mm_set1_pd(3.0);
103         const __m128d one    = _mm_set1_pd(1.0);
104         const __m128d two    = _mm_set1_pd(2.0);
105         const __m128d zero   = _mm_set1_pd(0.0);
106         const __m128d four   = _mm_set1_pd(4.0);
107         
108         const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
109         const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
110         const __m128d still_p4     = _mm_set1_pd(STILL_P4);
111     
112         factor  = 0.5 * ONE_4PI_EPS0;
113     
114     gb_radius = born->gb_radius;
115     vsolv     = born->vsolv;
116     work      = born->gpol_still_work;
117         jjnr      = nl->jjnr;
118     shiftvec  = fr->shift_vec[0];
119     dadx      = fr->dadx;
120     
121         jnrA = jnrB = 0;
122     jx = _mm_setzero_pd();
123     jy = _mm_setzero_pd();
124     jz = _mm_setzero_pd();
125     
126         n = 0;
127     
128         for(i=0;i<natoms;i++)
129         {
130                 work[i]=0;
131         }
132     
133         for(i=0;i<nl->nri;i++)
134         {
135         ii     = nl->iinr[i];
136                 ii3        = ii*3;
137         is3    = 3*nl->shift[i];     
138         shX    = shiftvec[is3];  
139         shY    = shiftvec[is3+1];
140         shZ    = shiftvec[is3+2];
141         nj0    = nl->jindex[i];      
142         nj1    = nl->jindex[i+1];    
143         
144         ix     = _mm_set1_pd(shX+x[ii3+0]);
145                 iy     = _mm_set1_pd(shY+x[ii3+1]);
146                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
147                 
148
149                 /* Polarization energy for atom ai */
150                 gpi    = _mm_setzero_pd();
151                 
152         rai     = _mm_load1_pd(gb_radius+ii);
153         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
154
155                 for(k=nj0;k<nj1-1;k+=2)
156                 {
157                         jnrA        = jjnr[k];   
158                         jnrB        = jjnr[k+1];
159             
160             j3A         = 3*jnrA;  
161                         j3B         = 3*jnrB;
162             
163             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
164             
165             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
166                         GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj);
167             
168                         dx          = _mm_sub_pd(ix,jx);
169                         dy          = _mm_sub_pd(iy,jy);
170                         dz          = _mm_sub_pd(iz,jz);
171             
172             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
173             rinv        = gmx_mm_invsqrt_pd(rsq);
174             rinv2       = _mm_mul_pd(rinv,rinv);
175             rinv4       = _mm_mul_pd(rinv2,rinv2);
176             rinv6       = _mm_mul_pd(rinv4,rinv2);
177             
178             rvdw        = _mm_add_pd(rai,raj);
179             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
180             
181             mask_cmp    = _mm_cmple_pd(ratio,still_p5inv);
182
183             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
184             if( 0 == _mm_movemask_pd(mask_cmp) )
185             {
186                 /* if ratio>still_p5inv for ALL elements */
187                 ccf         = one;
188                 dccf        = _mm_setzero_pd();
189             }
190             else 
191             {
192                 ratio       = _mm_min_pd(ratio,still_p5inv);
193                 theta       = _mm_mul_pd(ratio,still_pip5);
194                 gmx_mm_sincos_pd(theta,&sinq,&cosq);
195                 term        = _mm_mul_pd(half,_mm_sub_pd(one,cosq));
196                 ccf         = _mm_mul_pd(term,term);
197                 dccf        = _mm_mul_pd(_mm_mul_pd(two,term),
198                                          _mm_mul_pd(sinq,theta));
199             }
200
201             prod        = _mm_mul_pd(still_p4,vaj);
202             icf4        = _mm_mul_pd(ccf,rinv4);
203             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6);
204                         
205             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4));
206             
207             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) );
208             
209             _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
210             dadx+=2;
211             _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
212             dadx+=2;
213                 } 
214         
215         if(k<nj1)
216                 {
217                         jnrA        = jjnr[k];   
218             
219             j3A         = 3*jnrA;  
220             
221             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
222             
223             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
224                         GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj);
225             
226                         dx          = _mm_sub_sd(ix,jx);
227                         dy          = _mm_sub_sd(iy,jy);
228                         dz          = _mm_sub_sd(iz,jz);
229             
230             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
231             rinv        = gmx_mm_invsqrt_pd(rsq);
232             rinv2       = _mm_mul_sd(rinv,rinv);
233             rinv4       = _mm_mul_sd(rinv2,rinv2);
234             rinv6       = _mm_mul_sd(rinv4,rinv2);
235             
236             rvdw        = _mm_add_sd(rai,raj);
237             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
238             
239             mask_cmp    = _mm_cmple_sd(ratio,still_p5inv);
240             
241             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
242             if( 0 == _mm_movemask_pd(mask_cmp) )
243             {
244                 /* if ratio>still_p5inv for ALL elements */
245                 ccf         = one;
246                 dccf        = _mm_setzero_pd();
247             }
248             else 
249             {
250                 ratio       = _mm_min_sd(ratio,still_p5inv);
251                 theta       = _mm_mul_sd(ratio,still_pip5);
252                 gmx_mm_sincos_pd(theta,&sinq,&cosq);
253                 term        = _mm_mul_sd(half,_mm_sub_sd(one,cosq));
254                 ccf         = _mm_mul_sd(term,term);
255                 dccf        = _mm_mul_sd(_mm_mul_sd(two,term),
256                                          _mm_mul_sd(sinq,theta));
257             }
258             
259             prod        = _mm_mul_sd(still_p4,vaj);
260             icf4        = _mm_mul_sd(ccf,rinv4);
261             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6);
262
263             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4));
264             
265             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) );
266             
267             _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
268             dadx+=2;
269             _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
270             dadx+=2;
271                 } 
272         gmx_mm_update_1pot_pd(gpi,work+ii);
273         }
274     
275         /* Sum up the polarization energy from other nodes */
276         if(PARTDECOMP(cr))
277         {
278                 gmx_sum(natoms, work, cr);
279         }
280         else if(DOMAINDECOMP(cr))
281         {
282                 dd_atom_sum_real(cr->dd, work);
283         }
284         
285         /* Compute the radii */
286         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
287         {               
288                 if(born->use[i] != 0)
289                 {
290                         gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
291                         gpi2             = gpi_ai * gpi_ai;
292                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
293                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
294                 }
295         }
296     
297         /* Extra (local) communication required for DD */
298         if(DOMAINDECOMP(cr))
299         {
300                 dd_atom_spread_real(cr->dd, born->bRad);
301                 dd_atom_spread_real(cr->dd, fr->invsqrta);
302         }
303     
304         return 0;       
305 }
306
307
308 int 
309 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
310                                 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
311 {
312         int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
313     int jnrA,jnrB;
314     int j3A,j3B;
315         double shX,shY,shZ;
316         double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
317         double sum_ai2, sum_ai3,tsum,tchain,doffset;
318         double *obc_param;
319     double *gb_radius;
320     double *work;
321     int *  jjnr;
322     double *dadx;
323     double *shiftvec;
324     double min_rad,rad;
325     
326         __m128d ix,iy,iz,jx,jy,jz;
327         __m128d dx,dy,dz,t1,t2,t3,t4;
328         __m128d rsq,rinv,r;
329         __m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
330         __m128d uij,lij2,uij2,lij3,uij3,diff2;
331         __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
332         __m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
333         __m128d dadx1,dadx2;
334     __m128d logterm;
335         __m128d mask;
336         __m128d obc_mask1,obc_mask2,obc_mask3;    
337     
338     __m128d oneeighth   = _mm_set1_pd(0.125);
339     __m128d onefourth   = _mm_set1_pd(0.25);
340     
341         const __m128d half  = _mm_set1_pd(0.5);
342         const __m128d three = _mm_set1_pd(3.0);
343         const __m128d one   = _mm_set1_pd(1.0);
344         const __m128d two   = _mm_set1_pd(2.0);
345         const __m128d zero  = _mm_set1_pd(0.0);
346         const __m128d neg   = _mm_set1_pd(-1.0);
347         
348         /* Set the dielectric offset */
349         doffset   = born->gb_doffset;
350         gb_radius = born->gb_radius;
351     obc_param = born->param;
352     work      = born->gpol_hct_work;
353     jjnr      = nl->jjnr;
354     dadx      = fr->dadx;
355     shiftvec  = fr->shift_vec[0];
356     
357     jx        = _mm_setzero_pd();
358     jy        = _mm_setzero_pd();
359     jz        = _mm_setzero_pd();
360     
361     jnrA = jnrB = 0;
362     
363         for(i=0;i<born->nr;i++)
364         {
365                 work[i] = 0;
366         }
367         
368         for(i=0;i<nl->nri;i++)
369         {
370         ii     = nl->iinr[i];
371                 ii3        = ii*3;
372         is3    = 3*nl->shift[i];     
373         shX    = shiftvec[is3];  
374         shY    = shiftvec[is3+1];
375         shZ    = shiftvec[is3+2];
376         nj0    = nl->jindex[i];      
377         nj1    = nl->jindex[i+1];    
378         
379         ix     = _mm_set1_pd(shX+x[ii3+0]);
380                 iy     = _mm_set1_pd(shY+x[ii3+1]);
381                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
382                         
383                 rai    = _mm_load1_pd(gb_radius+ii);
384                 rai_inv= gmx_mm_inv_pd(rai);
385         
386                 sum_ai = _mm_setzero_pd();
387                 
388                 sk_ai  = _mm_load1_pd(born->param+ii);
389                 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
390         
391                 for(k=nj0;k<nj1-1;k+=2)
392                 {
393                         jnrA        = jjnr[k];   
394                         jnrB        = jjnr[k+1];
395                         
396             j3A         = 3*jnrA;  
397                         j3B         = 3*jnrB;
398             
399             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
400             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
401             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA,obc_param+jnrB,sk_aj);
402                         
403             dx    = _mm_sub_pd(ix, jx);
404                         dy    = _mm_sub_pd(iy, jy);
405                         dz    = _mm_sub_pd(iz, jz);
406                         
407             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
408             
409             rinv        = gmx_mm_invsqrt_pd(rsq);
410             r           = _mm_mul_pd(rsq,rinv);
411             
412                         /* Compute raj_inv aj1-4 */
413             raj_inv     = gmx_mm_inv_pd(raj);
414             
415             /* Evaluate influence of atom aj -> ai */
416             t1            = _mm_add_pd(r,sk_aj);
417             t2            = _mm_sub_pd(r,sk_aj);
418             t3            = _mm_sub_pd(sk_aj,r);
419             obc_mask1     = _mm_cmplt_pd(rai, t1);
420             obc_mask2     = _mm_cmplt_pd(rai, t2);
421             obc_mask3     = _mm_cmplt_pd(rai, t3);
422             
423             uij           = gmx_mm_inv_pd(t1);
424             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
425                                       _mm_andnot_pd(obc_mask2,rai_inv));
426             dlij          = _mm_and_pd(one,obc_mask2);
427             uij2          = _mm_mul_pd(uij, uij);
428             uij3          = _mm_mul_pd(uij2,uij);
429             lij2          = _mm_mul_pd(lij, lij);
430             lij3          = _mm_mul_pd(lij2,lij);
431                         
432             diff2         = _mm_sub_pd(uij2,lij2);
433             lij_inv       = gmx_mm_invsqrt_pd(lij2);
434             sk2_aj        = _mm_mul_pd(sk_aj,sk_aj);
435             sk2_rinv      = _mm_mul_pd(sk2_aj,rinv);
436             prod          = _mm_mul_pd(onefourth,sk2_rinv);
437                         
438             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
439             
440             t1            = _mm_sub_pd(lij,uij);
441             t2            = _mm_mul_pd(diff2,
442                                        _mm_sub_pd(_mm_mul_pd(onefourth,r),
443                                                   prod));
444             t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
445             t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
446             t4            = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij));
447             t4            = _mm_and_pd(t4,obc_mask3);
448             t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
449                         
450             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) );
451             
452             t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
453                                        _mm_mul_pd(prod,lij3));
454             t1            = _mm_sub_pd(t1,
455                                        _mm_mul_pd(onefourth,
456                                                   _mm_add_pd(_mm_mul_pd(lij,rinv),
457                                                              _mm_mul_pd(lij3,r))));
458             t2            = _mm_mul_pd(onefourth,
459                                        _mm_add_pd(_mm_mul_pd(uij,rinv),
460                                                   _mm_mul_pd(uij3,r)));
461             t2            = _mm_sub_pd(t2,
462                                        _mm_add_pd(_mm_mul_pd(half,uij2),
463                                                   _mm_mul_pd(prod,uij3)));
464             t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
465                                        _mm_mul_pd(rinv,rinv));
466             t3            = _mm_sub_pd(t3,
467                                        _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
468                                                   _mm_add_pd(one,
469                                                              _mm_mul_pd(sk2_rinv,rinv))));
470             t1            = _mm_mul_pd(rinv,
471                                        _mm_add_pd(_mm_mul_pd(dlij,t1),
472                                                   _mm_add_pd(t2,t3)));
473             
474             dadx1         = _mm_and_pd(t1,obc_mask1);
475             
476             /* Evaluate influence of atom ai -> aj */
477             t1            = _mm_add_pd(r,sk_ai);
478             t2            = _mm_sub_pd(r,sk_ai);
479             t3            = _mm_sub_pd(sk_ai,r);
480             obc_mask1     = _mm_cmplt_pd(raj, t1);
481             obc_mask2     = _mm_cmplt_pd(raj, t2);
482             obc_mask3     = _mm_cmplt_pd(raj, t3);
483             
484             uij           = gmx_mm_inv_pd(t1);
485             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
486                                       _mm_andnot_pd(obc_mask2,raj_inv));
487             dlij          = _mm_and_pd(one,obc_mask2);
488             uij2          = _mm_mul_pd(uij, uij);
489             uij3          = _mm_mul_pd(uij2,uij);
490             lij2          = _mm_mul_pd(lij, lij);
491             lij3          = _mm_mul_pd(lij2,lij);
492                         
493             diff2         = _mm_sub_pd(uij2,lij2);
494             lij_inv       = gmx_mm_invsqrt_pd(lij2);
495             sk2_rinv      = _mm_mul_pd(sk2_ai,rinv);
496             prod          = _mm_mul_pd(onefourth,sk2_rinv);
497                         
498             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
499             
500             t1            = _mm_sub_pd(lij,uij);
501             t2            = _mm_mul_pd(diff2,
502                                        _mm_sub_pd(_mm_mul_pd(onefourth,r),
503                                                   prod));
504             t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
505             t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
506             t4            = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij));
507             t4            = _mm_and_pd(t4,obc_mask3);
508             t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
509                         
510             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_and_pd(t1,obc_mask1));
511             
512             t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
513                                        _mm_mul_pd(prod,lij3));
514             t1            = _mm_sub_pd(t1,
515                                        _mm_mul_pd(onefourth,
516                                                   _mm_add_pd(_mm_mul_pd(lij,rinv),
517                                                              _mm_mul_pd(lij3,r))));
518             t2            = _mm_mul_pd(onefourth,
519                                        _mm_add_pd(_mm_mul_pd(uij,rinv),
520                                                   _mm_mul_pd(uij3,r)));
521             t2            = _mm_sub_pd(t2,
522                                        _mm_add_pd(_mm_mul_pd(half,uij2),
523                                                   _mm_mul_pd(prod,uij3)));
524             t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
525                                        _mm_mul_pd(rinv,rinv));
526             t3            = _mm_sub_pd(t3,
527                                        _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
528                                                   _mm_add_pd(one,
529                                                              _mm_mul_pd(sk2_rinv,rinv))));
530             t1            = _mm_mul_pd(rinv,
531                                        _mm_add_pd(_mm_mul_pd(dlij,t1),
532                                                   _mm_add_pd(t2,t3)));
533             
534             dadx2         = _mm_and_pd(t1,obc_mask1);
535             
536             _mm_store_pd(dadx,dadx1);
537             dadx += 2;
538             _mm_store_pd(dadx,dadx2);
539             dadx += 2;
540         } /* end normal inner loop */
541         
542                 if(k<nj1)
543                 {
544                         jnrA        = jjnr[k];   
545                         
546             j3A         = 3*jnrA;  
547             
548             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
549             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
550             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA,sk_aj);
551                         
552             dx    = _mm_sub_sd(ix, jx);
553                         dy    = _mm_sub_sd(iy, jy);
554                         dz    = _mm_sub_sd(iz, jz);
555                         
556             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
557             
558             rinv        = gmx_mm_invsqrt_pd(rsq);
559             r           = _mm_mul_sd(rsq,rinv);
560             
561                         /* Compute raj_inv aj1-4 */
562             raj_inv     = gmx_mm_inv_pd(raj);
563             
564             /* Evaluate influence of atom aj -> ai */
565             t1            = _mm_add_sd(r,sk_aj);
566             t2            = _mm_sub_sd(r,sk_aj);
567             t3            = _mm_sub_sd(sk_aj,r);
568             obc_mask1     = _mm_cmplt_sd(rai, t1);
569             obc_mask2     = _mm_cmplt_sd(rai, t2);
570             obc_mask3     = _mm_cmplt_sd(rai, t3);
571             
572             uij           = gmx_mm_inv_pd(t1);
573             lij           = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
574                                       _mm_andnot_pd(obc_mask2,rai_inv));
575             dlij          = _mm_and_pd(one,obc_mask2);
576             uij2          = _mm_mul_sd(uij, uij);
577             uij3          = _mm_mul_sd(uij2,uij);
578             lij2          = _mm_mul_sd(lij, lij);
579             lij3          = _mm_mul_sd(lij2,lij);
580             
581             diff2         = _mm_sub_sd(uij2,lij2);
582             lij_inv       = gmx_mm_invsqrt_pd(lij2);
583             sk2_aj        = _mm_mul_sd(sk_aj,sk_aj);
584             sk2_rinv      = _mm_mul_sd(sk2_aj,rinv);
585             prod          = _mm_mul_sd(onefourth,sk2_rinv);
586             
587             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
588             
589             t1            = _mm_sub_sd(lij,uij);
590             t2            = _mm_mul_sd(diff2,
591                                        _mm_sub_sd(_mm_mul_pd(onefourth,r),
592                                                   prod));
593             t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
594             t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
595             t4            = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij));
596             t4            = _mm_and_pd(t4,obc_mask3);
597             t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
598             
599             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) );
600             
601             t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
602                                        _mm_mul_sd(prod,lij3));
603             t1            = _mm_sub_sd(t1,
604                                        _mm_mul_sd(onefourth,
605                                                   _mm_add_sd(_mm_mul_sd(lij,rinv),
606                                                              _mm_mul_sd(lij3,r))));
607             t2            = _mm_mul_sd(onefourth,
608                                        _mm_add_sd(_mm_mul_sd(uij,rinv),
609                                                   _mm_mul_sd(uij3,r)));
610             t2            = _mm_sub_sd(t2,
611                                        _mm_add_sd(_mm_mul_sd(half,uij2),
612                                                   _mm_mul_sd(prod,uij3)));
613             t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
614                                        _mm_mul_sd(rinv,rinv));
615             t3            = _mm_sub_sd(t3,
616                                        _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
617                                                   _mm_add_sd(one,
618                                                              _mm_mul_sd(sk2_rinv,rinv))));
619             t1            = _mm_mul_sd(rinv,
620                                        _mm_add_sd(_mm_mul_sd(dlij,t1),
621                                                   _mm_add_pd(t2,t3)));
622             
623             dadx1         = _mm_and_pd(t1,obc_mask1);
624             
625             /* Evaluate influence of atom ai -> aj */
626             t1            = _mm_add_sd(r,sk_ai);
627             t2            = _mm_sub_sd(r,sk_ai);
628             t3            = _mm_sub_sd(sk_ai,r);
629             obc_mask1     = _mm_cmplt_sd(raj, t1);
630             obc_mask2     = _mm_cmplt_sd(raj, t2);
631             obc_mask3     = _mm_cmplt_sd(raj, t3);
632             
633             uij           = gmx_mm_inv_pd(t1);
634             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
635                                       _mm_andnot_pd(obc_mask2,raj_inv));
636             dlij          = _mm_and_pd(one,obc_mask2);
637             uij2          = _mm_mul_sd(uij, uij);
638             uij3          = _mm_mul_sd(uij2,uij);
639             lij2          = _mm_mul_sd(lij, lij);
640             lij3          = _mm_mul_sd(lij2,lij);
641             
642             diff2         = _mm_sub_sd(uij2,lij2);
643             lij_inv       = gmx_mm_invsqrt_pd(lij2);
644             sk2_rinv      = _mm_mul_sd(sk2_ai,rinv);
645             prod          = _mm_mul_sd(onefourth,sk2_rinv);
646             
647             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
648             
649             t1            = _mm_sub_sd(lij,uij);
650             t2            = _mm_mul_sd(diff2,
651                                        _mm_sub_sd(_mm_mul_sd(onefourth,r),
652                                                   prod));
653             t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
654             t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
655             t4            = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij));
656             t4            = _mm_and_pd(t4,obc_mask3);
657             t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
658             
659             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_and_pd(t1,obc_mask1));
660             
661             t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
662                                        _mm_mul_sd(prod,lij3));
663             t1            = _mm_sub_sd(t1,
664                                        _mm_mul_sd(onefourth,
665                                                   _mm_add_sd(_mm_mul_sd(lij,rinv),
666                                                              _mm_mul_sd(lij3,r))));
667             t2            = _mm_mul_sd(onefourth,
668                                        _mm_add_sd(_mm_mul_sd(uij,rinv),
669                                                   _mm_mul_sd(uij3,r)));
670             t2            = _mm_sub_sd(t2,
671                                        _mm_add_sd(_mm_mul_sd(half,uij2),
672                                                   _mm_mul_sd(prod,uij3)));
673             t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
674                                        _mm_mul_sd(rinv,rinv));
675             t3            = _mm_sub_sd(t3,
676                                        _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
677                                                   _mm_add_sd(one,
678                                                              _mm_mul_sd(sk2_rinv,rinv))));
679             t1            = _mm_mul_sd(rinv,
680                                        _mm_add_sd(_mm_mul_sd(dlij,t1),
681                                                   _mm_add_sd(t2,t3)));
682             
683             dadx2         = _mm_and_pd(t1,obc_mask1);
684             
685             _mm_store_pd(dadx,dadx1);
686             dadx += 2;
687             _mm_store_pd(dadx,dadx2);
688             dadx += 2;
689         } 
690         gmx_mm_update_1pot_pd(sum_ai,work+ii);
691         
692         }
693         
694         /* Parallel summations */
695         if(PARTDECOMP(cr))
696         {
697                 gmx_sum(natoms, work, cr);
698         }
699         else if(DOMAINDECOMP(cr))
700         {
701                 dd_atom_sum_real(cr->dd, work);
702         }
703         
704     if(gb_algorithm==egbHCT)
705     {
706         /* HCT */
707         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
708         {
709                         if(born->use[i] != 0)
710             {
711                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset; 
712                 sum     = 1.0/rr - work[i];
713                 min_rad = rr + doffset;
714                 rad     = 1.0/sum; 
715                 
716                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
717                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
718             }
719         }
720         
721         /* Extra communication required for DD */
722         if(DOMAINDECOMP(cr))
723         {
724             dd_atom_spread_real(cr->dd, born->bRad);
725             dd_atom_spread_real(cr->dd, fr->invsqrta);
726         }
727     }
728     else
729     {
730         /* OBC */
731         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
732         {
733                         if(born->use[i] != 0)
734             {
735                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
736                 rr_inv2 = 1.0/rr;
737                 rr      = rr-doffset; 
738                 rr_inv  = 1.0/rr;
739                 sum     = rr * work[i];
740                 sum2    = sum  * sum;
741                 sum3    = sum2 * sum;
742                 
743                 tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
744                 born->bRad[i] = rr_inv - tsum*rr_inv2;
745                 born->bRad[i] = 1.0 / born->bRad[i];
746                 
747                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
748                 
749                 tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
750                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
751             }
752         }
753         /* Extra (local) communication required for DD */
754         if(DOMAINDECOMP(cr))
755         {
756             dd_atom_spread_real(cr->dd, born->bRad);
757             dd_atom_spread_real(cr->dd, fr->invsqrta);
758             dd_atom_spread_real(cr->dd, born->drobc);
759         }
760     }
761     
762         
763         
764         return 0;
765 }
766
767
768 int
769 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, 
770                               double *x, double *f, double *fshift, double *shiftvec,
771                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)                                             
772 {
773         int    i,k,n,ii,jnr,ii3,is3,nj0,nj1,n0,n1;
774         int        jnrA,jnrB;
775     int    j3A,j3B;
776         int *  jjnr;
777     
778         double   rbi,shX,shY,shZ;
779         double   *rb;
780     
781         __m128d ix,iy,iz;
782         __m128d jx,jy,jz;
783         __m128d fix,fiy,fiz;
784         __m128d dx,dy,dz;
785     __m128d tx,ty,tz;
786     
787         __m128d rbai,rbaj,f_gb, f_gb_ai;
788         __m128d xmm1,xmm2,xmm3;
789         
790         const __m128d two = _mm_set1_pd(2.0);
791     
792         rb     = born->work; 
793     
794   jjnr   = nl->jjnr;
795   
796         /* Loop to get the proper form for the Born radius term, sse style */   
797   n0 = 0;
798   n1 = natoms;
799     
800         if(gb_algorithm==egbSTILL) 
801         {
802                 for(i=n0;i<n1;i++)
803                 {
804       rbi   = born->bRad[i];
805                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
806                 }
807         }
808         else if(gb_algorithm==egbHCT) 
809         {
810                 for(i=n0;i<n1;i++)
811                 {
812       rbi   = born->bRad[i];
813                         rb[i] = rbi * rbi * dvda[i];
814                 }
815         }
816         else if(gb_algorithm==egbOBC) 
817         {
818                 for(i=n0;i<n1;i++)
819                 {
820       rbi   = born->bRad[i];
821                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
822                 }
823         }
824     
825   jz = _mm_setzero_pd();
826   
827   n = j3A = j3B = 0;
828   
829         for(i=0;i<nl->nri;i++)
830         {
831     ii     = nl->iinr[i];
832                 ii3        = ii*3;
833     is3    = 3*nl->shift[i];     
834     shX    = shiftvec[is3];  
835     shY    = shiftvec[is3+1];
836     shZ    = shiftvec[is3+2];
837     nj0    = nl->jindex[i];      
838     nj1    = nl->jindex[i+1];    
839     
840     ix     = _mm_set1_pd(shX+x[ii3+0]);
841                 iy     = _mm_set1_pd(shY+x[ii3+1]);
842                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
843     
844                 rbai   = _mm_load1_pd(rb+ii);                   
845                 fix    = _mm_setzero_pd();
846                 fiy    = _mm_setzero_pd();
847                 fiz    = _mm_setzero_pd();      
848     
849         
850     for(k=nj0;k<nj1-1;k+=2)
851                 {
852                         jnrA        = jjnr[k];   
853                         jnrB        = jjnr[k+1];
854       
855       j3A         = 3*jnrA;  
856                         j3B         = 3*jnrB;
857             
858       GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
859       
860                         dx          = _mm_sub_pd(ix,jx);
861                         dy          = _mm_sub_pd(iy,jy);
862                         dz          = _mm_sub_pd(iz,jz);
863       
864       GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
865       
866                         /* load chain rule terms for j1-4 */
867                         f_gb        = _mm_load_pd(dadx);
868                         dadx += 2;
869                         f_gb_ai     = _mm_load_pd(dadx);
870                         dadx += 2;
871                         
872       /* calculate scalar force */
873       f_gb    = _mm_mul_pd(f_gb,rbai); 
874       f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
875       f_gb    = _mm_add_pd(f_gb,f_gb_ai);
876       
877       tx     = _mm_mul_pd(f_gb,dx);
878       ty     = _mm_mul_pd(f_gb,dy);
879       tz     = _mm_mul_pd(f_gb,dz);
880       
881       fix    = _mm_add_pd(fix,tx);
882       fiy    = _mm_add_pd(fiy,ty);
883       fiz    = _mm_add_pd(fiz,tz);
884       
885       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
886                 }
887     
888                 /*deal with odd elements */
889                 if(k<nj1) 
890         {
891           jnrA        = jjnr[k];   
892           j3A         = 3*jnrA;  
893           
894           GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
895           
896           dx          = _mm_sub_sd(ix,jx);
897           dy          = _mm_sub_sd(iy,jy);
898           dz          = _mm_sub_sd(iz,jz);
899           
900           GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
901           
902           /* load chain rule terms */
903           f_gb        = _mm_load_pd(dadx);
904           dadx += 2;
905           f_gb_ai     = _mm_load_pd(dadx);
906           dadx += 2;
907           
908           /* calculate scalar force */
909           f_gb    = _mm_mul_sd(f_gb,rbai); 
910           f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
911           f_gb    = _mm_add_sd(f_gb,f_gb_ai);
912           
913           tx     = _mm_mul_sd(f_gb,dx);
914           ty     = _mm_mul_sd(f_gb,dy);
915           tz     = _mm_mul_sd(f_gb,dz);
916           
917           fix    = _mm_add_sd(fix,tx);
918           fiy    = _mm_add_sd(fiy,ty);
919           fiz    = _mm_add_sd(fiz,tz);
920           
921           GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
922         } 
923     
924                 /* fix/fiy/fiz now contain four partial force terms, that all should be
925      * added to the i particle forces and shift forces. 
926      */
927                 gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3);
928         }       
929   
930         return 0;       
931 }
932
933 #else
934 /* keep compiler happy */
935 int genborn_sse2_dummy;
936
937 #endif /* SSE2 intrinsics available */
938