Fixing copyright issues and code contributors
[alexxy/gromacs.git] / src / mdlib / genborn_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2008, The GROMACS development team,
6  * check out http://www.gromacs.org for more information.
7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9  * others, as listed in the AUTHORS file in the top-level source
10  * directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43 #include <string.h>
44
45 #include "typedefs.h"
46 #include "smalloc.h"
47 #include "genborn.h"
48 #include "vec.h"
49 #include "grompp.h"
50 #include "pdbio.h"
51 #include "names.h"
52 #include "physics.h"
53 #include "domdec.h"
54 #include "partdec.h"
55 #include "network.h"
56 #include "gmx_fatal.h"
57 #include "mtop_util.h"
58 #include "genborn.h"
59
60 #ifdef GMX_LIB_MPI
61 #include <mpi.h>
62 #endif
63 #ifdef GMX_THREAD_MPI
64 #include "tmpi.h"
65 #endif
66
67 /* Only compile this file if SSE2 intrinsics are available */
68 #if 0 && defined (GMX_X86_SSE2)
69 #include <gmx_sse2_double.h>
70 #include <emmintrin.h>
71
72 #include "genborn_sse2_double.h"
73
74 int 
75 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
76                               int natoms, gmx_localtop_t *top,
77                               const t_atomtypes *atype, double *x, t_nblist *nl,
78                               gmx_genborn_t *born)
79 {
80         int i,k,n,ii,is3,ii3,nj0,nj1,offset;
81         int jnrA,jnrB,j3A,j3B;
82     int *mdtype;
83         double shX,shY,shZ;
84     int *jjnr;
85     double *shiftvec;
86     
87         double gpi_ai,gpi2;
88         double factor;
89         double *gb_radius;
90     double *vsolv;
91     double *work;
92     double *dadx;
93     
94         __m128d ix,iy,iz;
95         __m128d jx,jy,jz;
96         __m128d dx,dy,dz;
97         __m128d tx,ty,tz;
98         __m128d rsq,rinv,rinv2,rinv4,rinv6;
99         __m128d ratio,gpi,rai,raj,vai,vaj,rvdw;
100         __m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
101         __m128d mask,icf4,icf6,mask_cmp;
102             
103         const __m128d half   = _mm_set1_pd(0.5);
104         const __m128d three  = _mm_set1_pd(3.0);
105         const __m128d one    = _mm_set1_pd(1.0);
106         const __m128d two    = _mm_set1_pd(2.0);
107         const __m128d zero   = _mm_set1_pd(0.0);
108         const __m128d four   = _mm_set1_pd(4.0);
109         
110         const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
111         const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
112         const __m128d still_p4     = _mm_set1_pd(STILL_P4);
113     
114         factor  = 0.5 * ONE_4PI_EPS0;
115     
116     gb_radius = born->gb_radius;
117     vsolv     = born->vsolv;
118     work      = born->gpol_still_work;
119         jjnr      = nl->jjnr;
120     shiftvec  = fr->shift_vec[0];
121     dadx      = fr->dadx;
122     
123         jnrA = jnrB = 0;
124     jx = _mm_setzero_pd();
125     jy = _mm_setzero_pd();
126     jz = _mm_setzero_pd();
127     
128         n = 0;
129     
130         for(i=0;i<natoms;i++)
131         {
132                 work[i]=0;
133         }
134     
135         for(i=0;i<nl->nri;i++)
136         {
137         ii     = nl->iinr[i];
138                 ii3        = ii*3;
139         is3    = 3*nl->shift[i];     
140         shX    = shiftvec[is3];  
141         shY    = shiftvec[is3+1];
142         shZ    = shiftvec[is3+2];
143         nj0    = nl->jindex[i];      
144         nj1    = nl->jindex[i+1];    
145         
146         ix     = _mm_set1_pd(shX+x[ii3+0]);
147                 iy     = _mm_set1_pd(shY+x[ii3+1]);
148                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
149                 
150
151                 /* Polarization energy for atom ai */
152                 gpi    = _mm_setzero_pd();
153                 
154         rai     = _mm_load1_pd(gb_radius+ii);
155         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
156
157                 for(k=nj0;k<nj1-1;k+=2)
158                 {
159                         jnrA        = jjnr[k];   
160                         jnrB        = jjnr[k+1];
161             
162             j3A         = 3*jnrA;  
163                         j3B         = 3*jnrB;
164             
165             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
166             
167             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
168                         GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj);
169             
170                         dx          = _mm_sub_pd(ix,jx);
171                         dy          = _mm_sub_pd(iy,jy);
172                         dz          = _mm_sub_pd(iz,jz);
173             
174             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
175             rinv        = gmx_mm_invsqrt_pd(rsq);
176             rinv2       = _mm_mul_pd(rinv,rinv);
177             rinv4       = _mm_mul_pd(rinv2,rinv2);
178             rinv6       = _mm_mul_pd(rinv4,rinv2);
179             
180             rvdw        = _mm_add_pd(rai,raj);
181             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
182             
183             mask_cmp    = _mm_cmple_pd(ratio,still_p5inv);
184
185             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
186             if( 0 == _mm_movemask_pd(mask_cmp) )
187             {
188                 /* if ratio>still_p5inv for ALL elements */
189                 ccf         = one;
190                 dccf        = _mm_setzero_pd();
191             }
192             else 
193             {
194                 ratio       = _mm_min_pd(ratio,still_p5inv);
195                 theta       = _mm_mul_pd(ratio,still_pip5);
196                 gmx_mm_sincos_pd(theta,&sinq,&cosq);
197                 term        = _mm_mul_pd(half,_mm_sub_pd(one,cosq));
198                 ccf         = _mm_mul_pd(term,term);
199                 dccf        = _mm_mul_pd(_mm_mul_pd(two,term),
200                                          _mm_mul_pd(sinq,theta));
201             }
202
203             prod        = _mm_mul_pd(still_p4,vaj);
204             icf4        = _mm_mul_pd(ccf,rinv4);
205             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6);
206                         
207             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4));
208             
209             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) );
210             
211             _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
212             dadx+=2;
213             _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
214             dadx+=2;
215                 } 
216         
217         if(k<nj1)
218                 {
219                         jnrA        = jjnr[k];   
220             
221             j3A         = 3*jnrA;  
222             
223             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
224             
225             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
226                         GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj);
227             
228                         dx          = _mm_sub_sd(ix,jx);
229                         dy          = _mm_sub_sd(iy,jy);
230                         dz          = _mm_sub_sd(iz,jz);
231             
232             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
233             rinv        = gmx_mm_invsqrt_pd(rsq);
234             rinv2       = _mm_mul_sd(rinv,rinv);
235             rinv4       = _mm_mul_sd(rinv2,rinv2);
236             rinv6       = _mm_mul_sd(rinv4,rinv2);
237             
238             rvdw        = _mm_add_sd(rai,raj);
239             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
240             
241             mask_cmp    = _mm_cmple_sd(ratio,still_p5inv);
242             
243             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
244             if( 0 == _mm_movemask_pd(mask_cmp) )
245             {
246                 /* if ratio>still_p5inv for ALL elements */
247                 ccf         = one;
248                 dccf        = _mm_setzero_pd();
249             }
250             else 
251             {
252                 ratio       = _mm_min_sd(ratio,still_p5inv);
253                 theta       = _mm_mul_sd(ratio,still_pip5);
254                 gmx_mm_sincos_pd(theta,&sinq,&cosq);
255                 term        = _mm_mul_sd(half,_mm_sub_sd(one,cosq));
256                 ccf         = _mm_mul_sd(term,term);
257                 dccf        = _mm_mul_sd(_mm_mul_sd(two,term),
258                                          _mm_mul_sd(sinq,theta));
259             }
260             
261             prod        = _mm_mul_sd(still_p4,vaj);
262             icf4        = _mm_mul_sd(ccf,rinv4);
263             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6);
264
265             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4));
266             
267             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) );
268             
269             _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
270             dadx+=2;
271             _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
272             dadx+=2;
273                 } 
274         gmx_mm_update_1pot_pd(gpi,work+ii);
275         }
276     
277         /* Sum up the polarization energy from other nodes */
278         if(PARTDECOMP(cr))
279         {
280                 gmx_sum(natoms, work, cr);
281         }
282         else if(DOMAINDECOMP(cr))
283         {
284                 dd_atom_sum_real(cr->dd, work);
285         }
286         
287         /* Compute the radii */
288         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
289         {               
290                 if(born->use[i] != 0)
291                 {
292                         gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
293                         gpi2             = gpi_ai * gpi_ai;
294                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
295                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
296                 }
297         }
298     
299         /* Extra (local) communication required for DD */
300         if(DOMAINDECOMP(cr))
301         {
302                 dd_atom_spread_real(cr->dd, born->bRad);
303                 dd_atom_spread_real(cr->dd, fr->invsqrta);
304         }
305     
306         return 0;       
307 }
308
309
310 int 
311 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
312                                 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
313 {
314         int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
315     int jnrA,jnrB;
316     int j3A,j3B;
317         double shX,shY,shZ;
318         double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
319         double sum_ai2, sum_ai3,tsum,tchain,doffset;
320         double *obc_param;
321     double *gb_radius;
322     double *work;
323     int *  jjnr;
324     double *dadx;
325     double *shiftvec;
326     double min_rad,rad;
327     
328         __m128d ix,iy,iz,jx,jy,jz;
329         __m128d dx,dy,dz,t1,t2,t3,t4;
330         __m128d rsq,rinv,r;
331         __m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
332         __m128d uij,lij2,uij2,lij3,uij3,diff2;
333         __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
334         __m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
335         __m128d dadx1,dadx2;
336     __m128d logterm;
337         __m128d mask;
338         __m128d obc_mask1,obc_mask2,obc_mask3;    
339     
340     __m128d oneeighth   = _mm_set1_pd(0.125);
341     __m128d onefourth   = _mm_set1_pd(0.25);
342     
343         const __m128d half  = _mm_set1_pd(0.5);
344         const __m128d three = _mm_set1_pd(3.0);
345         const __m128d one   = _mm_set1_pd(1.0);
346         const __m128d two   = _mm_set1_pd(2.0);
347         const __m128d zero  = _mm_set1_pd(0.0);
348         const __m128d neg   = _mm_set1_pd(-1.0);
349         
350         /* Set the dielectric offset */
351         doffset   = born->gb_doffset;
352         gb_radius = born->gb_radius;
353     obc_param = born->param;
354     work      = born->gpol_hct_work;
355     jjnr      = nl->jjnr;
356     dadx      = fr->dadx;
357     shiftvec  = fr->shift_vec[0];
358     
359     jx        = _mm_setzero_pd();
360     jy        = _mm_setzero_pd();
361     jz        = _mm_setzero_pd();
362     
363     jnrA = jnrB = 0;
364     
365         for(i=0;i<born->nr;i++)
366         {
367                 work[i] = 0;
368         }
369         
370         for(i=0;i<nl->nri;i++)
371         {
372         ii     = nl->iinr[i];
373                 ii3        = ii*3;
374         is3    = 3*nl->shift[i];     
375         shX    = shiftvec[is3];  
376         shY    = shiftvec[is3+1];
377         shZ    = shiftvec[is3+2];
378         nj0    = nl->jindex[i];      
379         nj1    = nl->jindex[i+1];    
380         
381         ix     = _mm_set1_pd(shX+x[ii3+0]);
382                 iy     = _mm_set1_pd(shY+x[ii3+1]);
383                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
384                         
385                 rai    = _mm_load1_pd(gb_radius+ii);
386                 rai_inv= gmx_mm_inv_pd(rai);
387         
388                 sum_ai = _mm_setzero_pd();
389                 
390                 sk_ai  = _mm_load1_pd(born->param+ii);
391                 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
392         
393                 for(k=nj0;k<nj1-1;k+=2)
394                 {
395                         jnrA        = jjnr[k];   
396                         jnrB        = jjnr[k+1];
397                         
398             j3A         = 3*jnrA;  
399                         j3B         = 3*jnrB;
400             
401             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
402             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
403             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA,obc_param+jnrB,sk_aj);
404                         
405             dx    = _mm_sub_pd(ix, jx);
406                         dy    = _mm_sub_pd(iy, jy);
407                         dz    = _mm_sub_pd(iz, jz);
408                         
409             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
410             
411             rinv        = gmx_mm_invsqrt_pd(rsq);
412             r           = _mm_mul_pd(rsq,rinv);
413             
414                         /* Compute raj_inv aj1-4 */
415             raj_inv     = gmx_mm_inv_pd(raj);
416             
417             /* Evaluate influence of atom aj -> ai */
418             t1            = _mm_add_pd(r,sk_aj);
419             t2            = _mm_sub_pd(r,sk_aj);
420             t3            = _mm_sub_pd(sk_aj,r);
421             obc_mask1     = _mm_cmplt_pd(rai, t1);
422             obc_mask2     = _mm_cmplt_pd(rai, t2);
423             obc_mask3     = _mm_cmplt_pd(rai, t3);
424             
425             uij           = gmx_mm_inv_pd(t1);
426             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
427                                       _mm_andnot_pd(obc_mask2,rai_inv));
428             dlij          = _mm_and_pd(one,obc_mask2);
429             uij2          = _mm_mul_pd(uij, uij);
430             uij3          = _mm_mul_pd(uij2,uij);
431             lij2          = _mm_mul_pd(lij, lij);
432             lij3          = _mm_mul_pd(lij2,lij);
433                         
434             diff2         = _mm_sub_pd(uij2,lij2);
435             lij_inv       = gmx_mm_invsqrt_pd(lij2);
436             sk2_aj        = _mm_mul_pd(sk_aj,sk_aj);
437             sk2_rinv      = _mm_mul_pd(sk2_aj,rinv);
438             prod          = _mm_mul_pd(onefourth,sk2_rinv);
439                         
440             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
441             
442             t1            = _mm_sub_pd(lij,uij);
443             t2            = _mm_mul_pd(diff2,
444                                        _mm_sub_pd(_mm_mul_pd(onefourth,r),
445                                                   prod));
446             t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
447             t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
448             t4            = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij));
449             t4            = _mm_and_pd(t4,obc_mask3);
450             t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
451                         
452             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) );
453             
454             t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
455                                        _mm_mul_pd(prod,lij3));
456             t1            = _mm_sub_pd(t1,
457                                        _mm_mul_pd(onefourth,
458                                                   _mm_add_pd(_mm_mul_pd(lij,rinv),
459                                                              _mm_mul_pd(lij3,r))));
460             t2            = _mm_mul_pd(onefourth,
461                                        _mm_add_pd(_mm_mul_pd(uij,rinv),
462                                                   _mm_mul_pd(uij3,r)));
463             t2            = _mm_sub_pd(t2,
464                                        _mm_add_pd(_mm_mul_pd(half,uij2),
465                                                   _mm_mul_pd(prod,uij3)));
466             t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
467                                        _mm_mul_pd(rinv,rinv));
468             t3            = _mm_sub_pd(t3,
469                                        _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
470                                                   _mm_add_pd(one,
471                                                              _mm_mul_pd(sk2_rinv,rinv))));
472             t1            = _mm_mul_pd(rinv,
473                                        _mm_add_pd(_mm_mul_pd(dlij,t1),
474                                                   _mm_add_pd(t2,t3)));
475             
476             dadx1         = _mm_and_pd(t1,obc_mask1);
477             
478             /* Evaluate influence of atom ai -> aj */
479             t1            = _mm_add_pd(r,sk_ai);
480             t2            = _mm_sub_pd(r,sk_ai);
481             t3            = _mm_sub_pd(sk_ai,r);
482             obc_mask1     = _mm_cmplt_pd(raj, t1);
483             obc_mask2     = _mm_cmplt_pd(raj, t2);
484             obc_mask3     = _mm_cmplt_pd(raj, t3);
485             
486             uij           = gmx_mm_inv_pd(t1);
487             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
488                                       _mm_andnot_pd(obc_mask2,raj_inv));
489             dlij          = _mm_and_pd(one,obc_mask2);
490             uij2          = _mm_mul_pd(uij, uij);
491             uij3          = _mm_mul_pd(uij2,uij);
492             lij2          = _mm_mul_pd(lij, lij);
493             lij3          = _mm_mul_pd(lij2,lij);
494                         
495             diff2         = _mm_sub_pd(uij2,lij2);
496             lij_inv       = gmx_mm_invsqrt_pd(lij2);
497             sk2_rinv      = _mm_mul_pd(sk2_ai,rinv);
498             prod          = _mm_mul_pd(onefourth,sk2_rinv);
499                         
500             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
501             
502             t1            = _mm_sub_pd(lij,uij);
503             t2            = _mm_mul_pd(diff2,
504                                        _mm_sub_pd(_mm_mul_pd(onefourth,r),
505                                                   prod));
506             t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
507             t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
508             t4            = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij));
509             t4            = _mm_and_pd(t4,obc_mask3);
510             t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
511                         
512             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_and_pd(t1,obc_mask1));
513             
514             t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
515                                        _mm_mul_pd(prod,lij3));
516             t1            = _mm_sub_pd(t1,
517                                        _mm_mul_pd(onefourth,
518                                                   _mm_add_pd(_mm_mul_pd(lij,rinv),
519                                                              _mm_mul_pd(lij3,r))));
520             t2            = _mm_mul_pd(onefourth,
521                                        _mm_add_pd(_mm_mul_pd(uij,rinv),
522                                                   _mm_mul_pd(uij3,r)));
523             t2            = _mm_sub_pd(t2,
524                                        _mm_add_pd(_mm_mul_pd(half,uij2),
525                                                   _mm_mul_pd(prod,uij3)));
526             t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
527                                        _mm_mul_pd(rinv,rinv));
528             t3            = _mm_sub_pd(t3,
529                                        _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
530                                                   _mm_add_pd(one,
531                                                              _mm_mul_pd(sk2_rinv,rinv))));
532             t1            = _mm_mul_pd(rinv,
533                                        _mm_add_pd(_mm_mul_pd(dlij,t1),
534                                                   _mm_add_pd(t2,t3)));
535             
536             dadx2         = _mm_and_pd(t1,obc_mask1);
537             
538             _mm_store_pd(dadx,dadx1);
539             dadx += 2;
540             _mm_store_pd(dadx,dadx2);
541             dadx += 2;
542         } /* end normal inner loop */
543         
544                 if(k<nj1)
545                 {
546                         jnrA        = jjnr[k];   
547                         
548             j3A         = 3*jnrA;  
549             
550             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
551             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
552             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA,sk_aj);
553                         
554             dx    = _mm_sub_sd(ix, jx);
555                         dy    = _mm_sub_sd(iy, jy);
556                         dz    = _mm_sub_sd(iz, jz);
557                         
558             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
559             
560             rinv        = gmx_mm_invsqrt_pd(rsq);
561             r           = _mm_mul_sd(rsq,rinv);
562             
563                         /* Compute raj_inv aj1-4 */
564             raj_inv     = gmx_mm_inv_pd(raj);
565             
566             /* Evaluate influence of atom aj -> ai */
567             t1            = _mm_add_sd(r,sk_aj);
568             t2            = _mm_sub_sd(r,sk_aj);
569             t3            = _mm_sub_sd(sk_aj,r);
570             obc_mask1     = _mm_cmplt_sd(rai, t1);
571             obc_mask2     = _mm_cmplt_sd(rai, t2);
572             obc_mask3     = _mm_cmplt_sd(rai, t3);
573             
574             uij           = gmx_mm_inv_pd(t1);
575             lij           = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
576                                       _mm_andnot_pd(obc_mask2,rai_inv));
577             dlij          = _mm_and_pd(one,obc_mask2);
578             uij2          = _mm_mul_sd(uij, uij);
579             uij3          = _mm_mul_sd(uij2,uij);
580             lij2          = _mm_mul_sd(lij, lij);
581             lij3          = _mm_mul_sd(lij2,lij);
582             
583             diff2         = _mm_sub_sd(uij2,lij2);
584             lij_inv       = gmx_mm_invsqrt_pd(lij2);
585             sk2_aj        = _mm_mul_sd(sk_aj,sk_aj);
586             sk2_rinv      = _mm_mul_sd(sk2_aj,rinv);
587             prod          = _mm_mul_sd(onefourth,sk2_rinv);
588             
589             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
590             
591             t1            = _mm_sub_sd(lij,uij);
592             t2            = _mm_mul_sd(diff2,
593                                        _mm_sub_sd(_mm_mul_pd(onefourth,r),
594                                                   prod));
595             t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
596             t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
597             t4            = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij));
598             t4            = _mm_and_pd(t4,obc_mask3);
599             t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
600             
601             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) );
602             
603             t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
604                                        _mm_mul_sd(prod,lij3));
605             t1            = _mm_sub_sd(t1,
606                                        _mm_mul_sd(onefourth,
607                                                   _mm_add_sd(_mm_mul_sd(lij,rinv),
608                                                              _mm_mul_sd(lij3,r))));
609             t2            = _mm_mul_sd(onefourth,
610                                        _mm_add_sd(_mm_mul_sd(uij,rinv),
611                                                   _mm_mul_sd(uij3,r)));
612             t2            = _mm_sub_sd(t2,
613                                        _mm_add_sd(_mm_mul_sd(half,uij2),
614                                                   _mm_mul_sd(prod,uij3)));
615             t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
616                                        _mm_mul_sd(rinv,rinv));
617             t3            = _mm_sub_sd(t3,
618                                        _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
619                                                   _mm_add_sd(one,
620                                                              _mm_mul_sd(sk2_rinv,rinv))));
621             t1            = _mm_mul_sd(rinv,
622                                        _mm_add_sd(_mm_mul_sd(dlij,t1),
623                                                   _mm_add_pd(t2,t3)));
624             
625             dadx1         = _mm_and_pd(t1,obc_mask1);
626             
627             /* Evaluate influence of atom ai -> aj */
628             t1            = _mm_add_sd(r,sk_ai);
629             t2            = _mm_sub_sd(r,sk_ai);
630             t3            = _mm_sub_sd(sk_ai,r);
631             obc_mask1     = _mm_cmplt_sd(raj, t1);
632             obc_mask2     = _mm_cmplt_sd(raj, t2);
633             obc_mask3     = _mm_cmplt_sd(raj, t3);
634             
635             uij           = gmx_mm_inv_pd(t1);
636             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
637                                       _mm_andnot_pd(obc_mask2,raj_inv));
638             dlij          = _mm_and_pd(one,obc_mask2);
639             uij2          = _mm_mul_sd(uij, uij);
640             uij3          = _mm_mul_sd(uij2,uij);
641             lij2          = _mm_mul_sd(lij, lij);
642             lij3          = _mm_mul_sd(lij2,lij);
643             
644             diff2         = _mm_sub_sd(uij2,lij2);
645             lij_inv       = gmx_mm_invsqrt_pd(lij2);
646             sk2_rinv      = _mm_mul_sd(sk2_ai,rinv);
647             prod          = _mm_mul_sd(onefourth,sk2_rinv);
648             
649             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
650             
651             t1            = _mm_sub_sd(lij,uij);
652             t2            = _mm_mul_sd(diff2,
653                                        _mm_sub_sd(_mm_mul_sd(onefourth,r),
654                                                   prod));
655             t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
656             t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
657             t4            = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij));
658             t4            = _mm_and_pd(t4,obc_mask3);
659             t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
660             
661             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_and_pd(t1,obc_mask1));
662             
663             t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
664                                        _mm_mul_sd(prod,lij3));
665             t1            = _mm_sub_sd(t1,
666                                        _mm_mul_sd(onefourth,
667                                                   _mm_add_sd(_mm_mul_sd(lij,rinv),
668                                                              _mm_mul_sd(lij3,r))));
669             t2            = _mm_mul_sd(onefourth,
670                                        _mm_add_sd(_mm_mul_sd(uij,rinv),
671                                                   _mm_mul_sd(uij3,r)));
672             t2            = _mm_sub_sd(t2,
673                                        _mm_add_sd(_mm_mul_sd(half,uij2),
674                                                   _mm_mul_sd(prod,uij3)));
675             t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
676                                        _mm_mul_sd(rinv,rinv));
677             t3            = _mm_sub_sd(t3,
678                                        _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
679                                                   _mm_add_sd(one,
680                                                              _mm_mul_sd(sk2_rinv,rinv))));
681             t1            = _mm_mul_sd(rinv,
682                                        _mm_add_sd(_mm_mul_sd(dlij,t1),
683                                                   _mm_add_sd(t2,t3)));
684             
685             dadx2         = _mm_and_pd(t1,obc_mask1);
686             
687             _mm_store_pd(dadx,dadx1);
688             dadx += 2;
689             _mm_store_pd(dadx,dadx2);
690             dadx += 2;
691         } 
692         gmx_mm_update_1pot_pd(sum_ai,work+ii);
693         
694         }
695         
696         /* Parallel summations */
697         if(PARTDECOMP(cr))
698         {
699                 gmx_sum(natoms, work, cr);
700         }
701         else if(DOMAINDECOMP(cr))
702         {
703                 dd_atom_sum_real(cr->dd, work);
704         }
705         
706     if(gb_algorithm==egbHCT)
707     {
708         /* HCT */
709         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
710         {
711                         if(born->use[i] != 0)
712             {
713                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset; 
714                 sum     = 1.0/rr - work[i];
715                 min_rad = rr + doffset;
716                 rad     = 1.0/sum; 
717                 
718                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
719                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
720             }
721         }
722         
723         /* Extra communication required for DD */
724         if(DOMAINDECOMP(cr))
725         {
726             dd_atom_spread_real(cr->dd, born->bRad);
727             dd_atom_spread_real(cr->dd, fr->invsqrta);
728         }
729     }
730     else
731     {
732         /* OBC */
733         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
734         {
735                         if(born->use[i] != 0)
736             {
737                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
738                 rr_inv2 = 1.0/rr;
739                 rr      = rr-doffset; 
740                 rr_inv  = 1.0/rr;
741                 sum     = rr * work[i];
742                 sum2    = sum  * sum;
743                 sum3    = sum2 * sum;
744                 
745                 tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
746                 born->bRad[i] = rr_inv - tsum*rr_inv2;
747                 born->bRad[i] = 1.0 / born->bRad[i];
748                 
749                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
750                 
751                 tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
752                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
753             }
754         }
755         /* Extra (local) communication required for DD */
756         if(DOMAINDECOMP(cr))
757         {
758             dd_atom_spread_real(cr->dd, born->bRad);
759             dd_atom_spread_real(cr->dd, fr->invsqrta);
760             dd_atom_spread_real(cr->dd, born->drobc);
761         }
762     }
763     
764         
765         
766         return 0;
767 }
768
769
770 int
771 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, 
772                               double *x, double *f, double *fshift, double *shiftvec,
773                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)                                             
774 {
775         int    i,k,n,ii,jnr,ii3,is3,nj0,nj1,n0,n1;
776         int        jnrA,jnrB;
777     int    j3A,j3B;
778         int *  jjnr;
779     
780         double   rbi,shX,shY,shZ;
781         double   *rb;
782     
783         __m128d ix,iy,iz;
784         __m128d jx,jy,jz;
785         __m128d fix,fiy,fiz;
786         __m128d dx,dy,dz;
787     __m128d tx,ty,tz;
788     
789         __m128d rbai,rbaj,f_gb, f_gb_ai;
790         __m128d xmm1,xmm2,xmm3;
791         
792         const __m128d two = _mm_set1_pd(2.0);
793     
794         rb     = born->work; 
795     
796   jjnr   = nl->jjnr;
797   
798         /* Loop to get the proper form for the Born radius term, sse style */   
799   n0 = 0;
800   n1 = natoms;
801     
802         if(gb_algorithm==egbSTILL) 
803         {
804                 for(i=n0;i<n1;i++)
805                 {
806       rbi   = born->bRad[i];
807                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
808                 }
809         }
810         else if(gb_algorithm==egbHCT) 
811         {
812                 for(i=n0;i<n1;i++)
813                 {
814       rbi   = born->bRad[i];
815                         rb[i] = rbi * rbi * dvda[i];
816                 }
817         }
818         else if(gb_algorithm==egbOBC) 
819         {
820                 for(i=n0;i<n1;i++)
821                 {
822       rbi   = born->bRad[i];
823                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
824                 }
825         }
826     
827   jz = _mm_setzero_pd();
828   
829   n = j3A = j3B = 0;
830   
831         for(i=0;i<nl->nri;i++)
832         {
833     ii     = nl->iinr[i];
834                 ii3        = ii*3;
835     is3    = 3*nl->shift[i];     
836     shX    = shiftvec[is3];  
837     shY    = shiftvec[is3+1];
838     shZ    = shiftvec[is3+2];
839     nj0    = nl->jindex[i];      
840     nj1    = nl->jindex[i+1];    
841     
842     ix     = _mm_set1_pd(shX+x[ii3+0]);
843                 iy     = _mm_set1_pd(shY+x[ii3+1]);
844                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
845     
846                 rbai   = _mm_load1_pd(rb+ii);                   
847                 fix    = _mm_setzero_pd();
848                 fiy    = _mm_setzero_pd();
849                 fiz    = _mm_setzero_pd();      
850     
851         
852     for(k=nj0;k<nj1-1;k+=2)
853                 {
854                         jnrA        = jjnr[k];   
855                         jnrB        = jjnr[k+1];
856       
857       j3A         = 3*jnrA;  
858                         j3B         = 3*jnrB;
859             
860       GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
861       
862                         dx          = _mm_sub_pd(ix,jx);
863                         dy          = _mm_sub_pd(iy,jy);
864                         dz          = _mm_sub_pd(iz,jz);
865       
866       GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
867       
868                         /* load chain rule terms for j1-4 */
869                         f_gb        = _mm_load_pd(dadx);
870                         dadx += 2;
871                         f_gb_ai     = _mm_load_pd(dadx);
872                         dadx += 2;
873                         
874       /* calculate scalar force */
875       f_gb    = _mm_mul_pd(f_gb,rbai); 
876       f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
877       f_gb    = _mm_add_pd(f_gb,f_gb_ai);
878       
879       tx     = _mm_mul_pd(f_gb,dx);
880       ty     = _mm_mul_pd(f_gb,dy);
881       tz     = _mm_mul_pd(f_gb,dz);
882       
883       fix    = _mm_add_pd(fix,tx);
884       fiy    = _mm_add_pd(fiy,ty);
885       fiz    = _mm_add_pd(fiz,tz);
886       
887       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
888                 }
889     
890                 /*deal with odd elements */
891                 if(k<nj1) 
892         {
893           jnrA        = jjnr[k];   
894           j3A         = 3*jnrA;  
895           
896           GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
897           
898           dx          = _mm_sub_sd(ix,jx);
899           dy          = _mm_sub_sd(iy,jy);
900           dz          = _mm_sub_sd(iz,jz);
901           
902           GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
903           
904           /* load chain rule terms */
905           f_gb        = _mm_load_pd(dadx);
906           dadx += 2;
907           f_gb_ai     = _mm_load_pd(dadx);
908           dadx += 2;
909           
910           /* calculate scalar force */
911           f_gb    = _mm_mul_sd(f_gb,rbai); 
912           f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
913           f_gb    = _mm_add_sd(f_gb,f_gb_ai);
914           
915           tx     = _mm_mul_sd(f_gb,dx);
916           ty     = _mm_mul_sd(f_gb,dy);
917           tz     = _mm_mul_sd(f_gb,dz);
918           
919           fix    = _mm_add_sd(fix,tx);
920           fiy    = _mm_add_sd(fiy,ty);
921           fiz    = _mm_add_sd(fiz,tz);
922           
923           GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
924         } 
925     
926                 /* fix/fiy/fiz now contain four partial force terms, that all should be
927      * added to the i particle forces and shift forces. 
928      */
929                 gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3);
930         }       
931   
932         return 0;       
933 }
934
935 #else
936 /* keep compiler happy */
937 int genborn_sse2_dummy;
938
939 #endif /* SSE2 intrinsics available */
940