src/mdlib/genborn_sse2_double.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #ifdef HAVE_CONFIG_H
  39 #include <config.h>
  40 #endif
  41
  42 #include <math.h>
  43 #include <string.h>
  44
  45 #include "typedefs.h"
  46 #include "smalloc.h"
  47 #include "genborn.h"
  48 #include "vec.h"
  49 #include "grompp.h"
  50 #include "pdbio.h"
  51 #include "names.h"
  52 #include "physics.h"
  53 #include "domdec.h"
  54 #include "partdec.h"
  55 #include "network.h"
  56 #include "gmx_fatal.h"
  57 #include "mtop_util.h"
  58 #include "genborn.h"
  59
  60 #ifdef GMX_LIB_MPI
  61 #include <mpi.h>
  62 #endif
  63 #ifdef GMX_THREAD_MPI
  64 #include "tmpi.h"
  65 #endif
  66
  67 /* Only compile this file if SSE2 intrinsics are available */
  68 #if 0 && defined (GMX_X86_SSE2)
  69 #include <gmx_sse2_double.h>
  70 #include <emmintrin.h>
  71
  72 #include "genborn_sse2_double.h"
  73
  74 int
  75 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
  76                               int natoms, gmx_localtop_t *top,
  77                               const t_atomtypes *atype, double *x, t_nblist *nl,
  78                               gmx_genborn_t *born)
  79 {
  80         int i,k,n,ii,is3,ii3,nj0,nj1,offset;
  81         int jnrA,jnrB,j3A,j3B;
  82     int *mdtype;
  83         double shX,shY,shZ;
  84     int *jjnr;
  85     double *shiftvec;
  86
  87         double gpi_ai,gpi2;
  88         double factor;
  89         double *gb_radius;
  90     double *vsolv;
  91     double *work;
  92     double *dadx;
  93
  94         __m128d ix,iy,iz;
  95         __m128d jx,jy,jz;
  96         __m128d dx,dy,dz;
  97         __m128d tx,ty,tz;
  98         __m128d rsq,rinv,rinv2,rinv4,rinv6;
  99         __m128d ratio,gpi,rai,raj,vai,vaj,rvdw;
 100         __m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
 101         __m128d mask,icf4,icf6,mask_cmp;
 102
 103         const __m128d half   = _mm_set1_pd(0.5);
 104         const __m128d three  = _mm_set1_pd(3.0);
 105         const __m128d one    = _mm_set1_pd(1.0);
 106         const __m128d two    = _mm_set1_pd(2.0);
 107         const __m128d zero   = _mm_set1_pd(0.0);
 108         const __m128d four   = _mm_set1_pd(4.0);
 109
 110         const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
 111         const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
 112         const __m128d still_p4     = _mm_set1_pd(STILL_P4);
 113
 114         factor  = 0.5 * ONE_4PI_EPS0;
 115
 116     gb_radius = born->gb_radius;
 117     vsolv     = born->vsolv;
 118     work      = born->gpol_still_work;
 119         jjnr      = nl->jjnr;
 120     shiftvec  = fr->shift_vec[0];
 121     dadx      = fr->dadx;
 122
 123         jnrA = jnrB = 0;
 124     jx = _mm_setzero_pd();
 125     jy = _mm_setzero_pd();
 126     jz = _mm_setzero_pd();
 127
 128         n = 0;
 129
 130         for(i=0;i<natoms;i++)
 131         {
 132                 work[i]=0;
 133         }
 134
 135         for(i=0;i<nl->nri;i++)
 136         {
 137         ii     = nl->iinr[i];
 138                 ii3        = ii*3;
 139         is3    = 3*nl->shift[i];
 140         shX    = shiftvec[is3];
 141         shY    = shiftvec[is3+1];
 142         shZ    = shiftvec[is3+2];
 143         nj0    = nl->jindex[i];
 144         nj1    = nl->jindex[i+1];
 145
 146         ix     = _mm_set1_pd(shX+x[ii3+0]);
 147                 iy     = _mm_set1_pd(shY+x[ii3+1]);
 148                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
 149
 150
 151                 /* Polarization energy for atom ai */
 152                 gpi    = _mm_setzero_pd();
 153
 154         rai     = _mm_load1_pd(gb_radius+ii);
 155         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
 156
 157                 for(k=nj0;k<nj1-1;k+=2)
 158                 {
 159                         jnrA        = jjnr[k];
 160                         jnrB        = jjnr[k+1];
 161
 162             j3A         = 3*jnrA;
 163                         j3B         = 3*jnrB;
 164
 165             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
 166
 167             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
 168                         GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj);
 169
 170                         dx          = _mm_sub_pd(ix,jx);
 171                         dy          = _mm_sub_pd(iy,jy);
 172                         dz          = _mm_sub_pd(iz,jz);
 173
 174             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
 175             rinv        = gmx_mm_invsqrt_pd(rsq);
 176             rinv2       = _mm_mul_pd(rinv,rinv);
 177             rinv4       = _mm_mul_pd(rinv2,rinv2);
 178             rinv6       = _mm_mul_pd(rinv4,rinv2);
 179
 180             rvdw        = _mm_add_pd(rai,raj);
 181             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
 182
 183             mask_cmp    = _mm_cmple_pd(ratio,still_p5inv);
 184
 185             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 186             if( 0 == _mm_movemask_pd(mask_cmp) )
 187             {
 188                 /* if ratio>still_p5inv for ALL elements */
 189                 ccf         = one;
 190                 dccf        = _mm_setzero_pd();
 191             }
 192             else
 193             {
 194                 ratio       = _mm_min_pd(ratio,still_p5inv);
 195                 theta       = _mm_mul_pd(ratio,still_pip5);
 196                 gmx_mm_sincos_pd(theta,&sinq,&cosq);
 197                 term        = _mm_mul_pd(half,_mm_sub_pd(one,cosq));
 198                 ccf         = _mm_mul_pd(term,term);
 199                 dccf        = _mm_mul_pd(_mm_mul_pd(two,term),
 200                                          _mm_mul_pd(sinq,theta));
 201             }
 202
 203             prod        = _mm_mul_pd(still_p4,vaj);
 204             icf4        = _mm_mul_pd(ccf,rinv4);
 205             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6);
 206
 207             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4));
 208
 209             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) );
 210
 211             _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
 212             dadx+=2;
 213             _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
 214             dadx+=2;
 215                 }
 216
 217         if(k<nj1)
 218                 {
 219                         jnrA        = jjnr[k];
 220
 221             j3A         = 3*jnrA;
 222
 223             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
 224
 225             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
 226                         GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj);
 227
 228                         dx          = _mm_sub_sd(ix,jx);
 229                         dy          = _mm_sub_sd(iy,jy);
 230                         dz          = _mm_sub_sd(iz,jz);
 231
 232             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
 233             rinv        = gmx_mm_invsqrt_pd(rsq);
 234             rinv2       = _mm_mul_sd(rinv,rinv);
 235             rinv4       = _mm_mul_sd(rinv2,rinv2);
 236             rinv6       = _mm_mul_sd(rinv4,rinv2);
 237
 238             rvdw        = _mm_add_sd(rai,raj);
 239             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw)));
 240
 241             mask_cmp    = _mm_cmple_sd(ratio,still_p5inv);
 242
 243             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 244             if( 0 == _mm_movemask_pd(mask_cmp) )
 245             {
 246                 /* if ratio>still_p5inv for ALL elements */
 247                 ccf         = one;
 248                 dccf        = _mm_setzero_pd();
 249             }
 250             else
 251             {
 252                 ratio       = _mm_min_sd(ratio,still_p5inv);
 253                 theta       = _mm_mul_sd(ratio,still_pip5);
 254                 gmx_mm_sincos_pd(theta,&sinq,&cosq);
 255                 term        = _mm_mul_sd(half,_mm_sub_sd(one,cosq));
 256                 ccf         = _mm_mul_sd(term,term);
 257                 dccf        = _mm_mul_sd(_mm_mul_sd(two,term),
 258                                          _mm_mul_sd(sinq,theta));
 259             }
 260
 261             prod        = _mm_mul_sd(still_p4,vaj);
 262             icf4        = _mm_mul_sd(ccf,rinv4);
 263             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6);
 264
 265             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4));
 266
 267             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) );
 268
 269             _mm_store_pd(dadx,_mm_mul_pd(prod,icf6));
 270             dadx+=2;
 271             _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6));
 272             dadx+=2;
 273                 }
 274         gmx_mm_update_1pot_pd(gpi,work+ii);
 275         }
 276
 277         /* Sum up the polarization energy from other nodes */
 278         if(PARTDECOMP(cr))
 279         {
 280                 gmx_sum(natoms, work, cr);
 281         }
 282         else if(DOMAINDECOMP(cr))
 283         {
 284                 dd_atom_sum_real(cr->dd, work);
 285         }
 286
 287         /* Compute the radii */
 288         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 289         {
 290                 if(born->use[i] != 0)
 291                 {
 292                         gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 293                         gpi2             = gpi_ai * gpi_ai;
 294                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
 295                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 296                 }
 297         }
 298
 299         /* Extra (local) communication required for DD */
 300         if(DOMAINDECOMP(cr))
 301         {
 302                 dd_atom_spread_real(cr->dd, born->bRad);
 303                 dd_atom_spread_real(cr->dd, fr->invsqrta);
 304         }
 305
 306         return 0;
 307 }
 308
 309
 310 int
 311 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 312                                 const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
 313 {
 314         int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
 315     int jnrA,jnrB;
 316     int j3A,j3B;
 317         double shX,shY,shZ;
 318         double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
 319         double sum_ai2, sum_ai3,tsum,tchain,doffset;
 320         double *obc_param;
 321     double *gb_radius;
 322     double *work;
 323     int *  jjnr;
 324     double *dadx;
 325     double *shiftvec;
 326     double min_rad,rad;
 327
 328         __m128d ix,iy,iz,jx,jy,jz;
 329         __m128d dx,dy,dz,t1,t2,t3,t4;
 330         __m128d rsq,rinv,r;
 331         __m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
 332         __m128d uij,lij2,uij2,lij3,uij3,diff2;
 333         __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
 334         __m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
 335         __m128d dadx1,dadx2;
 336     __m128d logterm;
 337         __m128d mask;
 338         __m128d obc_mask1,obc_mask2,obc_mask3;
 339
 340     __m128d oneeighth   = _mm_set1_pd(0.125);
 341     __m128d onefourth   = _mm_set1_pd(0.25);
 342
 343         const __m128d half  = _mm_set1_pd(0.5);
 344         const __m128d three = _mm_set1_pd(3.0);
 345         const __m128d one   = _mm_set1_pd(1.0);
 346         const __m128d two   = _mm_set1_pd(2.0);
 347         const __m128d zero  = _mm_set1_pd(0.0);
 348         const __m128d neg   = _mm_set1_pd(-1.0);
 349
 350         /* Set the dielectric offset */
 351         doffset   = born->gb_doffset;
 352         gb_radius = born->gb_radius;
 353     obc_param = born->param;
 354     work      = born->gpol_hct_work;
 355     jjnr      = nl->jjnr;
 356     dadx      = fr->dadx;
 357     shiftvec  = fr->shift_vec[0];
 358
 359     jx        = _mm_setzero_pd();
 360     jy        = _mm_setzero_pd();
 361     jz        = _mm_setzero_pd();
 362
 363     jnrA = jnrB = 0;
 364
 365         for(i=0;i<born->nr;i++)
 366         {
 367                 work[i] = 0;
 368         }
 369
 370         for(i=0;i<nl->nri;i++)
 371         {
 372         ii     = nl->iinr[i];
 373                 ii3        = ii*3;
 374         is3    = 3*nl->shift[i];
 375         shX    = shiftvec[is3];
 376         shY    = shiftvec[is3+1];
 377         shZ    = shiftvec[is3+2];
 378         nj0    = nl->jindex[i];
 379         nj1    = nl->jindex[i+1];
 380
 381         ix     = _mm_set1_pd(shX+x[ii3+0]);
 382                 iy     = _mm_set1_pd(shY+x[ii3+1]);
 383                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
 384
 385                 rai    = _mm_load1_pd(gb_radius+ii);
 386                 rai_inv= gmx_mm_inv_pd(rai);
 387
 388                 sum_ai = _mm_setzero_pd();
 389
 390                 sk_ai  = _mm_load1_pd(born->param+ii);
 391                 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
 392
 393                 for(k=nj0;k<nj1-1;k+=2)
 394                 {
 395                         jnrA        = jjnr[k];
 396                         jnrB        = jjnr[k+1];
 397
 398             j3A         = 3*jnrA;
 399                         j3B         = 3*jnrB;
 400
 401             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
 402             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj);
 403             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA,obc_param+jnrB,sk_aj);
 404
 405             dx    = _mm_sub_pd(ix, jx);
 406                         dy    = _mm_sub_pd(iy, jy);
 407                         dz    = _mm_sub_pd(iz, jz);
 408
 409             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
 410
 411             rinv        = gmx_mm_invsqrt_pd(rsq);
 412             r           = _mm_mul_pd(rsq,rinv);
 413
 414                         /* Compute raj_inv aj1-4 */
 415             raj_inv     = gmx_mm_inv_pd(raj);
 416
 417             /* Evaluate influence of atom aj -> ai */
 418             t1            = _mm_add_pd(r,sk_aj);
 419             t2            = _mm_sub_pd(r,sk_aj);
 420             t3            = _mm_sub_pd(sk_aj,r);
 421             obc_mask1     = _mm_cmplt_pd(rai, t1);
 422             obc_mask2     = _mm_cmplt_pd(rai, t2);
 423             obc_mask3     = _mm_cmplt_pd(rai, t3);
 424
 425             uij           = gmx_mm_inv_pd(t1);
 426             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
 427                                       _mm_andnot_pd(obc_mask2,rai_inv));
 428             dlij          = _mm_and_pd(one,obc_mask2);
 429             uij2          = _mm_mul_pd(uij, uij);
 430             uij3          = _mm_mul_pd(uij2,uij);
 431             lij2          = _mm_mul_pd(lij, lij);
 432             lij3          = _mm_mul_pd(lij2,lij);
 433
 434             diff2         = _mm_sub_pd(uij2,lij2);
 435             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 436             sk2_aj        = _mm_mul_pd(sk_aj,sk_aj);
 437             sk2_rinv      = _mm_mul_pd(sk2_aj,rinv);
 438             prod          = _mm_mul_pd(onefourth,sk2_rinv);
 439
 440             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
 441
 442             t1            = _mm_sub_pd(lij,uij);
 443             t2            = _mm_mul_pd(diff2,
 444                                        _mm_sub_pd(_mm_mul_pd(onefourth,r),
 445                                                   prod));
 446             t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
 447             t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
 448             t4            = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij));
 449             t4            = _mm_and_pd(t4,obc_mask3);
 450             t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
 451
 452             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) );
 453
 454             t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
 455                                        _mm_mul_pd(prod,lij3));
 456             t1            = _mm_sub_pd(t1,
 457                                        _mm_mul_pd(onefourth,
 458                                                   _mm_add_pd(_mm_mul_pd(lij,rinv),
 459                                                              _mm_mul_pd(lij3,r))));
 460             t2            = _mm_mul_pd(onefourth,
 461                                        _mm_add_pd(_mm_mul_pd(uij,rinv),
 462                                                   _mm_mul_pd(uij3,r)));
 463             t2            = _mm_sub_pd(t2,
 464                                        _mm_add_pd(_mm_mul_pd(half,uij2),
 465                                                   _mm_mul_pd(prod,uij3)));
 466             t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
 467                                        _mm_mul_pd(rinv,rinv));
 468             t3            = _mm_sub_pd(t3,
 469                                        _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
 470                                                   _mm_add_pd(one,
 471                                                              _mm_mul_pd(sk2_rinv,rinv))));
 472             t1            = _mm_mul_pd(rinv,
 473                                        _mm_add_pd(_mm_mul_pd(dlij,t1),
 474                                                   _mm_add_pd(t2,t3)));
 475
 476             dadx1         = _mm_and_pd(t1,obc_mask1);
 477
 478             /* Evaluate influence of atom ai -> aj */
 479             t1            = _mm_add_pd(r,sk_ai);
 480             t2            = _mm_sub_pd(r,sk_ai);
 481             t3            = _mm_sub_pd(sk_ai,r);
 482             obc_mask1     = _mm_cmplt_pd(raj, t1);
 483             obc_mask2     = _mm_cmplt_pd(raj, t2);
 484             obc_mask3     = _mm_cmplt_pd(raj, t3);
 485
 486             uij           = gmx_mm_inv_pd(t1);
 487             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
 488                                       _mm_andnot_pd(obc_mask2,raj_inv));
 489             dlij          = _mm_and_pd(one,obc_mask2);
 490             uij2          = _mm_mul_pd(uij, uij);
 491             uij3          = _mm_mul_pd(uij2,uij);
 492             lij2          = _mm_mul_pd(lij, lij);
 493             lij3          = _mm_mul_pd(lij2,lij);
 494
 495             diff2         = _mm_sub_pd(uij2,lij2);
 496             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 497             sk2_rinv      = _mm_mul_pd(sk2_ai,rinv);
 498             prod          = _mm_mul_pd(onefourth,sk2_rinv);
 499
 500             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv));
 501
 502             t1            = _mm_sub_pd(lij,uij);
 503             t2            = _mm_mul_pd(diff2,
 504                                        _mm_sub_pd(_mm_mul_pd(onefourth,r),
 505                                                   prod));
 506             t3            = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm));
 507             t1            = _mm_add_pd(t1,_mm_add_pd(t2,t3));
 508             t4            = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij));
 509             t4            = _mm_and_pd(t4,obc_mask3);
 510             t1            = _mm_mul_pd(half,_mm_add_pd(t1,t4));
 511
 512             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_and_pd(t1,obc_mask1));
 513
 514             t1            = _mm_add_pd(_mm_mul_pd(half,lij2),
 515                                        _mm_mul_pd(prod,lij3));
 516             t1            = _mm_sub_pd(t1,
 517                                        _mm_mul_pd(onefourth,
 518                                                   _mm_add_pd(_mm_mul_pd(lij,rinv),
 519                                                              _mm_mul_pd(lij3,r))));
 520             t2            = _mm_mul_pd(onefourth,
 521                                        _mm_add_pd(_mm_mul_pd(uij,rinv),
 522                                                   _mm_mul_pd(uij3,r)));
 523             t2            = _mm_sub_pd(t2,
 524                                        _mm_add_pd(_mm_mul_pd(half,uij2),
 525                                                   _mm_mul_pd(prod,uij3)));
 526             t3            = _mm_mul_pd(_mm_mul_pd(onefourth,logterm),
 527                                        _mm_mul_pd(rinv,rinv));
 528             t3            = _mm_sub_pd(t3,
 529                                        _mm_mul_pd(_mm_mul_pd(diff2,oneeighth),
 530                                                   _mm_add_pd(one,
 531                                                              _mm_mul_pd(sk2_rinv,rinv))));
 532             t1            = _mm_mul_pd(rinv,
 533                                        _mm_add_pd(_mm_mul_pd(dlij,t1),
 534                                                   _mm_add_pd(t2,t3)));
 535
 536             dadx2         = _mm_and_pd(t1,obc_mask1);
 537
 538             _mm_store_pd(dadx,dadx1);
 539             dadx += 2;
 540             _mm_store_pd(dadx,dadx2);
 541             dadx += 2;
 542         } /* end normal inner loop */
 543
 544                 if(k<nj1)
 545                 {
 546                         jnrA        = jjnr[k];
 547
 548             j3A         = 3*jnrA;
 549
 550             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
 551             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj);
 552             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA,sk_aj);
 553
 554             dx    = _mm_sub_sd(ix, jx);
 555                         dy    = _mm_sub_sd(iy, jy);
 556                         dz    = _mm_sub_sd(iz, jz);
 557
 558             rsq         = gmx_mm_calc_rsq_pd(dx,dy,dz);
 559
 560             rinv        = gmx_mm_invsqrt_pd(rsq);
 561             r           = _mm_mul_sd(rsq,rinv);
 562
 563                         /* Compute raj_inv aj1-4 */
 564             raj_inv     = gmx_mm_inv_pd(raj);
 565
 566             /* Evaluate influence of atom aj -> ai */
 567             t1            = _mm_add_sd(r,sk_aj);
 568             t2            = _mm_sub_sd(r,sk_aj);
 569             t3            = _mm_sub_sd(sk_aj,r);
 570             obc_mask1     = _mm_cmplt_sd(rai, t1);
 571             obc_mask2     = _mm_cmplt_sd(rai, t2);
 572             obc_mask3     = _mm_cmplt_sd(rai, t3);
 573
 574             uij           = gmx_mm_inv_pd(t1);
 575             lij           = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
 576                                       _mm_andnot_pd(obc_mask2,rai_inv));
 577             dlij          = _mm_and_pd(one,obc_mask2);
 578             uij2          = _mm_mul_sd(uij, uij);
 579             uij3          = _mm_mul_sd(uij2,uij);
 580             lij2          = _mm_mul_sd(lij, lij);
 581             lij3          = _mm_mul_sd(lij2,lij);
 582
 583             diff2         = _mm_sub_sd(uij2,lij2);
 584             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 585             sk2_aj        = _mm_mul_sd(sk_aj,sk_aj);
 586             sk2_rinv      = _mm_mul_sd(sk2_aj,rinv);
 587             prod          = _mm_mul_sd(onefourth,sk2_rinv);
 588
 589             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
 590
 591             t1            = _mm_sub_sd(lij,uij);
 592             t2            = _mm_mul_sd(diff2,
 593                                        _mm_sub_sd(_mm_mul_pd(onefourth,r),
 594                                                   prod));
 595             t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
 596             t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
 597             t4            = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij));
 598             t4            = _mm_and_pd(t4,obc_mask3);
 599             t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
 600
 601             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) );
 602
 603             t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
 604                                        _mm_mul_sd(prod,lij3));
 605             t1            = _mm_sub_sd(t1,
 606                                        _mm_mul_sd(onefourth,
 607                                                   _mm_add_sd(_mm_mul_sd(lij,rinv),
 608                                                              _mm_mul_sd(lij3,r))));
 609             t2            = _mm_mul_sd(onefourth,
 610                                        _mm_add_sd(_mm_mul_sd(uij,rinv),
 611                                                   _mm_mul_sd(uij3,r)));
 612             t2            = _mm_sub_sd(t2,
 613                                        _mm_add_sd(_mm_mul_sd(half,uij2),
 614                                                   _mm_mul_sd(prod,uij3)));
 615             t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
 616                                        _mm_mul_sd(rinv,rinv));
 617             t3            = _mm_sub_sd(t3,
 618                                        _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
 619                                                   _mm_add_sd(one,
 620                                                              _mm_mul_sd(sk2_rinv,rinv))));
 621             t1            = _mm_mul_sd(rinv,
 622                                        _mm_add_sd(_mm_mul_sd(dlij,t1),
 623                                                   _mm_add_pd(t2,t3)));
 624
 625             dadx1         = _mm_and_pd(t1,obc_mask1);
 626
 627             /* Evaluate influence of atom ai -> aj */
 628             t1            = _mm_add_sd(r,sk_ai);
 629             t2            = _mm_sub_sd(r,sk_ai);
 630             t3            = _mm_sub_sd(sk_ai,r);
 631             obc_mask1     = _mm_cmplt_sd(raj, t1);
 632             obc_mask2     = _mm_cmplt_sd(raj, t2);
 633             obc_mask3     = _mm_cmplt_sd(raj, t3);
 634
 635             uij           = gmx_mm_inv_pd(t1);
 636             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)),
 637                                       _mm_andnot_pd(obc_mask2,raj_inv));
 638             dlij          = _mm_and_pd(one,obc_mask2);
 639             uij2          = _mm_mul_sd(uij, uij);
 640             uij3          = _mm_mul_sd(uij2,uij);
 641             lij2          = _mm_mul_sd(lij, lij);
 642             lij3          = _mm_mul_sd(lij2,lij);
 643
 644             diff2         = _mm_sub_sd(uij2,lij2);
 645             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 646             sk2_rinv      = _mm_mul_sd(sk2_ai,rinv);
 647             prod          = _mm_mul_sd(onefourth,sk2_rinv);
 648
 649             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv));
 650
 651             t1            = _mm_sub_sd(lij,uij);
 652             t2            = _mm_mul_sd(diff2,
 653                                        _mm_sub_sd(_mm_mul_sd(onefourth,r),
 654                                                   prod));
 655             t3            = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm));
 656             t1            = _mm_add_sd(t1,_mm_add_sd(t2,t3));
 657             t4            = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij));
 658             t4            = _mm_and_pd(t4,obc_mask3);
 659             t1            = _mm_mul_sd(half,_mm_add_sd(t1,t4));
 660
 661             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_and_pd(t1,obc_mask1));
 662
 663             t1            = _mm_add_sd(_mm_mul_sd(half,lij2),
 664                                        _mm_mul_sd(prod,lij3));
 665             t1            = _mm_sub_sd(t1,
 666                                        _mm_mul_sd(onefourth,
 667                                                   _mm_add_sd(_mm_mul_sd(lij,rinv),
 668                                                              _mm_mul_sd(lij3,r))));
 669             t2            = _mm_mul_sd(onefourth,
 670                                        _mm_add_sd(_mm_mul_sd(uij,rinv),
 671                                                   _mm_mul_sd(uij3,r)));
 672             t2            = _mm_sub_sd(t2,
 673                                        _mm_add_sd(_mm_mul_sd(half,uij2),
 674                                                   _mm_mul_sd(prod,uij3)));
 675             t3            = _mm_mul_sd(_mm_mul_sd(onefourth,logterm),
 676                                        _mm_mul_sd(rinv,rinv));
 677             t3            = _mm_sub_sd(t3,
 678                                        _mm_mul_sd(_mm_mul_sd(diff2,oneeighth),
 679                                                   _mm_add_sd(one,
 680                                                              _mm_mul_sd(sk2_rinv,rinv))));
 681             t1            = _mm_mul_sd(rinv,
 682                                        _mm_add_sd(_mm_mul_sd(dlij,t1),
 683                                                   _mm_add_sd(t2,t3)));
 684
 685             dadx2         = _mm_and_pd(t1,obc_mask1);
 686
 687             _mm_store_pd(dadx,dadx1);
 688             dadx += 2;
 689             _mm_store_pd(dadx,dadx2);
 690             dadx += 2;
 691         }
 692         gmx_mm_update_1pot_pd(sum_ai,work+ii);
 693
 694         }
 695
 696         /* Parallel summations */
 697         if(PARTDECOMP(cr))
 698         {
 699                 gmx_sum(natoms, work, cr);
 700         }
 701         else if(DOMAINDECOMP(cr))
 702         {
 703                 dd_atom_sum_real(cr->dd, work);
 704         }
 705
 706     if(gb_algorithm==egbHCT)
 707     {
 708         /* HCT */
 709         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 710         {
 711                         if(born->use[i] != 0)
 712             {
 713                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
 714                 sum     = 1.0/rr - work[i];
 715                 min_rad = rr + doffset;
 716                 rad     = 1.0/sum;
 717
 718                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
 719                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 720             }
 721         }
 722
 723         /* Extra communication required for DD */
 724         if(DOMAINDECOMP(cr))
 725         {
 726             dd_atom_spread_real(cr->dd, born->bRad);
 727             dd_atom_spread_real(cr->dd, fr->invsqrta);
 728         }
 729     }
 730     else
 731     {
 732         /* OBC */
 733         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 734         {
 735                         if(born->use[i] != 0)
 736             {
 737                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
 738                 rr_inv2 = 1.0/rr;
 739                 rr      = rr-doffset;
 740                 rr_inv  = 1.0/rr;
 741                 sum     = rr * work[i];
 742                 sum2    = sum  * sum;
 743                 sum3    = sum2 * sum;
 744
 745                 tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
 746                 born->bRad[i] = rr_inv - tsum*rr_inv2;
 747                 born->bRad[i] = 1.0 / born->bRad[i];
 748
 749                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
 750
 751                 tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
 752                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
 753             }
 754         }
 755         /* Extra (local) communication required for DD */
 756         if(DOMAINDECOMP(cr))
 757         {
 758             dd_atom_spread_real(cr->dd, born->bRad);
 759             dd_atom_spread_real(cr->dd, fr->invsqrta);
 760             dd_atom_spread_real(cr->dd, born->drobc);
 761         }
 762     }
 763
 764
 765
 766         return 0;
 767 }
 768
 769
 770 int
 771 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
 772                               double *x, double *f, double *fshift, double *shiftvec,
 773                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
 774 {
 775         int    i,k,n,ii,jnr,ii3,is3,nj0,nj1,n0,n1;
 776         int        jnrA,jnrB;
 777     int    j3A,j3B;
 778         int *  jjnr;
 779
 780         double   rbi,shX,shY,shZ;
 781         double   *rb;
 782
 783         __m128d ix,iy,iz;
 784         __m128d jx,jy,jz;
 785         __m128d fix,fiy,fiz;
 786         __m128d dx,dy,dz;
 787     __m128d tx,ty,tz;
 788
 789         __m128d rbai,rbaj,f_gb, f_gb_ai;
 790         __m128d xmm1,xmm2,xmm3;
 791
 792         const __m128d two = _mm_set1_pd(2.0);
 793
 794         rb     = born->work;
 795
 796   jjnr   = nl->jjnr;
 797
 798         /* Loop to get the proper form for the Born radius term, sse style */
 799   n0 = 0;
 800   n1 = natoms;
 801
 802         if(gb_algorithm==egbSTILL)
 803         {
 804                 for(i=n0;i<n1;i++)
 805                 {
 806       rbi   = born->bRad[i];
 807                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
 808                 }
 809         }
 810         else if(gb_algorithm==egbHCT)
 811         {
 812                 for(i=n0;i<n1;i++)
 813                 {
 814       rbi   = born->bRad[i];
 815                         rb[i] = rbi * rbi * dvda[i];
 816                 }
 817         }
 818         else if(gb_algorithm==egbOBC)
 819         {
 820                 for(i=n0;i<n1;i++)
 821                 {
 822       rbi   = born->bRad[i];
 823                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
 824                 }
 825         }
 826
 827   jz = _mm_setzero_pd();
 828
 829   n = j3A = j3B = 0;
 830
 831         for(i=0;i<nl->nri;i++)
 832         {
 833     ii     = nl->iinr[i];
 834                 ii3        = ii*3;
 835     is3    = 3*nl->shift[i];
 836     shX    = shiftvec[is3];
 837     shY    = shiftvec[is3+1];
 838     shZ    = shiftvec[is3+2];
 839     nj0    = nl->jindex[i];
 840     nj1    = nl->jindex[i+1];
 841
 842     ix     = _mm_set1_pd(shX+x[ii3+0]);
 843                 iy     = _mm_set1_pd(shY+x[ii3+1]);
 844                 iz     = _mm_set1_pd(shZ+x[ii3+2]);
 845
 846                 rbai   = _mm_load1_pd(rb+ii);
 847                 fix    = _mm_setzero_pd();
 848                 fiy    = _mm_setzero_pd();
 849                 fiz    = _mm_setzero_pd();
 850
 851
 852     for(k=nj0;k<nj1-1;k+=2)
 853                 {
 854                         jnrA        = jjnr[k];
 855                         jnrB        = jjnr[k+1];
 856
 857       j3A         = 3*jnrA;
 858                         j3B         = 3*jnrB;
 859
 860       GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
 861
 862                         dx          = _mm_sub_pd(ix,jx);
 863                         dy          = _mm_sub_pd(iy,jy);
 864                         dz          = _mm_sub_pd(iz,jz);
 865
 866       GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
 867
 868                         /* load chain rule terms for j1-4 */
 869                         f_gb        = _mm_load_pd(dadx);
 870                         dadx += 2;
 871                         f_gb_ai     = _mm_load_pd(dadx);
 872                         dadx += 2;
 873
 874       /* calculate scalar force */
 875       f_gb    = _mm_mul_pd(f_gb,rbai);
 876       f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
 877       f_gb    = _mm_add_pd(f_gb,f_gb_ai);
 878
 879       tx     = _mm_mul_pd(f_gb,dx);
 880       ty     = _mm_mul_pd(f_gb,dy);
 881       tz     = _mm_mul_pd(f_gb,dz);
 882
 883       fix    = _mm_add_pd(fix,tx);
 884       fiy    = _mm_add_pd(fiy,ty);
 885       fiz    = _mm_add_pd(fiz,tz);
 886
 887       GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
 888                 }
 889
 890                 /*deal with odd elements */
 891                 if(k<nj1)
 892         {
 893           jnrA        = jjnr[k];
 894           j3A         = 3*jnrA;
 895
 896           GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
 897
 898           dx          = _mm_sub_sd(ix,jx);
 899           dy          = _mm_sub_sd(iy,jy);
 900           dz          = _mm_sub_sd(iz,jz);
 901
 902           GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
 903
 904           /* load chain rule terms */
 905           f_gb        = _mm_load_pd(dadx);
 906           dadx += 2;
 907           f_gb_ai     = _mm_load_pd(dadx);
 908           dadx += 2;
 909
 910           /* calculate scalar force */
 911           f_gb    = _mm_mul_sd(f_gb,rbai);
 912           f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
 913           f_gb    = _mm_add_sd(f_gb,f_gb_ai);
 914
 915           tx     = _mm_mul_sd(f_gb,dx);
 916           ty     = _mm_mul_sd(f_gb,dy);
 917           tz     = _mm_mul_sd(f_gb,dz);
 918
 919           fix    = _mm_add_sd(fix,tx);
 920           fiy    = _mm_add_sd(fiy,ty);
 921           fiz    = _mm_add_sd(fiz,tz);
 922
 923           GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
 924         }
 925
 926                 /* fix/fiy/fiz now contain four partial force terms, that all should be
 927      * added to the i particle forces and shift forces.
 928      */
 929                 gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3);
 930         }
 931
 932         return 0;
 933 }
 934
 935 #else
 936 /* keep compiler happy */
 937 int genborn_sse2_dummy;
 938
 939 #endif /* SSE2 intrinsics available */
 940