src/gromacs/mdlib/genborn_sse2_single.c

   1 /*
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2008, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38 #include <math.h>
  39 #include <string.h>
  40
  41 #include "typedefs.h"
  42 #include "smalloc.h"
  43 #include "genborn.h"
  44 #include "vec.h"
  45 #include "grompp.h"
  46 #include "pdbio.h"
  47 #include "names.h"
  48 #include "physics.h"
  49 #include "partdec.h"
  50 #include "domdec.h"
  51 #include "network.h"
  52 #include "gmx_fatal.h"
  53 #include "mtop_util.h"
  54 #include "genborn.h"
  55
  56 #ifdef GMX_LIB_MPI
  57 #include <mpi.h>
  58 #endif
  59 #ifdef GMX_THREAD_MPI
  60 #include "tmpi.h"
  61 #endif
  62
  63
  64 /* Only compile this file if SSE intrinsics are available */
  65 #if ( (defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_SSE2)) && !defined(GMX_DOUBLE) )
  66
  67 #include <gmx_sse2_single.h>
  68 #include <xmmintrin.h>
  69 #include <emmintrin.h>
  70
  71 #include "genborn_sse2_single.h"
  72
  73
  74 int
  75 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
  76                               int natoms, gmx_localtop_t *top,
  77                               const t_atomtypes *atype, float *x, t_nblist *nl,
  78                               gmx_genborn_t *born)
  79 {
  80         int i,k,n,ii,is3,ii3,nj0,nj1,offset;
  81         int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
  82         int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
  83         int shift;
  84     int *mdtype;
  85         real shX,shY,shZ;
  86     int *jjnr;
  87     real *shiftvec;
  88
  89         float gpi_ai,gpi2;
  90         float factor;
  91         float *gb_radius;
  92     float *vsolv;
  93     float *work;
  94     float *dadx;
  95
  96         __m128 ix,iy,iz;
  97         __m128 jx,jy,jz;
  98         __m128 dx,dy,dz;
  99         __m128 tx,ty,tz;
 100         __m128 jxB,jyB,jzB;
 101         __m128 dxB,dyB,dzB;
 102         __m128 txB,tyB,tzB;
 103         __m128 rsq,rinv,rinv2,rinv4,rinv6;
 104         __m128 rsqB,rinvB,rinv2B,rinv4B,rinv6B;
 105         __m128 ratio,gpi,rai,raj,vai,vaj,rvdw;
 106         __m128 ratioB,rajB,vajB,rvdwB;
 107         __m128 ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
 108         __m128 ccfB,dccfB,thetaB,cosqB,termB,sinqB,resB,prodB;
 109         __m128 mask,icf4,icf6,mask_cmp;
 110         __m128 icf4B,icf6B,mask_cmpB;
 111
 112     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 113         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 114         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 115
 116         const __m128 half   = _mm_set1_ps(0.5f);
 117         const __m128 three  = _mm_set1_ps(3.0f);
 118         const __m128 one    = _mm_set1_ps(1.0f);
 119         const __m128 two    = _mm_set1_ps(2.0f);
 120         const __m128 zero   = _mm_set1_ps(0.0f);
 121         const __m128 four   = _mm_set1_ps(4.0f);
 122
 123         const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
 124         const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
 125         const __m128 still_p4     = _mm_set1_ps(STILL_P4);
 126
 127         factor  = 0.5 * ONE_4PI_EPS0;
 128
 129     gb_radius = born->gb_radius;
 130     vsolv     = born->vsolv;
 131     work      = born->gpol_still_work;
 132         jjnr      = nl->jjnr;
 133     shiftvec  = fr->shift_vec[0];
 134     dadx      = fr->dadx;
 135
 136         jnrA = jnrB = jnrC = jnrD = 0;
 137     jx = _mm_setzero_ps();
 138     jy = _mm_setzero_ps();
 139     jz = _mm_setzero_ps();
 140
 141         n = 0;
 142
 143         for(i=0;i<natoms;i++)
 144         {
 145                 work[i]=0;
 146         }
 147
 148         for(i=0;i<nl->nri;i++)
 149         {
 150         ii     = nl->iinr[i];
 151                 ii3        = ii*3;
 152         is3    = 3*nl->shift[i];
 153         shX    = shiftvec[is3];
 154         shY    = shiftvec[is3+1];
 155         shZ    = shiftvec[is3+2];
 156         nj0    = nl->jindex[i];
 157         nj1    = nl->jindex[i+1];
 158
 159         ix     = _mm_set1_ps(shX+x[ii3+0]);
 160                 iy     = _mm_set1_ps(shY+x[ii3+1]);
 161                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
 162
 163                 offset = (nj1-nj0)%4;
 164
 165                 /* Polarization energy for atom ai */
 166                 gpi    = _mm_setzero_ps();
 167
 168         rai     = _mm_load1_ps(gb_radius+ii);
 169         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
 170
 171                 for(k=nj0;k<nj1-4-offset;k+=8)
 172                 {
 173                         jnrA        = jjnr[k];
 174                         jnrB        = jjnr[k+1];
 175                         jnrC        = jjnr[k+2];
 176                         jnrD        = jjnr[k+3];
 177                         jnrE        = jjnr[k+4];
 178                         jnrF        = jjnr[k+5];
 179                         jnrG        = jjnr[k+6];
 180                         jnrH        = jjnr[k+7];
 181
 182             j3A         = 3*jnrA;
 183                         j3B         = 3*jnrB;
 184                         j3C         = 3*jnrC;
 185                         j3D         = 3*jnrD;
 186             j3E         = 3*jnrE;
 187                         j3F         = 3*jnrF;
 188                         j3G         = 3*jnrG;
 189                         j3H         = 3*jnrH;
 190
 191             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
 192             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
 193
 194             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
 195             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
 196                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
 197                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE,vsolv+jnrF,vsolv+jnrG,vsolv+jnrH,vajB);
 198
 199                         dx          = _mm_sub_ps(ix,jx);
 200                         dy          = _mm_sub_ps(iy,jy);
 201                         dz          = _mm_sub_ps(iz,jz);
 202                         dxB         = _mm_sub_ps(ix,jxB);
 203                         dyB         = _mm_sub_ps(iy,jyB);
 204                         dzB         = _mm_sub_ps(iz,jzB);
 205
 206             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
 207             rsqB        = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
 208             rinv        = gmx_mm_invsqrt_ps(rsq);
 209             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 210             rinv2       = _mm_mul_ps(rinv,rinv);
 211             rinv2B      = _mm_mul_ps(rinvB,rinvB);
 212             rinv4       = _mm_mul_ps(rinv2,rinv2);
 213             rinv4B      = _mm_mul_ps(rinv2B,rinv2B);
 214             rinv6       = _mm_mul_ps(rinv4,rinv2);
 215             rinv6B      = _mm_mul_ps(rinv4B,rinv2B);
 216
 217             rvdw        = _mm_add_ps(rai,raj);
 218             rvdwB       = _mm_add_ps(rai,rajB);
 219             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
 220             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB,rvdwB)));
 221
 222             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
 223             mask_cmpB   = _mm_cmple_ps(ratioB,still_p5inv);
 224
 225             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 226             if( 0 == _mm_movemask_ps(mask_cmp) )
 227             {
 228                 /* if ratio>still_p5inv for ALL elements */
 229                 ccf         = one;
 230                 dccf        = _mm_setzero_ps();
 231             }
 232             else
 233             {
 234                 ratio       = _mm_min_ps(ratio,still_p5inv);
 235                 theta       = _mm_mul_ps(ratio,still_pip5);
 236                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
 237                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
 238                 ccf         = _mm_mul_ps(term,term);
 239                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
 240                                          _mm_mul_ps(sinq,theta));
 241             }
 242             if( 0 == _mm_movemask_ps(mask_cmpB) )
 243             {
 244                 /* if ratio>still_p5inv for ALL elements */
 245                 ccfB        = one;
 246                 dccfB       = _mm_setzero_ps();
 247             }
 248             else
 249             {
 250                 ratioB      = _mm_min_ps(ratioB,still_p5inv);
 251                 thetaB      = _mm_mul_ps(ratioB,still_pip5);
 252                 gmx_mm_sincos_ps(thetaB,&sinqB,&cosqB);
 253                 termB       = _mm_mul_ps(half,_mm_sub_ps(one,cosqB));
 254                 ccfB        = _mm_mul_ps(termB,termB);
 255                 dccfB       = _mm_mul_ps(_mm_mul_ps(two,termB),
 256                                          _mm_mul_ps(sinqB,thetaB));
 257             }
 258
 259             prod        = _mm_mul_ps(still_p4,vaj);
 260             prodB       = _mm_mul_ps(still_p4,vajB);
 261             icf4        = _mm_mul_ps(ccf,rinv4);
 262             icf4B       = _mm_mul_ps(ccfB,rinv4B);
 263             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
 264             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccfB),dccfB), rinv6B);
 265
 266             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
 267             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_mul_ps(prod_ai,icf4B));
 268
 269             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod,icf4) , _mm_mul_ps(prodB,icf4B) ) );
 270
 271             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
 272             dadx+=4;
 273             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
 274             dadx+=4;
 275             _mm_store_ps(dadx,_mm_mul_ps(prodB,icf6B));
 276             dadx+=4;
 277             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6B));
 278             dadx+=4;
 279                 }
 280
 281         for(;k<nj1-offset;k+=4)
 282                 {
 283                         jnrA        = jjnr[k];
 284                         jnrB        = jjnr[k+1];
 285                         jnrC        = jjnr[k+2];
 286                         jnrD        = jjnr[k+3];
 287
 288             j3A         = 3*jnrA;
 289                         j3B         = 3*jnrB;
 290                         j3C         = 3*jnrC;
 291                         j3D         = 3*jnrD;
 292
 293             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
 294
 295             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
 296                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
 297
 298                         dx          = _mm_sub_ps(ix,jx);
 299                         dy          = _mm_sub_ps(iy,jy);
 300                         dz          = _mm_sub_ps(iz,jz);
 301
 302             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
 303             rinv        = gmx_mm_invsqrt_ps(rsq);
 304             rinv2       = _mm_mul_ps(rinv,rinv);
 305             rinv4       = _mm_mul_ps(rinv2,rinv2);
 306             rinv6       = _mm_mul_ps(rinv4,rinv2);
 307
 308             rvdw        = _mm_add_ps(rai,raj);
 309             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
 310
 311             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
 312
 313             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 314             if(0 == _mm_movemask_ps(mask_cmp))
 315             {
 316                 /* if ratio>still_p5inv for ALL elements */
 317                 ccf         = one;
 318                 dccf        = _mm_setzero_ps();
 319             }
 320             else
 321             {
 322                 ratio       = _mm_min_ps(ratio,still_p5inv);
 323                 theta       = _mm_mul_ps(ratio,still_pip5);
 324                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
 325                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
 326                 ccf         = _mm_mul_ps(term,term);
 327                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
 328                                          _mm_mul_ps(sinq,theta));
 329             }
 330
 331             prod        = _mm_mul_ps(still_p4,vaj);
 332             icf4        = _mm_mul_ps(ccf,rinv4);
 333             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
 334
 335             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
 336
 337             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
 338
 339             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
 340             dadx+=4;
 341             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
 342             dadx+=4;
 343                 }
 344
 345         if(offset!=0)
 346         {
 347             if(offset==1)
 348             {
 349                 jnrA        = jjnr[k];
 350                 j3A         = 3*jnrA;
 351                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
 352                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
 353                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA,vaj);
 354                 mask        = mask1;
 355             }
 356             else if(offset==2)
 357             {
 358                 jnrA        = jjnr[k];
 359                 jnrB        = jjnr[k+1];
 360                 j3A         = 3*jnrA;
 361                 j3B         = 3*jnrB;
 362                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
 363                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
 364                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA,vsolv+jnrB,vaj);
 365                 mask        = mask2;
 366             }
 367             else
 368             {
 369                 /* offset must be 3 */
 370                 jnrA        = jjnr[k];
 371                 jnrB        = jjnr[k+1];
 372                 jnrC        = jjnr[k+2];
 373                 j3A         = 3*jnrA;
 374                 j3B         = 3*jnrB;
 375                 j3C         = 3*jnrC;
 376                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
 377                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
 378                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vaj);
 379                 mask        = mask3;
 380             }
 381
 382                         dx          = _mm_sub_ps(ix,jx);
 383                         dy          = _mm_sub_ps(iy,jy);
 384                         dz          = _mm_sub_ps(iz,jz);
 385
 386             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
 387             rinv        = gmx_mm_invsqrt_ps(rsq);
 388             rinv2       = _mm_mul_ps(rinv,rinv);
 389             rinv4       = _mm_mul_ps(rinv2,rinv2);
 390             rinv6       = _mm_mul_ps(rinv4,rinv2);
 391
 392             rvdw        = _mm_add_ps(rai,raj);
 393             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
 394
 395             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
 396
 397             if(0 == _mm_movemask_ps(mask_cmp))
 398             {
 399                 /* if ratio>still_p5inv for ALL elements */
 400                 ccf         = one;
 401                 dccf        = _mm_setzero_ps();
 402             }
 403             else
 404             {
 405                 ratio       = _mm_min_ps(ratio,still_p5inv);
 406                 theta       = _mm_mul_ps(ratio,still_pip5);
 407                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
 408                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
 409                 ccf         = _mm_mul_ps(term,term);
 410                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
 411                                          _mm_mul_ps(sinq,theta));
 412             }
 413
 414             prod        = _mm_mul_ps(still_p4,vaj);
 415             icf4        = _mm_mul_ps(ccf,rinv4);
 416             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
 417
 418             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
 419
 420             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
 421             dadx+=4;
 422             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
 423             dadx+=4;
 424
 425             tmp = _mm_mul_ps(prod_ai,icf4);
 426
 427             if(offset==1)
 428             {
 429                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
 430             }
 431             else if(offset==2)
 432             {
 433                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
 434             }
 435             else
 436             {
 437                 /* offset must be 3 */
 438                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
 439             }
 440         }
 441         GMX_MM_UPDATE_1POT_PS(gpi,work+ii);
 442         }
 443
 444         /* Sum up the polarization energy from other nodes */
 445         if(PARTDECOMP(cr))
 446         {
 447                 gmx_sum(natoms, work, cr);
 448         }
 449         else if(DOMAINDECOMP(cr))
 450         {
 451                 dd_atom_sum_real(cr->dd, work);
 452         }
 453
 454         /* Compute the radii */
 455         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 456         {
 457                 if(born->use[i] != 0)
 458                 {
 459                         gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 460                         gpi2             = gpi_ai * gpi_ai;
 461                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
 462                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 463                 }
 464         }
 465
 466         /* Extra (local) communication required for DD */
 467         if(DOMAINDECOMP(cr))
 468         {
 469                 dd_atom_spread_real(cr->dd, born->bRad);
 470                 dd_atom_spread_real(cr->dd, fr->invsqrta);
 471         }
 472
 473         return 0;
 474 }
 475
 476
 477 int
 478 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 479                                 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
 480 {
 481         int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
 482     int jnrA,jnrB,jnrC,jnrD;
 483     int j3A,j3B,j3C,j3D;
 484     int jnrE,jnrF,jnrG,jnrH;
 485     int j3E,j3F,j3G,j3H;
 486         float shX,shY,shZ;
 487         float rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
 488         float sum_ai2, sum_ai3,tsum,tchain,doffset;
 489         float *obc_param;
 490     float *gb_radius;
 491     float *work;
 492     int *  jjnr;
 493     float *dadx;
 494     float *shiftvec;
 495     float min_rad,rad;
 496
 497         __m128 ix,iy,iz,jx,jy,jz;
 498         __m128 dx,dy,dz,t1,t2,t3,t4;
 499         __m128 rsq,rinv,r;
 500         __m128 rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
 501         __m128 uij,lij2,uij2,lij3,uij3,diff2;
 502         __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
 503         __m128 sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
 504         __m128 dadx1,dadx2;
 505     __m128 logterm;
 506         __m128 mask;
 507         __m128 obc_mask1,obc_mask2,obc_mask3;
 508     __m128 jxB,jyB,jzB,t1B,t2B,t3B,t4B;
 509     __m128 dxB,dyB,dzB,rsqB,rinvB,rB;
 510         __m128 rajB, raj_invB,rai_inv2B,sk2B,lijB,dlijB,duijB;
 511         __m128 uijB,lij2B,uij2B,lij3B,uij3B,diff2B;
 512         __m128 lij_invB,sk2_invB,prodB;
 513         __m128 sk_ajB,sk2_ajB,sk2_rinvB;
 514         __m128 dadx1B,dadx2B;
 515     __m128 logtermB;
 516     __m128 obc_mask1B,obc_mask2B,obc_mask3B;
 517
 518     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 519         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 520         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 521
 522     __m128 oneeighth   = _mm_set1_ps(0.125);
 523     __m128 onefourth   = _mm_set1_ps(0.25);
 524
 525         const __m128 half  = _mm_set1_ps(0.5f);
 526         const __m128 three = _mm_set1_ps(3.0f);
 527         const __m128 one   = _mm_set1_ps(1.0f);
 528         const __m128 two   = _mm_set1_ps(2.0f);
 529         const __m128 zero  = _mm_set1_ps(0.0f);
 530         const __m128 neg   = _mm_set1_ps(-1.0f);
 531
 532         /* Set the dielectric offset */
 533         doffset   = born->gb_doffset;
 534         gb_radius = born->gb_radius;
 535     obc_param = born->param;
 536     work      = born->gpol_hct_work;
 537     jjnr      = nl->jjnr;
 538     dadx      = fr->dadx;
 539     shiftvec  = fr->shift_vec[0];
 540
 541     jx        = _mm_setzero_ps();
 542     jy        = _mm_setzero_ps();
 543     jz        = _mm_setzero_ps();
 544
 545     jnrA = jnrB = jnrC = jnrD = 0;
 546
 547         for(i=0;i<born->nr;i++)
 548         {
 549                 work[i] = 0;
 550         }
 551
 552         for(i=0;i<nl->nri;i++)
 553         {
 554         ii     = nl->iinr[i];
 555                 ii3        = ii*3;
 556         is3    = 3*nl->shift[i];
 557         shX    = shiftvec[is3];
 558         shY    = shiftvec[is3+1];
 559         shZ    = shiftvec[is3+2];
 560         nj0    = nl->jindex[i];
 561         nj1    = nl->jindex[i+1];
 562
 563         ix     = _mm_set1_ps(shX+x[ii3+0]);
 564                 iy     = _mm_set1_ps(shY+x[ii3+1]);
 565                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
 566
 567                 offset = (nj1-nj0)%4;
 568
 569                 rai    = _mm_load1_ps(gb_radius+ii);
 570                 rai_inv= gmx_mm_inv_ps(rai);
 571
 572                 sum_ai = _mm_setzero_ps();
 573
 574                 sk_ai  = _mm_load1_ps(born->param+ii);
 575                 sk2_ai = _mm_mul_ps(sk_ai,sk_ai);
 576
 577                 for(k=nj0;k<nj1-4-offset;k+=8)
 578                 {
 579                         jnrA        = jjnr[k];
 580                         jnrB        = jjnr[k+1];
 581                         jnrC        = jjnr[k+2];
 582                         jnrD        = jjnr[k+3];
 583                         jnrE        = jjnr[k+4];
 584                         jnrF        = jjnr[k+5];
 585                         jnrG        = jjnr[k+6];
 586                         jnrH        = jjnr[k+7];
 587
 588             j3A         = 3*jnrA;
 589                         j3B         = 3*jnrB;
 590                         j3C         = 3*jnrC;
 591                         j3D         = 3*jnrD;
 592             j3E         = 3*jnrE;
 593                         j3F         = 3*jnrF;
 594                         j3G         = 3*jnrG;
 595                         j3H         = 3*jnrH;
 596
 597             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
 598             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
 599             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
 600             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
 601             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
 602             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE,obc_param+jnrF,obc_param+jnrG,obc_param+jnrH,sk_ajB);
 603
 604             dx    = _mm_sub_ps(ix, jx);
 605                         dy    = _mm_sub_ps(iy, jy);
 606                         dz    = _mm_sub_ps(iz, jz);
 607             dxB   = _mm_sub_ps(ix, jxB);
 608                         dyB   = _mm_sub_ps(iy, jyB);
 609                         dzB   = _mm_sub_ps(iz, jzB);
 610
 611             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
 612             rsqB        = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
 613
 614             rinv        = gmx_mm_invsqrt_ps(rsq);
 615             r           = _mm_mul_ps(rsq,rinv);
 616             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 617             rB          = _mm_mul_ps(rsqB,rinvB);
 618
 619                         /* Compute raj_inv aj1-4 */
 620             raj_inv     = gmx_mm_inv_ps(raj);
 621             raj_invB    = gmx_mm_inv_ps(rajB);
 622
 623             /* Evaluate influence of atom aj -> ai */
 624             t1            = _mm_add_ps(r,sk_aj);
 625             t2            = _mm_sub_ps(r,sk_aj);
 626             t3            = _mm_sub_ps(sk_aj,r);
 627             t1B           = _mm_add_ps(rB,sk_ajB);
 628             t2B           = _mm_sub_ps(rB,sk_ajB);
 629             t3B           = _mm_sub_ps(sk_ajB,rB);
 630             obc_mask1     = _mm_cmplt_ps(rai, t1);
 631             obc_mask2     = _mm_cmplt_ps(rai, t2);
 632             obc_mask3     = _mm_cmplt_ps(rai, t3);
 633             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
 634             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
 635             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
 636
 637             uij           = gmx_mm_inv_ps(t1);
 638             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
 639                                       _mm_andnot_ps(obc_mask2,rai_inv));
 640             dlij          = _mm_and_ps(one,obc_mask2);
 641             uij2          = _mm_mul_ps(uij, uij);
 642             uij3          = _mm_mul_ps(uij2,uij);
 643             lij2          = _mm_mul_ps(lij, lij);
 644             lij3          = _mm_mul_ps(lij2,lij);
 645
 646             uijB          = gmx_mm_inv_ps(t1B);
 647             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
 648                                       _mm_andnot_ps(obc_mask2B,rai_inv));
 649             dlijB         = _mm_and_ps(one,obc_mask2B);
 650             uij2B         = _mm_mul_ps(uijB, uijB);
 651             uij3B         = _mm_mul_ps(uij2B,uijB);
 652             lij2B         = _mm_mul_ps(lijB, lijB);
 653             lij3B         = _mm_mul_ps(lij2B,lijB);
 654
 655             diff2         = _mm_sub_ps(uij2,lij2);
 656             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 657             sk2_aj        = _mm_mul_ps(sk_aj,sk_aj);
 658             sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
 659             prod          = _mm_mul_ps(onefourth,sk2_rinv);
 660
 661             diff2B        = _mm_sub_ps(uij2B,lij2B);
 662             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 663             sk2_ajB       = _mm_mul_ps(sk_ajB,sk_ajB);
 664             sk2_rinvB     = _mm_mul_ps(sk2_ajB,rinvB);
 665             prodB         = _mm_mul_ps(onefourth,sk2_rinvB);
 666
 667             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
 668             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
 669
 670             t1            = _mm_sub_ps(lij,uij);
 671             t2            = _mm_mul_ps(diff2,
 672                                        _mm_sub_ps(_mm_mul_ps(onefourth,r),
 673                                                   prod));
 674             t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
 675             t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
 676             t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
 677             t4            = _mm_and_ps(t4,obc_mask3);
 678             t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
 679
 680             t1B           = _mm_sub_ps(lijB,uijB);
 681             t2B           = _mm_mul_ps(diff2B,
 682                                        _mm_sub_ps(_mm_mul_ps(onefourth,rB),
 683                                                   prodB));
 684             t3B           = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
 685             t1B           = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
 686             t4B           = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lijB));
 687             t4B           = _mm_and_ps(t4B,obc_mask3B);
 688             t1B           = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
 689
 690             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1,obc_mask1), _mm_and_ps(t1B,obc_mask1B) ));
 691
 692             t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
 693                                        _mm_mul_ps(prod,lij3));
 694             t1            = _mm_sub_ps(t1,
 695                                        _mm_mul_ps(onefourth,
 696                                                   _mm_add_ps(_mm_mul_ps(lij,rinv),
 697                                                              _mm_mul_ps(lij3,r))));
 698             t2            = _mm_mul_ps(onefourth,
 699                                        _mm_add_ps(_mm_mul_ps(uij,rinv),
 700                                                   _mm_mul_ps(uij3,r)));
 701             t2            = _mm_sub_ps(t2,
 702                                        _mm_add_ps(_mm_mul_ps(half,uij2),
 703                                                       _mm_mul_ps(prod,uij3)));
 704             t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
 705                                        _mm_mul_ps(rinv,rinv));
 706             t3            = _mm_sub_ps(t3,
 707                                        _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
 708                                                   _mm_add_ps(one,
 709                                                              _mm_mul_ps(sk2_rinv,rinv))));
 710             t1            = _mm_mul_ps(rinv,
 711                                        _mm_add_ps(_mm_mul_ps(dlij,t1),
 712                                                   _mm_add_ps(t2,t3)));
 713
 714
 715
 716             t1B           = _mm_add_ps(_mm_mul_ps(half,lij2B),
 717                                        _mm_mul_ps(prodB,lij3B));
 718             t1B           = _mm_sub_ps(t1B,
 719                                        _mm_mul_ps(onefourth,
 720                                                   _mm_add_ps(_mm_mul_ps(lijB,rinvB),
 721                                                              _mm_mul_ps(lij3B,rB))));
 722             t2B           = _mm_mul_ps(onefourth,
 723                                        _mm_add_ps(_mm_mul_ps(uijB,rinvB),
 724                                                   _mm_mul_ps(uij3B,rB)));
 725             t2B           = _mm_sub_ps(t2B,
 726                                        _mm_add_ps(_mm_mul_ps(half,uij2B),
 727                                                   _mm_mul_ps(prodB,uij3B)));
 728             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
 729                                        _mm_mul_ps(rinvB,rinvB));
 730             t3B           = _mm_sub_ps(t3B,
 731                                        _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
 732                                                   _mm_add_ps(one,
 733                                                              _mm_mul_ps(sk2_rinvB,rinvB))));
 734             t1B           = _mm_mul_ps(rinvB,
 735                                        _mm_add_ps(_mm_mul_ps(dlijB,t1B),
 736                                                   _mm_add_ps(t2B,t3B)));
 737
 738             dadx1         = _mm_and_ps(t1,obc_mask1);
 739             dadx1B        = _mm_and_ps(t1B,obc_mask1B);
 740
 741
 742             /* Evaluate influence of atom ai -> aj */
 743             t1            = _mm_add_ps(r,sk_ai);
 744             t2            = _mm_sub_ps(r,sk_ai);
 745             t3            = _mm_sub_ps(sk_ai,r);
 746             t1B           = _mm_add_ps(rB,sk_ai);
 747             t2B           = _mm_sub_ps(rB,sk_ai);
 748             t3B           = _mm_sub_ps(sk_ai,rB);
 749             obc_mask1     = _mm_cmplt_ps(raj, t1);
 750             obc_mask2     = _mm_cmplt_ps(raj, t2);
 751             obc_mask3     = _mm_cmplt_ps(raj, t3);
 752             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
 753             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
 754             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
 755
 756             uij           = gmx_mm_inv_ps(t1);
 757             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
 758                                       _mm_andnot_ps(obc_mask2,raj_inv));
 759             dlij          = _mm_and_ps(one,obc_mask2);
 760             uij2          = _mm_mul_ps(uij, uij);
 761             uij3          = _mm_mul_ps(uij2,uij);
 762             lij2          = _mm_mul_ps(lij, lij);
 763             lij3          = _mm_mul_ps(lij2,lij);
 764
 765             uijB          = gmx_mm_inv_ps(t1B);
 766             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
 767                                       _mm_andnot_ps(obc_mask2B,raj_invB));
 768             dlijB         = _mm_and_ps(one,obc_mask2B);
 769             uij2B         = _mm_mul_ps(uijB, uijB);
 770             uij3B         = _mm_mul_ps(uij2B,uijB);
 771             lij2B         = _mm_mul_ps(lijB, lijB);
 772             lij3B         = _mm_mul_ps(lij2B,lijB);
 773
 774             diff2         = _mm_sub_ps(uij2,lij2);
 775             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 776             sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
 777             prod          = _mm_mul_ps(onefourth,sk2_rinv);
 778
 779             diff2B        = _mm_sub_ps(uij2B,lij2B);
 780             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 781             sk2_rinvB     = _mm_mul_ps(sk2_ai,rinvB);
 782             prodB         = _mm_mul_ps(onefourth,sk2_rinvB);
 783
 784             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
 785             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
 786
 787             t1            = _mm_sub_ps(lij,uij);
 788             t2            = _mm_mul_ps(diff2,
 789                                        _mm_sub_ps(_mm_mul_ps(onefourth,r),
 790                                                   prod));
 791             t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
 792             t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
 793             t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
 794             t4            = _mm_and_ps(t4,obc_mask3);
 795             t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
 796
 797             t1B           = _mm_sub_ps(lijB,uijB);
 798             t2B           = _mm_mul_ps(diff2B,
 799                                        _mm_sub_ps(_mm_mul_ps(onefourth,rB),
 800                                                   prodB));
 801             t3B           = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
 802             t1B           = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
 803             t4B           = _mm_mul_ps(two,_mm_sub_ps(raj_invB,lijB));
 804             t4B           = _mm_and_ps(t4B,obc_mask3B);
 805             t1B           = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
 806
 807             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
 808             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_and_ps(t1B,obc_mask1B));
 809
 810             t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
 811                                        _mm_mul_ps(prod,lij3));
 812             t1            = _mm_sub_ps(t1,
 813                                        _mm_mul_ps(onefourth,
 814                                                   _mm_add_ps(_mm_mul_ps(lij,rinv),
 815                                                              _mm_mul_ps(lij3,r))));
 816             t2            = _mm_mul_ps(onefourth,
 817                                        _mm_add_ps(_mm_mul_ps(uij,rinv),
 818                                                   _mm_mul_ps(uij3,r)));
 819             t2            = _mm_sub_ps(t2,
 820                                        _mm_add_ps(_mm_mul_ps(half,uij2),
 821                                                   _mm_mul_ps(prod,uij3)));
 822             t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
 823                                        _mm_mul_ps(rinv,rinv));
 824             t3            = _mm_sub_ps(t3,
 825                                        _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
 826                                                   _mm_add_ps(one,
 827                                                              _mm_mul_ps(sk2_rinv,rinv))));
 828             t1            = _mm_mul_ps(rinv,
 829                                        _mm_add_ps(_mm_mul_ps(dlij,t1),
 830                                                   _mm_add_ps(t2,t3)));
 831
 832
 833             t1B           = _mm_add_ps(_mm_mul_ps(half,lij2B),
 834                                        _mm_mul_ps(prodB,lij3B));
 835             t1B           = _mm_sub_ps(t1B,
 836                                        _mm_mul_ps(onefourth,
 837                                                   _mm_add_ps(_mm_mul_ps(lijB,rinvB),
 838                                                              _mm_mul_ps(lij3B,rB))));
 839             t2B           = _mm_mul_ps(onefourth,
 840                                        _mm_add_ps(_mm_mul_ps(uijB,rinvB),
 841                                                   _mm_mul_ps(uij3B,rB)));
 842             t2B           = _mm_sub_ps(t2B,
 843                                        _mm_add_ps(_mm_mul_ps(half,uij2B),
 844                                                   _mm_mul_ps(prodB,uij3B)));
 845             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
 846                                        _mm_mul_ps(rinvB,rinvB));
 847             t3B           = _mm_sub_ps(t3B,
 848                                        _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
 849                                                   _mm_add_ps(one,
 850                                                              _mm_mul_ps(sk2_rinvB,rinvB))));
 851             t1B           = _mm_mul_ps(rinvB,
 852                                        _mm_add_ps(_mm_mul_ps(dlijB,t1B),
 853                                                   _mm_add_ps(t2B,t3B)));
 854
 855
 856             dadx2         = _mm_and_ps(t1,obc_mask1);
 857             dadx2B        = _mm_and_ps(t1B,obc_mask1B);
 858
 859             _mm_store_ps(dadx,dadx1);
 860             dadx += 4;
 861             _mm_store_ps(dadx,dadx2);
 862             dadx += 4;
 863             _mm_store_ps(dadx,dadx1B);
 864             dadx += 4;
 865             _mm_store_ps(dadx,dadx2B);
 866             dadx += 4;
 867
 868         } /* end normal inner loop */
 869
 870                 for(;k<nj1-offset;k+=4)
 871                 {
 872                         jnrA        = jjnr[k];
 873                         jnrB        = jjnr[k+1];
 874                         jnrC        = jjnr[k+2];
 875                         jnrD        = jjnr[k+3];
 876
 877             j3A         = 3*jnrA;
 878                         j3B         = 3*jnrB;
 879                         j3C         = 3*jnrC;
 880                         j3D         = 3*jnrD;
 881
 882             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
 883             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
 884             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
 885
 886             dx    = _mm_sub_ps(ix, jx);
 887                         dy    = _mm_sub_ps(iy, jy);
 888                         dz    = _mm_sub_ps(iz, jz);
 889
 890             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
 891
 892             rinv        = gmx_mm_invsqrt_ps(rsq);
 893             r           = _mm_mul_ps(rsq,rinv);
 894
 895                         /* Compute raj_inv aj1-4 */
 896             raj_inv     = gmx_mm_inv_ps(raj);
 897
 898             /* Evaluate influence of atom aj -> ai */
 899             t1            = _mm_add_ps(r,sk_aj);
 900             obc_mask1     = _mm_cmplt_ps(rai, t1);
 901
 902             if(_mm_movemask_ps(obc_mask1))
 903             {
 904                 /* If any of the elements has rai<dr+sk, this is executed */
 905                 t2            = _mm_sub_ps(r,sk_aj);
 906                 t3            = _mm_sub_ps(sk_aj,r);
 907
 908                 obc_mask2     = _mm_cmplt_ps(rai, t2);
 909                 obc_mask3     = _mm_cmplt_ps(rai, t3);
 910
 911                 uij           = gmx_mm_inv_ps(t1);
 912                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
 913                                           _mm_andnot_ps(obc_mask2,rai_inv));
 914                 dlij          = _mm_and_ps(one,obc_mask2);
 915                 uij2          = _mm_mul_ps(uij, uij);
 916                 uij3          = _mm_mul_ps(uij2,uij);
 917                 lij2          = _mm_mul_ps(lij, lij);
 918                 lij3          = _mm_mul_ps(lij2,lij);
 919                 diff2         = _mm_sub_ps(uij2,lij2);
 920                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 921                 sk2_aj        = _mm_mul_ps(sk_aj,sk_aj);
 922                 sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
 923                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
 924                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
 925                 t1            = _mm_sub_ps(lij,uij);
 926                 t2            = _mm_mul_ps(diff2,
 927                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
 928                                                       prod));
 929                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
 930                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
 931                 t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
 932                 t4            = _mm_and_ps(t4,obc_mask3);
 933                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
 934                 sum_ai        = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
 935                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
 936                                            _mm_mul_ps(prod,lij3));
 937                 t1            = _mm_sub_ps(t1,
 938                                            _mm_mul_ps(onefourth,
 939                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
 940                                                                  _mm_mul_ps(lij3,r))));
 941                 t2            = _mm_mul_ps(onefourth,
 942                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
 943                                                       _mm_mul_ps(uij3,r)));
 944                 t2            = _mm_sub_ps(t2,
 945                                            _mm_add_ps(_mm_mul_ps(half,uij2),
 946                                                       _mm_mul_ps(prod,uij3)));
 947                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
 948                                            _mm_mul_ps(rinv,rinv));
 949                 t3            = _mm_sub_ps(t3,
 950                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
 951                                                       _mm_add_ps(one,
 952                                                                  _mm_mul_ps(sk2_rinv,rinv))));
 953                 t1            = _mm_mul_ps(rinv,
 954                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
 955                                                       _mm_add_ps(t2,t3)));
 956
 957                 dadx1         = _mm_and_ps(t1,obc_mask1);
 958             }
 959             else
 960             {
 961                 dadx1         = _mm_setzero_ps();
 962             }
 963
 964             /* Evaluate influence of atom ai -> aj */
 965             t1            = _mm_add_ps(r,sk_ai);
 966             obc_mask1     = _mm_cmplt_ps(raj, t1);
 967
 968             if(_mm_movemask_ps(obc_mask1))
 969             {
 970                 t2            = _mm_sub_ps(r,sk_ai);
 971                 t3            = _mm_sub_ps(sk_ai,r);
 972                 obc_mask2     = _mm_cmplt_ps(raj, t2);
 973                 obc_mask3     = _mm_cmplt_ps(raj, t3);
 974
 975                 uij           = gmx_mm_inv_ps(t1);
 976                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
 977                                           _mm_andnot_ps(obc_mask2,raj_inv));
 978                 dlij          = _mm_and_ps(one,obc_mask2);
 979                 uij2          = _mm_mul_ps(uij, uij);
 980                 uij3          = _mm_mul_ps(uij2,uij);
 981                 lij2          = _mm_mul_ps(lij, lij);
 982                 lij3          = _mm_mul_ps(lij2,lij);
 983                 diff2         = _mm_sub_ps(uij2,lij2);
 984                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 985                 sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
 986                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
 987                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
 988                 t1            = _mm_sub_ps(lij,uij);
 989                 t2            = _mm_mul_ps(diff2,
 990                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
 991                                                       prod));
 992                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
 993                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
 994                 t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
 995                 t4            = _mm_and_ps(t4,obc_mask3);
 996                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
 997
 998                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
 999
1000                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1001                                            _mm_mul_ps(prod,lij3));
1002                 t1            = _mm_sub_ps(t1,
1003                                            _mm_mul_ps(onefourth,
1004                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1005                                                                  _mm_mul_ps(lij3,r))));
1006                 t2            = _mm_mul_ps(onefourth,
1007                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1008                                                       _mm_mul_ps(uij3,r)));
1009                 t2            = _mm_sub_ps(t2,
1010                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1011                                                       _mm_mul_ps(prod,uij3)));
1012                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1013                                            _mm_mul_ps(rinv,rinv));
1014                 t3            = _mm_sub_ps(t3,
1015                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1016                                                       _mm_add_ps(one,
1017                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1018                 t1            = _mm_mul_ps(rinv,
1019                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1020                                                       _mm_add_ps(t2,t3)));
1021                 dadx2         = _mm_and_ps(t1,obc_mask1);
1022             }
1023             else
1024             {
1025                 dadx2         = _mm_setzero_ps();
1026             }
1027
1028             _mm_store_ps(dadx,dadx1);
1029             dadx += 4;
1030             _mm_store_ps(dadx,dadx2);
1031             dadx += 4;
1032         } /* end normal inner loop */
1033
1034         if(offset!=0)
1035         {
1036             if(offset==1)
1037             {
1038                 jnrA        = jjnr[k];
1039                 j3A         = 3*jnrA;
1040                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1041                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
1042                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA,sk_aj);
1043                 mask        = mask1;
1044             }
1045             else if(offset==2)
1046             {
1047                 jnrA        = jjnr[k];
1048                 jnrB        = jjnr[k+1];
1049                 j3A         = 3*jnrA;
1050                 j3B         = 3*jnrB;
1051                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1052                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
1053                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA,obc_param+jnrB,sk_aj);
1054                 mask        = mask2;
1055             }
1056             else
1057             {
1058                 /* offset must be 3 */
1059                 jnrA        = jjnr[k];
1060                 jnrB        = jjnr[k+1];
1061                 jnrC        = jjnr[k+2];
1062                 j3A         = 3*jnrA;
1063                 j3B         = 3*jnrB;
1064                 j3C         = 3*jnrC;
1065                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1066                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
1067                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,sk_aj);
1068                 mask        = mask3;
1069             }
1070
1071             dx    = _mm_sub_ps(ix, jx);
1072                         dy    = _mm_sub_ps(iy, jy);
1073                         dz    = _mm_sub_ps(iz, jz);
1074
1075             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
1076
1077             rinv        = gmx_mm_invsqrt_ps(rsq);
1078             r           = _mm_mul_ps(rsq,rinv);
1079
1080                         /* Compute raj_inv aj1-4 */
1081             raj_inv     = gmx_mm_inv_ps(raj);
1082
1083             /* Evaluate influence of atom aj -> ai */
1084             t1            = _mm_add_ps(r,sk_aj);
1085             obc_mask1     = _mm_cmplt_ps(rai, t1);
1086             obc_mask1     = _mm_and_ps(obc_mask1,mask);
1087
1088             if(_mm_movemask_ps(obc_mask1))
1089             {
1090                 t2            = _mm_sub_ps(r,sk_aj);
1091                 t3            = _mm_sub_ps(sk_aj,r);
1092                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1093                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1094
1095                 uij           = gmx_mm_inv_ps(t1);
1096                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1097                                           _mm_andnot_ps(obc_mask2,rai_inv));
1098                 dlij          = _mm_and_ps(one,obc_mask2);
1099                 uij2          = _mm_mul_ps(uij, uij);
1100                 uij3          = _mm_mul_ps(uij2,uij);
1101                 lij2          = _mm_mul_ps(lij, lij);
1102                 lij3          = _mm_mul_ps(lij2,lij);
1103                 diff2         = _mm_sub_ps(uij2,lij2);
1104                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1105                 sk2_aj         = _mm_mul_ps(sk_aj,sk_aj);
1106                 sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
1107                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
1108                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1109                 t1            = _mm_sub_ps(lij,uij);
1110                 t2            = _mm_mul_ps(diff2,
1111                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
1112                                                       prod));
1113                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1114                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1115                 t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
1116                 t4            = _mm_and_ps(t4,obc_mask3);
1117                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1118                 sum_ai        = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
1119                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1120                                            _mm_mul_ps(prod,lij3));
1121                 t1            = _mm_sub_ps(t1,
1122                                            _mm_mul_ps(onefourth,
1123                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1124                                                                  _mm_mul_ps(lij3,r))));
1125                 t2            = _mm_mul_ps(onefourth,
1126                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1127                                                       _mm_mul_ps(uij3,r)));
1128                 t2            = _mm_sub_ps(t2,
1129                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1130                                                       _mm_mul_ps(prod,uij3)));
1131                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1132                                            _mm_mul_ps(rinv,rinv));
1133                 t3            = _mm_sub_ps(t3,
1134                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1135                                                       _mm_add_ps(one,
1136                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1137                 t1            = _mm_mul_ps(rinv,
1138                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1139                                                       _mm_add_ps(t2,t3)));
1140                 dadx1         = _mm_and_ps(t1,obc_mask1);
1141             }
1142             else
1143             {
1144                 dadx1         = _mm_setzero_ps();
1145             }
1146
1147                 /* Evaluate influence of atom ai -> aj */
1148             t1            = _mm_add_ps(r,sk_ai);
1149             obc_mask1     = _mm_cmplt_ps(raj, t1);
1150             obc_mask1     = _mm_and_ps(obc_mask1,mask);
1151
1152             if(_mm_movemask_ps(obc_mask1))
1153             {
1154                 t2            = _mm_sub_ps(r,sk_ai);
1155                 t3            = _mm_sub_ps(sk_ai,r);
1156                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1157                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1158
1159                 uij           = gmx_mm_inv_ps(t1);
1160                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1161                                           _mm_andnot_ps(obc_mask2,raj_inv));
1162                 dlij          = _mm_and_ps(one,obc_mask2);
1163                 uij2          = _mm_mul_ps(uij, uij);
1164                 uij3          = _mm_mul_ps(uij2,uij);
1165                 lij2          = _mm_mul_ps(lij, lij);
1166                 lij3          = _mm_mul_ps(lij2,lij);
1167                 diff2         = _mm_sub_ps(uij2,lij2);
1168                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1169                 sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
1170                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
1171                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1172                 t1            = _mm_sub_ps(lij,uij);
1173                 t2            = _mm_mul_ps(diff2,
1174                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
1175                                                       prod));
1176                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1177                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1178                 t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
1179                 t4            = _mm_and_ps(t4,obc_mask3);
1180                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1181
1182                 tmp           = _mm_and_ps(t1,obc_mask1);
1183
1184                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1185                                            _mm_mul_ps(prod,lij3));
1186                 t1            = _mm_sub_ps(t1,
1187                                            _mm_mul_ps(onefourth,
1188                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1189                                                                  _mm_mul_ps(lij3,r))));
1190                 t2            = _mm_mul_ps(onefourth,
1191                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1192                                                       _mm_mul_ps(uij3,r)));
1193                 t2            = _mm_sub_ps(t2,
1194                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1195                                                       _mm_mul_ps(prod,uij3)));
1196                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1197                                            _mm_mul_ps(rinv,rinv));
1198                 t3            = _mm_sub_ps(t3,
1199                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1200                                                       _mm_add_ps(one,
1201                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1202                 t1            = _mm_mul_ps(rinv,
1203                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1204                                                       _mm_add_ps(t2,t3)));
1205                 dadx2         = _mm_and_ps(t1,obc_mask1);
1206             }
1207             else
1208             {
1209                 dadx2         = _mm_setzero_ps();
1210                 tmp           = _mm_setzero_ps();
1211             }
1212
1213             _mm_store_ps(dadx,dadx1);
1214             dadx += 4;
1215             _mm_store_ps(dadx,dadx2);
1216             dadx += 4;
1217
1218             if(offset==1)
1219             {
1220                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
1221             }
1222             else if(offset==2)
1223             {
1224                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
1225             }
1226             else
1227             {
1228                 /* offset must be 3 */
1229                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
1230             }
1231
1232         }
1233         GMX_MM_UPDATE_1POT_PS(sum_ai,work+ii);
1234
1235         }
1236
1237         /* Parallel summations */
1238         if(PARTDECOMP(cr))
1239         {
1240                 gmx_sum(natoms, work, cr);
1241         }
1242         else if(DOMAINDECOMP(cr))
1243         {
1244                 dd_atom_sum_real(cr->dd, work);
1245         }
1246
1247     if(gb_algorithm==egbHCT)
1248     {
1249         /* HCT */
1250         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1251         {
1252                         if(born->use[i] != 0)
1253             {
1254                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1255                 sum     = 1.0/rr - work[i];
1256                 min_rad = rr + doffset;
1257                 rad     = 1.0/sum;
1258
1259                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1260                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1261             }
1262         }
1263
1264         /* Extra communication required for DD */
1265         if(DOMAINDECOMP(cr))
1266         {
1267             dd_atom_spread_real(cr->dd, born->bRad);
1268             dd_atom_spread_real(cr->dd, fr->invsqrta);
1269         }
1270     }
1271     else
1272     {
1273         /* OBC */
1274         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1275         {
1276                         if(born->use[i] != 0)
1277             {
1278                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1279                 rr_inv2 = 1.0/rr;
1280                 rr      = rr-doffset;
1281                 rr_inv  = 1.0/rr;
1282                 sum     = rr * work[i];
1283                 sum2    = sum  * sum;
1284                 sum3    = sum2 * sum;
1285
1286                 tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1287                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1288                 born->bRad[i] = 1.0 / born->bRad[i];
1289
1290                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
1291
1292                 tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1293                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1294             }
1295         }
1296         /* Extra (local) communication required for DD */
1297         if(DOMAINDECOMP(cr))
1298         {
1299             dd_atom_spread_real(cr->dd, born->bRad);
1300             dd_atom_spread_real(cr->dd, fr->invsqrta);
1301             dd_atom_spread_real(cr->dd, born->drobc);
1302         }
1303     }
1304
1305
1306
1307         return 0;
1308 }
1309
1310
1311
1312 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1313                                     float *x, float *f, float *fshift, float *shiftvec,
1314                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1315 {
1316         int    i,k,n,ii,jnr,ii3,is3,nj0,nj1,offset,n0,n1;
1317         int        jnrA,jnrB,jnrC,jnrD;
1318     int    j3A,j3B,j3C,j3D;
1319         int        jnrE,jnrF,jnrG,jnrH;
1320     int    j3E,j3F,j3G,j3H;
1321         int *  jjnr;
1322
1323         float   rbi,shX,shY,shZ;
1324         float   *rb;
1325
1326         __m128 ix,iy,iz;
1327         __m128 jx,jy,jz;
1328         __m128 jxB,jyB,jzB;
1329         __m128 fix,fiy,fiz;
1330         __m128 dx,dy,dz;
1331     __m128 tx,ty,tz;
1332         __m128 dxB,dyB,dzB;
1333     __m128 txB,tyB,tzB;
1334
1335         __m128 rbai,rbaj,rbajB, f_gb, f_gb_ai,f_gbB,f_gb_aiB;
1336         __m128 xmm1,xmm2,xmm3;
1337
1338         const __m128 two = _mm_set1_ps(2.0f);
1339
1340         rb     = born->work;
1341
1342     jjnr   = nl->jjnr;
1343
1344         /* Loop to get the proper form for the Born radius term, sse style */
1345         offset=natoms%4;
1346
1347   n0 = 0;
1348   n1 = natoms;
1349
1350         if(gb_algorithm==egbSTILL)
1351         {
1352                 for(i=n0;i<n1;i++)
1353                 {
1354       rbi   = born->bRad[i];
1355                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1356                 }
1357         }
1358         else if(gb_algorithm==egbHCT)
1359         {
1360                 for(i=n0;i<n1;i++)
1361                 {
1362       rbi   = born->bRad[i];
1363                         rb[i] = rbi * rbi * dvda[i];
1364                 }
1365         }
1366         else if(gb_algorithm==egbOBC)
1367         {
1368                 for(i=n0;i<n1;i++)
1369                 {
1370       rbi   = born->bRad[i];
1371                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1372                 }
1373         }
1374
1375     jz = _mm_setzero_ps();
1376
1377     n = j3A = j3B = j3C = j3D = 0;
1378
1379         for(i=0;i<nl->nri;i++)
1380         {
1381         ii     = nl->iinr[i];
1382                 ii3        = ii*3;
1383         is3    = 3*nl->shift[i];
1384         shX    = shiftvec[is3];
1385         shY    = shiftvec[is3+1];
1386         shZ    = shiftvec[is3+2];
1387         nj0    = nl->jindex[i];
1388         nj1    = nl->jindex[i+1];
1389
1390         ix     = _mm_set1_ps(shX+x[ii3+0]);
1391                 iy     = _mm_set1_ps(shY+x[ii3+1]);
1392                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
1393
1394                 offset = (nj1-nj0)%4;
1395
1396                 rbai   = _mm_load1_ps(rb+ii);
1397                 fix    = _mm_setzero_ps();
1398                 fiy    = _mm_setzero_ps();
1399                 fiz    = _mm_setzero_ps();
1400
1401
1402         for(k=nj0;k<nj1-offset;k+=4)
1403                 {
1404                         jnrA        = jjnr[k];
1405                         jnrB        = jjnr[k+1];
1406                         jnrC        = jjnr[k+2];
1407                         jnrD        = jjnr[k+3];
1408
1409             j3A         = 3*jnrA;
1410                         j3B         = 3*jnrB;
1411                         j3C         = 3*jnrC;
1412                         j3D         = 3*jnrD;
1413
1414             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
1415
1416                         dx          = _mm_sub_ps(ix,jx);
1417                         dy          = _mm_sub_ps(iy,jy);
1418                         dz          = _mm_sub_ps(iz,jz);
1419
1420             GMX_MM_LOAD_4VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rb+jnrD,rbaj);
1421
1422                         /* load chain rule terms for j1-4 */
1423                         f_gb        = _mm_load_ps(dadx);
1424                         dadx += 4;
1425                         f_gb_ai     = _mm_load_ps(dadx);
1426                         dadx += 4;
1427
1428             /* calculate scalar force */
1429             f_gb    = _mm_mul_ps(f_gb,rbai);
1430             f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1431             f_gb    = _mm_add_ps(f_gb,f_gb_ai);
1432
1433             tx     = _mm_mul_ps(f_gb,dx);
1434             ty     = _mm_mul_ps(f_gb,dy);
1435             tz     = _mm_mul_ps(f_gb,dz);
1436
1437             fix    = _mm_add_ps(fix,tx);
1438             fiy    = _mm_add_ps(fiy,ty);
1439             fiz    = _mm_add_ps(fiz,tz);
1440
1441             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A,f+j3B,f+j3C,f+j3D,tx,ty,tz);
1442                 }
1443
1444                 /*deal with odd elements */
1445                 if(offset!=0)
1446         {
1447             if(offset==1)
1448             {
1449                 jnrA        = jjnr[k];
1450                 j3A         = 3*jnrA;
1451                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1452                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA,rbaj);
1453             }
1454             else if(offset==2)
1455             {
1456                 jnrA        = jjnr[k];
1457                 jnrB        = jjnr[k+1];
1458                 j3A         = 3*jnrA;
1459                 j3B         = 3*jnrB;
1460                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1461                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA,rb+jnrB,rbaj);
1462             }
1463             else
1464             {
1465                 /* offset must be 3 */
1466                 jnrA        = jjnr[k];
1467                 jnrB        = jjnr[k+1];
1468                 jnrC        = jjnr[k+2];
1469                 j3A         = 3*jnrA;
1470                 j3B         = 3*jnrB;
1471                 j3C         = 3*jnrC;
1472                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1473                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rbaj);
1474             }
1475
1476             dx          = _mm_sub_ps(ix,jx);
1477             dy          = _mm_sub_ps(iy,jy);
1478             dz          = _mm_sub_ps(iz,jz);
1479
1480             /* load chain rule terms for j1-4 */
1481             f_gb        = _mm_load_ps(dadx);
1482             dadx += 4;
1483             f_gb_ai     = _mm_load_ps(dadx);
1484             dadx += 4;
1485
1486             /* calculate scalar force */
1487             f_gb    = _mm_mul_ps(f_gb,rbai);
1488             f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1489             f_gb    = _mm_add_ps(f_gb,f_gb_ai);
1490
1491             tx     = _mm_mul_ps(f_gb,dx);
1492             ty     = _mm_mul_ps(f_gb,dy);
1493             tz     = _mm_mul_ps(f_gb,dz);
1494
1495             fix    = _mm_add_ps(fix,tx);
1496             fiy    = _mm_add_ps(fiy,ty);
1497             fiz    = _mm_add_ps(fiz,tz);
1498
1499             if(offset==1)
1500             {
1501                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A,tx,ty,tz);
1502             }
1503             else if(offset==2)
1504             {
1505                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A,f+j3B,tx,ty,tz);
1506             }
1507             else
1508             {
1509                 /* offset must be 3 */
1510                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A,f+j3B,f+j3C,tx,ty,tz);
1511             }
1512         }
1513
1514                 /* fix/fiy/fiz now contain four partial force terms, that all should be
1515          * added to the i particle forces and shift forces.
1516          */
1517                 gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,f+ii3,fshift+is3);
1518         }
1519
1520         return 0;
1521 }
1522
1523
1524 #else
1525 /* keep compiler happy */
1526 int genborn_sse_dummy;
1527
1528 #endif /* SSE intrinsics available */