src/mdlib/genborn_sse2_single.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #ifdef HAVE_CONFIG_H
  39 #include <config.h>
  40 #endif
  41
  42 #include <math.h>
  43 #include <string.h>
  44
  45 #include "typedefs.h"
  46 #include "smalloc.h"
  47 #include "genborn.h"
  48 #include "vec.h"
  49 #include "grompp.h"
  50 #include "pdbio.h"
  51 #include "names.h"
  52 #include "physics.h"
  53 #include "partdec.h"
  54 #include "domdec.h"
  55 #include "network.h"
  56 #include "gmx_fatal.h"
  57 #include "mtop_util.h"
  58 #include "genborn.h"
  59
  60 #ifdef GMX_LIB_MPI
  61 #include <mpi.h>
  62 #endif
  63 #ifdef GMX_THREAD_MPI
  64 #include "tmpi.h"
  65 #endif
  66
  67
  68 /* Only compile this file if SSE intrinsics are available */
  69 #if 0 && defined (GMX_X86_SSE2)
  70
  71 #include <gmx_sse2_single.h>
  72 #include <emmintrin.h>
  73
  74 #include "genborn_sse2_single.h"
  75
  76
  77 int
  78 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
  79                               int natoms, gmx_localtop_t *top,
  80                               const t_atomtypes *atype, float *x, t_nblist *nl,
  81                               gmx_genborn_t *born)
  82 {
  83         int i,k,n,ii,is3,ii3,nj0,nj1,offset;
  84         int jnrA,jnrB,jnrC,jnrD,j3A,j3B,j3C,j3D;
  85         int jnrE,jnrF,jnrG,jnrH,j3E,j3F,j3G,j3H;
  86         int shift;
  87     int *mdtype;
  88         real shX,shY,shZ;
  89     int *jjnr;
  90     real *shiftvec;
  91
  92         float gpi_ai,gpi2;
  93         float factor;
  94         float *gb_radius;
  95     float *vsolv;
  96     float *work;
  97     float *dadx;
  98
  99         __m128 ix,iy,iz;
 100         __m128 jx,jy,jz;
 101         __m128 dx,dy,dz;
 102         __m128 tx,ty,tz;
 103         __m128 jxB,jyB,jzB;
 104         __m128 dxB,dyB,dzB;
 105         __m128 txB,tyB,tzB;
 106         __m128 rsq,rinv,rinv2,rinv4,rinv6;
 107         __m128 rsqB,rinvB,rinv2B,rinv4B,rinv6B;
 108         __m128 ratio,gpi,rai,raj,vai,vaj,rvdw;
 109         __m128 ratioB,rajB,vajB,rvdwB;
 110         __m128 ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp;
 111         __m128 ccfB,dccfB,thetaB,cosqB,termB,sinqB,resB,prodB;
 112         __m128 mask,icf4,icf6,mask_cmp;
 113         __m128 icf4B,icf6B,mask_cmpB;
 114
 115     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 116         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 117         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 118
 119         const __m128 half   = _mm_set1_ps(0.5f);
 120         const __m128 three  = _mm_set1_ps(3.0f);
 121         const __m128 one    = _mm_set1_ps(1.0f);
 122         const __m128 two    = _mm_set1_ps(2.0f);
 123         const __m128 zero   = _mm_set1_ps(0.0f);
 124         const __m128 four   = _mm_set1_ps(4.0f);
 125
 126         const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
 127         const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
 128         const __m128 still_p4     = _mm_set1_ps(STILL_P4);
 129
 130         factor  = 0.5 * ONE_4PI_EPS0;
 131
 132     gb_radius = born->gb_radius;
 133     vsolv     = born->vsolv;
 134     work      = born->gpol_still_work;
 135         jjnr      = nl->jjnr;
 136     shiftvec  = fr->shift_vec[0];
 137     dadx      = fr->dadx;
 138
 139         jnrA = jnrB = jnrC = jnrD = 0;
 140     jx = _mm_setzero_ps();
 141     jy = _mm_setzero_ps();
 142     jz = _mm_setzero_ps();
 143
 144         n = 0;
 145
 146         for(i=0;i<natoms;i++)
 147         {
 148                 work[i]=0;
 149         }
 150
 151         for(i=0;i<nl->nri;i++)
 152         {
 153         ii     = nl->iinr[i];
 154                 ii3        = ii*3;
 155         is3    = 3*nl->shift[i];
 156         shX    = shiftvec[is3];
 157         shY    = shiftvec[is3+1];
 158         shZ    = shiftvec[is3+2];
 159         nj0    = nl->jindex[i];
 160         nj1    = nl->jindex[i+1];
 161
 162         ix     = _mm_set1_ps(shX+x[ii3+0]);
 163                 iy     = _mm_set1_ps(shY+x[ii3+1]);
 164                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
 165
 166                 offset = (nj1-nj0)%4;
 167
 168                 /* Polarization energy for atom ai */
 169                 gpi    = _mm_setzero_ps();
 170
 171         rai     = _mm_load1_ps(gb_radius+ii);
 172         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
 173
 174                 for(k=nj0;k<nj1-4-offset;k+=8)
 175                 {
 176                         jnrA        = jjnr[k];
 177                         jnrB        = jjnr[k+1];
 178                         jnrC        = jjnr[k+2];
 179                         jnrD        = jjnr[k+3];
 180                         jnrE        = jjnr[k+4];
 181                         jnrF        = jjnr[k+5];
 182                         jnrG        = jjnr[k+6];
 183                         jnrH        = jjnr[k+7];
 184
 185             j3A         = 3*jnrA;
 186                         j3B         = 3*jnrB;
 187                         j3C         = 3*jnrC;
 188                         j3D         = 3*jnrD;
 189             j3E         = 3*jnrE;
 190                         j3F         = 3*jnrF;
 191                         j3G         = 3*jnrG;
 192                         j3H         = 3*jnrH;
 193
 194             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
 195             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
 196
 197             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
 198             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
 199                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
 200                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE,vsolv+jnrF,vsolv+jnrG,vsolv+jnrH,vajB);
 201
 202                         dx          = _mm_sub_ps(ix,jx);
 203                         dy          = _mm_sub_ps(iy,jy);
 204                         dz          = _mm_sub_ps(iz,jz);
 205                         dxB         = _mm_sub_ps(ix,jxB);
 206                         dyB         = _mm_sub_ps(iy,jyB);
 207                         dzB         = _mm_sub_ps(iz,jzB);
 208
 209             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
 210             rsqB        = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
 211             rinv        = gmx_mm_invsqrt_ps(rsq);
 212             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 213             rinv2       = _mm_mul_ps(rinv,rinv);
 214             rinv2B      = _mm_mul_ps(rinvB,rinvB);
 215             rinv4       = _mm_mul_ps(rinv2,rinv2);
 216             rinv4B      = _mm_mul_ps(rinv2B,rinv2B);
 217             rinv6       = _mm_mul_ps(rinv4,rinv2);
 218             rinv6B      = _mm_mul_ps(rinv4B,rinv2B);
 219
 220             rvdw        = _mm_add_ps(rai,raj);
 221             rvdwB       = _mm_add_ps(rai,rajB);
 222             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
 223             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB,rvdwB)));
 224
 225             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
 226             mask_cmpB   = _mm_cmple_ps(ratioB,still_p5inv);
 227
 228             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 229             if( 0 == _mm_movemask_ps(mask_cmp) )
 230             {
 231                 /* if ratio>still_p5inv for ALL elements */
 232                 ccf         = one;
 233                 dccf        = _mm_setzero_ps();
 234             }
 235             else
 236             {
 237                 ratio       = _mm_min_ps(ratio,still_p5inv);
 238                 theta       = _mm_mul_ps(ratio,still_pip5);
 239                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
 240                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
 241                 ccf         = _mm_mul_ps(term,term);
 242                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
 243                                          _mm_mul_ps(sinq,theta));
 244             }
 245             if( 0 == _mm_movemask_ps(mask_cmpB) )
 246             {
 247                 /* if ratio>still_p5inv for ALL elements */
 248                 ccfB        = one;
 249                 dccfB       = _mm_setzero_ps();
 250             }
 251             else
 252             {
 253                 ratioB      = _mm_min_ps(ratioB,still_p5inv);
 254                 thetaB      = _mm_mul_ps(ratioB,still_pip5);
 255                 gmx_mm_sincos_ps(thetaB,&sinqB,&cosqB);
 256                 termB       = _mm_mul_ps(half,_mm_sub_ps(one,cosqB));
 257                 ccfB        = _mm_mul_ps(termB,termB);
 258                 dccfB       = _mm_mul_ps(_mm_mul_ps(two,termB),
 259                                          _mm_mul_ps(sinqB,thetaB));
 260             }
 261
 262             prod        = _mm_mul_ps(still_p4,vaj);
 263             prodB       = _mm_mul_ps(still_p4,vajB);
 264             icf4        = _mm_mul_ps(ccf,rinv4);
 265             icf4B       = _mm_mul_ps(ccfB,rinv4B);
 266             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
 267             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccfB),dccfB), rinv6B);
 268
 269             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
 270             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_mul_ps(prod_ai,icf4B));
 271
 272             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod,icf4) , _mm_mul_ps(prodB,icf4B) ) );
 273
 274             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
 275             dadx+=4;
 276             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
 277             dadx+=4;
 278             _mm_store_ps(dadx,_mm_mul_ps(prodB,icf6B));
 279             dadx+=4;
 280             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6B));
 281             dadx+=4;
 282                 }
 283
 284         for(;k<nj1-offset;k+=4)
 285                 {
 286                         jnrA        = jjnr[k];
 287                         jnrB        = jjnr[k+1];
 288                         jnrC        = jjnr[k+2];
 289                         jnrD        = jjnr[k+3];
 290
 291             j3A         = 3*jnrA;
 292                         j3B         = 3*jnrB;
 293                         j3C         = 3*jnrC;
 294                         j3D         = 3*jnrD;
 295
 296             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
 297
 298             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
 299                         GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vsolv+jnrD,vaj);
 300
 301                         dx          = _mm_sub_ps(ix,jx);
 302                         dy          = _mm_sub_ps(iy,jy);
 303                         dz          = _mm_sub_ps(iz,jz);
 304
 305             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
 306             rinv        = gmx_mm_invsqrt_ps(rsq);
 307             rinv2       = _mm_mul_ps(rinv,rinv);
 308             rinv4       = _mm_mul_ps(rinv2,rinv2);
 309             rinv6       = _mm_mul_ps(rinv4,rinv2);
 310
 311             rvdw        = _mm_add_ps(rai,raj);
 312             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
 313
 314             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
 315
 316             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 317             if(0 == _mm_movemask_ps(mask_cmp))
 318             {
 319                 /* if ratio>still_p5inv for ALL elements */
 320                 ccf         = one;
 321                 dccf        = _mm_setzero_ps();
 322             }
 323             else
 324             {
 325                 ratio       = _mm_min_ps(ratio,still_p5inv);
 326                 theta       = _mm_mul_ps(ratio,still_pip5);
 327                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
 328                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
 329                 ccf         = _mm_mul_ps(term,term);
 330                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
 331                                          _mm_mul_ps(sinq,theta));
 332             }
 333
 334             prod        = _mm_mul_ps(still_p4,vaj);
 335             icf4        = _mm_mul_ps(ccf,rinv4);
 336             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
 337
 338             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_mul_ps(prod_ai,icf4));
 339
 340             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
 341
 342             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
 343             dadx+=4;
 344             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
 345             dadx+=4;
 346                 }
 347
 348         if(offset!=0)
 349         {
 350             if(offset==1)
 351             {
 352                 jnrA        = jjnr[k];
 353                 j3A         = 3*jnrA;
 354                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
 355                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
 356                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA,vaj);
 357                 mask        = mask1;
 358             }
 359             else if(offset==2)
 360             {
 361                 jnrA        = jjnr[k];
 362                 jnrB        = jjnr[k+1];
 363                 j3A         = 3*jnrA;
 364                 j3B         = 3*jnrB;
 365                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
 366                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
 367                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA,vsolv+jnrB,vaj);
 368                 mask        = mask2;
 369             }
 370             else
 371             {
 372                 /* offset must be 3 */
 373                 jnrA        = jjnr[k];
 374                 jnrB        = jjnr[k+1];
 375                 jnrC        = jjnr[k+2];
 376                 j3A         = 3*jnrA;
 377                 j3B         = 3*jnrB;
 378                 j3C         = 3*jnrC;
 379                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
 380                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
 381                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA,vsolv+jnrB,vsolv+jnrC,vaj);
 382                 mask        = mask3;
 383             }
 384
 385                         dx          = _mm_sub_ps(ix,jx);
 386                         dy          = _mm_sub_ps(iy,jy);
 387                         dz          = _mm_sub_ps(iz,jz);
 388
 389             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
 390             rinv        = gmx_mm_invsqrt_ps(rsq);
 391             rinv2       = _mm_mul_ps(rinv,rinv);
 392             rinv4       = _mm_mul_ps(rinv2,rinv2);
 393             rinv6       = _mm_mul_ps(rinv4,rinv2);
 394
 395             rvdw        = _mm_add_ps(rai,raj);
 396             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw,rvdw)));
 397
 398             mask_cmp    = _mm_cmple_ps(ratio,still_p5inv);
 399
 400             if(0 == _mm_movemask_ps(mask_cmp))
 401             {
 402                 /* if ratio>still_p5inv for ALL elements */
 403                 ccf         = one;
 404                 dccf        = _mm_setzero_ps();
 405             }
 406             else
 407             {
 408                 ratio       = _mm_min_ps(ratio,still_p5inv);
 409                 theta       = _mm_mul_ps(ratio,still_pip5);
 410                 gmx_mm_sincos_ps(theta,&sinq,&cosq);
 411                 term        = _mm_mul_ps(half,_mm_sub_ps(one,cosq));
 412                 ccf         = _mm_mul_ps(term,term);
 413                 dccf        = _mm_mul_ps(_mm_mul_ps(two,term),
 414                                          _mm_mul_ps(sinq,theta));
 415             }
 416
 417             prod        = _mm_mul_ps(still_p4,vaj);
 418             icf4        = _mm_mul_ps(ccf,rinv4);
 419             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four,ccf),dccf), rinv6);
 420
 421             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod,icf4));
 422
 423             _mm_store_ps(dadx,_mm_mul_ps(prod,icf6));
 424             dadx+=4;
 425             _mm_store_ps(dadx,_mm_mul_ps(prod_ai,icf6));
 426             dadx+=4;
 427
 428             tmp = _mm_mul_ps(prod_ai,icf4);
 429
 430             if(offset==1)
 431             {
 432                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
 433             }
 434             else if(offset==2)
 435             {
 436                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
 437             }
 438             else
 439             {
 440                 /* offset must be 3 */
 441                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
 442             }
 443         }
 444         GMX_MM_UPDATE_1POT_PS(gpi,work+ii);
 445         }
 446
 447         /* Sum up the polarization energy from other nodes */
 448         if(PARTDECOMP(cr))
 449         {
 450                 gmx_sum(natoms, work, cr);
 451         }
 452         else if(DOMAINDECOMP(cr))
 453         {
 454                 dd_atom_sum_real(cr->dd, work);
 455         }
 456
 457         /* Compute the radii */
 458         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 459         {
 460                 if(born->use[i] != 0)
 461                 {
 462                         gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 463                         gpi2             = gpi_ai * gpi_ai;
 464                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
 465                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 466                 }
 467         }
 468
 469         /* Extra (local) communication required for DD */
 470         if(DOMAINDECOMP(cr))
 471         {
 472                 dd_atom_spread_real(cr->dd, born->bRad);
 473                 dd_atom_spread_real(cr->dd, fr->invsqrta);
 474         }
 475
 476         return 0;
 477 }
 478
 479
 480 int
 481 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 482                                 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm)
 483 {
 484         int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset;
 485     int jnrA,jnrB,jnrC,jnrD;
 486     int j3A,j3B,j3C,j3D;
 487     int jnrE,jnrF,jnrG,jnrH;
 488     int j3E,j3F,j3G,j3H;
 489         float shX,shY,shZ;
 490         float rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr;
 491         float sum_ai2, sum_ai3,tsum,tchain,doffset;
 492         float *obc_param;
 493     float *gb_radius;
 494     float *work;
 495     int *  jjnr;
 496     float *dadx;
 497     float *shiftvec;
 498     float min_rad,rad;
 499
 500         __m128 ix,iy,iz,jx,jy,jz;
 501         __m128 dx,dy,dz,t1,t2,t3,t4;
 502         __m128 rsq,rinv,r;
 503         __m128 rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij;
 504         __m128 uij,lij2,uij2,lij3,uij3,diff2;
 505         __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
 506         __m128 sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv;
 507         __m128 dadx1,dadx2;
 508     __m128 logterm;
 509         __m128 mask;
 510         __m128 obc_mask1,obc_mask2,obc_mask3;
 511     __m128 jxB,jyB,jzB,t1B,t2B,t3B,t4B;
 512     __m128 dxB,dyB,dzB,rsqB,rinvB,rB;
 513         __m128 rajB, raj_invB,rai_inv2B,sk2B,lijB,dlijB,duijB;
 514         __m128 uijB,lij2B,uij2B,lij3B,uij3B,diff2B;
 515         __m128 lij_invB,sk2_invB,prodB;
 516         __m128 sk_ajB,sk2_ajB,sk2_rinvB;
 517         __m128 dadx1B,dadx2B;
 518     __m128 logtermB;
 519     __m128 obc_mask1B,obc_mask2B,obc_mask3B;
 520
 521     __m128   mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 522         __m128   mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 523         __m128   mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 524
 525     __m128 oneeighth   = _mm_set1_ps(0.125);
 526     __m128 onefourth   = _mm_set1_ps(0.25);
 527
 528         const __m128 half  = _mm_set1_ps(0.5f);
 529         const __m128 three = _mm_set1_ps(3.0f);
 530         const __m128 one   = _mm_set1_ps(1.0f);
 531         const __m128 two   = _mm_set1_ps(2.0f);
 532         const __m128 zero  = _mm_set1_ps(0.0f);
 533         const __m128 neg   = _mm_set1_ps(-1.0f);
 534
 535         /* Set the dielectric offset */
 536         doffset   = born->gb_doffset;
 537         gb_radius = born->gb_radius;
 538     obc_param = born->param;
 539     work      = born->gpol_hct_work;
 540     jjnr      = nl->jjnr;
 541     dadx      = fr->dadx;
 542     shiftvec  = fr->shift_vec[0];
 543
 544     jx        = _mm_setzero_ps();
 545     jy        = _mm_setzero_ps();
 546     jz        = _mm_setzero_ps();
 547
 548     jnrA = jnrB = jnrC = jnrD = 0;
 549
 550         for(i=0;i<born->nr;i++)
 551         {
 552                 work[i] = 0;
 553         }
 554
 555         for(i=0;i<nl->nri;i++)
 556         {
 557         ii     = nl->iinr[i];
 558                 ii3        = ii*3;
 559         is3    = 3*nl->shift[i];
 560         shX    = shiftvec[is3];
 561         shY    = shiftvec[is3+1];
 562         shZ    = shiftvec[is3+2];
 563         nj0    = nl->jindex[i];
 564         nj1    = nl->jindex[i+1];
 565
 566         ix     = _mm_set1_ps(shX+x[ii3+0]);
 567                 iy     = _mm_set1_ps(shY+x[ii3+1]);
 568                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
 569
 570                 offset = (nj1-nj0)%4;
 571
 572                 rai    = _mm_load1_ps(gb_radius+ii);
 573                 rai_inv= gmx_mm_inv_ps(rai);
 574
 575                 sum_ai = _mm_setzero_ps();
 576
 577                 sk_ai  = _mm_load1_ps(born->param+ii);
 578                 sk2_ai = _mm_mul_ps(sk_ai,sk_ai);
 579
 580                 for(k=nj0;k<nj1-4-offset;k+=8)
 581                 {
 582                         jnrA        = jjnr[k];
 583                         jnrB        = jjnr[k+1];
 584                         jnrC        = jjnr[k+2];
 585                         jnrD        = jjnr[k+3];
 586                         jnrE        = jjnr[k+4];
 587                         jnrF        = jjnr[k+5];
 588                         jnrG        = jjnr[k+6];
 589                         jnrH        = jjnr[k+7];
 590
 591             j3A         = 3*jnrA;
 592                         j3B         = 3*jnrB;
 593                         j3C         = 3*jnrC;
 594                         j3D         = 3*jnrD;
 595             j3E         = 3*jnrE;
 596                         j3F         = 3*jnrF;
 597                         j3G         = 3*jnrG;
 598                         j3H         = 3*jnrH;
 599
 600             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
 601             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E,x+j3F,x+j3G,x+j3H,jxB,jyB,jzB);
 602             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
 603             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE,gb_radius+jnrF,gb_radius+jnrG,gb_radius+jnrH,rajB);
 604             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
 605             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE,obc_param+jnrF,obc_param+jnrG,obc_param+jnrH,sk_ajB);
 606
 607             dx    = _mm_sub_ps(ix, jx);
 608                         dy    = _mm_sub_ps(iy, jy);
 609                         dz    = _mm_sub_ps(iz, jz);
 610             dxB   = _mm_sub_ps(ix, jxB);
 611                         dyB   = _mm_sub_ps(iy, jyB);
 612                         dzB   = _mm_sub_ps(iz, jzB);
 613
 614             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
 615             rsqB        = gmx_mm_calc_rsq_ps(dxB,dyB,dzB);
 616
 617             rinv        = gmx_mm_invsqrt_ps(rsq);
 618             r           = _mm_mul_ps(rsq,rinv);
 619             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 620             rB          = _mm_mul_ps(rsqB,rinvB);
 621
 622                         /* Compute raj_inv aj1-4 */
 623             raj_inv     = gmx_mm_inv_ps(raj);
 624             raj_invB    = gmx_mm_inv_ps(rajB);
 625
 626             /* Evaluate influence of atom aj -> ai */
 627             t1            = _mm_add_ps(r,sk_aj);
 628             t2            = _mm_sub_ps(r,sk_aj);
 629             t3            = _mm_sub_ps(sk_aj,r);
 630             t1B           = _mm_add_ps(rB,sk_ajB);
 631             t2B           = _mm_sub_ps(rB,sk_ajB);
 632             t3B           = _mm_sub_ps(sk_ajB,rB);
 633             obc_mask1     = _mm_cmplt_ps(rai, t1);
 634             obc_mask2     = _mm_cmplt_ps(rai, t2);
 635             obc_mask3     = _mm_cmplt_ps(rai, t3);
 636             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
 637             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
 638             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
 639
 640             uij           = gmx_mm_inv_ps(t1);
 641             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
 642                                       _mm_andnot_ps(obc_mask2,rai_inv));
 643             dlij          = _mm_and_ps(one,obc_mask2);
 644             uij2          = _mm_mul_ps(uij, uij);
 645             uij3          = _mm_mul_ps(uij2,uij);
 646             lij2          = _mm_mul_ps(lij, lij);
 647             lij3          = _mm_mul_ps(lij2,lij);
 648
 649             uijB          = gmx_mm_inv_ps(t1B);
 650             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
 651                                       _mm_andnot_ps(obc_mask2B,rai_inv));
 652             dlijB         = _mm_and_ps(one,obc_mask2B);
 653             uij2B         = _mm_mul_ps(uijB, uijB);
 654             uij3B         = _mm_mul_ps(uij2B,uijB);
 655             lij2B         = _mm_mul_ps(lijB, lijB);
 656             lij3B         = _mm_mul_ps(lij2B,lijB);
 657
 658             diff2         = _mm_sub_ps(uij2,lij2);
 659             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 660             sk2_aj        = _mm_mul_ps(sk_aj,sk_aj);
 661             sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
 662             prod          = _mm_mul_ps(onefourth,sk2_rinv);
 663
 664             diff2B        = _mm_sub_ps(uij2B,lij2B);
 665             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 666             sk2_ajB       = _mm_mul_ps(sk_ajB,sk_ajB);
 667             sk2_rinvB     = _mm_mul_ps(sk2_ajB,rinvB);
 668             prodB         = _mm_mul_ps(onefourth,sk2_rinvB);
 669
 670             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
 671             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
 672
 673             t1            = _mm_sub_ps(lij,uij);
 674             t2            = _mm_mul_ps(diff2,
 675                                        _mm_sub_ps(_mm_mul_ps(onefourth,r),
 676                                                   prod));
 677             t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
 678             t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
 679             t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
 680             t4            = _mm_and_ps(t4,obc_mask3);
 681             t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
 682
 683             t1B           = _mm_sub_ps(lijB,uijB);
 684             t2B           = _mm_mul_ps(diff2B,
 685                                        _mm_sub_ps(_mm_mul_ps(onefourth,rB),
 686                                                   prodB));
 687             t3B           = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
 688             t1B           = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
 689             t4B           = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lijB));
 690             t4B           = _mm_and_ps(t4B,obc_mask3B);
 691             t1B           = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
 692
 693             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1,obc_mask1), _mm_and_ps(t1B,obc_mask1B) ));
 694
 695             t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
 696                                        _mm_mul_ps(prod,lij3));
 697             t1            = _mm_sub_ps(t1,
 698                                        _mm_mul_ps(onefourth,
 699                                                   _mm_add_ps(_mm_mul_ps(lij,rinv),
 700                                                              _mm_mul_ps(lij3,r))));
 701             t2            = _mm_mul_ps(onefourth,
 702                                        _mm_add_ps(_mm_mul_ps(uij,rinv),
 703                                                   _mm_mul_ps(uij3,r)));
 704             t2            = _mm_sub_ps(t2,
 705                                        _mm_add_ps(_mm_mul_ps(half,uij2),
 706                                                       _mm_mul_ps(prod,uij3)));
 707             t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
 708                                        _mm_mul_ps(rinv,rinv));
 709             t3            = _mm_sub_ps(t3,
 710                                        _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
 711                                                   _mm_add_ps(one,
 712                                                              _mm_mul_ps(sk2_rinv,rinv))));
 713             t1            = _mm_mul_ps(rinv,
 714                                        _mm_add_ps(_mm_mul_ps(dlij,t1),
 715                                                   _mm_add_ps(t2,t3)));
 716
 717
 718
 719             t1B           = _mm_add_ps(_mm_mul_ps(half,lij2B),
 720                                        _mm_mul_ps(prodB,lij3B));
 721             t1B           = _mm_sub_ps(t1B,
 722                                        _mm_mul_ps(onefourth,
 723                                                   _mm_add_ps(_mm_mul_ps(lijB,rinvB),
 724                                                              _mm_mul_ps(lij3B,rB))));
 725             t2B           = _mm_mul_ps(onefourth,
 726                                        _mm_add_ps(_mm_mul_ps(uijB,rinvB),
 727                                                   _mm_mul_ps(uij3B,rB)));
 728             t2B           = _mm_sub_ps(t2B,
 729                                        _mm_add_ps(_mm_mul_ps(half,uij2B),
 730                                                   _mm_mul_ps(prodB,uij3B)));
 731             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
 732                                        _mm_mul_ps(rinvB,rinvB));
 733             t3B           = _mm_sub_ps(t3B,
 734                                        _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
 735                                                   _mm_add_ps(one,
 736                                                              _mm_mul_ps(sk2_rinvB,rinvB))));
 737             t1B           = _mm_mul_ps(rinvB,
 738                                        _mm_add_ps(_mm_mul_ps(dlijB,t1B),
 739                                                   _mm_add_ps(t2B,t3B)));
 740
 741             dadx1         = _mm_and_ps(t1,obc_mask1);
 742             dadx1B        = _mm_and_ps(t1B,obc_mask1B);
 743
 744
 745             /* Evaluate influence of atom ai -> aj */
 746             t1            = _mm_add_ps(r,sk_ai);
 747             t2            = _mm_sub_ps(r,sk_ai);
 748             t3            = _mm_sub_ps(sk_ai,r);
 749             t1B           = _mm_add_ps(rB,sk_ai);
 750             t2B           = _mm_sub_ps(rB,sk_ai);
 751             t3B           = _mm_sub_ps(sk_ai,rB);
 752             obc_mask1     = _mm_cmplt_ps(raj, t1);
 753             obc_mask2     = _mm_cmplt_ps(raj, t2);
 754             obc_mask3     = _mm_cmplt_ps(raj, t3);
 755             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
 756             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
 757             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
 758
 759             uij           = gmx_mm_inv_ps(t1);
 760             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
 761                                       _mm_andnot_ps(obc_mask2,raj_inv));
 762             dlij          = _mm_and_ps(one,obc_mask2);
 763             uij2          = _mm_mul_ps(uij, uij);
 764             uij3          = _mm_mul_ps(uij2,uij);
 765             lij2          = _mm_mul_ps(lij, lij);
 766             lij3          = _mm_mul_ps(lij2,lij);
 767
 768             uijB          = gmx_mm_inv_ps(t1B);
 769             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B,gmx_mm_inv_ps(t2B)),
 770                                       _mm_andnot_ps(obc_mask2B,raj_invB));
 771             dlijB         = _mm_and_ps(one,obc_mask2B);
 772             uij2B         = _mm_mul_ps(uijB, uijB);
 773             uij3B         = _mm_mul_ps(uij2B,uijB);
 774             lij2B         = _mm_mul_ps(lijB, lijB);
 775             lij3B         = _mm_mul_ps(lij2B,lijB);
 776
 777             diff2         = _mm_sub_ps(uij2,lij2);
 778             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 779             sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
 780             prod          = _mm_mul_ps(onefourth,sk2_rinv);
 781
 782             diff2B        = _mm_sub_ps(uij2B,lij2B);
 783             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 784             sk2_rinvB     = _mm_mul_ps(sk2_ai,rinvB);
 785             prodB         = _mm_mul_ps(onefourth,sk2_rinvB);
 786
 787             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
 788             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB,lij_invB));
 789
 790             t1            = _mm_sub_ps(lij,uij);
 791             t2            = _mm_mul_ps(diff2,
 792                                        _mm_sub_ps(_mm_mul_ps(onefourth,r),
 793                                                   prod));
 794             t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
 795             t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
 796             t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
 797             t4            = _mm_and_ps(t4,obc_mask3);
 798             t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
 799
 800             t1B           = _mm_sub_ps(lijB,uijB);
 801             t2B           = _mm_mul_ps(diff2B,
 802                                        _mm_sub_ps(_mm_mul_ps(onefourth,rB),
 803                                                   prodB));
 804             t3B           = _mm_mul_ps(half,_mm_mul_ps(rinvB,logtermB));
 805             t1B           = _mm_add_ps(t1B,_mm_add_ps(t2B,t3B));
 806             t4B           = _mm_mul_ps(two,_mm_sub_ps(raj_invB,lijB));
 807             t4B           = _mm_and_ps(t4B,obc_mask3B);
 808             t1B           = _mm_mul_ps(half,_mm_add_ps(t1B,t4B));
 809
 810             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
 811             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE,work+jnrF,work+jnrG,work+jnrH,_mm_and_ps(t1B,obc_mask1B));
 812
 813             t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
 814                                        _mm_mul_ps(prod,lij3));
 815             t1            = _mm_sub_ps(t1,
 816                                        _mm_mul_ps(onefourth,
 817                                                   _mm_add_ps(_mm_mul_ps(lij,rinv),
 818                                                              _mm_mul_ps(lij3,r))));
 819             t2            = _mm_mul_ps(onefourth,
 820                                        _mm_add_ps(_mm_mul_ps(uij,rinv),
 821                                                   _mm_mul_ps(uij3,r)));
 822             t2            = _mm_sub_ps(t2,
 823                                        _mm_add_ps(_mm_mul_ps(half,uij2),
 824                                                   _mm_mul_ps(prod,uij3)));
 825             t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
 826                                        _mm_mul_ps(rinv,rinv));
 827             t3            = _mm_sub_ps(t3,
 828                                        _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
 829                                                   _mm_add_ps(one,
 830                                                              _mm_mul_ps(sk2_rinv,rinv))));
 831             t1            = _mm_mul_ps(rinv,
 832                                        _mm_add_ps(_mm_mul_ps(dlij,t1),
 833                                                   _mm_add_ps(t2,t3)));
 834
 835
 836             t1B           = _mm_add_ps(_mm_mul_ps(half,lij2B),
 837                                        _mm_mul_ps(prodB,lij3B));
 838             t1B           = _mm_sub_ps(t1B,
 839                                        _mm_mul_ps(onefourth,
 840                                                   _mm_add_ps(_mm_mul_ps(lijB,rinvB),
 841                                                              _mm_mul_ps(lij3B,rB))));
 842             t2B           = _mm_mul_ps(onefourth,
 843                                        _mm_add_ps(_mm_mul_ps(uijB,rinvB),
 844                                                   _mm_mul_ps(uij3B,rB)));
 845             t2B           = _mm_sub_ps(t2B,
 846                                        _mm_add_ps(_mm_mul_ps(half,uij2B),
 847                                                   _mm_mul_ps(prodB,uij3B)));
 848             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth,logtermB),
 849                                        _mm_mul_ps(rinvB,rinvB));
 850             t3B           = _mm_sub_ps(t3B,
 851                                        _mm_mul_ps(_mm_mul_ps(diff2B,oneeighth),
 852                                                   _mm_add_ps(one,
 853                                                              _mm_mul_ps(sk2_rinvB,rinvB))));
 854             t1B           = _mm_mul_ps(rinvB,
 855                                        _mm_add_ps(_mm_mul_ps(dlijB,t1B),
 856                                                   _mm_add_ps(t2B,t3B)));
 857
 858
 859             dadx2         = _mm_and_ps(t1,obc_mask1);
 860             dadx2B        = _mm_and_ps(t1B,obc_mask1B);
 861
 862             _mm_store_ps(dadx,dadx1);
 863             dadx += 4;
 864             _mm_store_ps(dadx,dadx2);
 865             dadx += 4;
 866             _mm_store_ps(dadx,dadx1B);
 867             dadx += 4;
 868             _mm_store_ps(dadx,dadx2B);
 869             dadx += 4;
 870
 871         } /* end normal inner loop */
 872
 873                 for(;k<nj1-offset;k+=4)
 874                 {
 875                         jnrA        = jjnr[k];
 876                         jnrB        = jjnr[k+1];
 877                         jnrC        = jjnr[k+2];
 878                         jnrD        = jjnr[k+3];
 879
 880             j3A         = 3*jnrA;
 881                         j3B         = 3*jnrB;
 882                         j3C         = 3*jnrC;
 883                         j3D         = 3*jnrD;
 884
 885             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
 886             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,gb_radius+jnrD,raj);
 887             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,obc_param+jnrD,sk_aj);
 888
 889             dx    = _mm_sub_ps(ix, jx);
 890                         dy    = _mm_sub_ps(iy, jy);
 891                         dz    = _mm_sub_ps(iz, jz);
 892
 893             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
 894
 895             rinv        = gmx_mm_invsqrt_ps(rsq);
 896             r           = _mm_mul_ps(rsq,rinv);
 897
 898                         /* Compute raj_inv aj1-4 */
 899             raj_inv     = gmx_mm_inv_ps(raj);
 900
 901             /* Evaluate influence of atom aj -> ai */
 902             t1            = _mm_add_ps(r,sk_aj);
 903             obc_mask1     = _mm_cmplt_ps(rai, t1);
 904
 905             if(_mm_movemask_ps(obc_mask1))
 906             {
 907                 /* If any of the elements has rai<dr+sk, this is executed */
 908                 t2            = _mm_sub_ps(r,sk_aj);
 909                 t3            = _mm_sub_ps(sk_aj,r);
 910
 911                 obc_mask2     = _mm_cmplt_ps(rai, t2);
 912                 obc_mask3     = _mm_cmplt_ps(rai, t3);
 913
 914                 uij           = gmx_mm_inv_ps(t1);
 915                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
 916                                           _mm_andnot_ps(obc_mask2,rai_inv));
 917                 dlij          = _mm_and_ps(one,obc_mask2);
 918                 uij2          = _mm_mul_ps(uij, uij);
 919                 uij3          = _mm_mul_ps(uij2,uij);
 920                 lij2          = _mm_mul_ps(lij, lij);
 921                 lij3          = _mm_mul_ps(lij2,lij);
 922                 diff2         = _mm_sub_ps(uij2,lij2);
 923                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 924                 sk2_aj        = _mm_mul_ps(sk_aj,sk_aj);
 925                 sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
 926                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
 927                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
 928                 t1            = _mm_sub_ps(lij,uij);
 929                 t2            = _mm_mul_ps(diff2,
 930                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
 931                                                       prod));
 932                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
 933                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
 934                 t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
 935                 t4            = _mm_and_ps(t4,obc_mask3);
 936                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
 937                 sum_ai        = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
 938                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
 939                                            _mm_mul_ps(prod,lij3));
 940                 t1            = _mm_sub_ps(t1,
 941                                            _mm_mul_ps(onefourth,
 942                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
 943                                                                  _mm_mul_ps(lij3,r))));
 944                 t2            = _mm_mul_ps(onefourth,
 945                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
 946                                                       _mm_mul_ps(uij3,r)));
 947                 t2            = _mm_sub_ps(t2,
 948                                            _mm_add_ps(_mm_mul_ps(half,uij2),
 949                                                       _mm_mul_ps(prod,uij3)));
 950                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
 951                                            _mm_mul_ps(rinv,rinv));
 952                 t3            = _mm_sub_ps(t3,
 953                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
 954                                                       _mm_add_ps(one,
 955                                                                  _mm_mul_ps(sk2_rinv,rinv))));
 956                 t1            = _mm_mul_ps(rinv,
 957                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
 958                                                       _mm_add_ps(t2,t3)));
 959
 960                 dadx1         = _mm_and_ps(t1,obc_mask1);
 961             }
 962             else
 963             {
 964                 dadx1         = _mm_setzero_ps();
 965             }
 966
 967             /* Evaluate influence of atom ai -> aj */
 968             t1            = _mm_add_ps(r,sk_ai);
 969             obc_mask1     = _mm_cmplt_ps(raj, t1);
 970
 971             if(_mm_movemask_ps(obc_mask1))
 972             {
 973                 t2            = _mm_sub_ps(r,sk_ai);
 974                 t3            = _mm_sub_ps(sk_ai,r);
 975                 obc_mask2     = _mm_cmplt_ps(raj, t2);
 976                 obc_mask3     = _mm_cmplt_ps(raj, t3);
 977
 978                 uij           = gmx_mm_inv_ps(t1);
 979                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
 980                                           _mm_andnot_ps(obc_mask2,raj_inv));
 981                 dlij          = _mm_and_ps(one,obc_mask2);
 982                 uij2          = _mm_mul_ps(uij, uij);
 983                 uij3          = _mm_mul_ps(uij2,uij);
 984                 lij2          = _mm_mul_ps(lij, lij);
 985                 lij3          = _mm_mul_ps(lij2,lij);
 986                 diff2         = _mm_sub_ps(uij2,lij2);
 987                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 988                 sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
 989                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
 990                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
 991                 t1            = _mm_sub_ps(lij,uij);
 992                 t2            = _mm_mul_ps(diff2,
 993                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
 994                                                       prod));
 995                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
 996                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
 997                 t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
 998                 t4            = _mm_and_ps(t4,obc_mask3);
 999                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1000
1001                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA,work+jnrB,work+jnrC,work+jnrD,_mm_and_ps(t1,obc_mask1));
1002
1003                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1004                                            _mm_mul_ps(prod,lij3));
1005                 t1            = _mm_sub_ps(t1,
1006                                            _mm_mul_ps(onefourth,
1007                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1008                                                                  _mm_mul_ps(lij3,r))));
1009                 t2            = _mm_mul_ps(onefourth,
1010                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1011                                                       _mm_mul_ps(uij3,r)));
1012                 t2            = _mm_sub_ps(t2,
1013                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1014                                                       _mm_mul_ps(prod,uij3)));
1015                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1016                                            _mm_mul_ps(rinv,rinv));
1017                 t3            = _mm_sub_ps(t3,
1018                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1019                                                       _mm_add_ps(one,
1020                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1021                 t1            = _mm_mul_ps(rinv,
1022                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1023                                                       _mm_add_ps(t2,t3)));
1024                 dadx2         = _mm_and_ps(t1,obc_mask1);
1025             }
1026             else
1027             {
1028                 dadx2         = _mm_setzero_ps();
1029             }
1030
1031             _mm_store_ps(dadx,dadx1);
1032             dadx += 4;
1033             _mm_store_ps(dadx,dadx2);
1034             dadx += 4;
1035         } /* end normal inner loop */
1036
1037         if(offset!=0)
1038         {
1039             if(offset==1)
1040             {
1041                 jnrA        = jjnr[k];
1042                 j3A         = 3*jnrA;
1043                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1044                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA,raj);
1045                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA,sk_aj);
1046                 mask        = mask1;
1047             }
1048             else if(offset==2)
1049             {
1050                 jnrA        = jjnr[k];
1051                 jnrB        = jjnr[k+1];
1052                 j3A         = 3*jnrA;
1053                 j3B         = 3*jnrB;
1054                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1055                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,raj);
1056                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA,obc_param+jnrB,sk_aj);
1057                 mask        = mask2;
1058             }
1059             else
1060             {
1061                 /* offset must be 3 */
1062                 jnrA        = jjnr[k];
1063                 jnrB        = jjnr[k+1];
1064                 jnrC        = jjnr[k+2];
1065                 j3A         = 3*jnrA;
1066                 j3B         = 3*jnrB;
1067                 j3C         = 3*jnrC;
1068                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1069                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA,gb_radius+jnrB,gb_radius+jnrC,raj);
1070                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA,obc_param+jnrB,obc_param+jnrC,sk_aj);
1071                 mask        = mask3;
1072             }
1073
1074             dx    = _mm_sub_ps(ix, jx);
1075                         dy    = _mm_sub_ps(iy, jy);
1076                         dz    = _mm_sub_ps(iz, jz);
1077
1078             rsq         = gmx_mm_calc_rsq_ps(dx,dy,dz);
1079
1080             rinv        = gmx_mm_invsqrt_ps(rsq);
1081             r           = _mm_mul_ps(rsq,rinv);
1082
1083                         /* Compute raj_inv aj1-4 */
1084             raj_inv     = gmx_mm_inv_ps(raj);
1085
1086             /* Evaluate influence of atom aj -> ai */
1087             t1            = _mm_add_ps(r,sk_aj);
1088             obc_mask1     = _mm_cmplt_ps(rai, t1);
1089             obc_mask1     = _mm_and_ps(obc_mask1,mask);
1090
1091             if(_mm_movemask_ps(obc_mask1))
1092             {
1093                 t2            = _mm_sub_ps(r,sk_aj);
1094                 t3            = _mm_sub_ps(sk_aj,r);
1095                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1096                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1097
1098                 uij           = gmx_mm_inv_ps(t1);
1099                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1100                                           _mm_andnot_ps(obc_mask2,rai_inv));
1101                 dlij          = _mm_and_ps(one,obc_mask2);
1102                 uij2          = _mm_mul_ps(uij, uij);
1103                 uij3          = _mm_mul_ps(uij2,uij);
1104                 lij2          = _mm_mul_ps(lij, lij);
1105                 lij3          = _mm_mul_ps(lij2,lij);
1106                 diff2         = _mm_sub_ps(uij2,lij2);
1107                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1108                 sk2_aj         = _mm_mul_ps(sk_aj,sk_aj);
1109                 sk2_rinv      = _mm_mul_ps(sk2_aj,rinv);
1110                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
1111                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1112                 t1            = _mm_sub_ps(lij,uij);
1113                 t2            = _mm_mul_ps(diff2,
1114                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
1115                                                       prod));
1116                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1117                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1118                 t4            = _mm_mul_ps(two,_mm_sub_ps(rai_inv,lij));
1119                 t4            = _mm_and_ps(t4,obc_mask3);
1120                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1121                 sum_ai        = _mm_add_ps(sum_ai,_mm_and_ps(t1,obc_mask1));
1122                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1123                                            _mm_mul_ps(prod,lij3));
1124                 t1            = _mm_sub_ps(t1,
1125                                            _mm_mul_ps(onefourth,
1126                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1127                                                                  _mm_mul_ps(lij3,r))));
1128                 t2            = _mm_mul_ps(onefourth,
1129                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1130                                                       _mm_mul_ps(uij3,r)));
1131                 t2            = _mm_sub_ps(t2,
1132                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1133                                                       _mm_mul_ps(prod,uij3)));
1134                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1135                                            _mm_mul_ps(rinv,rinv));
1136                 t3            = _mm_sub_ps(t3,
1137                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1138                                                       _mm_add_ps(one,
1139                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1140                 t1            = _mm_mul_ps(rinv,
1141                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1142                                                       _mm_add_ps(t2,t3)));
1143                 dadx1         = _mm_and_ps(t1,obc_mask1);
1144             }
1145             else
1146             {
1147                 dadx1         = _mm_setzero_ps();
1148             }
1149
1150                 /* Evaluate influence of atom ai -> aj */
1151             t1            = _mm_add_ps(r,sk_ai);
1152             obc_mask1     = _mm_cmplt_ps(raj, t1);
1153             obc_mask1     = _mm_and_ps(obc_mask1,mask);
1154
1155             if(_mm_movemask_ps(obc_mask1))
1156             {
1157                 t2            = _mm_sub_ps(r,sk_ai);
1158                 t3            = _mm_sub_ps(sk_ai,r);
1159                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1160                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1161
1162                 uij           = gmx_mm_inv_ps(t1);
1163                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2,gmx_mm_inv_ps(t2)),
1164                                           _mm_andnot_ps(obc_mask2,raj_inv));
1165                 dlij          = _mm_and_ps(one,obc_mask2);
1166                 uij2          = _mm_mul_ps(uij, uij);
1167                 uij3          = _mm_mul_ps(uij2,uij);
1168                 lij2          = _mm_mul_ps(lij, lij);
1169                 lij3          = _mm_mul_ps(lij2,lij);
1170                 diff2         = _mm_sub_ps(uij2,lij2);
1171                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1172                 sk2_rinv      = _mm_mul_ps(sk2_ai,rinv);
1173                 prod          = _mm_mul_ps(onefourth,sk2_rinv);
1174                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij,lij_inv));
1175                 t1            = _mm_sub_ps(lij,uij);
1176                 t2            = _mm_mul_ps(diff2,
1177                                            _mm_sub_ps(_mm_mul_ps(onefourth,r),
1178                                                       prod));
1179                 t3            = _mm_mul_ps(half,_mm_mul_ps(rinv,logterm));
1180                 t1            = _mm_add_ps(t1,_mm_add_ps(t2,t3));
1181                 t4            = _mm_mul_ps(two,_mm_sub_ps(raj_inv,lij));
1182                 t4            = _mm_and_ps(t4,obc_mask3);
1183                 t1            = _mm_mul_ps(half,_mm_add_ps(t1,t4));
1184
1185                 tmp           = _mm_and_ps(t1,obc_mask1);
1186
1187                 t1            = _mm_add_ps(_mm_mul_ps(half,lij2),
1188                                            _mm_mul_ps(prod,lij3));
1189                 t1            = _mm_sub_ps(t1,
1190                                            _mm_mul_ps(onefourth,
1191                                                       _mm_add_ps(_mm_mul_ps(lij,rinv),
1192                                                                  _mm_mul_ps(lij3,r))));
1193                 t2            = _mm_mul_ps(onefourth,
1194                                            _mm_add_ps(_mm_mul_ps(uij,rinv),
1195                                                       _mm_mul_ps(uij3,r)));
1196                 t2            = _mm_sub_ps(t2,
1197                                            _mm_add_ps(_mm_mul_ps(half,uij2),
1198                                                       _mm_mul_ps(prod,uij3)));
1199                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth,logterm),
1200                                            _mm_mul_ps(rinv,rinv));
1201                 t3            = _mm_sub_ps(t3,
1202                                            _mm_mul_ps(_mm_mul_ps(diff2,oneeighth),
1203                                                       _mm_add_ps(one,
1204                                                                  _mm_mul_ps(sk2_rinv,rinv))));
1205                 t1            = _mm_mul_ps(rinv,
1206                                            _mm_add_ps(_mm_mul_ps(dlij,t1),
1207                                                       _mm_add_ps(t2,t3)));
1208                 dadx2         = _mm_and_ps(t1,obc_mask1);
1209             }
1210             else
1211             {
1212                 dadx2         = _mm_setzero_ps();
1213                 tmp           = _mm_setzero_ps();
1214             }
1215
1216             _mm_store_ps(dadx,dadx1);
1217             dadx += 4;
1218             _mm_store_ps(dadx,dadx2);
1219             dadx += 4;
1220
1221             if(offset==1)
1222             {
1223                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA,tmp);
1224             }
1225             else if(offset==2)
1226             {
1227                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA,work+jnrB,tmp);
1228             }
1229             else
1230             {
1231                 /* offset must be 3 */
1232                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA,work+jnrB,work+jnrC,tmp);
1233             }
1234
1235         }
1236         GMX_MM_UPDATE_1POT_PS(sum_ai,work+ii);
1237
1238         }
1239
1240         /* Parallel summations */
1241         if(PARTDECOMP(cr))
1242         {
1243                 gmx_sum(natoms, work, cr);
1244         }
1245         else if(DOMAINDECOMP(cr))
1246         {
1247                 dd_atom_sum_real(cr->dd, work);
1248         }
1249
1250     if(gb_algorithm==egbHCT)
1251     {
1252         /* HCT */
1253         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1254         {
1255                         if(born->use[i] != 0)
1256             {
1257                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1258                 sum     = 1.0/rr - work[i];
1259                 min_rad = rr + doffset;
1260                 rad     = 1.0/sum;
1261
1262                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1263                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1264             }
1265         }
1266
1267         /* Extra communication required for DD */
1268         if(DOMAINDECOMP(cr))
1269         {
1270             dd_atom_spread_real(cr->dd, born->bRad);
1271             dd_atom_spread_real(cr->dd, fr->invsqrta);
1272         }
1273     }
1274     else
1275     {
1276         /* OBC */
1277         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1278         {
1279                         if(born->use[i] != 0)
1280             {
1281                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1282                 rr_inv2 = 1.0/rr;
1283                 rr      = rr-doffset;
1284                 rr_inv  = 1.0/rr;
1285                 sum     = rr * work[i];
1286                 sum2    = sum  * sum;
1287                 sum3    = sum2 * sum;
1288
1289                 tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1290                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1291                 born->bRad[i] = 1.0 / born->bRad[i];
1292
1293                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
1294
1295                 tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1296                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1297             }
1298         }
1299         /* Extra (local) communication required for DD */
1300         if(DOMAINDECOMP(cr))
1301         {
1302             dd_atom_spread_real(cr->dd, born->bRad);
1303             dd_atom_spread_real(cr->dd, fr->invsqrta);
1304             dd_atom_spread_real(cr->dd, born->drobc);
1305         }
1306     }
1307
1308
1309
1310         return 0;
1311 }
1312
1313
1314
1315 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1316                                     float *x, float *f, float *fshift, float *shiftvec,
1317                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1318 {
1319         int    i,k,n,ii,jnr,ii3,is3,nj0,nj1,offset,n0,n1;
1320         int        jnrA,jnrB,jnrC,jnrD;
1321     int    j3A,j3B,j3C,j3D;
1322         int        jnrE,jnrF,jnrG,jnrH;
1323     int    j3E,j3F,j3G,j3H;
1324         int *  jjnr;
1325
1326         float   rbi,shX,shY,shZ;
1327         float   *rb;
1328
1329         __m128 ix,iy,iz;
1330         __m128 jx,jy,jz;
1331         __m128 jxB,jyB,jzB;
1332         __m128 fix,fiy,fiz;
1333         __m128 dx,dy,dz;
1334     __m128 tx,ty,tz;
1335         __m128 dxB,dyB,dzB;
1336     __m128 txB,tyB,tzB;
1337
1338         __m128 rbai,rbaj,rbajB, f_gb, f_gb_ai,f_gbB,f_gb_aiB;
1339         __m128 xmm1,xmm2,xmm3;
1340
1341         const __m128 two = _mm_set1_ps(2.0f);
1342
1343         rb     = born->work;
1344
1345     jjnr   = nl->jjnr;
1346
1347         /* Loop to get the proper form for the Born radius term, sse style */
1348         offset=natoms%4;
1349
1350   n0 = 0;
1351   n1 = natoms;
1352
1353         if(gb_algorithm==egbSTILL)
1354         {
1355                 for(i=n0;i<n1;i++)
1356                 {
1357       rbi   = born->bRad[i];
1358                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1359                 }
1360         }
1361         else if(gb_algorithm==egbHCT)
1362         {
1363                 for(i=n0;i<n1;i++)
1364                 {
1365       rbi   = born->bRad[i];
1366                         rb[i] = rbi * rbi * dvda[i];
1367                 }
1368         }
1369         else if(gb_algorithm==egbOBC)
1370         {
1371                 for(i=n0;i<n1;i++)
1372                 {
1373       rbi   = born->bRad[i];
1374                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1375                 }
1376         }
1377
1378     jz = _mm_setzero_ps();
1379
1380     n = j3A = j3B = j3C = j3D = 0;
1381
1382         for(i=0;i<nl->nri;i++)
1383         {
1384         ii     = nl->iinr[i];
1385                 ii3        = ii*3;
1386         is3    = 3*nl->shift[i];
1387         shX    = shiftvec[is3];
1388         shY    = shiftvec[is3+1];
1389         shZ    = shiftvec[is3+2];
1390         nj0    = nl->jindex[i];
1391         nj1    = nl->jindex[i+1];
1392
1393         ix     = _mm_set1_ps(shX+x[ii3+0]);
1394                 iy     = _mm_set1_ps(shY+x[ii3+1]);
1395                 iz     = _mm_set1_ps(shZ+x[ii3+2]);
1396
1397                 offset = (nj1-nj0)%4;
1398
1399                 rbai   = _mm_load1_ps(rb+ii);
1400                 fix    = _mm_setzero_ps();
1401                 fiy    = _mm_setzero_ps();
1402                 fiz    = _mm_setzero_ps();
1403
1404
1405         for(k=nj0;k<nj1-offset;k+=4)
1406                 {
1407                         jnrA        = jjnr[k];
1408                         jnrB        = jjnr[k+1];
1409                         jnrC        = jjnr[k+2];
1410                         jnrD        = jjnr[k+3];
1411
1412             j3A         = 3*jnrA;
1413                         j3B         = 3*jnrB;
1414                         j3C         = 3*jnrC;
1415                         j3D         = 3*jnrD;
1416
1417             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A,x+j3B,x+j3C,x+j3D,jx,jy,jz);
1418
1419                         dx          = _mm_sub_ps(ix,jx);
1420                         dy          = _mm_sub_ps(iy,jy);
1421                         dz          = _mm_sub_ps(iz,jz);
1422
1423             GMX_MM_LOAD_4VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rb+jnrD,rbaj);
1424
1425                         /* load chain rule terms for j1-4 */
1426                         f_gb        = _mm_load_ps(dadx);
1427                         dadx += 4;
1428                         f_gb_ai     = _mm_load_ps(dadx);
1429                         dadx += 4;
1430
1431             /* calculate scalar force */
1432             f_gb    = _mm_mul_ps(f_gb,rbai);
1433             f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1434             f_gb    = _mm_add_ps(f_gb,f_gb_ai);
1435
1436             tx     = _mm_mul_ps(f_gb,dx);
1437             ty     = _mm_mul_ps(f_gb,dy);
1438             tz     = _mm_mul_ps(f_gb,dz);
1439
1440             fix    = _mm_add_ps(fix,tx);
1441             fiy    = _mm_add_ps(fiy,ty);
1442             fiz    = _mm_add_ps(fiz,tz);
1443
1444             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A,f+j3B,f+j3C,f+j3D,tx,ty,tz);
1445                 }
1446
1447                 /*deal with odd elements */
1448                 if(offset!=0)
1449         {
1450             if(offset==1)
1451             {
1452                 jnrA        = jjnr[k];
1453                 j3A         = 3*jnrA;
1454                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A,jx,jy,jz);
1455                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA,rbaj);
1456             }
1457             else if(offset==2)
1458             {
1459                 jnrA        = jjnr[k];
1460                 jnrB        = jjnr[k+1];
1461                 j3A         = 3*jnrA;
1462                 j3B         = 3*jnrB;
1463                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A,x+j3B,jx,jy,jz);
1464                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA,rb+jnrB,rbaj);
1465             }
1466             else
1467             {
1468                 /* offset must be 3 */
1469                 jnrA        = jjnr[k];
1470                 jnrB        = jjnr[k+1];
1471                 jnrC        = jjnr[k+2];
1472                 j3A         = 3*jnrA;
1473                 j3B         = 3*jnrB;
1474                 j3C         = 3*jnrC;
1475                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A,x+j3B,x+j3C,jx,jy,jz);
1476                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA,rb+jnrB,rb+jnrC,rbaj);
1477             }
1478
1479             dx          = _mm_sub_ps(ix,jx);
1480             dy          = _mm_sub_ps(iy,jy);
1481             dz          = _mm_sub_ps(iz,jz);
1482
1483             /* load chain rule terms for j1-4 */
1484             f_gb        = _mm_load_ps(dadx);
1485             dadx += 4;
1486             f_gb_ai     = _mm_load_ps(dadx);
1487             dadx += 4;
1488
1489             /* calculate scalar force */
1490             f_gb    = _mm_mul_ps(f_gb,rbai);
1491             f_gb_ai = _mm_mul_ps(f_gb_ai,rbaj);
1492             f_gb    = _mm_add_ps(f_gb,f_gb_ai);
1493
1494             tx     = _mm_mul_ps(f_gb,dx);
1495             ty     = _mm_mul_ps(f_gb,dy);
1496             tz     = _mm_mul_ps(f_gb,dz);
1497
1498             fix    = _mm_add_ps(fix,tx);
1499             fiy    = _mm_add_ps(fiy,ty);
1500             fiz    = _mm_add_ps(fiz,tz);
1501
1502             if(offset==1)
1503             {
1504                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A,tx,ty,tz);
1505             }
1506             else if(offset==2)
1507             {
1508                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A,f+j3B,tx,ty,tz);
1509             }
1510             else
1511             {
1512                 /* offset must be 3 */
1513                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A,f+j3B,f+j3C,tx,ty,tz);
1514             }
1515         }
1516
1517                 /* fix/fiy/fiz now contain four partial force terms, that all should be
1518          * added to the i particle forces and shift forces.
1519          */
1520                 gmx_mm_update_iforce_1atom_ps(&fix,&fiy,&fiz,f+ii3,fshift+is3);
1521         }
1522
1523         return 0;
1524 }
1525
1526
1527 #else
1528 /* keep compiler happy */
1529 int genborn_sse_dummy;
1530
1531 #endif /* SSE intrinsics available */