src/gromacs/mdlib/genborn_sse2_single.c

   1 /*
   2  *
   3  *                This source code is part of
   4  *
   5  *                 G   R   O   M   A   C   S
   6  *
   7  *          GROningen MAchine for Chemical Simulations
   8  *
   9  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  10  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  11  * Copyright (c) 2001-2008, The GROMACS development team,
  12  * check out http://www.gromacs.org for more information.
  13
  14  * This program is free software; you can redistribute it and/or
  15  * modify it under the terms of the GNU General Public License
  16  * as published by the Free Software Foundation; either version 2
  17  * of the License, or (at your option) any later version.
  18  *
  19  * If you want to redistribute modifications, please consider that
  20  * scientific software is very special. Version control is crucial -
  21  * bugs must be traceable. We will be happy to consider code for
  22  * inclusion in the official distribution, but derived work must not
  23  * be called official GROMACS. Details are found in the README & COPYING
  24  * files - if they are missing, get the official version at www.gromacs.org.
  25  *
  26  * To help us fund GROMACS development, we humbly ask that you cite
  27  * the papers on the package - you can find them in the top README file.
  28  *
  29  * For more info, check our website at http://www.gromacs.org
  30  *
  31  * And Hey:
  32  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  33  */
  34 #ifdef HAVE_CONFIG_H
  35 #include <config.h>
  36 #endif
  37
  38 #include <math.h>
  39 #include <string.h>
  40
  41 #include "typedefs.h"
  42 #include "smalloc.h"
  43 #include "genborn.h"
  44 #include "vec.h"
  45 #include "grompp.h"
  46 #include "pdbio.h"
  47 #include "names.h"
  48 #include "physics.h"
  49 #include "partdec.h"
  50 #include "domdec.h"
  51 #include "network.h"
  52 #include "gmx_fatal.h"
  53 #include "mtop_util.h"
  54 #include "genborn.h"
  55
  56 #ifdef GMX_LIB_MPI
  57 #include <mpi.h>
  58 #endif
  59 #ifdef GMX_THREAD_MPI
  60 #include "tmpi.h"
  61 #endif
  62
  63
  64 /* Only compile this file if SSE intrinsics are available */
  65 #if 0 && defined (GMX_X86_SSE2)
  66
  67 #include <gmx_sse2_single.h>
  68 #include <emmintrin.h>
  69
  70 #include "genborn_sse2_single.h"
  71
  72
  73 int
  74 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
  75                               int natoms, gmx_localtop_t *top,
  76                               const t_atomtypes *atype, float *x, t_nblist *nl,
  77                               gmx_genborn_t *born)
  78 {
  79     int          i, k, n, ii, is3, ii3, nj0, nj1, offset;
  80     int          jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
  81     int          jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
  82     int          shift;
  83     int         *mdtype;
  84     real         shX, shY, shZ;
  85     int         *jjnr;
  86     real        *shiftvec;
  87
  88     float        gpi_ai, gpi2;
  89     float        factor;
  90     float       *gb_radius;
  91     float       *vsolv;
  92     float       *work;
  93     float       *dadx;
  94
  95     __m128       ix, iy, iz;
  96     __m128       jx, jy, jz;
  97     __m128       dx, dy, dz;
  98     __m128       tx, ty, tz;
  99     __m128       jxB, jyB, jzB;
 100     __m128       dxB, dyB, dzB;
 101     __m128       txB, tyB, tzB;
 102     __m128       rsq, rinv, rinv2, rinv4, rinv6;
 103     __m128       rsqB, rinvB, rinv2B, rinv4B, rinv6B;
 104     __m128       ratio, gpi, rai, raj, vai, vaj, rvdw;
 105     __m128       ratioB, rajB, vajB, rvdwB;
 106     __m128       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
 107     __m128       ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
 108     __m128       mask, icf4, icf6, mask_cmp;
 109     __m128       icf4B, icf6B, mask_cmpB;
 110
 111     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 112     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 113     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 114
 115     const __m128 half   = _mm_set1_ps(0.5f);
 116     const __m128 three  = _mm_set1_ps(3.0f);
 117     const __m128 one    = _mm_set1_ps(1.0f);
 118     const __m128 two    = _mm_set1_ps(2.0f);
 119     const __m128 zero   = _mm_set1_ps(0.0f);
 120     const __m128 four   = _mm_set1_ps(4.0f);
 121
 122     const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
 123     const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
 124     const __m128 still_p4     = _mm_set1_ps(STILL_P4);
 125
 126     factor  = 0.5 * ONE_4PI_EPS0;
 127
 128     gb_radius = born->gb_radius;
 129     vsolv     = born->vsolv;
 130     work      = born->gpol_still_work;
 131     jjnr      = nl->jjnr;
 132     shiftvec  = fr->shift_vec[0];
 133     dadx      = fr->dadx;
 134
 135     jnrA = jnrB = jnrC = jnrD = 0;
 136     jx   = _mm_setzero_ps();
 137     jy   = _mm_setzero_ps();
 138     jz   = _mm_setzero_ps();
 139
 140     n = 0;
 141
 142     for (i = 0; i < natoms; i++)
 143     {
 144         work[i] = 0;
 145     }
 146
 147     for (i = 0; i < nl->nri; i++)
 148     {
 149         ii     = nl->iinr[i];
 150         ii3    = ii*3;
 151         is3    = 3*nl->shift[i];
 152         shX    = shiftvec[is3];
 153         shY    = shiftvec[is3+1];
 154         shZ    = shiftvec[is3+2];
 155         nj0    = nl->jindex[i];
 156         nj1    = nl->jindex[i+1];
 157
 158         ix     = _mm_set1_ps(shX+x[ii3+0]);
 159         iy     = _mm_set1_ps(shY+x[ii3+1]);
 160         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 161
 162         offset = (nj1-nj0)%4;
 163
 164         /* Polarization energy for atom ai */
 165         gpi    = _mm_setzero_ps();
 166
 167         rai     = _mm_load1_ps(gb_radius+ii);
 168         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
 169
 170         for (k = nj0; k < nj1-4-offset; k += 8)
 171         {
 172             jnrA        = jjnr[k];
 173             jnrB        = jjnr[k+1];
 174             jnrC        = jjnr[k+2];
 175             jnrD        = jjnr[k+3];
 176             jnrE        = jjnr[k+4];
 177             jnrF        = jjnr[k+5];
 178             jnrG        = jjnr[k+6];
 179             jnrH        = jjnr[k+7];
 180
 181             j3A         = 3*jnrA;
 182             j3B         = 3*jnrB;
 183             j3C         = 3*jnrC;
 184             j3D         = 3*jnrD;
 185             j3E         = 3*jnrE;
 186             j3F         = 3*jnrF;
 187             j3G         = 3*jnrG;
 188             j3H         = 3*jnrH;
 189
 190             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 191             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 192
 193             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 194             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 195             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 196             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
 197
 198             dx          = _mm_sub_ps(ix, jx);
 199             dy          = _mm_sub_ps(iy, jy);
 200             dz          = _mm_sub_ps(iz, jz);
 201             dxB         = _mm_sub_ps(ix, jxB);
 202             dyB         = _mm_sub_ps(iy, jyB);
 203             dzB         = _mm_sub_ps(iz, jzB);
 204
 205             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 206             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 207             rinv        = gmx_mm_invsqrt_ps(rsq);
 208             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 209             rinv2       = _mm_mul_ps(rinv, rinv);
 210             rinv2B      = _mm_mul_ps(rinvB, rinvB);
 211             rinv4       = _mm_mul_ps(rinv2, rinv2);
 212             rinv4B      = _mm_mul_ps(rinv2B, rinv2B);
 213             rinv6       = _mm_mul_ps(rinv4, rinv2);
 214             rinv6B      = _mm_mul_ps(rinv4B, rinv2B);
 215
 216             rvdw        = _mm_add_ps(rai, raj);
 217             rvdwB       = _mm_add_ps(rai, rajB);
 218             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 219             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
 220
 221             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 222             mask_cmpB   = _mm_cmple_ps(ratioB, still_p5inv);
 223
 224             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 225             if (0 == _mm_movemask_ps(mask_cmp) )
 226             {
 227                 /* if ratio>still_p5inv for ALL elements */
 228                 ccf         = one;
 229                 dccf        = _mm_setzero_ps();
 230             }
 231             else
 232             {
 233                 ratio       = _mm_min_ps(ratio, still_p5inv);
 234                 theta       = _mm_mul_ps(ratio, still_pip5);
 235                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 236                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 237                 ccf         = _mm_mul_ps(term, term);
 238                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 239                                          _mm_mul_ps(sinq, theta));
 240             }
 241             if (0 == _mm_movemask_ps(mask_cmpB) )
 242             {
 243                 /* if ratio>still_p5inv for ALL elements */
 244                 ccfB        = one;
 245                 dccfB       = _mm_setzero_ps();
 246             }
 247             else
 248             {
 249                 ratioB      = _mm_min_ps(ratioB, still_p5inv);
 250                 thetaB      = _mm_mul_ps(ratioB, still_pip5);
 251                 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
 252                 termB       = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
 253                 ccfB        = _mm_mul_ps(termB, termB);
 254                 dccfB       = _mm_mul_ps(_mm_mul_ps(two, termB),
 255                                          _mm_mul_ps(sinqB, thetaB));
 256             }
 257
 258             prod        = _mm_mul_ps(still_p4, vaj);
 259             prodB       = _mm_mul_ps(still_p4, vajB);
 260             icf4        = _mm_mul_ps(ccf, rinv4);
 261             icf4B       = _mm_mul_ps(ccfB, rinv4B);
 262             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 263             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
 264
 265             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 266             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
 267
 268             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
 269
 270             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 271             dadx += 4;
 272             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 273             dadx += 4;
 274             _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
 275             dadx += 4;
 276             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
 277             dadx += 4;
 278         }
 279
 280         for (; k < nj1-offset; k += 4)
 281         {
 282             jnrA        = jjnr[k];
 283             jnrB        = jjnr[k+1];
 284             jnrC        = jjnr[k+2];
 285             jnrD        = jjnr[k+3];
 286
 287             j3A         = 3*jnrA;
 288             j3B         = 3*jnrB;
 289             j3C         = 3*jnrC;
 290             j3D         = 3*jnrD;
 291
 292             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 293
 294             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 295             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 296
 297             dx          = _mm_sub_ps(ix, jx);
 298             dy          = _mm_sub_ps(iy, jy);
 299             dz          = _mm_sub_ps(iz, jz);
 300
 301             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 302             rinv        = gmx_mm_invsqrt_ps(rsq);
 303             rinv2       = _mm_mul_ps(rinv, rinv);
 304             rinv4       = _mm_mul_ps(rinv2, rinv2);
 305             rinv6       = _mm_mul_ps(rinv4, rinv2);
 306
 307             rvdw        = _mm_add_ps(rai, raj);
 308             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 309
 310             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 311
 312             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 313             if (0 == _mm_movemask_ps(mask_cmp))
 314             {
 315                 /* if ratio>still_p5inv for ALL elements */
 316                 ccf         = one;
 317                 dccf        = _mm_setzero_ps();
 318             }
 319             else
 320             {
 321                 ratio       = _mm_min_ps(ratio, still_p5inv);
 322                 theta       = _mm_mul_ps(ratio, still_pip5);
 323                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 324                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 325                 ccf         = _mm_mul_ps(term, term);
 326                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 327                                          _mm_mul_ps(sinq, theta));
 328             }
 329
 330             prod        = _mm_mul_ps(still_p4, vaj);
 331             icf4        = _mm_mul_ps(ccf, rinv4);
 332             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 333
 334             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 335
 336             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 337
 338             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 339             dadx += 4;
 340             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 341             dadx += 4;
 342         }
 343
 344         if (offset != 0)
 345         {
 346             if (offset == 1)
 347             {
 348                 jnrA        = jjnr[k];
 349                 j3A         = 3*jnrA;
 350                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
 351                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
 352                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
 353                 mask        = mask1;
 354             }
 355             else if (offset == 2)
 356             {
 357                 jnrA        = jjnr[k];
 358                 jnrB        = jjnr[k+1];
 359                 j3A         = 3*jnrA;
 360                 j3B         = 3*jnrB;
 361                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
 362                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
 363                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
 364                 mask        = mask2;
 365             }
 366             else
 367             {
 368                 /* offset must be 3 */
 369                 jnrA        = jjnr[k];
 370                 jnrB        = jjnr[k+1];
 371                 jnrC        = jjnr[k+2];
 372                 j3A         = 3*jnrA;
 373                 j3B         = 3*jnrB;
 374                 j3C         = 3*jnrC;
 375                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
 376                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
 377                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
 378                 mask        = mask3;
 379             }
 380
 381             dx          = _mm_sub_ps(ix, jx);
 382             dy          = _mm_sub_ps(iy, jy);
 383             dz          = _mm_sub_ps(iz, jz);
 384
 385             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 386             rinv        = gmx_mm_invsqrt_ps(rsq);
 387             rinv2       = _mm_mul_ps(rinv, rinv);
 388             rinv4       = _mm_mul_ps(rinv2, rinv2);
 389             rinv6       = _mm_mul_ps(rinv4, rinv2);
 390
 391             rvdw        = _mm_add_ps(rai, raj);
 392             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 393
 394             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 395
 396             if (0 == _mm_movemask_ps(mask_cmp))
 397             {
 398                 /* if ratio>still_p5inv for ALL elements */
 399                 ccf         = one;
 400                 dccf        = _mm_setzero_ps();
 401             }
 402             else
 403             {
 404                 ratio       = _mm_min_ps(ratio, still_p5inv);
 405                 theta       = _mm_mul_ps(ratio, still_pip5);
 406                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 407                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 408                 ccf         = _mm_mul_ps(term, term);
 409                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 410                                          _mm_mul_ps(sinq, theta));
 411             }
 412
 413             prod        = _mm_mul_ps(still_p4, vaj);
 414             icf4        = _mm_mul_ps(ccf, rinv4);
 415             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 416
 417             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 418
 419             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 420             dadx += 4;
 421             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 422             dadx += 4;
 423
 424             tmp = _mm_mul_ps(prod_ai, icf4);
 425
 426             if (offset == 1)
 427             {
 428                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
 429             }
 430             else if (offset == 2)
 431             {
 432                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
 433             }
 434             else
 435             {
 436                 /* offset must be 3 */
 437                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
 438             }
 439         }
 440         GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
 441     }
 442
 443     /* Sum up the polarization energy from other nodes */
 444     if (PARTDECOMP(cr))
 445     {
 446         gmx_sum(natoms, work, cr);
 447     }
 448     else if (DOMAINDECOMP(cr))
 449     {
 450         dd_atom_sum_real(cr->dd, work);
 451     }
 452
 453     /* Compute the radii */
 454     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 455     {
 456         if (born->use[i] != 0)
 457         {
 458             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 459             gpi2             = gpi_ai * gpi_ai;
 460             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 461             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 462         }
 463     }
 464
 465     /* Extra (local) communication required for DD */
 466     if (DOMAINDECOMP(cr))
 467     {
 468         dd_atom_spread_real(cr->dd, born->bRad);
 469         dd_atom_spread_real(cr->dd, fr->invsqrta);
 470     }
 471
 472     return 0;
 473 }
 474
 475
 476 int
 477 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 478                                 const t_atomtypes *atype, float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 479 {
 480     int          i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 481     int          jnrA, jnrB, jnrC, jnrD;
 482     int          j3A, j3B, j3C, j3D;
 483     int          jnrE, jnrF, jnrG, jnrH;
 484     int          j3E, j3F, j3G, j3H;
 485     float        shX, shY, shZ;
 486     float        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 487     float        sum_ai2, sum_ai3, tsum, tchain, doffset;
 488     float       *obc_param;
 489     float       *gb_radius;
 490     float       *work;
 491     int       *  jjnr;
 492     float       *dadx;
 493     float       *shiftvec;
 494     float        min_rad, rad;
 495
 496     __m128       ix, iy, iz, jx, jy, jz;
 497     __m128       dx, dy, dz, t1, t2, t3, t4;
 498     __m128       rsq, rinv, r;
 499     __m128       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 500     __m128       uij, lij2, uij2, lij3, uij3, diff2;
 501     __m128       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 502     __m128       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 503     __m128       dadx1, dadx2;
 504     __m128       logterm;
 505     __m128       mask;
 506     __m128       obc_mask1, obc_mask2, obc_mask3;
 507     __m128       jxB, jyB, jzB, t1B, t2B, t3B, t4B;
 508     __m128       dxB, dyB, dzB, rsqB, rinvB, rB;
 509     __m128       rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
 510     __m128       uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
 511     __m128       lij_invB, sk2_invB, prodB;
 512     __m128       sk_ajB, sk2_ajB, sk2_rinvB;
 513     __m128       dadx1B, dadx2B;
 514     __m128       logtermB;
 515     __m128       obc_mask1B, obc_mask2B, obc_mask3B;
 516
 517     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 518     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 519     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 520
 521     __m128       oneeighth   = _mm_set1_ps(0.125);
 522     __m128       onefourth   = _mm_set1_ps(0.25);
 523
 524     const __m128 half  = _mm_set1_ps(0.5f);
 525     const __m128 three = _mm_set1_ps(3.0f);
 526     const __m128 one   = _mm_set1_ps(1.0f);
 527     const __m128 two   = _mm_set1_ps(2.0f);
 528     const __m128 zero  = _mm_set1_ps(0.0f);
 529     const __m128 neg   = _mm_set1_ps(-1.0f);
 530
 531     /* Set the dielectric offset */
 532     doffset   = born->gb_doffset;
 533     gb_radius = born->gb_radius;
 534     obc_param = born->param;
 535     work      = born->gpol_hct_work;
 536     jjnr      = nl->jjnr;
 537     dadx      = fr->dadx;
 538     shiftvec  = fr->shift_vec[0];
 539
 540     jx        = _mm_setzero_ps();
 541     jy        = _mm_setzero_ps();
 542     jz        = _mm_setzero_ps();
 543
 544     jnrA = jnrB = jnrC = jnrD = 0;
 545
 546     for (i = 0; i < born->nr; i++)
 547     {
 548         work[i] = 0;
 549     }
 550
 551     for (i = 0; i < nl->nri; i++)
 552     {
 553         ii     = nl->iinr[i];
 554         ii3    = ii*3;
 555         is3    = 3*nl->shift[i];
 556         shX    = shiftvec[is3];
 557         shY    = shiftvec[is3+1];
 558         shZ    = shiftvec[is3+2];
 559         nj0    = nl->jindex[i];
 560         nj1    = nl->jindex[i+1];
 561
 562         ix     = _mm_set1_ps(shX+x[ii3+0]);
 563         iy     = _mm_set1_ps(shY+x[ii3+1]);
 564         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 565
 566         offset = (nj1-nj0)%4;
 567
 568         rai     = _mm_load1_ps(gb_radius+ii);
 569         rai_inv = gmx_mm_inv_ps(rai);
 570
 571         sum_ai = _mm_setzero_ps();
 572
 573         sk_ai  = _mm_load1_ps(born->param+ii);
 574         sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
 575
 576         for (k = nj0; k < nj1-4-offset; k += 8)
 577         {
 578             jnrA        = jjnr[k];
 579             jnrB        = jjnr[k+1];
 580             jnrC        = jjnr[k+2];
 581             jnrD        = jjnr[k+3];
 582             jnrE        = jjnr[k+4];
 583             jnrF        = jjnr[k+5];
 584             jnrG        = jjnr[k+6];
 585             jnrH        = jjnr[k+7];
 586
 587             j3A         = 3*jnrA;
 588             j3B         = 3*jnrB;
 589             j3C         = 3*jnrC;
 590             j3D         = 3*jnrD;
 591             j3E         = 3*jnrE;
 592             j3F         = 3*jnrF;
 593             j3G         = 3*jnrG;
 594             j3H         = 3*jnrH;
 595
 596             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 597             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 598             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 599             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 600             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 601             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
 602
 603             dx    = _mm_sub_ps(ix, jx);
 604             dy    = _mm_sub_ps(iy, jy);
 605             dz    = _mm_sub_ps(iz, jz);
 606             dxB   = _mm_sub_ps(ix, jxB);
 607             dyB   = _mm_sub_ps(iy, jyB);
 608             dzB   = _mm_sub_ps(iz, jzB);
 609
 610             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 611             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 612
 613             rinv        = gmx_mm_invsqrt_ps(rsq);
 614             r           = _mm_mul_ps(rsq, rinv);
 615             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 616             rB          = _mm_mul_ps(rsqB, rinvB);
 617
 618             /* Compute raj_inv aj1-4 */
 619             raj_inv     = gmx_mm_inv_ps(raj);
 620             raj_invB    = gmx_mm_inv_ps(rajB);
 621
 622             /* Evaluate influence of atom aj -> ai */
 623             t1            = _mm_add_ps(r, sk_aj);
 624             t2            = _mm_sub_ps(r, sk_aj);
 625             t3            = _mm_sub_ps(sk_aj, r);
 626             t1B           = _mm_add_ps(rB, sk_ajB);
 627             t2B           = _mm_sub_ps(rB, sk_ajB);
 628             t3B           = _mm_sub_ps(sk_ajB, rB);
 629             obc_mask1     = _mm_cmplt_ps(rai, t1);
 630             obc_mask2     = _mm_cmplt_ps(rai, t2);
 631             obc_mask3     = _mm_cmplt_ps(rai, t3);
 632             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
 633             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
 634             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
 635
 636             uij           = gmx_mm_inv_ps(t1);
 637             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 638                                          _mm_andnot_ps(obc_mask2, rai_inv));
 639             dlij          = _mm_and_ps(one, obc_mask2);
 640             uij2          = _mm_mul_ps(uij, uij);
 641             uij3          = _mm_mul_ps(uij2, uij);
 642             lij2          = _mm_mul_ps(lij, lij);
 643             lij3          = _mm_mul_ps(lij2, lij);
 644
 645             uijB          = gmx_mm_inv_ps(t1B);
 646             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 647                                          _mm_andnot_ps(obc_mask2B, rai_inv));
 648             dlijB         = _mm_and_ps(one, obc_mask2B);
 649             uij2B         = _mm_mul_ps(uijB, uijB);
 650             uij3B         = _mm_mul_ps(uij2B, uijB);
 651             lij2B         = _mm_mul_ps(lijB, lijB);
 652             lij3B         = _mm_mul_ps(lij2B, lijB);
 653
 654             diff2         = _mm_sub_ps(uij2, lij2);
 655             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 656             sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 657             sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 658             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 659
 660             diff2B        = _mm_sub_ps(uij2B, lij2B);
 661             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 662             sk2_ajB       = _mm_mul_ps(sk_ajB, sk_ajB);
 663             sk2_rinvB     = _mm_mul_ps(sk2_ajB, rinvB);
 664             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 665
 666             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 667             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 668
 669             t1            = _mm_sub_ps(lij, uij);
 670             t2            = _mm_mul_ps(diff2,
 671                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 672                                                   prod));
 673             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 674             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 675             t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 676             t4            = _mm_and_ps(t4, obc_mask3);
 677             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 678
 679             t1B           = _mm_sub_ps(lijB, uijB);
 680             t2B           = _mm_mul_ps(diff2B,
 681                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 682                                                   prodB));
 683             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 684             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 685             t4B           = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
 686             t4B           = _mm_and_ps(t4B, obc_mask3B);
 687             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 688
 689             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
 690
 691             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 692                                        _mm_mul_ps(prod, lij3));
 693             t1            = _mm_sub_ps(t1,
 694                                        _mm_mul_ps(onefourth,
 695                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 696                                                              _mm_mul_ps(lij3, r))));
 697             t2            = _mm_mul_ps(onefourth,
 698                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 699                                                   _mm_mul_ps(uij3, r)));
 700             t2            = _mm_sub_ps(t2,
 701                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 702                                                   _mm_mul_ps(prod, uij3)));
 703             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 704                                        _mm_mul_ps(rinv, rinv));
 705             t3            = _mm_sub_ps(t3,
 706                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 707                                                   _mm_add_ps(one,
 708                                                              _mm_mul_ps(sk2_rinv, rinv))));
 709             t1            = _mm_mul_ps(rinv,
 710                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 711                                                   _mm_add_ps(t2, t3)));
 712
 713
 714
 715             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 716                                        _mm_mul_ps(prodB, lij3B));
 717             t1B           = _mm_sub_ps(t1B,
 718                                        _mm_mul_ps(onefourth,
 719                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 720                                                              _mm_mul_ps(lij3B, rB))));
 721             t2B           = _mm_mul_ps(onefourth,
 722                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 723                                                   _mm_mul_ps(uij3B, rB)));
 724             t2B           = _mm_sub_ps(t2B,
 725                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 726                                                   _mm_mul_ps(prodB, uij3B)));
 727             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 728                                        _mm_mul_ps(rinvB, rinvB));
 729             t3B           = _mm_sub_ps(t3B,
 730                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 731                                                   _mm_add_ps(one,
 732                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 733             t1B           = _mm_mul_ps(rinvB,
 734                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 735                                                   _mm_add_ps(t2B, t3B)));
 736
 737             dadx1         = _mm_and_ps(t1, obc_mask1);
 738             dadx1B        = _mm_and_ps(t1B, obc_mask1B);
 739
 740
 741             /* Evaluate influence of atom ai -> aj */
 742             t1            = _mm_add_ps(r, sk_ai);
 743             t2            = _mm_sub_ps(r, sk_ai);
 744             t3            = _mm_sub_ps(sk_ai, r);
 745             t1B           = _mm_add_ps(rB, sk_ai);
 746             t2B           = _mm_sub_ps(rB, sk_ai);
 747             t3B           = _mm_sub_ps(sk_ai, rB);
 748             obc_mask1     = _mm_cmplt_ps(raj, t1);
 749             obc_mask2     = _mm_cmplt_ps(raj, t2);
 750             obc_mask3     = _mm_cmplt_ps(raj, t3);
 751             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
 752             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
 753             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
 754
 755             uij           = gmx_mm_inv_ps(t1);
 756             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 757                                          _mm_andnot_ps(obc_mask2, raj_inv));
 758             dlij          = _mm_and_ps(one, obc_mask2);
 759             uij2          = _mm_mul_ps(uij, uij);
 760             uij3          = _mm_mul_ps(uij2, uij);
 761             lij2          = _mm_mul_ps(lij, lij);
 762             lij3          = _mm_mul_ps(lij2, lij);
 763
 764             uijB          = gmx_mm_inv_ps(t1B);
 765             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 766                                          _mm_andnot_ps(obc_mask2B, raj_invB));
 767             dlijB         = _mm_and_ps(one, obc_mask2B);
 768             uij2B         = _mm_mul_ps(uijB, uijB);
 769             uij3B         = _mm_mul_ps(uij2B, uijB);
 770             lij2B         = _mm_mul_ps(lijB, lijB);
 771             lij3B         = _mm_mul_ps(lij2B, lijB);
 772
 773             diff2         = _mm_sub_ps(uij2, lij2);
 774             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 775             sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 776             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 777
 778             diff2B        = _mm_sub_ps(uij2B, lij2B);
 779             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 780             sk2_rinvB     = _mm_mul_ps(sk2_ai, rinvB);
 781             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 782
 783             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 784             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 785
 786             t1            = _mm_sub_ps(lij, uij);
 787             t2            = _mm_mul_ps(diff2,
 788                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 789                                                   prod));
 790             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 791             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 792             t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 793             t4            = _mm_and_ps(t4, obc_mask3);
 794             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 795
 796             t1B           = _mm_sub_ps(lijB, uijB);
 797             t2B           = _mm_mul_ps(diff2B,
 798                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 799                                                   prodB));
 800             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 801             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 802             t4B           = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
 803             t4B           = _mm_and_ps(t4B, obc_mask3B);
 804             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 805
 806             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 807             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
 808
 809             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 810                                        _mm_mul_ps(prod, lij3));
 811             t1            = _mm_sub_ps(t1,
 812                                        _mm_mul_ps(onefourth,
 813                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 814                                                              _mm_mul_ps(lij3, r))));
 815             t2            = _mm_mul_ps(onefourth,
 816                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 817                                                   _mm_mul_ps(uij3, r)));
 818             t2            = _mm_sub_ps(t2,
 819                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 820                                                   _mm_mul_ps(prod, uij3)));
 821             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 822                                        _mm_mul_ps(rinv, rinv));
 823             t3            = _mm_sub_ps(t3,
 824                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 825                                                   _mm_add_ps(one,
 826                                                              _mm_mul_ps(sk2_rinv, rinv))));
 827             t1            = _mm_mul_ps(rinv,
 828                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 829                                                   _mm_add_ps(t2, t3)));
 830
 831
 832             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 833                                        _mm_mul_ps(prodB, lij3B));
 834             t1B           = _mm_sub_ps(t1B,
 835                                        _mm_mul_ps(onefourth,
 836                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 837                                                              _mm_mul_ps(lij3B, rB))));
 838             t2B           = _mm_mul_ps(onefourth,
 839                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 840                                                   _mm_mul_ps(uij3B, rB)));
 841             t2B           = _mm_sub_ps(t2B,
 842                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 843                                                   _mm_mul_ps(prodB, uij3B)));
 844             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 845                                        _mm_mul_ps(rinvB, rinvB));
 846             t3B           = _mm_sub_ps(t3B,
 847                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 848                                                   _mm_add_ps(one,
 849                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 850             t1B           = _mm_mul_ps(rinvB,
 851                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 852                                                   _mm_add_ps(t2B, t3B)));
 853
 854
 855             dadx2         = _mm_and_ps(t1, obc_mask1);
 856             dadx2B        = _mm_and_ps(t1B, obc_mask1B);
 857
 858             _mm_store_ps(dadx, dadx1);
 859             dadx += 4;
 860             _mm_store_ps(dadx, dadx2);
 861             dadx += 4;
 862             _mm_store_ps(dadx, dadx1B);
 863             dadx += 4;
 864             _mm_store_ps(dadx, dadx2B);
 865             dadx += 4;
 866
 867         } /* end normal inner loop */
 868
 869         for (; k < nj1-offset; k += 4)
 870         {
 871             jnrA        = jjnr[k];
 872             jnrB        = jjnr[k+1];
 873             jnrC        = jjnr[k+2];
 874             jnrD        = jjnr[k+3];
 875
 876             j3A         = 3*jnrA;
 877             j3B         = 3*jnrB;
 878             j3C         = 3*jnrC;
 879             j3D         = 3*jnrD;
 880
 881             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 882             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 883             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 884
 885             dx    = _mm_sub_ps(ix, jx);
 886             dy    = _mm_sub_ps(iy, jy);
 887             dz    = _mm_sub_ps(iz, jz);
 888
 889             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 890
 891             rinv        = gmx_mm_invsqrt_ps(rsq);
 892             r           = _mm_mul_ps(rsq, rinv);
 893
 894             /* Compute raj_inv aj1-4 */
 895             raj_inv     = gmx_mm_inv_ps(raj);
 896
 897             /* Evaluate influence of atom aj -> ai */
 898             t1            = _mm_add_ps(r, sk_aj);
 899             obc_mask1     = _mm_cmplt_ps(rai, t1);
 900
 901             if (_mm_movemask_ps(obc_mask1))
 902             {
 903                 /* If any of the elements has rai<dr+sk, this is executed */
 904                 t2            = _mm_sub_ps(r, sk_aj);
 905                 t3            = _mm_sub_ps(sk_aj, r);
 906
 907                 obc_mask2     = _mm_cmplt_ps(rai, t2);
 908                 obc_mask3     = _mm_cmplt_ps(rai, t3);
 909
 910                 uij           = gmx_mm_inv_ps(t1);
 911                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 912                                              _mm_andnot_ps(obc_mask2, rai_inv));
 913                 dlij          = _mm_and_ps(one, obc_mask2);
 914                 uij2          = _mm_mul_ps(uij, uij);
 915                 uij3          = _mm_mul_ps(uij2, uij);
 916                 lij2          = _mm_mul_ps(lij, lij);
 917                 lij3          = _mm_mul_ps(lij2, lij);
 918                 diff2         = _mm_sub_ps(uij2, lij2);
 919                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 920                 sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 921                 sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 922                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 923                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 924                 t1            = _mm_sub_ps(lij, uij);
 925                 t2            = _mm_mul_ps(diff2,
 926                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 927                                                       prod));
 928                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 929                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 930                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 931                 t4            = _mm_and_ps(t4, obc_mask3);
 932                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 933                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
 934                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 935                                            _mm_mul_ps(prod, lij3));
 936                 t1            = _mm_sub_ps(t1,
 937                                            _mm_mul_ps(onefourth,
 938                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 939                                                                  _mm_mul_ps(lij3, r))));
 940                 t2            = _mm_mul_ps(onefourth,
 941                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 942                                                       _mm_mul_ps(uij3, r)));
 943                 t2            = _mm_sub_ps(t2,
 944                                            _mm_add_ps(_mm_mul_ps(half, uij2),
 945                                                       _mm_mul_ps(prod, uij3)));
 946                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 947                                            _mm_mul_ps(rinv, rinv));
 948                 t3            = _mm_sub_ps(t3,
 949                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 950                                                       _mm_add_ps(one,
 951                                                                  _mm_mul_ps(sk2_rinv, rinv))));
 952                 t1            = _mm_mul_ps(rinv,
 953                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
 954                                                       _mm_add_ps(t2, t3)));
 955
 956                 dadx1         = _mm_and_ps(t1, obc_mask1);
 957             }
 958             else
 959             {
 960                 dadx1         = _mm_setzero_ps();
 961             }
 962
 963             /* Evaluate influence of atom ai -> aj */
 964             t1            = _mm_add_ps(r, sk_ai);
 965             obc_mask1     = _mm_cmplt_ps(raj, t1);
 966
 967             if (_mm_movemask_ps(obc_mask1))
 968             {
 969                 t2            = _mm_sub_ps(r, sk_ai);
 970                 t3            = _mm_sub_ps(sk_ai, r);
 971                 obc_mask2     = _mm_cmplt_ps(raj, t2);
 972                 obc_mask3     = _mm_cmplt_ps(raj, t3);
 973
 974                 uij           = gmx_mm_inv_ps(t1);
 975                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 976                                              _mm_andnot_ps(obc_mask2, raj_inv));
 977                 dlij          = _mm_and_ps(one, obc_mask2);
 978                 uij2          = _mm_mul_ps(uij, uij);
 979                 uij3          = _mm_mul_ps(uij2, uij);
 980                 lij2          = _mm_mul_ps(lij, lij);
 981                 lij3          = _mm_mul_ps(lij2, lij);
 982                 diff2         = _mm_sub_ps(uij2, lij2);
 983                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 984                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 985                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 986                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 987                 t1            = _mm_sub_ps(lij, uij);
 988                 t2            = _mm_mul_ps(diff2,
 989                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 990                                                       prod));
 991                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 992                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 993                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 994                 t4            = _mm_and_ps(t4, obc_mask3);
 995                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 996
 997                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 998
 999                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1000                                            _mm_mul_ps(prod, lij3));
1001                 t1            = _mm_sub_ps(t1,
1002                                            _mm_mul_ps(onefourth,
1003                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1004                                                                  _mm_mul_ps(lij3, r))));
1005                 t2            = _mm_mul_ps(onefourth,
1006                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1007                                                       _mm_mul_ps(uij3, r)));
1008                 t2            = _mm_sub_ps(t2,
1009                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1010                                                       _mm_mul_ps(prod, uij3)));
1011                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1012                                            _mm_mul_ps(rinv, rinv));
1013                 t3            = _mm_sub_ps(t3,
1014                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1015                                                       _mm_add_ps(one,
1016                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1017                 t1            = _mm_mul_ps(rinv,
1018                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1019                                                       _mm_add_ps(t2, t3)));
1020                 dadx2         = _mm_and_ps(t1, obc_mask1);
1021             }
1022             else
1023             {
1024                 dadx2         = _mm_setzero_ps();
1025             }
1026
1027             _mm_store_ps(dadx, dadx1);
1028             dadx += 4;
1029             _mm_store_ps(dadx, dadx2);
1030             dadx += 4;
1031         } /* end normal inner loop */
1032
1033         if (offset != 0)
1034         {
1035             if (offset == 1)
1036             {
1037                 jnrA        = jjnr[k];
1038                 j3A         = 3*jnrA;
1039                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1040                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1041                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1042                 mask        = mask1;
1043             }
1044             else if (offset == 2)
1045             {
1046                 jnrA        = jjnr[k];
1047                 jnrB        = jjnr[k+1];
1048                 j3A         = 3*jnrA;
1049                 j3B         = 3*jnrB;
1050                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1051                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1052                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1053                 mask        = mask2;
1054             }
1055             else
1056             {
1057                 /* offset must be 3 */
1058                 jnrA        = jjnr[k];
1059                 jnrB        = jjnr[k+1];
1060                 jnrC        = jjnr[k+2];
1061                 j3A         = 3*jnrA;
1062                 j3B         = 3*jnrB;
1063                 j3C         = 3*jnrC;
1064                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1065                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1066                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1067                 mask        = mask3;
1068             }
1069
1070             dx    = _mm_sub_ps(ix, jx);
1071             dy    = _mm_sub_ps(iy, jy);
1072             dz    = _mm_sub_ps(iz, jz);
1073
1074             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
1075
1076             rinv        = gmx_mm_invsqrt_ps(rsq);
1077             r           = _mm_mul_ps(rsq, rinv);
1078
1079             /* Compute raj_inv aj1-4 */
1080             raj_inv     = gmx_mm_inv_ps(raj);
1081
1082             /* Evaluate influence of atom aj -> ai */
1083             t1            = _mm_add_ps(r, sk_aj);
1084             obc_mask1     = _mm_cmplt_ps(rai, t1);
1085             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1086
1087             if (_mm_movemask_ps(obc_mask1))
1088             {
1089                 t2            = _mm_sub_ps(r, sk_aj);
1090                 t3            = _mm_sub_ps(sk_aj, r);
1091                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1092                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1093
1094                 uij           = gmx_mm_inv_ps(t1);
1095                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1096                                              _mm_andnot_ps(obc_mask2, rai_inv));
1097                 dlij           = _mm_and_ps(one, obc_mask2);
1098                 uij2           = _mm_mul_ps(uij, uij);
1099                 uij3           = _mm_mul_ps(uij2, uij);
1100                 lij2           = _mm_mul_ps(lij, lij);
1101                 lij3           = _mm_mul_ps(lij2, lij);
1102                 diff2          = _mm_sub_ps(uij2, lij2);
1103                 lij_inv        = gmx_mm_invsqrt_ps(lij2);
1104                 sk2_aj         = _mm_mul_ps(sk_aj, sk_aj);
1105                 sk2_rinv       = _mm_mul_ps(sk2_aj, rinv);
1106                 prod           = _mm_mul_ps(onefourth, sk2_rinv);
1107                 logterm        = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1108                 t1             = _mm_sub_ps(lij, uij);
1109                 t2             = _mm_mul_ps(diff2,
1110                                             _mm_sub_ps(_mm_mul_ps(onefourth, r),
1111                                                        prod));
1112                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1113                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1114                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1115                 t4            = _mm_and_ps(t4, obc_mask3);
1116                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1117                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1118                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1119                                            _mm_mul_ps(prod, lij3));
1120                 t1            = _mm_sub_ps(t1,
1121                                            _mm_mul_ps(onefourth,
1122                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1123                                                                  _mm_mul_ps(lij3, r))));
1124                 t2            = _mm_mul_ps(onefourth,
1125                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1126                                                       _mm_mul_ps(uij3, r)));
1127                 t2            = _mm_sub_ps(t2,
1128                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1129                                                       _mm_mul_ps(prod, uij3)));
1130                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1131                                            _mm_mul_ps(rinv, rinv));
1132                 t3            = _mm_sub_ps(t3,
1133                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1134                                                       _mm_add_ps(one,
1135                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1136                 t1            = _mm_mul_ps(rinv,
1137                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1138                                                       _mm_add_ps(t2, t3)));
1139                 dadx1         = _mm_and_ps(t1, obc_mask1);
1140             }
1141             else
1142             {
1143                 dadx1         = _mm_setzero_ps();
1144             }
1145
1146             /* Evaluate influence of atom ai -> aj */
1147             t1            = _mm_add_ps(r, sk_ai);
1148             obc_mask1     = _mm_cmplt_ps(raj, t1);
1149             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1150
1151             if (_mm_movemask_ps(obc_mask1))
1152             {
1153                 t2            = _mm_sub_ps(r, sk_ai);
1154                 t3            = _mm_sub_ps(sk_ai, r);
1155                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1156                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1157
1158                 uij           = gmx_mm_inv_ps(t1);
1159                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1160                                           _mm_andnot_ps(obc_mask2, raj_inv));
1161                 dlij          = _mm_and_ps(one, obc_mask2);
1162                 uij2          = _mm_mul_ps(uij, uij);
1163                 uij3          = _mm_mul_ps(uij2, uij);
1164                 lij2          = _mm_mul_ps(lij, lij);
1165                 lij3          = _mm_mul_ps(lij2, lij);
1166                 diff2         = _mm_sub_ps(uij2, lij2);
1167                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1168                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
1169                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
1170                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1171                 t1            = _mm_sub_ps(lij, uij);
1172                 t2            = _mm_mul_ps(diff2,
1173                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
1174                                                       prod));
1175                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1176                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1177                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1178                 t4            = _mm_and_ps(t4, obc_mask3);
1179                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1180
1181                 tmp           = _mm_and_ps(t1, obc_mask1);
1182
1183                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1184                                            _mm_mul_ps(prod, lij3));
1185                 t1            = _mm_sub_ps(t1,
1186                                            _mm_mul_ps(onefourth,
1187                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1188                                                                  _mm_mul_ps(lij3, r))));
1189                 t2            = _mm_mul_ps(onefourth,
1190                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1191                                                       _mm_mul_ps(uij3, r)));
1192                 t2            = _mm_sub_ps(t2,
1193                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1194                                                       _mm_mul_ps(prod, uij3)));
1195                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1196                                            _mm_mul_ps(rinv, rinv));
1197                 t3            = _mm_sub_ps(t3,
1198                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1199                                                       _mm_add_ps(one,
1200                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1201                 t1            = _mm_mul_ps(rinv,
1202                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1203                                                       _mm_add_ps(t2, t3)));
1204                 dadx2         = _mm_and_ps(t1, obc_mask1);
1205             }
1206             else
1207             {
1208                 dadx2         = _mm_setzero_ps();
1209                 tmp           = _mm_setzero_ps();
1210             }
1211
1212             _mm_store_ps(dadx, dadx1);
1213             dadx += 4;
1214             _mm_store_ps(dadx, dadx2);
1215             dadx += 4;
1216
1217             if (offset == 1)
1218             {
1219                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1220             }
1221             else if (offset == 2)
1222             {
1223                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1224             }
1225             else
1226             {
1227                 /* offset must be 3 */
1228                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1229             }
1230
1231         }
1232         GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1233
1234     }
1235
1236     /* Parallel summations */
1237     if (PARTDECOMP(cr))
1238     {
1239         gmx_sum(natoms, work, cr);
1240     }
1241     else if (DOMAINDECOMP(cr))
1242     {
1243         dd_atom_sum_real(cr->dd, work);
1244     }
1245
1246     if (gb_algorithm == egbHCT)
1247     {
1248         /* HCT */
1249         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1250         {
1251             if (born->use[i] != 0)
1252             {
1253                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1254                 sum     = 1.0/rr - work[i];
1255                 min_rad = rr + doffset;
1256                 rad     = 1.0/sum;
1257
1258                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1259                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1260             }
1261         }
1262
1263         /* Extra communication required for DD */
1264         if (DOMAINDECOMP(cr))
1265         {
1266             dd_atom_spread_real(cr->dd, born->bRad);
1267             dd_atom_spread_real(cr->dd, fr->invsqrta);
1268         }
1269     }
1270     else
1271     {
1272         /* OBC */
1273         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1274         {
1275             if (born->use[i] != 0)
1276             {
1277                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1278                 rr_inv2 = 1.0/rr;
1279                 rr      = rr-doffset;
1280                 rr_inv  = 1.0/rr;
1281                 sum     = rr * work[i];
1282                 sum2    = sum  * sum;
1283                 sum3    = sum2 * sum;
1284
1285                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1286                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1287                 born->bRad[i] = 1.0 / born->bRad[i];
1288
1289                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1290
1291                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1292                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1293             }
1294         }
1295         /* Extra (local) communication required for DD */
1296         if (DOMAINDECOMP(cr))
1297         {
1298             dd_atom_spread_real(cr->dd, born->bRad);
1299             dd_atom_spread_real(cr->dd, fr->invsqrta);
1300             dd_atom_spread_real(cr->dd, born->drobc);
1301         }
1302     }
1303
1304
1305
1306     return 0;
1307 }
1308
1309
1310
1311 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1312                                     float *x, float *f, float *fshift, float *shiftvec,
1313                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1314 {
1315     int          i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1316     int          jnrA, jnrB, jnrC, jnrD;
1317     int          j3A, j3B, j3C, j3D;
1318     int          jnrE, jnrF, jnrG, jnrH;
1319     int          j3E, j3F, j3G, j3H;
1320     int       *  jjnr;
1321
1322     float        rbi, shX, shY, shZ;
1323     float       *rb;
1324
1325     __m128       ix, iy, iz;
1326     __m128       jx, jy, jz;
1327     __m128       jxB, jyB, jzB;
1328     __m128       fix, fiy, fiz;
1329     __m128       dx, dy, dz;
1330     __m128       tx, ty, tz;
1331     __m128       dxB, dyB, dzB;
1332     __m128       txB, tyB, tzB;
1333
1334     __m128       rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1335     __m128       xmm1, xmm2, xmm3;
1336
1337     const __m128 two = _mm_set1_ps(2.0f);
1338
1339     rb     = born->work;
1340
1341     jjnr   = nl->jjnr;
1342
1343     /* Loop to get the proper form for the Born radius term, sse style */
1344     offset = natoms%4;
1345
1346     n0 = 0;
1347     n1 = natoms;
1348
1349     if (gb_algorithm == egbSTILL)
1350     {
1351         for (i = n0; i < n1; i++)
1352         {
1353             rbi   = born->bRad[i];
1354             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1355         }
1356     }
1357     else if (gb_algorithm == egbHCT)
1358     {
1359         for (i = n0; i < n1; i++)
1360         {
1361             rbi   = born->bRad[i];
1362             rb[i] = rbi * rbi * dvda[i];
1363         }
1364     }
1365     else if (gb_algorithm == egbOBC)
1366     {
1367         for (i = n0; i < n1; i++)
1368         {
1369             rbi   = born->bRad[i];
1370             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1371         }
1372     }
1373
1374     jz = _mm_setzero_ps();
1375
1376     n = j3A = j3B = j3C = j3D = 0;
1377
1378     for (i = 0; i < nl->nri; i++)
1379     {
1380         ii     = nl->iinr[i];
1381         ii3    = ii*3;
1382         is3    = 3*nl->shift[i];
1383         shX    = shiftvec[is3];
1384         shY    = shiftvec[is3+1];
1385         shZ    = shiftvec[is3+2];
1386         nj0    = nl->jindex[i];
1387         nj1    = nl->jindex[i+1];
1388
1389         ix     = _mm_set1_ps(shX+x[ii3+0]);
1390         iy     = _mm_set1_ps(shY+x[ii3+1]);
1391         iz     = _mm_set1_ps(shZ+x[ii3+2]);
1392
1393         offset = (nj1-nj0)%4;
1394
1395         rbai   = _mm_load1_ps(rb+ii);
1396         fix    = _mm_setzero_ps();
1397         fiy    = _mm_setzero_ps();
1398         fiz    = _mm_setzero_ps();
1399
1400
1401         for (k = nj0; k < nj1-offset; k += 4)
1402         {
1403             jnrA        = jjnr[k];
1404             jnrB        = jjnr[k+1];
1405             jnrC        = jjnr[k+2];
1406             jnrD        = jjnr[k+3];
1407
1408             j3A         = 3*jnrA;
1409             j3B         = 3*jnrB;
1410             j3C         = 3*jnrC;
1411             j3D         = 3*jnrD;
1412
1413             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1414
1415             dx          = _mm_sub_ps(ix, jx);
1416             dy          = _mm_sub_ps(iy, jy);
1417             dz          = _mm_sub_ps(iz, jz);
1418
1419             GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1420
1421             /* load chain rule terms for j1-4 */
1422             f_gb        = _mm_load_ps(dadx);
1423             dadx       += 4;
1424             f_gb_ai     = _mm_load_ps(dadx);
1425             dadx       += 4;
1426
1427             /* calculate scalar force */
1428             f_gb    = _mm_mul_ps(f_gb, rbai);
1429             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1430             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1431
1432             tx     = _mm_mul_ps(f_gb, dx);
1433             ty     = _mm_mul_ps(f_gb, dy);
1434             tz     = _mm_mul_ps(f_gb, dz);
1435
1436             fix    = _mm_add_ps(fix, tx);
1437             fiy    = _mm_add_ps(fiy, ty);
1438             fiz    = _mm_add_ps(fiz, tz);
1439
1440             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1441         }
1442
1443         /*deal with odd elements */
1444         if (offset != 0)
1445         {
1446             if (offset == 1)
1447             {
1448                 jnrA        = jjnr[k];
1449                 j3A         = 3*jnrA;
1450                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1451                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1452             }
1453             else if (offset == 2)
1454             {
1455                 jnrA        = jjnr[k];
1456                 jnrB        = jjnr[k+1];
1457                 j3A         = 3*jnrA;
1458                 j3B         = 3*jnrB;
1459                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1460                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1461             }
1462             else
1463             {
1464                 /* offset must be 3 */
1465                 jnrA        = jjnr[k];
1466                 jnrB        = jjnr[k+1];
1467                 jnrC        = jjnr[k+2];
1468                 j3A         = 3*jnrA;
1469                 j3B         = 3*jnrB;
1470                 j3C         = 3*jnrC;
1471                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1472                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1473             }
1474
1475             dx          = _mm_sub_ps(ix, jx);
1476             dy          = _mm_sub_ps(iy, jy);
1477             dz          = _mm_sub_ps(iz, jz);
1478
1479             /* load chain rule terms for j1-4 */
1480             f_gb        = _mm_load_ps(dadx);
1481             dadx       += 4;
1482             f_gb_ai     = _mm_load_ps(dadx);
1483             dadx       += 4;
1484
1485             /* calculate scalar force */
1486             f_gb    = _mm_mul_ps(f_gb, rbai);
1487             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1488             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1489
1490             tx     = _mm_mul_ps(f_gb, dx);
1491             ty     = _mm_mul_ps(f_gb, dy);
1492             tz     = _mm_mul_ps(f_gb, dz);
1493
1494             fix    = _mm_add_ps(fix, tx);
1495             fiy    = _mm_add_ps(fiy, ty);
1496             fiz    = _mm_add_ps(fiz, tz);
1497
1498             if (offset == 1)
1499             {
1500                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1501             }
1502             else if (offset == 2)
1503             {
1504                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1505             }
1506             else
1507             {
1508                 /* offset must be 3 */
1509                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1510             }
1511         }
1512
1513         /* fix/fiy/fiz now contain four partial force terms, that all should be
1514          * added to the i particle forces and shift forces.
1515          */
1516         gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1517     }
1518
1519     return 0;
1520 }
1521
1522
1523 #else
1524 /* keep compiler happy */
1525 int genborn_sse_dummy;
1526
1527 #endif /* SSE intrinsics available */