src/gromacs/mdlib/genborn_sse2_single.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include <math.h>
  40 #include <string.h>
  41
  42 #include "gromacs/fileio/pdbio.h"
  43 #include "gromacs/legacyheaders/domdec.h"
  44 #include "gromacs/legacyheaders/genborn.h"
  45 #include "gromacs/legacyheaders/names.h"
  46 #include "gromacs/legacyheaders/network.h"
  47 #include "gromacs/legacyheaders/typedefs.h"
  48 #include "gromacs/math/units.h"
  49 #include "gromacs/math/vec.h"
  50 #include "gromacs/utility/fatalerror.h"
  51 #include "gromacs/utility/gmxmpi.h"
  52 #include "gromacs/utility/smalloc.h"
  53
  54
  55 /* Only compile this file if SSE intrinsics are available */
  56 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  57
  58 #include "genborn_sse2_single.h"
  59
  60 #include <emmintrin.h>
  61 #include <gmx_sse2_single.h>
  62
  63
  64 int
  65 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
  66                               int natoms, gmx_localtop_t *top,
  67                               float *x, t_nblist *nl,
  68                               gmx_genborn_t *born)
  69 {
  70     int          i, k, n, ii, is3, ii3, nj0, nj1, offset;
  71     int          jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
  72     int          jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
  73     int          shift;
  74     int         *mdtype;
  75     real         shX, shY, shZ;
  76     int         *jjnr;
  77     real        *shiftvec;
  78
  79     float        gpi_ai, gpi2;
  80     float        factor;
  81     float       *gb_radius;
  82     float       *vsolv;
  83     float       *work;
  84     float       *dadx;
  85
  86     __m128       ix, iy, iz;
  87     __m128       jx, jy, jz;
  88     __m128       dx, dy, dz;
  89     __m128       tx, ty, tz;
  90     __m128       jxB, jyB, jzB;
  91     __m128       dxB, dyB, dzB;
  92     __m128       txB, tyB, tzB;
  93     __m128       rsq, rinv, rinv2, rinv4, rinv6;
  94     __m128       rsqB, rinvB, rinv2B, rinv4B, rinv6B;
  95     __m128       ratio, gpi, rai, raj, vai, vaj, rvdw;
  96     __m128       ratioB, rajB, vajB, rvdwB;
  97     __m128       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
  98     __m128       ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
  99     __m128       mask, icf4, icf6, mask_cmp;
 100     __m128       icf4B, icf6B, mask_cmpB;
 101
 102     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 103     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 104     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 105
 106     const __m128 half   = _mm_set1_ps(0.5f);
 107     const __m128 three  = _mm_set1_ps(3.0f);
 108     const __m128 one    = _mm_set1_ps(1.0f);
 109     const __m128 two    = _mm_set1_ps(2.0f);
 110     const __m128 zero   = _mm_set1_ps(0.0f);
 111     const __m128 four   = _mm_set1_ps(4.0f);
 112
 113     const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
 114     const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
 115     const __m128 still_p4     = _mm_set1_ps(STILL_P4);
 116
 117     factor  = 0.5 * ONE_4PI_EPS0;
 118
 119     gb_radius = born->gb_radius;
 120     vsolv     = born->vsolv;
 121     work      = born->gpol_still_work;
 122     jjnr      = nl->jjnr;
 123     shiftvec  = fr->shift_vec[0];
 124     dadx      = fr->dadx;
 125
 126     jnrA = jnrB = jnrC = jnrD = 0;
 127     jx   = _mm_setzero_ps();
 128     jy   = _mm_setzero_ps();
 129     jz   = _mm_setzero_ps();
 130
 131     n = 0;
 132
 133     for (i = 0; i < natoms; i++)
 134     {
 135         work[i] = 0;
 136     }
 137
 138     for (i = 0; i < nl->nri; i++)
 139     {
 140         ii     = nl->iinr[i];
 141         ii3    = ii*3;
 142         is3    = 3*nl->shift[i];
 143         shX    = shiftvec[is3];
 144         shY    = shiftvec[is3+1];
 145         shZ    = shiftvec[is3+2];
 146         nj0    = nl->jindex[i];
 147         nj1    = nl->jindex[i+1];
 148
 149         ix     = _mm_set1_ps(shX+x[ii3+0]);
 150         iy     = _mm_set1_ps(shY+x[ii3+1]);
 151         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 152
 153         offset = (nj1-nj0)%4;
 154
 155         /* Polarization energy for atom ai */
 156         gpi    = _mm_setzero_ps();
 157
 158         rai     = _mm_load1_ps(gb_radius+ii);
 159         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
 160
 161         for (k = nj0; k < nj1-4-offset; k += 8)
 162         {
 163             jnrA        = jjnr[k];
 164             jnrB        = jjnr[k+1];
 165             jnrC        = jjnr[k+2];
 166             jnrD        = jjnr[k+3];
 167             jnrE        = jjnr[k+4];
 168             jnrF        = jjnr[k+5];
 169             jnrG        = jjnr[k+6];
 170             jnrH        = jjnr[k+7];
 171
 172             j3A         = 3*jnrA;
 173             j3B         = 3*jnrB;
 174             j3C         = 3*jnrC;
 175             j3D         = 3*jnrD;
 176             j3E         = 3*jnrE;
 177             j3F         = 3*jnrF;
 178             j3G         = 3*jnrG;
 179             j3H         = 3*jnrH;
 180
 181             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 182             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 183
 184             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 185             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 186             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 187             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
 188
 189             dx          = _mm_sub_ps(ix, jx);
 190             dy          = _mm_sub_ps(iy, jy);
 191             dz          = _mm_sub_ps(iz, jz);
 192             dxB         = _mm_sub_ps(ix, jxB);
 193             dyB         = _mm_sub_ps(iy, jyB);
 194             dzB         = _mm_sub_ps(iz, jzB);
 195
 196             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 197             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 198             rinv        = gmx_mm_invsqrt_ps(rsq);
 199             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 200             rinv2       = _mm_mul_ps(rinv, rinv);
 201             rinv2B      = _mm_mul_ps(rinvB, rinvB);
 202             rinv4       = _mm_mul_ps(rinv2, rinv2);
 203             rinv4B      = _mm_mul_ps(rinv2B, rinv2B);
 204             rinv6       = _mm_mul_ps(rinv4, rinv2);
 205             rinv6B      = _mm_mul_ps(rinv4B, rinv2B);
 206
 207             rvdw        = _mm_add_ps(rai, raj);
 208             rvdwB       = _mm_add_ps(rai, rajB);
 209             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 210             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
 211
 212             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 213             mask_cmpB   = _mm_cmple_ps(ratioB, still_p5inv);
 214
 215             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 216             if (0 == _mm_movemask_ps(mask_cmp) )
 217             {
 218                 /* if ratio>still_p5inv for ALL elements */
 219                 ccf         = one;
 220                 dccf        = _mm_setzero_ps();
 221             }
 222             else
 223             {
 224                 ratio       = _mm_min_ps(ratio, still_p5inv);
 225                 theta       = _mm_mul_ps(ratio, still_pip5);
 226                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 227                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 228                 ccf         = _mm_mul_ps(term, term);
 229                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 230                                          _mm_mul_ps(sinq, theta));
 231             }
 232             if (0 == _mm_movemask_ps(mask_cmpB) )
 233             {
 234                 /* if ratio>still_p5inv for ALL elements */
 235                 ccfB        = one;
 236                 dccfB       = _mm_setzero_ps();
 237             }
 238             else
 239             {
 240                 ratioB      = _mm_min_ps(ratioB, still_p5inv);
 241                 thetaB      = _mm_mul_ps(ratioB, still_pip5);
 242                 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
 243                 termB       = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
 244                 ccfB        = _mm_mul_ps(termB, termB);
 245                 dccfB       = _mm_mul_ps(_mm_mul_ps(two, termB),
 246                                          _mm_mul_ps(sinqB, thetaB));
 247             }
 248
 249             prod        = _mm_mul_ps(still_p4, vaj);
 250             prodB       = _mm_mul_ps(still_p4, vajB);
 251             icf4        = _mm_mul_ps(ccf, rinv4);
 252             icf4B       = _mm_mul_ps(ccfB, rinv4B);
 253             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 254             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
 255
 256             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 257             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
 258
 259             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
 260
 261             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 262             dadx += 4;
 263             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 264             dadx += 4;
 265             _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
 266             dadx += 4;
 267             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
 268             dadx += 4;
 269         }
 270
 271         for (; k < nj1-offset; k += 4)
 272         {
 273             jnrA        = jjnr[k];
 274             jnrB        = jjnr[k+1];
 275             jnrC        = jjnr[k+2];
 276             jnrD        = jjnr[k+3];
 277
 278             j3A         = 3*jnrA;
 279             j3B         = 3*jnrB;
 280             j3C         = 3*jnrC;
 281             j3D         = 3*jnrD;
 282
 283             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 284
 285             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 286             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 287
 288             dx          = _mm_sub_ps(ix, jx);
 289             dy          = _mm_sub_ps(iy, jy);
 290             dz          = _mm_sub_ps(iz, jz);
 291
 292             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 293             rinv        = gmx_mm_invsqrt_ps(rsq);
 294             rinv2       = _mm_mul_ps(rinv, rinv);
 295             rinv4       = _mm_mul_ps(rinv2, rinv2);
 296             rinv6       = _mm_mul_ps(rinv4, rinv2);
 297
 298             rvdw        = _mm_add_ps(rai, raj);
 299             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 300
 301             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 302
 303             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 304             if (0 == _mm_movemask_ps(mask_cmp))
 305             {
 306                 /* if ratio>still_p5inv for ALL elements */
 307                 ccf         = one;
 308                 dccf        = _mm_setzero_ps();
 309             }
 310             else
 311             {
 312                 ratio       = _mm_min_ps(ratio, still_p5inv);
 313                 theta       = _mm_mul_ps(ratio, still_pip5);
 314                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 315                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 316                 ccf         = _mm_mul_ps(term, term);
 317                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 318                                          _mm_mul_ps(sinq, theta));
 319             }
 320
 321             prod        = _mm_mul_ps(still_p4, vaj);
 322             icf4        = _mm_mul_ps(ccf, rinv4);
 323             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 324
 325             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 326
 327             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 328
 329             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 330             dadx += 4;
 331             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 332             dadx += 4;
 333         }
 334
 335         if (offset != 0)
 336         {
 337             if (offset == 1)
 338             {
 339                 jnrA        = jjnr[k];
 340                 j3A         = 3*jnrA;
 341                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
 342                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
 343                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
 344                 mask        = mask1;
 345             }
 346             else if (offset == 2)
 347             {
 348                 jnrA        = jjnr[k];
 349                 jnrB        = jjnr[k+1];
 350                 j3A         = 3*jnrA;
 351                 j3B         = 3*jnrB;
 352                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
 353                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
 354                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
 355                 mask        = mask2;
 356             }
 357             else
 358             {
 359                 /* offset must be 3 */
 360                 jnrA        = jjnr[k];
 361                 jnrB        = jjnr[k+1];
 362                 jnrC        = jjnr[k+2];
 363                 j3A         = 3*jnrA;
 364                 j3B         = 3*jnrB;
 365                 j3C         = 3*jnrC;
 366                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
 367                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
 368                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
 369                 mask        = mask3;
 370             }
 371
 372             dx          = _mm_sub_ps(ix, jx);
 373             dy          = _mm_sub_ps(iy, jy);
 374             dz          = _mm_sub_ps(iz, jz);
 375
 376             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 377             rinv        = gmx_mm_invsqrt_ps(rsq);
 378             rinv2       = _mm_mul_ps(rinv, rinv);
 379             rinv4       = _mm_mul_ps(rinv2, rinv2);
 380             rinv6       = _mm_mul_ps(rinv4, rinv2);
 381
 382             rvdw        = _mm_add_ps(rai, raj);
 383             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 384
 385             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 386
 387             if (0 == _mm_movemask_ps(mask_cmp))
 388             {
 389                 /* if ratio>still_p5inv for ALL elements */
 390                 ccf         = one;
 391                 dccf        = _mm_setzero_ps();
 392             }
 393             else
 394             {
 395                 ratio       = _mm_min_ps(ratio, still_p5inv);
 396                 theta       = _mm_mul_ps(ratio, still_pip5);
 397                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 398                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 399                 ccf         = _mm_mul_ps(term, term);
 400                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 401                                          _mm_mul_ps(sinq, theta));
 402             }
 403
 404             prod        = _mm_mul_ps(still_p4, vaj);
 405             icf4        = _mm_mul_ps(ccf, rinv4);
 406             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 407
 408             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 409
 410             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 411             dadx += 4;
 412             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 413             dadx += 4;
 414
 415             tmp = _mm_mul_ps(prod_ai, icf4);
 416
 417             if (offset == 1)
 418             {
 419                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
 420             }
 421             else if (offset == 2)
 422             {
 423                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
 424             }
 425             else
 426             {
 427                 /* offset must be 3 */
 428                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
 429             }
 430         }
 431         GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
 432     }
 433
 434     /* Sum up the polarization energy from other nodes */
 435     if (DOMAINDECOMP(cr))
 436     {
 437         dd_atom_sum_real(cr->dd, work);
 438     }
 439
 440     /* Compute the radii */
 441     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 442     {
 443         if (born->use[i] != 0)
 444         {
 445             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 446             gpi2             = gpi_ai * gpi_ai;
 447             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 448             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 449         }
 450     }
 451
 452     /* Extra (local) communication required for DD */
 453     if (DOMAINDECOMP(cr))
 454     {
 455         dd_atom_spread_real(cr->dd, born->bRad);
 456         dd_atom_spread_real(cr->dd, fr->invsqrta);
 457     }
 458
 459     return 0;
 460 }
 461
 462
 463 int
 464 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 465                                 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 466 {
 467     int          i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 468     int          jnrA, jnrB, jnrC, jnrD;
 469     int          j3A, j3B, j3C, j3D;
 470     int          jnrE, jnrF, jnrG, jnrH;
 471     int          j3E, j3F, j3G, j3H;
 472     float        shX, shY, shZ;
 473     float        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 474     float        sum_ai2, sum_ai3, tsum, tchain, doffset;
 475     float       *obc_param;
 476     float       *gb_radius;
 477     float       *work;
 478     int       *  jjnr;
 479     float       *dadx;
 480     float       *shiftvec;
 481     float        min_rad, rad;
 482
 483     __m128       ix, iy, iz, jx, jy, jz;
 484     __m128       dx, dy, dz, t1, t2, t3, t4;
 485     __m128       rsq, rinv, r;
 486     __m128       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 487     __m128       uij, lij2, uij2, lij3, uij3, diff2;
 488     __m128       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 489     __m128       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 490     __m128       dadx1, dadx2;
 491     __m128       logterm;
 492     __m128       mask;
 493     __m128       obc_mask1, obc_mask2, obc_mask3;
 494     __m128       jxB, jyB, jzB, t1B, t2B, t3B, t4B;
 495     __m128       dxB, dyB, dzB, rsqB, rinvB, rB;
 496     __m128       rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
 497     __m128       uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
 498     __m128       lij_invB, sk2_invB, prodB;
 499     __m128       sk_ajB, sk2_ajB, sk2_rinvB;
 500     __m128       dadx1B, dadx2B;
 501     __m128       logtermB;
 502     __m128       obc_mask1B, obc_mask2B, obc_mask3B;
 503
 504     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 505     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 506     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 507
 508     __m128       oneeighth   = _mm_set1_ps(0.125);
 509     __m128       onefourth   = _mm_set1_ps(0.25);
 510
 511     const __m128 half  = _mm_set1_ps(0.5f);
 512     const __m128 three = _mm_set1_ps(3.0f);
 513     const __m128 one   = _mm_set1_ps(1.0f);
 514     const __m128 two   = _mm_set1_ps(2.0f);
 515     const __m128 zero  = _mm_set1_ps(0.0f);
 516     const __m128 neg   = _mm_set1_ps(-1.0f);
 517
 518     /* Set the dielectric offset */
 519     doffset   = born->gb_doffset;
 520     gb_radius = born->gb_radius;
 521     obc_param = born->param;
 522     work      = born->gpol_hct_work;
 523     jjnr      = nl->jjnr;
 524     dadx      = fr->dadx;
 525     shiftvec  = fr->shift_vec[0];
 526
 527     jx        = _mm_setzero_ps();
 528     jy        = _mm_setzero_ps();
 529     jz        = _mm_setzero_ps();
 530
 531     jnrA = jnrB = jnrC = jnrD = 0;
 532
 533     for (i = 0; i < born->nr; i++)
 534     {
 535         work[i] = 0;
 536     }
 537
 538     for (i = 0; i < nl->nri; i++)
 539     {
 540         ii     = nl->iinr[i];
 541         ii3    = ii*3;
 542         is3    = 3*nl->shift[i];
 543         shX    = shiftvec[is3];
 544         shY    = shiftvec[is3+1];
 545         shZ    = shiftvec[is3+2];
 546         nj0    = nl->jindex[i];
 547         nj1    = nl->jindex[i+1];
 548
 549         ix     = _mm_set1_ps(shX+x[ii3+0]);
 550         iy     = _mm_set1_ps(shY+x[ii3+1]);
 551         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 552
 553         offset = (nj1-nj0)%4;
 554
 555         rai     = _mm_load1_ps(gb_radius+ii);
 556         rai_inv = gmx_mm_inv_ps(rai);
 557
 558         sum_ai = _mm_setzero_ps();
 559
 560         sk_ai  = _mm_load1_ps(born->param+ii);
 561         sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
 562
 563         for (k = nj0; k < nj1-4-offset; k += 8)
 564         {
 565             jnrA        = jjnr[k];
 566             jnrB        = jjnr[k+1];
 567             jnrC        = jjnr[k+2];
 568             jnrD        = jjnr[k+3];
 569             jnrE        = jjnr[k+4];
 570             jnrF        = jjnr[k+5];
 571             jnrG        = jjnr[k+6];
 572             jnrH        = jjnr[k+7];
 573
 574             j3A         = 3*jnrA;
 575             j3B         = 3*jnrB;
 576             j3C         = 3*jnrC;
 577             j3D         = 3*jnrD;
 578             j3E         = 3*jnrE;
 579             j3F         = 3*jnrF;
 580             j3G         = 3*jnrG;
 581             j3H         = 3*jnrH;
 582
 583             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 584             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 585             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 586             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 587             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 588             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
 589
 590             dx    = _mm_sub_ps(ix, jx);
 591             dy    = _mm_sub_ps(iy, jy);
 592             dz    = _mm_sub_ps(iz, jz);
 593             dxB   = _mm_sub_ps(ix, jxB);
 594             dyB   = _mm_sub_ps(iy, jyB);
 595             dzB   = _mm_sub_ps(iz, jzB);
 596
 597             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 598             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 599
 600             rinv        = gmx_mm_invsqrt_ps(rsq);
 601             r           = _mm_mul_ps(rsq, rinv);
 602             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 603             rB          = _mm_mul_ps(rsqB, rinvB);
 604
 605             /* Compute raj_inv aj1-4 */
 606             raj_inv     = gmx_mm_inv_ps(raj);
 607             raj_invB    = gmx_mm_inv_ps(rajB);
 608
 609             /* Evaluate influence of atom aj -> ai */
 610             t1            = _mm_add_ps(r, sk_aj);
 611             t2            = _mm_sub_ps(r, sk_aj);
 612             t3            = _mm_sub_ps(sk_aj, r);
 613             t1B           = _mm_add_ps(rB, sk_ajB);
 614             t2B           = _mm_sub_ps(rB, sk_ajB);
 615             t3B           = _mm_sub_ps(sk_ajB, rB);
 616             obc_mask1     = _mm_cmplt_ps(rai, t1);
 617             obc_mask2     = _mm_cmplt_ps(rai, t2);
 618             obc_mask3     = _mm_cmplt_ps(rai, t3);
 619             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
 620             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
 621             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
 622
 623             uij           = gmx_mm_inv_ps(t1);
 624             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 625                                          _mm_andnot_ps(obc_mask2, rai_inv));
 626             dlij          = _mm_and_ps(one, obc_mask2);
 627             uij2          = _mm_mul_ps(uij, uij);
 628             uij3          = _mm_mul_ps(uij2, uij);
 629             lij2          = _mm_mul_ps(lij, lij);
 630             lij3          = _mm_mul_ps(lij2, lij);
 631
 632             uijB          = gmx_mm_inv_ps(t1B);
 633             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 634                                          _mm_andnot_ps(obc_mask2B, rai_inv));
 635             dlijB         = _mm_and_ps(one, obc_mask2B);
 636             uij2B         = _mm_mul_ps(uijB, uijB);
 637             uij3B         = _mm_mul_ps(uij2B, uijB);
 638             lij2B         = _mm_mul_ps(lijB, lijB);
 639             lij3B         = _mm_mul_ps(lij2B, lijB);
 640
 641             diff2         = _mm_sub_ps(uij2, lij2);
 642             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 643             sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 644             sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 645             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 646
 647             diff2B        = _mm_sub_ps(uij2B, lij2B);
 648             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 649             sk2_ajB       = _mm_mul_ps(sk_ajB, sk_ajB);
 650             sk2_rinvB     = _mm_mul_ps(sk2_ajB, rinvB);
 651             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 652
 653             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 654             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 655
 656             t1            = _mm_sub_ps(lij, uij);
 657             t2            = _mm_mul_ps(diff2,
 658                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 659                                                   prod));
 660             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 661             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 662             t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 663             t4            = _mm_and_ps(t4, obc_mask3);
 664             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 665
 666             t1B           = _mm_sub_ps(lijB, uijB);
 667             t2B           = _mm_mul_ps(diff2B,
 668                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 669                                                   prodB));
 670             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 671             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 672             t4B           = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
 673             t4B           = _mm_and_ps(t4B, obc_mask3B);
 674             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 675
 676             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
 677
 678             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 679                                        _mm_mul_ps(prod, lij3));
 680             t1            = _mm_sub_ps(t1,
 681                                        _mm_mul_ps(onefourth,
 682                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 683                                                              _mm_mul_ps(lij3, r))));
 684             t2            = _mm_mul_ps(onefourth,
 685                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 686                                                   _mm_mul_ps(uij3, r)));
 687             t2            = _mm_sub_ps(t2,
 688                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 689                                                   _mm_mul_ps(prod, uij3)));
 690             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 691                                        _mm_mul_ps(rinv, rinv));
 692             t3            = _mm_sub_ps(t3,
 693                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 694                                                   _mm_add_ps(one,
 695                                                              _mm_mul_ps(sk2_rinv, rinv))));
 696             t1            = _mm_mul_ps(rinv,
 697                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 698                                                   _mm_add_ps(t2, t3)));
 699
 700
 701
 702             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 703                                        _mm_mul_ps(prodB, lij3B));
 704             t1B           = _mm_sub_ps(t1B,
 705                                        _mm_mul_ps(onefourth,
 706                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 707                                                              _mm_mul_ps(lij3B, rB))));
 708             t2B           = _mm_mul_ps(onefourth,
 709                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 710                                                   _mm_mul_ps(uij3B, rB)));
 711             t2B           = _mm_sub_ps(t2B,
 712                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 713                                                   _mm_mul_ps(prodB, uij3B)));
 714             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 715                                        _mm_mul_ps(rinvB, rinvB));
 716             t3B           = _mm_sub_ps(t3B,
 717                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 718                                                   _mm_add_ps(one,
 719                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 720             t1B           = _mm_mul_ps(rinvB,
 721                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 722                                                   _mm_add_ps(t2B, t3B)));
 723
 724             dadx1         = _mm_and_ps(t1, obc_mask1);
 725             dadx1B        = _mm_and_ps(t1B, obc_mask1B);
 726
 727
 728             /* Evaluate influence of atom ai -> aj */
 729             t1            = _mm_add_ps(r, sk_ai);
 730             t2            = _mm_sub_ps(r, sk_ai);
 731             t3            = _mm_sub_ps(sk_ai, r);
 732             t1B           = _mm_add_ps(rB, sk_ai);
 733             t2B           = _mm_sub_ps(rB, sk_ai);
 734             t3B           = _mm_sub_ps(sk_ai, rB);
 735             obc_mask1     = _mm_cmplt_ps(raj, t1);
 736             obc_mask2     = _mm_cmplt_ps(raj, t2);
 737             obc_mask3     = _mm_cmplt_ps(raj, t3);
 738             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
 739             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
 740             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
 741
 742             uij           = gmx_mm_inv_ps(t1);
 743             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 744                                          _mm_andnot_ps(obc_mask2, raj_inv));
 745             dlij          = _mm_and_ps(one, obc_mask2);
 746             uij2          = _mm_mul_ps(uij, uij);
 747             uij3          = _mm_mul_ps(uij2, uij);
 748             lij2          = _mm_mul_ps(lij, lij);
 749             lij3          = _mm_mul_ps(lij2, lij);
 750
 751             uijB          = gmx_mm_inv_ps(t1B);
 752             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 753                                          _mm_andnot_ps(obc_mask2B, raj_invB));
 754             dlijB         = _mm_and_ps(one, obc_mask2B);
 755             uij2B         = _mm_mul_ps(uijB, uijB);
 756             uij3B         = _mm_mul_ps(uij2B, uijB);
 757             lij2B         = _mm_mul_ps(lijB, lijB);
 758             lij3B         = _mm_mul_ps(lij2B, lijB);
 759
 760             diff2         = _mm_sub_ps(uij2, lij2);
 761             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 762             sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 763             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 764
 765             diff2B        = _mm_sub_ps(uij2B, lij2B);
 766             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 767             sk2_rinvB     = _mm_mul_ps(sk2_ai, rinvB);
 768             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 769
 770             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 771             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 772
 773             t1            = _mm_sub_ps(lij, uij);
 774             t2            = _mm_mul_ps(diff2,
 775                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 776                                                   prod));
 777             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 778             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 779             t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 780             t4            = _mm_and_ps(t4, obc_mask3);
 781             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 782
 783             t1B           = _mm_sub_ps(lijB, uijB);
 784             t2B           = _mm_mul_ps(diff2B,
 785                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 786                                                   prodB));
 787             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 788             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 789             t4B           = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
 790             t4B           = _mm_and_ps(t4B, obc_mask3B);
 791             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 792
 793             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 794             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
 795
 796             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 797                                        _mm_mul_ps(prod, lij3));
 798             t1            = _mm_sub_ps(t1,
 799                                        _mm_mul_ps(onefourth,
 800                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 801                                                              _mm_mul_ps(lij3, r))));
 802             t2            = _mm_mul_ps(onefourth,
 803                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 804                                                   _mm_mul_ps(uij3, r)));
 805             t2            = _mm_sub_ps(t2,
 806                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 807                                                   _mm_mul_ps(prod, uij3)));
 808             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 809                                        _mm_mul_ps(rinv, rinv));
 810             t3            = _mm_sub_ps(t3,
 811                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 812                                                   _mm_add_ps(one,
 813                                                              _mm_mul_ps(sk2_rinv, rinv))));
 814             t1            = _mm_mul_ps(rinv,
 815                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 816                                                   _mm_add_ps(t2, t3)));
 817
 818
 819             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 820                                        _mm_mul_ps(prodB, lij3B));
 821             t1B           = _mm_sub_ps(t1B,
 822                                        _mm_mul_ps(onefourth,
 823                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 824                                                              _mm_mul_ps(lij3B, rB))));
 825             t2B           = _mm_mul_ps(onefourth,
 826                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 827                                                   _mm_mul_ps(uij3B, rB)));
 828             t2B           = _mm_sub_ps(t2B,
 829                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 830                                                   _mm_mul_ps(prodB, uij3B)));
 831             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 832                                        _mm_mul_ps(rinvB, rinvB));
 833             t3B           = _mm_sub_ps(t3B,
 834                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 835                                                   _mm_add_ps(one,
 836                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 837             t1B           = _mm_mul_ps(rinvB,
 838                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 839                                                   _mm_add_ps(t2B, t3B)));
 840
 841
 842             dadx2         = _mm_and_ps(t1, obc_mask1);
 843             dadx2B        = _mm_and_ps(t1B, obc_mask1B);
 844
 845             _mm_store_ps(dadx, dadx1);
 846             dadx += 4;
 847             _mm_store_ps(dadx, dadx2);
 848             dadx += 4;
 849             _mm_store_ps(dadx, dadx1B);
 850             dadx += 4;
 851             _mm_store_ps(dadx, dadx2B);
 852             dadx += 4;
 853
 854         } /* end normal inner loop */
 855
 856         for (; k < nj1-offset; k += 4)
 857         {
 858             jnrA        = jjnr[k];
 859             jnrB        = jjnr[k+1];
 860             jnrC        = jjnr[k+2];
 861             jnrD        = jjnr[k+3];
 862
 863             j3A         = 3*jnrA;
 864             j3B         = 3*jnrB;
 865             j3C         = 3*jnrC;
 866             j3D         = 3*jnrD;
 867
 868             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 869             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 870             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 871
 872             dx    = _mm_sub_ps(ix, jx);
 873             dy    = _mm_sub_ps(iy, jy);
 874             dz    = _mm_sub_ps(iz, jz);
 875
 876             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 877
 878             rinv        = gmx_mm_invsqrt_ps(rsq);
 879             r           = _mm_mul_ps(rsq, rinv);
 880
 881             /* Compute raj_inv aj1-4 */
 882             raj_inv     = gmx_mm_inv_ps(raj);
 883
 884             /* Evaluate influence of atom aj -> ai */
 885             t1            = _mm_add_ps(r, sk_aj);
 886             obc_mask1     = _mm_cmplt_ps(rai, t1);
 887
 888             if (_mm_movemask_ps(obc_mask1))
 889             {
 890                 /* If any of the elements has rai<dr+sk, this is executed */
 891                 t2            = _mm_sub_ps(r, sk_aj);
 892                 t3            = _mm_sub_ps(sk_aj, r);
 893
 894                 obc_mask2     = _mm_cmplt_ps(rai, t2);
 895                 obc_mask3     = _mm_cmplt_ps(rai, t3);
 896
 897                 uij           = gmx_mm_inv_ps(t1);
 898                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 899                                              _mm_andnot_ps(obc_mask2, rai_inv));
 900                 dlij          = _mm_and_ps(one, obc_mask2);
 901                 uij2          = _mm_mul_ps(uij, uij);
 902                 uij3          = _mm_mul_ps(uij2, uij);
 903                 lij2          = _mm_mul_ps(lij, lij);
 904                 lij3          = _mm_mul_ps(lij2, lij);
 905                 diff2         = _mm_sub_ps(uij2, lij2);
 906                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 907                 sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 908                 sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 909                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 910                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 911                 t1            = _mm_sub_ps(lij, uij);
 912                 t2            = _mm_mul_ps(diff2,
 913                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 914                                                       prod));
 915                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 916                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 917                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 918                 t4            = _mm_and_ps(t4, obc_mask3);
 919                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 920                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
 921                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 922                                            _mm_mul_ps(prod, lij3));
 923                 t1            = _mm_sub_ps(t1,
 924                                            _mm_mul_ps(onefourth,
 925                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 926                                                                  _mm_mul_ps(lij3, r))));
 927                 t2            = _mm_mul_ps(onefourth,
 928                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 929                                                       _mm_mul_ps(uij3, r)));
 930                 t2            = _mm_sub_ps(t2,
 931                                            _mm_add_ps(_mm_mul_ps(half, uij2),
 932                                                       _mm_mul_ps(prod, uij3)));
 933                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 934                                            _mm_mul_ps(rinv, rinv));
 935                 t3            = _mm_sub_ps(t3,
 936                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 937                                                       _mm_add_ps(one,
 938                                                                  _mm_mul_ps(sk2_rinv, rinv))));
 939                 t1            = _mm_mul_ps(rinv,
 940                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
 941                                                       _mm_add_ps(t2, t3)));
 942
 943                 dadx1         = _mm_and_ps(t1, obc_mask1);
 944             }
 945             else
 946             {
 947                 dadx1         = _mm_setzero_ps();
 948             }
 949
 950             /* Evaluate influence of atom ai -> aj */
 951             t1            = _mm_add_ps(r, sk_ai);
 952             obc_mask1     = _mm_cmplt_ps(raj, t1);
 953
 954             if (_mm_movemask_ps(obc_mask1))
 955             {
 956                 t2            = _mm_sub_ps(r, sk_ai);
 957                 t3            = _mm_sub_ps(sk_ai, r);
 958                 obc_mask2     = _mm_cmplt_ps(raj, t2);
 959                 obc_mask3     = _mm_cmplt_ps(raj, t3);
 960
 961                 uij           = gmx_mm_inv_ps(t1);
 962                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 963                                              _mm_andnot_ps(obc_mask2, raj_inv));
 964                 dlij          = _mm_and_ps(one, obc_mask2);
 965                 uij2          = _mm_mul_ps(uij, uij);
 966                 uij3          = _mm_mul_ps(uij2, uij);
 967                 lij2          = _mm_mul_ps(lij, lij);
 968                 lij3          = _mm_mul_ps(lij2, lij);
 969                 diff2         = _mm_sub_ps(uij2, lij2);
 970                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 971                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 972                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 973                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 974                 t1            = _mm_sub_ps(lij, uij);
 975                 t2            = _mm_mul_ps(diff2,
 976                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 977                                                       prod));
 978                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 979                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 980                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 981                 t4            = _mm_and_ps(t4, obc_mask3);
 982                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 983
 984                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 985
 986                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 987                                            _mm_mul_ps(prod, lij3));
 988                 t1            = _mm_sub_ps(t1,
 989                                            _mm_mul_ps(onefourth,
 990                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 991                                                                  _mm_mul_ps(lij3, r))));
 992                 t2            = _mm_mul_ps(onefourth,
 993                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 994                                                       _mm_mul_ps(uij3, r)));
 995                 t2            = _mm_sub_ps(t2,
 996                                            _mm_add_ps(_mm_mul_ps(half, uij2),
 997                                                       _mm_mul_ps(prod, uij3)));
 998                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 999                                            _mm_mul_ps(rinv, rinv));
1000                 t3            = _mm_sub_ps(t3,
1001                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1002                                                       _mm_add_ps(one,
1003                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1004                 t1            = _mm_mul_ps(rinv,
1005                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1006                                                       _mm_add_ps(t2, t3)));
1007                 dadx2         = _mm_and_ps(t1, obc_mask1);
1008             }
1009             else
1010             {
1011                 dadx2         = _mm_setzero_ps();
1012             }
1013
1014             _mm_store_ps(dadx, dadx1);
1015             dadx += 4;
1016             _mm_store_ps(dadx, dadx2);
1017             dadx += 4;
1018         } /* end normal inner loop */
1019
1020         if (offset != 0)
1021         {
1022             if (offset == 1)
1023             {
1024                 jnrA        = jjnr[k];
1025                 j3A         = 3*jnrA;
1026                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1027                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1028                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1029                 mask        = mask1;
1030             }
1031             else if (offset == 2)
1032             {
1033                 jnrA        = jjnr[k];
1034                 jnrB        = jjnr[k+1];
1035                 j3A         = 3*jnrA;
1036                 j3B         = 3*jnrB;
1037                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1038                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1039                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1040                 mask        = mask2;
1041             }
1042             else
1043             {
1044                 /* offset must be 3 */
1045                 jnrA        = jjnr[k];
1046                 jnrB        = jjnr[k+1];
1047                 jnrC        = jjnr[k+2];
1048                 j3A         = 3*jnrA;
1049                 j3B         = 3*jnrB;
1050                 j3C         = 3*jnrC;
1051                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1052                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1053                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1054                 mask        = mask3;
1055             }
1056
1057             dx    = _mm_sub_ps(ix, jx);
1058             dy    = _mm_sub_ps(iy, jy);
1059             dz    = _mm_sub_ps(iz, jz);
1060
1061             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
1062
1063             rinv        = gmx_mm_invsqrt_ps(rsq);
1064             r           = _mm_mul_ps(rsq, rinv);
1065
1066             /* Compute raj_inv aj1-4 */
1067             raj_inv     = gmx_mm_inv_ps(raj);
1068
1069             /* Evaluate influence of atom aj -> ai */
1070             t1            = _mm_add_ps(r, sk_aj);
1071             obc_mask1     = _mm_cmplt_ps(rai, t1);
1072             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1073
1074             if (_mm_movemask_ps(obc_mask1))
1075             {
1076                 t2            = _mm_sub_ps(r, sk_aj);
1077                 t3            = _mm_sub_ps(sk_aj, r);
1078                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1079                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1080
1081                 uij           = gmx_mm_inv_ps(t1);
1082                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1083                                              _mm_andnot_ps(obc_mask2, rai_inv));
1084                 dlij           = _mm_and_ps(one, obc_mask2);
1085                 uij2           = _mm_mul_ps(uij, uij);
1086                 uij3           = _mm_mul_ps(uij2, uij);
1087                 lij2           = _mm_mul_ps(lij, lij);
1088                 lij3           = _mm_mul_ps(lij2, lij);
1089                 diff2          = _mm_sub_ps(uij2, lij2);
1090                 lij_inv        = gmx_mm_invsqrt_ps(lij2);
1091                 sk2_aj         = _mm_mul_ps(sk_aj, sk_aj);
1092                 sk2_rinv       = _mm_mul_ps(sk2_aj, rinv);
1093                 prod           = _mm_mul_ps(onefourth, sk2_rinv);
1094                 logterm        = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1095                 t1             = _mm_sub_ps(lij, uij);
1096                 t2             = _mm_mul_ps(diff2,
1097                                             _mm_sub_ps(_mm_mul_ps(onefourth, r),
1098                                                        prod));
1099                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1100                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1101                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1102                 t4            = _mm_and_ps(t4, obc_mask3);
1103                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1104                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1105                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1106                                            _mm_mul_ps(prod, lij3));
1107                 t1            = _mm_sub_ps(t1,
1108                                            _mm_mul_ps(onefourth,
1109                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1110                                                                  _mm_mul_ps(lij3, r))));
1111                 t2            = _mm_mul_ps(onefourth,
1112                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1113                                                       _mm_mul_ps(uij3, r)));
1114                 t2            = _mm_sub_ps(t2,
1115                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1116                                                       _mm_mul_ps(prod, uij3)));
1117                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1118                                            _mm_mul_ps(rinv, rinv));
1119                 t3            = _mm_sub_ps(t3,
1120                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1121                                                       _mm_add_ps(one,
1122                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1123                 t1            = _mm_mul_ps(rinv,
1124                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1125                                                       _mm_add_ps(t2, t3)));
1126                 dadx1         = _mm_and_ps(t1, obc_mask1);
1127             }
1128             else
1129             {
1130                 dadx1         = _mm_setzero_ps();
1131             }
1132
1133             /* Evaluate influence of atom ai -> aj */
1134             t1            = _mm_add_ps(r, sk_ai);
1135             obc_mask1     = _mm_cmplt_ps(raj, t1);
1136             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1137
1138             if (_mm_movemask_ps(obc_mask1))
1139             {
1140                 t2            = _mm_sub_ps(r, sk_ai);
1141                 t3            = _mm_sub_ps(sk_ai, r);
1142                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1143                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1144
1145                 uij           = gmx_mm_inv_ps(t1);
1146                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1147                                           _mm_andnot_ps(obc_mask2, raj_inv));
1148                 dlij          = _mm_and_ps(one, obc_mask2);
1149                 uij2          = _mm_mul_ps(uij, uij);
1150                 uij3          = _mm_mul_ps(uij2, uij);
1151                 lij2          = _mm_mul_ps(lij, lij);
1152                 lij3          = _mm_mul_ps(lij2, lij);
1153                 diff2         = _mm_sub_ps(uij2, lij2);
1154                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1155                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
1156                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
1157                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1158                 t1            = _mm_sub_ps(lij, uij);
1159                 t2            = _mm_mul_ps(diff2,
1160                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
1161                                                       prod));
1162                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1163                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1164                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1165                 t4            = _mm_and_ps(t4, obc_mask3);
1166                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1167
1168                 tmp           = _mm_and_ps(t1, obc_mask1);
1169
1170                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1171                                            _mm_mul_ps(prod, lij3));
1172                 t1            = _mm_sub_ps(t1,
1173                                            _mm_mul_ps(onefourth,
1174                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1175                                                                  _mm_mul_ps(lij3, r))));
1176                 t2            = _mm_mul_ps(onefourth,
1177                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1178                                                       _mm_mul_ps(uij3, r)));
1179                 t2            = _mm_sub_ps(t2,
1180                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1181                                                       _mm_mul_ps(prod, uij3)));
1182                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1183                                            _mm_mul_ps(rinv, rinv));
1184                 t3            = _mm_sub_ps(t3,
1185                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1186                                                       _mm_add_ps(one,
1187                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1188                 t1            = _mm_mul_ps(rinv,
1189                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1190                                                       _mm_add_ps(t2, t3)));
1191                 dadx2         = _mm_and_ps(t1, obc_mask1);
1192             }
1193             else
1194             {
1195                 dadx2         = _mm_setzero_ps();
1196                 tmp           = _mm_setzero_ps();
1197             }
1198
1199             _mm_store_ps(dadx, dadx1);
1200             dadx += 4;
1201             _mm_store_ps(dadx, dadx2);
1202             dadx += 4;
1203
1204             if (offset == 1)
1205             {
1206                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1207             }
1208             else if (offset == 2)
1209             {
1210                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1211             }
1212             else
1213             {
1214                 /* offset must be 3 */
1215                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1216             }
1217
1218         }
1219         GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1220
1221     }
1222
1223     /* Parallel summations */
1224     if (DOMAINDECOMP(cr))
1225     {
1226         dd_atom_sum_real(cr->dd, work);
1227     }
1228
1229     if (gb_algorithm == egbHCT)
1230     {
1231         /* HCT */
1232         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1233         {
1234             if (born->use[i] != 0)
1235             {
1236                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1237                 sum     = 1.0/rr - work[i];
1238                 min_rad = rr + doffset;
1239                 rad     = 1.0/sum;
1240
1241                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1242                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1243             }
1244         }
1245
1246         /* Extra communication required for DD */
1247         if (DOMAINDECOMP(cr))
1248         {
1249             dd_atom_spread_real(cr->dd, born->bRad);
1250             dd_atom_spread_real(cr->dd, fr->invsqrta);
1251         }
1252     }
1253     else
1254     {
1255         /* OBC */
1256         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1257         {
1258             if (born->use[i] != 0)
1259             {
1260                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1261                 rr_inv2 = 1.0/rr;
1262                 rr      = rr-doffset;
1263                 rr_inv  = 1.0/rr;
1264                 sum     = rr * work[i];
1265                 sum2    = sum  * sum;
1266                 sum3    = sum2 * sum;
1267
1268                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1269                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1270                 born->bRad[i] = 1.0 / born->bRad[i];
1271
1272                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1273
1274                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1275                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1276             }
1277         }
1278         /* Extra (local) communication required for DD */
1279         if (DOMAINDECOMP(cr))
1280         {
1281             dd_atom_spread_real(cr->dd, born->bRad);
1282             dd_atom_spread_real(cr->dd, fr->invsqrta);
1283             dd_atom_spread_real(cr->dd, born->drobc);
1284         }
1285     }
1286
1287
1288
1289     return 0;
1290 }
1291
1292
1293
1294 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1295                                     float *x, float *f, float *fshift, float *shiftvec,
1296                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1297 {
1298     int          i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1299     int          jnrA, jnrB, jnrC, jnrD;
1300     int          j3A, j3B, j3C, j3D;
1301     int          jnrE, jnrF, jnrG, jnrH;
1302     int          j3E, j3F, j3G, j3H;
1303     int       *  jjnr;
1304
1305     float        rbi, shX, shY, shZ;
1306     float       *rb;
1307
1308     __m128       ix, iy, iz;
1309     __m128       jx, jy, jz;
1310     __m128       jxB, jyB, jzB;
1311     __m128       fix, fiy, fiz;
1312     __m128       dx, dy, dz;
1313     __m128       tx, ty, tz;
1314     __m128       dxB, dyB, dzB;
1315     __m128       txB, tyB, tzB;
1316
1317     __m128       rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1318     __m128       xmm1, xmm2, xmm3;
1319
1320     const __m128 two = _mm_set1_ps(2.0f);
1321
1322     rb     = born->work;
1323
1324     jjnr   = nl->jjnr;
1325
1326     /* Loop to get the proper form for the Born radius term, sse style */
1327     offset = natoms%4;
1328
1329     n0 = 0;
1330     n1 = natoms;
1331
1332     if (gb_algorithm == egbSTILL)
1333     {
1334         for (i = n0; i < n1; i++)
1335         {
1336             rbi   = born->bRad[i];
1337             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1338         }
1339     }
1340     else if (gb_algorithm == egbHCT)
1341     {
1342         for (i = n0; i < n1; i++)
1343         {
1344             rbi   = born->bRad[i];
1345             rb[i] = rbi * rbi * dvda[i];
1346         }
1347     }
1348     else if (gb_algorithm == egbOBC)
1349     {
1350         for (i = n0; i < n1; i++)
1351         {
1352             rbi   = born->bRad[i];
1353             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1354         }
1355     }
1356
1357     jz = _mm_setzero_ps();
1358
1359     n = j3A = j3B = j3C = j3D = 0;
1360
1361     for (i = 0; i < nl->nri; i++)
1362     {
1363         ii     = nl->iinr[i];
1364         ii3    = ii*3;
1365         is3    = 3*nl->shift[i];
1366         shX    = shiftvec[is3];
1367         shY    = shiftvec[is3+1];
1368         shZ    = shiftvec[is3+2];
1369         nj0    = nl->jindex[i];
1370         nj1    = nl->jindex[i+1];
1371
1372         ix     = _mm_set1_ps(shX+x[ii3+0]);
1373         iy     = _mm_set1_ps(shY+x[ii3+1]);
1374         iz     = _mm_set1_ps(shZ+x[ii3+2]);
1375
1376         offset = (nj1-nj0)%4;
1377
1378         rbai   = _mm_load1_ps(rb+ii);
1379         fix    = _mm_setzero_ps();
1380         fiy    = _mm_setzero_ps();
1381         fiz    = _mm_setzero_ps();
1382
1383
1384         for (k = nj0; k < nj1-offset; k += 4)
1385         {
1386             jnrA        = jjnr[k];
1387             jnrB        = jjnr[k+1];
1388             jnrC        = jjnr[k+2];
1389             jnrD        = jjnr[k+3];
1390
1391             j3A         = 3*jnrA;
1392             j3B         = 3*jnrB;
1393             j3C         = 3*jnrC;
1394             j3D         = 3*jnrD;
1395
1396             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1397
1398             dx          = _mm_sub_ps(ix, jx);
1399             dy          = _mm_sub_ps(iy, jy);
1400             dz          = _mm_sub_ps(iz, jz);
1401
1402             GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1403
1404             /* load chain rule terms for j1-4 */
1405             f_gb        = _mm_load_ps(dadx);
1406             dadx       += 4;
1407             f_gb_ai     = _mm_load_ps(dadx);
1408             dadx       += 4;
1409
1410             /* calculate scalar force */
1411             f_gb    = _mm_mul_ps(f_gb, rbai);
1412             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1413             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1414
1415             tx     = _mm_mul_ps(f_gb, dx);
1416             ty     = _mm_mul_ps(f_gb, dy);
1417             tz     = _mm_mul_ps(f_gb, dz);
1418
1419             fix    = _mm_add_ps(fix, tx);
1420             fiy    = _mm_add_ps(fiy, ty);
1421             fiz    = _mm_add_ps(fiz, tz);
1422
1423             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1424         }
1425
1426         /*deal with odd elements */
1427         if (offset != 0)
1428         {
1429             if (offset == 1)
1430             {
1431                 jnrA        = jjnr[k];
1432                 j3A         = 3*jnrA;
1433                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1434                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1435             }
1436             else if (offset == 2)
1437             {
1438                 jnrA        = jjnr[k];
1439                 jnrB        = jjnr[k+1];
1440                 j3A         = 3*jnrA;
1441                 j3B         = 3*jnrB;
1442                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1443                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1444             }
1445             else
1446             {
1447                 /* offset must be 3 */
1448                 jnrA        = jjnr[k];
1449                 jnrB        = jjnr[k+1];
1450                 jnrC        = jjnr[k+2];
1451                 j3A         = 3*jnrA;
1452                 j3B         = 3*jnrB;
1453                 j3C         = 3*jnrC;
1454                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1455                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1456             }
1457
1458             dx          = _mm_sub_ps(ix, jx);
1459             dy          = _mm_sub_ps(iy, jy);
1460             dz          = _mm_sub_ps(iz, jz);
1461
1462             /* load chain rule terms for j1-4 */
1463             f_gb        = _mm_load_ps(dadx);
1464             dadx       += 4;
1465             f_gb_ai     = _mm_load_ps(dadx);
1466             dadx       += 4;
1467
1468             /* calculate scalar force */
1469             f_gb    = _mm_mul_ps(f_gb, rbai);
1470             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1471             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1472
1473             tx     = _mm_mul_ps(f_gb, dx);
1474             ty     = _mm_mul_ps(f_gb, dy);
1475             tz     = _mm_mul_ps(f_gb, dz);
1476
1477             fix    = _mm_add_ps(fix, tx);
1478             fiy    = _mm_add_ps(fiy, ty);
1479             fiz    = _mm_add_ps(fiz, tz);
1480
1481             if (offset == 1)
1482             {
1483                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1484             }
1485             else if (offset == 2)
1486             {
1487                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1488             }
1489             else
1490             {
1491                 /* offset must be 3 */
1492                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1493             }
1494         }
1495
1496         /* fix/fiy/fiz now contain four partial force terms, that all should be
1497          * added to the i particle forces and shift forces.
1498          */
1499         gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1500     }
1501
1502     return 0;
1503 }
1504
1505
1506 #else
1507 /* keep compiler happy */
1508 int genborn_sse_dummy;
1509
1510 #endif /* SSE intrinsics available */