src/gromacs/mdlib/genborn_sse2_single.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <math.h>
  42 #include <string.h>
  43
  44 #include "typedefs.h"
  45 #include "gromacs/utility/smalloc.h"
  46 #include "genborn.h"
  47 #include "vec.h"
  48 #include "gromacs/fileio/pdbio.h"
  49 #include "names.h"
  50 #include "physics.h"
  51 #include "domdec.h"
  52 #include "network.h"
  53 #include "gromacs/utility/fatalerror.h"
  54 #include "mtop_util.h"
  55 #include "genborn.h"
  56
  57 #include "gromacs/utility/gmxmpi.h"
  58
  59
  60 /* Only compile this file if SSE intrinsics are available */
  61 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  62
  63 #include <gmx_sse2_single.h>
  64 #include <emmintrin.h>
  65
  66 #include "genborn_sse2_single.h"
  67
  68
  69 int
  70 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
  71                               int natoms, gmx_localtop_t *top,
  72                               float *x, t_nblist *nl,
  73                               gmx_genborn_t *born)
  74 {
  75     int          i, k, n, ii, is3, ii3, nj0, nj1, offset;
  76     int          jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
  77     int          jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
  78     int          shift;
  79     int         *mdtype;
  80     real         shX, shY, shZ;
  81     int         *jjnr;
  82     real        *shiftvec;
  83
  84     float        gpi_ai, gpi2;
  85     float        factor;
  86     float       *gb_radius;
  87     float       *vsolv;
  88     float       *work;
  89     float       *dadx;
  90
  91     __m128       ix, iy, iz;
  92     __m128       jx, jy, jz;
  93     __m128       dx, dy, dz;
  94     __m128       tx, ty, tz;
  95     __m128       jxB, jyB, jzB;
  96     __m128       dxB, dyB, dzB;
  97     __m128       txB, tyB, tzB;
  98     __m128       rsq, rinv, rinv2, rinv4, rinv6;
  99     __m128       rsqB, rinvB, rinv2B, rinv4B, rinv6B;
 100     __m128       ratio, gpi, rai, raj, vai, vaj, rvdw;
 101     __m128       ratioB, rajB, vajB, rvdwB;
 102     __m128       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
 103     __m128       ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
 104     __m128       mask, icf4, icf6, mask_cmp;
 105     __m128       icf4B, icf6B, mask_cmpB;
 106
 107     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 108     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 109     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 110
 111     const __m128 half   = _mm_set1_ps(0.5f);
 112     const __m128 three  = _mm_set1_ps(3.0f);
 113     const __m128 one    = _mm_set1_ps(1.0f);
 114     const __m128 two    = _mm_set1_ps(2.0f);
 115     const __m128 zero   = _mm_set1_ps(0.0f);
 116     const __m128 four   = _mm_set1_ps(4.0f);
 117
 118     const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
 119     const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
 120     const __m128 still_p4     = _mm_set1_ps(STILL_P4);
 121
 122     factor  = 0.5 * ONE_4PI_EPS0;
 123
 124     gb_radius = born->gb_radius;
 125     vsolv     = born->vsolv;
 126     work      = born->gpol_still_work;
 127     jjnr      = nl->jjnr;
 128     shiftvec  = fr->shift_vec[0];
 129     dadx      = fr->dadx;
 130
 131     jnrA = jnrB = jnrC = jnrD = 0;
 132     jx   = _mm_setzero_ps();
 133     jy   = _mm_setzero_ps();
 134     jz   = _mm_setzero_ps();
 135
 136     n = 0;
 137
 138     for (i = 0; i < natoms; i++)
 139     {
 140         work[i] = 0;
 141     }
 142
 143     for (i = 0; i < nl->nri; i++)
 144     {
 145         ii     = nl->iinr[i];
 146         ii3    = ii*3;
 147         is3    = 3*nl->shift[i];
 148         shX    = shiftvec[is3];
 149         shY    = shiftvec[is3+1];
 150         shZ    = shiftvec[is3+2];
 151         nj0    = nl->jindex[i];
 152         nj1    = nl->jindex[i+1];
 153
 154         ix     = _mm_set1_ps(shX+x[ii3+0]);
 155         iy     = _mm_set1_ps(shY+x[ii3+1]);
 156         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 157
 158         offset = (nj1-nj0)%4;
 159
 160         /* Polarization energy for atom ai */
 161         gpi    = _mm_setzero_ps();
 162
 163         rai     = _mm_load1_ps(gb_radius+ii);
 164         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
 165
 166         for (k = nj0; k < nj1-4-offset; k += 8)
 167         {
 168             jnrA        = jjnr[k];
 169             jnrB        = jjnr[k+1];
 170             jnrC        = jjnr[k+2];
 171             jnrD        = jjnr[k+3];
 172             jnrE        = jjnr[k+4];
 173             jnrF        = jjnr[k+5];
 174             jnrG        = jjnr[k+6];
 175             jnrH        = jjnr[k+7];
 176
 177             j3A         = 3*jnrA;
 178             j3B         = 3*jnrB;
 179             j3C         = 3*jnrC;
 180             j3D         = 3*jnrD;
 181             j3E         = 3*jnrE;
 182             j3F         = 3*jnrF;
 183             j3G         = 3*jnrG;
 184             j3H         = 3*jnrH;
 185
 186             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 187             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 188
 189             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 190             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 191             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 192             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
 193
 194             dx          = _mm_sub_ps(ix, jx);
 195             dy          = _mm_sub_ps(iy, jy);
 196             dz          = _mm_sub_ps(iz, jz);
 197             dxB         = _mm_sub_ps(ix, jxB);
 198             dyB         = _mm_sub_ps(iy, jyB);
 199             dzB         = _mm_sub_ps(iz, jzB);
 200
 201             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 202             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 203             rinv        = gmx_mm_invsqrt_ps(rsq);
 204             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 205             rinv2       = _mm_mul_ps(rinv, rinv);
 206             rinv2B      = _mm_mul_ps(rinvB, rinvB);
 207             rinv4       = _mm_mul_ps(rinv2, rinv2);
 208             rinv4B      = _mm_mul_ps(rinv2B, rinv2B);
 209             rinv6       = _mm_mul_ps(rinv4, rinv2);
 210             rinv6B      = _mm_mul_ps(rinv4B, rinv2B);
 211
 212             rvdw        = _mm_add_ps(rai, raj);
 213             rvdwB       = _mm_add_ps(rai, rajB);
 214             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 215             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
 216
 217             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 218             mask_cmpB   = _mm_cmple_ps(ratioB, still_p5inv);
 219
 220             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 221             if (0 == _mm_movemask_ps(mask_cmp) )
 222             {
 223                 /* if ratio>still_p5inv for ALL elements */
 224                 ccf         = one;
 225                 dccf        = _mm_setzero_ps();
 226             }
 227             else
 228             {
 229                 ratio       = _mm_min_ps(ratio, still_p5inv);
 230                 theta       = _mm_mul_ps(ratio, still_pip5);
 231                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 232                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 233                 ccf         = _mm_mul_ps(term, term);
 234                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 235                                          _mm_mul_ps(sinq, theta));
 236             }
 237             if (0 == _mm_movemask_ps(mask_cmpB) )
 238             {
 239                 /* if ratio>still_p5inv for ALL elements */
 240                 ccfB        = one;
 241                 dccfB       = _mm_setzero_ps();
 242             }
 243             else
 244             {
 245                 ratioB      = _mm_min_ps(ratioB, still_p5inv);
 246                 thetaB      = _mm_mul_ps(ratioB, still_pip5);
 247                 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
 248                 termB       = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
 249                 ccfB        = _mm_mul_ps(termB, termB);
 250                 dccfB       = _mm_mul_ps(_mm_mul_ps(two, termB),
 251                                          _mm_mul_ps(sinqB, thetaB));
 252             }
 253
 254             prod        = _mm_mul_ps(still_p4, vaj);
 255             prodB       = _mm_mul_ps(still_p4, vajB);
 256             icf4        = _mm_mul_ps(ccf, rinv4);
 257             icf4B       = _mm_mul_ps(ccfB, rinv4B);
 258             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 259             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
 260
 261             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 262             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
 263
 264             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
 265
 266             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 267             dadx += 4;
 268             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 269             dadx += 4;
 270             _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
 271             dadx += 4;
 272             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
 273             dadx += 4;
 274         }
 275
 276         for (; k < nj1-offset; k += 4)
 277         {
 278             jnrA        = jjnr[k];
 279             jnrB        = jjnr[k+1];
 280             jnrC        = jjnr[k+2];
 281             jnrD        = jjnr[k+3];
 282
 283             j3A         = 3*jnrA;
 284             j3B         = 3*jnrB;
 285             j3C         = 3*jnrC;
 286             j3D         = 3*jnrD;
 287
 288             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 289
 290             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 291             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 292
 293             dx          = _mm_sub_ps(ix, jx);
 294             dy          = _mm_sub_ps(iy, jy);
 295             dz          = _mm_sub_ps(iz, jz);
 296
 297             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 298             rinv        = gmx_mm_invsqrt_ps(rsq);
 299             rinv2       = _mm_mul_ps(rinv, rinv);
 300             rinv4       = _mm_mul_ps(rinv2, rinv2);
 301             rinv6       = _mm_mul_ps(rinv4, rinv2);
 302
 303             rvdw        = _mm_add_ps(rai, raj);
 304             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 305
 306             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 307
 308             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 309             if (0 == _mm_movemask_ps(mask_cmp))
 310             {
 311                 /* if ratio>still_p5inv for ALL elements */
 312                 ccf         = one;
 313                 dccf        = _mm_setzero_ps();
 314             }
 315             else
 316             {
 317                 ratio       = _mm_min_ps(ratio, still_p5inv);
 318                 theta       = _mm_mul_ps(ratio, still_pip5);
 319                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 320                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 321                 ccf         = _mm_mul_ps(term, term);
 322                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 323                                          _mm_mul_ps(sinq, theta));
 324             }
 325
 326             prod        = _mm_mul_ps(still_p4, vaj);
 327             icf4        = _mm_mul_ps(ccf, rinv4);
 328             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 329
 330             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 331
 332             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 333
 334             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 335             dadx += 4;
 336             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 337             dadx += 4;
 338         }
 339
 340         if (offset != 0)
 341         {
 342             if (offset == 1)
 343             {
 344                 jnrA        = jjnr[k];
 345                 j3A         = 3*jnrA;
 346                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
 347                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
 348                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
 349                 mask        = mask1;
 350             }
 351             else if (offset == 2)
 352             {
 353                 jnrA        = jjnr[k];
 354                 jnrB        = jjnr[k+1];
 355                 j3A         = 3*jnrA;
 356                 j3B         = 3*jnrB;
 357                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
 358                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
 359                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
 360                 mask        = mask2;
 361             }
 362             else
 363             {
 364                 /* offset must be 3 */
 365                 jnrA        = jjnr[k];
 366                 jnrB        = jjnr[k+1];
 367                 jnrC        = jjnr[k+2];
 368                 j3A         = 3*jnrA;
 369                 j3B         = 3*jnrB;
 370                 j3C         = 3*jnrC;
 371                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
 372                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
 373                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
 374                 mask        = mask3;
 375             }
 376
 377             dx          = _mm_sub_ps(ix, jx);
 378             dy          = _mm_sub_ps(iy, jy);
 379             dz          = _mm_sub_ps(iz, jz);
 380
 381             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 382             rinv        = gmx_mm_invsqrt_ps(rsq);
 383             rinv2       = _mm_mul_ps(rinv, rinv);
 384             rinv4       = _mm_mul_ps(rinv2, rinv2);
 385             rinv6       = _mm_mul_ps(rinv4, rinv2);
 386
 387             rvdw        = _mm_add_ps(rai, raj);
 388             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 389
 390             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 391
 392             if (0 == _mm_movemask_ps(mask_cmp))
 393             {
 394                 /* if ratio>still_p5inv for ALL elements */
 395                 ccf         = one;
 396                 dccf        = _mm_setzero_ps();
 397             }
 398             else
 399             {
 400                 ratio       = _mm_min_ps(ratio, still_p5inv);
 401                 theta       = _mm_mul_ps(ratio, still_pip5);
 402                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 403                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 404                 ccf         = _mm_mul_ps(term, term);
 405                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 406                                          _mm_mul_ps(sinq, theta));
 407             }
 408
 409             prod        = _mm_mul_ps(still_p4, vaj);
 410             icf4        = _mm_mul_ps(ccf, rinv4);
 411             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 412
 413             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 414
 415             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 416             dadx += 4;
 417             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 418             dadx += 4;
 419
 420             tmp = _mm_mul_ps(prod_ai, icf4);
 421
 422             if (offset == 1)
 423             {
 424                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
 425             }
 426             else if (offset == 2)
 427             {
 428                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
 429             }
 430             else
 431             {
 432                 /* offset must be 3 */
 433                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
 434             }
 435         }
 436         GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
 437     }
 438
 439     /* Sum up the polarization energy from other nodes */
 440     if (DOMAINDECOMP(cr))
 441     {
 442         dd_atom_sum_real(cr->dd, work);
 443     }
 444
 445     /* Compute the radii */
 446     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 447     {
 448         if (born->use[i] != 0)
 449         {
 450             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 451             gpi2             = gpi_ai * gpi_ai;
 452             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 453             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 454         }
 455     }
 456
 457     /* Extra (local) communication required for DD */
 458     if (DOMAINDECOMP(cr))
 459     {
 460         dd_atom_spread_real(cr->dd, born->bRad);
 461         dd_atom_spread_real(cr->dd, fr->invsqrta);
 462     }
 463
 464     return 0;
 465 }
 466
 467
 468 int
 469 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 470                                 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 471 {
 472     int          i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 473     int          jnrA, jnrB, jnrC, jnrD;
 474     int          j3A, j3B, j3C, j3D;
 475     int          jnrE, jnrF, jnrG, jnrH;
 476     int          j3E, j3F, j3G, j3H;
 477     float        shX, shY, shZ;
 478     float        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 479     float        sum_ai2, sum_ai3, tsum, tchain, doffset;
 480     float       *obc_param;
 481     float       *gb_radius;
 482     float       *work;
 483     int       *  jjnr;
 484     float       *dadx;
 485     float       *shiftvec;
 486     float        min_rad, rad;
 487
 488     __m128       ix, iy, iz, jx, jy, jz;
 489     __m128       dx, dy, dz, t1, t2, t3, t4;
 490     __m128       rsq, rinv, r;
 491     __m128       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 492     __m128       uij, lij2, uij2, lij3, uij3, diff2;
 493     __m128       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 494     __m128       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 495     __m128       dadx1, dadx2;
 496     __m128       logterm;
 497     __m128       mask;
 498     __m128       obc_mask1, obc_mask2, obc_mask3;
 499     __m128       jxB, jyB, jzB, t1B, t2B, t3B, t4B;
 500     __m128       dxB, dyB, dzB, rsqB, rinvB, rB;
 501     __m128       rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
 502     __m128       uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
 503     __m128       lij_invB, sk2_invB, prodB;
 504     __m128       sk_ajB, sk2_ajB, sk2_rinvB;
 505     __m128       dadx1B, dadx2B;
 506     __m128       logtermB;
 507     __m128       obc_mask1B, obc_mask2B, obc_mask3B;
 508
 509     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 510     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 511     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 512
 513     __m128       oneeighth   = _mm_set1_ps(0.125);
 514     __m128       onefourth   = _mm_set1_ps(0.25);
 515
 516     const __m128 half  = _mm_set1_ps(0.5f);
 517     const __m128 three = _mm_set1_ps(3.0f);
 518     const __m128 one   = _mm_set1_ps(1.0f);
 519     const __m128 two   = _mm_set1_ps(2.0f);
 520     const __m128 zero  = _mm_set1_ps(0.0f);
 521     const __m128 neg   = _mm_set1_ps(-1.0f);
 522
 523     /* Set the dielectric offset */
 524     doffset   = born->gb_doffset;
 525     gb_radius = born->gb_radius;
 526     obc_param = born->param;
 527     work      = born->gpol_hct_work;
 528     jjnr      = nl->jjnr;
 529     dadx      = fr->dadx;
 530     shiftvec  = fr->shift_vec[0];
 531
 532     jx        = _mm_setzero_ps();
 533     jy        = _mm_setzero_ps();
 534     jz        = _mm_setzero_ps();
 535
 536     jnrA = jnrB = jnrC = jnrD = 0;
 537
 538     for (i = 0; i < born->nr; i++)
 539     {
 540         work[i] = 0;
 541     }
 542
 543     for (i = 0; i < nl->nri; i++)
 544     {
 545         ii     = nl->iinr[i];
 546         ii3    = ii*3;
 547         is3    = 3*nl->shift[i];
 548         shX    = shiftvec[is3];
 549         shY    = shiftvec[is3+1];
 550         shZ    = shiftvec[is3+2];
 551         nj0    = nl->jindex[i];
 552         nj1    = nl->jindex[i+1];
 553
 554         ix     = _mm_set1_ps(shX+x[ii3+0]);
 555         iy     = _mm_set1_ps(shY+x[ii3+1]);
 556         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 557
 558         offset = (nj1-nj0)%4;
 559
 560         rai     = _mm_load1_ps(gb_radius+ii);
 561         rai_inv = gmx_mm_inv_ps(rai);
 562
 563         sum_ai = _mm_setzero_ps();
 564
 565         sk_ai  = _mm_load1_ps(born->param+ii);
 566         sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
 567
 568         for (k = nj0; k < nj1-4-offset; k += 8)
 569         {
 570             jnrA        = jjnr[k];
 571             jnrB        = jjnr[k+1];
 572             jnrC        = jjnr[k+2];
 573             jnrD        = jjnr[k+3];
 574             jnrE        = jjnr[k+4];
 575             jnrF        = jjnr[k+5];
 576             jnrG        = jjnr[k+6];
 577             jnrH        = jjnr[k+7];
 578
 579             j3A         = 3*jnrA;
 580             j3B         = 3*jnrB;
 581             j3C         = 3*jnrC;
 582             j3D         = 3*jnrD;
 583             j3E         = 3*jnrE;
 584             j3F         = 3*jnrF;
 585             j3G         = 3*jnrG;
 586             j3H         = 3*jnrH;
 587
 588             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 589             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 590             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 591             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 592             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 593             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
 594
 595             dx    = _mm_sub_ps(ix, jx);
 596             dy    = _mm_sub_ps(iy, jy);
 597             dz    = _mm_sub_ps(iz, jz);
 598             dxB   = _mm_sub_ps(ix, jxB);
 599             dyB   = _mm_sub_ps(iy, jyB);
 600             dzB   = _mm_sub_ps(iz, jzB);
 601
 602             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 603             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 604
 605             rinv        = gmx_mm_invsqrt_ps(rsq);
 606             r           = _mm_mul_ps(rsq, rinv);
 607             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 608             rB          = _mm_mul_ps(rsqB, rinvB);
 609
 610             /* Compute raj_inv aj1-4 */
 611             raj_inv     = gmx_mm_inv_ps(raj);
 612             raj_invB    = gmx_mm_inv_ps(rajB);
 613
 614             /* Evaluate influence of atom aj -> ai */
 615             t1            = _mm_add_ps(r, sk_aj);
 616             t2            = _mm_sub_ps(r, sk_aj);
 617             t3            = _mm_sub_ps(sk_aj, r);
 618             t1B           = _mm_add_ps(rB, sk_ajB);
 619             t2B           = _mm_sub_ps(rB, sk_ajB);
 620             t3B           = _mm_sub_ps(sk_ajB, rB);
 621             obc_mask1     = _mm_cmplt_ps(rai, t1);
 622             obc_mask2     = _mm_cmplt_ps(rai, t2);
 623             obc_mask3     = _mm_cmplt_ps(rai, t3);
 624             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
 625             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
 626             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
 627
 628             uij           = gmx_mm_inv_ps(t1);
 629             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 630                                          _mm_andnot_ps(obc_mask2, rai_inv));
 631             dlij          = _mm_and_ps(one, obc_mask2);
 632             uij2          = _mm_mul_ps(uij, uij);
 633             uij3          = _mm_mul_ps(uij2, uij);
 634             lij2          = _mm_mul_ps(lij, lij);
 635             lij3          = _mm_mul_ps(lij2, lij);
 636
 637             uijB          = gmx_mm_inv_ps(t1B);
 638             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 639                                          _mm_andnot_ps(obc_mask2B, rai_inv));
 640             dlijB         = _mm_and_ps(one, obc_mask2B);
 641             uij2B         = _mm_mul_ps(uijB, uijB);
 642             uij3B         = _mm_mul_ps(uij2B, uijB);
 643             lij2B         = _mm_mul_ps(lijB, lijB);
 644             lij3B         = _mm_mul_ps(lij2B, lijB);
 645
 646             diff2         = _mm_sub_ps(uij2, lij2);
 647             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 648             sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 649             sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 650             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 651
 652             diff2B        = _mm_sub_ps(uij2B, lij2B);
 653             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 654             sk2_ajB       = _mm_mul_ps(sk_ajB, sk_ajB);
 655             sk2_rinvB     = _mm_mul_ps(sk2_ajB, rinvB);
 656             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 657
 658             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 659             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 660
 661             t1            = _mm_sub_ps(lij, uij);
 662             t2            = _mm_mul_ps(diff2,
 663                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 664                                                   prod));
 665             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 666             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 667             t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 668             t4            = _mm_and_ps(t4, obc_mask3);
 669             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 670
 671             t1B           = _mm_sub_ps(lijB, uijB);
 672             t2B           = _mm_mul_ps(diff2B,
 673                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 674                                                   prodB));
 675             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 676             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 677             t4B           = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
 678             t4B           = _mm_and_ps(t4B, obc_mask3B);
 679             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 680
 681             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
 682
 683             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 684                                        _mm_mul_ps(prod, lij3));
 685             t1            = _mm_sub_ps(t1,
 686                                        _mm_mul_ps(onefourth,
 687                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 688                                                              _mm_mul_ps(lij3, r))));
 689             t2            = _mm_mul_ps(onefourth,
 690                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 691                                                   _mm_mul_ps(uij3, r)));
 692             t2            = _mm_sub_ps(t2,
 693                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 694                                                   _mm_mul_ps(prod, uij3)));
 695             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 696                                        _mm_mul_ps(rinv, rinv));
 697             t3            = _mm_sub_ps(t3,
 698                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 699                                                   _mm_add_ps(one,
 700                                                              _mm_mul_ps(sk2_rinv, rinv))));
 701             t1            = _mm_mul_ps(rinv,
 702                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 703                                                   _mm_add_ps(t2, t3)));
 704
 705
 706
 707             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 708                                        _mm_mul_ps(prodB, lij3B));
 709             t1B           = _mm_sub_ps(t1B,
 710                                        _mm_mul_ps(onefourth,
 711                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 712                                                              _mm_mul_ps(lij3B, rB))));
 713             t2B           = _mm_mul_ps(onefourth,
 714                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 715                                                   _mm_mul_ps(uij3B, rB)));
 716             t2B           = _mm_sub_ps(t2B,
 717                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 718                                                   _mm_mul_ps(prodB, uij3B)));
 719             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 720                                        _mm_mul_ps(rinvB, rinvB));
 721             t3B           = _mm_sub_ps(t3B,
 722                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 723                                                   _mm_add_ps(one,
 724                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 725             t1B           = _mm_mul_ps(rinvB,
 726                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 727                                                   _mm_add_ps(t2B, t3B)));
 728
 729             dadx1         = _mm_and_ps(t1, obc_mask1);
 730             dadx1B        = _mm_and_ps(t1B, obc_mask1B);
 731
 732
 733             /* Evaluate influence of atom ai -> aj */
 734             t1            = _mm_add_ps(r, sk_ai);
 735             t2            = _mm_sub_ps(r, sk_ai);
 736             t3            = _mm_sub_ps(sk_ai, r);
 737             t1B           = _mm_add_ps(rB, sk_ai);
 738             t2B           = _mm_sub_ps(rB, sk_ai);
 739             t3B           = _mm_sub_ps(sk_ai, rB);
 740             obc_mask1     = _mm_cmplt_ps(raj, t1);
 741             obc_mask2     = _mm_cmplt_ps(raj, t2);
 742             obc_mask3     = _mm_cmplt_ps(raj, t3);
 743             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
 744             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
 745             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
 746
 747             uij           = gmx_mm_inv_ps(t1);
 748             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 749                                          _mm_andnot_ps(obc_mask2, raj_inv));
 750             dlij          = _mm_and_ps(one, obc_mask2);
 751             uij2          = _mm_mul_ps(uij, uij);
 752             uij3          = _mm_mul_ps(uij2, uij);
 753             lij2          = _mm_mul_ps(lij, lij);
 754             lij3          = _mm_mul_ps(lij2, lij);
 755
 756             uijB          = gmx_mm_inv_ps(t1B);
 757             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 758                                          _mm_andnot_ps(obc_mask2B, raj_invB));
 759             dlijB         = _mm_and_ps(one, obc_mask2B);
 760             uij2B         = _mm_mul_ps(uijB, uijB);
 761             uij3B         = _mm_mul_ps(uij2B, uijB);
 762             lij2B         = _mm_mul_ps(lijB, lijB);
 763             lij3B         = _mm_mul_ps(lij2B, lijB);
 764
 765             diff2         = _mm_sub_ps(uij2, lij2);
 766             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 767             sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 768             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 769
 770             diff2B        = _mm_sub_ps(uij2B, lij2B);
 771             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 772             sk2_rinvB     = _mm_mul_ps(sk2_ai, rinvB);
 773             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 774
 775             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 776             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 777
 778             t1            = _mm_sub_ps(lij, uij);
 779             t2            = _mm_mul_ps(diff2,
 780                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 781                                                   prod));
 782             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 783             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 784             t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 785             t4            = _mm_and_ps(t4, obc_mask3);
 786             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 787
 788             t1B           = _mm_sub_ps(lijB, uijB);
 789             t2B           = _mm_mul_ps(diff2B,
 790                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 791                                                   prodB));
 792             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 793             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 794             t4B           = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
 795             t4B           = _mm_and_ps(t4B, obc_mask3B);
 796             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 797
 798             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 799             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
 800
 801             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 802                                        _mm_mul_ps(prod, lij3));
 803             t1            = _mm_sub_ps(t1,
 804                                        _mm_mul_ps(onefourth,
 805                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 806                                                              _mm_mul_ps(lij3, r))));
 807             t2            = _mm_mul_ps(onefourth,
 808                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 809                                                   _mm_mul_ps(uij3, r)));
 810             t2            = _mm_sub_ps(t2,
 811                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 812                                                   _mm_mul_ps(prod, uij3)));
 813             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 814                                        _mm_mul_ps(rinv, rinv));
 815             t3            = _mm_sub_ps(t3,
 816                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 817                                                   _mm_add_ps(one,
 818                                                              _mm_mul_ps(sk2_rinv, rinv))));
 819             t1            = _mm_mul_ps(rinv,
 820                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 821                                                   _mm_add_ps(t2, t3)));
 822
 823
 824             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 825                                        _mm_mul_ps(prodB, lij3B));
 826             t1B           = _mm_sub_ps(t1B,
 827                                        _mm_mul_ps(onefourth,
 828                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 829                                                              _mm_mul_ps(lij3B, rB))));
 830             t2B           = _mm_mul_ps(onefourth,
 831                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 832                                                   _mm_mul_ps(uij3B, rB)));
 833             t2B           = _mm_sub_ps(t2B,
 834                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 835                                                   _mm_mul_ps(prodB, uij3B)));
 836             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 837                                        _mm_mul_ps(rinvB, rinvB));
 838             t3B           = _mm_sub_ps(t3B,
 839                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 840                                                   _mm_add_ps(one,
 841                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 842             t1B           = _mm_mul_ps(rinvB,
 843                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 844                                                   _mm_add_ps(t2B, t3B)));
 845
 846
 847             dadx2         = _mm_and_ps(t1, obc_mask1);
 848             dadx2B        = _mm_and_ps(t1B, obc_mask1B);
 849
 850             _mm_store_ps(dadx, dadx1);
 851             dadx += 4;
 852             _mm_store_ps(dadx, dadx2);
 853             dadx += 4;
 854             _mm_store_ps(dadx, dadx1B);
 855             dadx += 4;
 856             _mm_store_ps(dadx, dadx2B);
 857             dadx += 4;
 858
 859         } /* end normal inner loop */
 860
 861         for (; k < nj1-offset; k += 4)
 862         {
 863             jnrA        = jjnr[k];
 864             jnrB        = jjnr[k+1];
 865             jnrC        = jjnr[k+2];
 866             jnrD        = jjnr[k+3];
 867
 868             j3A         = 3*jnrA;
 869             j3B         = 3*jnrB;
 870             j3C         = 3*jnrC;
 871             j3D         = 3*jnrD;
 872
 873             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 874             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 875             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 876
 877             dx    = _mm_sub_ps(ix, jx);
 878             dy    = _mm_sub_ps(iy, jy);
 879             dz    = _mm_sub_ps(iz, jz);
 880
 881             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 882
 883             rinv        = gmx_mm_invsqrt_ps(rsq);
 884             r           = _mm_mul_ps(rsq, rinv);
 885
 886             /* Compute raj_inv aj1-4 */
 887             raj_inv     = gmx_mm_inv_ps(raj);
 888
 889             /* Evaluate influence of atom aj -> ai */
 890             t1            = _mm_add_ps(r, sk_aj);
 891             obc_mask1     = _mm_cmplt_ps(rai, t1);
 892
 893             if (_mm_movemask_ps(obc_mask1))
 894             {
 895                 /* If any of the elements has rai<dr+sk, this is executed */
 896                 t2            = _mm_sub_ps(r, sk_aj);
 897                 t3            = _mm_sub_ps(sk_aj, r);
 898
 899                 obc_mask2     = _mm_cmplt_ps(rai, t2);
 900                 obc_mask3     = _mm_cmplt_ps(rai, t3);
 901
 902                 uij           = gmx_mm_inv_ps(t1);
 903                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 904                                              _mm_andnot_ps(obc_mask2, rai_inv));
 905                 dlij          = _mm_and_ps(one, obc_mask2);
 906                 uij2          = _mm_mul_ps(uij, uij);
 907                 uij3          = _mm_mul_ps(uij2, uij);
 908                 lij2          = _mm_mul_ps(lij, lij);
 909                 lij3          = _mm_mul_ps(lij2, lij);
 910                 diff2         = _mm_sub_ps(uij2, lij2);
 911                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 912                 sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 913                 sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 914                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 915                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 916                 t1            = _mm_sub_ps(lij, uij);
 917                 t2            = _mm_mul_ps(diff2,
 918                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 919                                                       prod));
 920                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 921                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 922                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 923                 t4            = _mm_and_ps(t4, obc_mask3);
 924                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 925                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
 926                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 927                                            _mm_mul_ps(prod, lij3));
 928                 t1            = _mm_sub_ps(t1,
 929                                            _mm_mul_ps(onefourth,
 930                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 931                                                                  _mm_mul_ps(lij3, r))));
 932                 t2            = _mm_mul_ps(onefourth,
 933                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 934                                                       _mm_mul_ps(uij3, r)));
 935                 t2            = _mm_sub_ps(t2,
 936                                            _mm_add_ps(_mm_mul_ps(half, uij2),
 937                                                       _mm_mul_ps(prod, uij3)));
 938                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 939                                            _mm_mul_ps(rinv, rinv));
 940                 t3            = _mm_sub_ps(t3,
 941                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 942                                                       _mm_add_ps(one,
 943                                                                  _mm_mul_ps(sk2_rinv, rinv))));
 944                 t1            = _mm_mul_ps(rinv,
 945                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
 946                                                       _mm_add_ps(t2, t3)));
 947
 948                 dadx1         = _mm_and_ps(t1, obc_mask1);
 949             }
 950             else
 951             {
 952                 dadx1         = _mm_setzero_ps();
 953             }
 954
 955             /* Evaluate influence of atom ai -> aj */
 956             t1            = _mm_add_ps(r, sk_ai);
 957             obc_mask1     = _mm_cmplt_ps(raj, t1);
 958
 959             if (_mm_movemask_ps(obc_mask1))
 960             {
 961                 t2            = _mm_sub_ps(r, sk_ai);
 962                 t3            = _mm_sub_ps(sk_ai, r);
 963                 obc_mask2     = _mm_cmplt_ps(raj, t2);
 964                 obc_mask3     = _mm_cmplt_ps(raj, t3);
 965
 966                 uij           = gmx_mm_inv_ps(t1);
 967                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 968                                              _mm_andnot_ps(obc_mask2, raj_inv));
 969                 dlij          = _mm_and_ps(one, obc_mask2);
 970                 uij2          = _mm_mul_ps(uij, uij);
 971                 uij3          = _mm_mul_ps(uij2, uij);
 972                 lij2          = _mm_mul_ps(lij, lij);
 973                 lij3          = _mm_mul_ps(lij2, lij);
 974                 diff2         = _mm_sub_ps(uij2, lij2);
 975                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 976                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 977                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 978                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 979                 t1            = _mm_sub_ps(lij, uij);
 980                 t2            = _mm_mul_ps(diff2,
 981                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 982                                                       prod));
 983                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 984                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 985                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 986                 t4            = _mm_and_ps(t4, obc_mask3);
 987                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 988
 989                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 990
 991                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 992                                            _mm_mul_ps(prod, lij3));
 993                 t1            = _mm_sub_ps(t1,
 994                                            _mm_mul_ps(onefourth,
 995                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 996                                                                  _mm_mul_ps(lij3, r))));
 997                 t2            = _mm_mul_ps(onefourth,
 998                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 999                                                       _mm_mul_ps(uij3, r)));
1000                 t2            = _mm_sub_ps(t2,
1001                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1002                                                       _mm_mul_ps(prod, uij3)));
1003                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1004                                            _mm_mul_ps(rinv, rinv));
1005                 t3            = _mm_sub_ps(t3,
1006                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1007                                                       _mm_add_ps(one,
1008                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1009                 t1            = _mm_mul_ps(rinv,
1010                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1011                                                       _mm_add_ps(t2, t3)));
1012                 dadx2         = _mm_and_ps(t1, obc_mask1);
1013             }
1014             else
1015             {
1016                 dadx2         = _mm_setzero_ps();
1017             }
1018
1019             _mm_store_ps(dadx, dadx1);
1020             dadx += 4;
1021             _mm_store_ps(dadx, dadx2);
1022             dadx += 4;
1023         } /* end normal inner loop */
1024
1025         if (offset != 0)
1026         {
1027             if (offset == 1)
1028             {
1029                 jnrA        = jjnr[k];
1030                 j3A         = 3*jnrA;
1031                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1032                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1033                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1034                 mask        = mask1;
1035             }
1036             else if (offset == 2)
1037             {
1038                 jnrA        = jjnr[k];
1039                 jnrB        = jjnr[k+1];
1040                 j3A         = 3*jnrA;
1041                 j3B         = 3*jnrB;
1042                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1043                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1044                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1045                 mask        = mask2;
1046             }
1047             else
1048             {
1049                 /* offset must be 3 */
1050                 jnrA        = jjnr[k];
1051                 jnrB        = jjnr[k+1];
1052                 jnrC        = jjnr[k+2];
1053                 j3A         = 3*jnrA;
1054                 j3B         = 3*jnrB;
1055                 j3C         = 3*jnrC;
1056                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1057                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1058                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1059                 mask        = mask3;
1060             }
1061
1062             dx    = _mm_sub_ps(ix, jx);
1063             dy    = _mm_sub_ps(iy, jy);
1064             dz    = _mm_sub_ps(iz, jz);
1065
1066             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
1067
1068             rinv        = gmx_mm_invsqrt_ps(rsq);
1069             r           = _mm_mul_ps(rsq, rinv);
1070
1071             /* Compute raj_inv aj1-4 */
1072             raj_inv     = gmx_mm_inv_ps(raj);
1073
1074             /* Evaluate influence of atom aj -> ai */
1075             t1            = _mm_add_ps(r, sk_aj);
1076             obc_mask1     = _mm_cmplt_ps(rai, t1);
1077             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1078
1079             if (_mm_movemask_ps(obc_mask1))
1080             {
1081                 t2            = _mm_sub_ps(r, sk_aj);
1082                 t3            = _mm_sub_ps(sk_aj, r);
1083                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1084                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1085
1086                 uij           = gmx_mm_inv_ps(t1);
1087                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1088                                              _mm_andnot_ps(obc_mask2, rai_inv));
1089                 dlij           = _mm_and_ps(one, obc_mask2);
1090                 uij2           = _mm_mul_ps(uij, uij);
1091                 uij3           = _mm_mul_ps(uij2, uij);
1092                 lij2           = _mm_mul_ps(lij, lij);
1093                 lij3           = _mm_mul_ps(lij2, lij);
1094                 diff2          = _mm_sub_ps(uij2, lij2);
1095                 lij_inv        = gmx_mm_invsqrt_ps(lij2);
1096                 sk2_aj         = _mm_mul_ps(sk_aj, sk_aj);
1097                 sk2_rinv       = _mm_mul_ps(sk2_aj, rinv);
1098                 prod           = _mm_mul_ps(onefourth, sk2_rinv);
1099                 logterm        = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1100                 t1             = _mm_sub_ps(lij, uij);
1101                 t2             = _mm_mul_ps(diff2,
1102                                             _mm_sub_ps(_mm_mul_ps(onefourth, r),
1103                                                        prod));
1104                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1105                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1106                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1107                 t4            = _mm_and_ps(t4, obc_mask3);
1108                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1109                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1110                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1111                                            _mm_mul_ps(prod, lij3));
1112                 t1            = _mm_sub_ps(t1,
1113                                            _mm_mul_ps(onefourth,
1114                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1115                                                                  _mm_mul_ps(lij3, r))));
1116                 t2            = _mm_mul_ps(onefourth,
1117                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1118                                                       _mm_mul_ps(uij3, r)));
1119                 t2            = _mm_sub_ps(t2,
1120                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1121                                                       _mm_mul_ps(prod, uij3)));
1122                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1123                                            _mm_mul_ps(rinv, rinv));
1124                 t3            = _mm_sub_ps(t3,
1125                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1126                                                       _mm_add_ps(one,
1127                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1128                 t1            = _mm_mul_ps(rinv,
1129                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1130                                                       _mm_add_ps(t2, t3)));
1131                 dadx1         = _mm_and_ps(t1, obc_mask1);
1132             }
1133             else
1134             {
1135                 dadx1         = _mm_setzero_ps();
1136             }
1137
1138             /* Evaluate influence of atom ai -> aj */
1139             t1            = _mm_add_ps(r, sk_ai);
1140             obc_mask1     = _mm_cmplt_ps(raj, t1);
1141             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1142
1143             if (_mm_movemask_ps(obc_mask1))
1144             {
1145                 t2            = _mm_sub_ps(r, sk_ai);
1146                 t3            = _mm_sub_ps(sk_ai, r);
1147                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1148                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1149
1150                 uij           = gmx_mm_inv_ps(t1);
1151                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1152                                           _mm_andnot_ps(obc_mask2, raj_inv));
1153                 dlij          = _mm_and_ps(one, obc_mask2);
1154                 uij2          = _mm_mul_ps(uij, uij);
1155                 uij3          = _mm_mul_ps(uij2, uij);
1156                 lij2          = _mm_mul_ps(lij, lij);
1157                 lij3          = _mm_mul_ps(lij2, lij);
1158                 diff2         = _mm_sub_ps(uij2, lij2);
1159                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1160                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
1161                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
1162                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1163                 t1            = _mm_sub_ps(lij, uij);
1164                 t2            = _mm_mul_ps(diff2,
1165                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
1166                                                       prod));
1167                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1168                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1169                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1170                 t4            = _mm_and_ps(t4, obc_mask3);
1171                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1172
1173                 tmp           = _mm_and_ps(t1, obc_mask1);
1174
1175                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1176                                            _mm_mul_ps(prod, lij3));
1177                 t1            = _mm_sub_ps(t1,
1178                                            _mm_mul_ps(onefourth,
1179                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1180                                                                  _mm_mul_ps(lij3, r))));
1181                 t2            = _mm_mul_ps(onefourth,
1182                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1183                                                       _mm_mul_ps(uij3, r)));
1184                 t2            = _mm_sub_ps(t2,
1185                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1186                                                       _mm_mul_ps(prod, uij3)));
1187                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1188                                            _mm_mul_ps(rinv, rinv));
1189                 t3            = _mm_sub_ps(t3,
1190                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1191                                                       _mm_add_ps(one,
1192                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1193                 t1            = _mm_mul_ps(rinv,
1194                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1195                                                       _mm_add_ps(t2, t3)));
1196                 dadx2         = _mm_and_ps(t1, obc_mask1);
1197             }
1198             else
1199             {
1200                 dadx2         = _mm_setzero_ps();
1201                 tmp           = _mm_setzero_ps();
1202             }
1203
1204             _mm_store_ps(dadx, dadx1);
1205             dadx += 4;
1206             _mm_store_ps(dadx, dadx2);
1207             dadx += 4;
1208
1209             if (offset == 1)
1210             {
1211                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1212             }
1213             else if (offset == 2)
1214             {
1215                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1216             }
1217             else
1218             {
1219                 /* offset must be 3 */
1220                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1221             }
1222
1223         }
1224         GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1225
1226     }
1227
1228     /* Parallel summations */
1229     if (DOMAINDECOMP(cr))
1230     {
1231         dd_atom_sum_real(cr->dd, work);
1232     }
1233
1234     if (gb_algorithm == egbHCT)
1235     {
1236         /* HCT */
1237         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1238         {
1239             if (born->use[i] != 0)
1240             {
1241                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1242                 sum     = 1.0/rr - work[i];
1243                 min_rad = rr + doffset;
1244                 rad     = 1.0/sum;
1245
1246                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1247                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1248             }
1249         }
1250
1251         /* Extra communication required for DD */
1252         if (DOMAINDECOMP(cr))
1253         {
1254             dd_atom_spread_real(cr->dd, born->bRad);
1255             dd_atom_spread_real(cr->dd, fr->invsqrta);
1256         }
1257     }
1258     else
1259     {
1260         /* OBC */
1261         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1262         {
1263             if (born->use[i] != 0)
1264             {
1265                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1266                 rr_inv2 = 1.0/rr;
1267                 rr      = rr-doffset;
1268                 rr_inv  = 1.0/rr;
1269                 sum     = rr * work[i];
1270                 sum2    = sum  * sum;
1271                 sum3    = sum2 * sum;
1272
1273                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1274                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1275                 born->bRad[i] = 1.0 / born->bRad[i];
1276
1277                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1278
1279                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1280                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1281             }
1282         }
1283         /* Extra (local) communication required for DD */
1284         if (DOMAINDECOMP(cr))
1285         {
1286             dd_atom_spread_real(cr->dd, born->bRad);
1287             dd_atom_spread_real(cr->dd, fr->invsqrta);
1288             dd_atom_spread_real(cr->dd, born->drobc);
1289         }
1290     }
1291
1292
1293
1294     return 0;
1295 }
1296
1297
1298
1299 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1300                                     float *x, float *f, float *fshift, float *shiftvec,
1301                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1302 {
1303     int          i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1304     int          jnrA, jnrB, jnrC, jnrD;
1305     int          j3A, j3B, j3C, j3D;
1306     int          jnrE, jnrF, jnrG, jnrH;
1307     int          j3E, j3F, j3G, j3H;
1308     int       *  jjnr;
1309
1310     float        rbi, shX, shY, shZ;
1311     float       *rb;
1312
1313     __m128       ix, iy, iz;
1314     __m128       jx, jy, jz;
1315     __m128       jxB, jyB, jzB;
1316     __m128       fix, fiy, fiz;
1317     __m128       dx, dy, dz;
1318     __m128       tx, ty, tz;
1319     __m128       dxB, dyB, dzB;
1320     __m128       txB, tyB, tzB;
1321
1322     __m128       rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1323     __m128       xmm1, xmm2, xmm3;
1324
1325     const __m128 two = _mm_set1_ps(2.0f);
1326
1327     rb     = born->work;
1328
1329     jjnr   = nl->jjnr;
1330
1331     /* Loop to get the proper form for the Born radius term, sse style */
1332     offset = natoms%4;
1333
1334     n0 = 0;
1335     n1 = natoms;
1336
1337     if (gb_algorithm == egbSTILL)
1338     {
1339         for (i = n0; i < n1; i++)
1340         {
1341             rbi   = born->bRad[i];
1342             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1343         }
1344     }
1345     else if (gb_algorithm == egbHCT)
1346     {
1347         for (i = n0; i < n1; i++)
1348         {
1349             rbi   = born->bRad[i];
1350             rb[i] = rbi * rbi * dvda[i];
1351         }
1352     }
1353     else if (gb_algorithm == egbOBC)
1354     {
1355         for (i = n0; i < n1; i++)
1356         {
1357             rbi   = born->bRad[i];
1358             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1359         }
1360     }
1361
1362     jz = _mm_setzero_ps();
1363
1364     n = j3A = j3B = j3C = j3D = 0;
1365
1366     for (i = 0; i < nl->nri; i++)
1367     {
1368         ii     = nl->iinr[i];
1369         ii3    = ii*3;
1370         is3    = 3*nl->shift[i];
1371         shX    = shiftvec[is3];
1372         shY    = shiftvec[is3+1];
1373         shZ    = shiftvec[is3+2];
1374         nj0    = nl->jindex[i];
1375         nj1    = nl->jindex[i+1];
1376
1377         ix     = _mm_set1_ps(shX+x[ii3+0]);
1378         iy     = _mm_set1_ps(shY+x[ii3+1]);
1379         iz     = _mm_set1_ps(shZ+x[ii3+2]);
1380
1381         offset = (nj1-nj0)%4;
1382
1383         rbai   = _mm_load1_ps(rb+ii);
1384         fix    = _mm_setzero_ps();
1385         fiy    = _mm_setzero_ps();
1386         fiz    = _mm_setzero_ps();
1387
1388
1389         for (k = nj0; k < nj1-offset; k += 4)
1390         {
1391             jnrA        = jjnr[k];
1392             jnrB        = jjnr[k+1];
1393             jnrC        = jjnr[k+2];
1394             jnrD        = jjnr[k+3];
1395
1396             j3A         = 3*jnrA;
1397             j3B         = 3*jnrB;
1398             j3C         = 3*jnrC;
1399             j3D         = 3*jnrD;
1400
1401             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1402
1403             dx          = _mm_sub_ps(ix, jx);
1404             dy          = _mm_sub_ps(iy, jy);
1405             dz          = _mm_sub_ps(iz, jz);
1406
1407             GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1408
1409             /* load chain rule terms for j1-4 */
1410             f_gb        = _mm_load_ps(dadx);
1411             dadx       += 4;
1412             f_gb_ai     = _mm_load_ps(dadx);
1413             dadx       += 4;
1414
1415             /* calculate scalar force */
1416             f_gb    = _mm_mul_ps(f_gb, rbai);
1417             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1418             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1419
1420             tx     = _mm_mul_ps(f_gb, dx);
1421             ty     = _mm_mul_ps(f_gb, dy);
1422             tz     = _mm_mul_ps(f_gb, dz);
1423
1424             fix    = _mm_add_ps(fix, tx);
1425             fiy    = _mm_add_ps(fiy, ty);
1426             fiz    = _mm_add_ps(fiz, tz);
1427
1428             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1429         }
1430
1431         /*deal with odd elements */
1432         if (offset != 0)
1433         {
1434             if (offset == 1)
1435             {
1436                 jnrA        = jjnr[k];
1437                 j3A         = 3*jnrA;
1438                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1439                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1440             }
1441             else if (offset == 2)
1442             {
1443                 jnrA        = jjnr[k];
1444                 jnrB        = jjnr[k+1];
1445                 j3A         = 3*jnrA;
1446                 j3B         = 3*jnrB;
1447                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1448                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1449             }
1450             else
1451             {
1452                 /* offset must be 3 */
1453                 jnrA        = jjnr[k];
1454                 jnrB        = jjnr[k+1];
1455                 jnrC        = jjnr[k+2];
1456                 j3A         = 3*jnrA;
1457                 j3B         = 3*jnrB;
1458                 j3C         = 3*jnrC;
1459                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1460                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1461             }
1462
1463             dx          = _mm_sub_ps(ix, jx);
1464             dy          = _mm_sub_ps(iy, jy);
1465             dz          = _mm_sub_ps(iz, jz);
1466
1467             /* load chain rule terms for j1-4 */
1468             f_gb        = _mm_load_ps(dadx);
1469             dadx       += 4;
1470             f_gb_ai     = _mm_load_ps(dadx);
1471             dadx       += 4;
1472
1473             /* calculate scalar force */
1474             f_gb    = _mm_mul_ps(f_gb, rbai);
1475             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1476             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1477
1478             tx     = _mm_mul_ps(f_gb, dx);
1479             ty     = _mm_mul_ps(f_gb, dy);
1480             tz     = _mm_mul_ps(f_gb, dz);
1481
1482             fix    = _mm_add_ps(fix, tx);
1483             fiy    = _mm_add_ps(fiy, ty);
1484             fiz    = _mm_add_ps(fiz, tz);
1485
1486             if (offset == 1)
1487             {
1488                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1489             }
1490             else if (offset == 2)
1491             {
1492                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1493             }
1494             else
1495             {
1496                 /* offset must be 3 */
1497                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1498             }
1499         }
1500
1501         /* fix/fiy/fiz now contain four partial force terms, that all should be
1502          * added to the i particle forces and shift forces.
1503          */
1504         gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1505     }
1506
1507     return 0;
1508 }
1509
1510
1511 #else
1512 /* keep compiler happy */
1513 int genborn_sse_dummy;
1514
1515 #endif /* SSE intrinsics available */