src/gromacs/mdlib/genborn_sse2_single.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "config.h"
  40
  41 #include <math.h>
  42 #include <string.h>
  43
  44 #include "gromacs/legacyheaders/typedefs.h"
  45 #include "gromacs/utility/smalloc.h"
  46 #include "gromacs/legacyheaders/genborn.h"
  47 #include "gromacs/math/vec.h"
  48 #include "gromacs/fileio/pdbio.h"
  49 #include "gromacs/legacyheaders/names.h"
  50 #include "gromacs/math/units.h"
  51 #include "gromacs/legacyheaders/domdec.h"
  52 #include "gromacs/legacyheaders/network.h"
  53 #include "gromacs/utility/fatalerror.h"
  54 #include "gromacs/legacyheaders/genborn.h"
  55
  56 #include "gromacs/utility/gmxmpi.h"
  57
  58
  59 /* Only compile this file if SSE intrinsics are available */
  60 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  61
  62 #include <gmx_sse2_single.h>
  63 #include <emmintrin.h>
  64
  65 #include "genborn_sse2_single.h"
  66
  67
  68 int
  69 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
  70                               int natoms, gmx_localtop_t *top,
  71                               float *x, t_nblist *nl,
  72                               gmx_genborn_t *born)
  73 {
  74     int          i, k, n, ii, is3, ii3, nj0, nj1, offset;
  75     int          jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
  76     int          jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
  77     int          shift;
  78     int         *mdtype;
  79     real         shX, shY, shZ;
  80     int         *jjnr;
  81     real        *shiftvec;
  82
  83     float        gpi_ai, gpi2;
  84     float        factor;
  85     float       *gb_radius;
  86     float       *vsolv;
  87     float       *work;
  88     float       *dadx;
  89
  90     __m128       ix, iy, iz;
  91     __m128       jx, jy, jz;
  92     __m128       dx, dy, dz;
  93     __m128       tx, ty, tz;
  94     __m128       jxB, jyB, jzB;
  95     __m128       dxB, dyB, dzB;
  96     __m128       txB, tyB, tzB;
  97     __m128       rsq, rinv, rinv2, rinv4, rinv6;
  98     __m128       rsqB, rinvB, rinv2B, rinv4B, rinv6B;
  99     __m128       ratio, gpi, rai, raj, vai, vaj, rvdw;
 100     __m128       ratioB, rajB, vajB, rvdwB;
 101     __m128       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
 102     __m128       ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
 103     __m128       mask, icf4, icf6, mask_cmp;
 104     __m128       icf4B, icf6B, mask_cmpB;
 105
 106     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 107     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 108     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 109
 110     const __m128 half   = _mm_set1_ps(0.5f);
 111     const __m128 three  = _mm_set1_ps(3.0f);
 112     const __m128 one    = _mm_set1_ps(1.0f);
 113     const __m128 two    = _mm_set1_ps(2.0f);
 114     const __m128 zero   = _mm_set1_ps(0.0f);
 115     const __m128 four   = _mm_set1_ps(4.0f);
 116
 117     const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
 118     const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
 119     const __m128 still_p4     = _mm_set1_ps(STILL_P4);
 120
 121     factor  = 0.5 * ONE_4PI_EPS0;
 122
 123     gb_radius = born->gb_radius;
 124     vsolv     = born->vsolv;
 125     work      = born->gpol_still_work;
 126     jjnr      = nl->jjnr;
 127     shiftvec  = fr->shift_vec[0];
 128     dadx      = fr->dadx;
 129
 130     jnrA = jnrB = jnrC = jnrD = 0;
 131     jx   = _mm_setzero_ps();
 132     jy   = _mm_setzero_ps();
 133     jz   = _mm_setzero_ps();
 134
 135     n = 0;
 136
 137     for (i = 0; i < natoms; i++)
 138     {
 139         work[i] = 0;
 140     }
 141
 142     for (i = 0; i < nl->nri; i++)
 143     {
 144         ii     = nl->iinr[i];
 145         ii3    = ii*3;
 146         is3    = 3*nl->shift[i];
 147         shX    = shiftvec[is3];
 148         shY    = shiftvec[is3+1];
 149         shZ    = shiftvec[is3+2];
 150         nj0    = nl->jindex[i];
 151         nj1    = nl->jindex[i+1];
 152
 153         ix     = _mm_set1_ps(shX+x[ii3+0]);
 154         iy     = _mm_set1_ps(shY+x[ii3+1]);
 155         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 156
 157         offset = (nj1-nj0)%4;
 158
 159         /* Polarization energy for atom ai */
 160         gpi    = _mm_setzero_ps();
 161
 162         rai     = _mm_load1_ps(gb_radius+ii);
 163         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
 164
 165         for (k = nj0; k < nj1-4-offset; k += 8)
 166         {
 167             jnrA        = jjnr[k];
 168             jnrB        = jjnr[k+1];
 169             jnrC        = jjnr[k+2];
 170             jnrD        = jjnr[k+3];
 171             jnrE        = jjnr[k+4];
 172             jnrF        = jjnr[k+5];
 173             jnrG        = jjnr[k+6];
 174             jnrH        = jjnr[k+7];
 175
 176             j3A         = 3*jnrA;
 177             j3B         = 3*jnrB;
 178             j3C         = 3*jnrC;
 179             j3D         = 3*jnrD;
 180             j3E         = 3*jnrE;
 181             j3F         = 3*jnrF;
 182             j3G         = 3*jnrG;
 183             j3H         = 3*jnrH;
 184
 185             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 186             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 187
 188             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 189             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 190             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 191             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
 192
 193             dx          = _mm_sub_ps(ix, jx);
 194             dy          = _mm_sub_ps(iy, jy);
 195             dz          = _mm_sub_ps(iz, jz);
 196             dxB         = _mm_sub_ps(ix, jxB);
 197             dyB         = _mm_sub_ps(iy, jyB);
 198             dzB         = _mm_sub_ps(iz, jzB);
 199
 200             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 201             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 202             rinv        = gmx_mm_invsqrt_ps(rsq);
 203             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 204             rinv2       = _mm_mul_ps(rinv, rinv);
 205             rinv2B      = _mm_mul_ps(rinvB, rinvB);
 206             rinv4       = _mm_mul_ps(rinv2, rinv2);
 207             rinv4B      = _mm_mul_ps(rinv2B, rinv2B);
 208             rinv6       = _mm_mul_ps(rinv4, rinv2);
 209             rinv6B      = _mm_mul_ps(rinv4B, rinv2B);
 210
 211             rvdw        = _mm_add_ps(rai, raj);
 212             rvdwB       = _mm_add_ps(rai, rajB);
 213             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 214             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
 215
 216             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 217             mask_cmpB   = _mm_cmple_ps(ratioB, still_p5inv);
 218
 219             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 220             if (0 == _mm_movemask_ps(mask_cmp) )
 221             {
 222                 /* if ratio>still_p5inv for ALL elements */
 223                 ccf         = one;
 224                 dccf        = _mm_setzero_ps();
 225             }
 226             else
 227             {
 228                 ratio       = _mm_min_ps(ratio, still_p5inv);
 229                 theta       = _mm_mul_ps(ratio, still_pip5);
 230                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 231                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 232                 ccf         = _mm_mul_ps(term, term);
 233                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 234                                          _mm_mul_ps(sinq, theta));
 235             }
 236             if (0 == _mm_movemask_ps(mask_cmpB) )
 237             {
 238                 /* if ratio>still_p5inv for ALL elements */
 239                 ccfB        = one;
 240                 dccfB       = _mm_setzero_ps();
 241             }
 242             else
 243             {
 244                 ratioB      = _mm_min_ps(ratioB, still_p5inv);
 245                 thetaB      = _mm_mul_ps(ratioB, still_pip5);
 246                 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
 247                 termB       = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
 248                 ccfB        = _mm_mul_ps(termB, termB);
 249                 dccfB       = _mm_mul_ps(_mm_mul_ps(two, termB),
 250                                          _mm_mul_ps(sinqB, thetaB));
 251             }
 252
 253             prod        = _mm_mul_ps(still_p4, vaj);
 254             prodB       = _mm_mul_ps(still_p4, vajB);
 255             icf4        = _mm_mul_ps(ccf, rinv4);
 256             icf4B       = _mm_mul_ps(ccfB, rinv4B);
 257             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 258             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
 259
 260             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 261             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
 262
 263             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
 264
 265             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 266             dadx += 4;
 267             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 268             dadx += 4;
 269             _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
 270             dadx += 4;
 271             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
 272             dadx += 4;
 273         }
 274
 275         for (; k < nj1-offset; k += 4)
 276         {
 277             jnrA        = jjnr[k];
 278             jnrB        = jjnr[k+1];
 279             jnrC        = jjnr[k+2];
 280             jnrD        = jjnr[k+3];
 281
 282             j3A         = 3*jnrA;
 283             j3B         = 3*jnrB;
 284             j3C         = 3*jnrC;
 285             j3D         = 3*jnrD;
 286
 287             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 288
 289             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 290             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 291
 292             dx          = _mm_sub_ps(ix, jx);
 293             dy          = _mm_sub_ps(iy, jy);
 294             dz          = _mm_sub_ps(iz, jz);
 295
 296             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 297             rinv        = gmx_mm_invsqrt_ps(rsq);
 298             rinv2       = _mm_mul_ps(rinv, rinv);
 299             rinv4       = _mm_mul_ps(rinv2, rinv2);
 300             rinv6       = _mm_mul_ps(rinv4, rinv2);
 301
 302             rvdw        = _mm_add_ps(rai, raj);
 303             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 304
 305             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 306
 307             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 308             if (0 == _mm_movemask_ps(mask_cmp))
 309             {
 310                 /* if ratio>still_p5inv for ALL elements */
 311                 ccf         = one;
 312                 dccf        = _mm_setzero_ps();
 313             }
 314             else
 315             {
 316                 ratio       = _mm_min_ps(ratio, still_p5inv);
 317                 theta       = _mm_mul_ps(ratio, still_pip5);
 318                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 319                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 320                 ccf         = _mm_mul_ps(term, term);
 321                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 322                                          _mm_mul_ps(sinq, theta));
 323             }
 324
 325             prod        = _mm_mul_ps(still_p4, vaj);
 326             icf4        = _mm_mul_ps(ccf, rinv4);
 327             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 328
 329             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 330
 331             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 332
 333             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 334             dadx += 4;
 335             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 336             dadx += 4;
 337         }
 338
 339         if (offset != 0)
 340         {
 341             if (offset == 1)
 342             {
 343                 jnrA        = jjnr[k];
 344                 j3A         = 3*jnrA;
 345                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
 346                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
 347                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
 348                 mask        = mask1;
 349             }
 350             else if (offset == 2)
 351             {
 352                 jnrA        = jjnr[k];
 353                 jnrB        = jjnr[k+1];
 354                 j3A         = 3*jnrA;
 355                 j3B         = 3*jnrB;
 356                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
 357                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
 358                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
 359                 mask        = mask2;
 360             }
 361             else
 362             {
 363                 /* offset must be 3 */
 364                 jnrA        = jjnr[k];
 365                 jnrB        = jjnr[k+1];
 366                 jnrC        = jjnr[k+2];
 367                 j3A         = 3*jnrA;
 368                 j3B         = 3*jnrB;
 369                 j3C         = 3*jnrC;
 370                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
 371                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
 372                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
 373                 mask        = mask3;
 374             }
 375
 376             dx          = _mm_sub_ps(ix, jx);
 377             dy          = _mm_sub_ps(iy, jy);
 378             dz          = _mm_sub_ps(iz, jz);
 379
 380             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 381             rinv        = gmx_mm_invsqrt_ps(rsq);
 382             rinv2       = _mm_mul_ps(rinv, rinv);
 383             rinv4       = _mm_mul_ps(rinv2, rinv2);
 384             rinv6       = _mm_mul_ps(rinv4, rinv2);
 385
 386             rvdw        = _mm_add_ps(rai, raj);
 387             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 388
 389             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 390
 391             if (0 == _mm_movemask_ps(mask_cmp))
 392             {
 393                 /* if ratio>still_p5inv for ALL elements */
 394                 ccf         = one;
 395                 dccf        = _mm_setzero_ps();
 396             }
 397             else
 398             {
 399                 ratio       = _mm_min_ps(ratio, still_p5inv);
 400                 theta       = _mm_mul_ps(ratio, still_pip5);
 401                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 402                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 403                 ccf         = _mm_mul_ps(term, term);
 404                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 405                                          _mm_mul_ps(sinq, theta));
 406             }
 407
 408             prod        = _mm_mul_ps(still_p4, vaj);
 409             icf4        = _mm_mul_ps(ccf, rinv4);
 410             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 411
 412             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 413
 414             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 415             dadx += 4;
 416             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 417             dadx += 4;
 418
 419             tmp = _mm_mul_ps(prod_ai, icf4);
 420
 421             if (offset == 1)
 422             {
 423                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
 424             }
 425             else if (offset == 2)
 426             {
 427                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
 428             }
 429             else
 430             {
 431                 /* offset must be 3 */
 432                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
 433             }
 434         }
 435         GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
 436     }
 437
 438     /* Sum up the polarization energy from other nodes */
 439     if (DOMAINDECOMP(cr))
 440     {
 441         dd_atom_sum_real(cr->dd, work);
 442     }
 443
 444     /* Compute the radii */
 445     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 446     {
 447         if (born->use[i] != 0)
 448         {
 449             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 450             gpi2             = gpi_ai * gpi_ai;
 451             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 452             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 453         }
 454     }
 455
 456     /* Extra (local) communication required for DD */
 457     if (DOMAINDECOMP(cr))
 458     {
 459         dd_atom_spread_real(cr->dd, born->bRad);
 460         dd_atom_spread_real(cr->dd, fr->invsqrta);
 461     }
 462
 463     return 0;
 464 }
 465
 466
 467 int
 468 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 469                                 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 470 {
 471     int          i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 472     int          jnrA, jnrB, jnrC, jnrD;
 473     int          j3A, j3B, j3C, j3D;
 474     int          jnrE, jnrF, jnrG, jnrH;
 475     int          j3E, j3F, j3G, j3H;
 476     float        shX, shY, shZ;
 477     float        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 478     float        sum_ai2, sum_ai3, tsum, tchain, doffset;
 479     float       *obc_param;
 480     float       *gb_radius;
 481     float       *work;
 482     int       *  jjnr;
 483     float       *dadx;
 484     float       *shiftvec;
 485     float        min_rad, rad;
 486
 487     __m128       ix, iy, iz, jx, jy, jz;
 488     __m128       dx, dy, dz, t1, t2, t3, t4;
 489     __m128       rsq, rinv, r;
 490     __m128       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 491     __m128       uij, lij2, uij2, lij3, uij3, diff2;
 492     __m128       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 493     __m128       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 494     __m128       dadx1, dadx2;
 495     __m128       logterm;
 496     __m128       mask;
 497     __m128       obc_mask1, obc_mask2, obc_mask3;
 498     __m128       jxB, jyB, jzB, t1B, t2B, t3B, t4B;
 499     __m128       dxB, dyB, dzB, rsqB, rinvB, rB;
 500     __m128       rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
 501     __m128       uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
 502     __m128       lij_invB, sk2_invB, prodB;
 503     __m128       sk_ajB, sk2_ajB, sk2_rinvB;
 504     __m128       dadx1B, dadx2B;
 505     __m128       logtermB;
 506     __m128       obc_mask1B, obc_mask2B, obc_mask3B;
 507
 508     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 509     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 510     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 511
 512     __m128       oneeighth   = _mm_set1_ps(0.125);
 513     __m128       onefourth   = _mm_set1_ps(0.25);
 514
 515     const __m128 half  = _mm_set1_ps(0.5f);
 516     const __m128 three = _mm_set1_ps(3.0f);
 517     const __m128 one   = _mm_set1_ps(1.0f);
 518     const __m128 two   = _mm_set1_ps(2.0f);
 519     const __m128 zero  = _mm_set1_ps(0.0f);
 520     const __m128 neg   = _mm_set1_ps(-1.0f);
 521
 522     /* Set the dielectric offset */
 523     doffset   = born->gb_doffset;
 524     gb_radius = born->gb_radius;
 525     obc_param = born->param;
 526     work      = born->gpol_hct_work;
 527     jjnr      = nl->jjnr;
 528     dadx      = fr->dadx;
 529     shiftvec  = fr->shift_vec[0];
 530
 531     jx        = _mm_setzero_ps();
 532     jy        = _mm_setzero_ps();
 533     jz        = _mm_setzero_ps();
 534
 535     jnrA = jnrB = jnrC = jnrD = 0;
 536
 537     for (i = 0; i < born->nr; i++)
 538     {
 539         work[i] = 0;
 540     }
 541
 542     for (i = 0; i < nl->nri; i++)
 543     {
 544         ii     = nl->iinr[i];
 545         ii3    = ii*3;
 546         is3    = 3*nl->shift[i];
 547         shX    = shiftvec[is3];
 548         shY    = shiftvec[is3+1];
 549         shZ    = shiftvec[is3+2];
 550         nj0    = nl->jindex[i];
 551         nj1    = nl->jindex[i+1];
 552
 553         ix     = _mm_set1_ps(shX+x[ii3+0]);
 554         iy     = _mm_set1_ps(shY+x[ii3+1]);
 555         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 556
 557         offset = (nj1-nj0)%4;
 558
 559         rai     = _mm_load1_ps(gb_radius+ii);
 560         rai_inv = gmx_mm_inv_ps(rai);
 561
 562         sum_ai = _mm_setzero_ps();
 563
 564         sk_ai  = _mm_load1_ps(born->param+ii);
 565         sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
 566
 567         for (k = nj0; k < nj1-4-offset; k += 8)
 568         {
 569             jnrA        = jjnr[k];
 570             jnrB        = jjnr[k+1];
 571             jnrC        = jjnr[k+2];
 572             jnrD        = jjnr[k+3];
 573             jnrE        = jjnr[k+4];
 574             jnrF        = jjnr[k+5];
 575             jnrG        = jjnr[k+6];
 576             jnrH        = jjnr[k+7];
 577
 578             j3A         = 3*jnrA;
 579             j3B         = 3*jnrB;
 580             j3C         = 3*jnrC;
 581             j3D         = 3*jnrD;
 582             j3E         = 3*jnrE;
 583             j3F         = 3*jnrF;
 584             j3G         = 3*jnrG;
 585             j3H         = 3*jnrH;
 586
 587             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 588             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 589             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 590             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 591             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 592             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
 593
 594             dx    = _mm_sub_ps(ix, jx);
 595             dy    = _mm_sub_ps(iy, jy);
 596             dz    = _mm_sub_ps(iz, jz);
 597             dxB   = _mm_sub_ps(ix, jxB);
 598             dyB   = _mm_sub_ps(iy, jyB);
 599             dzB   = _mm_sub_ps(iz, jzB);
 600
 601             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 602             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 603
 604             rinv        = gmx_mm_invsqrt_ps(rsq);
 605             r           = _mm_mul_ps(rsq, rinv);
 606             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 607             rB          = _mm_mul_ps(rsqB, rinvB);
 608
 609             /* Compute raj_inv aj1-4 */
 610             raj_inv     = gmx_mm_inv_ps(raj);
 611             raj_invB    = gmx_mm_inv_ps(rajB);
 612
 613             /* Evaluate influence of atom aj -> ai */
 614             t1            = _mm_add_ps(r, sk_aj);
 615             t2            = _mm_sub_ps(r, sk_aj);
 616             t3            = _mm_sub_ps(sk_aj, r);
 617             t1B           = _mm_add_ps(rB, sk_ajB);
 618             t2B           = _mm_sub_ps(rB, sk_ajB);
 619             t3B           = _mm_sub_ps(sk_ajB, rB);
 620             obc_mask1     = _mm_cmplt_ps(rai, t1);
 621             obc_mask2     = _mm_cmplt_ps(rai, t2);
 622             obc_mask3     = _mm_cmplt_ps(rai, t3);
 623             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
 624             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
 625             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
 626
 627             uij           = gmx_mm_inv_ps(t1);
 628             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 629                                          _mm_andnot_ps(obc_mask2, rai_inv));
 630             dlij          = _mm_and_ps(one, obc_mask2);
 631             uij2          = _mm_mul_ps(uij, uij);
 632             uij3          = _mm_mul_ps(uij2, uij);
 633             lij2          = _mm_mul_ps(lij, lij);
 634             lij3          = _mm_mul_ps(lij2, lij);
 635
 636             uijB          = gmx_mm_inv_ps(t1B);
 637             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 638                                          _mm_andnot_ps(obc_mask2B, rai_inv));
 639             dlijB         = _mm_and_ps(one, obc_mask2B);
 640             uij2B         = _mm_mul_ps(uijB, uijB);
 641             uij3B         = _mm_mul_ps(uij2B, uijB);
 642             lij2B         = _mm_mul_ps(lijB, lijB);
 643             lij3B         = _mm_mul_ps(lij2B, lijB);
 644
 645             diff2         = _mm_sub_ps(uij2, lij2);
 646             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 647             sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 648             sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 649             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 650
 651             diff2B        = _mm_sub_ps(uij2B, lij2B);
 652             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 653             sk2_ajB       = _mm_mul_ps(sk_ajB, sk_ajB);
 654             sk2_rinvB     = _mm_mul_ps(sk2_ajB, rinvB);
 655             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 656
 657             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 658             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 659
 660             t1            = _mm_sub_ps(lij, uij);
 661             t2            = _mm_mul_ps(diff2,
 662                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 663                                                   prod));
 664             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 665             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 666             t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 667             t4            = _mm_and_ps(t4, obc_mask3);
 668             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 669
 670             t1B           = _mm_sub_ps(lijB, uijB);
 671             t2B           = _mm_mul_ps(diff2B,
 672                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 673                                                   prodB));
 674             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 675             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 676             t4B           = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
 677             t4B           = _mm_and_ps(t4B, obc_mask3B);
 678             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 679
 680             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
 681
 682             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 683                                        _mm_mul_ps(prod, lij3));
 684             t1            = _mm_sub_ps(t1,
 685                                        _mm_mul_ps(onefourth,
 686                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 687                                                              _mm_mul_ps(lij3, r))));
 688             t2            = _mm_mul_ps(onefourth,
 689                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 690                                                   _mm_mul_ps(uij3, r)));
 691             t2            = _mm_sub_ps(t2,
 692                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 693                                                   _mm_mul_ps(prod, uij3)));
 694             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 695                                        _mm_mul_ps(rinv, rinv));
 696             t3            = _mm_sub_ps(t3,
 697                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 698                                                   _mm_add_ps(one,
 699                                                              _mm_mul_ps(sk2_rinv, rinv))));
 700             t1            = _mm_mul_ps(rinv,
 701                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 702                                                   _mm_add_ps(t2, t3)));
 703
 704
 705
 706             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 707                                        _mm_mul_ps(prodB, lij3B));
 708             t1B           = _mm_sub_ps(t1B,
 709                                        _mm_mul_ps(onefourth,
 710                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 711                                                              _mm_mul_ps(lij3B, rB))));
 712             t2B           = _mm_mul_ps(onefourth,
 713                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 714                                                   _mm_mul_ps(uij3B, rB)));
 715             t2B           = _mm_sub_ps(t2B,
 716                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 717                                                   _mm_mul_ps(prodB, uij3B)));
 718             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 719                                        _mm_mul_ps(rinvB, rinvB));
 720             t3B           = _mm_sub_ps(t3B,
 721                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 722                                                   _mm_add_ps(one,
 723                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 724             t1B           = _mm_mul_ps(rinvB,
 725                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 726                                                   _mm_add_ps(t2B, t3B)));
 727
 728             dadx1         = _mm_and_ps(t1, obc_mask1);
 729             dadx1B        = _mm_and_ps(t1B, obc_mask1B);
 730
 731
 732             /* Evaluate influence of atom ai -> aj */
 733             t1            = _mm_add_ps(r, sk_ai);
 734             t2            = _mm_sub_ps(r, sk_ai);
 735             t3            = _mm_sub_ps(sk_ai, r);
 736             t1B           = _mm_add_ps(rB, sk_ai);
 737             t2B           = _mm_sub_ps(rB, sk_ai);
 738             t3B           = _mm_sub_ps(sk_ai, rB);
 739             obc_mask1     = _mm_cmplt_ps(raj, t1);
 740             obc_mask2     = _mm_cmplt_ps(raj, t2);
 741             obc_mask3     = _mm_cmplt_ps(raj, t3);
 742             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
 743             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
 744             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
 745
 746             uij           = gmx_mm_inv_ps(t1);
 747             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 748                                          _mm_andnot_ps(obc_mask2, raj_inv));
 749             dlij          = _mm_and_ps(one, obc_mask2);
 750             uij2          = _mm_mul_ps(uij, uij);
 751             uij3          = _mm_mul_ps(uij2, uij);
 752             lij2          = _mm_mul_ps(lij, lij);
 753             lij3          = _mm_mul_ps(lij2, lij);
 754
 755             uijB          = gmx_mm_inv_ps(t1B);
 756             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 757                                          _mm_andnot_ps(obc_mask2B, raj_invB));
 758             dlijB         = _mm_and_ps(one, obc_mask2B);
 759             uij2B         = _mm_mul_ps(uijB, uijB);
 760             uij3B         = _mm_mul_ps(uij2B, uijB);
 761             lij2B         = _mm_mul_ps(lijB, lijB);
 762             lij3B         = _mm_mul_ps(lij2B, lijB);
 763
 764             diff2         = _mm_sub_ps(uij2, lij2);
 765             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 766             sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 767             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 768
 769             diff2B        = _mm_sub_ps(uij2B, lij2B);
 770             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 771             sk2_rinvB     = _mm_mul_ps(sk2_ai, rinvB);
 772             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 773
 774             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 775             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 776
 777             t1            = _mm_sub_ps(lij, uij);
 778             t2            = _mm_mul_ps(diff2,
 779                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 780                                                   prod));
 781             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 782             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 783             t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 784             t4            = _mm_and_ps(t4, obc_mask3);
 785             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 786
 787             t1B           = _mm_sub_ps(lijB, uijB);
 788             t2B           = _mm_mul_ps(diff2B,
 789                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 790                                                   prodB));
 791             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 792             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 793             t4B           = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
 794             t4B           = _mm_and_ps(t4B, obc_mask3B);
 795             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 796
 797             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 798             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
 799
 800             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 801                                        _mm_mul_ps(prod, lij3));
 802             t1            = _mm_sub_ps(t1,
 803                                        _mm_mul_ps(onefourth,
 804                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 805                                                              _mm_mul_ps(lij3, r))));
 806             t2            = _mm_mul_ps(onefourth,
 807                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 808                                                   _mm_mul_ps(uij3, r)));
 809             t2            = _mm_sub_ps(t2,
 810                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 811                                                   _mm_mul_ps(prod, uij3)));
 812             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 813                                        _mm_mul_ps(rinv, rinv));
 814             t3            = _mm_sub_ps(t3,
 815                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 816                                                   _mm_add_ps(one,
 817                                                              _mm_mul_ps(sk2_rinv, rinv))));
 818             t1            = _mm_mul_ps(rinv,
 819                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 820                                                   _mm_add_ps(t2, t3)));
 821
 822
 823             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 824                                        _mm_mul_ps(prodB, lij3B));
 825             t1B           = _mm_sub_ps(t1B,
 826                                        _mm_mul_ps(onefourth,
 827                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 828                                                              _mm_mul_ps(lij3B, rB))));
 829             t2B           = _mm_mul_ps(onefourth,
 830                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 831                                                   _mm_mul_ps(uij3B, rB)));
 832             t2B           = _mm_sub_ps(t2B,
 833                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 834                                                   _mm_mul_ps(prodB, uij3B)));
 835             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 836                                        _mm_mul_ps(rinvB, rinvB));
 837             t3B           = _mm_sub_ps(t3B,
 838                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 839                                                   _mm_add_ps(one,
 840                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 841             t1B           = _mm_mul_ps(rinvB,
 842                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 843                                                   _mm_add_ps(t2B, t3B)));
 844
 845
 846             dadx2         = _mm_and_ps(t1, obc_mask1);
 847             dadx2B        = _mm_and_ps(t1B, obc_mask1B);
 848
 849             _mm_store_ps(dadx, dadx1);
 850             dadx += 4;
 851             _mm_store_ps(dadx, dadx2);
 852             dadx += 4;
 853             _mm_store_ps(dadx, dadx1B);
 854             dadx += 4;
 855             _mm_store_ps(dadx, dadx2B);
 856             dadx += 4;
 857
 858         } /* end normal inner loop */
 859
 860         for (; k < nj1-offset; k += 4)
 861         {
 862             jnrA        = jjnr[k];
 863             jnrB        = jjnr[k+1];
 864             jnrC        = jjnr[k+2];
 865             jnrD        = jjnr[k+3];
 866
 867             j3A         = 3*jnrA;
 868             j3B         = 3*jnrB;
 869             j3C         = 3*jnrC;
 870             j3D         = 3*jnrD;
 871
 872             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 873             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 874             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 875
 876             dx    = _mm_sub_ps(ix, jx);
 877             dy    = _mm_sub_ps(iy, jy);
 878             dz    = _mm_sub_ps(iz, jz);
 879
 880             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 881
 882             rinv        = gmx_mm_invsqrt_ps(rsq);
 883             r           = _mm_mul_ps(rsq, rinv);
 884
 885             /* Compute raj_inv aj1-4 */
 886             raj_inv     = gmx_mm_inv_ps(raj);
 887
 888             /* Evaluate influence of atom aj -> ai */
 889             t1            = _mm_add_ps(r, sk_aj);
 890             obc_mask1     = _mm_cmplt_ps(rai, t1);
 891
 892             if (_mm_movemask_ps(obc_mask1))
 893             {
 894                 /* If any of the elements has rai<dr+sk, this is executed */
 895                 t2            = _mm_sub_ps(r, sk_aj);
 896                 t3            = _mm_sub_ps(sk_aj, r);
 897
 898                 obc_mask2     = _mm_cmplt_ps(rai, t2);
 899                 obc_mask3     = _mm_cmplt_ps(rai, t3);
 900
 901                 uij           = gmx_mm_inv_ps(t1);
 902                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 903                                              _mm_andnot_ps(obc_mask2, rai_inv));
 904                 dlij          = _mm_and_ps(one, obc_mask2);
 905                 uij2          = _mm_mul_ps(uij, uij);
 906                 uij3          = _mm_mul_ps(uij2, uij);
 907                 lij2          = _mm_mul_ps(lij, lij);
 908                 lij3          = _mm_mul_ps(lij2, lij);
 909                 diff2         = _mm_sub_ps(uij2, lij2);
 910                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 911                 sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 912                 sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 913                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 914                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 915                 t1            = _mm_sub_ps(lij, uij);
 916                 t2            = _mm_mul_ps(diff2,
 917                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 918                                                       prod));
 919                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 920                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 921                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 922                 t4            = _mm_and_ps(t4, obc_mask3);
 923                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 924                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
 925                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 926                                            _mm_mul_ps(prod, lij3));
 927                 t1            = _mm_sub_ps(t1,
 928                                            _mm_mul_ps(onefourth,
 929                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 930                                                                  _mm_mul_ps(lij3, r))));
 931                 t2            = _mm_mul_ps(onefourth,
 932                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 933                                                       _mm_mul_ps(uij3, r)));
 934                 t2            = _mm_sub_ps(t2,
 935                                            _mm_add_ps(_mm_mul_ps(half, uij2),
 936                                                       _mm_mul_ps(prod, uij3)));
 937                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 938                                            _mm_mul_ps(rinv, rinv));
 939                 t3            = _mm_sub_ps(t3,
 940                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 941                                                       _mm_add_ps(one,
 942                                                                  _mm_mul_ps(sk2_rinv, rinv))));
 943                 t1            = _mm_mul_ps(rinv,
 944                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
 945                                                       _mm_add_ps(t2, t3)));
 946
 947                 dadx1         = _mm_and_ps(t1, obc_mask1);
 948             }
 949             else
 950             {
 951                 dadx1         = _mm_setzero_ps();
 952             }
 953
 954             /* Evaluate influence of atom ai -> aj */
 955             t1            = _mm_add_ps(r, sk_ai);
 956             obc_mask1     = _mm_cmplt_ps(raj, t1);
 957
 958             if (_mm_movemask_ps(obc_mask1))
 959             {
 960                 t2            = _mm_sub_ps(r, sk_ai);
 961                 t3            = _mm_sub_ps(sk_ai, r);
 962                 obc_mask2     = _mm_cmplt_ps(raj, t2);
 963                 obc_mask3     = _mm_cmplt_ps(raj, t3);
 964
 965                 uij           = gmx_mm_inv_ps(t1);
 966                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 967                                              _mm_andnot_ps(obc_mask2, raj_inv));
 968                 dlij          = _mm_and_ps(one, obc_mask2);
 969                 uij2          = _mm_mul_ps(uij, uij);
 970                 uij3          = _mm_mul_ps(uij2, uij);
 971                 lij2          = _mm_mul_ps(lij, lij);
 972                 lij3          = _mm_mul_ps(lij2, lij);
 973                 diff2         = _mm_sub_ps(uij2, lij2);
 974                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 975                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 976                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 977                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 978                 t1            = _mm_sub_ps(lij, uij);
 979                 t2            = _mm_mul_ps(diff2,
 980                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 981                                                       prod));
 982                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 983                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 984                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 985                 t4            = _mm_and_ps(t4, obc_mask3);
 986                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 987
 988                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 989
 990                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 991                                            _mm_mul_ps(prod, lij3));
 992                 t1            = _mm_sub_ps(t1,
 993                                            _mm_mul_ps(onefourth,
 994                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 995                                                                  _mm_mul_ps(lij3, r))));
 996                 t2            = _mm_mul_ps(onefourth,
 997                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 998                                                       _mm_mul_ps(uij3, r)));
 999                 t2            = _mm_sub_ps(t2,
1000                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1001                                                       _mm_mul_ps(prod, uij3)));
1002                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1003                                            _mm_mul_ps(rinv, rinv));
1004                 t3            = _mm_sub_ps(t3,
1005                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1006                                                       _mm_add_ps(one,
1007                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1008                 t1            = _mm_mul_ps(rinv,
1009                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1010                                                       _mm_add_ps(t2, t3)));
1011                 dadx2         = _mm_and_ps(t1, obc_mask1);
1012             }
1013             else
1014             {
1015                 dadx2         = _mm_setzero_ps();
1016             }
1017
1018             _mm_store_ps(dadx, dadx1);
1019             dadx += 4;
1020             _mm_store_ps(dadx, dadx2);
1021             dadx += 4;
1022         } /* end normal inner loop */
1023
1024         if (offset != 0)
1025         {
1026             if (offset == 1)
1027             {
1028                 jnrA        = jjnr[k];
1029                 j3A         = 3*jnrA;
1030                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1031                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1032                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1033                 mask        = mask1;
1034             }
1035             else if (offset == 2)
1036             {
1037                 jnrA        = jjnr[k];
1038                 jnrB        = jjnr[k+1];
1039                 j3A         = 3*jnrA;
1040                 j3B         = 3*jnrB;
1041                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1042                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1043                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1044                 mask        = mask2;
1045             }
1046             else
1047             {
1048                 /* offset must be 3 */
1049                 jnrA        = jjnr[k];
1050                 jnrB        = jjnr[k+1];
1051                 jnrC        = jjnr[k+2];
1052                 j3A         = 3*jnrA;
1053                 j3B         = 3*jnrB;
1054                 j3C         = 3*jnrC;
1055                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1056                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1057                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1058                 mask        = mask3;
1059             }
1060
1061             dx    = _mm_sub_ps(ix, jx);
1062             dy    = _mm_sub_ps(iy, jy);
1063             dz    = _mm_sub_ps(iz, jz);
1064
1065             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
1066
1067             rinv        = gmx_mm_invsqrt_ps(rsq);
1068             r           = _mm_mul_ps(rsq, rinv);
1069
1070             /* Compute raj_inv aj1-4 */
1071             raj_inv     = gmx_mm_inv_ps(raj);
1072
1073             /* Evaluate influence of atom aj -> ai */
1074             t1            = _mm_add_ps(r, sk_aj);
1075             obc_mask1     = _mm_cmplt_ps(rai, t1);
1076             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1077
1078             if (_mm_movemask_ps(obc_mask1))
1079             {
1080                 t2            = _mm_sub_ps(r, sk_aj);
1081                 t3            = _mm_sub_ps(sk_aj, r);
1082                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1083                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1084
1085                 uij           = gmx_mm_inv_ps(t1);
1086                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1087                                              _mm_andnot_ps(obc_mask2, rai_inv));
1088                 dlij           = _mm_and_ps(one, obc_mask2);
1089                 uij2           = _mm_mul_ps(uij, uij);
1090                 uij3           = _mm_mul_ps(uij2, uij);
1091                 lij2           = _mm_mul_ps(lij, lij);
1092                 lij3           = _mm_mul_ps(lij2, lij);
1093                 diff2          = _mm_sub_ps(uij2, lij2);
1094                 lij_inv        = gmx_mm_invsqrt_ps(lij2);
1095                 sk2_aj         = _mm_mul_ps(sk_aj, sk_aj);
1096                 sk2_rinv       = _mm_mul_ps(sk2_aj, rinv);
1097                 prod           = _mm_mul_ps(onefourth, sk2_rinv);
1098                 logterm        = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1099                 t1             = _mm_sub_ps(lij, uij);
1100                 t2             = _mm_mul_ps(diff2,
1101                                             _mm_sub_ps(_mm_mul_ps(onefourth, r),
1102                                                        prod));
1103                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1104                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1105                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1106                 t4            = _mm_and_ps(t4, obc_mask3);
1107                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1108                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1109                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1110                                            _mm_mul_ps(prod, lij3));
1111                 t1            = _mm_sub_ps(t1,
1112                                            _mm_mul_ps(onefourth,
1113                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1114                                                                  _mm_mul_ps(lij3, r))));
1115                 t2            = _mm_mul_ps(onefourth,
1116                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1117                                                       _mm_mul_ps(uij3, r)));
1118                 t2            = _mm_sub_ps(t2,
1119                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1120                                                       _mm_mul_ps(prod, uij3)));
1121                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1122                                            _mm_mul_ps(rinv, rinv));
1123                 t3            = _mm_sub_ps(t3,
1124                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1125                                                       _mm_add_ps(one,
1126                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1127                 t1            = _mm_mul_ps(rinv,
1128                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1129                                                       _mm_add_ps(t2, t3)));
1130                 dadx1         = _mm_and_ps(t1, obc_mask1);
1131             }
1132             else
1133             {
1134                 dadx1         = _mm_setzero_ps();
1135             }
1136
1137             /* Evaluate influence of atom ai -> aj */
1138             t1            = _mm_add_ps(r, sk_ai);
1139             obc_mask1     = _mm_cmplt_ps(raj, t1);
1140             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1141
1142             if (_mm_movemask_ps(obc_mask1))
1143             {
1144                 t2            = _mm_sub_ps(r, sk_ai);
1145                 t3            = _mm_sub_ps(sk_ai, r);
1146                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1147                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1148
1149                 uij           = gmx_mm_inv_ps(t1);
1150                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1151                                           _mm_andnot_ps(obc_mask2, raj_inv));
1152                 dlij          = _mm_and_ps(one, obc_mask2);
1153                 uij2          = _mm_mul_ps(uij, uij);
1154                 uij3          = _mm_mul_ps(uij2, uij);
1155                 lij2          = _mm_mul_ps(lij, lij);
1156                 lij3          = _mm_mul_ps(lij2, lij);
1157                 diff2         = _mm_sub_ps(uij2, lij2);
1158                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1159                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
1160                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
1161                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1162                 t1            = _mm_sub_ps(lij, uij);
1163                 t2            = _mm_mul_ps(diff2,
1164                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
1165                                                       prod));
1166                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1167                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1168                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1169                 t4            = _mm_and_ps(t4, obc_mask3);
1170                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1171
1172                 tmp           = _mm_and_ps(t1, obc_mask1);
1173
1174                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1175                                            _mm_mul_ps(prod, lij3));
1176                 t1            = _mm_sub_ps(t1,
1177                                            _mm_mul_ps(onefourth,
1178                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1179                                                                  _mm_mul_ps(lij3, r))));
1180                 t2            = _mm_mul_ps(onefourth,
1181                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1182                                                       _mm_mul_ps(uij3, r)));
1183                 t2            = _mm_sub_ps(t2,
1184                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1185                                                       _mm_mul_ps(prod, uij3)));
1186                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1187                                            _mm_mul_ps(rinv, rinv));
1188                 t3            = _mm_sub_ps(t3,
1189                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1190                                                       _mm_add_ps(one,
1191                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1192                 t1            = _mm_mul_ps(rinv,
1193                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1194                                                       _mm_add_ps(t2, t3)));
1195                 dadx2         = _mm_and_ps(t1, obc_mask1);
1196             }
1197             else
1198             {
1199                 dadx2         = _mm_setzero_ps();
1200                 tmp           = _mm_setzero_ps();
1201             }
1202
1203             _mm_store_ps(dadx, dadx1);
1204             dadx += 4;
1205             _mm_store_ps(dadx, dadx2);
1206             dadx += 4;
1207
1208             if (offset == 1)
1209             {
1210                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1211             }
1212             else if (offset == 2)
1213             {
1214                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1215             }
1216             else
1217             {
1218                 /* offset must be 3 */
1219                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1220             }
1221
1222         }
1223         GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1224
1225     }
1226
1227     /* Parallel summations */
1228     if (DOMAINDECOMP(cr))
1229     {
1230         dd_atom_sum_real(cr->dd, work);
1231     }
1232
1233     if (gb_algorithm == egbHCT)
1234     {
1235         /* HCT */
1236         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1237         {
1238             if (born->use[i] != 0)
1239             {
1240                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1241                 sum     = 1.0/rr - work[i];
1242                 min_rad = rr + doffset;
1243                 rad     = 1.0/sum;
1244
1245                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1246                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1247             }
1248         }
1249
1250         /* Extra communication required for DD */
1251         if (DOMAINDECOMP(cr))
1252         {
1253             dd_atom_spread_real(cr->dd, born->bRad);
1254             dd_atom_spread_real(cr->dd, fr->invsqrta);
1255         }
1256     }
1257     else
1258     {
1259         /* OBC */
1260         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1261         {
1262             if (born->use[i] != 0)
1263             {
1264                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1265                 rr_inv2 = 1.0/rr;
1266                 rr      = rr-doffset;
1267                 rr_inv  = 1.0/rr;
1268                 sum     = rr * work[i];
1269                 sum2    = sum  * sum;
1270                 sum3    = sum2 * sum;
1271
1272                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1273                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1274                 born->bRad[i] = 1.0 / born->bRad[i];
1275
1276                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1277
1278                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1279                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1280             }
1281         }
1282         /* Extra (local) communication required for DD */
1283         if (DOMAINDECOMP(cr))
1284         {
1285             dd_atom_spread_real(cr->dd, born->bRad);
1286             dd_atom_spread_real(cr->dd, fr->invsqrta);
1287             dd_atom_spread_real(cr->dd, born->drobc);
1288         }
1289     }
1290
1291
1292
1293     return 0;
1294 }
1295
1296
1297
1298 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1299                                     float *x, float *f, float *fshift, float *shiftvec,
1300                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1301 {
1302     int          i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1303     int          jnrA, jnrB, jnrC, jnrD;
1304     int          j3A, j3B, j3C, j3D;
1305     int          jnrE, jnrF, jnrG, jnrH;
1306     int          j3E, j3F, j3G, j3H;
1307     int       *  jjnr;
1308
1309     float        rbi, shX, shY, shZ;
1310     float       *rb;
1311
1312     __m128       ix, iy, iz;
1313     __m128       jx, jy, jz;
1314     __m128       jxB, jyB, jzB;
1315     __m128       fix, fiy, fiz;
1316     __m128       dx, dy, dz;
1317     __m128       tx, ty, tz;
1318     __m128       dxB, dyB, dzB;
1319     __m128       txB, tyB, tzB;
1320
1321     __m128       rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1322     __m128       xmm1, xmm2, xmm3;
1323
1324     const __m128 two = _mm_set1_ps(2.0f);
1325
1326     rb     = born->work;
1327
1328     jjnr   = nl->jjnr;
1329
1330     /* Loop to get the proper form for the Born radius term, sse style */
1331     offset = natoms%4;
1332
1333     n0 = 0;
1334     n1 = natoms;
1335
1336     if (gb_algorithm == egbSTILL)
1337     {
1338         for (i = n0; i < n1; i++)
1339         {
1340             rbi   = born->bRad[i];
1341             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1342         }
1343     }
1344     else if (gb_algorithm == egbHCT)
1345     {
1346         for (i = n0; i < n1; i++)
1347         {
1348             rbi   = born->bRad[i];
1349             rb[i] = rbi * rbi * dvda[i];
1350         }
1351     }
1352     else if (gb_algorithm == egbOBC)
1353     {
1354         for (i = n0; i < n1; i++)
1355         {
1356             rbi   = born->bRad[i];
1357             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1358         }
1359     }
1360
1361     jz = _mm_setzero_ps();
1362
1363     n = j3A = j3B = j3C = j3D = 0;
1364
1365     for (i = 0; i < nl->nri; i++)
1366     {
1367         ii     = nl->iinr[i];
1368         ii3    = ii*3;
1369         is3    = 3*nl->shift[i];
1370         shX    = shiftvec[is3];
1371         shY    = shiftvec[is3+1];
1372         shZ    = shiftvec[is3+2];
1373         nj0    = nl->jindex[i];
1374         nj1    = nl->jindex[i+1];
1375
1376         ix     = _mm_set1_ps(shX+x[ii3+0]);
1377         iy     = _mm_set1_ps(shY+x[ii3+1]);
1378         iz     = _mm_set1_ps(shZ+x[ii3+2]);
1379
1380         offset = (nj1-nj0)%4;
1381
1382         rbai   = _mm_load1_ps(rb+ii);
1383         fix    = _mm_setzero_ps();
1384         fiy    = _mm_setzero_ps();
1385         fiz    = _mm_setzero_ps();
1386
1387
1388         for (k = nj0; k < nj1-offset; k += 4)
1389         {
1390             jnrA        = jjnr[k];
1391             jnrB        = jjnr[k+1];
1392             jnrC        = jjnr[k+2];
1393             jnrD        = jjnr[k+3];
1394
1395             j3A         = 3*jnrA;
1396             j3B         = 3*jnrB;
1397             j3C         = 3*jnrC;
1398             j3D         = 3*jnrD;
1399
1400             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1401
1402             dx          = _mm_sub_ps(ix, jx);
1403             dy          = _mm_sub_ps(iy, jy);
1404             dz          = _mm_sub_ps(iz, jz);
1405
1406             GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1407
1408             /* load chain rule terms for j1-4 */
1409             f_gb        = _mm_load_ps(dadx);
1410             dadx       += 4;
1411             f_gb_ai     = _mm_load_ps(dadx);
1412             dadx       += 4;
1413
1414             /* calculate scalar force */
1415             f_gb    = _mm_mul_ps(f_gb, rbai);
1416             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1417             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1418
1419             tx     = _mm_mul_ps(f_gb, dx);
1420             ty     = _mm_mul_ps(f_gb, dy);
1421             tz     = _mm_mul_ps(f_gb, dz);
1422
1423             fix    = _mm_add_ps(fix, tx);
1424             fiy    = _mm_add_ps(fiy, ty);
1425             fiz    = _mm_add_ps(fiz, tz);
1426
1427             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1428         }
1429
1430         /*deal with odd elements */
1431         if (offset != 0)
1432         {
1433             if (offset == 1)
1434             {
1435                 jnrA        = jjnr[k];
1436                 j3A         = 3*jnrA;
1437                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1438                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1439             }
1440             else if (offset == 2)
1441             {
1442                 jnrA        = jjnr[k];
1443                 jnrB        = jjnr[k+1];
1444                 j3A         = 3*jnrA;
1445                 j3B         = 3*jnrB;
1446                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1447                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1448             }
1449             else
1450             {
1451                 /* offset must be 3 */
1452                 jnrA        = jjnr[k];
1453                 jnrB        = jjnr[k+1];
1454                 jnrC        = jjnr[k+2];
1455                 j3A         = 3*jnrA;
1456                 j3B         = 3*jnrB;
1457                 j3C         = 3*jnrC;
1458                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1459                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1460             }
1461
1462             dx          = _mm_sub_ps(ix, jx);
1463             dy          = _mm_sub_ps(iy, jy);
1464             dz          = _mm_sub_ps(iz, jz);
1465
1466             /* load chain rule terms for j1-4 */
1467             f_gb        = _mm_load_ps(dadx);
1468             dadx       += 4;
1469             f_gb_ai     = _mm_load_ps(dadx);
1470             dadx       += 4;
1471
1472             /* calculate scalar force */
1473             f_gb    = _mm_mul_ps(f_gb, rbai);
1474             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1475             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1476
1477             tx     = _mm_mul_ps(f_gb, dx);
1478             ty     = _mm_mul_ps(f_gb, dy);
1479             tz     = _mm_mul_ps(f_gb, dz);
1480
1481             fix    = _mm_add_ps(fix, tx);
1482             fiy    = _mm_add_ps(fiy, ty);
1483             fiz    = _mm_add_ps(fiz, tz);
1484
1485             if (offset == 1)
1486             {
1487                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1488             }
1489             else if (offset == 2)
1490             {
1491                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1492             }
1493             else
1494             {
1495                 /* offset must be 3 */
1496                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1497             }
1498         }
1499
1500         /* fix/fiy/fiz now contain four partial force terms, that all should be
1501          * added to the i particle forces and shift forces.
1502          */
1503         gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1504     }
1505
1506     return 0;
1507 }
1508
1509
1510 #else
1511 /* keep compiler happy */
1512 int genborn_sse_dummy;
1513
1514 #endif /* SSE intrinsics available */