src/gromacs/mdlib/genborn_sse2_single.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include <math.h>
  40 #include <string.h>
  41
  42 #include "gromacs/legacyheaders/typedefs.h"
  43 #include "gromacs/utility/smalloc.h"
  44 #include "gromacs/legacyheaders/genborn.h"
  45 #include "gromacs/math/vec.h"
  46 #include "gromacs/fileio/pdbio.h"
  47 #include "gromacs/legacyheaders/names.h"
  48 #include "gromacs/math/units.h"
  49 #include "gromacs/legacyheaders/domdec.h"
  50 #include "gromacs/legacyheaders/network.h"
  51 #include "gromacs/utility/fatalerror.h"
  52 #include "gromacs/legacyheaders/genborn.h"
  53
  54 #include "gromacs/utility/gmxmpi.h"
  55
  56
  57 /* Only compile this file if SSE intrinsics are available */
  58 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  59
  60 #include <gmx_sse2_single.h>
  61 #include <emmintrin.h>
  62
  63 #include "genborn_sse2_single.h"
  64
  65
  66 int
  67 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
  68                               int natoms, gmx_localtop_t *top,
  69                               float *x, t_nblist *nl,
  70                               gmx_genborn_t *born)
  71 {
  72     int          i, k, n, ii, is3, ii3, nj0, nj1, offset;
  73     int          jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
  74     int          jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
  75     int          shift;
  76     int         *mdtype;
  77     real         shX, shY, shZ;
  78     int         *jjnr;
  79     real        *shiftvec;
  80
  81     float        gpi_ai, gpi2;
  82     float        factor;
  83     float       *gb_radius;
  84     float       *vsolv;
  85     float       *work;
  86     float       *dadx;
  87
  88     __m128       ix, iy, iz;
  89     __m128       jx, jy, jz;
  90     __m128       dx, dy, dz;
  91     __m128       tx, ty, tz;
  92     __m128       jxB, jyB, jzB;
  93     __m128       dxB, dyB, dzB;
  94     __m128       txB, tyB, tzB;
  95     __m128       rsq, rinv, rinv2, rinv4, rinv6;
  96     __m128       rsqB, rinvB, rinv2B, rinv4B, rinv6B;
  97     __m128       ratio, gpi, rai, raj, vai, vaj, rvdw;
  98     __m128       ratioB, rajB, vajB, rvdwB;
  99     __m128       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
 100     __m128       ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
 101     __m128       mask, icf4, icf6, mask_cmp;
 102     __m128       icf4B, icf6B, mask_cmpB;
 103
 104     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 105     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 106     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 107
 108     const __m128 half   = _mm_set1_ps(0.5f);
 109     const __m128 three  = _mm_set1_ps(3.0f);
 110     const __m128 one    = _mm_set1_ps(1.0f);
 111     const __m128 two    = _mm_set1_ps(2.0f);
 112     const __m128 zero   = _mm_set1_ps(0.0f);
 113     const __m128 four   = _mm_set1_ps(4.0f);
 114
 115     const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
 116     const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
 117     const __m128 still_p4     = _mm_set1_ps(STILL_P4);
 118
 119     factor  = 0.5 * ONE_4PI_EPS0;
 120
 121     gb_radius = born->gb_radius;
 122     vsolv     = born->vsolv;
 123     work      = born->gpol_still_work;
 124     jjnr      = nl->jjnr;
 125     shiftvec  = fr->shift_vec[0];
 126     dadx      = fr->dadx;
 127
 128     jnrA = jnrB = jnrC = jnrD = 0;
 129     jx   = _mm_setzero_ps();
 130     jy   = _mm_setzero_ps();
 131     jz   = _mm_setzero_ps();
 132
 133     n = 0;
 134
 135     for (i = 0; i < natoms; i++)
 136     {
 137         work[i] = 0;
 138     }
 139
 140     for (i = 0; i < nl->nri; i++)
 141     {
 142         ii     = nl->iinr[i];
 143         ii3    = ii*3;
 144         is3    = 3*nl->shift[i];
 145         shX    = shiftvec[is3];
 146         shY    = shiftvec[is3+1];
 147         shZ    = shiftvec[is3+2];
 148         nj0    = nl->jindex[i];
 149         nj1    = nl->jindex[i+1];
 150
 151         ix     = _mm_set1_ps(shX+x[ii3+0]);
 152         iy     = _mm_set1_ps(shY+x[ii3+1]);
 153         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 154
 155         offset = (nj1-nj0)%4;
 156
 157         /* Polarization energy for atom ai */
 158         gpi    = _mm_setzero_ps();
 159
 160         rai     = _mm_load1_ps(gb_radius+ii);
 161         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
 162
 163         for (k = nj0; k < nj1-4-offset; k += 8)
 164         {
 165             jnrA        = jjnr[k];
 166             jnrB        = jjnr[k+1];
 167             jnrC        = jjnr[k+2];
 168             jnrD        = jjnr[k+3];
 169             jnrE        = jjnr[k+4];
 170             jnrF        = jjnr[k+5];
 171             jnrG        = jjnr[k+6];
 172             jnrH        = jjnr[k+7];
 173
 174             j3A         = 3*jnrA;
 175             j3B         = 3*jnrB;
 176             j3C         = 3*jnrC;
 177             j3D         = 3*jnrD;
 178             j3E         = 3*jnrE;
 179             j3F         = 3*jnrF;
 180             j3G         = 3*jnrG;
 181             j3H         = 3*jnrH;
 182
 183             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 184             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 185
 186             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 187             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 188             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 189             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
 190
 191             dx          = _mm_sub_ps(ix, jx);
 192             dy          = _mm_sub_ps(iy, jy);
 193             dz          = _mm_sub_ps(iz, jz);
 194             dxB         = _mm_sub_ps(ix, jxB);
 195             dyB         = _mm_sub_ps(iy, jyB);
 196             dzB         = _mm_sub_ps(iz, jzB);
 197
 198             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 199             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 200             rinv        = gmx_mm_invsqrt_ps(rsq);
 201             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 202             rinv2       = _mm_mul_ps(rinv, rinv);
 203             rinv2B      = _mm_mul_ps(rinvB, rinvB);
 204             rinv4       = _mm_mul_ps(rinv2, rinv2);
 205             rinv4B      = _mm_mul_ps(rinv2B, rinv2B);
 206             rinv6       = _mm_mul_ps(rinv4, rinv2);
 207             rinv6B      = _mm_mul_ps(rinv4B, rinv2B);
 208
 209             rvdw        = _mm_add_ps(rai, raj);
 210             rvdwB       = _mm_add_ps(rai, rajB);
 211             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 212             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
 213
 214             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 215             mask_cmpB   = _mm_cmple_ps(ratioB, still_p5inv);
 216
 217             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 218             if (0 == _mm_movemask_ps(mask_cmp) )
 219             {
 220                 /* if ratio>still_p5inv for ALL elements */
 221                 ccf         = one;
 222                 dccf        = _mm_setzero_ps();
 223             }
 224             else
 225             {
 226                 ratio       = _mm_min_ps(ratio, still_p5inv);
 227                 theta       = _mm_mul_ps(ratio, still_pip5);
 228                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 229                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 230                 ccf         = _mm_mul_ps(term, term);
 231                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 232                                          _mm_mul_ps(sinq, theta));
 233             }
 234             if (0 == _mm_movemask_ps(mask_cmpB) )
 235             {
 236                 /* if ratio>still_p5inv for ALL elements */
 237                 ccfB        = one;
 238                 dccfB       = _mm_setzero_ps();
 239             }
 240             else
 241             {
 242                 ratioB      = _mm_min_ps(ratioB, still_p5inv);
 243                 thetaB      = _mm_mul_ps(ratioB, still_pip5);
 244                 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
 245                 termB       = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
 246                 ccfB        = _mm_mul_ps(termB, termB);
 247                 dccfB       = _mm_mul_ps(_mm_mul_ps(two, termB),
 248                                          _mm_mul_ps(sinqB, thetaB));
 249             }
 250
 251             prod        = _mm_mul_ps(still_p4, vaj);
 252             prodB       = _mm_mul_ps(still_p4, vajB);
 253             icf4        = _mm_mul_ps(ccf, rinv4);
 254             icf4B       = _mm_mul_ps(ccfB, rinv4B);
 255             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 256             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
 257
 258             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 259             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
 260
 261             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
 262
 263             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 264             dadx += 4;
 265             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 266             dadx += 4;
 267             _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
 268             dadx += 4;
 269             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
 270             dadx += 4;
 271         }
 272
 273         for (; k < nj1-offset; k += 4)
 274         {
 275             jnrA        = jjnr[k];
 276             jnrB        = jjnr[k+1];
 277             jnrC        = jjnr[k+2];
 278             jnrD        = jjnr[k+3];
 279
 280             j3A         = 3*jnrA;
 281             j3B         = 3*jnrB;
 282             j3C         = 3*jnrC;
 283             j3D         = 3*jnrD;
 284
 285             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 286
 287             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 288             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 289
 290             dx          = _mm_sub_ps(ix, jx);
 291             dy          = _mm_sub_ps(iy, jy);
 292             dz          = _mm_sub_ps(iz, jz);
 293
 294             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 295             rinv        = gmx_mm_invsqrt_ps(rsq);
 296             rinv2       = _mm_mul_ps(rinv, rinv);
 297             rinv4       = _mm_mul_ps(rinv2, rinv2);
 298             rinv6       = _mm_mul_ps(rinv4, rinv2);
 299
 300             rvdw        = _mm_add_ps(rai, raj);
 301             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 302
 303             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 304
 305             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 306             if (0 == _mm_movemask_ps(mask_cmp))
 307             {
 308                 /* if ratio>still_p5inv for ALL elements */
 309                 ccf         = one;
 310                 dccf        = _mm_setzero_ps();
 311             }
 312             else
 313             {
 314                 ratio       = _mm_min_ps(ratio, still_p5inv);
 315                 theta       = _mm_mul_ps(ratio, still_pip5);
 316                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 317                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 318                 ccf         = _mm_mul_ps(term, term);
 319                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 320                                          _mm_mul_ps(sinq, theta));
 321             }
 322
 323             prod        = _mm_mul_ps(still_p4, vaj);
 324             icf4        = _mm_mul_ps(ccf, rinv4);
 325             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 326
 327             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 328
 329             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 330
 331             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 332             dadx += 4;
 333             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 334             dadx += 4;
 335         }
 336
 337         if (offset != 0)
 338         {
 339             if (offset == 1)
 340             {
 341                 jnrA        = jjnr[k];
 342                 j3A         = 3*jnrA;
 343                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
 344                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
 345                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
 346                 mask        = mask1;
 347             }
 348             else if (offset == 2)
 349             {
 350                 jnrA        = jjnr[k];
 351                 jnrB        = jjnr[k+1];
 352                 j3A         = 3*jnrA;
 353                 j3B         = 3*jnrB;
 354                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
 355                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
 356                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
 357                 mask        = mask2;
 358             }
 359             else
 360             {
 361                 /* offset must be 3 */
 362                 jnrA        = jjnr[k];
 363                 jnrB        = jjnr[k+1];
 364                 jnrC        = jjnr[k+2];
 365                 j3A         = 3*jnrA;
 366                 j3B         = 3*jnrB;
 367                 j3C         = 3*jnrC;
 368                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
 369                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
 370                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
 371                 mask        = mask3;
 372             }
 373
 374             dx          = _mm_sub_ps(ix, jx);
 375             dy          = _mm_sub_ps(iy, jy);
 376             dz          = _mm_sub_ps(iz, jz);
 377
 378             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 379             rinv        = gmx_mm_invsqrt_ps(rsq);
 380             rinv2       = _mm_mul_ps(rinv, rinv);
 381             rinv4       = _mm_mul_ps(rinv2, rinv2);
 382             rinv6       = _mm_mul_ps(rinv4, rinv2);
 383
 384             rvdw        = _mm_add_ps(rai, raj);
 385             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 386
 387             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 388
 389             if (0 == _mm_movemask_ps(mask_cmp))
 390             {
 391                 /* if ratio>still_p5inv for ALL elements */
 392                 ccf         = one;
 393                 dccf        = _mm_setzero_ps();
 394             }
 395             else
 396             {
 397                 ratio       = _mm_min_ps(ratio, still_p5inv);
 398                 theta       = _mm_mul_ps(ratio, still_pip5);
 399                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 400                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 401                 ccf         = _mm_mul_ps(term, term);
 402                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 403                                          _mm_mul_ps(sinq, theta));
 404             }
 405
 406             prod        = _mm_mul_ps(still_p4, vaj);
 407             icf4        = _mm_mul_ps(ccf, rinv4);
 408             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 409
 410             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 411
 412             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 413             dadx += 4;
 414             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 415             dadx += 4;
 416
 417             tmp = _mm_mul_ps(prod_ai, icf4);
 418
 419             if (offset == 1)
 420             {
 421                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
 422             }
 423             else if (offset == 2)
 424             {
 425                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
 426             }
 427             else
 428             {
 429                 /* offset must be 3 */
 430                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
 431             }
 432         }
 433         GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
 434     }
 435
 436     /* Sum up the polarization energy from other nodes */
 437     if (DOMAINDECOMP(cr))
 438     {
 439         dd_atom_sum_real(cr->dd, work);
 440     }
 441
 442     /* Compute the radii */
 443     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 444     {
 445         if (born->use[i] != 0)
 446         {
 447             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 448             gpi2             = gpi_ai * gpi_ai;
 449             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 450             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 451         }
 452     }
 453
 454     /* Extra (local) communication required for DD */
 455     if (DOMAINDECOMP(cr))
 456     {
 457         dd_atom_spread_real(cr->dd, born->bRad);
 458         dd_atom_spread_real(cr->dd, fr->invsqrta);
 459     }
 460
 461     return 0;
 462 }
 463
 464
 465 int
 466 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 467                                 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 468 {
 469     int          i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 470     int          jnrA, jnrB, jnrC, jnrD;
 471     int          j3A, j3B, j3C, j3D;
 472     int          jnrE, jnrF, jnrG, jnrH;
 473     int          j3E, j3F, j3G, j3H;
 474     float        shX, shY, shZ;
 475     float        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 476     float        sum_ai2, sum_ai3, tsum, tchain, doffset;
 477     float       *obc_param;
 478     float       *gb_radius;
 479     float       *work;
 480     int       *  jjnr;
 481     float       *dadx;
 482     float       *shiftvec;
 483     float        min_rad, rad;
 484
 485     __m128       ix, iy, iz, jx, jy, jz;
 486     __m128       dx, dy, dz, t1, t2, t3, t4;
 487     __m128       rsq, rinv, r;
 488     __m128       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 489     __m128       uij, lij2, uij2, lij3, uij3, diff2;
 490     __m128       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 491     __m128       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 492     __m128       dadx1, dadx2;
 493     __m128       logterm;
 494     __m128       mask;
 495     __m128       obc_mask1, obc_mask2, obc_mask3;
 496     __m128       jxB, jyB, jzB, t1B, t2B, t3B, t4B;
 497     __m128       dxB, dyB, dzB, rsqB, rinvB, rB;
 498     __m128       rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
 499     __m128       uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
 500     __m128       lij_invB, sk2_invB, prodB;
 501     __m128       sk_ajB, sk2_ajB, sk2_rinvB;
 502     __m128       dadx1B, dadx2B;
 503     __m128       logtermB;
 504     __m128       obc_mask1B, obc_mask2B, obc_mask3B;
 505
 506     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 507     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 508     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 509
 510     __m128       oneeighth   = _mm_set1_ps(0.125);
 511     __m128       onefourth   = _mm_set1_ps(0.25);
 512
 513     const __m128 half  = _mm_set1_ps(0.5f);
 514     const __m128 three = _mm_set1_ps(3.0f);
 515     const __m128 one   = _mm_set1_ps(1.0f);
 516     const __m128 two   = _mm_set1_ps(2.0f);
 517     const __m128 zero  = _mm_set1_ps(0.0f);
 518     const __m128 neg   = _mm_set1_ps(-1.0f);
 519
 520     /* Set the dielectric offset */
 521     doffset   = born->gb_doffset;
 522     gb_radius = born->gb_radius;
 523     obc_param = born->param;
 524     work      = born->gpol_hct_work;
 525     jjnr      = nl->jjnr;
 526     dadx      = fr->dadx;
 527     shiftvec  = fr->shift_vec[0];
 528
 529     jx        = _mm_setzero_ps();
 530     jy        = _mm_setzero_ps();
 531     jz        = _mm_setzero_ps();
 532
 533     jnrA = jnrB = jnrC = jnrD = 0;
 534
 535     for (i = 0; i < born->nr; i++)
 536     {
 537         work[i] = 0;
 538     }
 539
 540     for (i = 0; i < nl->nri; i++)
 541     {
 542         ii     = nl->iinr[i];
 543         ii3    = ii*3;
 544         is3    = 3*nl->shift[i];
 545         shX    = shiftvec[is3];
 546         shY    = shiftvec[is3+1];
 547         shZ    = shiftvec[is3+2];
 548         nj0    = nl->jindex[i];
 549         nj1    = nl->jindex[i+1];
 550
 551         ix     = _mm_set1_ps(shX+x[ii3+0]);
 552         iy     = _mm_set1_ps(shY+x[ii3+1]);
 553         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 554
 555         offset = (nj1-nj0)%4;
 556
 557         rai     = _mm_load1_ps(gb_radius+ii);
 558         rai_inv = gmx_mm_inv_ps(rai);
 559
 560         sum_ai = _mm_setzero_ps();
 561
 562         sk_ai  = _mm_load1_ps(born->param+ii);
 563         sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
 564
 565         for (k = nj0; k < nj1-4-offset; k += 8)
 566         {
 567             jnrA        = jjnr[k];
 568             jnrB        = jjnr[k+1];
 569             jnrC        = jjnr[k+2];
 570             jnrD        = jjnr[k+3];
 571             jnrE        = jjnr[k+4];
 572             jnrF        = jjnr[k+5];
 573             jnrG        = jjnr[k+6];
 574             jnrH        = jjnr[k+7];
 575
 576             j3A         = 3*jnrA;
 577             j3B         = 3*jnrB;
 578             j3C         = 3*jnrC;
 579             j3D         = 3*jnrD;
 580             j3E         = 3*jnrE;
 581             j3F         = 3*jnrF;
 582             j3G         = 3*jnrG;
 583             j3H         = 3*jnrH;
 584
 585             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 586             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 587             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 588             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 589             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 590             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
 591
 592             dx    = _mm_sub_ps(ix, jx);
 593             dy    = _mm_sub_ps(iy, jy);
 594             dz    = _mm_sub_ps(iz, jz);
 595             dxB   = _mm_sub_ps(ix, jxB);
 596             dyB   = _mm_sub_ps(iy, jyB);
 597             dzB   = _mm_sub_ps(iz, jzB);
 598
 599             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 600             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 601
 602             rinv        = gmx_mm_invsqrt_ps(rsq);
 603             r           = _mm_mul_ps(rsq, rinv);
 604             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 605             rB          = _mm_mul_ps(rsqB, rinvB);
 606
 607             /* Compute raj_inv aj1-4 */
 608             raj_inv     = gmx_mm_inv_ps(raj);
 609             raj_invB    = gmx_mm_inv_ps(rajB);
 610
 611             /* Evaluate influence of atom aj -> ai */
 612             t1            = _mm_add_ps(r, sk_aj);
 613             t2            = _mm_sub_ps(r, sk_aj);
 614             t3            = _mm_sub_ps(sk_aj, r);
 615             t1B           = _mm_add_ps(rB, sk_ajB);
 616             t2B           = _mm_sub_ps(rB, sk_ajB);
 617             t3B           = _mm_sub_ps(sk_ajB, rB);
 618             obc_mask1     = _mm_cmplt_ps(rai, t1);
 619             obc_mask2     = _mm_cmplt_ps(rai, t2);
 620             obc_mask3     = _mm_cmplt_ps(rai, t3);
 621             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
 622             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
 623             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
 624
 625             uij           = gmx_mm_inv_ps(t1);
 626             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 627                                          _mm_andnot_ps(obc_mask2, rai_inv));
 628             dlij          = _mm_and_ps(one, obc_mask2);
 629             uij2          = _mm_mul_ps(uij, uij);
 630             uij3          = _mm_mul_ps(uij2, uij);
 631             lij2          = _mm_mul_ps(lij, lij);
 632             lij3          = _mm_mul_ps(lij2, lij);
 633
 634             uijB          = gmx_mm_inv_ps(t1B);
 635             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 636                                          _mm_andnot_ps(obc_mask2B, rai_inv));
 637             dlijB         = _mm_and_ps(one, obc_mask2B);
 638             uij2B         = _mm_mul_ps(uijB, uijB);
 639             uij3B         = _mm_mul_ps(uij2B, uijB);
 640             lij2B         = _mm_mul_ps(lijB, lijB);
 641             lij3B         = _mm_mul_ps(lij2B, lijB);
 642
 643             diff2         = _mm_sub_ps(uij2, lij2);
 644             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 645             sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 646             sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 647             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 648
 649             diff2B        = _mm_sub_ps(uij2B, lij2B);
 650             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 651             sk2_ajB       = _mm_mul_ps(sk_ajB, sk_ajB);
 652             sk2_rinvB     = _mm_mul_ps(sk2_ajB, rinvB);
 653             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 654
 655             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 656             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 657
 658             t1            = _mm_sub_ps(lij, uij);
 659             t2            = _mm_mul_ps(diff2,
 660                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 661                                                   prod));
 662             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 663             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 664             t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 665             t4            = _mm_and_ps(t4, obc_mask3);
 666             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 667
 668             t1B           = _mm_sub_ps(lijB, uijB);
 669             t2B           = _mm_mul_ps(diff2B,
 670                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 671                                                   prodB));
 672             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 673             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 674             t4B           = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
 675             t4B           = _mm_and_ps(t4B, obc_mask3B);
 676             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 677
 678             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
 679
 680             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 681                                        _mm_mul_ps(prod, lij3));
 682             t1            = _mm_sub_ps(t1,
 683                                        _mm_mul_ps(onefourth,
 684                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 685                                                              _mm_mul_ps(lij3, r))));
 686             t2            = _mm_mul_ps(onefourth,
 687                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 688                                                   _mm_mul_ps(uij3, r)));
 689             t2            = _mm_sub_ps(t2,
 690                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 691                                                   _mm_mul_ps(prod, uij3)));
 692             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 693                                        _mm_mul_ps(rinv, rinv));
 694             t3            = _mm_sub_ps(t3,
 695                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 696                                                   _mm_add_ps(one,
 697                                                              _mm_mul_ps(sk2_rinv, rinv))));
 698             t1            = _mm_mul_ps(rinv,
 699                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 700                                                   _mm_add_ps(t2, t3)));
 701
 702
 703
 704             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 705                                        _mm_mul_ps(prodB, lij3B));
 706             t1B           = _mm_sub_ps(t1B,
 707                                        _mm_mul_ps(onefourth,
 708                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 709                                                              _mm_mul_ps(lij3B, rB))));
 710             t2B           = _mm_mul_ps(onefourth,
 711                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 712                                                   _mm_mul_ps(uij3B, rB)));
 713             t2B           = _mm_sub_ps(t2B,
 714                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 715                                                   _mm_mul_ps(prodB, uij3B)));
 716             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 717                                        _mm_mul_ps(rinvB, rinvB));
 718             t3B           = _mm_sub_ps(t3B,
 719                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 720                                                   _mm_add_ps(one,
 721                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 722             t1B           = _mm_mul_ps(rinvB,
 723                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 724                                                   _mm_add_ps(t2B, t3B)));
 725
 726             dadx1         = _mm_and_ps(t1, obc_mask1);
 727             dadx1B        = _mm_and_ps(t1B, obc_mask1B);
 728
 729
 730             /* Evaluate influence of atom ai -> aj */
 731             t1            = _mm_add_ps(r, sk_ai);
 732             t2            = _mm_sub_ps(r, sk_ai);
 733             t3            = _mm_sub_ps(sk_ai, r);
 734             t1B           = _mm_add_ps(rB, sk_ai);
 735             t2B           = _mm_sub_ps(rB, sk_ai);
 736             t3B           = _mm_sub_ps(sk_ai, rB);
 737             obc_mask1     = _mm_cmplt_ps(raj, t1);
 738             obc_mask2     = _mm_cmplt_ps(raj, t2);
 739             obc_mask3     = _mm_cmplt_ps(raj, t3);
 740             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
 741             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
 742             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
 743
 744             uij           = gmx_mm_inv_ps(t1);
 745             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 746                                          _mm_andnot_ps(obc_mask2, raj_inv));
 747             dlij          = _mm_and_ps(one, obc_mask2);
 748             uij2          = _mm_mul_ps(uij, uij);
 749             uij3          = _mm_mul_ps(uij2, uij);
 750             lij2          = _mm_mul_ps(lij, lij);
 751             lij3          = _mm_mul_ps(lij2, lij);
 752
 753             uijB          = gmx_mm_inv_ps(t1B);
 754             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 755                                          _mm_andnot_ps(obc_mask2B, raj_invB));
 756             dlijB         = _mm_and_ps(one, obc_mask2B);
 757             uij2B         = _mm_mul_ps(uijB, uijB);
 758             uij3B         = _mm_mul_ps(uij2B, uijB);
 759             lij2B         = _mm_mul_ps(lijB, lijB);
 760             lij3B         = _mm_mul_ps(lij2B, lijB);
 761
 762             diff2         = _mm_sub_ps(uij2, lij2);
 763             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 764             sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 765             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 766
 767             diff2B        = _mm_sub_ps(uij2B, lij2B);
 768             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 769             sk2_rinvB     = _mm_mul_ps(sk2_ai, rinvB);
 770             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 771
 772             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 773             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 774
 775             t1            = _mm_sub_ps(lij, uij);
 776             t2            = _mm_mul_ps(diff2,
 777                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 778                                                   prod));
 779             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 780             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 781             t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 782             t4            = _mm_and_ps(t4, obc_mask3);
 783             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 784
 785             t1B           = _mm_sub_ps(lijB, uijB);
 786             t2B           = _mm_mul_ps(diff2B,
 787                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 788                                                   prodB));
 789             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 790             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 791             t4B           = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
 792             t4B           = _mm_and_ps(t4B, obc_mask3B);
 793             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 794
 795             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 796             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
 797
 798             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 799                                        _mm_mul_ps(prod, lij3));
 800             t1            = _mm_sub_ps(t1,
 801                                        _mm_mul_ps(onefourth,
 802                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 803                                                              _mm_mul_ps(lij3, r))));
 804             t2            = _mm_mul_ps(onefourth,
 805                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 806                                                   _mm_mul_ps(uij3, r)));
 807             t2            = _mm_sub_ps(t2,
 808                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 809                                                   _mm_mul_ps(prod, uij3)));
 810             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 811                                        _mm_mul_ps(rinv, rinv));
 812             t3            = _mm_sub_ps(t3,
 813                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 814                                                   _mm_add_ps(one,
 815                                                              _mm_mul_ps(sk2_rinv, rinv))));
 816             t1            = _mm_mul_ps(rinv,
 817                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 818                                                   _mm_add_ps(t2, t3)));
 819
 820
 821             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 822                                        _mm_mul_ps(prodB, lij3B));
 823             t1B           = _mm_sub_ps(t1B,
 824                                        _mm_mul_ps(onefourth,
 825                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 826                                                              _mm_mul_ps(lij3B, rB))));
 827             t2B           = _mm_mul_ps(onefourth,
 828                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 829                                                   _mm_mul_ps(uij3B, rB)));
 830             t2B           = _mm_sub_ps(t2B,
 831                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 832                                                   _mm_mul_ps(prodB, uij3B)));
 833             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 834                                        _mm_mul_ps(rinvB, rinvB));
 835             t3B           = _mm_sub_ps(t3B,
 836                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 837                                                   _mm_add_ps(one,
 838                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 839             t1B           = _mm_mul_ps(rinvB,
 840                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 841                                                   _mm_add_ps(t2B, t3B)));
 842
 843
 844             dadx2         = _mm_and_ps(t1, obc_mask1);
 845             dadx2B        = _mm_and_ps(t1B, obc_mask1B);
 846
 847             _mm_store_ps(dadx, dadx1);
 848             dadx += 4;
 849             _mm_store_ps(dadx, dadx2);
 850             dadx += 4;
 851             _mm_store_ps(dadx, dadx1B);
 852             dadx += 4;
 853             _mm_store_ps(dadx, dadx2B);
 854             dadx += 4;
 855
 856         } /* end normal inner loop */
 857
 858         for (; k < nj1-offset; k += 4)
 859         {
 860             jnrA        = jjnr[k];
 861             jnrB        = jjnr[k+1];
 862             jnrC        = jjnr[k+2];
 863             jnrD        = jjnr[k+3];
 864
 865             j3A         = 3*jnrA;
 866             j3B         = 3*jnrB;
 867             j3C         = 3*jnrC;
 868             j3D         = 3*jnrD;
 869
 870             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 871             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 872             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 873
 874             dx    = _mm_sub_ps(ix, jx);
 875             dy    = _mm_sub_ps(iy, jy);
 876             dz    = _mm_sub_ps(iz, jz);
 877
 878             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 879
 880             rinv        = gmx_mm_invsqrt_ps(rsq);
 881             r           = _mm_mul_ps(rsq, rinv);
 882
 883             /* Compute raj_inv aj1-4 */
 884             raj_inv     = gmx_mm_inv_ps(raj);
 885
 886             /* Evaluate influence of atom aj -> ai */
 887             t1            = _mm_add_ps(r, sk_aj);
 888             obc_mask1     = _mm_cmplt_ps(rai, t1);
 889
 890             if (_mm_movemask_ps(obc_mask1))
 891             {
 892                 /* If any of the elements has rai<dr+sk, this is executed */
 893                 t2            = _mm_sub_ps(r, sk_aj);
 894                 t3            = _mm_sub_ps(sk_aj, r);
 895
 896                 obc_mask2     = _mm_cmplt_ps(rai, t2);
 897                 obc_mask3     = _mm_cmplt_ps(rai, t3);
 898
 899                 uij           = gmx_mm_inv_ps(t1);
 900                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 901                                              _mm_andnot_ps(obc_mask2, rai_inv));
 902                 dlij          = _mm_and_ps(one, obc_mask2);
 903                 uij2          = _mm_mul_ps(uij, uij);
 904                 uij3          = _mm_mul_ps(uij2, uij);
 905                 lij2          = _mm_mul_ps(lij, lij);
 906                 lij3          = _mm_mul_ps(lij2, lij);
 907                 diff2         = _mm_sub_ps(uij2, lij2);
 908                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 909                 sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 910                 sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 911                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 912                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 913                 t1            = _mm_sub_ps(lij, uij);
 914                 t2            = _mm_mul_ps(diff2,
 915                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 916                                                       prod));
 917                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 918                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 919                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 920                 t4            = _mm_and_ps(t4, obc_mask3);
 921                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 922                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
 923                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 924                                            _mm_mul_ps(prod, lij3));
 925                 t1            = _mm_sub_ps(t1,
 926                                            _mm_mul_ps(onefourth,
 927                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 928                                                                  _mm_mul_ps(lij3, r))));
 929                 t2            = _mm_mul_ps(onefourth,
 930                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 931                                                       _mm_mul_ps(uij3, r)));
 932                 t2            = _mm_sub_ps(t2,
 933                                            _mm_add_ps(_mm_mul_ps(half, uij2),
 934                                                       _mm_mul_ps(prod, uij3)));
 935                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 936                                            _mm_mul_ps(rinv, rinv));
 937                 t3            = _mm_sub_ps(t3,
 938                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 939                                                       _mm_add_ps(one,
 940                                                                  _mm_mul_ps(sk2_rinv, rinv))));
 941                 t1            = _mm_mul_ps(rinv,
 942                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
 943                                                       _mm_add_ps(t2, t3)));
 944
 945                 dadx1         = _mm_and_ps(t1, obc_mask1);
 946             }
 947             else
 948             {
 949                 dadx1         = _mm_setzero_ps();
 950             }
 951
 952             /* Evaluate influence of atom ai -> aj */
 953             t1            = _mm_add_ps(r, sk_ai);
 954             obc_mask1     = _mm_cmplt_ps(raj, t1);
 955
 956             if (_mm_movemask_ps(obc_mask1))
 957             {
 958                 t2            = _mm_sub_ps(r, sk_ai);
 959                 t3            = _mm_sub_ps(sk_ai, r);
 960                 obc_mask2     = _mm_cmplt_ps(raj, t2);
 961                 obc_mask3     = _mm_cmplt_ps(raj, t3);
 962
 963                 uij           = gmx_mm_inv_ps(t1);
 964                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 965                                              _mm_andnot_ps(obc_mask2, raj_inv));
 966                 dlij          = _mm_and_ps(one, obc_mask2);
 967                 uij2          = _mm_mul_ps(uij, uij);
 968                 uij3          = _mm_mul_ps(uij2, uij);
 969                 lij2          = _mm_mul_ps(lij, lij);
 970                 lij3          = _mm_mul_ps(lij2, lij);
 971                 diff2         = _mm_sub_ps(uij2, lij2);
 972                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 973                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 974                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 975                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 976                 t1            = _mm_sub_ps(lij, uij);
 977                 t2            = _mm_mul_ps(diff2,
 978                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 979                                                       prod));
 980                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 981                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 982                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 983                 t4            = _mm_and_ps(t4, obc_mask3);
 984                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 985
 986                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 987
 988                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 989                                            _mm_mul_ps(prod, lij3));
 990                 t1            = _mm_sub_ps(t1,
 991                                            _mm_mul_ps(onefourth,
 992                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 993                                                                  _mm_mul_ps(lij3, r))));
 994                 t2            = _mm_mul_ps(onefourth,
 995                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 996                                                       _mm_mul_ps(uij3, r)));
 997                 t2            = _mm_sub_ps(t2,
 998                                            _mm_add_ps(_mm_mul_ps(half, uij2),
 999                                                       _mm_mul_ps(prod, uij3)));
1000                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1001                                            _mm_mul_ps(rinv, rinv));
1002                 t3            = _mm_sub_ps(t3,
1003                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1004                                                       _mm_add_ps(one,
1005                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1006                 t1            = _mm_mul_ps(rinv,
1007                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1008                                                       _mm_add_ps(t2, t3)));
1009                 dadx2         = _mm_and_ps(t1, obc_mask1);
1010             }
1011             else
1012             {
1013                 dadx2         = _mm_setzero_ps();
1014             }
1015
1016             _mm_store_ps(dadx, dadx1);
1017             dadx += 4;
1018             _mm_store_ps(dadx, dadx2);
1019             dadx += 4;
1020         } /* end normal inner loop */
1021
1022         if (offset != 0)
1023         {
1024             if (offset == 1)
1025             {
1026                 jnrA        = jjnr[k];
1027                 j3A         = 3*jnrA;
1028                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1029                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1030                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1031                 mask        = mask1;
1032             }
1033             else if (offset == 2)
1034             {
1035                 jnrA        = jjnr[k];
1036                 jnrB        = jjnr[k+1];
1037                 j3A         = 3*jnrA;
1038                 j3B         = 3*jnrB;
1039                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1040                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1041                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1042                 mask        = mask2;
1043             }
1044             else
1045             {
1046                 /* offset must be 3 */
1047                 jnrA        = jjnr[k];
1048                 jnrB        = jjnr[k+1];
1049                 jnrC        = jjnr[k+2];
1050                 j3A         = 3*jnrA;
1051                 j3B         = 3*jnrB;
1052                 j3C         = 3*jnrC;
1053                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1054                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1055                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1056                 mask        = mask3;
1057             }
1058
1059             dx    = _mm_sub_ps(ix, jx);
1060             dy    = _mm_sub_ps(iy, jy);
1061             dz    = _mm_sub_ps(iz, jz);
1062
1063             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
1064
1065             rinv        = gmx_mm_invsqrt_ps(rsq);
1066             r           = _mm_mul_ps(rsq, rinv);
1067
1068             /* Compute raj_inv aj1-4 */
1069             raj_inv     = gmx_mm_inv_ps(raj);
1070
1071             /* Evaluate influence of atom aj -> ai */
1072             t1            = _mm_add_ps(r, sk_aj);
1073             obc_mask1     = _mm_cmplt_ps(rai, t1);
1074             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1075
1076             if (_mm_movemask_ps(obc_mask1))
1077             {
1078                 t2            = _mm_sub_ps(r, sk_aj);
1079                 t3            = _mm_sub_ps(sk_aj, r);
1080                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1081                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1082
1083                 uij           = gmx_mm_inv_ps(t1);
1084                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1085                                              _mm_andnot_ps(obc_mask2, rai_inv));
1086                 dlij           = _mm_and_ps(one, obc_mask2);
1087                 uij2           = _mm_mul_ps(uij, uij);
1088                 uij3           = _mm_mul_ps(uij2, uij);
1089                 lij2           = _mm_mul_ps(lij, lij);
1090                 lij3           = _mm_mul_ps(lij2, lij);
1091                 diff2          = _mm_sub_ps(uij2, lij2);
1092                 lij_inv        = gmx_mm_invsqrt_ps(lij2);
1093                 sk2_aj         = _mm_mul_ps(sk_aj, sk_aj);
1094                 sk2_rinv       = _mm_mul_ps(sk2_aj, rinv);
1095                 prod           = _mm_mul_ps(onefourth, sk2_rinv);
1096                 logterm        = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1097                 t1             = _mm_sub_ps(lij, uij);
1098                 t2             = _mm_mul_ps(diff2,
1099                                             _mm_sub_ps(_mm_mul_ps(onefourth, r),
1100                                                        prod));
1101                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1102                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1103                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1104                 t4            = _mm_and_ps(t4, obc_mask3);
1105                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1106                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1107                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1108                                            _mm_mul_ps(prod, lij3));
1109                 t1            = _mm_sub_ps(t1,
1110                                            _mm_mul_ps(onefourth,
1111                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1112                                                                  _mm_mul_ps(lij3, r))));
1113                 t2            = _mm_mul_ps(onefourth,
1114                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1115                                                       _mm_mul_ps(uij3, r)));
1116                 t2            = _mm_sub_ps(t2,
1117                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1118                                                       _mm_mul_ps(prod, uij3)));
1119                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1120                                            _mm_mul_ps(rinv, rinv));
1121                 t3            = _mm_sub_ps(t3,
1122                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1123                                                       _mm_add_ps(one,
1124                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1125                 t1            = _mm_mul_ps(rinv,
1126                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1127                                                       _mm_add_ps(t2, t3)));
1128                 dadx1         = _mm_and_ps(t1, obc_mask1);
1129             }
1130             else
1131             {
1132                 dadx1         = _mm_setzero_ps();
1133             }
1134
1135             /* Evaluate influence of atom ai -> aj */
1136             t1            = _mm_add_ps(r, sk_ai);
1137             obc_mask1     = _mm_cmplt_ps(raj, t1);
1138             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1139
1140             if (_mm_movemask_ps(obc_mask1))
1141             {
1142                 t2            = _mm_sub_ps(r, sk_ai);
1143                 t3            = _mm_sub_ps(sk_ai, r);
1144                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1145                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1146
1147                 uij           = gmx_mm_inv_ps(t1);
1148                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1149                                           _mm_andnot_ps(obc_mask2, raj_inv));
1150                 dlij          = _mm_and_ps(one, obc_mask2);
1151                 uij2          = _mm_mul_ps(uij, uij);
1152                 uij3          = _mm_mul_ps(uij2, uij);
1153                 lij2          = _mm_mul_ps(lij, lij);
1154                 lij3          = _mm_mul_ps(lij2, lij);
1155                 diff2         = _mm_sub_ps(uij2, lij2);
1156                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1157                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
1158                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
1159                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1160                 t1            = _mm_sub_ps(lij, uij);
1161                 t2            = _mm_mul_ps(diff2,
1162                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
1163                                                       prod));
1164                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1165                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1166                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1167                 t4            = _mm_and_ps(t4, obc_mask3);
1168                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1169
1170                 tmp           = _mm_and_ps(t1, obc_mask1);
1171
1172                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1173                                            _mm_mul_ps(prod, lij3));
1174                 t1            = _mm_sub_ps(t1,
1175                                            _mm_mul_ps(onefourth,
1176                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1177                                                                  _mm_mul_ps(lij3, r))));
1178                 t2            = _mm_mul_ps(onefourth,
1179                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1180                                                       _mm_mul_ps(uij3, r)));
1181                 t2            = _mm_sub_ps(t2,
1182                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1183                                                       _mm_mul_ps(prod, uij3)));
1184                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1185                                            _mm_mul_ps(rinv, rinv));
1186                 t3            = _mm_sub_ps(t3,
1187                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1188                                                       _mm_add_ps(one,
1189                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1190                 t1            = _mm_mul_ps(rinv,
1191                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1192                                                       _mm_add_ps(t2, t3)));
1193                 dadx2         = _mm_and_ps(t1, obc_mask1);
1194             }
1195             else
1196             {
1197                 dadx2         = _mm_setzero_ps();
1198                 tmp           = _mm_setzero_ps();
1199             }
1200
1201             _mm_store_ps(dadx, dadx1);
1202             dadx += 4;
1203             _mm_store_ps(dadx, dadx2);
1204             dadx += 4;
1205
1206             if (offset == 1)
1207             {
1208                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1209             }
1210             else if (offset == 2)
1211             {
1212                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1213             }
1214             else
1215             {
1216                 /* offset must be 3 */
1217                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1218             }
1219
1220         }
1221         GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1222
1223     }
1224
1225     /* Parallel summations */
1226     if (DOMAINDECOMP(cr))
1227     {
1228         dd_atom_sum_real(cr->dd, work);
1229     }
1230
1231     if (gb_algorithm == egbHCT)
1232     {
1233         /* HCT */
1234         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1235         {
1236             if (born->use[i] != 0)
1237             {
1238                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1239                 sum     = 1.0/rr - work[i];
1240                 min_rad = rr + doffset;
1241                 rad     = 1.0/sum;
1242
1243                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1244                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1245             }
1246         }
1247
1248         /* Extra communication required for DD */
1249         if (DOMAINDECOMP(cr))
1250         {
1251             dd_atom_spread_real(cr->dd, born->bRad);
1252             dd_atom_spread_real(cr->dd, fr->invsqrta);
1253         }
1254     }
1255     else
1256     {
1257         /* OBC */
1258         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1259         {
1260             if (born->use[i] != 0)
1261             {
1262                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1263                 rr_inv2 = 1.0/rr;
1264                 rr      = rr-doffset;
1265                 rr_inv  = 1.0/rr;
1266                 sum     = rr * work[i];
1267                 sum2    = sum  * sum;
1268                 sum3    = sum2 * sum;
1269
1270                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1271                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1272                 born->bRad[i] = 1.0 / born->bRad[i];
1273
1274                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1275
1276                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1277                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1278             }
1279         }
1280         /* Extra (local) communication required for DD */
1281         if (DOMAINDECOMP(cr))
1282         {
1283             dd_atom_spread_real(cr->dd, born->bRad);
1284             dd_atom_spread_real(cr->dd, fr->invsqrta);
1285             dd_atom_spread_real(cr->dd, born->drobc);
1286         }
1287     }
1288
1289
1290
1291     return 0;
1292 }
1293
1294
1295
1296 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1297                                     float *x, float *f, float *fshift, float *shiftvec,
1298                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1299 {
1300     int          i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1301     int          jnrA, jnrB, jnrC, jnrD;
1302     int          j3A, j3B, j3C, j3D;
1303     int          jnrE, jnrF, jnrG, jnrH;
1304     int          j3E, j3F, j3G, j3H;
1305     int       *  jjnr;
1306
1307     float        rbi, shX, shY, shZ;
1308     float       *rb;
1309
1310     __m128       ix, iy, iz;
1311     __m128       jx, jy, jz;
1312     __m128       jxB, jyB, jzB;
1313     __m128       fix, fiy, fiz;
1314     __m128       dx, dy, dz;
1315     __m128       tx, ty, tz;
1316     __m128       dxB, dyB, dzB;
1317     __m128       txB, tyB, tzB;
1318
1319     __m128       rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1320     __m128       xmm1, xmm2, xmm3;
1321
1322     const __m128 two = _mm_set1_ps(2.0f);
1323
1324     rb     = born->work;
1325
1326     jjnr   = nl->jjnr;
1327
1328     /* Loop to get the proper form for the Born radius term, sse style */
1329     offset = natoms%4;
1330
1331     n0 = 0;
1332     n1 = natoms;
1333
1334     if (gb_algorithm == egbSTILL)
1335     {
1336         for (i = n0; i < n1; i++)
1337         {
1338             rbi   = born->bRad[i];
1339             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1340         }
1341     }
1342     else if (gb_algorithm == egbHCT)
1343     {
1344         for (i = n0; i < n1; i++)
1345         {
1346             rbi   = born->bRad[i];
1347             rb[i] = rbi * rbi * dvda[i];
1348         }
1349     }
1350     else if (gb_algorithm == egbOBC)
1351     {
1352         for (i = n0; i < n1; i++)
1353         {
1354             rbi   = born->bRad[i];
1355             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1356         }
1357     }
1358
1359     jz = _mm_setzero_ps();
1360
1361     n = j3A = j3B = j3C = j3D = 0;
1362
1363     for (i = 0; i < nl->nri; i++)
1364     {
1365         ii     = nl->iinr[i];
1366         ii3    = ii*3;
1367         is3    = 3*nl->shift[i];
1368         shX    = shiftvec[is3];
1369         shY    = shiftvec[is3+1];
1370         shZ    = shiftvec[is3+2];
1371         nj0    = nl->jindex[i];
1372         nj1    = nl->jindex[i+1];
1373
1374         ix     = _mm_set1_ps(shX+x[ii3+0]);
1375         iy     = _mm_set1_ps(shY+x[ii3+1]);
1376         iz     = _mm_set1_ps(shZ+x[ii3+2]);
1377
1378         offset = (nj1-nj0)%4;
1379
1380         rbai   = _mm_load1_ps(rb+ii);
1381         fix    = _mm_setzero_ps();
1382         fiy    = _mm_setzero_ps();
1383         fiz    = _mm_setzero_ps();
1384
1385
1386         for (k = nj0; k < nj1-offset; k += 4)
1387         {
1388             jnrA        = jjnr[k];
1389             jnrB        = jjnr[k+1];
1390             jnrC        = jjnr[k+2];
1391             jnrD        = jjnr[k+3];
1392
1393             j3A         = 3*jnrA;
1394             j3B         = 3*jnrB;
1395             j3C         = 3*jnrC;
1396             j3D         = 3*jnrD;
1397
1398             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1399
1400             dx          = _mm_sub_ps(ix, jx);
1401             dy          = _mm_sub_ps(iy, jy);
1402             dz          = _mm_sub_ps(iz, jz);
1403
1404             GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1405
1406             /* load chain rule terms for j1-4 */
1407             f_gb        = _mm_load_ps(dadx);
1408             dadx       += 4;
1409             f_gb_ai     = _mm_load_ps(dadx);
1410             dadx       += 4;
1411
1412             /* calculate scalar force */
1413             f_gb    = _mm_mul_ps(f_gb, rbai);
1414             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1415             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1416
1417             tx     = _mm_mul_ps(f_gb, dx);
1418             ty     = _mm_mul_ps(f_gb, dy);
1419             tz     = _mm_mul_ps(f_gb, dz);
1420
1421             fix    = _mm_add_ps(fix, tx);
1422             fiy    = _mm_add_ps(fiy, ty);
1423             fiz    = _mm_add_ps(fiz, tz);
1424
1425             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1426         }
1427
1428         /*deal with odd elements */
1429         if (offset != 0)
1430         {
1431             if (offset == 1)
1432             {
1433                 jnrA        = jjnr[k];
1434                 j3A         = 3*jnrA;
1435                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1436                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1437             }
1438             else if (offset == 2)
1439             {
1440                 jnrA        = jjnr[k];
1441                 jnrB        = jjnr[k+1];
1442                 j3A         = 3*jnrA;
1443                 j3B         = 3*jnrB;
1444                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1445                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1446             }
1447             else
1448             {
1449                 /* offset must be 3 */
1450                 jnrA        = jjnr[k];
1451                 jnrB        = jjnr[k+1];
1452                 jnrC        = jjnr[k+2];
1453                 j3A         = 3*jnrA;
1454                 j3B         = 3*jnrB;
1455                 j3C         = 3*jnrC;
1456                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1457                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1458             }
1459
1460             dx          = _mm_sub_ps(ix, jx);
1461             dy          = _mm_sub_ps(iy, jy);
1462             dz          = _mm_sub_ps(iz, jz);
1463
1464             /* load chain rule terms for j1-4 */
1465             f_gb        = _mm_load_ps(dadx);
1466             dadx       += 4;
1467             f_gb_ai     = _mm_load_ps(dadx);
1468             dadx       += 4;
1469
1470             /* calculate scalar force */
1471             f_gb    = _mm_mul_ps(f_gb, rbai);
1472             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1473             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1474
1475             tx     = _mm_mul_ps(f_gb, dx);
1476             ty     = _mm_mul_ps(f_gb, dy);
1477             tz     = _mm_mul_ps(f_gb, dz);
1478
1479             fix    = _mm_add_ps(fix, tx);
1480             fiy    = _mm_add_ps(fiy, ty);
1481             fiz    = _mm_add_ps(fiz, tz);
1482
1483             if (offset == 1)
1484             {
1485                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1486             }
1487             else if (offset == 2)
1488             {
1489                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1490             }
1491             else
1492             {
1493                 /* offset must be 3 */
1494                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1495             }
1496         }
1497
1498         /* fix/fiy/fiz now contain four partial force terms, that all should be
1499          * added to the i particle forces and shift forces.
1500          */
1501         gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1502     }
1503
1504     return 0;
1505 }
1506
1507
1508 #else
1509 /* keep compiler happy */
1510 int genborn_sse_dummy;
1511
1512 #endif /* SSE intrinsics available */