src/gromacs/mdlib/genborn_sse2_single.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <math.h>
  42 #include <string.h>
  43
  44 #include "typedefs.h"
  45 #include "smalloc.h"
  46 #include "genborn.h"
  47 #include "vec.h"
  48 #include "gromacs/fileio/pdbio.h"
  49 #include "names.h"
  50 #include "physics.h"
  51 #include "partdec.h"
  52 #include "domdec.h"
  53 #include "network.h"
  54 #include "gmx_fatal.h"
  55 #include "mtop_util.h"
  56 #include "genborn.h"
  57
  58 #include "gromacs/utility/gmxmpi.h"
  59
  60
  61 /* Only compile this file if SSE intrinsics are available */
  62 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  63
  64 #include <gmx_sse2_single.h>
  65 #include <emmintrin.h>
  66
  67 #include "genborn_sse2_single.h"
  68
  69
  70 int
  71 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
  72                               int natoms, gmx_localtop_t *top,
  73                               float *x, t_nblist *nl,
  74                               gmx_genborn_t *born)
  75 {
  76     int          i, k, n, ii, is3, ii3, nj0, nj1, offset;
  77     int          jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
  78     int          jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
  79     int          shift;
  80     int         *mdtype;
  81     real         shX, shY, shZ;
  82     int         *jjnr;
  83     real        *shiftvec;
  84
  85     float        gpi_ai, gpi2;
  86     float        factor;
  87     float       *gb_radius;
  88     float       *vsolv;
  89     float       *work;
  90     float       *dadx;
  91
  92     __m128       ix, iy, iz;
  93     __m128       jx, jy, jz;
  94     __m128       dx, dy, dz;
  95     __m128       tx, ty, tz;
  96     __m128       jxB, jyB, jzB;
  97     __m128       dxB, dyB, dzB;
  98     __m128       txB, tyB, tzB;
  99     __m128       rsq, rinv, rinv2, rinv4, rinv6;
 100     __m128       rsqB, rinvB, rinv2B, rinv4B, rinv6B;
 101     __m128       ratio, gpi, rai, raj, vai, vaj, rvdw;
 102     __m128       ratioB, rajB, vajB, rvdwB;
 103     __m128       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
 104     __m128       ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
 105     __m128       mask, icf4, icf6, mask_cmp;
 106     __m128       icf4B, icf6B, mask_cmpB;
 107
 108     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 109     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 110     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 111
 112     const __m128 half   = _mm_set1_ps(0.5f);
 113     const __m128 three  = _mm_set1_ps(3.0f);
 114     const __m128 one    = _mm_set1_ps(1.0f);
 115     const __m128 two    = _mm_set1_ps(2.0f);
 116     const __m128 zero   = _mm_set1_ps(0.0f);
 117     const __m128 four   = _mm_set1_ps(4.0f);
 118
 119     const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
 120     const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
 121     const __m128 still_p4     = _mm_set1_ps(STILL_P4);
 122
 123     factor  = 0.5 * ONE_4PI_EPS0;
 124
 125     gb_radius = born->gb_radius;
 126     vsolv     = born->vsolv;
 127     work      = born->gpol_still_work;
 128     jjnr      = nl->jjnr;
 129     shiftvec  = fr->shift_vec[0];
 130     dadx      = fr->dadx;
 131
 132     jnrA = jnrB = jnrC = jnrD = 0;
 133     jx   = _mm_setzero_ps();
 134     jy   = _mm_setzero_ps();
 135     jz   = _mm_setzero_ps();
 136
 137     n = 0;
 138
 139     for (i = 0; i < natoms; i++)
 140     {
 141         work[i] = 0;
 142     }
 143
 144     for (i = 0; i < nl->nri; i++)
 145     {
 146         ii     = nl->iinr[i];
 147         ii3    = ii*3;
 148         is3    = 3*nl->shift[i];
 149         shX    = shiftvec[is3];
 150         shY    = shiftvec[is3+1];
 151         shZ    = shiftvec[is3+2];
 152         nj0    = nl->jindex[i];
 153         nj1    = nl->jindex[i+1];
 154
 155         ix     = _mm_set1_ps(shX+x[ii3+0]);
 156         iy     = _mm_set1_ps(shY+x[ii3+1]);
 157         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 158
 159         offset = (nj1-nj0)%4;
 160
 161         /* Polarization energy for atom ai */
 162         gpi    = _mm_setzero_ps();
 163
 164         rai     = _mm_load1_ps(gb_radius+ii);
 165         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
 166
 167         for (k = nj0; k < nj1-4-offset; k += 8)
 168         {
 169             jnrA        = jjnr[k];
 170             jnrB        = jjnr[k+1];
 171             jnrC        = jjnr[k+2];
 172             jnrD        = jjnr[k+3];
 173             jnrE        = jjnr[k+4];
 174             jnrF        = jjnr[k+5];
 175             jnrG        = jjnr[k+6];
 176             jnrH        = jjnr[k+7];
 177
 178             j3A         = 3*jnrA;
 179             j3B         = 3*jnrB;
 180             j3C         = 3*jnrC;
 181             j3D         = 3*jnrD;
 182             j3E         = 3*jnrE;
 183             j3F         = 3*jnrF;
 184             j3G         = 3*jnrG;
 185             j3H         = 3*jnrH;
 186
 187             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 188             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 189
 190             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 191             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 192             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 193             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
 194
 195             dx          = _mm_sub_ps(ix, jx);
 196             dy          = _mm_sub_ps(iy, jy);
 197             dz          = _mm_sub_ps(iz, jz);
 198             dxB         = _mm_sub_ps(ix, jxB);
 199             dyB         = _mm_sub_ps(iy, jyB);
 200             dzB         = _mm_sub_ps(iz, jzB);
 201
 202             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 203             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 204             rinv        = gmx_mm_invsqrt_ps(rsq);
 205             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 206             rinv2       = _mm_mul_ps(rinv, rinv);
 207             rinv2B      = _mm_mul_ps(rinvB, rinvB);
 208             rinv4       = _mm_mul_ps(rinv2, rinv2);
 209             rinv4B      = _mm_mul_ps(rinv2B, rinv2B);
 210             rinv6       = _mm_mul_ps(rinv4, rinv2);
 211             rinv6B      = _mm_mul_ps(rinv4B, rinv2B);
 212
 213             rvdw        = _mm_add_ps(rai, raj);
 214             rvdwB       = _mm_add_ps(rai, rajB);
 215             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 216             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
 217
 218             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 219             mask_cmpB   = _mm_cmple_ps(ratioB, still_p5inv);
 220
 221             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 222             if (0 == _mm_movemask_ps(mask_cmp) )
 223             {
 224                 /* if ratio>still_p5inv for ALL elements */
 225                 ccf         = one;
 226                 dccf        = _mm_setzero_ps();
 227             }
 228             else
 229             {
 230                 ratio       = _mm_min_ps(ratio, still_p5inv);
 231                 theta       = _mm_mul_ps(ratio, still_pip5);
 232                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 233                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 234                 ccf         = _mm_mul_ps(term, term);
 235                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 236                                          _mm_mul_ps(sinq, theta));
 237             }
 238             if (0 == _mm_movemask_ps(mask_cmpB) )
 239             {
 240                 /* if ratio>still_p5inv for ALL elements */
 241                 ccfB        = one;
 242                 dccfB       = _mm_setzero_ps();
 243             }
 244             else
 245             {
 246                 ratioB      = _mm_min_ps(ratioB, still_p5inv);
 247                 thetaB      = _mm_mul_ps(ratioB, still_pip5);
 248                 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
 249                 termB       = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
 250                 ccfB        = _mm_mul_ps(termB, termB);
 251                 dccfB       = _mm_mul_ps(_mm_mul_ps(two, termB),
 252                                          _mm_mul_ps(sinqB, thetaB));
 253             }
 254
 255             prod        = _mm_mul_ps(still_p4, vaj);
 256             prodB       = _mm_mul_ps(still_p4, vajB);
 257             icf4        = _mm_mul_ps(ccf, rinv4);
 258             icf4B       = _mm_mul_ps(ccfB, rinv4B);
 259             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 260             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
 261
 262             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 263             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
 264
 265             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
 266
 267             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 268             dadx += 4;
 269             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 270             dadx += 4;
 271             _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
 272             dadx += 4;
 273             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
 274             dadx += 4;
 275         }
 276
 277         for (; k < nj1-offset; k += 4)
 278         {
 279             jnrA        = jjnr[k];
 280             jnrB        = jjnr[k+1];
 281             jnrC        = jjnr[k+2];
 282             jnrD        = jjnr[k+3];
 283
 284             j3A         = 3*jnrA;
 285             j3B         = 3*jnrB;
 286             j3C         = 3*jnrC;
 287             j3D         = 3*jnrD;
 288
 289             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 290
 291             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 292             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 293
 294             dx          = _mm_sub_ps(ix, jx);
 295             dy          = _mm_sub_ps(iy, jy);
 296             dz          = _mm_sub_ps(iz, jz);
 297
 298             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 299             rinv        = gmx_mm_invsqrt_ps(rsq);
 300             rinv2       = _mm_mul_ps(rinv, rinv);
 301             rinv4       = _mm_mul_ps(rinv2, rinv2);
 302             rinv6       = _mm_mul_ps(rinv4, rinv2);
 303
 304             rvdw        = _mm_add_ps(rai, raj);
 305             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 306
 307             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 308
 309             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 310             if (0 == _mm_movemask_ps(mask_cmp))
 311             {
 312                 /* if ratio>still_p5inv for ALL elements */
 313                 ccf         = one;
 314                 dccf        = _mm_setzero_ps();
 315             }
 316             else
 317             {
 318                 ratio       = _mm_min_ps(ratio, still_p5inv);
 319                 theta       = _mm_mul_ps(ratio, still_pip5);
 320                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 321                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 322                 ccf         = _mm_mul_ps(term, term);
 323                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 324                                          _mm_mul_ps(sinq, theta));
 325             }
 326
 327             prod        = _mm_mul_ps(still_p4, vaj);
 328             icf4        = _mm_mul_ps(ccf, rinv4);
 329             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 330
 331             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 332
 333             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 334
 335             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 336             dadx += 4;
 337             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 338             dadx += 4;
 339         }
 340
 341         if (offset != 0)
 342         {
 343             if (offset == 1)
 344             {
 345                 jnrA        = jjnr[k];
 346                 j3A         = 3*jnrA;
 347                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
 348                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
 349                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
 350                 mask        = mask1;
 351             }
 352             else if (offset == 2)
 353             {
 354                 jnrA        = jjnr[k];
 355                 jnrB        = jjnr[k+1];
 356                 j3A         = 3*jnrA;
 357                 j3B         = 3*jnrB;
 358                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
 359                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
 360                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
 361                 mask        = mask2;
 362             }
 363             else
 364             {
 365                 /* offset must be 3 */
 366                 jnrA        = jjnr[k];
 367                 jnrB        = jjnr[k+1];
 368                 jnrC        = jjnr[k+2];
 369                 j3A         = 3*jnrA;
 370                 j3B         = 3*jnrB;
 371                 j3C         = 3*jnrC;
 372                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
 373                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
 374                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
 375                 mask        = mask3;
 376             }
 377
 378             dx          = _mm_sub_ps(ix, jx);
 379             dy          = _mm_sub_ps(iy, jy);
 380             dz          = _mm_sub_ps(iz, jz);
 381
 382             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 383             rinv        = gmx_mm_invsqrt_ps(rsq);
 384             rinv2       = _mm_mul_ps(rinv, rinv);
 385             rinv4       = _mm_mul_ps(rinv2, rinv2);
 386             rinv6       = _mm_mul_ps(rinv4, rinv2);
 387
 388             rvdw        = _mm_add_ps(rai, raj);
 389             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 390
 391             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 392
 393             if (0 == _mm_movemask_ps(mask_cmp))
 394             {
 395                 /* if ratio>still_p5inv for ALL elements */
 396                 ccf         = one;
 397                 dccf        = _mm_setzero_ps();
 398             }
 399             else
 400             {
 401                 ratio       = _mm_min_ps(ratio, still_p5inv);
 402                 theta       = _mm_mul_ps(ratio, still_pip5);
 403                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 404                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 405                 ccf         = _mm_mul_ps(term, term);
 406                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 407                                          _mm_mul_ps(sinq, theta));
 408             }
 409
 410             prod        = _mm_mul_ps(still_p4, vaj);
 411             icf4        = _mm_mul_ps(ccf, rinv4);
 412             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 413
 414             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 415
 416             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 417             dadx += 4;
 418             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 419             dadx += 4;
 420
 421             tmp = _mm_mul_ps(prod_ai, icf4);
 422
 423             if (offset == 1)
 424             {
 425                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
 426             }
 427             else if (offset == 2)
 428             {
 429                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
 430             }
 431             else
 432             {
 433                 /* offset must be 3 */
 434                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
 435             }
 436         }
 437         GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
 438     }
 439
 440     /* Sum up the polarization energy from other nodes */
 441     if (PARTDECOMP(cr))
 442     {
 443         gmx_sum(natoms, work, cr);
 444     }
 445     else if (DOMAINDECOMP(cr))
 446     {
 447         dd_atom_sum_real(cr->dd, work);
 448     }
 449
 450     /* Compute the radii */
 451     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 452     {
 453         if (born->use[i] != 0)
 454         {
 455             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 456             gpi2             = gpi_ai * gpi_ai;
 457             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 458             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 459         }
 460     }
 461
 462     /* Extra (local) communication required for DD */
 463     if (DOMAINDECOMP(cr))
 464     {
 465         dd_atom_spread_real(cr->dd, born->bRad);
 466         dd_atom_spread_real(cr->dd, fr->invsqrta);
 467     }
 468
 469     return 0;
 470 }
 471
 472
 473 int
 474 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 475                                 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 476 {
 477     int          i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 478     int          jnrA, jnrB, jnrC, jnrD;
 479     int          j3A, j3B, j3C, j3D;
 480     int          jnrE, jnrF, jnrG, jnrH;
 481     int          j3E, j3F, j3G, j3H;
 482     float        shX, shY, shZ;
 483     float        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 484     float        sum_ai2, sum_ai3, tsum, tchain, doffset;
 485     float       *obc_param;
 486     float       *gb_radius;
 487     float       *work;
 488     int       *  jjnr;
 489     float       *dadx;
 490     float       *shiftvec;
 491     float        min_rad, rad;
 492
 493     __m128       ix, iy, iz, jx, jy, jz;
 494     __m128       dx, dy, dz, t1, t2, t3, t4;
 495     __m128       rsq, rinv, r;
 496     __m128       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 497     __m128       uij, lij2, uij2, lij3, uij3, diff2;
 498     __m128       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 499     __m128       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 500     __m128       dadx1, dadx2;
 501     __m128       logterm;
 502     __m128       mask;
 503     __m128       obc_mask1, obc_mask2, obc_mask3;
 504     __m128       jxB, jyB, jzB, t1B, t2B, t3B, t4B;
 505     __m128       dxB, dyB, dzB, rsqB, rinvB, rB;
 506     __m128       rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
 507     __m128       uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
 508     __m128       lij_invB, sk2_invB, prodB;
 509     __m128       sk_ajB, sk2_ajB, sk2_rinvB;
 510     __m128       dadx1B, dadx2B;
 511     __m128       logtermB;
 512     __m128       obc_mask1B, obc_mask2B, obc_mask3B;
 513
 514     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 515     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 516     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 517
 518     __m128       oneeighth   = _mm_set1_ps(0.125);
 519     __m128       onefourth   = _mm_set1_ps(0.25);
 520
 521     const __m128 half  = _mm_set1_ps(0.5f);
 522     const __m128 three = _mm_set1_ps(3.0f);
 523     const __m128 one   = _mm_set1_ps(1.0f);
 524     const __m128 two   = _mm_set1_ps(2.0f);
 525     const __m128 zero  = _mm_set1_ps(0.0f);
 526     const __m128 neg   = _mm_set1_ps(-1.0f);
 527
 528     /* Set the dielectric offset */
 529     doffset   = born->gb_doffset;
 530     gb_radius = born->gb_radius;
 531     obc_param = born->param;
 532     work      = born->gpol_hct_work;
 533     jjnr      = nl->jjnr;
 534     dadx      = fr->dadx;
 535     shiftvec  = fr->shift_vec[0];
 536
 537     jx        = _mm_setzero_ps();
 538     jy        = _mm_setzero_ps();
 539     jz        = _mm_setzero_ps();
 540
 541     jnrA = jnrB = jnrC = jnrD = 0;
 542
 543     for (i = 0; i < born->nr; i++)
 544     {
 545         work[i] = 0;
 546     }
 547
 548     for (i = 0; i < nl->nri; i++)
 549     {
 550         ii     = nl->iinr[i];
 551         ii3    = ii*3;
 552         is3    = 3*nl->shift[i];
 553         shX    = shiftvec[is3];
 554         shY    = shiftvec[is3+1];
 555         shZ    = shiftvec[is3+2];
 556         nj0    = nl->jindex[i];
 557         nj1    = nl->jindex[i+1];
 558
 559         ix     = _mm_set1_ps(shX+x[ii3+0]);
 560         iy     = _mm_set1_ps(shY+x[ii3+1]);
 561         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 562
 563         offset = (nj1-nj0)%4;
 564
 565         rai     = _mm_load1_ps(gb_radius+ii);
 566         rai_inv = gmx_mm_inv_ps(rai);
 567
 568         sum_ai = _mm_setzero_ps();
 569
 570         sk_ai  = _mm_load1_ps(born->param+ii);
 571         sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
 572
 573         for (k = nj0; k < nj1-4-offset; k += 8)
 574         {
 575             jnrA        = jjnr[k];
 576             jnrB        = jjnr[k+1];
 577             jnrC        = jjnr[k+2];
 578             jnrD        = jjnr[k+3];
 579             jnrE        = jjnr[k+4];
 580             jnrF        = jjnr[k+5];
 581             jnrG        = jjnr[k+6];
 582             jnrH        = jjnr[k+7];
 583
 584             j3A         = 3*jnrA;
 585             j3B         = 3*jnrB;
 586             j3C         = 3*jnrC;
 587             j3D         = 3*jnrD;
 588             j3E         = 3*jnrE;
 589             j3F         = 3*jnrF;
 590             j3G         = 3*jnrG;
 591             j3H         = 3*jnrH;
 592
 593             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 594             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 595             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 596             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 597             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 598             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
 599
 600             dx    = _mm_sub_ps(ix, jx);
 601             dy    = _mm_sub_ps(iy, jy);
 602             dz    = _mm_sub_ps(iz, jz);
 603             dxB   = _mm_sub_ps(ix, jxB);
 604             dyB   = _mm_sub_ps(iy, jyB);
 605             dzB   = _mm_sub_ps(iz, jzB);
 606
 607             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 608             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 609
 610             rinv        = gmx_mm_invsqrt_ps(rsq);
 611             r           = _mm_mul_ps(rsq, rinv);
 612             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 613             rB          = _mm_mul_ps(rsqB, rinvB);
 614
 615             /* Compute raj_inv aj1-4 */
 616             raj_inv     = gmx_mm_inv_ps(raj);
 617             raj_invB    = gmx_mm_inv_ps(rajB);
 618
 619             /* Evaluate influence of atom aj -> ai */
 620             t1            = _mm_add_ps(r, sk_aj);
 621             t2            = _mm_sub_ps(r, sk_aj);
 622             t3            = _mm_sub_ps(sk_aj, r);
 623             t1B           = _mm_add_ps(rB, sk_ajB);
 624             t2B           = _mm_sub_ps(rB, sk_ajB);
 625             t3B           = _mm_sub_ps(sk_ajB, rB);
 626             obc_mask1     = _mm_cmplt_ps(rai, t1);
 627             obc_mask2     = _mm_cmplt_ps(rai, t2);
 628             obc_mask3     = _mm_cmplt_ps(rai, t3);
 629             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
 630             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
 631             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
 632
 633             uij           = gmx_mm_inv_ps(t1);
 634             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 635                                          _mm_andnot_ps(obc_mask2, rai_inv));
 636             dlij          = _mm_and_ps(one, obc_mask2);
 637             uij2          = _mm_mul_ps(uij, uij);
 638             uij3          = _mm_mul_ps(uij2, uij);
 639             lij2          = _mm_mul_ps(lij, lij);
 640             lij3          = _mm_mul_ps(lij2, lij);
 641
 642             uijB          = gmx_mm_inv_ps(t1B);
 643             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 644                                          _mm_andnot_ps(obc_mask2B, rai_inv));
 645             dlijB         = _mm_and_ps(one, obc_mask2B);
 646             uij2B         = _mm_mul_ps(uijB, uijB);
 647             uij3B         = _mm_mul_ps(uij2B, uijB);
 648             lij2B         = _mm_mul_ps(lijB, lijB);
 649             lij3B         = _mm_mul_ps(lij2B, lijB);
 650
 651             diff2         = _mm_sub_ps(uij2, lij2);
 652             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 653             sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 654             sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 655             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 656
 657             diff2B        = _mm_sub_ps(uij2B, lij2B);
 658             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 659             sk2_ajB       = _mm_mul_ps(sk_ajB, sk_ajB);
 660             sk2_rinvB     = _mm_mul_ps(sk2_ajB, rinvB);
 661             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 662
 663             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 664             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 665
 666             t1            = _mm_sub_ps(lij, uij);
 667             t2            = _mm_mul_ps(diff2,
 668                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 669                                                   prod));
 670             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 671             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 672             t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 673             t4            = _mm_and_ps(t4, obc_mask3);
 674             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 675
 676             t1B           = _mm_sub_ps(lijB, uijB);
 677             t2B           = _mm_mul_ps(diff2B,
 678                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 679                                                   prodB));
 680             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 681             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 682             t4B           = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
 683             t4B           = _mm_and_ps(t4B, obc_mask3B);
 684             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 685
 686             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
 687
 688             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 689                                        _mm_mul_ps(prod, lij3));
 690             t1            = _mm_sub_ps(t1,
 691                                        _mm_mul_ps(onefourth,
 692                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 693                                                              _mm_mul_ps(lij3, r))));
 694             t2            = _mm_mul_ps(onefourth,
 695                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 696                                                   _mm_mul_ps(uij3, r)));
 697             t2            = _mm_sub_ps(t2,
 698                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 699                                                   _mm_mul_ps(prod, uij3)));
 700             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 701                                        _mm_mul_ps(rinv, rinv));
 702             t3            = _mm_sub_ps(t3,
 703                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 704                                                   _mm_add_ps(one,
 705                                                              _mm_mul_ps(sk2_rinv, rinv))));
 706             t1            = _mm_mul_ps(rinv,
 707                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 708                                                   _mm_add_ps(t2, t3)));
 709
 710
 711
 712             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 713                                        _mm_mul_ps(prodB, lij3B));
 714             t1B           = _mm_sub_ps(t1B,
 715                                        _mm_mul_ps(onefourth,
 716                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 717                                                              _mm_mul_ps(lij3B, rB))));
 718             t2B           = _mm_mul_ps(onefourth,
 719                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 720                                                   _mm_mul_ps(uij3B, rB)));
 721             t2B           = _mm_sub_ps(t2B,
 722                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 723                                                   _mm_mul_ps(prodB, uij3B)));
 724             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 725                                        _mm_mul_ps(rinvB, rinvB));
 726             t3B           = _mm_sub_ps(t3B,
 727                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 728                                                   _mm_add_ps(one,
 729                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 730             t1B           = _mm_mul_ps(rinvB,
 731                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 732                                                   _mm_add_ps(t2B, t3B)));
 733
 734             dadx1         = _mm_and_ps(t1, obc_mask1);
 735             dadx1B        = _mm_and_ps(t1B, obc_mask1B);
 736
 737
 738             /* Evaluate influence of atom ai -> aj */
 739             t1            = _mm_add_ps(r, sk_ai);
 740             t2            = _mm_sub_ps(r, sk_ai);
 741             t3            = _mm_sub_ps(sk_ai, r);
 742             t1B           = _mm_add_ps(rB, sk_ai);
 743             t2B           = _mm_sub_ps(rB, sk_ai);
 744             t3B           = _mm_sub_ps(sk_ai, rB);
 745             obc_mask1     = _mm_cmplt_ps(raj, t1);
 746             obc_mask2     = _mm_cmplt_ps(raj, t2);
 747             obc_mask3     = _mm_cmplt_ps(raj, t3);
 748             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
 749             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
 750             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
 751
 752             uij           = gmx_mm_inv_ps(t1);
 753             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 754                                          _mm_andnot_ps(obc_mask2, raj_inv));
 755             dlij          = _mm_and_ps(one, obc_mask2);
 756             uij2          = _mm_mul_ps(uij, uij);
 757             uij3          = _mm_mul_ps(uij2, uij);
 758             lij2          = _mm_mul_ps(lij, lij);
 759             lij3          = _mm_mul_ps(lij2, lij);
 760
 761             uijB          = gmx_mm_inv_ps(t1B);
 762             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 763                                          _mm_andnot_ps(obc_mask2B, raj_invB));
 764             dlijB         = _mm_and_ps(one, obc_mask2B);
 765             uij2B         = _mm_mul_ps(uijB, uijB);
 766             uij3B         = _mm_mul_ps(uij2B, uijB);
 767             lij2B         = _mm_mul_ps(lijB, lijB);
 768             lij3B         = _mm_mul_ps(lij2B, lijB);
 769
 770             diff2         = _mm_sub_ps(uij2, lij2);
 771             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 772             sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 773             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 774
 775             diff2B        = _mm_sub_ps(uij2B, lij2B);
 776             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 777             sk2_rinvB     = _mm_mul_ps(sk2_ai, rinvB);
 778             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 779
 780             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 781             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 782
 783             t1            = _mm_sub_ps(lij, uij);
 784             t2            = _mm_mul_ps(diff2,
 785                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 786                                                   prod));
 787             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 788             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 789             t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 790             t4            = _mm_and_ps(t4, obc_mask3);
 791             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 792
 793             t1B           = _mm_sub_ps(lijB, uijB);
 794             t2B           = _mm_mul_ps(diff2B,
 795                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 796                                                   prodB));
 797             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 798             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 799             t4B           = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
 800             t4B           = _mm_and_ps(t4B, obc_mask3B);
 801             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 802
 803             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 804             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
 805
 806             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 807                                        _mm_mul_ps(prod, lij3));
 808             t1            = _mm_sub_ps(t1,
 809                                        _mm_mul_ps(onefourth,
 810                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 811                                                              _mm_mul_ps(lij3, r))));
 812             t2            = _mm_mul_ps(onefourth,
 813                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 814                                                   _mm_mul_ps(uij3, r)));
 815             t2            = _mm_sub_ps(t2,
 816                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 817                                                   _mm_mul_ps(prod, uij3)));
 818             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 819                                        _mm_mul_ps(rinv, rinv));
 820             t3            = _mm_sub_ps(t3,
 821                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 822                                                   _mm_add_ps(one,
 823                                                              _mm_mul_ps(sk2_rinv, rinv))));
 824             t1            = _mm_mul_ps(rinv,
 825                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 826                                                   _mm_add_ps(t2, t3)));
 827
 828
 829             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 830                                        _mm_mul_ps(prodB, lij3B));
 831             t1B           = _mm_sub_ps(t1B,
 832                                        _mm_mul_ps(onefourth,
 833                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 834                                                              _mm_mul_ps(lij3B, rB))));
 835             t2B           = _mm_mul_ps(onefourth,
 836                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 837                                                   _mm_mul_ps(uij3B, rB)));
 838             t2B           = _mm_sub_ps(t2B,
 839                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 840                                                   _mm_mul_ps(prodB, uij3B)));
 841             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 842                                        _mm_mul_ps(rinvB, rinvB));
 843             t3B           = _mm_sub_ps(t3B,
 844                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 845                                                   _mm_add_ps(one,
 846                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 847             t1B           = _mm_mul_ps(rinvB,
 848                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 849                                                   _mm_add_ps(t2B, t3B)));
 850
 851
 852             dadx2         = _mm_and_ps(t1, obc_mask1);
 853             dadx2B        = _mm_and_ps(t1B, obc_mask1B);
 854
 855             _mm_store_ps(dadx, dadx1);
 856             dadx += 4;
 857             _mm_store_ps(dadx, dadx2);
 858             dadx += 4;
 859             _mm_store_ps(dadx, dadx1B);
 860             dadx += 4;
 861             _mm_store_ps(dadx, dadx2B);
 862             dadx += 4;
 863
 864         } /* end normal inner loop */
 865
 866         for (; k < nj1-offset; k += 4)
 867         {
 868             jnrA        = jjnr[k];
 869             jnrB        = jjnr[k+1];
 870             jnrC        = jjnr[k+2];
 871             jnrD        = jjnr[k+3];
 872
 873             j3A         = 3*jnrA;
 874             j3B         = 3*jnrB;
 875             j3C         = 3*jnrC;
 876             j3D         = 3*jnrD;
 877
 878             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 879             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 880             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 881
 882             dx    = _mm_sub_ps(ix, jx);
 883             dy    = _mm_sub_ps(iy, jy);
 884             dz    = _mm_sub_ps(iz, jz);
 885
 886             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 887
 888             rinv        = gmx_mm_invsqrt_ps(rsq);
 889             r           = _mm_mul_ps(rsq, rinv);
 890
 891             /* Compute raj_inv aj1-4 */
 892             raj_inv     = gmx_mm_inv_ps(raj);
 893
 894             /* Evaluate influence of atom aj -> ai */
 895             t1            = _mm_add_ps(r, sk_aj);
 896             obc_mask1     = _mm_cmplt_ps(rai, t1);
 897
 898             if (_mm_movemask_ps(obc_mask1))
 899             {
 900                 /* If any of the elements has rai<dr+sk, this is executed */
 901                 t2            = _mm_sub_ps(r, sk_aj);
 902                 t3            = _mm_sub_ps(sk_aj, r);
 903
 904                 obc_mask2     = _mm_cmplt_ps(rai, t2);
 905                 obc_mask3     = _mm_cmplt_ps(rai, t3);
 906
 907                 uij           = gmx_mm_inv_ps(t1);
 908                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 909                                              _mm_andnot_ps(obc_mask2, rai_inv));
 910                 dlij          = _mm_and_ps(one, obc_mask2);
 911                 uij2          = _mm_mul_ps(uij, uij);
 912                 uij3          = _mm_mul_ps(uij2, uij);
 913                 lij2          = _mm_mul_ps(lij, lij);
 914                 lij3          = _mm_mul_ps(lij2, lij);
 915                 diff2         = _mm_sub_ps(uij2, lij2);
 916                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 917                 sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 918                 sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 919                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 920                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 921                 t1            = _mm_sub_ps(lij, uij);
 922                 t2            = _mm_mul_ps(diff2,
 923                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 924                                                       prod));
 925                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 926                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 927                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 928                 t4            = _mm_and_ps(t4, obc_mask3);
 929                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 930                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
 931                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 932                                            _mm_mul_ps(prod, lij3));
 933                 t1            = _mm_sub_ps(t1,
 934                                            _mm_mul_ps(onefourth,
 935                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 936                                                                  _mm_mul_ps(lij3, r))));
 937                 t2            = _mm_mul_ps(onefourth,
 938                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 939                                                       _mm_mul_ps(uij3, r)));
 940                 t2            = _mm_sub_ps(t2,
 941                                            _mm_add_ps(_mm_mul_ps(half, uij2),
 942                                                       _mm_mul_ps(prod, uij3)));
 943                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 944                                            _mm_mul_ps(rinv, rinv));
 945                 t3            = _mm_sub_ps(t3,
 946                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 947                                                       _mm_add_ps(one,
 948                                                                  _mm_mul_ps(sk2_rinv, rinv))));
 949                 t1            = _mm_mul_ps(rinv,
 950                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
 951                                                       _mm_add_ps(t2, t3)));
 952
 953                 dadx1         = _mm_and_ps(t1, obc_mask1);
 954             }
 955             else
 956             {
 957                 dadx1         = _mm_setzero_ps();
 958             }
 959
 960             /* Evaluate influence of atom ai -> aj */
 961             t1            = _mm_add_ps(r, sk_ai);
 962             obc_mask1     = _mm_cmplt_ps(raj, t1);
 963
 964             if (_mm_movemask_ps(obc_mask1))
 965             {
 966                 t2            = _mm_sub_ps(r, sk_ai);
 967                 t3            = _mm_sub_ps(sk_ai, r);
 968                 obc_mask2     = _mm_cmplt_ps(raj, t2);
 969                 obc_mask3     = _mm_cmplt_ps(raj, t3);
 970
 971                 uij           = gmx_mm_inv_ps(t1);
 972                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 973                                              _mm_andnot_ps(obc_mask2, raj_inv));
 974                 dlij          = _mm_and_ps(one, obc_mask2);
 975                 uij2          = _mm_mul_ps(uij, uij);
 976                 uij3          = _mm_mul_ps(uij2, uij);
 977                 lij2          = _mm_mul_ps(lij, lij);
 978                 lij3          = _mm_mul_ps(lij2, lij);
 979                 diff2         = _mm_sub_ps(uij2, lij2);
 980                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 981                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 982                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 983                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 984                 t1            = _mm_sub_ps(lij, uij);
 985                 t2            = _mm_mul_ps(diff2,
 986                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 987                                                       prod));
 988                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 989                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 990                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 991                 t4            = _mm_and_ps(t4, obc_mask3);
 992                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 993
 994                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 995
 996                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 997                                            _mm_mul_ps(prod, lij3));
 998                 t1            = _mm_sub_ps(t1,
 999                                            _mm_mul_ps(onefourth,
1000                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1001                                                                  _mm_mul_ps(lij3, r))));
1002                 t2            = _mm_mul_ps(onefourth,
1003                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1004                                                       _mm_mul_ps(uij3, r)));
1005                 t2            = _mm_sub_ps(t2,
1006                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1007                                                       _mm_mul_ps(prod, uij3)));
1008                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1009                                            _mm_mul_ps(rinv, rinv));
1010                 t3            = _mm_sub_ps(t3,
1011                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1012                                                       _mm_add_ps(one,
1013                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1014                 t1            = _mm_mul_ps(rinv,
1015                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1016                                                       _mm_add_ps(t2, t3)));
1017                 dadx2         = _mm_and_ps(t1, obc_mask1);
1018             }
1019             else
1020             {
1021                 dadx2         = _mm_setzero_ps();
1022             }
1023
1024             _mm_store_ps(dadx, dadx1);
1025             dadx += 4;
1026             _mm_store_ps(dadx, dadx2);
1027             dadx += 4;
1028         } /* end normal inner loop */
1029
1030         if (offset != 0)
1031         {
1032             if (offset == 1)
1033             {
1034                 jnrA        = jjnr[k];
1035                 j3A         = 3*jnrA;
1036                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1037                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1038                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1039                 mask        = mask1;
1040             }
1041             else if (offset == 2)
1042             {
1043                 jnrA        = jjnr[k];
1044                 jnrB        = jjnr[k+1];
1045                 j3A         = 3*jnrA;
1046                 j3B         = 3*jnrB;
1047                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1048                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1049                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1050                 mask        = mask2;
1051             }
1052             else
1053             {
1054                 /* offset must be 3 */
1055                 jnrA        = jjnr[k];
1056                 jnrB        = jjnr[k+1];
1057                 jnrC        = jjnr[k+2];
1058                 j3A         = 3*jnrA;
1059                 j3B         = 3*jnrB;
1060                 j3C         = 3*jnrC;
1061                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1062                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1063                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1064                 mask        = mask3;
1065             }
1066
1067             dx    = _mm_sub_ps(ix, jx);
1068             dy    = _mm_sub_ps(iy, jy);
1069             dz    = _mm_sub_ps(iz, jz);
1070
1071             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
1072
1073             rinv        = gmx_mm_invsqrt_ps(rsq);
1074             r           = _mm_mul_ps(rsq, rinv);
1075
1076             /* Compute raj_inv aj1-4 */
1077             raj_inv     = gmx_mm_inv_ps(raj);
1078
1079             /* Evaluate influence of atom aj -> ai */
1080             t1            = _mm_add_ps(r, sk_aj);
1081             obc_mask1     = _mm_cmplt_ps(rai, t1);
1082             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1083
1084             if (_mm_movemask_ps(obc_mask1))
1085             {
1086                 t2            = _mm_sub_ps(r, sk_aj);
1087                 t3            = _mm_sub_ps(sk_aj, r);
1088                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1089                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1090
1091                 uij           = gmx_mm_inv_ps(t1);
1092                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1093                                              _mm_andnot_ps(obc_mask2, rai_inv));
1094                 dlij           = _mm_and_ps(one, obc_mask2);
1095                 uij2           = _mm_mul_ps(uij, uij);
1096                 uij3           = _mm_mul_ps(uij2, uij);
1097                 lij2           = _mm_mul_ps(lij, lij);
1098                 lij3           = _mm_mul_ps(lij2, lij);
1099                 diff2          = _mm_sub_ps(uij2, lij2);
1100                 lij_inv        = gmx_mm_invsqrt_ps(lij2);
1101                 sk2_aj         = _mm_mul_ps(sk_aj, sk_aj);
1102                 sk2_rinv       = _mm_mul_ps(sk2_aj, rinv);
1103                 prod           = _mm_mul_ps(onefourth, sk2_rinv);
1104                 logterm        = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1105                 t1             = _mm_sub_ps(lij, uij);
1106                 t2             = _mm_mul_ps(diff2,
1107                                             _mm_sub_ps(_mm_mul_ps(onefourth, r),
1108                                                        prod));
1109                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1110                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1111                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1112                 t4            = _mm_and_ps(t4, obc_mask3);
1113                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1114                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1115                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1116                                            _mm_mul_ps(prod, lij3));
1117                 t1            = _mm_sub_ps(t1,
1118                                            _mm_mul_ps(onefourth,
1119                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1120                                                                  _mm_mul_ps(lij3, r))));
1121                 t2            = _mm_mul_ps(onefourth,
1122                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1123                                                       _mm_mul_ps(uij3, r)));
1124                 t2            = _mm_sub_ps(t2,
1125                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1126                                                       _mm_mul_ps(prod, uij3)));
1127                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1128                                            _mm_mul_ps(rinv, rinv));
1129                 t3            = _mm_sub_ps(t3,
1130                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1131                                                       _mm_add_ps(one,
1132                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1133                 t1            = _mm_mul_ps(rinv,
1134                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1135                                                       _mm_add_ps(t2, t3)));
1136                 dadx1         = _mm_and_ps(t1, obc_mask1);
1137             }
1138             else
1139             {
1140                 dadx1         = _mm_setzero_ps();
1141             }
1142
1143             /* Evaluate influence of atom ai -> aj */
1144             t1            = _mm_add_ps(r, sk_ai);
1145             obc_mask1     = _mm_cmplt_ps(raj, t1);
1146             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1147
1148             if (_mm_movemask_ps(obc_mask1))
1149             {
1150                 t2            = _mm_sub_ps(r, sk_ai);
1151                 t3            = _mm_sub_ps(sk_ai, r);
1152                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1153                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1154
1155                 uij           = gmx_mm_inv_ps(t1);
1156                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1157                                           _mm_andnot_ps(obc_mask2, raj_inv));
1158                 dlij          = _mm_and_ps(one, obc_mask2);
1159                 uij2          = _mm_mul_ps(uij, uij);
1160                 uij3          = _mm_mul_ps(uij2, uij);
1161                 lij2          = _mm_mul_ps(lij, lij);
1162                 lij3          = _mm_mul_ps(lij2, lij);
1163                 diff2         = _mm_sub_ps(uij2, lij2);
1164                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1165                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
1166                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
1167                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1168                 t1            = _mm_sub_ps(lij, uij);
1169                 t2            = _mm_mul_ps(diff2,
1170                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
1171                                                       prod));
1172                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1173                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1174                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1175                 t4            = _mm_and_ps(t4, obc_mask3);
1176                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1177
1178                 tmp           = _mm_and_ps(t1, obc_mask1);
1179
1180                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1181                                            _mm_mul_ps(prod, lij3));
1182                 t1            = _mm_sub_ps(t1,
1183                                            _mm_mul_ps(onefourth,
1184                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1185                                                                  _mm_mul_ps(lij3, r))));
1186                 t2            = _mm_mul_ps(onefourth,
1187                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1188                                                       _mm_mul_ps(uij3, r)));
1189                 t2            = _mm_sub_ps(t2,
1190                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1191                                                       _mm_mul_ps(prod, uij3)));
1192                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1193                                            _mm_mul_ps(rinv, rinv));
1194                 t3            = _mm_sub_ps(t3,
1195                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1196                                                       _mm_add_ps(one,
1197                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1198                 t1            = _mm_mul_ps(rinv,
1199                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1200                                                       _mm_add_ps(t2, t3)));
1201                 dadx2         = _mm_and_ps(t1, obc_mask1);
1202             }
1203             else
1204             {
1205                 dadx2         = _mm_setzero_ps();
1206                 tmp           = _mm_setzero_ps();
1207             }
1208
1209             _mm_store_ps(dadx, dadx1);
1210             dadx += 4;
1211             _mm_store_ps(dadx, dadx2);
1212             dadx += 4;
1213
1214             if (offset == 1)
1215             {
1216                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1217             }
1218             else if (offset == 2)
1219             {
1220                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1221             }
1222             else
1223             {
1224                 /* offset must be 3 */
1225                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1226             }
1227
1228         }
1229         GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1230
1231     }
1232
1233     /* Parallel summations */
1234     if (PARTDECOMP(cr))
1235     {
1236         gmx_sum(natoms, work, cr);
1237     }
1238     else if (DOMAINDECOMP(cr))
1239     {
1240         dd_atom_sum_real(cr->dd, work);
1241     }
1242
1243     if (gb_algorithm == egbHCT)
1244     {
1245         /* HCT */
1246         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1247         {
1248             if (born->use[i] != 0)
1249             {
1250                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1251                 sum     = 1.0/rr - work[i];
1252                 min_rad = rr + doffset;
1253                 rad     = 1.0/sum;
1254
1255                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1256                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1257             }
1258         }
1259
1260         /* Extra communication required for DD */
1261         if (DOMAINDECOMP(cr))
1262         {
1263             dd_atom_spread_real(cr->dd, born->bRad);
1264             dd_atom_spread_real(cr->dd, fr->invsqrta);
1265         }
1266     }
1267     else
1268     {
1269         /* OBC */
1270         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1271         {
1272             if (born->use[i] != 0)
1273             {
1274                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1275                 rr_inv2 = 1.0/rr;
1276                 rr      = rr-doffset;
1277                 rr_inv  = 1.0/rr;
1278                 sum     = rr * work[i];
1279                 sum2    = sum  * sum;
1280                 sum3    = sum2 * sum;
1281
1282                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1283                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1284                 born->bRad[i] = 1.0 / born->bRad[i];
1285
1286                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1287
1288                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1289                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1290             }
1291         }
1292         /* Extra (local) communication required for DD */
1293         if (DOMAINDECOMP(cr))
1294         {
1295             dd_atom_spread_real(cr->dd, born->bRad);
1296             dd_atom_spread_real(cr->dd, fr->invsqrta);
1297             dd_atom_spread_real(cr->dd, born->drobc);
1298         }
1299     }
1300
1301
1302
1303     return 0;
1304 }
1305
1306
1307
1308 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1309                                     float *x, float *f, float *fshift, float *shiftvec,
1310                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1311 {
1312     int          i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1313     int          jnrA, jnrB, jnrC, jnrD;
1314     int          j3A, j3B, j3C, j3D;
1315     int          jnrE, jnrF, jnrG, jnrH;
1316     int          j3E, j3F, j3G, j3H;
1317     int       *  jjnr;
1318
1319     float        rbi, shX, shY, shZ;
1320     float       *rb;
1321
1322     __m128       ix, iy, iz;
1323     __m128       jx, jy, jz;
1324     __m128       jxB, jyB, jzB;
1325     __m128       fix, fiy, fiz;
1326     __m128       dx, dy, dz;
1327     __m128       tx, ty, tz;
1328     __m128       dxB, dyB, dzB;
1329     __m128       txB, tyB, tzB;
1330
1331     __m128       rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1332     __m128       xmm1, xmm2, xmm3;
1333
1334     const __m128 two = _mm_set1_ps(2.0f);
1335
1336     rb     = born->work;
1337
1338     jjnr   = nl->jjnr;
1339
1340     /* Loop to get the proper form for the Born radius term, sse style */
1341     offset = natoms%4;
1342
1343     n0 = 0;
1344     n1 = natoms;
1345
1346     if (gb_algorithm == egbSTILL)
1347     {
1348         for (i = n0; i < n1; i++)
1349         {
1350             rbi   = born->bRad[i];
1351             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1352         }
1353     }
1354     else if (gb_algorithm == egbHCT)
1355     {
1356         for (i = n0; i < n1; i++)
1357         {
1358             rbi   = born->bRad[i];
1359             rb[i] = rbi * rbi * dvda[i];
1360         }
1361     }
1362     else if (gb_algorithm == egbOBC)
1363     {
1364         for (i = n0; i < n1; i++)
1365         {
1366             rbi   = born->bRad[i];
1367             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1368         }
1369     }
1370
1371     jz = _mm_setzero_ps();
1372
1373     n = j3A = j3B = j3C = j3D = 0;
1374
1375     for (i = 0; i < nl->nri; i++)
1376     {
1377         ii     = nl->iinr[i];
1378         ii3    = ii*3;
1379         is3    = 3*nl->shift[i];
1380         shX    = shiftvec[is3];
1381         shY    = shiftvec[is3+1];
1382         shZ    = shiftvec[is3+2];
1383         nj0    = nl->jindex[i];
1384         nj1    = nl->jindex[i+1];
1385
1386         ix     = _mm_set1_ps(shX+x[ii3+0]);
1387         iy     = _mm_set1_ps(shY+x[ii3+1]);
1388         iz     = _mm_set1_ps(shZ+x[ii3+2]);
1389
1390         offset = (nj1-nj0)%4;
1391
1392         rbai   = _mm_load1_ps(rb+ii);
1393         fix    = _mm_setzero_ps();
1394         fiy    = _mm_setzero_ps();
1395         fiz    = _mm_setzero_ps();
1396
1397
1398         for (k = nj0; k < nj1-offset; k += 4)
1399         {
1400             jnrA        = jjnr[k];
1401             jnrB        = jjnr[k+1];
1402             jnrC        = jjnr[k+2];
1403             jnrD        = jjnr[k+3];
1404
1405             j3A         = 3*jnrA;
1406             j3B         = 3*jnrB;
1407             j3C         = 3*jnrC;
1408             j3D         = 3*jnrD;
1409
1410             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1411
1412             dx          = _mm_sub_ps(ix, jx);
1413             dy          = _mm_sub_ps(iy, jy);
1414             dz          = _mm_sub_ps(iz, jz);
1415
1416             GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1417
1418             /* load chain rule terms for j1-4 */
1419             f_gb        = _mm_load_ps(dadx);
1420             dadx       += 4;
1421             f_gb_ai     = _mm_load_ps(dadx);
1422             dadx       += 4;
1423
1424             /* calculate scalar force */
1425             f_gb    = _mm_mul_ps(f_gb, rbai);
1426             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1427             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1428
1429             tx     = _mm_mul_ps(f_gb, dx);
1430             ty     = _mm_mul_ps(f_gb, dy);
1431             tz     = _mm_mul_ps(f_gb, dz);
1432
1433             fix    = _mm_add_ps(fix, tx);
1434             fiy    = _mm_add_ps(fiy, ty);
1435             fiz    = _mm_add_ps(fiz, tz);
1436
1437             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1438         }
1439
1440         /*deal with odd elements */
1441         if (offset != 0)
1442         {
1443             if (offset == 1)
1444             {
1445                 jnrA        = jjnr[k];
1446                 j3A         = 3*jnrA;
1447                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1448                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1449             }
1450             else if (offset == 2)
1451             {
1452                 jnrA        = jjnr[k];
1453                 jnrB        = jjnr[k+1];
1454                 j3A         = 3*jnrA;
1455                 j3B         = 3*jnrB;
1456                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1457                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1458             }
1459             else
1460             {
1461                 /* offset must be 3 */
1462                 jnrA        = jjnr[k];
1463                 jnrB        = jjnr[k+1];
1464                 jnrC        = jjnr[k+2];
1465                 j3A         = 3*jnrA;
1466                 j3B         = 3*jnrB;
1467                 j3C         = 3*jnrC;
1468                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1469                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1470             }
1471
1472             dx          = _mm_sub_ps(ix, jx);
1473             dy          = _mm_sub_ps(iy, jy);
1474             dz          = _mm_sub_ps(iz, jz);
1475
1476             /* load chain rule terms for j1-4 */
1477             f_gb        = _mm_load_ps(dadx);
1478             dadx       += 4;
1479             f_gb_ai     = _mm_load_ps(dadx);
1480             dadx       += 4;
1481
1482             /* calculate scalar force */
1483             f_gb    = _mm_mul_ps(f_gb, rbai);
1484             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1485             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1486
1487             tx     = _mm_mul_ps(f_gb, dx);
1488             ty     = _mm_mul_ps(f_gb, dy);
1489             tz     = _mm_mul_ps(f_gb, dz);
1490
1491             fix    = _mm_add_ps(fix, tx);
1492             fiy    = _mm_add_ps(fiy, ty);
1493             fiz    = _mm_add_ps(fiz, tz);
1494
1495             if (offset == 1)
1496             {
1497                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1498             }
1499             else if (offset == 2)
1500             {
1501                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1502             }
1503             else
1504             {
1505                 /* offset must be 3 */
1506                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1507             }
1508         }
1509
1510         /* fix/fiy/fiz now contain four partial force terms, that all should be
1511          * added to the i particle forces and shift forces.
1512          */
1513         gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1514     }
1515
1516     return 0;
1517 }
1518
1519
1520 #else
1521 /* keep compiler happy */
1522 int genborn_sse_dummy;
1523
1524 #endif /* SSE intrinsics available */