src/gromacs/mdlib/genborn_sse2_single.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <math.h>
  42 #include <string.h>
  43
  44 #include "typedefs.h"
  45 #include "smalloc.h"
  46 #include "genborn.h"
  47 #include "vec.h"
  48 #include "grompp.h"
  49 #include "gromacs/fileio/pdbio.h"
  50 #include "names.h"
  51 #include "physics.h"
  52 #include "partdec.h"
  53 #include "domdec.h"
  54 #include "network.h"
  55 #include "gmx_fatal.h"
  56 #include "mtop_util.h"
  57 #include "genborn.h"
  58
  59 #include "gromacs/utility/gmxmpi.h"
  60
  61
  62 /* Only compile this file if SSE intrinsics are available */
  63 #if 0 && defined (GMX_X86_SSE2)
  64
  65 #include <gmx_sse2_single.h>
  66 #include <emmintrin.h>
  67
  68 #include "genborn_sse2_single.h"
  69
  70
  71 int
  72 calc_gb_rad_still_sse2_single(t_commrec *cr, t_forcerec *fr,
  73                               int natoms, gmx_localtop_t *top,
  74                               float *x, t_nblist *nl,
  75                               gmx_genborn_t *born)
  76 {
  77     int          i, k, n, ii, is3, ii3, nj0, nj1, offset;
  78     int          jnrA, jnrB, jnrC, jnrD, j3A, j3B, j3C, j3D;
  79     int          jnrE, jnrF, jnrG, jnrH, j3E, j3F, j3G, j3H;
  80     int          shift;
  81     int         *mdtype;
  82     real         shX, shY, shZ;
  83     int         *jjnr;
  84     real        *shiftvec;
  85
  86     float        gpi_ai, gpi2;
  87     float        factor;
  88     float       *gb_radius;
  89     float       *vsolv;
  90     float       *work;
  91     float       *dadx;
  92
  93     __m128       ix, iy, iz;
  94     __m128       jx, jy, jz;
  95     __m128       dx, dy, dz;
  96     __m128       tx, ty, tz;
  97     __m128       jxB, jyB, jzB;
  98     __m128       dxB, dyB, dzB;
  99     __m128       txB, tyB, tzB;
 100     __m128       rsq, rinv, rinv2, rinv4, rinv6;
 101     __m128       rsqB, rinvB, rinv2B, rinv4B, rinv6B;
 102     __m128       ratio, gpi, rai, raj, vai, vaj, rvdw;
 103     __m128       ratioB, rajB, vajB, rvdwB;
 104     __m128       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
 105     __m128       ccfB, dccfB, thetaB, cosqB, termB, sinqB, resB, prodB;
 106     __m128       mask, icf4, icf6, mask_cmp;
 107     __m128       icf4B, icf6B, mask_cmpB;
 108
 109     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 110     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 111     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 112
 113     const __m128 half   = _mm_set1_ps(0.5f);
 114     const __m128 three  = _mm_set1_ps(3.0f);
 115     const __m128 one    = _mm_set1_ps(1.0f);
 116     const __m128 two    = _mm_set1_ps(2.0f);
 117     const __m128 zero   = _mm_set1_ps(0.0f);
 118     const __m128 four   = _mm_set1_ps(4.0f);
 119
 120     const __m128 still_p5inv  = _mm_set1_ps(STILL_P5INV);
 121     const __m128 still_pip5   = _mm_set1_ps(STILL_PIP5);
 122     const __m128 still_p4     = _mm_set1_ps(STILL_P4);
 123
 124     factor  = 0.5 * ONE_4PI_EPS0;
 125
 126     gb_radius = born->gb_radius;
 127     vsolv     = born->vsolv;
 128     work      = born->gpol_still_work;
 129     jjnr      = nl->jjnr;
 130     shiftvec  = fr->shift_vec[0];
 131     dadx      = fr->dadx;
 132
 133     jnrA = jnrB = jnrC = jnrD = 0;
 134     jx   = _mm_setzero_ps();
 135     jy   = _mm_setzero_ps();
 136     jz   = _mm_setzero_ps();
 137
 138     n = 0;
 139
 140     for (i = 0; i < natoms; i++)
 141     {
 142         work[i] = 0;
 143     }
 144
 145     for (i = 0; i < nl->nri; i++)
 146     {
 147         ii     = nl->iinr[i];
 148         ii3    = ii*3;
 149         is3    = 3*nl->shift[i];
 150         shX    = shiftvec[is3];
 151         shY    = shiftvec[is3+1];
 152         shZ    = shiftvec[is3+2];
 153         nj0    = nl->jindex[i];
 154         nj1    = nl->jindex[i+1];
 155
 156         ix     = _mm_set1_ps(shX+x[ii3+0]);
 157         iy     = _mm_set1_ps(shY+x[ii3+1]);
 158         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 159
 160         offset = (nj1-nj0)%4;
 161
 162         /* Polarization energy for atom ai */
 163         gpi    = _mm_setzero_ps();
 164
 165         rai     = _mm_load1_ps(gb_radius+ii);
 166         prod_ai = _mm_set1_ps(STILL_P4*vsolv[ii]);
 167
 168         for (k = nj0; k < nj1-4-offset; k += 8)
 169         {
 170             jnrA        = jjnr[k];
 171             jnrB        = jjnr[k+1];
 172             jnrC        = jjnr[k+2];
 173             jnrD        = jjnr[k+3];
 174             jnrE        = jjnr[k+4];
 175             jnrF        = jjnr[k+5];
 176             jnrG        = jjnr[k+6];
 177             jnrH        = jjnr[k+7];
 178
 179             j3A         = 3*jnrA;
 180             j3B         = 3*jnrB;
 181             j3C         = 3*jnrC;
 182             j3D         = 3*jnrD;
 183             j3E         = 3*jnrE;
 184             j3F         = 3*jnrF;
 185             j3G         = 3*jnrG;
 186             j3H         = 3*jnrH;
 187
 188             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 189             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 190
 191             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 192             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 193             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 194             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrE, vsolv+jnrF, vsolv+jnrG, vsolv+jnrH, vajB);
 195
 196             dx          = _mm_sub_ps(ix, jx);
 197             dy          = _mm_sub_ps(iy, jy);
 198             dz          = _mm_sub_ps(iz, jz);
 199             dxB         = _mm_sub_ps(ix, jxB);
 200             dyB         = _mm_sub_ps(iy, jyB);
 201             dzB         = _mm_sub_ps(iz, jzB);
 202
 203             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 204             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 205             rinv        = gmx_mm_invsqrt_ps(rsq);
 206             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 207             rinv2       = _mm_mul_ps(rinv, rinv);
 208             rinv2B      = _mm_mul_ps(rinvB, rinvB);
 209             rinv4       = _mm_mul_ps(rinv2, rinv2);
 210             rinv4B      = _mm_mul_ps(rinv2B, rinv2B);
 211             rinv6       = _mm_mul_ps(rinv4, rinv2);
 212             rinv6B      = _mm_mul_ps(rinv4B, rinv2B);
 213
 214             rvdw        = _mm_add_ps(rai, raj);
 215             rvdwB       = _mm_add_ps(rai, rajB);
 216             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 217             ratioB      = _mm_mul_ps(rsqB, gmx_mm_inv_ps( _mm_mul_ps(rvdwB, rvdwB)));
 218
 219             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 220             mask_cmpB   = _mm_cmple_ps(ratioB, still_p5inv);
 221
 222             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 223             if (0 == _mm_movemask_ps(mask_cmp) )
 224             {
 225                 /* if ratio>still_p5inv for ALL elements */
 226                 ccf         = one;
 227                 dccf        = _mm_setzero_ps();
 228             }
 229             else
 230             {
 231                 ratio       = _mm_min_ps(ratio, still_p5inv);
 232                 theta       = _mm_mul_ps(ratio, still_pip5);
 233                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 234                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 235                 ccf         = _mm_mul_ps(term, term);
 236                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 237                                          _mm_mul_ps(sinq, theta));
 238             }
 239             if (0 == _mm_movemask_ps(mask_cmpB) )
 240             {
 241                 /* if ratio>still_p5inv for ALL elements */
 242                 ccfB        = one;
 243                 dccfB       = _mm_setzero_ps();
 244             }
 245             else
 246             {
 247                 ratioB      = _mm_min_ps(ratioB, still_p5inv);
 248                 thetaB      = _mm_mul_ps(ratioB, still_pip5);
 249                 gmx_mm_sincos_ps(thetaB, &sinqB, &cosqB);
 250                 termB       = _mm_mul_ps(half, _mm_sub_ps(one, cosqB));
 251                 ccfB        = _mm_mul_ps(termB, termB);
 252                 dccfB       = _mm_mul_ps(_mm_mul_ps(two, termB),
 253                                          _mm_mul_ps(sinqB, thetaB));
 254             }
 255
 256             prod        = _mm_mul_ps(still_p4, vaj);
 257             prodB       = _mm_mul_ps(still_p4, vajB);
 258             icf4        = _mm_mul_ps(ccf, rinv4);
 259             icf4B       = _mm_mul_ps(ccfB, rinv4B);
 260             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 261             icf6B       = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccfB), dccfB), rinv6B);
 262
 263             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 264             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_mul_ps(prod_ai, icf4B));
 265
 266             gpi           = _mm_add_ps(gpi, _mm_add_ps( _mm_mul_ps(prod, icf4), _mm_mul_ps(prodB, icf4B) ) );
 267
 268             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 269             dadx += 4;
 270             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 271             dadx += 4;
 272             _mm_store_ps(dadx, _mm_mul_ps(prodB, icf6B));
 273             dadx += 4;
 274             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6B));
 275             dadx += 4;
 276         }
 277
 278         for (; k < nj1-offset; k += 4)
 279         {
 280             jnrA        = jjnr[k];
 281             jnrB        = jjnr[k+1];
 282             jnrC        = jjnr[k+2];
 283             jnrD        = jjnr[k+3];
 284
 285             j3A         = 3*jnrA;
 286             j3B         = 3*jnrB;
 287             j3C         = 3*jnrC;
 288             j3D         = 3*jnrD;
 289
 290             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 291
 292             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 293             GMX_MM_LOAD_4VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vsolv+jnrD, vaj);
 294
 295             dx          = _mm_sub_ps(ix, jx);
 296             dy          = _mm_sub_ps(iy, jy);
 297             dz          = _mm_sub_ps(iz, jz);
 298
 299             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 300             rinv        = gmx_mm_invsqrt_ps(rsq);
 301             rinv2       = _mm_mul_ps(rinv, rinv);
 302             rinv4       = _mm_mul_ps(rinv2, rinv2);
 303             rinv6       = _mm_mul_ps(rinv4, rinv2);
 304
 305             rvdw        = _mm_add_ps(rai, raj);
 306             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 307
 308             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 309
 310             /* gmx_mm_sincos_ps() is quite expensive, so avoid calculating it if we can! */
 311             if (0 == _mm_movemask_ps(mask_cmp))
 312             {
 313                 /* if ratio>still_p5inv for ALL elements */
 314                 ccf         = one;
 315                 dccf        = _mm_setzero_ps();
 316             }
 317             else
 318             {
 319                 ratio       = _mm_min_ps(ratio, still_p5inv);
 320                 theta       = _mm_mul_ps(ratio, still_pip5);
 321                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 322                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 323                 ccf         = _mm_mul_ps(term, term);
 324                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 325                                          _mm_mul_ps(sinq, theta));
 326             }
 327
 328             prod        = _mm_mul_ps(still_p4, vaj);
 329             icf4        = _mm_mul_ps(ccf, rinv4);
 330             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 331
 332             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_mul_ps(prod_ai, icf4));
 333
 334             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 335
 336             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 337             dadx += 4;
 338             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 339             dadx += 4;
 340         }
 341
 342         if (offset != 0)
 343         {
 344             if (offset == 1)
 345             {
 346                 jnrA        = jjnr[k];
 347                 j3A         = 3*jnrA;
 348                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
 349                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
 350                 GMX_MM_LOAD_1VALUE_PS(vsolv+jnrA, vaj);
 351                 mask        = mask1;
 352             }
 353             else if (offset == 2)
 354             {
 355                 jnrA        = jjnr[k];
 356                 jnrB        = jjnr[k+1];
 357                 j3A         = 3*jnrA;
 358                 j3B         = 3*jnrB;
 359                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
 360                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
 361                 GMX_MM_LOAD_2VALUES_PS(vsolv+jnrA, vsolv+jnrB, vaj);
 362                 mask        = mask2;
 363             }
 364             else
 365             {
 366                 /* offset must be 3 */
 367                 jnrA        = jjnr[k];
 368                 jnrB        = jjnr[k+1];
 369                 jnrC        = jjnr[k+2];
 370                 j3A         = 3*jnrA;
 371                 j3B         = 3*jnrB;
 372                 j3C         = 3*jnrC;
 373                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
 374                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
 375                 GMX_MM_LOAD_3VALUES_PS(vsolv+jnrA, vsolv+jnrB, vsolv+jnrC, vaj);
 376                 mask        = mask3;
 377             }
 378
 379             dx          = _mm_sub_ps(ix, jx);
 380             dy          = _mm_sub_ps(iy, jy);
 381             dz          = _mm_sub_ps(iz, jz);
 382
 383             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 384             rinv        = gmx_mm_invsqrt_ps(rsq);
 385             rinv2       = _mm_mul_ps(rinv, rinv);
 386             rinv4       = _mm_mul_ps(rinv2, rinv2);
 387             rinv6       = _mm_mul_ps(rinv4, rinv2);
 388
 389             rvdw        = _mm_add_ps(rai, raj);
 390             ratio       = _mm_mul_ps(rsq, gmx_mm_inv_ps( _mm_mul_ps(rvdw, rvdw)));
 391
 392             mask_cmp    = _mm_cmple_ps(ratio, still_p5inv);
 393
 394             if (0 == _mm_movemask_ps(mask_cmp))
 395             {
 396                 /* if ratio>still_p5inv for ALL elements */
 397                 ccf         = one;
 398                 dccf        = _mm_setzero_ps();
 399             }
 400             else
 401             {
 402                 ratio       = _mm_min_ps(ratio, still_p5inv);
 403                 theta       = _mm_mul_ps(ratio, still_pip5);
 404                 gmx_mm_sincos_ps(theta, &sinq, &cosq);
 405                 term        = _mm_mul_ps(half, _mm_sub_ps(one, cosq));
 406                 ccf         = _mm_mul_ps(term, term);
 407                 dccf        = _mm_mul_ps(_mm_mul_ps(two, term),
 408                                          _mm_mul_ps(sinq, theta));
 409             }
 410
 411             prod        = _mm_mul_ps(still_p4, vaj);
 412             icf4        = _mm_mul_ps(ccf, rinv4);
 413             icf6        = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four, ccf), dccf), rinv6);
 414
 415             gpi           = _mm_add_ps(gpi, _mm_mul_ps(prod, icf4));
 416
 417             _mm_store_ps(dadx, _mm_mul_ps(prod, icf6));
 418             dadx += 4;
 419             _mm_store_ps(dadx, _mm_mul_ps(prod_ai, icf6));
 420             dadx += 4;
 421
 422             tmp = _mm_mul_ps(prod_ai, icf4);
 423
 424             if (offset == 1)
 425             {
 426                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
 427             }
 428             else if (offset == 2)
 429             {
 430                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
 431             }
 432             else
 433             {
 434                 /* offset must be 3 */
 435                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
 436             }
 437         }
 438         GMX_MM_UPDATE_1POT_PS(gpi, work+ii);
 439     }
 440
 441     /* Sum up the polarization energy from other nodes */
 442     if (PARTDECOMP(cr))
 443     {
 444         gmx_sum(natoms, work, cr);
 445     }
 446     else if (DOMAINDECOMP(cr))
 447     {
 448         dd_atom_sum_real(cr->dd, work);
 449     }
 450
 451     /* Compute the radii */
 452     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 453     {
 454         if (born->use[i] != 0)
 455         {
 456             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 457             gpi2             = gpi_ai * gpi_ai;
 458             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 459             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 460         }
 461     }
 462
 463     /* Extra (local) communication required for DD */
 464     if (DOMAINDECOMP(cr))
 465     {
 466         dd_atom_spread_real(cr->dd, born->bRad);
 467         dd_atom_spread_real(cr->dd, fr->invsqrta);
 468     }
 469
 470     return 0;
 471 }
 472
 473
 474 int
 475 calc_gb_rad_hct_obc_sse2_single(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 476                                 float *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 477 {
 478     int          i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 479     int          jnrA, jnrB, jnrC, jnrD;
 480     int          j3A, j3B, j3C, j3D;
 481     int          jnrE, jnrF, jnrG, jnrH;
 482     int          j3E, j3F, j3G, j3H;
 483     float        shX, shY, shZ;
 484     float        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 485     float        sum_ai2, sum_ai3, tsum, tchain, doffset;
 486     float       *obc_param;
 487     float       *gb_radius;
 488     float       *work;
 489     int       *  jjnr;
 490     float       *dadx;
 491     float       *shiftvec;
 492     float        min_rad, rad;
 493
 494     __m128       ix, iy, iz, jx, jy, jz;
 495     __m128       dx, dy, dz, t1, t2, t3, t4;
 496     __m128       rsq, rinv, r;
 497     __m128       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 498     __m128       uij, lij2, uij2, lij3, uij3, diff2;
 499     __m128       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 500     __m128       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 501     __m128       dadx1, dadx2;
 502     __m128       logterm;
 503     __m128       mask;
 504     __m128       obc_mask1, obc_mask2, obc_mask3;
 505     __m128       jxB, jyB, jzB, t1B, t2B, t3B, t4B;
 506     __m128       dxB, dyB, dzB, rsqB, rinvB, rB;
 507     __m128       rajB, raj_invB, rai_inv2B, sk2B, lijB, dlijB, duijB;
 508     __m128       uijB, lij2B, uij2B, lij3B, uij3B, diff2B;
 509     __m128       lij_invB, sk2_invB, prodB;
 510     __m128       sk_ajB, sk2_ajB, sk2_rinvB;
 511     __m128       dadx1B, dadx2B;
 512     __m128       logtermB;
 513     __m128       obc_mask1B, obc_mask2B, obc_mask3B;
 514
 515     __m128       mask1 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0, 0xffffffff) );
 516     __m128       mask2 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff) );
 517     __m128       mask3 = gmx_mm_castsi128_ps( _mm_set_epi32(0, 0xffffffff, 0xffffffff, 0xffffffff) );
 518
 519     __m128       oneeighth   = _mm_set1_ps(0.125);
 520     __m128       onefourth   = _mm_set1_ps(0.25);
 521
 522     const __m128 half  = _mm_set1_ps(0.5f);
 523     const __m128 three = _mm_set1_ps(3.0f);
 524     const __m128 one   = _mm_set1_ps(1.0f);
 525     const __m128 two   = _mm_set1_ps(2.0f);
 526     const __m128 zero  = _mm_set1_ps(0.0f);
 527     const __m128 neg   = _mm_set1_ps(-1.0f);
 528
 529     /* Set the dielectric offset */
 530     doffset   = born->gb_doffset;
 531     gb_radius = born->gb_radius;
 532     obc_param = born->param;
 533     work      = born->gpol_hct_work;
 534     jjnr      = nl->jjnr;
 535     dadx      = fr->dadx;
 536     shiftvec  = fr->shift_vec[0];
 537
 538     jx        = _mm_setzero_ps();
 539     jy        = _mm_setzero_ps();
 540     jz        = _mm_setzero_ps();
 541
 542     jnrA = jnrB = jnrC = jnrD = 0;
 543
 544     for (i = 0; i < born->nr; i++)
 545     {
 546         work[i] = 0;
 547     }
 548
 549     for (i = 0; i < nl->nri; i++)
 550     {
 551         ii     = nl->iinr[i];
 552         ii3    = ii*3;
 553         is3    = 3*nl->shift[i];
 554         shX    = shiftvec[is3];
 555         shY    = shiftvec[is3+1];
 556         shZ    = shiftvec[is3+2];
 557         nj0    = nl->jindex[i];
 558         nj1    = nl->jindex[i+1];
 559
 560         ix     = _mm_set1_ps(shX+x[ii3+0]);
 561         iy     = _mm_set1_ps(shY+x[ii3+1]);
 562         iz     = _mm_set1_ps(shZ+x[ii3+2]);
 563
 564         offset = (nj1-nj0)%4;
 565
 566         rai     = _mm_load1_ps(gb_radius+ii);
 567         rai_inv = gmx_mm_inv_ps(rai);
 568
 569         sum_ai = _mm_setzero_ps();
 570
 571         sk_ai  = _mm_load1_ps(born->param+ii);
 572         sk2_ai = _mm_mul_ps(sk_ai, sk_ai);
 573
 574         for (k = nj0; k < nj1-4-offset; k += 8)
 575         {
 576             jnrA        = jjnr[k];
 577             jnrB        = jjnr[k+1];
 578             jnrC        = jjnr[k+2];
 579             jnrD        = jjnr[k+3];
 580             jnrE        = jjnr[k+4];
 581             jnrF        = jjnr[k+5];
 582             jnrG        = jjnr[k+6];
 583             jnrH        = jjnr[k+7];
 584
 585             j3A         = 3*jnrA;
 586             j3B         = 3*jnrB;
 587             j3C         = 3*jnrC;
 588             j3D         = 3*jnrD;
 589             j3E         = 3*jnrE;
 590             j3F         = 3*jnrF;
 591             j3G         = 3*jnrG;
 592             j3H         = 3*jnrH;
 593
 594             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 595             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3E, x+j3F, x+j3G, x+j3H, jxB, jyB, jzB);
 596             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 597             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrE, gb_radius+jnrF, gb_radius+jnrG, gb_radius+jnrH, rajB);
 598             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 599             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrE, obc_param+jnrF, obc_param+jnrG, obc_param+jnrH, sk_ajB);
 600
 601             dx    = _mm_sub_ps(ix, jx);
 602             dy    = _mm_sub_ps(iy, jy);
 603             dz    = _mm_sub_ps(iz, jz);
 604             dxB   = _mm_sub_ps(ix, jxB);
 605             dyB   = _mm_sub_ps(iy, jyB);
 606             dzB   = _mm_sub_ps(iz, jzB);
 607
 608             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 609             rsqB        = gmx_mm_calc_rsq_ps(dxB, dyB, dzB);
 610
 611             rinv        = gmx_mm_invsqrt_ps(rsq);
 612             r           = _mm_mul_ps(rsq, rinv);
 613             rinvB       = gmx_mm_invsqrt_ps(rsqB);
 614             rB          = _mm_mul_ps(rsqB, rinvB);
 615
 616             /* Compute raj_inv aj1-4 */
 617             raj_inv     = gmx_mm_inv_ps(raj);
 618             raj_invB    = gmx_mm_inv_ps(rajB);
 619
 620             /* Evaluate influence of atom aj -> ai */
 621             t1            = _mm_add_ps(r, sk_aj);
 622             t2            = _mm_sub_ps(r, sk_aj);
 623             t3            = _mm_sub_ps(sk_aj, r);
 624             t1B           = _mm_add_ps(rB, sk_ajB);
 625             t2B           = _mm_sub_ps(rB, sk_ajB);
 626             t3B           = _mm_sub_ps(sk_ajB, rB);
 627             obc_mask1     = _mm_cmplt_ps(rai, t1);
 628             obc_mask2     = _mm_cmplt_ps(rai, t2);
 629             obc_mask3     = _mm_cmplt_ps(rai, t3);
 630             obc_mask1B    = _mm_cmplt_ps(rai, t1B);
 631             obc_mask2B    = _mm_cmplt_ps(rai, t2B);
 632             obc_mask3B    = _mm_cmplt_ps(rai, t3B);
 633
 634             uij           = gmx_mm_inv_ps(t1);
 635             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 636                                          _mm_andnot_ps(obc_mask2, rai_inv));
 637             dlij          = _mm_and_ps(one, obc_mask2);
 638             uij2          = _mm_mul_ps(uij, uij);
 639             uij3          = _mm_mul_ps(uij2, uij);
 640             lij2          = _mm_mul_ps(lij, lij);
 641             lij3          = _mm_mul_ps(lij2, lij);
 642
 643             uijB          = gmx_mm_inv_ps(t1B);
 644             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 645                                          _mm_andnot_ps(obc_mask2B, rai_inv));
 646             dlijB         = _mm_and_ps(one, obc_mask2B);
 647             uij2B         = _mm_mul_ps(uijB, uijB);
 648             uij3B         = _mm_mul_ps(uij2B, uijB);
 649             lij2B         = _mm_mul_ps(lijB, lijB);
 650             lij3B         = _mm_mul_ps(lij2B, lijB);
 651
 652             diff2         = _mm_sub_ps(uij2, lij2);
 653             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 654             sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 655             sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 656             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 657
 658             diff2B        = _mm_sub_ps(uij2B, lij2B);
 659             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 660             sk2_ajB       = _mm_mul_ps(sk_ajB, sk_ajB);
 661             sk2_rinvB     = _mm_mul_ps(sk2_ajB, rinvB);
 662             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 663
 664             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 665             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 666
 667             t1            = _mm_sub_ps(lij, uij);
 668             t2            = _mm_mul_ps(diff2,
 669                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 670                                                   prod));
 671             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 672             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 673             t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 674             t4            = _mm_and_ps(t4, obc_mask3);
 675             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 676
 677             t1B           = _mm_sub_ps(lijB, uijB);
 678             t2B           = _mm_mul_ps(diff2B,
 679                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 680                                                   prodB));
 681             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 682             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 683             t4B           = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lijB));
 684             t4B           = _mm_and_ps(t4B, obc_mask3B);
 685             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 686
 687             sum_ai        = _mm_add_ps(sum_ai, _mm_add_ps( _mm_and_ps(t1, obc_mask1), _mm_and_ps(t1B, obc_mask1B) ));
 688
 689             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 690                                        _mm_mul_ps(prod, lij3));
 691             t1            = _mm_sub_ps(t1,
 692                                        _mm_mul_ps(onefourth,
 693                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 694                                                              _mm_mul_ps(lij3, r))));
 695             t2            = _mm_mul_ps(onefourth,
 696                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 697                                                   _mm_mul_ps(uij3, r)));
 698             t2            = _mm_sub_ps(t2,
 699                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 700                                                   _mm_mul_ps(prod, uij3)));
 701             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 702                                        _mm_mul_ps(rinv, rinv));
 703             t3            = _mm_sub_ps(t3,
 704                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 705                                                   _mm_add_ps(one,
 706                                                              _mm_mul_ps(sk2_rinv, rinv))));
 707             t1            = _mm_mul_ps(rinv,
 708                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 709                                                   _mm_add_ps(t2, t3)));
 710
 711
 712
 713             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 714                                        _mm_mul_ps(prodB, lij3B));
 715             t1B           = _mm_sub_ps(t1B,
 716                                        _mm_mul_ps(onefourth,
 717                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 718                                                              _mm_mul_ps(lij3B, rB))));
 719             t2B           = _mm_mul_ps(onefourth,
 720                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 721                                                   _mm_mul_ps(uij3B, rB)));
 722             t2B           = _mm_sub_ps(t2B,
 723                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 724                                                   _mm_mul_ps(prodB, uij3B)));
 725             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 726                                        _mm_mul_ps(rinvB, rinvB));
 727             t3B           = _mm_sub_ps(t3B,
 728                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 729                                                   _mm_add_ps(one,
 730                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 731             t1B           = _mm_mul_ps(rinvB,
 732                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 733                                                   _mm_add_ps(t2B, t3B)));
 734
 735             dadx1         = _mm_and_ps(t1, obc_mask1);
 736             dadx1B        = _mm_and_ps(t1B, obc_mask1B);
 737
 738
 739             /* Evaluate influence of atom ai -> aj */
 740             t1            = _mm_add_ps(r, sk_ai);
 741             t2            = _mm_sub_ps(r, sk_ai);
 742             t3            = _mm_sub_ps(sk_ai, r);
 743             t1B           = _mm_add_ps(rB, sk_ai);
 744             t2B           = _mm_sub_ps(rB, sk_ai);
 745             t3B           = _mm_sub_ps(sk_ai, rB);
 746             obc_mask1     = _mm_cmplt_ps(raj, t1);
 747             obc_mask2     = _mm_cmplt_ps(raj, t2);
 748             obc_mask3     = _mm_cmplt_ps(raj, t3);
 749             obc_mask1B    = _mm_cmplt_ps(rajB, t1B);
 750             obc_mask2B    = _mm_cmplt_ps(rajB, t2B);
 751             obc_mask3B    = _mm_cmplt_ps(rajB, t3B);
 752
 753             uij           = gmx_mm_inv_ps(t1);
 754             lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 755                                          _mm_andnot_ps(obc_mask2, raj_inv));
 756             dlij          = _mm_and_ps(one, obc_mask2);
 757             uij2          = _mm_mul_ps(uij, uij);
 758             uij3          = _mm_mul_ps(uij2, uij);
 759             lij2          = _mm_mul_ps(lij, lij);
 760             lij3          = _mm_mul_ps(lij2, lij);
 761
 762             uijB          = gmx_mm_inv_ps(t1B);
 763             lijB          = _mm_or_ps(   _mm_and_ps(obc_mask2B, gmx_mm_inv_ps(t2B)),
 764                                          _mm_andnot_ps(obc_mask2B, raj_invB));
 765             dlijB         = _mm_and_ps(one, obc_mask2B);
 766             uij2B         = _mm_mul_ps(uijB, uijB);
 767             uij3B         = _mm_mul_ps(uij2B, uijB);
 768             lij2B         = _mm_mul_ps(lijB, lijB);
 769             lij3B         = _mm_mul_ps(lij2B, lijB);
 770
 771             diff2         = _mm_sub_ps(uij2, lij2);
 772             lij_inv       = gmx_mm_invsqrt_ps(lij2);
 773             sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 774             prod          = _mm_mul_ps(onefourth, sk2_rinv);
 775
 776             diff2B        = _mm_sub_ps(uij2B, lij2B);
 777             lij_invB      = gmx_mm_invsqrt_ps(lij2B);
 778             sk2_rinvB     = _mm_mul_ps(sk2_ai, rinvB);
 779             prodB         = _mm_mul_ps(onefourth, sk2_rinvB);
 780
 781             logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 782             logtermB      = gmx_mm_log_ps(_mm_mul_ps(uijB, lij_invB));
 783
 784             t1            = _mm_sub_ps(lij, uij);
 785             t2            = _mm_mul_ps(diff2,
 786                                        _mm_sub_ps(_mm_mul_ps(onefourth, r),
 787                                                   prod));
 788             t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 789             t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 790             t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 791             t4            = _mm_and_ps(t4, obc_mask3);
 792             t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 793
 794             t1B           = _mm_sub_ps(lijB, uijB);
 795             t2B           = _mm_mul_ps(diff2B,
 796                                        _mm_sub_ps(_mm_mul_ps(onefourth, rB),
 797                                                   prodB));
 798             t3B           = _mm_mul_ps(half, _mm_mul_ps(rinvB, logtermB));
 799             t1B           = _mm_add_ps(t1B, _mm_add_ps(t2B, t3B));
 800             t4B           = _mm_mul_ps(two, _mm_sub_ps(raj_invB, lijB));
 801             t4B           = _mm_and_ps(t4B, obc_mask3B);
 802             t1B           = _mm_mul_ps(half, _mm_add_ps(t1B, t4B));
 803
 804             GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 805             GMX_MM_INCREMENT_4VALUES_PS(work+jnrE, work+jnrF, work+jnrG, work+jnrH, _mm_and_ps(t1B, obc_mask1B));
 806
 807             t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 808                                        _mm_mul_ps(prod, lij3));
 809             t1            = _mm_sub_ps(t1,
 810                                        _mm_mul_ps(onefourth,
 811                                                   _mm_add_ps(_mm_mul_ps(lij, rinv),
 812                                                              _mm_mul_ps(lij3, r))));
 813             t2            = _mm_mul_ps(onefourth,
 814                                        _mm_add_ps(_mm_mul_ps(uij, rinv),
 815                                                   _mm_mul_ps(uij3, r)));
 816             t2            = _mm_sub_ps(t2,
 817                                        _mm_add_ps(_mm_mul_ps(half, uij2),
 818                                                   _mm_mul_ps(prod, uij3)));
 819             t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 820                                        _mm_mul_ps(rinv, rinv));
 821             t3            = _mm_sub_ps(t3,
 822                                        _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 823                                                   _mm_add_ps(one,
 824                                                              _mm_mul_ps(sk2_rinv, rinv))));
 825             t1            = _mm_mul_ps(rinv,
 826                                        _mm_add_ps(_mm_mul_ps(dlij, t1),
 827                                                   _mm_add_ps(t2, t3)));
 828
 829
 830             t1B           = _mm_add_ps(_mm_mul_ps(half, lij2B),
 831                                        _mm_mul_ps(prodB, lij3B));
 832             t1B           = _mm_sub_ps(t1B,
 833                                        _mm_mul_ps(onefourth,
 834                                                   _mm_add_ps(_mm_mul_ps(lijB, rinvB),
 835                                                              _mm_mul_ps(lij3B, rB))));
 836             t2B           = _mm_mul_ps(onefourth,
 837                                        _mm_add_ps(_mm_mul_ps(uijB, rinvB),
 838                                                   _mm_mul_ps(uij3B, rB)));
 839             t2B           = _mm_sub_ps(t2B,
 840                                        _mm_add_ps(_mm_mul_ps(half, uij2B),
 841                                                   _mm_mul_ps(prodB, uij3B)));
 842             t3B           = _mm_mul_ps(_mm_mul_ps(onefourth, logtermB),
 843                                        _mm_mul_ps(rinvB, rinvB));
 844             t3B           = _mm_sub_ps(t3B,
 845                                        _mm_mul_ps(_mm_mul_ps(diff2B, oneeighth),
 846                                                   _mm_add_ps(one,
 847                                                              _mm_mul_ps(sk2_rinvB, rinvB))));
 848             t1B           = _mm_mul_ps(rinvB,
 849                                        _mm_add_ps(_mm_mul_ps(dlijB, t1B),
 850                                                   _mm_add_ps(t2B, t3B)));
 851
 852
 853             dadx2         = _mm_and_ps(t1, obc_mask1);
 854             dadx2B        = _mm_and_ps(t1B, obc_mask1B);
 855
 856             _mm_store_ps(dadx, dadx1);
 857             dadx += 4;
 858             _mm_store_ps(dadx, dadx2);
 859             dadx += 4;
 860             _mm_store_ps(dadx, dadx1B);
 861             dadx += 4;
 862             _mm_store_ps(dadx, dadx2B);
 863             dadx += 4;
 864
 865         } /* end normal inner loop */
 866
 867         for (; k < nj1-offset; k += 4)
 868         {
 869             jnrA        = jjnr[k];
 870             jnrB        = jjnr[k+1];
 871             jnrC        = jjnr[k+2];
 872             jnrD        = jjnr[k+3];
 873
 874             j3A         = 3*jnrA;
 875             j3B         = 3*jnrB;
 876             j3C         = 3*jnrC;
 877             j3D         = 3*jnrD;
 878
 879             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
 880             GMX_MM_LOAD_4VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, gb_radius+jnrD, raj);
 881             GMX_MM_LOAD_4VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, obc_param+jnrD, sk_aj);
 882
 883             dx    = _mm_sub_ps(ix, jx);
 884             dy    = _mm_sub_ps(iy, jy);
 885             dz    = _mm_sub_ps(iz, jz);
 886
 887             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
 888
 889             rinv        = gmx_mm_invsqrt_ps(rsq);
 890             r           = _mm_mul_ps(rsq, rinv);
 891
 892             /* Compute raj_inv aj1-4 */
 893             raj_inv     = gmx_mm_inv_ps(raj);
 894
 895             /* Evaluate influence of atom aj -> ai */
 896             t1            = _mm_add_ps(r, sk_aj);
 897             obc_mask1     = _mm_cmplt_ps(rai, t1);
 898
 899             if (_mm_movemask_ps(obc_mask1))
 900             {
 901                 /* If any of the elements has rai<dr+sk, this is executed */
 902                 t2            = _mm_sub_ps(r, sk_aj);
 903                 t3            = _mm_sub_ps(sk_aj, r);
 904
 905                 obc_mask2     = _mm_cmplt_ps(rai, t2);
 906                 obc_mask3     = _mm_cmplt_ps(rai, t3);
 907
 908                 uij           = gmx_mm_inv_ps(t1);
 909                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 910                                              _mm_andnot_ps(obc_mask2, rai_inv));
 911                 dlij          = _mm_and_ps(one, obc_mask2);
 912                 uij2          = _mm_mul_ps(uij, uij);
 913                 uij3          = _mm_mul_ps(uij2, uij);
 914                 lij2          = _mm_mul_ps(lij, lij);
 915                 lij3          = _mm_mul_ps(lij2, lij);
 916                 diff2         = _mm_sub_ps(uij2, lij2);
 917                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 918                 sk2_aj        = _mm_mul_ps(sk_aj, sk_aj);
 919                 sk2_rinv      = _mm_mul_ps(sk2_aj, rinv);
 920                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 921                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 922                 t1            = _mm_sub_ps(lij, uij);
 923                 t2            = _mm_mul_ps(diff2,
 924                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 925                                                       prod));
 926                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 927                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 928                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
 929                 t4            = _mm_and_ps(t4, obc_mask3);
 930                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 931                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
 932                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 933                                            _mm_mul_ps(prod, lij3));
 934                 t1            = _mm_sub_ps(t1,
 935                                            _mm_mul_ps(onefourth,
 936                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
 937                                                                  _mm_mul_ps(lij3, r))));
 938                 t2            = _mm_mul_ps(onefourth,
 939                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
 940                                                       _mm_mul_ps(uij3, r)));
 941                 t2            = _mm_sub_ps(t2,
 942                                            _mm_add_ps(_mm_mul_ps(half, uij2),
 943                                                       _mm_mul_ps(prod, uij3)));
 944                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
 945                                            _mm_mul_ps(rinv, rinv));
 946                 t3            = _mm_sub_ps(t3,
 947                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
 948                                                       _mm_add_ps(one,
 949                                                                  _mm_mul_ps(sk2_rinv, rinv))));
 950                 t1            = _mm_mul_ps(rinv,
 951                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
 952                                                       _mm_add_ps(t2, t3)));
 953
 954                 dadx1         = _mm_and_ps(t1, obc_mask1);
 955             }
 956             else
 957             {
 958                 dadx1         = _mm_setzero_ps();
 959             }
 960
 961             /* Evaluate influence of atom ai -> aj */
 962             t1            = _mm_add_ps(r, sk_ai);
 963             obc_mask1     = _mm_cmplt_ps(raj, t1);
 964
 965             if (_mm_movemask_ps(obc_mask1))
 966             {
 967                 t2            = _mm_sub_ps(r, sk_ai);
 968                 t3            = _mm_sub_ps(sk_ai, r);
 969                 obc_mask2     = _mm_cmplt_ps(raj, t2);
 970                 obc_mask3     = _mm_cmplt_ps(raj, t3);
 971
 972                 uij           = gmx_mm_inv_ps(t1);
 973                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
 974                                              _mm_andnot_ps(obc_mask2, raj_inv));
 975                 dlij          = _mm_and_ps(one, obc_mask2);
 976                 uij2          = _mm_mul_ps(uij, uij);
 977                 uij3          = _mm_mul_ps(uij2, uij);
 978                 lij2          = _mm_mul_ps(lij, lij);
 979                 lij3          = _mm_mul_ps(lij2, lij);
 980                 diff2         = _mm_sub_ps(uij2, lij2);
 981                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
 982                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
 983                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
 984                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
 985                 t1            = _mm_sub_ps(lij, uij);
 986                 t2            = _mm_mul_ps(diff2,
 987                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
 988                                                       prod));
 989                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
 990                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
 991                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
 992                 t4            = _mm_and_ps(t4, obc_mask3);
 993                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
 994
 995                 GMX_MM_INCREMENT_4VALUES_PS(work+jnrA, work+jnrB, work+jnrC, work+jnrD, _mm_and_ps(t1, obc_mask1));
 996
 997                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
 998                                            _mm_mul_ps(prod, lij3));
 999                 t1            = _mm_sub_ps(t1,
1000                                            _mm_mul_ps(onefourth,
1001                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1002                                                                  _mm_mul_ps(lij3, r))));
1003                 t2            = _mm_mul_ps(onefourth,
1004                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1005                                                       _mm_mul_ps(uij3, r)));
1006                 t2            = _mm_sub_ps(t2,
1007                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1008                                                       _mm_mul_ps(prod, uij3)));
1009                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1010                                            _mm_mul_ps(rinv, rinv));
1011                 t3            = _mm_sub_ps(t3,
1012                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1013                                                       _mm_add_ps(one,
1014                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1015                 t1            = _mm_mul_ps(rinv,
1016                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1017                                                       _mm_add_ps(t2, t3)));
1018                 dadx2         = _mm_and_ps(t1, obc_mask1);
1019             }
1020             else
1021             {
1022                 dadx2         = _mm_setzero_ps();
1023             }
1024
1025             _mm_store_ps(dadx, dadx1);
1026             dadx += 4;
1027             _mm_store_ps(dadx, dadx2);
1028             dadx += 4;
1029         } /* end normal inner loop */
1030
1031         if (offset != 0)
1032         {
1033             if (offset == 1)
1034             {
1035                 jnrA        = jjnr[k];
1036                 j3A         = 3*jnrA;
1037                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1038                 GMX_MM_LOAD_1VALUE_PS(gb_radius+jnrA, raj);
1039                 GMX_MM_LOAD_1VALUE_PS(obc_param+jnrA, sk_aj);
1040                 mask        = mask1;
1041             }
1042             else if (offset == 2)
1043             {
1044                 jnrA        = jjnr[k];
1045                 jnrB        = jjnr[k+1];
1046                 j3A         = 3*jnrA;
1047                 j3B         = 3*jnrB;
1048                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1049                 GMX_MM_LOAD_2VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, raj);
1050                 GMX_MM_LOAD_2VALUES_PS(obc_param+jnrA, obc_param+jnrB, sk_aj);
1051                 mask        = mask2;
1052             }
1053             else
1054             {
1055                 /* offset must be 3 */
1056                 jnrA        = jjnr[k];
1057                 jnrB        = jjnr[k+1];
1058                 jnrC        = jjnr[k+2];
1059                 j3A         = 3*jnrA;
1060                 j3B         = 3*jnrB;
1061                 j3C         = 3*jnrC;
1062                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1063                 GMX_MM_LOAD_3VALUES_PS(gb_radius+jnrA, gb_radius+jnrB, gb_radius+jnrC, raj);
1064                 GMX_MM_LOAD_3VALUES_PS(obc_param+jnrA, obc_param+jnrB, obc_param+jnrC, sk_aj);
1065                 mask        = mask3;
1066             }
1067
1068             dx    = _mm_sub_ps(ix, jx);
1069             dy    = _mm_sub_ps(iy, jy);
1070             dz    = _mm_sub_ps(iz, jz);
1071
1072             rsq         = gmx_mm_calc_rsq_ps(dx, dy, dz);
1073
1074             rinv        = gmx_mm_invsqrt_ps(rsq);
1075             r           = _mm_mul_ps(rsq, rinv);
1076
1077             /* Compute raj_inv aj1-4 */
1078             raj_inv     = gmx_mm_inv_ps(raj);
1079
1080             /* Evaluate influence of atom aj -> ai */
1081             t1            = _mm_add_ps(r, sk_aj);
1082             obc_mask1     = _mm_cmplt_ps(rai, t1);
1083             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1084
1085             if (_mm_movemask_ps(obc_mask1))
1086             {
1087                 t2            = _mm_sub_ps(r, sk_aj);
1088                 t3            = _mm_sub_ps(sk_aj, r);
1089                 obc_mask2     = _mm_cmplt_ps(rai, t2);
1090                 obc_mask3     = _mm_cmplt_ps(rai, t3);
1091
1092                 uij           = gmx_mm_inv_ps(t1);
1093                 lij           = _mm_or_ps(   _mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1094                                              _mm_andnot_ps(obc_mask2, rai_inv));
1095                 dlij           = _mm_and_ps(one, obc_mask2);
1096                 uij2           = _mm_mul_ps(uij, uij);
1097                 uij3           = _mm_mul_ps(uij2, uij);
1098                 lij2           = _mm_mul_ps(lij, lij);
1099                 lij3           = _mm_mul_ps(lij2, lij);
1100                 diff2          = _mm_sub_ps(uij2, lij2);
1101                 lij_inv        = gmx_mm_invsqrt_ps(lij2);
1102                 sk2_aj         = _mm_mul_ps(sk_aj, sk_aj);
1103                 sk2_rinv       = _mm_mul_ps(sk2_aj, rinv);
1104                 prod           = _mm_mul_ps(onefourth, sk2_rinv);
1105                 logterm        = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1106                 t1             = _mm_sub_ps(lij, uij);
1107                 t2             = _mm_mul_ps(diff2,
1108                                             _mm_sub_ps(_mm_mul_ps(onefourth, r),
1109                                                        prod));
1110                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1111                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1112                 t4            = _mm_mul_ps(two, _mm_sub_ps(rai_inv, lij));
1113                 t4            = _mm_and_ps(t4, obc_mask3);
1114                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1115                 sum_ai        = _mm_add_ps(sum_ai, _mm_and_ps(t1, obc_mask1));
1116                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1117                                            _mm_mul_ps(prod, lij3));
1118                 t1            = _mm_sub_ps(t1,
1119                                            _mm_mul_ps(onefourth,
1120                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1121                                                                  _mm_mul_ps(lij3, r))));
1122                 t2            = _mm_mul_ps(onefourth,
1123                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1124                                                       _mm_mul_ps(uij3, r)));
1125                 t2            = _mm_sub_ps(t2,
1126                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1127                                                       _mm_mul_ps(prod, uij3)));
1128                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1129                                            _mm_mul_ps(rinv, rinv));
1130                 t3            = _mm_sub_ps(t3,
1131                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1132                                                       _mm_add_ps(one,
1133                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1134                 t1            = _mm_mul_ps(rinv,
1135                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1136                                                       _mm_add_ps(t2, t3)));
1137                 dadx1         = _mm_and_ps(t1, obc_mask1);
1138             }
1139             else
1140             {
1141                 dadx1         = _mm_setzero_ps();
1142             }
1143
1144             /* Evaluate influence of atom ai -> aj */
1145             t1            = _mm_add_ps(r, sk_ai);
1146             obc_mask1     = _mm_cmplt_ps(raj, t1);
1147             obc_mask1     = _mm_and_ps(obc_mask1, mask);
1148
1149             if (_mm_movemask_ps(obc_mask1))
1150             {
1151                 t2            = _mm_sub_ps(r, sk_ai);
1152                 t3            = _mm_sub_ps(sk_ai, r);
1153                 obc_mask2     = _mm_cmplt_ps(raj, t2);
1154                 obc_mask3     = _mm_cmplt_ps(raj, t3);
1155
1156                 uij           = gmx_mm_inv_ps(t1);
1157                 lij           = _mm_or_ps(_mm_and_ps(obc_mask2, gmx_mm_inv_ps(t2)),
1158                                           _mm_andnot_ps(obc_mask2, raj_inv));
1159                 dlij          = _mm_and_ps(one, obc_mask2);
1160                 uij2          = _mm_mul_ps(uij, uij);
1161                 uij3          = _mm_mul_ps(uij2, uij);
1162                 lij2          = _mm_mul_ps(lij, lij);
1163                 lij3          = _mm_mul_ps(lij2, lij);
1164                 diff2         = _mm_sub_ps(uij2, lij2);
1165                 lij_inv       = gmx_mm_invsqrt_ps(lij2);
1166                 sk2_rinv      = _mm_mul_ps(sk2_ai, rinv);
1167                 prod          = _mm_mul_ps(onefourth, sk2_rinv);
1168                 logterm       = gmx_mm_log_ps(_mm_mul_ps(uij, lij_inv));
1169                 t1            = _mm_sub_ps(lij, uij);
1170                 t2            = _mm_mul_ps(diff2,
1171                                            _mm_sub_ps(_mm_mul_ps(onefourth, r),
1172                                                       prod));
1173                 t3            = _mm_mul_ps(half, _mm_mul_ps(rinv, logterm));
1174                 t1            = _mm_add_ps(t1, _mm_add_ps(t2, t3));
1175                 t4            = _mm_mul_ps(two, _mm_sub_ps(raj_inv, lij));
1176                 t4            = _mm_and_ps(t4, obc_mask3);
1177                 t1            = _mm_mul_ps(half, _mm_add_ps(t1, t4));
1178
1179                 tmp           = _mm_and_ps(t1, obc_mask1);
1180
1181                 t1            = _mm_add_ps(_mm_mul_ps(half, lij2),
1182                                            _mm_mul_ps(prod, lij3));
1183                 t1            = _mm_sub_ps(t1,
1184                                            _mm_mul_ps(onefourth,
1185                                                       _mm_add_ps(_mm_mul_ps(lij, rinv),
1186                                                                  _mm_mul_ps(lij3, r))));
1187                 t2            = _mm_mul_ps(onefourth,
1188                                            _mm_add_ps(_mm_mul_ps(uij, rinv),
1189                                                       _mm_mul_ps(uij3, r)));
1190                 t2            = _mm_sub_ps(t2,
1191                                            _mm_add_ps(_mm_mul_ps(half, uij2),
1192                                                       _mm_mul_ps(prod, uij3)));
1193                 t3            = _mm_mul_ps(_mm_mul_ps(onefourth, logterm),
1194                                            _mm_mul_ps(rinv, rinv));
1195                 t3            = _mm_sub_ps(t3,
1196                                            _mm_mul_ps(_mm_mul_ps(diff2, oneeighth),
1197                                                       _mm_add_ps(one,
1198                                                                  _mm_mul_ps(sk2_rinv, rinv))));
1199                 t1            = _mm_mul_ps(rinv,
1200                                            _mm_add_ps(_mm_mul_ps(dlij, t1),
1201                                                       _mm_add_ps(t2, t3)));
1202                 dadx2         = _mm_and_ps(t1, obc_mask1);
1203             }
1204             else
1205             {
1206                 dadx2         = _mm_setzero_ps();
1207                 tmp           = _mm_setzero_ps();
1208             }
1209
1210             _mm_store_ps(dadx, dadx1);
1211             dadx += 4;
1212             _mm_store_ps(dadx, dadx2);
1213             dadx += 4;
1214
1215             if (offset == 1)
1216             {
1217                 GMX_MM_INCREMENT_1VALUE_PS(work+jnrA, tmp);
1218             }
1219             else if (offset == 2)
1220             {
1221                 GMX_MM_INCREMENT_2VALUES_PS(work+jnrA, work+jnrB, tmp);
1222             }
1223             else
1224             {
1225                 /* offset must be 3 */
1226                 GMX_MM_INCREMENT_3VALUES_PS(work+jnrA, work+jnrB, work+jnrC, tmp);
1227             }
1228
1229         }
1230         GMX_MM_UPDATE_1POT_PS(sum_ai, work+ii);
1231
1232     }
1233
1234     /* Parallel summations */
1235     if (PARTDECOMP(cr))
1236     {
1237         gmx_sum(natoms, work, cr);
1238     }
1239     else if (DOMAINDECOMP(cr))
1240     {
1241         dd_atom_sum_real(cr->dd, work);
1242     }
1243
1244     if (gb_algorithm == egbHCT)
1245     {
1246         /* HCT */
1247         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1248         {
1249             if (born->use[i] != 0)
1250             {
1251                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
1252                 sum     = 1.0/rr - work[i];
1253                 min_rad = rr + doffset;
1254                 rad     = 1.0/sum;
1255
1256                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
1257                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1258             }
1259         }
1260
1261         /* Extra communication required for DD */
1262         if (DOMAINDECOMP(cr))
1263         {
1264             dd_atom_spread_real(cr->dd, born->bRad);
1265             dd_atom_spread_real(cr->dd, fr->invsqrta);
1266         }
1267     }
1268     else
1269     {
1270         /* OBC */
1271         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
1272         {
1273             if (born->use[i] != 0)
1274             {
1275                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
1276                 rr_inv2 = 1.0/rr;
1277                 rr      = rr-doffset;
1278                 rr_inv  = 1.0/rr;
1279                 sum     = rr * work[i];
1280                 sum2    = sum  * sum;
1281                 sum3    = sum2 * sum;
1282
1283                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1284                 born->bRad[i] = rr_inv - tsum*rr_inv2;
1285                 born->bRad[i] = 1.0 / born->bRad[i];
1286
1287                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1288
1289                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1290                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1291             }
1292         }
1293         /* Extra (local) communication required for DD */
1294         if (DOMAINDECOMP(cr))
1295         {
1296             dd_atom_spread_real(cr->dd, born->bRad);
1297             dd_atom_spread_real(cr->dd, fr->invsqrta);
1298             dd_atom_spread_real(cr->dd, born->drobc);
1299         }
1300     }
1301
1302
1303
1304     return 0;
1305 }
1306
1307
1308
1309 float calc_gb_chainrule_sse2_single(int natoms, t_nblist *nl, float *dadx, float *dvda,
1310                                     float *x, float *f, float *fshift, float *shiftvec,
1311                                     int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
1312 {
1313     int          i, k, n, ii, jnr, ii3, is3, nj0, nj1, offset, n0, n1;
1314     int          jnrA, jnrB, jnrC, jnrD;
1315     int          j3A, j3B, j3C, j3D;
1316     int          jnrE, jnrF, jnrG, jnrH;
1317     int          j3E, j3F, j3G, j3H;
1318     int       *  jjnr;
1319
1320     float        rbi, shX, shY, shZ;
1321     float       *rb;
1322
1323     __m128       ix, iy, iz;
1324     __m128       jx, jy, jz;
1325     __m128       jxB, jyB, jzB;
1326     __m128       fix, fiy, fiz;
1327     __m128       dx, dy, dz;
1328     __m128       tx, ty, tz;
1329     __m128       dxB, dyB, dzB;
1330     __m128       txB, tyB, tzB;
1331
1332     __m128       rbai, rbaj, rbajB, f_gb, f_gb_ai, f_gbB, f_gb_aiB;
1333     __m128       xmm1, xmm2, xmm3;
1334
1335     const __m128 two = _mm_set1_ps(2.0f);
1336
1337     rb     = born->work;
1338
1339     jjnr   = nl->jjnr;
1340
1341     /* Loop to get the proper form for the Born radius term, sse style */
1342     offset = natoms%4;
1343
1344     n0 = 0;
1345     n1 = natoms;
1346
1347     if (gb_algorithm == egbSTILL)
1348     {
1349         for (i = n0; i < n1; i++)
1350         {
1351             rbi   = born->bRad[i];
1352             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1353         }
1354     }
1355     else if (gb_algorithm == egbHCT)
1356     {
1357         for (i = n0; i < n1; i++)
1358         {
1359             rbi   = born->bRad[i];
1360             rb[i] = rbi * rbi * dvda[i];
1361         }
1362     }
1363     else if (gb_algorithm == egbOBC)
1364     {
1365         for (i = n0; i < n1; i++)
1366         {
1367             rbi   = born->bRad[i];
1368             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1369         }
1370     }
1371
1372     jz = _mm_setzero_ps();
1373
1374     n = j3A = j3B = j3C = j3D = 0;
1375
1376     for (i = 0; i < nl->nri; i++)
1377     {
1378         ii     = nl->iinr[i];
1379         ii3    = ii*3;
1380         is3    = 3*nl->shift[i];
1381         shX    = shiftvec[is3];
1382         shY    = shiftvec[is3+1];
1383         shZ    = shiftvec[is3+2];
1384         nj0    = nl->jindex[i];
1385         nj1    = nl->jindex[i+1];
1386
1387         ix     = _mm_set1_ps(shX+x[ii3+0]);
1388         iy     = _mm_set1_ps(shY+x[ii3+1]);
1389         iz     = _mm_set1_ps(shZ+x[ii3+2]);
1390
1391         offset = (nj1-nj0)%4;
1392
1393         rbai   = _mm_load1_ps(rb+ii);
1394         fix    = _mm_setzero_ps();
1395         fiy    = _mm_setzero_ps();
1396         fiz    = _mm_setzero_ps();
1397
1398
1399         for (k = nj0; k < nj1-offset; k += 4)
1400         {
1401             jnrA        = jjnr[k];
1402             jnrB        = jjnr[k+1];
1403             jnrC        = jjnr[k+2];
1404             jnrD        = jjnr[k+3];
1405
1406             j3A         = 3*jnrA;
1407             j3B         = 3*jnrB;
1408             j3C         = 3*jnrC;
1409             j3D         = 3*jnrD;
1410
1411             GMX_MM_LOAD_1RVEC_4POINTERS_PS(x+j3A, x+j3B, x+j3C, x+j3D, jx, jy, jz);
1412
1413             dx          = _mm_sub_ps(ix, jx);
1414             dy          = _mm_sub_ps(iy, jy);
1415             dz          = _mm_sub_ps(iz, jz);
1416
1417             GMX_MM_LOAD_4VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rb+jnrD, rbaj);
1418
1419             /* load chain rule terms for j1-4 */
1420             f_gb        = _mm_load_ps(dadx);
1421             dadx       += 4;
1422             f_gb_ai     = _mm_load_ps(dadx);
1423             dadx       += 4;
1424
1425             /* calculate scalar force */
1426             f_gb    = _mm_mul_ps(f_gb, rbai);
1427             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1428             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1429
1430             tx     = _mm_mul_ps(f_gb, dx);
1431             ty     = _mm_mul_ps(f_gb, dy);
1432             tz     = _mm_mul_ps(f_gb, dz);
1433
1434             fix    = _mm_add_ps(fix, tx);
1435             fiy    = _mm_add_ps(fiy, ty);
1436             fiz    = _mm_add_ps(fiz, tz);
1437
1438             GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(f+j3A, f+j3B, f+j3C, f+j3D, tx, ty, tz);
1439         }
1440
1441         /*deal with odd elements */
1442         if (offset != 0)
1443         {
1444             if (offset == 1)
1445             {
1446                 jnrA        = jjnr[k];
1447                 j3A         = 3*jnrA;
1448                 GMX_MM_LOAD_1RVEC_1POINTER_PS(x+j3A, jx, jy, jz);
1449                 GMX_MM_LOAD_1VALUE_PS(rb+jnrA, rbaj);
1450             }
1451             else if (offset == 2)
1452             {
1453                 jnrA        = jjnr[k];
1454                 jnrB        = jjnr[k+1];
1455                 j3A         = 3*jnrA;
1456                 j3B         = 3*jnrB;
1457                 GMX_MM_LOAD_1RVEC_2POINTERS_PS(x+j3A, x+j3B, jx, jy, jz);
1458                 GMX_MM_LOAD_2VALUES_PS(rb+jnrA, rb+jnrB, rbaj);
1459             }
1460             else
1461             {
1462                 /* offset must be 3 */
1463                 jnrA        = jjnr[k];
1464                 jnrB        = jjnr[k+1];
1465                 jnrC        = jjnr[k+2];
1466                 j3A         = 3*jnrA;
1467                 j3B         = 3*jnrB;
1468                 j3C         = 3*jnrC;
1469                 GMX_MM_LOAD_1RVEC_3POINTERS_PS(x+j3A, x+j3B, x+j3C, jx, jy, jz);
1470                 GMX_MM_LOAD_3VALUES_PS(rb+jnrA, rb+jnrB, rb+jnrC, rbaj);
1471             }
1472
1473             dx          = _mm_sub_ps(ix, jx);
1474             dy          = _mm_sub_ps(iy, jy);
1475             dz          = _mm_sub_ps(iz, jz);
1476
1477             /* load chain rule terms for j1-4 */
1478             f_gb        = _mm_load_ps(dadx);
1479             dadx       += 4;
1480             f_gb_ai     = _mm_load_ps(dadx);
1481             dadx       += 4;
1482
1483             /* calculate scalar force */
1484             f_gb    = _mm_mul_ps(f_gb, rbai);
1485             f_gb_ai = _mm_mul_ps(f_gb_ai, rbaj);
1486             f_gb    = _mm_add_ps(f_gb, f_gb_ai);
1487
1488             tx     = _mm_mul_ps(f_gb, dx);
1489             ty     = _mm_mul_ps(f_gb, dy);
1490             tz     = _mm_mul_ps(f_gb, dz);
1491
1492             fix    = _mm_add_ps(fix, tx);
1493             fiy    = _mm_add_ps(fiy, ty);
1494             fiz    = _mm_add_ps(fiz, tz);
1495
1496             if (offset == 1)
1497             {
1498                 GMX_MM_DECREMENT_1RVEC_1POINTER_PS(f+j3A, tx, ty, tz);
1499             }
1500             else if (offset == 2)
1501             {
1502                 GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(f+j3A, f+j3B, tx, ty, tz);
1503             }
1504             else
1505             {
1506                 /* offset must be 3 */
1507                 GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(f+j3A, f+j3B, f+j3C, tx, ty, tz);
1508             }
1509         }
1510
1511         /* fix/fiy/fiz now contain four partial force terms, that all should be
1512          * added to the i particle forces and shift forces.
1513          */
1514         gmx_mm_update_iforce_1atom_ps(&fix, &fiy, &fiz, f+ii3, fshift+is3);
1515     }
1516
1517     return 0;
1518 }
1519
1520
1521 #else
1522 /* keep compiler happy */
1523 int genborn_sse_dummy;
1524
1525 #endif /* SSE intrinsics available */