src/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_inner.h

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   9  * Copyright (c) 2001-2009, The GROMACS Development Team
  10  *
  11  * Gromacs is a library for molecular simulation and trajectory analysis,
  12  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  13  * a full list of developers and information, check out http://www.gromacs.org
  14  *
  15  * This program is free software; you can redistribute it and/or modify it under
  16  * the terms of the GNU Lesser General Public License as published by the Free
  17  * Software Foundation; either version 2 of the License, or (at your option) any
  18  * later version.
  19  * As a special exception, you may use this file as part of a free software
  20  * library without restriction.  Specifically, if other files instantiate
  21  * templates or use macros or inline functions from this file, or you compile
  22  * this file and link it with other files to produce an executable, this
  23  * file does not by itself cause the resulting executable to be covered by
  24  * the GNU Lesser General Public License.
  25  *
  26  * In plain-speak: do not worry about classes/macros/templates either - only
  27  * changes to the library have to be LGPL, not an application linking with it.
  28  *
  29  * To help fund GROMACS development, we humbly ask that you cite
  30  * the papers people have written on it - you can find them on the website!
  31  */
  32
  33 /* This is the innermost loop contents for the n vs n atom
  34  * SSE2 single precision kernels.
  35  */
  36
  37
  38 /* When calculating RF or Ewald interactions we calculate the electrostatic
  39  * forces on excluded atom pairs here in the non-bonded loops.
  40  * But when energies and/or virial is required we calculate them
  41  * separately to as then it is easier to separate the energy and virial
  42  * contributions.
  43  */
  44 #if defined CHECK_EXCLS && defined CALC_COULOMB
  45 #define EXCL_FORCES
  46 #endif
  47
  48 #if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS && !(defined __GNUC__ && (defined CALC_COUL_TAB || (defined CALC_COUL_RF && defined GMX_MM128_HERE)))
  49 /* Without exclusions and energies we only need to mask the cut-off,
  50  * this is faster with blendv (only available with SSE4.1 and later).
  51  * With gcc and PME or RF in 128-bit, blendv is slower;
  52  * tested with gcc 4.6.2, 4.6.3 and 4.7.1.
  53  */
  54 #define CUTOFF_BLENDV
  55 #endif
  56
  57         {
  58             int        cj,aj,ajx,ajy,ajz;
  59
  60 #ifdef ENERGY_GROUPS
  61             int        egps_j;
  62             int        egp_jj[UNROLLJ>>1];
  63             int        jj;
  64 #endif
  65
  66 #ifdef CHECK_EXCLS
  67             /* Interaction (non-exclusion) mask of all 1's or 0's */
  68             gmx_mm_pr  int_SSE0;
  69             gmx_mm_pr  int_SSE1;
  70             gmx_mm_pr  int_SSE2;
  71             gmx_mm_pr  int_SSE3;
  72 #endif
  73
  74             gmx_mm_pr  jxSSE,jySSE,jzSSE;
  75             gmx_mm_pr  dx_SSE0,dy_SSE0,dz_SSE0;
  76             gmx_mm_pr  dx_SSE1,dy_SSE1,dz_SSE1;
  77             gmx_mm_pr  dx_SSE2,dy_SSE2,dz_SSE2;
  78             gmx_mm_pr  dx_SSE3,dy_SSE3,dz_SSE3;
  79             gmx_mm_pr  tx_SSE0,ty_SSE0,tz_SSE0;
  80             gmx_mm_pr  tx_SSE1,ty_SSE1,tz_SSE1;
  81             gmx_mm_pr  tx_SSE2,ty_SSE2,tz_SSE2;
  82             gmx_mm_pr  tx_SSE3,ty_SSE3,tz_SSE3;
  83             gmx_mm_pr  rsq_SSE0,rinv_SSE0,rinvsq_SSE0;
  84             gmx_mm_pr  rsq_SSE1,rinv_SSE1,rinvsq_SSE1;
  85             gmx_mm_pr  rsq_SSE2,rinv_SSE2,rinvsq_SSE2;
  86             gmx_mm_pr  rsq_SSE3,rinv_SSE3,rinvsq_SSE3;
  87 #ifndef CUTOFF_BLENDV
  88             /* wco: within cut-off, mask of all 1's or 0's */
  89             gmx_mm_pr  wco_SSE0;
  90             gmx_mm_pr  wco_SSE1;
  91             gmx_mm_pr  wco_SSE2;
  92             gmx_mm_pr  wco_SSE3;
  93 #endif
  94 #ifdef VDW_CUTOFF_CHECK
  95             gmx_mm_pr  wco_vdw_SSE0;
  96             gmx_mm_pr  wco_vdw_SSE1;
  97 #ifndef HALF_LJ
  98             gmx_mm_pr  wco_vdw_SSE2;
  99             gmx_mm_pr  wco_vdw_SSE3;
 100 #endif
 101 #endif
 102 #ifdef CALC_COULOMB
 103 #ifdef CHECK_EXCLS
 104             /* 1/r masked with the interaction mask */
 105             gmx_mm_pr  rinv_ex_SSE0;
 106             gmx_mm_pr  rinv_ex_SSE1;
 107             gmx_mm_pr  rinv_ex_SSE2;
 108             gmx_mm_pr  rinv_ex_SSE3;
 109 #endif
 110             gmx_mm_pr  jq_SSE;
 111             gmx_mm_pr  qq_SSE0;
 112             gmx_mm_pr  qq_SSE1;
 113             gmx_mm_pr  qq_SSE2;
 114             gmx_mm_pr  qq_SSE3;
 115 #ifdef CALC_COUL_TAB
 116             /* The force (PME mesh force) we need to subtract from 1/r^2 */
 117             gmx_mm_pr  fsub_SSE0;
 118             gmx_mm_pr  fsub_SSE1;
 119             gmx_mm_pr  fsub_SSE2;
 120             gmx_mm_pr  fsub_SSE3;
 121 #endif
 122             /* frcoul = (1/r - fsub)*r */
 123             gmx_mm_pr  frcoul_SSE0;
 124             gmx_mm_pr  frcoul_SSE1;
 125             gmx_mm_pr  frcoul_SSE2;
 126             gmx_mm_pr  frcoul_SSE3;
 127 #ifdef CALC_COUL_TAB
 128             /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
 129             gmx_mm_pr  r_SSE0,rs_SSE0,rf_SSE0,frac_SSE0;
 130             gmx_mm_pr  r_SSE1,rs_SSE1,rf_SSE1,frac_SSE1;
 131             gmx_mm_pr  r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
 132             gmx_mm_pr  r_SSE3,rs_SSE3,rf_SSE3,frac_SSE3;
 133             /* Table index: rs converted to an int */
 134 #if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
 135             gmx_epi32  ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
 136 #else
 137             __m128i    ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
 138 #endif
 139             /* Linear force table values */
 140             gmx_mm_pr  ctab0_SSE0,ctab1_SSE0;
 141             gmx_mm_pr  ctab0_SSE1,ctab1_SSE1;
 142             gmx_mm_pr  ctab0_SSE2,ctab1_SSE2;
 143             gmx_mm_pr  ctab0_SSE3,ctab1_SSE3;
 144 #ifdef CALC_ENERGIES
 145             /* Quadratic energy table value */
 146             gmx_mm_pr  ctabv_SSE0;
 147             gmx_mm_pr  ctabv_SSE1;
 148             gmx_mm_pr  ctabv_SSE2;
 149             gmx_mm_pr  ctabv_SSE3;
 150             /* The potential (PME mesh) we need to subtract from 1/r */
 151             gmx_mm_pr  vc_sub_SSE0;
 152             gmx_mm_pr  vc_sub_SSE1;
 153             gmx_mm_pr  vc_sub_SSE2;
 154             gmx_mm_pr  vc_sub_SSE3;
 155 #endif
 156 #endif
 157 #ifdef CALC_ENERGIES
 158             /* Electrostatic potential */
 159             gmx_mm_pr  vcoul_SSE0;
 160             gmx_mm_pr  vcoul_SSE1;
 161             gmx_mm_pr  vcoul_SSE2;
 162             gmx_mm_pr  vcoul_SSE3;
 163 #endif
 164 #endif
 165             /* The force times 1/r */
 166             gmx_mm_pr  fscal_SSE0;
 167             gmx_mm_pr  fscal_SSE1;
 168             gmx_mm_pr  fscal_SSE2;
 169             gmx_mm_pr  fscal_SSE3;
 170
 171 #ifdef CALC_LJ
 172 #ifdef LJ_COMB_LB
 173             /* LJ sigma_j/2 and sqrt(epsilon_j) */
 174             gmx_mm_pr  hsig_j_SSE,seps_j_SSE;
 175             /* LJ sigma_ij and epsilon_ij */
 176             gmx_mm_pr  sig_SSE0,eps_SSE0;
 177             gmx_mm_pr  sig_SSE1,eps_SSE1;
 178 #ifndef HALF_LJ
 179             gmx_mm_pr  sig_SSE2,eps_SSE2;
 180             gmx_mm_pr  sig_SSE3,eps_SSE3;
 181 #endif
 182 #ifdef CALC_ENERGIES
 183             gmx_mm_pr  sig2_SSE0,sig6_SSE0;
 184             gmx_mm_pr  sig2_SSE1,sig6_SSE1;
 185 #ifndef HALF_LJ
 186             gmx_mm_pr  sig2_SSE2,sig6_SSE2;
 187             gmx_mm_pr  sig2_SSE3,sig6_SSE3;
 188 #endif
 189 #endif /* LJ_COMB_LB */
 190 #endif /* CALC_LJ */
 191
 192 #ifdef LJ_COMB_GEOM
 193             gmx_mm_pr  c6s_j_SSE,c12s_j_SSE;
 194 #endif
 195
 196 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 197             /* Index for loading LJ parameters, complicated when interleaving */
 198             int         aj2;
 199 #endif
 200
 201 #ifndef FIX_LJ_C
 202             /* LJ C6 and C12 parameters, used with geometric comb. rule */
 203             gmx_mm_pr  c6_SSE0,c12_SSE0;
 204             gmx_mm_pr  c6_SSE1,c12_SSE1;
 205 #ifndef HALF_LJ
 206             gmx_mm_pr  c6_SSE2,c12_SSE2;
 207             gmx_mm_pr  c6_SSE3,c12_SSE3;
 208 #endif
 209 #endif
 210
 211             /* Intermediate variables for LJ calculation */
 212 #ifndef LJ_COMB_LB
 213             gmx_mm_pr  rinvsix_SSE0;
 214             gmx_mm_pr  rinvsix_SSE1;
 215 #ifndef HALF_LJ
 216             gmx_mm_pr  rinvsix_SSE2;
 217             gmx_mm_pr  rinvsix_SSE3;
 218 #endif
 219 #endif
 220 #ifdef LJ_COMB_LB
 221             gmx_mm_pr  sir_SSE0,sir2_SSE0,sir6_SSE0;
 222             gmx_mm_pr  sir_SSE1,sir2_SSE1,sir6_SSE1;
 223 #ifndef HALF_LJ
 224             gmx_mm_pr  sir_SSE2,sir2_SSE2,sir6_SSE2;
 225             gmx_mm_pr  sir_SSE3,sir2_SSE3,sir6_SSE3;
 226 #endif
 227 #endif
 228
 229             gmx_mm_pr  FrLJ6_SSE0,FrLJ12_SSE0;
 230             gmx_mm_pr  FrLJ6_SSE1,FrLJ12_SSE1;
 231 #ifndef HALF_LJ
 232             gmx_mm_pr  FrLJ6_SSE2,FrLJ12_SSE2;
 233             gmx_mm_pr  FrLJ6_SSE3,FrLJ12_SSE3;
 234 #endif
 235 #ifdef CALC_ENERGIES
 236             gmx_mm_pr  VLJ6_SSE0,VLJ12_SSE0,VLJ_SSE0;
 237             gmx_mm_pr  VLJ6_SSE1,VLJ12_SSE1,VLJ_SSE1;
 238 #ifndef HALF_LJ
 239             gmx_mm_pr  VLJ6_SSE2,VLJ12_SSE2,VLJ_SSE2;
 240             gmx_mm_pr  VLJ6_SSE3,VLJ12_SSE3,VLJ_SSE3;
 241 #endif
 242 #endif
 243 #endif /* CALC_LJ */
 244
 245             /* j-cluster index */
 246             cj            = l_cj[cjind].cj;
 247
 248             /* Atom indices (of the first atom in the cluster) */
 249             aj            = cj*UNROLLJ;
 250 #if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
 251 #if UNROLLJ == STRIDE
 252             aj2           = aj*2;
 253 #else
 254             aj2           = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
 255 #endif
 256 #endif
 257 #if UNROLLJ == STRIDE
 258             ajx           = aj*DIM;
 259 #else
 260             ajx           = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
 261 #endif
 262             ajy           = ajx + STRIDE;
 263             ajz           = ajy + STRIDE;
 264
 265 #ifdef CHECK_EXCLS
 266 #ifndef GMX_MM256_HERE
 267             {
 268                 /* Load integer interaction mask */
 269                 __m128i mask_int = _mm_set1_epi32(l_cj[cjind].excl);
 270
 271                 /* The is no unequal sse instruction, so we need a not here */
 272                 int_SSE0  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask0),zeroi_SSE));
 273                 int_SSE1  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask1),zeroi_SSE));
 274                 int_SSE2  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask2),zeroi_SSE));
 275                 int_SSE3  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask3),zeroi_SSE));
 276             }
 277 #else
 278             {
 279 #ifndef GMX_DOUBLE
 280                 /* Load integer interaction mask */
 281                 /* With AVX there are no integer operations, so cast to real */
 282                 gmx_mm_pr mask_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl));
 283                 /* We can't compare all 4*8=32 float bits: shift the mask */
 284                 gmx_mm_pr masksh_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl>>(2*UNROLLJ)));
 285                 /* Intel Compiler version 12.1.3 20120130 is buggy: use cast.
 286                  * With gcc we don't need the cast, but it's faster.
 287                  */
 288 #define cast_cvt(x)  _mm256_cvtepi32_ps(_mm256_castps_si256(x))
 289                 int_SSE0  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask0)),zero_SSE);
 290                 int_SSE1  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask1)),zero_SSE);
 291                 int_SSE2  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr,mask0)),zero_SSE);
 292                 int_SSE3  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr,mask1)),zero_SSE);
 293 #undef cast_cvt
 294 #else
 295                 /* Load integer interaction mask */
 296                 /* With AVX there are no integer operations,
 297                  * and there is no int to double conversion, so cast to float
 298                  */
 299                 __m256 mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(l_cj[cjind].excl));
 300 #define cast_cvt(x)  _mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castps_si256(x)))
 301                 int_SSE0  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask0)),zero_SSE);
 302                 int_SSE1  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask1)),zero_SSE);
 303                 int_SSE2  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask2)),zero_SSE);
 304                 int_SSE3  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask3)),zero_SSE);
 305 #undef cast_cvt
 306 #endif
 307             }
 308 #endif
 309 #endif
 310             /* load j atom coordinates */
 311             jxSSE         = gmx_load_pr(x+ajx);
 312             jySSE         = gmx_load_pr(x+ajy);
 313             jzSSE         = gmx_load_pr(x+ajz);
 314
 315             /* Calculate distance */
 316             dx_SSE0       = gmx_sub_pr(ix_SSE0,jxSSE);
 317             dy_SSE0       = gmx_sub_pr(iy_SSE0,jySSE);
 318             dz_SSE0       = gmx_sub_pr(iz_SSE0,jzSSE);
 319             dx_SSE1       = gmx_sub_pr(ix_SSE1,jxSSE);
 320             dy_SSE1       = gmx_sub_pr(iy_SSE1,jySSE);
 321             dz_SSE1       = gmx_sub_pr(iz_SSE1,jzSSE);
 322             dx_SSE2       = gmx_sub_pr(ix_SSE2,jxSSE);
 323             dy_SSE2       = gmx_sub_pr(iy_SSE2,jySSE);
 324             dz_SSE2       = gmx_sub_pr(iz_SSE2,jzSSE);
 325             dx_SSE3       = gmx_sub_pr(ix_SSE3,jxSSE);
 326             dy_SSE3       = gmx_sub_pr(iy_SSE3,jySSE);
 327             dz_SSE3       = gmx_sub_pr(iz_SSE3,jzSSE);
 328
 329             /* rsq = dx*dx+dy*dy+dz*dz */
 330             rsq_SSE0      = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
 331             rsq_SSE1      = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
 332             rsq_SSE2      = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
 333             rsq_SSE3      = gmx_calc_rsq_pr(dx_SSE3,dy_SSE3,dz_SSE3);
 334
 335 #ifndef CUTOFF_BLENDV
 336             wco_SSE0      = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
 337             wco_SSE1      = gmx_cmplt_pr(rsq_SSE1,rc2_SSE);
 338             wco_SSE2      = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
 339             wco_SSE3      = gmx_cmplt_pr(rsq_SSE3,rc2_SSE);
 340 #endif
 341
 342 #ifdef CHECK_EXCLS
 343 #ifdef EXCL_FORCES
 344             /* Only remove the (sub-)diagonal to avoid double counting */
 345 #if UNROLLJ == UNROLLI
 346             if (cj == ci_sh)
 347             {
 348                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag_SSE0);
 349                 wco_SSE1  = gmx_and_pr(wco_SSE1,diag_SSE1);
 350                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag_SSE2);
 351                 wco_SSE3  = gmx_and_pr(wco_SSE3,diag_SSE3);
 352             }
 353 #else
 354 #if UNROLLJ < UNROLLI
 355             if (cj == ci_sh*2)
 356             {
 357                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag0_SSE0);
 358                 wco_SSE1  = gmx_and_pr(wco_SSE1,diag0_SSE1);
 359                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag0_SSE2);
 360                 wco_SSE3  = gmx_and_pr(wco_SSE3,diag0_SSE3);
 361             }
 362             if (cj == ci_sh*2 + 1)
 363             {
 364                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag1_SSE0);
 365                 wco_SSE1  = gmx_and_pr(wco_SSE1,diag1_SSE1);
 366                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag1_SSE2);
 367                 wco_SSE3  = gmx_and_pr(wco_SSE3,diag1_SSE3);
 368             }
 369 #else
 370             if (cj*2 == ci_sh)
 371             {
 372                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag0_SSE0);
 373                 wco_SSE1  = gmx_and_pr(wco_SSE1,diag0_SSE1);
 374                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag0_SSE2);
 375                 wco_SSE3  = gmx_and_pr(wco_SSE3,diag0_SSE3);
 376             }
 377             else if (cj*2 + 1 == ci_sh)
 378             {
 379                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag1_SSE0);
 380                 wco_SSE1  = gmx_and_pr(wco_SSE1,diag1_SSE1);
 381                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag1_SSE2);
 382                 wco_SSE3  = gmx_and_pr(wco_SSE3,diag1_SSE3);
 383             }
 384 #endif
 385 #endif
 386 #else /* EXCL_FORCES */
 387             /* Remove all excluded atom pairs from the list */
 388             wco_SSE0      = gmx_and_pr(wco_SSE0,int_SSE0);
 389             wco_SSE1      = gmx_and_pr(wco_SSE1,int_SSE1);
 390             wco_SSE2      = gmx_and_pr(wco_SSE2,int_SSE2);
 391             wco_SSE3      = gmx_and_pr(wco_SSE3,int_SSE3);
 392 #endif
 393 #endif
 394
 395 #ifdef COUNT_PAIRS
 396             {
 397                 int i,j;
 398                 real tmp[UNROLLJ];
 399                 for(i=0; i<UNROLLI; i++)
 400                 {
 401                     gmx_storeu_pr(tmp,i==0 ? wco_SSE0 : (i==1 ? wco_SSE1 : (i==2 ? wco_SSE2 : wco_SSE3)));
 402                     for(j=0; j<UNROLLJ; j++)
 403                     {
 404                         if (!(tmp[j] == 0))
 405                         {
 406                             npair++;
 407                         }
 408                     }
 409                 }
 410             }
 411 #endif
 412
 413 #ifdef CHECK_EXCLS
 414             /* For excluded pairs add a small number to avoid r^-6 = NaN */
 415             rsq_SSE0      = gmx_add_pr(rsq_SSE0,gmx_andnot_pr(int_SSE0,avoid_sing_SSE));
 416             rsq_SSE1      = gmx_add_pr(rsq_SSE1,gmx_andnot_pr(int_SSE1,avoid_sing_SSE));
 417             rsq_SSE2      = gmx_add_pr(rsq_SSE2,gmx_andnot_pr(int_SSE2,avoid_sing_SSE));
 418             rsq_SSE3      = gmx_add_pr(rsq_SSE3,gmx_andnot_pr(int_SSE3,avoid_sing_SSE));
 419 #endif
 420
 421             /* Calculate 1/r */
 422 #ifndef GMX_DOUBLE
 423             rinv_SSE0     = gmx_invsqrt_pr(rsq_SSE0);
 424             rinv_SSE1     = gmx_invsqrt_pr(rsq_SSE1);
 425             rinv_SSE2     = gmx_invsqrt_pr(rsq_SSE2);
 426             rinv_SSE3     = gmx_invsqrt_pr(rsq_SSE3);
 427 #else
 428             GMX_MM_INVSQRT2_PD(rsq_SSE0,rsq_SSE1,rinv_SSE0,rinv_SSE1);
 429             GMX_MM_INVSQRT2_PD(rsq_SSE2,rsq_SSE3,rinv_SSE2,rinv_SSE3);
 430 #endif
 431
 432 #ifdef CALC_COULOMB
 433             /* Load parameters for j atom */
 434             jq_SSE        = gmx_load_pr(q+aj);
 435             qq_SSE0       = gmx_mul_pr(iq_SSE0,jq_SSE);
 436             qq_SSE1       = gmx_mul_pr(iq_SSE1,jq_SSE);
 437             qq_SSE2       = gmx_mul_pr(iq_SSE2,jq_SSE);
 438             qq_SSE3       = gmx_mul_pr(iq_SSE3,jq_SSE);
 439 #endif
 440
 441 #ifdef CALC_LJ
 442
 443 #if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
 444             load_lj_pair_params(nbfp0,type,aj,c6_SSE0,c12_SSE0);
 445             load_lj_pair_params(nbfp1,type,aj,c6_SSE1,c12_SSE1);
 446 #ifndef HALF_LJ
 447             load_lj_pair_params(nbfp2,type,aj,c6_SSE2,c12_SSE2);
 448             load_lj_pair_params(nbfp3,type,aj,c6_SSE3,c12_SSE3);
 449 #endif
 450 #endif /* not defined any LJ rule */
 451
 452 #ifdef LJ_COMB_GEOM
 453             c6s_j_SSE     = gmx_load_pr(ljc+aj2+0);
 454             c12s_j_SSE    = gmx_load_pr(ljc+aj2+STRIDE);
 455             c6_SSE0       = gmx_mul_pr(c6s_SSE0 ,c6s_j_SSE );
 456             c6_SSE1       = gmx_mul_pr(c6s_SSE1 ,c6s_j_SSE );
 457 #ifndef HALF_LJ
 458             c6_SSE2       = gmx_mul_pr(c6s_SSE2 ,c6s_j_SSE );
 459             c6_SSE3       = gmx_mul_pr(c6s_SSE3 ,c6s_j_SSE );
 460 #endif
 461             c12_SSE0      = gmx_mul_pr(c12s_SSE0,c12s_j_SSE);
 462             c12_SSE1      = gmx_mul_pr(c12s_SSE1,c12s_j_SSE);
 463 #ifndef HALF_LJ
 464             c12_SSE2      = gmx_mul_pr(c12s_SSE2,c12s_j_SSE);
 465             c12_SSE3      = gmx_mul_pr(c12s_SSE3,c12s_j_SSE);
 466 #endif
 467 #endif /* LJ_COMB_GEOM */
 468
 469 #ifdef LJ_COMB_LB
 470             hsig_j_SSE    = gmx_load_pr(ljc+aj2+0);
 471             seps_j_SSE    = gmx_load_pr(ljc+aj2+STRIDE);
 472
 473             sig_SSE0      = gmx_add_pr(hsig_i_SSE0,hsig_j_SSE);
 474             sig_SSE1      = gmx_add_pr(hsig_i_SSE1,hsig_j_SSE);
 475             eps_SSE0      = gmx_mul_pr(seps_i_SSE0,seps_j_SSE);
 476             eps_SSE1      = gmx_mul_pr(seps_i_SSE1,seps_j_SSE);
 477 #ifndef HALF_LJ
 478             sig_SSE2      = gmx_add_pr(hsig_i_SSE2,hsig_j_SSE);
 479             sig_SSE3      = gmx_add_pr(hsig_i_SSE3,hsig_j_SSE);
 480             eps_SSE2      = gmx_mul_pr(seps_i_SSE2,seps_j_SSE);
 481             eps_SSE3      = gmx_mul_pr(seps_i_SSE3,seps_j_SSE);
 482 #endif
 483 #endif /* LJ_COMB_LB */
 484
 485 #endif /* CALC_LJ */
 486
 487 #ifndef CUTOFF_BLENDV
 488             rinv_SSE0     = gmx_and_pr(rinv_SSE0,wco_SSE0);
 489             rinv_SSE1     = gmx_and_pr(rinv_SSE1,wco_SSE1);
 490             rinv_SSE2     = gmx_and_pr(rinv_SSE2,wco_SSE2);
 491             rinv_SSE3     = gmx_and_pr(rinv_SSE3,wco_SSE3);
 492 #else
 493             /* We only need to mask for the cut-off: blendv is faster */
 494             rinv_SSE0     = gmx_blendv_pr(rinv_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0));
 495             rinv_SSE1     = gmx_blendv_pr(rinv_SSE1,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE1));
 496             rinv_SSE2     = gmx_blendv_pr(rinv_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2));
 497             rinv_SSE3     = gmx_blendv_pr(rinv_SSE3,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE3));
 498 #endif
 499
 500             rinvsq_SSE0   = gmx_mul_pr(rinv_SSE0,rinv_SSE0);
 501             rinvsq_SSE1   = gmx_mul_pr(rinv_SSE1,rinv_SSE1);
 502             rinvsq_SSE2   = gmx_mul_pr(rinv_SSE2,rinv_SSE2);
 503             rinvsq_SSE3   = gmx_mul_pr(rinv_SSE3,rinv_SSE3);
 504
 505 #ifdef CALC_COULOMB
 506             /* Note that here we calculate force*r, not the usual force/r.
 507              * This allows avoiding masking the reaction-field contribution,
 508              * as frcoul is later multiplied by rinvsq which has been
 509              * masked with the cut-off check.
 510              */
 511
 512 #ifdef EXCL_FORCES
 513             /* Only add 1/r for non-excluded atom pairs */
 514             rinv_ex_SSE0  = gmx_and_pr(rinv_SSE0,int_SSE0);
 515             rinv_ex_SSE1  = gmx_and_pr(rinv_SSE1,int_SSE1);
 516             rinv_ex_SSE2  = gmx_and_pr(rinv_SSE2,int_SSE2);
 517             rinv_ex_SSE3  = gmx_and_pr(rinv_SSE3,int_SSE3);
 518 #else
 519             /* No exclusion forces, we always need 1/r */
 520 #define     rinv_ex_SSE0    rinv_SSE0
 521 #define     rinv_ex_SSE1    rinv_SSE1
 522 #define     rinv_ex_SSE2    rinv_SSE2
 523 #define     rinv_ex_SSE3    rinv_SSE3
 524 #endif
 525
 526 #ifdef CALC_COUL_RF
 527             /* Electrostatic interactions */
 528             frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(rsq_SSE0,mrc_3_SSE)));
 529             frcoul_SSE1   = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_mul_pr(rsq_SSE1,mrc_3_SSE)));
 530             frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(rsq_SSE2,mrc_3_SSE)));
 531             frcoul_SSE3   = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_mul_pr(rsq_SSE3,mrc_3_SSE)));
 532
 533 #ifdef CALC_ENERGIES
 534             vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_add_pr(gmx_mul_pr(rsq_SSE0,hrc_3_SSE),moh_rc_SSE)));
 535             vcoul_SSE1    = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_add_pr(gmx_mul_pr(rsq_SSE1,hrc_3_SSE),moh_rc_SSE)));
 536             vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_add_pr(gmx_mul_pr(rsq_SSE2,hrc_3_SSE),moh_rc_SSE)));
 537             vcoul_SSE3    = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_add_pr(gmx_mul_pr(rsq_SSE3,hrc_3_SSE),moh_rc_SSE)));
 538 #endif
 539 #endif
 540
 541 #ifdef CALC_COUL_TAB
 542             /* Electrostatic interactions */
 543             r_SSE0        = gmx_mul_pr(rsq_SSE0,rinv_SSE0);
 544             r_SSE1        = gmx_mul_pr(rsq_SSE1,rinv_SSE1);
 545             r_SSE2        = gmx_mul_pr(rsq_SSE2,rinv_SSE2);
 546             r_SSE3        = gmx_mul_pr(rsq_SSE3,rinv_SSE3);
 547             /* Convert r to scaled table units */
 548             rs_SSE0       = gmx_mul_pr(r_SSE0,invtsp_SSE);
 549             rs_SSE1       = gmx_mul_pr(r_SSE1,invtsp_SSE);
 550             rs_SSE2       = gmx_mul_pr(r_SSE2,invtsp_SSE);
 551             rs_SSE3       = gmx_mul_pr(r_SSE3,invtsp_SSE);
 552             /* Truncate scaled r to an int */
 553             ti_SSE0       = gmx_cvttpr_epi32(rs_SSE0);
 554             ti_SSE1       = gmx_cvttpr_epi32(rs_SSE1);
 555             ti_SSE2       = gmx_cvttpr_epi32(rs_SSE2);
 556             ti_SSE3       = gmx_cvttpr_epi32(rs_SSE3);
 557 #ifdef GMX_X86_SSE4_1
 558             /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
 559             rf_SSE0       = gmx_floor_pr(rs_SSE0);
 560             rf_SSE1       = gmx_floor_pr(rs_SSE1);
 561             rf_SSE2       = gmx_floor_pr(rs_SSE2);
 562             rf_SSE3       = gmx_floor_pr(rs_SSE3);
 563 #else
 564             rf_SSE0       = gmx_cvtepi32_pr(ti_SSE0);
 565             rf_SSE1       = gmx_cvtepi32_pr(ti_SSE1);
 566             rf_SSE2       = gmx_cvtepi32_pr(ti_SSE2);
 567             rf_SSE3       = gmx_cvtepi32_pr(ti_SSE3);
 568 #endif
 569             frac_SSE0     = gmx_sub_pr(rs_SSE0,rf_SSE0);
 570             frac_SSE1     = gmx_sub_pr(rs_SSE1,rf_SSE1);
 571             frac_SSE2     = gmx_sub_pr(rs_SSE2,rf_SSE2);
 572             frac_SSE3     = gmx_sub_pr(rs_SSE3,rf_SSE3);
 573
 574             /* Load and interpolate table forces and possibly energies.
 575              * Force and energy can be combined in one table, stride 4: FDV0
 576              * or in two separate tables with stride 1: F and V
 577              * Currently single precision uses FDV0, double F and V.
 578              */
 579 #ifndef CALC_ENERGIES
 580             load_table_f(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0);
 581             load_table_f(tab_coul_F,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1);
 582             load_table_f(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2);
 583             load_table_f(tab_coul_F,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3);
 584 #else
 585 #ifdef TAB_FDV0
 586             load_table_f_v(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
 587             load_table_f_v(tab_coul_F,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1,ctabv_SSE1);
 588             load_table_f_v(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
 589             load_table_f_v(tab_coul_F,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3,ctabv_SSE3);
 590 #else
 591             load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
 592             load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1,ctabv_SSE1);
 593             load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
 594             load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3,ctabv_SSE3);
 595 #endif
 596 #endif
 597             fsub_SSE0     = gmx_add_pr(ctab0_SSE0,gmx_mul_pr(frac_SSE0,ctab1_SSE0));
 598             fsub_SSE1     = gmx_add_pr(ctab0_SSE1,gmx_mul_pr(frac_SSE1,ctab1_SSE1));
 599             fsub_SSE2     = gmx_add_pr(ctab0_SSE2,gmx_mul_pr(frac_SSE2,ctab1_SSE2));
 600             fsub_SSE3     = gmx_add_pr(ctab0_SSE3,gmx_mul_pr(frac_SSE3,ctab1_SSE3));
 601             frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,gmx_mul_pr(fsub_SSE0,r_SSE0)));
 602             frcoul_SSE1   = gmx_mul_pr(qq_SSE1,gmx_sub_pr(rinv_ex_SSE1,gmx_mul_pr(fsub_SSE1,r_SSE1)));
 603             frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,gmx_mul_pr(fsub_SSE2,r_SSE2)));
 604             frcoul_SSE3   = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,gmx_mul_pr(fsub_SSE3,r_SSE3)));
 605
 606 #ifdef CALC_ENERGIES
 607             vc_sub_SSE0   = gmx_add_pr(ctabv_SSE0,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE0),gmx_add_pr(ctab0_SSE0,fsub_SSE0)));
 608             vc_sub_SSE1   = gmx_add_pr(ctabv_SSE1,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE1),gmx_add_pr(ctab0_SSE1,fsub_SSE1)));
 609             vc_sub_SSE2   = gmx_add_pr(ctabv_SSE2,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE2),gmx_add_pr(ctab0_SSE2,fsub_SSE2)));
 610             vc_sub_SSE3   = gmx_add_pr(ctabv_SSE3,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE3),gmx_add_pr(ctab0_SSE3,fsub_SSE3)));
 611
 612 #ifndef NO_SHIFT_EWALD
 613             /* Add Ewald potential shift to vc_sub for convenience */
 614 #ifdef CHECK_EXCLS
 615             vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,gmx_and_pr(sh_ewald_SSE,int_SSE0));
 616             vc_sub_SSE1   = gmx_add_pr(vc_sub_SSE1,gmx_and_pr(sh_ewald_SSE,int_SSE1));
 617             vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,gmx_and_pr(sh_ewald_SSE,int_SSE2));
 618             vc_sub_SSE3   = gmx_add_pr(vc_sub_SSE3,gmx_and_pr(sh_ewald_SSE,int_SSE3));
 619 #else
 620             vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,sh_ewald_SSE);
 621             vc_sub_SSE1   = gmx_add_pr(vc_sub_SSE1,sh_ewald_SSE);
 622             vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,sh_ewald_SSE);
 623             vc_sub_SSE3   = gmx_add_pr(vc_sub_SSE3,sh_ewald_SSE);
 624 #endif
 625 #endif
 626
 627             vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,vc_sub_SSE0));
 628             vcoul_SSE1    = gmx_mul_pr(qq_SSE1,gmx_sub_pr(rinv_ex_SSE1,vc_sub_SSE1));
 629             vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,vc_sub_SSE2));
 630             vcoul_SSE3    = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,vc_sub_SSE3));
 631
 632 #endif
 633 #endif
 634
 635 #ifdef CALC_ENERGIES
 636             /* Mask energy for cut-off and diagonal */
 637             vcoul_SSE0    = gmx_and_pr(vcoul_SSE0,wco_SSE0);
 638             vcoul_SSE1    = gmx_and_pr(vcoul_SSE1,wco_SSE1);
 639             vcoul_SSE2    = gmx_and_pr(vcoul_SSE2,wco_SSE2);
 640             vcoul_SSE3    = gmx_and_pr(vcoul_SSE3,wco_SSE3);
 641 #endif
 642
 643 #endif /* CALC_COULOMB */
 644
 645 #ifdef CALC_LJ
 646             /* Lennard-Jones interaction */
 647
 648 #ifdef VDW_CUTOFF_CHECK
 649             wco_vdw_SSE0  = gmx_cmplt_pr(rsq_SSE0,rcvdw2_SSE);
 650             wco_vdw_SSE1  = gmx_cmplt_pr(rsq_SSE1,rcvdw2_SSE);
 651 #ifndef HALF_LJ
 652             wco_vdw_SSE2  = gmx_cmplt_pr(rsq_SSE2,rcvdw2_SSE);
 653             wco_vdw_SSE3  = gmx_cmplt_pr(rsq_SSE3,rcvdw2_SSE);
 654 #endif
 655 #else
 656             /* Same cut-off for Coulomb and VdW, reuse the registers */
 657 #define     wco_vdw_SSE0    wco_SSE0
 658 #define     wco_vdw_SSE1    wco_SSE1
 659 #define     wco_vdw_SSE2    wco_SSE2
 660 #define     wco_vdw_SSE3    wco_SSE3
 661 #endif
 662
 663 #ifndef LJ_COMB_LB
 664             rinvsix_SSE0  = gmx_mul_pr(rinvsq_SSE0,gmx_mul_pr(rinvsq_SSE0,rinvsq_SSE0));
 665             rinvsix_SSE1  = gmx_mul_pr(rinvsq_SSE1,gmx_mul_pr(rinvsq_SSE1,rinvsq_SSE1));
 666 #ifdef EXCL_FORCES
 667             rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,int_SSE0);
 668             rinvsix_SSE1  = gmx_and_pr(rinvsix_SSE1,int_SSE1);
 669 #endif
 670 #ifndef HALF_LJ
 671             rinvsix_SSE2  = gmx_mul_pr(rinvsq_SSE2,gmx_mul_pr(rinvsq_SSE2,rinvsq_SSE2));
 672             rinvsix_SSE3  = gmx_mul_pr(rinvsq_SSE3,gmx_mul_pr(rinvsq_SSE3,rinvsq_SSE3));
 673 #ifdef EXCL_FORCES
 674             rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,int_SSE2);
 675             rinvsix_SSE3  = gmx_and_pr(rinvsix_SSE3,int_SSE3);
 676 #endif
 677 #endif
 678 #ifdef VDW_CUTOFF_CHECK
 679             rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,wco_vdw_SSE0);
 680             rinvsix_SSE1  = gmx_and_pr(rinvsix_SSE1,wco_vdw_SSE1);
 681 #ifndef HALF_LJ
 682             rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,wco_vdw_SSE2);
 683             rinvsix_SSE3  = gmx_and_pr(rinvsix_SSE3,wco_vdw_SSE3);
 684 #endif
 685 #endif
 686             FrLJ6_SSE0    = gmx_mul_pr(c6_SSE0,rinvsix_SSE0);
 687             FrLJ6_SSE1    = gmx_mul_pr(c6_SSE1,rinvsix_SSE1);
 688 #ifndef HALF_LJ
 689             FrLJ6_SSE2    = gmx_mul_pr(c6_SSE2,rinvsix_SSE2);
 690             FrLJ6_SSE3    = gmx_mul_pr(c6_SSE3,rinvsix_SSE3);
 691 #endif
 692             FrLJ12_SSE0   = gmx_mul_pr(c12_SSE0,gmx_mul_pr(rinvsix_SSE0,rinvsix_SSE0));
 693             FrLJ12_SSE1   = gmx_mul_pr(c12_SSE1,gmx_mul_pr(rinvsix_SSE1,rinvsix_SSE1));
 694 #ifndef HALF_LJ
 695             FrLJ12_SSE2   = gmx_mul_pr(c12_SSE2,gmx_mul_pr(rinvsix_SSE2,rinvsix_SSE2));
 696             FrLJ12_SSE3   = gmx_mul_pr(c12_SSE3,gmx_mul_pr(rinvsix_SSE3,rinvsix_SSE3));
 697 #endif
 698 #endif /* not LJ_COMB_LB */
 699
 700 #ifdef LJ_COMB_LB
 701             sir_SSE0      = gmx_mul_pr(sig_SSE0,rinv_SSE0);
 702             sir_SSE1      = gmx_mul_pr(sig_SSE1,rinv_SSE1);
 703 #ifndef HALF_LJ
 704             sir_SSE2      = gmx_mul_pr(sig_SSE2,rinv_SSE2);
 705             sir_SSE3      = gmx_mul_pr(sig_SSE3,rinv_SSE3);
 706 #endif
 707             sir2_SSE0     = gmx_mul_pr(sir_SSE0,sir_SSE0);
 708             sir2_SSE1     = gmx_mul_pr(sir_SSE1,sir_SSE1);
 709 #ifndef HALF_LJ
 710             sir2_SSE2     = gmx_mul_pr(sir_SSE2,sir_SSE2);
 711             sir2_SSE3     = gmx_mul_pr(sir_SSE3,sir_SSE3);
 712 #endif
 713             sir6_SSE0     = gmx_mul_pr(sir2_SSE0,gmx_mul_pr(sir2_SSE0,sir2_SSE0));
 714             sir6_SSE1     = gmx_mul_pr(sir2_SSE1,gmx_mul_pr(sir2_SSE1,sir2_SSE1));
 715 #ifdef EXCL_FORCES
 716             sir6_SSE0     = gmx_and_pr(sir6_SSE0,int_SSE0);
 717             sir6_SSE1     = gmx_and_pr(sir6_SSE1,int_SSE1);
 718 #endif
 719 #ifndef HALF_LJ
 720             sir6_SSE2     = gmx_mul_pr(sir2_SSE2,gmx_mul_pr(sir2_SSE2,sir2_SSE2));
 721             sir6_SSE3     = gmx_mul_pr(sir2_SSE3,gmx_mul_pr(sir2_SSE3,sir2_SSE3));
 722 #ifdef EXCL_FORCES
 723             sir6_SSE2     = gmx_and_pr(sir6_SSE2,int_SSE2);
 724             sir6_SSE3     = gmx_and_pr(sir6_SSE3,int_SSE3);
 725 #endif
 726 #endif
 727 #ifdef VDW_CUTOFF_CHECK
 728             sir6_SSE0     = gmx_and_pr(sir6_SSE0,wco_vdw_SSE0);
 729             sir6_SSE1     = gmx_and_pr(sir6_SSE1,wco_vdw_SSE1);
 730 #ifndef HALF_LJ
 731             sir6_SSE2     = gmx_and_pr(sir6_SSE2,wco_vdw_SSE2);
 732             sir6_SSE3     = gmx_and_pr(sir6_SSE3,wco_vdw_SSE3);
 733 #endif
 734 #endif
 735             FrLJ6_SSE0    = gmx_mul_pr(eps_SSE0,sir6_SSE0);
 736             FrLJ6_SSE1    = gmx_mul_pr(eps_SSE1,sir6_SSE1);
 737 #ifndef HALF_LJ
 738             FrLJ6_SSE2    = gmx_mul_pr(eps_SSE2,sir6_SSE2);
 739             FrLJ6_SSE3    = gmx_mul_pr(eps_SSE3,sir6_SSE3);
 740 #endif
 741             FrLJ12_SSE0   = gmx_mul_pr(FrLJ6_SSE0,sir6_SSE0);
 742             FrLJ12_SSE1   = gmx_mul_pr(FrLJ6_SSE1,sir6_SSE1);
 743 #ifndef HALF_LJ
 744             FrLJ12_SSE2   = gmx_mul_pr(FrLJ6_SSE2,sir6_SSE2);
 745             FrLJ12_SSE3   = gmx_mul_pr(FrLJ6_SSE3,sir6_SSE3);
 746 #endif
 747 #if defined CALC_ENERGIES
 748             /* We need C6 and C12 to calculate the LJ potential shift */
 749             sig2_SSE0     = gmx_mul_pr(sig_SSE0,sig_SSE0);
 750             sig2_SSE1     = gmx_mul_pr(sig_SSE1,sig_SSE1);
 751 #ifndef HALF_LJ
 752             sig2_SSE2     = gmx_mul_pr(sig_SSE2,sig_SSE2);
 753             sig2_SSE3     = gmx_mul_pr(sig_SSE3,sig_SSE3);
 754 #endif
 755             sig6_SSE0     = gmx_mul_pr(sig2_SSE0,gmx_mul_pr(sig2_SSE0,sig2_SSE0));
 756             sig6_SSE1     = gmx_mul_pr(sig2_SSE1,gmx_mul_pr(sig2_SSE1,sig2_SSE1));
 757 #ifndef HALF_LJ
 758             sig6_SSE2     = gmx_mul_pr(sig2_SSE2,gmx_mul_pr(sig2_SSE2,sig2_SSE2));
 759             sig6_SSE3     = gmx_mul_pr(sig2_SSE3,gmx_mul_pr(sig2_SSE3,sig2_SSE3));
 760 #endif
 761             c6_SSE0       = gmx_mul_pr(eps_SSE0,sig6_SSE0);
 762             c6_SSE1       = gmx_mul_pr(eps_SSE1,sig6_SSE1);
 763 #ifndef HALF_LJ
 764             c6_SSE2       = gmx_mul_pr(eps_SSE2,sig6_SSE2);
 765             c6_SSE3       = gmx_mul_pr(eps_SSE3,sig6_SSE3);
 766 #endif
 767             c12_SSE0      = gmx_mul_pr(c6_SSE0,sig6_SSE0);
 768             c12_SSE1      = gmx_mul_pr(c6_SSE1,sig6_SSE1);
 769 #ifndef HALF_LJ
 770             c12_SSE2      = gmx_mul_pr(c6_SSE2,sig6_SSE2);
 771             c12_SSE3      = gmx_mul_pr(c6_SSE3,sig6_SSE3);
 772 #endif
 773 #endif
 774 #endif /* LJ_COMB_LB */
 775
 776 #endif /* CALC_LJ */
 777
 778 #ifdef CALC_ENERGIES
 779 #ifdef ENERGY_GROUPS
 780             /* Extract the group pair index per j pair */
 781 #if UNROLLJ == 2
 782             egps_j        = nbat->energrp[cj>>1];
 783             egp_jj[0]     = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
 784 #else
 785             egps_j        = nbat->energrp[cj];
 786             for(jj=0; jj<(UNROLLJ>>1); jj++)
 787             {
 788                 egp_jj[jj]  = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
 789             }
 790 #endif
 791 #endif
 792
 793 #ifdef CALC_COULOMB
 794 #ifndef ENERGY_GROUPS
 795             vctotSSE      = gmx_add_pr(vctotSSE, gmx_sum4_pr(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
 796 #else
 797             add_ener_grp(vcoul_SSE0,vctp[0],egp_jj);
 798             add_ener_grp(vcoul_SSE1,vctp[1],egp_jj);
 799             add_ener_grp(vcoul_SSE2,vctp[2],egp_jj);
 800             add_ener_grp(vcoul_SSE3,vctp[3],egp_jj);
 801 #endif
 802 #endif
 803
 804 #ifdef CALC_LJ
 805             /* Calculate the LJ energies */
 806             VLJ6_SSE0     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE0,gmx_mul_pr(c6_SSE0,sh_invrc6_SSE)));
 807             VLJ6_SSE1     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE1,gmx_mul_pr(c6_SSE1,sh_invrc6_SSE)));
 808 #ifndef HALF_LJ
 809             VLJ6_SSE2     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE2,gmx_mul_pr(c6_SSE2,sh_invrc6_SSE)));
 810             VLJ6_SSE3     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE3,gmx_mul_pr(c6_SSE3,sh_invrc6_SSE)));
 811 #endif
 812             VLJ12_SSE0    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE0,gmx_mul_pr(c12_SSE0,sh_invrc12_SSE)));
 813             VLJ12_SSE1    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE1,gmx_mul_pr(c12_SSE1,sh_invrc12_SSE)));
 814 #ifndef HALF_LJ
 815             VLJ12_SSE2    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE2,gmx_mul_pr(c12_SSE2,sh_invrc12_SSE)));
 816             VLJ12_SSE3    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE3,gmx_mul_pr(c12_SSE3,sh_invrc12_SSE)));
 817 #endif
 818
 819             VLJ_SSE0      = gmx_sub_pr(VLJ12_SSE0,VLJ6_SSE0);
 820             VLJ_SSE1      = gmx_sub_pr(VLJ12_SSE1,VLJ6_SSE1);
 821 #ifndef HALF_LJ
 822             VLJ_SSE2      = gmx_sub_pr(VLJ12_SSE2,VLJ6_SSE2);
 823             VLJ_SSE3      = gmx_sub_pr(VLJ12_SSE3,VLJ6_SSE3);
 824 #endif
 825             /* The potential shift should be removed for pairs beyond cut-off */
 826             VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,wco_vdw_SSE0);
 827             VLJ_SSE1      = gmx_and_pr(VLJ_SSE1,wco_vdw_SSE1);
 828 #ifndef HALF_LJ
 829             VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,wco_vdw_SSE2);
 830             VLJ_SSE3      = gmx_and_pr(VLJ_SSE3,wco_vdw_SSE3);
 831 #endif
 832 #ifdef CHECK_EXCLS
 833             /* The potential shift should be removed for excluded pairs */
 834             VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,int_SSE0);
 835             VLJ_SSE1      = gmx_and_pr(VLJ_SSE1,int_SSE1);
 836 #ifndef HALF_LJ
 837             VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,int_SSE2);
 838             VLJ_SSE3      = gmx_and_pr(VLJ_SSE3,int_SSE3);
 839 #endif
 840 #endif
 841 #ifndef ENERGY_GROUPS
 842             VvdwtotSSE    = gmx_add_pr(VvdwtotSSE,
 843 #ifndef HALF_LJ
 844                                        gmx_sum4_pr(VLJ_SSE0,VLJ_SSE1,VLJ_SSE2,VLJ_SSE3)
 845 #else
 846                                        gmx_add_pr(VLJ_SSE0,VLJ_SSE1)
 847 #endif
 848                                       );
 849 #else
 850             add_ener_grp(VLJ_SSE0,vvdwtp[0],egp_jj);
 851             add_ener_grp(VLJ_SSE1,vvdwtp[1],egp_jj);
 852 #ifndef HALF_LJ
 853             add_ener_grp(VLJ_SSE2,vvdwtp[2],egp_jj);
 854             add_ener_grp(VLJ_SSE3,vvdwtp[3],egp_jj);
 855 #endif
 856 #endif
 857 #endif /* CALC_LJ */
 858 #endif /* CALC_ENERGIES */
 859
 860 #ifdef CALC_LJ
 861             fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,
 862 #ifdef CALC_COULOMB
 863                                                    gmx_add_pr(frcoul_SSE0,
 864 #else
 865                                                    (
 866 #endif
 867                                                     gmx_sub_pr(FrLJ12_SSE0,FrLJ6_SSE0)));
 868             fscal_SSE1    = gmx_mul_pr(rinvsq_SSE1,
 869 #ifdef CALC_COULOMB
 870                                                    gmx_add_pr(frcoul_SSE1,
 871 #else
 872                                                    (
 873 #endif
 874                                                     gmx_sub_pr(FrLJ12_SSE1,FrLJ6_SSE1)));
 875 #else
 876             fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,frcoul_SSE0);
 877             fscal_SSE1    = gmx_mul_pr(rinvsq_SSE1,frcoul_SSE1);
 878 #endif /* CALC_LJ */
 879 #if defined CALC_LJ && !defined HALF_LJ
 880             fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,
 881 #ifdef CALC_COULOMB
 882                                                    gmx_add_pr(frcoul_SSE2,
 883 #else
 884                                                    (
 885 #endif
 886                                                     gmx_sub_pr(FrLJ12_SSE2,FrLJ6_SSE2)));
 887             fscal_SSE3    = gmx_mul_pr(rinvsq_SSE3,
 888 #ifdef CALC_COULOMB
 889                                                    gmx_add_pr(frcoul_SSE3,
 890 #else
 891                                                    (
 892 #endif
 893                                                     gmx_sub_pr(FrLJ12_SSE3,FrLJ6_SSE3)));
 894 #else
 895             /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
 896             fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,frcoul_SSE2);
 897             fscal_SSE3    = gmx_mul_pr(rinvsq_SSE3,frcoul_SSE3);
 898 #endif
 899
 900             /* Calculate temporary vectorial force */
 901             tx_SSE0       = gmx_mul_pr(fscal_SSE0,dx_SSE0);
 902             tx_SSE1       = gmx_mul_pr(fscal_SSE1,dx_SSE1);
 903             tx_SSE2       = gmx_mul_pr(fscal_SSE2,dx_SSE2);
 904             tx_SSE3       = gmx_mul_pr(fscal_SSE3,dx_SSE3);
 905             ty_SSE0       = gmx_mul_pr(fscal_SSE0,dy_SSE0);
 906             ty_SSE1       = gmx_mul_pr(fscal_SSE1,dy_SSE1);
 907             ty_SSE2       = gmx_mul_pr(fscal_SSE2,dy_SSE2);
 908             ty_SSE3       = gmx_mul_pr(fscal_SSE3,dy_SSE3);
 909             tz_SSE0       = gmx_mul_pr(fscal_SSE0,dz_SSE0);
 910             tz_SSE1       = gmx_mul_pr(fscal_SSE1,dz_SSE1);
 911             tz_SSE2       = gmx_mul_pr(fscal_SSE2,dz_SSE2);
 912             tz_SSE3       = gmx_mul_pr(fscal_SSE3,dz_SSE3);
 913
 914             /* Increment i atom force */
 915             fix_SSE0      = gmx_add_pr(fix_SSE0,tx_SSE0);
 916             fix_SSE1      = gmx_add_pr(fix_SSE1,tx_SSE1);
 917             fix_SSE2      = gmx_add_pr(fix_SSE2,tx_SSE2);
 918             fix_SSE3      = gmx_add_pr(fix_SSE3,tx_SSE3);
 919             fiy_SSE0      = gmx_add_pr(fiy_SSE0,ty_SSE0);
 920             fiy_SSE1      = gmx_add_pr(fiy_SSE1,ty_SSE1);
 921             fiy_SSE2      = gmx_add_pr(fiy_SSE2,ty_SSE2);
 922             fiy_SSE3      = gmx_add_pr(fiy_SSE3,ty_SSE3);
 923             fiz_SSE0      = gmx_add_pr(fiz_SSE0,tz_SSE0);
 924             fiz_SSE1      = gmx_add_pr(fiz_SSE1,tz_SSE1);
 925             fiz_SSE2      = gmx_add_pr(fiz_SSE2,tz_SSE2);
 926             fiz_SSE3      = gmx_add_pr(fiz_SSE3,tz_SSE3);
 927
 928             /* Decrement j atom force */
 929             gmx_store_pr(f+ajx,
 930                          gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
 931             gmx_store_pr(f+ajy,
 932                          gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
 933             gmx_store_pr(f+ajz,
 934                          gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
 935         }
 936
 937 #undef  rinv_ex_SSE0
 938 #undef  rinv_ex_SSE1
 939 #undef  rinv_ex_SSE2
 940 #undef  rinv_ex_SSE3
 941
 942 #undef  wco_vdw_SSE0
 943 #undef  wco_vdw_SSE1
 944 #undef  wco_vdw_SSE2
 945 #undef  wco_vdw_SSE3
 946
 947 #undef  CUTOFF_BLENDV
 948
 949 #undef  EXCL_FORCES