src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2009, The GROMACS Development Team
   6  * Copyright (c) 2012, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
  39 #include "gmx_simd_macros.h"
  40
  41 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
  42
  43 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
  44 #define UNROLLJ    GMX_SIMD_WIDTH_HERE
  45
  46 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
  47 #define STRIDE     4
  48 #endif
  49 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
  50 #define STRIDE     8
  51 #endif
  52
  53 #ifdef GMX_MM128_HERE
  54 #ifndef GMX_DOUBLE
  55 /* single precision 4x4 kernel */
  56 #define SUM_SIMD(x) SUM_SIMD4(x)
  57 #define TAB_FDV0
  58 #else
  59 /* double precision 4x2 kernel */
  60 #define SUM_SIMD(x) (x[0]+x[1])
  61 #endif
  62 #endif
  63
  64 #ifdef GMX_MM256_HERE
  65 #ifndef GMX_DOUBLE
  66 /* single precision 4x8 kernel */
  67 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
  68 #define TAB_FDV0
  69 #else
  70 /* double precision 4x4 kernel */
  71 #define SUM_SIMD(x) SUM_SIMD4(x)
  72 #endif
  73 #endif
  74
  75 #define SIMD_MASK_ALL   0xffffffff
  76
  77 #include "nbnxn_kernel_simd_utils.h"
  78
  79 /* All functionality defines are set here, except for:
  80  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
  81  * CHECK_EXCLS, which is set just before including the inner loop contents.
  82  * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
  83  * set before calling the kernel function. We might want to move that
  84  * to inside the n-loop and have a different combination rule for different
  85  * ci's, as no combination rule gives a 50% performance hit for LJ.
  86  */
  87
  88 /* We always calculate shift forces, because it's cheap anyhow */
  89 #define CALC_SHIFTFORCES
  90
  91 /* Assumes all LJ parameters are identical */
  92 /* #define FIX_LJ_C */
  93
  94 /* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
  95  * with all combinations off electrostatics (coul), LJ combination rules (ljc)
  96  * and energy calculations (ene), depending on the defines set.
  97  */
  98
  99 #define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
 100
 101 #if defined LJ_COMB_GEOM
 102 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
 103 #else
 104 #if defined LJ_COMB_LB
 105 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
 106 #else
 107 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
 108 #endif
 109 #endif
 110
 111 #ifdef CALC_COUL_RF
 112 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
 113 #endif
 114 #ifdef CALC_COUL_TAB
 115 #ifndef VDW_CUTOFF_CHECK
 116 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
 117 #else
 118 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
 119 #endif
 120 #endif
 121 #ifdef CALC_COUL_EWALD
 122 #ifndef VDW_CUTOFF_CHECK
 123 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
 124 #else
 125 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
 126 #endif
 127 #endif
 128
 129 static void
 130 #ifndef CALC_ENERGIES
 131 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, noener)
 132 #else
 133 #ifndef ENERGY_GROUPS
 134 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, ener)
 135 #else
 136 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
 137 #endif
 138 #endif
 139 #undef NBK_FUNC_NAME
 140 #undef NBK_FUNC_NAME_C
 141 #undef NBK_FUNC_NAME_C_LJC
 142 (const nbnxn_pairlist_t     *nbl,
 143  const nbnxn_atomdata_t     *nbat,
 144  const interaction_const_t  *ic,
 145  rvec                       *shift_vec,
 146  real                       *f
 147 #ifdef CALC_SHIFTFORCES
 148  ,
 149  real                       *fshift
 150 #endif
 151 #ifdef CALC_ENERGIES
 152  ,
 153  real                       *Vvdw,
 154  real                       *Vc
 155 #endif
 156 )
 157 {
 158     const nbnxn_ci_t   *nbln;
 159     const nbnxn_cj_t   *l_cj;
 160     const int          *type;
 161     const real         *q;
 162     const real         *shiftvec;
 163     const real         *x;
 164     const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
 165     real                facel;
 166     real               *nbfp_ptr;
 167     int                 nbfp_stride;
 168     int                 n, ci, ci_sh;
 169     int                 ish, ish3;
 170     gmx_bool            do_LJ, half_LJ, do_coul;
 171     int                 sci, scix, sciy, sciz, sci2;
 172     int                 cjind0, cjind1, cjind;
 173     int                 ip, jp;
 174
 175 #ifdef ENERGY_GROUPS
 176     int         Vstride_i;
 177     int         egps_ishift, egps_imask;
 178     int         egps_jshift, egps_jmask, egps_jstride;
 179     int         egps_i;
 180     real       *vvdwtp[UNROLLI];
 181     real       *vctp[UNROLLI];
 182 #endif
 183
 184     gmx_mm_pr  shX_SSE;
 185     gmx_mm_pr  shY_SSE;
 186     gmx_mm_pr  shZ_SSE;
 187     gmx_mm_pr  ix_SSE0, iy_SSE0, iz_SSE0;
 188     gmx_mm_pr  ix_SSE1, iy_SSE1, iz_SSE1;
 189     gmx_mm_pr  ix_SSE2, iy_SSE2, iz_SSE2;
 190     gmx_mm_pr  ix_SSE3, iy_SSE3, iz_SSE3;
 191     gmx_mm_pr  fix_SSE0, fiy_SSE0, fiz_SSE0;
 192     gmx_mm_pr  fix_SSE1, fiy_SSE1, fiz_SSE1;
 193     gmx_mm_pr  fix_SSE2, fiy_SSE2, fiz_SSE2;
 194     gmx_mm_pr  fix_SSE3, fiy_SSE3, fiz_SSE3;
 195 #if UNROLLJ >= 4
 196 #ifndef GMX_DOUBLE
 197     __m128     fix_SSE, fiy_SSE, fiz_SSE;
 198 #else
 199     __m256d    fix_SSE, fiy_SSE, fiz_SSE;
 200 #endif
 201 #else
 202     __m128d    fix0_SSE, fiy0_SSE, fiz0_SSE;
 203     __m128d    fix2_SSE, fiy2_SSE, fiz2_SSE;
 204 #endif
 205
 206 #ifdef GMX_MM128_HERE
 207 #ifndef GMX_DOUBLE
 208     __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
 209     __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
 210     __m128i    mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
 211     __m128i    mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
 212 #else
 213     /* For double precision we need to set two 32bit ints for one double */
 214     __m128i    mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
 215     __m128i    mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
 216     __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
 217     __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
 218 #endif
 219 #endif
 220 #ifdef GMX_MM256_HERE
 221     /* AVX: use floating point masks, as there are no integer instructions */
 222 #ifndef GMX_DOUBLE
 223     gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
 224     gmx_mm_pr  mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
 225 #else
 226     /* There is no 256-bit int to double conversion, so we use float here */
 227     __m256     mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
 228     __m256     mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
 229     __m256     mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
 230     __m256     mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
 231 #endif
 232 #endif
 233
 234     gmx_mm_pr diag_jmi_SSE;
 235 #if UNROLLI == UNROLLJ
 236     gmx_mm_pr diag_SSE0, diag_SSE1, diag_SSE2, diag_SSE3;
 237 #else
 238     gmx_mm_pr diag0_SSE0, diag0_SSE1, diag0_SSE2, diag0_SSE3;
 239     gmx_mm_pr diag1_SSE0, diag1_SSE1, diag1_SSE2, diag1_SSE3;
 240 #endif
 241
 242 #if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
 243     __m128i    zeroi_SSE = _mm_setzero_si128();
 244 #endif
 245     gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
 246
 247     gmx_mm_pr  one_SSE = gmx_set1_pr(1.0);
 248     gmx_mm_pr  iq_SSE0 = gmx_setzero_pr();
 249     gmx_mm_pr  iq_SSE1 = gmx_setzero_pr();
 250     gmx_mm_pr  iq_SSE2 = gmx_setzero_pr();
 251     gmx_mm_pr  iq_SSE3 = gmx_setzero_pr();
 252     gmx_mm_pr  mrc_3_SSE;
 253 #ifdef CALC_ENERGIES
 254     gmx_mm_pr  hrc_3_SSE, moh_rc_SSE;
 255 #endif
 256
 257 #ifdef CALC_COUL_TAB
 258     /* Coulomb table variables */
 259     gmx_mm_pr   invtsp_SSE;
 260     const real *tab_coul_F;
 261 #ifndef TAB_FDV0
 262     const real *tab_coul_V;
 263 #endif
 264 #ifdef GMX_MM256_HERE
 265     int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
 266     int        ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
 267     int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
 268     int        ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
 269 #endif
 270 #ifdef CALC_ENERGIES
 271     gmx_mm_pr  mhalfsp_SSE;
 272 #endif
 273 #endif
 274
 275 #ifdef CALC_COUL_EWALD
 276     gmx_mm_pr beta2_SSE, beta_SSE;
 277 #endif
 278
 279 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 280     gmx_mm_pr  sh_ewald_SSE;
 281 #endif
 282
 283 #ifdef LJ_COMB_LB
 284     const real *ljc;
 285
 286     gmx_mm_pr   hsig_i_SSE0, seps_i_SSE0;
 287     gmx_mm_pr   hsig_i_SSE1, seps_i_SSE1;
 288     gmx_mm_pr   hsig_i_SSE2, seps_i_SSE2;
 289     gmx_mm_pr   hsig_i_SSE3, seps_i_SSE3;
 290 #else
 291 #ifdef FIX_LJ_C
 292     real        pvdw_array[2*UNROLLI*UNROLLJ+3];
 293     real       *pvdw_c6, *pvdw_c12;
 294     gmx_mm_pr   c6_SSE0, c12_SSE0;
 295     gmx_mm_pr   c6_SSE1, c12_SSE1;
 296     gmx_mm_pr   c6_SSE2, c12_SSE2;
 297     gmx_mm_pr   c6_SSE3, c12_SSE3;
 298 #endif
 299
 300 #ifdef LJ_COMB_GEOM
 301     const real *ljc;
 302
 303     gmx_mm_pr   c6s_SSE0, c12s_SSE0;
 304     gmx_mm_pr   c6s_SSE1, c12s_SSE1;
 305     gmx_mm_pr   c6s_SSE2 = gmx_setzero_pr(), c12s_SSE2 = gmx_setzero_pr();
 306     gmx_mm_pr   c6s_SSE3 = gmx_setzero_pr(), c12s_SSE3 = gmx_setzero_pr();
 307 #endif
 308 #endif /* LJ_COMB_LB */
 309
 310     gmx_mm_pr  vctotSSE, VvdwtotSSE;
 311     gmx_mm_pr  sixthSSE, twelvethSSE;
 312
 313     gmx_mm_pr  avoid_sing_SSE;
 314     gmx_mm_pr  rc2_SSE;
 315 #ifdef VDW_CUTOFF_CHECK
 316     gmx_mm_pr  rcvdw2_SSE;
 317 #endif
 318
 319 #ifdef CALC_ENERGIES
 320     gmx_mm_pr  sh_invrc6_SSE, sh_invrc12_SSE;
 321
 322     /* cppcheck-suppress unassignedVariable */
 323     real       tmpsum_array[15], *tmpsum;
 324 #endif
 325 #ifdef CALC_SHIFTFORCES
 326     /* cppcheck-suppress unassignedVariable */
 327     real       shf_array[15], *shf;
 328 #endif
 329
 330     int ninner;
 331
 332 #ifdef COUNT_PAIRS
 333     int npair = 0;
 334 #endif
 335
 336 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 337     ljc = nbat->lj_comb;
 338 #else
 339     /* No combination rule used */
 340 #ifndef GMX_DOUBLE
 341     nbfp_ptr    = nbat->nbfp_s4;
 342 #define NBFP_STRIDE  4
 343 #else
 344     nbfp_ptr    = nbat->nbfp;
 345 #define NBFP_STRIDE  2
 346 #endif
 347     nbfp_stride = NBFP_STRIDE;
 348 #endif
 349
 350     /* Load j-i for the first i */
 351     diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag);
 352     /* Generate all the diagonal masks as comparison results */
 353 #if UNROLLI == UNROLLJ
 354     diag_SSE0    = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 355     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 356     diag_SSE1    = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 357     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 358     diag_SSE2    = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 359     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 360     diag_SSE3    = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 361 #else
 362 #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
 363     diag0_SSE0   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 364     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 365     diag0_SSE1   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 366     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 367     diag0_SSE2   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 368     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 369     diag0_SSE3   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 370     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 371
 372 #if UNROLLI == 2*UNROLLJ
 373     /* Load j-i for the second half of the j-cluster */
 374     diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
 375 #endif
 376
 377     diag1_SSE0   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 378     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 379     diag1_SSE1   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 380     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 381     diag1_SSE2   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 382     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 383     diag1_SSE3   = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 384 #endif
 385 #endif
 386
 387 #ifdef CALC_COUL_TAB
 388 #ifdef GMX_MM256_HERE
 389     /* Generate aligned table index pointers */
 390     ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
 391     ti1 = (int *)(((size_t)(ti1_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
 392     ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
 393     ti3 = (int *)(((size_t)(ti3_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
 394 #endif
 395
 396     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
 397 #ifdef CALC_ENERGIES
 398     mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
 399 #endif
 400
 401 #ifdef TAB_FDV0
 402     tab_coul_F = ic->tabq_coul_FDV0;
 403 #else
 404     tab_coul_F = ic->tabq_coul_F;
 405     tab_coul_V = ic->tabq_coul_V;
 406 #endif
 407 #endif /* CALC_COUL_TAB */
 408
 409 #ifdef CALC_COUL_EWALD
 410     beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
 411     beta_SSE  = gmx_set1_pr(ic->ewaldcoeff);
 412 #endif
 413
 414 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
 415     sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
 416 #endif
 417
 418     q                   = nbat->q;
 419     type                = nbat->type;
 420     facel               = ic->epsfac;
 421     shiftvec            = shift_vec[0];
 422     x                   = nbat->x;
 423
 424     avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
 425
 426     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
 427     rc2_SSE    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
 428 #ifdef VDW_CUTOFF_CHECK
 429     rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
 430 #endif
 431
 432 #ifdef CALC_ENERGIES
 433     sixthSSE    = gmx_set1_pr(1.0/6.0);
 434     twelvethSSE = gmx_set1_pr(1.0/12.0);
 435
 436     sh_invrc6_SSE  = gmx_set1_pr(ic->sh_invrc6);
 437     sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
 438 #endif
 439
 440     mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
 441
 442 #ifdef CALC_ENERGIES
 443     hrc_3_SSE = gmx_set1_pr(ic->k_rf);
 444
 445     moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
 446 #endif
 447
 448 #ifdef CALC_ENERGIES
 449     tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
 450 #endif
 451 #ifdef CALC_SHIFTFORCES
 452     shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
 453 #endif
 454
 455 #ifdef FIX_LJ_C
 456     pvdw_c6  = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
 457     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
 458
 459     for (jp = 0; jp < UNROLLJ; jp++)
 460     {
 461         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
 462         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
 463         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
 464         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
 465
 466         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 467         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 468         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 469         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 470     }
 471     c6_SSE0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
 472     c6_SSE1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
 473     c6_SSE2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
 474     c6_SSE3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
 475
 476     c12_SSE0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
 477     c12_SSE1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
 478     c12_SSE2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
 479     c12_SSE3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
 480 #endif /* FIX_LJ_C */
 481
 482 #ifdef ENERGY_GROUPS
 483     egps_ishift  = nbat->neg_2log;
 484     egps_imask   = (1<<egps_ishift) - 1;
 485     egps_jshift  = 2*nbat->neg_2log;
 486     egps_jmask   = (1<<egps_jshift) - 1;
 487     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
 488     /* Major division is over i-particle energy groups, determine the stride */
 489     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 490 #endif
 491
 492     l_cj = nbl->cj;
 493
 494     ninner = 0;
 495     for (n = 0; n < nbl->nci; n++)
 496     {
 497         nbln = &nbl->ci[n];
 498
 499         ish              = (nbln->shift & NBNXN_CI_SHIFT);
 500         ish3             = ish*3;
 501         cjind0           = nbln->cj_ind_start;
 502         cjind1           = nbln->cj_ind_end;
 503         ci               = nbln->ci;
 504         ci_sh            = (ish == CENTRAL ? ci : -1);
 505
 506         shX_SSE = gmx_load1_pr(shiftvec+ish3);
 507         shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
 508         shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
 509
 510 #if UNROLLJ <= 4
 511         sci              = ci*STRIDE;
 512         scix             = sci*DIM;
 513         sci2             = sci*2;
 514 #else
 515         sci              = (ci>>1)*STRIDE;
 516         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
 517         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
 518         sci             += (ci & 1)*(STRIDE>>1);
 519 #endif
 520
 521         /* We have 5 LJ/C combinations, but use only three inner loops,
 522          * as the other combinations are unlikely and/or not much faster:
 523          * inner half-LJ + C for half-LJ + C / no-LJ + C
 524          * inner LJ + C      for full-LJ + C
 525          * inner LJ          for full-LJ + no-C / half-LJ + no-C
 526          */
 527         do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
 528         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
 529         half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 530
 531 #ifdef ENERGY_GROUPS
 532         egps_i = nbat->energrp[ci];
 533         {
 534             int ia, egp_ia;
 535
 536             for (ia = 0; ia < UNROLLI; ia++)
 537             {
 538                 egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
 539                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
 540                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
 541             }
 542         }
 543 #endif
 544 #if defined CALC_ENERGIES
 545 #if UNROLLJ == 4
 546         if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
 547 #endif
 548 #if UNROLLJ == 2
 549         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
 550 #endif
 551 #if UNROLLJ == 8
 552         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
 553 #endif
 554         {
 555             int  ia;
 556             real Vc_sub_self;
 557
 558 #ifdef CALC_COUL_RF
 559             Vc_sub_self = 0.5*ic->c_rf;
 560 #endif
 561 #ifdef CALC_COUL_TAB
 562 #ifdef TAB_FDV0
 563             Vc_sub_self = 0.5*tab_coul_F[2];
 564 #else
 565             Vc_sub_self = 0.5*tab_coul_V[0];
 566 #endif
 567 #endif
 568 #ifdef CALC_COUL_EWALD
 569             /* beta/sqrt(pi) */
 570             Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
 571 #endif
 572
 573             for (ia = 0; ia < UNROLLI; ia++)
 574             {
 575                 real qi;
 576
 577                 qi = q[sci+ia];
 578 #ifdef ENERGY_GROUPS
 579                 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
 580 #else
 581                 Vc[0]
 582 #endif
 583                     -= facel*qi*qi*Vc_sub_self;
 584             }
 585         }
 586 #endif
 587
 588         /* Load i atom data */
 589         sciy             = scix + STRIDE;
 590         sciz             = sciy + STRIDE;
 591         ix_SSE0          = gmx_add_pr(gmx_load1_pr(x+scix), shX_SSE);
 592         ix_SSE1          = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_SSE);
 593         ix_SSE2          = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_SSE);
 594         ix_SSE3          = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_SSE);
 595         iy_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciy), shY_SSE);
 596         iy_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_SSE);
 597         iy_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_SSE);
 598         iy_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_SSE);
 599         iz_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_SSE);
 600         iz_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_SSE);
 601         iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_SSE);
 602         iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_SSE);
 603
 604         if (do_coul)
 605         {
 606             iq_SSE0      = gmx_set1_pr(facel*q[sci]);
 607             iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
 608             iq_SSE2      = gmx_set1_pr(facel*q[sci+2]);
 609             iq_SSE3      = gmx_set1_pr(facel*q[sci+3]);
 610         }
 611
 612 #ifdef LJ_COMB_LB
 613         hsig_i_SSE0      = gmx_load1_pr(ljc+sci2+0);
 614         hsig_i_SSE1      = gmx_load1_pr(ljc+sci2+1);
 615         hsig_i_SSE2      = gmx_load1_pr(ljc+sci2+2);
 616         hsig_i_SSE3      = gmx_load1_pr(ljc+sci2+3);
 617         seps_i_SSE0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
 618         seps_i_SSE1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
 619         seps_i_SSE2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
 620         seps_i_SSE3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
 621 #else
 622 #ifdef LJ_COMB_GEOM
 623         c6s_SSE0         = gmx_load1_pr(ljc+sci2+0);
 624         c6s_SSE1         = gmx_load1_pr(ljc+sci2+1);
 625         if (!half_LJ)
 626         {
 627             c6s_SSE2     = gmx_load1_pr(ljc+sci2+2);
 628             c6s_SSE3     = gmx_load1_pr(ljc+sci2+3);
 629         }
 630         c12s_SSE0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
 631         c12s_SSE1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
 632         if (!half_LJ)
 633         {
 634             c12s_SSE2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
 635             c12s_SSE3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
 636         }
 637 #else
 638         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
 639         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
 640         if (!half_LJ)
 641         {
 642             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
 643             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
 644         }
 645 #endif
 646 #endif
 647
 648         /* Zero the potential energy for this list */
 649         VvdwtotSSE       = gmx_setzero_pr();
 650         vctotSSE         = gmx_setzero_pr();
 651
 652         /* Clear i atom forces */
 653         fix_SSE0           = gmx_setzero_pr();
 654         fix_SSE1           = gmx_setzero_pr();
 655         fix_SSE2           = gmx_setzero_pr();
 656         fix_SSE3           = gmx_setzero_pr();
 657         fiy_SSE0           = gmx_setzero_pr();
 658         fiy_SSE1           = gmx_setzero_pr();
 659         fiy_SSE2           = gmx_setzero_pr();
 660         fiy_SSE3           = gmx_setzero_pr();
 661         fiz_SSE0           = gmx_setzero_pr();
 662         fiz_SSE1           = gmx_setzero_pr();
 663         fiz_SSE2           = gmx_setzero_pr();
 664         fiz_SSE3           = gmx_setzero_pr();
 665
 666         cjind = cjind0;
 667
 668         /* Currently all kernels use (at least half) LJ */
 669 #define CALC_LJ
 670         if (half_LJ)
 671         {
 672 #define CALC_COULOMB
 673 #define HALF_LJ
 674 #define CHECK_EXCLS
 675             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 676             {
 677 #include "nbnxn_kernel_simd_4xn_inner.h"
 678                 cjind++;
 679             }
 680 #undef CHECK_EXCLS
 681             for (; (cjind < cjind1); cjind++)
 682             {
 683 #include "nbnxn_kernel_simd_4xn_inner.h"
 684             }
 685 #undef HALF_LJ
 686 #undef CALC_COULOMB
 687         }
 688         else if (do_coul)
 689         {
 690 #define CALC_COULOMB
 691 #define CHECK_EXCLS
 692             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 693             {
 694 #include "nbnxn_kernel_simd_4xn_inner.h"
 695                 cjind++;
 696             }
 697 #undef CHECK_EXCLS
 698             for (; (cjind < cjind1); cjind++)
 699             {
 700 #include "nbnxn_kernel_simd_4xn_inner.h"
 701             }
 702 #undef CALC_COULOMB
 703         }
 704         else
 705         {
 706 #define CHECK_EXCLS
 707             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 708             {
 709 #include "nbnxn_kernel_simd_4xn_inner.h"
 710                 cjind++;
 711             }
 712 #undef CHECK_EXCLS
 713             for (; (cjind < cjind1); cjind++)
 714             {
 715 #include "nbnxn_kernel_simd_4xn_inner.h"
 716             }
 717         }
 718 #undef CALC_LJ
 719         ninner += cjind1 - cjind0;
 720
 721         /* Add accumulated i-forces to the force array */
 722 #if UNROLLJ >= 4
 723 #ifndef GMX_DOUBLE
 724 #define gmx_load_ps4  _mm_load_ps
 725 #define gmx_store_ps4 _mm_store_ps
 726 #define gmx_add_ps4   _mm_add_ps
 727 #else
 728 #define gmx_load_ps4  _mm256_load_pd
 729 #define gmx_store_ps4 _mm256_store_pd
 730 #define gmx_add_ps4   _mm256_add_pd
 731 #endif
 732         GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3, fix_SSE);
 733         gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
 734
 735         GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3, fiy_SSE);
 736         gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
 737
 738         GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3, fiz_SSE);
 739         gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
 740
 741 #ifdef CALC_SHIFTFORCES
 742         gmx_store_ps4(shf, fix_SSE);
 743         fshift[ish3+0] += SUM_SIMD4(shf);
 744         gmx_store_ps4(shf, fiy_SSE);
 745         fshift[ish3+1] += SUM_SIMD4(shf);
 746         gmx_store_ps4(shf, fiz_SSE);
 747         fshift[ish3+2] += SUM_SIMD4(shf);
 748 #endif
 749 #else
 750         GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0, fix_SSE1, fix0_SSE);
 751         _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
 752         GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2, fix_SSE3, fix2_SSE);
 753         _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
 754
 755         GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0, fiy_SSE1, fiy0_SSE);
 756         _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
 757         GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2, fiy_SSE3, fiy2_SSE);
 758         _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
 759
 760         GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0, fiz_SSE1, fiz0_SSE);
 761         _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
 762         GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2, fiz_SSE3, fiz2_SSE);
 763         _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
 764
 765 #ifdef CALC_SHIFTFORCES
 766         _mm_store_pd(shf, _mm_add_pd(fix0_SSE, fix2_SSE));
 767         fshift[ish3+0] += shf[0] + shf[1];
 768         _mm_store_pd(shf, _mm_add_pd(fiy0_SSE, fiy2_SSE));
 769         fshift[ish3+1] += shf[0] + shf[1];
 770         _mm_store_pd(shf, _mm_add_pd(fiz0_SSE, fiz2_SSE));
 771         fshift[ish3+2] += shf[0] + shf[1];
 772 #endif
 773 #endif
 774
 775 #ifdef CALC_ENERGIES
 776         if (do_coul)
 777         {
 778             gmx_store_pr(tmpsum, vctotSSE);
 779             *Vc += SUM_SIMD(tmpsum);
 780         }
 781
 782         gmx_store_pr(tmpsum, VvdwtotSSE);
 783         *Vvdw += SUM_SIMD(tmpsum);
 784 #endif
 785
 786         /* Outer loop uses 6 flops/iteration */
 787     }
 788
 789 #ifdef COUNT_PAIRS
 790     printf("atom pairs %d\n", npair);
 791 #endif
 792 }
 793
 794 #undef gmx_load_ps4
 795 #undef gmx_store_ps4
 796 #undef gmx_store_ps4
 797
 798 #undef CALC_SHIFTFORCES
 799
 800 #undef UNROLLI
 801 #undef UNROLLJ
 802 #undef STRIDE
 803 #undef TAB_FDV0
 804 #undef NBFP_STRIDE