src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2009, The GROMACS Development Team
   6  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 #if !(GMX_NBNXN_SIMD_BITWIDTH == 128 || GMX_NBNXN_SIMD_BITWIDTH == 256)
  39 #error "unsupported GMX_NBNXN_SIMD_BITWIDTH"
  40 #endif
  41
  42 #ifdef GMX_NBNXN_HALF_WIDTH_SIMD
  43 #define GMX_USE_HALF_WIDTH_SIMD_HERE
  44 #endif
  45 #include "gmx_simd_macros.h"
  46
  47 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
  48
  49 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
  50 #define UNROLLJ    GMX_SIMD_WIDTH_HERE
  51
  52 /* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
  53 #if GMX_SIMD_WIDTH_HERE >= UNROLLI
  54 #define STRIDE     GMX_SIMD_WIDTH_HERE
  55 #else
  56 #define STRIDE     UNROLLI
  57 #endif
  58
  59 #if GMX_SIMD_WIDTH_HERE == 2
  60 #define SUM_SIMD(x)  (x[0]+x[1])
  61 #else
  62 #if GMX_SIMD_WIDTH_HERE == 4
  63 #define SUM_SIMD(x)  SUM_SIMD4(x)
  64 #else
  65 #if GMX_SIMD_WIDTH_HERE == 8
  66 #define SUM_SIMD(x)  (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
  67 #else
  68 #error "unsupported kernel configuration"
  69 #endif
  70 #endif
  71 #endif
  72
  73
  74 /* Decide if we should use the FDV0 table layout */
  75 #if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
  76 /* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
  77 #if GMX_SIMD_WIDTH_HERE/2 == 4
  78 #define TAB_FDV0
  79 #endif
  80 #else
  81 /* We use the FDV0 table layout when we can use aligned table loads */
  82 #if GMX_SIMD_WIDTH_HERE == 4
  83 #define TAB_FDV0
  84 #endif
  85 #endif
  86
  87
  88 #define SIMD_MASK_ALL   0xffffffff
  89
  90 #include "nbnxn_kernel_simd_utils.h"
  91
  92 /* All functionality defines are set here, except for:
  93  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
  94  * CHECK_EXCLS, which is set just before including the inner loop contents.
  95  * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
  96  * set before calling the kernel function. We might want to move that
  97  * to inside the n-loop and have a different combination rule for different
  98  * ci's, as no combination rule gives a 50% performance hit for LJ.
  99  */
 100
 101 /* We always calculate shift forces, because it's cheap anyhow */
 102 #define CALC_SHIFTFORCES
 103
 104 /* Assumes all LJ parameters are identical */
 105 /* #define FIX_LJ_C */
 106
 107 /* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
 108  * with all combinations off electrostatics (coul), LJ combination rules (ljc)
 109  * and energy calculations (ene), depending on the defines set.
 110  */
 111
 112 #define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
 113
 114 #if defined LJ_COMB_GEOM
 115 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
 116 #else
 117 #if defined LJ_COMB_LB
 118 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
 119 #else
 120 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
 121 #endif
 122 #endif
 123
 124 #ifdef CALC_COUL_RF
 125 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
 126 #endif
 127 #ifdef CALC_COUL_TAB
 128 #ifndef VDW_CUTOFF_CHECK
 129 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
 130 #else
 131 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
 132 #endif
 133 #endif
 134 #ifdef CALC_COUL_EWALD
 135 #ifndef VDW_CUTOFF_CHECK
 136 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
 137 #else
 138 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
 139 #endif
 140 #endif
 141
 142 static void
 143 #ifndef CALC_ENERGIES
 144 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, noener)
 145 #else
 146 #ifndef ENERGY_GROUPS
 147 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, ener)
 148 #else
 149 NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
 150 #endif
 151 #endif
 152 #undef NBK_FUNC_NAME
 153 #undef NBK_FUNC_NAME_C
 154 #undef NBK_FUNC_NAME_C_LJC
 155 (const nbnxn_pairlist_t     *nbl,
 156  const nbnxn_atomdata_t     *nbat,
 157  const interaction_const_t  *ic,
 158  rvec                       *shift_vec,
 159  real                       *f
 160 #ifdef CALC_SHIFTFORCES
 161  ,
 162  real                       *fshift
 163 #endif
 164 #ifdef CALC_ENERGIES
 165  ,
 166  real                       *Vvdw,
 167  real                       *Vc
 168 #endif
 169 )
 170 {
 171     const nbnxn_ci_t   *nbln;
 172     const nbnxn_cj_t   *l_cj;
 173     const int          *type;
 174     const real         *q;
 175     const real         *shiftvec;
 176     const real         *x;
 177     const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
 178     real                facel;
 179     real               *nbfp_ptr;
 180     int                 nbfp_stride;
 181     int                 n, ci, ci_sh;
 182     int                 ish, ish3;
 183     gmx_bool            do_LJ, half_LJ, do_coul;
 184     int                 sci, scix, sciy, sciz, sci2;
 185     int                 cjind0, cjind1, cjind;
 186     int                 ip, jp;
 187
 188 #ifdef ENERGY_GROUPS
 189     int         Vstride_i;
 190     int         egps_ishift, egps_imask;
 191     int         egps_jshift, egps_jmask, egps_jstride;
 192     int         egps_i;
 193     real       *vvdwtp[UNROLLI];
 194     real       *vctp[UNROLLI];
 195 #endif
 196
 197     gmx_mm_pr  shX_S;
 198     gmx_mm_pr  shY_S;
 199     gmx_mm_pr  shZ_S;
 200     gmx_mm_pr  ix_S0, iy_S0, iz_S0;
 201     gmx_mm_pr  ix_S1, iy_S1, iz_S1;
 202     gmx_mm_pr  ix_S2, iy_S2, iz_S2;
 203     gmx_mm_pr  ix_S3, iy_S3, iz_S3;
 204     gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
 205     gmx_mm_pr  fix_S1, fiy_S1, fiz_S1;
 206     gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
 207     gmx_mm_pr  fix_S3, fiy_S3, fiz_S3;
 208 #if UNROLLJ >= 4
 209 #ifndef GMX_DOUBLE
 210     __m128     fix_S, fiy_S, fiz_S;
 211 #else
 212     __m256d    fix_S, fiy_S, fiz_S;
 213 #endif
 214 #else
 215     __m128d    fix0_S, fiy0_S, fiz0_S;
 216     __m128d    fix2_S, fiy2_S, fiz2_S;
 217 #endif
 218
 219     gmx_mm_pr diag_jmi_S;
 220 #if UNROLLI == UNROLLJ
 221     gmx_mm_pr diag_S0, diag_S1, diag_S2, diag_S3;
 222 #else
 223     gmx_mm_pr diag0_S0, diag0_S1, diag0_S2, diag0_S3;
 224     gmx_mm_pr diag1_S0, diag1_S1, diag1_S2, diag1_S3;
 225 #endif
 226
 227 #ifdef gmx_checkbitmask_epi32
 228     gmx_epi32 mask_S0, mask_S1, mask_S2, mask_S3;
 229 #else
 230     gmx_mm_pr mask_S0, mask_S1, mask_S2, mask_S3;
 231 #endif
 232
 233     gmx_mm_pr  zero_S = gmx_set1_pr(0);
 234
 235     gmx_mm_pr  one_S  = gmx_set1_pr(1.0);
 236     gmx_mm_pr  iq_S0  = gmx_setzero_pr();
 237     gmx_mm_pr  iq_S1  = gmx_setzero_pr();
 238     gmx_mm_pr  iq_S2  = gmx_setzero_pr();
 239     gmx_mm_pr  iq_S3  = gmx_setzero_pr();
 240     gmx_mm_pr  mrc_3_S;
 241 #ifdef CALC_ENERGIES
 242     gmx_mm_pr  hrc_3_S, moh_rc_S;
 243 #endif
 244
 245 #ifdef CALC_COUL_TAB
 246     /* Coulomb table variables */
 247     gmx_mm_pr   invtsp_S;
 248     const real *tab_coul_F;
 249 #ifndef TAB_FDV0
 250     const real *tab_coul_V;
 251 #endif
 252 #if GMX_NBNXN_SIMD_BITWIDTH == 256
 253     int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
 254     int        ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
 255     int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
 256     int        ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
 257 #endif
 258 #ifdef CALC_ENERGIES
 259     gmx_mm_pr  mhalfsp_S;
 260 #endif
 261 #endif
 262
 263 #ifdef CALC_COUL_EWALD
 264     gmx_mm_pr beta2_S, beta_S;
 265 #endif
 266
 267 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 268     gmx_mm_pr  sh_ewald_S;
 269 #endif
 270
 271 #ifdef LJ_COMB_LB
 272     const real *ljc;
 273
 274     gmx_mm_pr   hsig_i_S0, seps_i_S0;
 275     gmx_mm_pr   hsig_i_S1, seps_i_S1;
 276     gmx_mm_pr   hsig_i_S2, seps_i_S2;
 277     gmx_mm_pr   hsig_i_S3, seps_i_S3;
 278 #else
 279 #ifdef FIX_LJ_C
 280     real        pvdw_array[2*UNROLLI*UNROLLJ+3];
 281     real       *pvdw_c6, *pvdw_c12;
 282     gmx_mm_pr   c6_S0, c12_S0;
 283     gmx_mm_pr   c6_S1, c12_S1;
 284     gmx_mm_pr   c6_S2, c12_S2;
 285     gmx_mm_pr   c6_S3, c12_S3;
 286 #endif
 287
 288 #ifdef LJ_COMB_GEOM
 289     const real *ljc;
 290
 291     gmx_mm_pr   c6s_S0, c12s_S0;
 292     gmx_mm_pr   c6s_S1, c12s_S1;
 293     gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
 294     gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
 295 #endif
 296 #endif /* LJ_COMB_LB */
 297
 298     gmx_mm_pr  vctot_S, Vvdwtot_S;
 299     gmx_mm_pr  sixth_S, twelveth_S;
 300
 301     gmx_mm_pr  avoid_sing_S;
 302     gmx_mm_pr  rc2_S;
 303 #ifdef VDW_CUTOFF_CHECK
 304     gmx_mm_pr  rcvdw2_S;
 305 #endif
 306
 307 #ifdef CALC_ENERGIES
 308     gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
 309
 310     /* cppcheck-suppress unassignedVariable */
 311     real       tmpsum_array[15], *tmpsum;
 312 #endif
 313 #ifdef CALC_SHIFTFORCES
 314     /* cppcheck-suppress unassignedVariable */
 315     real       shf_array[15], *shf;
 316 #endif
 317
 318     int ninner;
 319
 320 #ifdef COUNT_PAIRS
 321     int npair = 0;
 322 #endif
 323
 324 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 325     ljc = nbat->lj_comb;
 326 #else
 327     /* No combination rule used */
 328 #ifndef GMX_DOUBLE
 329     nbfp_ptr    = nbat->nbfp_s4;
 330 #define NBFP_STRIDE  4
 331 #else
 332     nbfp_ptr    = nbat->nbfp;
 333 #define NBFP_STRIDE  2
 334 #endif
 335     nbfp_stride = NBFP_STRIDE;
 336 #endif
 337
 338     /* Load j-i for the first i */
 339     diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag);
 340     /* Generate all the diagonal masks as comparison results */
 341 #if UNROLLI == UNROLLJ
 342     diag_S0    = gmx_cmplt_pr(zero_S, diag_jmi_S);
 343     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
 344     diag_S1    = gmx_cmplt_pr(zero_S, diag_jmi_S);
 345     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
 346     diag_S2    = gmx_cmplt_pr(zero_S, diag_jmi_S);
 347     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
 348     diag_S3    = gmx_cmplt_pr(zero_S, diag_jmi_S);
 349 #else
 350 #if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
 351     diag0_S0   = gmx_cmplt_pr(zero_S, diag_jmi_S);
 352     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
 353     diag0_S1   = gmx_cmplt_pr(zero_S, diag_jmi_S);
 354     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
 355     diag0_S2   = gmx_cmplt_pr(zero_S, diag_jmi_S);
 356     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
 357     diag0_S3   = gmx_cmplt_pr(zero_S, diag_jmi_S);
 358     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
 359
 360 #if UNROLLI == 2*UNROLLJ
 361     /* Load j-i for the second half of the j-cluster */
 362     diag_jmi_S = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
 363 #endif
 364
 365     diag1_S0   = gmx_cmplt_pr(zero_S, diag_jmi_S);
 366     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
 367     diag1_S1   = gmx_cmplt_pr(zero_S, diag_jmi_S);
 368     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
 369     diag1_S2   = gmx_cmplt_pr(zero_S, diag_jmi_S);
 370     diag_jmi_S = gmx_sub_pr(diag_jmi_S, one_S);
 371     diag1_S3   = gmx_cmplt_pr(zero_S, diag_jmi_S);
 372 #endif
 373 #endif
 374
 375     /* Load masks for topology exclusion masking */
 376 #ifdef gmx_checkbitmask_epi32
 377     mask_S0    = gmx_load_si(nbat->simd_excl_mask + 0*GMX_NBNXN_SIMD_BITWIDTH/32);
 378     mask_S1    = gmx_load_si(nbat->simd_excl_mask + 1*GMX_NBNXN_SIMD_BITWIDTH/32);
 379     mask_S2    = gmx_load_si(nbat->simd_excl_mask + 2*GMX_NBNXN_SIMD_BITWIDTH/32);
 380     mask_S3    = gmx_load_si(nbat->simd_excl_mask + 3*GMX_NBNXN_SIMD_BITWIDTH/32);
 381 #else
 382     mask_S0    = gmx_load_pr((real *)nbat->simd_excl_mask + 0*UNROLLJ);
 383     mask_S1    = gmx_load_pr((real *)nbat->simd_excl_mask + 1*UNROLLJ);
 384     mask_S2    = gmx_load_pr((real *)nbat->simd_excl_mask + 2*UNROLLJ);
 385     mask_S3    = gmx_load_pr((real *)nbat->simd_excl_mask + 3*UNROLLJ);
 386 #endif
 387
 388 #ifdef CALC_COUL_TAB
 389 #if GMX_NBNXN_SIMD_BITWIDTH == 256
 390     /* Generate aligned table index pointers */
 391     ti0 = gmx_simd_align_int(ti0_array);
 392     ti1 = gmx_simd_align_int(ti1_array);
 393     ti2 = gmx_simd_align_int(ti2_array);
 394     ti3 = gmx_simd_align_int(ti3_array);
 395 #endif
 396
 397     invtsp_S  = gmx_set1_pr(ic->tabq_scale);
 398 #ifdef CALC_ENERGIES
 399     mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
 400 #endif
 401
 402 #ifdef TAB_FDV0
 403     tab_coul_F = ic->tabq_coul_FDV0;
 404 #else
 405     tab_coul_F = ic->tabq_coul_F;
 406     tab_coul_V = ic->tabq_coul_V;
 407 #endif
 408 #endif /* CALC_COUL_TAB */
 409
 410 #ifdef CALC_COUL_EWALD
 411     beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
 412     beta_S  = gmx_set1_pr(ic->ewaldcoeff);
 413 #endif
 414
 415 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
 416     sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
 417 #endif
 418
 419     q                   = nbat->q;
 420     type                = nbat->type;
 421     facel               = ic->epsfac;
 422     shiftvec            = shift_vec[0];
 423     x                   = nbat->x;
 424
 425     avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
 426
 427     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
 428     rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
 429 #ifdef VDW_CUTOFF_CHECK
 430     rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
 431 #endif
 432
 433 #ifdef CALC_ENERGIES
 434     sixth_S      = gmx_set1_pr(1.0/6.0);
 435     twelveth_S   = gmx_set1_pr(1.0/12.0);
 436
 437     sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
 438     sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
 439 #endif
 440
 441     mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
 442
 443 #ifdef CALC_ENERGIES
 444     hrc_3_S  = gmx_set1_pr(ic->k_rf);
 445
 446     moh_rc_S = gmx_set1_pr(-ic->c_rf);
 447 #endif
 448
 449 #ifdef CALC_ENERGIES
 450     tmpsum   = gmx_simd_align_real(tmpsum_array);
 451 #endif
 452 #ifdef CALC_SHIFTFORCES
 453     shf      = gmx_simd_align_real(shf_array);
 454 #endif
 455
 456 #ifdef FIX_LJ_C
 457     pvdw_c6  = gmx_simd_align_real(pvdw_array+3);
 458     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
 459
 460     for (jp = 0; jp < UNROLLJ; jp++)
 461     {
 462         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
 463         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
 464         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
 465         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
 466
 467         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 468         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 469         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 470         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 471     }
 472     c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
 473     c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
 474     c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
 475     c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
 476
 477     c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
 478     c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
 479     c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
 480     c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
 481 #endif /* FIX_LJ_C */
 482
 483 #ifdef ENERGY_GROUPS
 484     egps_ishift  = nbat->neg_2log;
 485     egps_imask   = (1<<egps_ishift) - 1;
 486     egps_jshift  = 2*nbat->neg_2log;
 487     egps_jmask   = (1<<egps_jshift) - 1;
 488     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
 489     /* Major division is over i-particle energy groups, determine the stride */
 490     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 491 #endif
 492
 493     l_cj = nbl->cj;
 494
 495     ninner = 0;
 496     for (n = 0; n < nbl->nci; n++)
 497     {
 498         nbln = &nbl->ci[n];
 499
 500         ish              = (nbln->shift & NBNXN_CI_SHIFT);
 501         ish3             = ish*3;
 502         cjind0           = nbln->cj_ind_start;
 503         cjind1           = nbln->cj_ind_end;
 504         ci               = nbln->ci;
 505         ci_sh            = (ish == CENTRAL ? ci : -1);
 506
 507         shX_S = gmx_load1_pr(shiftvec+ish3);
 508         shY_S = gmx_load1_pr(shiftvec+ish3+1);
 509         shZ_S = gmx_load1_pr(shiftvec+ish3+2);
 510
 511 #if UNROLLJ <= 4
 512         sci              = ci*STRIDE;
 513         scix             = sci*DIM;
 514         sci2             = sci*2;
 515 #else
 516         sci              = (ci>>1)*STRIDE;
 517         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
 518         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
 519         sci             += (ci & 1)*(STRIDE>>1);
 520 #endif
 521
 522         /* We have 5 LJ/C combinations, but use only three inner loops,
 523          * as the other combinations are unlikely and/or not much faster:
 524          * inner half-LJ + C for half-LJ + C / no-LJ + C
 525          * inner LJ + C      for full-LJ + C
 526          * inner LJ          for full-LJ + no-C / half-LJ + no-C
 527          */
 528         do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
 529         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
 530         half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 531
 532 #ifdef ENERGY_GROUPS
 533         egps_i = nbat->energrp[ci];
 534         {
 535             int ia, egp_ia;
 536
 537             for (ia = 0; ia < UNROLLI; ia++)
 538             {
 539                 egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
 540                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
 541                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
 542             }
 543         }
 544 #endif
 545 #if defined CALC_ENERGIES
 546 #if UNROLLJ == 4
 547         if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
 548 #endif
 549 #if UNROLLJ == 2
 550         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
 551 #endif
 552 #if UNROLLJ == 8
 553         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
 554 #endif
 555         {
 556             int  ia;
 557             real Vc_sub_self;
 558
 559 #ifdef CALC_COUL_RF
 560             Vc_sub_self = 0.5*ic->c_rf;
 561 #endif
 562 #ifdef CALC_COUL_TAB
 563 #ifdef TAB_FDV0
 564             Vc_sub_self = 0.5*tab_coul_F[2];
 565 #else
 566             Vc_sub_self = 0.5*tab_coul_V[0];
 567 #endif
 568 #endif
 569 #ifdef CALC_COUL_EWALD
 570             /* beta/sqrt(pi) */
 571             Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
 572 #endif
 573
 574             for (ia = 0; ia < UNROLLI; ia++)
 575             {
 576                 real qi;
 577
 578                 qi = q[sci+ia];
 579 #ifdef ENERGY_GROUPS
 580                 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
 581 #else
 582                 Vc[0]
 583 #endif
 584                     -= facel*qi*qi*Vc_sub_self;
 585             }
 586         }
 587 #endif
 588
 589         /* Load i atom data */
 590         sciy             = scix + STRIDE;
 591         sciz             = sciy + STRIDE;
 592         ix_S0          = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
 593         ix_S1          = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
 594         ix_S2          = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
 595         ix_S3          = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
 596         iy_S0          = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
 597         iy_S1          = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
 598         iy_S2          = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
 599         iy_S3          = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
 600         iz_S0          = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
 601         iz_S1          = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
 602         iz_S2          = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
 603         iz_S3          = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
 604
 605         if (do_coul)
 606         {
 607             iq_S0      = gmx_set1_pr(facel*q[sci]);
 608             iq_S1      = gmx_set1_pr(facel*q[sci+1]);
 609             iq_S2      = gmx_set1_pr(facel*q[sci+2]);
 610             iq_S3      = gmx_set1_pr(facel*q[sci+3]);
 611         }
 612
 613 #ifdef LJ_COMB_LB
 614         hsig_i_S0      = gmx_load1_pr(ljc+sci2+0);
 615         hsig_i_S1      = gmx_load1_pr(ljc+sci2+1);
 616         hsig_i_S2      = gmx_load1_pr(ljc+sci2+2);
 617         hsig_i_S3      = gmx_load1_pr(ljc+sci2+3);
 618         seps_i_S0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
 619         seps_i_S1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
 620         seps_i_S2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
 621         seps_i_S3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
 622 #else
 623 #ifdef LJ_COMB_GEOM
 624         c6s_S0         = gmx_load1_pr(ljc+sci2+0);
 625         c6s_S1         = gmx_load1_pr(ljc+sci2+1);
 626         if (!half_LJ)
 627         {
 628             c6s_S2     = gmx_load1_pr(ljc+sci2+2);
 629             c6s_S3     = gmx_load1_pr(ljc+sci2+3);
 630         }
 631         c12s_S0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
 632         c12s_S1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
 633         if (!half_LJ)
 634         {
 635             c12s_S2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
 636             c12s_S3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
 637         }
 638 #else
 639         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
 640         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
 641         if (!half_LJ)
 642         {
 643             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
 644             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
 645         }
 646 #endif
 647 #endif
 648
 649         /* Zero the potential energy for this list */
 650         Vvdwtot_S        = gmx_setzero_pr();
 651         vctot_S          = gmx_setzero_pr();
 652
 653         /* Clear i atom forces */
 654         fix_S0           = gmx_setzero_pr();
 655         fix_S1           = gmx_setzero_pr();
 656         fix_S2           = gmx_setzero_pr();
 657         fix_S3           = gmx_setzero_pr();
 658         fiy_S0           = gmx_setzero_pr();
 659         fiy_S1           = gmx_setzero_pr();
 660         fiy_S2           = gmx_setzero_pr();
 661         fiy_S3           = gmx_setzero_pr();
 662         fiz_S0           = gmx_setzero_pr();
 663         fiz_S1           = gmx_setzero_pr();
 664         fiz_S2           = gmx_setzero_pr();
 665         fiz_S3           = gmx_setzero_pr();
 666
 667         cjind = cjind0;
 668
 669         /* Currently all kernels use (at least half) LJ */
 670 #define CALC_LJ
 671         if (half_LJ)
 672         {
 673 #define CALC_COULOMB
 674 #define HALF_LJ
 675 #define CHECK_EXCLS
 676             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 677             {
 678 #include "nbnxn_kernel_simd_4xn_inner.h"
 679                 cjind++;
 680             }
 681 #undef CHECK_EXCLS
 682             for (; (cjind < cjind1); cjind++)
 683             {
 684 #include "nbnxn_kernel_simd_4xn_inner.h"
 685             }
 686 #undef HALF_LJ
 687 #undef CALC_COULOMB
 688         }
 689         else if (do_coul)
 690         {
 691 #define CALC_COULOMB
 692 #define CHECK_EXCLS
 693             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 694             {
 695 #include "nbnxn_kernel_simd_4xn_inner.h"
 696                 cjind++;
 697             }
 698 #undef CHECK_EXCLS
 699             for (; (cjind < cjind1); cjind++)
 700             {
 701 #include "nbnxn_kernel_simd_4xn_inner.h"
 702             }
 703 #undef CALC_COULOMB
 704         }
 705         else
 706         {
 707 #define CHECK_EXCLS
 708             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 709             {
 710 #include "nbnxn_kernel_simd_4xn_inner.h"
 711                 cjind++;
 712             }
 713 #undef CHECK_EXCLS
 714             for (; (cjind < cjind1); cjind++)
 715             {
 716 #include "nbnxn_kernel_simd_4xn_inner.h"
 717             }
 718         }
 719 #undef CALC_LJ
 720         ninner += cjind1 - cjind0;
 721
 722         /* Add accumulated i-forces to the force array */
 723 #if UNROLLJ >= 4
 724 #ifndef GMX_DOUBLE
 725 #define gmx_load_pr4  _mm_load_ps
 726 #define gmx_store_pr4 _mm_store_ps
 727 #define gmx_add_pr4   _mm_add_ps
 728 #else
 729 #define gmx_load_pr4  _mm256_load_pd
 730 #define gmx_store_pr4 _mm256_store_pd
 731 #define gmx_add_pr4   _mm256_add_pd
 732 #endif
 733         GMX_MM_TRANSPOSE_SUM4_PR(fix_S0, fix_S1, fix_S2, fix_S3, fix_S);
 734         gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
 735
 736         GMX_MM_TRANSPOSE_SUM4_PR(fiy_S0, fiy_S1, fiy_S2, fiy_S3, fiy_S);
 737         gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
 738
 739         GMX_MM_TRANSPOSE_SUM4_PR(fiz_S0, fiz_S1, fiz_S2, fiz_S3, fiz_S);
 740         gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
 741
 742 #ifdef CALC_SHIFTFORCES
 743         gmx_store_pr4(shf, fix_S);
 744         fshift[ish3+0] += SUM_SIMD4(shf);
 745         gmx_store_pr4(shf, fiy_S);
 746         fshift[ish3+1] += SUM_SIMD4(shf);
 747         gmx_store_pr4(shf, fiz_S);
 748         fshift[ish3+2] += SUM_SIMD4(shf);
 749 #endif
 750 #else
 751         GMX_MM_TRANSPOSE_SUM2_PD(fix_S0, fix_S1, fix0_S);
 752         _mm_store_pd(f+scix, _mm_add_pd(fix0_S, _mm_load_pd(f+scix)));
 753         GMX_MM_TRANSPOSE_SUM2_PD(fix_S2, fix_S3, fix2_S);
 754         _mm_store_pd(f+scix+2, _mm_add_pd(fix2_S, _mm_load_pd(f+scix+2)));
 755
 756         GMX_MM_TRANSPOSE_SUM2_PD(fiy_S0, fiy_S1, fiy0_S);
 757         _mm_store_pd(f+sciy, _mm_add_pd(fiy0_S, _mm_load_pd(f+sciy)));
 758         GMX_MM_TRANSPOSE_SUM2_PD(fiy_S2, fiy_S3, fiy2_S);
 759         _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_S, _mm_load_pd(f+sciy+2)));
 760
 761         GMX_MM_TRANSPOSE_SUM2_PD(fiz_S0, fiz_S1, fiz0_S);
 762         _mm_store_pd(f+sciz, _mm_add_pd(fiz0_S, _mm_load_pd(f+sciz)));
 763         GMX_MM_TRANSPOSE_SUM2_PD(fiz_S2, fiz_S3, fiz2_S);
 764         _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_S, _mm_load_pd(f+sciz+2)));
 765
 766 #ifdef CALC_SHIFTFORCES
 767         _mm_store_pd(shf, _mm_add_pd(fix0_S, fix2_S));
 768         fshift[ish3+0] += shf[0] + shf[1];
 769         _mm_store_pd(shf, _mm_add_pd(fiy0_S, fiy2_S));
 770         fshift[ish3+1] += shf[0] + shf[1];
 771         _mm_store_pd(shf, _mm_add_pd(fiz0_S, fiz2_S));
 772         fshift[ish3+2] += shf[0] + shf[1];
 773 #endif
 774 #endif
 775
 776 #ifdef CALC_ENERGIES
 777         if (do_coul)
 778         {
 779             gmx_store_pr(tmpsum, vctot_S);
 780             *Vc += SUM_SIMD(tmpsum);
 781         }
 782
 783         gmx_store_pr(tmpsum, Vvdwtot_S);
 784         *Vvdw += SUM_SIMD(tmpsum);
 785 #endif
 786
 787         /* Outer loop uses 6 flops/iteration */
 788     }
 789
 790 #ifdef COUNT_PAIRS
 791     printf("atom pairs %d\n", npair);
 792 #endif
 793 }
 794
 795
 796 #undef gmx_load_pr4
 797 #undef gmx_store_pr4
 798 #undef gmx_store_pr4
 799
 800 #undef CALC_SHIFTFORCES
 801
 802 #undef UNROLLI
 803 #undef UNROLLJ
 804 #undef STRIDE
 805 #undef TAB_FDV0
 806 #undef NBFP_STRIDE
 807
 808 #undef GMX_USE_HALF_WIDTH_SIMD_HERE