src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2009, The GROMACS Development Team
   6  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38
  39 /* Half-width SIMD operations are required here.
  40  * As the 4xn kernels are the "standard" kernels and some special operations
  41  * are required only here, we define those in nbnxn_kernel_simd_utils_...
  42  *
  43  * Half-width SIMD real type:
  44  * gmx_mm_hpr
  45  *
  46  * Half-width SIMD operations
  47  * Load reals at half-width aligned pointer b into half-width SIMD register a:
  48  * gmx_load_hpr(a, b)
  49  * Set all entries in half-width SIMD register *a to b:
  50  * gmx_set1_hpr(a, b)
  51  * Load one real at b and one real at b+1 into halves of a, respectively:
  52  * gmx_load1p1_pr(a, b)
  53  * Load reals at half-width aligned pointer b into two halves of a:
  54  * gmx_loaddh_pr(a, b)
  55  * Store half-width SIMD register b into half width aligned memory a:
  56  * gmx_store_hpr(a, b)
  57  * gmx_add_hpr(a, b)
  58  * gmx_sub_hpr(a, b)
  59  * Sum over 4 half SIMD registers:
  60  * gmx_sum4_hpr(a, b)
  61  * Sum the elements of halfs of each input register and store sums in out:
  62  * gmx_mm_transpose_sum4h_pr(a, b)
  63  * Extract two half-width registers *b, *c from a full width register a:
  64  * gmx_pr_to_2hpr(a, b, c)
  65  */
  66
  67
  68 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
  69
  70 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
  71 #define UNROLLJ    (GMX_SIMD_WIDTH_HERE/2)
  72
  73 /* The stride of all the atom data arrays is equal to half the SIMD width */
  74 #define STRIDE     (GMX_SIMD_WIDTH_HERE/2)
  75
  76 #if GMX_SIMD_WIDTH_HERE == 8
  77 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
  78 #else
  79 #if GMX_SIMD_WIDTH_HERE == 16
  80 /* This is getting ridiculous, SIMD horizontal adds would help,
  81  * but this is not performance critical (only used to reduce energies)
  82  */
  83 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7]+x[8]+x[9]+x[10]+x[11]+x[12]+x[13]+x[14]+x[15])
  84 #else
  85 #error "unsupported kernel configuration"
  86 #endif
  87 #endif
  88
  89
  90 #if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
  91 /* AVX-256 single precision 2x(4+4) kernel,
  92  * we can do half SIMD-width aligned FDV0 table loads.
  93  */
  94 #define TAB_FDV0
  95 #endif
  96
  97 /* Currently stride 4 for the 2 LJ parameters is hard coded */
  98 #define NBFP_STRIDE  4
  99
 100
 101 #include "nbnxn_kernel_simd_utils.h"
 102
 103 /* All functionality defines are set here, except for:
 104  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
 105  * CHECK_EXCLS, which is set just before including the inner loop contents.
 106  * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
 107  * set before calling the kernel function. We might want to move that
 108  * to inside the n-loop and have a different combination rule for different
 109  * ci's, as no combination rule gives a 50% performance hit for LJ.
 110  */
 111
 112 /* We always calculate shift forces, because it's cheap anyhow */
 113 #define CALC_SHIFTFORCES
 114
 115 /* Assumes all LJ parameters are identical */
 116 /* #define FIX_LJ_C */
 117
 118 /* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
 119  * with all combinations off electrostatics (coul), LJ combination rules (ljc)
 120  * and energy calculations (ene), depending on the defines set.
 121  */
 122
 123 #define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
 124
 125 #if defined LJ_COMB_GEOM
 126 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
 127 #else
 128 #if defined LJ_COMB_LB
 129 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
 130 #else
 131 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
 132 #endif
 133 #endif
 134
 135 #ifdef CALC_COUL_RF
 136 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
 137 #endif
 138 #ifdef CALC_COUL_TAB
 139 #ifndef VDW_CUTOFF_CHECK
 140 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
 141 #else
 142 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
 143 #endif
 144 #endif
 145 #ifdef CALC_COUL_EWALD
 146 #ifndef VDW_CUTOFF_CHECK
 147 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
 148 #else
 149 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
 150 #endif
 151 #endif
 152
 153 static void
 154 #ifndef CALC_ENERGIES
 155 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, noener)
 156 #else
 157 #ifndef ENERGY_GROUPS
 158 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, ener)
 159 #else
 160 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, energrp)
 161 #endif
 162 #endif
 163 #undef NBK_FUNC_NAME
 164 #undef NBK_FUNC_NAME_C
 165 #undef NBK_FUNC_NAME_C_LJC
 166 (const nbnxn_pairlist_t     *nbl,
 167  const nbnxn_atomdata_t     *nbat,
 168  const interaction_const_t  *ic,
 169  rvec                       *shift_vec,
 170  real                       *f
 171 #ifdef CALC_SHIFTFORCES
 172  ,
 173  real                       *fshift
 174 #endif
 175 #ifdef CALC_ENERGIES
 176  ,
 177  real                       *Vvdw,
 178  real                       *Vc
 179 #endif
 180 )
 181 {
 182     const nbnxn_ci_t   *nbln;
 183     const nbnxn_cj_t   *l_cj;
 184     const int          *type;
 185     const real         *q;
 186     const real         *shiftvec;
 187     const real         *x;
 188     const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
 189     real                facel;
 190     real               *nbfp_ptr;
 191     int                 nbfp_stride;
 192     int                 n, ci, ci_sh;
 193     int                 ish, ish3;
 194     gmx_bool            do_LJ, half_LJ, do_coul;
 195     int                 sci, scix, sciy, sciz, sci2;
 196     int                 cjind0, cjind1, cjind;
 197     int                 ip, jp;
 198
 199 #ifdef ENERGY_GROUPS
 200     int         Vstride_i;
 201     int         egps_ishift, egps_imask;
 202     int         egps_jshift, egps_jmask, egps_jstride;
 203     int         egps_i;
 204     real       *vvdwtp[UNROLLI];
 205     real       *vctp[UNROLLI];
 206 #endif
 207
 208     gmx_mm_pr  shX_S;
 209     gmx_mm_pr  shY_S;
 210     gmx_mm_pr  shZ_S;
 211     gmx_mm_pr  ix_S0, iy_S0, iz_S0;
 212     gmx_mm_pr  ix_S2, iy_S2, iz_S2;
 213     gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
 214     gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
 215     /* We use an i-force SIMD register width of 4 */
 216     /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
 217     gmx_mm_pr4 fix_S, fiy_S, fiz_S;
 218
 219     gmx_mm_pr  diagonal_jmi_S;
 220 #if UNROLLI == UNROLLJ
 221     gmx_mm_pb  diagonal_mask_S0, diagonal_mask_S2;
 222 #else
 223     gmx_mm_pb  diagonal_mask0_S0, diagonal_mask0_S2;
 224     gmx_mm_pb  diagonal_mask1_S0, diagonal_mask1_S2;
 225 #endif
 226
 227     unsigned   *excl_filter;
 228 #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
 229     gmx_epi32  filter_S0, filter_S2;
 230 #else
 231     gmx_mm_pr  filter_S0, filter_S2;
 232 #endif
 233
 234     gmx_mm_pr  zero_S = gmx_set1_pr(0);
 235
 236     gmx_mm_pr  one_S = gmx_set1_pr(1.0);
 237     gmx_mm_pr  iq_S0 = gmx_setzero_pr();
 238     gmx_mm_pr  iq_S2 = gmx_setzero_pr();
 239     gmx_mm_pr  mrc_3_S;
 240 #ifdef CALC_ENERGIES
 241     gmx_mm_pr  hrc_3_S, moh_rc_S;
 242 #endif
 243
 244 #ifdef CALC_COUL_TAB
 245     /* Coulomb table variables */
 246     gmx_mm_pr   invtsp_S;
 247     const real *tab_coul_F;
 248 #ifndef TAB_FDV0
 249     const real *tab_coul_V;
 250 #endif
 251     int        ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0;
 252     int        ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2;
 253 #ifdef CALC_ENERGIES
 254     gmx_mm_pr  mhalfsp_S;
 255 #endif
 256 #endif
 257
 258 #ifdef CALC_COUL_EWALD
 259     gmx_mm_pr beta2_S, beta_S;
 260 #endif
 261
 262 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 263     gmx_mm_pr  sh_ewald_S;
 264 #endif
 265
 266 #ifdef LJ_COMB_LB
 267     const real *ljc;
 268
 269     gmx_mm_pr   hsig_i_S0, seps_i_S0;
 270     gmx_mm_pr   hsig_i_S2, seps_i_S2;
 271 #else
 272 #ifdef FIX_LJ_C
 273     real        pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_WIDTH_HERE];
 274     real       *pvdw_c6, *pvdw_c12;
 275     gmx_mm_pr   c6_S0, c12_S0;
 276     gmx_mm_pr   c6_S2, c12_S2;
 277 #endif
 278
 279 #ifdef LJ_COMB_GEOM
 280     const real *ljc;
 281
 282     gmx_mm_pr   c6s_S0, c12s_S0;
 283     gmx_mm_pr   c6s_S1, c12s_S1;
 284     gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
 285     gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
 286 #endif
 287 #endif /* LJ_COMB_LB */
 288
 289     gmx_mm_pr  vctot_S, Vvdwtot_S;
 290     gmx_mm_pr  sixth_S, twelveth_S;
 291
 292     gmx_mm_pr  avoid_sing_S;
 293     gmx_mm_pr  rc2_S;
 294 #ifdef VDW_CUTOFF_CHECK
 295     gmx_mm_pr  rcvdw2_S;
 296 #endif
 297
 298 #ifdef CALC_ENERGIES
 299     gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
 300
 301     /* cppcheck-suppress unassignedVariable */
 302     real       tmpsum_array[2*GMX_SIMD_WIDTH_HERE], *tmpsum;
 303 #endif
 304 #ifdef CALC_SHIFTFORCES
 305     /* cppcheck-suppress unassignedVariable */
 306     real       shf_array[2*GMX_SIMD_WIDTH_HERE], *shf;
 307 #endif
 308
 309     int ninner;
 310
 311 #ifdef COUNT_PAIRS
 312     int npair = 0;
 313 #endif
 314
 315 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 316     ljc = nbat->lj_comb;
 317 #else
 318     /* No combination rule used */
 319 #if NBFP_STRIDE == 2
 320     nbfp_ptr    = nbat->nbfp;
 321 #else
 322 #if NBFP_STRIDE == 4
 323     nbfp_ptr    = nbat->nbfp_s4;
 324 #else
 325 #error "Only NBFP_STRIDE 2 and 4 are currently supported"
 326 #endif
 327 #endif
 328     nbfp_stride = NBFP_STRIDE;
 329 #endif
 330
 331     /* Load j-i for the first i */
 332     diagonal_jmi_S    = gmx_load_pr(nbat->simd_2xnn_diagonal_j_minus_i);
 333     /* Generate all the diagonal masks as comparison results */
 334 #if UNROLLI == UNROLLJ
 335     diagonal_mask_S0  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 336     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 337     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 338     diagonal_mask_S2  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 339 #else
 340 #if 2*UNROLLI == UNROLLJ
 341     diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 342     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 343     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 344     diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 345     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 346     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 347     diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 348     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 349     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 350     diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 351 #endif
 352 #endif
 353
 354     /* Load masks for topology exclusion masking */
 355 #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
 356 #define FILTER_STRIDE  (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
 357 #else
 358 #ifdef GMX_DOUBLE
 359 #define FILTER_STRIDE  2
 360 #else
 361 #define FILTER_STRIDE  1
 362 #endif
 363 #endif
 364 #if FILTER_STRIDE == 1
 365     excl_filter = nbat->simd_exclusion_filter1;
 366 #else
 367     excl_filter = nbat->simd_exclusion_filter2;
 368 #endif
 369     /* Here we cast the exclusion filters from unsigned * to int * or real *.
 370      * Since we only check bits, the actual value they represent does not
 371      * matter, as long as both filter and mask data are treated the same way.
 372      */
 373 #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
 374     filter_S0 = gmx_load_si((int *)excl_filter + 0*2*UNROLLJ*FILTER_STRIDE);
 375     filter_S2 = gmx_load_si((int *)excl_filter + 1*2*UNROLLJ*FILTER_STRIDE);
 376 #else
 377     filter_S0 = gmx_load_pr((real *)excl_filter + 0*2*UNROLLJ);
 378     filter_S2 = gmx_load_pr((real *)excl_filter + 1*2*UNROLLJ);
 379 #endif
 380 #undef FILTER_STRIDE
 381
 382 #ifdef CALC_COUL_TAB
 383     /* Generate aligned table index pointers */
 384     ti0 = gmx_simd_align_int(ti0_array);
 385     ti2 = gmx_simd_align_int(ti2_array);
 386
 387     invtsp_S  = gmx_set1_pr(ic->tabq_scale);
 388 #ifdef CALC_ENERGIES
 389     mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
 390 #endif
 391
 392 #ifdef TAB_FDV0
 393     tab_coul_F = ic->tabq_coul_FDV0;
 394 #else
 395     tab_coul_F = ic->tabq_coul_F;
 396     tab_coul_V = ic->tabq_coul_V;
 397 #endif
 398 #endif /* CALC_COUL_TAB */
 399
 400 #ifdef CALC_COUL_EWALD
 401     beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
 402     beta_S  = gmx_set1_pr(ic->ewaldcoeff);
 403 #endif
 404
 405 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
 406     sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
 407 #endif
 408
 409     q                   = nbat->q;
 410     type                = nbat->type;
 411     facel               = ic->epsfac;
 412     shiftvec            = shift_vec[0];
 413     x                   = nbat->x;
 414
 415     avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
 416
 417     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
 418     rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
 419 #ifdef VDW_CUTOFF_CHECK
 420     rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
 421 #endif
 422
 423 #ifdef CALC_ENERGIES
 424     sixth_S      = gmx_set1_pr(1.0/6.0);
 425     twelveth_S   = gmx_set1_pr(1.0/12.0);
 426
 427     sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
 428     sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
 429 #endif
 430
 431     mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
 432
 433 #ifdef CALC_ENERGIES
 434     hrc_3_S  = gmx_set1_pr(ic->k_rf);
 435
 436     moh_rc_S = gmx_set1_pr(-ic->c_rf);
 437 #endif
 438
 439 #ifdef CALC_ENERGIES
 440     tmpsum   = gmx_simd_align_real(tmpsum_array);
 441 #endif
 442 #ifdef CALC_SHIFTFORCES
 443     shf      = gmx_simd_align_real(shf_array);
 444 #endif
 445
 446 #ifdef FIX_LJ_C
 447     pvdw_c6  = gmx_simd_align_real(pvdw_array);
 448     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
 449
 450     for (jp = 0; jp < UNROLLJ; jp++)
 451     {
 452         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
 453         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
 454         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
 455         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
 456
 457         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 458         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 459         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 460         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 461     }
 462     c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
 463     c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
 464     c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
 465     c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
 466
 467     c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
 468     c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
 469     c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
 470     c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
 471 #endif /* FIX_LJ_C */
 472
 473 #ifdef ENERGY_GROUPS
 474     egps_ishift  = nbat->neg_2log;
 475     egps_imask   = (1<<egps_ishift) - 1;
 476     egps_jshift  = 2*nbat->neg_2log;
 477     egps_jmask   = (1<<egps_jshift) - 1;
 478     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
 479     /* Major division is over i-particle energy groups, determine the stride */
 480     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 481 #endif
 482
 483     l_cj = nbl->cj;
 484
 485     ninner = 0;
 486     for (n = 0; n < nbl->nci; n++)
 487     {
 488         nbln = &nbl->ci[n];
 489
 490         ish              = (nbln->shift & NBNXN_CI_SHIFT);
 491         ish3             = ish*3;
 492         cjind0           = nbln->cj_ind_start;
 493         cjind1           = nbln->cj_ind_end;
 494         ci               = nbln->ci;
 495         ci_sh            = (ish == CENTRAL ? ci : -1);
 496
 497         shX_S = gmx_load1_pr(shiftvec+ish3);
 498         shY_S = gmx_load1_pr(shiftvec+ish3+1);
 499         shZ_S = gmx_load1_pr(shiftvec+ish3+2);
 500
 501 #if UNROLLJ <= 4
 502         sci              = ci*STRIDE;
 503         scix             = sci*DIM;
 504         sci2             = sci*2;
 505 #else
 506         sci              = (ci>>1)*STRIDE;
 507         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
 508         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
 509         sci             += (ci & 1)*(STRIDE>>1);
 510 #endif
 511
 512         /* We have 5 LJ/C combinations, but use only three inner loops,
 513          * as the other combinations are unlikely and/or not much faster:
 514          * inner half-LJ + C for half-LJ + C / no-LJ + C
 515          * inner LJ + C      for full-LJ + C
 516          * inner LJ          for full-LJ + no-C / half-LJ + no-C
 517          */
 518         do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
 519         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
 520         half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 521
 522 #ifdef ENERGY_GROUPS
 523         egps_i = nbat->energrp[ci];
 524         {
 525             int ia, egp_ia;
 526
 527             for (ia = 0; ia < UNROLLI; ia++)
 528             {
 529                 egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
 530                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
 531                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
 532             }
 533         }
 534 #endif
 535 #if defined CALC_ENERGIES
 536 #if UNROLLJ == 4
 537         if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
 538 #endif
 539 #if UNROLLJ == 8
 540         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
 541 #endif
 542         {
 543             int  ia;
 544             real Vc_sub_self;
 545
 546 #ifdef CALC_COUL_RF
 547             Vc_sub_self = 0.5*ic->c_rf;
 548 #endif
 549 #ifdef CALC_COUL_TAB
 550 #ifdef TAB_FDV0
 551             Vc_sub_self = 0.5*tab_coul_F[2];
 552 #else
 553             Vc_sub_self = 0.5*tab_coul_V[0];
 554 #endif
 555 #endif
 556 #ifdef CALC_COUL_EWALD
 557             /* beta/sqrt(pi) */
 558             Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
 559 #endif
 560
 561             for (ia = 0; ia < UNROLLI; ia++)
 562             {
 563                 real qi;
 564
 565                 qi = q[sci+ia];
 566 #ifdef ENERGY_GROUPS
 567                 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
 568 #else
 569                 Vc[0]
 570 #endif
 571                     -= facel*qi*qi*Vc_sub_self;
 572             }
 573         }
 574 #endif
 575
 576         /* Load i atom data */
 577         sciy             = scix + STRIDE;
 578         sciz             = sciy + STRIDE;
 579         gmx_load1p1_pr(&ix_S0, x+scix);
 580         gmx_load1p1_pr(&ix_S2, x+scix+2);
 581         gmx_load1p1_pr(&iy_S0, x+sciy);
 582         gmx_load1p1_pr(&iy_S2, x+sciy+2);
 583         gmx_load1p1_pr(&iz_S0, x+sciz);
 584         gmx_load1p1_pr(&iz_S2, x+sciz+2);
 585         ix_S0          = gmx_add_pr(ix_S0, shX_S);
 586         ix_S2          = gmx_add_pr(ix_S2, shX_S);
 587         iy_S0          = gmx_add_pr(iy_S0, shY_S);
 588         iy_S2          = gmx_add_pr(iy_S2, shY_S);
 589         iz_S0          = gmx_add_pr(iz_S0, shZ_S);
 590         iz_S2          = gmx_add_pr(iz_S2, shZ_S);
 591
 592         if (do_coul)
 593         {
 594             gmx_mm_pr facel_S;
 595
 596             facel_S    = gmx_set1_pr(facel);
 597
 598             gmx_load1p1_pr(&iq_S0, q+sci);
 599             gmx_load1p1_pr(&iq_S2, q+sci+2);
 600             iq_S0      = gmx_mul_pr(facel_S, iq_S0);
 601             iq_S2      = gmx_mul_pr(facel_S, iq_S2);
 602         }
 603
 604 #ifdef LJ_COMB_LB
 605         gmx_load1p1_pr(&hsig_i_S0, ljc+sci2+0);
 606         gmx_load1p1_pr(&hsig_i_S2, ljc+sci2+2);
 607         gmx_load1p1_pr(&seps_i_S0, ljc+sci2+STRIDE+0);
 608         gmx_load1p1_pr(&seps_i_S2, ljc+sci2+STRIDE+2);
 609 #else
 610 #ifdef LJ_COMB_GEOM
 611         gmx_load1p1_pr(&c6s_S0, ljc+sci2+0);
 612         if (!half_LJ)
 613         {
 614             gmx_load1p1_pr(&c6s_S2, ljc+sci2+2);
 615         }
 616         gmx_load1p1_pr(&c12s_S0, ljc+sci2+STRIDE+0);
 617         if (!half_LJ)
 618         {
 619             gmx_load1p1_pr(&c12s_S2, ljc+sci2+STRIDE+2);
 620         }
 621 #else
 622         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
 623         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
 624         if (!half_LJ)
 625         {
 626             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
 627             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
 628         }
 629 #endif
 630 #endif
 631
 632         /* Zero the potential energy for this list */
 633         Vvdwtot_S        = gmx_setzero_pr();
 634         vctot_S          = gmx_setzero_pr();
 635
 636         /* Clear i atom forces */
 637         fix_S0           = gmx_setzero_pr();
 638         fix_S2           = gmx_setzero_pr();
 639         fiy_S0           = gmx_setzero_pr();
 640         fiy_S2           = gmx_setzero_pr();
 641         fiz_S0           = gmx_setzero_pr();
 642         fiz_S2           = gmx_setzero_pr();
 643
 644         cjind = cjind0;
 645
 646         /* Currently all kernels use (at least half) LJ */
 647 #define CALC_LJ
 648         if (half_LJ)
 649         {
 650 #define CALC_COULOMB
 651 #define HALF_LJ
 652 #define CHECK_EXCLS
 653             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 654             {
 655 #include "nbnxn_kernel_simd_2xnn_inner.h"
 656                 cjind++;
 657             }
 658 #undef CHECK_EXCLS
 659             for (; (cjind < cjind1); cjind++)
 660             {
 661 #include "nbnxn_kernel_simd_2xnn_inner.h"
 662             }
 663 #undef HALF_LJ
 664 #undef CALC_COULOMB
 665         }
 666         else if (do_coul)
 667         {
 668 #define CALC_COULOMB
 669 #define CHECK_EXCLS
 670             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 671             {
 672 #include "nbnxn_kernel_simd_2xnn_inner.h"
 673                 cjind++;
 674             }
 675 #undef CHECK_EXCLS
 676             for (; (cjind < cjind1); cjind++)
 677             {
 678 #include "nbnxn_kernel_simd_2xnn_inner.h"
 679             }
 680 #undef CALC_COULOMB
 681         }
 682         else
 683         {
 684 #define CHECK_EXCLS
 685             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 686             {
 687 #include "nbnxn_kernel_simd_2xnn_inner.h"
 688                 cjind++;
 689             }
 690 #undef CHECK_EXCLS
 691             for (; (cjind < cjind1); cjind++)
 692             {
 693 #include "nbnxn_kernel_simd_2xnn_inner.h"
 694             }
 695         }
 696 #undef CALC_LJ
 697         ninner += cjind1 - cjind0;
 698
 699         /* Add accumulated i-forces to the force array */
 700         fix_S = gmx_mm_transpose_sum4h_pr(fix_S0, fix_S2);
 701         gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
 702
 703         fiy_S = gmx_mm_transpose_sum4h_pr(fiy_S0, fiy_S2);
 704         gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
 705
 706         fiz_S = gmx_mm_transpose_sum4h_pr(fiz_S0, fiz_S2);
 707         gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
 708
 709 #ifdef CALC_SHIFTFORCES
 710         gmx_store_pr4(shf, fix_S);
 711         fshift[ish3+0] += SUM_SIMD4(shf);
 712         gmx_store_pr4(shf, fiy_S);
 713         fshift[ish3+1] += SUM_SIMD4(shf);
 714         gmx_store_pr4(shf, fiz_S);
 715         fshift[ish3+2] += SUM_SIMD4(shf);
 716 #endif
 717
 718 #ifdef CALC_ENERGIES
 719         if (do_coul)
 720         {
 721             gmx_store_pr(tmpsum, vctot_S);
 722             *Vc += SUM_SIMD(tmpsum);
 723         }
 724
 725         gmx_store_pr(tmpsum, Vvdwtot_S);
 726         *Vvdw += SUM_SIMD(tmpsum);
 727 #endif
 728
 729         /* Outer loop uses 6 flops/iteration */
 730     }
 731
 732 #ifdef COUNT_PAIRS
 733     printf("atom pairs %d\n", npair);
 734 #endif
 735 }
 736
 737
 738 #undef CALC_SHIFTFORCES
 739
 740 #undef UNROLLI
 741 #undef UNROLLJ
 742 #undef STRIDE
 743 #undef TAB_FDV0
 744 #undef NBFP_STRIDE