src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2009, The GROMACS Development Team
   6  * Copyright (c) 2012, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* GMX_MM256_HERE should be set before including this file */
  39 #include "gmx_simd_macros.h"
  40
  41 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
  42
  43 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
  44 #define UNROLLJ    (GMX_SIMD_WIDTH_HERE/2)
  45
  46 #if defined GMX_MM256_HERE
  47 #define STRIDE     4
  48 #endif
  49
  50 #ifdef GMX_MM256_HERE
  51 #ifndef GMX_DOUBLE
  52 /* single precision 2x(4+4) kernel */
  53 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
  54 #define TAB_FDV0
  55 #else
  56 #error "unsupported kernel configuration"
  57 #endif
  58 #endif
  59
  60 #define SIMD_MASK_ALL   0xffffffff
  61
  62 #include "nbnxn_kernel_simd_utils.h"
  63
  64 /* All functionality defines are set here, except for:
  65  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
  66  * CHECK_EXCLS, which is set just before including the inner loop contents.
  67  * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
  68  * set before calling the kernel function. We might want to move that
  69  * to inside the n-loop and have a different combination rule for different
  70  * ci's, as no combination rule gives a 50% performance hit for LJ.
  71  */
  72
  73 /* We always calculate shift forces, because it's cheap anyhow */
  74 #define CALC_SHIFTFORCES
  75
  76 /* Assumes all LJ parameters are identical */
  77 /* #define FIX_LJ_C */
  78
  79 /* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
  80  * with all combinations off electrostatics (coul), LJ combination rules (ljc)
  81  * and energy calculations (ene), depending on the defines set.
  82  */
  83
  84 #define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
  85
  86 #if defined LJ_COMB_GEOM
  87 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
  88 #else
  89 #if defined LJ_COMB_LB
  90 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
  91 #else
  92 #define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
  93 #endif
  94 #endif
  95
  96 #ifdef CALC_COUL_RF
  97 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
  98 #endif
  99 #ifdef CALC_COUL_TAB
 100 #ifndef VDW_CUTOFF_CHECK
 101 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
 102 #else
 103 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
 104 #endif
 105 #endif
 106 #ifdef CALC_COUL_EWALD
 107 #ifndef VDW_CUTOFF_CHECK
 108 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
 109 #else
 110 #define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
 111 #endif
 112 #endif
 113
 114 static void
 115 #ifndef CALC_ENERGIES
 116 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, noener)
 117 #else
 118 #ifndef ENERGY_GROUPS
 119 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, ener)
 120 #else
 121 NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, energrp)
 122 #endif
 123 #endif
 124 #undef NBK_FUNC_NAME
 125 #undef NBK_FUNC_NAME_C
 126 #undef NBK_FUNC_NAME_C_LJC
 127 (const nbnxn_pairlist_t     *nbl,
 128  const nbnxn_atomdata_t     *nbat,
 129  const interaction_const_t  *ic,
 130  rvec                       *shift_vec,
 131  real                       *f
 132 #ifdef CALC_SHIFTFORCES
 133  ,
 134  real                       *fshift
 135 #endif
 136 #ifdef CALC_ENERGIES
 137  ,
 138  real                       *Vvdw,
 139  real                       *Vc
 140 #endif
 141 )
 142 {
 143     const nbnxn_ci_t   *nbln;
 144     const nbnxn_cj_t   *l_cj;
 145     const int          *type;
 146     const real         *q;
 147     const real         *shiftvec;
 148     const real         *x;
 149     const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
 150     real                facel;
 151     real               *nbfp_ptr;
 152     int                 nbfp_stride;
 153     int                 n, ci, ci_sh;
 154     int                 ish, ish3;
 155     gmx_bool            do_LJ, half_LJ, do_coul;
 156     int                 sci, scix, sciy, sciz, sci2;
 157     int                 cjind0, cjind1, cjind;
 158     int                 ip, jp;
 159
 160 #ifdef ENERGY_GROUPS
 161     int         Vstride_i;
 162     int         egps_ishift, egps_imask;
 163     int         egps_jshift, egps_jmask, egps_jstride;
 164     int         egps_i;
 165     real       *vvdwtp[UNROLLI];
 166     real       *vctp[UNROLLI];
 167 #endif
 168
 169     gmx_mm_pr  shX_SSE;
 170     gmx_mm_pr  shY_SSE;
 171     gmx_mm_pr  shZ_SSE;
 172     gmx_mm_pr  ix_SSE0, iy_SSE0, iz_SSE0;
 173     gmx_mm_pr  ix_SSE2, iy_SSE2, iz_SSE2;
 174     gmx_mm_pr  fix_SSE0, fiy_SSE0, fiz_SSE0;
 175     gmx_mm_pr  fix_SSE2, fiy_SSE2, fiz_SSE2;
 176 #if UNROLLJ >= 4
 177 #ifndef GMX_DOUBLE
 178     __m128     fix_SSE, fiy_SSE, fiz_SSE;
 179 #else
 180     __m256d    fix_SSE, fiy_SSE, fiz_SSE;
 181 #endif
 182 #else
 183     __m128d    fix0_SSE, fiy0_SSE, fiz0_SSE;
 184     __m128d    fix2_SSE, fiy2_SSE, fiz2_SSE;
 185 #endif
 186
 187     /* AVX: use floating point masks, as there are no integer instructions */
 188     gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
 189     gmx_mm_pr  mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
 190
 191     gmx_mm_pr  diag_jmi_SSE;
 192 #if UNROLLI == UNROLLJ
 193     gmx_mm_pr  diag_SSE0, diag_SSE2;
 194 #else
 195     gmx_mm_pr  diag0_SSE0, diag0_SSE2;
 196     gmx_mm_pr  diag1_SSE0, diag1_SSE2;
 197 #endif
 198
 199     gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
 200
 201     gmx_mm_pr  one_SSE = gmx_set1_pr(1.0);
 202     gmx_mm_pr  iq_SSE0 = gmx_setzero_pr();
 203     gmx_mm_pr  iq_SSE2 = gmx_setzero_pr();
 204     gmx_mm_pr  mrc_3_SSE;
 205 #ifdef CALC_ENERGIES
 206     gmx_mm_pr  hrc_3_SSE, moh_rc_SSE;
 207 #endif
 208
 209 #ifdef CALC_COUL_TAB
 210     /* Coulomb table variables */
 211     gmx_mm_pr   invtsp_SSE;
 212     const real *tab_coul_F;
 213 #ifndef TAB_FDV0
 214     const real *tab_coul_V;
 215 #endif
 216 #ifdef GMX_MM256_HERE
 217     int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
 218     int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
 219 #endif
 220 #ifdef CALC_ENERGIES
 221     gmx_mm_pr  mhalfsp_SSE;
 222 #endif
 223 #endif
 224
 225 #ifdef CALC_COUL_EWALD
 226     gmx_mm_pr beta2_SSE, beta_SSE;
 227 #endif
 228
 229 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 230     gmx_mm_pr  sh_ewald_SSE;
 231 #endif
 232
 233 #ifdef LJ_COMB_LB
 234     const real *ljc;
 235
 236     gmx_mm_pr   hsig_i_SSE0, seps_i_SSE0;
 237     gmx_mm_pr   hsig_i_SSE2, seps_i_SSE2;
 238 #else
 239 #ifdef FIX_LJ_C
 240     real        pvdw_array[2*UNROLLI*UNROLLJ+3];
 241     real       *pvdw_c6, *pvdw_c12;
 242     gmx_mm_pr   c6_SSE0, c12_SSE0;
 243     gmx_mm_pr   c6_SSE2, c12_SSE2;
 244 #endif
 245
 246 #ifdef LJ_COMB_GEOM
 247     const real *ljc;
 248
 249     gmx_mm_pr   c6s_SSE0, c12s_SSE0;
 250     gmx_mm_pr   c6s_SSE1, c12s_SSE1;
 251     gmx_mm_pr   c6s_SSE2 = gmx_setzero_pr(), c12s_SSE2 = gmx_setzero_pr();
 252     gmx_mm_pr   c6s_SSE3 = gmx_setzero_pr(), c12s_SSE3 = gmx_setzero_pr();
 253 #endif
 254 #endif /* LJ_COMB_LB */
 255
 256     gmx_mm_pr  vctotSSE, VvdwtotSSE;
 257     gmx_mm_pr  sixthSSE, twelvethSSE;
 258
 259     gmx_mm_pr  avoid_sing_SSE;
 260     gmx_mm_pr  rc2_SSE;
 261 #ifdef VDW_CUTOFF_CHECK
 262     gmx_mm_pr  rcvdw2_SSE;
 263 #endif
 264
 265 #ifdef CALC_ENERGIES
 266     gmx_mm_pr  sh_invrc6_SSE, sh_invrc12_SSE;
 267
 268     /* cppcheck-suppress unassignedVariable */
 269     real       tmpsum_array[15], *tmpsum;
 270 #endif
 271 #ifdef CALC_SHIFTFORCES
 272     /* cppcheck-suppress unassignedVariable */
 273     real       shf_array[15], *shf;
 274 #endif
 275
 276     int ninner;
 277
 278 #ifdef COUNT_PAIRS
 279     int npair = 0;
 280 #endif
 281
 282 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 283     ljc = nbat->lj_comb;
 284 #else
 285     /* No combination rule used */
 286 #ifndef GMX_DOUBLE
 287     nbfp_ptr    = nbat->nbfp_s4;
 288 #define NBFP_STRIDE  4
 289 #else
 290     nbfp_ptr    = nbat->nbfp;
 291 #define NBFP_STRIDE  2
 292 #endif
 293     nbfp_stride = NBFP_STRIDE;
 294 #endif
 295
 296     /* Load j-i for the first i */
 297     diag_jmi_SSE = gmx_load_pr(nbat->simd_2xnn_diag);
 298     /* Generate all the diagonal masks as comparison results */
 299 #if UNROLLI == UNROLLJ
 300     diag_SSE0    = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 301     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 302     diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE, one_SSE);
 303     diag_SSE2    = gmx_cmplt_pr(zero_SSE, diag_jmi_SSE);
 304 #else
 305 #if 2*UNROLLI == UNROLLJ
 306     diag0_SSE0 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
 307     diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
 308     diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
 309     diag0_SSE2 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
 310     diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
 311     diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
 312     diag1_SSE0 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
 313     diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
 314     diag_i_SSE = gmx_add_pr(diag_i_SSE, one_SSE);
 315     diag1_SSE2 = gmx_cmplt_pr(diag_i_SSE, diag_j_SSE);
 316 #endif
 317 #endif
 318
 319 #ifdef CALC_COUL_TAB
 320 #ifdef GMX_MM256_HERE
 321     /* Generate aligned table index pointers */
 322     ti0 = (int *)(((size_t)(ti0_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
 323     ti2 = (int *)(((size_t)(ti2_array+GMX_SIMD_WIDTH_HERE-1)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int)-1))));
 324 #endif
 325
 326     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
 327 #ifdef CALC_ENERGIES
 328     mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
 329 #endif
 330
 331 #ifdef TAB_FDV0
 332     tab_coul_F = ic->tabq_coul_FDV0;
 333 #else
 334     tab_coul_F = ic->tabq_coul_F;
 335     tab_coul_V = ic->tabq_coul_V;
 336 #endif
 337 #endif /* CALC_COUL_TAB */
 338
 339 #ifdef CALC_COUL_EWALD
 340     beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
 341     beta_SSE  = gmx_set1_pr(ic->ewaldcoeff);
 342 #endif
 343
 344 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
 345     sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
 346 #endif
 347
 348     q                   = nbat->q;
 349     type                = nbat->type;
 350     facel               = ic->epsfac;
 351     shiftvec            = shift_vec[0];
 352     x                   = nbat->x;
 353
 354     avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
 355
 356     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
 357     rc2_SSE    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
 358 #ifdef VDW_CUTOFF_CHECK
 359     rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
 360 #endif
 361
 362 #ifdef CALC_ENERGIES
 363     sixthSSE    = gmx_set1_pr(1.0/6.0);
 364     twelvethSSE = gmx_set1_pr(1.0/12.0);
 365
 366     sh_invrc6_SSE  = gmx_set1_pr(ic->sh_invrc6);
 367     sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
 368 #endif
 369
 370     mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
 371
 372 #ifdef CALC_ENERGIES
 373     hrc_3_SSE = gmx_set1_pr(ic->k_rf);
 374
 375     moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
 376 #endif
 377
 378 #ifdef CALC_ENERGIES
 379     tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
 380 #endif
 381 #ifdef CALC_SHIFTFORCES
 382     shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
 383 #endif
 384
 385 #ifdef FIX_LJ_C
 386     pvdw_c6  = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
 387     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
 388
 389     for (jp = 0; jp < UNROLLJ; jp++)
 390     {
 391         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
 392         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
 393         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
 394         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
 395
 396         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 397         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 398         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 399         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 400     }
 401     c6_SSE0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
 402     c6_SSE1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
 403     c6_SSE2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
 404     c6_SSE3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
 405
 406     c12_SSE0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
 407     c12_SSE1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
 408     c12_SSE2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
 409     c12_SSE3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
 410 #endif /* FIX_LJ_C */
 411
 412 #ifdef ENERGY_GROUPS
 413     egps_ishift  = nbat->neg_2log;
 414     egps_imask   = (1<<egps_ishift) - 1;
 415     egps_jshift  = 2*nbat->neg_2log;
 416     egps_jmask   = (1<<egps_jshift) - 1;
 417     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
 418     /* Major division is over i-particle energy groups, determine the stride */
 419     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 420 #endif
 421
 422     l_cj = nbl->cj;
 423
 424     ninner = 0;
 425     for (n = 0; n < nbl->nci; n++)
 426     {
 427         nbln = &nbl->ci[n];
 428
 429         ish              = (nbln->shift & NBNXN_CI_SHIFT);
 430         ish3             = ish*3;
 431         cjind0           = nbln->cj_ind_start;
 432         cjind1           = nbln->cj_ind_end;
 433         ci               = nbln->ci;
 434         ci_sh            = (ish == CENTRAL ? ci : -1);
 435
 436         shX_SSE = gmx_load1_pr(shiftvec+ish3);
 437         shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
 438         shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
 439
 440 #if UNROLLJ <= 4
 441         sci              = ci*STRIDE;
 442         scix             = sci*DIM;
 443         sci2             = sci*2;
 444 #else
 445         sci              = (ci>>1)*STRIDE;
 446         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
 447         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
 448         sci             += (ci & 1)*(STRIDE>>1);
 449 #endif
 450
 451         /* We have 5 LJ/C combinations, but use only three inner loops,
 452          * as the other combinations are unlikely and/or not much faster:
 453          * inner half-LJ + C for half-LJ + C / no-LJ + C
 454          * inner LJ + C      for full-LJ + C
 455          * inner LJ          for full-LJ + no-C / half-LJ + no-C
 456          */
 457         do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
 458         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
 459         half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 460
 461 #ifdef ENERGY_GROUPS
 462         egps_i = nbat->energrp[ci];
 463         {
 464             int ia, egp_ia;
 465
 466             for (ia = 0; ia < UNROLLI; ia++)
 467             {
 468                 egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
 469                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
 470                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
 471             }
 472         }
 473 #endif
 474 #if defined CALC_ENERGIES
 475 #if UNROLLJ == 4
 476         if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
 477 #endif
 478 #if UNROLLJ == 2
 479         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
 480 #endif
 481 #if UNROLLJ == 8
 482         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
 483 #endif
 484         {
 485             int  ia;
 486             real Vc_sub_self;
 487
 488 #ifdef CALC_COUL_RF
 489             Vc_sub_self = 0.5*ic->c_rf;
 490 #endif
 491 #ifdef CALC_COUL_TAB
 492 #ifdef TAB_FDV0
 493             Vc_sub_self = 0.5*tab_coul_F[2];
 494 #else
 495             Vc_sub_self = 0.5*tab_coul_V[0];
 496 #endif
 497 #endif
 498 #ifdef CALC_COUL_EWALD
 499             /* beta/sqrt(pi) */
 500             Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
 501 #endif
 502
 503             for (ia = 0; ia < UNROLLI; ia++)
 504             {
 505                 real qi;
 506
 507                 qi = q[sci+ia];
 508 #ifdef ENERGY_GROUPS
 509                 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
 510 #else
 511                 Vc[0]
 512 #endif
 513                     -= facel*qi*qi*Vc_sub_self;
 514             }
 515         }
 516 #endif
 517
 518 #define gmx_load2_hpr(x)  _mm256_insertf128_ps(gmx_load1_pr(x), gmx_load1_hpr(x+1), 1)
 519
 520         /* Load i atom data */
 521         sciy             = scix + STRIDE;
 522         sciz             = sciy + STRIDE;
 523         ix_SSE0          = gmx_add_pr(gmx_load2_hpr(x+scix), shX_SSE);
 524         ix_SSE2          = gmx_add_pr(gmx_load2_hpr(x+scix+2), shX_SSE);
 525         iy_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciy), shY_SSE);
 526         iy_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciy+2), shY_SSE);
 527         iz_SSE0          = gmx_add_pr(gmx_load2_hpr(x+sciz), shZ_SSE);
 528         iz_SSE2          = gmx_add_pr(gmx_load2_hpr(x+sciz+2), shZ_SSE);
 529
 530         if (do_coul)
 531         {
 532             gmx_mm_pr facel_SSE;
 533
 534             facel_SSE    = gmx_set1_pr(facel);
 535
 536             iq_SSE0      = gmx_mul_pr(facel_SSE, gmx_load2_hpr(q+sci));
 537             iq_SSE2      = gmx_mul_pr(facel_SSE, gmx_load2_hpr(q+sci+2));
 538         }
 539
 540 #ifdef LJ_COMB_LB
 541         hsig_i_SSE0      = gmx_load2_hpr(ljc+sci2+0);
 542         hsig_i_SSE2      = gmx_load2_hpr(ljc+sci2+2);
 543         seps_i_SSE0      = gmx_load2_hpr(ljc+sci2+STRIDE+0);
 544         seps_i_SSE2      = gmx_load2_hpr(ljc+sci2+STRIDE+2);
 545 #else
 546 #ifdef LJ_COMB_GEOM
 547         c6s_SSE0         = gmx_load2_hpr(ljc+sci2+0);
 548         if (!half_LJ)
 549         {
 550             c6s_SSE2     = gmx_load2_hpr(ljc+sci2+2);
 551         }
 552         c12s_SSE0        = gmx_load2_hpr(ljc+sci2+STRIDE+0);
 553         if (!half_LJ)
 554         {
 555             c12s_SSE2    = gmx_load2_hpr(ljc+sci2+STRIDE+2);
 556         }
 557 #else
 558         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
 559         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
 560         if (!half_LJ)
 561         {
 562             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
 563             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
 564         }
 565 #endif
 566 #endif
 567
 568         /* Zero the potential energy for this list */
 569         VvdwtotSSE       = gmx_setzero_pr();
 570         vctotSSE         = gmx_setzero_pr();
 571
 572         /* Clear i atom forces */
 573         fix_SSE0           = gmx_setzero_pr();
 574         fix_SSE2           = gmx_setzero_pr();
 575         fiy_SSE0           = gmx_setzero_pr();
 576         fiy_SSE2           = gmx_setzero_pr();
 577         fiz_SSE0           = gmx_setzero_pr();
 578         fiz_SSE2           = gmx_setzero_pr();
 579
 580         cjind = cjind0;
 581
 582         /* Currently all kernels use (at least half) LJ */
 583 #define CALC_LJ
 584         if (half_LJ)
 585         {
 586 #define CALC_COULOMB
 587 #define HALF_LJ
 588 #define CHECK_EXCLS
 589             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 590             {
 591 #include "nbnxn_kernel_simd_2xnn_inner.h"
 592                 cjind++;
 593             }
 594 #undef CHECK_EXCLS
 595             for (; (cjind < cjind1); cjind++)
 596             {
 597 #include "nbnxn_kernel_simd_2xnn_inner.h"
 598             }
 599 #undef HALF_LJ
 600 #undef CALC_COULOMB
 601         }
 602         else if (do_coul)
 603         {
 604 #define CALC_COULOMB
 605 #define CHECK_EXCLS
 606             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 607             {
 608 #include "nbnxn_kernel_simd_2xnn_inner.h"
 609                 cjind++;
 610             }
 611 #undef CHECK_EXCLS
 612             for (; (cjind < cjind1); cjind++)
 613             {
 614 #include "nbnxn_kernel_simd_2xnn_inner.h"
 615             }
 616 #undef CALC_COULOMB
 617         }
 618         else
 619         {
 620 #define CHECK_EXCLS
 621             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 622             {
 623 #include "nbnxn_kernel_simd_2xnn_inner.h"
 624                 cjind++;
 625             }
 626 #undef CHECK_EXCLS
 627             for (; (cjind < cjind1); cjind++)
 628             {
 629 #include "nbnxn_kernel_simd_2xnn_inner.h"
 630             }
 631         }
 632 #undef CALC_LJ
 633         ninner += cjind1 - cjind0;
 634
 635         /* Add accumulated i-forces to the force array */
 636 #if UNROLLJ >= 4
 637 #ifndef GMX_DOUBLE
 638 #define gmx_load_ps4  _mm_load_ps
 639 #define gmx_store_ps4 _mm_store_ps
 640 #define gmx_add_ps4   _mm_add_ps
 641 #else
 642 #define gmx_load_ps4  _mm256_load_pd
 643 #define gmx_store_ps4 _mm256_store_pd
 644 #define gmx_add_ps4   _mm256_add_pd
 645 #endif
 646         GMX_MM_TRANSPOSE_SUM4H_PR(fix_SSE0, fix_SSE2, fix_SSE);
 647         gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
 648
 649         GMX_MM_TRANSPOSE_SUM4H_PR(fiy_SSE0, fiy_SSE2, fiy_SSE);
 650         gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
 651
 652         GMX_MM_TRANSPOSE_SUM4H_PR(fiz_SSE0, fiz_SSE2, fiz_SSE);
 653         gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
 654
 655 #ifdef CALC_SHIFTFORCES
 656         gmx_store_ps4(shf, fix_SSE);
 657         fshift[ish3+0] += SUM_SIMD4(shf);
 658         gmx_store_ps4(shf, fiy_SSE);
 659         fshift[ish3+1] += SUM_SIMD4(shf);
 660         gmx_store_ps4(shf, fiz_SSE);
 661         fshift[ish3+2] += SUM_SIMD4(shf);
 662 #endif
 663 #else
 664         GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0, fix_SSE1, fix0_SSE);
 665         _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
 666         GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2, fix_SSE3, fix2_SSE);
 667         _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
 668
 669         GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0, fiy_SSE1, fiy0_SSE);
 670         _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
 671         GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2, fiy_SSE3, fiy2_SSE);
 672         _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
 673
 674         GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0, fiz_SSE1, fiz0_SSE);
 675         _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
 676         GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2, fiz_SSE3, fiz2_SSE);
 677         _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
 678
 679 #ifdef CALC_SHIFTFORCES
 680         _mm_store_pd(shf, _mm_add_pd(fix0_SSE, fix2_SSE));
 681         fshift[ish3+0] += shf[0] + shf[1];
 682         _mm_store_pd(shf, _mm_add_pd(fiy0_SSE, fiy2_SSE));
 683         fshift[ish3+1] += shf[0] + shf[1];
 684         _mm_store_pd(shf, _mm_add_pd(fiz0_SSE, fiz2_SSE));
 685         fshift[ish3+2] += shf[0] + shf[1];
 686 #endif
 687 #endif
 688
 689 #ifdef CALC_ENERGIES
 690         if (do_coul)
 691         {
 692             gmx_store_pr(tmpsum, vctotSSE);
 693             *Vc += SUM_SIMD(tmpsum);
 694         }
 695
 696         gmx_store_pr(tmpsum, VvdwtotSSE);
 697         *Vvdw += SUM_SIMD(tmpsum);
 698 #endif
 699
 700         /* Outer loop uses 6 flops/iteration */
 701     }
 702
 703 #ifdef COUNT_PAIRS
 704     printf("atom pairs %d\n", npair);
 705 #endif
 706 }
 707
 708 #undef gmx_load2_hpr
 709
 710 #undef gmx_load_ps4
 711 #undef gmx_store_ps4
 712 #undef gmx_store_ps4
 713
 714 #undef CALC_SHIFTFORCES
 715
 716 #undef UNROLLI
 717 #undef UNROLLJ
 718 #undef STRIDE
 719 #undef TAB_FDV0
 720 #undef NBFP_STRIDE