src/gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_outer.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2009, The GROMACS Development Team
   6  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38
  39 /* Half-width SIMD operations are required here.
  40  * As the 4xn kernels are the "standard" kernels and some special operations
  41  * are required only here, we define those in nbnxn_kernel_simd_utils_...
  42  *
  43  * Half-width SIMD real type:
  44  * gmx_mm_hpr
  45  *
  46  * Half-width SIMD operations
  47  * Load reals at half-width aligned pointer b into half-width SIMD register a:
  48  * gmx_load_hpr(a, b)
  49  * Set all entries in half-width SIMD register *a to b:
  50  * gmx_set1_hpr(a, b)
  51  * Load one real at b and one real at b+1 into halves of a, respectively:
  52  * gmx_load1p1_pr(a, b)
  53  * Load reals at half-width aligned pointer b into two halves of a:
  54  * gmx_loaddh_pr(a, b)
  55  * Store half-width SIMD register b into half width aligned memory a:
  56  * gmx_store_hpr(a, b)
  57  * gmx_add_hpr(a, b)
  58  * gmx_sub_hpr(a, b)
  59  * Sum over 4 half SIMD registers:
  60  * gmx_sum4_hpr(a, b)
  61  * Sum the elements of halfs of each input register and store sums in out:
  62  * gmx_mm_transpose_sum4h_pr(a, b)
  63  * Extract two half-width registers *b, *c from a full width register a:
  64  * gmx_pr_to_2hpr(a, b, c)
  65  */
  66
  67
  68 {
  69     const nbnxn_ci_t   *nbln;
  70     const nbnxn_cj_t   *l_cj;
  71     const int          *type;
  72     const real         *q;
  73     const real         *shiftvec;
  74     const real         *x;
  75     const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
  76     real                facel;
  77     real               *nbfp_ptr;
  78     int                 n, ci, ci_sh;
  79     int                 ish, ish3;
  80     gmx_bool            do_LJ, half_LJ, do_coul;
  81     int                 sci, scix, sciy, sciz, sci2;
  82     int                 cjind0, cjind1, cjind;
  83     int                 ip, jp;
  84
  85 #ifdef ENERGY_GROUPS
  86     int         Vstride_i;
  87     int         egps_ishift, egps_imask;
  88     int         egps_jshift, egps_jmask, egps_jstride;
  89     int         egps_i;
  90     real       *vvdwtp[UNROLLI];
  91     real       *vctp[UNROLLI];
  92 #endif
  93
  94     gmx_mm_pr  shX_S;
  95     gmx_mm_pr  shY_S;
  96     gmx_mm_pr  shZ_S;
  97     gmx_mm_pr  ix_S0, iy_S0, iz_S0;
  98     gmx_mm_pr  ix_S2, iy_S2, iz_S2;
  99     gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
 100     gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
 101     /* We use an i-force SIMD register width of 4 */
 102     /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
 103     gmx_mm_pr4 fix_S, fiy_S, fiz_S;
 104
 105     gmx_mm_pr  diagonal_jmi_S;
 106 #if UNROLLI == UNROLLJ
 107     gmx_mm_pb  diagonal_mask_S0, diagonal_mask_S2;
 108 #else
 109     gmx_mm_pb  diagonal_mask0_S0, diagonal_mask0_S2;
 110     gmx_mm_pb  diagonal_mask1_S0, diagonal_mask1_S2;
 111 #endif
 112
 113     unsigned      *exclusion_filter;
 114     gmx_exclfilter filter_S0, filter_S2;
 115
 116     gmx_mm_pr   zero_S = gmx_set1_pr(0);
 117
 118     gmx_mm_pr   one_S = gmx_set1_pr(1.0);
 119     gmx_mm_pr   iq_S0 = gmx_setzero_pr();
 120     gmx_mm_pr   iq_S2 = gmx_setzero_pr();
 121     gmx_mm_pr   mrc_3_S;
 122 #ifdef CALC_ENERGIES
 123     gmx_mm_pr   hrc_3_S, moh_rc_S;
 124 #endif
 125
 126 #ifdef CALC_COUL_TAB
 127     /* Coulomb table variables */
 128     gmx_mm_pr   invtsp_S;
 129     const real *tab_coul_F;
 130 #ifndef TAB_FDV0
 131     const real *tab_coul_V;
 132 #endif
 133     int         ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0;
 134     int         ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2;
 135 #ifdef CALC_ENERGIES
 136     gmx_mm_pr   mhalfsp_S;
 137 #endif
 138 #endif
 139
 140 #ifdef CALC_COUL_EWALD
 141     gmx_mm_pr beta2_S, beta_S;
 142 #endif
 143
 144 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 145     gmx_mm_pr  sh_ewald_S;
 146 #endif
 147
 148 #ifdef LJ_COMB_LB
 149     const real *ljc;
 150
 151     gmx_mm_pr   hsig_i_S0, seps_i_S0;
 152     gmx_mm_pr   hsig_i_S2, seps_i_S2;
 153 #else
 154 #ifdef FIX_LJ_C
 155     real        pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_WIDTH_HERE];
 156     real       *pvdw_c6, *pvdw_c12;
 157     gmx_mm_pr   c6_S0, c12_S0;
 158     gmx_mm_pr   c6_S2, c12_S2;
 159 #endif
 160
 161 #ifdef LJ_COMB_GEOM
 162     const real *ljc;
 163
 164     gmx_mm_pr   c6s_S0, c12s_S0;
 165     gmx_mm_pr   c6s_S1, c12s_S1;
 166     gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
 167     gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
 168 #endif
 169 #endif /* LJ_COMB_LB */
 170
 171     gmx_mm_pr  vctot_S, Vvdwtot_S;
 172     gmx_mm_pr  sixth_S, twelveth_S;
 173
 174     gmx_mm_pr  avoid_sing_S;
 175     gmx_mm_pr  rc2_S;
 176 #ifdef VDW_CUTOFF_CHECK
 177     gmx_mm_pr  rcvdw2_S;
 178 #endif
 179
 180 #ifdef CALC_ENERGIES
 181     gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
 182
 183     /* cppcheck-suppress unassignedVariable */
 184     real       tmpsum_array[2*GMX_SIMD_WIDTH_HERE], *tmpsum;
 185 #endif
 186 #ifdef CALC_SHIFTFORCES
 187     /* cppcheck-suppress unassignedVariable */
 188     real       shf_array[2*GMX_SIMD_WIDTH_HERE], *shf;
 189 #endif
 190
 191     int ninner;
 192
 193 #ifdef COUNT_PAIRS
 194     int npair = 0;
 195 #endif
 196
 197 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 198     ljc = nbat->lj_comb;
 199 #else
 200     /* No combination rule used */
 201     nbfp_ptr    = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
 202 #endif
 203
 204     /* Load j-i for the first i */
 205     diagonal_jmi_S    = gmx_load_pr(nbat->simd_2xnn_diagonal_j_minus_i);
 206     /* Generate all the diagonal masks as comparison results */
 207 #if UNROLLI == UNROLLJ
 208     diagonal_mask_S0  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 209     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 210     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 211     diagonal_mask_S2  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 212 #else
 213 #if 2*UNROLLI == UNROLLJ
 214     diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 215     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 216     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 217     diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 218     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 219     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 220     diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 221     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 222     diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 223     diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 224 #endif
 225 #endif
 226
 227     /* Load masks for topology exclusion masking. filter_stride is
 228        static const, so the conditional will be optimized away. */
 229     if (1 == filter_stride)
 230     {
 231         exclusion_filter = nbat->simd_exclusion_filter1;
 232     }
 233     else /* (2 == filter_stride) */
 234     {
 235         exclusion_filter = nbat->simd_exclusion_filter2;
 236     }
 237
 238     /* Here we cast the exclusion masks from unsigned * to int * or
 239      * real *.  Since we only check bits, the actual value they
 240      * represent does not matter, as long as both mask and exclusion
 241      * info are treated the same way.
 242      */
 243     filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0*2*UNROLLJ*filter_stride);
 244     filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 1*2*UNROLLJ*filter_stride);
 245
 246 #ifdef CALC_COUL_TAB
 247     /* Generate aligned table index pointers */
 248     ti0 = prepare_table_load_buffer(ti0_array);
 249     ti2 = prepare_table_load_buffer(ti2_array);
 250
 251     invtsp_S  = gmx_set1_pr(ic->tabq_scale);
 252 #ifdef CALC_ENERGIES
 253     mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
 254 #endif
 255
 256 #ifdef TAB_FDV0
 257     tab_coul_F = ic->tabq_coul_FDV0;
 258 #else
 259     tab_coul_F = ic->tabq_coul_F;
 260     tab_coul_V = ic->tabq_coul_V;
 261 #endif
 262 #endif /* CALC_COUL_TAB */
 263
 264 #ifdef CALC_COUL_EWALD
 265     beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
 266     beta_S  = gmx_set1_pr(ic->ewaldcoeff);
 267 #endif
 268
 269 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
 270     sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
 271 #endif
 272
 273     q                   = nbat->q;
 274     type                = nbat->type;
 275     facel               = ic->epsfac;
 276     shiftvec            = shift_vec[0];
 277     x                   = nbat->x;
 278
 279     avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
 280
 281     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
 282     rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
 283 #ifdef VDW_CUTOFF_CHECK
 284     rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
 285 #endif
 286
 287 #ifdef CALC_ENERGIES
 288     sixth_S      = gmx_set1_pr(1.0/6.0);
 289     twelveth_S   = gmx_set1_pr(1.0/12.0);
 290
 291     sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
 292     sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
 293 #endif
 294
 295     mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
 296
 297 #ifdef CALC_ENERGIES
 298     hrc_3_S  = gmx_set1_pr(ic->k_rf);
 299
 300     moh_rc_S = gmx_set1_pr(-ic->c_rf);
 301 #endif
 302
 303 #ifdef CALC_ENERGIES
 304     tmpsum   = gmx_simd_align_real(tmpsum_array);
 305 #endif
 306 #ifdef CALC_SHIFTFORCES
 307     shf      = gmx_simd_align_real(shf_array);
 308 #endif
 309
 310 #ifdef FIX_LJ_C
 311     pvdw_c6  = gmx_simd_align_real(pvdw_array);
 312     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
 313
 314     for (jp = 0; jp < UNROLLJ; jp++)
 315     {
 316         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
 317         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
 318         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
 319         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
 320
 321         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 322         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 323         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 324         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 325     }
 326     c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
 327     c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
 328     c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
 329     c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
 330
 331     c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
 332     c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
 333     c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
 334     c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
 335 #endif /* FIX_LJ_C */
 336
 337 #ifdef ENERGY_GROUPS
 338     egps_ishift  = nbat->neg_2log;
 339     egps_imask   = (1<<egps_ishift) - 1;
 340     egps_jshift  = 2*nbat->neg_2log;
 341     egps_jmask   = (1<<egps_jshift) - 1;
 342     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
 343     /* Major division is over i-particle energy groups, determine the stride */
 344     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 345 #endif
 346
 347     l_cj = nbl->cj;
 348
 349     ninner = 0;
 350     for (n = 0; n < nbl->nci; n++)
 351     {
 352         nbln = &nbl->ci[n];
 353
 354         ish              = (nbln->shift & NBNXN_CI_SHIFT);
 355         ish3             = ish*3;
 356         cjind0           = nbln->cj_ind_start;
 357         cjind1           = nbln->cj_ind_end;
 358         ci               = nbln->ci;
 359         ci_sh            = (ish == CENTRAL ? ci : -1);
 360
 361         shX_S = gmx_load1_pr(shiftvec+ish3);
 362         shY_S = gmx_load1_pr(shiftvec+ish3+1);
 363         shZ_S = gmx_load1_pr(shiftvec+ish3+2);
 364
 365 #if UNROLLJ <= 4
 366         sci              = ci*STRIDE;
 367         scix             = sci*DIM;
 368         sci2             = sci*2;
 369 #else
 370         sci              = (ci>>1)*STRIDE;
 371         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
 372         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
 373         sci             += (ci & 1)*(STRIDE>>1);
 374 #endif
 375
 376         /* We have 5 LJ/C combinations, but use only three inner loops,
 377          * as the other combinations are unlikely and/or not much faster:
 378          * inner half-LJ + C for half-LJ + C / no-LJ + C
 379          * inner LJ + C      for full-LJ + C
 380          * inner LJ          for full-LJ + no-C / half-LJ + no-C
 381          */
 382         do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
 383         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
 384         half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 385
 386 #ifdef ENERGY_GROUPS
 387         egps_i = nbat->energrp[ci];
 388         {
 389             int ia, egp_ia;
 390
 391             for (ia = 0; ia < UNROLLI; ia++)
 392             {
 393                 egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
 394                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
 395                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
 396             }
 397         }
 398 #endif
 399 #if defined CALC_ENERGIES
 400 #if UNROLLJ == 4
 401         if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
 402 #endif
 403 #if UNROLLJ == 8
 404         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
 405 #endif
 406         {
 407             int  ia;
 408             real Vc_sub_self;
 409
 410 #ifdef CALC_COUL_RF
 411             Vc_sub_self = 0.5*ic->c_rf;
 412 #endif
 413 #ifdef CALC_COUL_TAB
 414 #ifdef TAB_FDV0
 415             Vc_sub_self = 0.5*tab_coul_F[2];
 416 #else
 417             Vc_sub_self = 0.5*tab_coul_V[0];
 418 #endif
 419 #endif
 420 #ifdef CALC_COUL_EWALD
 421             /* beta/sqrt(pi) */
 422             Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
 423 #endif
 424
 425             for (ia = 0; ia < UNROLLI; ia++)
 426             {
 427                 real qi;
 428
 429                 qi = q[sci+ia];
 430 #ifdef ENERGY_GROUPS
 431                 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
 432 #else
 433                 Vc[0]
 434 #endif
 435                     -= facel*qi*qi*Vc_sub_self;
 436             }
 437         }
 438 #endif
 439
 440         /* Load i atom data */
 441         sciy             = scix + STRIDE;
 442         sciz             = sciy + STRIDE;
 443         gmx_load1p1_pr(&ix_S0, x+scix);
 444         gmx_load1p1_pr(&ix_S2, x+scix+2);
 445         gmx_load1p1_pr(&iy_S0, x+sciy);
 446         gmx_load1p1_pr(&iy_S2, x+sciy+2);
 447         gmx_load1p1_pr(&iz_S0, x+sciz);
 448         gmx_load1p1_pr(&iz_S2, x+sciz+2);
 449         ix_S0          = gmx_add_pr(ix_S0, shX_S);
 450         ix_S2          = gmx_add_pr(ix_S2, shX_S);
 451         iy_S0          = gmx_add_pr(iy_S0, shY_S);
 452         iy_S2          = gmx_add_pr(iy_S2, shY_S);
 453         iz_S0          = gmx_add_pr(iz_S0, shZ_S);
 454         iz_S2          = gmx_add_pr(iz_S2, shZ_S);
 455
 456         if (do_coul)
 457         {
 458             gmx_mm_pr facel_S;
 459
 460             facel_S    = gmx_set1_pr(facel);
 461
 462             gmx_load1p1_pr(&iq_S0, q+sci);
 463             gmx_load1p1_pr(&iq_S2, q+sci+2);
 464             iq_S0      = gmx_mul_pr(facel_S, iq_S0);
 465             iq_S2      = gmx_mul_pr(facel_S, iq_S2);
 466         }
 467
 468 #ifdef LJ_COMB_LB
 469         gmx_load1p1_pr(&hsig_i_S0, ljc+sci2+0);
 470         gmx_load1p1_pr(&hsig_i_S2, ljc+sci2+2);
 471         gmx_load1p1_pr(&seps_i_S0, ljc+sci2+STRIDE+0);
 472         gmx_load1p1_pr(&seps_i_S2, ljc+sci2+STRIDE+2);
 473 #else
 474 #ifdef LJ_COMB_GEOM
 475         gmx_load1p1_pr(&c6s_S0, ljc+sci2+0);
 476         if (!half_LJ)
 477         {
 478             gmx_load1p1_pr(&c6s_S2, ljc+sci2+2);
 479         }
 480         gmx_load1p1_pr(&c12s_S0, ljc+sci2+STRIDE+0);
 481         if (!half_LJ)
 482         {
 483             gmx_load1p1_pr(&c12s_S2, ljc+sci2+STRIDE+2);
 484         }
 485 #else
 486         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
 487         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
 488         if (!half_LJ)
 489         {
 490             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
 491             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
 492         }
 493 #endif
 494 #endif
 495
 496         /* Zero the potential energy for this list */
 497         Vvdwtot_S        = gmx_setzero_pr();
 498         vctot_S          = gmx_setzero_pr();
 499
 500         /* Clear i atom forces */
 501         fix_S0           = gmx_setzero_pr();
 502         fix_S2           = gmx_setzero_pr();
 503         fiy_S0           = gmx_setzero_pr();
 504         fiy_S2           = gmx_setzero_pr();
 505         fiz_S0           = gmx_setzero_pr();
 506         fiz_S2           = gmx_setzero_pr();
 507
 508         cjind = cjind0;
 509
 510         /* Currently all kernels use (at least half) LJ */
 511 #define CALC_LJ
 512         if (half_LJ)
 513         {
 514 #define CALC_COULOMB
 515 #define HALF_LJ
 516 #define CHECK_EXCLS
 517             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 518             {
 519 #include "nbnxn_kernel_simd_2xnn_inner.h"
 520                 cjind++;
 521             }
 522 #undef CHECK_EXCLS
 523             for (; (cjind < cjind1); cjind++)
 524             {
 525 #include "nbnxn_kernel_simd_2xnn_inner.h"
 526             }
 527 #undef HALF_LJ
 528 #undef CALC_COULOMB
 529         }
 530         else if (do_coul)
 531         {
 532 #define CALC_COULOMB
 533 #define CHECK_EXCLS
 534             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 535             {
 536 #include "nbnxn_kernel_simd_2xnn_inner.h"
 537                 cjind++;
 538             }
 539 #undef CHECK_EXCLS
 540             for (; (cjind < cjind1); cjind++)
 541             {
 542 #include "nbnxn_kernel_simd_2xnn_inner.h"
 543             }
 544 #undef CALC_COULOMB
 545         }
 546         else
 547         {
 548 #define CHECK_EXCLS
 549             while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 550             {
 551 #include "nbnxn_kernel_simd_2xnn_inner.h"
 552                 cjind++;
 553             }
 554 #undef CHECK_EXCLS
 555             for (; (cjind < cjind1); cjind++)
 556             {
 557 #include "nbnxn_kernel_simd_2xnn_inner.h"
 558             }
 559         }
 560 #undef CALC_LJ
 561         ninner += cjind1 - cjind0;
 562
 563         /* Add accumulated i-forces to the force array */
 564         fix_S = gmx_mm_transpose_sum4h_pr(fix_S0, fix_S2);
 565         gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
 566
 567         fiy_S = gmx_mm_transpose_sum4h_pr(fiy_S0, fiy_S2);
 568         gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
 569
 570         fiz_S = gmx_mm_transpose_sum4h_pr(fiz_S0, fiz_S2);
 571         gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
 572
 573 #ifdef CALC_SHIFTFORCES
 574         gmx_store_pr4(shf, fix_S);
 575         fshift[ish3+0] += SUM_SIMD4(shf);
 576         gmx_store_pr4(shf, fiy_S);
 577         fshift[ish3+1] += SUM_SIMD4(shf);
 578         gmx_store_pr4(shf, fiz_S);
 579         fshift[ish3+2] += SUM_SIMD4(shf);
 580 #endif
 581
 582 #ifdef CALC_ENERGIES
 583         if (do_coul)
 584         {
 585             gmx_store_pr(tmpsum, vctot_S);
 586             *Vc += SUM_SIMD(tmpsum);
 587         }
 588
 589         gmx_store_pr(tmpsum, Vvdwtot_S);
 590         *Vvdw += SUM_SIMD(tmpsum);
 591 #endif
 592
 593         /* Outer loop uses 6 flops/iteration */
 594     }
 595
 596 #ifdef COUNT_PAIRS
 597     printf("atom pairs %d\n", npair);
 598 #endif
 599 }