src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd_outer.h

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   9  * Copyright (c) 2001-2009, The GROMACS Development Team
  10  *
  11  * Gromacs is a library for molecular simulation and trajectory analysis,
  12  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  13  * a full list of developers and information, check out http://www.gromacs.org
  14  *
  15  * This program is free software; you can redistribute it and/or modify it under
  16  * the terms of the GNU Lesser General Public License as published by the Free
  17  * Software Foundation; either version 2 of the License, or (at your option) any
  18  * later version.
  19  * As a special exception, you may use this file as part of a free software
  20  * library without restriction.  Specifically, if other files instantiate
  21  * templates or use macros or inline functions from this file, or you compile
  22  * this file and link it with other files to produce an executable, this
  23  * file does not by itself cause the resulting executable to be covered by
  24  * the GNU Lesser General Public License.
  25  *
  26  * In plain-speak: do not worry about classes/macros/templates either - only
  27  * changes to the library have to be LGPL, not an application linking with it.
  28  *
  29  * To help fund GROMACS development, we humbly ask that you cite
  30  * the papers people have written on it - you can find them on the website!
  31  */
  32
  33 /* GMX_MM128_HERE or GMX_MM256_HERE should be set before including this file */
  34 #include "gmx_x86_simd_macros.h"
  35
  36 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
  37
  38 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
  39 #define UNROLLJ    GMX_X86_SIMD_WIDTH_HERE
  40
  41 #if defined GMX_MM128_HERE || defined GMX_DOUBLE
  42 #define STRIDE     4
  43 #endif
  44 #if defined GMX_MM256_HERE && !defined GMX_DOUBLE
  45 #define STRIDE     8
  46 #endif
  47
  48 #ifdef GMX_MM128_HERE
  49 #ifndef GMX_DOUBLE
  50 /* SSE single precision 4x4 kernel */
  51 #define SUM_SIMD(x) SUM_SIMD4(x)
  52 #define TAB_FDV0
  53 #else
  54 /* SSE double precision 4x2 kernel */
  55 #define SUM_SIMD(x) (x[0]+x[1])
  56 #endif
  57 #endif
  58
  59 #ifdef GMX_MM256_HERE
  60 #ifndef GMX_DOUBLE
  61 /* AVX single precision 4x8 kernel */
  62 #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
  63 #define TAB_FDV0
  64 #else
  65 /* AVX double precision 4x4 kernel */
  66 #define SUM_SIMD(x) SUM_SIMD4(x)
  67 #endif
  68 #endif
  69
  70 #define SIMD_MASK_ALL   0xffffffff
  71
  72 #include "nbnxn_kernel_x86_simd_utils.h"
  73
  74 /* All functionality defines are set here, except for:
  75  * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
  76  * CHECK_EXCLS, which is set just before including the inner loop contents.
  77  * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
  78  * set before calling the kernel function. We might want to move that
  79  * to inside the n-loop and have a different combination rule for different
  80  * ci's, as no combination rule gives a 50% performance hit for LJ.
  81  */
  82
  83 /* We always calculate shift forces, because it's cheap anyhow */
  84 #define CALC_SHIFTFORCES
  85
  86 /* Assumes all LJ parameters are identical */
  87 /* #define FIX_LJ_C */
  88
  89 #define NBK_FUNC_NAME_C_LJC(b,s,c,ljc,e) b##_##s##_##c##_comb_##ljc##_##e
  90
  91 #if defined LJ_COMB_GEOM
  92 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,geom,e)
  93 #else
  94 #if defined LJ_COMB_LB
  95 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,lb,e)
  96 #else
  97 #define NBK_FUNC_NAME_C(b,s,c,e) NBK_FUNC_NAME_C_LJC(b,s,c,none,e)
  98 #endif
  99 #endif
 100
 101 #ifdef CALC_COUL_RF
 102 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,rf,e)
 103 #endif
 104 #ifdef CALC_COUL_TAB
 105 #ifndef VDW_CUTOFF_CHECK
 106 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab,e)
 107 #else
 108 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,tab_twin,e)
 109 #endif
 110 #endif
 111 #ifdef CALC_COUL_EWALD
 112 #ifndef VDW_CUTOFF_CHECK
 113 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald,e)
 114 #else
 115 #define NBK_FUNC_NAME(b,s,e) NBK_FUNC_NAME_C(b,s,ewald_twin,e)
 116 #endif
 117 #endif
 118
 119 #ifdef GMX_MM128_HERE
 120 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd128,e)
 121 #endif
 122 #ifdef GMX_MM256_HERE
 123 #define NBK_FUNC_NAME_S128_OR_S256(b,e) NBK_FUNC_NAME(b,x86_simd256,e)
 124 #endif
 125
 126 static void
 127 #ifndef CALC_ENERGIES
 128 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,noener)
 129 #else
 130 #ifndef ENERGY_GROUPS
 131 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,ener)
 132 #else
 133 NBK_FUNC_NAME_S128_OR_S256(nbnxn_kernel,energrp)
 134 #endif
 135 #endif
 136 #undef NBK_FUNC_NAME
 137 #undef NBK_FUNC_NAME_C
 138 #undef NBK_FUNC_NAME_C_LJC
 139                             (const nbnxn_pairlist_t     *nbl,
 140                              const nbnxn_atomdata_t     *nbat,
 141                              const interaction_const_t  *ic,
 142                              rvec                       *shift_vec,
 143                              real                       *f
 144 #ifdef CALC_SHIFTFORCES
 145                              ,
 146                              real                       *fshift
 147 #endif
 148 #ifdef CALC_ENERGIES
 149                              ,
 150                              real                       *Vvdw,
 151                              real                       *Vc
 152 #endif
 153                             )
 154 {
 155     const nbnxn_ci_t   *nbln;
 156     const nbnxn_cj_t   *l_cj;
 157     const int          *type;
 158     const real         *q;
 159     const real         *shiftvec;
 160     const real         *x;
 161     const real         *nbfp0,*nbfp1,*nbfp2=NULL,*nbfp3=NULL;
 162     real       facel;
 163     real       *nbfp_ptr;
 164     int        nbfp_stride;
 165     int        n,ci,ci_sh;
 166     int        ish,ish3;
 167     gmx_bool   half_LJ,do_coul;
 168     int        sci,scix,sciy,sciz,sci2;
 169     int        cjind0,cjind1,cjind;
 170     int        ip,jp;
 171
 172 #ifdef ENERGY_GROUPS
 173     int        Vstride_i;
 174     int        egps_ishift,egps_imask;
 175     int        egps_jshift,egps_jmask,egps_jstride;
 176     int        egps_i;
 177     real       *vvdwtp[UNROLLI];
 178     real       *vctp[UNROLLI];
 179 #endif
 180
 181     gmx_mm_pr  shX_SSE;
 182     gmx_mm_pr  shY_SSE;
 183     gmx_mm_pr  shZ_SSE;
 184     gmx_mm_pr  ix_SSE0,iy_SSE0,iz_SSE0;
 185     gmx_mm_pr  ix_SSE1,iy_SSE1,iz_SSE1;
 186     gmx_mm_pr  ix_SSE2,iy_SSE2,iz_SSE2;
 187     gmx_mm_pr  ix_SSE3,iy_SSE3,iz_SSE3;
 188     gmx_mm_pr  fix_SSE0,fiy_SSE0,fiz_SSE0;
 189     gmx_mm_pr  fix_SSE1,fiy_SSE1,fiz_SSE1;
 190     gmx_mm_pr  fix_SSE2,fiy_SSE2,fiz_SSE2;
 191     gmx_mm_pr  fix_SSE3,fiy_SSE3,fiz_SSE3;
 192 #if UNROLLJ >= 4
 193 #ifndef GMX_DOUBLE
 194     __m128     fix_SSE,fiy_SSE,fiz_SSE;
 195 #else
 196     __m256d    fix_SSE,fiy_SSE,fiz_SSE;
 197 #endif
 198 #else
 199     __m128d    fix0_SSE,fiy0_SSE,fiz0_SSE;
 200     __m128d    fix2_SSE,fiy2_SSE,fiz2_SSE;
 201 #endif
 202
 203 #ifndef GMX_MM256_HERE
 204 #ifndef GMX_DOUBLE
 205     __m128i    mask0 = _mm_set_epi32( 0x0008, 0x0004, 0x0002, 0x0001 );
 206     __m128i    mask1 = _mm_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010 );
 207     __m128i    mask2 = _mm_set_epi32( 0x0800, 0x0400, 0x0200, 0x0100 );
 208     __m128i    mask3 = _mm_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000 );
 209 #else
 210     /* For double precision we need to set two 32bit ints for one double */
 211     __m128i    mask0 = _mm_set_epi32( 0x0002, 0x0002, 0x0001, 0x0001 );
 212     __m128i    mask1 = _mm_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004 );
 213     __m128i    mask2 = _mm_set_epi32( 0x0020, 0x0020, 0x0010, 0x0010 );
 214     __m128i    mask3 = _mm_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040 );
 215 #endif
 216 #else
 217     /* AVX: use floating point masks, as there are no integer instructions */
 218 #ifndef GMX_DOUBLE
 219     gmx_mm_pr  mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
 220     gmx_mm_pr  mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
 221 #else
 222     /* There is no 256-bit int to double conversion, so we use float here */
 223     __m256     mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0008, 0x0008, 0x0004, 0x0004, 0x0002, 0x0002, 0x0001, 0x0001 ));
 224     __m256     mask1 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0080, 0x0040, 0x0040, 0x0020, 0x0020, 0x0010, 0x0010 ));
 225     __m256     mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100 ));
 226     __m256     mask3 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000 ));
 227 #endif
 228 #endif
 229
 230 #ifndef GMX_MM256_HERE
 231 #ifndef GMX_DOUBLE
 232     __m128     diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
 233     __m128     diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
 234     __m128     diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
 235     __m128     diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 236 #else
 237     __m128d    diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
 238     __m128d    diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 239     __m128d    diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 240     __m128d    diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 241     __m128d    diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
 242     __m128d    diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
 243     __m128d    diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
 244     __m128d    diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 245 #endif
 246 #else /* GMX_MM256_HERE */
 247 #ifndef GMX_DOUBLE
 248     gmx_mm_pr  diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
 249     gmx_mm_pr  diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
 250     gmx_mm_pr  diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
 251     gmx_mm_pr  diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 252     gmx_mm_pr  diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 253     gmx_mm_pr  diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 254     gmx_mm_pr  diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 255     gmx_mm_pr  diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 256 #else
 257     gmx_mm_pr  diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
 258     gmx_mm_pr  diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 259     gmx_mm_pr  diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 260     gmx_mm_pr  diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
 261 #endif
 262 #endif
 263
 264 #ifndef GMX_MM256_HERE
 265     __m128i    zeroi_SSE = _mm_setzero_si128();
 266 #endif
 267 #ifdef GMX_X86_SSE4_1
 268     gmx_mm_pr  zero_SSE = gmx_set1_pr(0);
 269 #endif
 270
 271     gmx_mm_pr  one_SSE=gmx_set1_pr(1.0);
 272     gmx_mm_pr  iq_SSE0=gmx_setzero_pr();
 273     gmx_mm_pr  iq_SSE1=gmx_setzero_pr();
 274     gmx_mm_pr  iq_SSE2=gmx_setzero_pr();
 275     gmx_mm_pr  iq_SSE3=gmx_setzero_pr();
 276     gmx_mm_pr  mrc_3_SSE;
 277 #ifdef CALC_ENERGIES
 278     gmx_mm_pr  hrc_3_SSE,moh_rc_SSE;
 279 #endif
 280
 281 #ifdef CALC_COUL_TAB
 282     /* Coulomb table variables */
 283     gmx_mm_pr  invtsp_SSE;
 284     const real *tab_coul_F;
 285 #ifndef TAB_FDV0
 286     const real *tab_coul_V;
 287 #endif
 288 #ifdef GMX_MM256_HERE
 289     int        ti0_array[2*UNROLLJ-1],*ti0;
 290     int        ti1_array[2*UNROLLJ-1],*ti1;
 291     int        ti2_array[2*UNROLLJ-1],*ti2;
 292     int        ti3_array[2*UNROLLJ-1],*ti3;
 293 #endif
 294 #ifdef CALC_ENERGIES
 295     gmx_mm_pr  mhalfsp_SSE;
 296 #endif
 297 #endif
 298
 299 #ifdef CALC_COUL_EWALD
 300     gmx_mm_pr beta2_SSE,beta_SSE;
 301 #endif
 302
 303 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 304     gmx_mm_pr  sh_ewald_SSE;
 305 #endif
 306
 307 #ifdef LJ_COMB_LB
 308     const real *ljc;
 309
 310     gmx_mm_pr  hsig_i_SSE0,seps_i_SSE0;
 311     gmx_mm_pr  hsig_i_SSE1,seps_i_SSE1;
 312     gmx_mm_pr  hsig_i_SSE2,seps_i_SSE2;
 313     gmx_mm_pr  hsig_i_SSE3,seps_i_SSE3;
 314 #else
 315 #ifdef FIX_LJ_C
 316     real       pvdw_array[2*UNROLLI*UNROLLJ+3];
 317     real       *pvdw_c6,*pvdw_c12;
 318     gmx_mm_pr  c6_SSE0,c12_SSE0;
 319     gmx_mm_pr  c6_SSE1,c12_SSE1;
 320     gmx_mm_pr  c6_SSE2,c12_SSE2;
 321     gmx_mm_pr  c6_SSE3,c12_SSE3;
 322 #endif
 323
 324 #ifdef LJ_COMB_GEOM
 325     const real *ljc;
 326
 327     gmx_mm_pr  c6s_SSE0,c12s_SSE0;
 328     gmx_mm_pr  c6s_SSE1,c12s_SSE1;
 329     gmx_mm_pr  c6s_SSE2=gmx_setzero_pr(),c12s_SSE2=gmx_setzero_pr();
 330     gmx_mm_pr  c6s_SSE3=gmx_setzero_pr(),c12s_SSE3=gmx_setzero_pr();
 331 #endif
 332 #endif /* LJ_COMB_LB */
 333
 334     gmx_mm_pr  vctotSSE,VvdwtotSSE;
 335     gmx_mm_pr  sixthSSE,twelvethSSE;
 336
 337     gmx_mm_pr  avoid_sing_SSE;
 338     gmx_mm_pr  rc2_SSE;
 339 #ifdef VDW_CUTOFF_CHECK
 340     gmx_mm_pr  rcvdw2_SSE;
 341 #endif
 342
 343 #ifdef CALC_ENERGIES
 344     gmx_mm_pr  sh_invrc6_SSE,sh_invrc12_SSE;
 345
 346     /* cppcheck-suppress unassignedVariable */
 347     real       tmpsum_array[15],*tmpsum;
 348 #endif
 349 #ifdef CALC_SHIFTFORCES
 350     /* cppcheck-suppress unassignedVariable */
 351     real       shf_array[15],*shf;
 352 #endif
 353
 354     int ninner;
 355
 356 #ifdef COUNT_PAIRS
 357     int npair=0;
 358 #endif
 359
 360 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 361     ljc = nbat->lj_comb;
 362 #else
 363     /* No combination rule used */
 364 #ifndef GMX_DOUBLE
 365     nbfp_ptr    = nbat->nbfp_s4;
 366 #define NBFP_STRIDE  4
 367 #else
 368     nbfp_ptr    = nbat->nbfp;
 369 #define NBFP_STRIDE  2
 370 #endif
 371     nbfp_stride = NBFP_STRIDE;
 372 #endif
 373
 374 #ifdef CALC_COUL_TAB
 375 #ifdef GMX_MM256_HERE
 376     /* Generate aligned table pointers */
 377     ti0 = (int *)(((size_t)(ti0_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
 378     ti1 = (int *)(((size_t)(ti1_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
 379     ti2 = (int *)(((size_t)(ti2_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
 380     ti3 = (int *)(((size_t)(ti3_array+UNROLLJ-1)) & (~((size_t)(UNROLLJ*sizeof(real)-1))));
 381 #endif
 382
 383     invtsp_SSE  = gmx_set1_pr(ic->tabq_scale);
 384 #ifdef CALC_ENERGIES
 385     mhalfsp_SSE = gmx_set1_pr(-0.5/ic->tabq_scale);
 386 #endif
 387
 388 #ifdef TAB_FDV0
 389     tab_coul_F = ic->tabq_coul_FDV0;
 390 #else
 391     tab_coul_F = ic->tabq_coul_F;
 392     tab_coul_V = ic->tabq_coul_V;
 393 #endif
 394 #endif /* CALC_COUL_TAB */
 395
 396 #ifdef CALC_COUL_EWALD
 397     beta2_SSE = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
 398     beta_SSE  = gmx_set1_pr(ic->ewaldcoeff);
 399 #endif
 400
 401 #if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
 402     sh_ewald_SSE = gmx_set1_pr(ic->sh_ewald);
 403 #endif
 404
 405     q                   = nbat->q;
 406     type                = nbat->type;
 407     facel               = ic->epsfac;
 408     shiftvec            = shift_vec[0];
 409     x                   = nbat->x;
 410
 411     avoid_sing_SSE = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
 412
 413     /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
 414     rc2_SSE    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
 415 #ifdef VDW_CUTOFF_CHECK
 416     rcvdw2_SSE = gmx_set1_pr(ic->rvdw*ic->rvdw);
 417 #endif
 418
 419 #ifdef CALC_ENERGIES
 420     sixthSSE    = gmx_set1_pr(1.0/6.0);
 421     twelvethSSE = gmx_set1_pr(1.0/12.0);
 422
 423     sh_invrc6_SSE  = gmx_set1_pr(ic->sh_invrc6);
 424     sh_invrc12_SSE = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
 425 #endif
 426
 427     mrc_3_SSE = gmx_set1_pr(-2*ic->k_rf);
 428
 429 #ifdef CALC_ENERGIES
 430     hrc_3_SSE = gmx_set1_pr(ic->k_rf);
 431
 432     moh_rc_SSE = gmx_set1_pr(-ic->c_rf);
 433 #endif
 434
 435 #ifdef CALC_ENERGIES
 436     tmpsum = (real *)(((size_t)(tmpsum_array+7)) & (~((size_t)31)));
 437 #endif
 438 #ifdef CALC_SHIFTFORCES
 439     shf = (real *)(((size_t)(shf_array+7)) & (~((size_t)31)));
 440 #endif
 441
 442 #ifdef FIX_LJ_C
 443     pvdw_c6  = (real *)(((size_t)(pvdw_array+3)) & (~((size_t)15)));
 444     pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
 445
 446     for(jp=0; jp<UNROLLJ; jp++)
 447     {
 448         pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
 449         pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
 450         pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
 451         pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
 452
 453         pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 454         pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 455         pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 456         pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 457     }
 458     c6_SSE0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
 459     c6_SSE1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
 460     c6_SSE2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
 461     c6_SSE3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
 462
 463     c12_SSE0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
 464     c12_SSE1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
 465     c12_SSE2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
 466     c12_SSE3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
 467 #endif /* FIX_LJ_C */
 468
 469 #ifdef ENERGY_GROUPS
 470     egps_ishift  = nbat->neg_2log;
 471     egps_imask   = (1<<egps_ishift) - 1;
 472     egps_jshift  = 2*nbat->neg_2log;
 473     egps_jmask   = (1<<egps_jshift) - 1;
 474     egps_jstride = (UNROLLJ>>1)*UNROLLJ;
 475     /* Major division is over i-particles: divide nVS by 4 for i-stride */
 476     Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 477 #endif
 478
 479     l_cj = nbl->cj;
 480
 481     ninner = 0;
 482     for(n=0; n<nbl->nci; n++)
 483     {
 484         nbln = &nbl->ci[n];
 485
 486         ish              = (nbln->shift & NBNXN_CI_SHIFT);
 487         ish3             = ish*3;
 488         cjind0           = nbln->cj_ind_start;
 489         cjind1           = nbln->cj_ind_end;
 490         /* Currently only works super-cells equal to sub-cells */
 491         ci               = nbln->ci;
 492         ci_sh            = (ish == CENTRAL ? ci : -1);
 493
 494         shX_SSE = gmx_load1_pr(shiftvec+ish3);
 495         shY_SSE = gmx_load1_pr(shiftvec+ish3+1);
 496         shZ_SSE = gmx_load1_pr(shiftvec+ish3+2);
 497
 498 #if UNROLLJ <= 4
 499         sci              = ci*STRIDE;
 500         scix             = sci*DIM;
 501         sci2             = sci*2;
 502 #else
 503         sci              = (ci>>1)*STRIDE;
 504         scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
 505         sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
 506         sci             += (ci & 1)*(STRIDE>>1);
 507 #endif
 508
 509         half_LJ = (nbln->shift & NBNXN_CI_HALF_LJ(0));
 510         do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
 511
 512 #ifdef ENERGY_GROUPS
 513         egps_i = nbat->energrp[ci];
 514         {
 515             int ia,egp_ia;
 516
 517             for(ia=0; ia<UNROLLI; ia++)
 518             {
 519                 egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
 520                 vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
 521                 vctp[ia]   = Vc   + egp_ia*Vstride_i;
 522             }
 523         }
 524 #endif
 525 #if defined CALC_ENERGIES
 526 #if UNROLLJ == 4
 527         if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
 528 #endif
 529 #if UNROLLJ == 2
 530         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
 531 #endif
 532 #if UNROLLJ == 8
 533         if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
 534 #endif
 535         {
 536             int  ia;
 537             real Vc_sub_self;
 538
 539 #ifdef CALC_COUL_RF
 540             Vc_sub_self = 0.5*ic->c_rf;
 541 #endif
 542 #ifdef CALC_COUL_TAB
 543 #ifdef TAB_FDV0
 544             Vc_sub_self = 0.5*tab_coul_F[2];
 545 #else
 546             Vc_sub_self = 0.5*tab_coul_V[0];
 547 #endif
 548 #endif
 549 #ifdef CALC_COUL_EWALD
 550             /* beta/sqrt(pi) */
 551             Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
 552 #endif
 553
 554             for(ia=0; ia<UNROLLI; ia++)
 555             {
 556                 real qi;
 557
 558                 qi = q[sci+ia];
 559 #ifdef ENERGY_GROUPS
 560                 vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
 561 #else
 562                 Vc[0]
 563 #endif
 564                     -= facel*qi*qi*Vc_sub_self;
 565             }
 566         }
 567 #endif
 568
 569                 /* Load i atom data */
 570         sciy             = scix + STRIDE;
 571         sciz             = sciy + STRIDE;
 572         ix_SSE0          = gmx_add_pr(gmx_load1_pr(x+scix)  ,shX_SSE);
 573         ix_SSE1          = gmx_add_pr(gmx_load1_pr(x+scix+1),shX_SSE);
 574         ix_SSE2          = gmx_add_pr(gmx_load1_pr(x+scix+2),shX_SSE);
 575         ix_SSE3          = gmx_add_pr(gmx_load1_pr(x+scix+3),shX_SSE);
 576         iy_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciy)  ,shY_SSE);
 577         iy_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciy+1),shY_SSE);
 578         iy_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciy+2),shY_SSE);
 579         iy_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciy+3),shY_SSE);
 580         iz_SSE0          = gmx_add_pr(gmx_load1_pr(x+sciz)  ,shZ_SSE);
 581         iz_SSE1          = gmx_add_pr(gmx_load1_pr(x+sciz+1),shZ_SSE);
 582         iz_SSE2          = gmx_add_pr(gmx_load1_pr(x+sciz+2),shZ_SSE);
 583         iz_SSE3          = gmx_add_pr(gmx_load1_pr(x+sciz+3),shZ_SSE);
 584
 585         /* With half_LJ we currently always calculate Coulomb interactions */
 586         if (do_coul || half_LJ)
 587         {
 588             iq_SSE0      = gmx_set1_pr(facel*q[sci]);
 589             iq_SSE1      = gmx_set1_pr(facel*q[sci+1]);
 590             iq_SSE2      = gmx_set1_pr(facel*q[sci+2]);
 591             iq_SSE3      = gmx_set1_pr(facel*q[sci+3]);
 592         }
 593
 594 #ifdef LJ_COMB_LB
 595         hsig_i_SSE0      = gmx_load1_pr(ljc+sci2+0);
 596         hsig_i_SSE1      = gmx_load1_pr(ljc+sci2+1);
 597         hsig_i_SSE2      = gmx_load1_pr(ljc+sci2+2);
 598         hsig_i_SSE3      = gmx_load1_pr(ljc+sci2+3);
 599         seps_i_SSE0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
 600         seps_i_SSE1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
 601         seps_i_SSE2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
 602         seps_i_SSE3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
 603 #else
 604 #ifdef LJ_COMB_GEOM
 605         c6s_SSE0         = gmx_load1_pr(ljc+sci2+0);
 606         c6s_SSE1         = gmx_load1_pr(ljc+sci2+1);
 607         if (!half_LJ)
 608         {
 609             c6s_SSE2     = gmx_load1_pr(ljc+sci2+2);
 610             c6s_SSE3     = gmx_load1_pr(ljc+sci2+3);
 611         }
 612         c12s_SSE0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
 613         c12s_SSE1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
 614         if (!half_LJ)
 615         {
 616             c12s_SSE2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
 617             c12s_SSE3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
 618         }
 619 #else
 620         nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
 621         nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
 622         if (!half_LJ)
 623         {
 624             nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
 625             nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
 626         }
 627 #endif
 628 #endif
 629
 630         /* Zero the potential energy for this list */
 631         VvdwtotSSE       = gmx_setzero_pr();
 632         vctotSSE         = gmx_setzero_pr();
 633
 634         /* Clear i atom forces */
 635         fix_SSE0           = gmx_setzero_pr();
 636         fix_SSE1           = gmx_setzero_pr();
 637         fix_SSE2           = gmx_setzero_pr();
 638         fix_SSE3           = gmx_setzero_pr();
 639         fiy_SSE0           = gmx_setzero_pr();
 640         fiy_SSE1           = gmx_setzero_pr();
 641         fiy_SSE2           = gmx_setzero_pr();
 642         fiy_SSE3           = gmx_setzero_pr();
 643         fiz_SSE0           = gmx_setzero_pr();
 644         fiz_SSE1           = gmx_setzero_pr();
 645         fiz_SSE2           = gmx_setzero_pr();
 646         fiz_SSE3           = gmx_setzero_pr();
 647
 648         cjind = cjind0;
 649
 650         /* Currently all kernels use (at least half) LJ */
 651 #define CALC_LJ
 652         if (half_LJ)
 653         {
 654 #define CALC_COULOMB
 655 #define HALF_LJ
 656 #define CHECK_EXCLS
 657             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 658             {
 659 #include "nbnxn_kernel_x86_simd_inner.h"
 660                 cjind++;
 661             }
 662 #undef CHECK_EXCLS
 663             for(; (cjind<cjind1); cjind++)
 664             {
 665 #include "nbnxn_kernel_x86_simd_inner.h"
 666             }
 667 #undef HALF_LJ
 668 #undef CALC_COULOMB
 669         }
 670         else if (do_coul)
 671         {
 672 #define CALC_COULOMB
 673 #define CHECK_EXCLS
 674             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 675             {
 676 #include "nbnxn_kernel_x86_simd_inner.h"
 677                 cjind++;
 678             }
 679 #undef CHECK_EXCLS
 680             for(; (cjind<cjind1); cjind++)
 681             {
 682 #include "nbnxn_kernel_x86_simd_inner.h"
 683             }
 684 #undef CALC_COULOMB
 685         }
 686         else
 687         {
 688 #define CHECK_EXCLS
 689             while (cjind < cjind1 && nbl->cj[cjind].excl != SIMD_MASK_ALL)
 690             {
 691 #include "nbnxn_kernel_x86_simd_inner.h"
 692                 cjind++;
 693             }
 694 #undef CHECK_EXCLS
 695             for(; (cjind<cjind1); cjind++)
 696             {
 697 #include "nbnxn_kernel_x86_simd_inner.h"
 698             }
 699         }
 700 #undef CALC_LJ
 701         ninner += cjind1 - cjind0;
 702
 703         /* Add accumulated i-forces to the force array */
 704 #if UNROLLJ >= 4
 705 #ifndef GMX_DOUBLE
 706 #define gmx_load_ps4  _mm_load_ps
 707 #define gmx_store_ps4 _mm_store_ps
 708 #define gmx_add_ps4   _mm_add_ps
 709 #else
 710 #define gmx_load_ps4  _mm256_load_pd
 711 #define gmx_store_ps4 _mm256_store_pd
 712 #define gmx_add_ps4   _mm256_add_pd
 713 #endif
 714         GMX_MM_TRANSPOSE_SUM4_PR(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3,fix_SSE);
 715         gmx_store_ps4(f+scix, gmx_add_ps4(fix_SSE, gmx_load_ps4(f+scix)));
 716
 717         GMX_MM_TRANSPOSE_SUM4_PR(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3,fiy_SSE);
 718         gmx_store_ps4(f+sciy, gmx_add_ps4(fiy_SSE, gmx_load_ps4(f+sciy)));
 719
 720         GMX_MM_TRANSPOSE_SUM4_PR(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3,fiz_SSE);
 721         gmx_store_ps4(f+sciz, gmx_add_ps4(fiz_SSE, gmx_load_ps4(f+sciz)));
 722
 723 #ifdef CALC_SHIFTFORCES
 724         gmx_store_ps4(shf,fix_SSE);
 725         fshift[ish3+0] += SUM_SIMD4(shf);
 726         gmx_store_ps4(shf,fiy_SSE);
 727         fshift[ish3+1] += SUM_SIMD4(shf);
 728         gmx_store_ps4(shf,fiz_SSE);
 729         fshift[ish3+2] += SUM_SIMD4(shf);
 730 #endif
 731 #else
 732         GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE0,fix_SSE1,fix0_SSE);
 733         _mm_store_pd(f+scix, _mm_add_pd(fix0_SSE, _mm_load_pd(f+scix)));
 734         GMX_MM_TRANSPOSE_SUM2_PD(fix_SSE2,fix_SSE3,fix2_SSE);
 735         _mm_store_pd(f+scix+2, _mm_add_pd(fix2_SSE, _mm_load_pd(f+scix+2)));
 736
 737         GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE0,fiy_SSE1,fiy0_SSE);
 738         _mm_store_pd(f+sciy, _mm_add_pd(fiy0_SSE, _mm_load_pd(f+sciy)));
 739         GMX_MM_TRANSPOSE_SUM2_PD(fiy_SSE2,fiy_SSE3,fiy2_SSE);
 740         _mm_store_pd(f+sciy+2, _mm_add_pd(fiy2_SSE, _mm_load_pd(f+sciy+2)));
 741
 742         GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE0,fiz_SSE1,fiz0_SSE);
 743         _mm_store_pd(f+sciz, _mm_add_pd(fiz0_SSE, _mm_load_pd(f+sciz)));
 744         GMX_MM_TRANSPOSE_SUM2_PD(fiz_SSE2,fiz_SSE3,fiz2_SSE);
 745         _mm_store_pd(f+sciz+2, _mm_add_pd(fiz2_SSE, _mm_load_pd(f+sciz+2)));
 746
 747 #ifdef CALC_SHIFTFORCES
 748         _mm_store_pd(shf,_mm_add_pd(fix0_SSE,fix2_SSE));
 749         fshift[ish3+0] += shf[0] + shf[1];
 750         _mm_store_pd(shf,_mm_add_pd(fiy0_SSE,fiy2_SSE));
 751         fshift[ish3+1] += shf[0] + shf[1];
 752         _mm_store_pd(shf,_mm_add_pd(fiz0_SSE,fiz2_SSE));
 753         fshift[ish3+2] += shf[0] + shf[1];
 754 #endif
 755 #endif
 756
 757 #ifdef CALC_ENERGIES
 758         if (do_coul)
 759         {
 760             gmx_store_pr(tmpsum,vctotSSE);
 761             *Vc += SUM_SIMD(tmpsum);
 762         }
 763
 764         gmx_store_pr(tmpsum,VvdwtotSSE);
 765         *Vvdw += SUM_SIMD(tmpsum);
 766 #endif
 767
 768                 /* Outer loop uses 6 flops/iteration */
 769         }
 770
 771 #ifdef COUNT_PAIRS
 772     printf("atom pairs %d\n",npair);
 773 #endif
 774 }
 775
 776 #undef gmx_load_ps4
 777 #undef gmx_store_ps4
 778 #undef gmx_store_ps4
 779
 780 #undef CALC_SHIFTFORCES
 781
 782 #undef UNROLLI
 783 #undef UNROLLJ
 784 #undef STRIDE
 785 #undef TAB_FDV0
 786 #undef NBFP_STRIDE