Put #ifdef GMX_X86_SSE around all x86 specific code.
Memory alignment is determined instead of hard coded.
The diagonal exclusion masks are now calculated in a generic fashion.
Also made the GPU setup constants fully macro driven.
Added documentation for most of the nbnxn macro setup.
Change-Id: Ib59fe5cec3cf2402f4701e58de59f5707cb797b3
* with different settings from the same source file.
*/
-/* NOTE: floor and blend are NOT available with SSE2 only acceleration */
+/* NOTE: floor and blendv are NOT available with SSE2 only acceleration */
#undef GMX_SIMD_WIDTH_HERE
#undef gmx_epi32
+/* float/double SIMD register type */
#undef gmx_mm_pr
#undef gmx_load_pr
#undef gmx_set1_pr
#undef gmx_setzero_pr
#undef gmx_store_pr
+/* Only used for debugging */
#undef gmx_storeu_pr
#undef gmx_add_pr
#undef gmx_or_pr
#undef gmx_andnot_pr
+/* Only used to speed up the nbnxn tabulated PME kernels */
#undef gmx_floor_pr
+/* Only used with x86 when blendv is faster than comparison */
#undef gmx_blendv_pr
#undef gmx_movemask_pr
+/* Integer casts are only used for nbnxn x86 exclusion masks */
#undef gmx_mm_castsi128_pr
+#undef gmx_mm_castsi256_pr
+/* Conversions only used for nbnxn x86 exclusion masks and PME table lookup */
#undef gmx_cvttpr_epi32
#undef gmx_cvtepi32_pr
#undef gmx_calc_rsq_pr
#undef gmx_sum4_pr
+/* Only required for nbnxn analytical PME kernels */
#undef gmx_pmecorrF_pr
#undef gmx_pmecorrV_pr
+/* Half SIMD-width types and operations only for nbnxn 2xnn search+kernels */
+#undef gmx_mm_hpr
+
+#undef gmx_load_hpr
+#undef gmx_load1_hpr
+#undef gmx_store_hpr
+#undef gmx_add_hpr
+#undef gmx_sub_hpr
+
+#undef gmx_sum4_hpr
+
+#undef gmx_2hpr_to_pr
+
+
/* By defining GMX_MM128_HERE or GMX_MM256_HERE before including this file
* the same intrinsics, with defines, can be compiled for either 128 or 256
* bit wide SSE or AVX instructions.
#error "You should not define both GMX_MM128_HERE and GMX_MM256_HERE"
#endif
+
+#ifdef GMX_X86_SSE2
+
#ifdef GMX_MM128_HERE
#define gmx_epi32 __m128i
#endif
#endif /* GMX_MM256_HERE */
+
+#endif /* GMX_X86_SSE2 */
#define GMX_NBNXN_SIMD
#ifdef GMX_X86_AVX_256
-/* Comment out this define to use AVX-128 kernels with AVX-256 acceleration */
+/* Note that setting this to 128 will also work with AVX-256, but slower */
#define GMX_NBNXN_SIMD_BITWIDTH 256
#else
#define GMX_NBNXN_SIMD_BITWIDTH 128
#endif
/* The nbnxn SIMD 4xN and 2x(N+N) kernels can be added independently.
- * Currently the 2xNN SIMD kernels only make sense and are only implemented
- * with AVX-256 in single precision using a 4x4 cluster setup instead of 4x8.
+ * Currently the 2xNN SIMD kernels only make sense with:
+ * 8-way SIMD: 4x4 setup, works with AVX-256 in single precision
+ * 16-way SIMD: 4x8 setup, not used, but most of the kernel code is there
*/
#define GMX_NBNXN_SIMD_4XN
#if GMX_NBNXN_SIMD_BITWIDTH == 256 && !defined GMX_DOUBLE
nbnxnkNR
} nbnxn_kernel_type;
-/* Note that _mm_... intrinsics can be converted to either SSE or AVX
- * depending on compiler flags.
- * For gcc we check for __AVX__
- * At least a check for icc should be added (if there is a macro)
- */
+/* Define the nbnxn kernel names for all different types defined above */
static const char *nbnxn_kernel_name[nbnxnkNR] =
- { "not set", "plain C",
-#if !(defined GMX_X86_SSE2)
+{
+ "not set",
+ "plain C",
+#ifndef GMX_NBNXN_SIMD
"not available", "not available",
#else
+#ifdef GMX_X86_SSE2
#if GMX_NBNXN_SIMD_BITWIDTH == 128
+/* x86 SIMD intrinsics can be converted to either SSE or AVX depending
+ * on compiler flags. As we use nearly identical intrinsics, using an AVX
+ * compiler flag without an AVX macro effectively results in AVX kernels.
+ * For gcc we check for __AVX__
+ * At least a check for icc should be added (if there is a macro)
+ */
#if !(defined GMX_X86_AVX_128_FMA || defined __AVX__)
#ifndef GMX_X86_SSE4_1
"SSE2", "SSE2",
#else
"AVX-128", "AVX-128",
#endif
-#else
- "AVX-256", "AVX-256",
+#endif
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+ "AVX-256", "AVX-256",
+#endif
+#else /* not GMX_X86_SSE2 */
+ "SIMD", "SIMD",
#endif
#endif
- "CUDA", "plain C" };
+ "CUDA",
+ "plain C"
+};
enum { ewaldexclTable, ewaldexclAnalytical };
int xstride; /* stride for a coordinate in x (usually 3 or 4) */
int fstride; /* stride for a coordinate in f (usually 3 or 4) */
real *x; /* x and possibly q, size natoms*xstride */
+ real *simd_4xn_diag; /* indices to set the SIMD 4xN diagonal masks */
+ real *simd_2xnn_diag; /* indices to set the SIMD 2x(N+N)diagonal masks */
int nout; /* The number of force arrays */
nbnxn_atomdata_output_t *out; /* Output data structures */
int nalloc; /* Allocation size of all arrays (for x/f *x/fstride) */
#include "nbnxn_atomdata.h"
#include "gmx_omp_nthreads.h"
-/* Default nbnxn allocation routine, allocates 32 byte aligned,
- * which works for plain C and aligned SSE and AVX loads/stores.
- */
+/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
void nbnxn_alloc_aligned(void **ptr,size_t nbytes)
{
- *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,32);
+ *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,NBNXN_MEM_ALIGN);
}
/* Free function for memory allocated with nbnxn_alloc_aligned */
nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
nbat->x = NULL;
+
+#ifdef GMX_NBNXN_SIMD
+ if (simple)
+ {
+ /* Set the diagonal cluster pair exclusion mask setup data.
+ * In the kernel we check 0 < j - i to generate the masks.
+ * Here we store j - i for generating the mask for the first i,
+ * we substract 0.5 to avoid rounding issues.
+ * In the kernel we can subtract 1 to generate the subsequent mask.
+ */
+ const int simd_width=GMX_NBNXN_SIMD_BITWIDTH/(sizeof(real)*8);
+ int simd_4xn_diag_size,j;
+
+ simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE,simd_width);
+ snew_aligned(nbat->simd_4xn_diag,simd_4xn_diag_size,NBNXN_MEM_ALIGN);
+ for(j=0; j<simd_4xn_diag_size; j++)
+ {
+ nbat->simd_4xn_diag[j] = j - 0.5;
+ }
+
+ snew_aligned(nbat->simd_2xnn_diag,simd_width,NBNXN_MEM_ALIGN);
+ for(j=0; j<simd_width/2; j++)
+ {
+ /* The j-cluster size is half the SIMD width */
+ nbat->simd_2xnn_diag[j] = j - 0.5;
+ /* The next half of the SIMD width is for i + 1 */
+ nbat->simd_2xnn_diag[simd_width/2+j] = j - 1 - 0.5;
+ }
+ }
+#endif
+
+ /* Initialize the output data structures */
nbat->nout = nout;
snew(nbat->out,nbat->nout);
nbat->nalloc = 0;
}
static void
-nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
- gmx_bool bDestSet,
- real ** gmx_restrict src,
- int nsrc,
- int i0, int i1)
+nbnxn_atomdata_reduce_reals_simd(real * gmx_restrict dest,
+ gmx_bool bDestSet,
+ real ** gmx_restrict src,
+ int nsrc,
+ int i0, int i1)
{
-#ifdef NBNXN_SEARCH_SSE
-/* We can use AVX256 here, but not when AVX128 kernels are selected.
- * As this reduction is not faster with AVX256 anyway, we use 128-bit SIMD.
+#ifdef GMX_NBNXN_SIMD
+/* The SIMD width here is actually independent of that in the kernels,
+ * but we use the same width for simplicity (usually optimal anyhow).
*/
-#ifdef GMX_X86_AVX_256
-#define GMX_MM256_HERE
-#else
+#if GMX_NBNXN_SIMD_BITWIDTH == 128
#define GMX_MM128_HERE
#endif
+#if GMX_NBNXN_SIMD_BITWIDTH == 256
+#define GMX_MM256_HERE
+#endif
#include "gmx_simd_macros.h"
int i,s;
}
if (nfptr > 0)
{
-#ifdef NBNXN_SEARCH_SSE
- nbnxn_atomdata_reduce_reals_x86_simd
+#ifdef GMX_NBNXN_SIMD
+ nbnxn_atomdata_reduce_reals_simd
#else
nbnxn_atomdata_reduce_reals
#endif
/* With GPU kernels the cluster size is 8 atoms */
#define NBNXN_GPU_CLUSTER_SIZE 8
-/* With GPU kernels we group cluster pairs in 4 to optimize memory usage */
-#define NBNXN_GPU_JGROUP_SIZE 4
+/* With GPU kernels we group cluster pairs in 4 to optimize memory usage.
+ * To change this, also change nbnxn_cj4_t in include/types/nbnxn_pairlist.h.
+ */
+#define NBNXN_GPU_JGROUP_SIZE 4
+#define NBNXN_GPU_JGROUP_SIZE_2LOG 2
/* To avoid NaN when excluded atoms are at zero distance, we add a small
* number to r^2. NBNXN_AVOID_SING_R2_INC^-3 should fit in real.
#if !defined PRUNE_NBL && __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
#pragma unroll 4
#endif
- for (jm = 0; jm < 4; jm++)
+ for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
{
- if (imask & (255U << (jm * NCL_PER_SUPERCL)))
+ if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
{
mask_ji = (1U << (jm * NCL_PER_SUPERCL));
#if CUDA_VERSION >= 4010
#pragma unroll 4
#endif
- for (jm = 0; jm < 4; jm++)
+ for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
{
- imask_j = (imask >> (jm * 8)) & 255U;
+ imask_j = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
if (imask_j)
{
nsubi = __popc(imask_j);
#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
#define FBUF_STRIDE (CL_SIZE_SQ)
+/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
+const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+
/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
* Original idea: OpenMM
*/
#ifdef GMX_X86_SSE2
-#define NBNXN_SEARCH_SSE
+/* Use 4-way SIMD for, always, single precision bounding box calculations */
+#define NBNXN_SEARCH_BB_SSE
+#endif
+
+
+#ifdef GMX_NBNXN_SIMD
+/* Memory alignment in bytes as required by SIMD aligned loads/stores */
+#define NBNXN_MEM_ALIGN (GMX_NBNXN_SIMD_BITWIDTH/8)
+#else
+/* No alignment required, but set it so we can call the same routines */
+#define NBNXN_MEM_ALIGN 32
#endif
excl[0] = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
excl[1] = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
- for(jm=0; jm<4; jm++)
+ for(jm=0; jm<NBNXN_GPU_JGROUP_SIZE; jm++)
{
cj = nbl->cj4[cj4_ind].cj[jm];
gmx_mm_pr mask0 = _mm256_castsi256_ps(_mm256_set_epi32( 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001 ));
gmx_mm_pr mask2 = _mm256_castsi256_ps(_mm256_set_epi32( 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 ));
- gmx_mm_pr diag_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
- gmx_mm_pr diag_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
+ gmx_mm_pr diag_jmi_SSE;
+#if UNROLLI == UNROLLJ
+ gmx_mm_pr diag_SSE0,diag_SSE2;
+#else
+ gmx_mm_pr diag0_SSE0,diag0_SSE2;
+ gmx_mm_pr diag1_SSE0,diag1_SSE2;
+#endif
-#ifdef GMX_X86_SSE4_1
gmx_mm_pr zero_SSE = gmx_set1_pr(0);
-#endif
gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
gmx_mm_pr iq_SSE0=gmx_setzero_pr();
nbfp_stride = NBFP_STRIDE;
#endif
+ /* Load j-i for the first i */
+ diag_jmi_SSE = gmx_load_pr(nbat->simd_2xnn_diag);
+ /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
+ diag_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#else
+#if 2*UNROLLI == UNROLLJ
+ diag0_SSE0 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag0_SSE2 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag1_SSE0 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag_i_SSE = gmx_add_pr(diag_i_SSE,one_SSE);
+ diag1_SSE2 = gmx_cmplt_pr(diag_i_SSE,diag_j_SSE);
+#endif
+#endif
+
#ifdef CALC_COUL_TAB
#ifdef GMX_MM256_HERE
/* Generate aligned table index pointers */
ajz = ajy + STRIDE;
#ifdef CHECK_EXCLS
-#ifndef GMX_MM256_HERE
+#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
{
/* Load integer interaction mask */
__m128i mask_int = _mm_set1_epi32(l_cj[cjind].excl);
- /* The is no unequal sse instruction, so we need a not here */
int_SSE0 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask0),zeroi_SSE));
int_SSE1 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask1),zeroi_SSE));
int_SSE2 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask2),zeroi_SSE));
int_SSE3 = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask3),zeroi_SSE));
}
-#else
+#endif
+#if defined GMX_X86_SSE2 && defined GMX_MM256_HERE
{
#ifndef GMX_DOUBLE
/* Load integer interaction mask */
#endif
#endif
-#ifdef GMX_MM128_HERE
-#ifndef GMX_DOUBLE
- __m128 diag_SSE0 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
- __m128 diag_SSE1 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- __m128 diag_SSE2 = gmx_mm_castsi128_pr( _mm_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
- __m128 diag_SSE3 = gmx_mm_castsi128_pr( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#else
- __m128d diag0_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- __m128d diag0_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- __m128d diag0_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- __m128d diag0_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- __m128d diag1_SSE0 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
- __m128d diag1_SSE1 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ));
- __m128d diag1_SSE2 = gmx_mm_castsi128_pd( _mm_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- __m128d diag1_SSE3 = gmx_mm_castsi128_pd( _mm_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
-#endif
-#ifdef GMX_MM256_HERE
-#ifndef GMX_DOUBLE
- gmx_mm_pr diag0_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000 ));
- gmx_mm_pr diag0_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag0_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag0_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE0 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE1 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE2 = _mm256_castsi256_ps( _mm256_set_epi32( 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag1_SSE3 = _mm256_castsi256_ps( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
+ gmx_mm_pr diag_jmi_SSE;
+#if UNROLLI == UNROLLJ
+ gmx_mm_pr diag_SSE0,diag_SSE1,diag_SSE2,diag_SSE3;
#else
- gmx_mm_pr diag_SSE0 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag_SSE1 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag_SSE2 = _mm256_castsi256_pd( _mm256_set_epi32( 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
- gmx_mm_pr diag_SSE3 = _mm256_castsi256_pd( _mm256_set_epi32( 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 ));
-#endif
+ gmx_mm_pr diag0_SSE0,diag0_SSE1,diag0_SSE2,diag0_SSE3;
+ gmx_mm_pr diag1_SSE0,diag1_SSE1,diag1_SSE2,diag1_SSE3;
#endif
-#ifdef GMX_MM128_HERE
+#if defined GMX_X86_SSE2 && defined GMX_MM128_HERE
__m128i zeroi_SSE = _mm_setzero_si128();
#endif
-#ifdef GMX_X86_SSE4_1
gmx_mm_pr zero_SSE = gmx_set1_pr(0);
-#endif
gmx_mm_pr one_SSE=gmx_set1_pr(1.0);
gmx_mm_pr iq_SSE0=gmx_setzero_pr();
nbfp_stride = NBFP_STRIDE;
#endif
+ /* Load j-i for the first i */
+ diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag);
+ /* Generate all the diagonal masks as comparison results */
+#if UNROLLI == UNROLLJ
+ diag_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE1 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag_SSE3 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#else
+#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
+ diag0_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag0_SSE1 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag0_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag0_SSE3 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+
+#if UNROLLI == 2*UNROLLJ
+ /* Load j-i for the second half of the j-cluster */
+ diag_jmi_SSE = gmx_load_pr(nbat->simd_4xn_diag+UNROLLJ);
+#endif
+
+ diag1_SSE0 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag1_SSE1 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag1_SSE2 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+ diag_jmi_SSE = gmx_sub_pr(diag_jmi_SSE,one_SSE);
+ diag1_SSE3 = gmx_cmplt_pr(zero_SSE,diag_jmi_SSE);
+#endif
+#endif
+
#ifdef CALC_COUL_TAB
#ifdef GMX_MM256_HERE
/* Generate aligned table index pointers */
#ifndef _nbnxn_kernel_sse_utils_h_
#define _nbnxn_kernel_sse_utils_h_
-/* This files contains all functions/macros for the SSE/AVX kernels
- * which have explicit dependencies on the j-size / SIMD-width, which
- * can be 2 (SSE-double), 4 (SSE-single,AVX-double) or 8 (AVX-single).
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
* The functionality which depends on the j-cluster size is:
* LJ-parameter lookup
* force table lookup
* energy group pair energy storage
*/
+#ifdef GMX_X86_SSE2
+
/* Transpose 2 double precision registers */
#define GMX_MM_TRANSPOSE2_OP_PD(in0,in1,out0,out1) \
{ \
}
#endif
+#endif /* GMX_X86_SSE2 */
+
#endif /* _nbnxn_kernel_sse_utils_h_ */
#define BBU_Z 6
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
+/* We use SSE or AVX-128bit for bounding box calculations */
#ifndef GMX_DOUBLE
+/* Single precision BBs + coordinates, we can also load coordinates using SSE */
#define NBNXN_SEARCH_SSE_SINGLE
#endif
/* Include basic SSE2 stuff */
#include <emmintrin.h>
-#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
-#define NBNXN_8BB_SSE
+#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
+/* Store bounding boxes with x, y and z coordinates in packs of 4 */
+#define NBNXN_PBB_SSE
#endif
/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
* Here AVX-256 turns out to be slightly slower than AVX-128.
*/
-#define STRIDE_8BB 4
-#define STRIDE_8BB_2LOG 2
+#define STRIDE_PBB 4
+#define STRIDE_PBB_2LOG 2
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
#ifdef GMX_NBNXN_SIMD
#define NBNXN_INT_MASK_DIAG_J8_1 0x0080c0e0
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
#define NBNXN_BBXXXX
/* Size of bounding box corners quadruplet */
-#define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_8BB)
+#define NNBSBB_XXXX (NNBSBB_D*DIM*STRIDE_PBB)
#endif
/* We shift the i-particles backward for PBC.
grid->nc_nalloc = over_alloc_large(nc_max);
srenew(grid->nsubc,grid->nc_nalloc);
srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
-#ifdef NBNXN_8BB_SSE
- bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
+#ifdef NBNXN_PBB_SSE
+ bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
#else
bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
#endif
bb[BBU_Z] = R2F_U(zh);
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Packed coordinates, bb order xyz0 */
static void calc_bounding_box_x_x4_halves(int na,const real *x,
i += stride;
}
/* Note: possible double to float conversion here */
- bb[0*STRIDE_8BB] = R2F_D(xl);
- bb[1*STRIDE_8BB] = R2F_D(yl);
- bb[2*STRIDE_8BB] = R2F_D(zl);
- bb[3*STRIDE_8BB] = R2F_U(xh);
- bb[4*STRIDE_8BB] = R2F_U(yh);
- bb[5*STRIDE_8BB] = R2F_U(zh);
+ bb[0*STRIDE_PBB] = R2F_D(xl);
+ bb[1*STRIDE_PBB] = R2F_D(yl);
+ bb[2*STRIDE_PBB] = R2F_D(zl);
+ bb[3*STRIDE_PBB] = R2F_U(xh);
+ bb[4*STRIDE_PBB] = R2F_U(yh);
+ bb[5*STRIDE_PBB] = R2F_U(zh);
}
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
#ifdef NBNXN_SEARCH_SSE_SINGLE
{
calc_bounding_box_sse(na,x,bb_work);
- bb[0*STRIDE_8BB] = bb_work[BBL_X];
- bb[1*STRIDE_8BB] = bb_work[BBL_Y];
- bb[2*STRIDE_8BB] = bb_work[BBL_Z];
- bb[3*STRIDE_8BB] = bb_work[BBU_X];
- bb[4*STRIDE_8BB] = bb_work[BBU_Y];
- bb[5*STRIDE_8BB] = bb_work[BBU_Z];
+ bb[0*STRIDE_PBB] = bb_work[BBL_X];
+ bb[1*STRIDE_PBB] = bb_work[BBL_Y];
+ bb[2*STRIDE_PBB] = bb_work[BBL_Z];
+ bb[3*STRIDE_PBB] = bb_work[BBU_X];
+ bb[4*STRIDE_PBB] = bb_work[BBU_Y];
+ bb[5*STRIDE_PBB] = bb_work[BBU_Z];
}
#endif /* NBNXN_SEARCH_SSE_SINGLE */
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Combines pairs of consecutive bounding boxes */
static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
for(c=0; c<grid->nc; c++)
{
#ifdef NBNXN_BBXXXX
- for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
+ for(s=0; s<grid->nsubc[c]; s+=STRIDE_PBB)
{
int cs_w,i,d;
- cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
- for(i=0; i<STRIDE_8BB; i++)
+ cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
+ for(i=0; i<STRIDE_PBB; i++)
{
for(d=0; d<DIM; d++)
{
ba[d] +=
- grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
- grid->bb[cs_w*NNBSBB_XXXX+ d *STRIDE_8BB+i];
+ grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
+ grid->bb[cs_w*NNBSBB_XXXX+ d *STRIDE_PBB+i];
}
}
}
offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
bb_ptr = grid->bb + offset;
-#if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
+#if defined GMX_DOUBLE && defined NBNXN_SEARCH_BB_SSE
if (2*grid->na_cj == grid->na_c)
{
calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
*/
bb_ptr =
grid->bb +
- ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
- (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
+ ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
+ (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
#ifdef NBNXN_SEARCH_SSE_SINGLE
if (nbat->XFormat == nbatXYZQ)
{
fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
sx,sy,sz,
- bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
- bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
- bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
+ bb_ptr[0*STRIDE_PBB],bb_ptr[3*STRIDE_PBB],
+ bb_ptr[1*STRIDE_PBB],bb_ptr[4*STRIDE_PBB],
+ bb_ptr[2*STRIDE_PBB],bb_ptr[5*STRIDE_PBB]);
}
}
#endif
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
if (grid->bSimple && nbat->XFormat == nbatX8)
{
combine_bounding_box_pairs(grid,grid->bb);
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
if (grid->bSimple && nbat->XFormat == nbatX8)
{
combine_bounding_box_pairs(grid,grid->bb_simple);
return d2;
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* SSE code for bb distance for bb format xyz0 */
static float subc_bb_dist2_sse(int na_c,
\
shi = si*NNBSBB_D*DIM; \
\
- xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB); \
- yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB); \
- zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB); \
- xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB); \
- yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB); \
- zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB); \
+ xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB); \
+ yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB); \
+ zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB); \
+ xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB); \
+ yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB); \
+ zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB); \
\
dx_0 = _mm_sub_ps(xi_l,xj_h); \
dy_0 = _mm_sub_ps(yi_l,yj_h); \
zero = _mm_setzero_ps();
- xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
- yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
- zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
- xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
- yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
- zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
+ xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
+ yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
+ zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
+ xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
+ yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
+ zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
- /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
+ /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
* But as we know the number of iterations is 1 or 2, we unroll manually.
*/
SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
- if (STRIDE_8BB < nsi)
+ if (STRIDE_PBB < nsi)
{
- SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
+ SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB,bb_i,d2);
}
}
-#endif /* NBNXN_SEARCH_SSE */
+#endif /* NBNXN_SEARCH_BB_SSE */
/* Plain C function which determines if any atom pair between two cells
* is within distance sqrt(rl2).
rc2_SSE = _mm_set1_ps(rl2);
- na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
- ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
- iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
- iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
- ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
- iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
- iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
+ na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
+ ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
+ iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
+ iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
+ ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
+ iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
+ iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
/* We loop from the outer to the inner particles to maximize
* the chance that we find a pair in range quickly and return.
/* Returns the j sub-cell for index cj_ind */
static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
{
- return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
+ return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
}
/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
{
- return nbl->cj4[cj_ind>>2].imei[0].imask;
+ return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
}
/* Ensures there is enough space for extra extra exclusion masks */
/* We can store 4 j-subcell - i-supercell pairs in one struct.
* since we round down, we need one extra entry.
*/
- ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
+ ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
if (ncj4_max > nbl->cj4_nalloc)
{
snew(nbl->work,1);
#ifdef NBNXN_BBXXXX
- snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,32);
+ snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX,NBNXN_MEM_ALIGN);
#else
- snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,32);
+ snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,NBNXN_MEM_ALIGN);
#endif
- snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,32);
+ snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,NBNXN_MEM_ALIGN);
#ifdef GMX_NBNXN_SIMD
- snew_aligned(nbl->work->x_ci_simd_4xn,1,32);
- snew_aligned(nbl->work->x_ci_simd_2xnn,1,32);
+ snew_aligned(nbl->work->x_ci_simd_4xn,1,NBNXN_MEM_ALIGN);
+ snew_aligned(nbl->work->x_ci_simd_2xnn,1,NBNXN_MEM_ALIGN);
#endif
- snew_aligned(nbl->work->d2,GPU_NSUBCELL,32);
+ snew_aligned(nbl->work->d2,GPU_NSUBCELL,NBNXN_MEM_ALIGN);
}
void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
fprintf(fp,"nbl average j super cell list length %.1f\n",
0.25*nbl->ncj4/(double)nbl->nsci);
fprintf(fp,"nbl average i sub cell list length %.1f\n",
- nbl->nci_tot/(0.25*nbl->ncj4));
+ nbl->nci_tot/((double)nbl->ncj4));
for(si=0; si<=GPU_NSUBCELL; si++)
{
{
for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
{
- for(j=0; j<4; j++)
+ for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
{
b = 0;
for(si=0; si<GPU_NSUBCELL; si++)
w = (ej>>2);
for(ei=ej; ei<nbl->na_ci; ei++)
{
- excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
- ~(1U << (sj_offset*GPU_NSUBCELL+si));
+ excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
+ ~(1U << (sj_offset*GPU_NSUBCELL + si));
}
}
}
for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
{
- cj4_ind = (nbl->work->cj_ind >> 2);
+ cj4_ind = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
cj4 = &nbl->cj4[cj4_ind];
#ifdef NBNXN_BBXXXX
/* Determine all ci1 bb distances in one call with SSE */
- subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
+ subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
ci1,bb_ci,d2l);
*ndistc += na_c*2;
#endif
{
/* Avoid using function pointers here, as it's slower */
if (
-#ifdef NBNXN_8BB_SSE
+#ifdef NBNXN_PBB_SSE
!subc_in_range_sse8
#else
!subc_in_range_x
nbl->nci_tot += npair;
/* Increase the closing index in i super-cell list */
- nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
+ nbl->sci[nbl->nsci].cj4_ind_end =
+ ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
}
}
}
ndirect++;
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
else
{
while (cj_ind_first + ndirect <= cj_ind_last &&
inner_e = ge - se*na_c;
/* Macro for getting the index of atom a within a cluster */
-#define AMODI(a) ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
+#define AMODCJ4(a) ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
/* Macro for converting an atom number to a cluster number */
-#define A2CI(a) ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
+#define A2CJ4(a) ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
+/* Macro for getting the index of an i-atom within a warp */
+#define AMODWI(a) ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
- if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
+ if (nbl_imask0(nbl,found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
{
w = (inner_e >> 2);
- get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
+ get_nbl_exclusions_1(nbl,A2CJ4(found),w,&nbl_excl);
- nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
- ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
+ nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
+ ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
}
-#undef AMODI
-#undef A2CI
+#undef AMODCJ4
+#undef A2CJ4
+#undef AMODWI
}
}
}
/* We can only have complete blocks of 4 j-entries in a list,
* so round the count up before closing.
*/
- nbl->ncj4 = ((nbl->work->cj_ind + 4-1) >> 2);
+ nbl->ncj4 = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
nbl->nsci++;
int ia,m,i;
#ifdef NBNXN_BBXXXX
- ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
- for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
+ ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
+ for(m=0; m<(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
{
- for(i=0; i<STRIDE_8BB; i++)
+ for(i=0; i<STRIDE_PBB; i++)
{
- bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
- bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
- bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
- bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
- bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
- bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
+ bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
+ bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
+ bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
+ bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
+ bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
+ bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
}
}
#else
}
}
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
/* Copies PBC shifted super-cell packed atom coordinates to working array */
static void icell_set_x_supersub_sse8(int ci,
real shx,real shy,real shz,
for(si=0; si<GPU_NSUBCELL; si++)
{
- for(i=0; i<na_c; i+=STRIDE_8BB)
+ for(i=0; i<na_c; i+=STRIDE_PBB)
{
io = si*na_c + i;
ia = ci*GPU_NSUBCELL*na_c + io;
- for(j=0; j<STRIDE_8BB; j++)
+ for(j=0; j<STRIDE_PBB; j++)
{
- x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
- x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
- x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
+ x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
+ x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
+ x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
}
}
}
for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
{
- for(j=0; j<4; j++)
+ for(j=0; j<NBNXN_GPU_JGROUP_SIZE; j++)
{
fprintf(fp," sj %5d imask %x\n",
nbl->cj4[j4].cj[j],
}
else
{
-#ifdef NBNXN_SEARCH_SSE
+#ifdef NBNXN_SEARCH_BB_SSE
nbs->icell_set_x = icell_set_x_supersub_sse8;
#else
nbs->icell_set_x = icell_set_x_supersub;