Ported back some AVX256 optimizations to SSE2/SSE4.1/AVX128.
authorErik Lindahl <erik@kth.se>
Mon, 26 Nov 2012 17:56:49 +0000 (18:56 +0100)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Wed, 28 Nov 2012 21:30:50 +0000 (22:30 +0100)
commit3a07883e9daeae249e970ed727856c631dc1a2e3
tree3676a5ab3d922416491585cbd94353106aa4ce89
parent84513d5a3d147c575e47e744028bc32a5acbc28e
Ported back some AVX256 optimizations to SSE2/SSE4.1/AVX128.

We can gain a couple of percent in the group kernels for some cases
by only storing j-particle forces once in water kernels, and
I have updated the last few routines that used constructs that
some compliers considered fragile.

Change-Id: Ic69307c735c57dd92bf0b35e88550252447e4f7f
126 files changed:
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJ_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwLJ_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEw_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwLJ_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwLJ_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwNone_GeomW3P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwNone_GeomW4P1_avx_128_fma_single.c
src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_template_avx_128_fma_single.pre
src/gmxlib/nonbonded/nb_kernel_sse2_single/kernelutil_x86_sse2_single.h
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecEw_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW3P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_ElecRF_VdwNone_GeomW4P1_sse2_single.c
src/gmxlib/nonbonded/nb_kernel_sse2_single/nb_kernel_template_sse2_single.pre
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/kernelutil_x86_sse4_1_single.h
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwCSTab_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwLJ_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCSTab_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwLJ_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwLJ_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecCoul_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSh_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwLJSw_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEwSw_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwCSTab_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwCSTab_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJ_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwLJ_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecEw_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwCSTab_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSh_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwLJSw_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRFCut_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwCSTab_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwCSTab_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwLJ_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwLJ_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwNone_GeomW3P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_ElecRF_VdwNone_GeomW4P1_sse4_1_single.c
src/gmxlib/nonbonded/nb_kernel_sse4_1_single/nb_kernel_template_sse4_1_single.pre