OFF)
mark_as_advanced(GMX_BUILD_FOR_COVERAGE)
+option(GMX_DEVELOPER_BUILD
+ "Enable Developer convenience features: always build unit-tests"
+ OFF)
+mark_as_advanced(GMX_DEVELOPER_BUILD)
######################################################################
# Detect OpenMP support
# The cmake/Check{C,CXX}CompilerFlag.cmake files in the GROMACS distribution
# are used with permission from CMake v3.0.0 so that GROMACS can detect
# invalid options with the Intel Compilers, and we have added a line
-# to detect warnings with the Fujitsu compilers on K computer.
+# to detect warnings with the Fujitsu compilers on K computer and ICC.
# CMake-3.0 also has a bug where the FAIL_REGEX pattern for AIX contains
# a semicolon. Since this is also used as a separator in lists inside CMake,
# that string ends up being split into two separate patterns, and the last
# so, should we use plug-ins?
if((WIN32 AND NOT CYGWIN) OR (HAVE_DLOPEN AND BUILD_SHARED_LIBS))
if(NOT VMD_QUIETLY)
- MESSAGE(STATUS
- "Found the ability to use plug-ins when building shared libaries, "
- "so will compile to use plug-ins (e.g. to read VMD-supported file "
- "formats).")
+ MESSAGE(STATUS "Using dynamic plugins (e.g VMD-supported file formats)")
endif()
if(NOT GMX_VMD_PLUGIN_PATH)
find_package(VMD)
FAIL_REGEX "option.*not supported" # Intel
FAIL_REGEX "invalid argument .*option" # Intel
FAIL_REGEX "ignoring option .*argument required" # Intel
+ FAIL_REGEX "ignoring option .*argument is of wrong type" # Intel
FAIL_REGEX "[Uu]nknown option" # HP
FAIL_REGEX "[Ww]arning: [Oo]ption" # SunPro
FAIL_REGEX "command option .* is not recognized" # XL
message(STATUS "${SIMD_STATUS_MESSAGE}")
endif()
+# By default, 32-bit windows cannot pass SIMD (SSE/AVX) arguments in registers,
+# and even on 64-bit (all platforms) it is only used for a handful of arguments.
+# The __vectorcall (MSVC, from MSVC2013) or __regcall (ICC) calling conventions
+# enable this, which is critical to enable 32-bit SIMD and improves performance
+# for 64-bit SIMD.
+# Check if the compiler supports one of these, and in that case set gmx_simdcall
+# to that string. If we do not have any such calling convention modifier, set it
+# to an empty string.
+if(NOT DEFINED GMX_SIMD_CALLING_CONVENTION)
+ foreach(callconv __vectorcall __regcall "")
+ set(callconv_compile_var "_callconv_${callconv}")
+ check_c_source_compiles("int ${callconv} f(int i) {return i;} int main(void) {return f(0);}" ${callconv_compile_var})
+ if(${callconv_compile_var})
+ set(GMX_SIMD_CALLING_CONVENTION "${callconv}" CACHE INTERNAL "Calling convention for SIMD routines" FORCE)
+ break()
+ endif()
+ endforeach()
+endif()
+
endmacro()
3. Get and unpack the latest version of the GROMACS tarball.
4. Make a separate build directory and change to it.
5. Run `cmake` with the path to the source as an argument
-6. Run `make`, `make test`, and `make install`
+6. Run `make`, `make check`, and `make install`
Or, as a sequence of commands to execute:
cd build
cmake .. -DGMX_BUILD_OWN_FFTW=ON -DREGRESSIONTEST_DOWNLOAD=ON
make
- make test
+ make check
sudo make install
source /usr/local/gromacs/bin/GMXRC
configure_file(buildinfo.h.cmakein buildinfo.h ESCAPE_QUOTES)
if (BUILD_TESTING)
+ if(NOT GMX_DEVELOPER_BUILD)
+ set(UNITTEST_TARGET_OPTIONS EXCLUDE_FROM_ALL)
+ endif()
if (GMX_BUILD_UNITTESTS)
add_subdirectory(external/gmock-1.7.0)
endif()
/* String for SIMD instruction choice (for writing to log files and stdout) */
#define GMX_SIMD_STRING "@GMX_SIMD@"
+/* Calling convention string (if any) for routines with SIMD variable args */
+#define gmx_simdcall @GMX_SIMD_CALLING_CONVENTION@
+
/* Integer byte order is big endian. */
#cmakedefine GMX_INTEGER_BIG_ENDIAN
include(gmxGetGmockTupleWorkaround)
get_gmock_tuple_workaround(GMOCK_COMPILE_DEFINITIONS)
+set(GMOCK_COMPILE_DEFINITIONS "_GNU_SOURCE=1;${GMOCK_COMPILE_DEFINITIONS}")
set(GMOCK_COMPILE_DEFINITIONS ${GMOCK_COMPILE_DEFINITIONS} PARENT_SCOPE)
# GTest/GMock suggest linking with pthreads when available for thread safety
include_directories(BEFORE ${GTEST_DIR})
include_directories(BEFORE ${GMOCK_INCLUDE_DIRS})
include_directories(BEFORE ${GMOCK_DIR})
-add_library(gmock STATIC ${GMOCK_SOURCES} ${GTEST_SOURCES})
+add_library(gmock STATIC ${UNITTEST_TARGET_OPTIONS} ${GMOCK_SOURCES} ${GTEST_SOURCES})
set_property(TARGET gmock APPEND PROPERTY COMPILE_DEFINITIONS "${GMOCK_COMPILE_DEFINITIONS}")
set(GMOCK_LIBRARIES gmock ${PTHREADS_LIBRARIES} PARENT_SCOPE)
EXPECT_NO_THROW_GMX(data.addModule(mod2));
}
+#if GTEST_HAS_TYPED_TEST
/********************************************************************
* Input data for tests below.
ASSERT_NO_THROW_GMX(AnalysisDataTest::presentAllData());
}
+#else
+
+/* A dummy test that at least signals that something is missing if one runs the
+ * unit test executable itself.
+ */
+TEST(DISABLED_AnalysisDataCommonTest, GenericTests)
+{
+ ADD_FAILURE()
+ << "Tests for generic AnalysisData functionality require support for "
+ << "Google Test typed tests, which was not available when the tests "
+ << "were compiled.";
+}
+
+#endif
+
} // namespace
class MockAverageHistogram : public gmx::AbstractAverageHistogram
{
public:
- MockAverageHistogram() {}
//! Creates a histogram module with defined bin parameters.
explicit MockAverageHistogram(const gmx::AnalysisHistogramSettings &settings)
: AbstractAverageHistogram(settings)
#define FFTW_UNLOCK try { big_fftw_mutex.unlock(); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
#endif /* GMX_FFT_FFTW3 */
+#ifdef GMX_MPI
/* largest factor smaller than sqrt */
static int lfactor(int z)
{
- int i;
- for (i = static_cast<int>(sqrt(static_cast<double>(z)));; i--)
+ int i = static_cast<int>(sqrt(static_cast<double>(z)));
+ while (z%i != 0)
{
- if (z%i == 0)
- {
- return i;
- }
+ i--;
}
- return 1;
+ return i;
}
+#endif
/* largest factor */
static int l2factor(int z)
int i;
if (z == 1)
{
- return 1;
+ i = 1;
}
- for (i = z/2;; i--)
+ else
{
- if (z%i == 0)
+ i = z/2;
+ while (z%i != 0)
{
- return i;
+ i--;
}
}
- return 1;
+ return i;
}
/* largest prime factor: WARNING: slow recursion, only use for small numbers */
static const char *modeToVerb(char mode)
{
+ const char *p;
switch (mode)
{
case 'r':
- return "reading";
+ p = "reading";
break;
case 'w':
- return "writing";
+ p = "writing";
break;
case 'a':
- return "appending";
+ p = "appending";
break;
default:
gmx_fatal(FARGS, "Invalid file opening mode %c", mode);
- return "";
+ p = "";
+ break;
}
+ return p;
}
void gmx_tng_open(const char *filename,
sfree(ir);
}
+static gmx_bool can_scale_rvdw(int vdwtype)
+{
+ return (evdwCUT == vdwtype ||
+ evdwPME == vdwtype);
+}
#define EPME_SWITCHED(e) ((e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
fprintf(fp, " No. scaling rcoulomb");
fprintf(fp, " nkx nky nkz");
fprintf(fp, " spacing");
- if (evdwCUT == ir->vdwtype)
+ if (can_scale_rvdw(ir->vdwtype))
{
fprintf(fp, " rvdw");
}
ir->rlist = ir->rcoulomb + nlist_buffer;
}
- if (bScaleRvdw && evdwCUT == ir->vdwtype)
+ if (bScaleRvdw && can_scale_rvdw(ir->vdwtype))
{
- if (ecutsVERLET == ir->cutoff_scheme)
+ if (ecutsVERLET == ir->cutoff_scheme ||
+ evdwPME == ir->vdwtype)
{
- /* With Verlet, the van der Waals radius must always equal the Coulomb radius */
+ /* With either the Verlet cutoff-scheme or LJ-PME,
+ the van der Waals radius must always equal the
+ Coulomb radius */
ir->rvdw = ir->rcoulomb;
}
else
fprintf(fp, "%4d%10f%10f", j, fac, ir->rcoulomb);
fprintf(fp, "%5d%5d%5d", ir->nkx, ir->nky, ir->nkz);
fprintf(fp, " %9f ", info->fsx[j]);
- if (evdwCUT == ir->vdwtype)
+ if (can_scale_rvdw(ir->vdwtype))
{
fprintf(fp, "%10f", ir->rvdw);
}
* OMP_NUM_THREADS also has to be set */
if (bFullOmpSupport && getenv("OMP_NUM_THREADS") == NULL)
{
- gmx_fatal(FARGS, "%s=%d is set, the default number of threads also "
- "needs to be set with OMP_NUM_THREADS!",
- modth_env_var[m], nth);
+ gmx_warning("%s=%d is set, the default number of threads also "
+ "needs to be set with OMP_NUM_THREADS!",
+ modth_env_var[m], nth);
}
/* with the group scheme warn if any env var except PME is set */
row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
}
-static int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128d a, __m128d b)
{
return _mm_movemask_pd(_mm_cmplt_pd(a, b));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
{
return _mm_macc_pd(dx, dx, _mm_macc_pd(dy, dy, _mm_mul_pd(dz, dz)));
/* Load a double value from 1-2 places, merge into xmm register */
-static __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB)
{
return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
}
-static __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
{
return _mm_load_sd(ptrA);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB,
__m128d xmm1)
_mm_store_sd(ptrB, t2);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
_mm_store_sd(ptrA, xmm1);
/* Similar to store, but increments value in memory */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB, __m128d xmm1)
{
_mm_store_sd(ptrB, t1);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
__m128d tmp;
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
const double * gmx_restrict p2,
__m128d * gmx_restrict c6,
*c12 = _mm_unpackhi_pd(t1, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict c6,
__m128d * gmx_restrict c12)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
*z = _mm_load_sd(p1+2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
*z3 = _mm_load_sd(p1+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
*z1 = _mm_unpacklo_pd(t3, t4);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1)
{
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _t1 = _mm_sub_pd(_t1, _x1); \
- _t2 = _mm_sub_pd(_t2, _z1); \
- _t3 = _mm_sub_pd(_t3, _y2); \
- _t4 = _mm_sub_pd(_t4, _x3); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+6, t4);
_mm_store_sd(ptrA+8, t5);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _z3 = _mm_unpacklo_pd(_z3, _x4); \
- _y4 = _mm_unpacklo_pd(_y4, _z4); \
- _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
- _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
- _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
- _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
- _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
- _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
- }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
}
-#endif
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1)
{
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrB); \
- _t7 = _mm_loadu_pd(ptrB+2); \
- _t8 = _mm_loadu_pd(ptrB+4); \
- _t9 = _mm_loadu_pd(ptrB+6); \
- _t10 = _mm_load_sd(ptrB+8); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpackhi_pd(_z3, _z3); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _t6 = _mm_sub_pd(_t6, _tB); \
- _t7 = _mm_sub_pd(_t7, _tD); \
- _t8 = _mm_sub_pd(_t8, _tF); \
- _t9 = _mm_sub_pd(_t9, _tH); \
- _t10 = _mm_sub_sd(_t10, _tI); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrB, _t6); \
- _mm_storeu_pd(ptrB+2, _t7); \
- _mm_storeu_pd(ptrB+4, _t8); \
- _mm_storeu_pd(ptrB+6, _t9); \
- _mm_store_sd(ptrB+8, _t10); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+6, t9);
_mm_store_sd(ptrB+8, t10);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _t7 = _mm_loadu_pd(ptrB); \
- _t8 = _mm_loadu_pd(ptrB+2); \
- _t9 = _mm_loadu_pd(ptrB+4); \
- _t10 = _mm_loadu_pd(ptrB+6); \
- _t11 = _mm_loadu_pd(ptrB+8); \
- _t12 = _mm_loadu_pd(ptrB+10); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpacklo_pd(_z3, _x4); \
- _tJ = _mm_unpackhi_pd(_z3, _x4); \
- _tK = _mm_unpacklo_pd(_y4, _z4); \
- _tL = _mm_unpackhi_pd(_y4, _z4); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_pd(_t5, _tI); \
- _t6 = _mm_sub_pd(_t6, _tK); \
- _t7 = _mm_sub_pd(_t7, _tB); \
- _t8 = _mm_sub_pd(_t8, _tD); \
- _t9 = _mm_sub_pd(_t9, _tF); \
- _t10 = _mm_sub_pd(_t10, _tH); \
- _t11 = _mm_sub_pd(_t11, _tJ); \
- _t12 = _mm_sub_pd(_t12, _tL); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_storeu_pd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrA+10, _t6); \
- _mm_storeu_pd(ptrB, _t7); \
- _mm_storeu_pd(ptrB+2, _t8); \
- _mm_storeu_pd(ptrB+4, _t9); \
- _mm_storeu_pd(ptrB+6, _t10); \
- _mm_storeu_pd(ptrB+8, _t11); \
- _mm_storeu_pd(ptrB+10, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
double * gmx_restrict fptr,
double * gmx_restrict fshiftptr)
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- fix1 = _mm_hadd_pd(fix1, fiy1); \
- fiz1 = _mm_hadd_pd(fiz1, fix2); \
- fiy2 = _mm_hadd_pd(fiy2, fiz2); \
- fix3 = _mm_hadd_pd(fix3, fiy3); \
- fiz3 = _mm_hadd_pd(fiz3, fiz3); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
- fix1 = _mm_add_pd(fix1, fix3); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- fiz1 = _mm_add_sd(fiz1, _t2); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- fix1 = _mm_hadd_pd(fix1, fiy1); \
- fiz1 = _mm_hadd_pd(fiz1, fix2); \
- fiy2 = _mm_hadd_pd(fiy2, fiz2); \
- fix3 = _mm_hadd_pd(fix3, fiy3); \
- fiz3 = _mm_hadd_pd(fiz3, fix4); \
- fiy4 = _mm_hadd_pd(fiy4, fiz4); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
- _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
- fix3 = _mm_add_pd(fix3, _t2); \
- fix1 = _mm_add_pd(fix1, fix3); \
- fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
- fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
{
pot1 = _mm_hadd_pd(pot1, pot1);
_mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
__m128d pot2, double * gmx_restrict ptrB)
{
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
-static gmx_inline int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128 a, __m128 b)
{
return _mm_movemask_ps(_mm_cmplt_ps(a, b));
}
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
{
return _mm_macc_ps(dx, dx, _mm_macc_ps(dy, dy, _mm_mul_ps(dz, dz)));
/* Load a single value from 1-4 places, merge into xmm register */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
const float * gmx_restrict p2,
const float * gmx_restrict p3,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1)
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
- __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_permute_ps(_z3, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t18 = _mm_movehl_ps(_z3, _z3); \
- _t19 = _mm_permute_ps(_t18, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t20 = _mm_movelh_ps(_x1, _z1); \
- _t21 = _mm_movehl_ps(_z1, _x1); \
- _t22 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t23 = _mm_movelh_ps(_y2, _x3); \
- _t24 = _mm_movehl_ps(_x3, _y2); \
- _t25 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_load_ss(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t20); \
- _t2 = _mm_sub_ps(_t2, _t23); \
- _t3 = _mm_sub_ss(_t3, _z3); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_store_ss(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_load_ss(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _t21); \
- _t5 = _mm_sub_ps(_t5, _t24); \
- _t6 = _mm_sub_ss(_t6, _t17); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_store_ss(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_load_ss(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t22); \
- _t8 = _mm_sub_ps(_t8, _t25); \
- _t9 = _mm_sub_ss(_t9, _t18); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_store_ss(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_load_ss(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ss(_t12, _t19); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_store_ss(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_store_ss(ptrD+8, t12);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
- __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
- __m128 _t23, _t24; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_unpackhi_ps(_z3, _x4); \
- _z3 = _mm_unpacklo_ps(_z3, _x4); \
- _t18 = _mm_unpackhi_ps(_y4, _z4); \
- _y4 = _mm_unpacklo_ps(_y4, _z4); \
- _t19 = _mm_movelh_ps(_x1, _z1); \
- _z1 = _mm_movehl_ps(_z1, _x1); \
- _t20 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t21 = _mm_movelh_ps(_y2, _x3); \
- _x3 = _mm_movehl_ps(_x3, _y2); \
- _t22 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t23 = _mm_movelh_ps(_z3, _y4); \
- _y4 = _mm_movehl_ps(_y4, _z3); \
- _t24 = _mm_movelh_ps(_t17, _t18); \
- _t18 = _mm_movehl_ps(_t18, _t17); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_loadu_ps(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t19); \
- _t2 = _mm_sub_ps(_t2, _t21); \
- _t3 = _mm_sub_ps(_t3, _t23); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_storeu_ps(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_loadu_ps(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _z1); \
- _t5 = _mm_sub_ps(_t5, _x3); \
- _t6 = _mm_sub_ps(_t6, _y4); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_storeu_ps(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_loadu_ps(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t20); \
- _t8 = _mm_sub_ps(_t8, _t22); \
- _t9 = _mm_sub_ps(_t9, _t24); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_storeu_ps(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_loadu_ps(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ps(_t12, _t18); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_storeu_ps(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_storeu_ps(ptrD+8, t12);
}
-#endif
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
_mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4; \
-\
- fix1 = _mm_hadd_ps(fix1, fiy1); \
- fiz1 = _mm_hadd_ps(fiz1, fix2); \
- fiy2 = _mm_hadd_ps(fiy2, fiz2); \
- fix3 = _mm_hadd_ps(fix3, fiy3); \
- fiz3 = _mm_hadd_ps(fiz3, fiz3); \
- fix1 = _mm_hadd_ps(fix1, fiz1); \
- fiy2 = _mm_hadd_ps(fiy2, fix3); \
- fiz3 = _mm_hadd_ps(fiz3, fiz3); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
- _t4 = _mm_load_ss(fshiftptr+2); \
- _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
- _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
- _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
- _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
- _t3 = _mm_permute_ps(_t3, _MM_SHUFFLE(1, 2, 0, 0)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _mm_store_ss(fshiftptr+2, _t1); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
_mm_store_ss(fshiftptr+2, t1);
_mm_storeh_pi((__m64 *)(fshiftptr), t1);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5; \
-\
- fix1 = _mm_hadd_ps(fix1, fiy1); \
- fiz1 = _mm_hadd_ps(fiz1, fix2); \
- fiy2 = _mm_hadd_ps(fiy2, fiz2); \
- fix3 = _mm_hadd_ps(fix3, fiy3); \
- fiz3 = _mm_hadd_ps(fiz3, fix4); \
- fiy4 = _mm_hadd_ps(fiy4, fiz4); \
- fix1 = _mm_hadd_ps(fix1, fiz1); \
- fiy2 = _mm_hadd_ps(fiy2, fix3); \
- fiz3 = _mm_hadd_ps(fiz3, fiy4); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
- _t5 = _mm_load_ss(fshiftptr+2); \
- _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
- _t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
- _t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
- _t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
- _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
- _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _t5 = _mm_add_ps(_t5, _t1); \
- _mm_store_ss(fshiftptr+2, _t5); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
{
pot1 = _mm_hadd_ps(pot1, pot1);
_mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
__m128 pot2, float * gmx_restrict ptrB)
{
#define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32((x), (imm))
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_unpack128lo_pd(__m256d xmm1, __m256d xmm2)
{
return _mm256_permute2f128_pd(xmm1, xmm2, 0x20);
}
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_unpack128hi_pd(__m256d xmm1, __m256d xmm2)
{
return _mm256_permute2f128_pd(xmm1, xmm2, 0x31);
}
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_set_m128d(__m128d hi, __m128d lo)
{
return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 0x1);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_set_m128(__m128 hi, __m128 lo)
{
return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
}
-static int
+static gmx_inline int gmx_simdcall
gmx_mm256_any_lt(__m256d a, __m256d b)
{
return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_LT_OQ));
}
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_calc_rsq_pd(__m256d dx, __m256d dy, __m256d dz)
{
return _mm256_add_pd( _mm256_add_pd( _mm256_mul_pd(dx, dx), _mm256_mul_pd(dy, dy) ), _mm256_mul_pd(dz, dz) );
/* Load a single value from 1-4 places, merge into xmm register */
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_load_1real_pd(const double * gmx_restrict ptrA)
{
return _mm256_castpd128_pd256(_mm_load_sd(ptrA));
}
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB)
{
}
-static __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_mm256_load_4real_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
const double * gmx_restrict ptrC, const double * gmx_restrict ptrD)
{
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_store_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
{
_mm_store_sd(ptrA, _mm256_castpd256_pd128(xmm1));
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_store_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
{
__m256d t2;
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_store_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
{
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_increment_1real_pd(double * gmx_restrict ptrA, __m256d xmm1)
{
__m128d t1;
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_increment_2real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB, __m256d xmm1)
{
__m128d t1, t2;
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_increment_4real_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
double * gmx_restrict ptrC, double * gmx_restrict ptrD, __m256d xmm1)
{
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_1pair_swizzle_pd(const double * gmx_restrict p1, __m256d *c6, __m256d *c12)
{
*c6 = _mm256_castpd128_pd256(_mm_load_sd(p1));
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_2pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2, __m256d *c6, __m256d *c12)
{
__m128d t1, t2, t3;
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4pair_swizzle_pd(const double * gmx_restrict p1, const double * gmx_restrict p2,
const double * gmx_restrict p3, const double * gmx_restrict p4,
__m256d * gmx_restrict c6, __m256d * gmx_restrict c12)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m256d * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m256d * gmx_restrict x, __m256d * gmx_restrict y, __m256d * gmx_restrict z)
{
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
__m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
*z3 = _mm256_castpd128_pd256(_mm_load_sd(p1+8));
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
__m256d * gmx_restrict x2, __m256d * gmx_restrict y2, __m256d * gmx_restrict z2,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_1rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1)
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_3rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4rvec_4ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
const double * gmx_restrict ptrC, const double * gmx_restrict ptrD,
__m256d * gmx_restrict x1, __m256d * gmx_restrict y1, __m256d * gmx_restrict z1,
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
double * gmx_restrict ptrC, double * gmx_restrict ptrD,
__m256d x1, __m256d y1, __m256d z1)
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128d _tA, _tB, _tC, _tD, _tE; \
- _t1 = _mm256_loadu_pd(ptrA); \
- _t2 = _mm256_loadu_pd(ptrB); \
- _t3 = _mm256_loadu_pd(ptrC); \
- _t4 = _mm256_loadu_pd(ptrD); \
- _t5 = _mm256_loadu_pd(ptrA+4); \
- _t6 = _mm256_loadu_pd(ptrB+4); \
- _t7 = _mm256_loadu_pd(ptrC+4); \
- _t8 = _mm256_loadu_pd(ptrD+4); \
- _tA = _mm_load_sd(ptrA+8); \
- _tB = _mm_load_sd(ptrB+8); \
- _tC = _mm_load_sd(ptrC+8); \
- _tD = _mm_load_sd(ptrD+8); \
- _t9 = _mm256_unpacklo_pd(_x1, _y1); \
- _x1 = _mm256_unpackhi_pd(_x1, _y1); \
- _y1 = _mm256_unpacklo_pd(_z1, _x2); \
- _z1 = _mm256_unpackhi_pd(_z1, _x2); \
- _x2 = _mm256_unpacklo_pd(_y2, _z2); \
- _y2 = _mm256_unpackhi_pd(_y2, _z2); \
- _z2 = _mm256_unpacklo_pd(_x3, _y3); \
- _x3 = _mm256_unpackhi_pd(_x3, _y3); \
- _t10 = gmx_mm256_unpack128lo_pd(_t9, _y1); \
- _y3 = gmx_mm256_unpack128hi_pd(_t9, _y1); \
- _t9 = gmx_mm256_unpack128lo_pd(_x1, _z1); \
- _y1 = gmx_mm256_unpack128hi_pd(_x1, _z1); \
- _x1 = gmx_mm256_unpack128lo_pd(_x2, _z2); \
- _z1 = gmx_mm256_unpack128hi_pd(_x2, _z2); \
- _x2 = gmx_mm256_unpack128lo_pd(_y2, _x3); \
- _z2 = gmx_mm256_unpack128hi_pd(_y2, _x3); \
- _t1 = _mm256_sub_pd(_t1, _t10); \
- _t2 = _mm256_sub_pd(_t2, _t9); \
- _t3 = _mm256_sub_pd(_t3, _y3); \
- _t4 = _mm256_sub_pd(_t4, _y1); \
- _t5 = _mm256_sub_pd(_t5, _x1); \
- _t6 = _mm256_sub_pd(_t6, _x2); \
- _t7 = _mm256_sub_pd(_t7, _z1); \
- _t8 = _mm256_sub_pd(_t8, _z2); \
- _tA = _mm_sub_sd(_tA, _mm256_castpd256_pd128(_z3)); \
- _tB = _mm_sub_sd(_tB, _mm_permute_pd(_mm256_castpd256_pd128(_z3), _GMX_MM_PERMUTE128D(1, 1))); \
- _tE = _mm256_extractf128_pd(_z3, 0x1); \
- _tC = _mm_sub_sd(_tC, _tE); \
- _tD = _mm_sub_sd(_tD, _mm_permute_pd(_tE, _GMX_MM_PERMUTE128D(1, 1))); \
- _mm256_storeu_pd(ptrA, _t1); \
- _mm256_storeu_pd(ptrB, _t2); \
- _mm256_storeu_pd(ptrC, _t3); \
- _mm256_storeu_pd(ptrD, _t4); \
- _mm256_storeu_pd(ptrA+4, _t5); \
- _mm256_storeu_pd(ptrB+4, _t6); \
- _mm256_storeu_pd(ptrC+4, _t7); \
- _mm256_storeu_pd(ptrD+4, _t8); \
- _mm_store_sd(ptrA+8, _tA); \
- _mm_store_sd(ptrB+8, _tB); \
- _mm_store_sd(ptrC+8, _tC); \
- _mm_store_sd(ptrD+8, _tD); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
double * gmx_restrict ptrC, double * gmx_restrict ptrD,
__m256d x1, __m256d y1, __m256d z1,
_mm_store_sd(ptrC+8, tC);
_mm_store_sd(ptrD+8, tD);
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m256d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12, _t13, _t14; \
- __m128d _tA, _tB, _tC, _tD, _tE; \
- _t1 = _mm256_loadu_pd(ptrA); \
- _t2 = _mm256_loadu_pd(ptrB); \
- _t3 = _mm256_loadu_pd(ptrC); \
- _t4 = _mm256_loadu_pd(ptrD); \
- _t5 = _mm256_loadu_pd(ptrA+4); \
- _t6 = _mm256_loadu_pd(ptrB+4); \
- _t7 = _mm256_loadu_pd(ptrC+4); \
- _t8 = _mm256_loadu_pd(ptrD+4); \
- _t9 = _mm256_loadu_pd(ptrA+8); \
- _t10 = _mm256_loadu_pd(ptrB+8); \
- _t11 = _mm256_loadu_pd(ptrC+8); \
- _t12 = _mm256_loadu_pd(ptrD+8); \
- _t13 = _mm256_unpacklo_pd(_x1, _y1); \
- _x1 = _mm256_unpackhi_pd(_x1, _y1); \
- _y1 = _mm256_unpacklo_pd(_z1, _x2); \
- _z1 = _mm256_unpackhi_pd(_z1, _x2); \
- _x2 = _mm256_unpacklo_pd(_y2, _z2); \
- _y2 = _mm256_unpackhi_pd(_y2, _z2); \
- _z2 = _mm256_unpacklo_pd(_x3, _y3); \
- _x3 = _mm256_unpackhi_pd(_x3, _y3); \
- _y3 = _mm256_unpacklo_pd(_z3, _x4); \
- _z3 = _mm256_unpackhi_pd(_z3, _x4); \
- _x4 = _mm256_unpacklo_pd(_y4, _z4); \
- _y4 = _mm256_unpackhi_pd(_y4, _z4); \
- _z4 = gmx_mm256_unpack128lo_pd(_t13, _y1); \
- _t13 = gmx_mm256_unpack128hi_pd(_t13, _y1); \
- _y1 = gmx_mm256_unpack128lo_pd(_x1, _z1); \
- _x1 = gmx_mm256_unpack128hi_pd(_x1, _z1); \
- _z1 = gmx_mm256_unpack128lo_pd(_x2, _z2); \
- _x2 = gmx_mm256_unpack128hi_pd(_x2, _z2); \
- _z2 = gmx_mm256_unpack128lo_pd(_y2, _x3); \
- _y2 = gmx_mm256_unpack128hi_pd(_y2, _x3); \
- _x3 = gmx_mm256_unpack128lo_pd(_y3, _x4); \
- _y3 = gmx_mm256_unpack128hi_pd(_y3, _x4); \
- _x4 = gmx_mm256_unpack128lo_pd(_z3, _y4); \
- _z3 = gmx_mm256_unpack128hi_pd(_z3, _y4); \
- _t1 = _mm256_sub_pd(_t1, _z4); \
- _t2 = _mm256_sub_pd(_t2, _y1); \
- _t3 = _mm256_sub_pd(_t3, _t13); \
- _t4 = _mm256_sub_pd(_t4, _x1); \
- _t5 = _mm256_sub_pd(_t5, _z1); \
- _t6 = _mm256_sub_pd(_t6, _z2); \
- _t7 = _mm256_sub_pd(_t7, _x2); \
- _t8 = _mm256_sub_pd(_t8, _y2); \
- _t9 = _mm256_sub_pd(_t9, _x3); \
- _t10 = _mm256_sub_pd(_t10, _x4); \
- _t11 = _mm256_sub_pd(_t11, _y3); \
- _t12 = _mm256_sub_pd(_t12, _z3); \
- _mm256_storeu_pd(ptrA, _t1); \
- _mm256_storeu_pd(ptrB, _t2); \
- _mm256_storeu_pd(ptrC, _t3); \
- _mm256_storeu_pd(ptrD, _t4); \
- _mm256_storeu_pd(ptrA+4, _t5); \
- _mm256_storeu_pd(ptrB+4, _t6); \
- _mm256_storeu_pd(ptrC+4, _t7); \
- _mm256_storeu_pd(ptrD+4, _t8); \
- _mm256_storeu_pd(ptrA+8, _t9); \
- _mm256_storeu_pd(ptrB+8, _t10); \
- _mm256_storeu_pd(ptrC+8, _t11); \
- _mm256_storeu_pd(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
double * gmx_restrict ptrC, double * gmx_restrict ptrD,
__m256d x1, __m256d y1, __m256d z1,
_mm256_storeu_pd(ptrC+8, t11);
_mm256_storeu_pd(ptrD+8, t12);
}
-#endif
-
-
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_1atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
double * gmx_restrict fptr,
double * gmx_restrict fshiftptr)
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m256d _t1, _t2, _t3, _t4; \
- __m128d _tz3, _tA, _tB, _tC, _tD; \
- fix1 = _mm256_hadd_pd(fix1, fiy1); \
- fiz1 = _mm256_hadd_pd(fiz1, fix2); \
- fiy2 = _mm256_hadd_pd(fiy2, fiz2); \
- fix3 = _mm256_hadd_pd(fix3, fiy3); \
- fiz3 = _mm256_hadd_pd(fiz3, _mm256_setzero_pd()); \
- _t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
- _t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
- _t1 = _mm256_add_pd(_t1, _t2); \
- _t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
- _t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
- _t3 = _mm256_add_pd(_t3, _t4); \
- _tz3 = _mm_add_pd(_mm256_castpd256_pd128(fiz3), _mm256_extractf128_pd(fiz3, 0x1)); \
- _t2 = _mm256_loadu_pd(fptr); \
- _t4 = _mm256_loadu_pd(fptr+4); \
- _tA = _mm_load_sd(fptr+8); \
- _t2 = _mm256_add_pd(_t2, _t1); \
- _t4 = _mm256_add_pd(_t4, _t3); \
- _tA = _mm_add_sd(_tA, _tz3); \
- _mm256_storeu_pd(fptr, _t2); \
- _mm256_storeu_pd(fptr+4, _t4); \
- _mm_store_sd(fptr+8, _tA); \
- _tB = _mm256_extractf128_pd(_t1, 0x1); \
- _tC = _mm256_extractf128_pd(_t3, 0x1); \
- _tz3 = _mm_add_sd(_tz3, _tB); \
- _tD = _mm_permute_pd(_mm256_castpd256_pd128(_t3), _GMX_MM_PERMUTE128D(1, 1)); \
- _tz3 = _mm_add_sd(_tz3, _tD); \
- _tC = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t1)); \
- _tD = _mm_shuffle_pd(_tB, _mm256_castpd256_pd128(_t3), _MM_SHUFFLE2(0, 1)); \
- _tC = _mm_add_pd(_tC, _tD); \
- _tA = _mm_loadu_pd(fshiftptr); \
- _tB = _mm_load_sd(fshiftptr+2); \
- _tA = _mm_add_pd(_tA, _tC); \
- _tB = _mm_add_sd(_tB, _tz3); \
- _mm_storeu_pd(fshiftptr, _tA); \
- _mm_store_sd(fshiftptr+2, _tB); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_3atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
__m256d fix2, __m256d fiy2, __m256d fiz2,
__m256d fix3, __m256d fiy3, __m256d fiz3,
_mm_storeu_pd(fshiftptr, tA);
_mm_store_sd(fshiftptr+2, tB);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m256d _t1, _t2, _t3, _t4, _t5, _t6; \
- __m128d _tA, _tB, _tC, _tD; \
- fix1 = _mm256_hadd_pd(fix1, fiy1); \
- fiz1 = _mm256_hadd_pd(fiz1, fix2); \
- fiy2 = _mm256_hadd_pd(fiy2, fiz2); \
- fix3 = _mm256_hadd_pd(fix3, fiy3); \
- fiz3 = _mm256_hadd_pd(fiz3, fix4); \
- fiy4 = _mm256_hadd_pd(fiy4, fiz4); \
- _t1 = gmx_mm256_unpack128lo_pd(fix1, fiz1); \
- _t2 = gmx_mm256_unpack128hi_pd(fix1, fiz1); \
- _t1 = _mm256_add_pd(_t1, _t2); \
- _t3 = gmx_mm256_unpack128lo_pd(fiy2, fix3); \
- _t4 = gmx_mm256_unpack128hi_pd(fiy2, fix3); \
- _t3 = _mm256_add_pd(_t3, _t4); \
- _t5 = gmx_mm256_unpack128lo_pd(fiz3, fiy4); \
- _t6 = gmx_mm256_unpack128hi_pd(fiz3, fiy4); \
- _t5 = _mm256_add_pd(_t5, _t6); \
- _t2 = _mm256_loadu_pd(fptr); \
- _t4 = _mm256_loadu_pd(fptr+4); \
- _t6 = _mm256_loadu_pd(fptr+8); \
- _t2 = _mm256_add_pd(_t2, _t1); \
- _t4 = _mm256_add_pd(_t4, _t3); \
- _t6 = _mm256_add_pd(_t6, _t5); \
- _mm256_storeu_pd(fptr, _t2); \
- _mm256_storeu_pd(fptr+4, _t4); \
- _mm256_storeu_pd(fptr+8, _t6); \
- _tA = _mm256_extractf128_pd(_t1, 0x1); \
- _tB = _mm256_extractf128_pd(_t3, 0x1); \
- _tC = _mm256_extractf128_pd(_t5, 0x1); \
- _tB = _mm_add_pd(_tB, _mm256_castpd256_pd128(_t1)); \
- _tA = _mm_add_pd(_tA, _mm256_castpd256_pd128(_t5)); \
- _tC = _mm_add_pd(_tC, _mm256_castpd256_pd128(_t3)); \
- _tD = _mm_shuffle_pd(_tA, _tC, _MM_SHUFFLE2(0, 1)); \
- _tB = _mm_add_pd(_tB, _tD); \
- _tC = _mm_permute_pd(_tC, _GMX_MM_PERMUTE128D(1, 1)); \
- _tC = _mm_add_sd(_tC, _tA); \
- _tA = _mm_loadu_pd(fshiftptr); \
- _tD = _mm_load_sd(fshiftptr+2); \
- _tA = _mm_add_pd(_tA, _tB); \
- _tD = _mm_add_sd(_tD, _tC); \
- _mm_storeu_pd(fshiftptr, _tA); \
- _mm_store_sd(fshiftptr+2, _tD); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_4atom_swizzle_pd(__m256d fix1, __m256d fiy1, __m256d fiz1,
__m256d fix2, __m256d fiy2, __m256d fiz2,
__m256d fix3, __m256d fiy3, __m256d fiz3,
_mm_storeu_pd(fshiftptr, tA);
_mm_store_sd(fshiftptr+2, tD);
}
-#endif
-
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_1pot_pd(__m256d pot1, double * gmx_restrict ptrA)
{
__m128d t1;
_mm_store_sd(ptrA, _mm_add_sd(_mm_load_sd(ptrA), t1));
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_2pot_pd(__m256d pot1, double * gmx_restrict ptrA,
__m256d pot2, double * gmx_restrict ptrB)
{
#define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_unpack128lo_ps(__m256 xmm1, __m256 xmm2)
{
return _mm256_permute2f128_ps(xmm1, xmm2, 0x20);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_unpack128hi_ps(__m256 xmm1, __m256 xmm2)
{
return _mm256_permute2f128_ps(xmm1, xmm2, 0x31);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_set_m128(__m128 hi, __m128 lo)
{
return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 0x1);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_calc_rsq_ps(__m256 dx, __m256 dy, __m256 dz)
{
return _mm256_add_ps( _mm256_add_ps( _mm256_mul_ps(dx, dx), _mm256_mul_ps(dy, dy) ), _mm256_mul_ps(dz, dz) );
#define gmx_mm256_sum4_ps(t0, t1, t2, t3) _mm256_add_ps(_mm256_add_ps(t0, t1), _mm256_add_ps(t2, t3))
-static gmx_inline int
+static gmx_inline int gmx_simdcall
gmx_mm256_any_lt(__m256 a, __m256 b)
{
return _mm256_movemask_ps(_mm256_cmp_ps(a, b, _CMP_LT_OQ));
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_load_4real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD)
{
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_mm256_load_8real_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_store_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD, __m256 xmm1)
{
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_store_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
float * gmx_restrict ptrE, float * gmx_restrict ptrF,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_increment_4real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m256 xmm1)
_mm_store_ss(ptrD, t4);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_increment_8real_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
float * gmx_restrict ptrE, float * gmx_restrict ptrF,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
const float * gmx_restrict p3, const float * gmx_restrict p4,
__m256 * gmx_restrict c6, __m256 * gmx_restrict c12)
*c12 = _mm256_castps128_ps256(_mm_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_8pair_swizzle_ps(const float * gmx_restrict p1, const float * gmx_restrict p2,
const float * gmx_restrict p3, const float * gmx_restrict p4,
const float * gmx_restrict p5, const float * gmx_restrict p6,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m256 * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
__m256 * gmx_restrict x1, __m256 * gmx_restrict y1, __m256 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_3rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_load_4rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
const float * gmx_restrict ptrE, const float * gmx_restrict ptrF,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m256 x1, __m256 y1, __m256 z1)
gmx_mm_maskstore_ps(ptrD, mask, t8);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- x1, y1, z1, x2, y2, z2, x3, y3, z3) \
- { \
- __m256 _t1, _t2, _t3, _t4, _t5, _t6; \
- __m128 _tA, _tB, _tC, _tD; \
-\
- _t1 = _mm256_loadu_ps(ptrA); \
- _t2 = _mm256_loadu_ps(ptrB); \
- _t3 = _mm256_loadu_ps(ptrC); \
- _t4 = _mm256_loadu_ps(ptrD); \
- _tA = _mm_load_ss(ptrA+8); \
- _tB = _mm_load_ss(ptrB+8); \
- _tC = _mm_load_ss(ptrC+8); \
- _tD = _mm_load_ss(ptrD+8); \
- _t5 = _mm256_unpacklo_ps(x1, y1); \
- x1 = _mm256_unpackhi_ps(x1, y1); \
- y1 = _mm256_unpacklo_ps(z1, x2); \
- z1 = _mm256_unpackhi_ps(z1, x2); \
- x2 = _mm256_unpacklo_ps(y2, z2); \
- y2 = _mm256_unpackhi_ps(y2, z2); \
- _t6 = _mm256_unpacklo_ps(x3, y3); \
- x3 = _mm256_unpackhi_ps(x3, y3); \
- _t5 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
- x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
- y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(_t6), 0x1); \
- z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
- z2 = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t5 = _mm256_shuffle_ps(_t5, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
- y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
- x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t1 = _mm256_sub_ps(_t1, z2); \
- _t2 = _mm256_sub_ps(_t2, _t5); \
- _t3 = _mm256_sub_ps(_t3, y1); \
- _t4 = _mm256_sub_ps(_t4, x1); \
- _tA = _mm_sub_ss(_tA, _mm256_castps256_ps128(z3)); \
- _tB = _mm_sub_ss(_tB, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(1, 1, 1, 1))); \
- _tC = _mm_sub_ss(_tC, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(2, 2, 2, 2))); \
- _tD = _mm_sub_ss(_tD, _mm_permute_ps(_mm256_castps256_ps128(z3), _MM_SHUFFLE(3, 3, 3, 3))); \
- _mm256_storeu_ps(ptrA, _t1); \
- _mm256_storeu_ps(ptrB, _t2); \
- _mm256_storeu_ps(ptrC, _t3); \
- _mm256_storeu_ps(ptrD, _t4); \
- _mm_store_ss(ptrA+8, _tA); \
- _mm_store_ss(ptrB+8, _tB); \
- _mm_store_ss(ptrC+8, _tC); \
- _mm_store_ss(ptrD+8, _tD); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m256 x1, __m256 y1, __m256 z1,
_mm_store_ss(ptrC+8, tC);
_mm_store_ss(ptrD+8, tD);
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- x1, y1, z1, x2, y2, z2, x3, y3, z3, x4, y4, z4) \
- { \
- __m256 _t1, _t2, _t3, _t4, _t5; \
- __m128 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH; \
-\
- _t1 = _mm256_loadu_ps(ptrA); \
- _t2 = _mm256_loadu_ps(ptrB); \
- _t3 = _mm256_loadu_ps(ptrC); \
- _t4 = _mm256_loadu_ps(ptrD); \
- _tA = _mm_loadu_ps(ptrA+8); \
- _tB = _mm_loadu_ps(ptrB+8); \
- _tC = _mm_loadu_ps(ptrC+8); \
- _tD = _mm_loadu_ps(ptrD+8); \
- _t5 = _mm256_unpacklo_ps(x1, y1); \
- x1 = _mm256_unpackhi_ps(x1, y1); \
- y1 = _mm256_unpacklo_ps(z1, x2); \
- z1 = _mm256_unpackhi_ps(z1, x2); \
- x2 = _mm256_unpacklo_ps(y2, z2); \
- y2 = _mm256_unpackhi_ps(y2, z2); \
- z2 = _mm256_unpacklo_ps(x3, y3); \
- x3 = _mm256_unpackhi_ps(x3, y3); \
- y3 = _mm256_unpacklo_ps(z3, x4); \
- z3 = _mm256_unpackhi_ps(z3, x4); \
- x4 = _mm256_unpacklo_ps(y4, z4); \
- y4 = _mm256_unpackhi_ps(y4, z4); \
- x2 = _mm256_insertf128_ps(_t5, _mm256_castps256_ps128(x2), 0x1); \
- x1 = _mm256_insertf128_ps(x1, _mm256_castps256_ps128(y2), 0x1); \
- y1 = _mm256_insertf128_ps(y1, _mm256_castps256_ps128(z2), 0x1); \
- z1 = _mm256_insertf128_ps(z1, _mm256_castps256_ps128(x3), 0x1); \
- z2 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t5 = _mm256_shuffle_ps(x2, y1, _MM_SHUFFLE(3, 2, 3, 2)); \
- y1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(1, 0, 1, 0)); \
- x1 = _mm256_shuffle_ps(x1, z1, _MM_SHUFFLE(3, 2, 3, 2)); \
- _tE = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(1, 0, 1, 0)); \
- _tF = _mm_shuffle_ps(_mm256_castps256_ps128(y3), _mm256_castps256_ps128(x4), _MM_SHUFFLE(3, 2, 3, 2)); \
- _tG = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(1, 0, 1, 0)); \
- _tH = _mm_shuffle_ps(_mm256_castps256_ps128(z3), _mm256_castps256_ps128(y4), _MM_SHUFFLE(3, 2, 3, 2)); \
- _t1 = _mm256_sub_ps(_t1, z2); \
- _t2 = _mm256_sub_ps(_t2, _t5); \
- _t3 = _mm256_sub_ps(_t3, y1); \
- _t4 = _mm256_sub_ps(_t4, x1); \
- _tA = _mm_sub_ps(_tA, _tE); \
- _tB = _mm_sub_ps(_tB, _tF); \
- _tC = _mm_sub_ps(_tC, _tG); \
- _tD = _mm_sub_ps(_tD, _tH); \
- _mm256_storeu_ps(ptrA, _t1); \
- _mm256_storeu_ps(ptrB, _t2); \
- _mm256_storeu_ps(ptrC, _t3); \
- _mm256_storeu_ps(ptrD, _t4); \
- _mm_storeu_ps(ptrA+8, _tA); \
- _mm_storeu_ps(ptrB+8, _tB); \
- _mm_storeu_ps(ptrC+8, _tC); \
- _mm_storeu_ps(ptrD+8, _tD); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m256 x1, __m256 y1, __m256 z1,
_mm_storeu_ps(ptrC+8, tC);
_mm_storeu_ps(ptrD+8, tD);
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
float * gmx_restrict ptrE, float * gmx_restrict ptrF,
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-\
- _tA = _mm256_loadu_ps(ptrA); \
- _tB = _mm256_loadu_ps(ptrB); \
- _tC = _mm256_loadu_ps(ptrC); \
- _tD = _mm256_loadu_ps(ptrD); \
- _tE = _mm256_loadu_ps(ptrE); \
- _tF = _mm256_loadu_ps(ptrF); \
- _tG = _mm256_loadu_ps(ptrG); \
- _tH = _mm256_loadu_ps(ptrH); \
- _t1 = _mm256_unpacklo_ps(_x1, _y1); \
- _t2 = _mm256_unpackhi_ps(_x1, _y1); \
- _t3 = _mm256_unpacklo_ps(_z1, _x2); \
- _t4 = _mm256_unpackhi_ps(_z1, _x2); \
- _t5 = _mm256_unpacklo_ps(_y2, _z2); \
- _t6 = _mm256_unpackhi_ps(_y2, _z2); \
- _t7 = _mm256_unpacklo_ps(_x3, _y3); \
- _t8 = _mm256_unpackhi_ps(_x3, _y3); \
- _t9 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t10 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t11 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t12 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t1 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t2 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t3 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t4 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t5 = gmx_mm256_unpack128lo_ps(_t9, _t1); \
- _t6 = gmx_mm256_unpack128hi_ps(_t9, _t1); \
- _t7 = gmx_mm256_unpack128lo_ps(_t10, _t2); \
- _t8 = gmx_mm256_unpack128hi_ps(_t10, _t2); \
- _t1 = gmx_mm256_unpack128lo_ps(_t11, _t3); \
- _t2 = gmx_mm256_unpack128hi_ps(_t11, _t3); \
- _t9 = gmx_mm256_unpack128lo_ps(_t12, _t4); \
- _t10 = gmx_mm256_unpack128hi_ps(_t12, _t4); \
- _tA = _mm256_sub_ps(_tA, _t5); \
- _tB = _mm256_sub_ps(_tB, _t7); \
- _tC = _mm256_sub_ps(_tC, _t1); \
- _tD = _mm256_sub_ps(_tD, _t9); \
- _tE = _mm256_sub_ps(_tE, _t6); \
- _tF = _mm256_sub_ps(_tF, _t8); \
- _tG = _mm256_sub_ps(_tG, _t2); \
- _tH = _mm256_sub_ps(_tH, _t10); \
- _mm256_storeu_ps(ptrA, _tA); \
- _mm256_storeu_ps(ptrB, _tB); \
- _mm256_storeu_ps(ptrC, _tC); \
- _mm256_storeu_ps(ptrD, _tD); \
- _mm256_storeu_ps(ptrE, _tE); \
- _mm256_storeu_ps(ptrF, _tF); \
- _mm256_storeu_ps(ptrG, _tG); \
- _mm256_storeu_ps(ptrH, _tH); \
- _tI = gmx_mm256_set_m128(_mm_load_ss(ptrE+8), _mm_load_ss(ptrA+8)); \
- _tJ = gmx_mm256_set_m128(_mm_load_ss(ptrF+8), _mm_load_ss(ptrB+8)); \
- _tK = gmx_mm256_set_m128(_mm_load_ss(ptrG+8), _mm_load_ss(ptrC+8)); \
- _tL = gmx_mm256_set_m128(_mm_load_ss(ptrH+8), _mm_load_ss(ptrD+8)); \
- _tI = _mm256_unpacklo_ps(_tI, _tK); \
- _tJ = _mm256_unpacklo_ps(_tJ, _tL); \
- _tI = _mm256_unpacklo_ps(_tI, _tJ); \
- _tI = _mm256_sub_ps(_tI, _z3); \
- _tJ = _mm256_permute_ps(_tI, _MM_SHUFFLE(1, 1, 1, 1)); \
- _tK = _mm256_permute_ps(_tI, _MM_SHUFFLE(2, 2, 2, 2)); \
- _tL = _mm256_permute_ps(_tI, _MM_SHUFFLE(3, 3, 3, 3)); \
- _mm_store_ss(ptrA+8, _mm256_castps256_ps128(_tI)); \
- _mm_store_ss(ptrB+8, _mm256_castps256_ps128(_tJ)); \
- _mm_store_ss(ptrC+8, _mm256_castps256_ps128(_tK)); \
- _mm_store_ss(ptrD+8, _mm256_castps256_ps128(_tL)); \
- _mm_store_ss(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
- _mm_store_ss(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
- _mm_store_ss(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
- _mm_store_ss(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
float * gmx_restrict ptrE, float * gmx_restrict ptrF,
_mm_store_ss(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
_mm_store_ss(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, ptrE, ptrF, ptrG, ptrH, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m256 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m256 _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
-\
- _tA = _mm256_loadu_ps(ptrA); \
- _tB = _mm256_loadu_ps(ptrB); \
- _tC = _mm256_loadu_ps(ptrC); \
- _tD = _mm256_loadu_ps(ptrD); \
- _tE = _mm256_loadu_ps(ptrE); \
- _tF = _mm256_loadu_ps(ptrF); \
- _tG = _mm256_loadu_ps(ptrG); \
- _tH = _mm256_loadu_ps(ptrH); \
- _t1 = _mm256_unpacklo_ps(_x1, _y1); \
- _t2 = _mm256_unpackhi_ps(_x1, _y1); \
- _t3 = _mm256_unpacklo_ps(_z1, _x2); \
- _t4 = _mm256_unpackhi_ps(_z1, _x2); \
- _t5 = _mm256_unpacklo_ps(_y2, _z2); \
- _t6 = _mm256_unpackhi_ps(_y2, _z2); \
- _t7 = _mm256_unpacklo_ps(_x3, _y3); \
- _t8 = _mm256_unpackhi_ps(_x3, _y3); \
- _t9 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t10 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t11 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t12 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t1 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t2 = _mm256_shuffle_ps(_t5, _t7, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t3 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t4 = _mm256_shuffle_ps(_t6, _t8, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t5 = gmx_mm256_unpack128lo_ps(_t9, _t1); \
- _t6 = gmx_mm256_unpack128hi_ps(_t9, _t1); \
- _t7 = gmx_mm256_unpack128lo_ps(_t10, _t2); \
- _t8 = gmx_mm256_unpack128hi_ps(_t10, _t2); \
- _t1 = gmx_mm256_unpack128lo_ps(_t11, _t3); \
- _t2 = gmx_mm256_unpack128hi_ps(_t11, _t3); \
- _t9 = gmx_mm256_unpack128lo_ps(_t12, _t4); \
- _t10 = gmx_mm256_unpack128hi_ps(_t12, _t4); \
- _tA = _mm256_sub_ps(_tA, _t5); \
- _tB = _mm256_sub_ps(_tB, _t7); \
- _tC = _mm256_sub_ps(_tC, _t1); \
- _tD = _mm256_sub_ps(_tD, _t9); \
- _tE = _mm256_sub_ps(_tE, _t6); \
- _tF = _mm256_sub_ps(_tF, _t8); \
- _tG = _mm256_sub_ps(_tG, _t2); \
- _tH = _mm256_sub_ps(_tH, _t10); \
- _mm256_storeu_ps(ptrA, _tA); \
- _mm256_storeu_ps(ptrB, _tB); \
- _mm256_storeu_ps(ptrC, _tC); \
- _mm256_storeu_ps(ptrD, _tD); \
- _mm256_storeu_ps(ptrE, _tE); \
- _mm256_storeu_ps(ptrF, _tF); \
- _mm256_storeu_ps(ptrG, _tG); \
- _mm256_storeu_ps(ptrH, _tH); \
- _tI = gmx_mm256_set_m128(_mm_loadu_ps(ptrE+8), _mm_loadu_ps(ptrA+8)); \
- _tJ = gmx_mm256_set_m128(_mm_loadu_ps(ptrF+8), _mm_loadu_ps(ptrB+8)); \
- _tK = gmx_mm256_set_m128(_mm_loadu_ps(ptrG+8), _mm_loadu_ps(ptrC+8)); \
- _tL = gmx_mm256_set_m128(_mm_loadu_ps(ptrH+8), _mm_loadu_ps(ptrD+8)); \
- _t1 = _mm256_unpacklo_ps(_z3, _x4); \
- _t2 = _mm256_unpackhi_ps(_z3, _x4); \
- _t3 = _mm256_unpacklo_ps(_y4, _z4); \
- _t4 = _mm256_unpackhi_ps(_y4, _z4); \
- _t5 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t6 = _mm256_shuffle_ps(_t1, _t3, _MM_SHUFFLE(3, 2, 3, 2)); \
- _t7 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(1, 0, 1, 0)); \
- _t8 = _mm256_shuffle_ps(_t2, _t4, _MM_SHUFFLE(3, 2, 3, 2)); \
- _tI = _mm256_sub_ps(_tI, _t5); \
- _tJ = _mm256_sub_ps(_tJ, _t6); \
- _tK = _mm256_sub_ps(_tK, _t7); \
- _tL = _mm256_sub_ps(_tL, _t8); \
- _mm_storeu_ps(ptrA+8, _mm256_castps256_ps128(_tI)); \
- _mm_storeu_ps(ptrB+8, _mm256_castps256_ps128(_tJ)); \
- _mm_storeu_ps(ptrC+8, _mm256_castps256_ps128(_tK)); \
- _mm_storeu_ps(ptrD+8, _mm256_castps256_ps128(_tL)); \
- _mm_storeu_ps(ptrE+8, _mm256_extractf128_ps(_tI, 0x1)); \
- _mm_storeu_ps(ptrF+8, _mm256_extractf128_ps(_tJ, 0x1)); \
- _mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(_tK, 0x1)); \
- _mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(_tL, 0x1)); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
float * gmx_restrict ptrE, float * gmx_restrict ptrF,
_mm_storeu_ps(ptrG+8, _mm256_extractf128_ps(tK, 0x1));
_mm_storeu_ps(ptrH+8, _mm256_extractf128_ps(tL, 0x1));
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_1atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
_mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m256 _t1, _t2, _t3; \
- __m128 _tA, _tB, _tC; \
-\
- fix1 = _mm256_hadd_ps(fix1, fiy1); \
- fiz1 = _mm256_hadd_ps(fiz1, fix2); \
- fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
- fix3 = _mm256_hadd_ps(fix3, fiy3); \
- fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
- fix1 = _mm256_hadd_ps(fix1, fiz1); \
- fiy2 = _mm256_hadd_ps(fiy2, fix3); \
- fiz3 = _mm256_hadd_ps(fiz3, _mm256_setzero_ps()); \
-\
- _t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
- _t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
- _t1 = _mm256_add_ps(_t1, _t2); \
- _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
- _t3 = _mm256_loadu_ps(fptr); \
- _t3 = _mm256_add_ps(_t3, _t1); \
- _mm256_storeu_ps(fptr, _t3); \
- _tB = _mm_load_ss(fptr+8); \
- _tB = _mm_add_ss(_tB, _tA); \
- _mm_store_ss(fptr+8, _tB); \
-\
- _tB = _mm256_extractf128_ps(_t1, 0x1); \
- _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
- _tB = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
- _tC = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
- _tB = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
- _tA = _mm_add_ps(_tB, _tC); \
- _tA = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
- _tC = _mm_loadu_ps(fshiftptr); \
- _tC = _mm_add_ps(_tC, _tA); \
- _mm_storeu_ps(fshiftptr, _tC); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_3atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
__m256 fix2, __m256 fiy2, __m256 fiz2,
__m256 fix3, __m256 fiy3, __m256 fiz3,
tC = _mm_add_ps(tC, tA);
_mm_storeu_ps(fshiftptr, tC);
}
-#endif
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm256_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m256 _t1, _t2, _t3; \
- __m128 _tA, _tB, _tC; \
-\
- fix1 = _mm256_hadd_ps(fix1, fiy1); \
- fiz1 = _mm256_hadd_ps(fiz1, fix2); \
- fiy2 = _mm256_hadd_ps(fiy2, fiz2); \
- fix3 = _mm256_hadd_ps(fix3, fiy3); \
- fiz3 = _mm256_hadd_ps(fiz3, fix4); \
- fiy4 = _mm256_hadd_ps(fiy4, fiz4); \
-\
- fix1 = _mm256_hadd_ps(fix1, fiz1); \
- fiy2 = _mm256_hadd_ps(fiy2, fix3); \
- fiz3 = _mm256_hadd_ps(fiz3, fiy4); \
-\
- _t1 = gmx_mm256_unpack128lo_ps(fix1, fiy2); \
- _t2 = gmx_mm256_unpack128hi_ps(fix1, fiy2); \
- _t1 = _mm256_add_ps(_t1, _t2); \
- _tA = _mm_add_ps(_mm256_castps256_ps128(fiz3), _mm256_extractf128_ps(fiz3, 0x1)); \
- _t3 = _mm256_loadu_ps(fptr); \
- _t3 = _mm256_add_ps(_t3, _t1); \
- _mm256_storeu_ps(fptr, _t3); \
- _tB = _mm_loadu_ps(fptr+8); \
- _tB = _mm_add_ps(_tB, _tA); \
- _mm_storeu_ps(fptr+8, _tB); \
-\
- _tB = _mm256_extractf128_ps(_t1, 0x1); \
- _tC = _mm_shuffle_ps(_mm256_castps256_ps128(_t1), _tB, _MM_SHUFFLE(1, 0, 3, 3)); \
- _tB = _mm_shuffle_ps(_tB, _tA, _MM_SHUFFLE(1, 0, 3, 2)); \
- _tC = _mm_permute_ps(_tC, _MM_SHUFFLE(3, 3, 2, 0)); \
- _tA = _mm_permute_ps(_tA, _MM_SHUFFLE(0, 3, 2, 1)); \
- _tB = _mm_add_ps(_tB, _mm256_castps256_ps128(_t1)); \
- _tA = _mm_add_ps(_tA, _tC); \
- _tA = _mm_add_ps(_tA, _tB); \
- _tA = _mm_blend_ps(_mm_setzero_ps(), _tA, 0x7); \
- _tC = _mm_loadu_ps(fshiftptr); \
- _tC = _mm_add_ps(_tC, _tA); \
- _mm_storeu_ps(fshiftptr, _tC); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_iforce_4atom_swizzle_ps(__m256 fix1, __m256 fiy1, __m256 fiz1,
__m256 fix2, __m256 fiy2, __m256 fiz2,
__m256 fix3, __m256 fiy3, __m256 fiz3,
tC = _mm_add_ps(tC, tA);
_mm_storeu_ps(fshiftptr, tC);
}
-#endif
-
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_1pot_ps(__m256 pot1, float * gmx_restrict ptrA)
{
__m128 t1;
_mm_store_ss(ptrA, _mm_add_ss(_mm_load_ss(ptrA), t1));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm256_update_2pot_ps(__m256 pot1, float * gmx_restrict ptrA,
__m256 pot2, float * gmx_restrict ptrB)
{
row1 = _mm_unpackhi_pd(__gmx_t1, row1); \
}
-static int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128d a, __m128d b)
{
return _mm_movemask_pd(_mm_cmplt_pd(a, b));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
{
return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
/* Load a double value from 1-2 places, merge into xmm register */
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB)
{
return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
{
return _mm_load_sd(ptrA);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB,
__m128d xmm1)
_mm_store_sd(ptrB, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
_mm_store_sd(ptrA, xmm1);
/* Similar to store, but increments value in memory */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB, __m128d xmm1)
{
_mm_store_sd(ptrB, t1);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
__m128d tmp;
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
const double * gmx_restrict p2,
__m128d * gmx_restrict c6,
*c12 = _mm_unpackhi_pd(t1, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict c6,
__m128d * gmx_restrict c12)
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
*z = _mm_load_sd(p1+2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
*z3 = _mm_load_sd(p1+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
__m128d xy, __m128d z)
{
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1)
{
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _t1 = _mm_sub_pd(_t1, _x1); \
- _t2 = _mm_sub_pd(_t2, _z1); \
- _t3 = _mm_sub_pd(_t3, _y2); \
- _t4 = _mm_sub_pd(_t4, _x3); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- }
-#else
/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+6, t4);
_mm_store_sd(ptrA+8, t5);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _z3 = _mm_unpacklo_pd(_z3, _x4); \
- _y4 = _mm_unpacklo_pd(_y4, _z4); \
- _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
- _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
- _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
- _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
- _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
- _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1)
{
_mm_store_sd(ptrB+2, t4);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrB); \
- _t7 = _mm_loadu_pd(ptrB+2); \
- _t8 = _mm_loadu_pd(ptrB+4); \
- _t9 = _mm_loadu_pd(ptrB+6); \
- _t10 = _mm_load_sd(ptrB+8); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpackhi_pd(_z3, _z3); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _t6 = _mm_sub_pd(_t6, _tB); \
- _t7 = _mm_sub_pd(_t7, _tD); \
- _t8 = _mm_sub_pd(_t8, _tF); \
- _t9 = _mm_sub_pd(_t9, _tH); \
- _t10 = _mm_sub_sd(_t10, _tI); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrB, _t6); \
- _mm_storeu_pd(ptrB+2, _t7); \
- _mm_storeu_pd(ptrB+4, _t8); \
- _mm_storeu_pd(ptrB+6, _t9); \
- _mm_store_sd(ptrB+8, _t10); \
- }
-#else
+
/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+6, t9);
_mm_store_sd(ptrB+8, t10);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _t7 = _mm_loadu_pd(ptrB); \
- _t8 = _mm_loadu_pd(ptrB+2); \
- _t9 = _mm_loadu_pd(ptrB+4); \
- _t10 = _mm_loadu_pd(ptrB+6); \
- _t11 = _mm_loadu_pd(ptrB+8); \
- _t12 = _mm_loadu_pd(ptrB+10); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpacklo_pd(_z3, _x4); \
- _tJ = _mm_unpackhi_pd(_z3, _x4); \
- _tK = _mm_unpacklo_pd(_y4, _z4); \
- _tL = _mm_unpackhi_pd(_y4, _z4); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_pd(_t5, _tI); \
- _t6 = _mm_sub_pd(_t6, _tK); \
- _t7 = _mm_sub_pd(_t7, _tB); \
- _t8 = _mm_sub_pd(_t8, _tD); \
- _t9 = _mm_sub_pd(_t9, _tF); \
- _t10 = _mm_sub_pd(_t10, _tH); \
- _t11 = _mm_sub_pd(_t11, _tJ); \
- _t12 = _mm_sub_pd(_t12, _tL); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_storeu_pd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrA+10, _t6); \
- _mm_storeu_pd(ptrB, _t7); \
- _mm_storeu_pd(ptrB+2, _t8); \
- _mm_storeu_pd(ptrB+4, _t9); \
- _mm_storeu_pd(ptrB+6, _t10); \
- _mm_storeu_pd(ptrB+8, _t11); \
- _mm_storeu_pd(ptrB+10, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+8, t11);
_mm_storeu_pd(ptrB+10, t12);
}
-#endif
-
-
-
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
double * gmx_restrict fptr,
double * gmx_restrict fshiftptr)
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- GMX_MM_TRANSPOSE2_PD(fix1, fiy1); \
- GMX_MM_TRANSPOSE2_PD(fiz1, fix2); \
- GMX_MM_TRANSPOSE2_PD(fiy2, fiz2); \
- _t1 = fix3; \
- fix3 = _mm_unpacklo_pd(fix3, fiy3); \
- fiy3 = _mm_unpackhi_pd(_t1, fiy3); \
- fix1 = _mm_add_pd(fix1, fiy1); \
- fiz1 = _mm_add_pd(fiz1, fix2); \
- fiy2 = _mm_add_pd(fiy2, fiz2); \
- fix3 = _mm_add_pd(fix3, fiy3); \
- fiz3 = _mm_add_sd( fiz3, _mm_unpackhi_pd(fiz3, fiz3)); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
- fix1 = _mm_add_pd(fix1, fix3); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- fiz1 = _mm_add_sd(fiz1, _t2); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- GMX_MM_TRANSPOSE2_PD(fix1, fiy1); \
- GMX_MM_TRANSPOSE2_PD(fiz1, fix2); \
- GMX_MM_TRANSPOSE2_PD(fiy2, fiz2); \
- GMX_MM_TRANSPOSE2_PD(fix3, fiy3); \
- GMX_MM_TRANSPOSE2_PD(fiz3, fix4); \
- GMX_MM_TRANSPOSE2_PD(fiy4, fiz4); \
- fix1 = _mm_add_pd(fix1, fiy1); \
- fiz1 = _mm_add_pd(fiz1, fix2); \
- fiy2 = _mm_add_pd(fiy2, fiz2); \
- fix3 = _mm_add_pd(fix3, fiy3); \
- fiz3 = _mm_add_pd(fiz3, fix4); \
- fiy4 = _mm_add_pd(fiy4, fiz4); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
- _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
- fix3 = _mm_add_pd(fix3, _t2); \
- fix1 = _mm_add_pd(fix1, fix3); \
- fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
- fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
{
pot1 = _mm_add_pd(pot1, _mm_unpackhi_pd(pot1, pot1));
_mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
__m128d pot2, double * gmx_restrict ptrB)
{
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
{
return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy) ), _mm_mul_ps(dz, dz) );
}
-static int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128 a, __m128 b)
{
return _mm_movemask_ps(_mm_cmplt_ps(a, b));
/* Load a single value from 1-4 places, merge into xmm register */
-static __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
return _mm_unpacklo_ps(t1, t2);
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
/* Similar to store, but increments value in memory */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
const float * gmx_restrict p2,
const float * gmx_restrict p3,
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
- __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_shuffle_ps(_z3, _z3, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t18 = _mm_movehl_ps(_z3, _z3); \
- _t19 = _mm_shuffle_ps(_t18, _t18, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t20 = _mm_movelh_ps(_x1, _z1); \
- _t21 = _mm_movehl_ps(_z1, _x1); \
- _t22 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t23 = _mm_movelh_ps(_y2, _x3); \
- _t24 = _mm_movehl_ps(_x3, _y2); \
- _t25 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_load_ss(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t20); \
- _t2 = _mm_sub_ps(_t2, _t23); \
- _t3 = _mm_sub_ss(_t3, _z3); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_store_ss(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_load_ss(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _t21); \
- _t5 = _mm_sub_ps(_t5, _t24); \
- _t6 = _mm_sub_ss(_t6, _t17); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_store_ss(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_load_ss(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t22); \
- _t8 = _mm_sub_ps(_t8, _t25); \
- _t9 = _mm_sub_ss(_t9, _t18); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_store_ss(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_load_ss(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ss(_t12, _t19); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_store_ss(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_store_ss(ptrD+8, t12);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
- __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
- __m128 _t23, _t24; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_unpackhi_ps(_z3, _x4); \
- _z3 = _mm_unpacklo_ps(_z3, _x4); \
- _t18 = _mm_unpackhi_ps(_y4, _z4); \
- _y4 = _mm_unpacklo_ps(_y4, _z4); \
- _t19 = _mm_movelh_ps(_x1, _z1); \
- _z1 = _mm_movehl_ps(_z1, _x1); \
- _t20 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t21 = _mm_movelh_ps(_y2, _x3); \
- _x3 = _mm_movehl_ps(_x3, _y2); \
- _t22 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t23 = _mm_movelh_ps(_z3, _y4); \
- _y4 = _mm_movehl_ps(_y4, _z3); \
- _t24 = _mm_movelh_ps(_t17, _t18); \
- _t18 = _mm_movehl_ps(_t18, _t17); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_loadu_ps(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t19); \
- _t2 = _mm_sub_ps(_t2, _t21); \
- _t3 = _mm_sub_ps(_t3, _t23); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_storeu_ps(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_loadu_ps(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _z1); \
- _t5 = _mm_sub_ps(_t5, _x3); \
- _t6 = _mm_sub_ps(_t6, _y4); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_storeu_ps(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_loadu_ps(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t20); \
- _t8 = _mm_sub_ps(_t8, _t22); \
- _t9 = _mm_sub_ps(_t9, _t24); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_storeu_ps(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_loadu_ps(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ps(_t12, _t18); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_storeu_ps(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_storeu_ps(ptrD+8, t12);
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
_mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4; \
-\
- _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2); \
- _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3); \
- _t2 = _mm_movehl_ps(_mm_setzero_ps(), fiz3); \
- _t1 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t3 = _mm_shuffle_ps(_t2, _t2, _MM_SHUFFLE(0, 0, 0, 1)); \
- fix1 = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2)); \
- fiy2 = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3)); \
- fiz3 = _mm_add_ss(_mm_add_ps(fiz3, _t1), _mm_add_ps(_t2, _t3)); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
- _t4 = _mm_load_ss(fshiftptr+2); \
- _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
- _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
- _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
- _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
- _t3 = _mm_shuffle_ps(_t3, _t3, _MM_SHUFFLE(1, 2, 0, 0)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _mm_store_ss(fshiftptr+2, _t1); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
_mm_store_ss(fshiftptr+2, t1);
_mm_storeh_pi((__m64 *)(fshiftptr), t1);
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5; \
- _MM_TRANSPOSE4_PS(fix1, fiy1, fiz1, fix2); \
- _MM_TRANSPOSE4_PS(fiy2, fiz2, fix3, fiy3); \
- _MM_TRANSPOSE4_PS(fiz3, fix4, fiy4, fiz4); \
- fix1 = _mm_add_ps(_mm_add_ps(fix1, fiy1), _mm_add_ps(fiz1, fix2)); \
- fiy2 = _mm_add_ps(_mm_add_ps(fiy2, fiz2), _mm_add_ps(fix3, fiy3)); \
- fiz3 = _mm_add_ps(_mm_add_ps(fiz3, fix4), _mm_add_ps(fiy4, fiz4)); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
- _t5 = _mm_load_ss(fshiftptr+2); \
- _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
- _t1 = _mm_shuffle_ps(fix1, fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
- _t2 = _mm_shuffle_ps(fiy2, fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
- _t3 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
- _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
- _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _t5 = _mm_add_ps(_t5, _t1); \
- _mm_store_ss(fshiftptr+2, _t5); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
_mm_store_ss(fshiftptr+2, t5);
_mm_storeh_pi((__m64 *)(fshiftptr), t5);
}
-#endif
-static void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
{
pot1 = _mm_add_ps(pot1, _mm_movehl_ps(_mm_setzero_ps(), pot1));
_mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
}
-static void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
__m128 pot2, float * gmx_restrict ptrB)
{
/* Normal sum of four ymm registers */
#define gmx_mm_sum4_pd(t0, t1, t2, t3) _mm_add_pd(_mm_add_pd(t0, t1), _mm_add_pd(t2, t3))
-static int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128d a, __m128d b)
{
return _mm_movemask_pd(_mm_cmplt_pd(a, b));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_calc_rsq_pd(__m128d dx, __m128d dy, __m128d dz)
{
return _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy) ), _mm_mul_pd(dz, dz) );
/* Load a double value from 1-2 places, merge into xmm register */
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_2real_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB)
{
return _mm_unpacklo_pd(_mm_load_sd(ptrA), _mm_load_sd(ptrB));
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_mm_load_1real_pd(const double * gmx_restrict ptrA)
{
return _mm_load_sd(ptrA);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB,
__m128d xmm1)
_mm_store_sd(ptrB, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
_mm_store_sd(ptrA, xmm1);
/* Similar to store, but increments value in memory */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_2real_swizzle_pd(double * gmx_restrict ptrA,
double * gmx_restrict ptrB, __m128d xmm1)
{
_mm_store_sd(ptrB, t1);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_1real_pd(double * gmx_restrict ptrA, __m128d xmm1)
{
__m128d tmp;
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_2pair_swizzle_pd(const double * gmx_restrict p1,
const double * gmx_restrict p2,
__m128d * gmx_restrict c6,
*c12 = _mm_unpackhi_pd(t1, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1pair_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict c6,
__m128d * gmx_restrict c12)
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_pd(const double * gmx_restrict xyz_shift,
const double * gmx_restrict xyz,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x, __m128d * gmx_restrict y, __m128d * gmx_restrict z)
{
*z = _mm_load_sd(p1+2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
*z3 = _mm_load_sd(p1+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_1ptr_swizzle_pd(const double * gmx_restrict p1,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA,
const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1)
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_2ptr_swizzle_pd(const double * gmx_restrict ptrA, const double * gmx_restrict ptrB,
__m128d * gmx_restrict x1, __m128d * gmx_restrict y1, __m128d * gmx_restrict z1,
__m128d * gmx_restrict x2, __m128d * gmx_restrict y2, __m128d * gmx_restrict z2,
/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
__m128d xy, __m128d z)
{
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1)
{
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _t1 = _mm_sub_pd(_t1, _x1); \
- _t2 = _mm_sub_pd(_t2, _z1); \
- _t3 = _mm_sub_pd(_t3, _y2); \
- _t4 = _mm_sub_pd(_t4, _x3); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+6, t4);
_mm_store_sd(ptrA+8, t5);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_1ptr_swizzle_pd(ptrA, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _x1 = _mm_unpacklo_pd(_x1, _y1); \
- _z1 = _mm_unpacklo_pd(_z1, _x2); \
- _y2 = _mm_unpacklo_pd(_y2, _z2); \
- _x3 = _mm_unpacklo_pd(_x3, _y3); \
- _z3 = _mm_unpacklo_pd(_z3, _x4); \
- _y4 = _mm_unpacklo_pd(_y4, _z4); \
- _mm_storeu_pd(ptrA, _mm_sub_pd( _t1, _x1 )); \
- _mm_storeu_pd(ptrA+2, _mm_sub_pd( _t2, _z1 )); \
- _mm_storeu_pd(ptrA+4, _mm_sub_pd( _t3, _y2 )); \
- _mm_storeu_pd(ptrA+6, _mm_sub_pd( _t4, _x3 )); \
- _mm_storeu_pd(ptrA+8, _mm_sub_pd( _t5, _z3 )); \
- _mm_storeu_pd(ptrA+10, _mm_sub_pd( _t6, _y4 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrA+8, _mm_sub_pd( t5, z3 ));
_mm_storeu_pd(ptrA+10, _mm_sub_pd( t6, y4 ));
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1)
{
_mm_store_sd(ptrB+2, t4);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_load_sd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrB); \
- _t7 = _mm_loadu_pd(ptrB+2); \
- _t8 = _mm_loadu_pd(ptrB+4); \
- _t9 = _mm_loadu_pd(ptrB+6); \
- _t10 = _mm_load_sd(ptrB+8); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpackhi_pd(_z3, _z3); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_sd(_t5, _z3); \
- _t6 = _mm_sub_pd(_t6, _tB); \
- _t7 = _mm_sub_pd(_t7, _tD); \
- _t8 = _mm_sub_pd(_t8, _tF); \
- _t9 = _mm_sub_pd(_t9, _tH); \
- _t10 = _mm_sub_sd(_t10, _tI); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_store_sd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrB, _t6); \
- _mm_storeu_pd(ptrB+2, _t7); \
- _mm_storeu_pd(ptrB+4, _t8); \
- _mm_storeu_pd(ptrB+6, _t9); \
- _mm_store_sd(ptrB+8, _t10); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+6, t9);
_mm_store_sd(ptrB+8, t10);
}
-#endif
-
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_2ptr_swizzle_pd(ptrA, ptrB, _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128d _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11, _t12; \
- __m128d _tA, _tB, _tC, _tD, _tE, _tF, _tG, _tH, _tI, _tJ, _tK, _tL; \
- _t1 = _mm_loadu_pd(ptrA); \
- _t2 = _mm_loadu_pd(ptrA+2); \
- _t3 = _mm_loadu_pd(ptrA+4); \
- _t4 = _mm_loadu_pd(ptrA+6); \
- _t5 = _mm_loadu_pd(ptrA+8); \
- _t6 = _mm_loadu_pd(ptrA+10); \
- _t7 = _mm_loadu_pd(ptrB); \
- _t8 = _mm_loadu_pd(ptrB+2); \
- _t9 = _mm_loadu_pd(ptrB+4); \
- _t10 = _mm_loadu_pd(ptrB+6); \
- _t11 = _mm_loadu_pd(ptrB+8); \
- _t12 = _mm_loadu_pd(ptrB+10); \
- _tA = _mm_unpacklo_pd(_x1, _y1); \
- _tB = _mm_unpackhi_pd(_x1, _y1); \
- _tC = _mm_unpacklo_pd(_z1, _x2); \
- _tD = _mm_unpackhi_pd(_z1, _x2); \
- _tE = _mm_unpacklo_pd(_y2, _z2); \
- _tF = _mm_unpackhi_pd(_y2, _z2); \
- _tG = _mm_unpacklo_pd(_x3, _y3); \
- _tH = _mm_unpackhi_pd(_x3, _y3); \
- _tI = _mm_unpacklo_pd(_z3, _x4); \
- _tJ = _mm_unpackhi_pd(_z3, _x4); \
- _tK = _mm_unpacklo_pd(_y4, _z4); \
- _tL = _mm_unpackhi_pd(_y4, _z4); \
- _t1 = _mm_sub_pd(_t1, _tA); \
- _t2 = _mm_sub_pd(_t2, _tC); \
- _t3 = _mm_sub_pd(_t3, _tE); \
- _t4 = _mm_sub_pd(_t4, _tG); \
- _t5 = _mm_sub_pd(_t5, _tI); \
- _t6 = _mm_sub_pd(_t6, _tK); \
- _t7 = _mm_sub_pd(_t7, _tB); \
- _t8 = _mm_sub_pd(_t8, _tD); \
- _t9 = _mm_sub_pd(_t9, _tF); \
- _t10 = _mm_sub_pd(_t10, _tH); \
- _t11 = _mm_sub_pd(_t11, _tJ); \
- _t12 = _mm_sub_pd(_t12, _tL); \
- _mm_storeu_pd(ptrA, _t1); \
- _mm_storeu_pd(ptrA+2, _t2); \
- _mm_storeu_pd(ptrA+4, _t3); \
- _mm_storeu_pd(ptrA+6, _t4); \
- _mm_storeu_pd(ptrA+8, _t5); \
- _mm_storeu_pd(ptrA+10, _t6); \
- _mm_storeu_pd(ptrB, _t7); \
- _mm_storeu_pd(ptrB+2, _t8); \
- _mm_storeu_pd(ptrB+4, _t9); \
- _mm_storeu_pd(ptrB+6, _t10); \
- _mm_storeu_pd(ptrB+8, _t11); \
- _mm_storeu_pd(ptrB+10, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
__m128d x1, __m128d y1, __m128d z1,
__m128d x2, __m128d y2, __m128d z2,
_mm_storeu_pd(ptrB+8, t11);
_mm_storeu_pd(ptrB+10, t12);
}
-#endif
-
-
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
double * gmx_restrict fptr,
double * gmx_restrict fshiftptr)
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- fix1 = _mm_hadd_pd(fix1, fiy1); \
- fiz1 = _mm_hadd_pd(fiz1, fix2); \
- fiy2 = _mm_hadd_pd(fiy2, fiz2); \
- fix3 = _mm_hadd_pd(fix3, fiy3); \
- fiz3 = _mm_hadd_pd(fiz3, fiz3); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_store_sd( fptr+8, _mm_add_sd( _mm_load_sd(fptr+8), fiz3 )); \
- fix1 = _mm_add_pd(fix1, fix3); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiy2, fiy2, _MM_SHUFFLE2(1, 1)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- fiz1 = _mm_add_sd(fiz1, _t2); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_pd(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128d _t1, _t2; \
- fix1 = _mm_hadd_pd(fix1, fiy1); \
- fiz1 = _mm_hadd_pd(fiz1, fix2); \
- fiy2 = _mm_hadd_pd(fiy2, fiz2); \
- fix3 = _mm_hadd_pd(fix3, fiy3); \
- fiz3 = _mm_hadd_pd(fiz3, fix4); \
- fiy4 = _mm_hadd_pd(fiy4, fiz4); \
- _mm_storeu_pd( fptr, _mm_add_pd( _mm_loadu_pd(fptr), fix1 )); \
- _mm_storeu_pd( fptr+2, _mm_add_pd( _mm_loadu_pd(fptr+2), fiz1 )); \
- _mm_storeu_pd( fptr+4, _mm_add_pd( _mm_loadu_pd(fptr+4), fiy2 )); \
- _mm_storeu_pd( fptr+6, _mm_add_pd( _mm_loadu_pd(fptr+6), fix3 )); \
- _mm_storeu_pd( fptr+8, _mm_add_pd( _mm_loadu_pd(fptr+8), fiz3 )); \
- _mm_storeu_pd( fptr+10, _mm_add_pd( _mm_loadu_pd(fptr+10), fiy4 )); \
- _t1 = _mm_shuffle_pd(fiz1, fiy2, _MM_SHUFFLE2(0, 1)); \
- fix1 = _mm_add_pd(fix1, _t1); \
- _t2 = _mm_shuffle_pd(fiz3, fiy4, _MM_SHUFFLE2(0, 1)); \
- fix3 = _mm_add_pd(fix3, _t2); \
- fix1 = _mm_add_pd(fix1, fix3); \
- fiz1 = _mm_add_sd(fiz1, _mm_unpackhi_pd(fiy2, fiy2)); \
- fiz3 = _mm_add_sd(fiz3, _mm_unpackhi_pd(fiy4, fiy4)); \
- fiz1 = _mm_add_sd(fiz1, fiz3); \
- _mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 )); \
- _mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 )); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_pd(__m128d fix1, __m128d fiy1, __m128d fiz1,
__m128d fix2, __m128d fiy2, __m128d fiz2,
__m128d fix3, __m128d fiy3, __m128d fiz3,
_mm_storeu_pd( fshiftptr, _mm_add_pd( _mm_loadu_pd(fshiftptr), fix1 ));
_mm_store_sd( fshiftptr+2, _mm_add_sd( _mm_load_sd(fshiftptr+2), fiz1 ));
}
-#endif
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_pd(__m128d pot1, double * gmx_restrict ptrA)
{
pot1 = _mm_hadd_pd(pot1, pot1);
_mm_store_sd(ptrA, _mm_add_sd(pot1, _mm_load_sd(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_pd(__m128d pot1, double * gmx_restrict ptrA,
__m128d pot2, double * gmx_restrict ptrB)
{
/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0, t1, t2, t3) _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
{
return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy) ), _mm_mul_ps(dz, dz) );
}
-static gmx_inline int
+static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128 a, __m128 b)
{
return _mm_movemask_ps(_mm_cmplt_ps(a, b));
/* Load a single value from 1-4 places, merge into xmm register */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
return _mm_unpacklo_ps(t1, t2);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
/* Similar to store, but increments value in memory */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
float * gmx_restrict ptrB,
float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
const float * gmx_restrict p2,
const float * gmx_restrict p3,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
const float * gmx_restrict xyz,
__m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA,
const float * gmx_restrict ptrB,
const float * gmx_restrict ptrC,
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * ptrA,
float * ptrB,
float * ptrC,
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_3rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10; \
- __m128 _t11, _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19; \
- __m128 _t20, _t21, _t22, _t23, _t24, _t25; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_shuffle_ps(_z3, _z3, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t18 = _mm_movehl_ps(_z3, _z3); \
- _t19 = _mm_shuffle_ps(_t18, _t18, _MM_SHUFFLE(0, 0, 0, 1)); \
- _t20 = _mm_movelh_ps(_x1, _z1); \
- _t21 = _mm_movehl_ps(_z1, _x1); \
- _t22 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t23 = _mm_movelh_ps(_y2, _x3); \
- _t24 = _mm_movehl_ps(_x3, _y2); \
- _t25 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_load_ss(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t20); \
- _t2 = _mm_sub_ps(_t2, _t23); \
- _t3 = _mm_sub_ss(_t3, _z3); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_store_ss(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_load_ss(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _t21); \
- _t5 = _mm_sub_ps(_t5, _t24); \
- _t6 = _mm_sub_ss(_t6, _t17); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_store_ss(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_load_ss(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t22); \
- _t8 = _mm_sub_ps(_t8, _t25); \
- _t9 = _mm_sub_ss(_t9, _t18); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_store_ss(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_load_ss(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ss(_t12, _t19); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_store_ss(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_store_ss(ptrD+8, t12);
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_decrement_4rvec_4ptr_swizzle_ps(ptrA, ptrB, ptrC, ptrD, \
- _x1, _y1, _z1, _x2, _y2, _z2, _x3, _y3, _z3, _x4, _y4, _z4) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _t10, _t11; \
- __m128 _t12, _t13, _t14, _t15, _t16, _t17, _t18, _t19, _t20, _t21, _t22; \
- __m128 _t23, _t24; \
- _t13 = _mm_unpackhi_ps(_x1, _y1); \
- _x1 = _mm_unpacklo_ps(_x1, _y1); \
- _t14 = _mm_unpackhi_ps(_z1, _x2); \
- _z1 = _mm_unpacklo_ps(_z1, _x2); \
- _t15 = _mm_unpackhi_ps(_y2, _z2); \
- _y2 = _mm_unpacklo_ps(_y2, _z2); \
- _t16 = _mm_unpackhi_ps(_x3, _y3); \
- _x3 = _mm_unpacklo_ps(_x3, _y3); \
- _t17 = _mm_unpackhi_ps(_z3, _x4); \
- _z3 = _mm_unpacklo_ps(_z3, _x4); \
- _t18 = _mm_unpackhi_ps(_y4, _z4); \
- _y4 = _mm_unpacklo_ps(_y4, _z4); \
- _t19 = _mm_movelh_ps(_x1, _z1); \
- _z1 = _mm_movehl_ps(_z1, _x1); \
- _t20 = _mm_movelh_ps(_t13, _t14); \
- _t14 = _mm_movehl_ps(_t14, _t13); \
- _t21 = _mm_movelh_ps(_y2, _x3); \
- _x3 = _mm_movehl_ps(_x3, _y2); \
- _t22 = _mm_movelh_ps(_t15, _t16); \
- _t16 = _mm_movehl_ps(_t16, _t15); \
- _t23 = _mm_movelh_ps(_z3, _y4); \
- _y4 = _mm_movehl_ps(_y4, _z3); \
- _t24 = _mm_movelh_ps(_t17, _t18); \
- _t18 = _mm_movehl_ps(_t18, _t17); \
- _t1 = _mm_loadu_ps(ptrA); \
- _t2 = _mm_loadu_ps(ptrA+4); \
- _t3 = _mm_loadu_ps(ptrA+8); \
- _t1 = _mm_sub_ps(_t1, _t19); \
- _t2 = _mm_sub_ps(_t2, _t21); \
- _t3 = _mm_sub_ps(_t3, _t23); \
- _mm_storeu_ps(ptrA, _t1); \
- _mm_storeu_ps(ptrA+4, _t2); \
- _mm_storeu_ps(ptrA+8, _t3); \
- _t4 = _mm_loadu_ps(ptrB); \
- _t5 = _mm_loadu_ps(ptrB+4); \
- _t6 = _mm_loadu_ps(ptrB+8); \
- _t4 = _mm_sub_ps(_t4, _z1); \
- _t5 = _mm_sub_ps(_t5, _x3); \
- _t6 = _mm_sub_ps(_t6, _y4); \
- _mm_storeu_ps(ptrB, _t4); \
- _mm_storeu_ps(ptrB+4, _t5); \
- _mm_storeu_ps(ptrB+8, _t6); \
- _t7 = _mm_loadu_ps(ptrC); \
- _t8 = _mm_loadu_ps(ptrC+4); \
- _t9 = _mm_loadu_ps(ptrC+8); \
- _t7 = _mm_sub_ps(_t7, _t20); \
- _t8 = _mm_sub_ps(_t8, _t22); \
- _t9 = _mm_sub_ps(_t9, _t24); \
- _mm_storeu_ps(ptrC, _t7); \
- _mm_storeu_ps(ptrC+4, _t8); \
- _mm_storeu_ps(ptrC+8, _t9); \
- _t10 = _mm_loadu_ps(ptrD); \
- _t11 = _mm_loadu_ps(ptrD+4); \
- _t12 = _mm_loadu_ps(ptrD+8); \
- _t10 = _mm_sub_ps(_t10, _t14); \
- _t11 = _mm_sub_ps(_t11, _t16); \
- _t12 = _mm_sub_ps(_t12, _t18); \
- _mm_storeu_ps(ptrD, _t10); \
- _mm_storeu_ps(ptrD+4, _t11); \
- _mm_storeu_ps(ptrD+8, _t12); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
float * gmx_restrict ptrC, float * gmx_restrict ptrD,
__m128 x1, __m128 y1, __m128 z1,
_mm_storeu_ps(ptrD+4, t11);
_mm_storeu_ps(ptrD+8, t12);
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
float * gmx_restrict fptr,
float * gmx_restrict fshiftptr)
_mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
}
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_3atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4; \
-\
- fix1 = _mm_hadd_ps(fix1, fiy1); \
- fiz1 = _mm_hadd_ps(fiz1, fix2); \
- fiy2 = _mm_hadd_ps(fiy2, fiz2); \
- fix3 = _mm_hadd_ps(fix3, fiy3); \
- fiz3 = _mm_hadd_ps(fiz3, fiz3); \
- fix1 = _mm_hadd_ps(fix1, fiz1); \
- fiy2 = _mm_hadd_ps(fiy2, fix3); \
- fiz3 = _mm_hadd_ps(fiz3, fiz3); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) )); \
- _t4 = _mm_load_ss(fshiftptr+2); \
- _t4 = _mm_loadh_pi(_t4, (__m64 *)(fshiftptr)); \
- _t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); \
- _t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); \
- _t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); \
- _t3 = _mm_shuffle_ps(_t3, _t3, _MM_SHUFFLE(1, 2, 0, 0)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _mm_store_ss(fshiftptr+2, _t1); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t1); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
_mm_store_ss(fshiftptr+2, t1);
_mm_storeh_pi((__m64 *)(fshiftptr), t1);
}
-#endif
-
-#if defined (_MSC_VER) && defined(_M_IX86)
-/* Macro work-around since 32-bit MSVC cannot handle >3 xmm/ymm parameters */
-#define gmx_mm_update_iforce_4atom_swizzle_ps(fix1, fiy1, fiz1, fix2, fiy2, fiz2, fix3, fiy3, fiz3, fix4, fiy4, fiz4, \
- fptr, fshiftptr) \
- { \
- __m128 _t1, _t2, _t3, _t4, _t5; \
-\
- fix1 = _mm_hadd_ps(fix1, fiy1); \
- fiz1 = _mm_hadd_ps(fiz1, fix2); \
- fiy2 = _mm_hadd_ps(fiy2, fiz2); \
- fix3 = _mm_hadd_ps(fix3, fiy3); \
- fiz3 = _mm_hadd_ps(fiz3, fix4); \
- fiy4 = _mm_hadd_ps(fiy4, fiz4); \
- fix1 = _mm_hadd_ps(fix1, fiz1); \
- fiy2 = _mm_hadd_ps(fiy2, fix3); \
- fiz3 = _mm_hadd_ps(fiz3, fiy4); \
- _mm_storeu_ps(fptr, _mm_add_ps(fix1, _mm_loadu_ps(fptr) )); \
- _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4))); \
- _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8))); \
- _t5 = _mm_load_ss(fshiftptr+2); \
- _t5 = _mm_loadh_pi(_t5, (__m64 *)(fshiftptr)); \
- _t1 = _mm_shuffle_ps(fix1, fix1, _MM_SHUFFLE(1, 0, 2, 2)); \
- _t2 = _mm_shuffle_ps(fiy2, fiy2, _MM_SHUFFLE(3, 2, 1, 1)); \
- _t3 = _mm_shuffle_ps(fiz3, fiz3, _MM_SHUFFLE(2, 1, 0, 0)); \
- _t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3)); \
- _t4 = _mm_shuffle_ps(fiz3, _t4, _MM_SHUFFLE(2, 0, 3, 3)); \
- _t1 = _mm_add_ps(_t1, _t2); \
- _t3 = _mm_add_ps(_t3, _t4); \
- _t1 = _mm_add_ps(_t1, _t3); \
- _t5 = _mm_add_ps(_t5, _t1); \
- _mm_store_ss(fshiftptr+2, _t5); \
- _mm_storeh_pi((__m64 *)(fshiftptr), _t5); \
- }
-#else
-/* Real function for sane compilers */
-static gmx_inline void
+
+
+static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
__m128 fix2, __m128 fiy2, __m128 fiz2,
__m128 fix3, __m128 fiy3, __m128 fiz3,
_mm_store_ss(fshiftptr+2, t5);
_mm_storeh_pi((__m64 *)(fshiftptr), t5);
}
-#endif
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
{
pot1 = _mm_add_ps(pot1, _mm_movehl_ps(_mm_setzero_ps(), pot1));
_mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
__m128 pot2, float * gmx_restrict ptrB)
{
t_pull_coord *coord; /* the pull coordinates */
/* Variables not present in mdp, but used at run time */
- t_pull_group *dyna; /* dynamic groups for use with local constraints */
- rvec *rbuf; /* COM calculation buffer */
- dvec *dbuf; /* COM calculation buffer */
- double *dbuf_cyl; /* cylinder ref. groups COM calculation buffer */
+ t_pull_group *dyna; /* dynamic groups for use with local constraints */
+ gmx_bool bSetPBCatoms; /* Do we need to set x_pbc for the groups? */
- FILE *out_x; /* output file for pull data */
- FILE *out_f; /* output file for pull data */
+ rvec *rbuf; /* COM calculation buffer */
+ dvec *dbuf; /* COM calculation buffer */
+ double *dbuf_cyl; /* cylinder ref. groups COM calculation buffer */
+
+ FILE *out_x; /* output file for pull data */
+ FILE *out_f; /* output file for pull data */
} t_pull;
static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
__m128 *out0, __m128 *out1)
{
}
/* Collect element 2 of the 4 inputs to out */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_shuffle_4_ps_fil2_to_1_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3)
{
__m128 _c01, _c23;
}
/* Sum the elements within each input register and store the sums in out */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_transpose_sum4_pr(__m128 in0, __m128 in1,
__m128 in2, __m128 in3)
{
* prepare_table_load_buffer(), but it is only used with full-width
* AVX_256. */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *ti,
__m128 *ctab0_S, __m128 *ctab1_S)
{
gmx_shuffle_4_ps_fil01_to_2_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3], ctab0_S, ctab1_S);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int gmx_unused *ti,
__m128 *ctab0_S, __m128 *ctab1_S, __m128 *ctabv_S)
{
*ctabv_S = gmx_shuffle_4_ps_fil2_to_1_ps(ctab_S[0], ctab_S[1], ctab_S[2], ctab_S[3]);
}
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load1_exclfilter(int e)
{
return _mm_set1_epi32(e);
}
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load_exclusion_filter(const unsigned *i)
{
return gmx_simd_load_i(i);
}
-static gmx_inline gmx_simd_bool_t
+static gmx_inline gmx_simd_bool_t gmx_simdcall
gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
{
return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()));
#define gmx_sub_hpr _mm_sub_ps
/* Sum over 4 half SIMD registers */
-static __m128 gmx_sum4_hpr(__m256 x, __m256 y)
+static __m128 gmx_simdcall gmx_sum4_hpr(__m256 x, __m256 y)
{
__m256 sum;
*a = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 0x1);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
{
*b = _mm256_extractf128_ps(a, 0);
}
/* Store half width SIMD registers a and b in full width register *c */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_real_t *c)
{
*c = _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 0x1);
#endif /* GMX_NBNXN_SIMD_2XNN */
/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_shuffle_4_ps_fil01_to_2_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3,
__m128 *out0, __m128 *out1)
{
}
/* Collect element 2 of the 4 inputs to out */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_shuffle_4_ps_fil2_to_1_ps(__m128 in0, __m128 in1, __m128 in2, __m128 in3)
{
__m128 _c01, _c23;
}
/* Sum the elements within each input register and return the sums */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_transpose_sum4_pr(__m256 in0, __m256 in1,
__m256 in2, __m256 in3)
{
}
/* Sum the elements of halfs of each input register and return the sums */
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_mm_transpose_sum4h_pr(__m256 in0, __m256 in2)
{
in0 = _mm256_hadd_ps(in0, _mm256_setzero_ps());
}
/* Put two 128-bit 4-float registers into one 256-bit 8-float register */
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_2_mm_to_m256(__m128 in0, __m128 in1)
{
return _mm256_insertf128_ps(_mm256_castps128_ps256(in0), in1, 1);
* prepare_table_load_buffer(), but it is only used with full-width
* AVX_256. */
-static gmx_inline void
+static gmx_inline void gmx_simdcall
load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
__m256 *ctab0_S, __m256 *ctab1_S)
{
*ctab1_S = gmx_2_mm_to_m256(ctabt_S[2], ctabt_S[3]);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
load_table_f_v(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
__m256 *ctab0_S, __m256 *ctab1_S, __m256 *ctabv_S)
{
typedef gmx_simd_int32_t gmx_exclfilter;
static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load1_exclfilter(int e)
{
return _mm256_set1_epi32(e);
}
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load_exclusion_filter(const unsigned *i)
{
return gmx_simd_load_i(i);
}
-static gmx_inline gmx_simd_bool_t
+static gmx_inline gmx_simd_bool_t gmx_simdcall
gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
{
return _mm256_castsi256_ps(_mm256_cmpeq_epi32(_mm256_andnot_si256(m0, m1), _mm256_setzero_si256()));
typedef gmx_simd_real_t gmx_exclfilter;
static const int filter_stride = 1;
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load1_exclfilter(int e)
{
return _mm256_castsi256_ps(_mm256_set1_epi32(e));
}
-static gmx_inline gmx_exclfilter
+static gmx_inline gmx_exclfilter gmx_simdcall
gmx_load_exclusion_filter(const unsigned *i)
{
return gmx_simd_load_r((real *) (i));
}
-static gmx_inline gmx_simd_bool_t
+static gmx_inline gmx_simd_bool_t gmx_simdcall
gmx_checkbitmask_pb(gmx_exclfilter m0, gmx_exclfilter m1)
{
return _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c);
#include "../nbnxn_kernel_simd_utils.h"
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_load_simd_2xnn_interactions(int excl,
gmx_exclfilter filter_S0,
gmx_exclfilter filter_S2,
#include "../nbnxn_kernel_simd_utils.h"
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_load_simd_4xn_interactions(int gmx_unused excl,
gmx_exclfilter gmx_unused filter_S0,
gmx_exclfilter gmx_unused filter_S1,
t_mdatoms *mdatoms,
gmx_enerdata_t *enerd,
real *lambda,
- double t)
+ double t,
+ gmx_wallcycle_t wcycle)
{
t_pbc pbc;
real dvdl;
* The virial contribution is calculated directly,
* which is why we call pull_potential after calc_virial.
*/
+ wallcycle_start(wcycle, ewcPULLPOT);
set_pbc(&pbc, ir->ePBC, box);
dvdl = 0;
enerd->term[F_COM_PULL] +=
pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+ wallcycle_stop(wcycle, ewcPULLPOT);
}
static void pme_receive_force_ener(t_commrec *cr,
if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
{
+ /* Since the COM pulling is always done mass-weighted, no forces are
+ * applied to vsites and this call can be done after vsite spreading.
+ */
pull_potential_wrapper(cr, inputrec, box, x,
- f, vir_force, mdatoms, enerd, lambda, t);
+ f, vir_force, mdatoms, enerd, lambda, t,
+ wcycle);
}
/* Add the forces from enforced rotation potentials (if any) */
if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
{
pull_potential_wrapper(cr, inputrec, box, x,
- f, vir_force, mdatoms, enerd, lambda, t);
+ f, vir_force, mdatoms, enerd, lambda, t,
+ wcycle);
}
/* Add the forces from enforced rotation potentials (if any) */
x[i][YY] = mu[YY][YY]*x[i][YY]+mu[ZZ][YY]*x[i][ZZ];
x[i][ZZ] = mu[ZZ][ZZ]*x[i][ZZ];
}
- if (*scale_tot)
+ if (scale_tot != NULL)
{
/* The transposes of the scaling matrices are stored,
* so we need to do matrix multiplication in the inverse order.
if (inputrec->eI == eiSD1 && bDoConstr && !bFirstHalf)
{
+ wallcycle_start(wcycle, ewcUPDATE);
xprime = get_xprime(state, upd);
nth = gmx_omp_nthreads_get(emntUpdate);
DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
}
inc_nrnb(nrnb, eNR_UPDATE, homenr);
+ wallcycle_stop(wcycle, ewcUPDATE);
if (bDoConstr)
{
if ((inputrec->eI == eiSD2) && !(bFirstHalf))
{
+ wallcycle_start(wcycle, ewcUPDATE);
xprime = get_xprime(state, upd);
nth = gmx_omp_nthreads_get(emntUpdate);
DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
}
inc_nrnb(nrnb, eNR_UPDATE, homenr);
+ wallcycle_stop(wcycle, ewcUPDATE);
if (bDoConstr)
{
r_ij[c][m] = a*pull->coord[c].vec[m];
}
}
+
+ if (dnorm2(r_ij[c]) == 0)
+ {
+ gmx_fatal(FARGS, "Distance for pull coordinate %d is zero with constraint pulling, which is not allowed.", c + 1);
+ }
}
bConverged_all = FALSE;
double f_invr;
/* Add the pull contribution to the virial */
+ /* We have already checked above that r_ij[c] != 0 */
f_invr = pcrd->f_scal/dnorm(r_ij[c]);
for (j = 0; j < DIM; j++)
{
case epullgDIST:
ndr = dnorm(pcrd->dr);
- invdr = 1/ndr;
+ if (ndr > 0)
+ {
+ invdr = 1/ndr;
+ }
+ else
+ {
+ /* With an harmonic umbrella, the force is 0 at r=0,
+ * so we can set invdr to any value.
+ * With a constant force, the force at r=0 is not defined,
+ * so we zero it (this is anyhow a very rare event).
+ */
+ invdr = 0;
+ }
if (ePull == epullUMBRELLA)
{
pcrd->f_scal = -k*dev;
make_local_pull_group(ga2la, &pull->group[g],
0, md->homenr);
}
+
+ /* Since the PBC of atoms might have changed, we need to update the PBC */
+ pull->bSetPBCatoms = TRUE;
}
static void init_pull_group_index(FILE *fplog, t_commrec *cr,
snew(pull->dyna, pull->ncoord);
}
+ /* We still need to initialize the PBC reference coordinates */
+ pull->bSetPBCatoms = TRUE;
+
/* Only do I/O when we are doing dynamics and if we are the MASTER */
pull->out_x = NULL;
pull->out_f = NULL;
#include "gromacs/legacyheaders/gmx_ga2la.h"
static void pull_set_pbcatom(t_commrec *cr, t_pull_group *pgrp,
- t_mdatoms *md, rvec *x,
+ rvec *x,
rvec x_pbc)
{
int a, m;
- if (cr && PAR(cr))
+ if (cr != NULL && DOMAINDECOMP(cr))
{
- if (DOMAINDECOMP(cr))
- {
- if (!ga2la_get_home(cr->dd->ga2la, pgrp->pbcatom, &a))
- {
- a = -1;
- }
- }
- else
- {
- a = pgrp->pbcatom;
- }
-
- if (a >= 0 && a < md->homenr)
+ if (ga2la_get_home(cr->dd->ga2la, pgrp->pbcatom, &a))
{
copy_rvec(x[a], x_pbc);
}
}
static void pull_set_pbcatoms(t_commrec *cr, t_pull *pull,
- t_mdatoms *md, rvec *x,
+ rvec *x,
rvec *x_pbc)
{
int g, n, m;
}
else
{
- pull_set_pbcatom(cr, &pull->group[g], md, x, x_pbc[g]);
+ pull_set_pbcatom(cr, &pull->group[g], x, x_pbc[g]);
for (m = 0; m < DIM; m++)
{
if (pull->dim[m] == 0)
snew(pull->dbuf, 3*pull->ngroup);
}
- if (pull->bRefAt)
+ if (pull->bRefAt && pull->bSetPBCatoms)
{
- pull_set_pbcatoms(cr, pull, md, x, pull->rbuf);
+ pull_set_pbcatoms(cr, pull, x, pull->rbuf);
+
+ if (cr != NULL && DOMAINDECOMP(cr))
+ {
+ /* We can keep these PBC reference coordinates fixed for nstlist
+ * steps, since atoms won't jump over PBC.
+ * This avoids a global reduction at the next nstlist-1 steps.
+ * Note that the exact values of the pbc reference coordinates
+ * are irrelevant, as long all atoms in the group are within
+ * half a box distance of the reference coordinate.
+ */
+ pull->bSetPBCatoms = FALSE;
+ }
}
if (pull->cosdim >= 0)
void *scanner;
#endif
{
- FILE *yyo = yyoutput;
+ FILE *yyo gmx_unused = yyoutput;
YYUSE (yyo);
if (!yyvaluep)
return;
--- /dev/null
+--- parser.cpp 2014-08-11 22:24:31.000000000 +0200
++++ parser.cpp 2014-08-11 22:24:40.000000000 +0200
+@@ -963,7 +963,7 @@
+ void *scanner;
+ #endif
+ {
+- FILE *yyo = yyoutput;
++ FILE *yyo gmx_unused = yyoutput;
+ YYUSE (yyo);
+ if (!yyvaluep)
+ return;
# The commands are run only if the generated files are older than the
# Bison/Flex input files, or if a '-f' flag is provided.
+# Note: You can check parser.cpp/scanner.cpp for the exact versions of
+# bison/flex that were used in the generation. Some OSs have older versions
+# of these tools installed.
+
FORCE=
if [ "x$1" == "x-f" ] ; then
FORCE=1
cd $dirname
fi
-[[ $FORCE || parser.y -nt parser.cpp ]] && $BISON -t -o parser.cpp --defines=parser.h parser.y
-[[ $FORCE || scanner.l -nt scanner.cpp ]] && $FLEX -o scanner.cpp scanner.l
+# We apply some trivial patches to the output to avoid warnings for PGI
+# (and maybe other) compilers
+[[ $FORCE || parser.y -nt parser.cpp ]] && $BISON -t -o parser.cpp --defines=parser.h parser.y && patch -p0 < parser.patch && rm -f parser.cpp.orig
+[[ $FORCE || scanner.l -nt scanner.cpp ]] && $FLEX -o scanner.cpp scanner.l && patch -p0 < scanner.patch && rm -f scanner.cpp.orig
#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
-#define _gmx_sel_yywrap(yyscanner) 1
+static inline int _gmx_sel_yywrap(yyscan_t yyscanner) { return 1; }
#define YY_SKIP_YYWRAP
typedef unsigned char YY_CHAR;
// this call.
#define ADD_TOKEN _gmx_sel_lexer_add_token(yytext, yyleng, state)
+// Set YY_BREAK to an empty value to avoid warnings (for the PGI compiler)
+// when we have return statements followed by break. Instead, we add breaks
+// manually.
+#define YY_BREAK
#define YY_NO_UNISTD_H 1
-#line 556 "scanner.cpp"
+#line 560 "scanner.cpp"
#define INITIAL 0
#define matchof 1
register int yy_act;
struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
-#line 90 "scanner.l"
+#line 94 "scanner.l"
}
-#line 818 "scanner.cpp"
+#line 822 "scanner.cpp"
if ( !yyg->yy_init )
{
case 1:
YY_RULE_SETUP
-#line 123 "scanner.l"
-
+#line 127 "scanner.l"
+break;
YY_BREAK
case 2:
YY_RULE_SETUP
-#line 124 "scanner.l"
+#line 128 "scanner.l"
{ yylval->i = strtol(yytext, NULL, 10); ADD_TOKEN; return TOK_INT; }
YY_BREAK
case 3:
YY_RULE_SETUP
-#line 125 "scanner.l"
+#line 129 "scanner.l"
{ yylval->r = strtod(yytext, NULL); ADD_TOKEN; return TOK_REAL; }
YY_BREAK
case 4:
YY_RULE_SETUP
-#line 126 "scanner.l"
+#line 130 "scanner.l"
{ yylval->str = gmx_strndup(yytext+1, yyleng-2); ADD_TOKEN; return STR; }
YY_BREAK
case 5:
/* rule 5 can match eol */
YY_RULE_SETUP
-#line 128 "scanner.l"
-{ _gmx_sel_lexer_add_token(" ", 1, state); }
+#line 132 "scanner.l"
+{ _gmx_sel_lexer_add_token(" ", 1, state); break; }
YY_BREAK
case 6:
/* rule 6 can match eol */
YY_RULE_SETUP
-#line 129 "scanner.l"
+#line 133 "scanner.l"
{
if (yytext[0] == ';' || state->bInteractive)
{
{
_gmx_sel_lexer_add_token(" ", 1, state);
}
+ break;
}
YY_BREAK
case YY_STATE_EOF(cmdstart):
-#line 142 "scanner.l"
+#line 147 "scanner.l"
{ state->bCmdStart = true; yyterminate(); }
YY_BREAK
case YY_STATE_EOF(INITIAL):
case YY_STATE_EOF(matchof):
case YY_STATE_EOF(matchbool):
-#line 143 "scanner.l"
+#line 148 "scanner.l"
{ state->bCmdStart = true; return CMD_SEP; }
YY_BREAK
case 7:
YY_RULE_SETUP
-#line 146 "scanner.l"
+#line 151 "scanner.l"
{ ADD_TOKEN; yylval->i = 1; return TOK_INT; }
YY_BREAK
case 8:
YY_RULE_SETUP
-#line 147 "scanner.l"
+#line 152 "scanner.l"
{ ADD_TOKEN; yylval->i = 0; return TOK_INT; }
YY_BREAK
case 9:
YY_RULE_SETUP
-#line 149 "scanner.l"
+#line 154 "scanner.l"
{ ADD_TOKEN; return GROUP; }
YY_BREAK
case 10:
YY_RULE_SETUP
-#line 150 "scanner.l"
+#line 155 "scanner.l"
{ ADD_TOKEN; return TO; }
YY_BREAK
case 11:
YY_RULE_SETUP
-#line 151 "scanner.l"
+#line 156 "scanner.l"
{ ADD_TOKEN; BEGIN(0); return OF; }
YY_BREAK
case 12:
YY_RULE_SETUP
-#line 152 "scanner.l"
+#line 157 "scanner.l"
{ ADD_TOKEN; return AND; }
YY_BREAK
case 13:
YY_RULE_SETUP
-#line 153 "scanner.l"
+#line 158 "scanner.l"
{ ADD_TOKEN; return OR; }
YY_BREAK
case 14:
YY_RULE_SETUP
-#line 154 "scanner.l"
+#line 159 "scanner.l"
{ ADD_TOKEN; return XOR; }
YY_BREAK
case 15:
YY_RULE_SETUP
-#line 155 "scanner.l"
+#line 160 "scanner.l"
{ ADD_TOKEN; return NOT; }
YY_BREAK
case 16:
YY_RULE_SETUP
-#line 156 "scanner.l"
+#line 161 "scanner.l"
{ yylval->str = gmx_strndup(yytext, yyleng); ADD_TOKEN; return CMP_OP; }
YY_BREAK
case 17:
YY_RULE_SETUP
-#line 158 "scanner.l"
+#line 163 "scanner.l"
{ return _gmx_sel_lexer_process_identifier(yylval, yytext, yyleng, state); }
YY_BREAK
case 18:
/* rule 18 can match eol */
YY_RULE_SETUP
-#line 160 "scanner.l"
-{ _gmx_sel_lexer_add_token(" ", 1, state); }
+#line 165 "scanner.l"
+{ _gmx_sel_lexer_add_token(" ", 1, state); break; }
YY_BREAK
case 19:
YY_RULE_SETUP
-#line 161 "scanner.l"
+#line 166 "scanner.l"
{ yylval->str = gmx_strndup(yytext, yyleng); ADD_TOKEN; return STR; }
YY_BREAK
case 20:
YY_RULE_SETUP
-#line 162 "scanner.l"
+#line 167 "scanner.l"
{ ADD_TOKEN; return yytext[0]; }
YY_BREAK
case 21:
YY_RULE_SETUP
-#line 163 "scanner.l"
+#line 168 "scanner.l"
YY_FATAL_ERROR( "flex scanner jammed" );
YY_BREAK
-#line 1028 "scanner.cpp"
+#line 1033 "scanner.cpp"
case YY_END_OF_BUFFER:
{
#define YYTABLES_NAME "yytables"
-#line 163 "scanner.l"
+#line 168 "scanner.l"
// this call.
#define ADD_TOKEN _gmx_sel_lexer_add_token(yytext, yyleng, state)
+// Set YY_BREAK to an empty value to avoid warnings (for the PGI compiler)
+// when we have return statements followed by break. Instead, we add breaks
+// manually.
+#define YY_BREAK
%}
INTEGER [[:digit:]]+
}
%}
-{COMMENT}
+{COMMENT} break;
{INTEGER} { yylval->i = strtol(yytext, NULL, 10); ADD_TOKEN; return TOK_INT; }
{REAL} { yylval->r = strtod(yytext, NULL); ADD_TOKEN; return TOK_REAL; }
{STRING} { yylval->str = gmx_strndup(yytext+1, yyleng-2); ADD_TOKEN; return STR; }
-\\\n { _gmx_sel_lexer_add_token(" ", 1, state); }
+\\\n { _gmx_sel_lexer_add_token(" ", 1, state); break; }
";"|\n {
if (yytext[0] == ';' || state->bInteractive)
{
{
_gmx_sel_lexer_add_token(" ", 1, state);
}
+ break;
}
<cmdstart><<EOF>> { state->bCmdStart = true; yyterminate(); }
{IDENTIFIER} { return _gmx_sel_lexer_process_identifier(yylval, yytext, yyleng, state); }
-[[:space:]]+ { _gmx_sel_lexer_add_token(" ", 1, state); }
+[[:space:]]+ { _gmx_sel_lexer_add_token(" ", 1, state); break; }
[_[:alnum:]]+ { yylval->str = gmx_strndup(yytext, yyleng); ADD_TOKEN; return STR; }
. { ADD_TOKEN; return yytext[0]; }
--- /dev/null
+--- scanner.cpp 2014-08-12 22:12:01.000000000 +0300
++++ scanner.cpp 2014-08-12 22:17:03.000000000 +0300
+@@ -331,7 +331,7 @@
+
+ #define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+-#define _gmx_sel_yywrap(yyscanner) 1
++static inline int _gmx_sel_yywrap(yyscan_t yyscanner) { return 1; }
+ #define YY_SKIP_YYWRAP
+
+ typedef unsigned char YY_CHAR;
+@@ -1807,7 +1807,7 @@
+ YY_BUFFER_STATE b;
+ char *buf;
+ yy_size_t n;
+- int i;
++ yy_size_t i;
+
+ /* Get memory for full buffer, including space for trailing EOB's. */
+ n = _yybytes_len + 2;
#undef YY_DECL
#endif
-#line 163 "scanner.l"
+#line 168 "scanner.l"
#line 346 "scanner_flex.h"
#undef _gmx_sel_yyIN_HEADER
GMX_THROW(gmx::InternalError("Unsupported variable type"));
return INVALID;
}
- delete yylval->sel;
- return INVALID; /* Should not be reached. */
+ /* This position should not be reached. */
}
/* For method symbols, return the correct type */
if (symtype == gmx::SelectionParserSymbol::MethodSymbol)
const char *
_gmx_selelem_type_str(const gmx::SelectionTreeElement &sel)
{
+ const char *p = NULL;
switch (sel.type)
{
- case SEL_CONST: return "CONST";
- case SEL_EXPRESSION: return "EXPR";
- case SEL_BOOLEAN: return "BOOL";
- case SEL_ARITHMETIC: return "ARITH";
- case SEL_ROOT: return "ROOT";
- case SEL_SUBEXPR: return "SUBEXPR";
- case SEL_SUBEXPRREF: return "REF";
- case SEL_GROUPREF: return "GROUPREF";
- case SEL_MODIFIER: return "MODIFIER";
- }
- return NULL;
+ case SEL_CONST: p = "CONST"; break;
+ case SEL_EXPRESSION: p = "EXPR"; break;
+ case SEL_BOOLEAN: p = "BOOL"; break;
+ case SEL_ARITHMETIC: p = "ARITH"; break;
+ case SEL_ROOT: p = "ROOT"; break;
+ case SEL_SUBEXPR: p = "SUBEXPR"; break;
+ case SEL_SUBEXPRREF: p = "REF"; break;
+ case SEL_GROUPREF: p = "GROUPREF"; break;
+ case SEL_MODIFIER: p = "MODIFIER"; break;
+ // No default clause so we intentionally get compiler errors
+ // if new selection choices are added later.
+ }
+ return p;
}
/*!
const char *
_gmx_sel_value_type_str(const gmx_ana_selvalue_t *val)
{
+ const char *p = NULL;
switch (val->type)
{
- case NO_VALUE: return "NONE";
- case INT_VALUE: return "INT";
- case REAL_VALUE: return "REAL";
- case STR_VALUE: return "STR";
- case POS_VALUE: return "VEC";
- case GROUP_VALUE: return "GROUP";
+ case NO_VALUE: p = "NONE"; break;
+ case INT_VALUE: p = "INT"; break;
+ case REAL_VALUE: p = "REAL"; break;
+ case STR_VALUE: p = "STR"; break;
+ case POS_VALUE: p = "VEC"; break;
+ case GROUP_VALUE: p = "GROUP"; break;
+ // No default clause so we intentionally get compiler errors
+ // if new selection choices are added later.
}
- return NULL;
+ return p;
}
/*! \copydoc _gmx_selelem_type_str() */
const char *
_gmx_selelem_boolean_type_str(const gmx::SelectionTreeElement &sel)
{
+ const char *p = NULL;
switch (sel.u.boolt)
{
- case BOOL_NOT: return "NOT"; break;
- case BOOL_AND: return "AND"; break;
- case BOOL_OR: return "OR"; break;
- case BOOL_XOR: return "XOR"; break;
+ case BOOL_NOT: p = "NOT"; break;
+ case BOOL_AND: p = "AND"; break;
+ case BOOL_OR: p = "OR"; break;
+ case BOOL_XOR: p = "XOR"; break;
+ // No default clause so we intentionally get compiler errors
+ // if new selection choices are added later.
}
- return NULL;
+ return p;
}
static const char *
comparison_type_str(e_comparison_t cmpt)
{
+ const char *p = NULL;
switch (cmpt)
{
- case CMP_INVALID: return "INVALID"; break;
- case CMP_LESS: return "<"; break;
- case CMP_LEQ: return "<="; break;
- case CMP_GTR: return ">"; break;
- case CMP_GEQ: return ">="; break;
- case CMP_EQUAL: return "=="; break;
- case CMP_NEQ: return "!="; break;
+ case CMP_INVALID: p = "INVALID"; break;
+ case CMP_LESS: p = "<"; break;
+ case CMP_LEQ: p = "<="; break;
+ case CMP_GTR: p = ">"; break;
+ case CMP_GEQ: p = ">="; break;
+ case CMP_EQUAL: p = "=="; break;
+ case CMP_NEQ: p = "!="; break;
+ // No default clause so we intentionally get compiler errors
+ // if new selection choices are added later.
}
- return NULL;
+ return p;
}
/*!
struct TestPosition
{
- TestPosition() : refMinDist(0.0), refNearestPoint(-1)
- {
- clear_rvec(x);
- }
explicit TestPosition(const rvec x)
: refMinDist(0.0), refNearestPoint(-1)
{
/****************************************************
* IMPLEMENTATION HELPER FUNCTIONS *
****************************************************/
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
gmx_simd_setzero_ibm_qpx(void)
{
return vec_splats(0.0);
}
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
gmx_simd_get_exponent_ibm_qpx(vector4double x)
{
const gmx_int64_t expmask = 0x7ff0000000000000LL;
return vec_cfid(vec_ld(0, idata));
}
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
gmx_simd_get_mantissa_ibm_qpx(vector4double x)
{
const gmx_int64_t exp_and_sign_mask = 0xfff0000000000000LL;
return vec_ld(0, idata);
}
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
gmx_simd_set_exponent_ibm_qpx(vector4double x)
{
const gmx_int64_t expbase = 1023;
return vec_ld(0, idata);
}
-static __attribute__((always_inline)) double
+static __attribute__((always_inline)) double gmx_simdcall
gmx_simd_reduce_ibm_qpx(vector4double x)
{
vector4double y = vec_sldw(x, x, 2);
return vec_extract(y, 0);
}
-static __attribute__((always_inline)) vector4double
+static __attribute__((always_inline)) vector4double gmx_simdcall
gmx_simd_set1_int_ibm_qpx(int i)
{
int idata[4] __attribute__((aligned(32)));
}
/* This works in both single and double */
-static __attribute__((always_inline)) int
+static __attribute__((always_inline)) int gmx_simdcall
gmx_simd_anytrue_bool_ibm_qpx(vector4double a)
{
vector4double b = vec_sldw(a, a, 2);
#define gmx_simd4_blendv_d gmx_simd_blendv_d
#define gmx_simd4_reduce_d gmx_simd_reduce_d
-static __attribute__((always_inline)) double
+static __attribute__((always_inline)) double gmx_simdcall
gmx_simd4_dotproduct3_d_ibm_qpx(vector4double a, vector4double b)
{
vector4double dp_sh0 = vec_mul(a, b);
return vec_extract(dp, 0);
}
-static __attribute__((always_inline)) float
+static __attribute__((always_inline)) float gmx_simdcall
gmx_simd4_dotproduct3_f_ibm_qpx(vector4double a, vector4double b)
{
return (float)gmx_simd4_dotproduct3_d_ibm_qpx(a, b);
#define mask_hih _mm512_int2mask(0xFF00)
/* load store float */
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_loadu_f_mic(const float * m)
{
return _mm512_loadunpackhi_ps(_mm512_loadunpacklo_ps(_mm512_undefined_ps(), m), m+16);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_storeu_f_mic(float * m, __m512 s)
{
_mm512_packstorelo_ps(m, s);
}
/* load store fint32 */
-static gmx_inline __m512i
+static gmx_inline __m512i gmx_simdcall
gmx_simd_loadu_fi_mic(const gmx_int32_t * m)
{
return _mm512_loadunpackhi_epi32(_mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), m), m+16);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_storeu_fi_mic(gmx_int32_t * m, __m512i s)
{
_mm512_packstorelo_epi32(m, s);
}
/* load store double */
-static gmx_inline __m512d
+static gmx_inline __m512d gmx_simdcall
gmx_simd_loadu_d_mic(const double * m)
{
return _mm512_loadunpackhi_pd(_mm512_loadunpacklo_pd(_mm512_undefined_pd(), m), m+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_storeu_d_mic(double * m, __m512d s)
{
_mm512_packstorelo_pd(m, s);
}
/* load store dint32 */
-static gmx_inline __m512i
+static gmx_inline __m512i gmx_simdcall
gmx_simd_loadu_di_mic(const gmx_int32_t * m)
{
return _mm512_mask_loadunpackhi_epi32(_mm512_mask_loadunpacklo_epi32(_mm512_undefined_epi32(), mask_loh, m), mask_loh, m+16);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_storeu_di_mic(gmx_int32_t * m, __m512i s)
{
_mm512_mask_packstorelo_epi32(m, mask_loh, s);
}
/* load store simd4 */
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd4_loadu_f_mic(const float * m)
{
return _mm512_mask_loadunpackhi_ps(_mm512_mask_loadunpacklo_ps(_mm512_undefined_ps(), gmx_simd4_mask, m), gmx_simd4_mask, m+16);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd4_storeu_f_mic(float * m, __m512 s)
{
_mm512_mask_packstorelo_ps(m, gmx_simd4_mask, s);
_mm512_mask_packstorehi_ps(m+16, gmx_simd4_mask, s);
}
-static gmx_inline __m512d
+static gmx_inline __m512d gmx_simdcall
gmx_simd4_loadu_d_mic(const double * m)
{
return _mm512_mask_loadunpackhi_pd(_mm512_mask_loadunpacklo_pd(_mm512_undefined_pd(), gmx_simd4_mask, m), gmx_simd4_mask, m+8);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd4_storeu_d_mic(double * m, __m512d s)
{
_mm512_mask_packstorelo_pd(m, gmx_simd4_mask, s);
}
/* extract */
-static gmx_inline gmx_int32_t
+static gmx_inline gmx_int32_t gmx_simdcall
gmx_simd_extract_fi_mic(gmx_simd_fint32_t a, int index)
{
int r;
return r;
}
-static gmx_inline gmx_int32_t
+static gmx_inline gmx_int32_t gmx_simdcall
gmx_simd_extract_di_mic(gmx_simd_dint32_t a, int index)
{
int r;
/* This is likely faster than the built in scale operation (lat 8, t-put 3)
* since we only work on the integer part and use shifts. TODO: check. given that scale also only does integer
*/
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_set_exponent_f_mic(__m512 a)
{
__m512i iexp = gmx_simd_cvt_f2i(a);
*/
}
-static gmx_inline __m512d
+static gmx_inline __m512d gmx_simdcall
gmx_simd_set_exponent_d_mic(__m512d a)
{
const __m512i expbias = _mm512_set1_epi32(1023);
return _mm512_castsi512_pd(iexp);
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_cvt_f2dd_mic(__m512 f, __m512d * d0, __m512d * d1)
{
__m512i i1 = _mm512_permute4f128_epi32(_mm512_castps_si512(f), _MM_PERM_CDCD);
*d1 = _mm512_cvtpslo_pd(_mm512_castsi512_ps(i1));
}
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_cvt_dd2f_mic(__m512d d0, __m512d d1)
{
__m512 f0 = _mm512_cvtpd_pslo(d0);
return _mm512_mask_permute4f128_ps(f0, mask_hih, f1, PERM_LOW2HIGH);
}
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_exp2_f_mic(__m512 x)
{
return _mm512_exp223_ps(_mm512_cvtfxpnt_round_adjustps_epi32(x, _MM_ROUND_MODE_NEAREST, _MM_EXPADJ_24));
}
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_exp_f_mic(__m512 x)
{
/* only 59ulp accuracy so we need to do extra an iteration
return _mm512_mask_fmadd_ps(r, m, t, r);
}
-static gmx_inline __m512
+static gmx_inline __m512 gmx_simdcall
gmx_simd_log_f_mic(__m512 x)
{
return _mm512_mul_ps(_mm512_set1_ps(0.693147180559945286226764), _mm512_log2ae23_ps(x));
/*********************************************************
* SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
*********************************************************/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_get_exponent_f_avx2_256(gmx_simd_float_t x)
{
const __m256 expmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
return _mm256_cvtepi32_ps(iexp);
}
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_set_exponent_f_avx2_256(gmx_simd_float_t x)
{
const __m256i expbias = _mm256_set1_epi32(127);
/*********************************************************
* SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
*********************************************************/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_get_exponent_d_avx2_256(gmx_simd_double_t x)
{
const __m256d expmask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FF0000000000000LL));
return _mm256_cvtepi32_pd(iexp128);
}
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_set_exponent_d_avx2_256(gmx_simd_double_t x)
{
const __m256i expbias = _mm256_set1_epi64x(1023LL);
return _mm256_castsi256_pd(iexp);
}
-static gmx_inline gmx_simd_dibool_t
+static gmx_inline gmx_simd_dibool_t gmx_simdcall
gmx_simd_cvt_db2dib_avx2_256(gmx_simd_dbool_t a)
{
__m128i ia = _mm256_castsi256_si128(_mm256_castpd_si256(a));
return ia;
}
-static gmx_inline gmx_simd_dbool_t
+static gmx_inline gmx_simd_dbool_t gmx_simdcall
gmx_simd_cvt_dib2db_avx2_256(gmx_simd_dibool_t ia)
{
__m128d lo = _mm_castsi128_pd(_mm_unpacklo_epi32(ia, ia));
#define gmx_simd4_cvt_f2d _mm256_cvtps_pd
#define gmx_simd4_cvt_d2f _mm256_cvtpd_ps
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd4_reduce_d_avx_128_fma(__m256d a)
{
double f;
return f;
}
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd4_dotproduct3_d_avx_128_fma(__m256d a, __m256d b)
{
double d;
/*********************************************************
* SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
*********************************************************/
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_simd_get_exponent_f_avx_256(__m256 x)
{
const __m256 expmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
return _mm256_cvtepi32_ps(iexp256);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_simd_get_mantissa_f_avx_256(__m256 x)
{
const __m256 mantmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x007FFFFF));
return _mm256_or_ps(x, one);
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_simd_set_exponent_f_avx_256(__m256 x)
{
const __m128i expbias = _mm_set1_epi32(127);
return _mm256_castsi256_ps(iexp256);
}
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd_reduce_f_avx_256(__m256 a)
{
float f;
/*********************************************************
* SIMD DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
*********************************************************/
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_simd_get_exponent_d_avx_256(__m256d x)
{
const __m256d expmask = _mm256_castsi256_pd( _mm256_set1_epi64x(0x7FF0000000000000LL));
return _mm256_cvtepi32_pd(iexp128a);
}
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_simd_get_mantissa_d_avx_256(__m256d x)
{
const __m256d mantmask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x000FFFFFFFFFFFFFLL));
return _mm256_or_pd(x, one);
}
-static gmx_inline __m256d
+static gmx_inline __m256d gmx_simdcall
gmx_simd_set_exponent_d_avx_256(__m256d x)
{
const __m128i expbias = _mm_set1_epi32(1023);
return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(iexp128a), iexp128b, 0x1));
}
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd_reduce_d_avx_256(__m256d a)
{
double f;
return f;
}
-static gmx_inline gmx_simd_dibool_t
+static gmx_inline gmx_simd_dibool_t gmx_simdcall
gmx_simd_cvt_db2dib_avx_256(gmx_simd_dbool_t a)
{
__m128i a1 = _mm256_extractf128_si256(_mm256_castpd_si256(a), 0x1);
return _mm_blend_epi16(a0, a1, 0xF0);
}
-static gmx_inline gmx_simd_dbool_t
+static gmx_inline gmx_simd_dbool_t gmx_simdcall
gmx_simd_cvt_dib2db_avx_256(gmx_simd_dibool_t a)
{
__m128i a1 = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 2, 2));
return _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(a0), a1, 0x1));
}
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_cvt_f2dd_avx_256(__m256 f, __m256d *d0, __m256d *d1)
{
*d0 = _mm256_cvtps_pd(_mm256_castps256_ps128(f));
*d1 = _mm256_cvtps_pd(_mm256_extractf128_ps(f, 0x1));
}
-static gmx_inline __m256
+static gmx_inline __m256 gmx_simdcall
gmx_simd_cvt_dd2f_avx_256(__m256d d0, __m256d d1)
{
__m128 f0 = _mm256_cvtpd_ps(d0);
}
/* SIMD4 reduce helper */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd4_reduce_f_avx_256(__m128 a)
{
float f;
}
/* SIMD4 Dotproduct helper function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd4_dotproduct3_f_avx_256(__m128 a, __m128 b)
{
float f;
return f;
}
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd4_dotproduct3_d_avx_256(__m256d a, __m256d b)
{
double d;
/****************************************************
* SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
****************************************************/
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_simd_get_exponent_f_sse2(__m128 x)
{
const __m128 expmask = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
return _mm_cvtepi32_ps(iexp);
}
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_simd_get_mantissa_f_sse2(__m128 x)
{
const __m128 mantmask = _mm_castsi128_ps(_mm_set1_epi32(0x007FFFFF));
return _mm_or_ps(x, one);
}
-static gmx_inline __m128
+static gmx_inline __m128 gmx_simdcall
gmx_simd_set_exponent_f_sse2(__m128 x)
{
const __m128i expbias = _mm_set1_epi32(127);
return _mm_castsi128_ps(iexp);
}
-static gmx_inline __m128i
+static gmx_inline __m128i gmx_simdcall
gmx_simd_mul_fi_sse2(__m128i a, __m128i b)
{
__m128i a1 = _mm_srli_si128(a, 4); /* - a[3] a[2] a[1] */
return _mm_unpacklo_epi32(c, c1);
}
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd_reduce_f_sse2(__m128 a)
{
__m128 b;
/****************************************************
* DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
****************************************************/
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_simd_get_exponent_d_sse2(__m128d x)
{
/* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
return _mm_cvtepi32_pd(iexp);
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_simd_get_mantissa_d_sse2(__m128d x)
{
/* Don't use _mm_set1_epi64x() - on MSVC it is only supported for 64-bit builds */
return _mm_or_pd(x, one);
}
-static gmx_inline __m128d
+static gmx_inline __m128d gmx_simdcall
gmx_simd_set_exponent_d_sse2(__m128d x)
{
const __m128i expbias = _mm_set1_epi32(1023);
return _mm_castsi128_pd(iexp);
}
-static gmx_inline __m128i
+static gmx_inline __m128i gmx_simdcall
gmx_simd_mul_di_sse2(__m128i a, __m128i b)
{
__m128i c;
return _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 1, 2, 0)); /* 0 0 a[1]*b[1] a[0]*b[0] */
}
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd_reduce_d_sse2(__m128d a)
{
__m128d b;
#define gmx_simd4_reduce_f gmx_simd_reduce_f
/* SIMD4 Dotproduct helper function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd4_dotproduct3_f_sse2(__m128 a, __m128 b)
{
float f;
#define gmx_simd4_dotproduct3_f gmx_simd4_dotproduct3_f_sse4_1
/* SIMD reduction function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd_reduce_f_sse4_1(__m128 a)
{
float f;
}
/* SIMD4 Dotproduct helper function */
-static gmx_inline float
+static gmx_inline float gmx_simdcall
gmx_simd4_dotproduct3_f_sse4_1(__m128 a, __m128 b)
{
float f;
return f;
}
-static gmx_inline double
+static gmx_inline double gmx_simdcall
gmx_simd_reduce_d_sse4_1(__m128d a)
{
double f;
* \param d term 4 (multiple values)
* \return sum of terms 1-4 (multiple values)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_sum4_f(gmx_simd_float_t a, gmx_simd_float_t b,
gmx_simd_float_t c, gmx_simd_float_t d)
{
* with the exception that negative zero is not considered to be negative
* on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_xor_sign_f(gmx_simd_float_t a, gmx_simd_float_t b)
{
#ifdef GMX_SIMD_HAVE_LOGICAL
* \param x The reference (starting) value x for which we want 1/sqrt(x).
* \return An improved approximation with roughly twice as many bits of accuracy.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_rsqrt_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
{
# ifdef GMX_SIMD_HAVE_FMA
* \param x Argument that must be >0. This routine does not check arguments.
* \return 1/sqrt(x). Result is undefined if your argument was invalid.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_invsqrt_f(gmx_simd_float_t x)
{
gmx_simd_float_t lu = gmx_simd_rsqrt_f(x);
* In particular for double precision we can sometimes calculate square root
* pairs slightly faster by using single precision until the very last step.
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_invsqrt_pair_f(gmx_simd_float_t x0, gmx_simd_float_t x1,
gmx_simd_float_t *out0, gmx_simd_float_t *out1)
{
* \param x The reference (starting) value x for which we want 1/x.
* \return An improved approximation with roughly twice as many bits of accuracy.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_rcp_iter_f(gmx_simd_float_t lu, gmx_simd_float_t x)
{
return gmx_simd_mul_f(lu, gmx_simd_fnmadd_f(lu, x, gmx_simd_set1_f(2.0f)));
* \param x Argument that must be nonzero. This routine does not check arguments.
* \return 1/x. Result is undefined if your argument was invalid.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_inv_f(gmx_simd_float_t x)
{
gmx_simd_float_t lu = gmx_simd_rcp_f(x);
* \return sqrt(x). If x=0, the result will correctly be set to 0.
* The result is undefined if the input value is negative.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_sqrt_f(gmx_simd_float_t x)
{
gmx_simd_fbool_t mask;
* \result The natural logarithm of x. Undefined if argument is invalid.
*/
#ifndef gmx_simd_log_f
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_log_f(gmx_simd_float_t x)
{
const gmx_simd_float_t half = gmx_simd_set1_f(0.5f);
* \param x Argument.
* \result 2^x. Undefined if input argument caused overflow.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_exp2_f(gmx_simd_float_t x)
{
/* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
* \result exp(x). Undefined if input argument caused overflow,
* which can happen if abs(x) \> 7e13.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_exp_f(gmx_simd_float_t x)
{
const gmx_simd_float_t argscale = gmx_simd_set1_f(1.44269504088896341f);
* This routine achieves very close to full precision, but we do not care about
* the last bit or the subnormal result range.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_erf_f(gmx_simd_float_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
* (think results that are in the ballpark of 10^-30 for single precision,
* or 10^-200 for double) since that is not relevant for MD.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_erfc_f(gmx_simd_float_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*P(x^2) in range [-1,1] */
* magnitudes of the argument we inherently begin to lose accuracy due to the
* argument reduction, despite using extended precision arithmetics internally.
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_sincos_f(gmx_simd_float_t x, gmx_simd_float_t *sinval, gmx_simd_float_t *cosval)
{
/* Constants to subtract Pi/4*x from y while minimizing precision loss */
* \attention Do NOT call both sin & cos if you need both results, since each of them
* will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_sin_f(gmx_simd_float_t x)
{
gmx_simd_float_t s, c;
* \attention Do NOT call both sin & cos if you need both results, since each of them
* will then call \ref gmx_simd_sincos_r and waste a factor 2 in performance.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_cos_f(gmx_simd_float_t x)
{
gmx_simd_float_t s, c;
* \param x The argument to evaluate tan for
* \result Tan(x)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_tan_f(gmx_simd_float_t x)
{
const gmx_simd_float_t argred0 = gmx_simd_set1_f(1.5703125);
* \param x The argument to evaluate asin for
* \result Asin(x)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_asin_f(gmx_simd_float_t x)
{
const gmx_simd_float_t limitlow = gmx_simd_set1_f(1e-4f);
* \param x The argument to evaluate acos for
* \result Acos(x)
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_acos_f(gmx_simd_float_t x)
{
const gmx_simd_float_t one = gmx_simd_set1_f(1.0f);
* \param x The argument to evaluate atan for
* \result Atan(x), same argument/value range as standard math library.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_atan_f(gmx_simd_float_t x)
{
const gmx_simd_float_t halfpi = gmx_simd_set1_f(M_PI/2);
* of any concern in Gromacs, and in particular it will not affect calculations
* of angles from vectors.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_atan2_f(gmx_simd_float_t y, gmx_simd_float_t x)
{
const gmx_simd_float_t pi = gmx_simd_set1_f(M_PI);
* For \f$\beta r \geq 7206\f$ the return value can be inf or NaN.
*
*/
-static gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_pmecorrF_f(gmx_simd_float_t z2)
{
const gmx_simd_float_t FN6 = gmx_simd_set1_f(-1.7357322914161492954e-8f);
* when added to \f$1/r\f$ the error will be insignificant.
* For \f$\beta r \geq 7142\f$ the return value can be inf or NaN.
*/
-static gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_pmecorrV_f(gmx_simd_float_t z2)
{
const gmx_simd_float_t VN6 = gmx_simd_set1_f(1.9296833005951166339e-8f);
*
* \copydetails gmx_simd_sum4_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_sum4_d(gmx_simd_double_t a, gmx_simd_double_t b,
gmx_simd_double_t c, gmx_simd_double_t d)
{
* with the exception that negative zero is not considered to be negative
* on architectures where \ref GMX_SIMD_HAVE_LOGICAL is not set.
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_xor_sign_d(gmx_simd_double_t a, gmx_simd_double_t b)
{
#ifdef GMX_SIMD_HAVE_LOGICAL
*
* \copydetails gmx_simd_rsqrt_iter_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_rsqrt_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
{
#ifdef GMX_SIMD_HAVE_FMA
*
* \copydetails gmx_simd_invsqrt_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_invsqrt_d(gmx_simd_double_t x)
{
gmx_simd_double_t lu = gmx_simd_rsqrt_d(x);
*
* \copydetails gmx_simd_invsqrt_pair_f
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_invsqrt_pair_d(gmx_simd_double_t x0, gmx_simd_double_t x1,
gmx_simd_double_t *out0, gmx_simd_double_t *out1)
{
*
* \copydetails gmx_simd_rcp_iter_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_rcp_iter_d(gmx_simd_double_t lu, gmx_simd_double_t x)
{
return gmx_simd_mul_d(lu, gmx_simd_fnmadd_d(lu, x, gmx_simd_set1_d(2.0)));
*
* \copydetails gmx_simd_inv_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_inv_d(gmx_simd_double_t x)
{
gmx_simd_double_t lu = gmx_simd_rcp_d(x);
*
* \copydetails gmx_simd_sqrt_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_sqrt_d(gmx_simd_double_t x)
{
gmx_simd_dbool_t mask;
*
* \copydetails gmx_simd_log_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_log_d(gmx_simd_double_t x)
{
const gmx_simd_double_t half = gmx_simd_set1_d(0.5);
*
* \copydetails gmx_simd_exp2_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_exp2_d(gmx_simd_double_t x)
{
const gmx_simd_double_t arglimit = gmx_simd_set1_d(1022.0);
*
* \copydetails gmx_simd_exp_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_exp_d(gmx_simd_double_t x)
{
const gmx_simd_double_t argscale = gmx_simd_set1_d(1.44269504088896340735992468100);
*
* \copydetails gmx_simd_erf_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_erf_d(gmx_simd_double_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
*
* \copydetails gmx_simd_erfc_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_erfc_d(gmx_simd_double_t x)
{
/* Coefficients for minimax approximation of erf(x)=x*(CAoffset + P(x^2)/Q(x^2)) in range [-0.75,0.75] */
*
* \copydetails gmx_simd_sincos_f
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_sincos_d(gmx_simd_double_t x, gmx_simd_double_t *sinval, gmx_simd_double_t *cosval)
{
/* Constants to subtract Pi/4*x from y while minimizing precision loss */
*
* \copydetails gmx_simd_sin_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_sin_d(gmx_simd_double_t x)
{
gmx_simd_double_t s, c;
*
* \copydetails gmx_simd_cos_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_cos_d(gmx_simd_double_t x)
{
gmx_simd_double_t s, c;
*
* \copydetails gmx_simd_tan_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_tan_d(gmx_simd_double_t x)
{
const gmx_simd_double_t argred0 = gmx_simd_set1_d(2*0.78539816290140151978);
*
* \copydetails gmx_simd_asin_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_asin_d(gmx_simd_double_t x)
{
/* Same algorithm as cephes library */
*
* \copydetails gmx_simd_acos_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_acos_d(gmx_simd_double_t x)
{
const gmx_simd_double_t one = gmx_simd_set1_d(1.0);
*
* \copydetails gmx_simd_atan_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_atan_d(gmx_simd_double_t x)
{
/* Same algorithm as cephes library */
*
* \copydetails gmx_simd_atan2_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_atan2_d(gmx_simd_double_t y, gmx_simd_double_t x)
{
const gmx_simd_double_t pi = gmx_simd_set1_d(M_PI);
*
* \copydetails gmx_simd_pmecorrF_f
*/
-static gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_pmecorrF_d(gmx_simd_double_t z2)
{
const gmx_simd_double_t FN10 = gmx_simd_set1_d(-8.0072854618360083154e-14);
*
* \copydetails gmx_simd_pmecorrV_f
*/
-static gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_pmecorrV_d(gmx_simd_double_t z2)
{
const gmx_simd_double_t VN9 = gmx_simd_set1_d(-9.3723776169321855475e-13);
*
* \copydetails gmx_simd_sum4_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_sum4_f(gmx_simd4_float_t a, gmx_simd4_float_t b,
gmx_simd4_float_t c, gmx_simd4_float_t d)
{
*
* \copydetails gmx_simd_rsqrt_iter_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_rsqrt_iter_f(gmx_simd4_float_t lu, gmx_simd4_float_t x)
{
# ifdef GMX_SIMD_HAVE_FMA
*
* \copydetails gmx_simd_invsqrt_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_invsqrt_f(gmx_simd4_float_t x)
{
gmx_simd4_float_t lu = gmx_simd4_rsqrt_f(x);
*
* \copydetails gmx_simd_sum4_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_sum4_d(gmx_simd4_double_t a, gmx_simd4_double_t b,
gmx_simd4_double_t c, gmx_simd4_double_t d)
{
*
* \copydetails gmx_simd_rsqrt_iter_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_rsqrt_iter_d(gmx_simd4_double_t lu, gmx_simd4_double_t x)
{
#ifdef GMX_SIMD_HAVE_FMA
*
* \copydetails gmx_simd_invsqrt_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_invsqrt_d(gmx_simd4_double_t x)
{
gmx_simd4_double_t lu = gmx_simd4_rsqrt_d(x);
public:
::testing::AssertionResult
compareSimd4MathFunction(const char * refFuncExpr, const char *simd4FuncExpr,
- real refFunc(real x), gmx_simd4_real_t simd4Func(gmx_simd4_real_t x));
+ real refFunc(real x), gmx_simd4_real_t gmx_simdcall simd4Func(gmx_simd4_real_t x));
};
/*! \brief Test approximate equality of SIMD4 vs reference version of a function.
*/
::testing::AssertionResult
Simd4MathTest::compareSimd4MathFunction(const char * refFuncExpr, const char *simd4FuncExpr,
- real refFunc(real x), gmx_simd4_real_t simd4Func(gmx_simd4_real_t x))
+ real refFunc(real x), gmx_simd4_real_t gmx_simdcall simd4Func(gmx_simd4_real_t x))
{
std::vector<real> vx(GMX_SIMD4_WIDTH);
std::vector<real> vref(GMX_SIMD4_WIDTH);
public:
::testing::AssertionResult
compareSimdMathFunction(const char * refFuncExpr, const char *simdFuncExpr,
- real refFunc(real x), gmx_simd_real_t simdFunc(gmx_simd_real_t x));
+ real refFunc(real x), gmx_simd_real_t gmx_simdcall simdFunc(gmx_simd_real_t x));
};
/*! \brief Test approximate equality of SIMD vs reference version of a function.
*/
::testing::AssertionResult
SimdMathTest::compareSimdMathFunction(const char * refFuncExpr, const char *simdFuncExpr,
- real refFunc(real x), gmx_simd_real_t simdFunc(gmx_simd_real_t x))
+ real refFunc(real x), gmx_simd_real_t gmx_simdcall simdFunc(gmx_simd_real_t x))
{
std::vector<real> vx(GMX_SIMD_REAL_WIDTH);
std::vector<real> vref(GMX_SIMD_REAL_WIDTH);
}
/*! \brief Function wrapper to return first result when testing \ref gmx_simd_invsqrt_pair_r */
-gmx_simd_real_t
+gmx_simd_real_t gmx_simdcall
tst_invsqrt_pair0(gmx_simd_real_t x)
{
gmx_simd_real_t r0, r1;
}
/*! \brief Function wrapper to return second result when testing \ref gmx_simd_invsqrt_pair_r */
-gmx_simd_real_t
+gmx_simd_real_t gmx_simdcall
tst_invsqrt_pair1(gmx_simd_real_t x)
{
gmx_simd_real_t r0, r1;
*
* \note The SIMD part is that we calculate many scalar products in one call.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_iprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
gmx_simd_float_t bx, gmx_simd_float_t by, gmx_simd_float_t bz)
{
* \note This corresponds to the scalar product of the vector with itself, but
* the compiler might be able to optimize it better with identical vectors.
*/
-static gmx_inline gmx_simd_float_t
+static gmx_inline gmx_simd_float_t gmx_simdcall
gmx_simd_norm2_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az)
{
gmx_simd_float_t ret;
* The arguments x/y/z denotes the different components, and each element
* corresponds to a separate vector.
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_cprod_f(gmx_simd_float_t ax, gmx_simd_float_t ay, gmx_simd_float_t az,
gmx_simd_float_t bx, gmx_simd_float_t by, gmx_simd_float_t bz,
gmx_simd_float_t *cx, gmx_simd_float_t *cy, gmx_simd_float_t *cz)
*
* \copydetails gmx_simd_iprod_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_iprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az,
gmx_simd_double_t bx, gmx_simd_double_t by, gmx_simd_double_t bz)
{
*
* \copydetails gmx_simd_norm2_f
*/
-static gmx_inline gmx_simd_double_t
+static gmx_inline gmx_simd_double_t gmx_simdcall
gmx_simd_norm2_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az)
{
gmx_simd_double_t ret;
*
* \copydetails gmx_simd_cprod_f
*/
-static gmx_inline void
+static gmx_inline void gmx_simdcall
gmx_simd_cprod_d(gmx_simd_double_t ax, gmx_simd_double_t ay, gmx_simd_double_t az,
gmx_simd_double_t bx, gmx_simd_double_t by, gmx_simd_double_t bz,
gmx_simd_double_t *cx, gmx_simd_double_t *cy, gmx_simd_double_t *cz)
*
* \copydetails gmx_simd_norm2_f
*/
-static gmx_inline gmx_simd4_float_t
+static gmx_inline gmx_simd4_float_t gmx_simdcall
gmx_simd4_norm2_f(gmx_simd4_float_t ax, gmx_simd4_float_t ay, gmx_simd4_float_t az)
{
gmx_simd4_float_t ret;
*
* \copydetails gmx_simd_norm2_f
*/
-static gmx_inline gmx_simd4_double_t
+static gmx_inline gmx_simd4_double_t gmx_simdcall
gmx_simd4_norm2_d(gmx_simd4_double_t ax, gmx_simd4_double_t ay, gmx_simd4_double_t az)
{
gmx_simd4_double_t ret;
typedef unsigned long long
gmx_cycles_t;
+#elif ((defined __aarch64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__) || defined(__PGIC__)))
+/* 64-bit ARM cycle counters with GCC inline assembly */
+typedef unsigned long long
+ gmx_cycles_t;
+
#elif defined(_MSC_VER)
#include <windows.h>
typedef __int64
/* x86 or x86-64 with GCC inline assembly - pentium TSC register */
return 1;
}
+#elif ((defined __aarch64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__) || defined(__PGIC__)))
+static __inline int gmx_cycles_have_counter(void)
+{
+ /* 64-bit ARM cycle counters with GCC inline assembly */
+ return 1;
+}
#elif (defined(_MSC_VER))
static __inline int gmx_cycles_have_counter(void)
{
return cycle;
}
+#elif ((defined __aarch64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__) || defined(__PGIC__)))
+static __inline__ gmx_cycles_t gmx_cycles_read(void)
+{
+ /* 64-bit ARM cycle counters with GCC inline assembly */
+ gmx_cycles_t cycle;
+ __asm__ __volatile__("mrs %0, cntvct_el0" : "=r" (cycle) );
+
+ return cycle;
+}
+
#elif defined(_MSC_VER)
static __inline gmx_cycles_t gmx_cycles_read(void)
{
-#ifdef HAVE_RDTSCP
+#ifdef _M_ARM
+ /* Windows on 64-bit ARM */
+ return __rdpmccntr64();
+#else
+ /* x86 */
+# ifdef HAVE_RDTSCP
unsigned int ui;
return __rdtscp(&ui);
-#else
+# else
return __rdtsc();
+# endif
#endif
}
#elif (defined(__hpux) || defined(__HP_cc)) && defined(__ia64)
"Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
"PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
"PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
- "Vsite spread", "Write traj.", "Update", "Constraints", "Comm. energies",
+ "Vsite spread", "COM pull force",
+ "Write traj.", "Update", "Constraints", "Comm. energies",
"Enforced rotation", "Add rot. forces", "Coordinate swapping", "IMD", "Test"
};
ewcMOVEX, ewcGB, ewcFORCE, ewcMOVEF, ewcPMEMESH,
ewcPME_REDISTXF, ewcPME_SPREADGATHER, ewcPME_FFT, ewcPME_FFTCOMM, ewcLJPME, ewcPME_SOLVE,
ewcPMEWAITCOMM, ewcPP_PMEWAITRECVF, ewcWAIT_GPU_NB_NL, ewcWAIT_GPU_NB_L, ewcNB_XF_BUF_OPS,
- ewcVSITESPREAD, ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,
+ ewcVSITESPREAD, ewcPULLPOT,
+ ewcTRAJ, ewcUPDATE, ewcCONSTR, ewcMoveE, ewcROT, ewcROTadd, ewcSWAP, ewcIMD,
ewcTEST, ewcNR
};
select.cpp
$<TARGET_OBJECTS:analysisdata-test-shared>)
-add_executable(test_selection test_selection.cpp)
+add_executable(test_selection ${UNITTEST_TARGET_OPTIONS} test_selection.cpp)
target_link_libraries(test_selection libgromacs ${GMX_EXE_LINKER_FLAGS})
#elif (defined(__INTEL_COMPILER) || defined(__ECC)) && !defined(_MSC_VER)
/* ICC on *nix */
# define gmx_unused __attribute__ ((unused))
+#elif defined(__PGI)
+/* Portland group compilers */
+# define gmx_unused __attribute__ ((unused))
#elif defined _MSC_VER
/* MSVC */
# define gmx_unused /*@unused@*/
if (bVV && !bStartingFromCpt && !bRerunMD)
/* ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
{
+ wallcycle_start(wcycle, ewcUPDATE);
if (ir->eI == eiVV && bInitStep)
{
/* if using velocity verlet with full time step Ekin,
if (!bRerunMD || rerun_fr.bV || bForceUpdate) /* Why is rerun_fr.bV here? Unclear. */
{
+ wallcycle_stop(wcycle, ewcUPDATE);
update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
state, fr->bMolPBC, graph, f,
&top->idef, shake_vir,
cr, nrnb, wcycle, upd, constr,
TRUE, bCalcVir, vetanew);
+ wallcycle_start(wcycle, ewcUPDATE);
if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
{
So we need information from the last step in the first half of the integration */
if (bGStat || do_per_step(step-1, nstglobalcomm))
{
+ wallcycle_stop(wcycle, ewcUPDATE);
compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
constr, NULL, FALSE, state->box,
time step kinetic energy for the pressure (always true now, since we want accurate statistics).
b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
EkinAveVel because it's needed for the pressure */
+ wallcycle_start(wcycle, ewcUPDATE);
}
/* temperature scaling and pressure scaling to produce the extended variables at t+dt */
if (!bInitStep)
{
if (bExchanged)
{
-
+ wallcycle_stop(wcycle, ewcUPDATE);
/* We need the kinetic energy at minus the half step for determining
* the full step kinetic energy and possibly for T-coupling.*/
/* This may not be quite working correctly yet . . . . */
constr, NULL, FALSE, state->box,
top_global, &bSumEkinhOld,
CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+ wallcycle_start(wcycle, ewcUPDATE);
}
}
}
{
copy_rvecn(cbuf, state->v, 0, state->natoms);
}
+ wallcycle_stop(wcycle, ewcUPDATE);
}
/* MRS -- now done iterating -- compute the conserved quantity */
namespace
{
+#if defined(GMX_THREAD_MPI) || defined(DOXYGEN)
//! Number of tMPI threads for child mdrun call.
-int gmx_unused g_numThreads = 1;
+int g_numThreads = 1;
+#endif
+#if defined(GMX_OPENMP) || defined(DOXYGEN)
//! Number of OpenMP threads for child mdrun call.
-int gmx_unused g_numOpenMPThreads = 1;
-
+int g_numOpenMPThreads = 1;
+#endif
//! \cond
GMX_TEST_OPTIONS(MdrunTestOptions, options)
{
include_directories(${LIBXML2_INCLUDE_DIR})
file(GLOB TESTUTILS_SOURCES *.cpp)
-add_library(testutils STATIC ${TESTUTILS_SOURCES})
+add_library(testutils STATIC ${UNITTEST_TARGET_OPTIONS} ${TESTUTILS_SOURCES})
set(TESTUTILS_LIBS testutils ${GMOCK_LIBRARIES} ${LIBXML2_LIBRARIES})
set_property(TARGET testutils APPEND PROPERTY COMPILE_DEFINITIONS "${GMOCK_COMPILE_DEFINITIONS}")
target_link_libraries(testutils libgromacs ${GMOCK_LIBRARIES} ${LIBXML2_LIBRARIES})
function (gmx_add_unit_test_object_library NAME)
if (GMX_BUILD_UNITTESTS AND BUILD_TESTING)
include_directories(BEFORE ${GMOCK_INCLUDE_DIRS})
- add_library(${NAME} OBJECT ${ARGN})
+ add_library(${NAME} OBJECT ${UNITTEST_TARGET_OPTIONS} ${ARGN})
set_property(TARGET ${NAME} APPEND PROPERTY COMPILE_DEFINITIONS "${GMOCK_COMPILE_DEFINITIONS}")
endif()
endfunction ()
function (gmx_build_unit_test NAME EXENAME)
if (GMX_BUILD_UNITTESTS AND BUILD_TESTING)
include_directories(BEFORE ${GMOCK_INCLUDE_DIRS})
- add_executable(${EXENAME} ${ARGN} ${TESTUTILS_DIR}/unittest_main.cpp)
+ add_executable(${EXENAME} ${UNITTEST_TARGET_OPTIONS} ${ARGN} ${TESTUTILS_DIR}/unittest_main.cpp)
set_property(TARGET ${EXENAME} APPEND PROPERTY COMPILE_DEFINITIONS "${GMOCK_COMPILE_DEFINITIONS}")
target_link_libraries(${EXENAME} libgromacs ${TESTUTILS_LIBS} ${GMOCK_LIBRARIES} ${GMX_EXE_LINKER_FLAGS})
set(_temporary_files_path "${CMAKE_CURRENT_BINARY_DIR}/Testing/Temporary")