-/* Routines to decrement rvec in memory, typically use for j particle force updates */
-static void
-gmx_mm256_decrement_1rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA, __m256d xyz)
-{
- __m256d t1,t2;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_blend_pd(_mm256_setzero_pd(),xyz,0x7);
- t1 = _mm256_sub_pd(t1,t2);
- /* OK to add zeros and store more values here, since we only do a single store that cannot overlap */
- _mm256_storeu_pd(ptrA,t1);
-}
-
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m256d xyz1, __m256d xyz2, __m256d xyz3)
-{
- __m256d t1,t2;
- __m256d tA,tB;
- __m128d tC;
-
- tA = _mm256_loadu_pd(ptrA);
- tB = _mm256_loadu_pd(ptrA+4);
- tC = _mm_load_sd(ptrA+8);
-
- /* xyz1: - z1 | y1 x1 */
- /* xyz2: - z2 | y2 x2 */
- /* xyz3: - z3 | y3 x3 */
-
- xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z2 - | x2 y2 */
- t1 = _mm256_permute2f128_pd(xyz2,xyz2,0x21); /* x2 y2 | z2 - | */
- xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
- xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /* - - | z2 y2 */
- t2 = _mm256_permute2f128_pd(xyz3,xyz3,0x21); /* y3 x3 | - z3 | */
- xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /* y3 x3 | z2 y2 */
-
- tA = _mm256_sub_pd(tA,xyz1);
- tB = _mm256_sub_pd(tB,xyz2);
- tC = _mm_sub_sd(tC, _mm256_castpd256_pd128(t2));
-
- _mm256_storeu_pd(ptrA,tA);
- _mm256_storeu_pd(ptrA+4,tB);
- _mm_store_sd(ptrA+8,tC);
-}
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_noswizzle_pd(double * gmx_restrict ptrA,
- __m256d xyz1, __m256d xyz2, __m256d xyz3, __m256d xyz4)
-{
- __m256d t1,t2,t3;
- __m256d tA,tB,tC;
-
- tA = _mm256_loadu_pd(ptrA);
- tB = _mm256_loadu_pd(ptrA+4);
- tC = _mm256_loadu_pd(ptrA+8);
-
- /* xyz1: - z1 | y1 x1 */
- /* xyz2: - z2 | y2 x2 */
- /* xyz3: - z3 | y3 x3 */
- /* xyz4: - z4 | y4 x4 */
-
- xyz2 = _mm256_permute_pd(xyz2,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z2 - | x2 y2 */
- t1 = _mm256_permute2f128_pd(xyz2,xyz2,0x21); /* x2 y2 | z2 - | */
- xyz1 = _mm256_blend_pd(xyz1,t1,_GMX_MM_BLEND256D(1,0,0,0)); /* x2 z1 | y1 x1 */
- xyz2 = _mm256_blend_pd(xyz2,t1,_GMX_MM_BLEND256D(0,0,1,0)); /* - - | z2 y2 */
- t2 = _mm256_permute2f128_pd(xyz3,xyz3,0x21); /* y3 x3 | - z3 | */
- xyz2 = _mm256_blend_pd(xyz2,t2,_GMX_MM_BLEND256D(1,1,0,0)); /* y3 x3 | z2 y2 */
- xyz4 = _mm256_permute_pd(xyz4,_GMX_MM_PERMUTE256D(0,1,0,1)); /* z4 - | x4 y4 */
- t3 = _mm256_permute2f128_pd(xyz4,xyz4,0x21); /* x4 y4 | z4 - */
- t3 = _mm256_blend_pd(t3,xyz4,_GMX_MM_BLEND256D(1,0,1,0)); /* z4 y4| x4 - */
- xyz4 = _mm256_blend_pd(t3,t2,_GMX_MM_BLEND256D(0,0,0,1)); /* xz y4 | x4 z3 */
-
- tA = _mm256_sub_pd(tA,xyz1);
- tB = _mm256_sub_pd(tB,xyz2);
- tC = _mm256_sub_pd(tC,xyz4);
-
- _mm256_storeu_pd(ptrA,tA);
- _mm256_storeu_pd(ptrA+4,tB);
- _mm256_storeu_pd(ptrA+8,tC);
-}
-
-
-
-static void
-gmx_mm256_decrement_1rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1)
-{
- __m128d t1,t2,t3;
-
- t1 = _mm_sub_sd(_mm256_castpd256_pd128(x1),_mm_load_sd(ptrA));
- t2 = _mm_sub_sd(_mm256_castpd256_pd128(y1),_mm_load_sd(ptrA+1));
- t3 = _mm_sub_sd(_mm256_castpd256_pd128(z1),_mm_load_sd(ptrA+2));
- _mm_store_sd(ptrA,t1);
- _mm_store_sd(ptrA+1,t2);
- _mm_store_sd(ptrA+2,t3);
-}
-
-
-static void
-gmx_mm256_decrement_2rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2)
-{
- __m256d t1;
- __m128d tA;
- t1 = _mm256_loadu_pd(ptrA);
- tA = _mm_loadu_pd(ptrA+4);
-
- x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
-
- x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
-
- t1 = _mm256_sub_pd(x1,t1);
- tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(y2));
-
- _mm256_storeu_pd(ptrA,t1);
- _mm_storeu_pd(ptrA+4,tA);
-}
-
-
-static void
-gmx_mm256_decrement_3rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3)
-{
- __m256d t1,t2;
- __m128d tA;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrA+4);
- tA = _mm_load_sd(ptrA+8);
-
- x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- x3 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
-
- x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
- y2 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
- t1 = _mm256_sub_pd(t1,x1);
- t2 = _mm256_sub_pd(t2,y2);
- tA = _mm_sub_sd(tA,_mm256_castpd256_pd128(z3));
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrA+4,t2);
- _mm_store_sd(ptrA+8,tA);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_1ptr_swizzle_pd(double * gmx_restrict ptrA,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3,
- __m256d x4, __m256d y4, __m256d z4)
-{
- __m256d t1,t2,t3;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrA+4);
- t3 = _mm256_loadu_pd(ptrA+8);
-
- x1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- z1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- y2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- x3 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
- z3 = _mm256_unpacklo_pd(z3,x4); /* - - | x4a z3a */
- y4 = _mm256_unpacklo_pd(y4,z4); /* - - | z4a y4a */
-
- x1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2a z1a | y1a x1a */
- y2 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3a x3a | z2a y2a */
- z3 = gmx_mm256_unpack128lo_pd(z3,y4); /* z4a y4a | x4a z3a */
-
- t1 = _mm256_sub_pd(t1,x1);
- t2 = _mm256_sub_pd(t2,y2);
- t3 = _mm256_sub_pd(t3,z3);
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrA+4,t2);
- _mm256_storeu_pd(ptrA+8,t3);
-}
-
-static void
-gmx_mm256_decrement_1rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA,
- double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1)
-{
- __m256d t1,t2,t3,t4;
- __m256i mask;
-
- t3 = _mm256_loadu_pd(ptrA);
- t4 = _mm256_loadu_pd(ptrB);
-
- t1 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- t2 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- t1 = gmx_mm256_unpack128lo_pd(t1,z1); /* - z1a | y1a x1a */
- z1 = _mm256_permute_pd(z1,_GMX_MM_PERMUTE256D(1,1,1,1));
- t2 = gmx_mm256_unpack128lo_pd(t2,z1); /* z1b z1a | y1b x1b */
-
- /* Construct a mask without executing any data loads */
- mask = _mm256_castpd_si256(_mm256_blend_pd(_mm256_setzero_pd(),
- _mm256_cmp_pd(_mm256_setzero_pd(),_mm256_setzero_pd(),_CMP_EQ_OQ),0x7));
-
- t3 = _mm256_sub_pd(t3,t1);
- t4 = _mm256_sub_pd(t4,t2);
-
- /* Careful with potentially overlapping stores, need to be masked */
- _mm256_maskstore_pd(ptrA,mask,t3);
- _mm256_maskstore_pd(ptrB,mask,t4);
-}
-
-static void
-gmx_mm256_decrement_2rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2)
-{
- __m256d t1,t2,t5;
- __m128d t3,t4;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm_loadu_pd(ptrA+4);
- t4 = _mm_loadu_pd(ptrB+4);
-
- t5 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
-
- x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
-
- z2 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
- y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
- t1 = _mm256_sub_pd(t1,z2);
- t2 = _mm256_sub_pd(t2,y1);
- t3 = _mm_sub_pd(t3,_mm256_castpd256_pd128(x2));
- t4 = _mm_sub_pd(t4,_mm256_castpd256_pd128(y2));
-
- /* Careful with potentially overlapping stores, need to be masked */
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm_storeu_pd(ptrA+4,t3);
- _mm_storeu_pd(ptrB+4,t4);
-}
-
-static void
-gmx_mm256_decrement_3rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3)
-{
- __m256d t1,t2,t3,t4,t5,t6;
- __m128d tA,tB;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm256_loadu_pd(ptrA+4);
- t4 = _mm256_loadu_pd(ptrB+4);
- tA = _mm_load_sd(ptrA+8);
- tB = _mm_load_sd(ptrB+8);
-
- t5 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
-
- x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
-
- z2 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
- x3 = _mm256_unpackhi_pd(x3,y3); /* - - | y3b x3b */
-
- t6 = _mm256_permute_pd(z3,_GMX_MM_PERMUTE256D(1,1,1,1)); /* - - | - z3b */
-
- y3 = gmx_mm256_unpack128lo_pd(t5,y1); /* x2a z1a | y1a x1a */
- y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
- t5 = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
- x1 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
- t1 = _mm256_sub_pd(t1,y3);
- t2 = _mm256_sub_pd(t2,y1);
- t3 = _mm256_sub_pd(t3,t5);
- t4 = _mm256_sub_pd(t4,x1);
- tA = _mm_sub_pd(tA,_mm256_castpd256_pd128(z3));
- tB = _mm_sub_pd(tB,_mm256_castpd256_pd128(t6));
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm256_storeu_pd(ptrA+4,t3);
- _mm256_storeu_pd(ptrB+4,t4);
- _mm_store_sd(ptrA+8,tA);
- _mm_store_sd(ptrB+8,tB);
-}
-
-
-static void
-gmx_mm256_decrement_4rvec_2ptr_swizzle_pd(double * gmx_restrict ptrA, double * gmx_restrict ptrB,
- __m256d x1, __m256d y1, __m256d z1,
- __m256d x2, __m256d y2, __m256d z2,
- __m256d x3, __m256d y3, __m256d z3,
- __m256d x4, __m256d y4, __m256d z4)
-{
- __m256d t1,t2,t3,t4,t5,t6,t7;
-
- t1 = _mm256_loadu_pd(ptrA);
- t2 = _mm256_loadu_pd(ptrB);
- t3 = _mm256_loadu_pd(ptrA+4);
- t4 = _mm256_loadu_pd(ptrB+4);
- t5 = _mm256_loadu_pd(ptrA+8);
- t6 = _mm256_loadu_pd(ptrB+8);
-
- t7 = _mm256_unpacklo_pd(x1,y1); /* - - | y1a x1a */
- x1 = _mm256_unpackhi_pd(x1,y1); /* - - | y1b x1b */
-
- y1 = _mm256_unpacklo_pd(z1,x2); /* - - | x2a z1a */
- z1 = _mm256_unpackhi_pd(z1,x2); /* - - | x2b z1b */
-
- x2 = _mm256_unpacklo_pd(y2,z2); /* - - | z2a y2a */
- y2 = _mm256_unpackhi_pd(y2,z2); /* - - | z2b y2b */
-
- z2 = _mm256_unpacklo_pd(x3,y3); /* - - | y3a x3a */
- x3 = _mm256_unpackhi_pd(x3,y3); /* - - | y3b x3b */
-
- y3 = _mm256_unpacklo_pd(z3,x4); /* - - | x4a z3a */
- z3 = _mm256_unpackhi_pd(z3,x4); /* - - | x4b z3b */
- x4 = _mm256_unpacklo_pd(y4,z4); /* - - | z4a y4a */
- y4 = _mm256_unpackhi_pd(y4,z4); /* - - | z4b y4b */
-
- z4 = gmx_mm256_unpack128lo_pd(t7,y1); /* x2a z1a | y1a x1a */
- y1 = gmx_mm256_unpack128lo_pd(x1,z1); /* x2b z1b | y1b x1b */
-
- t7 = gmx_mm256_unpack128lo_pd(x2,z2); /* y3a x3a | z2a y2a */
- x1 = gmx_mm256_unpack128lo_pd(y2,x3); /* y3b x3b | z2b y2b */
-
- x2 = gmx_mm256_unpack128lo_pd(y3,x4); /* z4a y4a | x4a z3a */
- y2 = gmx_mm256_unpack128lo_pd(z3,y4); /* z4b y4b | x4b z3b */
-
- t1 = _mm256_sub_pd(t1,z4);
- t2 = _mm256_sub_pd(t2,y1);
- t3 = _mm256_sub_pd(t3,t7);
- t4 = _mm256_sub_pd(t4,x1);
- t5 = _mm256_sub_pd(t5,x2);
- t6 = _mm256_sub_pd(t6,y2);
-
- _mm256_storeu_pd(ptrA,t1);
- _mm256_storeu_pd(ptrB,t2);
- _mm256_storeu_pd(ptrA+4,t3);
- _mm256_storeu_pd(ptrB+4,t4);
- _mm256_storeu_pd(ptrA+8,t5);
- _mm256_storeu_pd(ptrB+8,t6);
-}
-
-
-