Fix aligned store to unaligned memory
authorRoland Schulz <roland@utk.edu>
Mon, 17 Feb 2014 22:23:33 +0000 (17:23 -0500)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Thu, 20 Feb 2014 17:55:00 +0000 (18:55 +0100)
Also fixes that unaligned store was used when not necessary.

Change-Id: I44bb222a07ec0af65198667787b8673b3c6cd2e7

src/mdlib/pme_simd4.h

index c66f8e99f6af844a5a275fd6d201b939c33774ab..0a6d0e4644e84e66d47089d2adc9075d6f8770db 100644 (file)
@@ -95,7 +95,7 @@
  * This code does not assume any memory alignment for the grid.
  */
 {
-    real         fx_tmp[4], fy_tmp[4], fz_tmp[4];
+    real         tmp[8], *tmp_aligned;
 
     gmx_simd4_pr fx_S, fy_S, fz_S;
 
     gmx_simd4_pr fxy1_S;
     gmx_simd4_pr fz1_S;
 
+    tmp_aligned = gmx_simd4_align_real(tmp);
+
     fx_S = gmx_simd4_setzero_pr();
     fy_S = gmx_simd4_setzero_pr();
     fz_S = gmx_simd4_setzero_pr();
         }
     }
 
-    gmx_simd4_storeu_pr(fx_tmp, fx_S);
-    gmx_simd4_storeu_pr(fy_tmp, fy_S);
-    gmx_simd4_storeu_pr(fz_tmp, fz_S);
+    gmx_simd4_store_pr(tmp_aligned, fx_S);
+    fx += tmp_aligned[0]+tmp_aligned[1]+tmp_aligned[2]+tmp_aligned[3];
+
+    gmx_simd4_store_pr(tmp_aligned, fy_S);
+    fy += tmp_aligned[0]+tmp_aligned[1]+tmp_aligned[2]+tmp_aligned[3];
 
-    fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
-    fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
-    fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
+    gmx_simd4_store_pr(tmp_aligned, fz_S);
+    fz += tmp_aligned[0]+tmp_aligned[1]+tmp_aligned[2]+tmp_aligned[3];
 }
 #undef PME_GATHER_F_SIMD4_ORDER4
 #endif
  */
 {
     int    offset;
-
-    real         fx_tmp[4], fy_tmp[4], fz_tmp[4];
+    real   tmp[8], *tmp_aligned;
 
     gmx_simd4_pr fx_S, fy_S, fz_S;
 
     gmx_simd4_pr fxy1_S;
     gmx_simd4_pr fz1_S;
 
+    tmp_aligned = gmx_simd4_align_real(tmp);
+
     offset = k0 & 3;
 
     fx_S = gmx_simd4_setzero_pr();
         }
     }
 
-    gmx_simd4_store_pr(fx_tmp, fx_S);
-    gmx_simd4_store_pr(fy_tmp, fy_S);
-    gmx_simd4_store_pr(fz_tmp, fz_S);
+    gmx_simd4_store_pr(tmp_aligned, fx_S);
+    fx += tmp_aligned[0]+tmp_aligned[1]+tmp_aligned[2]+tmp_aligned[3];
+
+    gmx_simd4_store_pr(tmp_aligned, fy_S);
+    fy += tmp_aligned[0]+tmp_aligned[1]+tmp_aligned[2]+tmp_aligned[3];
 
-    fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
-    fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
-    fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
+    gmx_simd4_store_pr(tmp_aligned, fz_S);
+    fz += tmp_aligned[0]+tmp_aligned[1]+tmp_aligned[2]+tmp_aligned[3];
 }
 #undef PME_ORDER
 #undef PME_GATHER_F_SIMD4_ALIGNED