src/mdlib/pme_sse_single.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 /* This include file has code between ifdef's to make sure
  40  * that this performance sensitive code is inlined
  41  * and to remove conditionals and variable loop bounds at compile time.
  42  */
  43
  44 #ifdef PME_SPREAD_SSE_ORDER4
  45 /* This code does not assume any memory alignment.
  46  * This code only works for pme_order = 4.
  47  */
  48 {
  49     __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3;
  50     __m128 tz_SSE;
  51     __m128 vx_SSE;
  52     __m128 vx_tz_SSE;
  53     __m128 sum_SSE0, sum_SSE1, sum_SSE2, sum_SSE3;
  54     __m128 gri_SSE0, gri_SSE1, gri_SSE2, gri_SSE3;
  55
  56     ty_SSE0 = _mm_load1_ps(&thy[0]);
  57     ty_SSE1 = _mm_load1_ps(&thy[1]);
  58     ty_SSE2 = _mm_load1_ps(&thy[2]);
  59     ty_SSE3 = _mm_load1_ps(&thy[3]);
  60
  61     tz_SSE  = _mm_loadu_ps(thz);
  62
  63     for (ithx = 0; (ithx < 4); ithx++)
  64     {
  65         index_x = (i0+ithx)*pny*pnz;
  66         valx    = qn*thx[ithx];
  67
  68         vx_SSE   = _mm_load1_ps(&valx);
  69
  70         vx_tz_SSE = _mm_mul_ps(vx_SSE, tz_SSE);
  71
  72         gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
  73         gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
  74         gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
  75         gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
  76
  77         sum_SSE0 = _mm_add_ps(gri_SSE0, _mm_mul_ps(vx_tz_SSE, ty_SSE0));
  78         sum_SSE1 = _mm_add_ps(gri_SSE1, _mm_mul_ps(vx_tz_SSE, ty_SSE1));
  79         sum_SSE2 = _mm_add_ps(gri_SSE2, _mm_mul_ps(vx_tz_SSE, ty_SSE2));
  80         sum_SSE3 = _mm_add_ps(gri_SSE3, _mm_mul_ps(vx_tz_SSE, ty_SSE3));
  81
  82         _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0, sum_SSE0);
  83         _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0, sum_SSE1);
  84         _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0, sum_SSE2);
  85         _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0, sum_SSE3);
  86     }
  87 }
  88 #undef PME_SPREAD_SSE_ORDER4
  89 #endif
  90
  91
  92 #ifdef PME_GATHER_F_SSE_ORDER4
  93 /* This code does not assume any memory alignment.
  94  * This code only works for pme_order = 4.
  95  */
  96 {
  97     float  fx_tmp[4], fy_tmp[4], fz_tmp[4];
  98
  99     __m128 fx_SSE, fy_SSE, fz_SSE;
 100
 101     __m128 tx_SSE, ty_SSE, tz_SSE;
 102     __m128 dx_SSE, dy_SSE, dz_SSE;
 103
 104     __m128 gval_SSE;
 105
 106     __m128 fxy1_SSE;
 107     __m128 fz1_SSE;
 108
 109     fx_SSE = _mm_setzero_ps();
 110     fy_SSE = _mm_setzero_ps();
 111     fz_SSE = _mm_setzero_ps();
 112
 113     tz_SSE  = _mm_loadu_ps(thz);
 114     dz_SSE  = _mm_loadu_ps(dthz);
 115
 116     for (ithx = 0; (ithx < 4); ithx++)
 117     {
 118         index_x  = (i0+ithx)*pny*pnz;
 119         tx_SSE   = _mm_load1_ps(thx+ithx);
 120         dx_SSE   = _mm_load1_ps(dthx+ithx);
 121
 122         for (ithy = 0; (ithy < 4); ithy++)
 123         {
 124             index_xy = index_x+(j0+ithy)*pnz;
 125             ty_SSE   = _mm_load1_ps(thy+ithy);
 126             dy_SSE   = _mm_load1_ps(dthy+ithy);
 127
 128             gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
 129
 130             fxy1_SSE = _mm_mul_ps(tz_SSE, gval_SSE);
 131             fz1_SSE  = _mm_mul_ps(dz_SSE, gval_SSE);
 132
 133             fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE));
 134             fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE));
 135             fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE));
 136         }
 137     }
 138
 139     _mm_storeu_ps(fx_tmp, fx_SSE);
 140     _mm_storeu_ps(fy_tmp, fy_SSE);
 141     _mm_storeu_ps(fz_tmp, fz_SSE);
 142
 143     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
 144     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
 145     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
 146 }
 147 #undef PME_GATHER_F_SSE_ORDER4
 148 #endif
 149
 150
 151 #ifdef PME_SPREAD_SSE_ALIGNED
 152 /* This code assumes that the grid is allocated 16 bit aligned
 153  * and that pnz is a multiple of 4.
 154  * This code supports pme_order <= 5.
 155  */
 156 {
 157     int    offset;
 158     int    index;
 159     __m128 ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3, ty_SSE4;
 160     __m128 tz_SSE0;
 161     __m128 tz_SSE1;
 162     __m128 vx_SSE;
 163     __m128 vx_tz_SSE0;
 164     __m128 vx_tz_SSE1;
 165     __m128 sum_SSE00, sum_SSE01, sum_SSE02, sum_SSE03, sum_SSE04;
 166     __m128 sum_SSE10, sum_SSE11, sum_SSE12, sum_SSE13, sum_SSE14;
 167     __m128 gri_SSE00, gri_SSE01, gri_SSE02, gri_SSE03, gri_SSE04;
 168     __m128 gri_SSE10, gri_SSE11, gri_SSE12, gri_SSE13, gri_SSE14;
 169
 170     offset = k0 & 3;
 171
 172     ty_SSE0 = _mm_load1_ps(&thy[0]);
 173     ty_SSE1 = _mm_load1_ps(&thy[1]);
 174     ty_SSE2 = _mm_load1_ps(&thy[2]);
 175     ty_SSE3 = _mm_load1_ps(&thy[3]);
 176 #if PME_ORDER == 5
 177     ty_SSE4 = _mm_load1_ps(&thy[4]);
 178 #endif
 179
 180     tz_SSE0 = _mm_loadu_ps(thz-offset);
 181     tz_SSE1 = _mm_loadu_ps(thz-offset+4);
 182     tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]);
 183     tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]);
 184
 185     for (ithx = 0; (ithx < PME_ORDER); ithx++)
 186     {
 187         index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
 188         valx  = qn*thx[ithx];
 189
 190         vx_SSE   = _mm_load1_ps(&valx);
 191
 192         vx_tz_SSE0 = _mm_mul_ps(vx_SSE, tz_SSE0);
 193         vx_tz_SSE1 = _mm_mul_ps(vx_SSE, tz_SSE1);
 194
 195         gri_SSE00 = _mm_load_ps(grid+index+0*pnz);
 196         gri_SSE01 = _mm_load_ps(grid+index+1*pnz);
 197         gri_SSE02 = _mm_load_ps(grid+index+2*pnz);
 198         gri_SSE03 = _mm_load_ps(grid+index+3*pnz);
 199 #if PME_ORDER == 5
 200         gri_SSE04 = _mm_load_ps(grid+index+4*pnz);
 201 #endif
 202         gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4);
 203         gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4);
 204         gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4);
 205         gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4);
 206 #if PME_ORDER == 5
 207         gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4);
 208 #endif
 209
 210         sum_SSE00 = _mm_add_ps(gri_SSE00, _mm_mul_ps(vx_tz_SSE0, ty_SSE0));
 211         sum_SSE01 = _mm_add_ps(gri_SSE01, _mm_mul_ps(vx_tz_SSE0, ty_SSE1));
 212         sum_SSE02 = _mm_add_ps(gri_SSE02, _mm_mul_ps(vx_tz_SSE0, ty_SSE2));
 213         sum_SSE03 = _mm_add_ps(gri_SSE03, _mm_mul_ps(vx_tz_SSE0, ty_SSE3));
 214 #if PME_ORDER == 5
 215         sum_SSE04 = _mm_add_ps(gri_SSE04, _mm_mul_ps(vx_tz_SSE0, ty_SSE4));
 216 #endif
 217         sum_SSE10 = _mm_add_ps(gri_SSE10, _mm_mul_ps(vx_tz_SSE1, ty_SSE0));
 218         sum_SSE11 = _mm_add_ps(gri_SSE11, _mm_mul_ps(vx_tz_SSE1, ty_SSE1));
 219         sum_SSE12 = _mm_add_ps(gri_SSE12, _mm_mul_ps(vx_tz_SSE1, ty_SSE2));
 220         sum_SSE13 = _mm_add_ps(gri_SSE13, _mm_mul_ps(vx_tz_SSE1, ty_SSE3));
 221 #if PME_ORDER == 5
 222         sum_SSE14 = _mm_add_ps(gri_SSE14, _mm_mul_ps(vx_tz_SSE1, ty_SSE4));
 223 #endif
 224
 225         _mm_store_ps(grid+index+0*pnz, sum_SSE00);
 226         _mm_store_ps(grid+index+1*pnz, sum_SSE01);
 227         _mm_store_ps(grid+index+2*pnz, sum_SSE02);
 228         _mm_store_ps(grid+index+3*pnz, sum_SSE03);
 229 #if PME_ORDER == 5
 230         _mm_store_ps(grid+index+4*pnz, sum_SSE04);
 231 #endif
 232         _mm_store_ps(grid+index+0*pnz+4, sum_SSE10);
 233         _mm_store_ps(grid+index+1*pnz+4, sum_SSE11);
 234         _mm_store_ps(grid+index+2*pnz+4, sum_SSE12);
 235         _mm_store_ps(grid+index+3*pnz+4, sum_SSE13);
 236 #if PME_ORDER == 5
 237         _mm_store_ps(grid+index+4*pnz+4, sum_SSE14);
 238 #endif
 239     }
 240 }
 241 #undef PME_ORDER
 242 #undef PME_SPREAD_SSE_ALIGNED
 243 #endif
 244
 245
 246 #ifdef PME_GATHER_F_SSE_ALIGNED
 247 /* This code assumes that the grid is allocated 16 bit aligned
 248  * and that pnz is a multiple of 4.
 249  * This code supports pme_order <= 5.
 250  */
 251 {
 252     int    offset;
 253
 254     float  fx_tmp[4], fy_tmp[4], fz_tmp[4];
 255
 256     __m128 fx_SSE, fy_SSE, fz_SSE;
 257
 258     __m128 tx_SSE, ty_SSE, tz_SSE0, tz_SSE1;
 259     __m128 dx_SSE, dy_SSE, dz_SSE0, dz_SSE1;
 260
 261     __m128 gval_SSE0;
 262     __m128 gval_SSE1;
 263
 264     __m128 fxy1_SSE0;
 265     __m128 fz1_SSE0;
 266     __m128 fxy1_SSE1;
 267     __m128 fz1_SSE1;
 268     __m128 fxy1_SSE;
 269     __m128 fz1_SSE;
 270
 271     offset = k0 & 3;
 272
 273     fx_SSE = _mm_setzero_ps();
 274     fy_SSE = _mm_setzero_ps();
 275     fz_SSE = _mm_setzero_ps();
 276
 277     tz_SSE0 = _mm_loadu_ps(thz-offset);
 278     dz_SSE0 = _mm_loadu_ps(dthz-offset);
 279     tz_SSE1 = _mm_loadu_ps(thz-offset+4);
 280     dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
 281     tz_SSE0 = _mm_and_ps(tz_SSE0, work->mask_SSE0[offset]);
 282     dz_SSE0 = _mm_and_ps(dz_SSE0, work->mask_SSE0[offset]);
 283     tz_SSE1 = _mm_and_ps(tz_SSE1, work->mask_SSE1[offset]);
 284     dz_SSE1 = _mm_and_ps(dz_SSE1, work->mask_SSE1[offset]);
 285
 286     for (ithx = 0; (ithx < PME_ORDER); ithx++)
 287     {
 288         index_x  = (i0+ithx)*pny*pnz;
 289         tx_SSE   = _mm_load1_ps(thx+ithx);
 290         dx_SSE   = _mm_load1_ps(dthx+ithx);
 291
 292         for (ithy = 0; (ithy < PME_ORDER); ithy++)
 293         {
 294             index_xy = index_x+(j0+ithy)*pnz;
 295             ty_SSE   = _mm_load1_ps(thy+ithy);
 296             dy_SSE   = _mm_load1_ps(dthy+ithy);
 297
 298             gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
 299             gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
 300
 301             fxy1_SSE0 = _mm_mul_ps(tz_SSE0, gval_SSE0);
 302             fz1_SSE0  = _mm_mul_ps(dz_SSE0, gval_SSE0);
 303             fxy1_SSE1 = _mm_mul_ps(tz_SSE1, gval_SSE1);
 304             fz1_SSE1  = _mm_mul_ps(dz_SSE1, gval_SSE1);
 305
 306             fxy1_SSE = _mm_add_ps(fxy1_SSE0, fxy1_SSE1);
 307             fz1_SSE  = _mm_add_ps(fz1_SSE0, fz1_SSE1);
 308
 309             fx_SSE = _mm_add_ps(fx_SSE, _mm_mul_ps(_mm_mul_ps(dx_SSE, ty_SSE), fxy1_SSE));
 310             fy_SSE = _mm_add_ps(fy_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, dy_SSE), fxy1_SSE));
 311             fz_SSE = _mm_add_ps(fz_SSE, _mm_mul_ps(_mm_mul_ps(tx_SSE, ty_SSE), fz1_SSE));
 312         }
 313     }
 314
 315     _mm_store_ps(fx_tmp, fx_SSE);
 316     _mm_store_ps(fy_tmp, fy_SSE);
 317     _mm_store_ps(fz_tmp, fz_SSE);
 318
 319     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
 320     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
 321     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
 322 }
 323 #undef PME_ORDER
 324 #undef PME_GATHER_F_SSE_ALIGNED
 325 #endif