src/gromacs/mdlib/pme_simd4.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team,
   6  * check out http://www.gromacs.org for more information.
   7  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   8  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   9  * others, as listed in the AUTHORS file in the top-level source
  10  * directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38
  39 /* This include file has code between ifdef's to make sure
  40  * that this performance sensitive code is inlined
  41  * and to remove conditionals and variable loop bounds at compile time.
  42  */
  43
  44 #ifdef PME_SPREAD_SIMD4_ORDER4
  45 /* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
  46  * This code does not assume any memory alignment for the grid.
  47  */
  48 {
  49     gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3;
  50     gmx_simd4_pr tz_S;
  51     gmx_simd4_pr vx_S;
  52     gmx_simd4_pr vx_tz_S;
  53     gmx_simd4_pr sum_S0, sum_S1, sum_S2, sum_S3;
  54     gmx_simd4_pr gri_S0, gri_S1, gri_S2, gri_S3;
  55
  56     ty_S0 = gmx_simd4_set1_pr(thy[0]);
  57     ty_S1 = gmx_simd4_set1_pr(thy[1]);
  58     ty_S2 = gmx_simd4_set1_pr(thy[2]);
  59     ty_S3 = gmx_simd4_set1_pr(thy[3]);
  60
  61     /* With order 4 the z-spline is actually aligned */
  62     tz_S  = gmx_simd4_load_pr(thz);
  63
  64     for (ithx = 0; (ithx < 4); ithx++)
  65     {
  66         index_x = (i0+ithx)*pny*pnz;
  67         valx    = qn*thx[ithx];
  68
  69         vx_S   = gmx_simd4_set1_pr(valx);
  70
  71         vx_tz_S = gmx_simd4_mul_pr(vx_S, tz_S);
  72
  73         gri_S0 = gmx_simd4_loadu_pr(grid+index_x+(j0+0)*pnz+k0);
  74         gri_S1 = gmx_simd4_loadu_pr(grid+index_x+(j0+1)*pnz+k0);
  75         gri_S2 = gmx_simd4_loadu_pr(grid+index_x+(j0+2)*pnz+k0);
  76         gri_S3 = gmx_simd4_loadu_pr(grid+index_x+(j0+3)*pnz+k0);
  77
  78         sum_S0 = gmx_simd4_madd_pr(vx_tz_S, ty_S0, gri_S0);
  79         sum_S1 = gmx_simd4_madd_pr(vx_tz_S, ty_S1, gri_S1);
  80         sum_S2 = gmx_simd4_madd_pr(vx_tz_S, ty_S2, gri_S2);
  81         sum_S3 = gmx_simd4_madd_pr(vx_tz_S, ty_S3, gri_S3);
  82
  83         gmx_simd4_storeu_pr(grid+index_x+(j0+0)*pnz+k0, sum_S0);
  84         gmx_simd4_storeu_pr(grid+index_x+(j0+1)*pnz+k0, sum_S1);
  85         gmx_simd4_storeu_pr(grid+index_x+(j0+2)*pnz+k0, sum_S2);
  86         gmx_simd4_storeu_pr(grid+index_x+(j0+3)*pnz+k0, sum_S3);
  87     }
  88 }
  89 #undef PME_SPREAD_SIMD4_ORDER4
  90 #endif
  91
  92
  93 #ifdef PME_GATHER_F_SIMD4_ORDER4
  94 /* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
  95  * This code does not assume any memory alignment for the grid.
  96  */
  97 {
  98     real         fx_tmp[4], fy_tmp[4], fz_tmp[4];
  99
 100     gmx_simd4_pr fx_S, fy_S, fz_S;
 101
 102     gmx_simd4_pr tx_S, ty_S, tz_S;
 103     gmx_simd4_pr dx_S, dy_S, dz_S;
 104
 105     gmx_simd4_pr gval_S;
 106
 107     gmx_simd4_pr fxy1_S;
 108     gmx_simd4_pr fz1_S;
 109
 110     fx_S = gmx_simd4_setzero_pr();
 111     fy_S = gmx_simd4_setzero_pr();
 112     fz_S = gmx_simd4_setzero_pr();
 113
 114     /* With order 4 the z-spline is actually aligned */
 115     tz_S  = gmx_simd4_load_pr(thz);
 116     dz_S  = gmx_simd4_load_pr(dthz);
 117
 118     for (ithx = 0; (ithx < 4); ithx++)
 119     {
 120         index_x  = (i0+ithx)*pny*pnz;
 121         tx_S   = gmx_simd4_set1_pr(thx[ithx]);
 122         dx_S   = gmx_simd4_set1_pr(dthx[ithx]);
 123
 124         for (ithy = 0; (ithy < 4); ithy++)
 125         {
 126             index_xy = index_x+(j0+ithy)*pnz;
 127             ty_S   = gmx_simd4_set1_pr(thy[ithy]);
 128             dy_S   = gmx_simd4_set1_pr(dthy[ithy]);
 129
 130             gval_S = gmx_simd4_loadu_pr(grid+index_xy+k0);
 131
 132             fxy1_S = gmx_simd4_mul_pr(tz_S, gval_S);
 133             fz1_S  = gmx_simd4_mul_pr(dz_S, gval_S);
 134
 135             fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S);
 136             fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S);
 137             fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S);
 138         }
 139     }
 140
 141     gmx_simd4_storeu_pr(fx_tmp, fx_S);
 142     gmx_simd4_storeu_pr(fy_tmp, fy_S);
 143     gmx_simd4_storeu_pr(fz_tmp, fz_S);
 144
 145     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
 146     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
 147     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
 148 }
 149 #undef PME_GATHER_F_SIMD4_ORDER4
 150 #endif
 151
 152
 153 #ifdef PME_SPREAD_SIMD4_ALIGNED
 154 /* This code assumes that the grid is allocated 4-real aligned
 155  * and that pnz is a multiple of 4.
 156  * This code supports pme_order <= 5.
 157  */
 158 {
 159     int    offset;
 160     int    index;
 161     gmx_simd4_pr ty_S0, ty_S1, ty_S2, ty_S3, ty_S4;
 162     gmx_simd4_pr tz_S0;
 163     gmx_simd4_pr tz_S1;
 164     gmx_simd4_pr vx_S;
 165     gmx_simd4_pr vx_tz_S0;
 166     gmx_simd4_pr vx_tz_S1;
 167     gmx_simd4_pr sum_S00, sum_S01, sum_S02, sum_S03, sum_S04;
 168     gmx_simd4_pr sum_S10, sum_S11, sum_S12, sum_S13, sum_S14;
 169     gmx_simd4_pr gri_S00, gri_S01, gri_S02, gri_S03, gri_S04;
 170     gmx_simd4_pr gri_S10, gri_S11, gri_S12, gri_S13, gri_S14;
 171
 172     offset = k0 & 3;
 173
 174     ty_S0 = gmx_simd4_set1_pr(thy[0]);
 175     ty_S1 = gmx_simd4_set1_pr(thy[1]);
 176     ty_S2 = gmx_simd4_set1_pr(thy[2]);
 177     ty_S3 = gmx_simd4_set1_pr(thy[3]);
 178 #if PME_ORDER == 5
 179     ty_S4 = gmx_simd4_set1_pr(thy[4]);
 180 #endif
 181
 182 #ifdef GMX_SIMD4_HAVE_UNALIGNED
 183     tz_S0 = gmx_simd4_loadu_pr(thz-offset);
 184     tz_S1 = gmx_simd4_loadu_pr(thz-offset+4);
 185 #else
 186     {
 187         int i;
 188         /* Copy thz to an aligned buffer (unused buffer parts are masked) */
 189         for (i = 0; i < PME_ORDER; i++)
 190         {
 191             thz_aligned[offset+i] = thz[i];
 192         }
 193         tz_S0 = gmx_simd4_load_pr(thz_aligned);
 194         tz_S1 = gmx_simd4_load_pr(thz_aligned+4);
 195     }
 196 #endif
 197     tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]);
 198     tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]);
 199
 200     for (ithx = 0; (ithx < PME_ORDER); ithx++)
 201     {
 202         index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
 203         valx  = qn*thx[ithx];
 204
 205         vx_S   = gmx_simd4_set1_pr(valx);
 206
 207         vx_tz_S0 = gmx_simd4_mul_pr(vx_S, tz_S0);
 208         vx_tz_S1 = gmx_simd4_mul_pr(vx_S, tz_S1);
 209
 210         gri_S00 = gmx_simd4_load_pr(grid+index+0*pnz);
 211         gri_S01 = gmx_simd4_load_pr(grid+index+1*pnz);
 212         gri_S02 = gmx_simd4_load_pr(grid+index+2*pnz);
 213         gri_S03 = gmx_simd4_load_pr(grid+index+3*pnz);
 214 #if PME_ORDER == 5
 215         gri_S04 = gmx_simd4_load_pr(grid+index+4*pnz);
 216 #endif
 217         gri_S10 = gmx_simd4_load_pr(grid+index+0*pnz+4);
 218         gri_S11 = gmx_simd4_load_pr(grid+index+1*pnz+4);
 219         gri_S12 = gmx_simd4_load_pr(grid+index+2*pnz+4);
 220         gri_S13 = gmx_simd4_load_pr(grid+index+3*pnz+4);
 221 #if PME_ORDER == 5
 222         gri_S14 = gmx_simd4_load_pr(grid+index+4*pnz+4);
 223 #endif
 224
 225         sum_S00 = gmx_simd4_madd_pr(vx_tz_S0, ty_S0, gri_S00);
 226         sum_S01 = gmx_simd4_madd_pr(vx_tz_S0, ty_S1, gri_S01);
 227         sum_S02 = gmx_simd4_madd_pr(vx_tz_S0, ty_S2, gri_S02);
 228         sum_S03 = gmx_simd4_madd_pr(vx_tz_S0, ty_S3, gri_S03);
 229 #if PME_ORDER == 5
 230         sum_S04 = gmx_simd4_madd_pr(vx_tz_S0, ty_S4, gri_S04);
 231 #endif
 232         sum_S10 = gmx_simd4_madd_pr(vx_tz_S1, ty_S0, gri_S10);
 233         sum_S11 = gmx_simd4_madd_pr(vx_tz_S1, ty_S1, gri_S11);
 234         sum_S12 = gmx_simd4_madd_pr(vx_tz_S1, ty_S2, gri_S12);
 235         sum_S13 = gmx_simd4_madd_pr(vx_tz_S1, ty_S3, gri_S13);
 236 #if PME_ORDER == 5
 237         sum_S14 = gmx_simd4_madd_pr(vx_tz_S1, ty_S4, gri_S14);
 238 #endif
 239
 240         gmx_simd4_store_pr(grid+index+0*pnz, sum_S00);
 241         gmx_simd4_store_pr(grid+index+1*pnz, sum_S01);
 242         gmx_simd4_store_pr(grid+index+2*pnz, sum_S02);
 243         gmx_simd4_store_pr(grid+index+3*pnz, sum_S03);
 244 #if PME_ORDER == 5
 245         gmx_simd4_store_pr(grid+index+4*pnz, sum_S04);
 246 #endif
 247         gmx_simd4_store_pr(grid+index+0*pnz+4, sum_S10);
 248         gmx_simd4_store_pr(grid+index+1*pnz+4, sum_S11);
 249         gmx_simd4_store_pr(grid+index+2*pnz+4, sum_S12);
 250         gmx_simd4_store_pr(grid+index+3*pnz+4, sum_S13);
 251 #if PME_ORDER == 5
 252         gmx_simd4_store_pr(grid+index+4*pnz+4, sum_S14);
 253 #endif
 254     }
 255 }
 256 #undef PME_ORDER
 257 #undef PME_SPREAD_SIMD4_ALIGNED
 258 #endif
 259
 260
 261 #ifdef PME_GATHER_F_SIMD4_ALIGNED
 262 /* This code assumes that the grid is allocated 4-real aligned
 263  * and that pnz is a multiple of 4.
 264  * This code supports pme_order <= 5.
 265  */
 266 {
 267     int    offset;
 268
 269     real         fx_tmp[4], fy_tmp[4], fz_tmp[4];
 270
 271     gmx_simd4_pr fx_S, fy_S, fz_S;
 272
 273     gmx_simd4_pr tx_S, ty_S, tz_S0, tz_S1;
 274     gmx_simd4_pr dx_S, dy_S, dz_S0, dz_S1;
 275
 276     gmx_simd4_pr gval_S0;
 277     gmx_simd4_pr gval_S1;
 278
 279     gmx_simd4_pr fxy1_S0;
 280     gmx_simd4_pr fz1_S0;
 281     gmx_simd4_pr fxy1_S1;
 282     gmx_simd4_pr fz1_S1;
 283     gmx_simd4_pr fxy1_S;
 284     gmx_simd4_pr fz1_S;
 285
 286     offset = k0 & 3;
 287
 288     fx_S = gmx_simd4_setzero_pr();
 289     fy_S = gmx_simd4_setzero_pr();
 290     fz_S = gmx_simd4_setzero_pr();
 291
 292 #ifdef GMX_SIMD4_HAVE_UNALIGNED
 293     tz_S0 = gmx_simd4_loadu_pr(thz-offset);
 294     tz_S1 = gmx_simd4_loadu_pr(thz-offset+4);
 295     dz_S0 = gmx_simd4_loadu_pr(dthz-offset);
 296     dz_S1 = gmx_simd4_loadu_pr(dthz-offset+4);
 297 #else
 298     {
 299         int i;
 300         /* Copy (d)thz to an aligned buffer (unused buffer parts are masked) */
 301         for (i = 0; i < PME_ORDER; i++)
 302         {
 303             thz_aligned[offset+i]  = thz[i];
 304             dthz_aligned[offset+i] = dthz[i];
 305         }
 306         tz_S0 = gmx_simd4_load_pr(thz_aligned);
 307         tz_S1 = gmx_simd4_load_pr(thz_aligned+4);
 308         dz_S0 = gmx_simd4_load_pr(dthz_aligned);
 309         dz_S1 = gmx_simd4_load_pr(dthz_aligned+4);
 310     }
 311 #endif
 312     tz_S0 = gmx_simd4_blendzero_pr(tz_S0, work->mask_S0[offset]);
 313     dz_S0 = gmx_simd4_blendzero_pr(dz_S0, work->mask_S0[offset]);
 314     tz_S1 = gmx_simd4_blendzero_pr(tz_S1, work->mask_S1[offset]);
 315     dz_S1 = gmx_simd4_blendzero_pr(dz_S1, work->mask_S1[offset]);
 316
 317     for (ithx = 0; (ithx < PME_ORDER); ithx++)
 318     {
 319         index_x  = (i0+ithx)*pny*pnz;
 320         tx_S   = gmx_simd4_set1_pr(thx[ithx]);
 321         dx_S   = gmx_simd4_set1_pr(dthx[ithx]);
 322
 323         for (ithy = 0; (ithy < PME_ORDER); ithy++)
 324         {
 325             index_xy = index_x+(j0+ithy)*pnz;
 326             ty_S   = gmx_simd4_set1_pr(thy[ithy]);
 327             dy_S   = gmx_simd4_set1_pr(dthy[ithy]);
 328
 329             gval_S0 = gmx_simd4_load_pr(grid+index_xy+k0-offset);
 330             gval_S1 = gmx_simd4_load_pr(grid+index_xy+k0-offset+4);
 331
 332             fxy1_S0 = gmx_simd4_mul_pr(tz_S0, gval_S0);
 333             fz1_S0  = gmx_simd4_mul_pr(dz_S0, gval_S0);
 334             fxy1_S1 = gmx_simd4_mul_pr(tz_S1, gval_S1);
 335             fz1_S1  = gmx_simd4_mul_pr(dz_S1, gval_S1);
 336
 337             fxy1_S = gmx_simd4_add_pr(fxy1_S0, fxy1_S1);
 338             fz1_S  = gmx_simd4_add_pr(fz1_S0, fz1_S1);
 339
 340             fx_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(dx_S, ty_S), fxy1_S, fx_S);
 341             fy_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, dy_S), fxy1_S, fy_S);
 342             fz_S = gmx_simd4_madd_pr(gmx_simd4_mul_pr(tx_S, ty_S), fz1_S, fz_S);
 343         }
 344     }
 345
 346     gmx_simd4_store_pr(fx_tmp, fx_S);
 347     gmx_simd4_store_pr(fy_tmp, fy_S);
 348     gmx_simd4_store_pr(fz_tmp, fz_S);
 349
 350     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
 351     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
 352     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
 353 }
 354 #undef PME_ORDER
 355 #undef PME_GATHER_F_SIMD4_ALIGNED
 356 #endif