src/gromacs/mdlib/pme_simd4.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* This include file has code between ifdef's to make sure
  39  * that this performance sensitive code is inlined
  40  * and to remove conditionals and variable loop bounds at compile time.
  41  */
  42
  43 #ifdef PME_SPREAD_SIMD4_ORDER4
  44 /* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
  45  * This code does not assume any memory alignment for the grid.
  46  */
  47 {
  48     gmx_simd4_real_t ty_S0, ty_S1, ty_S2, ty_S3;
  49     gmx_simd4_real_t tz_S;
  50     gmx_simd4_real_t vx_S;
  51     gmx_simd4_real_t vx_tz_S;
  52     gmx_simd4_real_t sum_S0, sum_S1, sum_S2, sum_S3;
  53     gmx_simd4_real_t gri_S0, gri_S1, gri_S2, gri_S3;
  54
  55     ty_S0 = gmx_simd4_set1_r(thy[0]);
  56     ty_S1 = gmx_simd4_set1_r(thy[1]);
  57     ty_S2 = gmx_simd4_set1_r(thy[2]);
  58     ty_S3 = gmx_simd4_set1_r(thy[3]);
  59
  60     /* With order 4 the z-spline is actually aligned */
  61     tz_S  = gmx_simd4_load_r(thz);
  62
  63     for (ithx = 0; (ithx < 4); ithx++)
  64     {
  65         index_x = (i0+ithx)*pny*pnz;
  66         valx    = qn*thx[ithx];
  67
  68         vx_S   = gmx_simd4_set1_r(valx);
  69
  70         vx_tz_S = gmx_simd4_mul_r(vx_S, tz_S);
  71
  72         gri_S0 = gmx_simd4_loadu_r(grid+index_x+(j0+0)*pnz+k0);
  73         gri_S1 = gmx_simd4_loadu_r(grid+index_x+(j0+1)*pnz+k0);
  74         gri_S2 = gmx_simd4_loadu_r(grid+index_x+(j0+2)*pnz+k0);
  75         gri_S3 = gmx_simd4_loadu_r(grid+index_x+(j0+3)*pnz+k0);
  76
  77         sum_S0 = gmx_simd4_fmadd_r(vx_tz_S, ty_S0, gri_S0);
  78         sum_S1 = gmx_simd4_fmadd_r(vx_tz_S, ty_S1, gri_S1);
  79         sum_S2 = gmx_simd4_fmadd_r(vx_tz_S, ty_S2, gri_S2);
  80         sum_S3 = gmx_simd4_fmadd_r(vx_tz_S, ty_S3, gri_S3);
  81
  82         gmx_simd4_storeu_r(grid+index_x+(j0+0)*pnz+k0, sum_S0);
  83         gmx_simd4_storeu_r(grid+index_x+(j0+1)*pnz+k0, sum_S1);
  84         gmx_simd4_storeu_r(grid+index_x+(j0+2)*pnz+k0, sum_S2);
  85         gmx_simd4_storeu_r(grid+index_x+(j0+3)*pnz+k0, sum_S3);
  86     }
  87 }
  88 #undef PME_SPREAD_SIMD4_ORDER4
  89 #endif
  90
  91
  92 #ifdef PME_GATHER_F_SIMD4_ORDER4
  93 /* Gather for one charge with pme_order=4 with unaligned SIMD4 load+store.
  94  * This code does not assume any memory alignment for the grid.
  95  */
  96 {
  97     real             fx_tmp[4], fy_tmp[4], fz_tmp[4];
  98
  99     gmx_simd4_real_t fx_S, fy_S, fz_S;
 100
 101     gmx_simd4_real_t tx_S, ty_S, tz_S;
 102     gmx_simd4_real_t dx_S, dy_S, dz_S;
 103
 104     gmx_simd4_real_t gval_S;
 105
 106     gmx_simd4_real_t fxy1_S;
 107     gmx_simd4_real_t fz1_S;
 108
 109     fx_S = gmx_simd4_setzero_r();
 110     fy_S = gmx_simd4_setzero_r();
 111     fz_S = gmx_simd4_setzero_r();
 112
 113     /* With order 4 the z-spline is actually aligned */
 114     tz_S  = gmx_simd4_load_r(thz);
 115     dz_S  = gmx_simd4_load_r(dthz);
 116
 117     for (ithx = 0; (ithx < 4); ithx++)
 118     {
 119         index_x  = (i0+ithx)*pny*pnz;
 120         tx_S     = gmx_simd4_set1_r(thx[ithx]);
 121         dx_S     = gmx_simd4_set1_r(dthx[ithx]);
 122
 123         for (ithy = 0; (ithy < 4); ithy++)
 124         {
 125             index_xy = index_x+(j0+ithy)*pnz;
 126             ty_S     = gmx_simd4_set1_r(thy[ithy]);
 127             dy_S     = gmx_simd4_set1_r(dthy[ithy]);
 128
 129             gval_S = gmx_simd4_loadu_r(grid+index_xy+k0);
 130
 131             fxy1_S = gmx_simd4_mul_r(tz_S, gval_S);
 132             fz1_S  = gmx_simd4_mul_r(dz_S, gval_S);
 133
 134             fx_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(dx_S, ty_S), fxy1_S, fx_S);
 135             fy_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, dy_S), fxy1_S, fy_S);
 136             fz_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, ty_S), fz1_S, fz_S);
 137         }
 138     }
 139
 140     gmx_simd4_storeu_r(fx_tmp, fx_S);
 141     gmx_simd4_storeu_r(fy_tmp, fy_S);
 142     gmx_simd4_storeu_r(fz_tmp, fz_S);
 143
 144     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
 145     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
 146     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
 147 }
 148 #undef PME_GATHER_F_SIMD4_ORDER4
 149 #endif
 150
 151
 152 #ifdef PME_SPREAD_SIMD4_ALIGNED
 153 /* This code assumes that the grid is allocated 4-real aligned
 154  * and that pnz is a multiple of 4.
 155  * This code supports pme_order <= 5.
 156  */
 157 {
 158     int              offset;
 159     int              index;
 160     gmx_simd4_real_t ty_S0, ty_S1, ty_S2, ty_S3, ty_S4;
 161     gmx_simd4_real_t tz_S0;
 162     gmx_simd4_real_t tz_S1;
 163     gmx_simd4_real_t vx_S;
 164     gmx_simd4_real_t vx_tz_S0;
 165     gmx_simd4_real_t vx_tz_S1;
 166     gmx_simd4_real_t sum_S00, sum_S01, sum_S02, sum_S03, sum_S04;
 167     gmx_simd4_real_t sum_S10, sum_S11, sum_S12, sum_S13, sum_S14;
 168     gmx_simd4_real_t gri_S00, gri_S01, gri_S02, gri_S03, gri_S04;
 169     gmx_simd4_real_t gri_S10, gri_S11, gri_S12, gri_S13, gri_S14;
 170
 171     offset = k0 & 3;
 172
 173     ty_S0 = gmx_simd4_set1_r(thy[0]);
 174     ty_S1 = gmx_simd4_set1_r(thy[1]);
 175     ty_S2 = gmx_simd4_set1_r(thy[2]);
 176     ty_S3 = gmx_simd4_set1_r(thy[3]);
 177 #if PME_ORDER == 5
 178     ty_S4 = gmx_simd4_set1_r(thy[4]);
 179 #endif
 180
 181 #ifdef GMX_SIMD4_HAVE_UNALIGNED
 182     tz_S0 = gmx_simd4_loadu_r(thz-offset);
 183     tz_S1 = gmx_simd4_loadu_r(thz-offset+4);
 184 #else
 185     {
 186         int i;
 187         /* Copy thz to an aligned buffer (unused buffer parts are masked) */
 188         for (i = 0; i < PME_ORDER; i++)
 189         {
 190             thz_aligned[offset+i] = thz[i];
 191         }
 192         tz_S0 = gmx_simd4_load_r(thz_aligned);
 193         tz_S1 = gmx_simd4_load_r(thz_aligned+4);
 194     }
 195 #endif
 196     tz_S0 = gmx_simd4_blendzero_r(tz_S0, work->mask_S0[offset]);
 197     tz_S1 = gmx_simd4_blendzero_r(tz_S1, work->mask_S1[offset]);
 198
 199     for (ithx = 0; (ithx < PME_ORDER); ithx++)
 200     {
 201         index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
 202         valx  = qn*thx[ithx];
 203
 204         vx_S   = gmx_simd4_set1_r(valx);
 205
 206         vx_tz_S0 = gmx_simd4_mul_r(vx_S, tz_S0);
 207         vx_tz_S1 = gmx_simd4_mul_r(vx_S, tz_S1);
 208
 209         gri_S00 = gmx_simd4_load_r(grid+index+0*pnz);
 210         gri_S01 = gmx_simd4_load_r(grid+index+1*pnz);
 211         gri_S02 = gmx_simd4_load_r(grid+index+2*pnz);
 212         gri_S03 = gmx_simd4_load_r(grid+index+3*pnz);
 213 #if PME_ORDER == 5
 214         gri_S04 = gmx_simd4_load_r(grid+index+4*pnz);
 215 #endif
 216         gri_S10 = gmx_simd4_load_r(grid+index+0*pnz+4);
 217         gri_S11 = gmx_simd4_load_r(grid+index+1*pnz+4);
 218         gri_S12 = gmx_simd4_load_r(grid+index+2*pnz+4);
 219         gri_S13 = gmx_simd4_load_r(grid+index+3*pnz+4);
 220 #if PME_ORDER == 5
 221         gri_S14 = gmx_simd4_load_r(grid+index+4*pnz+4);
 222 #endif
 223
 224         sum_S00 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S0, gri_S00);
 225         sum_S01 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S1, gri_S01);
 226         sum_S02 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S2, gri_S02);
 227         sum_S03 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S3, gri_S03);
 228 #if PME_ORDER == 5
 229         sum_S04 = gmx_simd4_fmadd_r(vx_tz_S0, ty_S4, gri_S04);
 230 #endif
 231         sum_S10 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S0, gri_S10);
 232         sum_S11 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S1, gri_S11);
 233         sum_S12 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S2, gri_S12);
 234         sum_S13 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S3, gri_S13);
 235 #if PME_ORDER == 5
 236         sum_S14 = gmx_simd4_fmadd_r(vx_tz_S1, ty_S4, gri_S14);
 237 #endif
 238
 239         gmx_simd4_store_r(grid+index+0*pnz, sum_S00);
 240         gmx_simd4_store_r(grid+index+1*pnz, sum_S01);
 241         gmx_simd4_store_r(grid+index+2*pnz, sum_S02);
 242         gmx_simd4_store_r(grid+index+3*pnz, sum_S03);
 243 #if PME_ORDER == 5
 244         gmx_simd4_store_r(grid+index+4*pnz, sum_S04);
 245 #endif
 246         gmx_simd4_store_r(grid+index+0*pnz+4, sum_S10);
 247         gmx_simd4_store_r(grid+index+1*pnz+4, sum_S11);
 248         gmx_simd4_store_r(grid+index+2*pnz+4, sum_S12);
 249         gmx_simd4_store_r(grid+index+3*pnz+4, sum_S13);
 250 #if PME_ORDER == 5
 251         gmx_simd4_store_r(grid+index+4*pnz+4, sum_S14);
 252 #endif
 253     }
 254 }
 255 #undef PME_ORDER
 256 #undef PME_SPREAD_SIMD4_ALIGNED
 257 #endif
 258
 259
 260 #ifdef PME_GATHER_F_SIMD4_ALIGNED
 261 /* This code assumes that the grid is allocated 4-real aligned
 262  * and that pnz is a multiple of 4.
 263  * This code supports pme_order <= 5.
 264  */
 265 {
 266     int              offset;
 267
 268     real             fx_tmp[4], fy_tmp[4], fz_tmp[4];
 269
 270     gmx_simd4_real_t fx_S, fy_S, fz_S;
 271
 272     gmx_simd4_real_t tx_S, ty_S, tz_S0, tz_S1;
 273     gmx_simd4_real_t dx_S, dy_S, dz_S0, dz_S1;
 274
 275     gmx_simd4_real_t gval_S0;
 276     gmx_simd4_real_t gval_S1;
 277
 278     gmx_simd4_real_t fxy1_S0;
 279     gmx_simd4_real_t fz1_S0;
 280     gmx_simd4_real_t fxy1_S1;
 281     gmx_simd4_real_t fz1_S1;
 282     gmx_simd4_real_t fxy1_S;
 283     gmx_simd4_real_t fz1_S;
 284
 285     offset = k0 & 3;
 286
 287     fx_S = gmx_simd4_setzero_r();
 288     fy_S = gmx_simd4_setzero_r();
 289     fz_S = gmx_simd4_setzero_r();
 290
 291 #ifdef GMX_SIMD4_HAVE_UNALIGNED
 292     tz_S0 = gmx_simd4_loadu_r(thz-offset);
 293     tz_S1 = gmx_simd4_loadu_r(thz-offset+4);
 294     dz_S0 = gmx_simd4_loadu_r(dthz-offset);
 295     dz_S1 = gmx_simd4_loadu_r(dthz-offset+4);
 296 #else
 297     {
 298         int i;
 299         /* Copy (d)thz to an aligned buffer (unused buffer parts are masked) */
 300         for (i = 0; i < PME_ORDER; i++)
 301         {
 302             thz_aligned[offset+i]  = thz[i];
 303             dthz_aligned[offset+i] = dthz[i];
 304         }
 305         tz_S0 = gmx_simd4_load_r(thz_aligned);
 306         tz_S1 = gmx_simd4_load_r(thz_aligned+4);
 307         dz_S0 = gmx_simd4_load_r(dthz_aligned);
 308         dz_S1 = gmx_simd4_load_r(dthz_aligned+4);
 309     }
 310 #endif
 311     tz_S0 = gmx_simd4_blendzero_r(tz_S0, work->mask_S0[offset]);
 312     dz_S0 = gmx_simd4_blendzero_r(dz_S0, work->mask_S0[offset]);
 313     tz_S1 = gmx_simd4_blendzero_r(tz_S1, work->mask_S1[offset]);
 314     dz_S1 = gmx_simd4_blendzero_r(dz_S1, work->mask_S1[offset]);
 315
 316     for (ithx = 0; (ithx < PME_ORDER); ithx++)
 317     {
 318         index_x  = (i0+ithx)*pny*pnz;
 319         tx_S     = gmx_simd4_set1_r(thx[ithx]);
 320         dx_S     = gmx_simd4_set1_r(dthx[ithx]);
 321
 322         for (ithy = 0; (ithy < PME_ORDER); ithy++)
 323         {
 324             index_xy = index_x+(j0+ithy)*pnz;
 325             ty_S     = gmx_simd4_set1_r(thy[ithy]);
 326             dy_S     = gmx_simd4_set1_r(dthy[ithy]);
 327
 328             gval_S0 = gmx_simd4_load_r(grid+index_xy+k0-offset);
 329             gval_S1 = gmx_simd4_load_r(grid+index_xy+k0-offset+4);
 330
 331             fxy1_S0 = gmx_simd4_mul_r(tz_S0, gval_S0);
 332             fz1_S0  = gmx_simd4_mul_r(dz_S0, gval_S0);
 333             fxy1_S1 = gmx_simd4_mul_r(tz_S1, gval_S1);
 334             fz1_S1  = gmx_simd4_mul_r(dz_S1, gval_S1);
 335
 336             fxy1_S = gmx_simd4_add_r(fxy1_S0, fxy1_S1);
 337             fz1_S  = gmx_simd4_add_r(fz1_S0, fz1_S1);
 338
 339             fx_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(dx_S, ty_S), fxy1_S, fx_S);
 340             fy_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, dy_S), fxy1_S, fy_S);
 341             fz_S = gmx_simd4_fmadd_r(gmx_simd4_mul_r(tx_S, ty_S), fz1_S, fz_S);
 342         }
 343     }
 344
 345     gmx_simd4_store_r(fx_tmp, fx_S);
 346     gmx_simd4_store_r(fy_tmp, fy_S);
 347     gmx_simd4_store_r(fz_tmp, fz_S);
 348
 349     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
 350     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
 351     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
 352 }
 353 #undef PME_ORDER
 354 #undef PME_GATHER_F_SIMD4_ALIGNED
 355 #endif