src/gromacs/ewald/pme-simd4.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37
  38 /* This include file has code between ifdef's to make sure
  39  * that this performance sensitive code is inlined
  40  * and to remove conditionals and variable loop bounds at compile time.
  41  */
  42
  43 #ifdef PME_SPREAD_SIMD4_ORDER4
  44 /* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
  45  * This code does not assume any memory alignment for the grid.
  46  */
  47 {
  48     using namespace gmx;
  49     Simd4Real ty_S0(thy[0]);
  50     Simd4Real ty_S1(thy[1]);
  51     Simd4Real ty_S2(thy[2]);
  52     Simd4Real ty_S3(thy[3]);
  53     Simd4Real tz_S;
  54     Simd4Real vx_S;
  55     Simd4Real vx_tz_S;
  56     Simd4Real sum_S0, sum_S1, sum_S2, sum_S3;
  57     Simd4Real gri_S0, gri_S1, gri_S2, gri_S3;
  58
  59     /* With order 4 the z-spline is actually aligned */
  60     tz_S  = load4(thz);
  61
  62     for (ithx = 0; (ithx < 4); ithx++)
  63     {
  64         index_x = (i0+ithx)*pny*pnz;
  65         valx    = coefficient*thx[ithx];
  66
  67         vx_S   = Simd4Real(valx);
  68
  69         vx_tz_S = vx_S * tz_S;
  70
  71         gri_S0 = load4U(grid+index_x+(j0+0)*pnz+k0);
  72         gri_S1 = load4U(grid+index_x+(j0+1)*pnz+k0);
  73         gri_S2 = load4U(grid+index_x+(j0+2)*pnz+k0);
  74         gri_S3 = load4U(grid+index_x+(j0+3)*pnz+k0);
  75
  76         sum_S0 = fma(vx_tz_S, ty_S0, gri_S0);
  77         sum_S1 = fma(vx_tz_S, ty_S1, gri_S1);
  78         sum_S2 = fma(vx_tz_S, ty_S2, gri_S2);
  79         sum_S3 = fma(vx_tz_S, ty_S3, gri_S3);
  80
  81         store4U(grid+index_x+(j0+0)*pnz+k0, sum_S0);
  82         store4U(grid+index_x+(j0+1)*pnz+k0, sum_S1);
  83         store4U(grid+index_x+(j0+2)*pnz+k0, sum_S2);
  84         store4U(grid+index_x+(j0+3)*pnz+k0, sum_S3);
  85     }
  86 }
  87 #undef PME_SPREAD_SIMD4_ORDER4
  88 #endif
  89
  90
  91 #ifdef PME_SPREAD_SIMD4_ALIGNED
  92 /* This code assumes that the grid is allocated 4-real aligned
  93  * and that pnz is a multiple of 4.
  94  * This code supports pme_order <= 5.
  95  */
  96 {
  97     using namespace gmx;
  98     int              offset;
  99     int              index;
 100     Simd4Real        ty_S0(thy[0]);
 101     Simd4Real        ty_S1(thy[1]);
 102     Simd4Real        ty_S2(thy[2]);
 103     Simd4Real        ty_S3(thy[3]);
 104     Simd4Real        tz_S0;
 105     Simd4Real        tz_S1;
 106     Simd4Real        vx_S;
 107     Simd4Real        vx_tz_S0;
 108     Simd4Real        vx_tz_S1;
 109     Simd4Real        sum_S00, sum_S01, sum_S02, sum_S03;
 110     Simd4Real        sum_S10, sum_S11, sum_S12, sum_S13;
 111     Simd4Real        gri_S00, gri_S01, gri_S02, gri_S03;
 112     Simd4Real        gri_S10, gri_S11, gri_S12, gri_S13;
 113 #if PME_ORDER == 5
 114     Simd4Real        ty_S4(thy[4]);
 115     Simd4Real        sum_S04;
 116     Simd4Real        sum_S14;
 117     Simd4Real        gri_S04;
 118     Simd4Real        gri_S14;
 119 #endif
 120
 121     offset = k0 & 3;
 122
 123 #ifdef PME_SIMD4_UNALIGNED
 124     tz_S0 = load4U(thz-offset);
 125     tz_S1 = load4U(thz-offset+4);
 126 #else
 127     {
 128         int i;
 129         /* Copy thz to an aligned buffer (unused buffer parts are masked) */
 130         for (i = 0; i < PME_ORDER; i++)
 131         {
 132             thz_aligned[offset+i] = thz[i];
 133         }
 134         tz_S0 = load4(thz_aligned);
 135         tz_S1 = load4(thz_aligned+4);
 136     }
 137 #endif
 138     tz_S0 = selectByMask(tz_S0, work->mask_S0[offset]);
 139     tz_S1 = selectByMask(tz_S1, work->mask_S1[offset]);
 140
 141     for (ithx = 0; (ithx < PME_ORDER); ithx++)
 142     {
 143         index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
 144         valx  = coefficient*thx[ithx];
 145
 146         vx_S   = Simd4Real(valx);
 147
 148         vx_tz_S0 = vx_S * tz_S0;
 149         vx_tz_S1 = vx_S * tz_S1;
 150
 151         gri_S00 = load4(grid+index+0*pnz);
 152         gri_S01 = load4(grid+index+1*pnz);
 153         gri_S02 = load4(grid+index+2*pnz);
 154         gri_S03 = load4(grid+index+3*pnz);
 155 #if PME_ORDER == 5
 156         gri_S04 = load4(grid+index+4*pnz);
 157 #endif
 158         gri_S10 = load4(grid+index+0*pnz+4);
 159         gri_S11 = load4(grid+index+1*pnz+4);
 160         gri_S12 = load4(grid+index+2*pnz+4);
 161         gri_S13 = load4(grid+index+3*pnz+4);
 162 #if PME_ORDER == 5
 163         gri_S14 = load4(grid+index+4*pnz+4);
 164 #endif
 165
 166         sum_S00 = fma(vx_tz_S0, ty_S0, gri_S00);
 167         sum_S01 = fma(vx_tz_S0, ty_S1, gri_S01);
 168         sum_S02 = fma(vx_tz_S0, ty_S2, gri_S02);
 169         sum_S03 = fma(vx_tz_S0, ty_S3, gri_S03);
 170 #if PME_ORDER == 5
 171         sum_S04 = fma(vx_tz_S0, ty_S4, gri_S04);
 172 #endif
 173         sum_S10 = fma(vx_tz_S1, ty_S0, gri_S10);
 174         sum_S11 = fma(vx_tz_S1, ty_S1, gri_S11);
 175         sum_S12 = fma(vx_tz_S1, ty_S2, gri_S12);
 176         sum_S13 = fma(vx_tz_S1, ty_S3, gri_S13);
 177 #if PME_ORDER == 5
 178         sum_S14 = fma(vx_tz_S1, ty_S4, gri_S14);
 179 #endif
 180
 181         store4(grid+index+0*pnz, sum_S00);
 182         store4(grid+index+1*pnz, sum_S01);
 183         store4(grid+index+2*pnz, sum_S02);
 184         store4(grid+index+3*pnz, sum_S03);
 185 #if PME_ORDER == 5
 186         store4(grid+index+4*pnz, sum_S04);
 187 #endif
 188         store4(grid+index+0*pnz+4, sum_S10);
 189         store4(grid+index+1*pnz+4, sum_S11);
 190         store4(grid+index+2*pnz+4, sum_S12);
 191         store4(grid+index+3*pnz+4, sum_S13);
 192 #if PME_ORDER == 5
 193         store4(grid+index+4*pnz+4, sum_S14);
 194 #endif
 195     }
 196 }
 197 #undef PME_ORDER
 198 #undef PME_SPREAD_SIMD4_ALIGNED
 199 #endif