src/mdlib/pme_sse_single.h

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 4.5
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2004, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * GROwing Monsters And Cloning Shrimps
  35  */
  36
  37 /* This include file has code between ifdef's to make sure
  38  * that this performance sensitive code is inlined
  39  * and to remove conditionals and variable loop bounds at compile time.
  40  */
  41
  42 #ifdef PME_SPREAD_SSE_ORDER4
  43 /* This code does not assume any memory alignment.
  44  * This code only works for pme_order = 4.
  45  */
  46 {
  47     __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3;
  48     __m128 tz_SSE;
  49     __m128 vx_SSE;
  50     __m128 vx_tz_SSE;
  51     __m128 sum_SSE0,sum_SSE1,sum_SSE2,sum_SSE3;
  52     __m128 gri_SSE0,gri_SSE1,gri_SSE2,gri_SSE3;
  53
  54     ty_SSE0 = _mm_load1_ps(&thy[0]);
  55     ty_SSE1 = _mm_load1_ps(&thy[1]);
  56     ty_SSE2 = _mm_load1_ps(&thy[2]);
  57     ty_SSE3 = _mm_load1_ps(&thy[3]);
  58
  59     tz_SSE  = _mm_loadu_ps(thz);
  60
  61     for(ithx=0; (ithx<4); ithx++)
  62     {
  63         index_x = (i0+ithx)*pny*pnz;
  64         valx    = qn*thx[ithx];
  65
  66         vx_SSE   = _mm_load1_ps(&valx);
  67
  68         vx_tz_SSE = _mm_mul_ps(vx_SSE,tz_SSE);
  69
  70         gri_SSE0 = _mm_loadu_ps(grid+index_x+(j0+0)*pnz+k0);
  71         gri_SSE1 = _mm_loadu_ps(grid+index_x+(j0+1)*pnz+k0);
  72         gri_SSE2 = _mm_loadu_ps(grid+index_x+(j0+2)*pnz+k0);
  73         gri_SSE3 = _mm_loadu_ps(grid+index_x+(j0+3)*pnz+k0);
  74
  75         sum_SSE0 = _mm_add_ps(gri_SSE0,_mm_mul_ps(vx_tz_SSE,ty_SSE0));
  76         sum_SSE1 = _mm_add_ps(gri_SSE1,_mm_mul_ps(vx_tz_SSE,ty_SSE1));
  77         sum_SSE2 = _mm_add_ps(gri_SSE2,_mm_mul_ps(vx_tz_SSE,ty_SSE2));
  78         sum_SSE3 = _mm_add_ps(gri_SSE3,_mm_mul_ps(vx_tz_SSE,ty_SSE3));
  79
  80         _mm_storeu_ps(grid+index_x+(j0+0)*pnz+k0,sum_SSE0);
  81         _mm_storeu_ps(grid+index_x+(j0+1)*pnz+k0,sum_SSE1);
  82         _mm_storeu_ps(grid+index_x+(j0+2)*pnz+k0,sum_SSE2);
  83         _mm_storeu_ps(grid+index_x+(j0+3)*pnz+k0,sum_SSE3);
  84     }
  85 }
  86 #undef PME_SPREAD_SSE_ORDER4
  87 #endif
  88
  89
  90 #ifdef PME_GATHER_F_SSE_ORDER4
  91 /* This code does not assume any memory alignment.
  92  * This code only works for pme_order = 4.
  93  */
  94 {
  95     float fx_tmp[4],fy_tmp[4],fz_tmp[4];
  96
  97     __m128 fx_SSE,fy_SSE,fz_SSE;
  98
  99     __m128 tx_SSE,ty_SSE,tz_SSE;
 100     __m128 dx_SSE,dy_SSE,dz_SSE;
 101
 102     __m128 gval_SSE;
 103
 104     __m128 fxy1_SSE;
 105     __m128 fz1_SSE;
 106
 107     fx_SSE = _mm_setzero_ps();
 108     fy_SSE = _mm_setzero_ps();
 109     fz_SSE = _mm_setzero_ps();
 110
 111     tz_SSE  = _mm_loadu_ps(thz);
 112     dz_SSE  = _mm_loadu_ps(dthz);
 113
 114     for(ithx=0; (ithx<4); ithx++)
 115     {
 116         index_x = (i0+ithx)*pny*pnz;
 117         tx_SSE   = _mm_load1_ps(thx+ithx);
 118         dx_SSE   = _mm_load1_ps(dthx+ithx);
 119
 120         for(ithy=0; (ithy<4); ithy++)
 121         {
 122             index_xy = index_x+(j0+ithy)*pnz;
 123             ty_SSE   = _mm_load1_ps(thy+ithy);
 124             dy_SSE   = _mm_load1_ps(dthy+ithy);
 125
 126             gval_SSE = _mm_loadu_ps(grid+index_xy+k0);
 127
 128             fxy1_SSE = _mm_mul_ps(tz_SSE,gval_SSE);
 129             fz1_SSE  = _mm_mul_ps(dz_SSE,gval_SSE);
 130
 131             fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
 132             fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
 133             fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
 134         }
 135     }
 136
 137     _mm_storeu_ps(fx_tmp,fx_SSE);
 138     _mm_storeu_ps(fy_tmp,fy_SSE);
 139     _mm_storeu_ps(fz_tmp,fz_SSE);
 140
 141     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
 142     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
 143     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
 144 }
 145 #undef PME_GATHER_F_SSE_ORDER4
 146 #endif
 147
 148
 149 #ifdef PME_SPREAD_SSE_ALIGNED
 150 /* This code assumes that the grid is allocated 16 bit aligned
 151  * and that pnz is a multiple of 4.
 152  * This code supports pme_order <= 5.
 153  */
 154 {
 155     int offset;
 156     int index;
 157     __m128 ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3,ty_SSE4;
 158     __m128 tz_SSE0;
 159     __m128 tz_SSE1;
 160     __m128 vx_SSE;
 161     __m128 vx_tz_SSE0;
 162     __m128 vx_tz_SSE1;
 163     __m128 sum_SSE00,sum_SSE01,sum_SSE02,sum_SSE03,sum_SSE04;
 164     __m128 sum_SSE10,sum_SSE11,sum_SSE12,sum_SSE13,sum_SSE14;
 165     __m128 gri_SSE00,gri_SSE01,gri_SSE02,gri_SSE03,gri_SSE04;
 166     __m128 gri_SSE10,gri_SSE11,gri_SSE12,gri_SSE13,gri_SSE14;
 167
 168     offset = k0 & 3;
 169
 170     ty_SSE0 = _mm_load1_ps(&thy[0]);
 171     ty_SSE1 = _mm_load1_ps(&thy[1]);
 172     ty_SSE2 = _mm_load1_ps(&thy[2]);
 173     ty_SSE3 = _mm_load1_ps(&thy[3]);
 174 #if PME_ORDER == 5
 175     ty_SSE4 = _mm_load1_ps(&thy[4]);
 176 #endif
 177
 178     tz_SSE0 = _mm_loadu_ps(thz-offset);
 179     tz_SSE1 = _mm_loadu_ps(thz-offset+4);
 180     tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
 181     tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
 182
 183     for(ithx=0; (ithx<PME_ORDER); ithx++)
 184     {
 185         index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
 186         valx  = qn*thx[ithx];
 187
 188         vx_SSE   = _mm_load1_ps(&valx);
 189
 190         vx_tz_SSE0 = _mm_mul_ps(vx_SSE,tz_SSE0);
 191         vx_tz_SSE1 = _mm_mul_ps(vx_SSE,tz_SSE1);
 192
 193         gri_SSE00 = _mm_load_ps(grid+index+0*pnz);
 194         gri_SSE01 = _mm_load_ps(grid+index+1*pnz);
 195         gri_SSE02 = _mm_load_ps(grid+index+2*pnz);
 196         gri_SSE03 = _mm_load_ps(grid+index+3*pnz);
 197 #if PME_ORDER == 5
 198         gri_SSE04 = _mm_load_ps(grid+index+4*pnz);
 199 #endif
 200         gri_SSE10 = _mm_load_ps(grid+index+0*pnz+4);
 201         gri_SSE11 = _mm_load_ps(grid+index+1*pnz+4);
 202         gri_SSE12 = _mm_load_ps(grid+index+2*pnz+4);
 203         gri_SSE13 = _mm_load_ps(grid+index+3*pnz+4);
 204 #if PME_ORDER == 5
 205         gri_SSE14 = _mm_load_ps(grid+index+4*pnz+4);
 206 #endif
 207
 208         sum_SSE00 = _mm_add_ps(gri_SSE00,_mm_mul_ps(vx_tz_SSE0,ty_SSE0));
 209         sum_SSE01 = _mm_add_ps(gri_SSE01,_mm_mul_ps(vx_tz_SSE0,ty_SSE1));
 210         sum_SSE02 = _mm_add_ps(gri_SSE02,_mm_mul_ps(vx_tz_SSE0,ty_SSE2));
 211         sum_SSE03 = _mm_add_ps(gri_SSE03,_mm_mul_ps(vx_tz_SSE0,ty_SSE3));
 212 #if PME_ORDER == 5
 213         sum_SSE04 = _mm_add_ps(gri_SSE04,_mm_mul_ps(vx_tz_SSE0,ty_SSE4));
 214 #endif
 215         sum_SSE10 = _mm_add_ps(gri_SSE10,_mm_mul_ps(vx_tz_SSE1,ty_SSE0));
 216         sum_SSE11 = _mm_add_ps(gri_SSE11,_mm_mul_ps(vx_tz_SSE1,ty_SSE1));
 217         sum_SSE12 = _mm_add_ps(gri_SSE12,_mm_mul_ps(vx_tz_SSE1,ty_SSE2));
 218         sum_SSE13 = _mm_add_ps(gri_SSE13,_mm_mul_ps(vx_tz_SSE1,ty_SSE3));
 219 #if PME_ORDER == 5
 220         sum_SSE14 = _mm_add_ps(gri_SSE14,_mm_mul_ps(vx_tz_SSE1,ty_SSE4));
 221 #endif
 222
 223         _mm_store_ps(grid+index+0*pnz,sum_SSE00);
 224         _mm_store_ps(grid+index+1*pnz,sum_SSE01);
 225         _mm_store_ps(grid+index+2*pnz,sum_SSE02);
 226         _mm_store_ps(grid+index+3*pnz,sum_SSE03);
 227 #if PME_ORDER == 5
 228         _mm_store_ps(grid+index+4*pnz,sum_SSE04);
 229 #endif
 230         _mm_store_ps(grid+index+0*pnz+4,sum_SSE10);
 231         _mm_store_ps(grid+index+1*pnz+4,sum_SSE11);
 232         _mm_store_ps(grid+index+2*pnz+4,sum_SSE12);
 233         _mm_store_ps(grid+index+3*pnz+4,sum_SSE13);
 234 #if PME_ORDER == 5
 235         _mm_store_ps(grid+index+4*pnz+4,sum_SSE14);
 236 #endif
 237     }
 238 }
 239 #undef PME_ORDER
 240 #undef PME_SPREAD_SSE_ALIGNED
 241 #endif
 242
 243
 244 #ifdef PME_GATHER_F_SSE_ALIGNED
 245 /* This code assumes that the grid is allocated 16 bit aligned
 246  * and that pnz is a multiple of 4.
 247  * This code supports pme_order <= 5.
 248  */
 249 {
 250     int   offset;
 251
 252     float fx_tmp[4],fy_tmp[4],fz_tmp[4];
 253
 254     __m128 fx_SSE,fy_SSE,fz_SSE;
 255
 256     __m128 tx_SSE,ty_SSE,tz_SSE0,tz_SSE1;
 257     __m128 dx_SSE,dy_SSE,dz_SSE0,dz_SSE1;
 258
 259     __m128 gval_SSE0;
 260     __m128 gval_SSE1;
 261
 262     __m128 fxy1_SSE0;
 263     __m128 fz1_SSE0;
 264     __m128 fxy1_SSE1;
 265     __m128 fz1_SSE1;
 266     __m128 fxy1_SSE;
 267     __m128 fz1_SSE;
 268
 269     offset = k0 & 3;
 270
 271     fx_SSE = _mm_setzero_ps();
 272     fy_SSE = _mm_setzero_ps();
 273     fz_SSE = _mm_setzero_ps();
 274
 275     tz_SSE0 = _mm_loadu_ps(thz-offset);
 276     dz_SSE0 = _mm_loadu_ps(dthz-offset);
 277     tz_SSE1 = _mm_loadu_ps(thz-offset+4);
 278     dz_SSE1 = _mm_loadu_ps(dthz-offset+4);
 279     tz_SSE0 = _mm_and_ps(tz_SSE0,work->mask_SSE0[offset]);
 280     dz_SSE0 = _mm_and_ps(dz_SSE0,work->mask_SSE0[offset]);
 281     tz_SSE1 = _mm_and_ps(tz_SSE1,work->mask_SSE1[offset]);
 282     dz_SSE1 = _mm_and_ps(dz_SSE1,work->mask_SSE1[offset]);
 283
 284     for(ithx=0; (ithx<PME_ORDER); ithx++)
 285     {
 286         index_x = (i0+ithx)*pny*pnz;
 287         tx_SSE   = _mm_load1_ps(thx+ithx);
 288         dx_SSE   = _mm_load1_ps(dthx+ithx);
 289
 290         for(ithy=0; (ithy<PME_ORDER); ithy++)
 291         {
 292             index_xy = index_x+(j0+ithy)*pnz;
 293             ty_SSE   = _mm_load1_ps(thy+ithy);
 294             dy_SSE   = _mm_load1_ps(dthy+ithy);
 295
 296             gval_SSE0 = _mm_load_ps(grid+index_xy+k0-offset);
 297             gval_SSE1 = _mm_load_ps(grid+index_xy+k0-offset+4);
 298
 299             fxy1_SSE0 = _mm_mul_ps(tz_SSE0,gval_SSE0);
 300             fz1_SSE0  = _mm_mul_ps(dz_SSE0,gval_SSE0);
 301             fxy1_SSE1 = _mm_mul_ps(tz_SSE1,gval_SSE1);
 302             fz1_SSE1  = _mm_mul_ps(dz_SSE1,gval_SSE1);
 303
 304             fxy1_SSE = _mm_add_ps(fxy1_SSE0,fxy1_SSE1);
 305             fz1_SSE  = _mm_add_ps(fz1_SSE0,fz1_SSE1);
 306
 307             fx_SSE = _mm_add_ps(fx_SSE,_mm_mul_ps(_mm_mul_ps(dx_SSE,ty_SSE),fxy1_SSE));
 308             fy_SSE = _mm_add_ps(fy_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,dy_SSE),fxy1_SSE));
 309             fz_SSE = _mm_add_ps(fz_SSE,_mm_mul_ps(_mm_mul_ps(tx_SSE,ty_SSE),fz1_SSE));
 310         }
 311     }
 312
 313     _mm_store_ps(fx_tmp,fx_SSE);
 314     _mm_store_ps(fy_tmp,fy_SSE);
 315     _mm_store_ps(fz_tmp,fz_SSE);
 316
 317     fx += fx_tmp[0]+fx_tmp[1]+fx_tmp[2]+fx_tmp[3];
 318     fy += fy_tmp[0]+fy_tmp[1]+fy_tmp[2]+fy_tmp[3];
 319     fz += fz_tmp[0]+fz_tmp[1]+fz_tmp[2]+fz_tmp[3];
 320 }
 321 #undef PME_ORDER
 322 #undef PME_GATHER_F_SSE_ALIGNED
 323 #endif