src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _nbnxn_kernel_simd_utils_ref_h_
  36 #define _nbnxn_kernel_simd_utils_ref_h_
  37
  38 typedef gmx_simd_ref_epi32      gmx_simd_ref_exclfilter;
  39 typedef gmx_simd_ref_exclfilter gmx_exclfilter;
  40 static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
  41
  42 /* Set the stride for the lookup of the two LJ parameters from their
  43    (padded) array. Only strides of 2 and 4 are currently supported. */
  44 #if defined GMX_NBNXN_SIMD_2XNN
  45 static const int nbfp_stride = 4;
  46 #elif defined GMX_DOUBLE
  47 static const int nbfp_stride = 2;
  48 #else
  49 static const int nbfp_stride = 4;
  50 #endif
  51
  52 #if GMX_SIMD_WIDTH_HERE > 4
  53 /* The 4xn kernel operates on 4-wide i-force registers */
  54
  55 /* float/double SIMD register type */
  56 typedef struct {
  57     real r[4];
  58 } gmx_mm_pr4;
  59
  60 static gmx_inline gmx_mm_pr4
  61 gmx_load_pr4(const real *r)
  62 {
  63     gmx_mm_pr4 a;
  64     int        i;
  65
  66     for (i = 0; i < 4; i++)
  67     {
  68         a.r[i] = r[i];
  69     }
  70
  71     return a;
  72 }
  73
  74 static gmx_inline void
  75 gmx_store_pr4(real *dest, gmx_mm_pr4 src)
  76 {
  77     gmx_mm_pr4 a;
  78     int        i;
  79
  80     for (i = 0; i < 4; i++)
  81     {
  82         dest[i] = src.r[i];
  83     }
  84 }
  85
  86 static gmx_inline gmx_mm_pr4
  87 gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
  88 {
  89     gmx_mm_pr4 c;
  90     int        i;
  91
  92     for (i = 0; i < 4; i++)
  93     {
  94         c.r[i] = a.r[i] + b.r[i];
  95     }
  96
  97     return c;
  98 }
  99
 100 #else
 101
 102 typedef gmx_simd_ref_pr gmx_simd_ref_pr4;
 103
 104 #endif
 105
 106
 107 #ifdef GMX_NBNXN_SIMD_2XNN
 108
 109 /* Half-width operations are required for the 2xnn kernels */
 110
 111 /* Half-width SIMD real type */
 112 /* float/double SIMD register type */
 113 typedef struct {
 114     real r[GMX_SIMD_WIDTH_HERE/2];
 115 } gmx_mm_hpr;
 116
 117 /* Half-width SIMD operations */
 118
 119 /* Load reals at half-width aligned pointer b into half-width SIMD register a */
 120 static gmx_inline void
 121 gmx_load_hpr(gmx_mm_hpr *a, const real *b)
 122 {
 123     int i;
 124
 125     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 126     {
 127         a->r[i] = b[i];
 128     }
 129 }
 130
 131 /* Set all entries in half-width SIMD register *a to b */
 132 static gmx_inline void
 133 gmx_set1_hpr(gmx_mm_hpr *a, real b)
 134 {
 135     int i;
 136
 137     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 138     {
 139         a->r[i] = b;
 140     }
 141 }
 142
 143 /* Load one real at b and one real at b+1 into halves of a, respectively */
 144 static gmx_inline void
 145 gmx_load1p1_pr(gmx_simd_ref_pr *a, const real *b)
 146 {
 147     int i;
 148
 149     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 150     {
 151         a->r[                        i] = b[0];
 152         a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[1];
 153     }
 154 }
 155
 156 /* Load reals at half-width aligned pointer b into two halves of a */
 157 static gmx_inline void
 158 gmx_loaddh_pr(gmx_simd_ref_pr *a, const real *b)
 159 {
 160     int i;
 161
 162     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 163     {
 164         a->r[i]                         = b[i];
 165         a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[i];
 166     }
 167 }
 168
 169 /* Store half-width SIMD register b into half width aligned memory a */
 170 static gmx_inline void
 171 gmx_store_hpr(real *a, gmx_mm_hpr b)
 172 {
 173     int i;
 174
 175     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 176     {
 177         a[i] = b.r[i];
 178     }
 179 }
 180
 181 static gmx_inline gmx_mm_hpr
 182 gmx_add_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
 183 {
 184     gmx_mm_hpr c;
 185     int        i;
 186
 187     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 188     {
 189         c.r[i] = a.r[i] + b.r[i];
 190     }
 191
 192     return c;
 193 }
 194
 195 static gmx_inline gmx_mm_hpr
 196 gmx_sub_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
 197 {
 198     gmx_mm_hpr c;
 199     int        i;
 200
 201     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 202     {
 203         c.r[i] = a.r[i] - b.r[i];
 204     }
 205
 206     return c;
 207 }
 208
 209 /* Sum over 4 half SIMD registers */
 210 static gmx_inline gmx_mm_hpr
 211 gmx_sum4_hpr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
 212 {
 213     gmx_mm_hpr c;
 214     int        i;
 215
 216     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 217     {
 218         c.r[i] =
 219             a.r[i] +
 220             a.r[GMX_SIMD_WIDTH_HERE/2+i] +
 221             b.r[i] +
 222             b.r[GMX_SIMD_WIDTH_HERE/2+i];
 223     }
 224
 225     return c;
 226 }
 227
 228 /* Sum the elements of halfs of each input register and store sums in out */
 229 static gmx_inline gmx_mm_pr4
 230 gmx_mm_transpose_sum4h_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
 231 {
 232     gmx_mm_pr4 sum;
 233     int        i;
 234
 235     sum.r[0] = 0;
 236     sum.r[1] = 0;
 237     sum.r[2] = 0;
 238     sum.r[3] = 0;
 239
 240     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 241     {
 242         sum.r[0] += a.r[i];
 243         sum.r[1] += a.r[GMX_SIMD_WIDTH_HERE/2+i];
 244         sum.r[2] += b.r[i];
 245         sum.r[3] += b.r[GMX_SIMD_WIDTH_HERE/2+i];
 246     }
 247
 248     return sum;
 249 }
 250
 251 static gmx_inline void
 252 gmx_pr_to_2hpr(gmx_simd_ref_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
 253 {
 254     int i;
 255
 256     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 257     {
 258         b->r[i] = a.r[i];
 259         c->r[i] = a.r[GMX_SIMD_WIDTH_HERE/2 + i];
 260     }
 261 }
 262 static gmx_inline void
 263 gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_ref_pr *c)
 264 {
 265     int i;
 266
 267     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 268     {
 269         c->r[i]                         = a.r[i];
 270         c->r[GMX_SIMD_WIDTH_HERE/2 + i] = b.r[i];
 271     }
 272 }
 273
 274 #endif /* GMX_NBNXN_SIMD_2XNN */
 275
 276
 277 #ifndef TAB_FDV0
 278 static gmx_inline void
 279 load_table_f(const real *tab_coul_F, gmx_simd_ref_epi32 ti_S, int *ti,
 280              gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
 281 {
 282     int i;
 283
 284     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 285     {
 286         ctab0_S->r[i] = tab_coul_F[ti_S.r[i]];
 287         ctab1_S->r[i] = tab_coul_F[ti_S.r[i]+1];
 288     }
 289
 290     *ctab1_S  = gmx_sub_pr(*ctab1_S, *ctab0_S);
 291 }
 292
 293 static gmx_inline void
 294 load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
 295                gmx_simd_ref_epi32 ti_S, int *ti,
 296                gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
 297                gmx_simd_ref_pr *ctabv_S)
 298 {
 299     int i;
 300
 301     load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
 302
 303     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 304     {
 305         ctabv_S->r[i] = tab_coul_V[ti_S.r[i]];
 306     }
 307 }
 308 #endif
 309
 310 #ifdef TAB_FDV0
 311 static gmx_inline void
 312 load_table_f(const real *tab_coul_FDV0, gmx_simd_ref_epi32 ti_S, int *ti,
 313              gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
 314 {
 315     int i;
 316
 317     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 318     {
 319         ctab0_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4];
 320         ctab1_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+1];
 321     }
 322 }
 323
 324 static gmx_inline void
 325 load_table_f_v(const real *tab_coul_FDV0,
 326                gmx_simd_ref_epi32 ti_S, int *ti,
 327                gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
 328                gmx_simd_ref_pr *ctabv_S)
 329 {
 330     int i;
 331
 332     load_table_f(tab_coul_FDV0, ti_S, ti, ctab0_S, ctab1_S);
 333
 334     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 335     {
 336         ctabv_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+2];
 337     }
 338 }
 339 #endif
 340
 341 /* Sum the elements within each input register and store the sums in out.
 342  * Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead.
 343  */
 344 #if GMX_SIMD_WIDTH_HERE == 2
 345 static gmx_inline gmx_simd_ref_pr
 346 gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1)
 347 {
 348     gmx_simd_ref_pr sum;
 349
 350     sum.r[0] = in0.r[0] + in0.r[1];
 351     sum.r[1] = in1.r[0] + in1.r[1];
 352
 353     return sum;
 354 }
 355 #endif
 356
 357 #if GMX_SIMD_WIDTH_HERE >= 4
 358 #if GMX_SIMD_WIDTH_HERE == 4
 359 static gmx_inline gmx_simd_ref_pr
 360 #else
 361 static gmx_inline gmx_mm_pr4
 362 #endif
 363 gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
 364                          gmx_simd_ref_pr in2, gmx_simd_ref_pr in3)
 365 {
 366 #if GMX_SIMD_WIDTH_HERE == 4
 367     gmx_simd_ref_pr sum;
 368 #else
 369     gmx_mm_pr4      sum;
 370 #endif
 371     int             i;
 372
 373     sum.r[0] = 0;
 374     sum.r[1] = 0;
 375     sum.r[2] = 0;
 376     sum.r[3] = 0;
 377
 378     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 379     {
 380         sum.r[0] += in0.r[i];
 381         sum.r[1] += in1.r[i];
 382         sum.r[2] += in2.r[i];
 383         sum.r[3] += in3.r[i];
 384     }
 385
 386     return sum;
 387 }
 388 #endif
 389
 390 #ifdef GMX_DOUBLE
 391 /* In double precision it can be faster to first calculate single precision
 392  * square roots for two double precision registers at once and then use
 393  * double precision Newton-Raphson iteration to reach full double precision.
 394  * For this reference code we just use a plain-C sqrt.
 395  */
 396 static gmx_inline void
 397 gmx_mm_invsqrt2_pd(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
 398                    gmx_simd_ref_pr *out0, gmx_simd_ref_pr *out1)
 399 {
 400     out0 = gmx_invsqrt_pr(in0);
 401     out1 = gmx_invsqrt_pr(in1);
 402 }
 403 #endif
 404
 405 static gmx_inline void
 406 load_lj_pair_params(const real *nbfp, const int *type, int aj,
 407                     gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
 408 {
 409     int i;
 410
 411     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 412     {
 413         c6_S->r[i]  = nbfp[type[aj+i]*nbfp_stride];
 414         c12_S->r[i] = nbfp[type[aj+i]*nbfp_stride+1];
 415     }
 416 }
 417
 418 #ifdef GMX_NBNXN_SIMD_2XNN
 419 static gmx_inline void
 420 load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
 421                      const int *type, int aj,
 422                      gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
 423 {
 424     int i;
 425
 426     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 427     {
 428         c6_S->r[i]                          = nbfp0[type[aj+i]*nbfp_stride];
 429         c6_S->r[GMX_SIMD_WIDTH_HERE/2 + i]  = nbfp1[type[aj+i]*nbfp_stride];
 430         c12_S->r[i]                         = nbfp0[type[aj+i]*nbfp_stride+1];
 431         c12_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1];
 432     }
 433 }
 434 #endif
 435
 436 /* Code for handling loading exclusions and converting them into
 437    interactions. The x86 code might use either integer- or real-type
 438    SIMD, but the reference code does not need to know. */
 439
 440 #define gmx_load1_exclfilter(e)       gmx_simd_ref_load1_exclfilter(e)
 441 #define gmx_load_exclusion_filter(e)  gmx_simd_ref_load_exclusion_filter((int *) e)
 442 #define gmx_checkbitmask_pb(m0, m1)   gmx_simd_ref_checkbitmask_pb(m0, m1)
 443
 444 static gmx_inline gmx_simd_ref_exclfilter
 445 gmx_simd_ref_load1_exclfilter(int src)
 446 {
 447     gmx_simd_ref_exclfilter a;
 448     int                     i;
 449
 450     for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
 451     {
 452         a.r[i] = src;
 453     }
 454
 455     return a;
 456 }
 457
 458 static gmx_inline gmx_simd_ref_exclfilter
 459 gmx_simd_ref_load_exclusion_filter(const int *src)
 460 {
 461     gmx_simd_ref_exclfilter a;
 462     int                     i;
 463
 464     for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
 465     {
 466         a.r[i] = src[i];
 467     }
 468
 469     return a;
 470 }
 471
 472 /* For topology exclusion-pair checking we need: ((a & b) ? True :
 473  * False). The x86 implementations use hardware-suitable integer-
 474  * and/or real-valued SIMD operations and a bit-wise "and" to do
 475  * this. The reference implementation normally uses logical operations
 476  * for logic, but in this case the i- and j-atom exclusion masks
 477  * computed during searching expect to be combined with bit-wise
 478  * "and".
 479  *
 480  * If the same bit is set in both input masks, return TRUE, else
 481  * FALSE. This function is only called with a single bit set in b.
 482  */
 483 static gmx_inline gmx_simd_ref_pb
 484 gmx_simd_ref_checkbitmask_pb(gmx_simd_ref_exclfilter a, gmx_simd_ref_exclfilter b)
 485 {
 486     gmx_simd_ref_pb c;
 487     int             i;
 488
 489     for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
 490     {
 491         c.r[i] = ((a.r[i] & b.r[i]) != 0);
 492     }
 493
 494     return c;
 495 }
 496
 497 #endif /* _nbnxn_kernel_simd_utils_ref_h_ */