src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _nbnxn_kernel_simd_utils_ref_h_
  36 #define _nbnxn_kernel_simd_utils_ref_h_
  37
  38 #
  39 #include "gromacs/simd/simd_math.h"
  40
  41 typedef gmx_simd_int32_t        gmx_simd_ref_exclfilter;
  42 typedef gmx_simd_ref_exclfilter gmx_exclfilter;
  43 static const int filter_stride = GMX_SIMD_INT32_WIDTH/GMX_SIMD_REAL_WIDTH;
  44
  45 /* Set the stride for the lookup of the two LJ parameters from their
  46    (padded) array. Only strides of 2 and 4 are currently supported. */
  47 #if defined GMX_NBNXN_SIMD_2XNN
  48 static const int nbfp_stride = 4;
  49 #elif defined GMX_DOUBLE
  50 static const int nbfp_stride = 2;
  51 #else
  52 static const int nbfp_stride = 4;
  53 #endif
  54
  55 #if GMX_SIMD_REAL_WIDTH > 4
  56 /* The 4xn kernel operates on 4-wide i-force registers */
  57
  58 /* float/double SIMD register type */
  59 typedef struct {
  60     real r[4];
  61 } gmx_simd4_real_t;
  62
  63 static gmx_inline gmx_simd4_real_t
  64 gmx_simd4_load_r(const real *r)
  65 {
  66     gmx_simd4_real_t a;
  67     int              i;
  68
  69     for (i = 0; i < 4; i++)
  70     {
  71         a.r[i] = r[i];
  72     }
  73
  74     return a;
  75 }
  76
  77 static gmx_inline void
  78 gmx_simd4_store_r(real *dest, gmx_simd4_real_t src)
  79 {
  80     gmx_simd4_real_t a;
  81     int              i;
  82
  83     for (i = 0; i < 4; i++)
  84     {
  85         dest[i] = src.r[i];
  86     }
  87 }
  88
  89 static gmx_inline gmx_simd4_real_t
  90 gmx_simd4_add_r(gmx_simd4_real_t a, gmx_simd4_real_t b)
  91 {
  92     gmx_simd4_real_t c;
  93     int              i;
  94
  95     for (i = 0; i < 4; i++)
  96     {
  97         c.r[i] = a.r[i] + b.r[i];
  98     }
  99
 100     return c;
 101 }
 102
 103 static gmx_inline real
 104 gmx_simd4_reduce_r(gmx_simd4_real_t a)
 105 {
 106     return a.r[0] + a.r[1] + a.r[2] + a.r[3];
 107 }
 108
 109 #endif
 110
 111
 112 #ifdef GMX_NBNXN_SIMD_2XNN
 113
 114 /* Half-width operations are required for the 2xnn kernels */
 115
 116 /* Half-width SIMD real type */
 117 /* float/double SIMD register type */
 118 typedef struct {
 119     real r[GMX_SIMD_REAL_WIDTH/2];
 120 } gmx_mm_hpr;
 121
 122 /* Half-width SIMD operations */
 123
 124 /* Load reals at half-width aligned pointer b into half-width SIMD register a */
 125 static gmx_inline void
 126 gmx_load_hpr(gmx_mm_hpr *a, const real *b)
 127 {
 128     int i;
 129
 130     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 131     {
 132         a->r[i] = b[i];
 133     }
 134 }
 135
 136 /* Set all entries in half-width SIMD register *a to b */
 137 static gmx_inline void
 138 gmx_set1_hpr(gmx_mm_hpr *a, real b)
 139 {
 140     int i;
 141
 142     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 143     {
 144         a->r[i] = b;
 145     }
 146 }
 147
 148 /* Load one real at b and one real at b+1 into halves of a, respectively */
 149 static gmx_inline void
 150 gmx_load1p1_pr(gmx_simd_real_t *a, const real *b)
 151 {
 152     int i;
 153
 154     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 155     {
 156         a->r[                        i] = b[0];
 157         a->r[GMX_SIMD_REAL_WIDTH/2 + i] = b[1];
 158     }
 159 }
 160
 161 /* Load reals at half-width aligned pointer b into two halves of a */
 162 static gmx_inline void
 163 gmx_loaddh_pr(gmx_simd_real_t *a, const real *b)
 164 {
 165     int i;
 166
 167     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 168     {
 169         a->r[i]                         = b[i];
 170         a->r[GMX_SIMD_REAL_WIDTH/2 + i] = b[i];
 171     }
 172 }
 173
 174 /* Store half-width SIMD register b into half width aligned memory a */
 175 static gmx_inline void
 176 gmx_store_hpr(real *a, gmx_mm_hpr b)
 177 {
 178     int i;
 179
 180     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 181     {
 182         a[i] = b.r[i];
 183     }
 184 }
 185
 186 static gmx_inline gmx_mm_hpr
 187 gmx_add_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
 188 {
 189     gmx_mm_hpr c;
 190     int        i;
 191
 192     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 193     {
 194         c.r[i] = a.r[i] + b.r[i];
 195     }
 196
 197     return c;
 198 }
 199
 200 static gmx_inline gmx_mm_hpr
 201 gmx_sub_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
 202 {
 203     gmx_mm_hpr c;
 204     int        i;
 205
 206     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 207     {
 208         c.r[i] = a.r[i] - b.r[i];
 209     }
 210
 211     return c;
 212 }
 213
 214 /* Sum over 4 half SIMD registers */
 215 static gmx_inline gmx_mm_hpr
 216 gmx_sum4_hpr(gmx_simd_real_t a, gmx_simd_real_t b)
 217 {
 218     gmx_mm_hpr c;
 219     int        i;
 220
 221     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 222     {
 223         c.r[i] =
 224             a.r[i] +
 225             a.r[GMX_SIMD_REAL_WIDTH/2+i] +
 226             b.r[i] +
 227             b.r[GMX_SIMD_REAL_WIDTH/2+i];
 228     }
 229
 230     return c;
 231 }
 232
 233 #ifdef GMX_NBNXN_SIMD_2XNN
 234 /* Sum the elements of halfs of each input register and store sums in out */
 235 static gmx_inline gmx_simd4_real_t
 236 gmx_mm_transpose_sum4h_pr(gmx_simd_real_t a, gmx_simd_real_t b)
 237 {
 238     gmx_simd4_real_t sum;
 239     int              i;
 240
 241     sum.r[0] = 0;
 242     sum.r[1] = 0;
 243     sum.r[2] = 0;
 244     sum.r[3] = 0;
 245
 246     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 247     {
 248         sum.r[0] += a.r[i];
 249         sum.r[1] += a.r[GMX_SIMD_REAL_WIDTH/2+i];
 250         sum.r[2] += b.r[i];
 251         sum.r[3] += b.r[GMX_SIMD_REAL_WIDTH/2+i];
 252     }
 253
 254     return sum;
 255 }
 256 #endif
 257
 258 static gmx_inline void
 259 gmx_pr_to_2hpr(gmx_simd_real_t a, gmx_mm_hpr *b, gmx_mm_hpr *c)
 260 {
 261     int i;
 262
 263     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 264     {
 265         b->r[i] = a.r[i];
 266         c->r[i] = a.r[GMX_SIMD_REAL_WIDTH/2 + i];
 267     }
 268 }
 269 static gmx_inline void
 270 gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_real_t *c)
 271 {
 272     int i;
 273
 274     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 275     {
 276         c->r[i]                         = a.r[i];
 277         c->r[GMX_SIMD_REAL_WIDTH/2 + i] = b.r[i];
 278     }
 279 }
 280
 281 #endif /* GMX_NBNXN_SIMD_2XNN */
 282
 283
 284 #ifndef TAB_FDV0
 285 static gmx_inline void
 286 load_table_f(const real *tab_coul_F, gmx_simd_int32_t ti_S,
 287              int gmx_unused *ti,
 288              gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S)
 289 {
 290     int i;
 291
 292     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 293     {
 294         ctab0_S->r[i] = tab_coul_F[ti_S.i[i]];
 295         ctab1_S->r[i] = tab_coul_F[ti_S.i[i]+1];
 296     }
 297
 298     *ctab1_S  = gmx_simd_sub_r(*ctab1_S, *ctab0_S);
 299 }
 300
 301 static gmx_inline void
 302 load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
 303                gmx_simd_int32_t ti_S, int *ti,
 304                gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S,
 305                gmx_simd_real_t *ctabv_S)
 306 {
 307     int i;
 308
 309     load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
 310
 311     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 312     {
 313         ctabv_S->r[i] = tab_coul_V[ti_S.i[i]];
 314     }
 315 }
 316 #endif
 317
 318 #ifdef TAB_FDV0
 319 static gmx_inline void
 320 load_table_f(const real *tab_coul_FDV0, gmx_simd_int32_t ti_S, int *ti,
 321              gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S)
 322 {
 323     int i;
 324
 325     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 326     {
 327         ctab0_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4];
 328         ctab1_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4+1];
 329     }
 330 }
 331
 332 static gmx_inline void
 333 load_table_f_v(const real *tab_coul_FDV0,
 334                gmx_simd_int32_t ti_S, int *ti,
 335                gmx_simd_real_t *ctab0_S, gmx_simd_real_t *ctab1_S,
 336                gmx_simd_real_t *ctabv_S)
 337 {
 338     int i;
 339
 340     load_table_f(tab_coul_FDV0, ti_S, ti, ctab0_S, ctab1_S);
 341
 342     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 343     {
 344         ctabv_S->r[i] = tab_coul_FDV0[ti_S.i[i]*4+2];
 345     }
 346 }
 347 #endif
 348
 349 /* Sum the elements within each input register and store the sums in out.
 350  * Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead.
 351  */
 352 #if GMX_SIMD_REAL_WIDTH == 2
 353 static gmx_inline gmx_simd_real_t
 354 gmx_mm_transpose_sum2_pr(gmx_simd_real_t in0, gmx_simd_real_t in1)
 355 {
 356     gmx_simd_real_t sum;
 357
 358     sum.r[0] = in0.r[0] + in0.r[1];
 359     sum.r[1] = in1.r[0] + in1.r[1];
 360
 361     return sum;
 362 }
 363 #endif
 364
 365 #if GMX_SIMD_REAL_WIDTH >= 4
 366 #if GMX_SIMD_REAL_WIDTH == 4
 367 static gmx_inline gmx_simd_real_t
 368 #else
 369 static gmx_inline gmx_simd4_real_t
 370 #endif
 371 gmx_mm_transpose_sum4_pr(gmx_simd_real_t in0, gmx_simd_real_t in1,
 372                          gmx_simd_real_t in2, gmx_simd_real_t in3)
 373 {
 374 #if GMX_SIMD_REAL_WIDTH == 4
 375     gmx_simd_real_t  sum;
 376 #else
 377     gmx_simd4_real_t sum;
 378 #endif
 379     int              i;
 380
 381     sum.r[0] = 0;
 382     sum.r[1] = 0;
 383     sum.r[2] = 0;
 384     sum.r[3] = 0;
 385
 386     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 387     {
 388         sum.r[0] += in0.r[i];
 389         sum.r[1] += in1.r[i];
 390         sum.r[2] += in2.r[i];
 391         sum.r[3] += in3.r[i];
 392     }
 393
 394     return sum;
 395 }
 396 #endif
 397
 398 #ifdef GMX_DOUBLE
 399 /* In double precision it can be faster to first calculate single precision
 400  * square roots for two double precision registers at once and then use
 401  * double precision Newton-Raphson iteration to reach full double precision.
 402  * For this reference code we just use a plain-C sqrt.
 403  */
 404 static gmx_inline void
 405 gmx_mm_invsqrt2_pd(gmx_simd_real_t in0, gmx_simd_real_t in1,
 406                    gmx_simd_real_t *out0, gmx_simd_real_t *out1)
 407 {
 408     *out0 = gmx_simd_invsqrt_r(in0);
 409     *out1 = gmx_simd_invsqrt_r(in1);
 410 }
 411 #endif
 412
 413 static gmx_inline void
 414 load_lj_pair_params(const real *nbfp, const int *type, int aj,
 415                     gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
 416 {
 417     int i;
 418
 419     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 420     {
 421         c6_S->r[i]  = nbfp[type[aj+i]*nbfp_stride];
 422         c12_S->r[i] = nbfp[type[aj+i]*nbfp_stride+1];
 423     }
 424 }
 425
 426 #ifdef GMX_NBNXN_SIMD_2XNN
 427 static gmx_inline void
 428 load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
 429                      const int *type, int aj,
 430                      gmx_simd_real_t *c6_S, gmx_simd_real_t *c12_S)
 431 {
 432     int i;
 433
 434     for (i = 0; i < GMX_SIMD_REAL_WIDTH/2; i++)
 435     {
 436         c6_S->r[i]                          = nbfp0[type[aj+i]*nbfp_stride];
 437         c6_S->r[GMX_SIMD_REAL_WIDTH/2 + i]  = nbfp1[type[aj+i]*nbfp_stride];
 438         c12_S->r[i]                         = nbfp0[type[aj+i]*nbfp_stride+1];
 439         c12_S->r[GMX_SIMD_REAL_WIDTH/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1];
 440     }
 441 }
 442 #endif
 443
 444 /* Code for handling loading exclusions and converting them into
 445    interactions. The x86 code might use either integer- or real-type
 446    SIMD, but the reference code does not need to know. */
 447
 448 #define gmx_load1_exclfilter(e)       gmx_simd_ref_load1_exclfilter(e)
 449 #define gmx_load_exclusion_filter(e)  gmx_simd_ref_load_exclusion_filter((int *) e)
 450 #define gmx_checkbitmask_pb(m0, m1)   gmx_simd_ref_checkbitmask_pb(m0, m1)
 451
 452 static gmx_inline gmx_simd_ref_exclfilter
 453 gmx_simd_ref_load1_exclfilter(int src)
 454 {
 455     gmx_simd_ref_exclfilter a;
 456     int                     i;
 457
 458     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 459     {
 460         a.i[i] = src;
 461     }
 462
 463     return a;
 464 }
 465
 466 static gmx_inline gmx_simd_ref_exclfilter
 467 gmx_simd_ref_load_exclusion_filter(const int *src)
 468 {
 469     gmx_simd_ref_exclfilter a;
 470     int                     i;
 471
 472     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 473     {
 474         a.i[i] = src[i];
 475     }
 476
 477     return a;
 478 }
 479
 480 /* For topology exclusion-pair checking we need: ((a & b) ? True :
 481  * False). The x86 implementations use hardware-suitable integer-
 482  * and/or real-valued SIMD operations and a bit-wise "and" to do
 483  * this. The reference implementation normally uses logical operations
 484  * for logic, but in this case the i- and j-atom exclusion masks
 485  * computed during searching expect to be combined with bit-wise
 486  * "and".
 487  *
 488  * If the same bit is set in both input masks, return TRUE, else
 489  * FALSE. This function is only called with a single bit set in b.
 490  */
 491 static gmx_inline gmx_simd_bool_t
 492 gmx_simd_ref_checkbitmask_pb(gmx_simd_ref_exclfilter a, gmx_simd_ref_exclfilter b)
 493 {
 494     gmx_simd_bool_t c;
 495     int             i;
 496
 497     for (i = 0; i < GMX_SIMD_REAL_WIDTH; i++)
 498     {
 499         c.b[i] = ((a.i[i] & b.i[i]) != 0);
 500     }
 501
 502     return c;
 503 }
 504
 505 #endif /* _nbnxn_kernel_simd_utils_ref_h_ */