src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2012, The GROMACS Development Team
   6  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   7  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   8  * others, as listed in the AUTHORS file in the top-level source
   9  * directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #ifndef _nbnxn_kernel_simd_utils_ref_h_
  38 #define _nbnxn_kernel_simd_utils_ref_h_
  39
  40 typedef gmx_simd_ref_epi32      gmx_simd_ref_exclfilter;
  41 typedef gmx_simd_ref_exclfilter gmx_exclfilter;
  42 static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
  43
  44 /* Set the stride for the lookup of the two LJ parameters from their
  45    (padded) array. Only strides of 2 and 4 are currently supported. */
  46 #if defined GMX_NBNXN_SIMD_2XNN
  47 static const int nbfp_stride = 4;
  48 #elif defined GMX_DOUBLE
  49 static const int nbfp_stride = 2;
  50 #else
  51 static const int nbfp_stride = 4;
  52 #endif
  53
  54 #if GMX_SIMD_WIDTH_HERE > 4
  55 /* The 4xn kernel operates on 4-wide i-force registers */
  56
  57 /* float/double SIMD register type */
  58 typedef struct {
  59     real r[4];
  60 } gmx_mm_pr4;
  61
  62 static gmx_inline gmx_mm_pr4
  63 gmx_load_pr4(const real *r)
  64 {
  65     gmx_mm_pr4 a;
  66     int        i;
  67
  68     for (i = 0; i < 4; i++)
  69     {
  70         a.r[i] = r[i];
  71     }
  72
  73     return a;
  74 }
  75
  76 static gmx_inline void
  77 gmx_store_pr4(real *dest, gmx_mm_pr4 src)
  78 {
  79     gmx_mm_pr4 a;
  80     int        i;
  81
  82     for (i = 0; i < 4; i++)
  83     {
  84         dest[i] = src.r[i];
  85     }
  86 }
  87
  88 static gmx_inline gmx_mm_pr4
  89 gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
  90 {
  91     gmx_mm_pr4 c;
  92     int        i;
  93
  94     for (i = 0; i < 4; i++)
  95     {
  96         c.r[i] = a.r[i] + b.r[i];
  97     }
  98
  99     return c;
 100 }
 101
 102 #else
 103
 104 typedef gmx_simd_ref_pr gmx_simd_ref_pr4;
 105
 106 #endif
 107
 108
 109 #ifdef GMX_NBNXN_SIMD_2XNN
 110
 111 /* Half-width operations are required for the 2xnn kernels */
 112
 113 /* Half-width SIMD real type */
 114 /* float/double SIMD register type */
 115 typedef struct {
 116     real r[GMX_SIMD_WIDTH_HERE/2];
 117 } gmx_mm_hpr;
 118
 119 /* Half-width SIMD operations */
 120
 121 /* Load reals at half-width aligned pointer b into half-width SIMD register a */
 122 static gmx_inline void
 123 gmx_load_hpr(gmx_mm_hpr *a, const real *b)
 124 {
 125     int i;
 126
 127     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 128     {
 129         a->r[i] = b[i];
 130     }
 131 }
 132
 133 /* Set all entries in half-width SIMD register *a to b */
 134 static gmx_inline void
 135 gmx_set1_hpr(gmx_mm_hpr *a, real b)
 136 {
 137     int i;
 138
 139     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 140     {
 141         a->r[i] = b;
 142     }
 143 }
 144
 145 /* Load one real at b and one real at b+1 into halves of a, respectively */
 146 static gmx_inline void
 147 gmx_load1p1_pr(gmx_simd_ref_pr *a, const real *b)
 148 {
 149     int i;
 150
 151     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 152     {
 153         a->r[                        i] = b[0];
 154         a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[1];
 155     }
 156 }
 157
 158 /* Load reals at half-width aligned pointer b into two halves of a */
 159 static gmx_inline void
 160 gmx_loaddh_pr(gmx_simd_ref_pr *a, const real *b)
 161 {
 162     int i;
 163
 164     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 165     {
 166         a->r[i]                         = b[i];
 167         a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[i];
 168     }
 169 }
 170
 171 /* Store half-width SIMD register b into half width aligned memory a */
 172 static gmx_inline void
 173 gmx_store_hpr(real *a, gmx_mm_hpr b)
 174 {
 175     int i;
 176
 177     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 178     {
 179         a[i] = b.r[i];
 180     }
 181 }
 182
 183 static gmx_inline gmx_mm_hpr
 184 gmx_add_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
 185 {
 186     gmx_mm_hpr c;
 187     int        i;
 188
 189     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 190     {
 191         c.r[i] = a.r[i] + b.r[i];
 192     }
 193
 194     return c;
 195 }
 196
 197 static gmx_inline gmx_mm_hpr
 198 gmx_sub_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
 199 {
 200     gmx_mm_hpr c;
 201     int        i;
 202
 203     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 204     {
 205         c.r[i] = a.r[i] - b.r[i];
 206     }
 207
 208     return c;
 209 }
 210
 211 /* Sum over 4 half SIMD registers */
 212 static gmx_inline gmx_mm_hpr
 213 gmx_sum4_hpr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
 214 {
 215     gmx_mm_hpr c;
 216     int        i;
 217
 218     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 219     {
 220         c.r[i] =
 221             a.r[i] +
 222             a.r[GMX_SIMD_WIDTH_HERE/2+i] +
 223             b.r[i] +
 224             b.r[GMX_SIMD_WIDTH_HERE/2+i];
 225     }
 226
 227     return c;
 228 }
 229
 230 /* Sum the elements of halfs of each input register and store sums in out */
 231 static gmx_inline gmx_mm_pr4
 232 gmx_mm_transpose_sum4h_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
 233 {
 234     gmx_mm_pr4 sum;
 235     int        i;
 236
 237     sum.r[0] = 0;
 238     sum.r[1] = 0;
 239     sum.r[2] = 0;
 240     sum.r[3] = 0;
 241
 242     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 243     {
 244         sum.r[0] += a.r[i];
 245         sum.r[1] += a.r[GMX_SIMD_WIDTH_HERE/2+i];
 246         sum.r[2] += b.r[i];
 247         sum.r[3] += b.r[GMX_SIMD_WIDTH_HERE/2+i];
 248     }
 249
 250     return sum;
 251 }
 252
 253 static gmx_inline void
 254 gmx_pr_to_2hpr(gmx_simd_ref_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
 255 {
 256     int i;
 257
 258     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 259     {
 260         b->r[i] = a.r[i];
 261         c->r[i] = a.r[GMX_SIMD_WIDTH_HERE/2 + i];
 262     }
 263 }
 264 static gmx_inline void
 265 gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_ref_pr *c)
 266 {
 267     int i;
 268
 269     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 270     {
 271         c->r[i]                         = a.r[i];
 272         c->r[GMX_SIMD_WIDTH_HERE/2 + i] = b.r[i];
 273     }
 274 }
 275
 276 #endif /* GMX_NBNXN_SIMD_2XNN */
 277
 278
 279 #ifndef TAB_FDV0
 280 static gmx_inline void
 281 load_table_f(const real *tab_coul_F, gmx_simd_ref_epi32 ti_S, int *ti,
 282              gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
 283 {
 284     int i;
 285
 286     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 287     {
 288         ctab0_S->r[i] = tab_coul_F[ti_S.r[i]];
 289         ctab1_S->r[i] = tab_coul_F[ti_S.r[i]+1];
 290     }
 291
 292     *ctab1_S  = gmx_sub_pr(*ctab1_S, *ctab0_S);
 293 }
 294
 295 static gmx_inline void
 296 load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
 297                gmx_simd_ref_epi32 ti_S, int *ti,
 298                gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
 299                gmx_simd_ref_pr *ctabv_S)
 300 {
 301     int i;
 302
 303     load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
 304
 305     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 306     {
 307         ctabv_S->r[i] = tab_coul_V[ti_S.r[i]];
 308     }
 309 }
 310 #endif
 311
 312 #ifdef TAB_FDV0
 313 static gmx_inline void
 314 load_table_f(const real *tab_coul_FDV0, gmx_simd_ref_epi32 ti_S, int *ti,
 315              gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
 316 {
 317     int i;
 318
 319     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 320     {
 321         ctab0_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4];
 322         ctab1_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+1];
 323     }
 324 }
 325
 326 static gmx_inline void
 327 load_table_f_v(const real *tab_coul_FDV0,
 328                gmx_simd_ref_epi32 ti_S, int *ti,
 329                gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
 330                gmx_simd_ref_pr *ctabv_S)
 331 {
 332     int i;
 333
 334     load_table_f(tab_coul_FDV0, ti_S, ti, ctab0_S, ctab1_S);
 335
 336     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 337     {
 338         ctabv_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+2];
 339     }
 340 }
 341 #endif
 342
 343 /* Sum the elements within each input register and store the sums in out.
 344  * Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead.
 345  */
 346 #if GMX_SIMD_WIDTH_HERE == 2
 347 static gmx_inline gmx_simd_ref_pr
 348 gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1)
 349 {
 350     gmx_simd_ref_pr sum;
 351
 352     sum.r[0] = in0.r[0] + in0.r[1];
 353     sum.r[1] = in1.r[0] + in1.r[1];
 354
 355     return sum;
 356 }
 357 #endif
 358
 359 #if GMX_SIMD_WIDTH_HERE >= 4
 360 #if GMX_SIMD_WIDTH_HERE == 4
 361 static gmx_inline gmx_simd_ref_pr
 362 #else
 363 static gmx_inline gmx_mm_pr4
 364 #endif
 365 gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
 366                          gmx_simd_ref_pr in2, gmx_simd_ref_pr in3)
 367 {
 368 #if GMX_SIMD_WIDTH_HERE == 4
 369     gmx_simd_ref_pr sum;
 370 #else
 371     gmx_mm_pr4      sum;
 372 #endif
 373     int             i;
 374
 375     sum.r[0] = 0;
 376     sum.r[1] = 0;
 377     sum.r[2] = 0;
 378     sum.r[3] = 0;
 379
 380     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 381     {
 382         sum.r[0] += in0.r[i];
 383         sum.r[1] += in1.r[i];
 384         sum.r[2] += in2.r[i];
 385         sum.r[3] += in3.r[i];
 386     }
 387
 388     return sum;
 389 }
 390 #endif
 391
 392 #ifdef GMX_DOUBLE
 393 /* In double precision it can be faster to first calculate single precision
 394  * square roots for two double precision registers at once and then use
 395  * double precision Newton-Raphson iteration to reach full double precision.
 396  * For this reference code we just use a plain-C sqrt.
 397  */
 398 static gmx_inline void
 399 gmx_mm_invsqrt2_pd(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
 400                    gmx_simd_ref_pr *out0, gmx_simd_ref_pr *out1)
 401 {
 402     out0 = gmx_invsqrt_pr(in0);
 403     out1 = gmx_invsqrt_pr(in1);
 404 }
 405 #endif
 406
 407 static gmx_inline void
 408 load_lj_pair_params(const real *nbfp, const int *type, int aj,
 409                     gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
 410 {
 411     int i;
 412
 413     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 414     {
 415         c6_S->r[i]  = nbfp[type[aj+i]*nbfp_stride];
 416         c12_S->r[i] = nbfp[type[aj+i]*nbfp_stride+1];
 417     }
 418 }
 419
 420 #ifdef GMX_NBNXN_SIMD_2XNN
 421 static gmx_inline void
 422 load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
 423                      const int *type, int aj,
 424                      gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
 425 {
 426     int i;
 427
 428     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 429     {
 430         c6_S->r[i]                          = nbfp0[type[aj+i]*nbfp_stride];
 431         c6_S->r[GMX_SIMD_WIDTH_HERE/2 + i]  = nbfp1[type[aj+i]*nbfp_stride];
 432         c12_S->r[i]                         = nbfp0[type[aj+i]*nbfp_stride+1];
 433         c12_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1];
 434     }
 435 }
 436 #endif
 437
 438 /* Code for handling loading exclusions and converting them into
 439    interactions. The x86 code might use either integer- or real-type
 440    SIMD, but the reference code does not need to know. */
 441
 442 #define gmx_load1_exclfilter(e)       gmx_simd_ref_load1_exclfilter(e)
 443 #define gmx_load_exclusion_filter(e)  gmx_simd_ref_load_exclusion_filter((int *) e)
 444 #define gmx_checkbitmask_pb(m0, m1)   gmx_simd_ref_checkbitmask_pb(m0, m1)
 445
 446 static gmx_inline gmx_simd_ref_exclfilter
 447 gmx_simd_ref_load1_exclfilter(int src)
 448 {
 449     gmx_simd_ref_exclfilter a;
 450     int                     i;
 451
 452     for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
 453     {
 454         a.r[i] = src;
 455     }
 456
 457     return a;
 458 }
 459
 460 static gmx_inline gmx_simd_ref_exclfilter
 461 gmx_simd_ref_load_exclusion_filter(const int *src)
 462 {
 463     gmx_simd_ref_exclfilter a;
 464     int                     i;
 465
 466     for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
 467     {
 468         a.r[i] = src[i];
 469     }
 470
 471     return a;
 472 }
 473
 474 /* For topology exclusion-pair checking we need: ((a & b) ? True :
 475  * False). The x86 implementations use hardware-suitable integer-
 476  * and/or real-valued SIMD operations and a bit-wise "and" to do
 477  * this. The reference implementation normally uses logical operations
 478  * for logic, but in this case the i- and j-atom exclusion masks
 479  * computed during searching expect to be combined with bit-wise
 480  * "and".
 481  *
 482  * If the same bit is set in both input masks, return TRUE, else
 483  * FALSE. This function is only called with a single bit set in b.
 484  */
 485 static gmx_inline gmx_simd_ref_pb
 486 gmx_simd_ref_checkbitmask_pb(gmx_simd_ref_exclfilter a, gmx_simd_ref_exclfilter b)
 487 {
 488     gmx_simd_ref_pb c;
 489     int             i;
 490
 491     for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
 492     {
 493         c.r[i] = ((a.r[i] & b.r[i]) != 0);
 494     }
 495
 496     return c;
 497 }
 498
 499 #endif /* _nbnxn_kernel_simd_utils_ref_h_ */