src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef _nbnxn_kernel_simd_utils_ref_h_
  36 #define _nbnxn_kernel_simd_utils_ref_h_
  37
  38 typedef gmx_simd_ref_epi32      gmx_simd_ref_exclfilter;
  39 typedef gmx_simd_ref_exclfilter gmx_exclfilter;
  40 static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
  41
  42 /* Set the stride for the lookup of the two LJ parameters from their
  43    (padded) array. Only strides of 2 and 4 are currently supported. */
  44 #if defined GMX_NBNXN_SIMD_2XNN
  45 static const int nbfp_stride = 4;
  46 #elif defined GMX_DOUBLE
  47 static const int nbfp_stride = 2;
  48 #else
  49 static const int nbfp_stride = 4;
  50 #endif
  51
  52 #if GMX_SIMD_WIDTH_HERE > 4
  53 /* The 4xn kernel operates on 4-wide i-force registers */
  54
  55 /* float/double SIMD register type */
  56 typedef struct {
  57     real r[4];
  58 } gmx_mm_pr4;
  59
  60 static gmx_inline gmx_mm_pr4
  61 gmx_load_pr4(const real *r)
  62 {
  63     gmx_mm_pr4 a;
  64     int        i;
  65
  66     for (i = 0; i < 4; i++)
  67     {
  68         a.r[i] = r[i];
  69     }
  70
  71     return a;
  72 }
  73
  74 static gmx_inline void
  75 gmx_store_pr4(real *dest, gmx_mm_pr4 src)
  76 {
  77     gmx_mm_pr4 a;
  78     int        i;
  79
  80     for (i = 0; i < 4; i++)
  81     {
  82         dest[i] = src.r[i];
  83     }
  84 }
  85
  86 static gmx_inline gmx_mm_pr4
  87 gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
  88 {
  89     gmx_mm_pr4 c;
  90     int        i;
  91
  92     for (i = 0; i < 4; i++)
  93     {
  94         c.r[i] = a.r[i] + b.r[i];
  95     }
  96
  97     return c;
  98 }
  99 #else
 100
 101 typedef gmx_simd_ref_pr gmx_mm_pr4;
 102
 103 #define gmx_load_pr4   gmx_load_pr
 104 #define gmx_store_pr4  gmx_store_pr
 105 #define gmx_add_pr4    gmx_add_pr
 106
 107 #endif
 108
 109
 110 #ifdef GMX_NBNXN_SIMD_2XNN
 111
 112 /* Half-width operations are required for the 2xnn kernels */
 113
 114 /* Half-width SIMD real type */
 115 /* float/double SIMD register type */
 116 typedef struct {
 117     real r[GMX_SIMD_WIDTH_HERE/2];
 118 } gmx_mm_hpr;
 119
 120 /* Half-width SIMD operations */
 121
 122 /* Load reals at half-width aligned pointer b into half-width SIMD register a */
 123 static gmx_inline void
 124 gmx_load_hpr(gmx_mm_hpr *a, const real *b)
 125 {
 126     int i;
 127
 128     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 129     {
 130         a->r[i] = b[i];
 131     }
 132 }
 133
 134 /* Set all entries in half-width SIMD register *a to b */
 135 static gmx_inline void
 136 gmx_set1_hpr(gmx_mm_hpr *a, real b)
 137 {
 138     int i;
 139
 140     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 141     {
 142         a->r[i] = b;
 143     }
 144 }
 145
 146 /* Load one real at b and one real at b+1 into halves of a, respectively */
 147 static gmx_inline void
 148 gmx_load1p1_pr(gmx_simd_ref_pr *a, const real *b)
 149 {
 150     int i;
 151
 152     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 153     {
 154         a->r[                        i] = b[0];
 155         a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[1];
 156     }
 157 }
 158
 159 /* Load reals at half-width aligned pointer b into two halves of a */
 160 static gmx_inline void
 161 gmx_loaddh_pr(gmx_simd_ref_pr *a, const real *b)
 162 {
 163     int i;
 164
 165     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 166     {
 167         a->r[i]                         = b[i];
 168         a->r[GMX_SIMD_WIDTH_HERE/2 + i] = b[i];
 169     }
 170 }
 171
 172 /* Store half-width SIMD register b into half width aligned memory a */
 173 static gmx_inline void
 174 gmx_store_hpr(real *a, gmx_mm_hpr b)
 175 {
 176     int i;
 177
 178     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 179     {
 180         a[i] = b.r[i];
 181     }
 182 }
 183
 184 static gmx_inline gmx_mm_hpr
 185 gmx_add_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
 186 {
 187     gmx_mm_hpr c;
 188     int        i;
 189
 190     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 191     {
 192         c.r[i] = a.r[i] + b.r[i];
 193     }
 194
 195     return c;
 196 }
 197
 198 static gmx_inline gmx_mm_hpr
 199 gmx_sub_hpr(gmx_mm_hpr a, gmx_mm_hpr b)
 200 {
 201     gmx_mm_hpr c;
 202     int        i;
 203
 204     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 205     {
 206         c.r[i] = a.r[i] - b.r[i];
 207     }
 208
 209     return c;
 210 }
 211
 212 /* Sum over 4 half SIMD registers */
 213 static gmx_inline gmx_mm_hpr
 214 gmx_sum4_hpr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
 215 {
 216     gmx_mm_hpr c;
 217     int        i;
 218
 219     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 220     {
 221         c.r[i] =
 222             a.r[i] +
 223             a.r[GMX_SIMD_WIDTH_HERE/2+i] +
 224             b.r[i] +
 225             b.r[GMX_SIMD_WIDTH_HERE/2+i];
 226     }
 227
 228     return c;
 229 }
 230
 231 #ifdef GMX_NBNXN_SIMD_2XNN
 232 /* Sum the elements of halfs of each input register and store sums in out */
 233 static gmx_inline gmx_mm_pr4
 234 gmx_mm_transpose_sum4h_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
 235 {
 236     gmx_mm_pr4 sum;
 237     int        i;
 238
 239     sum.r[0] = 0;
 240     sum.r[1] = 0;
 241     sum.r[2] = 0;
 242     sum.r[3] = 0;
 243
 244     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 245     {
 246         sum.r[0] += a.r[i];
 247         sum.r[1] += a.r[GMX_SIMD_WIDTH_HERE/2+i];
 248         sum.r[2] += b.r[i];
 249         sum.r[3] += b.r[GMX_SIMD_WIDTH_HERE/2+i];
 250     }
 251
 252     return sum;
 253 }
 254 #endif
 255
 256 static gmx_inline void
 257 gmx_pr_to_2hpr(gmx_simd_ref_pr a, gmx_mm_hpr *b, gmx_mm_hpr *c)
 258 {
 259     int i;
 260
 261     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 262     {
 263         b->r[i] = a.r[i];
 264         c->r[i] = a.r[GMX_SIMD_WIDTH_HERE/2 + i];
 265     }
 266 }
 267 static gmx_inline void
 268 gmx_2hpr_to_pr(gmx_mm_hpr a, gmx_mm_hpr b, gmx_simd_ref_pr *c)
 269 {
 270     int i;
 271
 272     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 273     {
 274         c->r[i]                         = a.r[i];
 275         c->r[GMX_SIMD_WIDTH_HERE/2 + i] = b.r[i];
 276     }
 277 }
 278
 279 #endif /* GMX_NBNXN_SIMD_2XNN */
 280
 281
 282 #ifndef TAB_FDV0
 283 static gmx_inline void
 284 load_table_f(const real *tab_coul_F, gmx_simd_ref_epi32 ti_S,
 285              int gmx_unused *ti,
 286              gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
 287 {
 288     int i;
 289
 290     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 291     {
 292         ctab0_S->r[i] = tab_coul_F[ti_S.r[i]];
 293         ctab1_S->r[i] = tab_coul_F[ti_S.r[i]+1];
 294     }
 295
 296     *ctab1_S  = gmx_sub_pr(*ctab1_S, *ctab0_S);
 297 }
 298
 299 static gmx_inline void
 300 load_table_f_v(const real *tab_coul_F, const real *tab_coul_V,
 301                gmx_simd_ref_epi32 ti_S, int *ti,
 302                gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
 303                gmx_simd_ref_pr *ctabv_S)
 304 {
 305     int i;
 306
 307     load_table_f(tab_coul_F, ti_S, ti, ctab0_S, ctab1_S);
 308
 309     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 310     {
 311         ctabv_S->r[i] = tab_coul_V[ti_S.r[i]];
 312     }
 313 }
 314 #endif
 315
 316 #ifdef TAB_FDV0
 317 static gmx_inline void
 318 load_table_f(const real *tab_coul_FDV0, gmx_simd_ref_epi32 ti_S, int *ti,
 319              gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S)
 320 {
 321     int i;
 322
 323     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 324     {
 325         ctab0_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4];
 326         ctab1_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+1];
 327     }
 328 }
 329
 330 static gmx_inline void
 331 load_table_f_v(const real *tab_coul_FDV0,
 332                gmx_simd_ref_epi32 ti_S, int *ti,
 333                gmx_simd_ref_pr *ctab0_S, gmx_simd_ref_pr *ctab1_S,
 334                gmx_simd_ref_pr *ctabv_S)
 335 {
 336     int i;
 337
 338     load_table_f(tab_coul_FDV0, ti_S, ti, ctab0_S, ctab1_S);
 339
 340     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 341     {
 342         ctabv_S->r[i] = tab_coul_FDV0[ti_S.r[i]*4+2];
 343     }
 344 }
 345 #endif
 346
 347 /* Sum the elements within each input register and store the sums in out.
 348  * Note that 4/8-way SIMD requires gmx_mm_transpose_sum4_pr instead.
 349  */
 350 #if GMX_SIMD_WIDTH_HERE == 2
 351 static gmx_inline gmx_simd_ref_pr
 352 gmx_mm_transpose_sum2_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1)
 353 {
 354     gmx_simd_ref_pr sum;
 355
 356     sum.r[0] = in0.r[0] + in0.r[1];
 357     sum.r[1] = in1.r[0] + in1.r[1];
 358
 359     return sum;
 360 }
 361 #endif
 362
 363 #if GMX_SIMD_WIDTH_HERE >= 4
 364 #if GMX_SIMD_WIDTH_HERE == 4
 365 static gmx_inline gmx_simd_ref_pr
 366 #else
 367 static gmx_inline gmx_mm_pr4
 368 #endif
 369 gmx_mm_transpose_sum4_pr(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
 370                          gmx_simd_ref_pr in2, gmx_simd_ref_pr in3)
 371 {
 372 #if GMX_SIMD_WIDTH_HERE == 4
 373     gmx_simd_ref_pr sum;
 374 #else
 375     gmx_mm_pr4      sum;
 376 #endif
 377     int             i;
 378
 379     sum.r[0] = 0;
 380     sum.r[1] = 0;
 381     sum.r[2] = 0;
 382     sum.r[3] = 0;
 383
 384     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 385     {
 386         sum.r[0] += in0.r[i];
 387         sum.r[1] += in1.r[i];
 388         sum.r[2] += in2.r[i];
 389         sum.r[3] += in3.r[i];
 390     }
 391
 392     return sum;
 393 }
 394 #endif
 395
 396 #ifdef GMX_DOUBLE
 397 /* In double precision it can be faster to first calculate single precision
 398  * square roots for two double precision registers at once and then use
 399  * double precision Newton-Raphson iteration to reach full double precision.
 400  * For this reference code we just use a plain-C sqrt.
 401  */
 402 static gmx_inline void
 403 gmx_mm_invsqrt2_pd(gmx_simd_ref_pr in0, gmx_simd_ref_pr in1,
 404                    gmx_simd_ref_pr *out0, gmx_simd_ref_pr *out1)
 405 {
 406     *out0 = gmx_invsqrt_pr(in0);
 407     *out1 = gmx_invsqrt_pr(in1);
 408 }
 409 #endif
 410
 411 static gmx_inline void
 412 load_lj_pair_params(const real *nbfp, const int *type, int aj,
 413                     gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
 414 {
 415     int i;
 416
 417     for (i = 0; i < GMX_SIMD_WIDTH_HERE; i++)
 418     {
 419         c6_S->r[i]  = nbfp[type[aj+i]*nbfp_stride];
 420         c12_S->r[i] = nbfp[type[aj+i]*nbfp_stride+1];
 421     }
 422 }
 423
 424 #ifdef GMX_NBNXN_SIMD_2XNN
 425 static gmx_inline void
 426 load_lj_pair_params2(const real *nbfp0, const real *nbfp1,
 427                      const int *type, int aj,
 428                      gmx_simd_ref_pr *c6_S, gmx_simd_ref_pr *c12_S)
 429 {
 430     int i;
 431
 432     for (i = 0; i < GMX_SIMD_WIDTH_HERE/2; i++)
 433     {
 434         c6_S->r[i]                          = nbfp0[type[aj+i]*nbfp_stride];
 435         c6_S->r[GMX_SIMD_WIDTH_HERE/2 + i]  = nbfp1[type[aj+i]*nbfp_stride];
 436         c12_S->r[i]                         = nbfp0[type[aj+i]*nbfp_stride+1];
 437         c12_S->r[GMX_SIMD_WIDTH_HERE/2 + i] = nbfp1[type[aj+i]*nbfp_stride+1];
 438     }
 439 }
 440 #endif
 441
 442 /* Code for handling loading exclusions and converting them into
 443    interactions. The x86 code might use either integer- or real-type
 444    SIMD, but the reference code does not need to know. */
 445
 446 #define gmx_load1_exclfilter(e)       gmx_simd_ref_load1_exclfilter(e)
 447 #define gmx_load_exclusion_filter(e)  gmx_simd_ref_load_exclusion_filter((int *) e)
 448 #define gmx_checkbitmask_pb(m0, m1)   gmx_simd_ref_checkbitmask_pb(m0, m1)
 449
 450 static gmx_inline gmx_simd_ref_exclfilter
 451 gmx_simd_ref_load1_exclfilter(int src)
 452 {
 453     gmx_simd_ref_exclfilter a;
 454     int                     i;
 455
 456     for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
 457     {
 458         a.r[i] = src;
 459     }
 460
 461     return a;
 462 }
 463
 464 static gmx_inline gmx_simd_ref_exclfilter
 465 gmx_simd_ref_load_exclusion_filter(const int *src)
 466 {
 467     gmx_simd_ref_exclfilter a;
 468     int                     i;
 469
 470     for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
 471     {
 472         a.r[i] = src[i];
 473     }
 474
 475     return a;
 476 }
 477
 478 /* For topology exclusion-pair checking we need: ((a & b) ? True :
 479  * False). The x86 implementations use hardware-suitable integer-
 480  * and/or real-valued SIMD operations and a bit-wise "and" to do
 481  * this. The reference implementation normally uses logical operations
 482  * for logic, but in this case the i- and j-atom exclusion masks
 483  * computed during searching expect to be combined with bit-wise
 484  * "and".
 485  *
 486  * If the same bit is set in both input masks, return TRUE, else
 487  * FALSE. This function is only called with a single bit set in b.
 488  */
 489 static gmx_inline gmx_simd_ref_pb
 490 gmx_simd_ref_checkbitmask_pb(gmx_simd_ref_exclfilter a, gmx_simd_ref_exclfilter b)
 491 {
 492     gmx_simd_ref_pb c;
 493     int             i;
 494
 495     for (i = 0; i < GMX_SIMD_REF_WIDTH; i++)
 496     {
 497         c.r[i] = ((a.r[i] & b.r[i]) != 0);
 498     }
 499
 500     return c;
 501 }
 502
 503 #endif /* _nbnxn_kernel_simd_utils_ref_h_ */