src/gromacs/simd/general_x86_mic.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2013, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef _general_x86_mic_h_
  37 #define _general_x86_mic_h_
  38
  39 /* This file contains the SIMD implmenetation for Intel MIC
  40  */
  41
  42 #include <math.h>
  43 #include <immintrin.h>
  44
  45 #ifdef GMX_DOUBLE
  46 #error "Double precision isn't supported on Intel Phi yet"
  47 #endif
  48
  49 typedef __m512 gmx_mm_ps;
  50 typedef __m512 gmx_mm_pr;
  51 /* boolean SIMD register type */
  52 typedef __mmask16 gmx_mm_pb;
  53 typedef __m512i gmx_epi32;
  54
  55 #define GMX_HAVE_SIMD_MACROS
  56 #define GMX_SIMD_WIDTH_HERE  16
  57 #define GMX_SIMD_EPI32_WIDTH 16
  58
  59 #define gmx_load_pr _mm512_load_ps
  60
  61 /* Set all SIMD register elements to *r */
  62 static gmx_inline gmx_mm_ps
  63 gmx_load1_pr(const real *r)
  64 {
  65     return _mm512_extload_ps(r, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
  66 }
  67
  68 #define gmx_set1_pr _mm512_set1_ps
  69 /* Set all SIMD register elements to 0 */
  70 #define gmx_setzero_pr _mm512_setzero_ps
  71 #define gmx_store_pr _mm512_store_ps
  72
  73 #define gmx_add_pr _mm512_add_ps
  74 #define gmx_sub_pr _mm512_sub_ps
  75 #define gmx_mul_pr _mm512_mul_ps
  76
  77 #define GMX_SIMD_HAVE_FMA
  78 #define gmx_madd_pr _mm512_fmadd_ps
  79 #define gmx_nmsub_pr _mm512_fnmadd_ps
  80
  81 #define gmx_max_pr _mm512_max_ps
  82
  83 static gmx_inline gmx_mm_ps
  84 gmx_blendzero_pr(gmx_mm_ps a, gmx_mm_pb b)
  85 {
  86     return _mm512_mask_mov_ps(_mm512_setzero_ps(), b, a);
  87 }
  88
  89 #define gmx_round_pr _mm512_rint_ps
  90
  91 #define GMX_SIMD_HAVE_FLOOR
  92 #define gmx_floor_pr _mm512_floor_ps
  93
  94 /* Copy the sign of a to b, assumes b >= 0 for efficiency */
  95 static gmx_inline gmx_mm_ps
  96 gmx_cpsgn_nonneg_pr(gmx_mm_ps a, gmx_mm_ps b)
  97 {
  98     __m512 zero = _mm512_setzero_ps();
  99     __m512 neg1 = _mm512_set1_ps(-1);
 100     /* TODO (only bond): Bitwise operations on floating points can be done after casting to int.
 101        That allows us to do it the same way as AVX which might be faster. */
 102     return _mm512_mask_mul_ps(b, _mm512_cmplt_ps_mask(a, zero), b, neg1);
 103 }
 104
 105 /* Very specific operation required in the non-bonded kernels */
 106 static gmx_inline gmx_mm_ps
 107 gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_ps b, gmx_mm_ps c)
 108 {
 109     return _mm512_mask_add_ps(b, _mm512_knot(a), b, c);
 110 }
 111
 112 /* Comparison */
 113 #define gmx_cmplt_pr _mm512_cmplt_ps_mask
 114
 115 /* Logical AND on SIMD booleans. */
 116 #define gmx_and_pb _mm512_kand
 117
 118 /* Logical OR on SIMD booleans. */
 119 #define gmx_or_pb _mm512_kor
 120
 121 /* Returns a single int (0/1) which tells if any of the booleans is True
 122    It returns the full mask (not 1 for True). But given that any non-zero is True this is OK. */
 123 #define gmx_anytrue_pb _mm512_mask2int
 124
 125 /* Conversions only used for PME table lookup */
 126 static gmx_inline gmx_epi32
 127 gmx_cvttpr_epi32(gmx_mm_ps a)
 128 {
 129     return _mm512_cvtfxpnt_round_adjustps_epi32(a, _MM_ROUND_MODE_DOWN, _MM_EXPADJ_NONE);
 130 };
 131
 132 /* These two function only need to be approximate, Newton-Raphson iteration
 133  * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
 134  */
 135 #define gmx_rsqrt_pr _mm512_rsqrt23_ps
 136 #define gmx_rcp_pr _mm512_rcp23_ps
 137
 138 #define GMX_SIMD_HAVE_EXP
 139 #define gmx_exp_pr _mm512_exp_ps
 140 #define gmx_erfc_pr _mm512_erfc_ps
 141
 142 #define GMX_SIMD_HAVE_TRIGONOMETRIC
 143 #define gmx_sqrt_pr  _mm512_sqrt_ps
 144
 145 static gmx_inline int
 146 gmx_sincos_pr(gmx_mm_ps a,
 147               gmx_mm_ps *s, gmx_mm_ps *c)
 148 {
 149     /* TODO (only bond): optimize that both are calculated together.
 150        Or (if if that isn't fast on MIC) don't call sincos if only one is needed. */
 151     *s = _mm512_sin_ps(a);
 152     *c = _mm512_cos_ps(a);
 153     return 0;
 154 }
 155
 156 #define gmx_acos_pr _mm512_acos_ps
 157 #define gmx_atan2_pr _mm512_atan2_ps
 158
 159 #endif /* _general_x86_mic_h_ */