Apply clang-format to source tree

[alexxy/gromacs.git] / src / gromacs / nbnxm / kernels_simd_2xmm / kernel_inner.h
diff --git a/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h b/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h

index d4a11e64d306b1867bcec01a4621608c593ef4d7..f109430c019fd62c96c199ed4362454ae984ea7c 100644 (file)
--- a/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h
+++ b/src/gromacs/nbnxm/kernels_simd_2xmm/kernel_inner.h
@@ -59,232 +59,231 @@
   * contributions.
   */
  #if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM)
-#define EXCL_FORCES
+#    define EXCL_FORCES
  #endif
  
  {
-    int        cj, aj, ajx, ajy, ajz;
+    int cj, aj, ajx, ajy, ajz;
  
  #ifdef ENERGY_GROUPS
      /* Energy group indices for two atoms packed into one int */
-    int        egp_jj[UNROLLJ/2];
+    int egp_jj[UNROLLJ / 2];
  #endif
  
  #ifdef CHECK_EXCLS
      /* Interaction (non-exclusion) mask of all 1's or 0's */
-    SimdBool  interact_S0;
-    SimdBool  interact_S2;
+    SimdBool interact_S0;
+    SimdBool interact_S2;
  #endif
  
-    SimdReal  jx_S, jy_S, jz_S;
-    SimdReal  dx_S0, dy_S0, dz_S0;
-    SimdReal  dx_S2, dy_S2, dz_S2;
-    SimdReal  tx_S0, ty_S0, tz_S0;
-    SimdReal  tx_S2, ty_S2, tz_S2;
-    SimdReal  rsq_S0, rinv_S0, rinvsq_S0;
-    SimdReal  rsq_S2, rinv_S2, rinvsq_S2;
+    SimdReal jx_S, jy_S, jz_S;
+    SimdReal dx_S0, dy_S0, dz_S0;
+    SimdReal dx_S2, dy_S2, dz_S2;
+    SimdReal tx_S0, ty_S0, tz_S0;
+    SimdReal tx_S2, ty_S2, tz_S2;
+    SimdReal rsq_S0, rinv_S0, rinvsq_S0;
+    SimdReal rsq_S2, rinv_S2, rinvsq_S2;
      /* wco: within cut-off, mask of all 1's or 0's */
-    SimdBool  wco_S0;
-    SimdBool  wco_S2;
+    SimdBool wco_S0;
+    SimdBool wco_S2;
  #ifdef VDW_CUTOFF_CHECK
-    SimdBool  wco_vdw_S0;
-#ifndef HALF_LJ
-    SimdBool  wco_vdw_S2;
-#endif
+    SimdBool wco_vdw_S0;
+#    ifndef HALF_LJ
+    SimdBool wco_vdw_S2;
+#    endif
  #endif
  
  #if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
-    SimdReal r_S0;
-#if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
-    SimdReal r_S2;
-#endif
+    SimdReal                                                        r_S0;
+#    if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
+    SimdReal                                                        r_S2;
+#    endif
  #endif
  
  #if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
-    SimdReal  rsw_S0, rsw2_S0;
-#ifndef HALF_LJ
-    SimdReal  rsw_S2, rsw2_S2;
-#endif
+    SimdReal rsw_S0, rsw2_S0;
+#    ifndef HALF_LJ
+    SimdReal rsw_S2, rsw2_S2;
+#    endif
  #endif
  
  #ifdef CALC_COULOMB
-#ifdef CHECK_EXCLS
+#    ifdef CHECK_EXCLS
      /* 1/r masked with the interaction mask */
-    SimdReal  rinv_ex_S0;
-    SimdReal  rinv_ex_S2;
-#endif
-    SimdReal  jq_S;
-    SimdReal  qq_S0;
-    SimdReal  qq_S2;
-#ifdef CALC_COUL_TAB
+    SimdReal rinv_ex_S0;
+    SimdReal rinv_ex_S2;
+#    endif
+    SimdReal jq_S;
+    SimdReal qq_S0;
+    SimdReal qq_S2;
+#    ifdef CALC_COUL_TAB
      /* The force (PME mesh force) we need to subtract from 1/r^2 */
-    SimdReal  fsub_S0;
-    SimdReal  fsub_S2;
-#endif
-#ifdef CALC_COUL_EWALD
-    SimdReal  brsq_S0, brsq_S2;
-    SimdReal  ewcorr_S0, ewcorr_S2;
-#endif
+    SimdReal fsub_S0;
+    SimdReal fsub_S2;
+#    endif
+#    ifdef CALC_COUL_EWALD
+    SimdReal brsq_S0, brsq_S2;
+    SimdReal ewcorr_S0, ewcorr_S2;
+#    endif
  
      /* frcoul = (1/r - fsub)*r */
-    SimdReal  frcoul_S0;
-    SimdReal  frcoul_S2;
-#ifdef CALC_COUL_TAB
+    SimdReal frcoul_S0;
+    SimdReal frcoul_S2;
+#    ifdef CALC_COUL_TAB
      /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
-    SimdReal         rs_S0, rf_S0, frac_S0;
-    SimdReal         rs_S2, rf_S2, frac_S2;
+    SimdReal rs_S0, rf_S0, frac_S0;
+    SimdReal rs_S2, rf_S2, frac_S2;
      /* Table index: rs truncated to an int */
-    SimdInt32        ti_S0, ti_S2;
+    SimdInt32 ti_S0, ti_S2;
      /* Linear force table values */
-    SimdReal         ctab0_S0, ctab1_S0;
-    SimdReal         ctab0_S2, ctab1_S2;
-#ifdef CALC_ENERGIES
+    SimdReal ctab0_S0, ctab1_S0;
+    SimdReal ctab0_S2, ctab1_S2;
+#        ifdef CALC_ENERGIES
      /* Quadratic energy table value */
-    SimdReal  ctabv_S0, dum_S0;
-    SimdReal  ctabv_S2, dum_S2;
-#endif
-#endif
-#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+    SimdReal ctabv_S0, dum_S0;
+    SimdReal ctabv_S2, dum_S2;
+#        endif
+#    endif
+#    if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
      /* The potential (PME mesh) we need to subtract from 1/r */
-    SimdReal  vc_sub_S0;
-    SimdReal  vc_sub_S2;
-#endif
-#ifdef CALC_ENERGIES
+    SimdReal vc_sub_S0;
+    SimdReal vc_sub_S2;
+#    endif
+#    ifdef CALC_ENERGIES
      /* Electrostatic potential */
-    SimdReal  vcoul_S0;
-    SimdReal  vcoul_S2;
-#endif
+    SimdReal vcoul_S0;
+    SimdReal vcoul_S2;
+#    endif
  #endif
      /* The force times 1/r */
-    SimdReal  fscal_S0;
-    SimdReal  fscal_S2;
+    SimdReal fscal_S0;
+    SimdReal fscal_S2;
  
  #ifdef CALC_LJ
-#ifdef LJ_COMB_LB
+#    ifdef LJ_COMB_LB
      /* LJ sigma_j/2 and sqrt(epsilon_j) */
-    SimdReal  hsig_j_S, seps_j_S;
+    SimdReal hsig_j_S, seps_j_S;
      /* LJ sigma_ij and epsilon_ij */
-    SimdReal  sig_S0, eps_S0;
-#ifndef HALF_LJ
-    SimdReal  sig_S2, eps_S2;
-#endif
-#ifdef CALC_ENERGIES
-    SimdReal  sig2_S0, sig6_S0;
-#ifndef HALF_LJ
-    SimdReal  sig2_S2, sig6_S2;
-#endif
-#endif /* LJ_COMB_LB */
-#endif /* CALC_LJ */
-
-#ifdef LJ_COMB_GEOM
-    SimdReal  c6s_j_S, c12s_j_S;
-#endif
-
-#if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
+    SimdReal sig_S0, eps_S0;
+#        ifndef HALF_LJ
+    SimdReal sig_S2, eps_S2;
+#        endif
+#        ifdef CALC_ENERGIES
+    SimdReal sig2_S0, sig6_S0;
+#            ifndef HALF_LJ
+    SimdReal sig2_S2, sig6_S2;
+#            endif
+#        endif /* LJ_COMB_LB */
+#    endif     /* CALC_LJ */
+
+#    ifdef LJ_COMB_GEOM
+    SimdReal c6s_j_S, c12s_j_S;
+#    endif
+
+#    if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
      /* Index for loading LJ parameters, complicated when interleaving */
-    int         aj2;
-#endif
+    int aj2;
+#    endif
  
      /* Intermediate variables for LJ calculation */
-#ifndef LJ_COMB_LB
-    SimdReal  rinvsix_S0;
-#ifndef HALF_LJ
-    SimdReal  rinvsix_S2;
-#endif
-#endif
-#ifdef LJ_COMB_LB
-    SimdReal  sir_S0, sir2_S0, sir6_S0;
-#ifndef HALF_LJ
-    SimdReal  sir_S2, sir2_S2, sir6_S2;
-#endif
-#endif
-
-    SimdReal  FrLJ6_S0, FrLJ12_S0, frLJ_S0;
-#ifndef HALF_LJ
-    SimdReal  FrLJ6_S2, FrLJ12_S2, frLJ_S2;
-#endif
+#    ifndef LJ_COMB_LB
+    SimdReal rinvsix_S0;
+#        ifndef HALF_LJ
+    SimdReal rinvsix_S2;
+#        endif
+#    endif
+#    ifdef LJ_COMB_LB
+    SimdReal sir_S0, sir2_S0, sir6_S0;
+#        ifndef HALF_LJ
+    SimdReal sir_S2, sir2_S2, sir6_S2;
+#        endif
+#    endif
+
+    SimdReal FrLJ6_S0, FrLJ12_S0, frLJ_S0;
+#    ifndef HALF_LJ
+    SimdReal FrLJ6_S2, FrLJ12_S2, frLJ_S2;
+#    endif
  #endif /* CALC_LJ */
  
      /* j-cluster index */
-    cj            = l_cj[cjind].cj;
+    cj = l_cj[cjind].cj;
  
      /* Atom indices (of the first atom in the cluster) */
-    aj            = cj*UNROLLJ;
+    aj = cj * UNROLLJ;
  #if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM)
-    aj2           = aj*2;
+    aj2 = aj * 2;
  #endif
-    ajx           = aj*DIM;
-    ajy           = ajx + STRIDE;
-    ajz           = ajy + STRIDE;
+    ajx = aj * DIM;
+    ajy = ajx + STRIDE;
+    ajz = ajy + STRIDE;
  
  #ifdef CHECK_EXCLS
-    gmx_load_simd_2xnn_interactions(static_cast<int>(l_cj[cjind].excl),
-                                    filter_S0, filter_S2,
+    gmx_load_simd_2xnn_interactions(static_cast<int>(l_cj[cjind].excl), filter_S0, filter_S2,
                                      &interact_S0, &interact_S2);
  #endif /* CHECK_EXCLS */
  
      /* load j atom coordinates */
-    jx_S = loadDuplicateHsimd(x+ajx);
-    jy_S = loadDuplicateHsimd(x+ajy);
-    jz_S = loadDuplicateHsimd(x+ajz);
+    jx_S = loadDuplicateHsimd(x + ajx);
+    jy_S = loadDuplicateHsimd(x + ajy);
+    jz_S = loadDuplicateHsimd(x + ajz);
  
      /* Calculate distance */
-    dx_S0       = ix_S0 - jx_S;
-    dy_S0       = iy_S0 - jy_S;
-    dz_S0       = iz_S0 - jz_S;
-    dx_S2       = ix_S2 - jx_S;
-    dy_S2       = iy_S2 - jy_S;
-    dz_S2       = iz_S2 - jz_S;
+    dx_S0 = ix_S0 - jx_S;
+    dy_S0 = iy_S0 - jy_S;
+    dz_S0 = iz_S0 - jz_S;
+    dx_S2 = ix_S2 - jx_S;
+    dy_S2 = iy_S2 - jy_S;
+    dz_S2 = iz_S2 - jz_S;
  
      /* rsq = dx*dx+dy*dy+dz*dz */
-    rsq_S0      = norm2(dx_S0, dy_S0, dz_S0);
-    rsq_S2      = norm2(dx_S2, dy_S2, dz_S2);
+    rsq_S0 = norm2(dx_S0, dy_S0, dz_S0);
+    rsq_S2 = norm2(dx_S2, dy_S2, dz_S2);
  
      /* Do the cut-off check */
-    wco_S0      = (rsq_S0 < rc2_S);
-    wco_S2      = (rsq_S2 < rc2_S);
+    wco_S0 = (rsq_S0 < rc2_S);
+    wco_S2 = (rsq_S2 < rc2_S);
  
  #ifdef CHECK_EXCLS
-#ifdef EXCL_FORCES
+#    ifdef EXCL_FORCES
      /* Only remove the (sub-)diagonal to avoid double counting */
-#if UNROLLJ == UNROLLI
+#        if UNROLLJ == UNROLLI
      if (cj == ci_sh)
      {
-        wco_S0  = wco_S0 && diagonal_mask_S0;
-        wco_S2  = wco_S2 && diagonal_mask_S2;
+        wco_S0 = wco_S0 && diagonal_mask_S0;
+        wco_S2 = wco_S2 && diagonal_mask_S2;
      }
-#else
-#if UNROLLJ == 2*UNROLLI
-    if (cj*2 == ci_sh)
+#        else
+#            if UNROLLJ == 2 * UNROLLI
+    if (cj * 2 == ci_sh)
      {
-        wco_S0  = wco_S0 && diagonal_mask0_S0;
-        wco_S2  = wco_S2 && diagonal_mask0_S2;
+        wco_S0 = wco_S0 && diagonal_mask0_S0;
+        wco_S2 = wco_S2 && diagonal_mask0_S2;
      }
-    else if (cj*2 + 1 == ci_sh)
+    else if (cj * 2 + 1 == ci_sh)
      {
-        wco_S0  = wco_S0 && diagonal_mask1_S0;
-        wco_S2  = wco_S2 && diagonal_mask1_S2;
+        wco_S0 = wco_S0 && diagonal_mask1_S0;
+        wco_S2 = wco_S2 && diagonal_mask1_S2;
      }
-#else
-#error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
-#endif
-#endif
-#else /* EXCL_FORCES */
-      /* No exclusion forces: remove all excluded atom pairs from the list */
-    wco_S0      = wco_S0 && interact_S0;
-    wco_S2      = wco_S2 && interact_S2;
-#endif
+#            else
+#                error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
+#            endif
+#        endif
+#    else /* EXCL_FORCES */
+    /* No exclusion forces: remove all excluded atom pairs from the list */
+    wco_S0 = wco_S0 && interact_S0;
+    wco_S2 = wco_S2 && interact_S2;
+#    endif
  #endif
  
  #ifdef COUNT_PAIRS
      {
-        int  i, j;
-        alignas(GMX_SIMD_ALIGNMENT) real  tmp[GMX_SIMD_REAL_WIDTH];
+        int                              i, j;
+        alignas(GMX_SIMD_ALIGNMENT) real tmp[GMX_SIMD_REAL_WIDTH];
  
          for (i = 0; i < UNROLLI; i += 2)
          {
              store(tmp, rc2_S - (i == 0 ? rsq_S0 : rsq_S2));
-            for (j = 0; j < 2*UNROLLJ; j++)
+            for (j = 0; j < 2 * UNROLLJ; j++)
              {
                  if (tmp[j] >= 0)
                  {
@@ -297,63 +296,63 @@
  
      // Ensure the distances do not fall below the limit where r^-12 overflows.
      // This should never happen for normal interactions.
-    rsq_S0      = max(rsq_S0, minRsq_S);
-    rsq_S2      = max(rsq_S2, minRsq_S);
+    rsq_S0 = max(rsq_S0, minRsq_S);
+    rsq_S2 = max(rsq_S2, minRsq_S);
  
      /* Calculate 1/r */
-    rinv_S0     = invsqrt(rsq_S0);
-    rinv_S2     = invsqrt(rsq_S2);
+    rinv_S0 = invsqrt(rsq_S0);
+    rinv_S2 = invsqrt(rsq_S2);
  
  #ifdef CALC_COULOMB
      /* Load parameters for j atom */
-    jq_S        = loadDuplicateHsimd(q+aj);
-    qq_S0       = iq_S0 * jq_S;
-    qq_S2       = iq_S2 * jq_S;
+    jq_S  = loadDuplicateHsimd(q + aj);
+    qq_S0 = iq_S0 * jq_S;
+    qq_S2 = iq_S2 * jq_S;
  #endif
  
  #ifdef CALC_LJ
-#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
-    SimdReal c6_S0, c12_S0;
-    gatherLoadTransposeHsimd<c_simdBestPairAlignment>(nbfp0, nbfp1, type+aj, &c6_S0, &c12_S0);
-#ifndef HALF_LJ
+#    if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
+    SimdReal                                                     c6_S0, c12_S0;
+    gatherLoadTransposeHsimd<c_simdBestPairAlignment>(nbfp0, nbfp1, type + aj, &c6_S0, &c12_S0);
+#        ifndef HALF_LJ
      SimdReal c6_S2, c12_S2;
-    gatherLoadTransposeHsimd<c_simdBestPairAlignment>(nbfp2, nbfp3, type+aj, &c6_S2, &c12_S2);
-#endif
-#endif /* not defined any LJ rule */
-
-#ifdef LJ_COMB_GEOM
-    c6s_j_S  = loadDuplicateHsimd(ljc+aj2);
-    c12s_j_S = loadDuplicateHsimd(ljc+aj2+STRIDE);
-    SimdReal c6_S0       = c6s_S0 * c6s_j_S;
-#ifndef HALF_LJ
-    SimdReal c6_S2       = c6s_S2 * c6s_j_S;
-#endif
-    SimdReal c12_S0      = c12s_S0 * c12s_j_S;
-#ifndef HALF_LJ
-    SimdReal c12_S2      = c12s_S2 * c12s_j_S;
-#endif
-#endif /* LJ_COMB_GEOM */
-
-#ifdef LJ_COMB_LB
-    hsig_j_S = loadDuplicateHsimd(ljc+aj2);
-    seps_j_S = loadDuplicateHsimd(ljc+aj2+STRIDE);
-
-    sig_S0      = hsig_i_S0 + hsig_j_S;
-    eps_S0      = seps_i_S0 * seps_j_S;
-#ifndef HALF_LJ
-    sig_S2      = hsig_i_S2 + hsig_j_S;
-    eps_S2      = seps_i_S2 * seps_j_S;
-#endif
-#endif /* LJ_COMB_LB */
+    gatherLoadTransposeHsimd<c_simdBestPairAlignment>(nbfp2, nbfp3, type + aj, &c6_S2, &c12_S2);
+#        endif
+#    endif /* not defined any LJ rule */
+
+#    ifdef LJ_COMB_GEOM
+    c6s_j_S        = loadDuplicateHsimd(ljc + aj2);
+    c12s_j_S       = loadDuplicateHsimd(ljc + aj2 + STRIDE);
+    SimdReal c6_S0 = c6s_S0 * c6s_j_S;
+#        ifndef HALF_LJ
+    SimdReal c6_S2 = c6s_S2 * c6s_j_S;
+#        endif
+    SimdReal c12_S0 = c12s_S0 * c12s_j_S;
+#        ifndef HALF_LJ
+    SimdReal c12_S2 = c12s_S2 * c12s_j_S;
+#        endif
+#    endif /* LJ_COMB_GEOM */
+
+#    ifdef LJ_COMB_LB
+    hsig_j_S = loadDuplicateHsimd(ljc + aj2);
+    seps_j_S = loadDuplicateHsimd(ljc + aj2 + STRIDE);
+
+    sig_S0 = hsig_i_S0 + hsig_j_S;
+    eps_S0 = seps_i_S0 * seps_j_S;
+#        ifndef HALF_LJ
+    sig_S2 = hsig_i_S2 + hsig_j_S;
+    eps_S2 = seps_i_S2 * seps_j_S;
+#        endif
+#    endif /* LJ_COMB_LB */
  
  #endif /* CALC_LJ */
  
      /* Set rinv to zero for r beyond the cut-off */
-    rinv_S0     = selectByMask(rinv_S0, wco_S0);
-    rinv_S2     = selectByMask(rinv_S2, wco_S2);
+    rinv_S0 = selectByMask(rinv_S0, wco_S0);
+    rinv_S2 = selectByMask(rinv_S2, wco_S2);
  
-    rinvsq_S0   = rinv_S0 * rinv_S0;
-    rinvsq_S2   = rinv_S2 * rinv_S2;
+    rinvsq_S0 = rinv_S0 * rinv_S0;
+    rinvsq_S2 = rinv_S2 * rinv_S2;
  
  #ifdef CALC_COULOMB
      /* Note that here we calculate force*r, not the usual force/r.
@@ -362,536 +361,548 @@
       * masked with the cut-off check.
       */
  
-#ifdef EXCL_FORCES
+#    ifdef EXCL_FORCES
      /* Only add 1/r for non-excluded atom pairs */
-    rinv_ex_S0  = selectByMask(rinv_S0, interact_S0);
-    rinv_ex_S2  = selectByMask(rinv_S2, interact_S2);
-#else
+    rinv_ex_S0 = selectByMask(rinv_S0, interact_S0);
+    rinv_ex_S2 = selectByMask(rinv_S2, interact_S2);
+#    else
      /* No exclusion forces, we always need 1/r */
-#define     rinv_ex_S0    rinv_S0
-#define     rinv_ex_S2    rinv_S2
-#endif
+#        define rinv_ex_S0 rinv_S0
+#        define rinv_ex_S2 rinv_S2
+#    endif
  
-#ifdef CALC_COUL_RF
+#    ifdef CALC_COUL_RF
      /* Electrostatic interactions */
-    frcoul_S0   = qq_S0 * fma(rsq_S0, mrc_3_S, rinv_ex_S0);
-    frcoul_S2   = qq_S2 * fma(rsq_S2, mrc_3_S, rinv_ex_S2);
+    frcoul_S0 = qq_S0 * fma(rsq_S0, mrc_3_S, rinv_ex_S0);
+    frcoul_S2 = qq_S2 * fma(rsq_S2, mrc_3_S, rinv_ex_S2);
  
-#ifdef CALC_ENERGIES
-    vcoul_S0    = qq_S0 * (rinv_ex_S0 + fma(rsq_S0, hrc_3_S, moh_rc_S));
-    vcoul_S2    = qq_S2 * (rinv_ex_S2 + fma(rsq_S2, hrc_3_S, moh_rc_S));
-#endif
-#endif
+#        ifdef CALC_ENERGIES
+    vcoul_S0 = qq_S0 * (rinv_ex_S0 + fma(rsq_S0, hrc_3_S, moh_rc_S));
+    vcoul_S2 = qq_S2 * (rinv_ex_S2 + fma(rsq_S2, hrc_3_S, moh_rc_S));
+#        endif
+#    endif
  
-#ifdef CALC_COUL_EWALD
+#    ifdef CALC_COUL_EWALD
      /* We need to mask (or limit) rsq for the cut-off,
       * as large distances can cause an overflow in gmx_pmecorrF/V.
       */
-    brsq_S0     = beta2_S * selectByMask(rsq_S0, wco_S0);
-    brsq_S2     = beta2_S * selectByMask(rsq_S2, wco_S2);
-    ewcorr_S0   = beta_S * pmeForceCorrection(brsq_S0);
-    ewcorr_S2   = beta_S * pmeForceCorrection(brsq_S2);
-    frcoul_S0   = qq_S0 * fma(ewcorr_S0, brsq_S0, rinv_ex_S0);
-    frcoul_S2   = qq_S2 * fma(ewcorr_S2, brsq_S2, rinv_ex_S2);
+    brsq_S0   = beta2_S * selectByMask(rsq_S0, wco_S0);
+    brsq_S2   = beta2_S * selectByMask(rsq_S2, wco_S2);
+    ewcorr_S0 = beta_S * pmeForceCorrection(brsq_S0);
+    ewcorr_S2 = beta_S * pmeForceCorrection(brsq_S2);
+    frcoul_S0 = qq_S0 * fma(ewcorr_S0, brsq_S0, rinv_ex_S0);
+    frcoul_S2 = qq_S2 * fma(ewcorr_S2, brsq_S2, rinv_ex_S2);
  
-#ifdef CALC_ENERGIES
-    vc_sub_S0   = beta_S * pmePotentialCorrection(brsq_S0);
-    vc_sub_S2   = beta_S * pmePotentialCorrection(brsq_S2);
-#endif
+#        ifdef CALC_ENERGIES
+    vc_sub_S0 = beta_S * pmePotentialCorrection(brsq_S0);
+    vc_sub_S2 = beta_S * pmePotentialCorrection(brsq_S2);
+#        endif
  
-#endif /* CALC_COUL_EWALD */
+#    endif /* CALC_COUL_EWALD */
  
-#ifdef CALC_COUL_TAB
+#    ifdef CALC_COUL_TAB
      /* Electrostatic interactions */
-    r_S0        = rsq_S0 * rinv_S0;
-    r_S2        = rsq_S2 * rinv_S2;
+    r_S0 = rsq_S0 * rinv_S0;
+    r_S2 = rsq_S2 * rinv_S2;
      /* Convert r to scaled table units */
-    rs_S0       = r_S0 * invtsp_S;
-    rs_S2       = r_S2 * invtsp_S;
+    rs_S0 = r_S0 * invtsp_S;
+    rs_S2 = r_S2 * invtsp_S;
      /* Truncate scaled r to an int */
-    ti_S0       = cvttR2I(rs_S0);
-    ti_S2       = cvttR2I(rs_S2);
+    ti_S0 = cvttR2I(rs_S0);
+    ti_S2 = cvttR2I(rs_S2);
  
-    rf_S0       = trunc(rs_S0);
-    rf_S2       = trunc(rs_S2);
+    rf_S0 = trunc(rs_S0);
+    rf_S2 = trunc(rs_S2);
  
-    frac_S0     = rs_S0 - rf_S0;
-    frac_S2     = rs_S2 - rf_S2;
+    frac_S0 = rs_S0 - rf_S0;
+    frac_S2 = rs_S2 - rf_S2;
  
      /* Load and interpolate table forces and possibly energies.
       * Force and energy can be combined in one table, stride 4: FDV0
       * or in two separate tables with stride 1: F and V
       * Currently single precision uses FDV0, double F and V.
       */
-#ifndef CALC_ENERGIES
-#ifdef TAB_FDV0
+#        ifndef CALC_ENERGIES
+#            ifdef TAB_FDV0
      gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
      gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
-#else
+#            else
      gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
      gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
-    ctab1_S0 = ctab1_S0 - ctab0_S0;
-    ctab1_S2 = ctab1_S2 - ctab0_S2;
-#endif
-#else
-#ifdef TAB_FDV0
+    ctab1_S0  = ctab1_S0 - ctab0_S0;
+    ctab1_S2  = ctab1_S2 - ctab0_S2;
+#            endif
+#        else
+#            ifdef TAB_FDV0
      gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0, &ctabv_S0, &dum_S0);
      gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2, &ctabv_S2, &dum_S2);
-#else
+#            else
      gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
      gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S0, &ctabv_S0, &dum_S0);
      gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
      gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S2, &ctabv_S2, &dum_S2);
      ctab1_S0 = ctab1_S0 - ctab0_S0;
      ctab1_S2 = ctab1_S2 - ctab0_S2;
-#endif
-#endif
-    fsub_S0     = fma(frac_S0, ctab1_S0, ctab0_S0);
-    fsub_S2     = fma(frac_S2, ctab1_S2, ctab0_S2);
-    frcoul_S0   = qq_S0 * fnma(fsub_S0, r_S0, rinv_ex_S0);
-    frcoul_S2   = qq_S2 * fnma(fsub_S2, r_S2, rinv_ex_S2);
-
-#ifdef CALC_ENERGIES
-    vc_sub_S0   = fma((mhalfsp_S * frac_S0), (ctab0_S0 + fsub_S0), ctabv_S0);
-    vc_sub_S2   = fma((mhalfsp_S * frac_S2), (ctab0_S2 + fsub_S2), ctabv_S2);
-#endif
-#endif /* CALC_COUL_TAB */
-
-#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
-#ifndef NO_SHIFT_EWALD
+#            endif
+#        endif
+    fsub_S0   = fma(frac_S0, ctab1_S0, ctab0_S0);
+    fsub_S2   = fma(frac_S2, ctab1_S2, ctab0_S2);
+    frcoul_S0 = qq_S0 * fnma(fsub_S0, r_S0, rinv_ex_S0);
+    frcoul_S2 = qq_S2 * fnma(fsub_S2, r_S2, rinv_ex_S2);
+
+#        ifdef CALC_ENERGIES
+    vc_sub_S0 = fma((mhalfsp_S * frac_S0), (ctab0_S0 + fsub_S0), ctabv_S0);
+    vc_sub_S2 = fma((mhalfsp_S * frac_S2), (ctab0_S2 + fsub_S2), ctabv_S2);
+#        endif
+#    endif /* CALC_COUL_TAB */
+
+#    if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
+#        ifndef NO_SHIFT_EWALD
      /* Add Ewald potential shift to vc_sub for convenience */
-#ifdef CHECK_EXCLS
-    vc_sub_S0   = vc_sub_S0 + selectByMask(sh_ewald_S, interact_S0);
-    vc_sub_S2   = vc_sub_S2 + selectByMask(sh_ewald_S, interact_S2);
-#else
-    vc_sub_S0   = vc_sub_S0 + sh_ewald_S;
-    vc_sub_S2   = vc_sub_S2 + sh_ewald_S;
-#endif
-#endif
+#            ifdef CHECK_EXCLS
+    vc_sub_S0 = vc_sub_S0 + selectByMask(sh_ewald_S, interact_S0);
+    vc_sub_S2 = vc_sub_S2 + selectByMask(sh_ewald_S, interact_S2);
+#            else
+    vc_sub_S0 = vc_sub_S0 + sh_ewald_S;
+    vc_sub_S2 = vc_sub_S2 + sh_ewald_S;
+#            endif
+#        endif
  
-    vcoul_S0    = qq_S0 * (rinv_ex_S0 - vc_sub_S0);
-    vcoul_S2    = qq_S2 * (rinv_ex_S2 - vc_sub_S2);
+    vcoul_S0 = qq_S0 * (rinv_ex_S0 - vc_sub_S0);
+    vcoul_S2 = qq_S2 * (rinv_ex_S2 - vc_sub_S2);
  
-#endif
+#    endif
  
-#ifdef CALC_ENERGIES
+#    ifdef CALC_ENERGIES
      /* Mask energy for cut-off and diagonal */
-    vcoul_S0    = selectByMask(vcoul_S0, wco_S0);
-    vcoul_S2    = selectByMask(vcoul_S2, wco_S2);
-#endif
+    vcoul_S0 = selectByMask(vcoul_S0, wco_S0);
+    vcoul_S2 = selectByMask(vcoul_S2, wco_S2);
+#    endif
  
  #endif /* CALC_COULOMB */
  
  #ifdef CALC_LJ
      /* Lennard-Jones interaction */
  
-#ifdef VDW_CUTOFF_CHECK
-    wco_vdw_S0  = (rsq_S0 < rcvdw2_S);
-#ifndef HALF_LJ
-    wco_vdw_S2  = (rsq_S2 < rcvdw2_S);
-#endif
-#else
+#    ifdef VDW_CUTOFF_CHECK
+    wco_vdw_S0 = (rsq_S0 < rcvdw2_S);
+#        ifndef HALF_LJ
+    wco_vdw_S2 = (rsq_S2 < rcvdw2_S);
+#        endif
+#    else
      /* Same cut-off for Coulomb and VdW, reuse the registers */
-#define     wco_vdw_S0    wco_S0
-#define     wco_vdw_S2    wco_S2
-#endif
-
-#ifndef LJ_COMB_LB
-    rinvsix_S0  = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
-#ifdef EXCL_FORCES
-    rinvsix_S0  = selectByMask(rinvsix_S0, interact_S0);
-#endif
-#ifndef HALF_LJ
-    rinvsix_S2  = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
-#ifdef EXCL_FORCES
-    rinvsix_S2  = selectByMask(rinvsix_S2, interact_S2);
-#endif
-#endif
-
-#if defined LJ_CUT || defined LJ_POT_SWITCH
+#        define wco_vdw_S0 wco_S0
+#        define wco_vdw_S2 wco_S2
+#    endif
+
+#    ifndef LJ_COMB_LB
+    rinvsix_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
+#        ifdef EXCL_FORCES
+    rinvsix_S0 = selectByMask(rinvsix_S0, interact_S0);
+#        endif
+#        ifndef HALF_LJ
+    rinvsix_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
+#            ifdef EXCL_FORCES
+    rinvsix_S2 = selectByMask(rinvsix_S2, interact_S2);
+#            endif
+#        endif
+
+#        if defined LJ_CUT || defined LJ_POT_SWITCH
      /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
-    FrLJ6_S0    = c6_S0 * rinvsix_S0;
-#ifndef HALF_LJ
-    FrLJ6_S2    = c6_S2 * rinvsix_S2;
-#endif
-    FrLJ12_S0   = c12_S0 * rinvsix_S0 * rinvsix_S0;
-#ifndef HALF_LJ
-    FrLJ12_S2   = c12_S2 * rinvsix_S2 * rinvsix_S2;
-#endif
-#endif
-
-#if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+    FrLJ6_S0 = c6_S0 * rinvsix_S0;
+#            ifndef HALF_LJ
+    FrLJ6_S2 = c6_S2 * rinvsix_S2;
+#            endif
+    FrLJ12_S0 = c12_S0 * rinvsix_S0 * rinvsix_S0;
+#            ifndef HALF_LJ
+    FrLJ12_S2 = c12_S2 * rinvsix_S2 * rinvsix_S2;
+#            endif
+#        endif
+
+#        if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
      /* We switch the LJ force */
-    r_S0        = rsq_S0 * rinv_S0;
-    rsw_S0      = max(r_S0 - rswitch_S, zero_S);
-    rsw2_S0     = rsw_S0 * rsw_S0;
-#ifndef HALF_LJ
-    r_S2        = rsq_S2 * rinv_S2;
-    rsw_S2      = max(r_S2 - rswitch_S, zero_S);
-    rsw2_S2     = rsw_S2 * rsw_S2;
-#endif
-#endif
-
-#ifdef LJ_FORCE_SWITCH
-
-#define add_fr_switch(fr, rsw, rsw2_r, c2, c3) fma(fma(c3, rsw, c2), rsw2_r, fr)
+    r_S0    = rsq_S0 * rinv_S0;
+    rsw_S0  = max(r_S0 - rswitch_S, zero_S);
+    rsw2_S0 = rsw_S0 * rsw_S0;
+#            ifndef HALF_LJ
+    r_S2    = rsq_S2 * rinv_S2;
+    rsw_S2  = max(r_S2 - rswitch_S, zero_S);
+    rsw2_S2 = rsw_S2 * rsw_S2;
+#            endif
+#        endif
+
+#        ifdef LJ_FORCE_SWITCH
+
+#            define add_fr_switch(fr, rsw, rsw2_r, c2, c3) fma(fma(c3, rsw, c2), rsw2_r, fr)
      SimdReal rsw2_r_S0 = rsw2_S0 * r_S0;
-    FrLJ6_S0    = c6_S0 * add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S);
-#ifndef HALF_LJ
+    FrLJ6_S0           = c6_S0 * add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S);
+#            ifndef HALF_LJ
      SimdReal rsw2_r_S2 = rsw2_S2 * r_S2;
-    FrLJ6_S2    = c6_S2 * add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S);
-#endif
-    FrLJ12_S0   = c12_S0 * add_fr_switch(rinvsix_S0 * rinvsix_S0, rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S);
-#ifndef HALF_LJ
-    FrLJ12_S2   = c12_S2 * add_fr_switch(rinvsix_S2 * rinvsix_S2, rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S);
-#endif
-#undef add_fr_switch
-#endif /* LJ_FORCE_SWITCH */
-
-#endif /* not LJ_COMB_LB */
-
-#ifdef LJ_COMB_LB
-    sir_S0      = sig_S0 * rinv_S0;
-#ifndef HALF_LJ
-    sir_S2      = sig_S2 * rinv_S2;
-#endif
-    sir2_S0     = sir_S0 * sir_S0;
-#ifndef HALF_LJ
-    sir2_S2     = sir_S2 * sir_S2;
-#endif
-    sir6_S0     = sir2_S0 * sir2_S0 * sir2_S0;
-#ifdef EXCL_FORCES
-    sir6_S0     = selectByMask(sir6_S0, interact_S0);
-#endif
-#ifndef HALF_LJ
-    sir6_S2     = sir2_S2 * sir2_S2 * sir2_S2;
-#ifdef EXCL_FORCES
-    sir6_S2     = selectByMask(sir6_S2, interact_S2);
-#endif
-#endif
-#ifdef VDW_CUTOFF_CHECK
-    sir6_S0     = selectByMask(sir6_S0, wco_vdw_S0);
-#ifndef HALF_LJ
-    sir6_S2     = selectByMask(sir6_S2, wco_vdw_S2);
-#endif
-#endif
-    FrLJ6_S0    = eps_S0 * sir6_S0;
-#ifndef HALF_LJ
-    FrLJ6_S2    = eps_S2 * sir6_S2;
-#endif
-    FrLJ12_S0   = FrLJ6_S0 * sir6_S0;
-#ifndef HALF_LJ
-    FrLJ12_S2   = FrLJ6_S2 * sir6_S2;
-#endif
-#if defined CALC_ENERGIES
+    FrLJ6_S2           = c6_S2 * add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S);
+#            endif
+    FrLJ12_S0 = c12_S0 * add_fr_switch(rinvsix_S0 * rinvsix_S0, rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S);
+#            ifndef HALF_LJ
+    FrLJ12_S2 = c12_S2 * add_fr_switch(rinvsix_S2 * rinvsix_S2, rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S);
+#            endif
+#            undef add_fr_switch
+#        endif /* LJ_FORCE_SWITCH */
+
+#    endif /* not LJ_COMB_LB */
+
+#    ifdef LJ_COMB_LB
+    sir_S0 = sig_S0 * rinv_S0;
+#        ifndef HALF_LJ
+    sir_S2 = sig_S2 * rinv_S2;
+#        endif
+    sir2_S0 = sir_S0 * sir_S0;
+#        ifndef HALF_LJ
+    sir2_S2 = sir_S2 * sir_S2;
+#        endif
+    sir6_S0 = sir2_S0 * sir2_S0 * sir2_S0;
+#        ifdef EXCL_FORCES
+    sir6_S0 = selectByMask(sir6_S0, interact_S0);
+#        endif
+#        ifndef HALF_LJ
+    sir6_S2 = sir2_S2 * sir2_S2 * sir2_S2;
+#            ifdef EXCL_FORCES
+    sir6_S2 = selectByMask(sir6_S2, interact_S2);
+#            endif
+#        endif
+#        ifdef VDW_CUTOFF_CHECK
+    sir6_S0 = selectByMask(sir6_S0, wco_vdw_S0);
+#            ifndef HALF_LJ
+    sir6_S2 = selectByMask(sir6_S2, wco_vdw_S2);
+#            endif
+#        endif
+    FrLJ6_S0 = eps_S0 * sir6_S0;
+#        ifndef HALF_LJ
+    FrLJ6_S2 = eps_S2 * sir6_S2;
+#        endif
+    FrLJ12_S0 = FrLJ6_S0 * sir6_S0;
+#        ifndef HALF_LJ
+    FrLJ12_S2 = FrLJ6_S2 * sir6_S2;
+#        endif
+#        if defined CALC_ENERGIES
      /* We need C6 and C12 to calculate the LJ potential shift */
-    sig2_S0     = sig_S0 * sig_S0;
-#ifndef HALF_LJ
-    sig2_S2     = sig_S2 * sig_S2;
-#endif
-    sig6_S0     = sig2_S0 * sig2_S0 * sig2_S0;
-#ifndef HALF_LJ
-    sig6_S2     = sig2_S2 * sig2_S2 * sig2_S2;
-#endif
-    SimdReal c6_S0  = eps_S0 * sig6_S0;
-#ifndef HALF_LJ
-    SimdReal c6_S2  = eps_S2 * sig6_S2;
-#endif
+    sig2_S0 = sig_S0 * sig_S0;
+#            ifndef HALF_LJ
+    sig2_S2 = sig_S2 * sig_S2;
+#            endif
+    sig6_S0 = sig2_S0 * sig2_S0 * sig2_S0;
+#            ifndef HALF_LJ
+    sig6_S2 = sig2_S2 * sig2_S2 * sig2_S2;
+#            endif
+    SimdReal c6_S0 = eps_S0 * sig6_S0;
+#            ifndef HALF_LJ
+    SimdReal c6_S2 = eps_S2 * sig6_S2;
+#            endif
      SimdReal c12_S0 = c6_S0 * sig6_S0;
-#ifndef HALF_LJ
+#            ifndef HALF_LJ
      SimdReal c12_S2 = c6_S2 * sig6_S2;
-#endif
-#endif
-#endif /* LJ_COMB_LB */
+#            endif
+#        endif
+#    endif /* LJ_COMB_LB */
  
      /* Determine the total scalar LJ force*r */
-    frLJ_S0     = FrLJ12_S0 - FrLJ6_S0;
-#ifndef HALF_LJ
-    frLJ_S2     = FrLJ12_S2 - FrLJ6_S2;
-#endif
+    frLJ_S0 = FrLJ12_S0 - FrLJ6_S0;
+#    ifndef HALF_LJ
+    frLJ_S2 = FrLJ12_S2 - FrLJ6_S2;
+#    endif
  
-#if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
+#    if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
  
-#ifdef LJ_CUT
+#        ifdef LJ_CUT
      /* Calculate the LJ energies, with constant potential shift */
-    SimdReal VLJ6_S0  = sixth_S * fma(c6_S0, p6_cpot_S, FrLJ6_S0);
-#ifndef HALF_LJ
-    SimdReal VLJ6_S2  = sixth_S * fma(c6_S2, p6_cpot_S, FrLJ6_S2);
-#endif
+    SimdReal VLJ6_S0 = sixth_S * fma(c6_S0, p6_cpot_S, FrLJ6_S0);
+#            ifndef HALF_LJ
+    SimdReal VLJ6_S2 = sixth_S * fma(c6_S2, p6_cpot_S, FrLJ6_S2);
+#            endif
      SimdReal VLJ12_S0 = twelveth_S * fma(c12_S0, p12_cpot_S, FrLJ12_S0);
-#ifndef HALF_LJ
+#            ifndef HALF_LJ
      SimdReal VLJ12_S2 = twelveth_S * fma(c12_S2, p12_cpot_S, FrLJ12_S2);
-#endif
-#endif /* LJ_CUT */
-
-#ifdef LJ_FORCE_SWITCH
-#define v_fswitch_pr(rsw, rsw2, c0, c3, c4) fma(fma(c4, rsw, c3), (rsw2) * (rsw), c0)
-
-    SimdReal VLJ6_S0     = c6_S0 * fma(sixth_S, rinvsix_S0, v_fswitch_pr(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
-#ifndef HALF_LJ
-    SimdReal VLJ6_S2     = c6_S2 * fma(sixth_S, rinvsix_S2, v_fswitch_pr(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
-#endif
-    SimdReal VLJ12_S0    = c12_S0 * fma(twelveth_S, rinvsix_S0 * rinvsix_S0, v_fswitch_pr(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
-#ifndef HALF_LJ
-    SimdReal VLJ12_S2    = c12_S2 * fma(twelveth_S, rinvsix_S2 * rinvsix_S2, v_fswitch_pr(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
-#endif
-#undef v_fswitch_pr
-#endif /* LJ_FORCE_SWITCH */
+#            endif
+#        endif /* LJ_CUT */
+
+#        ifdef LJ_FORCE_SWITCH
+#            define v_fswitch_pr(rsw, rsw2, c0, c3, c4) fma(fma(c4, rsw, c3), (rsw2) * (rsw), c0)
+
+    SimdReal VLJ6_S0 =
+            c6_S0 * fma(sixth_S, rinvsix_S0, v_fswitch_pr(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
+#            ifndef HALF_LJ
+    SimdReal VLJ6_S2 =
+            c6_S2 * fma(sixth_S, rinvsix_S2, v_fswitch_pr(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
+#            endif
+    SimdReal VLJ12_S0 = c12_S0
+                        * fma(twelveth_S, rinvsix_S0 * rinvsix_S0,
+                              v_fswitch_pr(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
+#            ifndef HALF_LJ
+    SimdReal VLJ12_S2 = c12_S2
+                        * fma(twelveth_S, rinvsix_S2 * rinvsix_S2,
+                              v_fswitch_pr(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
+#            endif
+#            undef v_fswitch_pr
+#        endif /* LJ_FORCE_SWITCH */
  
      /* Add up the repulsion and dispersion */
-    SimdReal VLJ_S0      = VLJ12_S0 - VLJ6_S0;
-#ifndef HALF_LJ
-    SimdReal VLJ_S2      = VLJ12_S2 - VLJ6_S2;
-#endif
+    SimdReal VLJ_S0 = VLJ12_S0 - VLJ6_S0;
+#        ifndef HALF_LJ
+    SimdReal VLJ_S2 = VLJ12_S2 - VLJ6_S2;
+#        endif
  
-#endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
+#    endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
  
-#ifdef LJ_POT_SWITCH
+#    ifdef LJ_POT_SWITCH
      /* We always need the potential, since it is needed for the force */
      SimdReal VLJ_S0 = fnma(sixth_S, FrLJ6_S0, twelveth_S * FrLJ12_S0);
-#ifndef HALF_LJ
+#        ifndef HALF_LJ
      SimdReal VLJ_S2 = fnma(sixth_S, FrLJ6_S2, twelveth_S * FrLJ12_S2);
-#endif
+#        endif
  
      {
          SimdReal sw_S0, dsw_S0;
-#ifndef HALF_LJ
+#        ifndef HALF_LJ
          SimdReal sw_S2, dsw_S2;
-#endif
+#        endif
  
-#define switch_pr(rsw, rsw2, c3, c4, c5) fma(fma(fma(c5, rsw, c4), rsw, c3), (rsw2) * (rsw), one_S)
-#define dswitch_pr(rsw, rsw2, c2, c3, c4) fma(fma(c4, rsw, c3), rsw, c2)*(rsw2)
+#        define switch_pr(rsw, rsw2, c3, c4, c5) \
+            fma(fma(fma(c5, rsw, c4), rsw, c3), (rsw2) * (rsw), one_S)
+#        define dswitch_pr(rsw, rsw2, c2, c3, c4) fma(fma(c4, rsw, c3), rsw, c2) * (rsw2)
  
          sw_S0  = switch_pr(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
          dsw_S0 = dswitch_pr(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
-#ifndef HALF_LJ
+#        ifndef HALF_LJ
          sw_S2  = switch_pr(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
          dsw_S2 = dswitch_pr(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
-#endif
+#        endif
          frLJ_S0 = fnma(dsw_S0 * VLJ_S0, r_S0, sw_S0 * frLJ_S0);
-#ifndef HALF_LJ
+#        ifndef HALF_LJ
          frLJ_S2 = fnma(dsw_S2 * VLJ_S2, r_S2, sw_S2 * frLJ_S2);
-#endif
-#ifdef CALC_ENERGIES
-        VLJ_S0  = sw_S0 * VLJ_S0;
-#ifndef HALF_LJ
-        VLJ_S2  = sw_S2 * VLJ_S2;
-#endif
-#endif
-
-#undef switch_pr
-#undef dswitch_pr
+#        endif
+#        ifdef CALC_ENERGIES
+        VLJ_S0 = sw_S0 * VLJ_S0;
+#            ifndef HALF_LJ
+        VLJ_S2 = sw_S2 * VLJ_S2;
+#            endif
+#        endif
+
+#        undef switch_pr
+#        undef dswitch_pr
      }
-#endif /* LJ_POT_SWITCH */
+#    endif /* LJ_POT_SWITCH */
  
-#if defined CALC_ENERGIES && defined CHECK_EXCLS
+#    if defined CALC_ENERGIES && defined CHECK_EXCLS
      /* The potential shift should be removed for excluded pairs */
-    VLJ_S0      = selectByMask(VLJ_S0, interact_S0);
-#ifndef HALF_LJ
-    VLJ_S2      = selectByMask(VLJ_S2, interact_S2);
-#endif
-#endif
+    VLJ_S0 = selectByMask(VLJ_S0, interact_S0);
+#        ifndef HALF_LJ
+    VLJ_S2 = selectByMask(VLJ_S2, interact_S2);
+#        endif
+#    endif
  
-#ifdef LJ_EWALD_GEOM
+#    ifdef LJ_EWALD_GEOM
      {
          SimdReal c6s_j_S;
          SimdReal c6grid_S0, rinvsix_nm_S0, cr2_S0, expmcr2_S0, poly_S0;
-#ifndef HALF_LJ
+#        ifndef HALF_LJ
          SimdReal c6grid_S2, rinvsix_nm_S2, cr2_S2, expmcr2_S2, poly_S2;
-#endif
-#ifdef CALC_ENERGIES
+#        endif
+#        ifdef CALC_ENERGIES
          SimdReal sh_mask_S0;
-#ifndef HALF_LJ
+#            ifndef HALF_LJ
          SimdReal sh_mask_S2;
-#endif
-#endif
+#            endif
+#        endif
  
          /* Determine C6 for the grid using the geometric combination rule */
-        c6s_j_S         = loadDuplicateHsimd(ljc+aj2);
-        c6grid_S0       = c6s_S0 * c6s_j_S;
-#ifndef HALF_LJ
-        c6grid_S2       = c6s_S2 * c6s_j_S;
-#endif
+        c6s_j_S   = loadDuplicateHsimd(ljc + aj2);
+        c6grid_S0 = c6s_S0 * c6s_j_S;
+#        ifndef HALF_LJ
+        c6grid_S2 = c6s_S2 * c6s_j_S;
+#        endif
  
-#ifdef CHECK_EXCLS
+#        ifdef CHECK_EXCLS
          /* Recalculate rinvsix without exclusion mask (compiler might optimize) */
          rinvsix_nm_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
-#ifndef HALF_LJ
+#            ifndef HALF_LJ
          rinvsix_nm_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
-#endif
-#else
+#            endif
+#        else
          /* We didn't use a mask, so we can copy */
          rinvsix_nm_S0 = rinvsix_S0;
-#ifndef HALF_LJ
+#            ifndef HALF_LJ
          rinvsix_nm_S2 = rinvsix_S2;
-#endif
-#endif
+#            endif
+#        endif
  
          /* Mask for the cut-off to avoid overflow of cr2^2 */
-        cr2_S0        = lje_c2_S * selectByMask(rsq_S0, wco_vdw_S0);
-#ifndef HALF_LJ
-        cr2_S2        = lje_c2_S * selectByMask(rsq_S2, wco_vdw_S2);
-#endif
+        cr2_S0 = lje_c2_S * selectByMask(rsq_S0, wco_vdw_S0);
+#        ifndef HALF_LJ
+        cr2_S2 = lje_c2_S * selectByMask(rsq_S2, wco_vdw_S2);
+#        endif
          // Unsafe version of our exp() should be fine, since these arguments should never
          // be smaller than -127 for any reasonable choice of cutoff or ewald coefficients.
-        expmcr2_S0    = exp<MathOptimization::Unsafe>( -cr2_S0);
-#ifndef HALF_LJ
-        expmcr2_S2    = exp<MathOptimization::Unsafe>( -cr2_S2);
-#endif
+        expmcr2_S0 = exp<MathOptimization::Unsafe>(-cr2_S0);
+#        ifndef HALF_LJ
+        expmcr2_S2 = exp<MathOptimization::Unsafe>(-cr2_S2);
+#        endif
  
          /* 1 + cr2 + 1/2*cr2^2 */
-        poly_S0       = fma(fma(half_S, cr2_S0, one_S), cr2_S0, one_S);
-#ifndef HALF_LJ
-        poly_S2       = fma(fma(half_S, cr2_S2, one_S), cr2_S2, one_S);
-#endif
+        poly_S0 = fma(fma(half_S, cr2_S0, one_S), cr2_S0, one_S);
+#        ifndef HALF_LJ
+        poly_S2 = fma(fma(half_S, cr2_S2, one_S), cr2_S2, one_S);
+#        endif
  
          /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use:
           * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6)
           */
-        frLJ_S0       = fma(c6grid_S0, fnma(expmcr2_S0, fma(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0), frLJ_S0);
-#ifndef HALF_LJ
-        frLJ_S2       = fma(c6grid_S2, fnma(expmcr2_S2, fma(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2), frLJ_S2);
-#endif
-
-#ifdef CALC_ENERGIES
-#ifdef CHECK_EXCLS
-        sh_mask_S0    = selectByMask(lje_vc_S, interact_S0);
-#ifndef HALF_LJ
-        sh_mask_S2    = selectByMask(lje_vc_S, interact_S2);
-#endif
-#else
-        sh_mask_S0    = lje_vc_S;
-#ifndef HALF_LJ
-        sh_mask_S2    = lje_vc_S;
-#endif
-#endif
-
-        VLJ_S0        = fma(sixth_S * c6grid_S0, fma(rinvsix_nm_S0, fnma(expmcr2_S0, poly_S0, one_S), sh_mask_S0), VLJ_S0);
-#ifndef HALF_LJ
-        VLJ_S2        = fma(sixth_S * c6grid_S2, fma(rinvsix_nm_S2, fnma(expmcr2_S2, poly_S2, one_S), sh_mask_S2), VLJ_S2);
-#endif
-#endif /* CALC_ENERGIES */
+        frLJ_S0 = fma(c6grid_S0,
+                      fnma(expmcr2_S0, fma(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0), frLJ_S0);
+#        ifndef HALF_LJ
+        frLJ_S2 = fma(c6grid_S2,
+                      fnma(expmcr2_S2, fma(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2), frLJ_S2);
+#        endif
+
+#        ifdef CALC_ENERGIES
+#            ifdef CHECK_EXCLS
+        sh_mask_S0 = selectByMask(lje_vc_S, interact_S0);
+#                ifndef HALF_LJ
+        sh_mask_S2 = selectByMask(lje_vc_S, interact_S2);
+#                endif
+#            else
+        sh_mask_S0 = lje_vc_S;
+#                ifndef HALF_LJ
+        sh_mask_S2 = lje_vc_S;
+#                endif
+#            endif
+
+        VLJ_S0 = fma(sixth_S * c6grid_S0,
+                     fma(rinvsix_nm_S0, fnma(expmcr2_S0, poly_S0, one_S), sh_mask_S0), VLJ_S0);
+#            ifndef HALF_LJ
+        VLJ_S2 = fma(sixth_S * c6grid_S2,
+                     fma(rinvsix_nm_S2, fnma(expmcr2_S2, poly_S2, one_S), sh_mask_S2), VLJ_S2);
+#            endif
+#        endif /* CALC_ENERGIES */
      }
-#endif /* LJ_EWALD_GEOM */
+#    endif /* LJ_EWALD_GEOM */
  
-#if defined VDW_CUTOFF_CHECK
+#    if defined VDW_CUTOFF_CHECK
      /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
       * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
       */
-    frLJ_S0     = selectByMask(frLJ_S0, wco_vdw_S0);
-#ifndef HALF_LJ
-    frLJ_S2     = selectByMask(frLJ_S2, wco_vdw_S2);
-#endif
-#endif
+    frLJ_S0 = selectByMask(frLJ_S0, wco_vdw_S0);
+#        ifndef HALF_LJ
+    frLJ_S2 = selectByMask(frLJ_S2, wco_vdw_S2);
+#        endif
+#    endif
  
-#ifdef CALC_ENERGIES
+#    ifdef CALC_ENERGIES
      /* The potential shift should be removed for pairs beyond cut-off */
-    VLJ_S0      = selectByMask(VLJ_S0, wco_vdw_S0);
-#ifndef HALF_LJ
-    VLJ_S2      = selectByMask(VLJ_S2, wco_vdw_S2);
-#endif
-#endif
+    VLJ_S0 = selectByMask(VLJ_S0, wco_vdw_S0);
+#        ifndef HALF_LJ
+    VLJ_S2 = selectByMask(VLJ_S2, wco_vdw_S2);
+#        endif
+#    endif
  
  #endif /* CALC_LJ */
  
  #ifdef CALC_ENERGIES
-#ifdef ENERGY_GROUPS
+#    ifdef ENERGY_GROUPS
      /* Extract the group pair index per j pair.
       * Energy groups are stored per i-cluster, so things get
       * complicated when the i- and j-cluster size don't match.
       */
      {
          int egps_j;
-#if UNROLLJ == 2
+#        if UNROLLJ == 2
          egps_j    = nbatParams.energrp[cj >> 1];
-        egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
-#else
+        egp_jj[0] = ((egps_j >> ((cj & 1) * egps_jshift)) & egps_jmask) * egps_jstride;
+#        else
          /* We assume UNROLLI <= UNROLLJ */
          int jdi;
-        for (jdi = 0; jdi < UNROLLJ/UNROLLI; jdi++)
+        for (jdi = 0; jdi < UNROLLJ / UNROLLI; jdi++)
          {
              int jj;
-            egps_j = nbatParams.energrp[cj*(UNROLLJ/UNROLLI) + jdi];
-            for (jj = 0; jj < (UNROLLI/2); jj++)
+            egps_j = nbatParams.energrp[cj * (UNROLLJ / UNROLLI) + jdi];
+            for (jj = 0; jj < (UNROLLI / 2); jj++)
              {
-                egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+                egp_jj[jdi * (UNROLLI / 2) + jj] =
+                        ((egps_j >> (jj * egps_jshift)) & egps_jmask) * egps_jstride;
              }
          }
-#endif
+#        endif
      }
-#endif
+#    endif
  
-#ifdef CALC_COULOMB
-#ifndef ENERGY_GROUPS
-    vctot_S      = vctot_S + vcoul_S0 + vcoul_S2;
-#else
+#    ifdef CALC_COULOMB
+#        ifndef ENERGY_GROUPS
+    vctot_S = vctot_S + vcoul_S0 + vcoul_S2;
+#        else
      add_ener_grp_halves(vcoul_S0, vctp[0], vctp[1], egp_jj);
      add_ener_grp_halves(vcoul_S2, vctp[2], vctp[3], egp_jj);
-#endif
-#endif
-
-#ifdef CALC_LJ
-#ifndef ENERGY_GROUPS
-    Vvdwtot_S    = Vvdwtot_S + VLJ_S0
-#ifndef HALF_LJ
-        + VLJ_S2
-#endif
-    ;
-#else
+#        endif
+#    endif
+
+#    ifdef CALC_LJ
+#        ifndef ENERGY_GROUPS
+    Vvdwtot_S = Vvdwtot_S + VLJ_S0
+#            ifndef HALF_LJ
+                + VLJ_S2
+#            endif
+            ;
+#        else
      add_ener_grp_halves(VLJ_S0, vvdwtp[0], vvdwtp[1], egp_jj);
-#ifndef HALF_LJ
+#            ifndef HALF_LJ
      add_ener_grp_halves(VLJ_S2, vvdwtp[2], vvdwtp[3], egp_jj);
-#endif
-#endif
-#endif /* CALC_LJ */
-#endif /* CALC_ENERGIES */
+#            endif
+#        endif
+#    endif /* CALC_LJ */
+#endif     /* CALC_ENERGIES */
  
  #ifdef CALC_LJ
-#ifdef CALC_COULOMB
-    fscal_S0    = rinvsq_S0 * (frcoul_S0 + frLJ_S0);
+#    ifdef CALC_COULOMB
+    fscal_S0 = rinvsq_S0 * (frcoul_S0 + frLJ_S0);
+#    else
+    fscal_S0 = rinvsq_S0 * frLJ_S0;
+#    endif
  #else
-    fscal_S0    = rinvsq_S0 * frLJ_S0;
-#endif
-#else
-    fscal_S0    = rinvsq_S0 * frcoul_S0;
+    fscal_S0 = rinvsq_S0 * frcoul_S0;
  #endif /* CALC_LJ */
  #if defined CALC_LJ && !defined HALF_LJ
-#ifdef CALC_COULOMB
-    fscal_S2    = rinvsq_S2 * (frcoul_S2 + frLJ_S2);
-#else
-    fscal_S2    = rinvsq_S2 * frLJ_S2;
-#endif
+#    ifdef CALC_COULOMB
+    fscal_S2 = rinvsq_S2 * (frcoul_S2 + frLJ_S2);
+#    else
+    fscal_S2 = rinvsq_S2 * frLJ_S2;
+#    endif
  #else
      /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
-    fscal_S2    = rinvsq_S2 * frcoul_S2;
+    fscal_S2 = rinvsq_S2 * frcoul_S2;
  #endif
  
      /* Calculate temporary vectorial force */
-    tx_S0       = fscal_S0 * dx_S0;
-    tx_S2       = fscal_S2 * dx_S2;
-    ty_S0       = fscal_S0 * dy_S0;
-    ty_S2       = fscal_S2 * dy_S2;
-    tz_S0       = fscal_S0 * dz_S0;
-    tz_S2       = fscal_S2 * dz_S2;
+    tx_S0 = fscal_S0 * dx_S0;
+    tx_S2 = fscal_S2 * dx_S2;
+    ty_S0 = fscal_S0 * dy_S0;
+    ty_S2 = fscal_S2 * dy_S2;
+    tz_S0 = fscal_S0 * dz_S0;
+    tz_S2 = fscal_S2 * dz_S2;
  
      /* Increment i atom force */
-    fix_S0      = fix_S0 + tx_S0;
-    fix_S2      = fix_S2 + tx_S2;
-    fiy_S0      = fiy_S0 + ty_S0;
-    fiy_S2      = fiy_S2 + ty_S2;
-    fiz_S0      = fiz_S0 + tz_S0;
-    fiz_S2      = fiz_S2 + tz_S2;
+    fix_S0 = fix_S0 + tx_S0;
+    fix_S2 = fix_S2 + tx_S2;
+    fiy_S0 = fiy_S0 + ty_S0;
+    fiy_S2 = fiy_S2 + ty_S2;
+    fiz_S0 = fiz_S0 + tz_S0;
+    fiz_S2 = fiz_S2 + tz_S2;
  
      /* Decrement j atom force */
-    decrHsimd(f+ajx, tx_S0 + tx_S2);
-    decrHsimd(f+ajy, ty_S0 + ty_S2);
-    decrHsimd(f+ajz, tz_S0 + tz_S2);
+    decrHsimd(f + ajx, tx_S0 + tx_S2);
+    decrHsimd(f + ajy, ty_S0 + ty_S2);
+    decrHsimd(f + ajz, tz_S0 + tz_S2);
  }
  
-#undef  rinv_ex_S0
-#undef  rinv_ex_S2
+#undef rinv_ex_S0
+#undef rinv_ex_S2
  
-#undef  wco_vdw_S0
-#undef  wco_vdw_S2
+#undef wco_vdw_S0
+#undef wco_vdw_S2
  
-#undef  EXCL_FORCES
+#undef EXCL_FORCES