src/gromacs/mdlib/genborn_sse2_double.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include <math.h>
  40 #include <string.h>
  41
  42 #include "gromacs/legacyheaders/typedefs.h"
  43 #include "gromacs/utility/smalloc.h"
  44 #include "gromacs/legacyheaders/genborn.h"
  45 #include "gromacs/math/vec.h"
  46 #include "gromacs/fileio/pdbio.h"
  47 #include "gromacs/legacyheaders/names.h"
  48 #include "gromacs/math/units.h"
  49 #include "gromacs/legacyheaders/domdec.h"
  50 #include "gromacs/legacyheaders/network.h"
  51 #include "gromacs/utility/fatalerror.h"
  52 #include "gromacs/legacyheaders/genborn.h"
  53
  54 #include "gromacs/utility/gmxmpi.h"
  55
  56 /* Only compile this file if SSE2 intrinsics are available */
  57 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  58 #include <gmx_sse2_double.h>
  59 #include <emmintrin.h>
  60
  61 #include "genborn_sse2_double.h"
  62
  63 int
  64 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
  65                               int natoms, gmx_localtop_t *top,
  66                               double *x, t_nblist *nl,
  67                               gmx_genborn_t *born)
  68 {
  69     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
  70     int           jnrA, jnrB, j3A, j3B;
  71     int          *mdtype;
  72     double        shX, shY, shZ;
  73     int          *jjnr;
  74     double       *shiftvec;
  75
  76     double        gpi_ai, gpi2;
  77     double        factor;
  78     double       *gb_radius;
  79     double       *vsolv;
  80     double       *work;
  81     double       *dadx;
  82
  83     __m128d       ix, iy, iz;
  84     __m128d       jx, jy, jz;
  85     __m128d       dx, dy, dz;
  86     __m128d       tx, ty, tz;
  87     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
  88     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
  89     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
  90     __m128d       mask, icf4, icf6, mask_cmp;
  91
  92     const __m128d half   = _mm_set1_pd(0.5);
  93     const __m128d three  = _mm_set1_pd(3.0);
  94     const __m128d one    = _mm_set1_pd(1.0);
  95     const __m128d two    = _mm_set1_pd(2.0);
  96     const __m128d zero   = _mm_set1_pd(0.0);
  97     const __m128d four   = _mm_set1_pd(4.0);
  98
  99     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
 100     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
 101     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
 102
 103     factor  = 0.5 * ONE_4PI_EPS0;
 104
 105     gb_radius = born->gb_radius;
 106     vsolv     = born->vsolv;
 107     work      = born->gpol_still_work;
 108     jjnr      = nl->jjnr;
 109     shiftvec  = fr->shift_vec[0];
 110     dadx      = fr->dadx;
 111
 112     jnrA = jnrB = 0;
 113     jx   = _mm_setzero_pd();
 114     jy   = _mm_setzero_pd();
 115     jz   = _mm_setzero_pd();
 116
 117     n = 0;
 118
 119     for (i = 0; i < natoms; i++)
 120     {
 121         work[i] = 0;
 122     }
 123
 124     for (i = 0; i < nl->nri; i++)
 125     {
 126         ii     = nl->iinr[i];
 127         ii3    = ii*3;
 128         is3    = 3*nl->shift[i];
 129         shX    = shiftvec[is3];
 130         shY    = shiftvec[is3+1];
 131         shZ    = shiftvec[is3+2];
 132         nj0    = nl->jindex[i];
 133         nj1    = nl->jindex[i+1];
 134
 135         ix     = _mm_set1_pd(shX+x[ii3+0]);
 136         iy     = _mm_set1_pd(shY+x[ii3+1]);
 137         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 138
 139
 140         /* Polarization energy for atom ai */
 141         gpi    = _mm_setzero_pd();
 142
 143         rai     = _mm_load1_pd(gb_radius+ii);
 144         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
 145
 146         for (k = nj0; k < nj1-1; k += 2)
 147         {
 148             jnrA        = jjnr[k];
 149             jnrB        = jjnr[k+1];
 150
 151             j3A         = 3*jnrA;
 152             j3B         = 3*jnrB;
 153
 154             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 155
 156             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 157             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
 158
 159             dx          = _mm_sub_pd(ix, jx);
 160             dy          = _mm_sub_pd(iy, jy);
 161             dz          = _mm_sub_pd(iz, jz);
 162
 163             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 164             rinv        = gmx_mm_invsqrt_pd(rsq);
 165             rinv2       = _mm_mul_pd(rinv, rinv);
 166             rinv4       = _mm_mul_pd(rinv2, rinv2);
 167             rinv6       = _mm_mul_pd(rinv4, rinv2);
 168
 169             rvdw        = _mm_add_pd(rai, raj);
 170             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 171
 172             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
 173
 174             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 175             if (0 == _mm_movemask_pd(mask_cmp) )
 176             {
 177                 /* if ratio>still_p5inv for ALL elements */
 178                 ccf         = one;
 179                 dccf        = _mm_setzero_pd();
 180             }
 181             else
 182             {
 183                 ratio       = _mm_min_pd(ratio, still_p5inv);
 184                 theta       = _mm_mul_pd(ratio, still_pip5);
 185                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 186                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
 187                 ccf         = _mm_mul_pd(term, term);
 188                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
 189                                          _mm_mul_pd(sinq, theta));
 190             }
 191
 192             prod        = _mm_mul_pd(still_p4, vaj);
 193             icf4        = _mm_mul_pd(ccf, rinv4);
 194             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
 195
 196             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
 197
 198             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
 199
 200             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 201             dadx += 2;
 202             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 203             dadx += 2;
 204         }
 205
 206         if (k < nj1)
 207         {
 208             jnrA        = jjnr[k];
 209
 210             j3A         = 3*jnrA;
 211
 212             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 213
 214             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 215             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
 216
 217             dx          = _mm_sub_sd(ix, jx);
 218             dy          = _mm_sub_sd(iy, jy);
 219             dz          = _mm_sub_sd(iz, jz);
 220
 221             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 222             rinv        = gmx_mm_invsqrt_pd(rsq);
 223             rinv2       = _mm_mul_sd(rinv, rinv);
 224             rinv4       = _mm_mul_sd(rinv2, rinv2);
 225             rinv6       = _mm_mul_sd(rinv4, rinv2);
 226
 227             rvdw        = _mm_add_sd(rai, raj);
 228             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 229
 230             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
 231
 232             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 233             if (0 == _mm_movemask_pd(mask_cmp) )
 234             {
 235                 /* if ratio>still_p5inv for ALL elements */
 236                 ccf         = one;
 237                 dccf        = _mm_setzero_pd();
 238             }
 239             else
 240             {
 241                 ratio       = _mm_min_sd(ratio, still_p5inv);
 242                 theta       = _mm_mul_sd(ratio, still_pip5);
 243                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 244                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
 245                 ccf         = _mm_mul_sd(term, term);
 246                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
 247                                          _mm_mul_sd(sinq, theta));
 248             }
 249
 250             prod        = _mm_mul_sd(still_p4, vaj);
 251             icf4        = _mm_mul_sd(ccf, rinv4);
 252             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
 253
 254             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
 255
 256             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
 257
 258             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 259             dadx += 2;
 260             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 261             dadx += 2;
 262         }
 263         gmx_mm_update_1pot_pd(gpi, work+ii);
 264     }
 265
 266     /* Sum up the polarization energy from other nodes */
 267     if (DOMAINDECOMP(cr))
 268     {
 269         dd_atom_sum_real(cr->dd, work);
 270     }
 271
 272     /* Compute the radii */
 273     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 274     {
 275         if (born->use[i] != 0)
 276         {
 277             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 278             gpi2             = gpi_ai * gpi_ai;
 279             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 280             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 281         }
 282     }
 283
 284     /* Extra (local) communication required for DD */
 285     if (DOMAINDECOMP(cr))
 286     {
 287         dd_atom_spread_real(cr->dd, born->bRad);
 288         dd_atom_spread_real(cr->dd, fr->invsqrta);
 289     }
 290
 291     return 0;
 292 }
 293
 294
 295 int
 296 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 297                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 298 {
 299     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 300     int           jnrA, jnrB;
 301     int           j3A, j3B;
 302     double        shX, shY, shZ;
 303     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 304     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
 305     double       *obc_param;
 306     double       *gb_radius;
 307     double       *work;
 308     int        *  jjnr;
 309     double       *dadx;
 310     double       *shiftvec;
 311     double        min_rad, rad;
 312
 313     __m128d       ix, iy, iz, jx, jy, jz;
 314     __m128d       dx, dy, dz, t1, t2, t3, t4;
 315     __m128d       rsq, rinv, r;
 316     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 317     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
 318     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 319     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 320     __m128d       dadx1, dadx2;
 321     __m128d       logterm;
 322     __m128d       mask;
 323     __m128d       obc_mask1, obc_mask2, obc_mask3;
 324
 325     __m128d       oneeighth   = _mm_set1_pd(0.125);
 326     __m128d       onefourth   = _mm_set1_pd(0.25);
 327
 328     const __m128d half  = _mm_set1_pd(0.5);
 329     const __m128d three = _mm_set1_pd(3.0);
 330     const __m128d one   = _mm_set1_pd(1.0);
 331     const __m128d two   = _mm_set1_pd(2.0);
 332     const __m128d zero  = _mm_set1_pd(0.0);
 333     const __m128d neg   = _mm_set1_pd(-1.0);
 334
 335     /* Set the dielectric offset */
 336     doffset   = born->gb_doffset;
 337     gb_radius = born->gb_radius;
 338     obc_param = born->param;
 339     work      = born->gpol_hct_work;
 340     jjnr      = nl->jjnr;
 341     dadx      = fr->dadx;
 342     shiftvec  = fr->shift_vec[0];
 343
 344     jx        = _mm_setzero_pd();
 345     jy        = _mm_setzero_pd();
 346     jz        = _mm_setzero_pd();
 347
 348     jnrA = jnrB = 0;
 349
 350     for (i = 0; i < born->nr; i++)
 351     {
 352         work[i] = 0;
 353     }
 354
 355     for (i = 0; i < nl->nri; i++)
 356     {
 357         ii     = nl->iinr[i];
 358         ii3    = ii*3;
 359         is3    = 3*nl->shift[i];
 360         shX    = shiftvec[is3];
 361         shY    = shiftvec[is3+1];
 362         shZ    = shiftvec[is3+2];
 363         nj0    = nl->jindex[i];
 364         nj1    = nl->jindex[i+1];
 365
 366         ix     = _mm_set1_pd(shX+x[ii3+0]);
 367         iy     = _mm_set1_pd(shY+x[ii3+1]);
 368         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 369
 370         rai     = _mm_load1_pd(gb_radius+ii);
 371         rai_inv = gmx_mm_inv_pd(rai);
 372
 373         sum_ai = _mm_setzero_pd();
 374
 375         sk_ai  = _mm_load1_pd(born->param+ii);
 376         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
 377
 378         for (k = nj0; k < nj1-1; k += 2)
 379         {
 380             jnrA        = jjnr[k];
 381             jnrB        = jjnr[k+1];
 382
 383             j3A         = 3*jnrA;
 384             j3B         = 3*jnrB;
 385
 386             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 387             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 388             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
 389
 390             dx    = _mm_sub_pd(ix, jx);
 391             dy    = _mm_sub_pd(iy, jy);
 392             dz    = _mm_sub_pd(iz, jz);
 393
 394             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 395
 396             rinv        = gmx_mm_invsqrt_pd(rsq);
 397             r           = _mm_mul_pd(rsq, rinv);
 398
 399             /* Compute raj_inv aj1-4 */
 400             raj_inv     = gmx_mm_inv_pd(raj);
 401
 402             /* Evaluate influence of atom aj -> ai */
 403             t1            = _mm_add_pd(r, sk_aj);
 404             t2            = _mm_sub_pd(r, sk_aj);
 405             t3            = _mm_sub_pd(sk_aj, r);
 406             obc_mask1     = _mm_cmplt_pd(rai, t1);
 407             obc_mask2     = _mm_cmplt_pd(rai, t2);
 408             obc_mask3     = _mm_cmplt_pd(rai, t3);
 409
 410             uij           = gmx_mm_inv_pd(t1);
 411             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 412                                          _mm_andnot_pd(obc_mask2, rai_inv));
 413             dlij          = _mm_and_pd(one, obc_mask2);
 414             uij2          = _mm_mul_pd(uij, uij);
 415             uij3          = _mm_mul_pd(uij2, uij);
 416             lij2          = _mm_mul_pd(lij, lij);
 417             lij3          = _mm_mul_pd(lij2, lij);
 418
 419             diff2         = _mm_sub_pd(uij2, lij2);
 420             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 421             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
 422             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
 423             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 424
 425             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 426
 427             t1            = _mm_sub_pd(lij, uij);
 428             t2            = _mm_mul_pd(diff2,
 429                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 430                                                   prod));
 431             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 432             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 433             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
 434             t4            = _mm_and_pd(t4, obc_mask3);
 435             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 436
 437             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 438
 439             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 440                                        _mm_mul_pd(prod, lij3));
 441             t1            = _mm_sub_pd(t1,
 442                                        _mm_mul_pd(onefourth,
 443                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 444                                                              _mm_mul_pd(lij3, r))));
 445             t2            = _mm_mul_pd(onefourth,
 446                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 447                                                   _mm_mul_pd(uij3, r)));
 448             t2            = _mm_sub_pd(t2,
 449                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 450                                                   _mm_mul_pd(prod, uij3)));
 451             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 452                                        _mm_mul_pd(rinv, rinv));
 453             t3            = _mm_sub_pd(t3,
 454                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 455                                                   _mm_add_pd(one,
 456                                                              _mm_mul_pd(sk2_rinv, rinv))));
 457             t1            = _mm_mul_pd(rinv,
 458                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 459                                                   _mm_add_pd(t2, t3)));
 460
 461             dadx1         = _mm_and_pd(t1, obc_mask1);
 462
 463             /* Evaluate influence of atom ai -> aj */
 464             t1            = _mm_add_pd(r, sk_ai);
 465             t2            = _mm_sub_pd(r, sk_ai);
 466             t3            = _mm_sub_pd(sk_ai, r);
 467             obc_mask1     = _mm_cmplt_pd(raj, t1);
 468             obc_mask2     = _mm_cmplt_pd(raj, t2);
 469             obc_mask3     = _mm_cmplt_pd(raj, t3);
 470
 471             uij           = gmx_mm_inv_pd(t1);
 472             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 473                                          _mm_andnot_pd(obc_mask2, raj_inv));
 474             dlij          = _mm_and_pd(one, obc_mask2);
 475             uij2          = _mm_mul_pd(uij, uij);
 476             uij3          = _mm_mul_pd(uij2, uij);
 477             lij2          = _mm_mul_pd(lij, lij);
 478             lij3          = _mm_mul_pd(lij2, lij);
 479
 480             diff2         = _mm_sub_pd(uij2, lij2);
 481             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 482             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
 483             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 484
 485             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 486
 487             t1            = _mm_sub_pd(lij, uij);
 488             t2            = _mm_mul_pd(diff2,
 489                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 490                                                   prod));
 491             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 492             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 493             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
 494             t4            = _mm_and_pd(t4, obc_mask3);
 495             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 496
 497             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
 498
 499             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 500                                        _mm_mul_pd(prod, lij3));
 501             t1            = _mm_sub_pd(t1,
 502                                        _mm_mul_pd(onefourth,
 503                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 504                                                              _mm_mul_pd(lij3, r))));
 505             t2            = _mm_mul_pd(onefourth,
 506                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 507                                                   _mm_mul_pd(uij3, r)));
 508             t2            = _mm_sub_pd(t2,
 509                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 510                                                   _mm_mul_pd(prod, uij3)));
 511             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 512                                        _mm_mul_pd(rinv, rinv));
 513             t3            = _mm_sub_pd(t3,
 514                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 515                                                   _mm_add_pd(one,
 516                                                              _mm_mul_pd(sk2_rinv, rinv))));
 517             t1            = _mm_mul_pd(rinv,
 518                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 519                                                   _mm_add_pd(t2, t3)));
 520
 521             dadx2         = _mm_and_pd(t1, obc_mask1);
 522
 523             _mm_store_pd(dadx, dadx1);
 524             dadx += 2;
 525             _mm_store_pd(dadx, dadx2);
 526             dadx += 2;
 527         } /* end normal inner loop */
 528
 529         if (k < nj1)
 530         {
 531             jnrA        = jjnr[k];
 532
 533             j3A         = 3*jnrA;
 534
 535             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 536             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 537             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
 538
 539             dx    = _mm_sub_sd(ix, jx);
 540             dy    = _mm_sub_sd(iy, jy);
 541             dz    = _mm_sub_sd(iz, jz);
 542
 543             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 544
 545             rinv        = gmx_mm_invsqrt_pd(rsq);
 546             r           = _mm_mul_sd(rsq, rinv);
 547
 548             /* Compute raj_inv aj1-4 */
 549             raj_inv     = gmx_mm_inv_pd(raj);
 550
 551             /* Evaluate influence of atom aj -> ai */
 552             t1            = _mm_add_sd(r, sk_aj);
 553             t2            = _mm_sub_sd(r, sk_aj);
 554             t3            = _mm_sub_sd(sk_aj, r);
 555             obc_mask1     = _mm_cmplt_sd(rai, t1);
 556             obc_mask2     = _mm_cmplt_sd(rai, t2);
 557             obc_mask3     = _mm_cmplt_sd(rai, t3);
 558
 559             uij           = gmx_mm_inv_pd(t1);
 560             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 561                                       _mm_andnot_pd(obc_mask2, rai_inv));
 562             dlij          = _mm_and_pd(one, obc_mask2);
 563             uij2          = _mm_mul_sd(uij, uij);
 564             uij3          = _mm_mul_sd(uij2, uij);
 565             lij2          = _mm_mul_sd(lij, lij);
 566             lij3          = _mm_mul_sd(lij2, lij);
 567
 568             diff2         = _mm_sub_sd(uij2, lij2);
 569             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 570             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
 571             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
 572             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 573
 574             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 575
 576             t1            = _mm_sub_sd(lij, uij);
 577             t2            = _mm_mul_sd(diff2,
 578                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
 579                                                   prod));
 580             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 581             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 582             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
 583             t4            = _mm_and_pd(t4, obc_mask3);
 584             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 585
 586             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 587
 588             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 589                                        _mm_mul_sd(prod, lij3));
 590             t1            = _mm_sub_sd(t1,
 591                                        _mm_mul_sd(onefourth,
 592                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 593                                                              _mm_mul_sd(lij3, r))));
 594             t2            = _mm_mul_sd(onefourth,
 595                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 596                                                   _mm_mul_sd(uij3, r)));
 597             t2            = _mm_sub_sd(t2,
 598                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 599                                                   _mm_mul_sd(prod, uij3)));
 600             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 601                                        _mm_mul_sd(rinv, rinv));
 602             t3            = _mm_sub_sd(t3,
 603                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 604                                                   _mm_add_sd(one,
 605                                                              _mm_mul_sd(sk2_rinv, rinv))));
 606             t1            = _mm_mul_sd(rinv,
 607                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 608                                                   _mm_add_pd(t2, t3)));
 609
 610             dadx1         = _mm_and_pd(t1, obc_mask1);
 611
 612             /* Evaluate influence of atom ai -> aj */
 613             t1            = _mm_add_sd(r, sk_ai);
 614             t2            = _mm_sub_sd(r, sk_ai);
 615             t3            = _mm_sub_sd(sk_ai, r);
 616             obc_mask1     = _mm_cmplt_sd(raj, t1);
 617             obc_mask2     = _mm_cmplt_sd(raj, t2);
 618             obc_mask3     = _mm_cmplt_sd(raj, t3);
 619
 620             uij           = gmx_mm_inv_pd(t1);
 621             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 622                                          _mm_andnot_pd(obc_mask2, raj_inv));
 623             dlij          = _mm_and_pd(one, obc_mask2);
 624             uij2          = _mm_mul_sd(uij, uij);
 625             uij3          = _mm_mul_sd(uij2, uij);
 626             lij2          = _mm_mul_sd(lij, lij);
 627             lij3          = _mm_mul_sd(lij2, lij);
 628
 629             diff2         = _mm_sub_sd(uij2, lij2);
 630             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 631             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
 632             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 633
 634             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 635
 636             t1            = _mm_sub_sd(lij, uij);
 637             t2            = _mm_mul_sd(diff2,
 638                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
 639                                                   prod));
 640             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 641             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 642             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
 643             t4            = _mm_and_pd(t4, obc_mask3);
 644             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 645
 646             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
 647
 648             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 649                                        _mm_mul_sd(prod, lij3));
 650             t1            = _mm_sub_sd(t1,
 651                                        _mm_mul_sd(onefourth,
 652                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 653                                                              _mm_mul_sd(lij3, r))));
 654             t2            = _mm_mul_sd(onefourth,
 655                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 656                                                   _mm_mul_sd(uij3, r)));
 657             t2            = _mm_sub_sd(t2,
 658                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 659                                                   _mm_mul_sd(prod, uij3)));
 660             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 661                                        _mm_mul_sd(rinv, rinv));
 662             t3            = _mm_sub_sd(t3,
 663                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 664                                                   _mm_add_sd(one,
 665                                                              _mm_mul_sd(sk2_rinv, rinv))));
 666             t1            = _mm_mul_sd(rinv,
 667                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 668                                                   _mm_add_sd(t2, t3)));
 669
 670             dadx2         = _mm_and_pd(t1, obc_mask1);
 671
 672             _mm_store_pd(dadx, dadx1);
 673             dadx += 2;
 674             _mm_store_pd(dadx, dadx2);
 675             dadx += 2;
 676         }
 677         gmx_mm_update_1pot_pd(sum_ai, work+ii);
 678
 679     }
 680
 681     /* Parallel summations */
 682     if (DOMAINDECOMP(cr))
 683     {
 684         dd_atom_sum_real(cr->dd, work);
 685     }
 686
 687     if (gb_algorithm == egbHCT)
 688     {
 689         /* HCT */
 690         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 691         {
 692             if (born->use[i] != 0)
 693             {
 694                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
 695                 sum     = 1.0/rr - work[i];
 696                 min_rad = rr + doffset;
 697                 rad     = 1.0/sum;
 698
 699                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
 700                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 701             }
 702         }
 703
 704         /* Extra communication required for DD */
 705         if (DOMAINDECOMP(cr))
 706         {
 707             dd_atom_spread_real(cr->dd, born->bRad);
 708             dd_atom_spread_real(cr->dd, fr->invsqrta);
 709         }
 710     }
 711     else
 712     {
 713         /* OBC */
 714         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 715         {
 716             if (born->use[i] != 0)
 717             {
 718                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
 719                 rr_inv2 = 1.0/rr;
 720                 rr      = rr-doffset;
 721                 rr_inv  = 1.0/rr;
 722                 sum     = rr * work[i];
 723                 sum2    = sum  * sum;
 724                 sum3    = sum2 * sum;
 725
 726                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
 727                 born->bRad[i] = rr_inv - tsum*rr_inv2;
 728                 born->bRad[i] = 1.0 / born->bRad[i];
 729
 730                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 731
 732                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
 733                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
 734             }
 735         }
 736         /* Extra (local) communication required for DD */
 737         if (DOMAINDECOMP(cr))
 738         {
 739             dd_atom_spread_real(cr->dd, born->bRad);
 740             dd_atom_spread_real(cr->dd, fr->invsqrta);
 741             dd_atom_spread_real(cr->dd, born->drobc);
 742         }
 743     }
 744
 745
 746
 747     return 0;
 748 }
 749
 750
 751 int
 752 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
 753                               double *x, double *f, double *fshift, double *shiftvec,
 754                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
 755 {
 756     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
 757     int           jnrA, jnrB;
 758     int           j3A, j3B;
 759     int        *  jjnr;
 760
 761     double        rbi, shX, shY, shZ;
 762     double       *rb;
 763
 764     __m128d       ix, iy, iz;
 765     __m128d       jx, jy, jz;
 766     __m128d       fix, fiy, fiz;
 767     __m128d       dx, dy, dz;
 768     __m128d       tx, ty, tz;
 769
 770     __m128d       rbai, rbaj, f_gb, f_gb_ai;
 771     __m128d       xmm1, xmm2, xmm3;
 772
 773     const __m128d two = _mm_set1_pd(2.0);
 774
 775     rb     = born->work;
 776
 777     jjnr   = nl->jjnr;
 778
 779     /* Loop to get the proper form for the Born radius term, sse style */
 780     n0 = 0;
 781     n1 = natoms;
 782
 783     if (gb_algorithm == egbSTILL)
 784     {
 785         for (i = n0; i < n1; i++)
 786         {
 787             rbi   = born->bRad[i];
 788             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
 789         }
 790     }
 791     else if (gb_algorithm == egbHCT)
 792     {
 793         for (i = n0; i < n1; i++)
 794         {
 795             rbi   = born->bRad[i];
 796             rb[i] = rbi * rbi * dvda[i];
 797         }
 798     }
 799     else if (gb_algorithm == egbOBC)
 800     {
 801         for (i = n0; i < n1; i++)
 802         {
 803             rbi   = born->bRad[i];
 804             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
 805         }
 806     }
 807
 808     jz = _mm_setzero_pd();
 809
 810     n = j3A = j3B = 0;
 811
 812     for (i = 0; i < nl->nri; i++)
 813     {
 814         ii     = nl->iinr[i];
 815         ii3    = ii*3;
 816         is3    = 3*nl->shift[i];
 817         shX    = shiftvec[is3];
 818         shY    = shiftvec[is3+1];
 819         shZ    = shiftvec[is3+2];
 820         nj0    = nl->jindex[i];
 821         nj1    = nl->jindex[i+1];
 822
 823         ix     = _mm_set1_pd(shX+x[ii3+0]);
 824         iy     = _mm_set1_pd(shY+x[ii3+1]);
 825         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 826
 827         rbai   = _mm_load1_pd(rb+ii);
 828         fix    = _mm_setzero_pd();
 829         fiy    = _mm_setzero_pd();
 830         fiz    = _mm_setzero_pd();
 831
 832
 833         for (k = nj0; k < nj1-1; k += 2)
 834         {
 835             jnrA        = jjnr[k];
 836             jnrB        = jjnr[k+1];
 837
 838             j3A         = 3*jnrA;
 839             j3B         = 3*jnrB;
 840
 841             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 842
 843             dx          = _mm_sub_pd(ix, jx);
 844             dy          = _mm_sub_pd(iy, jy);
 845             dz          = _mm_sub_pd(iz, jz);
 846
 847             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
 848
 849             /* load chain rule terms for j1-4 */
 850             f_gb        = _mm_load_pd(dadx);
 851             dadx       += 2;
 852             f_gb_ai     = _mm_load_pd(dadx);
 853             dadx       += 2;
 854
 855             /* calculate scalar force */
 856             f_gb    = _mm_mul_pd(f_gb, rbai);
 857             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
 858             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
 859
 860             tx     = _mm_mul_pd(f_gb, dx);
 861             ty     = _mm_mul_pd(f_gb, dy);
 862             tz     = _mm_mul_pd(f_gb, dz);
 863
 864             fix    = _mm_add_pd(fix, tx);
 865             fiy    = _mm_add_pd(fiy, ty);
 866             fiz    = _mm_add_pd(fiz, tz);
 867
 868             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
 869         }
 870
 871         /*deal with odd elements */
 872         if (k < nj1)
 873         {
 874             jnrA        = jjnr[k];
 875             j3A         = 3*jnrA;
 876
 877             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 878
 879             dx          = _mm_sub_sd(ix, jx);
 880             dy          = _mm_sub_sd(iy, jy);
 881             dz          = _mm_sub_sd(iz, jz);
 882
 883             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
 884
 885             /* load chain rule terms */
 886             f_gb        = _mm_load_pd(dadx);
 887             dadx       += 2;
 888             f_gb_ai     = _mm_load_pd(dadx);
 889             dadx       += 2;
 890
 891             /* calculate scalar force */
 892             f_gb    = _mm_mul_sd(f_gb, rbai);
 893             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
 894             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
 895
 896             tx     = _mm_mul_sd(f_gb, dx);
 897             ty     = _mm_mul_sd(f_gb, dy);
 898             tz     = _mm_mul_sd(f_gb, dz);
 899
 900             fix    = _mm_add_sd(fix, tx);
 901             fiy    = _mm_add_sd(fiy, ty);
 902             fiz    = _mm_add_sd(fiz, tz);
 903
 904             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
 905         }
 906
 907         /* fix/fiy/fiz now contain four partial force terms, that all should be
 908          * added to the i particle forces and shift forces.
 909          */
 910         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
 911     }
 912
 913     return 0;
 914 }
 915
 916 #else
 917 /* keep compiler happy */
 918 int genborn_sse2_dummy;
 919
 920 #endif /* SSE2 intrinsics available */