src/gromacs/mdlib/genborn_sse2_double.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #include "gmxpre.h"
  38
  39 #include "config.h"
  40
  41 #include <math.h>
  42 #include <string.h>
  43
  44 #include "gromacs/legacyheaders/typedefs.h"
  45 #include "gromacs/utility/smalloc.h"
  46 #include "gromacs/legacyheaders/genborn.h"
  47 #include "gromacs/math/vec.h"
  48 #include "gromacs/fileio/pdbio.h"
  49 #include "gromacs/legacyheaders/names.h"
  50 #include "gromacs/math/units.h"
  51 #include "gromacs/legacyheaders/domdec.h"
  52 #include "gromacs/legacyheaders/network.h"
  53 #include "gromacs/utility/fatalerror.h"
  54 #include "gromacs/legacyheaders/genborn.h"
  55
  56 #include "gromacs/utility/gmxmpi.h"
  57
  58 /* Only compile this file if SSE2 intrinsics are available */
  59 #if 0 && defined (GMX_SIMD_X86_SSE2_OR_HIGHER)
  60 #include <gmx_sse2_double.h>
  61 #include <emmintrin.h>
  62
  63 #include "genborn_sse2_double.h"
  64
  65 int
  66 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
  67                               int natoms, gmx_localtop_t *top,
  68                               double *x, t_nblist *nl,
  69                               gmx_genborn_t *born)
  70 {
  71     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
  72     int           jnrA, jnrB, j3A, j3B;
  73     int          *mdtype;
  74     double        shX, shY, shZ;
  75     int          *jjnr;
  76     double       *shiftvec;
  77
  78     double        gpi_ai, gpi2;
  79     double        factor;
  80     double       *gb_radius;
  81     double       *vsolv;
  82     double       *work;
  83     double       *dadx;
  84
  85     __m128d       ix, iy, iz;
  86     __m128d       jx, jy, jz;
  87     __m128d       dx, dy, dz;
  88     __m128d       tx, ty, tz;
  89     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
  90     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
  91     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
  92     __m128d       mask, icf4, icf6, mask_cmp;
  93
  94     const __m128d half   = _mm_set1_pd(0.5);
  95     const __m128d three  = _mm_set1_pd(3.0);
  96     const __m128d one    = _mm_set1_pd(1.0);
  97     const __m128d two    = _mm_set1_pd(2.0);
  98     const __m128d zero   = _mm_set1_pd(0.0);
  99     const __m128d four   = _mm_set1_pd(4.0);
 100
 101     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
 102     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
 103     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
 104
 105     factor  = 0.5 * ONE_4PI_EPS0;
 106
 107     gb_radius = born->gb_radius;
 108     vsolv     = born->vsolv;
 109     work      = born->gpol_still_work;
 110     jjnr      = nl->jjnr;
 111     shiftvec  = fr->shift_vec[0];
 112     dadx      = fr->dadx;
 113
 114     jnrA = jnrB = 0;
 115     jx   = _mm_setzero_pd();
 116     jy   = _mm_setzero_pd();
 117     jz   = _mm_setzero_pd();
 118
 119     n = 0;
 120
 121     for (i = 0; i < natoms; i++)
 122     {
 123         work[i] = 0;
 124     }
 125
 126     for (i = 0; i < nl->nri; i++)
 127     {
 128         ii     = nl->iinr[i];
 129         ii3    = ii*3;
 130         is3    = 3*nl->shift[i];
 131         shX    = shiftvec[is3];
 132         shY    = shiftvec[is3+1];
 133         shZ    = shiftvec[is3+2];
 134         nj0    = nl->jindex[i];
 135         nj1    = nl->jindex[i+1];
 136
 137         ix     = _mm_set1_pd(shX+x[ii3+0]);
 138         iy     = _mm_set1_pd(shY+x[ii3+1]);
 139         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 140
 141
 142         /* Polarization energy for atom ai */
 143         gpi    = _mm_setzero_pd();
 144
 145         rai     = _mm_load1_pd(gb_radius+ii);
 146         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
 147
 148         for (k = nj0; k < nj1-1; k += 2)
 149         {
 150             jnrA        = jjnr[k];
 151             jnrB        = jjnr[k+1];
 152
 153             j3A         = 3*jnrA;
 154             j3B         = 3*jnrB;
 155
 156             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 157
 158             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 159             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
 160
 161             dx          = _mm_sub_pd(ix, jx);
 162             dy          = _mm_sub_pd(iy, jy);
 163             dz          = _mm_sub_pd(iz, jz);
 164
 165             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 166             rinv        = gmx_mm_invsqrt_pd(rsq);
 167             rinv2       = _mm_mul_pd(rinv, rinv);
 168             rinv4       = _mm_mul_pd(rinv2, rinv2);
 169             rinv6       = _mm_mul_pd(rinv4, rinv2);
 170
 171             rvdw        = _mm_add_pd(rai, raj);
 172             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 173
 174             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
 175
 176             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 177             if (0 == _mm_movemask_pd(mask_cmp) )
 178             {
 179                 /* if ratio>still_p5inv for ALL elements */
 180                 ccf         = one;
 181                 dccf        = _mm_setzero_pd();
 182             }
 183             else
 184             {
 185                 ratio       = _mm_min_pd(ratio, still_p5inv);
 186                 theta       = _mm_mul_pd(ratio, still_pip5);
 187                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 188                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
 189                 ccf         = _mm_mul_pd(term, term);
 190                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
 191                                          _mm_mul_pd(sinq, theta));
 192             }
 193
 194             prod        = _mm_mul_pd(still_p4, vaj);
 195             icf4        = _mm_mul_pd(ccf, rinv4);
 196             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
 197
 198             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
 199
 200             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
 201
 202             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 203             dadx += 2;
 204             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 205             dadx += 2;
 206         }
 207
 208         if (k < nj1)
 209         {
 210             jnrA        = jjnr[k];
 211
 212             j3A         = 3*jnrA;
 213
 214             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 215
 216             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 217             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
 218
 219             dx          = _mm_sub_sd(ix, jx);
 220             dy          = _mm_sub_sd(iy, jy);
 221             dz          = _mm_sub_sd(iz, jz);
 222
 223             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 224             rinv        = gmx_mm_invsqrt_pd(rsq);
 225             rinv2       = _mm_mul_sd(rinv, rinv);
 226             rinv4       = _mm_mul_sd(rinv2, rinv2);
 227             rinv6       = _mm_mul_sd(rinv4, rinv2);
 228
 229             rvdw        = _mm_add_sd(rai, raj);
 230             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 231
 232             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
 233
 234             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 235             if (0 == _mm_movemask_pd(mask_cmp) )
 236             {
 237                 /* if ratio>still_p5inv for ALL elements */
 238                 ccf         = one;
 239                 dccf        = _mm_setzero_pd();
 240             }
 241             else
 242             {
 243                 ratio       = _mm_min_sd(ratio, still_p5inv);
 244                 theta       = _mm_mul_sd(ratio, still_pip5);
 245                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 246                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
 247                 ccf         = _mm_mul_sd(term, term);
 248                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
 249                                          _mm_mul_sd(sinq, theta));
 250             }
 251
 252             prod        = _mm_mul_sd(still_p4, vaj);
 253             icf4        = _mm_mul_sd(ccf, rinv4);
 254             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
 255
 256             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
 257
 258             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
 259
 260             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 261             dadx += 2;
 262             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 263             dadx += 2;
 264         }
 265         gmx_mm_update_1pot_pd(gpi, work+ii);
 266     }
 267
 268     /* Sum up the polarization energy from other nodes */
 269     if (DOMAINDECOMP(cr))
 270     {
 271         dd_atom_sum_real(cr->dd, work);
 272     }
 273
 274     /* Compute the radii */
 275     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 276     {
 277         if (born->use[i] != 0)
 278         {
 279             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 280             gpi2             = gpi_ai * gpi_ai;
 281             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 282             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 283         }
 284     }
 285
 286     /* Extra (local) communication required for DD */
 287     if (DOMAINDECOMP(cr))
 288     {
 289         dd_atom_spread_real(cr->dd, born->bRad);
 290         dd_atom_spread_real(cr->dd, fr->invsqrta);
 291     }
 292
 293     return 0;
 294 }
 295
 296
 297 int
 298 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 299                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 300 {
 301     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 302     int           jnrA, jnrB;
 303     int           j3A, j3B;
 304     double        shX, shY, shZ;
 305     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 306     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
 307     double       *obc_param;
 308     double       *gb_radius;
 309     double       *work;
 310     int        *  jjnr;
 311     double       *dadx;
 312     double       *shiftvec;
 313     double        min_rad, rad;
 314
 315     __m128d       ix, iy, iz, jx, jy, jz;
 316     __m128d       dx, dy, dz, t1, t2, t3, t4;
 317     __m128d       rsq, rinv, r;
 318     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 319     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
 320     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 321     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 322     __m128d       dadx1, dadx2;
 323     __m128d       logterm;
 324     __m128d       mask;
 325     __m128d       obc_mask1, obc_mask2, obc_mask3;
 326
 327     __m128d       oneeighth   = _mm_set1_pd(0.125);
 328     __m128d       onefourth   = _mm_set1_pd(0.25);
 329
 330     const __m128d half  = _mm_set1_pd(0.5);
 331     const __m128d three = _mm_set1_pd(3.0);
 332     const __m128d one   = _mm_set1_pd(1.0);
 333     const __m128d two   = _mm_set1_pd(2.0);
 334     const __m128d zero  = _mm_set1_pd(0.0);
 335     const __m128d neg   = _mm_set1_pd(-1.0);
 336
 337     /* Set the dielectric offset */
 338     doffset   = born->gb_doffset;
 339     gb_radius = born->gb_radius;
 340     obc_param = born->param;
 341     work      = born->gpol_hct_work;
 342     jjnr      = nl->jjnr;
 343     dadx      = fr->dadx;
 344     shiftvec  = fr->shift_vec[0];
 345
 346     jx        = _mm_setzero_pd();
 347     jy        = _mm_setzero_pd();
 348     jz        = _mm_setzero_pd();
 349
 350     jnrA = jnrB = 0;
 351
 352     for (i = 0; i < born->nr; i++)
 353     {
 354         work[i] = 0;
 355     }
 356
 357     for (i = 0; i < nl->nri; i++)
 358     {
 359         ii     = nl->iinr[i];
 360         ii3    = ii*3;
 361         is3    = 3*nl->shift[i];
 362         shX    = shiftvec[is3];
 363         shY    = shiftvec[is3+1];
 364         shZ    = shiftvec[is3+2];
 365         nj0    = nl->jindex[i];
 366         nj1    = nl->jindex[i+1];
 367
 368         ix     = _mm_set1_pd(shX+x[ii3+0]);
 369         iy     = _mm_set1_pd(shY+x[ii3+1]);
 370         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 371
 372         rai     = _mm_load1_pd(gb_radius+ii);
 373         rai_inv = gmx_mm_inv_pd(rai);
 374
 375         sum_ai = _mm_setzero_pd();
 376
 377         sk_ai  = _mm_load1_pd(born->param+ii);
 378         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
 379
 380         for (k = nj0; k < nj1-1; k += 2)
 381         {
 382             jnrA        = jjnr[k];
 383             jnrB        = jjnr[k+1];
 384
 385             j3A         = 3*jnrA;
 386             j3B         = 3*jnrB;
 387
 388             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 389             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 390             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
 391
 392             dx    = _mm_sub_pd(ix, jx);
 393             dy    = _mm_sub_pd(iy, jy);
 394             dz    = _mm_sub_pd(iz, jz);
 395
 396             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 397
 398             rinv        = gmx_mm_invsqrt_pd(rsq);
 399             r           = _mm_mul_pd(rsq, rinv);
 400
 401             /* Compute raj_inv aj1-4 */
 402             raj_inv     = gmx_mm_inv_pd(raj);
 403
 404             /* Evaluate influence of atom aj -> ai */
 405             t1            = _mm_add_pd(r, sk_aj);
 406             t2            = _mm_sub_pd(r, sk_aj);
 407             t3            = _mm_sub_pd(sk_aj, r);
 408             obc_mask1     = _mm_cmplt_pd(rai, t1);
 409             obc_mask2     = _mm_cmplt_pd(rai, t2);
 410             obc_mask3     = _mm_cmplt_pd(rai, t3);
 411
 412             uij           = gmx_mm_inv_pd(t1);
 413             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 414                                          _mm_andnot_pd(obc_mask2, rai_inv));
 415             dlij          = _mm_and_pd(one, obc_mask2);
 416             uij2          = _mm_mul_pd(uij, uij);
 417             uij3          = _mm_mul_pd(uij2, uij);
 418             lij2          = _mm_mul_pd(lij, lij);
 419             lij3          = _mm_mul_pd(lij2, lij);
 420
 421             diff2         = _mm_sub_pd(uij2, lij2);
 422             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 423             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
 424             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
 425             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 426
 427             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 428
 429             t1            = _mm_sub_pd(lij, uij);
 430             t2            = _mm_mul_pd(diff2,
 431                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 432                                                   prod));
 433             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 434             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 435             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
 436             t4            = _mm_and_pd(t4, obc_mask3);
 437             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 438
 439             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 440
 441             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 442                                        _mm_mul_pd(prod, lij3));
 443             t1            = _mm_sub_pd(t1,
 444                                        _mm_mul_pd(onefourth,
 445                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 446                                                              _mm_mul_pd(lij3, r))));
 447             t2            = _mm_mul_pd(onefourth,
 448                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 449                                                   _mm_mul_pd(uij3, r)));
 450             t2            = _mm_sub_pd(t2,
 451                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 452                                                   _mm_mul_pd(prod, uij3)));
 453             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 454                                        _mm_mul_pd(rinv, rinv));
 455             t3            = _mm_sub_pd(t3,
 456                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 457                                                   _mm_add_pd(one,
 458                                                              _mm_mul_pd(sk2_rinv, rinv))));
 459             t1            = _mm_mul_pd(rinv,
 460                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 461                                                   _mm_add_pd(t2, t3)));
 462
 463             dadx1         = _mm_and_pd(t1, obc_mask1);
 464
 465             /* Evaluate influence of atom ai -> aj */
 466             t1            = _mm_add_pd(r, sk_ai);
 467             t2            = _mm_sub_pd(r, sk_ai);
 468             t3            = _mm_sub_pd(sk_ai, r);
 469             obc_mask1     = _mm_cmplt_pd(raj, t1);
 470             obc_mask2     = _mm_cmplt_pd(raj, t2);
 471             obc_mask3     = _mm_cmplt_pd(raj, t3);
 472
 473             uij           = gmx_mm_inv_pd(t1);
 474             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 475                                          _mm_andnot_pd(obc_mask2, raj_inv));
 476             dlij          = _mm_and_pd(one, obc_mask2);
 477             uij2          = _mm_mul_pd(uij, uij);
 478             uij3          = _mm_mul_pd(uij2, uij);
 479             lij2          = _mm_mul_pd(lij, lij);
 480             lij3          = _mm_mul_pd(lij2, lij);
 481
 482             diff2         = _mm_sub_pd(uij2, lij2);
 483             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 484             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
 485             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 486
 487             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 488
 489             t1            = _mm_sub_pd(lij, uij);
 490             t2            = _mm_mul_pd(diff2,
 491                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 492                                                   prod));
 493             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 494             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 495             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
 496             t4            = _mm_and_pd(t4, obc_mask3);
 497             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 498
 499             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
 500
 501             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 502                                        _mm_mul_pd(prod, lij3));
 503             t1            = _mm_sub_pd(t1,
 504                                        _mm_mul_pd(onefourth,
 505                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 506                                                              _mm_mul_pd(lij3, r))));
 507             t2            = _mm_mul_pd(onefourth,
 508                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 509                                                   _mm_mul_pd(uij3, r)));
 510             t2            = _mm_sub_pd(t2,
 511                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 512                                                   _mm_mul_pd(prod, uij3)));
 513             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 514                                        _mm_mul_pd(rinv, rinv));
 515             t3            = _mm_sub_pd(t3,
 516                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 517                                                   _mm_add_pd(one,
 518                                                              _mm_mul_pd(sk2_rinv, rinv))));
 519             t1            = _mm_mul_pd(rinv,
 520                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 521                                                   _mm_add_pd(t2, t3)));
 522
 523             dadx2         = _mm_and_pd(t1, obc_mask1);
 524
 525             _mm_store_pd(dadx, dadx1);
 526             dadx += 2;
 527             _mm_store_pd(dadx, dadx2);
 528             dadx += 2;
 529         } /* end normal inner loop */
 530
 531         if (k < nj1)
 532         {
 533             jnrA        = jjnr[k];
 534
 535             j3A         = 3*jnrA;
 536
 537             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 538             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 539             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
 540
 541             dx    = _mm_sub_sd(ix, jx);
 542             dy    = _mm_sub_sd(iy, jy);
 543             dz    = _mm_sub_sd(iz, jz);
 544
 545             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 546
 547             rinv        = gmx_mm_invsqrt_pd(rsq);
 548             r           = _mm_mul_sd(rsq, rinv);
 549
 550             /* Compute raj_inv aj1-4 */
 551             raj_inv     = gmx_mm_inv_pd(raj);
 552
 553             /* Evaluate influence of atom aj -> ai */
 554             t1            = _mm_add_sd(r, sk_aj);
 555             t2            = _mm_sub_sd(r, sk_aj);
 556             t3            = _mm_sub_sd(sk_aj, r);
 557             obc_mask1     = _mm_cmplt_sd(rai, t1);
 558             obc_mask2     = _mm_cmplt_sd(rai, t2);
 559             obc_mask3     = _mm_cmplt_sd(rai, t3);
 560
 561             uij           = gmx_mm_inv_pd(t1);
 562             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 563                                       _mm_andnot_pd(obc_mask2, rai_inv));
 564             dlij          = _mm_and_pd(one, obc_mask2);
 565             uij2          = _mm_mul_sd(uij, uij);
 566             uij3          = _mm_mul_sd(uij2, uij);
 567             lij2          = _mm_mul_sd(lij, lij);
 568             lij3          = _mm_mul_sd(lij2, lij);
 569
 570             diff2         = _mm_sub_sd(uij2, lij2);
 571             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 572             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
 573             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
 574             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 575
 576             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 577
 578             t1            = _mm_sub_sd(lij, uij);
 579             t2            = _mm_mul_sd(diff2,
 580                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
 581                                                   prod));
 582             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 583             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 584             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
 585             t4            = _mm_and_pd(t4, obc_mask3);
 586             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 587
 588             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 589
 590             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 591                                        _mm_mul_sd(prod, lij3));
 592             t1            = _mm_sub_sd(t1,
 593                                        _mm_mul_sd(onefourth,
 594                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 595                                                              _mm_mul_sd(lij3, r))));
 596             t2            = _mm_mul_sd(onefourth,
 597                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 598                                                   _mm_mul_sd(uij3, r)));
 599             t2            = _mm_sub_sd(t2,
 600                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 601                                                   _mm_mul_sd(prod, uij3)));
 602             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 603                                        _mm_mul_sd(rinv, rinv));
 604             t3            = _mm_sub_sd(t3,
 605                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 606                                                   _mm_add_sd(one,
 607                                                              _mm_mul_sd(sk2_rinv, rinv))));
 608             t1            = _mm_mul_sd(rinv,
 609                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 610                                                   _mm_add_pd(t2, t3)));
 611
 612             dadx1         = _mm_and_pd(t1, obc_mask1);
 613
 614             /* Evaluate influence of atom ai -> aj */
 615             t1            = _mm_add_sd(r, sk_ai);
 616             t2            = _mm_sub_sd(r, sk_ai);
 617             t3            = _mm_sub_sd(sk_ai, r);
 618             obc_mask1     = _mm_cmplt_sd(raj, t1);
 619             obc_mask2     = _mm_cmplt_sd(raj, t2);
 620             obc_mask3     = _mm_cmplt_sd(raj, t3);
 621
 622             uij           = gmx_mm_inv_pd(t1);
 623             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 624                                          _mm_andnot_pd(obc_mask2, raj_inv));
 625             dlij          = _mm_and_pd(one, obc_mask2);
 626             uij2          = _mm_mul_sd(uij, uij);
 627             uij3          = _mm_mul_sd(uij2, uij);
 628             lij2          = _mm_mul_sd(lij, lij);
 629             lij3          = _mm_mul_sd(lij2, lij);
 630
 631             diff2         = _mm_sub_sd(uij2, lij2);
 632             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 633             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
 634             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 635
 636             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 637
 638             t1            = _mm_sub_sd(lij, uij);
 639             t2            = _mm_mul_sd(diff2,
 640                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
 641                                                   prod));
 642             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 643             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 644             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
 645             t4            = _mm_and_pd(t4, obc_mask3);
 646             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 647
 648             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
 649
 650             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 651                                        _mm_mul_sd(prod, lij3));
 652             t1            = _mm_sub_sd(t1,
 653                                        _mm_mul_sd(onefourth,
 654                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 655                                                              _mm_mul_sd(lij3, r))));
 656             t2            = _mm_mul_sd(onefourth,
 657                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 658                                                   _mm_mul_sd(uij3, r)));
 659             t2            = _mm_sub_sd(t2,
 660                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 661                                                   _mm_mul_sd(prod, uij3)));
 662             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 663                                        _mm_mul_sd(rinv, rinv));
 664             t3            = _mm_sub_sd(t3,
 665                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 666                                                   _mm_add_sd(one,
 667                                                              _mm_mul_sd(sk2_rinv, rinv))));
 668             t1            = _mm_mul_sd(rinv,
 669                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 670                                                   _mm_add_sd(t2, t3)));
 671
 672             dadx2         = _mm_and_pd(t1, obc_mask1);
 673
 674             _mm_store_pd(dadx, dadx1);
 675             dadx += 2;
 676             _mm_store_pd(dadx, dadx2);
 677             dadx += 2;
 678         }
 679         gmx_mm_update_1pot_pd(sum_ai, work+ii);
 680
 681     }
 682
 683     /* Parallel summations */
 684     if (DOMAINDECOMP(cr))
 685     {
 686         dd_atom_sum_real(cr->dd, work);
 687     }
 688
 689     if (gb_algorithm == egbHCT)
 690     {
 691         /* HCT */
 692         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 693         {
 694             if (born->use[i] != 0)
 695             {
 696                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
 697                 sum     = 1.0/rr - work[i];
 698                 min_rad = rr + doffset;
 699                 rad     = 1.0/sum;
 700
 701                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
 702                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 703             }
 704         }
 705
 706         /* Extra communication required for DD */
 707         if (DOMAINDECOMP(cr))
 708         {
 709             dd_atom_spread_real(cr->dd, born->bRad);
 710             dd_atom_spread_real(cr->dd, fr->invsqrta);
 711         }
 712     }
 713     else
 714     {
 715         /* OBC */
 716         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 717         {
 718             if (born->use[i] != 0)
 719             {
 720                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
 721                 rr_inv2 = 1.0/rr;
 722                 rr      = rr-doffset;
 723                 rr_inv  = 1.0/rr;
 724                 sum     = rr * work[i];
 725                 sum2    = sum  * sum;
 726                 sum3    = sum2 * sum;
 727
 728                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
 729                 born->bRad[i] = rr_inv - tsum*rr_inv2;
 730                 born->bRad[i] = 1.0 / born->bRad[i];
 731
 732                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 733
 734                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
 735                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
 736             }
 737         }
 738         /* Extra (local) communication required for DD */
 739         if (DOMAINDECOMP(cr))
 740         {
 741             dd_atom_spread_real(cr->dd, born->bRad);
 742             dd_atom_spread_real(cr->dd, fr->invsqrta);
 743             dd_atom_spread_real(cr->dd, born->drobc);
 744         }
 745     }
 746
 747
 748
 749     return 0;
 750 }
 751
 752
 753 int
 754 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
 755                               double *x, double *f, double *fshift, double *shiftvec,
 756                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
 757 {
 758     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
 759     int           jnrA, jnrB;
 760     int           j3A, j3B;
 761     int        *  jjnr;
 762
 763     double        rbi, shX, shY, shZ;
 764     double       *rb;
 765
 766     __m128d       ix, iy, iz;
 767     __m128d       jx, jy, jz;
 768     __m128d       fix, fiy, fiz;
 769     __m128d       dx, dy, dz;
 770     __m128d       tx, ty, tz;
 771
 772     __m128d       rbai, rbaj, f_gb, f_gb_ai;
 773     __m128d       xmm1, xmm2, xmm3;
 774
 775     const __m128d two = _mm_set1_pd(2.0);
 776
 777     rb     = born->work;
 778
 779     jjnr   = nl->jjnr;
 780
 781     /* Loop to get the proper form for the Born radius term, sse style */
 782     n0 = 0;
 783     n1 = natoms;
 784
 785     if (gb_algorithm == egbSTILL)
 786     {
 787         for (i = n0; i < n1; i++)
 788         {
 789             rbi   = born->bRad[i];
 790             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
 791         }
 792     }
 793     else if (gb_algorithm == egbHCT)
 794     {
 795         for (i = n0; i < n1; i++)
 796         {
 797             rbi   = born->bRad[i];
 798             rb[i] = rbi * rbi * dvda[i];
 799         }
 800     }
 801     else if (gb_algorithm == egbOBC)
 802     {
 803         for (i = n0; i < n1; i++)
 804         {
 805             rbi   = born->bRad[i];
 806             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
 807         }
 808     }
 809
 810     jz = _mm_setzero_pd();
 811
 812     n = j3A = j3B = 0;
 813
 814     for (i = 0; i < nl->nri; i++)
 815     {
 816         ii     = nl->iinr[i];
 817         ii3    = ii*3;
 818         is3    = 3*nl->shift[i];
 819         shX    = shiftvec[is3];
 820         shY    = shiftvec[is3+1];
 821         shZ    = shiftvec[is3+2];
 822         nj0    = nl->jindex[i];
 823         nj1    = nl->jindex[i+1];
 824
 825         ix     = _mm_set1_pd(shX+x[ii3+0]);
 826         iy     = _mm_set1_pd(shY+x[ii3+1]);
 827         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 828
 829         rbai   = _mm_load1_pd(rb+ii);
 830         fix    = _mm_setzero_pd();
 831         fiy    = _mm_setzero_pd();
 832         fiz    = _mm_setzero_pd();
 833
 834
 835         for (k = nj0; k < nj1-1; k += 2)
 836         {
 837             jnrA        = jjnr[k];
 838             jnrB        = jjnr[k+1];
 839
 840             j3A         = 3*jnrA;
 841             j3B         = 3*jnrB;
 842
 843             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 844
 845             dx          = _mm_sub_pd(ix, jx);
 846             dy          = _mm_sub_pd(iy, jy);
 847             dz          = _mm_sub_pd(iz, jz);
 848
 849             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
 850
 851             /* load chain rule terms for j1-4 */
 852             f_gb        = _mm_load_pd(dadx);
 853             dadx       += 2;
 854             f_gb_ai     = _mm_load_pd(dadx);
 855             dadx       += 2;
 856
 857             /* calculate scalar force */
 858             f_gb    = _mm_mul_pd(f_gb, rbai);
 859             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
 860             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
 861
 862             tx     = _mm_mul_pd(f_gb, dx);
 863             ty     = _mm_mul_pd(f_gb, dy);
 864             tz     = _mm_mul_pd(f_gb, dz);
 865
 866             fix    = _mm_add_pd(fix, tx);
 867             fiy    = _mm_add_pd(fiy, ty);
 868             fiz    = _mm_add_pd(fiz, tz);
 869
 870             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
 871         }
 872
 873         /*deal with odd elements */
 874         if (k < nj1)
 875         {
 876             jnrA        = jjnr[k];
 877             j3A         = 3*jnrA;
 878
 879             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 880
 881             dx          = _mm_sub_sd(ix, jx);
 882             dy          = _mm_sub_sd(iy, jy);
 883             dz          = _mm_sub_sd(iz, jz);
 884
 885             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
 886
 887             /* load chain rule terms */
 888             f_gb        = _mm_load_pd(dadx);
 889             dadx       += 2;
 890             f_gb_ai     = _mm_load_pd(dadx);
 891             dadx       += 2;
 892
 893             /* calculate scalar force */
 894             f_gb    = _mm_mul_sd(f_gb, rbai);
 895             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
 896             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
 897
 898             tx     = _mm_mul_sd(f_gb, dx);
 899             ty     = _mm_mul_sd(f_gb, dy);
 900             tz     = _mm_mul_sd(f_gb, dz);
 901
 902             fix    = _mm_add_sd(fix, tx);
 903             fiy    = _mm_add_sd(fiy, ty);
 904             fiz    = _mm_add_sd(fiz, tz);
 905
 906             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
 907         }
 908
 909         /* fix/fiy/fiz now contain four partial force terms, that all should be
 910          * added to the i particle forces and shift forces.
 911          */
 912         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
 913     }
 914
 915     return 0;
 916 }
 917
 918 #else
 919 /* keep compiler happy */
 920 int genborn_sse2_dummy;
 921
 922 #endif /* SSE2 intrinsics available */