src/gromacs/mdlib/genborn_sse2_double.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013,2014, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <math.h>
  42 #include <string.h>
  43
  44 #include "typedefs.h"
  45 #include "smalloc.h"
  46 #include "genborn.h"
  47 #include "vec.h"
  48 #include "gromacs/fileio/pdbio.h"
  49 #include "names.h"
  50 #include "physics.h"
  51 #include "domdec.h"
  52 #include "partdec.h"
  53 #include "network.h"
  54 #include "gmx_fatal.h"
  55 #include "mtop_util.h"
  56 #include "genborn.h"
  57
  58 #include "gromacs/utility/gmxmpi.h"
  59
  60 /* Only compile this file if SSE2 intrinsics are available */
  61 #if 0 && defined (GMX_X86_SSE2)
  62 #include <gmx_sse2_double.h>
  63 #include <emmintrin.h>
  64
  65 #include "genborn_sse2_double.h"
  66
  67 int
  68 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
  69                               int natoms, gmx_localtop_t *top,
  70                               double *x, t_nblist *nl,
  71                               gmx_genborn_t *born)
  72 {
  73     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
  74     int           jnrA, jnrB, j3A, j3B;
  75     int          *mdtype;
  76     double        shX, shY, shZ;
  77     int          *jjnr;
  78     double       *shiftvec;
  79
  80     double        gpi_ai, gpi2;
  81     double        factor;
  82     double       *gb_radius;
  83     double       *vsolv;
  84     double       *work;
  85     double       *dadx;
  86
  87     __m128d       ix, iy, iz;
  88     __m128d       jx, jy, jz;
  89     __m128d       dx, dy, dz;
  90     __m128d       tx, ty, tz;
  91     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
  92     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
  93     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
  94     __m128d       mask, icf4, icf6, mask_cmp;
  95
  96     const __m128d half   = _mm_set1_pd(0.5);
  97     const __m128d three  = _mm_set1_pd(3.0);
  98     const __m128d one    = _mm_set1_pd(1.0);
  99     const __m128d two    = _mm_set1_pd(2.0);
 100     const __m128d zero   = _mm_set1_pd(0.0);
 101     const __m128d four   = _mm_set1_pd(4.0);
 102
 103     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
 104     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
 105     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
 106
 107     factor  = 0.5 * ONE_4PI_EPS0;
 108
 109     gb_radius = born->gb_radius;
 110     vsolv     = born->vsolv;
 111     work      = born->gpol_still_work;
 112     jjnr      = nl->jjnr;
 113     shiftvec  = fr->shift_vec[0];
 114     dadx      = fr->dadx;
 115
 116     jnrA = jnrB = 0;
 117     jx   = _mm_setzero_pd();
 118     jy   = _mm_setzero_pd();
 119     jz   = _mm_setzero_pd();
 120
 121     n = 0;
 122
 123     for (i = 0; i < natoms; i++)
 124     {
 125         work[i] = 0;
 126     }
 127
 128     for (i = 0; i < nl->nri; i++)
 129     {
 130         ii     = nl->iinr[i];
 131         ii3    = ii*3;
 132         is3    = 3*nl->shift[i];
 133         shX    = shiftvec[is3];
 134         shY    = shiftvec[is3+1];
 135         shZ    = shiftvec[is3+2];
 136         nj0    = nl->jindex[i];
 137         nj1    = nl->jindex[i+1];
 138
 139         ix     = _mm_set1_pd(shX+x[ii3+0]);
 140         iy     = _mm_set1_pd(shY+x[ii3+1]);
 141         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 142
 143
 144         /* Polarization energy for atom ai */
 145         gpi    = _mm_setzero_pd();
 146
 147         rai     = _mm_load1_pd(gb_radius+ii);
 148         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
 149
 150         for (k = nj0; k < nj1-1; k += 2)
 151         {
 152             jnrA        = jjnr[k];
 153             jnrB        = jjnr[k+1];
 154
 155             j3A         = 3*jnrA;
 156             j3B         = 3*jnrB;
 157
 158             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 159
 160             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 161             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
 162
 163             dx          = _mm_sub_pd(ix, jx);
 164             dy          = _mm_sub_pd(iy, jy);
 165             dz          = _mm_sub_pd(iz, jz);
 166
 167             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 168             rinv        = gmx_mm_invsqrt_pd(rsq);
 169             rinv2       = _mm_mul_pd(rinv, rinv);
 170             rinv4       = _mm_mul_pd(rinv2, rinv2);
 171             rinv6       = _mm_mul_pd(rinv4, rinv2);
 172
 173             rvdw        = _mm_add_pd(rai, raj);
 174             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 175
 176             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
 177
 178             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 179             if (0 == _mm_movemask_pd(mask_cmp) )
 180             {
 181                 /* if ratio>still_p5inv for ALL elements */
 182                 ccf         = one;
 183                 dccf        = _mm_setzero_pd();
 184             }
 185             else
 186             {
 187                 ratio       = _mm_min_pd(ratio, still_p5inv);
 188                 theta       = _mm_mul_pd(ratio, still_pip5);
 189                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 190                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
 191                 ccf         = _mm_mul_pd(term, term);
 192                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
 193                                          _mm_mul_pd(sinq, theta));
 194             }
 195
 196             prod        = _mm_mul_pd(still_p4, vaj);
 197             icf4        = _mm_mul_pd(ccf, rinv4);
 198             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
 199
 200             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
 201
 202             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
 203
 204             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 205             dadx += 2;
 206             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 207             dadx += 2;
 208         }
 209
 210         if (k < nj1)
 211         {
 212             jnrA        = jjnr[k];
 213
 214             j3A         = 3*jnrA;
 215
 216             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 217
 218             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 219             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
 220
 221             dx          = _mm_sub_sd(ix, jx);
 222             dy          = _mm_sub_sd(iy, jy);
 223             dz          = _mm_sub_sd(iz, jz);
 224
 225             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 226             rinv        = gmx_mm_invsqrt_pd(rsq);
 227             rinv2       = _mm_mul_sd(rinv, rinv);
 228             rinv4       = _mm_mul_sd(rinv2, rinv2);
 229             rinv6       = _mm_mul_sd(rinv4, rinv2);
 230
 231             rvdw        = _mm_add_sd(rai, raj);
 232             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 233
 234             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
 235
 236             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 237             if (0 == _mm_movemask_pd(mask_cmp) )
 238             {
 239                 /* if ratio>still_p5inv for ALL elements */
 240                 ccf         = one;
 241                 dccf        = _mm_setzero_pd();
 242             }
 243             else
 244             {
 245                 ratio       = _mm_min_sd(ratio, still_p5inv);
 246                 theta       = _mm_mul_sd(ratio, still_pip5);
 247                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 248                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
 249                 ccf         = _mm_mul_sd(term, term);
 250                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
 251                                          _mm_mul_sd(sinq, theta));
 252             }
 253
 254             prod        = _mm_mul_sd(still_p4, vaj);
 255             icf4        = _mm_mul_sd(ccf, rinv4);
 256             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
 257
 258             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
 259
 260             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
 261
 262             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 263             dadx += 2;
 264             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 265             dadx += 2;
 266         }
 267         gmx_mm_update_1pot_pd(gpi, work+ii);
 268     }
 269
 270     /* Sum up the polarization energy from other nodes */
 271     if (PARTDECOMP(cr))
 272     {
 273         gmx_sum(natoms, work, cr);
 274     }
 275     else if (DOMAINDECOMP(cr))
 276     {
 277         dd_atom_sum_real(cr->dd, work);
 278     }
 279
 280     /* Compute the radii */
 281     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 282     {
 283         if (born->use[i] != 0)
 284         {
 285             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 286             gpi2             = gpi_ai * gpi_ai;
 287             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 288             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 289         }
 290     }
 291
 292     /* Extra (local) communication required for DD */
 293     if (DOMAINDECOMP(cr))
 294     {
 295         dd_atom_spread_real(cr->dd, born->bRad);
 296         dd_atom_spread_real(cr->dd, fr->invsqrta);
 297     }
 298
 299     return 0;
 300 }
 301
 302
 303 int
 304 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 305                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 306 {
 307     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 308     int           jnrA, jnrB;
 309     int           j3A, j3B;
 310     double        shX, shY, shZ;
 311     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 312     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
 313     double       *obc_param;
 314     double       *gb_radius;
 315     double       *work;
 316     int        *  jjnr;
 317     double       *dadx;
 318     double       *shiftvec;
 319     double        min_rad, rad;
 320
 321     __m128d       ix, iy, iz, jx, jy, jz;
 322     __m128d       dx, dy, dz, t1, t2, t3, t4;
 323     __m128d       rsq, rinv, r;
 324     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 325     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
 326     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 327     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 328     __m128d       dadx1, dadx2;
 329     __m128d       logterm;
 330     __m128d       mask;
 331     __m128d       obc_mask1, obc_mask2, obc_mask3;
 332
 333     __m128d       oneeighth   = _mm_set1_pd(0.125);
 334     __m128d       onefourth   = _mm_set1_pd(0.25);
 335
 336     const __m128d half  = _mm_set1_pd(0.5);
 337     const __m128d three = _mm_set1_pd(3.0);
 338     const __m128d one   = _mm_set1_pd(1.0);
 339     const __m128d two   = _mm_set1_pd(2.0);
 340     const __m128d zero  = _mm_set1_pd(0.0);
 341     const __m128d neg   = _mm_set1_pd(-1.0);
 342
 343     /* Set the dielectric offset */
 344     doffset   = born->gb_doffset;
 345     gb_radius = born->gb_radius;
 346     obc_param = born->param;
 347     work      = born->gpol_hct_work;
 348     jjnr      = nl->jjnr;
 349     dadx      = fr->dadx;
 350     shiftvec  = fr->shift_vec[0];
 351
 352     jx        = _mm_setzero_pd();
 353     jy        = _mm_setzero_pd();
 354     jz        = _mm_setzero_pd();
 355
 356     jnrA = jnrB = 0;
 357
 358     for (i = 0; i < born->nr; i++)
 359     {
 360         work[i] = 0;
 361     }
 362
 363     for (i = 0; i < nl->nri; i++)
 364     {
 365         ii     = nl->iinr[i];
 366         ii3    = ii*3;
 367         is3    = 3*nl->shift[i];
 368         shX    = shiftvec[is3];
 369         shY    = shiftvec[is3+1];
 370         shZ    = shiftvec[is3+2];
 371         nj0    = nl->jindex[i];
 372         nj1    = nl->jindex[i+1];
 373
 374         ix     = _mm_set1_pd(shX+x[ii3+0]);
 375         iy     = _mm_set1_pd(shY+x[ii3+1]);
 376         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 377
 378         rai     = _mm_load1_pd(gb_radius+ii);
 379         rai_inv = gmx_mm_inv_pd(rai);
 380
 381         sum_ai = _mm_setzero_pd();
 382
 383         sk_ai  = _mm_load1_pd(born->param+ii);
 384         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
 385
 386         for (k = nj0; k < nj1-1; k += 2)
 387         {
 388             jnrA        = jjnr[k];
 389             jnrB        = jjnr[k+1];
 390
 391             j3A         = 3*jnrA;
 392             j3B         = 3*jnrB;
 393
 394             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 395             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 396             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
 397
 398             dx    = _mm_sub_pd(ix, jx);
 399             dy    = _mm_sub_pd(iy, jy);
 400             dz    = _mm_sub_pd(iz, jz);
 401
 402             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 403
 404             rinv        = gmx_mm_invsqrt_pd(rsq);
 405             r           = _mm_mul_pd(rsq, rinv);
 406
 407             /* Compute raj_inv aj1-4 */
 408             raj_inv     = gmx_mm_inv_pd(raj);
 409
 410             /* Evaluate influence of atom aj -> ai */
 411             t1            = _mm_add_pd(r, sk_aj);
 412             t2            = _mm_sub_pd(r, sk_aj);
 413             t3            = _mm_sub_pd(sk_aj, r);
 414             obc_mask1     = _mm_cmplt_pd(rai, t1);
 415             obc_mask2     = _mm_cmplt_pd(rai, t2);
 416             obc_mask3     = _mm_cmplt_pd(rai, t3);
 417
 418             uij           = gmx_mm_inv_pd(t1);
 419             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 420                                          _mm_andnot_pd(obc_mask2, rai_inv));
 421             dlij          = _mm_and_pd(one, obc_mask2);
 422             uij2          = _mm_mul_pd(uij, uij);
 423             uij3          = _mm_mul_pd(uij2, uij);
 424             lij2          = _mm_mul_pd(lij, lij);
 425             lij3          = _mm_mul_pd(lij2, lij);
 426
 427             diff2         = _mm_sub_pd(uij2, lij2);
 428             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 429             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
 430             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
 431             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 432
 433             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 434
 435             t1            = _mm_sub_pd(lij, uij);
 436             t2            = _mm_mul_pd(diff2,
 437                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 438                                                   prod));
 439             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 440             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 441             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
 442             t4            = _mm_and_pd(t4, obc_mask3);
 443             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 444
 445             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 446
 447             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 448                                        _mm_mul_pd(prod, lij3));
 449             t1            = _mm_sub_pd(t1,
 450                                        _mm_mul_pd(onefourth,
 451                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 452                                                              _mm_mul_pd(lij3, r))));
 453             t2            = _mm_mul_pd(onefourth,
 454                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 455                                                   _mm_mul_pd(uij3, r)));
 456             t2            = _mm_sub_pd(t2,
 457                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 458                                                   _mm_mul_pd(prod, uij3)));
 459             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 460                                        _mm_mul_pd(rinv, rinv));
 461             t3            = _mm_sub_pd(t3,
 462                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 463                                                   _mm_add_pd(one,
 464                                                              _mm_mul_pd(sk2_rinv, rinv))));
 465             t1            = _mm_mul_pd(rinv,
 466                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 467                                                   _mm_add_pd(t2, t3)));
 468
 469             dadx1         = _mm_and_pd(t1, obc_mask1);
 470
 471             /* Evaluate influence of atom ai -> aj */
 472             t1            = _mm_add_pd(r, sk_ai);
 473             t2            = _mm_sub_pd(r, sk_ai);
 474             t3            = _mm_sub_pd(sk_ai, r);
 475             obc_mask1     = _mm_cmplt_pd(raj, t1);
 476             obc_mask2     = _mm_cmplt_pd(raj, t2);
 477             obc_mask3     = _mm_cmplt_pd(raj, t3);
 478
 479             uij           = gmx_mm_inv_pd(t1);
 480             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 481                                          _mm_andnot_pd(obc_mask2, raj_inv));
 482             dlij          = _mm_and_pd(one, obc_mask2);
 483             uij2          = _mm_mul_pd(uij, uij);
 484             uij3          = _mm_mul_pd(uij2, uij);
 485             lij2          = _mm_mul_pd(lij, lij);
 486             lij3          = _mm_mul_pd(lij2, lij);
 487
 488             diff2         = _mm_sub_pd(uij2, lij2);
 489             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 490             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
 491             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 492
 493             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 494
 495             t1            = _mm_sub_pd(lij, uij);
 496             t2            = _mm_mul_pd(diff2,
 497                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 498                                                   prod));
 499             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 500             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 501             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
 502             t4            = _mm_and_pd(t4, obc_mask3);
 503             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 504
 505             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
 506
 507             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 508                                        _mm_mul_pd(prod, lij3));
 509             t1            = _mm_sub_pd(t1,
 510                                        _mm_mul_pd(onefourth,
 511                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 512                                                              _mm_mul_pd(lij3, r))));
 513             t2            = _mm_mul_pd(onefourth,
 514                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 515                                                   _mm_mul_pd(uij3, r)));
 516             t2            = _mm_sub_pd(t2,
 517                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 518                                                   _mm_mul_pd(prod, uij3)));
 519             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 520                                        _mm_mul_pd(rinv, rinv));
 521             t3            = _mm_sub_pd(t3,
 522                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 523                                                   _mm_add_pd(one,
 524                                                              _mm_mul_pd(sk2_rinv, rinv))));
 525             t1            = _mm_mul_pd(rinv,
 526                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 527                                                   _mm_add_pd(t2, t3)));
 528
 529             dadx2         = _mm_and_pd(t1, obc_mask1);
 530
 531             _mm_store_pd(dadx, dadx1);
 532             dadx += 2;
 533             _mm_store_pd(dadx, dadx2);
 534             dadx += 2;
 535         } /* end normal inner loop */
 536
 537         if (k < nj1)
 538         {
 539             jnrA        = jjnr[k];
 540
 541             j3A         = 3*jnrA;
 542
 543             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 544             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 545             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
 546
 547             dx    = _mm_sub_sd(ix, jx);
 548             dy    = _mm_sub_sd(iy, jy);
 549             dz    = _mm_sub_sd(iz, jz);
 550
 551             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 552
 553             rinv        = gmx_mm_invsqrt_pd(rsq);
 554             r           = _mm_mul_sd(rsq, rinv);
 555
 556             /* Compute raj_inv aj1-4 */
 557             raj_inv     = gmx_mm_inv_pd(raj);
 558
 559             /* Evaluate influence of atom aj -> ai */
 560             t1            = _mm_add_sd(r, sk_aj);
 561             t2            = _mm_sub_sd(r, sk_aj);
 562             t3            = _mm_sub_sd(sk_aj, r);
 563             obc_mask1     = _mm_cmplt_sd(rai, t1);
 564             obc_mask2     = _mm_cmplt_sd(rai, t2);
 565             obc_mask3     = _mm_cmplt_sd(rai, t3);
 566
 567             uij           = gmx_mm_inv_pd(t1);
 568             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 569                                       _mm_andnot_pd(obc_mask2, rai_inv));
 570             dlij          = _mm_and_pd(one, obc_mask2);
 571             uij2          = _mm_mul_sd(uij, uij);
 572             uij3          = _mm_mul_sd(uij2, uij);
 573             lij2          = _mm_mul_sd(lij, lij);
 574             lij3          = _mm_mul_sd(lij2, lij);
 575
 576             diff2         = _mm_sub_sd(uij2, lij2);
 577             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 578             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
 579             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
 580             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 581
 582             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 583
 584             t1            = _mm_sub_sd(lij, uij);
 585             t2            = _mm_mul_sd(diff2,
 586                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
 587                                                   prod));
 588             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 589             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 590             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
 591             t4            = _mm_and_pd(t4, obc_mask3);
 592             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 593
 594             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 595
 596             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 597                                        _mm_mul_sd(prod, lij3));
 598             t1            = _mm_sub_sd(t1,
 599                                        _mm_mul_sd(onefourth,
 600                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 601                                                              _mm_mul_sd(lij3, r))));
 602             t2            = _mm_mul_sd(onefourth,
 603                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 604                                                   _mm_mul_sd(uij3, r)));
 605             t2            = _mm_sub_sd(t2,
 606                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 607                                                   _mm_mul_sd(prod, uij3)));
 608             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 609                                        _mm_mul_sd(rinv, rinv));
 610             t3            = _mm_sub_sd(t3,
 611                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 612                                                   _mm_add_sd(one,
 613                                                              _mm_mul_sd(sk2_rinv, rinv))));
 614             t1            = _mm_mul_sd(rinv,
 615                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 616                                                   _mm_add_pd(t2, t3)));
 617
 618             dadx1         = _mm_and_pd(t1, obc_mask1);
 619
 620             /* Evaluate influence of atom ai -> aj */
 621             t1            = _mm_add_sd(r, sk_ai);
 622             t2            = _mm_sub_sd(r, sk_ai);
 623             t3            = _mm_sub_sd(sk_ai, r);
 624             obc_mask1     = _mm_cmplt_sd(raj, t1);
 625             obc_mask2     = _mm_cmplt_sd(raj, t2);
 626             obc_mask3     = _mm_cmplt_sd(raj, t3);
 627
 628             uij           = gmx_mm_inv_pd(t1);
 629             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 630                                          _mm_andnot_pd(obc_mask2, raj_inv));
 631             dlij          = _mm_and_pd(one, obc_mask2);
 632             uij2          = _mm_mul_sd(uij, uij);
 633             uij3          = _mm_mul_sd(uij2, uij);
 634             lij2          = _mm_mul_sd(lij, lij);
 635             lij3          = _mm_mul_sd(lij2, lij);
 636
 637             diff2         = _mm_sub_sd(uij2, lij2);
 638             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 639             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
 640             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 641
 642             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 643
 644             t1            = _mm_sub_sd(lij, uij);
 645             t2            = _mm_mul_sd(diff2,
 646                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
 647                                                   prod));
 648             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 649             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 650             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
 651             t4            = _mm_and_pd(t4, obc_mask3);
 652             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 653
 654             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
 655
 656             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 657                                        _mm_mul_sd(prod, lij3));
 658             t1            = _mm_sub_sd(t1,
 659                                        _mm_mul_sd(onefourth,
 660                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 661                                                              _mm_mul_sd(lij3, r))));
 662             t2            = _mm_mul_sd(onefourth,
 663                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 664                                                   _mm_mul_sd(uij3, r)));
 665             t2            = _mm_sub_sd(t2,
 666                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 667                                                   _mm_mul_sd(prod, uij3)));
 668             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 669                                        _mm_mul_sd(rinv, rinv));
 670             t3            = _mm_sub_sd(t3,
 671                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 672                                                   _mm_add_sd(one,
 673                                                              _mm_mul_sd(sk2_rinv, rinv))));
 674             t1            = _mm_mul_sd(rinv,
 675                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 676                                                   _mm_add_sd(t2, t3)));
 677
 678             dadx2         = _mm_and_pd(t1, obc_mask1);
 679
 680             _mm_store_pd(dadx, dadx1);
 681             dadx += 2;
 682             _mm_store_pd(dadx, dadx2);
 683             dadx += 2;
 684         }
 685         gmx_mm_update_1pot_pd(sum_ai, work+ii);
 686
 687     }
 688
 689     /* Parallel summations */
 690     if (PARTDECOMP(cr))
 691     {
 692         gmx_sum(natoms, work, cr);
 693     }
 694     else if (DOMAINDECOMP(cr))
 695     {
 696         dd_atom_sum_real(cr->dd, work);
 697     }
 698
 699     if (gb_algorithm == egbHCT)
 700     {
 701         /* HCT */
 702         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 703         {
 704             if (born->use[i] != 0)
 705             {
 706                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
 707                 sum     = 1.0/rr - work[i];
 708                 min_rad = rr + doffset;
 709                 rad     = 1.0/sum;
 710
 711                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
 712                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 713             }
 714         }
 715
 716         /* Extra communication required for DD */
 717         if (DOMAINDECOMP(cr))
 718         {
 719             dd_atom_spread_real(cr->dd, born->bRad);
 720             dd_atom_spread_real(cr->dd, fr->invsqrta);
 721         }
 722     }
 723     else
 724     {
 725         /* OBC */
 726         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 727         {
 728             if (born->use[i] != 0)
 729             {
 730                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
 731                 rr_inv2 = 1.0/rr;
 732                 rr      = rr-doffset;
 733                 rr_inv  = 1.0/rr;
 734                 sum     = rr * work[i];
 735                 sum2    = sum  * sum;
 736                 sum3    = sum2 * sum;
 737
 738                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
 739                 born->bRad[i] = rr_inv - tsum*rr_inv2;
 740                 born->bRad[i] = 1.0 / born->bRad[i];
 741
 742                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 743
 744                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
 745                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
 746             }
 747         }
 748         /* Extra (local) communication required for DD */
 749         if (DOMAINDECOMP(cr))
 750         {
 751             dd_atom_spread_real(cr->dd, born->bRad);
 752             dd_atom_spread_real(cr->dd, fr->invsqrta);
 753             dd_atom_spread_real(cr->dd, born->drobc);
 754         }
 755     }
 756
 757
 758
 759     return 0;
 760 }
 761
 762
 763 int
 764 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
 765                               double *x, double *f, double *fshift, double *shiftvec,
 766                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
 767 {
 768     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
 769     int           jnrA, jnrB;
 770     int           j3A, j3B;
 771     int        *  jjnr;
 772
 773     double        rbi, shX, shY, shZ;
 774     double       *rb;
 775
 776     __m128d       ix, iy, iz;
 777     __m128d       jx, jy, jz;
 778     __m128d       fix, fiy, fiz;
 779     __m128d       dx, dy, dz;
 780     __m128d       tx, ty, tz;
 781
 782     __m128d       rbai, rbaj, f_gb, f_gb_ai;
 783     __m128d       xmm1, xmm2, xmm3;
 784
 785     const __m128d two = _mm_set1_pd(2.0);
 786
 787     rb     = born->work;
 788
 789     jjnr   = nl->jjnr;
 790
 791     /* Loop to get the proper form for the Born radius term, sse style */
 792     n0 = 0;
 793     n1 = natoms;
 794
 795     if (gb_algorithm == egbSTILL)
 796     {
 797         for (i = n0; i < n1; i++)
 798         {
 799             rbi   = born->bRad[i];
 800             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
 801         }
 802     }
 803     else if (gb_algorithm == egbHCT)
 804     {
 805         for (i = n0; i < n1; i++)
 806         {
 807             rbi   = born->bRad[i];
 808             rb[i] = rbi * rbi * dvda[i];
 809         }
 810     }
 811     else if (gb_algorithm == egbOBC)
 812     {
 813         for (i = n0; i < n1; i++)
 814         {
 815             rbi   = born->bRad[i];
 816             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
 817         }
 818     }
 819
 820     jz = _mm_setzero_pd();
 821
 822     n = j3A = j3B = 0;
 823
 824     for (i = 0; i < nl->nri; i++)
 825     {
 826         ii     = nl->iinr[i];
 827         ii3    = ii*3;
 828         is3    = 3*nl->shift[i];
 829         shX    = shiftvec[is3];
 830         shY    = shiftvec[is3+1];
 831         shZ    = shiftvec[is3+2];
 832         nj0    = nl->jindex[i];
 833         nj1    = nl->jindex[i+1];
 834
 835         ix     = _mm_set1_pd(shX+x[ii3+0]);
 836         iy     = _mm_set1_pd(shY+x[ii3+1]);
 837         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 838
 839         rbai   = _mm_load1_pd(rb+ii);
 840         fix    = _mm_setzero_pd();
 841         fiy    = _mm_setzero_pd();
 842         fiz    = _mm_setzero_pd();
 843
 844
 845         for (k = nj0; k < nj1-1; k += 2)
 846         {
 847             jnrA        = jjnr[k];
 848             jnrB        = jjnr[k+1];
 849
 850             j3A         = 3*jnrA;
 851             j3B         = 3*jnrB;
 852
 853             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 854
 855             dx          = _mm_sub_pd(ix, jx);
 856             dy          = _mm_sub_pd(iy, jy);
 857             dz          = _mm_sub_pd(iz, jz);
 858
 859             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
 860
 861             /* load chain rule terms for j1-4 */
 862             f_gb        = _mm_load_pd(dadx);
 863             dadx       += 2;
 864             f_gb_ai     = _mm_load_pd(dadx);
 865             dadx       += 2;
 866
 867             /* calculate scalar force */
 868             f_gb    = _mm_mul_pd(f_gb, rbai);
 869             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
 870             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
 871
 872             tx     = _mm_mul_pd(f_gb, dx);
 873             ty     = _mm_mul_pd(f_gb, dy);
 874             tz     = _mm_mul_pd(f_gb, dz);
 875
 876             fix    = _mm_add_pd(fix, tx);
 877             fiy    = _mm_add_pd(fiy, ty);
 878             fiz    = _mm_add_pd(fiz, tz);
 879
 880             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
 881         }
 882
 883         /*deal with odd elements */
 884         if (k < nj1)
 885         {
 886             jnrA        = jjnr[k];
 887             j3A         = 3*jnrA;
 888
 889             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 890
 891             dx          = _mm_sub_sd(ix, jx);
 892             dy          = _mm_sub_sd(iy, jy);
 893             dz          = _mm_sub_sd(iz, jz);
 894
 895             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
 896
 897             /* load chain rule terms */
 898             f_gb        = _mm_load_pd(dadx);
 899             dadx       += 2;
 900             f_gb_ai     = _mm_load_pd(dadx);
 901             dadx       += 2;
 902
 903             /* calculate scalar force */
 904             f_gb    = _mm_mul_sd(f_gb, rbai);
 905             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
 906             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
 907
 908             tx     = _mm_mul_sd(f_gb, dx);
 909             ty     = _mm_mul_sd(f_gb, dy);
 910             tz     = _mm_mul_sd(f_gb, dz);
 911
 912             fix    = _mm_add_sd(fix, tx);
 913             fiy    = _mm_add_sd(fiy, ty);
 914             fiz    = _mm_add_sd(fiz, tz);
 915
 916             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
 917         }
 918
 919         /* fix/fiy/fiz now contain four partial force terms, that all should be
 920          * added to the i particle forces and shift forces.
 921          */
 922         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
 923     }
 924
 925     return 0;
 926 }
 927
 928 #else
 929 /* keep compiler happy */
 930 int genborn_sse2_dummy;
 931
 932 #endif /* SSE2 intrinsics available */