src/gromacs/mdlib/genborn_sse2_double.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2008, The GROMACS development team.
   6  * Copyright (c) 2013, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <math.h>
  42 #include <string.h>
  43
  44 #include "typedefs.h"
  45 #include "smalloc.h"
  46 #include "genborn.h"
  47 #include "vec.h"
  48 #include "grompp.h"
  49 #include "gromacs/fileio/pdbio.h"
  50 #include "names.h"
  51 #include "physics.h"
  52 #include "domdec.h"
  53 #include "partdec.h"
  54 #include "network.h"
  55 #include "gmx_fatal.h"
  56 #include "mtop_util.h"
  57 #include "genborn.h"
  58
  59 #include "gromacs/utility/gmxmpi.h"
  60
  61 /* Only compile this file if SSE2 intrinsics are available */
  62 #if 0 && defined (GMX_X86_SSE2)
  63 #include <gmx_sse2_double.h>
  64 #include <emmintrin.h>
  65
  66 #include "genborn_sse2_double.h"
  67
  68 int
  69 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,
  70                               int natoms, gmx_localtop_t *top,
  71                               double *x, t_nblist *nl,
  72                               gmx_genborn_t *born)
  73 {
  74     int           i, k, n, ii, is3, ii3, nj0, nj1, offset;
  75     int           jnrA, jnrB, j3A, j3B;
  76     int          *mdtype;
  77     double        shX, shY, shZ;
  78     int          *jjnr;
  79     double       *shiftvec;
  80
  81     double        gpi_ai, gpi2;
  82     double        factor;
  83     double       *gb_radius;
  84     double       *vsolv;
  85     double       *work;
  86     double       *dadx;
  87
  88     __m128d       ix, iy, iz;
  89     __m128d       jx, jy, jz;
  90     __m128d       dx, dy, dz;
  91     __m128d       tx, ty, tz;
  92     __m128d       rsq, rinv, rinv2, rinv4, rinv6;
  93     __m128d       ratio, gpi, rai, raj, vai, vaj, rvdw;
  94     __m128d       ccf, dccf, theta, cosq, term, sinq, res, prod, prod_ai, tmp;
  95     __m128d       mask, icf4, icf6, mask_cmp;
  96
  97     const __m128d half   = _mm_set1_pd(0.5);
  98     const __m128d three  = _mm_set1_pd(3.0);
  99     const __m128d one    = _mm_set1_pd(1.0);
 100     const __m128d two    = _mm_set1_pd(2.0);
 101     const __m128d zero   = _mm_set1_pd(0.0);
 102     const __m128d four   = _mm_set1_pd(4.0);
 103
 104     const __m128d still_p5inv  = _mm_set1_pd(STILL_P5INV);
 105     const __m128d still_pip5   = _mm_set1_pd(STILL_PIP5);
 106     const __m128d still_p4     = _mm_set1_pd(STILL_P4);
 107
 108     factor  = 0.5 * ONE_4PI_EPS0;
 109
 110     gb_radius = born->gb_radius;
 111     vsolv     = born->vsolv;
 112     work      = born->gpol_still_work;
 113     jjnr      = nl->jjnr;
 114     shiftvec  = fr->shift_vec[0];
 115     dadx      = fr->dadx;
 116
 117     jnrA = jnrB = 0;
 118     jx   = _mm_setzero_pd();
 119     jy   = _mm_setzero_pd();
 120     jz   = _mm_setzero_pd();
 121
 122     n = 0;
 123
 124     for (i = 0; i < natoms; i++)
 125     {
 126         work[i] = 0;
 127     }
 128
 129     for (i = 0; i < nl->nri; i++)
 130     {
 131         ii     = nl->iinr[i];
 132         ii3    = ii*3;
 133         is3    = 3*nl->shift[i];
 134         shX    = shiftvec[is3];
 135         shY    = shiftvec[is3+1];
 136         shZ    = shiftvec[is3+2];
 137         nj0    = nl->jindex[i];
 138         nj1    = nl->jindex[i+1];
 139
 140         ix     = _mm_set1_pd(shX+x[ii3+0]);
 141         iy     = _mm_set1_pd(shY+x[ii3+1]);
 142         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 143
 144
 145         /* Polarization energy for atom ai */
 146         gpi    = _mm_setzero_pd();
 147
 148         rai     = _mm_load1_pd(gb_radius+ii);
 149         prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]);
 150
 151         for (k = nj0; k < nj1-1; k += 2)
 152         {
 153             jnrA        = jjnr[k];
 154             jnrB        = jjnr[k+1];
 155
 156             j3A         = 3*jnrA;
 157             j3B         = 3*jnrB;
 158
 159             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 160
 161             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 162             GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA, vsolv+jnrB, vaj);
 163
 164             dx          = _mm_sub_pd(ix, jx);
 165             dy          = _mm_sub_pd(iy, jy);
 166             dz          = _mm_sub_pd(iz, jz);
 167
 168             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 169             rinv        = gmx_mm_invsqrt_pd(rsq);
 170             rinv2       = _mm_mul_pd(rinv, rinv);
 171             rinv4       = _mm_mul_pd(rinv2, rinv2);
 172             rinv6       = _mm_mul_pd(rinv4, rinv2);
 173
 174             rvdw        = _mm_add_pd(rai, raj);
 175             ratio       = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 176
 177             mask_cmp    = _mm_cmple_pd(ratio, still_p5inv);
 178
 179             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 180             if (0 == _mm_movemask_pd(mask_cmp) )
 181             {
 182                 /* if ratio>still_p5inv for ALL elements */
 183                 ccf         = one;
 184                 dccf        = _mm_setzero_pd();
 185             }
 186             else
 187             {
 188                 ratio       = _mm_min_pd(ratio, still_p5inv);
 189                 theta       = _mm_mul_pd(ratio, still_pip5);
 190                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 191                 term        = _mm_mul_pd(half, _mm_sub_pd(one, cosq));
 192                 ccf         = _mm_mul_pd(term, term);
 193                 dccf        = _mm_mul_pd(_mm_mul_pd(two, term),
 194                                          _mm_mul_pd(sinq, theta));
 195             }
 196
 197             prod        = _mm_mul_pd(still_p4, vaj);
 198             icf4        = _mm_mul_pd(ccf, rinv4);
 199             icf6        = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four, ccf), dccf), rinv6);
 200
 201             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_mul_pd(prod_ai, icf4));
 202
 203             gpi           = _mm_add_pd(gpi, _mm_mul_pd(prod, icf4) );
 204
 205             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 206             dadx += 2;
 207             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 208             dadx += 2;
 209         }
 210
 211         if (k < nj1)
 212         {
 213             jnrA        = jjnr[k];
 214
 215             j3A         = 3*jnrA;
 216
 217             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 218
 219             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 220             GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA, vaj);
 221
 222             dx          = _mm_sub_sd(ix, jx);
 223             dy          = _mm_sub_sd(iy, jy);
 224             dz          = _mm_sub_sd(iz, jz);
 225
 226             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 227             rinv        = gmx_mm_invsqrt_pd(rsq);
 228             rinv2       = _mm_mul_sd(rinv, rinv);
 229             rinv4       = _mm_mul_sd(rinv2, rinv2);
 230             rinv6       = _mm_mul_sd(rinv4, rinv2);
 231
 232             rvdw        = _mm_add_sd(rai, raj);
 233             ratio       = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw, rvdw)));
 234
 235             mask_cmp    = _mm_cmple_sd(ratio, still_p5inv);
 236
 237             /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */
 238             if (0 == _mm_movemask_pd(mask_cmp) )
 239             {
 240                 /* if ratio>still_p5inv for ALL elements */
 241                 ccf         = one;
 242                 dccf        = _mm_setzero_pd();
 243             }
 244             else
 245             {
 246                 ratio       = _mm_min_sd(ratio, still_p5inv);
 247                 theta       = _mm_mul_sd(ratio, still_pip5);
 248                 gmx_mm_sincos_pd(theta, &sinq, &cosq);
 249                 term        = _mm_mul_sd(half, _mm_sub_sd(one, cosq));
 250                 ccf         = _mm_mul_sd(term, term);
 251                 dccf        = _mm_mul_sd(_mm_mul_sd(two, term),
 252                                          _mm_mul_sd(sinq, theta));
 253             }
 254
 255             prod        = _mm_mul_sd(still_p4, vaj);
 256             icf4        = _mm_mul_sd(ccf, rinv4);
 257             icf6        = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four, ccf), dccf), rinv6);
 258
 259             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_mul_sd(prod_ai, icf4));
 260
 261             gpi           = _mm_add_sd(gpi, _mm_mul_sd(prod, icf4) );
 262
 263             _mm_store_pd(dadx, _mm_mul_pd(prod, icf6));
 264             dadx += 2;
 265             _mm_store_pd(dadx, _mm_mul_pd(prod_ai, icf6));
 266             dadx += 2;
 267         }
 268         gmx_mm_update_1pot_pd(gpi, work+ii);
 269     }
 270
 271     /* Sum up the polarization energy from other nodes */
 272     if (PARTDECOMP(cr))
 273     {
 274         gmx_sum(natoms, work, cr);
 275     }
 276     else if (DOMAINDECOMP(cr))
 277     {
 278         dd_atom_sum_real(cr->dd, work);
 279     }
 280
 281     /* Compute the radii */
 282     for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 283     {
 284         if (born->use[i] != 0)
 285         {
 286             gpi_ai           = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/
 287             gpi2             = gpi_ai * gpi_ai;
 288             born->bRad[i]    = factor*gmx_invsqrt(gpi2);
 289             fr->invsqrta[i]  = gmx_invsqrt(born->bRad[i]);
 290         }
 291     }
 292
 293     /* Extra (local) communication required for DD */
 294     if (DOMAINDECOMP(cr))
 295     {
 296         dd_atom_spread_real(cr->dd, born->bRad);
 297         dd_atom_spread_real(cr->dd, fr->invsqrta);
 298     }
 299
 300     return 0;
 301 }
 302
 303
 304 int
 305 calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
 306                                 double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md, int gb_algorithm)
 307 {
 308     int           i, ai, k, n, ii, ii3, is3, nj0, nj1, at0, at1, offset;
 309     int           jnrA, jnrB;
 310     int           j3A, j3B;
 311     double        shX, shY, shZ;
 312     double        rr, rr_inv, rr_inv2, sum_tmp, sum, sum2, sum3, gbr;
 313     double        sum_ai2, sum_ai3, tsum, tchain, doffset;
 314     double       *obc_param;
 315     double       *gb_radius;
 316     double       *work;
 317     int        *  jjnr;
 318     double       *dadx;
 319     double       *shiftvec;
 320     double        min_rad, rad;
 321
 322     __m128d       ix, iy, iz, jx, jy, jz;
 323     __m128d       dx, dy, dz, t1, t2, t3, t4;
 324     __m128d       rsq, rinv, r;
 325     __m128d       rai, rai_inv, raj, raj_inv, rai_inv2, sk, sk2, lij, dlij, duij;
 326     __m128d       uij, lij2, uij2, lij3, uij3, diff2;
 327     __m128d       lij_inv, sk2_inv, prod, log_term, tmp, tmp_sum;
 328     __m128d       sum_ai, tmp_ai, sk_ai, sk_aj, sk2_ai, sk2_aj, sk2_rinv;
 329     __m128d       dadx1, dadx2;
 330     __m128d       logterm;
 331     __m128d       mask;
 332     __m128d       obc_mask1, obc_mask2, obc_mask3;
 333
 334     __m128d       oneeighth   = _mm_set1_pd(0.125);
 335     __m128d       onefourth   = _mm_set1_pd(0.25);
 336
 337     const __m128d half  = _mm_set1_pd(0.5);
 338     const __m128d three = _mm_set1_pd(3.0);
 339     const __m128d one   = _mm_set1_pd(1.0);
 340     const __m128d two   = _mm_set1_pd(2.0);
 341     const __m128d zero  = _mm_set1_pd(0.0);
 342     const __m128d neg   = _mm_set1_pd(-1.0);
 343
 344     /* Set the dielectric offset */
 345     doffset   = born->gb_doffset;
 346     gb_radius = born->gb_radius;
 347     obc_param = born->param;
 348     work      = born->gpol_hct_work;
 349     jjnr      = nl->jjnr;
 350     dadx      = fr->dadx;
 351     shiftvec  = fr->shift_vec[0];
 352
 353     jx        = _mm_setzero_pd();
 354     jy        = _mm_setzero_pd();
 355     jz        = _mm_setzero_pd();
 356
 357     jnrA = jnrB = 0;
 358
 359     for (i = 0; i < born->nr; i++)
 360     {
 361         work[i] = 0;
 362     }
 363
 364     for (i = 0; i < nl->nri; i++)
 365     {
 366         ii     = nl->iinr[i];
 367         ii3    = ii*3;
 368         is3    = 3*nl->shift[i];
 369         shX    = shiftvec[is3];
 370         shY    = shiftvec[is3+1];
 371         shZ    = shiftvec[is3+2];
 372         nj0    = nl->jindex[i];
 373         nj1    = nl->jindex[i+1];
 374
 375         ix     = _mm_set1_pd(shX+x[ii3+0]);
 376         iy     = _mm_set1_pd(shY+x[ii3+1]);
 377         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 378
 379         rai     = _mm_load1_pd(gb_radius+ii);
 380         rai_inv = gmx_mm_inv_pd(rai);
 381
 382         sum_ai = _mm_setzero_pd();
 383
 384         sk_ai  = _mm_load1_pd(born->param+ii);
 385         sk2_ai = _mm_mul_pd(sk_ai, sk_ai);
 386
 387         for (k = nj0; k < nj1-1; k += 2)
 388         {
 389             jnrA        = jjnr[k];
 390             jnrB        = jjnr[k+1];
 391
 392             j3A         = 3*jnrA;
 393             j3B         = 3*jnrB;
 394
 395             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 396             GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA, gb_radius+jnrB, raj);
 397             GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA, obc_param+jnrB, sk_aj);
 398
 399             dx    = _mm_sub_pd(ix, jx);
 400             dy    = _mm_sub_pd(iy, jy);
 401             dz    = _mm_sub_pd(iz, jz);
 402
 403             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 404
 405             rinv        = gmx_mm_invsqrt_pd(rsq);
 406             r           = _mm_mul_pd(rsq, rinv);
 407
 408             /* Compute raj_inv aj1-4 */
 409             raj_inv     = gmx_mm_inv_pd(raj);
 410
 411             /* Evaluate influence of atom aj -> ai */
 412             t1            = _mm_add_pd(r, sk_aj);
 413             t2            = _mm_sub_pd(r, sk_aj);
 414             t3            = _mm_sub_pd(sk_aj, r);
 415             obc_mask1     = _mm_cmplt_pd(rai, t1);
 416             obc_mask2     = _mm_cmplt_pd(rai, t2);
 417             obc_mask3     = _mm_cmplt_pd(rai, t3);
 418
 419             uij           = gmx_mm_inv_pd(t1);
 420             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 421                                          _mm_andnot_pd(obc_mask2, rai_inv));
 422             dlij          = _mm_and_pd(one, obc_mask2);
 423             uij2          = _mm_mul_pd(uij, uij);
 424             uij3          = _mm_mul_pd(uij2, uij);
 425             lij2          = _mm_mul_pd(lij, lij);
 426             lij3          = _mm_mul_pd(lij2, lij);
 427
 428             diff2         = _mm_sub_pd(uij2, lij2);
 429             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 430             sk2_aj        = _mm_mul_pd(sk_aj, sk_aj);
 431             sk2_rinv      = _mm_mul_pd(sk2_aj, rinv);
 432             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 433
 434             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 435
 436             t1            = _mm_sub_pd(lij, uij);
 437             t2            = _mm_mul_pd(diff2,
 438                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 439                                                   prod));
 440             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 441             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 442             t4            = _mm_mul_pd(two, _mm_sub_pd(rai_inv, lij));
 443             t4            = _mm_and_pd(t4, obc_mask3);
 444             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 445
 446             sum_ai        = _mm_add_pd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 447
 448             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 449                                        _mm_mul_pd(prod, lij3));
 450             t1            = _mm_sub_pd(t1,
 451                                        _mm_mul_pd(onefourth,
 452                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 453                                                              _mm_mul_pd(lij3, r))));
 454             t2            = _mm_mul_pd(onefourth,
 455                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 456                                                   _mm_mul_pd(uij3, r)));
 457             t2            = _mm_sub_pd(t2,
 458                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 459                                                   _mm_mul_pd(prod, uij3)));
 460             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 461                                        _mm_mul_pd(rinv, rinv));
 462             t3            = _mm_sub_pd(t3,
 463                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 464                                                   _mm_add_pd(one,
 465                                                              _mm_mul_pd(sk2_rinv, rinv))));
 466             t1            = _mm_mul_pd(rinv,
 467                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 468                                                   _mm_add_pd(t2, t3)));
 469
 470             dadx1         = _mm_and_pd(t1, obc_mask1);
 471
 472             /* Evaluate influence of atom ai -> aj */
 473             t1            = _mm_add_pd(r, sk_ai);
 474             t2            = _mm_sub_pd(r, sk_ai);
 475             t3            = _mm_sub_pd(sk_ai, r);
 476             obc_mask1     = _mm_cmplt_pd(raj, t1);
 477             obc_mask2     = _mm_cmplt_pd(raj, t2);
 478             obc_mask3     = _mm_cmplt_pd(raj, t3);
 479
 480             uij           = gmx_mm_inv_pd(t1);
 481             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 482                                          _mm_andnot_pd(obc_mask2, raj_inv));
 483             dlij          = _mm_and_pd(one, obc_mask2);
 484             uij2          = _mm_mul_pd(uij, uij);
 485             uij3          = _mm_mul_pd(uij2, uij);
 486             lij2          = _mm_mul_pd(lij, lij);
 487             lij3          = _mm_mul_pd(lij2, lij);
 488
 489             diff2         = _mm_sub_pd(uij2, lij2);
 490             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 491             sk2_rinv      = _mm_mul_pd(sk2_ai, rinv);
 492             prod          = _mm_mul_pd(onefourth, sk2_rinv);
 493
 494             logterm       = gmx_mm_log_pd(_mm_mul_pd(uij, lij_inv));
 495
 496             t1            = _mm_sub_pd(lij, uij);
 497             t2            = _mm_mul_pd(diff2,
 498                                        _mm_sub_pd(_mm_mul_pd(onefourth, r),
 499                                                   prod));
 500             t3            = _mm_mul_pd(half, _mm_mul_pd(rinv, logterm));
 501             t1            = _mm_add_pd(t1, _mm_add_pd(t2, t3));
 502             t4            = _mm_mul_pd(two, _mm_sub_pd(raj_inv, lij));
 503             t4            = _mm_and_pd(t4, obc_mask3);
 504             t1            = _mm_mul_pd(half, _mm_add_pd(t1, t4));
 505
 506             GMX_MM_INCREMENT_2VALUES_PD(work+jnrA, work+jnrB, _mm_and_pd(t1, obc_mask1));
 507
 508             t1            = _mm_add_pd(_mm_mul_pd(half, lij2),
 509                                        _mm_mul_pd(prod, lij3));
 510             t1            = _mm_sub_pd(t1,
 511                                        _mm_mul_pd(onefourth,
 512                                                   _mm_add_pd(_mm_mul_pd(lij, rinv),
 513                                                              _mm_mul_pd(lij3, r))));
 514             t2            = _mm_mul_pd(onefourth,
 515                                        _mm_add_pd(_mm_mul_pd(uij, rinv),
 516                                                   _mm_mul_pd(uij3, r)));
 517             t2            = _mm_sub_pd(t2,
 518                                        _mm_add_pd(_mm_mul_pd(half, uij2),
 519                                                   _mm_mul_pd(prod, uij3)));
 520             t3            = _mm_mul_pd(_mm_mul_pd(onefourth, logterm),
 521                                        _mm_mul_pd(rinv, rinv));
 522             t3            = _mm_sub_pd(t3,
 523                                        _mm_mul_pd(_mm_mul_pd(diff2, oneeighth),
 524                                                   _mm_add_pd(one,
 525                                                              _mm_mul_pd(sk2_rinv, rinv))));
 526             t1            = _mm_mul_pd(rinv,
 527                                        _mm_add_pd(_mm_mul_pd(dlij, t1),
 528                                                   _mm_add_pd(t2, t3)));
 529
 530             dadx2         = _mm_and_pd(t1, obc_mask1);
 531
 532             _mm_store_pd(dadx, dadx1);
 533             dadx += 2;
 534             _mm_store_pd(dadx, dadx2);
 535             dadx += 2;
 536         } /* end normal inner loop */
 537
 538         if (k < nj1)
 539         {
 540             jnrA        = jjnr[k];
 541
 542             j3A         = 3*jnrA;
 543
 544             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 545             GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA, raj);
 546             GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA, sk_aj);
 547
 548             dx    = _mm_sub_sd(ix, jx);
 549             dy    = _mm_sub_sd(iy, jy);
 550             dz    = _mm_sub_sd(iz, jz);
 551
 552             rsq         = gmx_mm_calc_rsq_pd(dx, dy, dz);
 553
 554             rinv        = gmx_mm_invsqrt_pd(rsq);
 555             r           = _mm_mul_sd(rsq, rinv);
 556
 557             /* Compute raj_inv aj1-4 */
 558             raj_inv     = gmx_mm_inv_pd(raj);
 559
 560             /* Evaluate influence of atom aj -> ai */
 561             t1            = _mm_add_sd(r, sk_aj);
 562             t2            = _mm_sub_sd(r, sk_aj);
 563             t3            = _mm_sub_sd(sk_aj, r);
 564             obc_mask1     = _mm_cmplt_sd(rai, t1);
 565             obc_mask2     = _mm_cmplt_sd(rai, t2);
 566             obc_mask3     = _mm_cmplt_sd(rai, t3);
 567
 568             uij           = gmx_mm_inv_pd(t1);
 569             lij           = _mm_or_pd(_mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 570                                       _mm_andnot_pd(obc_mask2, rai_inv));
 571             dlij          = _mm_and_pd(one, obc_mask2);
 572             uij2          = _mm_mul_sd(uij, uij);
 573             uij3          = _mm_mul_sd(uij2, uij);
 574             lij2          = _mm_mul_sd(lij, lij);
 575             lij3          = _mm_mul_sd(lij2, lij);
 576
 577             diff2         = _mm_sub_sd(uij2, lij2);
 578             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 579             sk2_aj        = _mm_mul_sd(sk_aj, sk_aj);
 580             sk2_rinv      = _mm_mul_sd(sk2_aj, rinv);
 581             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 582
 583             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 584
 585             t1            = _mm_sub_sd(lij, uij);
 586             t2            = _mm_mul_sd(diff2,
 587                                        _mm_sub_sd(_mm_mul_pd(onefourth, r),
 588                                                   prod));
 589             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 590             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 591             t4            = _mm_mul_sd(two, _mm_sub_sd(rai_inv, lij));
 592             t4            = _mm_and_pd(t4, obc_mask3);
 593             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 594
 595             sum_ai        = _mm_add_sd(sum_ai, _mm_and_pd(t1, obc_mask1) );
 596
 597             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 598                                        _mm_mul_sd(prod, lij3));
 599             t1            = _mm_sub_sd(t1,
 600                                        _mm_mul_sd(onefourth,
 601                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 602                                                              _mm_mul_sd(lij3, r))));
 603             t2            = _mm_mul_sd(onefourth,
 604                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 605                                                   _mm_mul_sd(uij3, r)));
 606             t2            = _mm_sub_sd(t2,
 607                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 608                                                   _mm_mul_sd(prod, uij3)));
 609             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 610                                        _mm_mul_sd(rinv, rinv));
 611             t3            = _mm_sub_sd(t3,
 612                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 613                                                   _mm_add_sd(one,
 614                                                              _mm_mul_sd(sk2_rinv, rinv))));
 615             t1            = _mm_mul_sd(rinv,
 616                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 617                                                   _mm_add_pd(t2, t3)));
 618
 619             dadx1         = _mm_and_pd(t1, obc_mask1);
 620
 621             /* Evaluate influence of atom ai -> aj */
 622             t1            = _mm_add_sd(r, sk_ai);
 623             t2            = _mm_sub_sd(r, sk_ai);
 624             t3            = _mm_sub_sd(sk_ai, r);
 625             obc_mask1     = _mm_cmplt_sd(raj, t1);
 626             obc_mask2     = _mm_cmplt_sd(raj, t2);
 627             obc_mask3     = _mm_cmplt_sd(raj, t3);
 628
 629             uij           = gmx_mm_inv_pd(t1);
 630             lij           = _mm_or_pd(   _mm_and_pd(obc_mask2, gmx_mm_inv_pd(t2)),
 631                                          _mm_andnot_pd(obc_mask2, raj_inv));
 632             dlij          = _mm_and_pd(one, obc_mask2);
 633             uij2          = _mm_mul_sd(uij, uij);
 634             uij3          = _mm_mul_sd(uij2, uij);
 635             lij2          = _mm_mul_sd(lij, lij);
 636             lij3          = _mm_mul_sd(lij2, lij);
 637
 638             diff2         = _mm_sub_sd(uij2, lij2);
 639             lij_inv       = gmx_mm_invsqrt_pd(lij2);
 640             sk2_rinv      = _mm_mul_sd(sk2_ai, rinv);
 641             prod          = _mm_mul_sd(onefourth, sk2_rinv);
 642
 643             logterm       = gmx_mm_log_pd(_mm_mul_sd(uij, lij_inv));
 644
 645             t1            = _mm_sub_sd(lij, uij);
 646             t2            = _mm_mul_sd(diff2,
 647                                        _mm_sub_sd(_mm_mul_sd(onefourth, r),
 648                                                   prod));
 649             t3            = _mm_mul_sd(half, _mm_mul_sd(rinv, logterm));
 650             t1            = _mm_add_sd(t1, _mm_add_sd(t2, t3));
 651             t4            = _mm_mul_sd(two, _mm_sub_sd(raj_inv, lij));
 652             t4            = _mm_and_pd(t4, obc_mask3);
 653             t1            = _mm_mul_sd(half, _mm_add_sd(t1, t4));
 654
 655             GMX_MM_INCREMENT_1VALUE_PD(work+jnrA, _mm_and_pd(t1, obc_mask1));
 656
 657             t1            = _mm_add_sd(_mm_mul_sd(half, lij2),
 658                                        _mm_mul_sd(prod, lij3));
 659             t1            = _mm_sub_sd(t1,
 660                                        _mm_mul_sd(onefourth,
 661                                                   _mm_add_sd(_mm_mul_sd(lij, rinv),
 662                                                              _mm_mul_sd(lij3, r))));
 663             t2            = _mm_mul_sd(onefourth,
 664                                        _mm_add_sd(_mm_mul_sd(uij, rinv),
 665                                                   _mm_mul_sd(uij3, r)));
 666             t2            = _mm_sub_sd(t2,
 667                                        _mm_add_sd(_mm_mul_sd(half, uij2),
 668                                                   _mm_mul_sd(prod, uij3)));
 669             t3            = _mm_mul_sd(_mm_mul_sd(onefourth, logterm),
 670                                        _mm_mul_sd(rinv, rinv));
 671             t3            = _mm_sub_sd(t3,
 672                                        _mm_mul_sd(_mm_mul_sd(diff2, oneeighth),
 673                                                   _mm_add_sd(one,
 674                                                              _mm_mul_sd(sk2_rinv, rinv))));
 675             t1            = _mm_mul_sd(rinv,
 676                                        _mm_add_sd(_mm_mul_sd(dlij, t1),
 677                                                   _mm_add_sd(t2, t3)));
 678
 679             dadx2         = _mm_and_pd(t1, obc_mask1);
 680
 681             _mm_store_pd(dadx, dadx1);
 682             dadx += 2;
 683             _mm_store_pd(dadx, dadx2);
 684             dadx += 2;
 685         }
 686         gmx_mm_update_1pot_pd(sum_ai, work+ii);
 687
 688     }
 689
 690     /* Parallel summations */
 691     if (PARTDECOMP(cr))
 692     {
 693         gmx_sum(natoms, work, cr);
 694     }
 695     else if (DOMAINDECOMP(cr))
 696     {
 697         dd_atom_sum_real(cr->dd, work);
 698     }
 699
 700     if (gb_algorithm == egbHCT)
 701     {
 702         /* HCT */
 703         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 704         {
 705             if (born->use[i] != 0)
 706             {
 707                 rr      = top->atomtypes.gb_radius[md->typeA[i]]-doffset;
 708                 sum     = 1.0/rr - work[i];
 709                 min_rad = rr + doffset;
 710                 rad     = 1.0/sum;
 711
 712                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
 713                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 714             }
 715         }
 716
 717         /* Extra communication required for DD */
 718         if (DOMAINDECOMP(cr))
 719         {
 720             dd_atom_spread_real(cr->dd, born->bRad);
 721             dd_atom_spread_real(cr->dd, fr->invsqrta);
 722         }
 723     }
 724     else
 725     {
 726         /* OBC */
 727         for (i = 0; i < fr->natoms_force; i++) /* PELA born->nr */
 728         {
 729             if (born->use[i] != 0)
 730             {
 731                 rr      = top->atomtypes.gb_radius[md->typeA[i]];
 732                 rr_inv2 = 1.0/rr;
 733                 rr      = rr-doffset;
 734                 rr_inv  = 1.0/rr;
 735                 sum     = rr * work[i];
 736                 sum2    = sum  * sum;
 737                 sum3    = sum2 * sum;
 738
 739                 tsum          = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
 740                 born->bRad[i] = rr_inv - tsum*rr_inv2;
 741                 born->bRad[i] = 1.0 / born->bRad[i];
 742
 743                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
 744
 745                 tchain         = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
 746                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
 747             }
 748         }
 749         /* Extra (local) communication required for DD */
 750         if (DOMAINDECOMP(cr))
 751         {
 752             dd_atom_spread_real(cr->dd, born->bRad);
 753             dd_atom_spread_real(cr->dd, fr->invsqrta);
 754             dd_atom_spread_real(cr->dd, born->drobc);
 755         }
 756     }
 757
 758
 759
 760     return 0;
 761 }
 762
 763
 764 int
 765 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda,
 766                               double *x, double *f, double *fshift, double *shiftvec,
 767                               int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)
 768 {
 769     int           i, k, n, ii, jnr, ii3, is3, nj0, nj1, n0, n1;
 770     int           jnrA, jnrB;
 771     int           j3A, j3B;
 772     int        *  jjnr;
 773
 774     double        rbi, shX, shY, shZ;
 775     double       *rb;
 776
 777     __m128d       ix, iy, iz;
 778     __m128d       jx, jy, jz;
 779     __m128d       fix, fiy, fiz;
 780     __m128d       dx, dy, dz;
 781     __m128d       tx, ty, tz;
 782
 783     __m128d       rbai, rbaj, f_gb, f_gb_ai;
 784     __m128d       xmm1, xmm2, xmm3;
 785
 786     const __m128d two = _mm_set1_pd(2.0);
 787
 788     rb     = born->work;
 789
 790     jjnr   = nl->jjnr;
 791
 792     /* Loop to get the proper form for the Born radius term, sse style */
 793     n0 = 0;
 794     n1 = natoms;
 795
 796     if (gb_algorithm == egbSTILL)
 797     {
 798         for (i = n0; i < n1; i++)
 799         {
 800             rbi   = born->bRad[i];
 801             rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
 802         }
 803     }
 804     else if (gb_algorithm == egbHCT)
 805     {
 806         for (i = n0; i < n1; i++)
 807         {
 808             rbi   = born->bRad[i];
 809             rb[i] = rbi * rbi * dvda[i];
 810         }
 811     }
 812     else if (gb_algorithm == egbOBC)
 813     {
 814         for (i = n0; i < n1; i++)
 815         {
 816             rbi   = born->bRad[i];
 817             rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
 818         }
 819     }
 820
 821     jz = _mm_setzero_pd();
 822
 823     n = j3A = j3B = 0;
 824
 825     for (i = 0; i < nl->nri; i++)
 826     {
 827         ii     = nl->iinr[i];
 828         ii3    = ii*3;
 829         is3    = 3*nl->shift[i];
 830         shX    = shiftvec[is3];
 831         shY    = shiftvec[is3+1];
 832         shZ    = shiftvec[is3+2];
 833         nj0    = nl->jindex[i];
 834         nj1    = nl->jindex[i+1];
 835
 836         ix     = _mm_set1_pd(shX+x[ii3+0]);
 837         iy     = _mm_set1_pd(shY+x[ii3+1]);
 838         iz     = _mm_set1_pd(shZ+x[ii3+2]);
 839
 840         rbai   = _mm_load1_pd(rb+ii);
 841         fix    = _mm_setzero_pd();
 842         fiy    = _mm_setzero_pd();
 843         fiz    = _mm_setzero_pd();
 844
 845
 846         for (k = nj0; k < nj1-1; k += 2)
 847         {
 848             jnrA        = jjnr[k];
 849             jnrB        = jjnr[k+1];
 850
 851             j3A         = 3*jnrA;
 852             j3B         = 3*jnrB;
 853
 854             GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A, x+j3B, jx, jy, jz);
 855
 856             dx          = _mm_sub_pd(ix, jx);
 857             dy          = _mm_sub_pd(iy, jy);
 858             dz          = _mm_sub_pd(iz, jz);
 859
 860             GMX_MM_LOAD_2VALUES_PD(rb+jnrA, rb+jnrB, rbaj);
 861
 862             /* load chain rule terms for j1-4 */
 863             f_gb        = _mm_load_pd(dadx);
 864             dadx       += 2;
 865             f_gb_ai     = _mm_load_pd(dadx);
 866             dadx       += 2;
 867
 868             /* calculate scalar force */
 869             f_gb    = _mm_mul_pd(f_gb, rbai);
 870             f_gb_ai = _mm_mul_pd(f_gb_ai, rbaj);
 871             f_gb    = _mm_add_pd(f_gb, f_gb_ai);
 872
 873             tx     = _mm_mul_pd(f_gb, dx);
 874             ty     = _mm_mul_pd(f_gb, dy);
 875             tz     = _mm_mul_pd(f_gb, dz);
 876
 877             fix    = _mm_add_pd(fix, tx);
 878             fiy    = _mm_add_pd(fiy, ty);
 879             fiz    = _mm_add_pd(fiz, tz);
 880
 881             GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A, f+j3B, tx, ty, tz);
 882         }
 883
 884         /*deal with odd elements */
 885         if (k < nj1)
 886         {
 887             jnrA        = jjnr[k];
 888             j3A         = 3*jnrA;
 889
 890             GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A, jx, jy, jz);
 891
 892             dx          = _mm_sub_sd(ix, jx);
 893             dy          = _mm_sub_sd(iy, jy);
 894             dz          = _mm_sub_sd(iz, jz);
 895
 896             GMX_MM_LOAD_1VALUE_PD(rb+jnrA, rbaj);
 897
 898             /* load chain rule terms */
 899             f_gb        = _mm_load_pd(dadx);
 900             dadx       += 2;
 901             f_gb_ai     = _mm_load_pd(dadx);
 902             dadx       += 2;
 903
 904             /* calculate scalar force */
 905             f_gb    = _mm_mul_sd(f_gb, rbai);
 906             f_gb_ai = _mm_mul_sd(f_gb_ai, rbaj);
 907             f_gb    = _mm_add_sd(f_gb, f_gb_ai);
 908
 909             tx     = _mm_mul_sd(f_gb, dx);
 910             ty     = _mm_mul_sd(f_gb, dy);
 911             tz     = _mm_mul_sd(f_gb, dz);
 912
 913             fix    = _mm_add_sd(fix, tx);
 914             fiy    = _mm_add_sd(fiy, ty);
 915             fiz    = _mm_add_sd(fiz, tz);
 916
 917             GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A, tx, ty, tz);
 918         }
 919
 920         /* fix/fiy/fiz now contain four partial force terms, that all should be
 921          * added to the i particle forces and shift forces.
 922          */
 923         gmx_mm_update_iforce_1atom_pd(&fix, &fiy, &fiz, f+ii3, fshift+is3);
 924     }
 925
 926     return 0;
 927 }
 928
 929 #else
 930 /* keep compiler happy */
 931 int genborn_sse2_dummy;
 932
 933 #endif /* SSE2 intrinsics available */