2 * This source code is part of
6 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7 * Copyright (c) 2001-2009, The GROMACS Development Team
9 * Gromacs is a library for molecular simulation and trajectory analysis,
10 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11 * a full list of developers and information, check out http://www.gromacs.org
13 * This program is free software; you can redistribute it and/or modify it under
14 * the terms of the GNU Lesser General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option) any
17 * As a special exception, you may use this file as part of a free software
18 * library without restriction. Specifically, if other files instantiate
19 * templates or use macros or inline functions from this file, or you compile
20 * this file and link it with other files to produce an executable, this
21 * file does not by itself cause the resulting executable to be covered by
22 * the GNU Lesser General Public License.
24 * In plain-speak: do not worry about classes/macros/templates either - only
25 * changes to the library have to be LGPL, not an application linking with it.
27 * To help fund GROMACS development, we humbly ask that you cite
28 * the papers people have written on it - you can find them on the website!
35 #include "types/simple.h"
44 #include "genborn_allvsall.h"
47 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) )
49 #include <gmx_sse2_double.h>
67 int ** prologue_mask_gb;
80 gmx_allvsallgb2_data_t;
84 calc_maxoffset(int i,int natoms)
88 if ((natoms % 2) == 1)
90 /* Odd number of atoms, easy */
93 else if ((natoms % 4) == 0)
95 /* Multiple of four is hard */
104 maxoffset=natoms/2-1;
115 maxoffset=natoms/2-1;
128 maxoffset=natoms/2-1;
136 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
147 int ni0,ni1,nj0,nj1,nj;
151 int firstinteraction;
155 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
156 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
157 * whether they should interact or not.
159 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
160 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
161 * we create a jindex array with three elements per i atom: the starting point, the point to
162 * which we need to check exclusions, and the end point.
163 * This way we only have to allocate a short exclusion mask per i atom.
166 ni0 = (start/UNROLLI)*UNROLLI;
167 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
169 /* Set the interaction mask to only enable the i atoms we want to include */
170 snew(pi,2*(natoms+UNROLLI+2*SIMD_WIDTH));
171 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
172 for(i=0;i<natoms+UNROLLI;i++)
174 aadata->imask[2*i] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
175 aadata->imask[2*i+1] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
178 /* Allocate memory for our modified jindex array */
179 snew(aadata->jindex_gb,4*(natoms+UNROLLI));
180 for(i=0;i<4*(natoms+UNROLLI);i++)
182 aadata->jindex_gb[i] = 0;
185 /* Create the exclusion masks for the prologue part */
186 snew(aadata->prologue_mask_gb,natoms+UNROLLI); /* list of pointers */
188 /* First zero everything to avoid uninitialized data */
189 for(i=0;i<natoms+UNROLLI;i++)
191 aadata->prologue_mask_gb[i] = NULL;
194 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
195 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
197 max_excl_offset = -1;
199 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
200 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
202 /* Which atom is the first we (might) interact with? */
203 imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
204 for(i=ibase;i<imax;i++)
206 /* Before exclusions, which atom is the first we (might) interact with? */
207 firstinteraction = i+1;
208 max_offset = calc_maxoffset(i,natoms);
212 for(j=0;j<ilist[F_GB12].nr;j+=3)
214 a1 = ilist[F_GB12].iatoms[j+1];
215 a2 = ilist[F_GB12].iatoms[j+2];
230 if(k==firstinteraction)
238 for(j=0;j<ilist[F_GB13].nr;j+=3)
240 a1 = ilist[F_GB13].iatoms[j+1];
241 a2 = ilist[F_GB13].iatoms[j+2];
256 if(k==firstinteraction)
264 for(j=0;j<ilist[F_GB14].nr;j+=3)
266 a1 = ilist[F_GB14].iatoms[j+1];
267 a2 = ilist[F_GB14].iatoms[j+2];
281 if(k==firstinteraction)
287 imin = (firstinteraction < imin) ? firstinteraction : imin;
289 /* round down to j unrolling factor */
290 imin = (imin/UNROLLJ)*UNROLLJ;
292 for(i=ibase;i<imax;i++)
294 max_offset = calc_maxoffset(i,natoms);
298 for(j=0;j<ilist[F_GB12].nr;j+=3)
300 a1 = ilist[F_GB12].iatoms[j+1];
301 a2 = ilist[F_GB12].iatoms[j+2];
328 if( k+natoms <= max_offset )
332 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
337 for(j=0;j<ilist[F_GB13].nr;j+=3)
339 a1 = ilist[F_GB13].iatoms[j+1];
340 a2 = ilist[F_GB13].iatoms[j+2];
367 if( k+natoms <= max_offset )
371 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
376 for(j=0;j<ilist[F_GB14].nr;j+=3)
378 a1 = ilist[F_GB14].iatoms[j+1];
379 a2 = ilist[F_GB14].iatoms[j+2];
406 if( k+natoms <= max_offset )
410 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
415 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
417 /* round up to j unrolling factor */
418 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
420 /* Set all the prologue masks length to this value (even for i>end) */
421 for(i=ibase;i<ibase+UNROLLI;i++)
423 aadata->jindex_gb[4*i] = imin;
424 aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
428 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
429 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
431 for(i=ibase;i<ibase+UNROLLI;i++)
433 nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
434 imin = aadata->jindex_gb[4*i];
436 /* Allocate aligned memory */
437 snew(pi,2*(nj+2*SIMD_WIDTH));
438 aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
440 max_offset = calc_maxoffset(i,natoms);
442 /* Include interactions i+1 <= j < i+maxoffset */
447 if( (j>i) && (j<=i+max_offset) )
449 aadata->prologue_mask_gb[i][2*k] = 0xFFFFFFFF;
450 aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
454 aadata->prologue_mask_gb[i][2*k] = 0;
455 aadata->prologue_mask_gb[i][2*k+1] = 0;
459 /* Clear out the explicit exclusions */
464 for(j=0;j<ilist[F_GB12].nr;j+=3)
466 a1 = ilist[F_GB12].iatoms[j+1];
467 a2 = ilist[F_GB12].iatoms[j+2];
488 if( k+natoms <= max_offset )
496 aadata->prologue_mask_gb[i][2*k] = 0;
497 aadata->prologue_mask_gb[i][2*k+1] = 0;
503 for(j=0;j<ilist[F_GB13].nr;j+=3)
505 a1 = ilist[F_GB13].iatoms[j+1];
506 a2 = ilist[F_GB13].iatoms[j+2];
527 if( k+natoms <= max_offset )
535 aadata->prologue_mask_gb[i][2*k] = 0;
536 aadata->prologue_mask_gb[i][2*k+1] = 0;
542 for(j=0;j<ilist[F_GB14].nr;j+=3)
544 a1 = ilist[F_GB14].iatoms[j+1];
545 a2 = ilist[F_GB14].iatoms[j+2];
566 if( k+natoms <= max_offset )
574 aadata->prologue_mask_gb[i][2*k] = 0;
575 aadata->prologue_mask_gb[i][2*k+1] = 0;
583 /* Construct the epilogue mask - this just contains the check for maxoffset */
584 snew(aadata->epilogue_mask,natoms+UNROLLI);
586 /* First zero everything to avoid uninitialized data */
587 for(i=0;i<natoms+UNROLLI;i++)
589 aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
590 aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
591 aadata->epilogue_mask[i] = NULL;
594 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
596 /* Find the lowest index for which we need to use the epilogue */
598 max_offset = calc_maxoffset(imin,natoms);
600 imin = imin + 1 + max_offset;
602 /* Find largest index for which we need to use the epilogue */
603 imax = ibase + UNROLLI-1;
604 imax = (imax < end) ? imax : end;
606 max_offset = calc_maxoffset(imax,natoms);
607 imax = imax + 1 + max_offset + UNROLLJ - 1;
609 for(i=ibase;i<ibase+UNROLLI;i++)
611 /* Start of epilogue - round down to j tile limit */
612 aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
613 /* Make sure we dont overlap - for small systems everything is done in the prologue */
614 aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
615 /* Round upwards to j tile limit */
616 aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
617 /* Make sure we dont have a negative range for the epilogue */
618 aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
622 /* And fill it with data... */
624 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
626 for(i=ibase;i<ibase+UNROLLI;i++)
629 nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
631 /* Allocate aligned memory */
632 snew(pi,2*(nj+2*SIMD_WIDTH));
633 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
635 max_offset = calc_maxoffset(i,natoms);
639 j = aadata->jindex_gb[4*i+2] + k;
640 aadata->epilogue_mask[i][2*k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
641 aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
649 genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
650 gmx_localtop_t * top,
651 gmx_genborn_t * born,
653 double radius_offset,
661 gmx_allvsallgb2_data_t *aadata;
664 natoms = mdatoms->nr;
669 snew(p,2*natoms+2*SIMD_WIDTH);
670 aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
671 snew(p,2*natoms+2*SIMD_WIDTH);
672 aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
673 snew(p,2*natoms+2*SIMD_WIDTH);
674 aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
675 snew(p,2*natoms+2*SIMD_WIDTH);
676 aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
677 snew(p,2*natoms+2*SIMD_WIDTH);
678 aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
679 snew(p,2*natoms+2*SIMD_WIDTH);
680 aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
682 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
683 aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
685 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
686 aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
688 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
689 aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
691 for(i=0;i<mdatoms->nr;i++)
693 aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
694 if(gb_algorithm==egbSTILL)
696 aadata->workparam[i] = born->vsolv[i];
698 else if(gb_algorithm==egbOBC)
700 aadata->workparam[i] = born->param[i];
702 aadata->work[i] = 0.0;
704 for(i=0;i<mdatoms->nr;i++)
706 aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
707 aadata->workparam[natoms+i] = aadata->workparam[i];
708 aadata->work[natoms+i] = aadata->work[i];
711 for(i=0;i<2*natoms+SIMD_WIDTH;i++)
713 aadata->x_align[i] = 0.0;
714 aadata->y_align[i] = 0.0;
715 aadata->z_align[i] = 0.0;
716 aadata->fx_align[i] = 0.0;
717 aadata->fy_align[i] = 0.0;
718 aadata->fz_align[i] = 0.0;
721 setup_gb_exclusions_and_indices(aadata,top->idef.il,mdatoms->start,mdatoms->start+mdatoms->homenr,mdatoms->nr,
722 bInclude12,bInclude13,bInclude14);
727 * This routine apparently hits a compiler bug visual studio has had 'forever'.
728 * It is present both in VS2005 and VS2008, and the only way around it is to
729 * decrease optimization. We do that with at pragma, and only for MSVC, so it
730 * will not hurt any of the well-behaving and supported compilers out there.
731 * MS: Fix your compiler, it sucks like a black hole!
734 #pragma optimize("t",off)
738 genborn_allvsall_calc_still_radii_sse2_double(t_forcerec * fr,
740 gmx_genborn_t * born,
741 gmx_localtop_t * top,
746 gmx_allvsallgb2_data_t *aadata;
762 double irsq,idr4,idr6;
763 double raj,rvdw,ratio;
764 double vaj,ccf,dccf,theta,cosq;
765 double term,prod,icf4,icf6,gpi2,factor,sinq;
776 __m128d ix_SSE0,iy_SSE0,iz_SSE0;
777 __m128d ix_SSE1,iy_SSE1,iz_SSE1;
778 __m128d gpi_SSE0,rai_SSE0,prod_ai_SSE0;
779 __m128d gpi_SSE1,rai_SSE1,prod_ai_SSE1;
780 __m128d imask_SSE0,jmask_SSE0;
781 __m128d imask_SSE1,jmask_SSE1;
782 __m128d jx_SSE,jy_SSE,jz_SSE;
783 __m128d dx_SSE0,dy_SSE0,dz_SSE0;
784 __m128d dx_SSE1,dy_SSE1,dz_SSE1;
785 __m128d rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
786 __m128d rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
787 __m128d raj_SSE,vaj_SSE,prod_SSE;
788 __m128d rvdw_SSE0,ratio_SSE0;
789 __m128d rvdw_SSE1,ratio_SSE1;
790 __m128d theta_SSE0,sinq_SSE0,cosq_SSE0,term_SSE0;
791 __m128d theta_SSE1,sinq_SSE1,cosq_SSE1,term_SSE1;
792 __m128d ccf_SSE0,dccf_SSE0;
793 __m128d ccf_SSE1,dccf_SSE1;
794 __m128d icf4_SSE0,icf6_SSE0;
795 __m128d icf4_SSE1,icf6_SSE1;
796 __m128d half_SSE,one_SSE,two_SSE,four_SSE;
797 __m128d still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
799 natoms = mdatoms->nr;
800 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
801 ni1 = mdatoms->start+mdatoms->homenr;
805 aadata = *((gmx_allvsallgb2_data_t **)paadata);
810 genborn_allvsall_setup(&aadata,top,born,mdatoms,0.0,
811 egbSTILL,FALSE,FALSE,TRUE);
812 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
815 x_align = aadata->x_align;
816 y_align = aadata->y_align;
817 z_align = aadata->z_align;
819 gb_radius = aadata->gb_radius;
820 vsolv = aadata->workparam;
822 jindex = aadata->jindex_gb;
825 still_p4_SSE = _mm_set1_pd(STILL_P4);
826 still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
827 still_pip5_SSE = _mm_set1_pd(STILL_PIP5);
828 half_SSE = _mm_set1_pd(0.5);
829 one_SSE = _mm_set1_pd(1.0);
830 two_SSE = _mm_set1_pd(2.0);
831 four_SSE = _mm_set1_pd(4.0);
833 /* This will be summed, so it has to extend to natoms + buffer */
834 for(i=0;i<natoms+1+natoms/2;i++)
839 for(i=ni0;i<ni1+1+natoms/2;i++)
843 y_align[i] = x[3*k+1];
844 z_align[i] = x[3*k+2];
848 for(i=ni0; i<ni1; i+=UNROLLI)
850 /* We assume shifts are NOT used for all-vs-all interactions */
851 /* Load i atom data */
852 ix_SSE0 = _mm_load1_pd(x_align+i);
853 iy_SSE0 = _mm_load1_pd(y_align+i);
854 iz_SSE0 = _mm_load1_pd(z_align+i);
855 ix_SSE1 = _mm_load1_pd(x_align+i+1);
856 iy_SSE1 = _mm_load1_pd(y_align+i+1);
857 iz_SSE1 = _mm_load1_pd(z_align+i+1);
859 gpi_SSE0 = _mm_setzero_pd();
860 gpi_SSE1 = _mm_setzero_pd();
862 rai_SSE0 = _mm_load1_pd(gb_radius+i);
863 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
865 prod_ai_SSE0 = _mm_set1_pd(STILL_P4*vsolv[i]);
866 prod_ai_SSE1 = _mm_set1_pd(STILL_P4*vsolv[i+1]);
868 /* Load limits for loop over neighbors */
874 pmask0 = aadata->prologue_mask_gb[i];
875 pmask1 = aadata->prologue_mask_gb[i+1];
876 emask0 = aadata->epilogue_mask[i];
877 emask1 = aadata->epilogue_mask[i+1];
879 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
880 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
882 /* Prologue part, including exclusion mask */
883 for(j=nj0; j<nj1; j+=UNROLLJ)
885 jmask_SSE0 = _mm_load_pd((double *)pmask0);
886 jmask_SSE1 = _mm_load_pd((double *)pmask1);
890 /* load j atom coordinates */
891 jx_SSE = _mm_load_pd(x_align+j);
892 jy_SSE = _mm_load_pd(y_align+j);
893 jz_SSE = _mm_load_pd(z_align+j);
895 /* Calculate distance */
896 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
897 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
898 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
899 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
900 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
901 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
903 /* rsq = dx*dx+dy*dy+dz*dz */
904 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
905 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
908 jmask_SSE0 = _mm_and_pd(jmask_SSE0,imask_SSE0);
909 jmask_SSE1 = _mm_and_pd(jmask_SSE1,imask_SSE1);
911 /* Calculate 1/r and 1/r2 */
912 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
913 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
916 rinv_SSE0 = _mm_and_pd(rinv_SSE0,jmask_SSE0);
917 rinv_SSE1 = _mm_and_pd(rinv_SSE1,jmask_SSE1);
919 irsq_SSE0 = _mm_mul_pd(rinv_SSE0,rinv_SSE0);
920 irsq_SSE1 = _mm_mul_pd(rinv_SSE1,rinv_SSE1);
921 idr4_SSE0 = _mm_mul_pd(irsq_SSE0,irsq_SSE0);
922 idr4_SSE1 = _mm_mul_pd(irsq_SSE1,irsq_SSE1);
923 idr6_SSE0 = _mm_mul_pd(idr4_SSE0,irsq_SSE0);
924 idr6_SSE1 = _mm_mul_pd(idr4_SSE1,irsq_SSE1);
926 raj_SSE = _mm_load_pd(gb_radius+j);
927 vaj_SSE = _mm_load_pd(vsolv+j);
929 rvdw_SSE0 = _mm_add_pd(rai_SSE0,raj_SSE);
930 rvdw_SSE1 = _mm_add_pd(rai_SSE1,raj_SSE);
932 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0,rvdw_SSE0)));
933 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1,rvdw_SSE1)));
935 ratio_SSE0 = _mm_min_pd(ratio_SSE0,still_p5inv_SSE);
936 ratio_SSE1 = _mm_min_pd(ratio_SSE1,still_p5inv_SSE);
937 theta_SSE0 = _mm_mul_pd(ratio_SSE0,still_pip5_SSE);
938 theta_SSE1 = _mm_mul_pd(ratio_SSE1,still_pip5_SSE);
939 gmx_mm_sincos_pd(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
940 gmx_mm_sincos_pd(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
941 term_SSE0 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE0));
942 term_SSE1 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE1));
943 ccf_SSE0 = _mm_mul_pd(term_SSE0,term_SSE0);
944 ccf_SSE1 = _mm_mul_pd(term_SSE1,term_SSE1);
945 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE0),
946 _mm_mul_pd(sinq_SSE0,theta_SSE0));
947 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE1),
948 _mm_mul_pd(sinq_SSE1,theta_SSE1));
950 prod_SSE = _mm_mul_pd(still_p4_SSE,vaj_SSE);
951 icf4_SSE0 = _mm_mul_pd(ccf_SSE0,idr4_SSE0);
952 icf4_SSE1 = _mm_mul_pd(ccf_SSE1,idr4_SSE1);
953 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
954 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
956 _mm_store_pd(work+j , _mm_add_pd(_mm_load_pd(work+j),
957 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0,icf4_SSE0),
958 _mm_mul_pd(prod_ai_SSE1,icf4_SSE1))));
961 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE,icf4_SSE0));
962 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE,icf4_SSE1));
964 /* Save ai->aj and aj->ai chain rule terms */
965 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE0));
967 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE1));
970 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE0,icf6_SSE0));
972 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE1,icf6_SSE1));
976 /* Main part, no exclusions */
977 for(j=nj1; j<nj2; j+=UNROLLJ)
980 /* load j atom coordinates */
981 jx_SSE = _mm_load_pd(x_align+j);
982 jy_SSE = _mm_load_pd(y_align+j);
983 jz_SSE = _mm_load_pd(z_align+j);
985 /* Calculate distance */
986 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
987 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
988 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
989 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
990 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
991 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
993 /* rsq = dx*dx+dy*dy+dz*dz */
994 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
995 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
997 /* Calculate 1/r and 1/r2 */
998 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
999 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1002 rinv_SSE0 = _mm_and_pd(rinv_SSE0,imask_SSE0);
1003 rinv_SSE1 = _mm_and_pd(rinv_SSE1,imask_SSE1);
1005 irsq_SSE0 = _mm_mul_pd(rinv_SSE0,rinv_SSE0);
1006 irsq_SSE1 = _mm_mul_pd(rinv_SSE1,rinv_SSE1);
1007 idr4_SSE0 = _mm_mul_pd(irsq_SSE0,irsq_SSE0);
1008 idr4_SSE1 = _mm_mul_pd(irsq_SSE1,irsq_SSE1);
1009 idr6_SSE0 = _mm_mul_pd(idr4_SSE0,irsq_SSE0);
1010 idr6_SSE1 = _mm_mul_pd(idr4_SSE1,irsq_SSE1);
1012 raj_SSE = _mm_load_pd(gb_radius+j);
1014 rvdw_SSE0 = _mm_add_pd(rai_SSE0,raj_SSE);
1015 rvdw_SSE1 = _mm_add_pd(rai_SSE1,raj_SSE);
1016 vaj_SSE = _mm_load_pd(vsolv+j);
1018 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0,rvdw_SSE0)));
1019 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1,rvdw_SSE1)));
1021 ratio_SSE0 = _mm_min_pd(ratio_SSE0,still_p5inv_SSE);
1022 ratio_SSE1 = _mm_min_pd(ratio_SSE1,still_p5inv_SSE);
1023 theta_SSE0 = _mm_mul_pd(ratio_SSE0,still_pip5_SSE);
1024 theta_SSE1 = _mm_mul_pd(ratio_SSE1,still_pip5_SSE);
1025 gmx_mm_sincos_pd(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
1026 gmx_mm_sincos_pd(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
1027 term_SSE0 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE0));
1028 term_SSE1 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE1));
1029 ccf_SSE0 = _mm_mul_pd(term_SSE0,term_SSE0);
1030 ccf_SSE1 = _mm_mul_pd(term_SSE1,term_SSE1);
1031 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE0),
1032 _mm_mul_pd(sinq_SSE0,theta_SSE0));
1033 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE1),
1034 _mm_mul_pd(sinq_SSE1,theta_SSE1));
1036 prod_SSE = _mm_mul_pd(still_p4_SSE,vaj_SSE );
1037 icf4_SSE0 = _mm_mul_pd(ccf_SSE0,idr4_SSE0);
1038 icf4_SSE1 = _mm_mul_pd(ccf_SSE1,idr4_SSE1);
1039 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1040 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1042 _mm_store_pd(work+j , _mm_add_pd(_mm_load_pd(work+j),
1043 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0,icf4_SSE0),
1044 _mm_mul_pd(prod_ai_SSE1,icf4_SSE1))));
1046 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE,icf4_SSE0));
1047 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE,icf4_SSE1));
1049 /* Save ai->aj and aj->ai chain rule terms */
1050 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE0));
1052 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE1));
1055 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE0,icf6_SSE0));
1057 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE1,icf6_SSE1));
1060 /* Epilogue part, including exclusion mask */
1061 for(j=nj2; j<nj3; j+=UNROLLJ)
1063 jmask_SSE0 = _mm_load_pd((double *)emask0);
1064 jmask_SSE1 = _mm_load_pd((double *)emask1);
1065 emask0 += 2*UNROLLJ;
1066 emask1 += 2*UNROLLJ;
1068 /* load j atom coordinates */
1069 jx_SSE = _mm_load_pd(x_align+j);
1070 jy_SSE = _mm_load_pd(y_align+j);
1071 jz_SSE = _mm_load_pd(z_align+j);
1073 /* Calculate distance */
1074 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
1075 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
1076 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
1077 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
1078 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
1079 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
1081 /* rsq = dx*dx+dy*dy+dz*dz */
1082 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1083 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1086 jmask_SSE0 = _mm_and_pd(jmask_SSE0,imask_SSE0);
1087 jmask_SSE1 = _mm_and_pd(jmask_SSE1,imask_SSE1);
1089 /* Calculate 1/r and 1/r2 */
1090 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1091 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1094 rinv_SSE0 = _mm_and_pd(rinv_SSE0,jmask_SSE0);
1095 rinv_SSE1 = _mm_and_pd(rinv_SSE1,jmask_SSE1);
1097 irsq_SSE0 = _mm_mul_pd(rinv_SSE0,rinv_SSE0);
1098 irsq_SSE1 = _mm_mul_pd(rinv_SSE1,rinv_SSE1);
1099 idr4_SSE0 = _mm_mul_pd(irsq_SSE0,irsq_SSE0);
1100 idr4_SSE1 = _mm_mul_pd(irsq_SSE1,irsq_SSE1);
1101 idr6_SSE0 = _mm_mul_pd(idr4_SSE0,irsq_SSE0);
1102 idr6_SSE1 = _mm_mul_pd(idr4_SSE1,irsq_SSE1);
1104 raj_SSE = _mm_load_pd(gb_radius+j);
1105 vaj_SSE = _mm_load_pd(vsolv+j);
1107 rvdw_SSE0 = _mm_add_pd(rai_SSE0,raj_SSE);
1108 rvdw_SSE1 = _mm_add_pd(rai_SSE1,raj_SSE);
1110 ratio_SSE0 = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0,rvdw_SSE0)));
1111 ratio_SSE1 = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1,rvdw_SSE1)));
1113 ratio_SSE0 = _mm_min_pd(ratio_SSE0,still_p5inv_SSE);
1114 ratio_SSE1 = _mm_min_pd(ratio_SSE1,still_p5inv_SSE);
1115 theta_SSE0 = _mm_mul_pd(ratio_SSE0,still_pip5_SSE);
1116 theta_SSE1 = _mm_mul_pd(ratio_SSE1,still_pip5_SSE);
1117 gmx_mm_sincos_pd(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
1118 gmx_mm_sincos_pd(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
1119 term_SSE0 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE0));
1120 term_SSE1 = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE1));
1121 ccf_SSE0 = _mm_mul_pd(term_SSE0,term_SSE0);
1122 ccf_SSE1 = _mm_mul_pd(term_SSE1,term_SSE1);
1123 dccf_SSE0 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE0),
1124 _mm_mul_pd(sinq_SSE0,theta_SSE0));
1125 dccf_SSE1 = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE1),
1126 _mm_mul_pd(sinq_SSE1,theta_SSE1));
1128 prod_SSE = _mm_mul_pd(still_p4_SSE,vaj_SSE);
1129 icf4_SSE0 = _mm_mul_pd(ccf_SSE0,idr4_SSE0);
1130 icf4_SSE1 = _mm_mul_pd(ccf_SSE1,idr4_SSE1);
1131 icf6_SSE0 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1132 icf6_SSE1 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1134 _mm_store_pd(work+j , _mm_add_pd(_mm_load_pd(work+j),
1135 _mm_add_pd(_mm_mul_pd(prod_ai_SSE0,icf4_SSE0),
1136 _mm_mul_pd(prod_ai_SSE1,icf4_SSE1))));
1138 gpi_SSE0 = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE,icf4_SSE0));
1139 gpi_SSE1 = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE,icf4_SSE1));
1141 /* Save ai->aj and aj->ai chain rule terms */
1142 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE0));
1144 _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE1));
1147 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE0,icf6_SSE0));
1149 _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE1,icf6_SSE1));
1152 GMX_MM_TRANSPOSE2_PD(gpi_SSE0,gpi_SSE1);
1153 gpi_SSE0 = _mm_add_pd(gpi_SSE0,gpi_SSE1);
1154 _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
1157 /* In case we have written anything beyond natoms, move it back.
1158 * Never mind that we leave stuff above natoms; that will not
1159 * be accessed later in the routine.
1160 * In principle this should be a move rather than sum, but this
1161 * way we dont have to worry about even/odd offsets...
1163 for(i=natoms;i<ni1+1+natoms/2;i++)
1165 work[i-natoms] += work[i];
1168 /* Parallel summations */
1171 gmx_sum(natoms,work,cr);
1174 factor = 0.5 * ONE_4PI_EPS0;
1175 /* Calculate the radii - should we do all atoms, or just our local ones? */
1176 for(i=0;i<natoms;i++)
1178 if(born->use[i] != 0)
1180 gpi = born->gpol[i]+work[i];
1182 born->bRad[i] = factor*gmx_invsqrt(gpi2);
1183 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1189 /* Reinstate MSVC optimization */
1191 #pragma optimize("",on)
1196 genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec * fr,
1197 t_mdatoms * mdatoms,
1198 gmx_genborn_t * born,
1200 gmx_localtop_t * top,
1205 gmx_allvsallgb2_data_t *aadata;
1208 int nj0,nj1,nj2,nj3;
1226 double rai,rai_inv,rai_inv2,sum_ai,sum_ai2,sum_ai3,tsum,tchain;
1228 __m128d ix_SSE0,iy_SSE0,iz_SSE0;
1229 __m128d ix_SSE1,iy_SSE1,iz_SSE1;
1230 __m128d gpi_SSE0,rai_SSE0,prod_ai_SSE0;
1231 __m128d gpi_SSE1,rai_SSE1,prod_ai_SSE1;
1232 __m128d imask_SSE0,jmask_SSE0;
1233 __m128d imask_SSE1,jmask_SSE1;
1234 __m128d jx_SSE,jy_SSE,jz_SSE;
1235 __m128d dx_SSE0,dy_SSE0,dz_SSE0;
1236 __m128d dx_SSE1,dy_SSE1,dz_SSE1;
1237 __m128d rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
1238 __m128d rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
1239 __m128d raj_SSE,raj_inv_SSE,sk_aj_SSE,sk2_aj_SSE;
1240 __m128d ccf_SSE0,dccf_SSE0,prod_SSE0;
1241 __m128d ccf_SSE1,dccf_SSE1,prod_SSE1;
1242 __m128d icf4_SSE0,icf6_SSE0;
1243 __m128d icf4_SSE1,icf6_SSE1;
1244 __m128d oneeighth_SSE,onefourth_SSE,half_SSE,one_SSE,two_SSE,four_SSE;
1245 __m128d still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
1246 __m128d rai_inv_SSE0;
1247 __m128d rai_inv_SSE1;
1248 __m128d sk_ai_SSE0,sk2_ai_SSE0,sum_ai_SSE0;
1249 __m128d sk_ai_SSE1,sk2_ai_SSE1,sum_ai_SSE1;
1250 __m128d lij_inv_SSE0,sk2_rinv_SSE0;
1251 __m128d lij_inv_SSE1,sk2_rinv_SSE1;
1254 __m128d t1_SSE0,t2_SSE0,t3_SSE0,t4_SSE0;
1255 __m128d t1_SSE1,t2_SSE1,t3_SSE1,t4_SSE1;
1256 __m128d obc_mask1_SSE0,obc_mask2_SSE0,obc_mask3_SSE0;
1257 __m128d obc_mask1_SSE1,obc_mask2_SSE1,obc_mask3_SSE1;
1258 __m128d uij_SSE0,uij2_SSE0,uij3_SSE0;
1259 __m128d uij_SSE1,uij2_SSE1,uij3_SSE1;
1260 __m128d lij_SSE0,lij2_SSE0,lij3_SSE0;
1261 __m128d lij_SSE1,lij2_SSE1,lij3_SSE1;
1262 __m128d dlij_SSE0,diff2_SSE0,logterm_SSE0;
1263 __m128d dlij_SSE1,diff2_SSE1,logterm_SSE1;
1264 __m128d doffset_SSE,tmpSSE;
1266 natoms = mdatoms->nr;
1267 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
1268 ni1 = mdatoms->start+mdatoms->homenr;
1272 aadata = *((gmx_allvsallgb2_data_t **)paadata);
1277 genborn_allvsall_setup(&aadata,top,born,mdatoms,born->gb_doffset,
1278 egbOBC,TRUE,TRUE,TRUE);
1279 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1282 x_align = aadata->x_align;
1283 y_align = aadata->y_align;
1284 z_align = aadata->z_align;
1286 gb_radius = aadata->gb_radius;
1287 work = aadata->work;
1288 jindex = aadata->jindex_gb;
1290 obc_param = aadata->workparam;
1292 oneeighth_SSE = _mm_set1_pd(0.125);
1293 onefourth_SSE = _mm_set1_pd(0.25);
1294 half_SSE = _mm_set1_pd(0.5);
1295 one_SSE = _mm_set1_pd(1.0);
1296 two_SSE = _mm_set1_pd(2.0);
1297 four_SSE = _mm_set1_pd(4.0);
1298 doffset_SSE = _mm_set1_pd(born->gb_doffset);
1300 for(i=0;i<natoms;i++)
1302 x_align[i] = x[3*i];
1303 y_align[i] = x[3*i+1];
1304 z_align[i] = x[3*i+2];
1308 for(i=0;i<natoms/2+1;i++)
1310 x_align[natoms+i] = x_align[i];
1311 y_align[natoms+i] = y_align[i];
1312 z_align[natoms+i] = z_align[i];
1315 for(i=0;i<natoms+natoms/2+1;i++)
1320 for(i=ni0; i<ni1; i+= UNROLLI)
1322 /* We assume shifts are NOT used for all-vs-all interactions */
1324 /* Load i atom data */
1325 ix_SSE0 = _mm_load1_pd(x_align+i);
1326 iy_SSE0 = _mm_load1_pd(y_align+i);
1327 iz_SSE0 = _mm_load1_pd(z_align+i);
1328 ix_SSE1 = _mm_load1_pd(x_align+i+1);
1329 iy_SSE1 = _mm_load1_pd(y_align+i+1);
1330 iz_SSE1 = _mm_load1_pd(z_align+i+1);
1332 rai_SSE0 = _mm_load1_pd(gb_radius+i);
1333 rai_SSE1 = _mm_load1_pd(gb_radius+i+1);
1334 rai_inv_SSE0 = gmx_mm_inv_pd(rai_SSE0);
1335 rai_inv_SSE1 = gmx_mm_inv_pd(rai_SSE1);
1337 sk_ai_SSE0 = _mm_load1_pd(obc_param+i);
1338 sk_ai_SSE1 = _mm_load1_pd(obc_param+i+1);
1339 sk2_ai_SSE0 = _mm_mul_pd(sk_ai_SSE0,sk_ai_SSE0);
1340 sk2_ai_SSE1 = _mm_mul_pd(sk_ai_SSE1,sk_ai_SSE1);
1342 sum_ai_SSE0 = _mm_setzero_pd();
1343 sum_ai_SSE1 = _mm_setzero_pd();
1345 /* Load limits for loop over neighbors */
1347 nj1 = jindex[4*i+1];
1348 nj2 = jindex[4*i+2];
1349 nj3 = jindex[4*i+3];
1351 pmask0 = aadata->prologue_mask_gb[i];
1352 pmask1 = aadata->prologue_mask_gb[i+1];
1353 emask0 = aadata->epilogue_mask[i];
1354 emask1 = aadata->epilogue_mask[i+1];
1356 imask_SSE0 = _mm_load1_pd((double *)(aadata->imask+2*i));
1357 imask_SSE1 = _mm_load1_pd((double *)(aadata->imask+2*i+2));
1359 /* Prologue part, including exclusion mask */
1360 for(j=nj0; j<nj1; j+=UNROLLJ)
1362 jmask_SSE0 = _mm_load_pd((double *)pmask0);
1363 jmask_SSE1 = _mm_load_pd((double *)pmask1);
1364 pmask0 += 2*UNROLLJ;
1365 pmask1 += 2*UNROLLJ;
1367 /* load j atom coordinates */
1368 jx_SSE = _mm_load_pd(x_align+j);
1369 jy_SSE = _mm_load_pd(y_align+j);
1370 jz_SSE = _mm_load_pd(z_align+j);
1372 /* Calculate distance */
1373 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
1374 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
1375 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
1376 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
1377 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
1378 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
1380 /* rsq = dx*dx+dy*dy+dz*dz */
1381 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1382 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1385 jmask_SSE0 = _mm_and_pd(jmask_SSE0,imask_SSE0);
1386 jmask_SSE1 = _mm_and_pd(jmask_SSE1,imask_SSE1);
1388 /* Calculate 1/r and 1/r2 */
1389 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1390 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1393 rinv_SSE0 = _mm_and_pd(rinv_SSE0,jmask_SSE0);
1394 rinv_SSE1 = _mm_and_pd(rinv_SSE1,jmask_SSE1);
1396 dr_SSE0 = _mm_mul_pd(rsq_SSE0,rinv_SSE0);
1397 dr_SSE1 = _mm_mul_pd(rsq_SSE1,rinv_SSE1);
1399 sk_aj_SSE = _mm_load_pd(obc_param+j);
1400 raj_SSE = _mm_load_pd(gb_radius+j);
1401 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1403 /* Evaluate influence of atom aj -> ai */
1404 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_aj_SSE);
1405 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_aj_SSE);
1406 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_aj_SSE);
1407 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_aj_SSE);
1408 t3_SSE0 = _mm_sub_pd(sk_aj_SSE,dr_SSE0);
1409 t3_SSE1 = _mm_sub_pd(sk_aj_SSE,dr_SSE1);
1411 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1412 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1413 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1414 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1415 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1416 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1417 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
1418 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
1420 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1421 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1422 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1423 _mm_andnot_pd(obc_mask2_SSE0,rai_inv_SSE0));
1424 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1425 _mm_andnot_pd(obc_mask2_SSE1,rai_inv_SSE1));
1426 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1427 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1429 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1430 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1431 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1432 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1433 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1434 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1435 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1436 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1438 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1439 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1440 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1441 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1442 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE,sk_aj_SSE);
1443 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE0);
1444 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE1);
1445 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1446 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1448 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1449 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1451 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
1452 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
1453 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1454 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1456 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1457 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1460 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1461 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1462 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1463 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1464 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE0,lij_SSE0));
1465 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE1,lij_SSE1));
1466 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1467 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1468 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1469 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1471 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1472 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1474 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1475 _mm_mul_pd(prod_SSE0,lij3_SSE0));
1476 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1477 _mm_mul_pd(prod_SSE1,lij3_SSE1));
1478 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1479 _mm_mul_pd(onefourth_SSE,
1480 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1481 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1482 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1483 _mm_mul_pd(onefourth_SSE,
1484 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1485 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1487 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1488 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1489 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1490 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1491 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1492 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1493 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1494 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1495 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1496 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1497 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1498 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1499 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1500 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1501 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1502 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1503 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1504 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1506 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1507 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1508 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1510 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1512 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1513 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1514 _mm_add_pd(t2_SSE0,t3_SSE0)));
1515 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1516 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1517 _mm_add_pd(t2_SSE1,t3_SSE1)));
1519 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1521 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1524 /* Evaluate influence of atom ai -> aj */
1525 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_ai_SSE0);
1526 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_ai_SSE1);
1527 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_ai_SSE0);
1528 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_ai_SSE1);
1529 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0,dr_SSE0);
1530 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1,dr_SSE1);
1532 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1533 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1534 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1535 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1536 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1537 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1538 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
1539 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
1541 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1542 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1543 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1544 _mm_andnot_pd(obc_mask2_SSE0,raj_inv_SSE));
1545 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1546 _mm_andnot_pd(obc_mask2_SSE1,raj_inv_SSE));
1547 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1548 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1550 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1551 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1552 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1553 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1554 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1555 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1556 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1557 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1559 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1560 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1561 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1562 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1563 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0,rinv_SSE0);
1564 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1,rinv_SSE1);
1565 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1566 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1568 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1569 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1570 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
1571 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
1572 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1573 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1575 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1576 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1578 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1579 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1580 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1581 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1582 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE0));
1583 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE1));
1584 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1585 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1586 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1587 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1589 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1590 _mm_add_pd(_mm_and_pd(t1_SSE0,obc_mask1_SSE0),
1591 _mm_and_pd(t1_SSE1,obc_mask1_SSE1))));
1593 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1594 _mm_mul_pd(prod_SSE0,lij3_SSE0));
1595 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1596 _mm_mul_pd(prod_SSE1,lij3_SSE1));
1597 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1598 _mm_mul_pd(onefourth_SSE,
1599 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1600 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1601 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1602 _mm_mul_pd(onefourth_SSE,
1603 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1604 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1605 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1606 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1607 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1608 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1609 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1610 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1611 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1612 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1613 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1614 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1615 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1616 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1618 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1619 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1620 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1621 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1623 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1624 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1626 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1627 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1628 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1630 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1633 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1634 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1635 _mm_add_pd(t2_SSE0,t3_SSE0)));
1636 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1637 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1638 _mm_add_pd(t2_SSE1,t3_SSE1)));
1640 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1642 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1646 /* Main part, no exclusions */
1647 for(j=nj1; j<nj2; j+=UNROLLJ)
1649 /* load j atom coordinates */
1650 jx_SSE = _mm_load_pd(x_align+j);
1651 jy_SSE = _mm_load_pd(y_align+j);
1652 jz_SSE = _mm_load_pd(z_align+j);
1654 /* Calculate distance */
1655 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
1656 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
1657 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
1658 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
1659 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
1660 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
1662 /* rsq = dx*dx+dy*dy+dz*dz */
1663 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1664 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1666 /* Calculate 1/r and 1/r2 */
1667 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1668 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1671 rinv_SSE0 = _mm_and_pd(rinv_SSE0,imask_SSE0);
1672 rinv_SSE1 = _mm_and_pd(rinv_SSE1,imask_SSE1);
1674 dr_SSE0 = _mm_mul_pd(rsq_SSE0,rinv_SSE0);
1675 dr_SSE1 = _mm_mul_pd(rsq_SSE1,rinv_SSE1);
1677 sk_aj_SSE = _mm_load_pd(obc_param+j);
1678 raj_SSE = _mm_load_pd(gb_radius+j);
1680 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1682 /* Evaluate influence of atom aj -> ai */
1683 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_aj_SSE);
1684 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_aj_SSE);
1685 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_aj_SSE);
1686 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_aj_SSE);
1687 t3_SSE0 = _mm_sub_pd(sk_aj_SSE,dr_SSE0);
1688 t3_SSE1 = _mm_sub_pd(sk_aj_SSE,dr_SSE1);
1690 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1691 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1692 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1693 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1694 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1695 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1696 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,imask_SSE0);
1697 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,imask_SSE1);
1699 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1700 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1701 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1702 _mm_andnot_pd(obc_mask2_SSE0,rai_inv_SSE0));
1703 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1704 _mm_andnot_pd(obc_mask2_SSE1,rai_inv_SSE1));
1705 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1706 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1708 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1709 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1710 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1711 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1712 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1713 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1714 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1715 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1717 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1718 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1719 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1720 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1721 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE,sk_aj_SSE);
1722 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE0);
1723 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE1);
1724 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1725 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1727 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1728 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1730 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
1731 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
1732 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1733 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1735 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1736 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1739 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1740 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1741 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1742 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1743 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE0,lij_SSE0));
1744 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE1,lij_SSE1));
1745 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1746 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1747 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1748 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1750 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1751 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1753 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1754 _mm_mul_pd(prod_SSE0,lij3_SSE0));
1755 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1756 _mm_mul_pd(prod_SSE1,lij3_SSE1));
1758 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1759 _mm_mul_pd(onefourth_SSE,
1760 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1761 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1762 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1763 _mm_mul_pd(onefourth_SSE,
1764 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1765 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1767 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1768 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1769 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1770 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1771 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1772 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1773 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1774 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1775 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1776 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1777 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1778 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1779 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1780 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1781 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1782 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1783 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1784 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1786 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1787 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1788 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1790 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1792 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1793 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1794 _mm_add_pd(t2_SSE0,t3_SSE0)));
1795 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1796 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1797 _mm_add_pd(t2_SSE1,t3_SSE1)));
1799 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1801 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1804 /* Evaluate influence of atom ai -> aj */
1805 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_ai_SSE0);
1806 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_ai_SSE1);
1807 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_ai_SSE0);
1808 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_ai_SSE1);
1809 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0,dr_SSE0);
1810 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1,dr_SSE1);
1812 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1813 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1814 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1815 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1816 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1817 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1818 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,imask_SSE0);
1819 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,imask_SSE1);
1821 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1822 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1823 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1824 _mm_andnot_pd(obc_mask2_SSE0,raj_inv_SSE));
1825 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1826 _mm_andnot_pd(obc_mask2_SSE1,raj_inv_SSE));
1827 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1828 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1830 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1831 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1832 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1833 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1834 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
1835 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
1836 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1837 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1839 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1840 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1841 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
1842 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
1843 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0,rinv_SSE0);
1844 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1,rinv_SSE1);
1845 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1846 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1848 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1849 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1850 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
1851 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
1852 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
1853 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1855 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
1856 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1858 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1859 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1860 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1861 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1862 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE0));
1863 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE1));
1864 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1865 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1866 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1867 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1869 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1870 _mm_add_pd(_mm_and_pd(t1_SSE0,obc_mask1_SSE0),
1871 _mm_and_pd(t1_SSE1,obc_mask1_SSE1))));
1873 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1874 _mm_mul_pd(prod_SSE0,lij3_SSE0));
1875 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1876 _mm_mul_pd(prod_SSE1,lij3_SSE1));
1877 t1_SSE0 = _mm_sub_pd(t1_SSE0,
1878 _mm_mul_pd(onefourth_SSE,
1879 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1880 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1881 t1_SSE1 = _mm_sub_pd(t1_SSE1,
1882 _mm_mul_pd(onefourth_SSE,
1883 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1884 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1885 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
1886 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1887 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1888 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
1889 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1890 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1891 t2_SSE0 = _mm_sub_pd(t2_SSE0,
1892 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1893 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1894 t2_SSE1 = _mm_sub_pd(t2_SSE1,
1895 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1896 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1898 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1899 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1900 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1901 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1903 t3_SSE0 = _mm_sub_pd(t3_SSE0,
1904 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1906 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1907 t3_SSE1 = _mm_sub_pd(t3_SSE1,
1908 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1910 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1912 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
1913 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1914 _mm_add_pd(t2_SSE0,t3_SSE0)));
1915 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
1916 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1917 _mm_add_pd(t2_SSE1,t3_SSE1)));
1919 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1921 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1925 /* Epilogue part, including exclusion mask */
1926 for(j=nj2; j<nj3; j+=UNROLLJ)
1928 jmask_SSE0 = _mm_load_pd((double *)emask0);
1929 jmask_SSE1 = _mm_load_pd((double *)emask1);
1930 emask0 += 2*UNROLLJ;
1931 emask1 += 2*UNROLLJ;
1933 /* load j atom coordinates */
1934 jx_SSE = _mm_load_pd(x_align+j);
1935 jy_SSE = _mm_load_pd(y_align+j);
1936 jz_SSE = _mm_load_pd(z_align+j);
1938 /* Calculate distance */
1939 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
1940 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
1941 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
1942 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
1943 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
1944 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
1946 /* rsq = dx*dx+dy*dy+dz*dz */
1947 rsq_SSE0 = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1948 rsq_SSE1 = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1951 jmask_SSE0 = _mm_and_pd(jmask_SSE0,imask_SSE0);
1952 jmask_SSE1 = _mm_and_pd(jmask_SSE1,imask_SSE1);
1954 /* Calculate 1/r and 1/r2 */
1955 rinv_SSE0 = gmx_mm_invsqrt_pd(rsq_SSE0);
1956 rinv_SSE1 = gmx_mm_invsqrt_pd(rsq_SSE1);
1959 rinv_SSE0 = _mm_and_pd(rinv_SSE0,jmask_SSE0);
1960 rinv_SSE1 = _mm_and_pd(rinv_SSE1,jmask_SSE1);
1962 dr_SSE0 = _mm_mul_pd(rsq_SSE0,rinv_SSE0);
1963 dr_SSE1 = _mm_mul_pd(rsq_SSE1,rinv_SSE1);
1965 sk_aj_SSE = _mm_load_pd(obc_param+j);
1966 raj_SSE = _mm_load_pd(gb_radius+j);
1968 raj_inv_SSE = gmx_mm_inv_pd(raj_SSE);
1970 /* Evaluate influence of atom aj -> ai */
1971 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_aj_SSE);
1972 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_aj_SSE);
1973 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_aj_SSE);
1974 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_aj_SSE);
1975 t3_SSE0 = _mm_sub_pd(sk_aj_SSE,dr_SSE0);
1976 t3_SSE1 = _mm_sub_pd(sk_aj_SSE,dr_SSE1);
1978 obc_mask1_SSE0 = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1979 obc_mask1_SSE1 = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1980 obc_mask2_SSE0 = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1981 obc_mask2_SSE1 = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1982 obc_mask3_SSE0 = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1983 obc_mask3_SSE1 = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1984 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
1985 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
1987 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
1988 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
1989 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1990 _mm_andnot_pd(obc_mask2_SSE0,rai_inv_SSE0));
1991 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1992 _mm_andnot_pd(obc_mask2_SSE1,rai_inv_SSE1));
1994 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1995 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1997 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
1998 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
1999 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
2000 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
2001 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2002 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2003 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
2004 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
2006 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
2007 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
2008 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2009 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2010 sk2_aj_SSE = _mm_mul_pd(sk_aj_SSE,sk_aj_SSE);
2011 sk2_rinv_SSE0 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE0);
2012 sk2_rinv_SSE1 = _mm_mul_pd(sk2_aj_SSE,rinv_SSE1);
2013 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
2014 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
2016 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
2017 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
2019 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
2020 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
2021 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2022 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
2024 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2025 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
2028 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
2029 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
2030 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
2031 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
2032 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE0,lij_SSE0));
2033 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE1,lij_SSE1));
2034 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
2035 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
2036 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
2037 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
2039 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
2040 sum_ai_SSE1 = _mm_add_pd(sum_ai_SSE1,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
2042 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
2043 _mm_mul_pd(prod_SSE0,lij3_SSE0));
2044 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
2045 _mm_mul_pd(prod_SSE1,lij3_SSE1));
2046 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2047 _mm_mul_pd(onefourth_SSE,
2048 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
2049 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
2050 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2051 _mm_mul_pd(onefourth_SSE,
2052 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
2053 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
2055 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2056 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
2057 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
2058 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2059 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
2060 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
2061 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2062 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
2063 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
2064 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2065 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
2066 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
2067 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
2068 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
2069 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
2070 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
2071 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2072 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
2074 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
2075 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2076 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
2078 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
2080 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2081 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
2082 _mm_add_pd(t2_SSE0,t3_SSE0)));
2083 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2084 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
2085 _mm_add_pd(t2_SSE1,t3_SSE1)));
2087 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
2089 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
2092 /* Evaluate influence of atom ai -> aj */
2093 t1_SSE0 = _mm_add_pd(dr_SSE0,sk_ai_SSE0);
2094 t1_SSE1 = _mm_add_pd(dr_SSE1,sk_ai_SSE1);
2095 t2_SSE0 = _mm_sub_pd(dr_SSE0,sk_ai_SSE0);
2096 t2_SSE1 = _mm_sub_pd(dr_SSE1,sk_ai_SSE1);
2097 t3_SSE0 = _mm_sub_pd(sk_ai_SSE0,dr_SSE0);
2098 t3_SSE1 = _mm_sub_pd(sk_ai_SSE1,dr_SSE1);
2100 obc_mask1_SSE0 = _mm_cmplt_pd(raj_SSE, t1_SSE0);
2101 obc_mask1_SSE1 = _mm_cmplt_pd(raj_SSE, t1_SSE1);
2102 obc_mask2_SSE0 = _mm_cmplt_pd(raj_SSE, t2_SSE0);
2103 obc_mask2_SSE1 = _mm_cmplt_pd(raj_SSE, t2_SSE1);
2104 obc_mask3_SSE0 = _mm_cmplt_pd(raj_SSE, t3_SSE0);
2105 obc_mask3_SSE1 = _mm_cmplt_pd(raj_SSE, t3_SSE1);
2106 obc_mask1_SSE0 = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
2107 obc_mask1_SSE1 = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
2109 uij_SSE0 = gmx_mm_inv_pd(t1_SSE0);
2110 uij_SSE1 = gmx_mm_inv_pd(t1_SSE1);
2111 lij_SSE0 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
2112 _mm_andnot_pd(obc_mask2_SSE0,raj_inv_SSE));
2113 lij_SSE1 = _mm_or_pd( _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
2114 _mm_andnot_pd(obc_mask2_SSE1,raj_inv_SSE));
2116 dlij_SSE0 = _mm_and_pd(one_SSE,obc_mask2_SSE0);
2117 dlij_SSE1 = _mm_and_pd(one_SSE,obc_mask2_SSE1);
2119 uij2_SSE0 = _mm_mul_pd(uij_SSE0, uij_SSE0);
2120 uij2_SSE1 = _mm_mul_pd(uij_SSE1, uij_SSE1);
2121 uij3_SSE0 = _mm_mul_pd(uij2_SSE0,uij_SSE0);
2122 uij3_SSE1 = _mm_mul_pd(uij2_SSE1,uij_SSE1);
2123 lij2_SSE0 = _mm_mul_pd(lij_SSE0, lij_SSE0);
2124 lij2_SSE1 = _mm_mul_pd(lij_SSE1, lij_SSE1);
2125 lij3_SSE0 = _mm_mul_pd(lij2_SSE0,lij_SSE0);
2126 lij3_SSE1 = _mm_mul_pd(lij2_SSE1,lij_SSE1);
2128 diff2_SSE0 = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
2129 diff2_SSE1 = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
2130 lij_inv_SSE0 = gmx_mm_invsqrt_pd(lij2_SSE0);
2131 lij_inv_SSE1 = gmx_mm_invsqrt_pd(lij2_SSE1);
2132 sk2_rinv_SSE0 = _mm_mul_pd(sk2_ai_SSE0,rinv_SSE0);
2133 sk2_rinv_SSE1 = _mm_mul_pd(sk2_ai_SSE1,rinv_SSE1);
2134 prod_SSE0 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
2135 prod_SSE1 = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
2137 logterm_SSE0 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
2138 logterm_SSE1 = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
2139 t1_SSE0 = _mm_sub_pd(lij_SSE0,uij_SSE0);
2140 t1_SSE1 = _mm_sub_pd(lij_SSE1,uij_SSE1);
2141 t2_SSE0 = _mm_mul_pd(diff2_SSE0,
2142 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
2144 t2_SSE1 = _mm_mul_pd(diff2_SSE1,
2145 _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
2147 t3_SSE0 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
2148 t3_SSE1 = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
2149 t1_SSE0 = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
2150 t1_SSE1 = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
2151 t4_SSE0 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE0));
2152 t4_SSE1 = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE1));
2153 t4_SSE0 = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
2154 t4_SSE1 = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
2155 t1_SSE0 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
2156 t1_SSE1 = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
2158 _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
2159 _mm_add_pd(_mm_and_pd(t1_SSE0,obc_mask1_SSE0),
2160 _mm_and_pd(t1_SSE1,obc_mask1_SSE1))));
2162 t1_SSE0 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
2163 _mm_mul_pd(prod_SSE0,lij3_SSE0));
2164 t1_SSE1 = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
2165 _mm_mul_pd(prod_SSE1,lij3_SSE1));
2167 t1_SSE0 = _mm_sub_pd(t1_SSE0,
2168 _mm_mul_pd(onefourth_SSE,
2169 _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
2170 _mm_mul_pd(lij3_SSE0,dr_SSE0))));
2171 t1_SSE1 = _mm_sub_pd(t1_SSE1,
2172 _mm_mul_pd(onefourth_SSE,
2173 _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
2174 _mm_mul_pd(lij3_SSE1,dr_SSE1))));
2175 t2_SSE0 = _mm_mul_pd(onefourth_SSE,
2176 _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
2177 _mm_mul_pd(uij3_SSE0,dr_SSE0)));
2178 t2_SSE1 = _mm_mul_pd(onefourth_SSE,
2179 _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
2180 _mm_mul_pd(uij3_SSE1,dr_SSE1)));
2181 t2_SSE0 = _mm_sub_pd(t2_SSE0,
2182 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
2183 _mm_mul_pd(prod_SSE0,uij3_SSE0)));
2184 t2_SSE1 = _mm_sub_pd(t2_SSE1,
2185 _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
2186 _mm_mul_pd(prod_SSE1,uij3_SSE1)));
2188 t3_SSE0 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
2189 _mm_mul_pd(rinv_SSE0,rinv_SSE0));
2190 t3_SSE1 = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
2191 _mm_mul_pd(rinv_SSE1,rinv_SSE1));
2193 t3_SSE0 = _mm_sub_pd(t3_SSE0,
2194 _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
2196 _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
2197 t3_SSE1 = _mm_sub_pd(t3_SSE1,
2198 _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
2200 _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
2202 t1_SSE0 = _mm_mul_pd(rinv_SSE0,
2203 _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
2204 _mm_add_pd(t2_SSE0,t3_SSE0)));
2205 t1_SSE1 = _mm_mul_pd(rinv_SSE1,
2206 _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
2207 _mm_add_pd(t2_SSE1,t3_SSE1)));
2209 _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
2211 _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
2214 GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0,sum_ai_SSE1);
2215 sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0,sum_ai_SSE1);
2216 _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
2220 for(i=0;i<natoms/2+1;i++)
2222 work[i] += work[natoms+i];
2225 /* Parallel summations */
2229 gmx_sum(natoms,work, cr);
2232 if(gb_algorithm==egbHCT)
2235 for(i=0;i<natoms;i++)
2237 if(born->use[i] != 0)
2239 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
2240 sum_ai = 1.0/rai - work[i];
2241 min_rad = rai + born->gb_doffset;
2244 born->bRad[i] = rad > min_rad ? rad : min_rad;
2245 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2254 /* Calculate the radii */
2255 for(i=0;i<natoms;i++)
2258 if(born->use[i] != 0)
2260 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
2262 rai = rai-born->gb_doffset;
2264 sum_ai = rai * work[i];
2265 sum_ai2 = sum_ai * sum_ai;
2266 sum_ai3 = sum_ai2 * sum_ai;
2268 tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
2269 born->bRad[i] = rai_inv - tsum*rai_inv2;
2270 born->bRad[i] = 1.0 / born->bRad[i];
2272 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
2274 tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
2275 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
2291 genborn_allvsall_calc_chainrule_sse2_double(t_forcerec * fr,
2292 t_mdatoms * mdatoms,
2293 gmx_genborn_t * born,
2299 gmx_allvsallgb2_data_t *aadata;
2302 int nj0,nj1,nj2,nj3;
2315 double rbai,rbaj,fgb,fgb_ai,rbi;
2326 __m128d jmask_SSE0,jmask_SSE1;
2327 __m128d ix_SSE0,iy_SSE0,iz_SSE0;
2328 __m128d ix_SSE1,iy_SSE1,iz_SSE1;
2329 __m128d fix_SSE0,fiy_SSE0,fiz_SSE0;
2330 __m128d fix_SSE1,fiy_SSE1,fiz_SSE1;
2331 __m128d rbai_SSE0,rbai_SSE1;
2332 __m128d imask_SSE0,imask_SSE1;
2333 __m128d jx_SSE,jy_SSE,jz_SSE,rbaj_SSE;
2334 __m128d dx_SSE0,dy_SSE0,dz_SSE0;
2335 __m128d dx_SSE1,dy_SSE1,dz_SSE1;
2336 __m128d fgb_SSE0,fgb_ai_SSE0;
2337 __m128d fgb_SSE1,fgb_ai_SSE1;
2338 __m128d tx_SSE0,ty_SSE0,tz_SSE0;
2339 __m128d tx_SSE1,ty_SSE1,tz_SSE1;
2340 __m128d t1,t2,tmpSSE;
2342 natoms = mdatoms->nr;
2343 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
2344 ni1 = mdatoms->start+mdatoms->homenr;
2346 aadata = (gmx_allvsallgb2_data_t *)paadata;
2348 x_align = aadata->x_align;
2349 y_align = aadata->y_align;
2350 z_align = aadata->z_align;
2351 fx_align = aadata->fx_align;
2352 fy_align = aadata->fy_align;
2353 fz_align = aadata->fz_align;
2355 jindex = aadata->jindex_gb;
2361 /* Loop to get the proper form for the Born radius term */
2362 if(gb_algorithm==egbSTILL)
2364 for(i=0;i<natoms;i++)
2366 rbi = born->bRad[i];
2367 rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
2370 else if(gb_algorithm==egbHCT)
2372 for(i=0;i<natoms;i++)
2374 rbi = born->bRad[i];
2375 rb[i] = rbi * rbi * fr->dvda[i];
2378 else if(gb_algorithm==egbOBC)
2380 for(idx=0;idx<natoms;idx++)
2382 rbi = born->bRad[idx];
2383 rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
2387 for(i=0;i<2*natoms;i++)
2395 for(i=0;i<natoms;i++)
2400 for(i=ni0; i<ni1; i+=UNROLLI)
2402 /* We assume shifts are NOT used for all-vs-all interactions */
2404 /* Load i atom data */
2405 ix_SSE0 = _mm_load1_pd(x_align+i);
2406 iy_SSE0 = _mm_load1_pd(y_align+i);
2407 iz_SSE0 = _mm_load1_pd(z_align+i);
2408 ix_SSE1 = _mm_load1_pd(x_align+i+1);
2409 iy_SSE1 = _mm_load1_pd(y_align+i+1);
2410 iz_SSE1 = _mm_load1_pd(z_align+i+1);
2412 fix_SSE0 = _mm_setzero_pd();
2413 fiy_SSE0 = _mm_setzero_pd();
2414 fiz_SSE0 = _mm_setzero_pd();
2415 fix_SSE1 = _mm_setzero_pd();
2416 fiy_SSE1 = _mm_setzero_pd();
2417 fiz_SSE1 = _mm_setzero_pd();
2419 rbai_SSE0 = _mm_load1_pd(rb+i);
2420 rbai_SSE1 = _mm_load1_pd(rb+i+1);
2422 /* Load limits for loop over neighbors */
2424 nj3 = jindex[4*i+3];
2426 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
2427 for(j=nj0; j<nj3; j+=UNROLLJ)
2429 /* load j atom coordinates */
2430 jx_SSE = _mm_load_pd(x_align+j);
2431 jy_SSE = _mm_load_pd(y_align+j);
2432 jz_SSE = _mm_load_pd(z_align+j);
2434 /* Calculate distance */
2435 dx_SSE0 = _mm_sub_pd(ix_SSE0,jx_SSE);
2436 dy_SSE0 = _mm_sub_pd(iy_SSE0,jy_SSE);
2437 dz_SSE0 = _mm_sub_pd(iz_SSE0,jz_SSE);
2438 dx_SSE1 = _mm_sub_pd(ix_SSE1,jx_SSE);
2439 dy_SSE1 = _mm_sub_pd(iy_SSE1,jy_SSE);
2440 dz_SSE1 = _mm_sub_pd(iz_SSE1,jz_SSE);
2442 rbaj_SSE = _mm_load_pd(rb+j);
2444 fgb_SSE0 = _mm_mul_pd(rbai_SSE0,_mm_load_pd(dadx));
2446 fgb_SSE1 = _mm_mul_pd(rbai_SSE1,_mm_load_pd(dadx));
2449 fgb_ai_SSE0 = _mm_mul_pd(rbaj_SSE,_mm_load_pd(dadx));
2451 fgb_ai_SSE1 = _mm_mul_pd(rbaj_SSE,_mm_load_pd(dadx));
2454 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
2455 fgb_SSE0 = _mm_add_pd(fgb_SSE0,fgb_ai_SSE0);
2456 fgb_SSE1 = _mm_add_pd(fgb_SSE1,fgb_ai_SSE1);
2458 /* Calculate temporary vectorial force */
2459 tx_SSE0 = _mm_mul_pd(fgb_SSE0,dx_SSE0);
2460 ty_SSE0 = _mm_mul_pd(fgb_SSE0,dy_SSE0);
2461 tz_SSE0 = _mm_mul_pd(fgb_SSE0,dz_SSE0);
2462 tx_SSE1 = _mm_mul_pd(fgb_SSE1,dx_SSE1);
2463 ty_SSE1 = _mm_mul_pd(fgb_SSE1,dy_SSE1);
2464 tz_SSE1 = _mm_mul_pd(fgb_SSE1,dz_SSE1);
2466 /* Increment i atom force */
2467 fix_SSE0 = _mm_add_pd(fix_SSE0,tx_SSE0);
2468 fiy_SSE0 = _mm_add_pd(fiy_SSE0,ty_SSE0);
2469 fiz_SSE0 = _mm_add_pd(fiz_SSE0,tz_SSE0);
2470 fix_SSE1 = _mm_add_pd(fix_SSE1,tx_SSE1);
2471 fiy_SSE1 = _mm_add_pd(fiy_SSE1,ty_SSE1);
2472 fiz_SSE1 = _mm_add_pd(fiz_SSE1,tz_SSE1);
2474 /* Decrement j atom force */
2475 _mm_store_pd(fx_align+j,
2476 _mm_sub_pd( _mm_load_pd(fx_align+j) , _mm_add_pd(tx_SSE0,tx_SSE1) ));
2477 _mm_store_pd(fy_align+j,
2478 _mm_sub_pd( _mm_load_pd(fy_align+j) , _mm_add_pd(ty_SSE0,ty_SSE1) ));
2479 _mm_store_pd(fz_align+j,
2480 _mm_sub_pd( _mm_load_pd(fz_align+j) , _mm_add_pd(tz_SSE0,tz_SSE1) ));
2483 /* Add i forces to mem */
2484 GMX_MM_TRANSPOSE2_PD(fix_SSE0,fix_SSE1);
2485 fix_SSE0 = _mm_add_pd(fix_SSE0,fix_SSE1);
2486 _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
2488 GMX_MM_TRANSPOSE2_PD(fiy_SSE0,fiy_SSE1);
2489 fiy_SSE0 = _mm_add_pd(fiy_SSE0,fiy_SSE1);
2490 _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
2492 GMX_MM_TRANSPOSE2_PD(fiz_SSE0,fiz_SSE1);
2493 fiz_SSE0 = _mm_add_pd(fiz_SSE0,fiz_SSE1);
2494 _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));
2497 for(i=0;i<natoms;i++)
2499 f[3*i] += fx_align[i] + fx_align[natoms+i];
2500 f[3*i+1] += fy_align[i] + fy_align[natoms+i];
2501 f[3*i+2] += fz_align[i] + fz_align[natoms+i];
2508 /* dummy variable when not using SSE */
2509 int genborn_allvsall_sse2_double_dummy;