2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2012, by the GROMACS development team, led by
6 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
7 * others, as listed in the AUTHORS file in the top-level source
8 * directory and at http://www.gromacs.org.
10 * GROMACS is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public License
12 * as published by the Free Software Foundation; either version 2.1
13 * of the License, or (at your option) any later version.
15 * GROMACS is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with GROMACS; if not, see
22 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25 * If you want to redistribute modifications to GROMACS, please
26 * consider that scientific software is very special. Version
27 * control is crucial - bugs must be traceable. We will be happy to
28 * consider code for inclusion in the official distribution, but
29 * derived work must not be called official GROMACS. Details are found
30 * in the README & COPYING files - if they are missing, get the
31 * official version at http://www.gromacs.org.
33 * To help us fund GROMACS development, we humbly ask that you cite
34 * the research papers on the package. Check out http://www.gromacs.org.
41 #include "types/simple.h"
50 #include "genborn_allvsall.h"
52 #if 0 && defined (GMX_X86_SSE2)
54 #include <gmx_sse2_single.h>
72 int ** prologue_mask_gb;
85 gmx_allvsallgb2_data_t;
89 calc_maxoffset(int i,int natoms)
93 if ((natoms % 2) == 1)
95 /* Odd number of atoms, easy */
98 else if ((natoms % 4) == 0)
100 /* Multiple of four is hard */
109 maxoffset=natoms/2-1;
120 maxoffset=natoms/2-1;
133 maxoffset=natoms/2-1;
141 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t * aadata,
152 int ni0,ni1,nj0,nj1,nj;
156 int firstinteraction;
160 /* This routine can appear to be a bit complex, but it is mostly book-keeping.
161 * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
162 * whether they should interact or not.
164 * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
165 * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
166 * we create a jindex array with three elements per i atom: the starting point, the point to
167 * which we need to check exclusions, and the end point.
168 * This way we only have to allocate a short exclusion mask per i atom.
171 ni0 = (start/UNROLLI)*UNROLLI;
172 ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
174 /* Set the interaction mask to only enable the i atoms we want to include */
175 snew(pi,natoms+UNROLLI+2*SIMD_WIDTH);
176 aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
177 for(i=0;i<natoms+UNROLLI;i++)
179 aadata->imask[i] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
182 /* Allocate memory for our modified jindex array */
183 snew(aadata->jindex_gb,4*(natoms+UNROLLI));
184 for(i=0;i<4*(natoms+UNROLLI);i++)
186 aadata->jindex_gb[i] = 0;
189 /* Create the exclusion masks for the prologue part */
190 snew(aadata->prologue_mask_gb,natoms+UNROLLI); /* list of pointers */
192 /* First zero everything to avoid uninitialized data */
193 for(i=0;i<natoms+UNROLLI;i++)
195 aadata->prologue_mask_gb[i] = NULL;
198 /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
199 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
201 max_excl_offset = -1;
203 /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
204 imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
206 /* Which atom is the first we (might) interact with? */
207 imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
208 for(i=ibase;i<imax;i++)
210 /* Before exclusions, which atom is the first we (might) interact with? */
211 firstinteraction = i+1;
212 max_offset = calc_maxoffset(i,natoms);
216 for(j=0;j<ilist[F_GB12].nr;j+=3)
218 a1 = ilist[F_GB12].iatoms[j+1];
219 a2 = ilist[F_GB12].iatoms[j+2];
234 if(k==firstinteraction)
242 for(j=0;j<ilist[F_GB13].nr;j+=3)
244 a1 = ilist[F_GB13].iatoms[j+1];
245 a2 = ilist[F_GB13].iatoms[j+2];
260 if(k==firstinteraction)
268 for(j=0;j<ilist[F_GB14].nr;j+=3)
270 a1 = ilist[F_GB14].iatoms[j+1];
271 a2 = ilist[F_GB14].iatoms[j+2];
285 if(k==firstinteraction)
291 imin = (firstinteraction < imin) ? firstinteraction : imin;
293 /* round down to j unrolling factor */
294 imin = (imin/UNROLLJ)*UNROLLJ;
296 for(i=ibase;i<imax;i++)
298 max_offset = calc_maxoffset(i,natoms);
302 for(j=0;j<ilist[F_GB12].nr;j+=3)
304 a1 = ilist[F_GB12].iatoms[j+1];
305 a2 = ilist[F_GB12].iatoms[j+2];
332 if( k+natoms <= max_offset )
336 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
341 for(j=0;j<ilist[F_GB13].nr;j+=3)
343 a1 = ilist[F_GB13].iatoms[j+1];
344 a2 = ilist[F_GB13].iatoms[j+2];
371 if( k+natoms <= max_offset )
375 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
380 for(j=0;j<ilist[F_GB14].nr;j+=3)
382 a1 = ilist[F_GB14].iatoms[j+1];
383 a2 = ilist[F_GB14].iatoms[j+2];
410 if( k+natoms <= max_offset )
414 max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
419 /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
421 /* round up to j unrolling factor */
422 max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
424 /* Set all the prologue masks length to this value (even for i>end) */
425 for(i=ibase;i<ibase+UNROLLI;i++)
427 aadata->jindex_gb[4*i] = imin;
428 aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
432 /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
433 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
435 for(i=ibase;i<ibase+UNROLLI;i++)
437 nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
438 imin = aadata->jindex_gb[4*i];
440 /* Allocate aligned memory */
441 snew(pi,nj+2*SIMD_WIDTH);
442 aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
444 max_offset = calc_maxoffset(i,natoms);
446 /* Include interactions i+1 <= j < i+maxoffset */
451 if( (j>i) && (j<=i+max_offset) )
453 aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
457 aadata->prologue_mask_gb[i][k] = 0;
461 /* Clear out the explicit exclusions */
466 for(j=0;j<ilist[F_GB12].nr;j+=3)
468 a1 = ilist[F_GB12].iatoms[j+1];
469 a2 = ilist[F_GB12].iatoms[j+2];
490 if( k+natoms <= max_offset )
498 aadata->prologue_mask_gb[i][k] = 0;
504 for(j=0;j<ilist[F_GB13].nr;j+=3)
506 a1 = ilist[F_GB13].iatoms[j+1];
507 a2 = ilist[F_GB13].iatoms[j+2];
528 if( k+natoms <= max_offset )
536 aadata->prologue_mask_gb[i][k] = 0;
542 for(j=0;j<ilist[F_GB14].nr;j+=3)
544 a1 = ilist[F_GB14].iatoms[j+1];
545 a2 = ilist[F_GB14].iatoms[j+2];
566 if( k+natoms <= max_offset )
574 aadata->prologue_mask_gb[i][k] = 0;
582 /* Construct the epilogue mask - this just contains the check for maxoffset */
583 snew(aadata->epilogue_mask,natoms+UNROLLI);
585 /* First zero everything to avoid uninitialized data */
586 for(i=0;i<natoms+UNROLLI;i++)
588 aadata->jindex_gb[4*i+2] = aadata->jindex_gb[4*i+1];
589 aadata->jindex_gb[4*i+3] = aadata->jindex_gb[4*i+1];
590 aadata->epilogue_mask[i] = NULL;
593 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
595 /* Find the lowest index for which we need to use the epilogue */
597 max_offset = calc_maxoffset(imin,natoms);
599 imin = imin + 1 + max_offset;
601 /* Find largest index for which we need to use the epilogue */
602 imax = ibase + UNROLLI-1;
603 imax = (imax < end) ? imax : end;
605 max_offset = calc_maxoffset(imax,natoms);
606 imax = imax + 1 + max_offset + UNROLLJ - 1;
608 for(i=ibase;i<ibase+UNROLLI;i++)
610 /* Start of epilogue - round down to j tile limit */
611 aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
612 /* Make sure we dont overlap - for small systems everything is done in the prologue */
613 aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
614 /* Round upwards to j tile limit */
615 aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
616 /* Make sure we dont have a negative range for the epilogue */
617 aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
621 /* And fill it with data... */
623 for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
625 for(i=ibase;i<ibase+UNROLLI;i++)
628 nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
630 /* Allocate aligned memory */
631 snew(pi,nj+2*SIMD_WIDTH);
632 aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
634 max_offset = calc_maxoffset(i,natoms);
638 j = aadata->jindex_gb[4*i+2] + k;
639 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
647 genborn_allvsall_setup(gmx_allvsallgb2_data_t ** p_aadata,
648 gmx_localtop_t * top,
649 gmx_genborn_t * born,
659 gmx_allvsallgb2_data_t *aadata;
662 natoms = mdatoms->nr;
667 snew(p,2*natoms+2*SIMD_WIDTH);
668 aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
669 snew(p,2*natoms+2*SIMD_WIDTH);
670 aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
671 snew(p,2*natoms+2*SIMD_WIDTH);
672 aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
673 snew(p,2*natoms+2*SIMD_WIDTH);
674 aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
675 snew(p,2*natoms+2*SIMD_WIDTH);
676 aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
677 snew(p,2*natoms+2*SIMD_WIDTH);
678 aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
680 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
681 aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
683 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
684 aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
686 snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
687 aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
689 for(i=0;i<mdatoms->nr;i++)
691 aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
692 if(gb_algorithm==egbSTILL)
694 aadata->workparam[i] = born->vsolv[i];
696 else if(gb_algorithm==egbOBC)
698 aadata->workparam[i] = born->param[i];
700 aadata->work[i] = 0.0;
702 for(i=0;i<mdatoms->nr;i++)
704 aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
705 aadata->workparam[natoms+i] = aadata->workparam[i];
706 aadata->work[natoms+i] = aadata->work[i];
709 for(i=0;i<2*natoms+SIMD_WIDTH;i++)
711 aadata->x_align[i] = 0.0;
712 aadata->y_align[i] = 0.0;
713 aadata->z_align[i] = 0.0;
714 aadata->fx_align[i] = 0.0;
715 aadata->fy_align[i] = 0.0;
716 aadata->fz_align[i] = 0.0;
719 setup_gb_exclusions_and_indices(aadata,top->idef.il,mdatoms->start,mdatoms->start+mdatoms->homenr,mdatoms->nr,
720 bInclude12,bInclude13,bInclude14);
725 genborn_allvsall_calc_still_radii_sse2_single(t_forcerec * fr,
727 gmx_genborn_t * born,
728 gmx_localtop_t * top,
733 gmx_allvsallgb2_data_t *aadata;
755 real vaj,ccf,dccf,theta,cosq;
756 real term,prod,icf4,icf6,gpi2,factor,sinq;
767 __m128 ix_SSE0,iy_SSE0,iz_SSE0;
768 __m128 ix_SSE1,iy_SSE1,iz_SSE1;
769 __m128 ix_SSE2,iy_SSE2,iz_SSE2;
770 __m128 ix_SSE3,iy_SSE3,iz_SSE3;
771 __m128 gpi_SSE0,rai_SSE0,prod_ai_SSE0;
772 __m128 gpi_SSE1,rai_SSE1,prod_ai_SSE1;
773 __m128 gpi_SSE2,rai_SSE2,prod_ai_SSE2;
774 __m128 gpi_SSE3,rai_SSE3,prod_ai_SSE3;
775 __m128 imask_SSE0,jmask_SSE0;
776 __m128 imask_SSE1,jmask_SSE1;
777 __m128 imask_SSE2,jmask_SSE2;
778 __m128 imask_SSE3,jmask_SSE3;
779 __m128 jx_SSE,jy_SSE,jz_SSE;
780 __m128 dx_SSE0,dy_SSE0,dz_SSE0;
781 __m128 dx_SSE1,dy_SSE1,dz_SSE1;
782 __m128 dx_SSE2,dy_SSE2,dz_SSE2;
783 __m128 dx_SSE3,dy_SSE3,dz_SSE3;
784 __m128 rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
785 __m128 rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
786 __m128 rsq_SSE2,rinv_SSE2,irsq_SSE2,idr4_SSE2,idr6_SSE2;
787 __m128 rsq_SSE3,rinv_SSE3,irsq_SSE3,idr4_SSE3,idr6_SSE3;
788 __m128 raj_SSE,vaj_SSE,prod_SSE;
789 __m128 rvdw_SSE0,ratio_SSE0;
790 __m128 rvdw_SSE1,ratio_SSE1;
791 __m128 rvdw_SSE2,ratio_SSE2;
792 __m128 rvdw_SSE3,ratio_SSE3;
793 __m128 theta_SSE0,sinq_SSE0,cosq_SSE0,term_SSE0;
794 __m128 theta_SSE1,sinq_SSE1,cosq_SSE1,term_SSE1;
795 __m128 theta_SSE2,sinq_SSE2,cosq_SSE2,term_SSE2;
796 __m128 theta_SSE3,sinq_SSE3,cosq_SSE3,term_SSE3;
797 __m128 ccf_SSE0,dccf_SSE0;
798 __m128 ccf_SSE1,dccf_SSE1;
799 __m128 ccf_SSE2,dccf_SSE2;
800 __m128 ccf_SSE3,dccf_SSE3;
801 __m128 icf4_SSE0,icf6_SSE0;
802 __m128 icf4_SSE1,icf6_SSE1;
803 __m128 icf4_SSE2,icf6_SSE2;
804 __m128 icf4_SSE3,icf6_SSE3;
805 __m128 half_SSE,one_SSE,two_SSE,four_SSE;
806 __m128 still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
808 natoms = mdatoms->nr;
809 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
810 ni1 = mdatoms->start+mdatoms->homenr;
814 aadata = *((gmx_allvsallgb2_data_t **)paadata);
819 genborn_allvsall_setup(&aadata,top,born,mdatoms,0.0,
820 egbSTILL,FALSE,FALSE,TRUE);
821 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
824 x_align = aadata->x_align;
825 y_align = aadata->y_align;
826 z_align = aadata->z_align;
828 gb_radius = aadata->gb_radius;
829 vsolv = aadata->workparam;
831 jindex = aadata->jindex_gb;
834 still_p4_SSE = _mm_set1_ps(STILL_P4);
835 still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
836 still_pip5_SSE = _mm_set1_ps(STILL_PIP5);
837 half_SSE = _mm_set1_ps(0.5);
838 one_SSE = _mm_set1_ps(1.0);
839 two_SSE = _mm_set1_ps(2.0);
840 four_SSE = _mm_set1_ps(4.0);
842 /* This will be summed, so it has to extend to natoms + buffer */
843 for(i=0;i<natoms+1+natoms/2;i++)
848 for(i=ni0;i<ni1+1+natoms/2;i++)
852 y_align[i] = x[3*k+1];
853 z_align[i] = x[3*k+2];
858 for(i=ni0; i<ni1; i+=UNROLLI)
860 /* We assume shifts are NOT used for all-vs-all interactions */
862 /* Load i atom data */
863 ix_SSE0 = _mm_load1_ps(x_align+i);
864 iy_SSE0 = _mm_load1_ps(y_align+i);
865 iz_SSE0 = _mm_load1_ps(z_align+i);
866 ix_SSE1 = _mm_load1_ps(x_align+i+1);
867 iy_SSE1 = _mm_load1_ps(y_align+i+1);
868 iz_SSE1 = _mm_load1_ps(z_align+i+1);
869 ix_SSE2 = _mm_load1_ps(x_align+i+2);
870 iy_SSE2 = _mm_load1_ps(y_align+i+2);
871 iz_SSE2 = _mm_load1_ps(z_align+i+2);
872 ix_SSE3 = _mm_load1_ps(x_align+i+3);
873 iy_SSE3 = _mm_load1_ps(y_align+i+3);
874 iz_SSE3 = _mm_load1_ps(z_align+i+3);
876 gpi_SSE0 = _mm_setzero_ps();
877 gpi_SSE1 = _mm_setzero_ps();
878 gpi_SSE2 = _mm_setzero_ps();
879 gpi_SSE3 = _mm_setzero_ps();
881 rai_SSE0 = _mm_load1_ps(gb_radius+i);
882 rai_SSE1 = _mm_load1_ps(gb_radius+i+1);
883 rai_SSE2 = _mm_load1_ps(gb_radius+i+2);
884 rai_SSE3 = _mm_load1_ps(gb_radius+i+3);
886 prod_ai_SSE0 = _mm_set1_ps(STILL_P4*vsolv[i]);
887 prod_ai_SSE1 = _mm_set1_ps(STILL_P4*vsolv[i+1]);
888 prod_ai_SSE2 = _mm_set1_ps(STILL_P4*vsolv[i+2]);
889 prod_ai_SSE3 = _mm_set1_ps(STILL_P4*vsolv[i+3]);
891 /* Load limits for loop over neighbors */
897 pmask0 = aadata->prologue_mask_gb[i];
898 pmask1 = aadata->prologue_mask_gb[i+1];
899 pmask2 = aadata->prologue_mask_gb[i+2];
900 pmask3 = aadata->prologue_mask_gb[i+3];
901 emask0 = aadata->epilogue_mask[i];
902 emask1 = aadata->epilogue_mask[i+1];
903 emask2 = aadata->epilogue_mask[i+2];
904 emask3 = aadata->epilogue_mask[i+3];
906 imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i));
907 imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1));
908 imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2));
909 imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3));
911 /* Prologue part, including exclusion mask */
912 for(j=nj0; j<nj1; j+=UNROLLJ)
914 jmask_SSE0 = _mm_load_ps((real *)pmask0);
915 jmask_SSE1 = _mm_load_ps((real *)pmask1);
916 jmask_SSE2 = _mm_load_ps((real *)pmask2);
917 jmask_SSE3 = _mm_load_ps((real *)pmask3);
923 /* load j atom coordinates */
924 jx_SSE = _mm_load_ps(x_align+j);
925 jy_SSE = _mm_load_ps(y_align+j);
926 jz_SSE = _mm_load_ps(z_align+j);
928 /* Calculate distance */
929 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
930 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
931 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
932 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
933 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
934 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
935 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
936 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
937 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
938 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
939 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
940 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
942 /* rsq = dx*dx+dy*dy+dz*dz */
943 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
944 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
945 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
946 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
949 jmask_SSE0 = _mm_and_ps(jmask_SSE0,imask_SSE0);
950 jmask_SSE1 = _mm_and_ps(jmask_SSE1,imask_SSE1);
951 jmask_SSE2 = _mm_and_ps(jmask_SSE2,imask_SSE2);
952 jmask_SSE3 = _mm_and_ps(jmask_SSE3,imask_SSE3);
954 /* Calculate 1/r and 1/r2 */
955 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
956 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
957 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
958 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
961 rinv_SSE0 = _mm_and_ps(rinv_SSE0,jmask_SSE0);
962 rinv_SSE1 = _mm_and_ps(rinv_SSE1,jmask_SSE1);
963 rinv_SSE2 = _mm_and_ps(rinv_SSE2,jmask_SSE2);
964 rinv_SSE3 = _mm_and_ps(rinv_SSE3,jmask_SSE3);
966 irsq_SSE0 = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
967 irsq_SSE1 = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
968 irsq_SSE2 = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
969 irsq_SSE3 = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
970 idr4_SSE0 = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
971 idr4_SSE1 = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
972 idr4_SSE2 = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
973 idr4_SSE3 = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
974 idr6_SSE0 = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
975 idr6_SSE1 = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
976 idr6_SSE2 = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
977 idr6_SSE3 = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
979 raj_SSE = _mm_load_ps(gb_radius+j);
980 vaj_SSE = _mm_load_ps(vsolv+j);
982 rvdw_SSE0 = _mm_add_ps(rai_SSE0,raj_SSE);
983 rvdw_SSE1 = _mm_add_ps(rai_SSE1,raj_SSE);
984 rvdw_SSE2 = _mm_add_ps(rai_SSE2,raj_SSE);
985 rvdw_SSE3 = _mm_add_ps(rai_SSE3,raj_SSE);
987 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
988 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
989 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
990 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
992 ratio_SSE0 = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
993 ratio_SSE1 = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
994 ratio_SSE2 = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
995 ratio_SSE3 = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
996 theta_SSE0 = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
997 theta_SSE1 = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
998 theta_SSE2 = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
999 theta_SSE3 = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
1000 gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
1001 gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
1002 gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);
1003 gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);
1004 term_SSE0 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
1005 term_SSE1 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1006 term_SSE2 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1007 term_SSE3 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1008 ccf_SSE0 = _mm_mul_ps(term_SSE0,term_SSE0);
1009 ccf_SSE1 = _mm_mul_ps(term_SSE1,term_SSE1);
1010 ccf_SSE2 = _mm_mul_ps(term_SSE2,term_SSE2);
1011 ccf_SSE3 = _mm_mul_ps(term_SSE3,term_SSE3);
1012 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1013 _mm_mul_ps(sinq_SSE0,theta_SSE0));
1014 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1015 _mm_mul_ps(sinq_SSE1,theta_SSE1));
1016 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1017 _mm_mul_ps(sinq_SSE2,theta_SSE2));
1018 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1019 _mm_mul_ps(sinq_SSE3,theta_SSE3));
1021 prod_SSE = _mm_mul_ps(still_p4_SSE,vaj_SSE);
1022 icf4_SSE0 = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1023 icf4_SSE1 = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1024 icf4_SSE2 = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1025 icf4_SSE3 = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1026 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1027 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1028 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1029 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1031 _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1032 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1033 _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1034 _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1035 _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1037 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1038 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1039 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1040 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1042 /* Save ai->aj and aj->ai chain rule terms */
1043 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1045 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1047 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1049 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1052 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1054 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1056 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1058 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1062 /* Main part, no exclusions */
1063 for(j=nj1; j<nj2; j+=UNROLLJ)
1065 /* load j atom coordinates */
1066 jx_SSE = _mm_load_ps(x_align+j);
1067 jy_SSE = _mm_load_ps(y_align+j);
1068 jz_SSE = _mm_load_ps(z_align+j);
1070 /* Calculate distance */
1071 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
1072 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
1073 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
1074 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
1075 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
1076 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
1077 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
1078 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
1079 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
1080 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
1081 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
1082 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
1084 /* rsq = dx*dx+dy*dy+dz*dz */
1085 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1086 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1087 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1088 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1090 /* Calculate 1/r and 1/r2 */
1091 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1092 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1093 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1094 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1097 rinv_SSE0 = _mm_and_ps(rinv_SSE0,imask_SSE0);
1098 rinv_SSE1 = _mm_and_ps(rinv_SSE1,imask_SSE1);
1099 rinv_SSE2 = _mm_and_ps(rinv_SSE2,imask_SSE2);
1100 rinv_SSE3 = _mm_and_ps(rinv_SSE3,imask_SSE3);
1102 irsq_SSE0 = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1103 irsq_SSE1 = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1104 irsq_SSE2 = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1105 irsq_SSE3 = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1106 idr4_SSE0 = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
1107 idr4_SSE1 = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
1108 idr4_SSE2 = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
1109 idr4_SSE3 = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
1110 idr6_SSE0 = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
1111 idr6_SSE1 = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
1112 idr6_SSE2 = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
1113 idr6_SSE3 = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
1115 raj_SSE = _mm_load_ps(gb_radius+j);
1117 rvdw_SSE0 = _mm_add_ps(rai_SSE0,raj_SSE);
1118 rvdw_SSE1 = _mm_add_ps(rai_SSE1,raj_SSE);
1119 rvdw_SSE2 = _mm_add_ps(rai_SSE2,raj_SSE);
1120 rvdw_SSE3 = _mm_add_ps(rai_SSE3,raj_SSE);
1121 vaj_SSE = _mm_load_ps(vsolv+j);
1123 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
1124 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
1125 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
1126 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
1128 ratio_SSE0 = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
1129 ratio_SSE1 = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
1130 ratio_SSE2 = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
1131 ratio_SSE3 = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
1132 theta_SSE0 = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
1133 theta_SSE1 = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
1134 theta_SSE2 = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
1135 theta_SSE3 = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
1136 gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
1137 gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
1138 gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);
1139 gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);
1140 term_SSE0 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
1141 term_SSE1 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1142 term_SSE2 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1143 term_SSE3 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1144 ccf_SSE0 = _mm_mul_ps(term_SSE0,term_SSE0);
1145 ccf_SSE1 = _mm_mul_ps(term_SSE1,term_SSE1);
1146 ccf_SSE2 = _mm_mul_ps(term_SSE2,term_SSE2);
1147 ccf_SSE3 = _mm_mul_ps(term_SSE3,term_SSE3);
1148 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1149 _mm_mul_ps(sinq_SSE0,theta_SSE0));
1150 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1151 _mm_mul_ps(sinq_SSE1,theta_SSE1));
1152 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1153 _mm_mul_ps(sinq_SSE2,theta_SSE2));
1154 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1155 _mm_mul_ps(sinq_SSE3,theta_SSE3));
1157 prod_SSE = _mm_mul_ps(still_p4_SSE,vaj_SSE );
1158 icf4_SSE0 = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1159 icf4_SSE1 = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1160 icf4_SSE2 = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1161 icf4_SSE3 = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1162 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1163 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1164 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1165 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1167 _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1168 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1169 _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1170 _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1171 _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1173 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1174 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1175 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1176 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1178 /* Save ai->aj and aj->ai chain rule terms */
1179 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1181 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1183 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1185 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1188 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1190 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1192 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1194 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1197 /* Epilogue part, including exclusion mask */
1198 for(j=nj2; j<nj3; j+=UNROLLJ)
1200 jmask_SSE0 = _mm_load_ps((real *)emask0);
1201 jmask_SSE1 = _mm_load_ps((real *)emask1);
1202 jmask_SSE2 = _mm_load_ps((real *)emask2);
1203 jmask_SSE3 = _mm_load_ps((real *)emask3);
1209 /* load j atom coordinates */
1210 jx_SSE = _mm_load_ps(x_align+j);
1211 jy_SSE = _mm_load_ps(y_align+j);
1212 jz_SSE = _mm_load_ps(z_align+j);
1214 /* Calculate distance */
1215 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
1216 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
1217 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
1218 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
1219 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
1220 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
1221 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
1222 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
1223 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
1224 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
1225 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
1226 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
1228 /* rsq = dx*dx+dy*dy+dz*dz */
1229 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1230 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1231 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1232 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1235 jmask_SSE0 = _mm_and_ps(jmask_SSE0,imask_SSE0);
1236 jmask_SSE1 = _mm_and_ps(jmask_SSE1,imask_SSE1);
1237 jmask_SSE2 = _mm_and_ps(jmask_SSE2,imask_SSE2);
1238 jmask_SSE3 = _mm_and_ps(jmask_SSE3,imask_SSE3);
1240 /* Calculate 1/r and 1/r2 */
1241 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1242 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1243 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1244 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1247 rinv_SSE0 = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1248 rinv_SSE1 = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1249 rinv_SSE2 = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1250 rinv_SSE3 = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1252 irsq_SSE0 = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1253 irsq_SSE1 = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1254 irsq_SSE2 = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1255 irsq_SSE3 = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1256 idr4_SSE0 = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
1257 idr4_SSE1 = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
1258 idr4_SSE2 = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
1259 idr4_SSE3 = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
1260 idr6_SSE0 = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
1261 idr6_SSE1 = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
1262 idr6_SSE2 = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
1263 idr6_SSE3 = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
1265 raj_SSE = _mm_load_ps(gb_radius+j);
1266 vaj_SSE = _mm_load_ps(vsolv+j);
1268 rvdw_SSE0 = _mm_add_ps(rai_SSE0,raj_SSE);
1269 rvdw_SSE1 = _mm_add_ps(rai_SSE1,raj_SSE);
1270 rvdw_SSE2 = _mm_add_ps(rai_SSE2,raj_SSE);
1271 rvdw_SSE3 = _mm_add_ps(rai_SSE3,raj_SSE);
1273 ratio_SSE0 = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
1274 ratio_SSE1 = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
1275 ratio_SSE2 = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
1276 ratio_SSE3 = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
1278 ratio_SSE0 = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
1279 ratio_SSE1 = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
1280 ratio_SSE2 = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
1281 ratio_SSE3 = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
1282 theta_SSE0 = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
1283 theta_SSE1 = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
1284 theta_SSE2 = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
1285 theta_SSE3 = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
1286 gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);
1287 gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);
1288 gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);
1289 gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);
1290 term_SSE0 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
1291 term_SSE1 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1292 term_SSE2 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1293 term_SSE3 = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1294 ccf_SSE0 = _mm_mul_ps(term_SSE0,term_SSE0);
1295 ccf_SSE1 = _mm_mul_ps(term_SSE1,term_SSE1);
1296 ccf_SSE2 = _mm_mul_ps(term_SSE2,term_SSE2);
1297 ccf_SSE3 = _mm_mul_ps(term_SSE3,term_SSE3);
1298 dccf_SSE0 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1299 _mm_mul_ps(sinq_SSE0,theta_SSE0));
1300 dccf_SSE1 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1301 _mm_mul_ps(sinq_SSE1,theta_SSE1));
1302 dccf_SSE2 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1303 _mm_mul_ps(sinq_SSE2,theta_SSE2));
1304 dccf_SSE3 = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1305 _mm_mul_ps(sinq_SSE3,theta_SSE3));
1307 prod_SSE = _mm_mul_ps(still_p4_SSE,vaj_SSE);
1308 icf4_SSE0 = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1309 icf4_SSE1 = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1310 icf4_SSE2 = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1311 icf4_SSE3 = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1312 icf6_SSE0 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1313 icf6_SSE1 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1314 icf6_SSE2 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1315 icf6_SSE3 = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1317 _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1318 gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1319 _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1320 _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1321 _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1323 gpi_SSE0 = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1324 gpi_SSE1 = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1325 gpi_SSE2 = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1326 gpi_SSE3 = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1328 /* Save ai->aj and aj->ai chain rule terms */
1329 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1331 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1333 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1335 _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1338 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1340 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1342 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1344 _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1347 _MM_TRANSPOSE4_PS(gpi_SSE0,gpi_SSE1,gpi_SSE2,gpi_SSE3);
1348 gpi_SSE0 = _mm_add_ps(gpi_SSE0,gpi_SSE1);
1349 gpi_SSE2 = _mm_add_ps(gpi_SSE2,gpi_SSE3);
1350 gpi_SSE0 = _mm_add_ps(gpi_SSE0,gpi_SSE2);
1351 _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
1354 /* In case we have written anything beyond natoms, move it back.
1355 * Never mind that we leave stuff above natoms; that will not
1356 * be accessed later in the routine.
1357 * In principle this should be a move rather than sum, but this
1358 * way we dont have to worry about even/odd offsets...
1360 for(i=natoms;i<ni1+1+natoms/2;i++)
1362 work[i-natoms] += work[i];
1365 /* Parallel summations */
1368 gmx_sum(natoms,work,cr);
1371 factor = 0.5 * ONE_4PI_EPS0;
1372 /* Calculate the radii - should we do all atoms, or just our local ones? */
1373 for(i=0;i<natoms;i++)
1375 if(born->use[i] != 0)
1377 gpi = born->gpol[i]+work[i];
1379 born->bRad[i] = factor*gmx_invsqrt(gpi2);
1380 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1390 genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec * fr,
1391 t_mdatoms * mdatoms,
1392 gmx_genborn_t * born,
1394 gmx_localtop_t * top,
1399 gmx_allvsallgb2_data_t *aadata;
1402 int nj0,nj1,nj2,nj3;
1424 real rai,rai_inv,rai_inv2,sum_ai,sum_ai2,sum_ai3,tsum,tchain;
1426 __m128 ix_SSE0,iy_SSE0,iz_SSE0;
1427 __m128 ix_SSE1,iy_SSE1,iz_SSE1;
1428 __m128 ix_SSE2,iy_SSE2,iz_SSE2;
1429 __m128 ix_SSE3,iy_SSE3,iz_SSE3;
1430 __m128 gpi_SSE0,rai_SSE0,prod_ai_SSE0;
1431 __m128 gpi_SSE1,rai_SSE1,prod_ai_SSE1;
1432 __m128 gpi_SSE2,rai_SSE2,prod_ai_SSE2;
1433 __m128 gpi_SSE3,rai_SSE3,prod_ai_SSE3;
1434 __m128 imask_SSE0,jmask_SSE0;
1435 __m128 imask_SSE1,jmask_SSE1;
1436 __m128 imask_SSE2,jmask_SSE2;
1437 __m128 imask_SSE3,jmask_SSE3;
1438 __m128 jx_SSE,jy_SSE,jz_SSE;
1439 __m128 dx_SSE0,dy_SSE0,dz_SSE0;
1440 __m128 dx_SSE1,dy_SSE1,dz_SSE1;
1441 __m128 dx_SSE2,dy_SSE2,dz_SSE2;
1442 __m128 dx_SSE3,dy_SSE3,dz_SSE3;
1443 __m128 rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
1444 __m128 rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
1445 __m128 rsq_SSE2,rinv_SSE2,irsq_SSE2,idr4_SSE2,idr6_SSE2;
1446 __m128 rsq_SSE3,rinv_SSE3,irsq_SSE3,idr4_SSE3,idr6_SSE3;
1447 __m128 raj_SSE,raj_inv_SSE,sk_aj_SSE,sk2_aj_SSE;
1448 __m128 ccf_SSE0,dccf_SSE0,prod_SSE0;
1449 __m128 ccf_SSE1,dccf_SSE1,prod_SSE1;
1450 __m128 ccf_SSE2,dccf_SSE2,prod_SSE2;
1451 __m128 ccf_SSE3,dccf_SSE3,prod_SSE3;
1452 __m128 icf4_SSE0,icf6_SSE0;
1453 __m128 icf4_SSE1,icf6_SSE1;
1454 __m128 icf4_SSE2,icf6_SSE2;
1455 __m128 icf4_SSE3,icf6_SSE3;
1456 __m128 oneeighth_SSE,onefourth_SSE,half_SSE,one_SSE,two_SSE,four_SSE;
1457 __m128 still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
1458 __m128 rai_inv_SSE0;
1459 __m128 rai_inv_SSE1;
1460 __m128 rai_inv_SSE2;
1461 __m128 rai_inv_SSE3;
1462 __m128 sk_ai_SSE0,sk2_ai_SSE0,sum_ai_SSE0;
1463 __m128 sk_ai_SSE1,sk2_ai_SSE1,sum_ai_SSE1;
1464 __m128 sk_ai_SSE2,sk2_ai_SSE2,sum_ai_SSE2;
1465 __m128 sk_ai_SSE3,sk2_ai_SSE3,sum_ai_SSE3;
1466 __m128 lij_inv_SSE0,sk2_rinv_SSE0;
1467 __m128 lij_inv_SSE1,sk2_rinv_SSE1;
1468 __m128 lij_inv_SSE2,sk2_rinv_SSE2;
1469 __m128 lij_inv_SSE3,sk2_rinv_SSE3;
1474 __m128 t1_SSE0,t2_SSE0,t3_SSE0,t4_SSE0;
1475 __m128 t1_SSE1,t2_SSE1,t3_SSE1,t4_SSE1;
1476 __m128 t1_SSE2,t2_SSE2,t3_SSE2,t4_SSE2;
1477 __m128 t1_SSE3,t2_SSE3,t3_SSE3,t4_SSE3;
1478 __m128 obc_mask1_SSE0,obc_mask2_SSE0,obc_mask3_SSE0;
1479 __m128 obc_mask1_SSE1,obc_mask2_SSE1,obc_mask3_SSE1;
1480 __m128 obc_mask1_SSE2,obc_mask2_SSE2,obc_mask3_SSE2;
1481 __m128 obc_mask1_SSE3,obc_mask2_SSE3,obc_mask3_SSE3;
1482 __m128 uij_SSE0,uij2_SSE0,uij3_SSE0;
1483 __m128 uij_SSE1,uij2_SSE1,uij3_SSE1;
1484 __m128 uij_SSE2,uij2_SSE2,uij3_SSE2;
1485 __m128 uij_SSE3,uij2_SSE3,uij3_SSE3;
1486 __m128 lij_SSE0,lij2_SSE0,lij3_SSE0;
1487 __m128 lij_SSE1,lij2_SSE1,lij3_SSE1;
1488 __m128 lij_SSE2,lij2_SSE2,lij3_SSE2;
1489 __m128 lij_SSE3,lij2_SSE3,lij3_SSE3;
1490 __m128 dlij_SSE0,diff2_SSE0,logterm_SSE0;
1491 __m128 dlij_SSE1,diff2_SSE1,logterm_SSE1;
1492 __m128 dlij_SSE2,diff2_SSE2,logterm_SSE2;
1493 __m128 dlij_SSE3,diff2_SSE3,logterm_SSE3;
1496 natoms = mdatoms->nr;
1497 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
1498 ni1 = mdatoms->start+mdatoms->homenr;
1502 aadata = *((gmx_allvsallgb2_data_t **)paadata);
1507 genborn_allvsall_setup(&aadata,top,born,mdatoms,born->gb_doffset,
1508 egbOBC,TRUE,TRUE,TRUE);
1509 *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1512 x_align = aadata->x_align;
1513 y_align = aadata->y_align;
1514 z_align = aadata->z_align;
1516 gb_radius = aadata->gb_radius;
1517 work = aadata->work;
1518 jindex = aadata->jindex_gb;
1520 obc_param = aadata->workparam;
1522 oneeighth_SSE = _mm_set1_ps(0.125);
1523 onefourth_SSE = _mm_set1_ps(0.25);
1524 half_SSE = _mm_set1_ps(0.5);
1525 one_SSE = _mm_set1_ps(1.0);
1526 two_SSE = _mm_set1_ps(2.0);
1527 four_SSE = _mm_set1_ps(4.0);
1528 doffset_SSE = _mm_set1_ps(born->gb_doffset);
1530 for(i=0;i<natoms;i++)
1532 x_align[i] = x[3*i];
1533 y_align[i] = x[3*i+1];
1534 z_align[i] = x[3*i+2];
1538 for(i=0;i<natoms/2+1;i++)
1540 x_align[natoms+i] = x_align[i];
1541 y_align[natoms+i] = y_align[i];
1542 z_align[natoms+i] = z_align[i];
1545 for(i=0;i<natoms+natoms/2+1;i++)
1550 for(i=ni0; i<ni1; i+= UNROLLI)
1552 /* We assume shifts are NOT used for all-vs-all interactions */
1554 /* Load i atom data */
1555 ix_SSE0 = _mm_load1_ps(x_align+i);
1556 iy_SSE0 = _mm_load1_ps(y_align+i);
1557 iz_SSE0 = _mm_load1_ps(z_align+i);
1558 ix_SSE1 = _mm_load1_ps(x_align+i+1);
1559 iy_SSE1 = _mm_load1_ps(y_align+i+1);
1560 iz_SSE1 = _mm_load1_ps(z_align+i+1);
1561 ix_SSE2 = _mm_load1_ps(x_align+i+2);
1562 iy_SSE2 = _mm_load1_ps(y_align+i+2);
1563 iz_SSE2 = _mm_load1_ps(z_align+i+2);
1564 ix_SSE3 = _mm_load1_ps(x_align+i+3);
1565 iy_SSE3 = _mm_load1_ps(y_align+i+3);
1566 iz_SSE3 = _mm_load1_ps(z_align+i+3);
1568 rai_SSE0 = _mm_load1_ps(gb_radius+i);
1569 rai_SSE1 = _mm_load1_ps(gb_radius+i+1);
1570 rai_SSE2 = _mm_load1_ps(gb_radius+i+2);
1571 rai_SSE3 = _mm_load1_ps(gb_radius+i+3);
1572 rai_inv_SSE0 = gmx_mm_inv_ps(rai_SSE0);
1573 rai_inv_SSE1 = gmx_mm_inv_ps(rai_SSE1);
1574 rai_inv_SSE2 = gmx_mm_inv_ps(rai_SSE2);
1575 rai_inv_SSE3 = gmx_mm_inv_ps(rai_SSE3);
1577 sk_ai_SSE0 = _mm_load1_ps(obc_param+i);
1578 sk_ai_SSE1 = _mm_load1_ps(obc_param+i+1);
1579 sk_ai_SSE2 = _mm_load1_ps(obc_param+i+2);
1580 sk_ai_SSE3 = _mm_load1_ps(obc_param+i+3);
1581 sk2_ai_SSE0 = _mm_mul_ps(sk_ai_SSE0,sk_ai_SSE0);
1582 sk2_ai_SSE1 = _mm_mul_ps(sk_ai_SSE1,sk_ai_SSE1);
1583 sk2_ai_SSE2 = _mm_mul_ps(sk_ai_SSE2,sk_ai_SSE2);
1584 sk2_ai_SSE3 = _mm_mul_ps(sk_ai_SSE3,sk_ai_SSE3);
1586 sum_ai_SSE0 = _mm_setzero_ps();
1587 sum_ai_SSE1 = _mm_setzero_ps();
1588 sum_ai_SSE2 = _mm_setzero_ps();
1589 sum_ai_SSE3 = _mm_setzero_ps();
1591 /* Load limits for loop over neighbors */
1593 nj1 = jindex[4*i+1];
1594 nj2 = jindex[4*i+2];
1595 nj3 = jindex[4*i+3];
1597 pmask0 = aadata->prologue_mask_gb[i];
1598 pmask1 = aadata->prologue_mask_gb[i+1];
1599 pmask2 = aadata->prologue_mask_gb[i+2];
1600 pmask3 = aadata->prologue_mask_gb[i+3];
1601 emask0 = aadata->epilogue_mask[i];
1602 emask1 = aadata->epilogue_mask[i+1];
1603 emask2 = aadata->epilogue_mask[i+2];
1604 emask3 = aadata->epilogue_mask[i+3];
1606 imask_SSE0 = _mm_load1_ps((real *)(aadata->imask+i));
1607 imask_SSE1 = _mm_load1_ps((real *)(aadata->imask+i+1));
1608 imask_SSE2 = _mm_load1_ps((real *)(aadata->imask+i+2));
1609 imask_SSE3 = _mm_load1_ps((real *)(aadata->imask+i+3));
1611 /* Prologue part, including exclusion mask */
1612 for(j=nj0; j<nj1; j+=UNROLLJ)
1614 jmask_SSE0 = _mm_load_ps((real *)pmask0);
1615 jmask_SSE1 = _mm_load_ps((real *)pmask1);
1616 jmask_SSE2 = _mm_load_ps((real *)pmask2);
1617 jmask_SSE3 = _mm_load_ps((real *)pmask3);
1623 /* load j atom coordinates */
1624 jx_SSE = _mm_load_ps(x_align+j);
1625 jy_SSE = _mm_load_ps(y_align+j);
1626 jz_SSE = _mm_load_ps(z_align+j);
1628 /* Calculate distance */
1629 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
1630 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
1631 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
1632 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
1633 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
1634 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
1635 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
1636 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
1637 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
1638 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
1639 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
1640 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
1642 /* rsq = dx*dx+dy*dy+dz*dz */
1643 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1644 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1645 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1646 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1649 jmask_SSE0 = _mm_and_ps(jmask_SSE0,imask_SSE0);
1650 jmask_SSE1 = _mm_and_ps(jmask_SSE1,imask_SSE1);
1651 jmask_SSE2 = _mm_and_ps(jmask_SSE2,imask_SSE2);
1652 jmask_SSE3 = _mm_and_ps(jmask_SSE3,imask_SSE3);
1654 /* Calculate 1/r and 1/r2 */
1655 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
1656 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
1657 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
1658 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
1661 rinv_SSE0 = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1662 rinv_SSE1 = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1663 rinv_SSE2 = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1664 rinv_SSE3 = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1666 dr_SSE0 = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
1667 dr_SSE1 = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
1668 dr_SSE2 = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
1669 dr_SSE3 = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
1671 sk_aj_SSE = _mm_load_ps(obc_param+j);
1672 raj_SSE = _mm_load_ps(gb_radius+j);
1673 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
1675 /* Evaluate influence of atom aj -> ai */
1676 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_aj_SSE);
1677 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_aj_SSE);
1678 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_aj_SSE);
1679 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_aj_SSE);
1680 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
1681 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
1682 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
1683 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
1684 t3_SSE0 = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
1685 t3_SSE1 = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
1686 t3_SSE2 = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
1687 t3_SSE3 = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
1689 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
1690 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
1691 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
1692 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
1693 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
1694 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
1695 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
1696 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
1697 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
1698 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
1699 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
1700 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
1701 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
1702 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
1703 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
1704 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
1706 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
1707 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
1708 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
1709 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
1710 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
1711 _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
1712 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
1713 _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
1714 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
1715 _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
1716 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
1717 _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
1718 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
1719 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
1720 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
1721 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
1723 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
1724 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
1725 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
1726 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
1727 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
1728 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
1729 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
1730 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
1731 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
1732 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
1733 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
1734 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
1735 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
1736 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
1737 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
1738 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
1740 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
1741 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
1742 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
1743 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
1744 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
1745 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
1746 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
1747 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
1748 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
1749 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
1750 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
1751 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
1752 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
1753 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
1754 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
1755 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
1756 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
1758 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
1759 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
1760 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
1761 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
1763 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
1764 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
1765 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
1766 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
1767 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
1768 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
1770 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
1771 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
1773 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
1774 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
1776 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
1777 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
1780 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
1781 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
1782 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
1783 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
1784 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
1785 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
1786 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
1787 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
1788 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
1789 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
1790 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
1791 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
1792 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
1793 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
1794 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
1795 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
1796 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
1797 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
1798 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
1799 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
1801 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
1802 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
1803 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
1804 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
1806 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
1807 _mm_mul_ps(prod_SSE0,lij3_SSE0));
1808 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
1809 _mm_mul_ps(prod_SSE1,lij3_SSE1));
1810 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
1811 _mm_mul_ps(prod_SSE2,lij3_SSE2));
1812 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
1813 _mm_mul_ps(prod_SSE3,lij3_SSE3));
1814 t1_SSE0 = _mm_sub_ps(t1_SSE0,
1815 _mm_mul_ps(onefourth_SSE,
1816 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
1817 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
1818 t1_SSE1 = _mm_sub_ps(t1_SSE1,
1819 _mm_mul_ps(onefourth_SSE,
1820 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
1821 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
1822 t1_SSE2 = _mm_sub_ps(t1_SSE2,
1823 _mm_mul_ps(onefourth_SSE,
1824 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
1825 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
1826 t1_SSE3 = _mm_sub_ps(t1_SSE3,
1827 _mm_mul_ps(onefourth_SSE,
1828 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
1829 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
1831 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
1832 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
1833 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
1834 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
1835 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
1836 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
1837 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
1838 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
1839 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
1840 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
1841 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
1842 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
1843 t2_SSE0 = _mm_sub_ps(t2_SSE0,
1844 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
1845 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
1846 t2_SSE1 = _mm_sub_ps(t2_SSE1,
1847 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
1848 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
1849 t2_SSE2 = _mm_sub_ps(t2_SSE2,
1850 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
1851 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
1852 t2_SSE3 = _mm_sub_ps(t2_SSE3,
1853 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
1854 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
1855 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
1856 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
1857 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
1858 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
1859 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
1860 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
1861 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
1862 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
1863 t3_SSE0 = _mm_sub_ps(t3_SSE0,
1864 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
1866 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
1867 t3_SSE1 = _mm_sub_ps(t3_SSE1,
1868 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
1870 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
1871 t3_SSE2 = _mm_sub_ps(t3_SSE2,
1872 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
1874 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
1875 t3_SSE3 = _mm_sub_ps(t3_SSE3,
1876 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
1878 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
1880 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
1881 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
1882 _mm_add_ps(t2_SSE0,t3_SSE0)));
1883 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
1884 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
1885 _mm_add_ps(t2_SSE1,t3_SSE1)));
1886 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
1887 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
1888 _mm_add_ps(t2_SSE2,t3_SSE2)));
1889 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
1890 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
1891 _mm_add_ps(t2_SSE3,t3_SSE3)));
1893 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
1895 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
1897 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
1899 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
1902 /* Evaluate influence of atom ai -> aj */
1903 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
1904 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
1905 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
1906 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
1907 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
1908 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
1909 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
1910 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
1911 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
1912 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
1913 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
1914 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
1916 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
1917 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
1918 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
1919 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
1920 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
1921 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
1922 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
1923 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
1924 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
1925 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
1926 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
1927 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
1928 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
1929 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
1930 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
1931 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
1933 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
1934 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
1935 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
1936 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
1937 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
1938 _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
1939 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
1940 _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
1941 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
1942 _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
1943 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
1944 _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
1945 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
1946 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
1947 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
1948 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
1950 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
1951 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
1952 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
1953 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
1954 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
1955 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
1956 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
1957 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
1958 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
1959 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
1960 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
1961 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
1962 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
1963 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
1964 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
1965 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
1967 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
1968 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
1969 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
1970 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
1971 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
1972 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
1973 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
1974 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
1975 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
1976 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
1977 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
1978 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
1979 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
1980 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
1981 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
1982 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
1984 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
1985 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
1986 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
1987 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
1988 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
1989 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
1990 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
1991 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
1992 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
1993 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
1995 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
1996 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
1998 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
1999 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2001 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2002 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2004 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2005 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2006 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2007 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2008 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2009 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2010 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2011 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2012 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
2013 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
2014 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
2015 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
2016 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2017 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2018 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2019 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2020 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2021 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2022 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2023 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2025 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2026 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
2027 _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
2028 _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
2029 _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
2031 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2032 _mm_mul_ps(prod_SSE0,lij3_SSE0));
2033 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2034 _mm_mul_ps(prod_SSE1,lij3_SSE1));
2035 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2036 _mm_mul_ps(prod_SSE2,lij3_SSE2));
2037 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2038 _mm_mul_ps(prod_SSE3,lij3_SSE3));
2039 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2040 _mm_mul_ps(onefourth_SSE,
2041 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2042 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2043 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2044 _mm_mul_ps(onefourth_SSE,
2045 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2046 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2047 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2048 _mm_mul_ps(onefourth_SSE,
2049 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2050 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2051 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2052 _mm_mul_ps(onefourth_SSE,
2053 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2054 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2055 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2056 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2057 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2058 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2059 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2060 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2061 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2062 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2063 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2064 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2065 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2066 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2067 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2068 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2069 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2070 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2071 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2072 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2073 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2074 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2075 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2076 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2077 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2078 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2080 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2081 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2082 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2083 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2084 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2085 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2086 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2087 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2089 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2090 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2092 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2093 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2094 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2096 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2097 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2098 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2100 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2101 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2102 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2104 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2107 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2108 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2109 _mm_add_ps(t2_SSE0,t3_SSE0)));
2110 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2111 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2112 _mm_add_ps(t2_SSE1,t3_SSE1)));
2113 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2114 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2115 _mm_add_ps(t2_SSE2,t3_SSE2)));
2116 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2117 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2118 _mm_add_ps(t2_SSE3,t3_SSE3)));
2120 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2122 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2124 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2126 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2131 /* Main part, no exclusions */
2132 for(j=nj1; j<nj2; j+=UNROLLJ)
2134 /* load j atom coordinates */
2135 jx_SSE = _mm_load_ps(x_align+j);
2136 jy_SSE = _mm_load_ps(y_align+j);
2137 jz_SSE = _mm_load_ps(z_align+j);
2139 /* Calculate distance */
2140 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
2141 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
2142 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
2143 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
2144 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
2145 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
2146 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
2147 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
2148 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
2149 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
2150 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
2151 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
2153 /* rsq = dx*dx+dy*dy+dz*dz */
2154 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2155 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2156 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2157 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2159 /* Calculate 1/r and 1/r2 */
2160 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
2161 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
2162 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
2163 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
2166 rinv_SSE0 = _mm_and_ps(rinv_SSE0,imask_SSE0);
2167 rinv_SSE1 = _mm_and_ps(rinv_SSE1,imask_SSE1);
2168 rinv_SSE2 = _mm_and_ps(rinv_SSE2,imask_SSE2);
2169 rinv_SSE3 = _mm_and_ps(rinv_SSE3,imask_SSE3);
2171 dr_SSE0 = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
2172 dr_SSE1 = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
2173 dr_SSE2 = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
2174 dr_SSE3 = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
2176 sk_aj_SSE = _mm_load_ps(obc_param+j);
2177 raj_SSE = _mm_load_ps(gb_radius+j);
2179 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
2181 /* Evaluate influence of atom aj -> ai */
2182 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_aj_SSE);
2183 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_aj_SSE);
2184 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_aj_SSE);
2185 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_aj_SSE);
2186 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
2187 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
2188 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
2189 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
2190 t3_SSE0 = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
2191 t3_SSE1 = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
2192 t3_SSE2 = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
2193 t3_SSE3 = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
2195 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2196 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2197 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2198 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2199 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2200 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2201 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2202 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2203 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2204 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2205 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2206 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2207 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,imask_SSE0);
2208 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,imask_SSE1);
2209 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,imask_SSE2);
2210 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,imask_SSE3);
2212 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2213 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2214 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2215 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2216 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2217 _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
2218 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2219 _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
2220 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2221 _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
2222 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2223 _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
2224 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2225 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2226 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2227 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2229 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2230 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2231 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2232 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2233 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2234 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2235 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2236 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2237 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2238 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2239 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2240 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2241 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2242 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2243 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2244 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2246 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2247 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2248 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2249 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2250 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2251 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2252 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2253 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2254 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
2255 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
2256 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
2257 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
2258 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
2259 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2260 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2261 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2262 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2264 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2265 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2266 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2267 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2269 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
2270 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
2271 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
2272 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
2273 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2274 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2276 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2277 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2279 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2280 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2282 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2283 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2286 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2287 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2288 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2289 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2290 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2291 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2292 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2293 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2294 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
2295 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
2296 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
2297 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
2298 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2299 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2300 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2301 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2302 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2303 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2304 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2305 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2307 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2308 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2309 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2310 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2312 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2313 _mm_mul_ps(prod_SSE0,lij3_SSE0));
2314 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2315 _mm_mul_ps(prod_SSE1,lij3_SSE1));
2316 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2317 _mm_mul_ps(prod_SSE2,lij3_SSE2));
2318 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2319 _mm_mul_ps(prod_SSE3,lij3_SSE3));
2320 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2321 _mm_mul_ps(onefourth_SSE,
2322 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2323 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2324 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2325 _mm_mul_ps(onefourth_SSE,
2326 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2327 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2328 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2329 _mm_mul_ps(onefourth_SSE,
2330 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2331 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2332 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2333 _mm_mul_ps(onefourth_SSE,
2334 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2335 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2337 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2338 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2339 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2340 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2341 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2342 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2343 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2344 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2345 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2346 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2347 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2348 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2349 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2350 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2351 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2352 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2353 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2354 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2355 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2356 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2357 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2358 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2359 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2360 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2361 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2362 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2363 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2364 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2365 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2366 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2367 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2368 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2369 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2370 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2372 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2373 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2374 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2376 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2377 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2378 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2380 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2381 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2382 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2384 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2386 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2387 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2388 _mm_add_ps(t2_SSE0,t3_SSE0)));
2389 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2390 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2391 _mm_add_ps(t2_SSE1,t3_SSE1)));
2392 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2393 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2394 _mm_add_ps(t2_SSE2,t3_SSE2)));
2395 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2396 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2397 _mm_add_ps(t2_SSE3,t3_SSE3)));
2399 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2401 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2403 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2405 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2408 /* Evaluate influence of atom ai -> aj */
2409 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
2410 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
2411 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
2412 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
2413 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
2414 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
2415 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
2416 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
2417 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
2418 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
2419 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
2420 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
2422 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2423 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2424 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2425 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2426 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2427 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2428 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2429 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2430 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2431 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2432 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2433 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2434 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,imask_SSE0);
2435 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,imask_SSE1);
2436 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,imask_SSE2);
2437 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,imask_SSE3);
2439 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2440 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2441 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2442 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2443 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2444 _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
2445 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2446 _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
2447 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2448 _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
2449 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2450 _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
2451 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2452 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2453 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2454 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2456 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2457 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2458 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2459 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2460 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2461 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2462 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2463 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2464 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2465 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2466 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2467 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2468 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2469 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2470 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2471 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2473 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2474 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2475 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2476 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2477 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2478 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2479 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2480 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2481 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
2482 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
2483 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
2484 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
2485 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2486 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2487 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2488 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2490 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2491 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2492 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2493 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2494 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
2495 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
2496 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
2497 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
2498 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2499 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2501 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2502 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2504 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2505 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2507 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2508 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2510 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2511 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2512 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2513 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2514 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2515 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2516 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2517 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2518 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
2519 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
2520 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
2521 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
2522 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2523 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2524 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2525 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2526 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2527 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2528 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2529 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2531 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2532 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
2533 _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
2534 _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
2535 _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
2537 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2538 _mm_mul_ps(prod_SSE0,lij3_SSE0));
2539 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2540 _mm_mul_ps(prod_SSE1,lij3_SSE1));
2541 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2542 _mm_mul_ps(prod_SSE2,lij3_SSE2));
2543 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2544 _mm_mul_ps(prod_SSE3,lij3_SSE3));
2545 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2546 _mm_mul_ps(onefourth_SSE,
2547 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2548 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2549 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2550 _mm_mul_ps(onefourth_SSE,
2551 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2552 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2553 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2554 _mm_mul_ps(onefourth_SSE,
2555 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2556 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2557 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2558 _mm_mul_ps(onefourth_SSE,
2559 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2560 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2561 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2562 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2563 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2564 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2565 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2566 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2567 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2568 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2569 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2570 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2571 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2572 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2573 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2574 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2575 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2576 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2577 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2578 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2579 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2580 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2581 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2582 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2583 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2584 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2586 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2587 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2588 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2589 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2590 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2591 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2592 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2593 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2595 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2596 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2598 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2599 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2600 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2602 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2603 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2604 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2606 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2607 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2608 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2610 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2612 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2613 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2614 _mm_add_ps(t2_SSE0,t3_SSE0)));
2615 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2616 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2617 _mm_add_ps(t2_SSE1,t3_SSE1)));
2618 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2619 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2620 _mm_add_ps(t2_SSE2,t3_SSE2)));
2621 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2622 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2623 _mm_add_ps(t2_SSE3,t3_SSE3)));
2625 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2627 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2629 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2631 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2635 /* Epilogue part, including exclusion mask */
2636 for(j=nj2; j<nj3; j+=UNROLLJ)
2638 jmask_SSE0 = _mm_load_ps((real *)emask0);
2639 jmask_SSE1 = _mm_load_ps((real *)emask1);
2640 jmask_SSE2 = _mm_load_ps((real *)emask2);
2641 jmask_SSE3 = _mm_load_ps((real *)emask3);
2647 /* load j atom coordinates */
2648 jx_SSE = _mm_load_ps(x_align+j);
2649 jy_SSE = _mm_load_ps(y_align+j);
2650 jz_SSE = _mm_load_ps(z_align+j);
2652 /* Calculate distance */
2653 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
2654 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
2655 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
2656 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
2657 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
2658 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
2659 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
2660 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
2661 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
2662 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
2663 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
2664 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
2666 /* rsq = dx*dx+dy*dy+dz*dz */
2667 rsq_SSE0 = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2668 rsq_SSE1 = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2669 rsq_SSE2 = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2670 rsq_SSE3 = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2673 jmask_SSE0 = _mm_and_ps(jmask_SSE0,imask_SSE0);
2674 jmask_SSE1 = _mm_and_ps(jmask_SSE1,imask_SSE1);
2675 jmask_SSE2 = _mm_and_ps(jmask_SSE2,imask_SSE2);
2676 jmask_SSE3 = _mm_and_ps(jmask_SSE3,imask_SSE3);
2678 /* Calculate 1/r and 1/r2 */
2679 rinv_SSE0 = gmx_mm_invsqrt_ps(rsq_SSE0);
2680 rinv_SSE1 = gmx_mm_invsqrt_ps(rsq_SSE1);
2681 rinv_SSE2 = gmx_mm_invsqrt_ps(rsq_SSE2);
2682 rinv_SSE3 = gmx_mm_invsqrt_ps(rsq_SSE3);
2685 rinv_SSE0 = _mm_and_ps(rinv_SSE0,jmask_SSE0);
2686 rinv_SSE1 = _mm_and_ps(rinv_SSE1,jmask_SSE1);
2687 rinv_SSE2 = _mm_and_ps(rinv_SSE2,jmask_SSE2);
2688 rinv_SSE3 = _mm_and_ps(rinv_SSE3,jmask_SSE3);
2690 dr_SSE0 = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
2691 dr_SSE1 = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
2692 dr_SSE2 = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
2693 dr_SSE3 = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
2695 sk_aj_SSE = _mm_load_ps(obc_param+j);
2696 raj_SSE = _mm_load_ps(gb_radius+j);
2698 raj_inv_SSE = gmx_mm_inv_ps(raj_SSE);
2700 /* Evaluate influence of atom aj -> ai */
2701 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_aj_SSE);
2702 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_aj_SSE);
2703 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_aj_SSE);
2704 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_aj_SSE);
2705 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
2706 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
2707 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
2708 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
2709 t3_SSE0 = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
2710 t3_SSE1 = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
2711 t3_SSE2 = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
2712 t3_SSE3 = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
2714 obc_mask1_SSE0 = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2715 obc_mask1_SSE1 = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2716 obc_mask1_SSE2 = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2717 obc_mask1_SSE3 = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2718 obc_mask2_SSE0 = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2719 obc_mask2_SSE1 = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2720 obc_mask2_SSE2 = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2721 obc_mask2_SSE3 = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2722 obc_mask3_SSE0 = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2723 obc_mask3_SSE1 = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2724 obc_mask3_SSE2 = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2725 obc_mask3_SSE3 = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2726 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
2727 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
2728 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
2729 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
2731 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2732 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2733 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2734 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2735 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2736 _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
2737 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2738 _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
2739 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2740 _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
2741 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2742 _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
2743 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2744 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2745 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2746 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2748 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2749 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2750 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2751 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2752 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2753 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2754 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2755 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2756 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2757 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2758 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2759 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2760 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2761 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2762 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2763 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2765 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2766 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2767 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2768 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2769 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2770 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2771 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2772 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
2773 sk2_aj_SSE = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
2774 sk2_rinv_SSE0 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
2775 sk2_rinv_SSE1 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
2776 sk2_rinv_SSE2 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
2777 sk2_rinv_SSE3 = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
2778 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2779 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2780 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2781 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2783 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2784 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2785 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2786 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2788 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
2789 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
2790 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
2791 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
2792 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
2793 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2795 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
2796 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2798 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
2799 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2801 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
2802 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2805 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2806 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2807 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2808 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2809 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2810 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2811 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2812 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2813 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
2814 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
2815 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
2816 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
2817 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2818 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2819 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2820 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2821 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2822 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2823 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2824 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2826 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2827 sum_ai_SSE1 = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2828 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2829 sum_ai_SSE3 = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2831 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2832 _mm_mul_ps(prod_SSE0,lij3_SSE0));
2833 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2834 _mm_mul_ps(prod_SSE1,lij3_SSE1));
2835 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2836 _mm_mul_ps(prod_SSE2,lij3_SSE2));
2837 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2838 _mm_mul_ps(prod_SSE3,lij3_SSE3));
2839 t1_SSE0 = _mm_sub_ps(t1_SSE0,
2840 _mm_mul_ps(onefourth_SSE,
2841 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2842 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2843 t1_SSE1 = _mm_sub_ps(t1_SSE1,
2844 _mm_mul_ps(onefourth_SSE,
2845 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2846 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2847 t1_SSE2 = _mm_sub_ps(t1_SSE2,
2848 _mm_mul_ps(onefourth_SSE,
2849 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2850 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2851 t1_SSE3 = _mm_sub_ps(t1_SSE3,
2852 _mm_mul_ps(onefourth_SSE,
2853 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2854 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2856 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
2857 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2858 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2859 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
2860 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2861 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2862 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
2863 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2864 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2865 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
2866 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2867 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2868 t2_SSE0 = _mm_sub_ps(t2_SSE0,
2869 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2870 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2871 t2_SSE1 = _mm_sub_ps(t2_SSE1,
2872 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2873 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2874 t2_SSE2 = _mm_sub_ps(t2_SSE2,
2875 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2876 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2877 t2_SSE3 = _mm_sub_ps(t2_SSE3,
2878 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2879 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2880 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2881 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2882 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2883 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2884 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2885 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2886 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2887 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2888 t3_SSE0 = _mm_sub_ps(t3_SSE0,
2889 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2891 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2892 t3_SSE1 = _mm_sub_ps(t3_SSE1,
2893 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2895 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2896 t3_SSE2 = _mm_sub_ps(t3_SSE2,
2897 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2899 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2900 t3_SSE3 = _mm_sub_ps(t3_SSE3,
2901 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2903 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2905 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
2906 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2907 _mm_add_ps(t2_SSE0,t3_SSE0)));
2908 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
2909 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2910 _mm_add_ps(t2_SSE1,t3_SSE1)));
2911 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
2912 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2913 _mm_add_ps(t2_SSE2,t3_SSE2)));
2914 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
2915 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2916 _mm_add_ps(t2_SSE3,t3_SSE3)));
2918 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2920 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2922 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2924 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2927 /* Evaluate influence of atom ai -> aj */
2928 t1_SSE0 = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
2929 t1_SSE1 = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
2930 t1_SSE2 = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
2931 t1_SSE3 = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
2932 t2_SSE0 = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
2933 t2_SSE1 = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
2934 t2_SSE2 = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
2935 t2_SSE3 = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
2936 t3_SSE0 = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
2937 t3_SSE1 = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
2938 t3_SSE2 = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
2939 t3_SSE3 = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
2941 obc_mask1_SSE0 = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2942 obc_mask1_SSE1 = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2943 obc_mask1_SSE2 = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2944 obc_mask1_SSE3 = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2945 obc_mask2_SSE0 = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2946 obc_mask2_SSE1 = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2947 obc_mask2_SSE2 = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2948 obc_mask2_SSE3 = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2949 obc_mask3_SSE0 = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2950 obc_mask3_SSE1 = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2951 obc_mask3_SSE2 = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2952 obc_mask3_SSE3 = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2953 obc_mask1_SSE0 = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
2954 obc_mask1_SSE1 = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
2955 obc_mask1_SSE2 = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
2956 obc_mask1_SSE3 = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
2958 uij_SSE0 = gmx_mm_inv_ps(t1_SSE0);
2959 uij_SSE1 = gmx_mm_inv_ps(t1_SSE1);
2960 uij_SSE2 = gmx_mm_inv_ps(t1_SSE2);
2961 uij_SSE3 = gmx_mm_inv_ps(t1_SSE3);
2962 lij_SSE0 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2963 _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
2964 lij_SSE1 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2965 _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
2966 lij_SSE2 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2967 _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
2968 lij_SSE3 = _mm_or_ps( _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2969 _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
2970 dlij_SSE0 = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2971 dlij_SSE1 = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2972 dlij_SSE2 = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2973 dlij_SSE3 = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2975 uij2_SSE0 = _mm_mul_ps(uij_SSE0, uij_SSE0);
2976 uij2_SSE1 = _mm_mul_ps(uij_SSE1, uij_SSE1);
2977 uij2_SSE2 = _mm_mul_ps(uij_SSE2, uij_SSE2);
2978 uij2_SSE3 = _mm_mul_ps(uij_SSE3, uij_SSE3);
2979 uij3_SSE0 = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2980 uij3_SSE1 = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2981 uij3_SSE2 = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2982 uij3_SSE3 = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2983 lij2_SSE0 = _mm_mul_ps(lij_SSE0, lij_SSE0);
2984 lij2_SSE1 = _mm_mul_ps(lij_SSE1, lij_SSE1);
2985 lij2_SSE2 = _mm_mul_ps(lij_SSE2, lij_SSE2);
2986 lij2_SSE3 = _mm_mul_ps(lij_SSE3, lij_SSE3);
2987 lij3_SSE0 = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2988 lij3_SSE1 = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2989 lij3_SSE2 = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2990 lij3_SSE3 = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2992 diff2_SSE0 = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2993 diff2_SSE1 = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2994 diff2_SSE2 = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2995 diff2_SSE3 = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2996 lij_inv_SSE0 = gmx_mm_invsqrt_ps(lij2_SSE0);
2997 lij_inv_SSE1 = gmx_mm_invsqrt_ps(lij2_SSE1);
2998 lij_inv_SSE2 = gmx_mm_invsqrt_ps(lij2_SSE2);
2999 lij_inv_SSE3 = gmx_mm_invsqrt_ps(lij2_SSE3);
3000 sk2_rinv_SSE0 = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
3001 sk2_rinv_SSE1 = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
3002 sk2_rinv_SSE2 = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
3003 sk2_rinv_SSE3 = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
3004 prod_SSE0 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
3005 prod_SSE1 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
3006 prod_SSE2 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
3007 prod_SSE3 = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
3009 logterm_SSE0 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
3010 logterm_SSE1 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
3011 logterm_SSE2 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
3012 logterm_SSE3 = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
3013 t1_SSE0 = _mm_sub_ps(lij_SSE0,uij_SSE0);
3014 t1_SSE1 = _mm_sub_ps(lij_SSE1,uij_SSE1);
3015 t1_SSE2 = _mm_sub_ps(lij_SSE2,uij_SSE2);
3016 t1_SSE3 = _mm_sub_ps(lij_SSE3,uij_SSE3);
3017 t2_SSE0 = _mm_mul_ps(diff2_SSE0,
3018 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
3020 t2_SSE1 = _mm_mul_ps(diff2_SSE1,
3021 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
3023 t2_SSE2 = _mm_mul_ps(diff2_SSE2,
3024 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
3026 t2_SSE3 = _mm_mul_ps(diff2_SSE3,
3027 _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
3029 t3_SSE0 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
3030 t3_SSE1 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
3031 t3_SSE2 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
3032 t3_SSE3 = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
3033 t1_SSE0 = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
3034 t1_SSE1 = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
3035 t1_SSE2 = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
3036 t1_SSE3 = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
3037 t4_SSE0 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
3038 t4_SSE1 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
3039 t4_SSE2 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
3040 t4_SSE3 = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
3041 t4_SSE0 = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
3042 t4_SSE1 = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
3043 t4_SSE2 = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
3044 t4_SSE3 = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
3045 t1_SSE0 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
3046 t1_SSE1 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
3047 t1_SSE2 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
3048 t1_SSE3 = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
3050 _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
3051 gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
3052 _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
3053 _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
3054 _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
3056 t1_SSE0 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
3057 _mm_mul_ps(prod_SSE0,lij3_SSE0));
3058 t1_SSE1 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
3059 _mm_mul_ps(prod_SSE1,lij3_SSE1));
3060 t1_SSE2 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
3061 _mm_mul_ps(prod_SSE2,lij3_SSE2));
3062 t1_SSE3 = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
3063 _mm_mul_ps(prod_SSE3,lij3_SSE3));
3064 t1_SSE0 = _mm_sub_ps(t1_SSE0,
3065 _mm_mul_ps(onefourth_SSE,
3066 _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
3067 _mm_mul_ps(lij3_SSE0,dr_SSE0))));
3068 t1_SSE1 = _mm_sub_ps(t1_SSE1,
3069 _mm_mul_ps(onefourth_SSE,
3070 _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
3071 _mm_mul_ps(lij3_SSE1,dr_SSE1))));
3072 t1_SSE2 = _mm_sub_ps(t1_SSE2,
3073 _mm_mul_ps(onefourth_SSE,
3074 _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
3075 _mm_mul_ps(lij3_SSE2,dr_SSE2))));
3076 t1_SSE3 = _mm_sub_ps(t1_SSE3,
3077 _mm_mul_ps(onefourth_SSE,
3078 _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
3079 _mm_mul_ps(lij3_SSE3,dr_SSE3))));
3080 t2_SSE0 = _mm_mul_ps(onefourth_SSE,
3081 _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
3082 _mm_mul_ps(uij3_SSE0,dr_SSE0)));
3083 t2_SSE1 = _mm_mul_ps(onefourth_SSE,
3084 _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
3085 _mm_mul_ps(uij3_SSE1,dr_SSE1)));
3086 t2_SSE2 = _mm_mul_ps(onefourth_SSE,
3087 _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
3088 _mm_mul_ps(uij3_SSE2,dr_SSE2)));
3089 t2_SSE3 = _mm_mul_ps(onefourth_SSE,
3090 _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
3091 _mm_mul_ps(uij3_SSE3,dr_SSE3)));
3092 t2_SSE0 = _mm_sub_ps(t2_SSE0,
3093 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
3094 _mm_mul_ps(prod_SSE0,uij3_SSE0)));
3095 t2_SSE1 = _mm_sub_ps(t2_SSE1,
3096 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
3097 _mm_mul_ps(prod_SSE1,uij3_SSE1)));
3098 t2_SSE2 = _mm_sub_ps(t2_SSE2,
3099 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
3100 _mm_mul_ps(prod_SSE2,uij3_SSE2)));
3101 t2_SSE3 = _mm_sub_ps(t2_SSE3,
3102 _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
3103 _mm_mul_ps(prod_SSE3,uij3_SSE3)));
3105 t3_SSE0 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
3106 _mm_mul_ps(rinv_SSE0,rinv_SSE0));
3107 t3_SSE1 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
3108 _mm_mul_ps(rinv_SSE1,rinv_SSE1));
3109 t3_SSE2 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
3110 _mm_mul_ps(rinv_SSE2,rinv_SSE2));
3111 t3_SSE3 = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
3112 _mm_mul_ps(rinv_SSE3,rinv_SSE3));
3114 t3_SSE0 = _mm_sub_ps(t3_SSE0,
3115 _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
3117 _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
3118 t3_SSE1 = _mm_sub_ps(t3_SSE1,
3119 _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
3121 _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
3122 t3_SSE2 = _mm_sub_ps(t3_SSE2,
3123 _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
3125 _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
3126 t3_SSE3 = _mm_sub_ps(t3_SSE3,
3127 _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
3129 _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
3132 t1_SSE0 = _mm_mul_ps(rinv_SSE0,
3133 _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
3134 _mm_add_ps(t2_SSE0,t3_SSE0)));
3135 t1_SSE1 = _mm_mul_ps(rinv_SSE1,
3136 _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
3137 _mm_add_ps(t2_SSE1,t3_SSE1)));
3138 t1_SSE2 = _mm_mul_ps(rinv_SSE2,
3139 _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
3140 _mm_add_ps(t2_SSE2,t3_SSE2)));
3141 t1_SSE3 = _mm_mul_ps(rinv_SSE3,
3142 _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
3143 _mm_add_ps(t2_SSE3,t3_SSE3)));
3145 _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
3147 _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
3149 _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
3151 _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
3154 _MM_TRANSPOSE4_PS(sum_ai_SSE0,sum_ai_SSE1,sum_ai_SSE2,sum_ai_SSE3);
3155 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,sum_ai_SSE1);
3156 sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,sum_ai_SSE3);
3157 sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,sum_ai_SSE2);
3158 _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
3162 for(i=0;i<natoms/2+1;i++)
3164 work[i] += work[natoms+i];
3167 /* Parallel summations */
3171 gmx_sum(natoms,work, cr);
3174 if(gb_algorithm==egbHCT)
3177 for(i=0;i<natoms;i++)
3179 if(born->use[i] != 0)
3181 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
3182 sum_ai = 1.0/rai - work[i];
3183 min_rad = rai + born->gb_doffset;
3186 born->bRad[i] = rad > min_rad ? rad : min_rad;
3187 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3196 /* Calculate the radii */
3197 for(i=0;i<natoms;i++)
3200 if(born->use[i] != 0)
3202 rai = top->atomtypes.gb_radius[mdatoms->typeA[i]];
3204 rai = rai-born->gb_doffset;
3206 sum_ai = rai * work[i];
3207 sum_ai2 = sum_ai * sum_ai;
3208 sum_ai3 = sum_ai2 * sum_ai;
3210 tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
3211 born->bRad[i] = rai_inv - tsum*rai_inv2;
3212 born->bRad[i] = 1.0 / born->bRad[i];
3214 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
3216 tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
3217 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
3233 genborn_allvsall_calc_chainrule_sse2_single(t_forcerec * fr,
3234 t_mdatoms * mdatoms,
3235 gmx_genborn_t * born,
3241 gmx_allvsallgb2_data_t *aadata;
3244 int nj0,nj1,nj2,nj3;
3257 real rbai,rbaj,fgb,fgb_ai,rbi;
3268 __m128 jmask_SSE0,jmask_SSE1,jmask_SSE2,jmask_SSE3;
3269 __m128 ix_SSE0,iy_SSE0,iz_SSE0;
3270 __m128 ix_SSE1,iy_SSE1,iz_SSE1;
3271 __m128 ix_SSE2,iy_SSE2,iz_SSE2;
3272 __m128 ix_SSE3,iy_SSE3,iz_SSE3;
3273 __m128 fix_SSE0,fiy_SSE0,fiz_SSE0;
3274 __m128 fix_SSE1,fiy_SSE1,fiz_SSE1;
3275 __m128 fix_SSE2,fiy_SSE2,fiz_SSE2;
3276 __m128 fix_SSE3,fiy_SSE3,fiz_SSE3;
3277 __m128 rbai_SSE0,rbai_SSE1,rbai_SSE2,rbai_SSE3;
3278 __m128 imask_SSE0,imask_SSE1,imask_SSE2,imask_SSE3;
3279 __m128 jx_SSE,jy_SSE,jz_SSE,rbaj_SSE;
3280 __m128 dx_SSE0,dy_SSE0,dz_SSE0;
3281 __m128 dx_SSE1,dy_SSE1,dz_SSE1;
3282 __m128 dx_SSE2,dy_SSE2,dz_SSE2;
3283 __m128 dx_SSE3,dy_SSE3,dz_SSE3;
3284 __m128 fgb_SSE0,fgb_ai_SSE0;
3285 __m128 fgb_SSE1,fgb_ai_SSE1;
3286 __m128 fgb_SSE2,fgb_ai_SSE2;
3287 __m128 fgb_SSE3,fgb_ai_SSE3;
3288 __m128 tx_SSE0,ty_SSE0,tz_SSE0;
3289 __m128 tx_SSE1,ty_SSE1,tz_SSE1;
3290 __m128 tx_SSE2,ty_SSE2,tz_SSE2;
3291 __m128 tx_SSE3,ty_SSE3,tz_SSE3;
3294 natoms = mdatoms->nr;
3295 ni0 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
3296 ni1 = mdatoms->start+mdatoms->homenr;
3299 aadata = (gmx_allvsallgb2_data_t *)paadata;
3301 x_align = aadata->x_align;
3302 y_align = aadata->y_align;
3303 z_align = aadata->z_align;
3304 fx_align = aadata->fx_align;
3305 fy_align = aadata->fy_align;
3306 fz_align = aadata->fz_align;
3308 jindex = aadata->jindex_gb;
3314 /* Loop to get the proper form for the Born radius term */
3315 if(gb_algorithm==egbSTILL)
3317 for(i=0;i<natoms;i++)
3319 rbi = born->bRad[i];
3320 rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
3323 else if(gb_algorithm==egbHCT)
3325 for(i=0;i<natoms;i++)
3327 rbi = born->bRad[i];
3328 rb[i] = rbi * rbi * fr->dvda[i];
3331 else if(gb_algorithm==egbOBC)
3333 for(idx=0;idx<natoms;idx++)
3335 rbi = born->bRad[idx];
3336 rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
3340 for(i=0;i<2*natoms;i++)
3348 for(i=0;i<natoms;i++)
3353 for(i=ni0; i<ni1; i+=UNROLLI)
3355 /* We assume shifts are NOT used for all-vs-all interactions */
3357 /* Load i atom data */
3358 ix_SSE0 = _mm_load1_ps(x_align+i);
3359 iy_SSE0 = _mm_load1_ps(y_align+i);
3360 iz_SSE0 = _mm_load1_ps(z_align+i);
3361 ix_SSE1 = _mm_load1_ps(x_align+i+1);
3362 iy_SSE1 = _mm_load1_ps(y_align+i+1);
3363 iz_SSE1 = _mm_load1_ps(z_align+i+1);
3364 ix_SSE2 = _mm_load1_ps(x_align+i+2);
3365 iy_SSE2 = _mm_load1_ps(y_align+i+2);
3366 iz_SSE2 = _mm_load1_ps(z_align+i+2);
3367 ix_SSE3 = _mm_load1_ps(x_align+i+3);
3368 iy_SSE3 = _mm_load1_ps(y_align+i+3);
3369 iz_SSE3 = _mm_load1_ps(z_align+i+3);
3371 fix_SSE0 = _mm_setzero_ps();
3372 fiy_SSE0 = _mm_setzero_ps();
3373 fiz_SSE0 = _mm_setzero_ps();
3374 fix_SSE1 = _mm_setzero_ps();
3375 fiy_SSE1 = _mm_setzero_ps();
3376 fiz_SSE1 = _mm_setzero_ps();
3377 fix_SSE2 = _mm_setzero_ps();
3378 fiy_SSE2 = _mm_setzero_ps();
3379 fiz_SSE2 = _mm_setzero_ps();
3380 fix_SSE3 = _mm_setzero_ps();
3381 fiy_SSE3 = _mm_setzero_ps();
3382 fiz_SSE3 = _mm_setzero_ps();
3384 rbai_SSE0 = _mm_load1_ps(rb+i);
3385 rbai_SSE1 = _mm_load1_ps(rb+i+1);
3386 rbai_SSE2 = _mm_load1_ps(rb+i+2);
3387 rbai_SSE3 = _mm_load1_ps(rb+i+3);
3389 /* Load limits for loop over neighbors */
3391 nj3 = jindex[4*i+3];
3393 /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
3394 for(j=nj0; j<nj3; j+=UNROLLJ)
3396 /* load j atom coordinates */
3397 jx_SSE = _mm_load_ps(x_align+j);
3398 jy_SSE = _mm_load_ps(y_align+j);
3399 jz_SSE = _mm_load_ps(z_align+j);
3401 /* Calculate distance */
3402 dx_SSE0 = _mm_sub_ps(ix_SSE0,jx_SSE);
3403 dy_SSE0 = _mm_sub_ps(iy_SSE0,jy_SSE);
3404 dz_SSE0 = _mm_sub_ps(iz_SSE0,jz_SSE);
3405 dx_SSE1 = _mm_sub_ps(ix_SSE1,jx_SSE);
3406 dy_SSE1 = _mm_sub_ps(iy_SSE1,jy_SSE);
3407 dz_SSE1 = _mm_sub_ps(iz_SSE1,jz_SSE);
3408 dx_SSE2 = _mm_sub_ps(ix_SSE2,jx_SSE);
3409 dy_SSE2 = _mm_sub_ps(iy_SSE2,jy_SSE);
3410 dz_SSE2 = _mm_sub_ps(iz_SSE2,jz_SSE);
3411 dx_SSE3 = _mm_sub_ps(ix_SSE3,jx_SSE);
3412 dy_SSE3 = _mm_sub_ps(iy_SSE3,jy_SSE);
3413 dz_SSE3 = _mm_sub_ps(iz_SSE3,jz_SSE);
3415 rbaj_SSE = _mm_load_ps(rb+j);
3417 fgb_SSE0 = _mm_mul_ps(rbai_SSE0,_mm_load_ps(dadx));
3419 fgb_SSE1 = _mm_mul_ps(rbai_SSE1,_mm_load_ps(dadx));
3421 fgb_SSE2 = _mm_mul_ps(rbai_SSE2,_mm_load_ps(dadx));
3423 fgb_SSE3 = _mm_mul_ps(rbai_SSE3,_mm_load_ps(dadx));
3426 fgb_ai_SSE0 = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3428 fgb_ai_SSE1 = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3430 fgb_ai_SSE2 = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3432 fgb_ai_SSE3 = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3435 /* Total force between ai and aj is the sum of ai->aj and aj->ai */
3436 fgb_SSE0 = _mm_add_ps(fgb_SSE0,fgb_ai_SSE0);
3437 fgb_SSE1 = _mm_add_ps(fgb_SSE1,fgb_ai_SSE1);
3438 fgb_SSE2 = _mm_add_ps(fgb_SSE2,fgb_ai_SSE2);
3439 fgb_SSE3 = _mm_add_ps(fgb_SSE3,fgb_ai_SSE3);
3441 /* Calculate temporary vectorial force */
3442 tx_SSE0 = _mm_mul_ps(fgb_SSE0,dx_SSE0);
3443 ty_SSE0 = _mm_mul_ps(fgb_SSE0,dy_SSE0);
3444 tz_SSE0 = _mm_mul_ps(fgb_SSE0,dz_SSE0);
3445 tx_SSE1 = _mm_mul_ps(fgb_SSE1,dx_SSE1);
3446 ty_SSE1 = _mm_mul_ps(fgb_SSE1,dy_SSE1);
3447 tz_SSE1 = _mm_mul_ps(fgb_SSE1,dz_SSE1);
3448 tx_SSE2 = _mm_mul_ps(fgb_SSE2,dx_SSE2);
3449 ty_SSE2 = _mm_mul_ps(fgb_SSE2,dy_SSE2);
3450 tz_SSE2 = _mm_mul_ps(fgb_SSE2,dz_SSE2);
3451 tx_SSE3 = _mm_mul_ps(fgb_SSE3,dx_SSE3);
3452 ty_SSE3 = _mm_mul_ps(fgb_SSE3,dy_SSE3);
3453 tz_SSE3 = _mm_mul_ps(fgb_SSE3,dz_SSE3);
3455 /* Increment i atom force */
3456 fix_SSE0 = _mm_add_ps(fix_SSE0,tx_SSE0);
3457 fiy_SSE0 = _mm_add_ps(fiy_SSE0,ty_SSE0);
3458 fiz_SSE0 = _mm_add_ps(fiz_SSE0,tz_SSE0);
3459 fix_SSE1 = _mm_add_ps(fix_SSE1,tx_SSE1);
3460 fiy_SSE1 = _mm_add_ps(fiy_SSE1,ty_SSE1);
3461 fiz_SSE1 = _mm_add_ps(fiz_SSE1,tz_SSE1);
3462 fix_SSE2 = _mm_add_ps(fix_SSE2,tx_SSE2);
3463 fiy_SSE2 = _mm_add_ps(fiy_SSE2,ty_SSE2);
3464 fiz_SSE2 = _mm_add_ps(fiz_SSE2,tz_SSE2);
3465 fix_SSE3 = _mm_add_ps(fix_SSE3,tx_SSE3);
3466 fiy_SSE3 = _mm_add_ps(fiy_SSE3,ty_SSE3);
3467 fiz_SSE3 = _mm_add_ps(fiz_SSE3,tz_SSE3);
3469 /* Decrement j atom force */
3470 _mm_store_ps(fx_align+j,
3471 _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
3472 _mm_store_ps(fy_align+j,
3473 _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
3474 _mm_store_ps(fz_align+j,
3475 _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
3477 /* Add i forces to mem and shifted force list */
3478 _MM_TRANSPOSE4_PS(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3);
3479 fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE1);
3480 fix_SSE2 = _mm_add_ps(fix_SSE2,fix_SSE3);
3481 fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE2);
3482 _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
3484 _MM_TRANSPOSE4_PS(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3);
3485 fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE1);
3486 fiy_SSE2 = _mm_add_ps(fiy_SSE2,fiy_SSE3);
3487 fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE2);
3488 _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
3490 _MM_TRANSPOSE4_PS(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3);
3491 fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE1);
3492 fiz_SSE2 = _mm_add_ps(fiz_SSE2,fiz_SSE3);
3493 fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE2);
3494 _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
3497 for(i=0;i<natoms;i++)
3499 f[3*i] += fx_align[i] + fx_align[natoms+i];
3500 f[3*i+1] += fy_align[i] + fy_align[natoms+i];
3501 f[3*i+2] += fz_align[i] + fz_align[natoms+i];
3508 /* dummy variable when not using SSE */
3509 int genborn_allvsall_sse2_single_dummy;