Update copyright statements and change license to LGPL
[alexxy/gromacs.git] / src / mdlib / genborn_allvsall_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2012, by the GROMACS development team, led by
6  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
7  * others, as listed in the AUTHORS file in the top-level source
8  * directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39
40 #include <math.h>
41 #include "types/simple.h"
42
43 #include "vec.h"
44 #include "smalloc.h"
45
46 #include "partdec.h"
47 #include "network.h"
48 #include "physics.h"
49 #include "genborn.h"
50 #include "genborn_allvsall.h"
51
52 #if 0 && defined (GMX_X86_SSE2)
53
54 #include <gmx_sse2_single.h>
55
56
57 #define SIMD_WIDTH 4
58 #define UNROLLI    4
59 #define UNROLLJ    4
60
61
62
63
64
65
66
67
68
69 typedef struct 
70 {
71     int *      jindex_gb;
72     int **     prologue_mask_gb;
73     int **     epilogue_mask;
74     int *      imask;
75     real *     gb_radius;
76     real *     workparam;
77     real *     work;
78     real *     x_align;
79     real *     y_align;
80     real *     z_align;
81     real *     fx_align;
82         real *     fy_align;
83         real *     fz_align;        
84
85 gmx_allvsallgb2_data_t;
86
87
88 static int 
89 calc_maxoffset(int i,int natoms)
90 {
91     int maxoffset;
92     
93     if ((natoms % 2) == 1)
94     {
95         /* Odd number of atoms, easy */
96         maxoffset = natoms/2;
97     }
98     else if ((natoms % 4) == 0)
99     {
100         /* Multiple of four is hard */
101         if (i < natoms/2)
102         {
103             if ((i % 2) == 0)
104             {
105                 maxoffset=natoms/2;
106             }
107             else
108             {
109                 maxoffset=natoms/2-1;
110             }
111         }
112         else
113         {
114             if ((i % 2) == 1)
115             {
116                 maxoffset=natoms/2;
117             }
118             else
119             {
120                 maxoffset=natoms/2-1;
121             }
122         }
123     }
124     else
125     {
126         /* natoms/2 = odd */
127         if ((i % 2) == 0)
128         {
129             maxoffset=natoms/2;
130         }
131         else
132         {
133             maxoffset=natoms/2-1;
134         }
135     }
136     
137     return maxoffset;
138 }
139
140 static void
141 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
142                                 t_ilist *                  ilist,
143                                 int                        start,
144                                 int                        end,
145                                 int                        natoms,
146                                 gmx_bool                       bInclude12,
147                                 gmx_bool                       bInclude13,
148                                 gmx_bool                       bInclude14)
149 {
150     int i,j,k,tp;
151     int a1,a2;
152     int ni0,ni1,nj0,nj1,nj;
153     int imin,imax,iexcl;
154     int max_offset;
155     int max_excl_offset;
156     int firstinteraction;
157     int ibase;
158     int  *pi;
159     
160     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
161      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
162      * whether they should interact or not. 
163      *
164      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
165      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
166      * we create a jindex array with three elements per i atom: the starting point, the point to
167      * which we need to check exclusions, and the end point.
168      * This way we only have to allocate a short exclusion mask per i atom.
169      */
170     
171     ni0 = (start/UNROLLI)*UNROLLI;
172     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
173     
174     /* Set the interaction mask to only enable the i atoms we want to include */
175     snew(pi,natoms+UNROLLI+2*SIMD_WIDTH);
176     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
177     for(i=0;i<natoms+UNROLLI;i++)
178     {
179         aadata->imask[i] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
180     }
181     
182     /* Allocate memory for our modified jindex array */
183     snew(aadata->jindex_gb,4*(natoms+UNROLLI));
184     for(i=0;i<4*(natoms+UNROLLI);i++)
185     {
186         aadata->jindex_gb[i] = 0;
187     }
188     
189     /* Create the exclusion masks for the prologue part */
190         snew(aadata->prologue_mask_gb,natoms+UNROLLI); /* list of pointers */
191         
192     /* First zero everything to avoid uninitialized data */
193     for(i=0;i<natoms+UNROLLI;i++)
194     {
195         aadata->prologue_mask_gb[i] = NULL;
196     }
197     
198     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
199     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
200         {
201         max_excl_offset = -1;
202         
203         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
204         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
205         
206         /* Which atom is the first we (might) interact with? */
207         imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
208         for(i=ibase;i<imax;i++)
209         {
210             /* Before exclusions, which atom is the first we (might) interact with? */
211             firstinteraction = i+1;
212             max_offset = calc_maxoffset(i,natoms);
213
214             if(!bInclude12)
215             {
216                 for(j=0;j<ilist[F_GB12].nr;j+=3)
217                 {
218                     a1 = ilist[F_GB12].iatoms[j+1];
219                     a2 = ilist[F_GB12].iatoms[j+2];
220
221                     if(a1==i)
222                     {
223                         k = a2;
224                     }
225                     else if(a2==i)
226                     {
227                         k = a1;
228                     }
229                     else 
230                     {
231                         continue;
232                     }
233
234                     if(k==firstinteraction)
235                     {
236                         firstinteraction++;
237                     }
238                 }
239             }
240             if(!bInclude13)
241             {
242                 for(j=0;j<ilist[F_GB13].nr;j+=3)
243                 {
244                     a1 = ilist[F_GB13].iatoms[j+1];
245                     a2 = ilist[F_GB13].iatoms[j+2];
246                     
247                     if(a1==i)
248                     {
249                         k = a2;
250                     }
251                     else if(a2==i)
252                     {
253                         k = a1;
254                     }
255                     else 
256                     {
257                         continue;
258                     }
259                     
260                     if(k==firstinteraction)
261                     {
262                         firstinteraction++;
263                     }
264                 }
265             }
266             if(!bInclude14)
267             {
268                 for(j=0;j<ilist[F_GB14].nr;j+=3)
269                 {
270                     a1 = ilist[F_GB14].iatoms[j+1];
271                     a2 = ilist[F_GB14].iatoms[j+2];
272                     if(a1==i)
273                     {
274                         k = a2;
275                     }
276                     else if(a2==i)
277                     {
278                         k = a1;
279                     }
280                     else 
281                     {
282                         continue;
283                     }
284                     
285                     if(k==firstinteraction)
286                     {
287                         firstinteraction++;
288                     }
289                 }
290             }
291             imin = (firstinteraction < imin) ? firstinteraction : imin;
292         }
293         /* round down to j unrolling factor */
294         imin = (imin/UNROLLJ)*UNROLLJ;
295
296         for(i=ibase;i<imax;i++)
297         {
298             max_offset = calc_maxoffset(i,natoms);
299             
300             if(!bInclude12)
301             {
302                 for(j=0;j<ilist[F_GB12].nr;j+=3)
303                 {
304                     a1 = ilist[F_GB12].iatoms[j+1];
305                     a2 = ilist[F_GB12].iatoms[j+2];
306                     
307                     if(a1==i)
308                     {
309                         k = a2;
310                     }
311                     else if(a2==i)
312                     {
313                         k = a1;
314                     }
315                     else 
316                     {
317                         continue;
318                     }
319                     
320                     if(k<imin)
321                     {
322                         k += natoms;
323                     }
324                     
325                     if(k>i+max_offset)
326                     {
327                         continue;
328                     }
329                     
330                     k = k - imin;
331                     
332                     if( k+natoms <= max_offset )
333                     {
334                         k+=natoms;
335                     }
336                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
337                 }
338             }
339             if(!bInclude13)
340             {
341                 for(j=0;j<ilist[F_GB13].nr;j+=3)
342                 {
343                     a1 = ilist[F_GB13].iatoms[j+1];
344                     a2 = ilist[F_GB13].iatoms[j+2];
345                     
346                     if(a1==i)
347                     {
348                         k = a2;
349                     }
350                     else if(a2==i)
351                     {
352                         k = a1;
353                     }
354                     else 
355                     {
356                         continue;
357                     }
358                     
359                     if(k<imin)
360                     {
361                         k += natoms;
362                     }
363                     
364                     if(k>i+max_offset)
365                     {
366                         continue;
367                     }
368                     
369                     k = k - imin;
370                     
371                     if( k+natoms <= max_offset )
372                     {
373                         k+=natoms;
374                     }
375                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
376                 }
377             }
378             if(!bInclude14)
379             {
380                 for(j=0;j<ilist[F_GB14].nr;j+=3)
381                 {
382                     a1 = ilist[F_GB14].iatoms[j+1];
383                     a2 = ilist[F_GB14].iatoms[j+2];
384
385                     if(a1==i)
386                     {
387                         k = a2;
388                     }
389                     else if(a2==i)
390                     {
391                         k = a1;
392                     }
393                     else 
394                     {
395                         continue;
396                     }
397                     
398                     if(k<imin)
399                     {
400                         k += natoms;
401                     }
402                     
403                     if(k>i+max_offset)
404                     {
405                         continue;
406                     }
407                     
408                     k = k - imin;
409                     
410                     if( k+natoms <= max_offset )
411                     {
412                         k+=natoms;
413                     }
414                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
415                 }
416             }
417         }
418
419         /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
420         max_excl_offset++;
421         /* round up to j unrolling factor */
422         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
423         
424         /* Set all the prologue masks length to this value (even for i>end) */
425         for(i=ibase;i<ibase+UNROLLI;i++)
426         {
427             aadata->jindex_gb[4*i]   = imin;
428             aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
429         }        
430     }
431      
432     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
433     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
434     {      
435         for(i=ibase;i<ibase+UNROLLI;i++)
436         {
437             nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
438             imin = aadata->jindex_gb[4*i];
439             
440             /* Allocate aligned memory */
441             snew(pi,nj+2*SIMD_WIDTH);
442             aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
443             
444             max_offset = calc_maxoffset(i,natoms);
445             
446             /* Include interactions i+1 <= j < i+maxoffset */
447             for(k=0;k<nj;k++)
448             {
449                 j = imin + k;
450                 
451                 if( (j>i) && (j<=i+max_offset) )
452                 {
453                     aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
454                 }
455                 else
456                 {
457                     aadata->prologue_mask_gb[i][k] = 0;
458                 }
459             }
460             
461             /* Clear out the explicit exclusions */
462             if(i<end)
463             {
464                 if(!bInclude12)
465                 {
466                     for(j=0;j<ilist[F_GB12].nr;j+=3)
467                     {
468                         a1 = ilist[F_GB12].iatoms[j+1];
469                         a2 = ilist[F_GB12].iatoms[j+2];
470                         
471                         if(a1==i)
472                         {
473                             k = a2;
474                         }
475                         else if(a2==i)
476                         {
477                             k = a1;
478                         }
479                         else 
480                         {
481                             continue;
482                         }
483                         
484                         if(k>i+max_offset)
485                         {
486                             continue;
487                         }
488                         k = k-i;
489                         
490                         if( k+natoms <= max_offset )
491                         {
492                             k+=natoms;
493                         }
494                         
495                         k = k+i-imin;
496                         if(k>=0)
497                         {                        
498                             aadata->prologue_mask_gb[i][k] = 0;
499                         }
500                     }
501                 }
502                 if(!bInclude13)
503                 {
504                     for(j=0;j<ilist[F_GB13].nr;j+=3)
505                     {
506                         a1 = ilist[F_GB13].iatoms[j+1];
507                         a2 = ilist[F_GB13].iatoms[j+2];
508  
509                         if(a1==i)
510                         {
511                             k = a2;
512                         }
513                         else if(a2==i)
514                         {
515                             k = a1;
516                         }
517                         else 
518                         {
519                             continue;
520                         }
521                         
522                         if(k>i+max_offset)
523                         {
524                             continue;
525                         }
526                         k = k-i;
527                         
528                         if( k+natoms <= max_offset )
529                         {
530                             k+=natoms;
531                         }
532                         
533                         k = k+i-imin;
534                         if(k>=0)
535                         {                        
536                             aadata->prologue_mask_gb[i][k] = 0;
537                         }
538                     }
539                 }
540                 if(!bInclude14)
541                 {
542                     for(j=0;j<ilist[F_GB14].nr;j+=3)
543                     {
544                         a1 = ilist[F_GB14].iatoms[j+1];
545                         a2 = ilist[F_GB14].iatoms[j+2];
546
547                         if(a1==i)
548                         {
549                             k = a2;
550                         }
551                         else if(a2==i)
552                         {
553                             k = a1;
554                         }
555                         else 
556                         {
557                             continue;
558                         }
559                         
560                         if(k>i+max_offset)
561                         {
562                             continue;
563                         }
564                         k = k-i;
565                         
566                         if( k+natoms <= max_offset )
567                         {
568                             k+=natoms;
569                         }
570                         
571                         k = k+i-imin;
572                         if(k>=0)
573                         {                        
574                             aadata->prologue_mask_gb[i][k] = 0;
575                         }
576                     }
577                 }
578             }
579         }
580     }
581     
582     /* Construct the epilogue mask - this just contains the check for maxoffset */
583     snew(aadata->epilogue_mask,natoms+UNROLLI);
584     
585     /* First zero everything to avoid uninitialized data */
586     for(i=0;i<natoms+UNROLLI;i++)
587     {
588         aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
589         aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
590         aadata->epilogue_mask[i] = NULL;
591     }
592     
593     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
594     {      
595         /* Find the lowest index for which we need to use the epilogue */
596         imin = ibase;
597         max_offset = calc_maxoffset(imin,natoms);
598
599         imin = imin + 1 + max_offset;
600         
601         /* Find largest index for which we need to use the epilogue */
602         imax = ibase + UNROLLI-1;
603         imax = (imax < end) ? imax : end; 
604         
605         max_offset = calc_maxoffset(imax,natoms);
606         imax = imax + 1 + max_offset + UNROLLJ - 1;
607         
608         for(i=ibase;i<ibase+UNROLLI;i++)
609         {
610             /* Start of epilogue - round down to j tile limit */
611             aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
612             /* Make sure we dont overlap - for small systems everything is done in the prologue */
613             aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
614             /* Round upwards to j tile limit */
615             aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
616             /* Make sure we dont have a negative range for the epilogue */
617             aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
618         }
619     }
620     
621     /* And fill it with data... */
622     
623     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
624     {
625         for(i=ibase;i<ibase+UNROLLI;i++)
626         {
627             
628             nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
629             
630             /* Allocate aligned memory */
631             snew(pi,nj+2*SIMD_WIDTH);
632             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
633             
634             max_offset = calc_maxoffset(i,natoms);
635             
636             for(k=0;k<nj;k++)
637             {
638                 j = aadata->jindex_gb[4*i+2] + k;
639                 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
640             }
641         }
642     }
643 }
644
645
646 static void
647 genborn_allvsall_setup(gmx_allvsallgb2_data_t **  p_aadata,
648                        gmx_localtop_t *           top,
649                        gmx_genborn_t *            born,
650                        t_mdatoms *                mdatoms,
651                        real                       radius_offset,
652                        int                        gb_algorithm,
653                        gmx_bool                       bInclude12,
654                        gmx_bool                       bInclude13,
655                        gmx_bool                       bInclude14)
656 {
657         int i,j,idx;
658     int natoms;
659         gmx_allvsallgb2_data_t *aadata;
660     real *p;
661     
662     natoms = mdatoms->nr;
663     
664         snew(aadata,1);
665         *p_aadata = aadata;
666     
667         snew(p,2*natoms+2*SIMD_WIDTH);
668         aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
669         snew(p,2*natoms+2*SIMD_WIDTH);
670         aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
671         snew(p,2*natoms+2*SIMD_WIDTH);
672         aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
673         snew(p,2*natoms+2*SIMD_WIDTH);
674         aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
675         snew(p,2*natoms+2*SIMD_WIDTH);
676         aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
677         snew(p,2*natoms+2*SIMD_WIDTH);
678         aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
679
680     snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
681     aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
682         
683     snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
684     aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
685     
686     snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
687     aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
688     
689     for(i=0;i<mdatoms->nr;i++)
690     {
691         aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
692         if(gb_algorithm==egbSTILL)
693         {
694             aadata->workparam[i] = born->vsolv[i];
695         }
696         else if(gb_algorithm==egbOBC)
697         {
698             aadata->workparam[i] = born->param[i];
699         }
700         aadata->work[i]      = 0.0;
701     }
702     for(i=0;i<mdatoms->nr;i++)
703     {
704         aadata->gb_radius[natoms+i] = aadata->gb_radius[i]; 
705         aadata->workparam[natoms+i] = aadata->workparam[i];
706         aadata->work[natoms+i]      = aadata->work[i];
707     }
708     
709     for(i=0;i<2*natoms+SIMD_WIDTH;i++)
710         {
711                 aadata->x_align[i] = 0.0;
712                 aadata->y_align[i] = 0.0;
713                 aadata->z_align[i] = 0.0;
714                 aadata->fx_align[i] = 0.0;
715                 aadata->fy_align[i] = 0.0;
716                 aadata->fz_align[i] = 0.0;
717         }
718     
719     setup_gb_exclusions_and_indices(aadata,top->idef.il,mdatoms->start,mdatoms->start+mdatoms->homenr,mdatoms->nr,
720                                     bInclude12,bInclude13,bInclude14);
721 }
722
723
724 int
725 genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
726                                               t_mdatoms *            mdatoms,
727                                               gmx_genborn_t *        born,
728                                               gmx_localtop_t *       top,
729                                               real *                 x,
730                                               t_commrec *            cr,
731                                               void *                 paadata)
732 {
733         gmx_allvsallgb2_data_t *aadata;
734         int        natoms;
735         int        ni0,ni1;
736         int        nj0,nj1,nj2,nj3;
737         int        i,j,k,n;
738     int *      mask;
739     int *      pmask0;
740     int *      pmask1;
741     int *      pmask2;
742     int *      pmask3;
743     int *      emask0;
744     int *      emask1;
745     int *      emask2;
746     int *      emask3;
747     real       ix,iy,iz;
748     real       jx,jy,jz;
749     real       dx,dy,dz;
750     real       rsq,rinv;
751     real       gpi,rai,vai;
752     real       prod_ai;
753     real       irsq,idr4,idr6;
754     real       raj,rvdw,ratio;
755     real       vaj,ccf,dccf,theta,cosq;
756     real       term,prod,icf4,icf6,gpi2,factor,sinq;
757     real *     gb_radius;
758     real *     vsolv;
759     real *     work;
760     real       tmpsum[4];
761     real *     x_align;
762     real *     y_align;
763     real *     z_align;
764     int *      jindex;
765     real *     dadx;
766     
767     __m128     ix_SSE0,iy_SSE0,iz_SSE0;
768     __m128     ix_SSE1,iy_SSE1,iz_SSE1;
769     __m128     ix_SSE2,iy_SSE2,iz_SSE2;
770     __m128     ix_SSE3,iy_SSE3,iz_SSE3;
771     __m128     gpi_SSE0,rai_SSE0,prod_ai_SSE0;
772     __m128     gpi_SSE1,rai_SSE1,prod_ai_SSE1;
773     __m128     gpi_SSE2,rai_SSE2,prod_ai_SSE2;
774     __m128     gpi_SSE3,rai_SSE3,prod_ai_SSE3;
775     __m128     imask_SSE0,jmask_SSE0;
776     __m128     imask_SSE1,jmask_SSE1;
777     __m128     imask_SSE2,jmask_SSE2;
778     __m128     imask_SSE3,jmask_SSE3;
779     __m128     jx_SSE,jy_SSE,jz_SSE;
780     __m128     dx_SSE0,dy_SSE0,dz_SSE0;
781     __m128     dx_SSE1,dy_SSE1,dz_SSE1;
782     __m128     dx_SSE2,dy_SSE2,dz_SSE2;
783     __m128     dx_SSE3,dy_SSE3,dz_SSE3;
784     __m128     rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
785     __m128     rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
786     __m128     rsq_SSE2,rinv_SSE2,irsq_SSE2,idr4_SSE2,idr6_SSE2;
787     __m128     rsq_SSE3,rinv_SSE3,irsq_SSE3,idr4_SSE3,idr6_SSE3;
788     __m128     raj_SSE,vaj_SSE,prod_SSE;
789     __m128     rvdw_SSE0,ratio_SSE0;
790     __m128     rvdw_SSE1,ratio_SSE1;
791     __m128     rvdw_SSE2,ratio_SSE2;
792     __m128     rvdw_SSE3,ratio_SSE3;
793     __m128     theta_SSE0,sinq_SSE0,cosq_SSE0,term_SSE0;
794     __m128     theta_SSE1,sinq_SSE1,cosq_SSE1,term_SSE1;
795     __m128     theta_SSE2,sinq_SSE2,cosq_SSE2,term_SSE2;
796     __m128     theta_SSE3,sinq_SSE3,cosq_SSE3,term_SSE3;
797     __m128     ccf_SSE0,dccf_SSE0;
798     __m128     ccf_SSE1,dccf_SSE1;
799     __m128     ccf_SSE2,dccf_SSE2;
800     __m128     ccf_SSE3,dccf_SSE3;
801     __m128     icf4_SSE0,icf6_SSE0;
802     __m128     icf4_SSE1,icf6_SSE1;
803     __m128     icf4_SSE2,icf6_SSE2;
804     __m128     icf4_SSE3,icf6_SSE3;
805     __m128     half_SSE,one_SSE,two_SSE,four_SSE;
806     __m128     still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
807     
808     natoms              = mdatoms->nr;
809         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
810         ni1                 = mdatoms->start+mdatoms->homenr;
811
812     n = 0;
813     
814     aadata = *((gmx_allvsallgb2_data_t **)paadata);
815
816     
817         if(aadata==NULL)
818         {
819                 genborn_allvsall_setup(&aadata,top,born,mdatoms,0.0,
820                                egbSTILL,FALSE,FALSE,TRUE);
821         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
822         }
823         
824     x_align = aadata->x_align;
825         y_align = aadata->y_align;
826         z_align = aadata->z_align;
827
828     gb_radius = aadata->gb_radius;
829     vsolv     = aadata->workparam;
830     work      = aadata->work;
831     jindex    = aadata->jindex_gb;
832     dadx      = fr->dadx;
833     
834     still_p4_SSE    = _mm_set1_ps(STILL_P4);
835     still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
836     still_pip5_SSE  = _mm_set1_ps(STILL_PIP5);
837     half_SSE        = _mm_set1_ps(0.5);
838     one_SSE         = _mm_set1_ps(1.0);
839     two_SSE         = _mm_set1_ps(2.0);
840     four_SSE        = _mm_set1_ps(4.0);
841     
842     /* This will be summed, so it has to extend to natoms + buffer */
843     for(i=0;i<natoms+1+natoms/2;i++)
844     {
845         work[i] = 0;
846     }
847     
848         for(i=ni0;i<ni1+1+natoms/2;i++)
849         {
850         k = i%natoms;
851                 x_align[i]  = x[3*k];
852                 y_align[i]  = x[3*k+1];
853                 z_align[i]  = x[3*k+2];
854         work[i]     = 0;
855     }        
856     
857     
858         for(i=ni0; i<ni1; i+=UNROLLI)
859         {
860                 /* We assume shifts are NOT used for all-vs-all interactions */
861
862                 /* Load i atom data */
863                 ix_SSE0          = _mm_load1_ps(x_align+i);
864                 iy_SSE0          = _mm_load1_ps(y_align+i);
865                 iz_SSE0          = _mm_load1_ps(z_align+i);
866                 ix_SSE1          = _mm_load1_ps(x_align+i+1);
867                 iy_SSE1          = _mm_load1_ps(y_align+i+1);
868                 iz_SSE1          = _mm_load1_ps(z_align+i+1);
869                 ix_SSE2          = _mm_load1_ps(x_align+i+2);
870                 iy_SSE2          = _mm_load1_ps(y_align+i+2);
871                 iz_SSE2          = _mm_load1_ps(z_align+i+2);
872                 ix_SSE3          = _mm_load1_ps(x_align+i+3);
873                 iy_SSE3          = _mm_load1_ps(y_align+i+3);
874                 iz_SSE3          = _mm_load1_ps(z_align+i+3);
875         
876         gpi_SSE0         = _mm_setzero_ps();
877         gpi_SSE1         = _mm_setzero_ps();
878         gpi_SSE2         = _mm_setzero_ps();
879         gpi_SSE3         = _mm_setzero_ps();
880
881         rai_SSE0         = _mm_load1_ps(gb_radius+i);
882         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
883         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
884         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
885         
886         prod_ai_SSE0     = _mm_set1_ps(STILL_P4*vsolv[i]);
887         prod_ai_SSE1     = _mm_set1_ps(STILL_P4*vsolv[i+1]);
888         prod_ai_SSE2     = _mm_set1_ps(STILL_P4*vsolv[i+2]);
889         prod_ai_SSE3     = _mm_set1_ps(STILL_P4*vsolv[i+3]);
890         
891                 /* Load limits for loop over neighbors */
892                 nj0              = jindex[4*i];
893                 nj1              = jindex[4*i+1];
894                 nj2              = jindex[4*i+2];
895                 nj3              = jindex[4*i+3];
896                         
897         pmask0           = aadata->prologue_mask_gb[i];
898         pmask1           = aadata->prologue_mask_gb[i+1];
899         pmask2           = aadata->prologue_mask_gb[i+2];
900         pmask3           = aadata->prologue_mask_gb[i+3];
901         emask0           = aadata->epilogue_mask[i]; 
902         emask1           = aadata->epilogue_mask[i+1]; 
903         emask2           = aadata->epilogue_mask[i+2]; 
904         emask3           = aadata->epilogue_mask[i+3]; 
905
906         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
907         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
908         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
909         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
910         
911         /* Prologue part, including exclusion mask */
912         for(j=nj0; j<nj1; j+=UNROLLJ)
913         {          
914             jmask_SSE0 = _mm_load_ps((real *)pmask0);
915             jmask_SSE1 = _mm_load_ps((real *)pmask1);
916             jmask_SSE2 = _mm_load_ps((real *)pmask2);
917             jmask_SSE3 = _mm_load_ps((real *)pmask3);
918             pmask0 += UNROLLJ;
919             pmask1 += UNROLLJ;
920             pmask2 += UNROLLJ;
921             pmask3 += UNROLLJ;
922             
923             /* load j atom coordinates */
924             jx_SSE            = _mm_load_ps(x_align+j);
925             jy_SSE            = _mm_load_ps(y_align+j);
926             jz_SSE            = _mm_load_ps(z_align+j);
927             
928             /* Calculate distance */
929             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
930             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
931             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
932             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
933             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
934             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
935             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
936             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
937             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
938             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
939             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
940             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
941             
942             /* rsq = dx*dx+dy*dy+dz*dz */
943             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
944             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
945             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
946             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
947             
948             /* Combine masks */
949             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
950             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
951             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
952             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
953             
954             /* Calculate 1/r and 1/r2 */
955             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
956             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
957             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
958             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
959             
960             /* Apply mask */
961             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
962             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
963             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
964             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
965             
966             irsq_SSE0          = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
967             irsq_SSE1          = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
968             irsq_SSE2          = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
969             irsq_SSE3          = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
970             idr4_SSE0          = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
971             idr4_SSE1          = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
972             idr4_SSE2          = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
973             idr4_SSE3          = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
974             idr6_SSE0          = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
975             idr6_SSE1          = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
976             idr6_SSE2          = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
977             idr6_SSE3          = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
978             
979             raj_SSE            = _mm_load_ps(gb_radius+j);
980             vaj_SSE            = _mm_load_ps(vsolv+j);
981             
982             rvdw_SSE0          = _mm_add_ps(rai_SSE0,raj_SSE);
983             rvdw_SSE1          = _mm_add_ps(rai_SSE1,raj_SSE);
984             rvdw_SSE2          = _mm_add_ps(rai_SSE2,raj_SSE);
985             rvdw_SSE3          = _mm_add_ps(rai_SSE3,raj_SSE);
986             
987             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
988             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
989             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
990             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
991
992             ratio_SSE0         = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
993             ratio_SSE1         = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
994             ratio_SSE2         = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
995             ratio_SSE3         = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
996             theta_SSE0         = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
997             theta_SSE1         = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
998             theta_SSE2         = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
999             theta_SSE3         = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
1000             gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
1001             gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
1002             gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);            
1003             gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);            
1004             term_SSE0          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
1005             term_SSE1          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1006             term_SSE2          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1007             term_SSE3          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1008             ccf_SSE0           = _mm_mul_ps(term_SSE0,term_SSE0);
1009             ccf_SSE1           = _mm_mul_ps(term_SSE1,term_SSE1);
1010             ccf_SSE2           = _mm_mul_ps(term_SSE2,term_SSE2);
1011             ccf_SSE3           = _mm_mul_ps(term_SSE3,term_SSE3);
1012             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1013                                             _mm_mul_ps(sinq_SSE0,theta_SSE0));
1014             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1015                                             _mm_mul_ps(sinq_SSE1,theta_SSE1));
1016             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1017                                             _mm_mul_ps(sinq_SSE2,theta_SSE2));
1018             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1019                                             _mm_mul_ps(sinq_SSE3,theta_SSE3));
1020
1021             prod_SSE           = _mm_mul_ps(still_p4_SSE,vaj_SSE);
1022             icf4_SSE0          = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1023             icf4_SSE1          = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1024             icf4_SSE2          = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1025             icf4_SSE3          = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1026             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1027             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1028             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1029             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1030
1031             _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1032                                              gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1033                                                          _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1034                                                          _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1035                                                          _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1036             
1037             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1038             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1039             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1040             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1041
1042             /* Save ai->aj and aj->ai chain rule terms */
1043             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1044             dadx+=4;
1045             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1046             dadx+=4;
1047             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1048             dadx+=4;
1049             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1050             dadx+=4;            
1051             
1052             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1053             dadx+=4;
1054             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1055             dadx+=4;
1056             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1057             dadx+=4;
1058             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1059             dadx+=4;
1060         }
1061                                  
1062         /* Main part, no exclusions */
1063         for(j=nj1; j<nj2; j+=UNROLLJ)
1064         {               
1065             /* load j atom coordinates */
1066             jx_SSE            = _mm_load_ps(x_align+j);
1067             jy_SSE            = _mm_load_ps(y_align+j);
1068             jz_SSE            = _mm_load_ps(z_align+j);
1069             
1070             /* Calculate distance */
1071             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
1072             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
1073             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
1074             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
1075             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
1076             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
1077             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
1078             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
1079             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
1080             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
1081             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
1082             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
1083             
1084             /* rsq = dx*dx+dy*dy+dz*dz */
1085             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1086             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1087             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1088             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1089             
1090             /* Calculate 1/r and 1/r2 */
1091             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1092             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1093             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1094             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1095             
1096             /* Apply mask */
1097             rinv_SSE0          = _mm_and_ps(rinv_SSE0,imask_SSE0);
1098             rinv_SSE1          = _mm_and_ps(rinv_SSE1,imask_SSE1);
1099             rinv_SSE2          = _mm_and_ps(rinv_SSE2,imask_SSE2);
1100             rinv_SSE3          = _mm_and_ps(rinv_SSE3,imask_SSE3);
1101             
1102             irsq_SSE0          = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1103             irsq_SSE1          = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1104             irsq_SSE2          = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1105             irsq_SSE3          = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1106             idr4_SSE0          = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
1107             idr4_SSE1          = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
1108             idr4_SSE2          = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
1109             idr4_SSE3          = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
1110             idr6_SSE0          = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
1111             idr6_SSE1          = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
1112             idr6_SSE2          = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
1113             idr6_SSE3          = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
1114             
1115             raj_SSE            = _mm_load_ps(gb_radius+j);
1116             
1117             rvdw_SSE0          = _mm_add_ps(rai_SSE0,raj_SSE);
1118             rvdw_SSE1          = _mm_add_ps(rai_SSE1,raj_SSE);
1119             rvdw_SSE2          = _mm_add_ps(rai_SSE2,raj_SSE);
1120             rvdw_SSE3          = _mm_add_ps(rai_SSE3,raj_SSE);
1121             vaj_SSE            = _mm_load_ps(vsolv+j);
1122             
1123             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
1124             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
1125             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
1126             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
1127             
1128             ratio_SSE0         = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
1129             ratio_SSE1         = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
1130             ratio_SSE2         = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
1131             ratio_SSE3         = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
1132             theta_SSE0         = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
1133             theta_SSE1         = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
1134             theta_SSE2         = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
1135             theta_SSE3         = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
1136             gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
1137             gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
1138             gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);            
1139             gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);            
1140             term_SSE0          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
1141             term_SSE1          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1142             term_SSE2          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1143             term_SSE3          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1144             ccf_SSE0           = _mm_mul_ps(term_SSE0,term_SSE0);
1145             ccf_SSE1           = _mm_mul_ps(term_SSE1,term_SSE1);
1146             ccf_SSE2           = _mm_mul_ps(term_SSE2,term_SSE2);
1147             ccf_SSE3           = _mm_mul_ps(term_SSE3,term_SSE3);
1148             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1149                                             _mm_mul_ps(sinq_SSE0,theta_SSE0));
1150             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1151                                             _mm_mul_ps(sinq_SSE1,theta_SSE1));
1152             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1153                                             _mm_mul_ps(sinq_SSE2,theta_SSE2));
1154             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1155                                             _mm_mul_ps(sinq_SSE3,theta_SSE3));
1156             
1157             prod_SSE           = _mm_mul_ps(still_p4_SSE,vaj_SSE );
1158             icf4_SSE0          = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1159             icf4_SSE1          = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1160             icf4_SSE2          = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1161             icf4_SSE3          = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1162             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1163             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1164             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1165             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1166             
1167             _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1168                                              gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1169                                                          _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1170                                                          _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1171                                                          _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1172             
1173             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1174             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1175             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1176             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1177             
1178             /* Save ai->aj and aj->ai chain rule terms */
1179             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1180             dadx+=4;
1181             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1182             dadx+=4;
1183             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1184             dadx+=4;
1185             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1186             dadx+=4;            
1187             
1188             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1189             dadx+=4;
1190             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1191             dadx+=4;
1192             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1193             dadx+=4;
1194             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1195             dadx+=4;
1196         }
1197         /* Epilogue part, including exclusion mask */
1198         for(j=nj2; j<nj3; j+=UNROLLJ)
1199         {                     
1200             jmask_SSE0 = _mm_load_ps((real *)emask0);
1201             jmask_SSE1 = _mm_load_ps((real *)emask1);
1202             jmask_SSE2 = _mm_load_ps((real *)emask2);
1203             jmask_SSE3 = _mm_load_ps((real *)emask3);
1204             emask0 += UNROLLJ;
1205             emask1 += UNROLLJ;
1206             emask2 += UNROLLJ;
1207             emask3 += UNROLLJ;
1208             
1209             /* load j atom coordinates */
1210             jx_SSE            = _mm_load_ps(x_align+j);
1211             jy_SSE            = _mm_load_ps(y_align+j);
1212             jz_SSE            = _mm_load_ps(z_align+j);
1213             
1214             /* Calculate distance */
1215             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
1216             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
1217             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
1218             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
1219             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
1220             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
1221             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
1222             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
1223             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
1224             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
1225             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
1226             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
1227             
1228             /* rsq = dx*dx+dy*dy+dz*dz */
1229             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1230             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1231             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1232             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1233             
1234             /* Combine masks */
1235             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
1236             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
1237             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
1238             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
1239             
1240             /* Calculate 1/r and 1/r2 */
1241             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1242             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1243             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1244             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1245             
1246             /* Apply mask */
1247             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1248             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1249             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1250             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1251             
1252             irsq_SSE0          = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1253             irsq_SSE1          = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1254             irsq_SSE2          = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1255             irsq_SSE3          = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1256             idr4_SSE0          = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
1257             idr4_SSE1          = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
1258             idr4_SSE2          = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
1259             idr4_SSE3          = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
1260             idr6_SSE0          = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
1261             idr6_SSE1          = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
1262             idr6_SSE2          = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
1263             idr6_SSE3          = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
1264             
1265             raj_SSE            = _mm_load_ps(gb_radius+j);
1266             vaj_SSE            = _mm_load_ps(vsolv+j);
1267             
1268             rvdw_SSE0          = _mm_add_ps(rai_SSE0,raj_SSE);
1269             rvdw_SSE1          = _mm_add_ps(rai_SSE1,raj_SSE);
1270             rvdw_SSE2          = _mm_add_ps(rai_SSE2,raj_SSE);
1271             rvdw_SSE3          = _mm_add_ps(rai_SSE3,raj_SSE);
1272             
1273             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
1274             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
1275             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
1276             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
1277             
1278             ratio_SSE0         = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
1279             ratio_SSE1         = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
1280             ratio_SSE2         = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
1281             ratio_SSE3         = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
1282             theta_SSE0         = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
1283             theta_SSE1         = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
1284             theta_SSE2         = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
1285             theta_SSE3         = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
1286             gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
1287             gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
1288             gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);            
1289             gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);            
1290             term_SSE0          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
1291             term_SSE1          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1292             term_SSE2          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1293             term_SSE3          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1294             ccf_SSE0           = _mm_mul_ps(term_SSE0,term_SSE0);
1295             ccf_SSE1           = _mm_mul_ps(term_SSE1,term_SSE1);
1296             ccf_SSE2           = _mm_mul_ps(term_SSE2,term_SSE2);
1297             ccf_SSE3           = _mm_mul_ps(term_SSE3,term_SSE3);
1298             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1299                                             _mm_mul_ps(sinq_SSE0,theta_SSE0));
1300             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1301                                             _mm_mul_ps(sinq_SSE1,theta_SSE1));
1302             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1303                                             _mm_mul_ps(sinq_SSE2,theta_SSE2));
1304             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1305                                             _mm_mul_ps(sinq_SSE3,theta_SSE3));
1306             
1307             prod_SSE           = _mm_mul_ps(still_p4_SSE,vaj_SSE);
1308             icf4_SSE0          = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1309             icf4_SSE1          = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1310             icf4_SSE2          = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1311             icf4_SSE3          = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1312             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1313             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1314             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1315             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1316             
1317             _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1318                                              gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1319                                                          _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1320                                                          _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1321                                                          _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1322             
1323             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1324             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1325             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1326             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1327               
1328             /* Save ai->aj and aj->ai chain rule terms */
1329             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1330             dadx+=4;
1331             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1332             dadx+=4;
1333             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1334             dadx+=4;
1335             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1336             dadx+=4;            
1337             
1338             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1339             dadx+=4;
1340             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1341             dadx+=4;
1342             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1343             dadx+=4;
1344             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1345             dadx+=4;
1346         }
1347         _MM_TRANSPOSE4_PS(gpi_SSE0,gpi_SSE1,gpi_SSE2,gpi_SSE3);
1348         gpi_SSE0 = _mm_add_ps(gpi_SSE0,gpi_SSE1);
1349         gpi_SSE2 = _mm_add_ps(gpi_SSE2,gpi_SSE3);
1350         gpi_SSE0 = _mm_add_ps(gpi_SSE0,gpi_SSE2);
1351         _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
1352         }    
1353     
1354     /* In case we have written anything beyond natoms, move it back.
1355      * Never mind that we leave stuff above natoms; that will not
1356      * be accessed later in the routine.
1357      * In principle this should be a move rather than sum, but this
1358      * way we dont have to worry about even/odd offsets...
1359      */
1360     for(i=natoms;i<ni1+1+natoms/2;i++)
1361     {
1362         work[i-natoms] += work[i];
1363     }
1364     
1365     /* Parallel summations */
1366         if(PARTDECOMP(cr))
1367         {
1368                 gmx_sum(natoms,work,cr);
1369         }
1370         
1371     factor  = 0.5 * ONE_4PI_EPS0;
1372         /* Calculate the radii - should we do all atoms, or just our local ones? */
1373         for(i=0;i<natoms;i++)
1374         {
1375                 if(born->use[i] != 0)
1376                 {
1377                         gpi  = born->gpol[i]+work[i];
1378                         gpi2 = gpi * gpi;
1379                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
1380                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1381                 }
1382         }
1383         
1384         return 0;
1385 }
1386
1387
1388
1389 int
1390 genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
1391                                                 t_mdatoms *            mdatoms,
1392                                                 gmx_genborn_t *        born,
1393                                                 int                    gb_algorithm,
1394                                                 gmx_localtop_t *       top,
1395                                                 real *                 x,
1396                                                 t_commrec *            cr,
1397                                                 void *                 paadata)
1398 {
1399         gmx_allvsallgb2_data_t *aadata;
1400         int        natoms;
1401         int        ni0,ni1;
1402         int        nj0,nj1,nj2,nj3;
1403         int        i,j,k,n;
1404     int *      mask;
1405     int *      pmask0;
1406     int *      pmask1;
1407     int *      pmask2;
1408     int *      pmask3;
1409     int *      emask0;
1410     int *      emask1;
1411     int *      emask2;
1412     int *      emask3;
1413     real *     gb_radius;
1414     real *     vsolv;
1415     real *     work;
1416     real       tmpsum[4];
1417     real *     x_align;
1418     real *     y_align;
1419     real *     z_align;
1420     int *      jindex;
1421     real *     dadx;
1422     real *     obc_param;
1423     real       rad,min_rad;
1424     real rai,rai_inv,rai_inv2,sum_ai,sum_ai2,sum_ai3,tsum,tchain;
1425
1426     __m128     ix_SSE0,iy_SSE0,iz_SSE0;
1427     __m128     ix_SSE1,iy_SSE1,iz_SSE1;
1428     __m128     ix_SSE2,iy_SSE2,iz_SSE2;
1429     __m128     ix_SSE3,iy_SSE3,iz_SSE3;
1430     __m128     gpi_SSE0,rai_SSE0,prod_ai_SSE0;
1431     __m128     gpi_SSE1,rai_SSE1,prod_ai_SSE1;
1432     __m128     gpi_SSE2,rai_SSE2,prod_ai_SSE2;
1433     __m128     gpi_SSE3,rai_SSE3,prod_ai_SSE3;
1434     __m128     imask_SSE0,jmask_SSE0;
1435     __m128     imask_SSE1,jmask_SSE1;
1436     __m128     imask_SSE2,jmask_SSE2;
1437     __m128     imask_SSE3,jmask_SSE3;
1438     __m128     jx_SSE,jy_SSE,jz_SSE;
1439     __m128     dx_SSE0,dy_SSE0,dz_SSE0;
1440     __m128     dx_SSE1,dy_SSE1,dz_SSE1;
1441     __m128     dx_SSE2,dy_SSE2,dz_SSE2;
1442     __m128     dx_SSE3,dy_SSE3,dz_SSE3;
1443     __m128     rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
1444     __m128     rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
1445     __m128     rsq_SSE2,rinv_SSE2,irsq_SSE2,idr4_SSE2,idr6_SSE2;
1446     __m128     rsq_SSE3,rinv_SSE3,irsq_SSE3,idr4_SSE3,idr6_SSE3;
1447     __m128     raj_SSE,raj_inv_SSE,sk_aj_SSE,sk2_aj_SSE;
1448     __m128     ccf_SSE0,dccf_SSE0,prod_SSE0;
1449     __m128     ccf_SSE1,dccf_SSE1,prod_SSE1;
1450     __m128     ccf_SSE2,dccf_SSE2,prod_SSE2;
1451     __m128     ccf_SSE3,dccf_SSE3,prod_SSE3;
1452     __m128     icf4_SSE0,icf6_SSE0;
1453     __m128     icf4_SSE1,icf6_SSE1;
1454     __m128     icf4_SSE2,icf6_SSE2;
1455     __m128     icf4_SSE3,icf6_SSE3;
1456     __m128     oneeighth_SSE,onefourth_SSE,half_SSE,one_SSE,two_SSE,four_SSE;
1457     __m128     still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
1458     __m128     rai_inv_SSE0;
1459     __m128     rai_inv_SSE1;
1460     __m128     rai_inv_SSE2;
1461     __m128     rai_inv_SSE3;
1462     __m128     sk_ai_SSE0,sk2_ai_SSE0,sum_ai_SSE0;
1463     __m128     sk_ai_SSE1,sk2_ai_SSE1,sum_ai_SSE1;
1464     __m128     sk_ai_SSE2,sk2_ai_SSE2,sum_ai_SSE2;
1465     __m128     sk_ai_SSE3,sk2_ai_SSE3,sum_ai_SSE3;
1466     __m128     lij_inv_SSE0,sk2_rinv_SSE0;
1467     __m128     lij_inv_SSE1,sk2_rinv_SSE1;
1468     __m128     lij_inv_SSE2,sk2_rinv_SSE2;
1469     __m128     lij_inv_SSE3,sk2_rinv_SSE3;
1470     __m128     dr_SSE0;
1471     __m128     dr_SSE1;
1472     __m128     dr_SSE2;
1473     __m128     dr_SSE3;
1474     __m128     t1_SSE0,t2_SSE0,t3_SSE0,t4_SSE0;
1475     __m128     t1_SSE1,t2_SSE1,t3_SSE1,t4_SSE1;
1476     __m128     t1_SSE2,t2_SSE2,t3_SSE2,t4_SSE2;
1477     __m128     t1_SSE3,t2_SSE3,t3_SSE3,t4_SSE3;
1478     __m128     obc_mask1_SSE0,obc_mask2_SSE0,obc_mask3_SSE0;
1479     __m128     obc_mask1_SSE1,obc_mask2_SSE1,obc_mask3_SSE1;
1480     __m128     obc_mask1_SSE2,obc_mask2_SSE2,obc_mask3_SSE2;
1481     __m128     obc_mask1_SSE3,obc_mask2_SSE3,obc_mask3_SSE3;
1482     __m128     uij_SSE0,uij2_SSE0,uij3_SSE0;
1483     __m128     uij_SSE1,uij2_SSE1,uij3_SSE1;
1484     __m128     uij_SSE2,uij2_SSE2,uij3_SSE2;
1485     __m128     uij_SSE3,uij2_SSE3,uij3_SSE3;
1486     __m128     lij_SSE0,lij2_SSE0,lij3_SSE0;
1487     __m128     lij_SSE1,lij2_SSE1,lij3_SSE1;
1488     __m128     lij_SSE2,lij2_SSE2,lij3_SSE2;
1489     __m128     lij_SSE3,lij2_SSE3,lij3_SSE3;
1490     __m128     dlij_SSE0,diff2_SSE0,logterm_SSE0;
1491     __m128     dlij_SSE1,diff2_SSE1,logterm_SSE1;
1492     __m128     dlij_SSE2,diff2_SSE2,logterm_SSE2;
1493     __m128     dlij_SSE3,diff2_SSE3,logterm_SSE3;
1494     __m128     doffset_SSE;
1495     
1496     natoms              = mdatoms->nr;
1497         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
1498         ni1                 = mdatoms->start+mdatoms->homenr;
1499     
1500     n = 0;
1501     
1502     aadata = *((gmx_allvsallgb2_data_t **)paadata);
1503     
1504     
1505         if(aadata==NULL)
1506         {
1507                 genborn_allvsall_setup(&aadata,top,born,mdatoms,born->gb_doffset,
1508                                egbOBC,TRUE,TRUE,TRUE);
1509         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1510         }
1511     
1512     x_align = aadata->x_align;
1513         y_align = aadata->y_align;
1514         z_align = aadata->z_align;
1515     
1516     gb_radius = aadata->gb_radius;
1517     work      = aadata->work;
1518     jindex    = aadata->jindex_gb;
1519     dadx      = fr->dadx;
1520     obc_param = aadata->workparam;
1521     
1522     oneeighth_SSE   = _mm_set1_ps(0.125);
1523     onefourth_SSE   = _mm_set1_ps(0.25);
1524     half_SSE        = _mm_set1_ps(0.5);
1525     one_SSE         = _mm_set1_ps(1.0);
1526     two_SSE         = _mm_set1_ps(2.0);
1527     four_SSE        = _mm_set1_ps(4.0);
1528     doffset_SSE     = _mm_set1_ps(born->gb_doffset);
1529     
1530     for(i=0;i<natoms;i++)
1531         {
1532                 x_align[i]  = x[3*i];
1533                 y_align[i]  = x[3*i+1];
1534                 z_align[i]  = x[3*i+2];
1535         }
1536     
1537     /* Copy again */
1538         for(i=0;i<natoms/2+1;i++)
1539         {
1540                 x_align[natoms+i]  = x_align[i];
1541                 y_align[natoms+i]  = y_align[i];
1542                 z_align[natoms+i]  = z_align[i];
1543     }        
1544     
1545     for(i=0;i<natoms+natoms/2+1;i++)
1546     {
1547         work[i] = 0;
1548     }
1549         
1550         for(i=ni0; i<ni1; i+= UNROLLI)
1551         {
1552                 /* We assume shifts are NOT used for all-vs-all interactions */
1553         
1554                 /* Load i atom data */
1555                 ix_SSE0          = _mm_load1_ps(x_align+i);
1556                 iy_SSE0          = _mm_load1_ps(y_align+i);
1557                 iz_SSE0          = _mm_load1_ps(z_align+i);
1558                 ix_SSE1          = _mm_load1_ps(x_align+i+1);
1559                 iy_SSE1          = _mm_load1_ps(y_align+i+1);
1560                 iz_SSE1          = _mm_load1_ps(z_align+i+1);
1561                 ix_SSE2          = _mm_load1_ps(x_align+i+2);
1562                 iy_SSE2          = _mm_load1_ps(y_align+i+2);
1563                 iz_SSE2          = _mm_load1_ps(z_align+i+2);
1564                 ix_SSE3          = _mm_load1_ps(x_align+i+3);
1565                 iy_SSE3          = _mm_load1_ps(y_align+i+3);
1566                 iz_SSE3          = _mm_load1_ps(z_align+i+3);
1567         
1568         rai_SSE0         = _mm_load1_ps(gb_radius+i);
1569         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
1570         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
1571         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
1572         rai_inv_SSE0     = gmx_mm_inv_ps(rai_SSE0);
1573         rai_inv_SSE1     = gmx_mm_inv_ps(rai_SSE1);
1574         rai_inv_SSE2     = gmx_mm_inv_ps(rai_SSE2);
1575         rai_inv_SSE3     = gmx_mm_inv_ps(rai_SSE3);
1576         
1577         sk_ai_SSE0       = _mm_load1_ps(obc_param+i);
1578         sk_ai_SSE1       = _mm_load1_ps(obc_param+i+1);
1579         sk_ai_SSE2       = _mm_load1_ps(obc_param+i+2);
1580         sk_ai_SSE3       = _mm_load1_ps(obc_param+i+3);
1581         sk2_ai_SSE0      = _mm_mul_ps(sk_ai_SSE0,sk_ai_SSE0);
1582         sk2_ai_SSE1      = _mm_mul_ps(sk_ai_SSE1,sk_ai_SSE1);
1583         sk2_ai_SSE2      = _mm_mul_ps(sk_ai_SSE2,sk_ai_SSE2);
1584         sk2_ai_SSE3      = _mm_mul_ps(sk_ai_SSE3,sk_ai_SSE3);
1585         
1586         sum_ai_SSE0      = _mm_setzero_ps();
1587         sum_ai_SSE1      = _mm_setzero_ps();
1588         sum_ai_SSE2      = _mm_setzero_ps();
1589         sum_ai_SSE3      = _mm_setzero_ps();
1590         
1591                 /* Load limits for loop over neighbors */
1592                 nj0              = jindex[4*i];
1593                 nj1              = jindex[4*i+1];
1594                 nj2              = jindex[4*i+2];
1595                 nj3              = jindex[4*i+3];
1596         
1597         pmask0           = aadata->prologue_mask_gb[i];
1598         pmask1           = aadata->prologue_mask_gb[i+1];
1599         pmask2           = aadata->prologue_mask_gb[i+2];
1600         pmask3           = aadata->prologue_mask_gb[i+3];
1601         emask0           = aadata->epilogue_mask[i]; 
1602         emask1           = aadata->epilogue_mask[i+1]; 
1603         emask2           = aadata->epilogue_mask[i+2]; 
1604         emask3           = aadata->epilogue_mask[i+3]; 
1605         
1606         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
1607         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
1608         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
1609         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
1610         
1611         /* Prologue part, including exclusion mask */
1612         for(j=nj0; j<nj1; j+=UNROLLJ)
1613         {          
1614             jmask_SSE0 = _mm_load_ps((real *)pmask0);
1615             jmask_SSE1 = _mm_load_ps((real *)pmask1);
1616             jmask_SSE2 = _mm_load_ps((real *)pmask2);
1617             jmask_SSE3 = _mm_load_ps((real *)pmask3);
1618             pmask0 += UNROLLJ;
1619             pmask1 += UNROLLJ;
1620             pmask2 += UNROLLJ;
1621             pmask3 += UNROLLJ;
1622             
1623             /* load j atom coordinates */
1624             jx_SSE            = _mm_load_ps(x_align+j);
1625             jy_SSE            = _mm_load_ps(y_align+j);
1626             jz_SSE            = _mm_load_ps(z_align+j);
1627             
1628             /* Calculate distance */
1629             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
1630             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
1631             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
1632             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
1633             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
1634             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
1635             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
1636             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
1637             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
1638             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
1639             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
1640             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
1641             
1642             /* rsq = dx*dx+dy*dy+dz*dz */
1643             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1644             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1645             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1646             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1647             
1648             /* Combine masks */
1649             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
1650             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
1651             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
1652             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
1653             
1654             /* Calculate 1/r and 1/r2 */
1655             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1656             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1657             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1658             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1659             
1660             /* Apply mask */
1661             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1662             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1663             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1664             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1665             
1666             dr_SSE0            = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
1667             dr_SSE1            = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
1668             dr_SSE2            = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
1669             dr_SSE3            = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
1670
1671             sk_aj_SSE          = _mm_load_ps(obc_param+j);
1672             raj_SSE            = _mm_load_ps(gb_radius+j);
1673             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
1674             
1675             /* Evaluate influence of atom aj -> ai */
1676             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_aj_SSE);
1677             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_aj_SSE);
1678             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_aj_SSE);
1679             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_aj_SSE);
1680             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
1681             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
1682             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
1683             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
1684             t3_SSE0            = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
1685             t3_SSE1            = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
1686             t3_SSE2            = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
1687             t3_SSE3            = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
1688
1689             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
1690             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
1691             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
1692             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
1693             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
1694             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
1695             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
1696             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
1697             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
1698             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
1699             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
1700             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
1701             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
1702             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
1703             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
1704             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
1705
1706             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1707             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1708             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1709             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1710             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
1711                                            _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
1712             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
1713                                            _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
1714             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
1715                                            _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
1716             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
1717                                            _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
1718             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
1719             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
1720             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
1721             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
1722             
1723             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1724             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1725             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1726             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1727             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
1728             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
1729             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
1730             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
1731             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1732             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1733             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1734             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1735             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
1736             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
1737             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
1738             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
1739             
1740             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
1741             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
1742             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
1743             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
1744             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1745             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1746             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1747             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1748             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
1749             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
1750             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
1751             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
1752             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
1753             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
1754             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
1755             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
1756             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
1757             
1758             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
1759             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
1760             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
1761             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
1762
1763             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
1764             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
1765             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
1766             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
1767             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1768                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
1769                                                        prod_SSE0));
1770             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1771                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
1772                                                        prod_SSE1));
1773             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1774                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
1775                                                        prod_SSE2));
1776             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
1777                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
1778                                                        prod_SSE3));
1779             
1780             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
1781             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
1782             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
1783             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
1784             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
1785             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
1786             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
1787             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
1788             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
1789             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
1790             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
1791             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
1792             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
1793             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
1794             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
1795             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
1796             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
1797             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
1798             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
1799             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
1800             
1801             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
1802             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
1803             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
1804             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
1805             
1806             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
1807                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
1808             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
1809                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
1810             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
1811                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
1812             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
1813                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
1814             t1_SSE0            = _mm_sub_ps(t1_SSE0,
1815                                             _mm_mul_ps(onefourth_SSE,
1816                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
1817                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
1818             t1_SSE1            = _mm_sub_ps(t1_SSE1,
1819                                             _mm_mul_ps(onefourth_SSE,
1820                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
1821                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
1822             t1_SSE2            = _mm_sub_ps(t1_SSE2,
1823                                             _mm_mul_ps(onefourth_SSE,
1824                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
1825                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
1826             t1_SSE3            = _mm_sub_ps(t1_SSE3,
1827                                             _mm_mul_ps(onefourth_SSE,
1828                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
1829                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
1830
1831             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
1832                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
1833                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
1834             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
1835                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
1836                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
1837             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
1838                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
1839                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
1840             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
1841                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
1842                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
1843             t2_SSE0            = _mm_sub_ps(t2_SSE0,
1844                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
1845                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
1846             t2_SSE1            = _mm_sub_ps(t2_SSE1,
1847                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
1848                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
1849             t2_SSE2            = _mm_sub_ps(t2_SSE2,
1850                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
1851                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
1852             t2_SSE3            = _mm_sub_ps(t2_SSE3,
1853                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
1854                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
1855             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
1856                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
1857             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
1858                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
1859             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
1860                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
1861             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
1862                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
1863             t3_SSE0            = _mm_sub_ps(t3_SSE0,
1864                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
1865                                                        _mm_add_ps(one_SSE,
1866                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
1867             t3_SSE1            = _mm_sub_ps(t3_SSE1,
1868                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
1869                                                        _mm_add_ps(one_SSE,
1870                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
1871             t3_SSE2            = _mm_sub_ps(t3_SSE2,
1872                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
1873                                                        _mm_add_ps(one_SSE,
1874                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
1875             t3_SSE3            = _mm_sub_ps(t3_SSE3,
1876                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
1877                                                        _mm_add_ps(one_SSE,
1878                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
1879             
1880             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
1881                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
1882                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
1883             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
1884                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
1885                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
1886             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
1887                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
1888                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
1889             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
1890                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
1891                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
1892             
1893             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
1894             dadx += 4;
1895             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
1896             dadx += 4;
1897             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
1898             dadx += 4;
1899             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
1900             dadx += 4;
1901             
1902             /* Evaluate influence of atom ai -> aj */
1903             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
1904             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
1905             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
1906             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
1907             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
1908             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
1909             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
1910             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
1911             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
1912             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
1913             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
1914             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
1915             
1916             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
1917             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
1918             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
1919             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
1920             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
1921             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
1922             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
1923             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
1924             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
1925             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
1926             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
1927             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
1928             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
1929             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
1930             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
1931             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
1932             
1933             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1934             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1935             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1936             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1937             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
1938                                            _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
1939             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
1940                                            _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
1941             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
1942                                            _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
1943             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
1944                                            _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
1945             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
1946             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
1947             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
1948             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
1949             
1950             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1951             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1952             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1953             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1954             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
1955             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
1956             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
1957             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
1958             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1959             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1960             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1961             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1962             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
1963             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
1964             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
1965             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
1966             
1967             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
1968             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
1969             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
1970             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
1971             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1972             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1973             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1974             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1975             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
1976             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
1977             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
1978             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
1979             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
1980             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
1981             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
1982             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
1983             
1984             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
1985             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
1986             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
1987             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
1988             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
1989             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
1990             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
1991             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
1992             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1993                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
1994                                                        prod_SSE0));
1995             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1996                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
1997                                                        prod_SSE1));
1998             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1999                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2000                                                        prod_SSE2));
2001             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2002                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2003                                                        prod_SSE3));
2004             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2005             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2006             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2007             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2008             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2009             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2010             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2011             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2012             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
2013             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
2014             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
2015             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
2016             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2017             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2018             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2019             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2020             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2021             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2022             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2023             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2024             
2025             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2026                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
2027                                                         _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
2028                                                         _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
2029                                                         _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
2030             
2031             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2032                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
2033             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2034                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
2035             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2036                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
2037             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2038                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
2039             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2040                                             _mm_mul_ps(onefourth_SSE,
2041                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2042                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2043             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2044                                             _mm_mul_ps(onefourth_SSE,
2045                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2046                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2047             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2048                                             _mm_mul_ps(onefourth_SSE,
2049                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2050                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2051             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2052                                             _mm_mul_ps(onefourth_SSE,
2053                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2054                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2055             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2056                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2057                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2058             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2059                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2060                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2061             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2062                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2063                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2064             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2065                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2066                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2067             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2068                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2069                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2070             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2071                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2072                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2073             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2074                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2075                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2076             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2077                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2078                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2079             
2080             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2081                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2082             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2083                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2084             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2085                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2086             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2087                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2088             
2089             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2090                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2091                                                        _mm_add_ps(one_SSE,
2092                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2093             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2094                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2095                                                        _mm_add_ps(one_SSE,
2096                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2097             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2098                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2099                                                        _mm_add_ps(one_SSE,
2100                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2101             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2102                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2103                                                        _mm_add_ps(one_SSE,
2104                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2105             
2106             
2107             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2108                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2109                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
2110             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2111                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2112                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
2113             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2114                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2115                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
2116             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2117                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2118                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
2119
2120             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2121             dadx += 4;
2122             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2123             dadx += 4;
2124             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2125             dadx += 4;
2126             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2127             dadx += 4;
2128             
2129         }
2130         
2131         /* Main part, no exclusions */
2132         for(j=nj1; j<nj2; j+=UNROLLJ)
2133         {                      
2134             /* load j atom coordinates */
2135             jx_SSE            = _mm_load_ps(x_align+j);
2136             jy_SSE            = _mm_load_ps(y_align+j);
2137             jz_SSE            = _mm_load_ps(z_align+j);
2138             
2139             /* Calculate distance */
2140             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
2141             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
2142             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
2143             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
2144             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
2145             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
2146             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
2147             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
2148             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
2149             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
2150             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
2151             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
2152             
2153             /* rsq = dx*dx+dy*dy+dz*dz */
2154             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2155             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2156             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2157             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2158             
2159             /* Calculate 1/r and 1/r2 */
2160             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2161             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2162             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2163             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2164             
2165             /* Apply mask */
2166             rinv_SSE0          = _mm_and_ps(rinv_SSE0,imask_SSE0);
2167             rinv_SSE1          = _mm_and_ps(rinv_SSE1,imask_SSE1);
2168             rinv_SSE2          = _mm_and_ps(rinv_SSE2,imask_SSE2);
2169             rinv_SSE3          = _mm_and_ps(rinv_SSE3,imask_SSE3);
2170             
2171             dr_SSE0            = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
2172             dr_SSE1            = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
2173             dr_SSE2            = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
2174             dr_SSE3            = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
2175             
2176             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2177             raj_SSE            = _mm_load_ps(gb_radius+j);
2178
2179             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2180             
2181             /* Evaluate influence of atom aj -> ai */
2182             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_aj_SSE);
2183             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_aj_SSE);
2184             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_aj_SSE);
2185             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_aj_SSE);
2186             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
2187             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
2188             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
2189             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
2190             t3_SSE0            = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
2191             t3_SSE1            = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
2192             t3_SSE2            = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
2193             t3_SSE3            = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
2194             
2195             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2196             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2197             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2198             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2199             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2200             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2201             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2202             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2203             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2204             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2205             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2206             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2207             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,imask_SSE0);
2208             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,imask_SSE1);
2209             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,imask_SSE2);
2210             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,imask_SSE3);
2211             
2212             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2213             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2214             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2215             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2216             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2217                                            _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
2218             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2219                                            _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
2220             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2221                                            _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
2222             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2223                                            _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
2224             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2225             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2226             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2227             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2228             
2229             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2230             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2231             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2232             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2233             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2234             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2235             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2236             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2237             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2238             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2239             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2240             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2241             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2242             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2243             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2244             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2245             
2246             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2247             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2248             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2249             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2250             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2251             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2252             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2253             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2254             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
2255             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
2256             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
2257             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
2258             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
2259             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2260             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2261             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2262             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2263             
2264             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2265             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2266             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2267             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2268             
2269             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
2270             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
2271             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
2272             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
2273             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2274                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2275                                                        prod_SSE0));
2276             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2277                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2278                                                        prod_SSE1));
2279             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2280                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2281                                                        prod_SSE2));
2282             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2283                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2284                                                        prod_SSE3));
2285             
2286             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2287             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2288             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2289             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2290             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2291             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2292             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2293             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2294             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
2295             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
2296             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
2297             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
2298             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2299             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2300             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2301             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2302             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2303             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2304             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2305             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2306             
2307             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2308             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2309             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2310             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2311             
2312             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2313                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
2314             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2315                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
2316             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2317                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
2318             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2319                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
2320             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2321                                             _mm_mul_ps(onefourth_SSE,
2322                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2323                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2324             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2325                                             _mm_mul_ps(onefourth_SSE,
2326                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2327                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2328             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2329                                             _mm_mul_ps(onefourth_SSE,
2330                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2331                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2332             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2333                                             _mm_mul_ps(onefourth_SSE,
2334                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2335                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2336             
2337             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2338                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2339                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2340             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2341                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2342                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2343             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2344                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2345                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2346             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2347                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2348                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2349             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2350                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2351                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2352             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2353                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2354                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2355             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2356                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2357                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2358             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2359                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2360                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2361             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2362                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2363             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2364                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2365             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2366                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2367             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2368                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2369             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2370                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2371                                                        _mm_add_ps(one_SSE,
2372                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2373             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2374                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2375                                                        _mm_add_ps(one_SSE,
2376                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2377             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2378                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2379                                                        _mm_add_ps(one_SSE,
2380                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2381             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2382                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2383                                                        _mm_add_ps(one_SSE,
2384                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2385             
2386             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2387                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2388                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
2389             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2390                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2391                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
2392             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2393                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2394                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
2395             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2396                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2397                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
2398             
2399             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2400             dadx += 4;
2401             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2402             dadx += 4;
2403             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2404             dadx += 4;
2405             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2406             dadx += 4;
2407             
2408             /* Evaluate influence of atom ai -> aj */
2409             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
2410             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
2411             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
2412             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
2413             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
2414             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
2415             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
2416             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
2417             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
2418             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
2419             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
2420             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
2421             
2422             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2423             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2424             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2425             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2426             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2427             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2428             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2429             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2430             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2431             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2432             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2433             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2434             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,imask_SSE0);
2435             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,imask_SSE1);
2436             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,imask_SSE2);
2437             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,imask_SSE3);
2438             
2439             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2440             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2441             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2442             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2443             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2444                                            _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
2445             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2446                                            _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
2447             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2448                                            _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
2449             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2450                                            _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
2451             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2452             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2453             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2454             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2455             
2456             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2457             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2458             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2459             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2460             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2461             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2462             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2463             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2464             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2465             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2466             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2467             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2468             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2469             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2470             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2471             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2472             
2473             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2474             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2475             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2476             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2477             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2478             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2479             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2480             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2481             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
2482             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
2483             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
2484             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
2485             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2486             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2487             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2488             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2489             
2490             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2491             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2492             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2493             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2494             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
2495             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
2496             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
2497             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
2498             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2499                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2500                                                        prod_SSE0));
2501             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2502                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2503                                                        prod_SSE1));
2504             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2505                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2506                                                        prod_SSE2));
2507             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2508                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2509                                                        prod_SSE3));
2510             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2511             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2512             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2513             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2514             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2515             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2516             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2517             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2518             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
2519             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
2520             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
2521             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
2522             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2523             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2524             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2525             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2526             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2527             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2528             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2529             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2530             
2531             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2532                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
2533                                                         _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
2534                                                         _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
2535                                                         _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
2536             
2537             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2538                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
2539             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2540                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
2541             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2542                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
2543             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2544                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
2545             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2546                                             _mm_mul_ps(onefourth_SSE,
2547                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2548                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2549             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2550                                             _mm_mul_ps(onefourth_SSE,
2551                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2552                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2553             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2554                                             _mm_mul_ps(onefourth_SSE,
2555                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2556                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2557             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2558                                             _mm_mul_ps(onefourth_SSE,
2559                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2560                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2561             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2562                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2563                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2564             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2565                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2566                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2567             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2568                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2569                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2570             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2571                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2572                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2573             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2574                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2575                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2576             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2577                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2578                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2579             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2580                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2581                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2582             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2583                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2584                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2585             
2586             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2587                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2588             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2589                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2590             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2591                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2592             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2593                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2594             
2595             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2596                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2597                                                        _mm_add_ps(one_SSE,
2598                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2599             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2600                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2601                                                        _mm_add_ps(one_SSE,
2602                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2603             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2604                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2605                                                        _mm_add_ps(one_SSE,
2606                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2607             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2608                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2609                                                        _mm_add_ps(one_SSE,
2610                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2611             
2612             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2613                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2614                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
2615             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2616                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2617                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
2618             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2619                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2620                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
2621             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2622                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2623                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
2624             
2625             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2626             dadx += 4;
2627             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2628             dadx += 4;
2629             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2630             dadx += 4;
2631             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2632             dadx += 4;
2633         }
2634
2635         /* Epilogue part, including exclusion mask */
2636         for(j=nj2; j<nj3; j+=UNROLLJ)
2637         {                      
2638             jmask_SSE0 = _mm_load_ps((real *)emask0);
2639             jmask_SSE1 = _mm_load_ps((real *)emask1);
2640             jmask_SSE2 = _mm_load_ps((real *)emask2);
2641             jmask_SSE3 = _mm_load_ps((real *)emask3);
2642             emask0 += UNROLLJ;
2643             emask1 += UNROLLJ;
2644             emask2 += UNROLLJ;
2645             emask3 += UNROLLJ;
2646             
2647             /* load j atom coordinates */
2648             jx_SSE            = _mm_load_ps(x_align+j);
2649             jy_SSE            = _mm_load_ps(y_align+j);
2650             jz_SSE            = _mm_load_ps(z_align+j);
2651             
2652             /* Calculate distance */
2653             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
2654             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
2655             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
2656             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
2657             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
2658             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
2659             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
2660             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
2661             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
2662             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
2663             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
2664             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
2665             
2666             /* rsq = dx*dx+dy*dy+dz*dz */
2667             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2668             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2669             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2670             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2671             
2672             /* Combine masks */
2673             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
2674             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
2675             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
2676             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
2677             
2678             /* Calculate 1/r and 1/r2 */
2679             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2680             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2681             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2682             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2683             
2684             /* Apply mask */
2685             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
2686             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
2687             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
2688             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
2689             
2690             dr_SSE0            = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
2691             dr_SSE1            = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
2692             dr_SSE2            = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
2693             dr_SSE3            = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
2694             
2695             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2696             raj_SSE            = _mm_load_ps(gb_radius+j);
2697
2698             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2699             
2700             /* Evaluate influence of atom aj -> ai */
2701             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_aj_SSE);
2702             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_aj_SSE);
2703             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_aj_SSE);
2704             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_aj_SSE);
2705             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
2706             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
2707             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
2708             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
2709             t3_SSE0            = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
2710             t3_SSE1            = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
2711             t3_SSE2            = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
2712             t3_SSE3            = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
2713             
2714             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2715             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2716             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2717             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2718             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2719             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2720             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2721             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2722             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2723             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2724             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2725             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2726             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
2727             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
2728             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
2729             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
2730             
2731             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2732             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2733             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2734             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2735             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2736                                            _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
2737             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2738                                            _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
2739             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2740                                            _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
2741             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2742                                            _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
2743             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2744             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2745             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2746             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2747             
2748             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2749             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2750             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2751             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2752             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2753             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2754             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2755             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2756             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2757             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2758             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2759             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2760             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2761             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2762             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2763             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2764             
2765             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2766             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2767             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2768             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2769             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2770             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2771             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2772             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2773             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
2774             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
2775             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
2776             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
2777             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
2778             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2779             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2780             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2781             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2782             
2783             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2784             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2785             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2786             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2787             
2788             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
2789             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
2790             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
2791             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
2792             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2793                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2794                                                        prod_SSE0));
2795             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2796                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2797                                                        prod_SSE1));
2798             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2799                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2800                                                        prod_SSE2));
2801             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2802                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2803                                                        prod_SSE3));
2804             
2805             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2806             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2807             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2808             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2809             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2810             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2811             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2812             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2813             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
2814             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
2815             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
2816             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
2817             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2818             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2819             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2820             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2821             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2822             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2823             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2824             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2825             
2826             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2827             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2828             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2829             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2830             
2831             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2832                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
2833             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2834                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
2835             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2836                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
2837             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2838                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
2839             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2840                                             _mm_mul_ps(onefourth_SSE,
2841                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2842                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2843             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2844                                             _mm_mul_ps(onefourth_SSE,
2845                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2846                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2847             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2848                                             _mm_mul_ps(onefourth_SSE,
2849                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2850                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2851             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2852                                             _mm_mul_ps(onefourth_SSE,
2853                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2854                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2855             
2856             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2857                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2858                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2859             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2860                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2861                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2862             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2863                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2864                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2865             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2866                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2867                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2868             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2869                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2870                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2871             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2872                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2873                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2874             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2875                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2876                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2877             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2878                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2879                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2880             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2881                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2882             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2883                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2884             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2885                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2886             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2887                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2888             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2889                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2890                                                        _mm_add_ps(one_SSE,
2891                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2892             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2893                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2894                                                        _mm_add_ps(one_SSE,
2895                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2896             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2897                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2898                                                        _mm_add_ps(one_SSE,
2899                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2900             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2901                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2902                                                        _mm_add_ps(one_SSE,
2903                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2904             
2905             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2906                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2907                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
2908             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2909                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2910                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
2911             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2912                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2913                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
2914             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2915                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2916                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
2917             
2918             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2919             dadx += 4;
2920             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2921             dadx += 4;
2922             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2923             dadx += 4;
2924             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2925             dadx += 4;
2926             
2927             /* Evaluate influence of atom ai -> aj */
2928             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
2929             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
2930             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
2931             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
2932             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
2933             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
2934             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
2935             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
2936             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
2937             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
2938             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
2939             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
2940             
2941             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2942             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2943             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2944             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2945             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2946             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2947             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2948             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2949             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2950             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2951             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2952             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2953             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
2954             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
2955             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
2956             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
2957             
2958             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2959             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2960             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2961             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2962             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2963                                            _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
2964             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2965                                            _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
2966             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2967                                            _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
2968             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2969                                            _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
2970             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2971             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2972             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2973             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2974             
2975             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2976             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2977             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2978             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2979             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2980             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2981             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2982             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2983             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2984             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2985             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2986             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2987             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2988             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2989             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2990             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2991             
2992             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2993             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2994             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2995             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2996             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2997             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2998             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2999             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
3000             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
3001             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
3002             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
3003             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
3004             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
3005             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
3006             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
3007             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
3008             
3009             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
3010             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
3011             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
3012             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
3013             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
3014             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
3015             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
3016             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
3017             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
3018                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
3019                                                        prod_SSE0));
3020             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
3021                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
3022                                                        prod_SSE1));
3023             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
3024                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
3025                                                        prod_SSE2));
3026             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
3027                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
3028                                                        prod_SSE3));
3029             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
3030             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
3031             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
3032             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
3033             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
3034             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
3035             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
3036             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
3037             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
3038             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
3039             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
3040             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
3041             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
3042             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
3043             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
3044             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
3045             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
3046             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
3047             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
3048             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
3049             
3050             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
3051                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
3052                                                         _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
3053                                                         _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
3054                                                         _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
3055             
3056             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
3057                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
3058             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
3059                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
3060             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
3061                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
3062             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
3063                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
3064             t1_SSE0            = _mm_sub_ps(t1_SSE0,
3065                                             _mm_mul_ps(onefourth_SSE,
3066                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
3067                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
3068             t1_SSE1            = _mm_sub_ps(t1_SSE1,
3069                                             _mm_mul_ps(onefourth_SSE,
3070                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
3071                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
3072             t1_SSE2            = _mm_sub_ps(t1_SSE2,
3073                                             _mm_mul_ps(onefourth_SSE,
3074                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
3075                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
3076             t1_SSE3            = _mm_sub_ps(t1_SSE3,
3077                                             _mm_mul_ps(onefourth_SSE,
3078                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
3079                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
3080             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
3081                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
3082                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
3083             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
3084                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
3085                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
3086             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
3087                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
3088                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
3089             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
3090                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
3091                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
3092             t2_SSE0            = _mm_sub_ps(t2_SSE0,
3093                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
3094                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
3095             t2_SSE1            = _mm_sub_ps(t2_SSE1,
3096                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
3097                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
3098             t2_SSE2            = _mm_sub_ps(t2_SSE2,
3099                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
3100                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
3101             t2_SSE3            = _mm_sub_ps(t2_SSE3,
3102                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
3103                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
3104             
3105             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
3106                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
3107             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
3108                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
3109             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
3110                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
3111             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
3112                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
3113             
3114             t3_SSE0            = _mm_sub_ps(t3_SSE0,
3115                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
3116                                                        _mm_add_ps(one_SSE,
3117                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
3118             t3_SSE1            = _mm_sub_ps(t3_SSE1,
3119                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
3120                                                        _mm_add_ps(one_SSE,
3121                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
3122             t3_SSE2            = _mm_sub_ps(t3_SSE2,
3123                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
3124                                                        _mm_add_ps(one_SSE,
3125                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
3126             t3_SSE3            = _mm_sub_ps(t3_SSE3,
3127                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
3128                                                        _mm_add_ps(one_SSE,
3129                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
3130             
3131             
3132             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
3133                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
3134                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
3135             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
3136                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
3137                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
3138             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
3139                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
3140                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
3141             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
3142                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
3143                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
3144             
3145             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
3146             dadx += 4;
3147             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
3148             dadx += 4;
3149             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
3150             dadx += 4;
3151             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
3152             dadx += 4;
3153         }
3154         _MM_TRANSPOSE4_PS(sum_ai_SSE0,sum_ai_SSE1,sum_ai_SSE2,sum_ai_SSE3);
3155         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,sum_ai_SSE1);
3156         sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,sum_ai_SSE3);
3157         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,sum_ai_SSE2);
3158         _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
3159         }    
3160     
3161     
3162     for(i=0;i<natoms/2+1;i++)
3163     {
3164         work[i] += work[natoms+i];
3165     }
3166     
3167     /* Parallel summations */
3168     
3169         if(PARTDECOMP(cr))
3170         {
3171                 gmx_sum(natoms,work, cr);
3172         }
3173         
3174     if(gb_algorithm==egbHCT)
3175     {
3176         /* HCT */
3177         for(i=0;i<natoms;i++)
3178         {
3179             if(born->use[i] != 0)
3180             {
3181                 rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset; 
3182                 sum_ai  = 1.0/rai - work[i];
3183                 min_rad = rai + born->gb_doffset;
3184                 rad     = 1.0/sum_ai; 
3185                 
3186                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
3187                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3188             }
3189         }
3190         
3191     }
3192     else
3193     {
3194         /* OBC */
3195         
3196         /* Calculate the radii */
3197         for(i=0;i<natoms;i++)
3198         {
3199             
3200             if(born->use[i] != 0)
3201             {
3202                 rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
3203                 rai_inv2   = 1.0/rai;
3204                 rai        = rai-born->gb_doffset; 
3205                 rai_inv    = 1.0/rai;
3206                 sum_ai     = rai * work[i];
3207                 sum_ai2    = sum_ai  * sum_ai;
3208                 sum_ai3    = sum_ai2 * sum_ai;
3209                 
3210                 tsum    = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
3211                 born->bRad[i] = rai_inv - tsum*rai_inv2;
3212                 born->bRad[i] = 1.0 / born->bRad[i];
3213                 
3214                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
3215                 
3216                 tchain  = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
3217                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
3218             }
3219         }
3220     }
3221         
3222         return 0;
3223 }
3224
3225
3226
3227
3228
3229
3230
3231
3232 int
3233 genborn_allvsall_calc_chainrule_sse2_single(t_forcerec *           fr,
3234                                             t_mdatoms *            mdatoms,
3235                                             gmx_genborn_t *        born,
3236                                             real *                 x,
3237                                             real *                 f,
3238                                             int                    gb_algorithm,
3239                                             void *                 paadata)
3240 {
3241         gmx_allvsallgb2_data_t *aadata;
3242         int        natoms;
3243         int        ni0,ni1;
3244         int        nj0,nj1,nj2,nj3;
3245         int        i,j,k,n;
3246     int        idx;
3247     int *      mask;
3248     int *      pmask0;
3249     int *      emask0;
3250     int *      jindex;
3251
3252     real       ix,iy,iz;
3253     real       fix,fiy,fiz;
3254     real       jx,jy,jz;
3255     real       dx,dy,dz;
3256     real       tx,ty,tz;
3257     real       rbai,rbaj,fgb,fgb_ai,rbi;
3258     real *     rb;
3259     real *     dadx;
3260     real *     x_align;
3261     real *     y_align;
3262     real *     z_align;
3263     real *     fx_align;
3264     real *     fy_align;
3265     real *     fz_align;
3266     real       tmpsum[4];
3267     
3268     __m128     jmask_SSE0,jmask_SSE1,jmask_SSE2,jmask_SSE3;
3269     __m128     ix_SSE0,iy_SSE0,iz_SSE0;
3270     __m128     ix_SSE1,iy_SSE1,iz_SSE1;
3271     __m128     ix_SSE2,iy_SSE2,iz_SSE2;
3272     __m128     ix_SSE3,iy_SSE3,iz_SSE3;
3273     __m128     fix_SSE0,fiy_SSE0,fiz_SSE0;
3274     __m128     fix_SSE1,fiy_SSE1,fiz_SSE1;
3275     __m128     fix_SSE2,fiy_SSE2,fiz_SSE2;
3276     __m128     fix_SSE3,fiy_SSE3,fiz_SSE3;
3277     __m128     rbai_SSE0,rbai_SSE1,rbai_SSE2,rbai_SSE3;
3278     __m128     imask_SSE0,imask_SSE1,imask_SSE2,imask_SSE3;
3279     __m128     jx_SSE,jy_SSE,jz_SSE,rbaj_SSE;
3280     __m128     dx_SSE0,dy_SSE0,dz_SSE0;
3281     __m128     dx_SSE1,dy_SSE1,dz_SSE1;
3282     __m128     dx_SSE2,dy_SSE2,dz_SSE2;
3283     __m128     dx_SSE3,dy_SSE3,dz_SSE3;
3284     __m128     fgb_SSE0,fgb_ai_SSE0;
3285     __m128     fgb_SSE1,fgb_ai_SSE1;
3286     __m128     fgb_SSE2,fgb_ai_SSE2;
3287     __m128     fgb_SSE3,fgb_ai_SSE3;
3288     __m128     tx_SSE0,ty_SSE0,tz_SSE0;
3289     __m128     tx_SSE1,ty_SSE1,tz_SSE1;
3290     __m128     tx_SSE2,ty_SSE2,tz_SSE2;
3291     __m128     tx_SSE3,ty_SSE3,tz_SSE3;
3292     __m128     t1,t2;
3293     
3294     natoms              = mdatoms->nr;
3295         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
3296         ni1                 = mdatoms->start+mdatoms->homenr;
3297     dadx                = fr->dadx;
3298     
3299     aadata = (gmx_allvsallgb2_data_t *)paadata;
3300
3301     x_align = aadata->x_align;
3302         y_align = aadata->y_align;
3303         z_align = aadata->z_align;
3304     fx_align = aadata->fx_align;
3305         fy_align = aadata->fy_align;
3306         fz_align = aadata->fz_align;
3307     
3308     jindex    = aadata->jindex_gb;
3309     dadx      = fr->dadx;
3310
3311     n = 0;
3312     rb = aadata->work;
3313     
3314         /* Loop to get the proper form for the Born radius term */
3315         if(gb_algorithm==egbSTILL) 
3316         {
3317                 for(i=0;i<natoms;i++)
3318                 {
3319                         rbi   = born->bRad[i];
3320                         rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
3321                 }
3322         }
3323         else if(gb_algorithm==egbHCT) 
3324         {
3325                 for(i=0;i<natoms;i++)
3326                 {
3327                         rbi   = born->bRad[i];
3328                         rb[i] = rbi * rbi * fr->dvda[i];
3329                 }
3330         }
3331         else if(gb_algorithm==egbOBC) 
3332         {
3333                 for(idx=0;idx<natoms;idx++)
3334                 {
3335                         rbi   = born->bRad[idx];
3336                         rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
3337                 }
3338         }
3339     
3340     for(i=0;i<2*natoms;i++)
3341         {
3342                 fx_align[i]       = 0;
3343                 fy_align[i]       = 0;
3344                 fz_align[i]       = 0;
3345     }        
3346     
3347     
3348     for(i=0;i<natoms;i++)
3349     {
3350         rb[i+natoms]=rb[i];
3351     }
3352
3353     for(i=ni0; i<ni1; i+=UNROLLI)
3354         {
3355                 /* We assume shifts are NOT used for all-vs-all interactions */
3356                 
3357                 /* Load i atom data */
3358                 ix_SSE0          = _mm_load1_ps(x_align+i);
3359                 iy_SSE0          = _mm_load1_ps(y_align+i);
3360                 iz_SSE0          = _mm_load1_ps(z_align+i);
3361                 ix_SSE1          = _mm_load1_ps(x_align+i+1);
3362                 iy_SSE1          = _mm_load1_ps(y_align+i+1);
3363                 iz_SSE1          = _mm_load1_ps(z_align+i+1);
3364                 ix_SSE2          = _mm_load1_ps(x_align+i+2);
3365                 iy_SSE2          = _mm_load1_ps(y_align+i+2);
3366                 iz_SSE2          = _mm_load1_ps(z_align+i+2);
3367                 ix_SSE3          = _mm_load1_ps(x_align+i+3);
3368                 iy_SSE3          = _mm_load1_ps(y_align+i+3);
3369                 iz_SSE3          = _mm_load1_ps(z_align+i+3);
3370         
3371                 fix_SSE0         = _mm_setzero_ps();
3372                 fiy_SSE0         = _mm_setzero_ps();
3373                 fiz_SSE0         = _mm_setzero_ps();
3374                 fix_SSE1         = _mm_setzero_ps();
3375                 fiy_SSE1         = _mm_setzero_ps();
3376                 fiz_SSE1         = _mm_setzero_ps();
3377                 fix_SSE2         = _mm_setzero_ps();
3378                 fiy_SSE2         = _mm_setzero_ps();
3379                 fiz_SSE2         = _mm_setzero_ps();
3380                 fix_SSE3         = _mm_setzero_ps();
3381                 fiy_SSE3         = _mm_setzero_ps();
3382                 fiz_SSE3         = _mm_setzero_ps();
3383         
3384         rbai_SSE0        = _mm_load1_ps(rb+i);
3385         rbai_SSE1        = _mm_load1_ps(rb+i+1);
3386         rbai_SSE2        = _mm_load1_ps(rb+i+2);
3387         rbai_SSE3        = _mm_load1_ps(rb+i+3);
3388         
3389                 /* Load limits for loop over neighbors */
3390                 nj0              = jindex[4*i];
3391                 nj3              = jindex[4*i+3];
3392
3393         /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
3394         for(j=nj0; j<nj3; j+=UNROLLJ)
3395         {          
3396             /* load j atom coordinates */
3397             jx_SSE           = _mm_load_ps(x_align+j);
3398             jy_SSE           = _mm_load_ps(y_align+j);
3399             jz_SSE           = _mm_load_ps(z_align+j);
3400             
3401             /* Calculate distance */
3402             dx_SSE0          = _mm_sub_ps(ix_SSE0,jx_SSE);
3403             dy_SSE0          = _mm_sub_ps(iy_SSE0,jy_SSE);
3404             dz_SSE0          = _mm_sub_ps(iz_SSE0,jz_SSE);
3405             dx_SSE1          = _mm_sub_ps(ix_SSE1,jx_SSE);
3406             dy_SSE1          = _mm_sub_ps(iy_SSE1,jy_SSE);
3407             dz_SSE1          = _mm_sub_ps(iz_SSE1,jz_SSE);
3408             dx_SSE2          = _mm_sub_ps(ix_SSE2,jx_SSE);
3409             dy_SSE2          = _mm_sub_ps(iy_SSE2,jy_SSE);
3410             dz_SSE2          = _mm_sub_ps(iz_SSE2,jz_SSE);
3411             dx_SSE3          = _mm_sub_ps(ix_SSE3,jx_SSE);
3412             dy_SSE3          = _mm_sub_ps(iy_SSE3,jy_SSE);
3413             dz_SSE3          = _mm_sub_ps(iz_SSE3,jz_SSE);
3414             
3415             rbaj_SSE         = _mm_load_ps(rb+j);
3416             
3417             fgb_SSE0         = _mm_mul_ps(rbai_SSE0,_mm_load_ps(dadx));            
3418             dadx += 4;
3419             fgb_SSE1         = _mm_mul_ps(rbai_SSE1,_mm_load_ps(dadx));
3420             dadx += 4;
3421             fgb_SSE2         = _mm_mul_ps(rbai_SSE2,_mm_load_ps(dadx));
3422             dadx += 4;
3423             fgb_SSE3         = _mm_mul_ps(rbai_SSE3,_mm_load_ps(dadx));
3424             dadx += 4;
3425                         
3426             fgb_ai_SSE0      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3427             dadx +=4;
3428             fgb_ai_SSE1      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3429             dadx +=4;
3430             fgb_ai_SSE2      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3431             dadx +=4;
3432             fgb_ai_SSE3      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3433             dadx +=4;
3434             
3435             /* Total force between ai and aj is the sum of ai->aj and aj->ai */
3436             fgb_SSE0         = _mm_add_ps(fgb_SSE0,fgb_ai_SSE0);
3437             fgb_SSE1         = _mm_add_ps(fgb_SSE1,fgb_ai_SSE1);
3438             fgb_SSE2         = _mm_add_ps(fgb_SSE2,fgb_ai_SSE2);
3439             fgb_SSE3         = _mm_add_ps(fgb_SSE3,fgb_ai_SSE3);
3440                             
3441             /* Calculate temporary vectorial force */
3442             tx_SSE0            = _mm_mul_ps(fgb_SSE0,dx_SSE0);
3443             ty_SSE0            = _mm_mul_ps(fgb_SSE0,dy_SSE0);
3444             tz_SSE0            = _mm_mul_ps(fgb_SSE0,dz_SSE0);
3445             tx_SSE1            = _mm_mul_ps(fgb_SSE1,dx_SSE1);
3446             ty_SSE1            = _mm_mul_ps(fgb_SSE1,dy_SSE1);
3447             tz_SSE1            = _mm_mul_ps(fgb_SSE1,dz_SSE1);
3448             tx_SSE2            = _mm_mul_ps(fgb_SSE2,dx_SSE2);
3449             ty_SSE2            = _mm_mul_ps(fgb_SSE2,dy_SSE2);
3450             tz_SSE2            = _mm_mul_ps(fgb_SSE2,dz_SSE2);
3451             tx_SSE3            = _mm_mul_ps(fgb_SSE3,dx_SSE3);
3452             ty_SSE3            = _mm_mul_ps(fgb_SSE3,dy_SSE3);
3453             tz_SSE3            = _mm_mul_ps(fgb_SSE3,dz_SSE3);
3454             
3455             /* Increment i atom force */
3456             fix_SSE0          = _mm_add_ps(fix_SSE0,tx_SSE0);
3457             fiy_SSE0          = _mm_add_ps(fiy_SSE0,ty_SSE0);
3458             fiz_SSE0          = _mm_add_ps(fiz_SSE0,tz_SSE0);
3459             fix_SSE1          = _mm_add_ps(fix_SSE1,tx_SSE1);
3460             fiy_SSE1          = _mm_add_ps(fiy_SSE1,ty_SSE1);
3461             fiz_SSE1          = _mm_add_ps(fiz_SSE1,tz_SSE1);
3462             fix_SSE2          = _mm_add_ps(fix_SSE2,tx_SSE2);
3463             fiy_SSE2          = _mm_add_ps(fiy_SSE2,ty_SSE2);
3464             fiz_SSE2          = _mm_add_ps(fiz_SSE2,tz_SSE2);
3465             fix_SSE3          = _mm_add_ps(fix_SSE3,tx_SSE3);
3466             fiy_SSE3          = _mm_add_ps(fiy_SSE3,ty_SSE3);
3467             fiz_SSE3          = _mm_add_ps(fiz_SSE3,tz_SSE3);
3468             
3469             /* Decrement j atom force */
3470             _mm_store_ps(fx_align+j,
3471                          _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
3472             _mm_store_ps(fy_align+j,
3473                          _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
3474             _mm_store_ps(fz_align+j,
3475                          _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
3476         }
3477                 /* Add i forces to mem and shifted force list */
3478         _MM_TRANSPOSE4_PS(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3);
3479         fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE1);
3480         fix_SSE2 = _mm_add_ps(fix_SSE2,fix_SSE3);
3481         fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE2);
3482         _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
3483         
3484         _MM_TRANSPOSE4_PS(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3);
3485         fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE1);
3486         fiy_SSE2 = _mm_add_ps(fiy_SSE2,fiy_SSE3);
3487         fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE2);
3488         _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
3489         
3490         _MM_TRANSPOSE4_PS(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3);
3491         fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE1);
3492         fiz_SSE2 = _mm_add_ps(fiz_SSE2,fiz_SSE3);
3493         fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE2);
3494         _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));                
3495         }    
3496         
3497     for(i=0;i<natoms;i++)
3498         {
3499                 f[3*i]       += fx_align[i] + fx_align[natoms+i];
3500                 f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
3501                 f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
3502     }
3503     
3504         return 0;
3505 }
3506
3507 #else
3508 /* dummy variable when not using SSE */
3509 int genborn_allvsall_sse2_single_dummy;
3510
3511
3512 #endif
3513