Merge remote-tracking branch 'origin/release-4-6'
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_allvsall_sse2_single.c
1 /*
2  *                This source code is part of
3  *
4  *                 G   R   O   M   A   C   S
5  *
6  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
7  * Copyright (c) 2001-2009, The GROMACS Development Team
8  *
9  * Gromacs is a library for molecular simulation and trajectory analysis,
10  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
11  * a full list of developers and information, check out http://www.gromacs.org
12  *
13  * This program is free software; you can redistribute it and/or modify it under 
14  * the terms of the GNU Lesser General Public License as published by the Free 
15  * Software Foundation; either version 2 of the License, or (at your option) any 
16  * later version.
17  * As a special exception, you may use this file as part of a free software
18  * library without restriction.  Specifically, if other files instantiate
19  * templates or use macros or inline functions from this file, or you compile
20  * this file and link it with other files to produce an executable, this
21  * file does not by itself cause the resulting executable to be covered by
22  * the GNU Lesser General Public License.  
23  *
24  * In plain-speak: do not worry about classes/macros/templates either - only
25  * changes to the library have to be LGPL, not an application linking with it.
26  *
27  * To help fund GROMACS development, we humbly ask that you cite
28  * the papers people have written on it - you can find them on the website!
29  */
30 #ifdef HAVE_CONFIG_H
31 #include <config.h>
32 #endif
33
34 #include <math.h>
35 #include "types/simple.h"
36
37 #include "vec.h"
38 #include "smalloc.h"
39
40 #include "partdec.h"
41 #include "network.h"
42 #include "physics.h"
43 #include "genborn.h"
44 #include "genborn_allvsall.h"
45
46 #if ( (defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_SSE2)) && !defined(GMX_DOUBLE) )
47
48 #include <gmx_sse2_single.h>
49
50
51 #define SIMD_WIDTH 4
52 #define UNROLLI    4
53 #define UNROLLJ    4
54
55
56
57
58
59
60
61
62
63 typedef struct 
64 {
65     int *      jindex_gb;
66     int **     prologue_mask_gb;
67     int **     epilogue_mask;
68     int *      imask;
69     real *     gb_radius;
70     real *     workparam;
71     real *     work;
72     real *     x_align;
73     real *     y_align;
74     real *     z_align;
75     real *     fx_align;
76         real *     fy_align;
77         real *     fz_align;        
78
79 gmx_allvsallgb2_data_t;
80
81
82 static int 
83 calc_maxoffset(int i,int natoms)
84 {
85     int maxoffset;
86     
87     if ((natoms % 2) == 1)
88     {
89         /* Odd number of atoms, easy */
90         maxoffset = natoms/2;
91     }
92     else if ((natoms % 4) == 0)
93     {
94         /* Multiple of four is hard */
95         if (i < natoms/2)
96         {
97             if ((i % 2) == 0)
98             {
99                 maxoffset=natoms/2;
100             }
101             else
102             {
103                 maxoffset=natoms/2-1;
104             }
105         }
106         else
107         {
108             if ((i % 2) == 1)
109             {
110                 maxoffset=natoms/2;
111             }
112             else
113             {
114                 maxoffset=natoms/2-1;
115             }
116         }
117     }
118     else
119     {
120         /* natoms/2 = odd */
121         if ((i % 2) == 0)
122         {
123             maxoffset=natoms/2;
124         }
125         else
126         {
127             maxoffset=natoms/2-1;
128         }
129     }
130     
131     return maxoffset;
132 }
133
134 static void
135 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
136                                 t_ilist *                  ilist,
137                                 int                        start,
138                                 int                        end,
139                                 int                        natoms,
140                                 gmx_bool                       bInclude12,
141                                 gmx_bool                       bInclude13,
142                                 gmx_bool                       bInclude14)
143 {
144     int i,j,k,tp;
145     int a1,a2;
146     int ni0,ni1,nj0,nj1,nj;
147     int imin,imax,iexcl;
148     int max_offset;
149     int max_excl_offset;
150     int firstinteraction;
151     int ibase;
152     int  *pi;
153     
154     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
155      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
156      * whether they should interact or not. 
157      *
158      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
159      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
160      * we create a jindex array with three elements per i atom: the starting point, the point to
161      * which we need to check exclusions, and the end point.
162      * This way we only have to allocate a short exclusion mask per i atom.
163      */
164     
165     ni0 = (start/UNROLLI)*UNROLLI;
166     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
167     
168     /* Set the interaction mask to only enable the i atoms we want to include */
169     snew(pi,natoms+UNROLLI+2*SIMD_WIDTH);
170     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
171     for(i=0;i<natoms+UNROLLI;i++)
172     {
173         aadata->imask[i] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
174     }
175     
176     /* Allocate memory for our modified jindex array */
177     snew(aadata->jindex_gb,4*(natoms+UNROLLI));
178     for(i=0;i<4*(natoms+UNROLLI);i++)
179     {
180         aadata->jindex_gb[i] = 0;
181     }
182     
183     /* Create the exclusion masks for the prologue part */
184         snew(aadata->prologue_mask_gb,natoms+UNROLLI); /* list of pointers */
185         
186     /* First zero everything to avoid uninitialized data */
187     for(i=0;i<natoms+UNROLLI;i++)
188     {
189         aadata->prologue_mask_gb[i] = NULL;
190     }
191     
192     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
193     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
194         {
195         max_excl_offset = -1;
196         
197         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
198         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
199         
200         /* Which atom is the first we (might) interact with? */
201         imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
202         for(i=ibase;i<imax;i++)
203         {
204             /* Before exclusions, which atom is the first we (might) interact with? */
205             firstinteraction = i+1;
206             max_offset = calc_maxoffset(i,natoms);
207
208             if(!bInclude12)
209             {
210                 for(j=0;j<ilist[F_GB12].nr;j+=3)
211                 {
212                     a1 = ilist[F_GB12].iatoms[j+1];
213                     a2 = ilist[F_GB12].iatoms[j+2];
214
215                     if(a1==i)
216                     {
217                         k = a2;
218                     }
219                     else if(a2==i)
220                     {
221                         k = a1;
222                     }
223                     else 
224                     {
225                         continue;
226                     }
227
228                     if(k==firstinteraction)
229                     {
230                         firstinteraction++;
231                     }
232                 }
233             }
234             if(!bInclude13)
235             {
236                 for(j=0;j<ilist[F_GB13].nr;j+=3)
237                 {
238                     a1 = ilist[F_GB13].iatoms[j+1];
239                     a2 = ilist[F_GB13].iatoms[j+2];
240                     
241                     if(a1==i)
242                     {
243                         k = a2;
244                     }
245                     else if(a2==i)
246                     {
247                         k = a1;
248                     }
249                     else 
250                     {
251                         continue;
252                     }
253                     
254                     if(k==firstinteraction)
255                     {
256                         firstinteraction++;
257                     }
258                 }
259             }
260             if(!bInclude14)
261             {
262                 for(j=0;j<ilist[F_GB14].nr;j+=3)
263                 {
264                     a1 = ilist[F_GB14].iatoms[j+1];
265                     a2 = ilist[F_GB14].iatoms[j+2];
266                     if(a1==i)
267                     {
268                         k = a2;
269                     }
270                     else if(a2==i)
271                     {
272                         k = a1;
273                     }
274                     else 
275                     {
276                         continue;
277                     }
278                     
279                     if(k==firstinteraction)
280                     {
281                         firstinteraction++;
282                     }
283                 }
284             }
285             imin = (firstinteraction < imin) ? firstinteraction : imin;
286         }
287         /* round down to j unrolling factor */
288         imin = (imin/UNROLLJ)*UNROLLJ;
289
290         for(i=ibase;i<imax;i++)
291         {
292             max_offset = calc_maxoffset(i,natoms);
293             
294             if(!bInclude12)
295             {
296                 for(j=0;j<ilist[F_GB12].nr;j+=3)
297                 {
298                     a1 = ilist[F_GB12].iatoms[j+1];
299                     a2 = ilist[F_GB12].iatoms[j+2];
300                     
301                     if(a1==i)
302                     {
303                         k = a2;
304                     }
305                     else if(a2==i)
306                     {
307                         k = a1;
308                     }
309                     else 
310                     {
311                         continue;
312                     }
313                     
314                     if(k<imin)
315                     {
316                         k += natoms;
317                     }
318                     
319                     if(k>i+max_offset)
320                     {
321                         continue;
322                     }
323                     
324                     k = k - imin;
325                     
326                     if( k+natoms <= max_offset )
327                     {
328                         k+=natoms;
329                     }
330                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
331                 }
332             }
333             if(!bInclude13)
334             {
335                 for(j=0;j<ilist[F_GB13].nr;j+=3)
336                 {
337                     a1 = ilist[F_GB13].iatoms[j+1];
338                     a2 = ilist[F_GB13].iatoms[j+2];
339                     
340                     if(a1==i)
341                     {
342                         k = a2;
343                     }
344                     else if(a2==i)
345                     {
346                         k = a1;
347                     }
348                     else 
349                     {
350                         continue;
351                     }
352                     
353                     if(k<imin)
354                     {
355                         k += natoms;
356                     }
357                     
358                     if(k>i+max_offset)
359                     {
360                         continue;
361                     }
362                     
363                     k = k - imin;
364                     
365                     if( k+natoms <= max_offset )
366                     {
367                         k+=natoms;
368                     }
369                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
370                 }
371             }
372             if(!bInclude14)
373             {
374                 for(j=0;j<ilist[F_GB14].nr;j+=3)
375                 {
376                     a1 = ilist[F_GB14].iatoms[j+1];
377                     a2 = ilist[F_GB14].iatoms[j+2];
378
379                     if(a1==i)
380                     {
381                         k = a2;
382                     }
383                     else if(a2==i)
384                     {
385                         k = a1;
386                     }
387                     else 
388                     {
389                         continue;
390                     }
391                     
392                     if(k<imin)
393                     {
394                         k += natoms;
395                     }
396                     
397                     if(k>i+max_offset)
398                     {
399                         continue;
400                     }
401                     
402                     k = k - imin;
403                     
404                     if( k+natoms <= max_offset )
405                     {
406                         k+=natoms;
407                     }
408                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
409                 }
410             }
411         }
412
413         /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
414         max_excl_offset++;
415         /* round up to j unrolling factor */
416         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
417         
418         /* Set all the prologue masks length to this value (even for i>end) */
419         for(i=ibase;i<ibase+UNROLLI;i++)
420         {
421             aadata->jindex_gb[4*i]   = imin;
422             aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
423         }        
424     }
425      
426     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
427     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
428     {      
429         for(i=ibase;i<ibase+UNROLLI;i++)
430         {
431             nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
432             imin = aadata->jindex_gb[4*i];
433             
434             /* Allocate aligned memory */
435             snew(pi,nj+2*SIMD_WIDTH);
436             aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
437             
438             max_offset = calc_maxoffset(i,natoms);
439             
440             /* Include interactions i+1 <= j < i+maxoffset */
441             for(k=0;k<nj;k++)
442             {
443                 j = imin + k;
444                 
445                 if( (j>i) && (j<=i+max_offset) )
446                 {
447                     aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
448                 }
449                 else
450                 {
451                     aadata->prologue_mask_gb[i][k] = 0;
452                 }
453             }
454             
455             /* Clear out the explicit exclusions */
456             if(i<end)
457             {
458                 if(!bInclude12)
459                 {
460                     for(j=0;j<ilist[F_GB12].nr;j+=3)
461                     {
462                         a1 = ilist[F_GB12].iatoms[j+1];
463                         a2 = ilist[F_GB12].iatoms[j+2];
464                         
465                         if(a1==i)
466                         {
467                             k = a2;
468                         }
469                         else if(a2==i)
470                         {
471                             k = a1;
472                         }
473                         else 
474                         {
475                             continue;
476                         }
477                         
478                         if(k>i+max_offset)
479                         {
480                             continue;
481                         }
482                         k = k-i;
483                         
484                         if( k+natoms <= max_offset )
485                         {
486                             k+=natoms;
487                         }
488                         
489                         k = k+i-imin;
490                         if(k>=0)
491                         {                        
492                             aadata->prologue_mask_gb[i][k] = 0;
493                         }
494                     }
495                 }
496                 if(!bInclude13)
497                 {
498                     for(j=0;j<ilist[F_GB13].nr;j+=3)
499                     {
500                         a1 = ilist[F_GB13].iatoms[j+1];
501                         a2 = ilist[F_GB13].iatoms[j+2];
502  
503                         if(a1==i)
504                         {
505                             k = a2;
506                         }
507                         else if(a2==i)
508                         {
509                             k = a1;
510                         }
511                         else 
512                         {
513                             continue;
514                         }
515                         
516                         if(k>i+max_offset)
517                         {
518                             continue;
519                         }
520                         k = k-i;
521                         
522                         if( k+natoms <= max_offset )
523                         {
524                             k+=natoms;
525                         }
526                         
527                         k = k+i-imin;
528                         if(k>=0)
529                         {                        
530                             aadata->prologue_mask_gb[i][k] = 0;
531                         }
532                     }
533                 }
534                 if(!bInclude14)
535                 {
536                     for(j=0;j<ilist[F_GB14].nr;j+=3)
537                     {
538                         a1 = ilist[F_GB14].iatoms[j+1];
539                         a2 = ilist[F_GB14].iatoms[j+2];
540
541                         if(a1==i)
542                         {
543                             k = a2;
544                         }
545                         else if(a2==i)
546                         {
547                             k = a1;
548                         }
549                         else 
550                         {
551                             continue;
552                         }
553                         
554                         if(k>i+max_offset)
555                         {
556                             continue;
557                         }
558                         k = k-i;
559                         
560                         if( k+natoms <= max_offset )
561                         {
562                             k+=natoms;
563                         }
564                         
565                         k = k+i-imin;
566                         if(k>=0)
567                         {                        
568                             aadata->prologue_mask_gb[i][k] = 0;
569                         }
570                     }
571                 }
572             }
573         }
574     }
575     
576     /* Construct the epilogue mask - this just contains the check for maxoffset */
577     snew(aadata->epilogue_mask,natoms+UNROLLI);
578     
579     /* First zero everything to avoid uninitialized data */
580     for(i=0;i<natoms+UNROLLI;i++)
581     {
582         aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
583         aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
584         aadata->epilogue_mask[i] = NULL;
585     }
586     
587     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
588     {      
589         /* Find the lowest index for which we need to use the epilogue */
590         imin = ibase;
591         max_offset = calc_maxoffset(imin,natoms);
592
593         imin = imin + 1 + max_offset;
594         
595         /* Find largest index for which we need to use the epilogue */
596         imax = ibase + UNROLLI-1;
597         imax = (imax < end) ? imax : end; 
598         
599         max_offset = calc_maxoffset(imax,natoms);
600         imax = imax + 1 + max_offset + UNROLLJ - 1;
601         
602         for(i=ibase;i<ibase+UNROLLI;i++)
603         {
604             /* Start of epilogue - round down to j tile limit */
605             aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
606             /* Make sure we dont overlap - for small systems everything is done in the prologue */
607             aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
608             /* Round upwards to j tile limit */
609             aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
610             /* Make sure we dont have a negative range for the epilogue */
611             aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
612         }
613     }
614     
615     /* And fill it with data... */
616     
617     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
618     {
619         for(i=ibase;i<ibase+UNROLLI;i++)
620         {
621             
622             nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
623             
624             /* Allocate aligned memory */
625             snew(pi,nj+2*SIMD_WIDTH);
626             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
627             
628             max_offset = calc_maxoffset(i,natoms);
629             
630             for(k=0;k<nj;k++)
631             {
632                 j = aadata->jindex_gb[4*i+2] + k;
633                 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
634             }
635         }
636     }
637 }
638
639
640 static void
641 genborn_allvsall_setup(gmx_allvsallgb2_data_t **  p_aadata,
642                        gmx_localtop_t *           top,
643                        gmx_genborn_t *            born,
644                        t_mdatoms *                mdatoms,
645                        real                       radius_offset,
646                        int                        gb_algorithm,
647                        gmx_bool                       bInclude12,
648                        gmx_bool                       bInclude13,
649                        gmx_bool                       bInclude14)
650 {
651         int i,j,idx;
652     int natoms;
653         gmx_allvsallgb2_data_t *aadata;
654     real *p;
655     
656     natoms = mdatoms->nr;
657     
658         snew(aadata,1);
659         *p_aadata = aadata;
660     
661         snew(p,2*natoms+2*SIMD_WIDTH);
662         aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
663         snew(p,2*natoms+2*SIMD_WIDTH);
664         aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
665         snew(p,2*natoms+2*SIMD_WIDTH);
666         aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
667         snew(p,2*natoms+2*SIMD_WIDTH);
668         aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
669         snew(p,2*natoms+2*SIMD_WIDTH);
670         aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
671         snew(p,2*natoms+2*SIMD_WIDTH);
672         aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
673
674     snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
675     aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
676         
677     snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
678     aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
679     
680     snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
681     aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
682     
683     for(i=0;i<mdatoms->nr;i++)
684     {
685         aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
686         if(gb_algorithm==egbSTILL)
687         {
688             aadata->workparam[i] = born->vsolv[i];
689         }
690         else if(gb_algorithm==egbOBC)
691         {
692             aadata->workparam[i] = born->param[i];
693         }
694         aadata->work[i]      = 0.0;
695     }
696     for(i=0;i<mdatoms->nr;i++)
697     {
698         aadata->gb_radius[natoms+i] = aadata->gb_radius[i]; 
699         aadata->workparam[natoms+i] = aadata->workparam[i];
700         aadata->work[natoms+i]      = aadata->work[i];
701     }
702     
703     for(i=0;i<2*natoms+SIMD_WIDTH;i++)
704         {
705                 aadata->x_align[i] = 0.0;
706                 aadata->y_align[i] = 0.0;
707                 aadata->z_align[i] = 0.0;
708                 aadata->fx_align[i] = 0.0;
709                 aadata->fy_align[i] = 0.0;
710                 aadata->fz_align[i] = 0.0;
711         }
712     
713     setup_gb_exclusions_and_indices(aadata,top->idef.il,mdatoms->start,mdatoms->start+mdatoms->homenr,mdatoms->nr,
714                                     bInclude12,bInclude13,bInclude14);
715 }
716
717
718 int
719 genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
720                                               t_mdatoms *            mdatoms,
721                                               gmx_genborn_t *        born,
722                                               gmx_localtop_t *       top,
723                                               real *                 x,
724                                               t_commrec *            cr,
725                                               void *                 paadata)
726 {
727         gmx_allvsallgb2_data_t *aadata;
728         int        natoms;
729         int        ni0,ni1;
730         int        nj0,nj1,nj2,nj3;
731         int        i,j,k,n;
732     int *      mask;
733     int *      pmask0;
734     int *      pmask1;
735     int *      pmask2;
736     int *      pmask3;
737     int *      emask0;
738     int *      emask1;
739     int *      emask2;
740     int *      emask3;
741     real       ix,iy,iz;
742     real       jx,jy,jz;
743     real       dx,dy,dz;
744     real       rsq,rinv;
745     real       gpi,rai,vai;
746     real       prod_ai;
747     real       irsq,idr4,idr6;
748     real       raj,rvdw,ratio;
749     real       vaj,ccf,dccf,theta,cosq;
750     real       term,prod,icf4,icf6,gpi2,factor,sinq;
751     real *     gb_radius;
752     real *     vsolv;
753     real *     work;
754     real       tmpsum[4];
755     real *     x_align;
756     real *     y_align;
757     real *     z_align;
758     int *      jindex;
759     real *     dadx;
760     
761     __m128     ix_SSE0,iy_SSE0,iz_SSE0;
762     __m128     ix_SSE1,iy_SSE1,iz_SSE1;
763     __m128     ix_SSE2,iy_SSE2,iz_SSE2;
764     __m128     ix_SSE3,iy_SSE3,iz_SSE3;
765     __m128     gpi_SSE0,rai_SSE0,prod_ai_SSE0;
766     __m128     gpi_SSE1,rai_SSE1,prod_ai_SSE1;
767     __m128     gpi_SSE2,rai_SSE2,prod_ai_SSE2;
768     __m128     gpi_SSE3,rai_SSE3,prod_ai_SSE3;
769     __m128     imask_SSE0,jmask_SSE0;
770     __m128     imask_SSE1,jmask_SSE1;
771     __m128     imask_SSE2,jmask_SSE2;
772     __m128     imask_SSE3,jmask_SSE3;
773     __m128     jx_SSE,jy_SSE,jz_SSE;
774     __m128     dx_SSE0,dy_SSE0,dz_SSE0;
775     __m128     dx_SSE1,dy_SSE1,dz_SSE1;
776     __m128     dx_SSE2,dy_SSE2,dz_SSE2;
777     __m128     dx_SSE3,dy_SSE3,dz_SSE3;
778     __m128     rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
779     __m128     rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
780     __m128     rsq_SSE2,rinv_SSE2,irsq_SSE2,idr4_SSE2,idr6_SSE2;
781     __m128     rsq_SSE3,rinv_SSE3,irsq_SSE3,idr4_SSE3,idr6_SSE3;
782     __m128     raj_SSE,vaj_SSE,prod_SSE;
783     __m128     rvdw_SSE0,ratio_SSE0;
784     __m128     rvdw_SSE1,ratio_SSE1;
785     __m128     rvdw_SSE2,ratio_SSE2;
786     __m128     rvdw_SSE3,ratio_SSE3;
787     __m128     theta_SSE0,sinq_SSE0,cosq_SSE0,term_SSE0;
788     __m128     theta_SSE1,sinq_SSE1,cosq_SSE1,term_SSE1;
789     __m128     theta_SSE2,sinq_SSE2,cosq_SSE2,term_SSE2;
790     __m128     theta_SSE3,sinq_SSE3,cosq_SSE3,term_SSE3;
791     __m128     ccf_SSE0,dccf_SSE0;
792     __m128     ccf_SSE1,dccf_SSE1;
793     __m128     ccf_SSE2,dccf_SSE2;
794     __m128     ccf_SSE3,dccf_SSE3;
795     __m128     icf4_SSE0,icf6_SSE0;
796     __m128     icf4_SSE1,icf6_SSE1;
797     __m128     icf4_SSE2,icf6_SSE2;
798     __m128     icf4_SSE3,icf6_SSE3;
799     __m128     half_SSE,one_SSE,two_SSE,four_SSE;
800     __m128     still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
801     
802     natoms              = mdatoms->nr;
803         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
804         ni1                 = mdatoms->start+mdatoms->homenr;
805
806     n = 0;
807     
808     aadata = *((gmx_allvsallgb2_data_t **)paadata);
809
810     
811         if(aadata==NULL)
812         {
813                 genborn_allvsall_setup(&aadata,top,born,mdatoms,0.0,
814                                egbSTILL,FALSE,FALSE,TRUE);
815         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
816         }
817         
818     x_align = aadata->x_align;
819         y_align = aadata->y_align;
820         z_align = aadata->z_align;
821
822     gb_radius = aadata->gb_radius;
823     vsolv     = aadata->workparam;
824     work      = aadata->work;
825     jindex    = aadata->jindex_gb;
826     dadx      = fr->dadx;
827     
828     still_p4_SSE    = _mm_set1_ps(STILL_P4);
829     still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
830     still_pip5_SSE  = _mm_set1_ps(STILL_PIP5);
831     half_SSE        = _mm_set1_ps(0.5);
832     one_SSE         = _mm_set1_ps(1.0);
833     two_SSE         = _mm_set1_ps(2.0);
834     four_SSE        = _mm_set1_ps(4.0);
835     
836     /* This will be summed, so it has to extend to natoms + buffer */
837     for(i=0;i<natoms+1+natoms/2;i++)
838     {
839         work[i] = 0;
840     }
841     
842         for(i=ni0;i<ni1+1+natoms/2;i++)
843         {
844         k = i%natoms;
845                 x_align[i]  = x[3*k];
846                 y_align[i]  = x[3*k+1];
847                 z_align[i]  = x[3*k+2];
848         work[i]     = 0;
849     }        
850     
851     
852         for(i=ni0; i<ni1; i+=UNROLLI)
853         {
854                 /* We assume shifts are NOT used for all-vs-all interactions */
855
856                 /* Load i atom data */
857                 ix_SSE0          = _mm_load1_ps(x_align+i);
858                 iy_SSE0          = _mm_load1_ps(y_align+i);
859                 iz_SSE0          = _mm_load1_ps(z_align+i);
860                 ix_SSE1          = _mm_load1_ps(x_align+i+1);
861                 iy_SSE1          = _mm_load1_ps(y_align+i+1);
862                 iz_SSE1          = _mm_load1_ps(z_align+i+1);
863                 ix_SSE2          = _mm_load1_ps(x_align+i+2);
864                 iy_SSE2          = _mm_load1_ps(y_align+i+2);
865                 iz_SSE2          = _mm_load1_ps(z_align+i+2);
866                 ix_SSE3          = _mm_load1_ps(x_align+i+3);
867                 iy_SSE3          = _mm_load1_ps(y_align+i+3);
868                 iz_SSE3          = _mm_load1_ps(z_align+i+3);
869         
870         gpi_SSE0         = _mm_setzero_ps();
871         gpi_SSE1         = _mm_setzero_ps();
872         gpi_SSE2         = _mm_setzero_ps();
873         gpi_SSE3         = _mm_setzero_ps();
874
875         rai_SSE0         = _mm_load1_ps(gb_radius+i);
876         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
877         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
878         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
879         
880         prod_ai_SSE0     = _mm_set1_ps(STILL_P4*vsolv[i]);
881         prod_ai_SSE1     = _mm_set1_ps(STILL_P4*vsolv[i+1]);
882         prod_ai_SSE2     = _mm_set1_ps(STILL_P4*vsolv[i+2]);
883         prod_ai_SSE3     = _mm_set1_ps(STILL_P4*vsolv[i+3]);
884         
885                 /* Load limits for loop over neighbors */
886                 nj0              = jindex[4*i];
887                 nj1              = jindex[4*i+1];
888                 nj2              = jindex[4*i+2];
889                 nj3              = jindex[4*i+3];
890                         
891         pmask0           = aadata->prologue_mask_gb[i];
892         pmask1           = aadata->prologue_mask_gb[i+1];
893         pmask2           = aadata->prologue_mask_gb[i+2];
894         pmask3           = aadata->prologue_mask_gb[i+3];
895         emask0           = aadata->epilogue_mask[i]; 
896         emask1           = aadata->epilogue_mask[i+1]; 
897         emask2           = aadata->epilogue_mask[i+2]; 
898         emask3           = aadata->epilogue_mask[i+3]; 
899
900         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
901         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
902         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
903         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
904         
905         /* Prologue part, including exclusion mask */
906         for(j=nj0; j<nj1; j+=UNROLLJ)
907         {          
908             jmask_SSE0 = _mm_load_ps((real *)pmask0);
909             jmask_SSE1 = _mm_load_ps((real *)pmask1);
910             jmask_SSE2 = _mm_load_ps((real *)pmask2);
911             jmask_SSE3 = _mm_load_ps((real *)pmask3);
912             pmask0 += UNROLLJ;
913             pmask1 += UNROLLJ;
914             pmask2 += UNROLLJ;
915             pmask3 += UNROLLJ;
916             
917             /* load j atom coordinates */
918             jx_SSE            = _mm_load_ps(x_align+j);
919             jy_SSE            = _mm_load_ps(y_align+j);
920             jz_SSE            = _mm_load_ps(z_align+j);
921             
922             /* Calculate distance */
923             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
924             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
925             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
926             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
927             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
928             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
929             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
930             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
931             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
932             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
933             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
934             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
935             
936             /* rsq = dx*dx+dy*dy+dz*dz */
937             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
938             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
939             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
940             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
941             
942             /* Combine masks */
943             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
944             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
945             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
946             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
947             
948             /* Calculate 1/r and 1/r2 */
949             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
950             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
951             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
952             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
953             
954             /* Apply mask */
955             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
956             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
957             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
958             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
959             
960             irsq_SSE0          = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
961             irsq_SSE1          = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
962             irsq_SSE2          = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
963             irsq_SSE3          = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
964             idr4_SSE0          = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
965             idr4_SSE1          = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
966             idr4_SSE2          = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
967             idr4_SSE3          = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
968             idr6_SSE0          = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
969             idr6_SSE1          = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
970             idr6_SSE2          = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
971             idr6_SSE3          = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
972             
973             raj_SSE            = _mm_load_ps(gb_radius+j);
974             vaj_SSE            = _mm_load_ps(vsolv+j);
975             
976             rvdw_SSE0          = _mm_add_ps(rai_SSE0,raj_SSE);
977             rvdw_SSE1          = _mm_add_ps(rai_SSE1,raj_SSE);
978             rvdw_SSE2          = _mm_add_ps(rai_SSE2,raj_SSE);
979             rvdw_SSE3          = _mm_add_ps(rai_SSE3,raj_SSE);
980             
981             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
982             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
983             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
984             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
985
986             ratio_SSE0         = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
987             ratio_SSE1         = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
988             ratio_SSE2         = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
989             ratio_SSE3         = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
990             theta_SSE0         = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
991             theta_SSE1         = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
992             theta_SSE2         = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
993             theta_SSE3         = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
994             gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
995             gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
996             gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);            
997             gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);            
998             term_SSE0          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
999             term_SSE1          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1000             term_SSE2          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1001             term_SSE3          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1002             ccf_SSE0           = _mm_mul_ps(term_SSE0,term_SSE0);
1003             ccf_SSE1           = _mm_mul_ps(term_SSE1,term_SSE1);
1004             ccf_SSE2           = _mm_mul_ps(term_SSE2,term_SSE2);
1005             ccf_SSE3           = _mm_mul_ps(term_SSE3,term_SSE3);
1006             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1007                                             _mm_mul_ps(sinq_SSE0,theta_SSE0));
1008             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1009                                             _mm_mul_ps(sinq_SSE1,theta_SSE1));
1010             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1011                                             _mm_mul_ps(sinq_SSE2,theta_SSE2));
1012             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1013                                             _mm_mul_ps(sinq_SSE3,theta_SSE3));
1014
1015             prod_SSE           = _mm_mul_ps(still_p4_SSE,vaj_SSE);
1016             icf4_SSE0          = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1017             icf4_SSE1          = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1018             icf4_SSE2          = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1019             icf4_SSE3          = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1020             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1021             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1022             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1023             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1024
1025             _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1026                                              gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1027                                                          _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1028                                                          _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1029                                                          _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1030             
1031             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1032             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1033             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1034             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1035
1036             /* Save ai->aj and aj->ai chain rule terms */
1037             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1038             dadx+=4;
1039             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1040             dadx+=4;
1041             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1042             dadx+=4;
1043             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1044             dadx+=4;            
1045             
1046             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1047             dadx+=4;
1048             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1049             dadx+=4;
1050             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1051             dadx+=4;
1052             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1053             dadx+=4;
1054         }
1055                                  
1056         /* Main part, no exclusions */
1057         for(j=nj1; j<nj2; j+=UNROLLJ)
1058         {               
1059             /* load j atom coordinates */
1060             jx_SSE            = _mm_load_ps(x_align+j);
1061             jy_SSE            = _mm_load_ps(y_align+j);
1062             jz_SSE            = _mm_load_ps(z_align+j);
1063             
1064             /* Calculate distance */
1065             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
1066             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
1067             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
1068             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
1069             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
1070             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
1071             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
1072             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
1073             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
1074             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
1075             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
1076             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
1077             
1078             /* rsq = dx*dx+dy*dy+dz*dz */
1079             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1080             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1081             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1082             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1083             
1084             /* Calculate 1/r and 1/r2 */
1085             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1086             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1087             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1088             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1089             
1090             /* Apply mask */
1091             rinv_SSE0          = _mm_and_ps(rinv_SSE0,imask_SSE0);
1092             rinv_SSE1          = _mm_and_ps(rinv_SSE1,imask_SSE1);
1093             rinv_SSE2          = _mm_and_ps(rinv_SSE2,imask_SSE2);
1094             rinv_SSE3          = _mm_and_ps(rinv_SSE3,imask_SSE3);
1095             
1096             irsq_SSE0          = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1097             irsq_SSE1          = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1098             irsq_SSE2          = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1099             irsq_SSE3          = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1100             idr4_SSE0          = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
1101             idr4_SSE1          = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
1102             idr4_SSE2          = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
1103             idr4_SSE3          = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
1104             idr6_SSE0          = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
1105             idr6_SSE1          = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
1106             idr6_SSE2          = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
1107             idr6_SSE3          = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
1108             
1109             raj_SSE            = _mm_load_ps(gb_radius+j);
1110             
1111             rvdw_SSE0          = _mm_add_ps(rai_SSE0,raj_SSE);
1112             rvdw_SSE1          = _mm_add_ps(rai_SSE1,raj_SSE);
1113             rvdw_SSE2          = _mm_add_ps(rai_SSE2,raj_SSE);
1114             rvdw_SSE3          = _mm_add_ps(rai_SSE3,raj_SSE);
1115             vaj_SSE            = _mm_load_ps(vsolv+j);
1116             
1117             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
1118             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
1119             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
1120             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
1121             
1122             ratio_SSE0         = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
1123             ratio_SSE1         = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
1124             ratio_SSE2         = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
1125             ratio_SSE3         = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
1126             theta_SSE0         = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
1127             theta_SSE1         = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
1128             theta_SSE2         = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
1129             theta_SSE3         = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
1130             gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
1131             gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
1132             gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);            
1133             gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);            
1134             term_SSE0          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
1135             term_SSE1          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1136             term_SSE2          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1137             term_SSE3          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1138             ccf_SSE0           = _mm_mul_ps(term_SSE0,term_SSE0);
1139             ccf_SSE1           = _mm_mul_ps(term_SSE1,term_SSE1);
1140             ccf_SSE2           = _mm_mul_ps(term_SSE2,term_SSE2);
1141             ccf_SSE3           = _mm_mul_ps(term_SSE3,term_SSE3);
1142             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1143                                             _mm_mul_ps(sinq_SSE0,theta_SSE0));
1144             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1145                                             _mm_mul_ps(sinq_SSE1,theta_SSE1));
1146             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1147                                             _mm_mul_ps(sinq_SSE2,theta_SSE2));
1148             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1149                                             _mm_mul_ps(sinq_SSE3,theta_SSE3));
1150             
1151             prod_SSE           = _mm_mul_ps(still_p4_SSE,vaj_SSE );
1152             icf4_SSE0          = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1153             icf4_SSE1          = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1154             icf4_SSE2          = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1155             icf4_SSE3          = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1156             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1157             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1158             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1159             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1160             
1161             _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1162                                              gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1163                                                          _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1164                                                          _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1165                                                          _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1166             
1167             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1168             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1169             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1170             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1171             
1172             /* Save ai->aj and aj->ai chain rule terms */
1173             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1174             dadx+=4;
1175             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1176             dadx+=4;
1177             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1178             dadx+=4;
1179             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1180             dadx+=4;            
1181             
1182             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1183             dadx+=4;
1184             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1185             dadx+=4;
1186             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1187             dadx+=4;
1188             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1189             dadx+=4;
1190         }
1191         /* Epilogue part, including exclusion mask */
1192         for(j=nj2; j<nj3; j+=UNROLLJ)
1193         {                     
1194             jmask_SSE0 = _mm_load_ps((real *)emask0);
1195             jmask_SSE1 = _mm_load_ps((real *)emask1);
1196             jmask_SSE2 = _mm_load_ps((real *)emask2);
1197             jmask_SSE3 = _mm_load_ps((real *)emask3);
1198             emask0 += UNROLLJ;
1199             emask1 += UNROLLJ;
1200             emask2 += UNROLLJ;
1201             emask3 += UNROLLJ;
1202             
1203             /* load j atom coordinates */
1204             jx_SSE            = _mm_load_ps(x_align+j);
1205             jy_SSE            = _mm_load_ps(y_align+j);
1206             jz_SSE            = _mm_load_ps(z_align+j);
1207             
1208             /* Calculate distance */
1209             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
1210             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
1211             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
1212             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
1213             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
1214             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
1215             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
1216             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
1217             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
1218             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
1219             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
1220             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
1221             
1222             /* rsq = dx*dx+dy*dy+dz*dz */
1223             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1224             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1225             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1226             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1227             
1228             /* Combine masks */
1229             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
1230             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
1231             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
1232             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
1233             
1234             /* Calculate 1/r and 1/r2 */
1235             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1236             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1237             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1238             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1239             
1240             /* Apply mask */
1241             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1242             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1243             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1244             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1245             
1246             irsq_SSE0          = _mm_mul_ps(rinv_SSE0,rinv_SSE0);
1247             irsq_SSE1          = _mm_mul_ps(rinv_SSE1,rinv_SSE1);
1248             irsq_SSE2          = _mm_mul_ps(rinv_SSE2,rinv_SSE2);
1249             irsq_SSE3          = _mm_mul_ps(rinv_SSE3,rinv_SSE3);
1250             idr4_SSE0          = _mm_mul_ps(irsq_SSE0,irsq_SSE0);
1251             idr4_SSE1          = _mm_mul_ps(irsq_SSE1,irsq_SSE1);
1252             idr4_SSE2          = _mm_mul_ps(irsq_SSE2,irsq_SSE2);
1253             idr4_SSE3          = _mm_mul_ps(irsq_SSE3,irsq_SSE3);
1254             idr6_SSE0          = _mm_mul_ps(idr4_SSE0,irsq_SSE0);
1255             idr6_SSE1          = _mm_mul_ps(idr4_SSE1,irsq_SSE1);
1256             idr6_SSE2          = _mm_mul_ps(idr4_SSE2,irsq_SSE2);
1257             idr6_SSE3          = _mm_mul_ps(idr4_SSE3,irsq_SSE3);
1258             
1259             raj_SSE            = _mm_load_ps(gb_radius+j);
1260             vaj_SSE            = _mm_load_ps(vsolv+j);
1261             
1262             rvdw_SSE0          = _mm_add_ps(rai_SSE0,raj_SSE);
1263             rvdw_SSE1          = _mm_add_ps(rai_SSE1,raj_SSE);
1264             rvdw_SSE2          = _mm_add_ps(rai_SSE2,raj_SSE);
1265             rvdw_SSE3          = _mm_add_ps(rai_SSE3,raj_SSE);
1266             
1267             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0,rvdw_SSE0)));
1268             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1,rvdw_SSE1)));
1269             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2,rvdw_SSE2)));
1270             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3,rvdw_SSE3)));
1271             
1272             ratio_SSE0         = _mm_min_ps(ratio_SSE0,still_p5inv_SSE);
1273             ratio_SSE1         = _mm_min_ps(ratio_SSE1,still_p5inv_SSE);
1274             ratio_SSE2         = _mm_min_ps(ratio_SSE2,still_p5inv_SSE);
1275             ratio_SSE3         = _mm_min_ps(ratio_SSE3,still_p5inv_SSE);
1276             theta_SSE0         = _mm_mul_ps(ratio_SSE0,still_pip5_SSE);
1277             theta_SSE1         = _mm_mul_ps(ratio_SSE1,still_pip5_SSE);
1278             theta_SSE2         = _mm_mul_ps(ratio_SSE2,still_pip5_SSE);
1279             theta_SSE3         = _mm_mul_ps(ratio_SSE3,still_pip5_SSE);
1280             gmx_mm_sincos_ps(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
1281             gmx_mm_sincos_ps(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
1282             gmx_mm_sincos_ps(theta_SSE2,&sinq_SSE2,&cosq_SSE2);            
1283             gmx_mm_sincos_ps(theta_SSE3,&sinq_SSE3,&cosq_SSE3);            
1284             term_SSE0          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE0));
1285             term_SSE1          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE1));
1286             term_SSE2          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE2));
1287             term_SSE3          = _mm_mul_ps(half_SSE,_mm_sub_ps(one_SSE,cosq_SSE3));
1288             ccf_SSE0           = _mm_mul_ps(term_SSE0,term_SSE0);
1289             ccf_SSE1           = _mm_mul_ps(term_SSE1,term_SSE1);
1290             ccf_SSE2           = _mm_mul_ps(term_SSE2,term_SSE2);
1291             ccf_SSE3           = _mm_mul_ps(term_SSE3,term_SSE3);
1292             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE0),
1293                                             _mm_mul_ps(sinq_SSE0,theta_SSE0));
1294             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE1),
1295                                             _mm_mul_ps(sinq_SSE1,theta_SSE1));
1296             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE2),
1297                                             _mm_mul_ps(sinq_SSE2,theta_SSE2));
1298             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE,term_SSE3),
1299                                             _mm_mul_ps(sinq_SSE3,theta_SSE3));
1300             
1301             prod_SSE           = _mm_mul_ps(still_p4_SSE,vaj_SSE);
1302             icf4_SSE0          = _mm_mul_ps(ccf_SSE0,idr4_SSE0);
1303             icf4_SSE1          = _mm_mul_ps(ccf_SSE1,idr4_SSE1);
1304             icf4_SSE2          = _mm_mul_ps(ccf_SSE2,idr4_SSE2);
1305             icf4_SSE3          = _mm_mul_ps(ccf_SSE3,idr4_SSE3);
1306             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1307             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1308             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE2),dccf_SSE2), idr6_SSE2);
1309             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE,ccf_SSE3),dccf_SSE3), idr6_SSE3);
1310             
1311             _mm_store_ps(work+j , _mm_add_ps(_mm_load_ps(work+j),
1312                                              gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0,icf4_SSE0),
1313                                                          _mm_mul_ps(prod_ai_SSE1,icf4_SSE1),
1314                                                          _mm_mul_ps(prod_ai_SSE2,icf4_SSE2),
1315                                                          _mm_mul_ps(prod_ai_SSE3,icf4_SSE3))));
1316             
1317             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE,icf4_SSE0));
1318             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE,icf4_SSE1));
1319             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE,icf4_SSE2));
1320             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE,icf4_SSE3));
1321               
1322             /* Save ai->aj and aj->ai chain rule terms */
1323             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE0));
1324             dadx+=4;
1325             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE1));
1326             dadx+=4;
1327             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE2));
1328             dadx+=4;
1329             _mm_store_ps(dadx,_mm_mul_ps(prod_SSE,icf6_SSE3));
1330             dadx+=4;            
1331             
1332             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE0,icf6_SSE0));
1333             dadx+=4;
1334             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE1,icf6_SSE1));
1335             dadx+=4;
1336             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE2,icf6_SSE2));
1337             dadx+=4;
1338             _mm_store_ps(dadx,_mm_mul_ps(prod_ai_SSE3,icf6_SSE3));
1339             dadx+=4;
1340         }
1341         _MM_TRANSPOSE4_PS(gpi_SSE0,gpi_SSE1,gpi_SSE2,gpi_SSE3);
1342         gpi_SSE0 = _mm_add_ps(gpi_SSE0,gpi_SSE1);
1343         gpi_SSE2 = _mm_add_ps(gpi_SSE2,gpi_SSE3);
1344         gpi_SSE0 = _mm_add_ps(gpi_SSE0,gpi_SSE2);
1345         _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
1346         }    
1347     
1348     /* In case we have written anything beyond natoms, move it back.
1349      * Never mind that we leave stuff above natoms; that will not
1350      * be accessed later in the routine.
1351      * In principle this should be a move rather than sum, but this
1352      * way we dont have to worry about even/odd offsets...
1353      */
1354     for(i=natoms;i<ni1+1+natoms/2;i++)
1355     {
1356         work[i-natoms] += work[i];
1357     }
1358     
1359     /* Parallel summations */
1360         if(PARTDECOMP(cr))
1361         {
1362                 gmx_sum(natoms,work,cr);
1363         }
1364         
1365     factor  = 0.5 * ONE_4PI_EPS0;
1366         /* Calculate the radii - should we do all atoms, or just our local ones? */
1367         for(i=0;i<natoms;i++)
1368         {
1369                 if(born->use[i] != 0)
1370                 {
1371                         gpi  = born->gpol[i]+work[i];
1372                         gpi2 = gpi * gpi;
1373                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
1374                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1375                 }
1376         }
1377         
1378         return 0;
1379 }
1380
1381
1382
1383 int
1384 genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
1385                                                 t_mdatoms *            mdatoms,
1386                                                 gmx_genborn_t *        born,
1387                                                 int                    gb_algorithm,
1388                                                 gmx_localtop_t *       top,
1389                                                 real *                 x,
1390                                                 t_commrec *            cr,
1391                                                 void *                 paadata)
1392 {
1393         gmx_allvsallgb2_data_t *aadata;
1394         int        natoms;
1395         int        ni0,ni1;
1396         int        nj0,nj1,nj2,nj3;
1397         int        i,j,k,n;
1398     int *      mask;
1399     int *      pmask0;
1400     int *      pmask1;
1401     int *      pmask2;
1402     int *      pmask3;
1403     int *      emask0;
1404     int *      emask1;
1405     int *      emask2;
1406     int *      emask3;
1407     real *     gb_radius;
1408     real *     vsolv;
1409     real *     work;
1410     real       tmpsum[4];
1411     real *     x_align;
1412     real *     y_align;
1413     real *     z_align;
1414     int *      jindex;
1415     real *     dadx;
1416     real *     obc_param;
1417     real       rad,min_rad;
1418     real rai,rai_inv,rai_inv2,sum_ai,sum_ai2,sum_ai3,tsum,tchain;
1419
1420     __m128     ix_SSE0,iy_SSE0,iz_SSE0;
1421     __m128     ix_SSE1,iy_SSE1,iz_SSE1;
1422     __m128     ix_SSE2,iy_SSE2,iz_SSE2;
1423     __m128     ix_SSE3,iy_SSE3,iz_SSE3;
1424     __m128     gpi_SSE0,rai_SSE0,prod_ai_SSE0;
1425     __m128     gpi_SSE1,rai_SSE1,prod_ai_SSE1;
1426     __m128     gpi_SSE2,rai_SSE2,prod_ai_SSE2;
1427     __m128     gpi_SSE3,rai_SSE3,prod_ai_SSE3;
1428     __m128     imask_SSE0,jmask_SSE0;
1429     __m128     imask_SSE1,jmask_SSE1;
1430     __m128     imask_SSE2,jmask_SSE2;
1431     __m128     imask_SSE3,jmask_SSE3;
1432     __m128     jx_SSE,jy_SSE,jz_SSE;
1433     __m128     dx_SSE0,dy_SSE0,dz_SSE0;
1434     __m128     dx_SSE1,dy_SSE1,dz_SSE1;
1435     __m128     dx_SSE2,dy_SSE2,dz_SSE2;
1436     __m128     dx_SSE3,dy_SSE3,dz_SSE3;
1437     __m128     rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
1438     __m128     rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
1439     __m128     rsq_SSE2,rinv_SSE2,irsq_SSE2,idr4_SSE2,idr6_SSE2;
1440     __m128     rsq_SSE3,rinv_SSE3,irsq_SSE3,idr4_SSE3,idr6_SSE3;
1441     __m128     raj_SSE,raj_inv_SSE,sk_aj_SSE,sk2_aj_SSE;
1442     __m128     ccf_SSE0,dccf_SSE0,prod_SSE0;
1443     __m128     ccf_SSE1,dccf_SSE1,prod_SSE1;
1444     __m128     ccf_SSE2,dccf_SSE2,prod_SSE2;
1445     __m128     ccf_SSE3,dccf_SSE3,prod_SSE3;
1446     __m128     icf4_SSE0,icf6_SSE0;
1447     __m128     icf4_SSE1,icf6_SSE1;
1448     __m128     icf4_SSE2,icf6_SSE2;
1449     __m128     icf4_SSE3,icf6_SSE3;
1450     __m128     oneeighth_SSE,onefourth_SSE,half_SSE,one_SSE,two_SSE,four_SSE;
1451     __m128     still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
1452     __m128     rai_inv_SSE0;
1453     __m128     rai_inv_SSE1;
1454     __m128     rai_inv_SSE2;
1455     __m128     rai_inv_SSE3;
1456     __m128     sk_ai_SSE0,sk2_ai_SSE0,sum_ai_SSE0;
1457     __m128     sk_ai_SSE1,sk2_ai_SSE1,sum_ai_SSE1;
1458     __m128     sk_ai_SSE2,sk2_ai_SSE2,sum_ai_SSE2;
1459     __m128     sk_ai_SSE3,sk2_ai_SSE3,sum_ai_SSE3;
1460     __m128     lij_inv_SSE0,sk2_rinv_SSE0;
1461     __m128     lij_inv_SSE1,sk2_rinv_SSE1;
1462     __m128     lij_inv_SSE2,sk2_rinv_SSE2;
1463     __m128     lij_inv_SSE3,sk2_rinv_SSE3;
1464     __m128     dr_SSE0;
1465     __m128     dr_SSE1;
1466     __m128     dr_SSE2;
1467     __m128     dr_SSE3;
1468     __m128     t1_SSE0,t2_SSE0,t3_SSE0,t4_SSE0;
1469     __m128     t1_SSE1,t2_SSE1,t3_SSE1,t4_SSE1;
1470     __m128     t1_SSE2,t2_SSE2,t3_SSE2,t4_SSE2;
1471     __m128     t1_SSE3,t2_SSE3,t3_SSE3,t4_SSE3;
1472     __m128     obc_mask1_SSE0,obc_mask2_SSE0,obc_mask3_SSE0;
1473     __m128     obc_mask1_SSE1,obc_mask2_SSE1,obc_mask3_SSE1;
1474     __m128     obc_mask1_SSE2,obc_mask2_SSE2,obc_mask3_SSE2;
1475     __m128     obc_mask1_SSE3,obc_mask2_SSE3,obc_mask3_SSE3;
1476     __m128     uij_SSE0,uij2_SSE0,uij3_SSE0;
1477     __m128     uij_SSE1,uij2_SSE1,uij3_SSE1;
1478     __m128     uij_SSE2,uij2_SSE2,uij3_SSE2;
1479     __m128     uij_SSE3,uij2_SSE3,uij3_SSE3;
1480     __m128     lij_SSE0,lij2_SSE0,lij3_SSE0;
1481     __m128     lij_SSE1,lij2_SSE1,lij3_SSE1;
1482     __m128     lij_SSE2,lij2_SSE2,lij3_SSE2;
1483     __m128     lij_SSE3,lij2_SSE3,lij3_SSE3;
1484     __m128     dlij_SSE0,diff2_SSE0,logterm_SSE0;
1485     __m128     dlij_SSE1,diff2_SSE1,logterm_SSE1;
1486     __m128     dlij_SSE2,diff2_SSE2,logterm_SSE2;
1487     __m128     dlij_SSE3,diff2_SSE3,logterm_SSE3;
1488     __m128     doffset_SSE;
1489     
1490     natoms              = mdatoms->nr;
1491         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
1492         ni1                 = mdatoms->start+mdatoms->homenr;
1493     
1494     n = 0;
1495     
1496     aadata = *((gmx_allvsallgb2_data_t **)paadata);
1497     
1498     
1499         if(aadata==NULL)
1500         {
1501                 genborn_allvsall_setup(&aadata,top,born,mdatoms,born->gb_doffset,
1502                                egbOBC,TRUE,TRUE,TRUE);
1503         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1504         }
1505     
1506     x_align = aadata->x_align;
1507         y_align = aadata->y_align;
1508         z_align = aadata->z_align;
1509     
1510     gb_radius = aadata->gb_radius;
1511     work      = aadata->work;
1512     jindex    = aadata->jindex_gb;
1513     dadx      = fr->dadx;
1514     obc_param = aadata->workparam;
1515     
1516     oneeighth_SSE   = _mm_set1_ps(0.125);
1517     onefourth_SSE   = _mm_set1_ps(0.25);
1518     half_SSE        = _mm_set1_ps(0.5);
1519     one_SSE         = _mm_set1_ps(1.0);
1520     two_SSE         = _mm_set1_ps(2.0);
1521     four_SSE        = _mm_set1_ps(4.0);
1522     doffset_SSE     = _mm_set1_ps(born->gb_doffset);
1523     
1524     for(i=0;i<natoms;i++)
1525         {
1526                 x_align[i]  = x[3*i];
1527                 y_align[i]  = x[3*i+1];
1528                 z_align[i]  = x[3*i+2];
1529         }
1530     
1531     /* Copy again */
1532         for(i=0;i<natoms/2+1;i++)
1533         {
1534                 x_align[natoms+i]  = x_align[i];
1535                 y_align[natoms+i]  = y_align[i];
1536                 z_align[natoms+i]  = z_align[i];
1537     }        
1538     
1539     for(i=0;i<natoms+natoms/2+1;i++)
1540     {
1541         work[i] = 0;
1542     }
1543         
1544         for(i=ni0; i<ni1; i+= UNROLLI)
1545         {
1546                 /* We assume shifts are NOT used for all-vs-all interactions */
1547         
1548                 /* Load i atom data */
1549                 ix_SSE0          = _mm_load1_ps(x_align+i);
1550                 iy_SSE0          = _mm_load1_ps(y_align+i);
1551                 iz_SSE0          = _mm_load1_ps(z_align+i);
1552                 ix_SSE1          = _mm_load1_ps(x_align+i+1);
1553                 iy_SSE1          = _mm_load1_ps(y_align+i+1);
1554                 iz_SSE1          = _mm_load1_ps(z_align+i+1);
1555                 ix_SSE2          = _mm_load1_ps(x_align+i+2);
1556                 iy_SSE2          = _mm_load1_ps(y_align+i+2);
1557                 iz_SSE2          = _mm_load1_ps(z_align+i+2);
1558                 ix_SSE3          = _mm_load1_ps(x_align+i+3);
1559                 iy_SSE3          = _mm_load1_ps(y_align+i+3);
1560                 iz_SSE3          = _mm_load1_ps(z_align+i+3);
1561         
1562         rai_SSE0         = _mm_load1_ps(gb_radius+i);
1563         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
1564         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
1565         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
1566         rai_inv_SSE0     = gmx_mm_inv_ps(rai_SSE0);
1567         rai_inv_SSE1     = gmx_mm_inv_ps(rai_SSE1);
1568         rai_inv_SSE2     = gmx_mm_inv_ps(rai_SSE2);
1569         rai_inv_SSE3     = gmx_mm_inv_ps(rai_SSE3);
1570         
1571         sk_ai_SSE0       = _mm_load1_ps(obc_param+i);
1572         sk_ai_SSE1       = _mm_load1_ps(obc_param+i+1);
1573         sk_ai_SSE2       = _mm_load1_ps(obc_param+i+2);
1574         sk_ai_SSE3       = _mm_load1_ps(obc_param+i+3);
1575         sk2_ai_SSE0      = _mm_mul_ps(sk_ai_SSE0,sk_ai_SSE0);
1576         sk2_ai_SSE1      = _mm_mul_ps(sk_ai_SSE1,sk_ai_SSE1);
1577         sk2_ai_SSE2      = _mm_mul_ps(sk_ai_SSE2,sk_ai_SSE2);
1578         sk2_ai_SSE3      = _mm_mul_ps(sk_ai_SSE3,sk_ai_SSE3);
1579         
1580         sum_ai_SSE0      = _mm_setzero_ps();
1581         sum_ai_SSE1      = _mm_setzero_ps();
1582         sum_ai_SSE2      = _mm_setzero_ps();
1583         sum_ai_SSE3      = _mm_setzero_ps();
1584         
1585                 /* Load limits for loop over neighbors */
1586                 nj0              = jindex[4*i];
1587                 nj1              = jindex[4*i+1];
1588                 nj2              = jindex[4*i+2];
1589                 nj3              = jindex[4*i+3];
1590         
1591         pmask0           = aadata->prologue_mask_gb[i];
1592         pmask1           = aadata->prologue_mask_gb[i+1];
1593         pmask2           = aadata->prologue_mask_gb[i+2];
1594         pmask3           = aadata->prologue_mask_gb[i+3];
1595         emask0           = aadata->epilogue_mask[i]; 
1596         emask1           = aadata->epilogue_mask[i+1]; 
1597         emask2           = aadata->epilogue_mask[i+2]; 
1598         emask3           = aadata->epilogue_mask[i+3]; 
1599         
1600         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
1601         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
1602         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
1603         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
1604         
1605         /* Prologue part, including exclusion mask */
1606         for(j=nj0; j<nj1; j+=UNROLLJ)
1607         {          
1608             jmask_SSE0 = _mm_load_ps((real *)pmask0);
1609             jmask_SSE1 = _mm_load_ps((real *)pmask1);
1610             jmask_SSE2 = _mm_load_ps((real *)pmask2);
1611             jmask_SSE3 = _mm_load_ps((real *)pmask3);
1612             pmask0 += UNROLLJ;
1613             pmask1 += UNROLLJ;
1614             pmask2 += UNROLLJ;
1615             pmask3 += UNROLLJ;
1616             
1617             /* load j atom coordinates */
1618             jx_SSE            = _mm_load_ps(x_align+j);
1619             jy_SSE            = _mm_load_ps(y_align+j);
1620             jz_SSE            = _mm_load_ps(z_align+j);
1621             
1622             /* Calculate distance */
1623             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
1624             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
1625             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
1626             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
1627             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
1628             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
1629             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
1630             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
1631             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
1632             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
1633             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
1634             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
1635             
1636             /* rsq = dx*dx+dy*dy+dz*dz */
1637             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
1638             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
1639             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
1640             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
1641             
1642             /* Combine masks */
1643             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
1644             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
1645             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
1646             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
1647             
1648             /* Calculate 1/r and 1/r2 */
1649             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1650             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1651             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1652             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1653             
1654             /* Apply mask */
1655             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
1656             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
1657             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
1658             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
1659             
1660             dr_SSE0            = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
1661             dr_SSE1            = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
1662             dr_SSE2            = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
1663             dr_SSE3            = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
1664
1665             sk_aj_SSE          = _mm_load_ps(obc_param+j);
1666             raj_SSE            = _mm_load_ps(gb_radius+j);
1667             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
1668             
1669             /* Evaluate influence of atom aj -> ai */
1670             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_aj_SSE);
1671             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_aj_SSE);
1672             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_aj_SSE);
1673             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_aj_SSE);
1674             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
1675             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
1676             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
1677             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
1678             t3_SSE0            = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
1679             t3_SSE1            = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
1680             t3_SSE2            = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
1681             t3_SSE3            = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
1682
1683             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
1684             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
1685             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
1686             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
1687             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
1688             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
1689             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
1690             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
1691             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
1692             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
1693             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
1694             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
1695             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
1696             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
1697             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
1698             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
1699
1700             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1701             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1702             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1703             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1704             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
1705                                            _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
1706             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
1707                                            _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
1708             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
1709                                            _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
1710             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
1711                                            _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
1712             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
1713             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
1714             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
1715             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
1716             
1717             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1718             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1719             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1720             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1721             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
1722             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
1723             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
1724             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
1725             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1726             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1727             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1728             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1729             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
1730             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
1731             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
1732             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
1733             
1734             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
1735             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
1736             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
1737             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
1738             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1739             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1740             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1741             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1742             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
1743             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
1744             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
1745             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
1746             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
1747             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
1748             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
1749             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
1750             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
1751             
1752             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
1753             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
1754             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
1755             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
1756
1757             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
1758             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
1759             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
1760             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
1761             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1762                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
1763                                                        prod_SSE0));
1764             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1765                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
1766                                                        prod_SSE1));
1767             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1768                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
1769                                                        prod_SSE2));
1770             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
1771                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
1772                                                        prod_SSE3));
1773             
1774             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
1775             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
1776             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
1777             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
1778             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
1779             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
1780             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
1781             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
1782             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
1783             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
1784             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
1785             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
1786             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
1787             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
1788             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
1789             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
1790             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
1791             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
1792             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
1793             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
1794             
1795             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
1796             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
1797             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
1798             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
1799             
1800             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
1801                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
1802             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
1803                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
1804             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
1805                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
1806             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
1807                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
1808             t1_SSE0            = _mm_sub_ps(t1_SSE0,
1809                                             _mm_mul_ps(onefourth_SSE,
1810                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
1811                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
1812             t1_SSE1            = _mm_sub_ps(t1_SSE1,
1813                                             _mm_mul_ps(onefourth_SSE,
1814                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
1815                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
1816             t1_SSE2            = _mm_sub_ps(t1_SSE2,
1817                                             _mm_mul_ps(onefourth_SSE,
1818                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
1819                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
1820             t1_SSE3            = _mm_sub_ps(t1_SSE3,
1821                                             _mm_mul_ps(onefourth_SSE,
1822                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
1823                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
1824
1825             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
1826                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
1827                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
1828             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
1829                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
1830                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
1831             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
1832                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
1833                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
1834             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
1835                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
1836                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
1837             t2_SSE0            = _mm_sub_ps(t2_SSE0,
1838                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
1839                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
1840             t2_SSE1            = _mm_sub_ps(t2_SSE1,
1841                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
1842                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
1843             t2_SSE2            = _mm_sub_ps(t2_SSE2,
1844                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
1845                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
1846             t2_SSE3            = _mm_sub_ps(t2_SSE3,
1847                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
1848                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
1849             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
1850                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
1851             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
1852                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
1853             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
1854                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
1855             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
1856                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
1857             t3_SSE0            = _mm_sub_ps(t3_SSE0,
1858                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
1859                                                        _mm_add_ps(one_SSE,
1860                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
1861             t3_SSE1            = _mm_sub_ps(t3_SSE1,
1862                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
1863                                                        _mm_add_ps(one_SSE,
1864                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
1865             t3_SSE2            = _mm_sub_ps(t3_SSE2,
1866                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
1867                                                        _mm_add_ps(one_SSE,
1868                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
1869             t3_SSE3            = _mm_sub_ps(t3_SSE3,
1870                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
1871                                                        _mm_add_ps(one_SSE,
1872                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
1873             
1874             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
1875                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
1876                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
1877             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
1878                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
1879                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
1880             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
1881                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
1882                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
1883             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
1884                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
1885                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
1886             
1887             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
1888             dadx += 4;
1889             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
1890             dadx += 4;
1891             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
1892             dadx += 4;
1893             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
1894             dadx += 4;
1895             
1896             /* Evaluate influence of atom ai -> aj */
1897             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
1898             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
1899             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
1900             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
1901             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
1902             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
1903             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
1904             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
1905             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
1906             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
1907             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
1908             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
1909             
1910             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
1911             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
1912             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
1913             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
1914             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
1915             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
1916             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
1917             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
1918             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
1919             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
1920             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
1921             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
1922             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
1923             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
1924             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
1925             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
1926             
1927             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1928             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1929             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1930             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1931             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
1932                                            _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
1933             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
1934                                            _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
1935             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
1936                                            _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
1937             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
1938                                            _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
1939             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
1940             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
1941             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
1942             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
1943             
1944             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1945             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1946             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1947             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1948             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
1949             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
1950             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
1951             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
1952             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1953             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1954             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1955             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1956             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
1957             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
1958             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
1959             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
1960             
1961             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
1962             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
1963             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
1964             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
1965             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1966             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1967             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1968             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1969             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
1970             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
1971             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
1972             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
1973             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
1974             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
1975             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
1976             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
1977             
1978             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
1979             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
1980             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
1981             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
1982             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
1983             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
1984             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
1985             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
1986             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1987                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
1988                                                        prod_SSE0));
1989             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1990                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
1991                                                        prod_SSE1));
1992             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1993                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
1994                                                        prod_SSE2));
1995             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
1996                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
1997                                                        prod_SSE3));
1998             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
1999             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2000             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2001             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2002             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2003             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2004             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2005             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2006             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
2007             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
2008             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
2009             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
2010             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2011             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2012             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2013             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2014             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2015             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2016             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2017             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2018             
2019             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2020                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
2021                                                         _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
2022                                                         _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
2023                                                         _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
2024             
2025             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2026                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
2027             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2028                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
2029             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2030                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
2031             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2032                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
2033             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2034                                             _mm_mul_ps(onefourth_SSE,
2035                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2036                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2037             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2038                                             _mm_mul_ps(onefourth_SSE,
2039                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2040                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2041             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2042                                             _mm_mul_ps(onefourth_SSE,
2043                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2044                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2045             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2046                                             _mm_mul_ps(onefourth_SSE,
2047                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2048                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2049             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2050                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2051                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2052             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2053                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2054                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2055             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2056                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2057                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2058             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2059                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2060                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2061             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2062                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2063                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2064             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2065                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2066                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2067             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2068                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2069                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2070             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2071                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2072                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2073             
2074             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2075                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2076             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2077                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2078             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2079                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2080             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2081                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2082             
2083             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2084                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2085                                                        _mm_add_ps(one_SSE,
2086                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2087             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2088                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2089                                                        _mm_add_ps(one_SSE,
2090                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2091             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2092                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2093                                                        _mm_add_ps(one_SSE,
2094                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2095             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2096                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2097                                                        _mm_add_ps(one_SSE,
2098                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2099             
2100             
2101             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2102                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2103                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
2104             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2105                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2106                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
2107             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2108                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2109                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
2110             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2111                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2112                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
2113
2114             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2115             dadx += 4;
2116             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2117             dadx += 4;
2118             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2119             dadx += 4;
2120             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2121             dadx += 4;
2122             
2123         }
2124         
2125         /* Main part, no exclusions */
2126         for(j=nj1; j<nj2; j+=UNROLLJ)
2127         {                      
2128             /* load j atom coordinates */
2129             jx_SSE            = _mm_load_ps(x_align+j);
2130             jy_SSE            = _mm_load_ps(y_align+j);
2131             jz_SSE            = _mm_load_ps(z_align+j);
2132             
2133             /* Calculate distance */
2134             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
2135             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
2136             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
2137             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
2138             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
2139             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
2140             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
2141             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
2142             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
2143             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
2144             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
2145             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
2146             
2147             /* rsq = dx*dx+dy*dy+dz*dz */
2148             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2149             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2150             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2151             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2152             
2153             /* Calculate 1/r and 1/r2 */
2154             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2155             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2156             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2157             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2158             
2159             /* Apply mask */
2160             rinv_SSE0          = _mm_and_ps(rinv_SSE0,imask_SSE0);
2161             rinv_SSE1          = _mm_and_ps(rinv_SSE1,imask_SSE1);
2162             rinv_SSE2          = _mm_and_ps(rinv_SSE2,imask_SSE2);
2163             rinv_SSE3          = _mm_and_ps(rinv_SSE3,imask_SSE3);
2164             
2165             dr_SSE0            = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
2166             dr_SSE1            = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
2167             dr_SSE2            = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
2168             dr_SSE3            = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
2169             
2170             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2171             raj_SSE            = _mm_load_ps(gb_radius+j);
2172
2173             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2174             
2175             /* Evaluate influence of atom aj -> ai */
2176             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_aj_SSE);
2177             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_aj_SSE);
2178             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_aj_SSE);
2179             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_aj_SSE);
2180             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
2181             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
2182             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
2183             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
2184             t3_SSE0            = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
2185             t3_SSE1            = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
2186             t3_SSE2            = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
2187             t3_SSE3            = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
2188             
2189             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2190             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2191             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2192             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2193             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2194             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2195             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2196             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2197             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2198             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2199             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2200             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2201             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,imask_SSE0);
2202             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,imask_SSE1);
2203             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,imask_SSE2);
2204             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,imask_SSE3);
2205             
2206             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2207             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2208             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2209             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2210             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2211                                            _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
2212             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2213                                            _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
2214             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2215                                            _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
2216             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2217                                            _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
2218             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2219             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2220             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2221             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2222             
2223             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2224             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2225             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2226             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2227             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2228             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2229             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2230             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2231             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2232             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2233             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2234             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2235             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2236             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2237             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2238             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2239             
2240             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2241             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2242             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2243             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2244             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2245             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2246             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2247             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2248             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
2249             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
2250             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
2251             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
2252             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
2253             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2254             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2255             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2256             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2257             
2258             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2259             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2260             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2261             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2262             
2263             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
2264             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
2265             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
2266             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
2267             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2268                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2269                                                        prod_SSE0));
2270             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2271                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2272                                                        prod_SSE1));
2273             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2274                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2275                                                        prod_SSE2));
2276             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2277                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2278                                                        prod_SSE3));
2279             
2280             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2281             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2282             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2283             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2284             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2285             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2286             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2287             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2288             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
2289             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
2290             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
2291             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
2292             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2293             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2294             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2295             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2296             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2297             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2298             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2299             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2300             
2301             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2302             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2303             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2304             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2305             
2306             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2307                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
2308             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2309                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
2310             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2311                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
2312             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2313                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
2314             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2315                                             _mm_mul_ps(onefourth_SSE,
2316                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2317                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2318             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2319                                             _mm_mul_ps(onefourth_SSE,
2320                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2321                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2322             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2323                                             _mm_mul_ps(onefourth_SSE,
2324                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2325                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2326             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2327                                             _mm_mul_ps(onefourth_SSE,
2328                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2329                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2330             
2331             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2332                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2333                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2334             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2335                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2336                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2337             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2338                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2339                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2340             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2341                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2342                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2343             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2344                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2345                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2346             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2347                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2348                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2349             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2350                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2351                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2352             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2353                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2354                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2355             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2356                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2357             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2358                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2359             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2360                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2361             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2362                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2363             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2364                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2365                                                        _mm_add_ps(one_SSE,
2366                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2367             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2368                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2369                                                        _mm_add_ps(one_SSE,
2370                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2371             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2372                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2373                                                        _mm_add_ps(one_SSE,
2374                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2375             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2376                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2377                                                        _mm_add_ps(one_SSE,
2378                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2379             
2380             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2381                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2382                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
2383             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2384                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2385                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
2386             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2387                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2388                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
2389             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2390                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2391                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
2392             
2393             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2394             dadx += 4;
2395             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2396             dadx += 4;
2397             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2398             dadx += 4;
2399             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2400             dadx += 4;
2401             
2402             /* Evaluate influence of atom ai -> aj */
2403             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
2404             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
2405             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
2406             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
2407             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
2408             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
2409             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
2410             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
2411             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
2412             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
2413             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
2414             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
2415             
2416             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2417             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2418             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2419             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2420             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2421             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2422             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2423             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2424             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2425             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2426             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2427             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2428             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,imask_SSE0);
2429             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,imask_SSE1);
2430             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,imask_SSE2);
2431             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,imask_SSE3);
2432             
2433             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2434             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2435             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2436             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2437             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2438                                            _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
2439             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2440                                            _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
2441             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2442                                            _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
2443             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2444                                            _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
2445             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2446             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2447             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2448             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2449             
2450             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2451             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2452             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2453             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2454             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2455             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2456             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2457             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2458             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2459             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2460             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2461             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2462             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2463             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2464             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2465             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2466             
2467             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2468             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2469             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2470             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2471             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2472             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2473             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2474             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2475             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
2476             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
2477             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
2478             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
2479             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2480             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2481             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2482             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2483             
2484             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2485             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2486             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2487             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2488             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
2489             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
2490             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
2491             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
2492             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2493                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2494                                                        prod_SSE0));
2495             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2496                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2497                                                        prod_SSE1));
2498             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2499                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2500                                                        prod_SSE2));
2501             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2502                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2503                                                        prod_SSE3));
2504             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2505             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2506             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2507             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2508             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2509             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2510             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2511             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2512             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
2513             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
2514             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
2515             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
2516             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2517             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2518             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2519             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2520             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2521             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2522             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2523             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2524             
2525             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2526                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
2527                                                         _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
2528                                                         _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
2529                                                         _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
2530             
2531             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2532                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
2533             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2534                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
2535             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2536                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
2537             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2538                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
2539             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2540                                             _mm_mul_ps(onefourth_SSE,
2541                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2542                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2543             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2544                                             _mm_mul_ps(onefourth_SSE,
2545                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2546                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2547             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2548                                             _mm_mul_ps(onefourth_SSE,
2549                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2550                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2551             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2552                                             _mm_mul_ps(onefourth_SSE,
2553                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2554                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2555             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2556                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2557                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2558             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2559                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2560                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2561             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2562                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2563                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2564             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2565                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2566                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2567             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2568                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2569                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2570             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2571                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2572                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2573             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2574                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2575                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2576             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2577                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2578                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2579             
2580             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2581                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2582             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2583                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2584             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2585                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2586             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2587                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2588             
2589             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2590                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2591                                                        _mm_add_ps(one_SSE,
2592                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2593             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2594                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2595                                                        _mm_add_ps(one_SSE,
2596                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2597             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2598                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2599                                                        _mm_add_ps(one_SSE,
2600                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2601             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2602                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2603                                                        _mm_add_ps(one_SSE,
2604                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2605             
2606             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2607                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2608                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
2609             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2610                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2611                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
2612             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2613                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2614                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
2615             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2616                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2617                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
2618             
2619             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2620             dadx += 4;
2621             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2622             dadx += 4;
2623             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2624             dadx += 4;
2625             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2626             dadx += 4;
2627         }
2628
2629         /* Epilogue part, including exclusion mask */
2630         for(j=nj2; j<nj3; j+=UNROLLJ)
2631         {                      
2632             jmask_SSE0 = _mm_load_ps((real *)emask0);
2633             jmask_SSE1 = _mm_load_ps((real *)emask1);
2634             jmask_SSE2 = _mm_load_ps((real *)emask2);
2635             jmask_SSE3 = _mm_load_ps((real *)emask3);
2636             emask0 += UNROLLJ;
2637             emask1 += UNROLLJ;
2638             emask2 += UNROLLJ;
2639             emask3 += UNROLLJ;
2640             
2641             /* load j atom coordinates */
2642             jx_SSE            = _mm_load_ps(x_align+j);
2643             jy_SSE            = _mm_load_ps(y_align+j);
2644             jz_SSE            = _mm_load_ps(z_align+j);
2645             
2646             /* Calculate distance */
2647             dx_SSE0            = _mm_sub_ps(ix_SSE0,jx_SSE);
2648             dy_SSE0            = _mm_sub_ps(iy_SSE0,jy_SSE);
2649             dz_SSE0            = _mm_sub_ps(iz_SSE0,jz_SSE);
2650             dx_SSE1            = _mm_sub_ps(ix_SSE1,jx_SSE);
2651             dy_SSE1            = _mm_sub_ps(iy_SSE1,jy_SSE);
2652             dz_SSE1            = _mm_sub_ps(iz_SSE1,jz_SSE);
2653             dx_SSE2            = _mm_sub_ps(ix_SSE2,jx_SSE);
2654             dy_SSE2            = _mm_sub_ps(iy_SSE2,jy_SSE);
2655             dz_SSE2            = _mm_sub_ps(iz_SSE2,jz_SSE);
2656             dx_SSE3            = _mm_sub_ps(ix_SSE3,jx_SSE);
2657             dy_SSE3            = _mm_sub_ps(iy_SSE3,jy_SSE);
2658             dz_SSE3            = _mm_sub_ps(iz_SSE3,jz_SSE);
2659             
2660             /* rsq = dx*dx+dy*dy+dz*dz */
2661             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
2662             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
2663             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
2664             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
2665             
2666             /* Combine masks */
2667             jmask_SSE0         = _mm_and_ps(jmask_SSE0,imask_SSE0);
2668             jmask_SSE1         = _mm_and_ps(jmask_SSE1,imask_SSE1);
2669             jmask_SSE2         = _mm_and_ps(jmask_SSE2,imask_SSE2);
2670             jmask_SSE3         = _mm_and_ps(jmask_SSE3,imask_SSE3);
2671             
2672             /* Calculate 1/r and 1/r2 */
2673             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2674             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2675             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2676             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2677             
2678             /* Apply mask */
2679             rinv_SSE0          = _mm_and_ps(rinv_SSE0,jmask_SSE0);
2680             rinv_SSE1          = _mm_and_ps(rinv_SSE1,jmask_SSE1);
2681             rinv_SSE2          = _mm_and_ps(rinv_SSE2,jmask_SSE2);
2682             rinv_SSE3          = _mm_and_ps(rinv_SSE3,jmask_SSE3);
2683             
2684             dr_SSE0            = _mm_mul_ps(rsq_SSE0,rinv_SSE0);
2685             dr_SSE1            = _mm_mul_ps(rsq_SSE1,rinv_SSE1);
2686             dr_SSE2            = _mm_mul_ps(rsq_SSE2,rinv_SSE2);
2687             dr_SSE3            = _mm_mul_ps(rsq_SSE3,rinv_SSE3);
2688             
2689             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2690             raj_SSE            = _mm_load_ps(gb_radius+j);
2691
2692             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2693             
2694             /* Evaluate influence of atom aj -> ai */
2695             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_aj_SSE);
2696             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_aj_SSE);
2697             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_aj_SSE);
2698             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_aj_SSE);
2699             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_aj_SSE);
2700             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_aj_SSE);
2701             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_aj_SSE);
2702             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_aj_SSE);
2703             t3_SSE0            = _mm_sub_ps(sk_aj_SSE,dr_SSE0);
2704             t3_SSE1            = _mm_sub_ps(sk_aj_SSE,dr_SSE1);
2705             t3_SSE2            = _mm_sub_ps(sk_aj_SSE,dr_SSE2);
2706             t3_SSE3            = _mm_sub_ps(sk_aj_SSE,dr_SSE3);
2707             
2708             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2709             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2710             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2711             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2712             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2713             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2714             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2715             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2716             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2717             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2718             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2719             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2720             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
2721             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
2722             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
2723             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
2724             
2725             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2726             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2727             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2728             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2729             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2730                                            _mm_andnot_ps(obc_mask2_SSE0,rai_inv_SSE0));
2731             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2732                                            _mm_andnot_ps(obc_mask2_SSE1,rai_inv_SSE1));
2733             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2734                                            _mm_andnot_ps(obc_mask2_SSE2,rai_inv_SSE2));
2735             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2736                                            _mm_andnot_ps(obc_mask2_SSE3,rai_inv_SSE3));
2737             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2738             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2739             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2740             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2741             
2742             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2743             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2744             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2745             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2746             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2747             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2748             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2749             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2750             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2751             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2752             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2753             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2754             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2755             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2756             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2757             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2758             
2759             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2760             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2761             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2762             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2763             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2764             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2765             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2766             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2767             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE,sk_aj_SSE);
2768             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE0);
2769             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE1);
2770             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE2);
2771             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE,rinv_SSE3);
2772             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2773             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
2774             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
2775             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
2776             
2777             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
2778             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
2779             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
2780             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
2781             
2782             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
2783             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
2784             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
2785             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
2786             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2787                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
2788                                                        prod_SSE0));
2789             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2790                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
2791                                                        prod_SSE1));
2792             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2793                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
2794                                                        prod_SSE2));
2795             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2796                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
2797                                                        prod_SSE3));
2798             
2799             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
2800             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
2801             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
2802             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
2803             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
2804             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
2805             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
2806             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
2807             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE0,lij_SSE0));
2808             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE1,lij_SSE1));
2809             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE2,lij_SSE2));
2810             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(rai_inv_SSE3,lij_SSE3));
2811             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
2812             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
2813             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
2814             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
2815             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
2816             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
2817             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
2818             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
2819             
2820             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2821             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2822             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2823             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2824             
2825             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
2826                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
2827             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
2828                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
2829             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
2830                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
2831             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
2832                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
2833             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2834                                             _mm_mul_ps(onefourth_SSE,
2835                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
2836                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
2837             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2838                                             _mm_mul_ps(onefourth_SSE,
2839                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
2840                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
2841             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2842                                             _mm_mul_ps(onefourth_SSE,
2843                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
2844                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
2845             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2846                                             _mm_mul_ps(onefourth_SSE,
2847                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
2848                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
2849             
2850             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2851                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
2852                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
2853             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2854                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
2855                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
2856             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2857                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
2858                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
2859             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2860                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
2861                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
2862             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2863                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
2864                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
2865             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2866                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
2867                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
2868             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2869                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
2870                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
2871             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2872                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
2873                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
2874             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
2875                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
2876             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
2877                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
2878             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
2879                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
2880             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
2881                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
2882             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2883                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
2884                                                        _mm_add_ps(one_SSE,
2885                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
2886             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2887                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
2888                                                        _mm_add_ps(one_SSE,
2889                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
2890             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2891                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
2892                                                        _mm_add_ps(one_SSE,
2893                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
2894             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2895                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
2896                                                        _mm_add_ps(one_SSE,
2897                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
2898             
2899             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2900                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
2901                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
2902             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2903                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
2904                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
2905             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2906                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
2907                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
2908             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2909                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
2910                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
2911             
2912             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
2913             dadx += 4;
2914             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
2915             dadx += 4;
2916             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
2917             dadx += 4;
2918             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
2919             dadx += 4;
2920             
2921             /* Evaluate influence of atom ai -> aj */
2922             t1_SSE0            = _mm_add_ps(dr_SSE0,sk_ai_SSE0);
2923             t1_SSE1            = _mm_add_ps(dr_SSE1,sk_ai_SSE1);
2924             t1_SSE2            = _mm_add_ps(dr_SSE2,sk_ai_SSE2);
2925             t1_SSE3            = _mm_add_ps(dr_SSE3,sk_ai_SSE3);
2926             t2_SSE0            = _mm_sub_ps(dr_SSE0,sk_ai_SSE0);
2927             t2_SSE1            = _mm_sub_ps(dr_SSE1,sk_ai_SSE1);
2928             t2_SSE2            = _mm_sub_ps(dr_SSE2,sk_ai_SSE2);
2929             t2_SSE3            = _mm_sub_ps(dr_SSE3,sk_ai_SSE3);
2930             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0,dr_SSE0);
2931             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1,dr_SSE1);
2932             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2,dr_SSE2);
2933             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3,dr_SSE3);
2934             
2935             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2936             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2937             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2938             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2939             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2940             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2941             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2942             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2943             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2944             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2945             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2946             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2947             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0,jmask_SSE0);
2948             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1,jmask_SSE1);
2949             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2,jmask_SSE2);
2950             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3,jmask_SSE3);
2951             
2952             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2953             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2954             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2955             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2956             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0,gmx_mm_inv_ps(t2_SSE0)),
2957                                            _mm_andnot_ps(obc_mask2_SSE0,raj_inv_SSE));
2958             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1,gmx_mm_inv_ps(t2_SSE1)),
2959                                            _mm_andnot_ps(obc_mask2_SSE1,raj_inv_SSE));
2960             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2,gmx_mm_inv_ps(t2_SSE2)),
2961                                            _mm_andnot_ps(obc_mask2_SSE2,raj_inv_SSE));
2962             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3,gmx_mm_inv_ps(t2_SSE3)),
2963                                            _mm_andnot_ps(obc_mask2_SSE3,raj_inv_SSE));
2964             dlij_SSE0          = _mm_and_ps(one_SSE,obc_mask2_SSE0);
2965             dlij_SSE1          = _mm_and_ps(one_SSE,obc_mask2_SSE1);
2966             dlij_SSE2          = _mm_and_ps(one_SSE,obc_mask2_SSE2);
2967             dlij_SSE3          = _mm_and_ps(one_SSE,obc_mask2_SSE3);
2968             
2969             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2970             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2971             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2972             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2973             uij3_SSE0          = _mm_mul_ps(uij2_SSE0,uij_SSE0);
2974             uij3_SSE1          = _mm_mul_ps(uij2_SSE1,uij_SSE1);
2975             uij3_SSE2          = _mm_mul_ps(uij2_SSE2,uij_SSE2);
2976             uij3_SSE3          = _mm_mul_ps(uij2_SSE3,uij_SSE3);
2977             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2978             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2979             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2980             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2981             lij3_SSE0          = _mm_mul_ps(lij2_SSE0,lij_SSE0);
2982             lij3_SSE1          = _mm_mul_ps(lij2_SSE1,lij_SSE1);
2983             lij3_SSE2          = _mm_mul_ps(lij2_SSE2,lij_SSE2);
2984             lij3_SSE3          = _mm_mul_ps(lij2_SSE3,lij_SSE3);
2985             
2986             diff2_SSE0         = _mm_sub_ps(uij2_SSE0,lij2_SSE0);
2987             diff2_SSE1         = _mm_sub_ps(uij2_SSE1,lij2_SSE1);
2988             diff2_SSE2         = _mm_sub_ps(uij2_SSE2,lij2_SSE2);
2989             diff2_SSE3         = _mm_sub_ps(uij2_SSE3,lij2_SSE3);
2990             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2991             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2992             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2993             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2994             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0,rinv_SSE0);
2995             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1,rinv_SSE1);
2996             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2,rinv_SSE2);
2997             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3,rinv_SSE3);
2998             prod_SSE0          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE0);
2999             prod_SSE1          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE1);
3000             prod_SSE2          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE2);
3001             prod_SSE3          = _mm_mul_ps(onefourth_SSE,sk2_rinv_SSE3);
3002             
3003             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0,lij_inv_SSE0));
3004             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1,lij_inv_SSE1));
3005             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2,lij_inv_SSE2));
3006             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3,lij_inv_SSE3));
3007             t1_SSE0            = _mm_sub_ps(lij_SSE0,uij_SSE0);
3008             t1_SSE1            = _mm_sub_ps(lij_SSE1,uij_SSE1);
3009             t1_SSE2            = _mm_sub_ps(lij_SSE2,uij_SSE2);
3010             t1_SSE3            = _mm_sub_ps(lij_SSE3,uij_SSE3);
3011             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
3012                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE0),
3013                                                        prod_SSE0));
3014             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
3015                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE1),
3016                                                        prod_SSE1));
3017             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
3018                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE2),
3019                                                        prod_SSE2));
3020             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
3021                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE,dr_SSE3),
3022                                                        prod_SSE3));
3023             t3_SSE0            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE0,logterm_SSE0));
3024             t3_SSE1            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE1,logterm_SSE1));
3025             t3_SSE2            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE2,logterm_SSE2));
3026             t3_SSE3            = _mm_mul_ps(half_SSE,_mm_mul_ps(rinv_SSE3,logterm_SSE3));
3027             t1_SSE0            = _mm_add_ps(t1_SSE0,_mm_add_ps(t2_SSE0,t3_SSE0));
3028             t1_SSE1            = _mm_add_ps(t1_SSE1,_mm_add_ps(t2_SSE1,t3_SSE1));
3029             t1_SSE2            = _mm_add_ps(t1_SSE2,_mm_add_ps(t2_SSE2,t3_SSE2));
3030             t1_SSE3            = _mm_add_ps(t1_SSE3,_mm_add_ps(t2_SSE3,t3_SSE3));
3031             t4_SSE0            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE0));
3032             t4_SSE1            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE1));
3033             t4_SSE2            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE2));
3034             t4_SSE3            = _mm_mul_ps(two_SSE,_mm_sub_ps(raj_inv_SSE,lij_SSE3));
3035             t4_SSE0            = _mm_and_ps(t4_SSE0,obc_mask3_SSE0);
3036             t4_SSE1            = _mm_and_ps(t4_SSE1,obc_mask3_SSE1);
3037             t4_SSE2            = _mm_and_ps(t4_SSE2,obc_mask3_SSE2);
3038             t4_SSE3            = _mm_and_ps(t4_SSE3,obc_mask3_SSE3);
3039             t1_SSE0            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE0,t4_SSE0));
3040             t1_SSE1            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE1,t4_SSE1));
3041             t1_SSE2            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE2,t4_SSE2));
3042             t1_SSE3            = _mm_mul_ps(half_SSE,_mm_add_ps(t1_SSE3,t4_SSE3));
3043             
3044             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
3045                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0,obc_mask1_SSE0),
3046                                                         _mm_and_ps(t1_SSE1,obc_mask1_SSE1),
3047                                                         _mm_and_ps(t1_SSE2,obc_mask1_SSE2),
3048                                                         _mm_and_ps(t1_SSE3,obc_mask1_SSE3))));
3049             
3050             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE0),
3051                                             _mm_mul_ps(prod_SSE0,lij3_SSE0));
3052             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE1),
3053                                             _mm_mul_ps(prod_SSE1,lij3_SSE1));
3054             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE2),
3055                                             _mm_mul_ps(prod_SSE2,lij3_SSE2));
3056             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE,lij2_SSE3),
3057                                             _mm_mul_ps(prod_SSE3,lij3_SSE3));
3058             t1_SSE0            = _mm_sub_ps(t1_SSE0,
3059                                             _mm_mul_ps(onefourth_SSE,
3060                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0,rinv_SSE0),
3061                                                                   _mm_mul_ps(lij3_SSE0,dr_SSE0))));
3062             t1_SSE1            = _mm_sub_ps(t1_SSE1,
3063                                             _mm_mul_ps(onefourth_SSE,
3064                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1,rinv_SSE1),
3065                                                                   _mm_mul_ps(lij3_SSE1,dr_SSE1))));
3066             t1_SSE2            = _mm_sub_ps(t1_SSE2,
3067                                             _mm_mul_ps(onefourth_SSE,
3068                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2,rinv_SSE2),
3069                                                                   _mm_mul_ps(lij3_SSE2,dr_SSE2))));
3070             t1_SSE3            = _mm_sub_ps(t1_SSE3,
3071                                             _mm_mul_ps(onefourth_SSE,
3072                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3,rinv_SSE3),
3073                                                                   _mm_mul_ps(lij3_SSE3,dr_SSE3))));
3074             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
3075                                             _mm_add_ps(_mm_mul_ps(uij_SSE0,rinv_SSE0),
3076                                                        _mm_mul_ps(uij3_SSE0,dr_SSE0)));
3077             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
3078                                             _mm_add_ps(_mm_mul_ps(uij_SSE1,rinv_SSE1),
3079                                                        _mm_mul_ps(uij3_SSE1,dr_SSE1)));
3080             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
3081                                             _mm_add_ps(_mm_mul_ps(uij_SSE2,rinv_SSE2),
3082                                                        _mm_mul_ps(uij3_SSE2,dr_SSE2)));
3083             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
3084                                             _mm_add_ps(_mm_mul_ps(uij_SSE3,rinv_SSE3),
3085                                                        _mm_mul_ps(uij3_SSE3,dr_SSE3)));
3086             t2_SSE0            = _mm_sub_ps(t2_SSE0,
3087                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE0),
3088                                                        _mm_mul_ps(prod_SSE0,uij3_SSE0)));
3089             t2_SSE1            = _mm_sub_ps(t2_SSE1,
3090                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE1),
3091                                                        _mm_mul_ps(prod_SSE1,uij3_SSE1)));
3092             t2_SSE2            = _mm_sub_ps(t2_SSE2,
3093                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE2),
3094                                                        _mm_mul_ps(prod_SSE2,uij3_SSE2)));
3095             t2_SSE3            = _mm_sub_ps(t2_SSE3,
3096                                             _mm_add_ps(_mm_mul_ps(half_SSE,uij2_SSE3),
3097                                                        _mm_mul_ps(prod_SSE3,uij3_SSE3)));
3098             
3099             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE0),
3100                                             _mm_mul_ps(rinv_SSE0,rinv_SSE0));
3101             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE1),
3102                                             _mm_mul_ps(rinv_SSE1,rinv_SSE1));
3103             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE2),
3104                                             _mm_mul_ps(rinv_SSE2,rinv_SSE2));
3105             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE,logterm_SSE3),
3106                                             _mm_mul_ps(rinv_SSE3,rinv_SSE3));
3107             
3108             t3_SSE0            = _mm_sub_ps(t3_SSE0,
3109                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0,oneeighth_SSE),
3110                                                        _mm_add_ps(one_SSE,
3111                                                                   _mm_mul_ps(sk2_rinv_SSE0,rinv_SSE0))));
3112             t3_SSE1            = _mm_sub_ps(t3_SSE1,
3113                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1,oneeighth_SSE),
3114                                                        _mm_add_ps(one_SSE,
3115                                                                   _mm_mul_ps(sk2_rinv_SSE1,rinv_SSE1))));
3116             t3_SSE2            = _mm_sub_ps(t3_SSE2,
3117                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2,oneeighth_SSE),
3118                                                        _mm_add_ps(one_SSE,
3119                                                                   _mm_mul_ps(sk2_rinv_SSE2,rinv_SSE2))));
3120             t3_SSE3            = _mm_sub_ps(t3_SSE3,
3121                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3,oneeighth_SSE),
3122                                                        _mm_add_ps(one_SSE,
3123                                                                   _mm_mul_ps(sk2_rinv_SSE3,rinv_SSE3))));
3124             
3125             
3126             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
3127                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0,t1_SSE0),
3128                                                        _mm_add_ps(t2_SSE0,t3_SSE0)));
3129             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
3130                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1,t1_SSE1),
3131                                                        _mm_add_ps(t2_SSE1,t3_SSE1)));
3132             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
3133                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2,t1_SSE2),
3134                                                        _mm_add_ps(t2_SSE2,t3_SSE2)));
3135             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
3136                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3,t1_SSE3),
3137                                                        _mm_add_ps(t2_SSE3,t3_SSE3)));
3138             
3139             _mm_store_ps(dadx,_mm_and_ps(t1_SSE0,obc_mask1_SSE0));
3140             dadx += 4;
3141             _mm_store_ps(dadx,_mm_and_ps(t1_SSE1,obc_mask1_SSE1));
3142             dadx += 4;
3143             _mm_store_ps(dadx,_mm_and_ps(t1_SSE2,obc_mask1_SSE2));
3144             dadx += 4;
3145             _mm_store_ps(dadx,_mm_and_ps(t1_SSE3,obc_mask1_SSE3));
3146             dadx += 4;
3147         }
3148         _MM_TRANSPOSE4_PS(sum_ai_SSE0,sum_ai_SSE1,sum_ai_SSE2,sum_ai_SSE3);
3149         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,sum_ai_SSE1);
3150         sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2,sum_ai_SSE3);
3151         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0,sum_ai_SSE2);
3152         _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
3153         }    
3154     
3155     
3156     for(i=0;i<natoms/2+1;i++)
3157     {
3158         work[i] += work[natoms+i];
3159     }
3160     
3161     /* Parallel summations */
3162     
3163         if(PARTDECOMP(cr))
3164         {
3165                 gmx_sum(natoms,work, cr);
3166         }
3167         
3168     if(gb_algorithm==egbHCT)
3169     {
3170         /* HCT */
3171         for(i=0;i<natoms;i++)
3172         {
3173             if(born->use[i] != 0)
3174             {
3175                 rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset; 
3176                 sum_ai  = 1.0/rai - work[i];
3177                 min_rad = rai + born->gb_doffset;
3178                 rad     = 1.0/sum_ai; 
3179                 
3180                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
3181                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3182             }
3183         }
3184         
3185     }
3186     else
3187     {
3188         /* OBC */
3189         
3190         /* Calculate the radii */
3191         for(i=0;i<natoms;i++)
3192         {
3193             
3194             if(born->use[i] != 0)
3195             {
3196                 rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
3197                 rai_inv2   = 1.0/rai;
3198                 rai        = rai-born->gb_doffset; 
3199                 rai_inv    = 1.0/rai;
3200                 sum_ai     = rai * work[i];
3201                 sum_ai2    = sum_ai  * sum_ai;
3202                 sum_ai3    = sum_ai2 * sum_ai;
3203                 
3204                 tsum    = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
3205                 born->bRad[i] = rai_inv - tsum*rai_inv2;
3206                 born->bRad[i] = 1.0 / born->bRad[i];
3207                 
3208                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
3209                 
3210                 tchain  = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
3211                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
3212             }
3213         }
3214     }
3215         
3216         return 0;
3217 }
3218
3219
3220
3221
3222
3223
3224
3225
3226 int
3227 genborn_allvsall_calc_chainrule_sse2_single(t_forcerec *           fr,
3228                                             t_mdatoms *            mdatoms,
3229                                             gmx_genborn_t *        born,
3230                                             real *                 x,
3231                                             real *                 f,
3232                                             int                    gb_algorithm,
3233                                             void *                 paadata)
3234 {
3235         gmx_allvsallgb2_data_t *aadata;
3236         int        natoms;
3237         int        ni0,ni1;
3238         int        nj0,nj1,nj2,nj3;
3239         int        i,j,k,n;
3240     int        idx;
3241     int *      mask;
3242     int *      pmask0;
3243     int *      emask0;
3244     int *      jindex;
3245
3246     real       ix,iy,iz;
3247     real       fix,fiy,fiz;
3248     real       jx,jy,jz;
3249     real       dx,dy,dz;
3250     real       tx,ty,tz;
3251     real       rbai,rbaj,fgb,fgb_ai,rbi;
3252     real *     rb;
3253     real *     dadx;
3254     real *     x_align;
3255     real *     y_align;
3256     real *     z_align;
3257     real *     fx_align;
3258     real *     fy_align;
3259     real *     fz_align;
3260     real       tmpsum[4];
3261     
3262     __m128     jmask_SSE0,jmask_SSE1,jmask_SSE2,jmask_SSE3;
3263     __m128     ix_SSE0,iy_SSE0,iz_SSE0;
3264     __m128     ix_SSE1,iy_SSE1,iz_SSE1;
3265     __m128     ix_SSE2,iy_SSE2,iz_SSE2;
3266     __m128     ix_SSE3,iy_SSE3,iz_SSE3;
3267     __m128     fix_SSE0,fiy_SSE0,fiz_SSE0;
3268     __m128     fix_SSE1,fiy_SSE1,fiz_SSE1;
3269     __m128     fix_SSE2,fiy_SSE2,fiz_SSE2;
3270     __m128     fix_SSE3,fiy_SSE3,fiz_SSE3;
3271     __m128     rbai_SSE0,rbai_SSE1,rbai_SSE2,rbai_SSE3;
3272     __m128     imask_SSE0,imask_SSE1,imask_SSE2,imask_SSE3;
3273     __m128     jx_SSE,jy_SSE,jz_SSE,rbaj_SSE;
3274     __m128     dx_SSE0,dy_SSE0,dz_SSE0;
3275     __m128     dx_SSE1,dy_SSE1,dz_SSE1;
3276     __m128     dx_SSE2,dy_SSE2,dz_SSE2;
3277     __m128     dx_SSE3,dy_SSE3,dz_SSE3;
3278     __m128     fgb_SSE0,fgb_ai_SSE0;
3279     __m128     fgb_SSE1,fgb_ai_SSE1;
3280     __m128     fgb_SSE2,fgb_ai_SSE2;
3281     __m128     fgb_SSE3,fgb_ai_SSE3;
3282     __m128     tx_SSE0,ty_SSE0,tz_SSE0;
3283     __m128     tx_SSE1,ty_SSE1,tz_SSE1;
3284     __m128     tx_SSE2,ty_SSE2,tz_SSE2;
3285     __m128     tx_SSE3,ty_SSE3,tz_SSE3;
3286     __m128     t1,t2;
3287     
3288     natoms              = mdatoms->nr;
3289         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
3290         ni1                 = mdatoms->start+mdatoms->homenr;
3291     dadx                = fr->dadx;
3292     
3293     aadata = (gmx_allvsallgb2_data_t *)paadata;
3294
3295     x_align = aadata->x_align;
3296         y_align = aadata->y_align;
3297         z_align = aadata->z_align;
3298     fx_align = aadata->fx_align;
3299         fy_align = aadata->fy_align;
3300         fz_align = aadata->fz_align;
3301     
3302     jindex    = aadata->jindex_gb;
3303     dadx      = fr->dadx;
3304
3305     n = 0;
3306     rb = aadata->work;
3307     
3308         /* Loop to get the proper form for the Born radius term */
3309         if(gb_algorithm==egbSTILL) 
3310         {
3311                 for(i=0;i<natoms;i++)
3312                 {
3313                         rbi   = born->bRad[i];
3314                         rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
3315                 }
3316         }
3317         else if(gb_algorithm==egbHCT) 
3318         {
3319                 for(i=0;i<natoms;i++)
3320                 {
3321                         rbi   = born->bRad[i];
3322                         rb[i] = rbi * rbi * fr->dvda[i];
3323                 }
3324         }
3325         else if(gb_algorithm==egbOBC) 
3326         {
3327                 for(idx=0;idx<natoms;idx++)
3328                 {
3329                         rbi   = born->bRad[idx];
3330                         rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
3331                 }
3332         }
3333     
3334     for(i=0;i<2*natoms;i++)
3335         {
3336                 fx_align[i]       = 0;
3337                 fy_align[i]       = 0;
3338                 fz_align[i]       = 0;
3339     }        
3340     
3341     
3342     for(i=0;i<natoms;i++)
3343     {
3344         rb[i+natoms]=rb[i];
3345     }
3346
3347     for(i=ni0; i<ni1; i+=UNROLLI)
3348         {
3349                 /* We assume shifts are NOT used for all-vs-all interactions */
3350                 
3351                 /* Load i atom data */
3352                 ix_SSE0          = _mm_load1_ps(x_align+i);
3353                 iy_SSE0          = _mm_load1_ps(y_align+i);
3354                 iz_SSE0          = _mm_load1_ps(z_align+i);
3355                 ix_SSE1          = _mm_load1_ps(x_align+i+1);
3356                 iy_SSE1          = _mm_load1_ps(y_align+i+1);
3357                 iz_SSE1          = _mm_load1_ps(z_align+i+1);
3358                 ix_SSE2          = _mm_load1_ps(x_align+i+2);
3359                 iy_SSE2          = _mm_load1_ps(y_align+i+2);
3360                 iz_SSE2          = _mm_load1_ps(z_align+i+2);
3361                 ix_SSE3          = _mm_load1_ps(x_align+i+3);
3362                 iy_SSE3          = _mm_load1_ps(y_align+i+3);
3363                 iz_SSE3          = _mm_load1_ps(z_align+i+3);
3364         
3365                 fix_SSE0         = _mm_setzero_ps();
3366                 fiy_SSE0         = _mm_setzero_ps();
3367                 fiz_SSE0         = _mm_setzero_ps();
3368                 fix_SSE1         = _mm_setzero_ps();
3369                 fiy_SSE1         = _mm_setzero_ps();
3370                 fiz_SSE1         = _mm_setzero_ps();
3371                 fix_SSE2         = _mm_setzero_ps();
3372                 fiy_SSE2         = _mm_setzero_ps();
3373                 fiz_SSE2         = _mm_setzero_ps();
3374                 fix_SSE3         = _mm_setzero_ps();
3375                 fiy_SSE3         = _mm_setzero_ps();
3376                 fiz_SSE3         = _mm_setzero_ps();
3377         
3378         rbai_SSE0        = _mm_load1_ps(rb+i);
3379         rbai_SSE1        = _mm_load1_ps(rb+i+1);
3380         rbai_SSE2        = _mm_load1_ps(rb+i+2);
3381         rbai_SSE3        = _mm_load1_ps(rb+i+3);
3382         
3383                 /* Load limits for loop over neighbors */
3384                 nj0              = jindex[4*i];
3385                 nj3              = jindex[4*i+3];
3386
3387         /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
3388         for(j=nj0; j<nj3; j+=UNROLLJ)
3389         {          
3390             /* load j atom coordinates */
3391             jx_SSE           = _mm_load_ps(x_align+j);
3392             jy_SSE           = _mm_load_ps(y_align+j);
3393             jz_SSE           = _mm_load_ps(z_align+j);
3394             
3395             /* Calculate distance */
3396             dx_SSE0          = _mm_sub_ps(ix_SSE0,jx_SSE);
3397             dy_SSE0          = _mm_sub_ps(iy_SSE0,jy_SSE);
3398             dz_SSE0          = _mm_sub_ps(iz_SSE0,jz_SSE);
3399             dx_SSE1          = _mm_sub_ps(ix_SSE1,jx_SSE);
3400             dy_SSE1          = _mm_sub_ps(iy_SSE1,jy_SSE);
3401             dz_SSE1          = _mm_sub_ps(iz_SSE1,jz_SSE);
3402             dx_SSE2          = _mm_sub_ps(ix_SSE2,jx_SSE);
3403             dy_SSE2          = _mm_sub_ps(iy_SSE2,jy_SSE);
3404             dz_SSE2          = _mm_sub_ps(iz_SSE2,jz_SSE);
3405             dx_SSE3          = _mm_sub_ps(ix_SSE3,jx_SSE);
3406             dy_SSE3          = _mm_sub_ps(iy_SSE3,jy_SSE);
3407             dz_SSE3          = _mm_sub_ps(iz_SSE3,jz_SSE);
3408             
3409             rbaj_SSE         = _mm_load_ps(rb+j);
3410             
3411             fgb_SSE0         = _mm_mul_ps(rbai_SSE0,_mm_load_ps(dadx));            
3412             dadx += 4;
3413             fgb_SSE1         = _mm_mul_ps(rbai_SSE1,_mm_load_ps(dadx));
3414             dadx += 4;
3415             fgb_SSE2         = _mm_mul_ps(rbai_SSE2,_mm_load_ps(dadx));
3416             dadx += 4;
3417             fgb_SSE3         = _mm_mul_ps(rbai_SSE3,_mm_load_ps(dadx));
3418             dadx += 4;
3419                         
3420             fgb_ai_SSE0      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3421             dadx +=4;
3422             fgb_ai_SSE1      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3423             dadx +=4;
3424             fgb_ai_SSE2      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3425             dadx +=4;
3426             fgb_ai_SSE3      = _mm_mul_ps(rbaj_SSE,_mm_load_ps(dadx));
3427             dadx +=4;
3428             
3429             /* Total force between ai and aj is the sum of ai->aj and aj->ai */
3430             fgb_SSE0         = _mm_add_ps(fgb_SSE0,fgb_ai_SSE0);
3431             fgb_SSE1         = _mm_add_ps(fgb_SSE1,fgb_ai_SSE1);
3432             fgb_SSE2         = _mm_add_ps(fgb_SSE2,fgb_ai_SSE2);
3433             fgb_SSE3         = _mm_add_ps(fgb_SSE3,fgb_ai_SSE3);
3434                             
3435             /* Calculate temporary vectorial force */
3436             tx_SSE0            = _mm_mul_ps(fgb_SSE0,dx_SSE0);
3437             ty_SSE0            = _mm_mul_ps(fgb_SSE0,dy_SSE0);
3438             tz_SSE0            = _mm_mul_ps(fgb_SSE0,dz_SSE0);
3439             tx_SSE1            = _mm_mul_ps(fgb_SSE1,dx_SSE1);
3440             ty_SSE1            = _mm_mul_ps(fgb_SSE1,dy_SSE1);
3441             tz_SSE1            = _mm_mul_ps(fgb_SSE1,dz_SSE1);
3442             tx_SSE2            = _mm_mul_ps(fgb_SSE2,dx_SSE2);
3443             ty_SSE2            = _mm_mul_ps(fgb_SSE2,dy_SSE2);
3444             tz_SSE2            = _mm_mul_ps(fgb_SSE2,dz_SSE2);
3445             tx_SSE3            = _mm_mul_ps(fgb_SSE3,dx_SSE3);
3446             ty_SSE3            = _mm_mul_ps(fgb_SSE3,dy_SSE3);
3447             tz_SSE3            = _mm_mul_ps(fgb_SSE3,dz_SSE3);
3448             
3449             /* Increment i atom force */
3450             fix_SSE0          = _mm_add_ps(fix_SSE0,tx_SSE0);
3451             fiy_SSE0          = _mm_add_ps(fiy_SSE0,ty_SSE0);
3452             fiz_SSE0          = _mm_add_ps(fiz_SSE0,tz_SSE0);
3453             fix_SSE1          = _mm_add_ps(fix_SSE1,tx_SSE1);
3454             fiy_SSE1          = _mm_add_ps(fiy_SSE1,ty_SSE1);
3455             fiz_SSE1          = _mm_add_ps(fiz_SSE1,tz_SSE1);
3456             fix_SSE2          = _mm_add_ps(fix_SSE2,tx_SSE2);
3457             fiy_SSE2          = _mm_add_ps(fiy_SSE2,ty_SSE2);
3458             fiz_SSE2          = _mm_add_ps(fiz_SSE2,tz_SSE2);
3459             fix_SSE3          = _mm_add_ps(fix_SSE3,tx_SSE3);
3460             fiy_SSE3          = _mm_add_ps(fiy_SSE3,ty_SSE3);
3461             fiz_SSE3          = _mm_add_ps(fiz_SSE3,tz_SSE3);
3462             
3463             /* Decrement j atom force */
3464             _mm_store_ps(fx_align+j,
3465                          _mm_sub_ps( _mm_load_ps(fx_align+j) , gmx_mm_sum4_ps(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
3466             _mm_store_ps(fy_align+j,
3467                          _mm_sub_ps( _mm_load_ps(fy_align+j) , gmx_mm_sum4_ps(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
3468             _mm_store_ps(fz_align+j,
3469                          _mm_sub_ps( _mm_load_ps(fz_align+j) , gmx_mm_sum4_ps(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
3470         }
3471                 /* Add i forces to mem and shifted force list */
3472         _MM_TRANSPOSE4_PS(fix_SSE0,fix_SSE1,fix_SSE2,fix_SSE3);
3473         fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE1);
3474         fix_SSE2 = _mm_add_ps(fix_SSE2,fix_SSE3);
3475         fix_SSE0 = _mm_add_ps(fix_SSE0,fix_SSE2);
3476         _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
3477         
3478         _MM_TRANSPOSE4_PS(fiy_SSE0,fiy_SSE1,fiy_SSE2,fiy_SSE3);
3479         fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE1);
3480         fiy_SSE2 = _mm_add_ps(fiy_SSE2,fiy_SSE3);
3481         fiy_SSE0 = _mm_add_ps(fiy_SSE0,fiy_SSE2);
3482         _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
3483         
3484         _MM_TRANSPOSE4_PS(fiz_SSE0,fiz_SSE1,fiz_SSE2,fiz_SSE3);
3485         fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE1);
3486         fiz_SSE2 = _mm_add_ps(fiz_SSE2,fiz_SSE3);
3487         fiz_SSE0 = _mm_add_ps(fiz_SSE0,fiz_SSE2);
3488         _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));                
3489         }    
3490         
3491     for(i=0;i<natoms;i++)
3492         {
3493                 f[3*i]       += fx_align[i] + fx_align[natoms+i];
3494                 f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
3495                 f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
3496     }
3497     
3498         return 0;
3499 }
3500
3501 #else
3502 /* dummy variable when not using SSE */
3503 int genborn_allvsall_sse2_single_dummy;
3504
3505
3506 #endif
3507