c219485666d61eb5f7e1b20a2050433f1437b247
[alexxy/gromacs.git] / src / mdlib / genborn_allvsall_sse2_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2012, by the GROMACS development team, led by
6  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
7  * others, as listed in the AUTHORS file in the top-level source
8  * directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39
40 #include <math.h>
41 #include "types/simple.h"
42
43 #include "vec.h"
44 #include "smalloc.h"
45
46 #include "partdec.h"
47 #include "network.h"
48 #include "physics.h" 
49 #include "genborn.h"
50 #include "genborn_allvsall.h"
51
52
53 #if 0 && defined (GMX_X86_SSE2)
54
55 #include <gmx_sse2_double.h>
56
57
58 #define SIMD_WIDTH 2
59 #define UNROLLI    2
60 #define UNROLLJ    2
61
62
63
64
65
66
67
68
69
70 typedef struct 
71 {
72     int *      jindex_gb;
73     int **     prologue_mask_gb;
74     int **     epilogue_mask;
75     int *      imask;
76     double *     gb_radius;
77     double *     workparam;
78     double *     work;
79     double *     x_align;
80     double *     y_align;
81     double *     z_align;
82     double *     fx_align;
83         double *     fy_align;
84         double *     fz_align;      
85
86 gmx_allvsallgb2_data_t;
87
88
89 static int 
90 calc_maxoffset(int i,int natoms)
91 {
92     int maxoffset;
93     
94     if ((natoms % 2) == 1)
95     {
96         /* Odd number of atoms, easy */
97         maxoffset = natoms/2;
98     }
99     else if ((natoms % 4) == 0)
100     {
101         /* Multiple of four is hard */
102         if (i < natoms/2)
103         {
104             if ((i % 2) == 0)
105             {
106                 maxoffset=natoms/2;
107             }
108             else
109             {
110                 maxoffset=natoms/2-1;
111             }
112         }
113         else
114         {
115             if ((i % 2) == 1)
116             {
117                 maxoffset=natoms/2;
118             }
119             else
120             {
121                 maxoffset=natoms/2-1;
122             }
123         }
124     }
125     else
126     {
127         /* natoms/2 = odd */
128         if ((i % 2) == 0)
129         {
130             maxoffset=natoms/2;
131         }
132         else
133         {
134             maxoffset=natoms/2-1;
135         }
136     }
137     
138     return maxoffset;
139 }
140
141 static void
142 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t *   aadata,
143                                 t_ilist *                  ilist,
144                                 int                        start,
145                                 int                        end,
146                                 int                        natoms,
147                                 gmx_bool                       bInclude12,
148                                 gmx_bool                       bInclude13,
149                                 gmx_bool                       bInclude14)
150 {
151     int i,j,k,tp;
152     int a1,a2;
153     int ni0,ni1,nj0,nj1,nj;
154     int imin,imax,iexcl;
155     int max_offset;
156     int max_excl_offset;
157     int firstinteraction;
158     int ibase;
159     int  *pi;
160     
161     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
162      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
163      * whether they should interact or not. 
164      *
165      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
166      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
167      * we create a jindex array with three elements per i atom: the starting point, the point to
168      * which we need to check exclusions, and the end point.
169      * This way we only have to allocate a short exclusion mask per i atom.
170      */
171     
172     ni0 = (start/UNROLLI)*UNROLLI;
173     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
174     
175     /* Set the interaction mask to only enable the i atoms we want to include */
176     snew(pi,2*(natoms+UNROLLI+2*SIMD_WIDTH));
177     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
178     for(i=0;i<natoms+UNROLLI;i++)
179     {
180         aadata->imask[2*i]   = (i>=start && i<end) ? 0xFFFFFFFF : 0;
181         aadata->imask[2*i+1] = (i>=start && i<end) ? 0xFFFFFFFF : 0;
182     }
183     
184     /* Allocate memory for our modified jindex array */
185     snew(aadata->jindex_gb,4*(natoms+UNROLLI));
186     for(i=0;i<4*(natoms+UNROLLI);i++)
187     {
188         aadata->jindex_gb[i] = 0;
189     }
190     
191     /* Create the exclusion masks for the prologue part */
192         snew(aadata->prologue_mask_gb,natoms+UNROLLI); /* list of pointers */
193         
194     /* First zero everything to avoid uninitialized data */
195     for(i=0;i<natoms+UNROLLI;i++)
196     {
197         aadata->prologue_mask_gb[i] = NULL;
198     }
199     
200     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
201     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
202         {
203         max_excl_offset = -1;
204         
205         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
206         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
207         
208         /* Which atom is the first we (might) interact with? */
209         imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
210         for(i=ibase;i<imax;i++)
211         {
212             /* Before exclusions, which atom is the first we (might) interact with? */
213             firstinteraction = i+1;
214             max_offset = calc_maxoffset(i,natoms);
215             
216             if(!bInclude12)
217             {
218                 for(j=0;j<ilist[F_GB12].nr;j+=3)
219                 {
220                     a1 = ilist[F_GB12].iatoms[j+1];
221                     a2 = ilist[F_GB12].iatoms[j+2];
222                     
223                     if(a1==i)
224                     {
225                         k = a2;
226                     }
227                     else if(a2==i)
228                     {
229                         k = a1;
230                     }
231                     else 
232                     {
233                         continue;
234                     }
235                     
236                     if(k==firstinteraction)
237                     {
238                         firstinteraction++;
239                     }
240                 }
241             }
242             if(!bInclude13)
243             {
244                 for(j=0;j<ilist[F_GB13].nr;j+=3)
245                 {
246                     a1 = ilist[F_GB13].iatoms[j+1];
247                     a2 = ilist[F_GB13].iatoms[j+2];
248                     
249                     if(a1==i)
250                     {
251                         k = a2;
252                     }
253                     else if(a2==i)
254                     {
255                         k = a1;
256                     }
257                     else 
258                     {
259                         continue;
260                     }
261                     
262                     if(k==firstinteraction)
263                     {
264                         firstinteraction++;
265                     }
266                 }
267             }
268             if(!bInclude14)
269             {
270                 for(j=0;j<ilist[F_GB14].nr;j+=3)
271                 {
272                     a1 = ilist[F_GB14].iatoms[j+1];
273                     a2 = ilist[F_GB14].iatoms[j+2];
274                     if(a1==i)
275                     {
276                         k = a2;
277                     }
278                     else if(a2==i)
279                     {
280                         k = a1;
281                     }
282                     else 
283                     {
284                         continue;
285                     }
286                     
287                     if(k==firstinteraction)
288                     {
289                         firstinteraction++;
290                     }
291                 }
292             }
293             imin = (firstinteraction < imin) ? firstinteraction : imin;
294         }
295         /* round down to j unrolling factor */
296         imin = (imin/UNROLLJ)*UNROLLJ;
297         
298         for(i=ibase;i<imax;i++)
299         {
300             max_offset = calc_maxoffset(i,natoms);
301             
302             if(!bInclude12)
303             {
304                 for(j=0;j<ilist[F_GB12].nr;j+=3)
305                 {
306                     a1 = ilist[F_GB12].iatoms[j+1];
307                     a2 = ilist[F_GB12].iatoms[j+2];
308                     
309                     if(a1==i)
310                     {
311                         k = a2;
312                     }
313                     else if(a2==i)
314                     {
315                         k = a1;
316                     }
317                     else 
318                     {
319                         continue;
320                     }
321                     
322                     if(k<imin)
323                     {
324                         k += natoms;
325                     }
326                     
327                     if(k>i+max_offset)
328                     {
329                         continue;
330                     }
331                     
332                     k = k - imin;
333                     
334                     if( k+natoms <= max_offset )
335                     {
336                         k+=natoms;
337                     }
338                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
339                 }
340             }
341             if(!bInclude13)
342             {
343                 for(j=0;j<ilist[F_GB13].nr;j+=3)
344                 {
345                     a1 = ilist[F_GB13].iatoms[j+1];
346                     a2 = ilist[F_GB13].iatoms[j+2];
347                     
348                     if(a1==i)
349                     {
350                         k = a2;
351                     }
352                     else if(a2==i)
353                     {
354                         k = a1;
355                     }
356                     else 
357                     {
358                         continue;
359                     }
360                     
361                     if(k<imin)
362                     {
363                         k += natoms;
364                     }
365                     
366                     if(k>i+max_offset)
367                     {
368                         continue;
369                     }
370                     
371                     k = k - imin;
372                     
373                     if( k+natoms <= max_offset )
374                     {
375                         k+=natoms;
376                     }
377                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
378                 }
379             }
380             if(!bInclude14)
381             {
382                 for(j=0;j<ilist[F_GB14].nr;j+=3)
383                 {
384                     a1 = ilist[F_GB14].iatoms[j+1];
385                     a2 = ilist[F_GB14].iatoms[j+2];
386                     
387                     if(a1==i)
388                     {
389                         k = a2;
390                     }
391                     else if(a2==i)
392                     {
393                         k = a1;
394                     }
395                     else 
396                     {
397                         continue;
398                     }
399                     
400                     if(k<imin)
401                     {
402                         k += natoms;
403                     }
404                     
405                     if(k>i+max_offset)
406                     {
407                         continue;
408                     }
409                     
410                     k = k - imin;
411                     
412                     if( k+natoms <= max_offset )
413                     {
414                         k+=natoms;
415                     }
416                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
417                 }
418             }
419         }
420         
421         /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
422         max_excl_offset++;
423         /* round up to j unrolling factor */
424         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
425         
426         /* Set all the prologue masks length to this value (even for i>end) */
427         for(i=ibase;i<ibase+UNROLLI;i++)
428         {
429             aadata->jindex_gb[4*i]   = imin;
430             aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
431         }        
432     }
433     
434     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
435     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
436     {      
437         for(i=ibase;i<ibase+UNROLLI;i++)
438         {
439             nj = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
440             imin = aadata->jindex_gb[4*i];
441             
442             /* Allocate aligned memory */
443             snew(pi,2*(nj+2*SIMD_WIDTH));
444             aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
445             
446             max_offset = calc_maxoffset(i,natoms);
447             
448             /* Include interactions i+1 <= j < i+maxoffset */
449             for(k=0;k<nj;k++)
450             {
451                 j = imin + k;
452                 
453                 if( (j>i) && (j<=i+max_offset) )
454                 {
455                     aadata->prologue_mask_gb[i][2*k]   = 0xFFFFFFFF;
456                     aadata->prologue_mask_gb[i][2*k+1] = 0xFFFFFFFF;
457                 }
458                 else
459                 {
460                     aadata->prologue_mask_gb[i][2*k]   = 0;
461                     aadata->prologue_mask_gb[i][2*k+1] = 0;
462                 }
463             }
464             
465             /* Clear out the explicit exclusions */
466             if(i<end)
467             {
468                 if(!bInclude12)
469                 {
470                     for(j=0;j<ilist[F_GB12].nr;j+=3)
471                     {
472                         a1 = ilist[F_GB12].iatoms[j+1];
473                         a2 = ilist[F_GB12].iatoms[j+2];
474                         
475                         if(a1==i)
476                         {
477                             k = a2;
478                         }
479                         else if(a2==i)
480                         {
481                             k = a1;
482                         }
483                         else 
484                         {
485                             continue;
486                         }
487                         
488                         if(k>i+max_offset)
489                         {
490                             continue;
491                         }
492                         k = k-i;
493                         
494                         if( k+natoms <= max_offset )
495                         {
496                             k+=natoms;
497                         }
498                         
499                         k = k+i-imin;
500                         if(k>=0)
501                         {                        
502                             aadata->prologue_mask_gb[i][2*k]   = 0;
503                             aadata->prologue_mask_gb[i][2*k+1] = 0;
504                         }
505                     }
506                 }
507                 if(!bInclude13)
508                 {
509                     for(j=0;j<ilist[F_GB13].nr;j+=3)
510                     {
511                         a1 = ilist[F_GB13].iatoms[j+1];
512                         a2 = ilist[F_GB13].iatoms[j+2];
513                         
514                         if(a1==i)
515                         {
516                             k = a2;
517                         }
518                         else if(a2==i)
519                         {
520                             k = a1;
521                         }
522                         else 
523                         {
524                             continue;
525                         }
526                         
527                         if(k>i+max_offset)
528                         {
529                             continue;
530                         }
531                         k = k-i;
532                         
533                         if( k+natoms <= max_offset )
534                         {
535                             k+=natoms;
536                         }
537                         
538                         k = k+i-imin;
539                         if(k>=0)
540                         {                        
541                             aadata->prologue_mask_gb[i][2*k]   = 0;
542                             aadata->prologue_mask_gb[i][2*k+1] = 0;
543                         }
544                     }
545                 }
546                 if(!bInclude14)
547                 {
548                     for(j=0;j<ilist[F_GB14].nr;j+=3)
549                     {
550                         a1 = ilist[F_GB14].iatoms[j+1];
551                         a2 = ilist[F_GB14].iatoms[j+2];
552                         
553                         if(a1==i)
554                         {
555                             k = a2;
556                         }
557                         else if(a2==i)
558                         {
559                             k = a1;
560                         }
561                         else 
562                         {
563                             continue;
564                         }
565                         
566                         if(k>i+max_offset)
567                         {
568                             continue;
569                         }
570                         k = k-i;
571                         
572                         if( k+natoms <= max_offset )
573                         {
574                             k+=natoms;
575                         }
576                         
577                         k = k+i-imin;
578                         if(k>=0)
579                         {                        
580                             aadata->prologue_mask_gb[i][2*k]   = 0;
581                             aadata->prologue_mask_gb[i][2*k+1] = 0;
582                         }
583                     }
584                 }
585             }
586         }
587     }
588     
589     /* Construct the epilogue mask - this just contains the check for maxoffset */
590     snew(aadata->epilogue_mask,natoms+UNROLLI);
591     
592     /* First zero everything to avoid uninitialized data */
593     for(i=0;i<natoms+UNROLLI;i++)
594     {
595         aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
596         aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
597         aadata->epilogue_mask[i] = NULL;
598     }
599     
600     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
601     {      
602         /* Find the lowest index for which we need to use the epilogue */
603         imin = ibase;
604         max_offset = calc_maxoffset(imin,natoms);
605         
606         imin = imin + 1 + max_offset;
607         
608         /* Find largest index for which we need to use the epilogue */
609         imax = ibase + UNROLLI-1;
610         imax = (imax < end) ? imax : end; 
611         
612         max_offset = calc_maxoffset(imax,natoms);
613         imax = imax + 1 + max_offset + UNROLLJ - 1;
614         
615         for(i=ibase;i<ibase+UNROLLI;i++)
616         {
617             /* Start of epilogue - round down to j tile limit */
618             aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
619             /* Make sure we dont overlap - for small systems everything is done in the prologue */
620             aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
621             /* Round upwards to j tile limit */
622             aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
623             /* Make sure we dont have a negative range for the epilogue */
624             aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
625         }
626     }
627     
628     /* And fill it with data... */
629     
630     for(ibase=ni0;ibase<ni1;ibase+=UNROLLI)
631     {
632         for(i=ibase;i<ibase+UNROLLI;i++)
633         {
634             
635             nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
636             
637             /* Allocate aligned memory */
638             snew(pi,2*(nj+2*SIMD_WIDTH));
639             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
640             
641             max_offset = calc_maxoffset(i,natoms);
642             
643             for(k=0;k<nj;k++)
644             {
645                 j = aadata->jindex_gb[4*i+2] + k;
646                 aadata->epilogue_mask[i][2*k]   = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
647                 aadata->epilogue_mask[i][2*k+1] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
648             }
649         }
650     }
651 }
652
653
654 static void
655 genborn_allvsall_setup(gmx_allvsallgb2_data_t **  p_aadata,
656                        gmx_localtop_t *           top,
657                        gmx_genborn_t *            born,
658                        t_mdatoms *                mdatoms,
659                        double                     radius_offset,
660                        int                        gb_algorithm,
661                        gmx_bool                       bInclude12,
662                        gmx_bool                       bInclude13,
663                        gmx_bool                       bInclude14)
664 {
665         int i,j,idx;
666     int natoms;
667         gmx_allvsallgb2_data_t *aadata;
668     double *p;
669     
670     natoms = mdatoms->nr;
671     
672         snew(aadata,1);
673         *p_aadata = aadata;
674     
675         snew(p,2*natoms+2*SIMD_WIDTH);
676         aadata->x_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
677         snew(p,2*natoms+2*SIMD_WIDTH);
678         aadata->y_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
679         snew(p,2*natoms+2*SIMD_WIDTH);
680         aadata->z_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
681         snew(p,2*natoms+2*SIMD_WIDTH);
682         aadata->fx_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
683         snew(p,2*natoms+2*SIMD_WIDTH);
684         aadata->fy_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
685         snew(p,2*natoms+2*SIMD_WIDTH);
686         aadata->fz_align = (double *) (((size_t) p + 16) & (~((size_t) 15)));
687
688     snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
689     aadata->gb_radius = (double *) (((size_t) p + 16) & (~((size_t) 15)));
690         
691     snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
692     aadata->workparam = (double *) (((size_t) p + 16) & (~((size_t) 15)));
693     
694     snew(p,2*natoms+UNROLLJ+SIMD_WIDTH);
695     aadata->work = (double *) (((size_t) p + 16) & (~((size_t) 15)));
696     
697     for(i=0;i<mdatoms->nr;i++)
698     {
699         aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
700         if(gb_algorithm==egbSTILL)
701         {
702             aadata->workparam[i] = born->vsolv[i];
703         }
704         else if(gb_algorithm==egbOBC)
705         {
706             aadata->workparam[i] = born->param[i];
707         }
708         aadata->work[i]      = 0.0;
709     }
710     for(i=0;i<mdatoms->nr;i++)
711     {
712         aadata->gb_radius[natoms+i] = aadata->gb_radius[i]; 
713         aadata->workparam[natoms+i] = aadata->workparam[i];
714         aadata->work[natoms+i]      = aadata->work[i];
715     }
716     
717     for(i=0;i<2*natoms+SIMD_WIDTH;i++)
718         {
719                 aadata->x_align[i] = 0.0;
720                 aadata->y_align[i] = 0.0;
721                 aadata->z_align[i] = 0.0;
722                 aadata->fx_align[i] = 0.0;
723                 aadata->fy_align[i] = 0.0;
724                 aadata->fz_align[i] = 0.0;
725         }
726     
727     setup_gb_exclusions_and_indices(aadata,top->idef.il,mdatoms->start,mdatoms->start+mdatoms->homenr,mdatoms->nr,
728                                     bInclude12,bInclude13,bInclude14);
729 }
730
731
732 /*
733  * This routine apparently hits a compiler bug visual studio has had 'forever'.
734  * It is present both in VS2005 and VS2008, and the only way around it is to
735  * decrease optimization. We do that with at pragma, and only for MSVC, so it
736  * will not hurt any of the well-behaving and supported compilers out there.
737  * MS: Fix your compiler, it sucks like a black hole!
738  */
739 #ifdef _MSC_VER
740 #pragma optimize("t",off)
741 #endif
742
743 int
744 genborn_allvsall_calc_still_radii_sse2_double(t_forcerec *           fr,
745                                               t_mdatoms *            mdatoms,
746                                               gmx_genborn_t *        born,
747                                               gmx_localtop_t *       top,
748                                               double *                 x,
749                                               t_commrec *            cr,
750                                               void *                 paadata)
751 {
752         gmx_allvsallgb2_data_t *aadata;
753         int        natoms;
754         int        ni0,ni1;
755         int        nj0,nj1,nj2,nj3;
756         int        i,j,k,n;
757     int *      mask;
758     int *      pmask0;
759     int *      pmask1;
760     int *      emask0;
761     int *      emask1;
762     double       ix,iy,iz;
763     double       jx,jy,jz;
764     double       dx,dy,dz;
765     double       rsq,rinv;
766     double       gpi,rai,vai;
767     double       prod_ai;
768     double       irsq,idr4,idr6;
769     double       raj,rvdw,ratio;
770     double       vaj,ccf,dccf,theta,cosq;
771     double       term,prod,icf4,icf6,gpi2,factor,sinq;
772     double *     gb_radius;
773     double *     vsolv;
774     double *     work;
775     double       tmpsum[2];
776     double *     x_align;
777     double *     y_align;
778     double *     z_align;
779     int *      jindex;
780     double *     dadx;
781     
782     __m128d    ix_SSE0,iy_SSE0,iz_SSE0;
783     __m128d    ix_SSE1,iy_SSE1,iz_SSE1;
784     __m128d    gpi_SSE0,rai_SSE0,prod_ai_SSE0;
785     __m128d    gpi_SSE1,rai_SSE1,prod_ai_SSE1;
786     __m128d    imask_SSE0,jmask_SSE0;
787     __m128d    imask_SSE1,jmask_SSE1;
788     __m128d    jx_SSE,jy_SSE,jz_SSE;
789     __m128d    dx_SSE0,dy_SSE0,dz_SSE0;
790     __m128d    dx_SSE1,dy_SSE1,dz_SSE1;
791     __m128d    rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
792     __m128d    rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
793     __m128d    raj_SSE,vaj_SSE,prod_SSE;
794     __m128d    rvdw_SSE0,ratio_SSE0;
795     __m128d    rvdw_SSE1,ratio_SSE1;
796     __m128d    theta_SSE0,sinq_SSE0,cosq_SSE0,term_SSE0;
797     __m128d    theta_SSE1,sinq_SSE1,cosq_SSE1,term_SSE1;
798     __m128d    ccf_SSE0,dccf_SSE0;
799     __m128d    ccf_SSE1,dccf_SSE1;
800     __m128d    icf4_SSE0,icf6_SSE0;
801     __m128d    icf4_SSE1,icf6_SSE1;
802     __m128d    half_SSE,one_SSE,two_SSE,four_SSE;
803     __m128d    still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
804     
805     natoms              = mdatoms->nr;
806         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
807         ni1                 = mdatoms->start+mdatoms->homenr;
808
809     n = 0;
810     
811     aadata = *((gmx_allvsallgb2_data_t **)paadata);
812
813     
814         if(aadata==NULL)
815         {
816                 genborn_allvsall_setup(&aadata,top,born,mdatoms,0.0,
817                                egbSTILL,FALSE,FALSE,TRUE);
818         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
819         }
820         
821     x_align = aadata->x_align;
822         y_align = aadata->y_align;
823         z_align = aadata->z_align;
824
825     gb_radius = aadata->gb_radius;
826     vsolv     = aadata->workparam;
827     work      = aadata->work;
828     jindex    = aadata->jindex_gb;
829     dadx      = fr->dadx;
830     
831     still_p4_SSE    = _mm_set1_pd(STILL_P4);
832     still_p5inv_SSE = _mm_set1_pd(STILL_P5INV);
833     still_pip5_SSE  = _mm_set1_pd(STILL_PIP5);
834     half_SSE        = _mm_set1_pd(0.5);
835     one_SSE         = _mm_set1_pd(1.0);
836     two_SSE         = _mm_set1_pd(2.0);
837     four_SSE        = _mm_set1_pd(4.0);
838     
839     /* This will be summed, so it has to extend to natoms + buffer */
840     for(i=0;i<natoms+1+natoms/2;i++)
841     {
842         work[i] = 0;
843     }
844     
845         for(i=ni0;i<ni1+1+natoms/2;i++)
846         {
847         k = i%natoms;
848                 x_align[i]  = x[3*k];
849                 y_align[i]  = x[3*k+1];
850                 z_align[i]  = x[3*k+2];
851         work[i]     = 0;
852     }
853         
854         for(i=ni0; i<ni1; i+=UNROLLI)
855         {
856                 /* We assume shifts are NOT used for all-vs-all interactions */
857                 /* Load i atom data */
858                 ix_SSE0          = _mm_load1_pd(x_align+i);
859                 iy_SSE0          = _mm_load1_pd(y_align+i);
860                 iz_SSE0          = _mm_load1_pd(z_align+i);
861                 ix_SSE1          = _mm_load1_pd(x_align+i+1);
862                 iy_SSE1          = _mm_load1_pd(y_align+i+1);
863                 iz_SSE1          = _mm_load1_pd(z_align+i+1);
864         
865         gpi_SSE0         = _mm_setzero_pd();
866         gpi_SSE1         = _mm_setzero_pd();
867
868         rai_SSE0         = _mm_load1_pd(gb_radius+i);
869         rai_SSE1         = _mm_load1_pd(gb_radius+i+1);
870         
871         prod_ai_SSE0     = _mm_set1_pd(STILL_P4*vsolv[i]);
872         prod_ai_SSE1     = _mm_set1_pd(STILL_P4*vsolv[i+1]);
873         
874                 /* Load limits for loop over neighbors */
875                 nj0              = jindex[4*i];
876                 nj1              = jindex[4*i+1];
877                 nj2              = jindex[4*i+2];
878                 nj3              = jindex[4*i+3];
879                         
880         pmask0           = aadata->prologue_mask_gb[i];
881         pmask1           = aadata->prologue_mask_gb[i+1];
882         emask0           = aadata->epilogue_mask[i]; 
883         emask1           = aadata->epilogue_mask[i+1]; 
884
885         imask_SSE0        = _mm_load1_pd((double *)(aadata->imask+2*i));
886         imask_SSE1        = _mm_load1_pd((double *)(aadata->imask+2*i+2));
887         
888         /* Prologue part, including exclusion mask */
889         for(j=nj0; j<nj1; j+=UNROLLJ)
890         {          
891             jmask_SSE0 = _mm_load_pd((double *)pmask0);
892             jmask_SSE1 = _mm_load_pd((double *)pmask1);
893             pmask0 += 2*UNROLLJ;
894             pmask1 += 2*UNROLLJ;
895             
896             /* load j atom coordinates */
897             jx_SSE            = _mm_load_pd(x_align+j);
898             jy_SSE            = _mm_load_pd(y_align+j);
899             jz_SSE            = _mm_load_pd(z_align+j);
900
901             /* Calculate distance */
902             dx_SSE0            = _mm_sub_pd(ix_SSE0,jx_SSE);
903             dy_SSE0            = _mm_sub_pd(iy_SSE0,jy_SSE);
904             dz_SSE0            = _mm_sub_pd(iz_SSE0,jz_SSE);
905             dx_SSE1            = _mm_sub_pd(ix_SSE1,jx_SSE);
906             dy_SSE1            = _mm_sub_pd(iy_SSE1,jy_SSE);
907             dz_SSE1            = _mm_sub_pd(iz_SSE1,jz_SSE);
908             
909             /* rsq = dx*dx+dy*dy+dz*dz */
910             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
911             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
912             
913             /* Combine masks */
914             jmask_SSE0         = _mm_and_pd(jmask_SSE0,imask_SSE0);
915             jmask_SSE1         = _mm_and_pd(jmask_SSE1,imask_SSE1);
916             
917             /* Calculate 1/r and 1/r2 */
918             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
919             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
920             
921             /* Apply mask */
922             rinv_SSE0          = _mm_and_pd(rinv_SSE0,jmask_SSE0);
923             rinv_SSE1          = _mm_and_pd(rinv_SSE1,jmask_SSE1);
924             
925             irsq_SSE0          = _mm_mul_pd(rinv_SSE0,rinv_SSE0);
926             irsq_SSE1          = _mm_mul_pd(rinv_SSE1,rinv_SSE1);
927             idr4_SSE0          = _mm_mul_pd(irsq_SSE0,irsq_SSE0);
928             idr4_SSE1          = _mm_mul_pd(irsq_SSE1,irsq_SSE1);
929             idr6_SSE0          = _mm_mul_pd(idr4_SSE0,irsq_SSE0);
930             idr6_SSE1          = _mm_mul_pd(idr4_SSE1,irsq_SSE1);
931             
932             raj_SSE            = _mm_load_pd(gb_radius+j);
933             vaj_SSE            = _mm_load_pd(vsolv+j);
934
935             rvdw_SSE0          = _mm_add_pd(rai_SSE0,raj_SSE);
936             rvdw_SSE1          = _mm_add_pd(rai_SSE1,raj_SSE);
937             
938             ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0,rvdw_SSE0)));
939             ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1,rvdw_SSE1)));
940
941             ratio_SSE0         = _mm_min_pd(ratio_SSE0,still_p5inv_SSE);
942             ratio_SSE1         = _mm_min_pd(ratio_SSE1,still_p5inv_SSE);
943             theta_SSE0         = _mm_mul_pd(ratio_SSE0,still_pip5_SSE);
944             theta_SSE1         = _mm_mul_pd(ratio_SSE1,still_pip5_SSE);
945             gmx_mm_sincos_pd(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
946             gmx_mm_sincos_pd(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
947             term_SSE0          = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE0));
948             term_SSE1          = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE1));
949             ccf_SSE0           = _mm_mul_pd(term_SSE0,term_SSE0);
950             ccf_SSE1           = _mm_mul_pd(term_SSE1,term_SSE1);
951             dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE0),
952                                             _mm_mul_pd(sinq_SSE0,theta_SSE0));
953             dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE1),
954                                             _mm_mul_pd(sinq_SSE1,theta_SSE1));
955
956             prod_SSE           = _mm_mul_pd(still_p4_SSE,vaj_SSE);
957             icf4_SSE0          = _mm_mul_pd(ccf_SSE0,idr4_SSE0);
958             icf4_SSE1          = _mm_mul_pd(ccf_SSE1,idr4_SSE1);
959             icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
960             icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
961
962             _mm_store_pd(work+j , _mm_add_pd(_mm_load_pd(work+j),
963                                              _mm_add_pd(_mm_mul_pd(prod_ai_SSE0,icf4_SSE0),
964                                                             _mm_mul_pd(prod_ai_SSE1,icf4_SSE1))));
965
966             
967             gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE,icf4_SSE0));
968             gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE,icf4_SSE1));
969              
970             /* Save ai->aj and aj->ai chain rule terms */
971             _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE0));
972             dadx+=2;
973             _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE1));
974             dadx+=2;
975             
976             _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE0,icf6_SSE0));
977             dadx+=2;
978             _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE1,icf6_SSE1));
979             dadx+=2;
980         }
981                                  
982         /* Main part, no exclusions */
983         for(j=nj1; j<nj2; j+=UNROLLJ)
984         {              
985
986             /* load j atom coordinates */
987             jx_SSE            = _mm_load_pd(x_align+j);
988             jy_SSE            = _mm_load_pd(y_align+j);
989             jz_SSE            = _mm_load_pd(z_align+j);
990             
991             /* Calculate distance */
992             dx_SSE0            = _mm_sub_pd(ix_SSE0,jx_SSE);
993             dy_SSE0            = _mm_sub_pd(iy_SSE0,jy_SSE);
994             dz_SSE0            = _mm_sub_pd(iz_SSE0,jz_SSE);
995             dx_SSE1            = _mm_sub_pd(ix_SSE1,jx_SSE);
996             dy_SSE1            = _mm_sub_pd(iy_SSE1,jy_SSE);
997             dz_SSE1            = _mm_sub_pd(iz_SSE1,jz_SSE);
998             
999             /* rsq = dx*dx+dy*dy+dz*dz */
1000             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1001             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1002             
1003             /* Calculate 1/r and 1/r2 */
1004             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1005             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1006             
1007             /* Apply mask */
1008             rinv_SSE0          = _mm_and_pd(rinv_SSE0,imask_SSE0);
1009             rinv_SSE1          = _mm_and_pd(rinv_SSE1,imask_SSE1);
1010             
1011             irsq_SSE0          = _mm_mul_pd(rinv_SSE0,rinv_SSE0);
1012             irsq_SSE1          = _mm_mul_pd(rinv_SSE1,rinv_SSE1);
1013             idr4_SSE0          = _mm_mul_pd(irsq_SSE0,irsq_SSE0);
1014             idr4_SSE1          = _mm_mul_pd(irsq_SSE1,irsq_SSE1);
1015             idr6_SSE0          = _mm_mul_pd(idr4_SSE0,irsq_SSE0);
1016             idr6_SSE1          = _mm_mul_pd(idr4_SSE1,irsq_SSE1);
1017             
1018             raj_SSE            = _mm_load_pd(gb_radius+j);
1019             
1020             rvdw_SSE0          = _mm_add_pd(rai_SSE0,raj_SSE);
1021             rvdw_SSE1          = _mm_add_pd(rai_SSE1,raj_SSE);
1022             vaj_SSE            = _mm_load_pd(vsolv+j);
1023             
1024             ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0,rvdw_SSE0)));
1025             ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1,rvdw_SSE1)));
1026             
1027             ratio_SSE0         = _mm_min_pd(ratio_SSE0,still_p5inv_SSE);
1028             ratio_SSE1         = _mm_min_pd(ratio_SSE1,still_p5inv_SSE);
1029             theta_SSE0         = _mm_mul_pd(ratio_SSE0,still_pip5_SSE);
1030             theta_SSE1         = _mm_mul_pd(ratio_SSE1,still_pip5_SSE);
1031             gmx_mm_sincos_pd(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
1032             gmx_mm_sincos_pd(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
1033             term_SSE0          = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE0));
1034             term_SSE1          = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE1));
1035             ccf_SSE0           = _mm_mul_pd(term_SSE0,term_SSE0);
1036             ccf_SSE1           = _mm_mul_pd(term_SSE1,term_SSE1);
1037             dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE0),
1038                                             _mm_mul_pd(sinq_SSE0,theta_SSE0));
1039             dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE1),
1040                                             _mm_mul_pd(sinq_SSE1,theta_SSE1));
1041             
1042             prod_SSE           = _mm_mul_pd(still_p4_SSE,vaj_SSE );
1043             icf4_SSE0          = _mm_mul_pd(ccf_SSE0,idr4_SSE0);
1044             icf4_SSE1          = _mm_mul_pd(ccf_SSE1,idr4_SSE1);
1045             icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1046             icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1047             
1048             _mm_store_pd(work+j , _mm_add_pd(_mm_load_pd(work+j),
1049                                              _mm_add_pd(_mm_mul_pd(prod_ai_SSE0,icf4_SSE0),
1050                                                         _mm_mul_pd(prod_ai_SSE1,icf4_SSE1))));
1051             
1052             gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE,icf4_SSE0));
1053             gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE,icf4_SSE1));
1054
1055             /* Save ai->aj and aj->ai chain rule terms */
1056             _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE0));
1057             dadx+=2;
1058             _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE1));
1059             dadx+=2;
1060             
1061             _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE0,icf6_SSE0));
1062             dadx+=2;
1063             _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE1,icf6_SSE1));
1064             dadx+=2;
1065         }
1066         /* Epilogue part, including exclusion mask */
1067         for(j=nj2; j<nj3; j+=UNROLLJ)
1068         {                 
1069             jmask_SSE0 = _mm_load_pd((double *)emask0);
1070             jmask_SSE1 = _mm_load_pd((double *)emask1);
1071             emask0 += 2*UNROLLJ;
1072             emask1 += 2*UNROLLJ;
1073             
1074             /* load j atom coordinates */
1075             jx_SSE            = _mm_load_pd(x_align+j);
1076             jy_SSE            = _mm_load_pd(y_align+j);
1077             jz_SSE            = _mm_load_pd(z_align+j);
1078             
1079             /* Calculate distance */
1080             dx_SSE0            = _mm_sub_pd(ix_SSE0,jx_SSE);
1081             dy_SSE0            = _mm_sub_pd(iy_SSE0,jy_SSE);
1082             dz_SSE0            = _mm_sub_pd(iz_SSE0,jz_SSE);
1083             dx_SSE1            = _mm_sub_pd(ix_SSE1,jx_SSE);
1084             dy_SSE1            = _mm_sub_pd(iy_SSE1,jy_SSE);
1085             dz_SSE1            = _mm_sub_pd(iz_SSE1,jz_SSE);
1086             
1087             /* rsq = dx*dx+dy*dy+dz*dz */
1088             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1089             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1090             
1091             /* Combine masks */
1092             jmask_SSE0         = _mm_and_pd(jmask_SSE0,imask_SSE0);
1093             jmask_SSE1         = _mm_and_pd(jmask_SSE1,imask_SSE1);
1094             
1095             /* Calculate 1/r and 1/r2 */
1096             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1097             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1098             
1099             /* Apply mask */
1100             rinv_SSE0          = _mm_and_pd(rinv_SSE0,jmask_SSE0);
1101             rinv_SSE1          = _mm_and_pd(rinv_SSE1,jmask_SSE1);
1102             
1103             irsq_SSE0          = _mm_mul_pd(rinv_SSE0,rinv_SSE0);
1104             irsq_SSE1          = _mm_mul_pd(rinv_SSE1,rinv_SSE1);
1105             idr4_SSE0          = _mm_mul_pd(irsq_SSE0,irsq_SSE0);
1106             idr4_SSE1          = _mm_mul_pd(irsq_SSE1,irsq_SSE1);
1107             idr6_SSE0          = _mm_mul_pd(idr4_SSE0,irsq_SSE0);
1108             idr6_SSE1          = _mm_mul_pd(idr4_SSE1,irsq_SSE1);
1109             
1110             raj_SSE            = _mm_load_pd(gb_radius+j);
1111             vaj_SSE            = _mm_load_pd(vsolv+j);
1112             
1113             rvdw_SSE0          = _mm_add_pd(rai_SSE0,raj_SSE);
1114             rvdw_SSE1          = _mm_add_pd(rai_SSE1,raj_SSE);
1115             
1116             ratio_SSE0         = _mm_mul_pd(rsq_SSE0, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE0,rvdw_SSE0)));
1117             ratio_SSE1         = _mm_mul_pd(rsq_SSE1, gmx_mm_inv_pd( _mm_mul_pd(rvdw_SSE1,rvdw_SSE1)));
1118             
1119             ratio_SSE0         = _mm_min_pd(ratio_SSE0,still_p5inv_SSE);
1120             ratio_SSE1         = _mm_min_pd(ratio_SSE1,still_p5inv_SSE);
1121             theta_SSE0         = _mm_mul_pd(ratio_SSE0,still_pip5_SSE);
1122             theta_SSE1         = _mm_mul_pd(ratio_SSE1,still_pip5_SSE);
1123             gmx_mm_sincos_pd(theta_SSE0,&sinq_SSE0,&cosq_SSE0);            
1124             gmx_mm_sincos_pd(theta_SSE1,&sinq_SSE1,&cosq_SSE1);            
1125             term_SSE0          = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE0));
1126             term_SSE1          = _mm_mul_pd(half_SSE,_mm_sub_pd(one_SSE,cosq_SSE1));
1127             ccf_SSE0           = _mm_mul_pd(term_SSE0,term_SSE0);
1128             ccf_SSE1           = _mm_mul_pd(term_SSE1,term_SSE1);
1129             dccf_SSE0          = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE0),
1130                                             _mm_mul_pd(sinq_SSE0,theta_SSE0));
1131             dccf_SSE1          = _mm_mul_pd(_mm_mul_pd(two_SSE,term_SSE1),
1132                                             _mm_mul_pd(sinq_SSE1,theta_SSE1));
1133             
1134             prod_SSE           = _mm_mul_pd(still_p4_SSE,vaj_SSE);
1135             icf4_SSE0          = _mm_mul_pd(ccf_SSE0,idr4_SSE0);
1136             icf4_SSE1          = _mm_mul_pd(ccf_SSE1,idr4_SSE1);
1137             icf6_SSE0          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE0),dccf_SSE0), idr6_SSE0);
1138             icf6_SSE1          = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four_SSE,ccf_SSE1),dccf_SSE1), idr6_SSE1);
1139             
1140             _mm_store_pd(work+j , _mm_add_pd(_mm_load_pd(work+j),
1141                                              _mm_add_pd(_mm_mul_pd(prod_ai_SSE0,icf4_SSE0),
1142                                                         _mm_mul_pd(prod_ai_SSE1,icf4_SSE1))));
1143             
1144             gpi_SSE0           = _mm_add_pd(gpi_SSE0, _mm_mul_pd(prod_SSE,icf4_SSE0));
1145             gpi_SSE1           = _mm_add_pd(gpi_SSE1, _mm_mul_pd(prod_SSE,icf4_SSE1));
1146  
1147             /* Save ai->aj and aj->ai chain rule terms */
1148             _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE0));
1149             dadx+=2;
1150             _mm_store_pd(dadx,_mm_mul_pd(prod_SSE,icf6_SSE1));
1151             dadx+=2;
1152             
1153             _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE0,icf6_SSE0));
1154             dadx+=2;
1155             _mm_store_pd(dadx,_mm_mul_pd(prod_ai_SSE1,icf6_SSE1));
1156             dadx+=2;
1157         }
1158         GMX_MM_TRANSPOSE2_PD(gpi_SSE0,gpi_SSE1);
1159         gpi_SSE0 = _mm_add_pd(gpi_SSE0,gpi_SSE1);
1160         _mm_store_pd(work+i, _mm_add_pd(gpi_SSE0, _mm_load_pd(work+i)));
1161         }    
1162     
1163     /* In case we have written anything beyond natoms, move it back.
1164      * Never mind that we leave stuff above natoms; that will not
1165      * be accessed later in the routine.
1166      * In principle this should be a move rather than sum, but this
1167      * way we dont have to worry about even/odd offsets...
1168      */
1169     for(i=natoms;i<ni1+1+natoms/2;i++)
1170     {
1171         work[i-natoms] += work[i];
1172     }
1173     
1174     /* Parallel summations */
1175         if(PARTDECOMP(cr))
1176         {
1177                 gmx_sum(natoms,work,cr);
1178         }
1179         
1180     factor  = 0.5 * ONE_4PI_EPS0;
1181         /* Calculate the radii - should we do all atoms, or just our local ones? */
1182         for(i=0;i<natoms;i++)
1183         {
1184                 if(born->use[i] != 0)
1185                 {
1186                         gpi  = born->gpol[i]+work[i];
1187                         gpi2 = gpi * gpi;
1188                         born->bRad[i]   = factor*gmx_invsqrt(gpi2);
1189                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1190                 }
1191         }
1192         
1193         return 0;
1194 }
1195 /* Reinstate MSVC optimization */
1196 #ifdef _MSC_VER
1197 #pragma optimize("",on)
1198 #endif
1199
1200
1201 int
1202 genborn_allvsall_calc_hct_obc_radii_sse2_double(t_forcerec *           fr,
1203                                                 t_mdatoms *            mdatoms,
1204                                                 gmx_genborn_t *        born,
1205                                                 int                    gb_algorithm,
1206                                                 gmx_localtop_t *       top,
1207                                                 double *                 x,
1208                                                 t_commrec *            cr,
1209                                                 void *                 paadata)
1210 {
1211         gmx_allvsallgb2_data_t *aadata;
1212         int        natoms;
1213         int        ni0,ni1;
1214         int        nj0,nj1,nj2,nj3;
1215         int        i,j,k,n;
1216     int *      mask;
1217     int *      pmask0;
1218     int *      pmask1;
1219     int *      emask0;
1220     int *      emask1;
1221     double *     gb_radius;
1222     double *     vsolv;
1223     double *     work;
1224     double       tmpsum[2];
1225     double *     x_align;
1226     double *     y_align;
1227     double *     z_align;
1228     int *      jindex;
1229     double *     dadx;
1230     double *     obc_param;
1231     double       rad,min_rad;
1232     double rai,rai_inv,rai_inv2,sum_ai,sum_ai2,sum_ai3,tsum,tchain;
1233
1234     __m128d    ix_SSE0,iy_SSE0,iz_SSE0;
1235     __m128d    ix_SSE1,iy_SSE1,iz_SSE1;
1236     __m128d    gpi_SSE0,rai_SSE0,prod_ai_SSE0;
1237     __m128d    gpi_SSE1,rai_SSE1,prod_ai_SSE1;
1238     __m128d    imask_SSE0,jmask_SSE0;
1239     __m128d    imask_SSE1,jmask_SSE1;
1240     __m128d    jx_SSE,jy_SSE,jz_SSE;
1241     __m128d    dx_SSE0,dy_SSE0,dz_SSE0;
1242     __m128d    dx_SSE1,dy_SSE1,dz_SSE1;
1243     __m128d    rsq_SSE0,rinv_SSE0,irsq_SSE0,idr4_SSE0,idr6_SSE0;
1244     __m128d    rsq_SSE1,rinv_SSE1,irsq_SSE1,idr4_SSE1,idr6_SSE1;
1245     __m128d    raj_SSE,raj_inv_SSE,sk_aj_SSE,sk2_aj_SSE;
1246     __m128d    ccf_SSE0,dccf_SSE0,prod_SSE0;
1247     __m128d    ccf_SSE1,dccf_SSE1,prod_SSE1;
1248     __m128d    icf4_SSE0,icf6_SSE0;
1249     __m128d    icf4_SSE1,icf6_SSE1;
1250     __m128d    oneeighth_SSE,onefourth_SSE,half_SSE,one_SSE,two_SSE,four_SSE;
1251     __m128d    still_p4_SSE,still_p5inv_SSE,still_pip5_SSE;
1252     __m128d    rai_inv_SSE0;
1253     __m128d    rai_inv_SSE1;
1254     __m128d    sk_ai_SSE0,sk2_ai_SSE0,sum_ai_SSE0;
1255     __m128d    sk_ai_SSE1,sk2_ai_SSE1,sum_ai_SSE1;
1256     __m128d    lij_inv_SSE0,sk2_rinv_SSE0;
1257     __m128d    lij_inv_SSE1,sk2_rinv_SSE1;
1258     __m128d    dr_SSE0;
1259     __m128d    dr_SSE1;
1260     __m128d    t1_SSE0,t2_SSE0,t3_SSE0,t4_SSE0;
1261     __m128d    t1_SSE1,t2_SSE1,t3_SSE1,t4_SSE1;
1262     __m128d    obc_mask1_SSE0,obc_mask2_SSE0,obc_mask3_SSE0;
1263     __m128d    obc_mask1_SSE1,obc_mask2_SSE1,obc_mask3_SSE1;
1264     __m128d    uij_SSE0,uij2_SSE0,uij3_SSE0;
1265     __m128d    uij_SSE1,uij2_SSE1,uij3_SSE1;
1266     __m128d    lij_SSE0,lij2_SSE0,lij3_SSE0;
1267     __m128d    lij_SSE1,lij2_SSE1,lij3_SSE1;
1268     __m128d    dlij_SSE0,diff2_SSE0,logterm_SSE0;
1269     __m128d    dlij_SSE1,diff2_SSE1,logterm_SSE1;
1270     __m128d    doffset_SSE,tmpSSE;
1271     
1272     natoms              = mdatoms->nr;
1273         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
1274         ni1                 = mdatoms->start+mdatoms->homenr;
1275     
1276     n = 0;
1277     
1278     aadata = *((gmx_allvsallgb2_data_t **)paadata);
1279     
1280     
1281         if(aadata==NULL)
1282         {
1283                 genborn_allvsall_setup(&aadata,top,born,mdatoms,born->gb_doffset,
1284                                egbOBC,TRUE,TRUE,TRUE);
1285         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1286         }
1287     
1288     x_align = aadata->x_align;
1289         y_align = aadata->y_align;
1290         z_align = aadata->z_align;
1291     
1292     gb_radius = aadata->gb_radius;
1293     work      = aadata->work;
1294     jindex    = aadata->jindex_gb;
1295     dadx      = fr->dadx;
1296     obc_param = aadata->workparam;
1297     
1298     oneeighth_SSE   = _mm_set1_pd(0.125);
1299     onefourth_SSE   = _mm_set1_pd(0.25);
1300     half_SSE        = _mm_set1_pd(0.5);
1301     one_SSE         = _mm_set1_pd(1.0);
1302     two_SSE         = _mm_set1_pd(2.0);
1303     four_SSE        = _mm_set1_pd(4.0);
1304     doffset_SSE     = _mm_set1_pd(born->gb_doffset);
1305     
1306     for(i=0;i<natoms;i++)
1307         {
1308                 x_align[i]  = x[3*i];
1309                 y_align[i]  = x[3*i+1];
1310                 z_align[i]  = x[3*i+2];
1311         }
1312     
1313     /* Copy again */
1314         for(i=0;i<natoms/2+1;i++)
1315         {
1316                 x_align[natoms+i]  = x_align[i];
1317                 y_align[natoms+i]  = y_align[i];
1318                 z_align[natoms+i]  = z_align[i];
1319     }        
1320     
1321     for(i=0;i<natoms+natoms/2+1;i++)
1322     {
1323         work[i] = 0;
1324     }
1325         
1326         for(i=ni0; i<ni1; i+= UNROLLI)
1327         {
1328                 /* We assume shifts are NOT used for all-vs-all interactions */
1329         
1330                 /* Load i atom data */
1331                 ix_SSE0          = _mm_load1_pd(x_align+i);
1332                 iy_SSE0          = _mm_load1_pd(y_align+i);
1333                 iz_SSE0          = _mm_load1_pd(z_align+i);
1334                 ix_SSE1          = _mm_load1_pd(x_align+i+1);
1335                 iy_SSE1          = _mm_load1_pd(y_align+i+1);
1336                 iz_SSE1          = _mm_load1_pd(z_align+i+1);
1337         
1338         rai_SSE0         = _mm_load1_pd(gb_radius+i);
1339         rai_SSE1         = _mm_load1_pd(gb_radius+i+1);
1340         rai_inv_SSE0     = gmx_mm_inv_pd(rai_SSE0);
1341         rai_inv_SSE1     = gmx_mm_inv_pd(rai_SSE1);
1342         
1343         sk_ai_SSE0       = _mm_load1_pd(obc_param+i);
1344         sk_ai_SSE1       = _mm_load1_pd(obc_param+i+1);
1345         sk2_ai_SSE0      = _mm_mul_pd(sk_ai_SSE0,sk_ai_SSE0);
1346         sk2_ai_SSE1      = _mm_mul_pd(sk_ai_SSE1,sk_ai_SSE1);
1347         
1348         sum_ai_SSE0      = _mm_setzero_pd();
1349         sum_ai_SSE1      = _mm_setzero_pd();
1350         
1351                 /* Load limits for loop over neighbors */
1352                 nj0              = jindex[4*i];
1353                 nj1              = jindex[4*i+1];
1354                 nj2              = jindex[4*i+2];
1355                 nj3              = jindex[4*i+3];
1356         
1357         pmask0           = aadata->prologue_mask_gb[i];
1358         pmask1           = aadata->prologue_mask_gb[i+1];
1359         emask0           = aadata->epilogue_mask[i]; 
1360         emask1           = aadata->epilogue_mask[i+1]; 
1361         
1362         imask_SSE0        = _mm_load1_pd((double *)(aadata->imask+2*i));
1363         imask_SSE1        = _mm_load1_pd((double *)(aadata->imask+2*i+2));
1364         
1365         /* Prologue part, including exclusion mask */
1366         for(j=nj0; j<nj1; j+=UNROLLJ)
1367         {          
1368             jmask_SSE0 = _mm_load_pd((double *)pmask0);
1369             jmask_SSE1 = _mm_load_pd((double *)pmask1);
1370             pmask0 += 2*UNROLLJ;
1371             pmask1 += 2*UNROLLJ;
1372             
1373             /* load j atom coordinates */
1374             jx_SSE            = _mm_load_pd(x_align+j);
1375             jy_SSE            = _mm_load_pd(y_align+j);
1376             jz_SSE            = _mm_load_pd(z_align+j);
1377             
1378             /* Calculate distance */
1379             dx_SSE0            = _mm_sub_pd(ix_SSE0,jx_SSE);
1380             dy_SSE0            = _mm_sub_pd(iy_SSE0,jy_SSE);
1381             dz_SSE0            = _mm_sub_pd(iz_SSE0,jz_SSE);
1382             dx_SSE1            = _mm_sub_pd(ix_SSE1,jx_SSE);
1383             dy_SSE1            = _mm_sub_pd(iy_SSE1,jy_SSE);
1384             dz_SSE1            = _mm_sub_pd(iz_SSE1,jz_SSE);
1385             
1386             /* rsq = dx*dx+dy*dy+dz*dz */
1387             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1388             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1389             
1390             /* Combine masks */
1391             jmask_SSE0         = _mm_and_pd(jmask_SSE0,imask_SSE0);
1392             jmask_SSE1         = _mm_and_pd(jmask_SSE1,imask_SSE1);
1393             
1394             /* Calculate 1/r and 1/r2 */
1395             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1396             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1397             
1398             /* Apply mask */
1399             rinv_SSE0          = _mm_and_pd(rinv_SSE0,jmask_SSE0);
1400             rinv_SSE1          = _mm_and_pd(rinv_SSE1,jmask_SSE1);
1401             
1402             dr_SSE0            = _mm_mul_pd(rsq_SSE0,rinv_SSE0);
1403             dr_SSE1            = _mm_mul_pd(rsq_SSE1,rinv_SSE1);
1404
1405             sk_aj_SSE          = _mm_load_pd(obc_param+j);
1406             raj_SSE            = _mm_load_pd(gb_radius+j);
1407             raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
1408             
1409             /* Evaluate influence of atom aj -> ai */
1410             t1_SSE0            = _mm_add_pd(dr_SSE0,sk_aj_SSE);
1411             t1_SSE1            = _mm_add_pd(dr_SSE1,sk_aj_SSE);
1412             t2_SSE0            = _mm_sub_pd(dr_SSE0,sk_aj_SSE);
1413             t2_SSE1            = _mm_sub_pd(dr_SSE1,sk_aj_SSE);
1414             t3_SSE0            = _mm_sub_pd(sk_aj_SSE,dr_SSE0);
1415             t3_SSE1            = _mm_sub_pd(sk_aj_SSE,dr_SSE1);
1416
1417             obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1418             obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1419             obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1420             obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1421             obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1422             obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1423             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
1424             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
1425
1426             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1427             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1428             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1429                                            _mm_andnot_pd(obc_mask2_SSE0,rai_inv_SSE0));
1430             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1431                                            _mm_andnot_pd(obc_mask2_SSE1,rai_inv_SSE1));
1432             dlij_SSE0          = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1433             dlij_SSE1          = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1434             
1435             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1436             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1437             uij3_SSE0          = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1438             uij3_SSE1          = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1439             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1440             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1441             lij3_SSE0          = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1442             lij3_SSE1          = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1443             
1444             diff2_SSE0         = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1445             diff2_SSE1         = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1446             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1447             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1448             sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE,sk_aj_SSE);
1449             sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE,rinv_SSE0);
1450             sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE,rinv_SSE1);
1451             prod_SSE0          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1452             prod_SSE1          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1453             
1454             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1455             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1456
1457             t1_SSE0            = _mm_sub_pd(lij_SSE0,uij_SSE0);
1458             t1_SSE1            = _mm_sub_pd(lij_SSE1,uij_SSE1);
1459             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1460                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1461                                                        prod_SSE0));
1462             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1463                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1464                                                        prod_SSE1));
1465             
1466             t3_SSE0            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1467             t3_SSE1            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1468             t1_SSE0            = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1469             t1_SSE1            = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1470             t4_SSE0            = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE0,lij_SSE0));
1471             t4_SSE1            = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE1,lij_SSE1));
1472             t4_SSE0            = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1473             t4_SSE1            = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1474             t1_SSE0            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1475             t1_SSE1            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1476             
1477             sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1478             sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1479             
1480             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1481                                             _mm_mul_pd(prod_SSE0,lij3_SSE0));
1482             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1483                                             _mm_mul_pd(prod_SSE1,lij3_SSE1));
1484             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1485                                             _mm_mul_pd(onefourth_SSE,
1486                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1487                                                                   _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1488             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1489                                             _mm_mul_pd(onefourth_SSE,
1490                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1491                                                                   _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1492
1493             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1494                                             _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1495                                                        _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1496             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1497                                             _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1498                                                        _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1499             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1500                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1501                                                        _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1502             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1503                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1504                                                        _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1505             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1506                                             _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1507             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1508                                             _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1509             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1510                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1511                                                        _mm_add_pd(one_SSE,
1512                                                                   _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1513             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1514                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1515                                                        _mm_add_pd(one_SSE,
1516                                                                   _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1517             
1518             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1519                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1520                                                        _mm_add_pd(t2_SSE0,t3_SSE0)));
1521             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1522                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1523                                                        _mm_add_pd(t2_SSE1,t3_SSE1)));
1524             
1525             _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1526             dadx += 2;
1527             _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1528             dadx += 2;
1529             
1530             /* Evaluate influence of atom ai -> aj */
1531             t1_SSE0            = _mm_add_pd(dr_SSE0,sk_ai_SSE0);
1532             t1_SSE1            = _mm_add_pd(dr_SSE1,sk_ai_SSE1);
1533             t2_SSE0            = _mm_sub_pd(dr_SSE0,sk_ai_SSE0);
1534             t2_SSE1            = _mm_sub_pd(dr_SSE1,sk_ai_SSE1);
1535             t3_SSE0            = _mm_sub_pd(sk_ai_SSE0,dr_SSE0);
1536             t3_SSE1            = _mm_sub_pd(sk_ai_SSE1,dr_SSE1);
1537             
1538             obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1539             obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1540             obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1541             obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1542             obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1543             obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1544             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
1545             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
1546             
1547             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1548             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1549             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1550                                            _mm_andnot_pd(obc_mask2_SSE0,raj_inv_SSE));
1551             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1552                                            _mm_andnot_pd(obc_mask2_SSE1,raj_inv_SSE));
1553             dlij_SSE0          = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1554             dlij_SSE1          = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1555             
1556             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1557             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1558             uij3_SSE0          = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1559             uij3_SSE1          = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1560             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1561             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1562             lij3_SSE0          = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1563             lij3_SSE1          = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1564             
1565             diff2_SSE0         = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1566             diff2_SSE1         = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1567             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1568             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1569             sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0,rinv_SSE0);
1570             sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1,rinv_SSE1);
1571             prod_SSE0          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1572             prod_SSE1          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1573             
1574             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1575             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1576             t1_SSE0            = _mm_sub_pd(lij_SSE0,uij_SSE0);
1577             t1_SSE1            = _mm_sub_pd(lij_SSE1,uij_SSE1);
1578             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1579                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1580                                                        prod_SSE0));
1581             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1582                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1583                                                        prod_SSE1));
1584             t3_SSE0            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1585             t3_SSE1            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1586             t1_SSE0            = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1587             t1_SSE1            = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1588             t4_SSE0            = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE0));
1589             t4_SSE1            = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE1));
1590             t4_SSE0            = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1591             t4_SSE1            = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1592             t1_SSE0            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1593             t1_SSE1            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1594             
1595             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1596                                             _mm_add_pd(_mm_and_pd(t1_SSE0,obc_mask1_SSE0),
1597                                                        _mm_and_pd(t1_SSE1,obc_mask1_SSE1))));
1598             
1599             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1600                                             _mm_mul_pd(prod_SSE0,lij3_SSE0));
1601             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1602                                             _mm_mul_pd(prod_SSE1,lij3_SSE1));
1603             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1604                                             _mm_mul_pd(onefourth_SSE,
1605                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1606                                                                   _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1607             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1608                                             _mm_mul_pd(onefourth_SSE,
1609                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1610                                                                   _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1611             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1612                                             _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1613                                                        _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1614             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1615                                             _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1616                                                        _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1617             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1618                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1619                                                        _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1620             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1621                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1622                                                        _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1623             
1624             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1625                                             _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1626             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1627                                             _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1628             
1629             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1630                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1631                                                        _mm_add_pd(one_SSE,
1632                                                                   _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1633             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1634                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1635                                                        _mm_add_pd(one_SSE,
1636                                                                   _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1637             
1638             
1639             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1640                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1641                                                        _mm_add_pd(t2_SSE0,t3_SSE0)));
1642             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1643                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1644                                                        _mm_add_pd(t2_SSE1,t3_SSE1)));
1645
1646             _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1647             dadx += 2;
1648             _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1649             dadx += 2;            
1650         }
1651         
1652         /* Main part, no exclusions */
1653         for(j=nj1; j<nj2; j+=UNROLLJ)
1654         {                      
1655             /* load j atom coordinates */
1656             jx_SSE            = _mm_load_pd(x_align+j);
1657             jy_SSE            = _mm_load_pd(y_align+j);
1658             jz_SSE            = _mm_load_pd(z_align+j);
1659             
1660             /* Calculate distance */
1661             dx_SSE0            = _mm_sub_pd(ix_SSE0,jx_SSE);
1662             dy_SSE0            = _mm_sub_pd(iy_SSE0,jy_SSE);
1663             dz_SSE0            = _mm_sub_pd(iz_SSE0,jz_SSE);
1664             dx_SSE1            = _mm_sub_pd(ix_SSE1,jx_SSE);
1665             dy_SSE1            = _mm_sub_pd(iy_SSE1,jy_SSE);
1666             dz_SSE1            = _mm_sub_pd(iz_SSE1,jz_SSE);
1667             
1668             /* rsq = dx*dx+dy*dy+dz*dz */
1669             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1670             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1671             
1672             /* Calculate 1/r and 1/r2 */
1673             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1674             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1675             
1676             /* Apply mask */
1677             rinv_SSE0          = _mm_and_pd(rinv_SSE0,imask_SSE0);
1678             rinv_SSE1          = _mm_and_pd(rinv_SSE1,imask_SSE1);
1679             
1680             dr_SSE0            = _mm_mul_pd(rsq_SSE0,rinv_SSE0);
1681             dr_SSE1            = _mm_mul_pd(rsq_SSE1,rinv_SSE1);
1682             
1683             sk_aj_SSE          = _mm_load_pd(obc_param+j);
1684             raj_SSE            = _mm_load_pd(gb_radius+j);
1685
1686             raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
1687             
1688             /* Evaluate influence of atom aj -> ai */
1689             t1_SSE0            = _mm_add_pd(dr_SSE0,sk_aj_SSE);
1690             t1_SSE1            = _mm_add_pd(dr_SSE1,sk_aj_SSE);
1691             t2_SSE0            = _mm_sub_pd(dr_SSE0,sk_aj_SSE);
1692             t2_SSE1            = _mm_sub_pd(dr_SSE1,sk_aj_SSE);
1693             t3_SSE0            = _mm_sub_pd(sk_aj_SSE,dr_SSE0);
1694             t3_SSE1            = _mm_sub_pd(sk_aj_SSE,dr_SSE1);
1695             
1696             obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1697             obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1698             obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1699             obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1700             obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1701             obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1702             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0,imask_SSE0);
1703             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1,imask_SSE1);
1704             
1705             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1706             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1707             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1708                                            _mm_andnot_pd(obc_mask2_SSE0,rai_inv_SSE0));
1709             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1710                                            _mm_andnot_pd(obc_mask2_SSE1,rai_inv_SSE1));
1711             dlij_SSE0          = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1712             dlij_SSE1          = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1713             
1714             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1715             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1716             uij3_SSE0          = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1717             uij3_SSE1          = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1718             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1719             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1720             lij3_SSE0          = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1721             lij3_SSE1          = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1722             
1723             diff2_SSE0         = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1724             diff2_SSE1         = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1725             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1726             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1727             sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE,sk_aj_SSE);
1728             sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE,rinv_SSE0);
1729             sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE,rinv_SSE1);
1730             prod_SSE0          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1731             prod_SSE1          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1732             
1733             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1734             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1735             
1736             t1_SSE0            = _mm_sub_pd(lij_SSE0,uij_SSE0);
1737             t1_SSE1            = _mm_sub_pd(lij_SSE1,uij_SSE1);
1738             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1739                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1740                                                        prod_SSE0));
1741             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1742                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1743                                                        prod_SSE1));
1744              
1745             t3_SSE0            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1746             t3_SSE1            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1747             t1_SSE0            = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1748             t1_SSE1            = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1749             t4_SSE0            = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE0,lij_SSE0));
1750             t4_SSE1            = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE1,lij_SSE1));
1751             t4_SSE0            = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1752             t4_SSE1            = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1753             t1_SSE0            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1754             t1_SSE1            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1755             
1756             sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1757             sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1758             
1759             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1760                                             _mm_mul_pd(prod_SSE0,lij3_SSE0));
1761             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1762                                             _mm_mul_pd(prod_SSE1,lij3_SSE1));
1763             
1764             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1765                                             _mm_mul_pd(onefourth_SSE,
1766                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1767                                                                   _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1768             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1769                                             _mm_mul_pd(onefourth_SSE,
1770                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1771                                                                   _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1772             
1773             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1774                                             _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1775                                                        _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1776             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1777                                             _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1778                                                        _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1779             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1780                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1781                                                        _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1782             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1783                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1784                                                        _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1785             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1786                                             _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1787             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1788                                             _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1789             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1790                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1791                                                        _mm_add_pd(one_SSE,
1792                                                                   _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1793             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1794                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1795                                                        _mm_add_pd(one_SSE,
1796                                                                   _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1797             
1798             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1799                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1800                                                        _mm_add_pd(t2_SSE0,t3_SSE0)));
1801             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1802                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1803                                                        _mm_add_pd(t2_SSE1,t3_SSE1)));
1804             
1805             _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1806             dadx += 2;
1807             _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1808             dadx += 2;
1809             
1810             /* Evaluate influence of atom ai -> aj */
1811             t1_SSE0            = _mm_add_pd(dr_SSE0,sk_ai_SSE0);
1812             t1_SSE1            = _mm_add_pd(dr_SSE1,sk_ai_SSE1);
1813             t2_SSE0            = _mm_sub_pd(dr_SSE0,sk_ai_SSE0);
1814             t2_SSE1            = _mm_sub_pd(dr_SSE1,sk_ai_SSE1);
1815             t3_SSE0            = _mm_sub_pd(sk_ai_SSE0,dr_SSE0);
1816             t3_SSE1            = _mm_sub_pd(sk_ai_SSE1,dr_SSE1);
1817             
1818             obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
1819             obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
1820             obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
1821             obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
1822             obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
1823             obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
1824             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0,imask_SSE0);
1825             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1,imask_SSE1);
1826             
1827             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1828             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1829             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1830                                            _mm_andnot_pd(obc_mask2_SSE0,raj_inv_SSE));
1831             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1832                                            _mm_andnot_pd(obc_mask2_SSE1,raj_inv_SSE));
1833             dlij_SSE0          = _mm_and_pd(one_SSE,obc_mask2_SSE0);
1834             dlij_SSE1          = _mm_and_pd(one_SSE,obc_mask2_SSE1);
1835             
1836             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
1837             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
1838             uij3_SSE0          = _mm_mul_pd(uij2_SSE0,uij_SSE0);
1839             uij3_SSE1          = _mm_mul_pd(uij2_SSE1,uij_SSE1);
1840             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
1841             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
1842             lij3_SSE0          = _mm_mul_pd(lij2_SSE0,lij_SSE0);
1843             lij3_SSE1          = _mm_mul_pd(lij2_SSE1,lij_SSE1);
1844             
1845             diff2_SSE0         = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
1846             diff2_SSE1         = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
1847             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
1848             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
1849             sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0,rinv_SSE0);
1850             sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1,rinv_SSE1);
1851             prod_SSE0          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
1852             prod_SSE1          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
1853             
1854             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
1855             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
1856             t1_SSE0            = _mm_sub_pd(lij_SSE0,uij_SSE0);
1857             t1_SSE1            = _mm_sub_pd(lij_SSE1,uij_SSE1);
1858             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
1859                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
1860                                                        prod_SSE0));
1861             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
1862                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
1863                                                        prod_SSE1));
1864             t3_SSE0            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
1865             t3_SSE1            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
1866             t1_SSE0            = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
1867             t1_SSE1            = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
1868             t4_SSE0            = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE0));
1869             t4_SSE1            = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE1));
1870             t4_SSE0            = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
1871             t4_SSE1            = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
1872             t1_SSE0            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
1873             t1_SSE1            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
1874              
1875             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
1876                                             _mm_add_pd(_mm_and_pd(t1_SSE0,obc_mask1_SSE0),
1877                                                        _mm_and_pd(t1_SSE1,obc_mask1_SSE1))));
1878                         
1879             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
1880                                             _mm_mul_pd(prod_SSE0,lij3_SSE0));
1881             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
1882                                             _mm_mul_pd(prod_SSE1,lij3_SSE1));
1883             t1_SSE0            = _mm_sub_pd(t1_SSE0,
1884                                             _mm_mul_pd(onefourth_SSE,
1885                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
1886                                                                   _mm_mul_pd(lij3_SSE0,dr_SSE0))));
1887             t1_SSE1            = _mm_sub_pd(t1_SSE1,
1888                                             _mm_mul_pd(onefourth_SSE,
1889                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
1890                                                                   _mm_mul_pd(lij3_SSE1,dr_SSE1))));
1891             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
1892                                             _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
1893                                                        _mm_mul_pd(uij3_SSE0,dr_SSE0)));
1894             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
1895                                             _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
1896                                                        _mm_mul_pd(uij3_SSE1,dr_SSE1)));
1897             t2_SSE0            = _mm_sub_pd(t2_SSE0,
1898                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
1899                                                        _mm_mul_pd(prod_SSE0,uij3_SSE0)));
1900             t2_SSE1            = _mm_sub_pd(t2_SSE1,
1901                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
1902                                                        _mm_mul_pd(prod_SSE1,uij3_SSE1)));
1903             
1904             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
1905                                             _mm_mul_pd(rinv_SSE0,rinv_SSE0));
1906             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
1907                                             _mm_mul_pd(rinv_SSE1,rinv_SSE1));
1908             
1909             t3_SSE0            = _mm_sub_pd(t3_SSE0,
1910                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
1911                                                        _mm_add_pd(one_SSE,
1912                                                                   _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
1913             t3_SSE1            = _mm_sub_pd(t3_SSE1,
1914                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
1915                                                        _mm_add_pd(one_SSE,
1916                                                                   _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
1917             
1918             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
1919                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
1920                                                        _mm_add_pd(t2_SSE0,t3_SSE0)));
1921             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
1922                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
1923                                                        _mm_add_pd(t2_SSE1,t3_SSE1)));
1924             
1925             _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
1926             dadx += 2;
1927             _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
1928             dadx += 2;
1929         }
1930
1931         /* Epilogue part, including exclusion mask */
1932         for(j=nj2; j<nj3; j+=UNROLLJ)
1933         {                      
1934             jmask_SSE0 = _mm_load_pd((double *)emask0);
1935             jmask_SSE1 = _mm_load_pd((double *)emask1);
1936             emask0 += 2*UNROLLJ;
1937             emask1 += 2*UNROLLJ; 
1938             
1939             /* load j atom coordinates */
1940             jx_SSE            = _mm_load_pd(x_align+j);
1941             jy_SSE            = _mm_load_pd(y_align+j);
1942             jz_SSE            = _mm_load_pd(z_align+j);
1943             
1944             /* Calculate distance */
1945             dx_SSE0            = _mm_sub_pd(ix_SSE0,jx_SSE);
1946             dy_SSE0            = _mm_sub_pd(iy_SSE0,jy_SSE);
1947             dz_SSE0            = _mm_sub_pd(iz_SSE0,jz_SSE);
1948             dx_SSE1            = _mm_sub_pd(ix_SSE1,jx_SSE);
1949             dy_SSE1            = _mm_sub_pd(iy_SSE1,jy_SSE);
1950             dz_SSE1            = _mm_sub_pd(iz_SSE1,jz_SSE);
1951             
1952             /* rsq = dx*dx+dy*dy+dz*dz */
1953             rsq_SSE0           = gmx_mm_calc_rsq_pd(dx_SSE0,dy_SSE0,dz_SSE0);
1954             rsq_SSE1           = gmx_mm_calc_rsq_pd(dx_SSE1,dy_SSE1,dz_SSE1);
1955             
1956             /* Combine masks */
1957             jmask_SSE0         = _mm_and_pd(jmask_SSE0,imask_SSE0);
1958             jmask_SSE1         = _mm_and_pd(jmask_SSE1,imask_SSE1);
1959             
1960             /* Calculate 1/r and 1/r2 */
1961             rinv_SSE0          = gmx_mm_invsqrt_pd(rsq_SSE0);
1962             rinv_SSE1          = gmx_mm_invsqrt_pd(rsq_SSE1);
1963             
1964             /* Apply mask */
1965             rinv_SSE0          = _mm_and_pd(rinv_SSE0,jmask_SSE0);
1966             rinv_SSE1          = _mm_and_pd(rinv_SSE1,jmask_SSE1);
1967             
1968             dr_SSE0            = _mm_mul_pd(rsq_SSE0,rinv_SSE0);
1969             dr_SSE1            = _mm_mul_pd(rsq_SSE1,rinv_SSE1);
1970             
1971             sk_aj_SSE          = _mm_load_pd(obc_param+j);
1972             raj_SSE            = _mm_load_pd(gb_radius+j);
1973
1974             raj_inv_SSE        = gmx_mm_inv_pd(raj_SSE);
1975             
1976             /* Evaluate influence of atom aj -> ai */
1977             t1_SSE0            = _mm_add_pd(dr_SSE0,sk_aj_SSE);
1978             t1_SSE1            = _mm_add_pd(dr_SSE1,sk_aj_SSE);
1979             t2_SSE0            = _mm_sub_pd(dr_SSE0,sk_aj_SSE);
1980             t2_SSE1            = _mm_sub_pd(dr_SSE1,sk_aj_SSE);
1981             t3_SSE0            = _mm_sub_pd(sk_aj_SSE,dr_SSE0);
1982             t3_SSE1            = _mm_sub_pd(sk_aj_SSE,dr_SSE1);
1983             
1984             obc_mask1_SSE0     = _mm_cmplt_pd(rai_SSE0, t1_SSE0);
1985             obc_mask1_SSE1     = _mm_cmplt_pd(rai_SSE1, t1_SSE1);
1986             obc_mask2_SSE0     = _mm_cmplt_pd(rai_SSE0, t2_SSE0);
1987             obc_mask2_SSE1     = _mm_cmplt_pd(rai_SSE1, t2_SSE1);
1988             obc_mask3_SSE0     = _mm_cmplt_pd(rai_SSE0, t3_SSE0);
1989             obc_mask3_SSE1     = _mm_cmplt_pd(rai_SSE1, t3_SSE1);
1990             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
1991             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
1992             
1993             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
1994             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
1995             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
1996                                            _mm_andnot_pd(obc_mask2_SSE0,rai_inv_SSE0));
1997             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
1998                                            _mm_andnot_pd(obc_mask2_SSE1,rai_inv_SSE1));
1999             
2000             dlij_SSE0          = _mm_and_pd(one_SSE,obc_mask2_SSE0);
2001             dlij_SSE1          = _mm_and_pd(one_SSE,obc_mask2_SSE1);
2002             
2003             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
2004             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
2005             uij3_SSE0          = _mm_mul_pd(uij2_SSE0,uij_SSE0);
2006             uij3_SSE1          = _mm_mul_pd(uij2_SSE1,uij_SSE1);
2007             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
2008             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
2009             lij3_SSE0          = _mm_mul_pd(lij2_SSE0,lij_SSE0);
2010             lij3_SSE1          = _mm_mul_pd(lij2_SSE1,lij_SSE1);
2011             
2012             diff2_SSE0         = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
2013             diff2_SSE1         = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
2014             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
2015             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
2016             sk2_aj_SSE         = _mm_mul_pd(sk_aj_SSE,sk_aj_SSE);
2017             sk2_rinv_SSE0      = _mm_mul_pd(sk2_aj_SSE,rinv_SSE0);
2018             sk2_rinv_SSE1      = _mm_mul_pd(sk2_aj_SSE,rinv_SSE1);
2019             prod_SSE0          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
2020             prod_SSE1          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
2021             
2022             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
2023             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
2024             
2025             t1_SSE0            = _mm_sub_pd(lij_SSE0,uij_SSE0);
2026             t1_SSE1            = _mm_sub_pd(lij_SSE1,uij_SSE1);
2027             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
2028                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
2029                                                        prod_SSE0));
2030             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
2031                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
2032                                                        prod_SSE1));
2033             
2034             t3_SSE0            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
2035             t3_SSE1            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
2036             t1_SSE0            = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
2037             t1_SSE1            = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
2038             t4_SSE0            = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE0,lij_SSE0));
2039             t4_SSE1            = _mm_mul_pd(two_SSE,_mm_sub_pd(rai_inv_SSE1,lij_SSE1));
2040             t4_SSE0            = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
2041             t4_SSE1            = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
2042             t1_SSE0            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
2043             t1_SSE1            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
2044             
2045             sum_ai_SSE0        = _mm_add_pd(sum_ai_SSE0,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
2046             sum_ai_SSE1        = _mm_add_pd(sum_ai_SSE1,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
2047             
2048             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
2049                                             _mm_mul_pd(prod_SSE0,lij3_SSE0));
2050             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
2051                                             _mm_mul_pd(prod_SSE1,lij3_SSE1));
2052             t1_SSE0            = _mm_sub_pd(t1_SSE0,
2053                                             _mm_mul_pd(onefourth_SSE,
2054                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
2055                                                                   _mm_mul_pd(lij3_SSE0,dr_SSE0))));
2056             t1_SSE1            = _mm_sub_pd(t1_SSE1,
2057                                             _mm_mul_pd(onefourth_SSE,
2058                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
2059                                                                   _mm_mul_pd(lij3_SSE1,dr_SSE1))));
2060             
2061             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
2062                                             _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
2063                                                        _mm_mul_pd(uij3_SSE0,dr_SSE0)));
2064             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
2065                                             _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
2066                                                        _mm_mul_pd(uij3_SSE1,dr_SSE1)));
2067             t2_SSE0            = _mm_sub_pd(t2_SSE0,
2068                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
2069                                                        _mm_mul_pd(prod_SSE0,uij3_SSE0)));
2070             t2_SSE1            = _mm_sub_pd(t2_SSE1,
2071                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
2072                                                        _mm_mul_pd(prod_SSE1,uij3_SSE1)));
2073             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
2074                                             _mm_mul_pd(rinv_SSE0,rinv_SSE0));
2075             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
2076                                             _mm_mul_pd(rinv_SSE1,rinv_SSE1));
2077             t3_SSE0            = _mm_sub_pd(t3_SSE0,
2078                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
2079                                                        _mm_add_pd(one_SSE,
2080                                                                   _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
2081             t3_SSE1            = _mm_sub_pd(t3_SSE1,
2082                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
2083                                                        _mm_add_pd(one_SSE,
2084                                                                   _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
2085             
2086             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
2087                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
2088                                                        _mm_add_pd(t2_SSE0,t3_SSE0)));
2089             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
2090                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
2091                                                        _mm_add_pd(t2_SSE1,t3_SSE1)));
2092             
2093             _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
2094             dadx += 2;
2095             _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
2096             dadx += 2;
2097             
2098             /* Evaluate influence of atom ai -> aj */
2099             t1_SSE0            = _mm_add_pd(dr_SSE0,sk_ai_SSE0);
2100             t1_SSE1            = _mm_add_pd(dr_SSE1,sk_ai_SSE1);
2101             t2_SSE0            = _mm_sub_pd(dr_SSE0,sk_ai_SSE0);
2102             t2_SSE1            = _mm_sub_pd(dr_SSE1,sk_ai_SSE1);
2103             t3_SSE0            = _mm_sub_pd(sk_ai_SSE0,dr_SSE0);
2104             t3_SSE1            = _mm_sub_pd(sk_ai_SSE1,dr_SSE1);
2105             
2106             obc_mask1_SSE0     = _mm_cmplt_pd(raj_SSE, t1_SSE0);
2107             obc_mask1_SSE1     = _mm_cmplt_pd(raj_SSE, t1_SSE1);
2108             obc_mask2_SSE0     = _mm_cmplt_pd(raj_SSE, t2_SSE0);
2109             obc_mask2_SSE1     = _mm_cmplt_pd(raj_SSE, t2_SSE1);
2110             obc_mask3_SSE0     = _mm_cmplt_pd(raj_SSE, t3_SSE0);
2111             obc_mask3_SSE1     = _mm_cmplt_pd(raj_SSE, t3_SSE1);
2112             obc_mask1_SSE0     = _mm_and_pd(obc_mask1_SSE0,jmask_SSE0);
2113             obc_mask1_SSE1     = _mm_and_pd(obc_mask1_SSE1,jmask_SSE1);
2114             
2115             uij_SSE0           = gmx_mm_inv_pd(t1_SSE0);
2116             uij_SSE1           = gmx_mm_inv_pd(t1_SSE1);
2117             lij_SSE0           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE0,gmx_mm_inv_pd(t2_SSE0)),
2118                                            _mm_andnot_pd(obc_mask2_SSE0,raj_inv_SSE));
2119             lij_SSE1           = _mm_or_pd(   _mm_and_pd(obc_mask2_SSE1,gmx_mm_inv_pd(t2_SSE1)),
2120                                            _mm_andnot_pd(obc_mask2_SSE1,raj_inv_SSE));
2121  
2122             dlij_SSE0          = _mm_and_pd(one_SSE,obc_mask2_SSE0);
2123             dlij_SSE1          = _mm_and_pd(one_SSE,obc_mask2_SSE1);
2124             
2125             uij2_SSE0          = _mm_mul_pd(uij_SSE0, uij_SSE0);
2126             uij2_SSE1          = _mm_mul_pd(uij_SSE1, uij_SSE1);
2127             uij3_SSE0          = _mm_mul_pd(uij2_SSE0,uij_SSE0);
2128             uij3_SSE1          = _mm_mul_pd(uij2_SSE1,uij_SSE1);
2129             lij2_SSE0          = _mm_mul_pd(lij_SSE0, lij_SSE0);
2130             lij2_SSE1          = _mm_mul_pd(lij_SSE1, lij_SSE1);
2131             lij3_SSE0          = _mm_mul_pd(lij2_SSE0,lij_SSE0);
2132             lij3_SSE1          = _mm_mul_pd(lij2_SSE1,lij_SSE1);
2133             
2134             diff2_SSE0         = _mm_sub_pd(uij2_SSE0,lij2_SSE0);
2135             diff2_SSE1         = _mm_sub_pd(uij2_SSE1,lij2_SSE1);
2136             lij_inv_SSE0       = gmx_mm_invsqrt_pd(lij2_SSE0);
2137             lij_inv_SSE1       = gmx_mm_invsqrt_pd(lij2_SSE1);
2138             sk2_rinv_SSE0      = _mm_mul_pd(sk2_ai_SSE0,rinv_SSE0);
2139             sk2_rinv_SSE1      = _mm_mul_pd(sk2_ai_SSE1,rinv_SSE1);
2140             prod_SSE0          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE0);
2141             prod_SSE1          = _mm_mul_pd(onefourth_SSE,sk2_rinv_SSE1);
2142             
2143             logterm_SSE0       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE0,lij_inv_SSE0));
2144             logterm_SSE1       = gmx_mm_log_pd(_mm_mul_pd(uij_SSE1,lij_inv_SSE1));
2145             t1_SSE0            = _mm_sub_pd(lij_SSE0,uij_SSE0);
2146             t1_SSE1            = _mm_sub_pd(lij_SSE1,uij_SSE1);
2147             t2_SSE0            = _mm_mul_pd(diff2_SSE0,
2148                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE0),
2149                                                        prod_SSE0));
2150             t2_SSE1            = _mm_mul_pd(diff2_SSE1,
2151                                             _mm_sub_pd(_mm_mul_pd(onefourth_SSE,dr_SSE1),
2152                                                        prod_SSE1));
2153             t3_SSE0            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE0,logterm_SSE0));
2154             t3_SSE1            = _mm_mul_pd(half_SSE,_mm_mul_pd(rinv_SSE1,logterm_SSE1));
2155             t1_SSE0            = _mm_add_pd(t1_SSE0,_mm_add_pd(t2_SSE0,t3_SSE0));
2156             t1_SSE1            = _mm_add_pd(t1_SSE1,_mm_add_pd(t2_SSE1,t3_SSE1));
2157             t4_SSE0            = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE0));
2158             t4_SSE1            = _mm_mul_pd(two_SSE,_mm_sub_pd(raj_inv_SSE,lij_SSE1));
2159             t4_SSE0            = _mm_and_pd(t4_SSE0,obc_mask3_SSE0);
2160             t4_SSE1            = _mm_and_pd(t4_SSE1,obc_mask3_SSE1);
2161             t1_SSE0            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE0,t4_SSE0));
2162             t1_SSE1            = _mm_mul_pd(half_SSE,_mm_add_pd(t1_SSE1,t4_SSE1));
2163             
2164             _mm_store_pd(work+j, _mm_add_pd(_mm_load_pd(work+j),
2165                                             _mm_add_pd(_mm_and_pd(t1_SSE0,obc_mask1_SSE0),
2166                                                        _mm_and_pd(t1_SSE1,obc_mask1_SSE1))));
2167                         
2168             t1_SSE0            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE0),
2169                                             _mm_mul_pd(prod_SSE0,lij3_SSE0));
2170             t1_SSE1            = _mm_add_pd(_mm_mul_pd(half_SSE,lij2_SSE1),
2171                                             _mm_mul_pd(prod_SSE1,lij3_SSE1));
2172             
2173             t1_SSE0            = _mm_sub_pd(t1_SSE0,
2174                                             _mm_mul_pd(onefourth_SSE,
2175                                                        _mm_add_pd(_mm_mul_pd(lij_SSE0,rinv_SSE0),
2176                                                                   _mm_mul_pd(lij3_SSE0,dr_SSE0))));
2177             t1_SSE1            = _mm_sub_pd(t1_SSE1,
2178                                             _mm_mul_pd(onefourth_SSE,
2179                                                        _mm_add_pd(_mm_mul_pd(lij_SSE1,rinv_SSE1),
2180                                                                   _mm_mul_pd(lij3_SSE1,dr_SSE1))));
2181             t2_SSE0            = _mm_mul_pd(onefourth_SSE,
2182                                             _mm_add_pd(_mm_mul_pd(uij_SSE0,rinv_SSE0),
2183                                                        _mm_mul_pd(uij3_SSE0,dr_SSE0)));
2184             t2_SSE1            = _mm_mul_pd(onefourth_SSE,
2185                                             _mm_add_pd(_mm_mul_pd(uij_SSE1,rinv_SSE1),
2186                                                        _mm_mul_pd(uij3_SSE1,dr_SSE1)));
2187             t2_SSE0            = _mm_sub_pd(t2_SSE0,
2188                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE0),
2189                                                        _mm_mul_pd(prod_SSE0,uij3_SSE0)));
2190             t2_SSE1            = _mm_sub_pd(t2_SSE1,
2191                                             _mm_add_pd(_mm_mul_pd(half_SSE,uij2_SSE1),
2192                                                        _mm_mul_pd(prod_SSE1,uij3_SSE1)));
2193             
2194             t3_SSE0            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE0),
2195                                             _mm_mul_pd(rinv_SSE0,rinv_SSE0));
2196             t3_SSE1            = _mm_mul_pd(_mm_mul_pd(onefourth_SSE,logterm_SSE1),
2197                                             _mm_mul_pd(rinv_SSE1,rinv_SSE1));
2198             
2199             t3_SSE0            = _mm_sub_pd(t3_SSE0,
2200                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE0,oneeighth_SSE),
2201                                                        _mm_add_pd(one_SSE,
2202                                                                   _mm_mul_pd(sk2_rinv_SSE0,rinv_SSE0))));
2203             t3_SSE1            = _mm_sub_pd(t3_SSE1,
2204                                             _mm_mul_pd(_mm_mul_pd(diff2_SSE1,oneeighth_SSE),
2205                                                        _mm_add_pd(one_SSE,
2206                                                                   _mm_mul_pd(sk2_rinv_SSE1,rinv_SSE1))));
2207             
2208             t1_SSE0            = _mm_mul_pd(rinv_SSE0,
2209                                             _mm_add_pd(_mm_mul_pd(dlij_SSE0,t1_SSE0),
2210                                                        _mm_add_pd(t2_SSE0,t3_SSE0)));
2211             t1_SSE1            = _mm_mul_pd(rinv_SSE1,
2212                                             _mm_add_pd(_mm_mul_pd(dlij_SSE1,t1_SSE1),
2213                                                        _mm_add_pd(t2_SSE1,t3_SSE1)));
2214             
2215             _mm_store_pd(dadx,_mm_and_pd(t1_SSE0,obc_mask1_SSE0));
2216             dadx += 2;
2217             _mm_store_pd(dadx,_mm_and_pd(t1_SSE1,obc_mask1_SSE1));
2218             dadx += 2;
2219         }
2220         GMX_MM_TRANSPOSE2_PD(sum_ai_SSE0,sum_ai_SSE1);
2221         sum_ai_SSE0 = _mm_add_pd(sum_ai_SSE0,sum_ai_SSE1);
2222         _mm_store_pd(work+i, _mm_add_pd(sum_ai_SSE0, _mm_load_pd(work+i)));
2223         }    
2224     
2225     
2226     for(i=0;i<natoms/2+1;i++)
2227     {
2228         work[i] += work[natoms+i];
2229     }
2230     
2231     /* Parallel summations */
2232     
2233         if(PARTDECOMP(cr))
2234         {
2235                 gmx_sum(natoms,work, cr);
2236         }
2237         
2238     if(gb_algorithm==egbHCT)
2239     {
2240         /* HCT */
2241         for(i=0;i<natoms;i++)
2242         {
2243             if(born->use[i] != 0)
2244             {
2245                 rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset; 
2246                 sum_ai  = 1.0/rai - work[i];
2247                 min_rad = rai + born->gb_doffset;
2248                 rad     = 1.0/sum_ai; 
2249                 
2250                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
2251                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
2252             }
2253         }
2254         
2255     }
2256     else
2257     {
2258         /* OBC */
2259         
2260         /* Calculate the radii */
2261         for(i=0;i<natoms;i++)
2262         {
2263             
2264             if(born->use[i] != 0)
2265             {
2266                 rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
2267                 rai_inv2   = 1.0/rai;
2268                 rai        = rai-born->gb_doffset; 
2269                 rai_inv    = 1.0/rai;
2270                 sum_ai     = rai * work[i];
2271                 sum_ai2    = sum_ai  * sum_ai;
2272                 sum_ai3    = sum_ai2 * sum_ai;
2273                 
2274                 tsum    = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
2275                 born->bRad[i] = rai_inv - tsum*rai_inv2;
2276                 born->bRad[i] = 1.0 / born->bRad[i];
2277                 
2278                 fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]);
2279                 
2280                 tchain  = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
2281                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
2282             }
2283         }
2284     }
2285         
2286         return 0;
2287 }
2288
2289
2290
2291
2292
2293
2294
2295
2296 int
2297 genborn_allvsall_calc_chainrule_sse2_double(t_forcerec *           fr,
2298                                             t_mdatoms *            mdatoms,
2299                                             gmx_genborn_t *        born,
2300                                             double *                 x,
2301                                             double *                 f,
2302                                             int                    gb_algorithm,
2303                                             void *                 paadata)
2304 {
2305         gmx_allvsallgb2_data_t *aadata;
2306         int        natoms;
2307         int        ni0,ni1;
2308         int        nj0,nj1,nj2,nj3;
2309         int        i,j,k,n;
2310     int        idx;
2311     int *      mask;
2312     int *      pmask0;
2313     int *      emask0;
2314     int *      jindex;
2315
2316     double       ix,iy,iz;
2317     double       fix,fiy,fiz;
2318     double       jx,jy,jz;
2319     double       dx,dy,dz;
2320     double       tx,ty,tz;
2321     double       rbai,rbaj,fgb,fgb_ai,rbi;
2322     double *     rb;
2323     double *     dadx;
2324     double *     x_align;
2325     double *     y_align;
2326     double *     z_align;
2327     double *     fx_align;
2328     double *     fy_align;
2329     double *     fz_align;
2330     double       tmpsum[2];
2331     
2332     __m128d    jmask_SSE0,jmask_SSE1;
2333     __m128d    ix_SSE0,iy_SSE0,iz_SSE0;
2334     __m128d    ix_SSE1,iy_SSE1,iz_SSE1;
2335     __m128d    fix_SSE0,fiy_SSE0,fiz_SSE0;
2336     __m128d    fix_SSE1,fiy_SSE1,fiz_SSE1;
2337     __m128d    rbai_SSE0,rbai_SSE1;
2338     __m128d    imask_SSE0,imask_SSE1;
2339     __m128d    jx_SSE,jy_SSE,jz_SSE,rbaj_SSE;
2340     __m128d    dx_SSE0,dy_SSE0,dz_SSE0;
2341     __m128d    dx_SSE1,dy_SSE1,dz_SSE1;
2342     __m128d    fgb_SSE0,fgb_ai_SSE0;
2343     __m128d    fgb_SSE1,fgb_ai_SSE1;
2344     __m128d    tx_SSE0,ty_SSE0,tz_SSE0;
2345     __m128d    tx_SSE1,ty_SSE1,tz_SSE1;
2346     __m128d    t1,t2,tmpSSE;
2347
2348     natoms              = mdatoms->nr;
2349         ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
2350         ni1                 = mdatoms->start+mdatoms->homenr;
2351     
2352     aadata = (gmx_allvsallgb2_data_t *)paadata;
2353
2354     x_align = aadata->x_align;
2355         y_align = aadata->y_align;
2356         z_align = aadata->z_align;
2357     fx_align = aadata->fx_align;
2358         fy_align = aadata->fy_align;
2359         fz_align = aadata->fz_align;
2360     
2361     jindex    = aadata->jindex_gb;
2362     dadx      = fr->dadx;
2363
2364     n = 0;
2365     rb = aadata->work;
2366
2367         /* Loop to get the proper form for the Born radius term */
2368         if(gb_algorithm==egbSTILL) 
2369         {
2370                 for(i=0;i<natoms;i++)
2371                 {
2372                         rbi   = born->bRad[i];
2373                         rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
2374                 }
2375         }
2376         else if(gb_algorithm==egbHCT) 
2377         {
2378                 for(i=0;i<natoms;i++)
2379                 {
2380                         rbi   = born->bRad[i];
2381                         rb[i] = rbi * rbi * fr->dvda[i];
2382                 }
2383         }
2384         else if(gb_algorithm==egbOBC) 
2385         {
2386                 for(idx=0;idx<natoms;idx++)
2387                 {
2388                         rbi   = born->bRad[idx];
2389                         rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
2390                 }
2391         }
2392     
2393     for(i=0;i<2*natoms;i++)
2394         {
2395                 fx_align[i]       = 0;
2396                 fy_align[i]       = 0;
2397                 fz_align[i]       = 0;
2398     }        
2399     
2400     
2401     for(i=0;i<natoms;i++)
2402     {
2403         rb[i+natoms]=rb[i];
2404     }
2405
2406     for(i=ni0; i<ni1; i+=UNROLLI)
2407         {
2408                 /* We assume shifts are NOT used for all-vs-all interactions */
2409                 
2410                 /* Load i atom data */
2411                 ix_SSE0          = _mm_load1_pd(x_align+i);
2412                 iy_SSE0          = _mm_load1_pd(y_align+i);
2413                 iz_SSE0          = _mm_load1_pd(z_align+i);
2414                 ix_SSE1          = _mm_load1_pd(x_align+i+1);
2415                 iy_SSE1          = _mm_load1_pd(y_align+i+1);
2416                 iz_SSE1          = _mm_load1_pd(z_align+i+1);
2417         
2418                 fix_SSE0         = _mm_setzero_pd();
2419                 fiy_SSE0         = _mm_setzero_pd();
2420                 fiz_SSE0         = _mm_setzero_pd();
2421                 fix_SSE1         = _mm_setzero_pd();
2422                 fiy_SSE1         = _mm_setzero_pd();
2423                 fiz_SSE1         = _mm_setzero_pd();
2424         
2425         rbai_SSE0        = _mm_load1_pd(rb+i);
2426         rbai_SSE1        = _mm_load1_pd(rb+i+1);
2427         
2428                 /* Load limits for loop over neighbors */
2429                 nj0              = jindex[4*i];
2430                 nj3              = jindex[4*i+3];
2431         
2432         /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
2433         for(j=nj0; j<nj3; j+=UNROLLJ)
2434         {          
2435             /* load j atom coordinates */
2436             jx_SSE           = _mm_load_pd(x_align+j);
2437             jy_SSE           = _mm_load_pd(y_align+j);
2438             jz_SSE           = _mm_load_pd(z_align+j);
2439             
2440             /* Calculate distance */
2441             dx_SSE0          = _mm_sub_pd(ix_SSE0,jx_SSE);
2442             dy_SSE0          = _mm_sub_pd(iy_SSE0,jy_SSE);
2443             dz_SSE0          = _mm_sub_pd(iz_SSE0,jz_SSE);
2444             dx_SSE1          = _mm_sub_pd(ix_SSE1,jx_SSE);
2445             dy_SSE1          = _mm_sub_pd(iy_SSE1,jy_SSE);
2446             dz_SSE1          = _mm_sub_pd(iz_SSE1,jz_SSE);
2447             
2448             rbaj_SSE         = _mm_load_pd(rb+j);
2449             
2450             fgb_SSE0         = _mm_mul_pd(rbai_SSE0,_mm_load_pd(dadx));            
2451             dadx += 2;
2452             fgb_SSE1         = _mm_mul_pd(rbai_SSE1,_mm_load_pd(dadx));
2453             dadx += 2;
2454             
2455             fgb_ai_SSE0      = _mm_mul_pd(rbaj_SSE,_mm_load_pd(dadx));
2456             dadx +=2;
2457             fgb_ai_SSE1      = _mm_mul_pd(rbaj_SSE,_mm_load_pd(dadx));
2458             dadx +=2;
2459             
2460             /* Total force between ai and aj is the sum of ai->aj and aj->ai */
2461             fgb_SSE0         = _mm_add_pd(fgb_SSE0,fgb_ai_SSE0);
2462             fgb_SSE1         = _mm_add_pd(fgb_SSE1,fgb_ai_SSE1);
2463             
2464             /* Calculate temporary vectorial force */
2465             tx_SSE0            = _mm_mul_pd(fgb_SSE0,dx_SSE0);
2466             ty_SSE0            = _mm_mul_pd(fgb_SSE0,dy_SSE0);
2467             tz_SSE0            = _mm_mul_pd(fgb_SSE0,dz_SSE0);
2468             tx_SSE1            = _mm_mul_pd(fgb_SSE1,dx_SSE1);
2469             ty_SSE1            = _mm_mul_pd(fgb_SSE1,dy_SSE1);
2470             tz_SSE1            = _mm_mul_pd(fgb_SSE1,dz_SSE1);
2471             
2472             /* Increment i atom force */
2473             fix_SSE0          = _mm_add_pd(fix_SSE0,tx_SSE0);
2474             fiy_SSE0          = _mm_add_pd(fiy_SSE0,ty_SSE0);
2475             fiz_SSE0          = _mm_add_pd(fiz_SSE0,tz_SSE0);
2476             fix_SSE1          = _mm_add_pd(fix_SSE1,tx_SSE1);
2477             fiy_SSE1          = _mm_add_pd(fiy_SSE1,ty_SSE1);
2478             fiz_SSE1          = _mm_add_pd(fiz_SSE1,tz_SSE1);
2479             
2480             /* Decrement j atom force */
2481             _mm_store_pd(fx_align+j,
2482                          _mm_sub_pd( _mm_load_pd(fx_align+j) , _mm_add_pd(tx_SSE0,tx_SSE1) ));
2483             _mm_store_pd(fy_align+j,
2484                          _mm_sub_pd( _mm_load_pd(fy_align+j) , _mm_add_pd(ty_SSE0,ty_SSE1) ));
2485             _mm_store_pd(fz_align+j,
2486                          _mm_sub_pd( _mm_load_pd(fz_align+j) , _mm_add_pd(tz_SSE0,tz_SSE1) ));
2487         }
2488         
2489                 /* Add i forces to mem */
2490         GMX_MM_TRANSPOSE2_PD(fix_SSE0,fix_SSE1);
2491         fix_SSE0 = _mm_add_pd(fix_SSE0,fix_SSE1);
2492         _mm_store_pd(fx_align+i, _mm_add_pd(fix_SSE0, _mm_load_pd(fx_align+i)));
2493         
2494         GMX_MM_TRANSPOSE2_PD(fiy_SSE0,fiy_SSE1);
2495         fiy_SSE0 = _mm_add_pd(fiy_SSE0,fiy_SSE1);
2496         _mm_store_pd(fy_align+i, _mm_add_pd(fiy_SSE0, _mm_load_pd(fy_align+i)));
2497         
2498         GMX_MM_TRANSPOSE2_PD(fiz_SSE0,fiz_SSE1);
2499         fiz_SSE0 = _mm_add_pd(fiz_SSE0,fiz_SSE1);
2500         _mm_store_pd(fz_align+i, _mm_add_pd(fiz_SSE0, _mm_load_pd(fz_align+i)));                
2501         }    
2502         
2503     for(i=0;i<natoms;i++)
2504         {
2505                 f[3*i]       += fx_align[i] + fx_align[natoms+i];
2506                 f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
2507                 f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
2508     }
2509     
2510         return 0;
2511 }
2512
2513 #else
2514 /* dummy variable when not using SSE */
2515 int genborn_allvsall_sse2_double_dummy;
2516
2517
2518 #endif
2519