Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / genborn_allvsall_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2009, The GROMACS Development Team.
6  * Copyright (c) 2012, by the GROMACS development team, led by
7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8  * and including many others, as listed in the AUTHORS file in the
9  * top-level source directory and at http://www.gromacs.org.
10  *
11  * GROMACS is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public License
13  * as published by the Free Software Foundation; either version 2.1
14  * of the License, or (at your option) any later version.
15  *
16  * GROMACS is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with GROMACS; if not, see
23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
25  *
26  * If you want to redistribute modifications to GROMACS, please
27  * consider that scientific software is very special. Version
28  * control is crucial - bugs must be traceable. We will be happy to
29  * consider code for inclusion in the official distribution, but
30  * derived work must not be called official GROMACS. Details are found
31  * in the README & COPYING files - if they are missing, get the
32  * official version at http://www.gromacs.org.
33  *
34  * To help us fund GROMACS development, we humbly ask that you cite
35  * the research papers on the package. Check out http://www.gromacs.org.
36  */
37 #ifdef HAVE_CONFIG_H
38 #include <config.h>
39 #endif
40
41 #include <math.h>
42 #include "types/simple.h"
43
44 #include "vec.h"
45 #include "smalloc.h"
46
47 #include "partdec.h"
48 #include "network.h"
49 #include "physics.h"
50 #include "genborn.h"
51 #include "genborn_allvsall.h"
52
53 #if 0 && defined (GMX_X86_SSE2)
54
55 #include <gmx_sse2_single.h>
56
57
58 #define SIMD_WIDTH 4
59 #define UNROLLI    4
60 #define UNROLLJ    4
61
62
63
64
65
66
67
68
69
70 typedef struct
71 {
72     int *      jindex_gb;
73     int **     prologue_mask_gb;
74     int **     epilogue_mask;
75     int *      imask;
76     real *     gb_radius;
77     real *     workparam;
78     real *     work;
79     real *     x_align;
80     real *     y_align;
81     real *     z_align;
82     real *     fx_align;
83     real *     fy_align;
84     real *     fz_align;
85 }
86 gmx_allvsallgb2_data_t;
87
88
89 static int
90 calc_maxoffset(int i, int natoms)
91 {
92     int maxoffset;
93
94     if ((natoms % 2) == 1)
95     {
96         /* Odd number of atoms, easy */
97         maxoffset = natoms/2;
98     }
99     else if ((natoms % 4) == 0)
100     {
101         /* Multiple of four is hard */
102         if (i < natoms/2)
103         {
104             if ((i % 2) == 0)
105             {
106                 maxoffset = natoms/2;
107             }
108             else
109             {
110                 maxoffset = natoms/2-1;
111             }
112         }
113         else
114         {
115             if ((i % 2) == 1)
116             {
117                 maxoffset = natoms/2;
118             }
119             else
120             {
121                 maxoffset = natoms/2-1;
122             }
123         }
124     }
125     else
126     {
127         /* natoms/2 = odd */
128         if ((i % 2) == 0)
129         {
130             maxoffset = natoms/2;
131         }
132         else
133         {
134             maxoffset = natoms/2-1;
135         }
136     }
137
138     return maxoffset;
139 }
140
141 static void
142 setup_gb_exclusions_and_indices(gmx_allvsallgb2_data_t     *   aadata,
143                                 t_ilist     *                  ilist,
144                                 int                            start,
145                                 int                            end,
146                                 int                            natoms,
147                                 gmx_bool                       bInclude12,
148                                 gmx_bool                       bInclude13,
149                                 gmx_bool                       bInclude14)
150 {
151     int   i, j, k, tp;
152     int   a1, a2;
153     int   ni0, ni1, nj0, nj1, nj;
154     int   imin, imax, iexcl;
155     int   max_offset;
156     int   max_excl_offset;
157     int   firstinteraction;
158     int   ibase;
159     int  *pi;
160
161     /* This routine can appear to be a bit complex, but it is mostly book-keeping.
162      * To enable the fast all-vs-all kernel we need to be able to stream through all coordinates
163      * whether they should interact or not.
164      *
165      * To avoid looping over the exclusions, we create a simple mask that is 1 if the interaction
166      * should be present, otherwise 0. Since exclusions typically only occur when i & j are close,
167      * we create a jindex array with three elements per i atom: the starting point, the point to
168      * which we need to check exclusions, and the end point.
169      * This way we only have to allocate a short exclusion mask per i atom.
170      */
171
172     ni0 = (start/UNROLLI)*UNROLLI;
173     ni1 = ((end+UNROLLI-1)/UNROLLI)*UNROLLI;
174
175     /* Set the interaction mask to only enable the i atoms we want to include */
176     snew(pi, natoms+UNROLLI+2*SIMD_WIDTH);
177     aadata->imask = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
178     for (i = 0; i < natoms+UNROLLI; i++)
179     {
180         aadata->imask[i] = (i >= start && i < end) ? 0xFFFFFFFF : 0;
181     }
182
183     /* Allocate memory for our modified jindex array */
184     snew(aadata->jindex_gb, 4*(natoms+UNROLLI));
185     for (i = 0; i < 4*(natoms+UNROLLI); i++)
186     {
187         aadata->jindex_gb[i] = 0;
188     }
189
190     /* Create the exclusion masks for the prologue part */
191     snew(aadata->prologue_mask_gb, natoms+UNROLLI); /* list of pointers */
192
193     /* First zero everything to avoid uninitialized data */
194     for (i = 0; i < natoms+UNROLLI; i++)
195     {
196         aadata->prologue_mask_gb[i] = NULL;
197     }
198
199     /* Calculate the largest exclusion range we need for each UNROLLI-tuplet of i atoms. */
200     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
201     {
202         max_excl_offset = -1;
203
204         /* First find maxoffset for the next 4 atoms (or fewer if we are close to end) */
205         imax = ((ibase+UNROLLI) < end) ? (ibase+UNROLLI) : end;
206
207         /* Which atom is the first we (might) interact with? */
208         imin = natoms; /* Guaranteed to be overwritten by one of 'firstinteraction' */
209         for (i = ibase; i < imax; i++)
210         {
211             /* Before exclusions, which atom is the first we (might) interact with? */
212             firstinteraction = i+1;
213             max_offset       = calc_maxoffset(i, natoms);
214
215             if (!bInclude12)
216             {
217                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
218                 {
219                     a1 = ilist[F_GB12].iatoms[j+1];
220                     a2 = ilist[F_GB12].iatoms[j+2];
221
222                     if (a1 == i)
223                     {
224                         k = a2;
225                     }
226                     else if (a2 == i)
227                     {
228                         k = a1;
229                     }
230                     else
231                     {
232                         continue;
233                     }
234
235                     if (k == firstinteraction)
236                     {
237                         firstinteraction++;
238                     }
239                 }
240             }
241             if (!bInclude13)
242             {
243                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
244                 {
245                     a1 = ilist[F_GB13].iatoms[j+1];
246                     a2 = ilist[F_GB13].iatoms[j+2];
247
248                     if (a1 == i)
249                     {
250                         k = a2;
251                     }
252                     else if (a2 == i)
253                     {
254                         k = a1;
255                     }
256                     else
257                     {
258                         continue;
259                     }
260
261                     if (k == firstinteraction)
262                     {
263                         firstinteraction++;
264                     }
265                 }
266             }
267             if (!bInclude14)
268             {
269                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
270                 {
271                     a1 = ilist[F_GB14].iatoms[j+1];
272                     a2 = ilist[F_GB14].iatoms[j+2];
273                     if (a1 == i)
274                     {
275                         k = a2;
276                     }
277                     else if (a2 == i)
278                     {
279                         k = a1;
280                     }
281                     else
282                     {
283                         continue;
284                     }
285
286                     if (k == firstinteraction)
287                     {
288                         firstinteraction++;
289                     }
290                 }
291             }
292             imin = (firstinteraction < imin) ? firstinteraction : imin;
293         }
294         /* round down to j unrolling factor */
295         imin = (imin/UNROLLJ)*UNROLLJ;
296
297         for (i = ibase; i < imax; i++)
298         {
299             max_offset = calc_maxoffset(i, natoms);
300
301             if (!bInclude12)
302             {
303                 for (j = 0; j < ilist[F_GB12].nr; j += 3)
304                 {
305                     a1 = ilist[F_GB12].iatoms[j+1];
306                     a2 = ilist[F_GB12].iatoms[j+2];
307
308                     if (a1 == i)
309                     {
310                         k = a2;
311                     }
312                     else if (a2 == i)
313                     {
314                         k = a1;
315                     }
316                     else
317                     {
318                         continue;
319                     }
320
321                     if (k < imin)
322                     {
323                         k += natoms;
324                     }
325
326                     if (k > i+max_offset)
327                     {
328                         continue;
329                     }
330
331                     k = k - imin;
332
333                     if (k+natoms <= max_offset)
334                     {
335                         k += natoms;
336                     }
337                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
338                 }
339             }
340             if (!bInclude13)
341             {
342                 for (j = 0; j < ilist[F_GB13].nr; j += 3)
343                 {
344                     a1 = ilist[F_GB13].iatoms[j+1];
345                     a2 = ilist[F_GB13].iatoms[j+2];
346
347                     if (a1 == i)
348                     {
349                         k = a2;
350                     }
351                     else if (a2 == i)
352                     {
353                         k = a1;
354                     }
355                     else
356                     {
357                         continue;
358                     }
359
360                     if (k < imin)
361                     {
362                         k += natoms;
363                     }
364
365                     if (k > i+max_offset)
366                     {
367                         continue;
368                     }
369
370                     k = k - imin;
371
372                     if (k+natoms <= max_offset)
373                     {
374                         k += natoms;
375                     }
376                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
377                 }
378             }
379             if (!bInclude14)
380             {
381                 for (j = 0; j < ilist[F_GB14].nr; j += 3)
382                 {
383                     a1 = ilist[F_GB14].iatoms[j+1];
384                     a2 = ilist[F_GB14].iatoms[j+2];
385
386                     if (a1 == i)
387                     {
388                         k = a2;
389                     }
390                     else if (a2 == i)
391                     {
392                         k = a1;
393                     }
394                     else
395                     {
396                         continue;
397                     }
398
399                     if (k < imin)
400                     {
401                         k += natoms;
402                     }
403
404                     if (k > i+max_offset)
405                     {
406                         continue;
407                     }
408
409                     k = k - imin;
410
411                     if (k+natoms <= max_offset)
412                     {
413                         k += natoms;
414                     }
415                     max_excl_offset = (k > max_excl_offset) ? k : max_excl_offset;
416                 }
417             }
418         }
419
420         /* The offset specifies the last atom to be excluded, so add one unit to get an upper loop limit */
421         max_excl_offset++;
422         /* round up to j unrolling factor */
423         max_excl_offset = (max_excl_offset/UNROLLJ+1)*UNROLLJ;
424
425         /* Set all the prologue masks length to this value (even for i>end) */
426         for (i = ibase; i < ibase+UNROLLI; i++)
427         {
428             aadata->jindex_gb[4*i]   = imin;
429             aadata->jindex_gb[4*i+1] = imin+max_excl_offset;
430         }
431     }
432
433     /* Now the hard part, loop over it all again to calculate the actual contents of the prologue masks */
434     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
435     {
436         for (i = ibase; i < ibase+UNROLLI; i++)
437         {
438             nj   = aadata->jindex_gb[4*i+1] - aadata->jindex_gb[4*i];
439             imin = aadata->jindex_gb[4*i];
440
441             /* Allocate aligned memory */
442             snew(pi, nj+2*SIMD_WIDTH);
443             aadata->prologue_mask_gb[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
444
445             max_offset = calc_maxoffset(i, natoms);
446
447             /* Include interactions i+1 <= j < i+maxoffset */
448             for (k = 0; k < nj; k++)
449             {
450                 j = imin + k;
451
452                 if ( (j > i) && (j <= i+max_offset) )
453                 {
454                     aadata->prologue_mask_gb[i][k] = 0xFFFFFFFF;
455                 }
456                 else
457                 {
458                     aadata->prologue_mask_gb[i][k] = 0;
459                 }
460             }
461
462             /* Clear out the explicit exclusions */
463             if (i < end)
464             {
465                 if (!bInclude12)
466                 {
467                     for (j = 0; j < ilist[F_GB12].nr; j += 3)
468                     {
469                         a1 = ilist[F_GB12].iatoms[j+1];
470                         a2 = ilist[F_GB12].iatoms[j+2];
471
472                         if (a1 == i)
473                         {
474                             k = a2;
475                         }
476                         else if (a2 == i)
477                         {
478                             k = a1;
479                         }
480                         else
481                         {
482                             continue;
483                         }
484
485                         if (k > i+max_offset)
486                         {
487                             continue;
488                         }
489                         k = k-i;
490
491                         if (k+natoms <= max_offset)
492                         {
493                             k += natoms;
494                         }
495
496                         k = k+i-imin;
497                         if (k >= 0)
498                         {
499                             aadata->prologue_mask_gb[i][k] = 0;
500                         }
501                     }
502                 }
503                 if (!bInclude13)
504                 {
505                     for (j = 0; j < ilist[F_GB13].nr; j += 3)
506                     {
507                         a1 = ilist[F_GB13].iatoms[j+1];
508                         a2 = ilist[F_GB13].iatoms[j+2];
509
510                         if (a1 == i)
511                         {
512                             k = a2;
513                         }
514                         else if (a2 == i)
515                         {
516                             k = a1;
517                         }
518                         else
519                         {
520                             continue;
521                         }
522
523                         if (k > i+max_offset)
524                         {
525                             continue;
526                         }
527                         k = k-i;
528
529                         if (k+natoms <= max_offset)
530                         {
531                             k += natoms;
532                         }
533
534                         k = k+i-imin;
535                         if (k >= 0)
536                         {
537                             aadata->prologue_mask_gb[i][k] = 0;
538                         }
539                     }
540                 }
541                 if (!bInclude14)
542                 {
543                     for (j = 0; j < ilist[F_GB14].nr; j += 3)
544                     {
545                         a1 = ilist[F_GB14].iatoms[j+1];
546                         a2 = ilist[F_GB14].iatoms[j+2];
547
548                         if (a1 == i)
549                         {
550                             k = a2;
551                         }
552                         else if (a2 == i)
553                         {
554                             k = a1;
555                         }
556                         else
557                         {
558                             continue;
559                         }
560
561                         if (k > i+max_offset)
562                         {
563                             continue;
564                         }
565                         k = k-i;
566
567                         if (k+natoms <= max_offset)
568                         {
569                             k += natoms;
570                         }
571
572                         k = k+i-imin;
573                         if (k >= 0)
574                         {
575                             aadata->prologue_mask_gb[i][k] = 0;
576                         }
577                     }
578                 }
579             }
580         }
581     }
582
583     /* Construct the epilogue mask - this just contains the check for maxoffset */
584     snew(aadata->epilogue_mask, natoms+UNROLLI);
585
586     /* First zero everything to avoid uninitialized data */
587     for (i = 0; i < natoms+UNROLLI; i++)
588     {
589         aadata->jindex_gb[4*i+2]    = aadata->jindex_gb[4*i+1];
590         aadata->jindex_gb[4*i+3]    = aadata->jindex_gb[4*i+1];
591         aadata->epilogue_mask[i]    = NULL;
592     }
593
594     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
595     {
596         /* Find the lowest index for which we need to use the epilogue */
597         imin       = ibase;
598         max_offset = calc_maxoffset(imin, natoms);
599
600         imin = imin + 1 + max_offset;
601
602         /* Find largest index for which we need to use the epilogue */
603         imax = ibase + UNROLLI-1;
604         imax = (imax < end) ? imax : end;
605
606         max_offset = calc_maxoffset(imax, natoms);
607         imax       = imax + 1 + max_offset + UNROLLJ - 1;
608
609         for (i = ibase; i < ibase+UNROLLI; i++)
610         {
611             /* Start of epilogue - round down to j tile limit */
612             aadata->jindex_gb[4*i+2] = (imin/UNROLLJ)*UNROLLJ;
613             /* Make sure we dont overlap - for small systems everything is done in the prologue */
614             aadata->jindex_gb[4*i+2] = (aadata->jindex_gb[4*i+1] > aadata->jindex_gb[4*i+2]) ? aadata->jindex_gb[4*i+1] : aadata->jindex_gb[4*i+2];
615             /* Round upwards to j tile limit */
616             aadata->jindex_gb[4*i+3] = (imax/UNROLLJ)*UNROLLJ;
617             /* Make sure we dont have a negative range for the epilogue */
618             aadata->jindex_gb[4*i+3] = (aadata->jindex_gb[4*i+2] > aadata->jindex_gb[4*i+3]) ? aadata->jindex_gb[4*i+2] : aadata->jindex_gb[4*i+3];
619         }
620     }
621
622     /* And fill it with data... */
623
624     for (ibase = ni0; ibase < ni1; ibase += UNROLLI)
625     {
626         for (i = ibase; i < ibase+UNROLLI; i++)
627         {
628
629             nj = aadata->jindex_gb[4*i+3] - aadata->jindex_gb[4*i+2];
630
631             /* Allocate aligned memory */
632             snew(pi, nj+2*SIMD_WIDTH);
633             aadata->epilogue_mask[i] = (int *) (((size_t) pi + 16) & (~((size_t) 15)));
634
635             max_offset = calc_maxoffset(i, natoms);
636
637             for (k = 0; k < nj; k++)
638             {
639                 j = aadata->jindex_gb[4*i+2] + k;
640                 aadata->epilogue_mask[i][k] = (j <= i+max_offset) ? 0xFFFFFFFF : 0;
641             }
642         }
643     }
644 }
645
646
647 static void
648 genborn_allvsall_setup(gmx_allvsallgb2_data_t     **  p_aadata,
649                        gmx_localtop_t     *           top,
650                        gmx_genborn_t     *            born,
651                        t_mdatoms     *                mdatoms,
652                        real                           radius_offset,
653                        int                            gb_algorithm,
654                        gmx_bool                       bInclude12,
655                        gmx_bool                       bInclude13,
656                        gmx_bool                       bInclude14)
657 {
658     int                     i, j, idx;
659     int                     natoms;
660     gmx_allvsallgb2_data_t *aadata;
661     real                   *p;
662
663     natoms = mdatoms->nr;
664
665     snew(aadata, 1);
666     *p_aadata = aadata;
667
668     snew(p, 2*natoms+2*SIMD_WIDTH);
669     aadata->x_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
670     snew(p, 2*natoms+2*SIMD_WIDTH);
671     aadata->y_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
672     snew(p, 2*natoms+2*SIMD_WIDTH);
673     aadata->z_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
674     snew(p, 2*natoms+2*SIMD_WIDTH);
675     aadata->fx_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
676     snew(p, 2*natoms+2*SIMD_WIDTH);
677     aadata->fy_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
678     snew(p, 2*natoms+2*SIMD_WIDTH);
679     aadata->fz_align = (real *) (((size_t) p + 16) & (~((size_t) 15)));
680
681     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
682     aadata->gb_radius = (real *) (((size_t) p + 16) & (~((size_t) 15)));
683
684     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
685     aadata->workparam = (real *) (((size_t) p + 16) & (~((size_t) 15)));
686
687     snew(p, 2*natoms+UNROLLJ+SIMD_WIDTH);
688     aadata->work = (real *) (((size_t) p + 16) & (~((size_t) 15)));
689
690     for (i = 0; i < mdatoms->nr; i++)
691     {
692         aadata->gb_radius[i] = top->atomtypes.gb_radius[mdatoms->typeA[i]] - radius_offset;
693         if (gb_algorithm == egbSTILL)
694         {
695             aadata->workparam[i] = born->vsolv[i];
696         }
697         else if (gb_algorithm == egbOBC)
698         {
699             aadata->workparam[i] = born->param[i];
700         }
701         aadata->work[i]      = 0.0;
702     }
703     for (i = 0; i < mdatoms->nr; i++)
704     {
705         aadata->gb_radius[natoms+i] = aadata->gb_radius[i];
706         aadata->workparam[natoms+i] = aadata->workparam[i];
707         aadata->work[natoms+i]      = aadata->work[i];
708     }
709
710     for (i = 0; i < 2*natoms+SIMD_WIDTH; i++)
711     {
712         aadata->x_align[i]  = 0.0;
713         aadata->y_align[i]  = 0.0;
714         aadata->z_align[i]  = 0.0;
715         aadata->fx_align[i] = 0.0;
716         aadata->fy_align[i] = 0.0;
717         aadata->fz_align[i] = 0.0;
718     }
719
720     setup_gb_exclusions_and_indices(aadata, top->idef.il, mdatoms->start, mdatoms->start+mdatoms->homenr, mdatoms->nr,
721                                     bInclude12, bInclude13, bInclude14);
722 }
723
724
725 int
726 genborn_allvsall_calc_still_radii_sse2_single(t_forcerec *           fr,
727                                               t_mdatoms *            mdatoms,
728                                               gmx_genborn_t *        born,
729                                               gmx_localtop_t *       top,
730                                               real *                 x,
731                                               t_commrec *            cr,
732                                               void *                 paadata)
733 {
734     gmx_allvsallgb2_data_t *aadata;
735     int                     natoms;
736     int                     ni0, ni1;
737     int                     nj0, nj1, nj2, nj3;
738     int                     i, j, k, n;
739     int              *      mask;
740     int              *      pmask0;
741     int              *      pmask1;
742     int              *      pmask2;
743     int              *      pmask3;
744     int              *      emask0;
745     int              *      emask1;
746     int              *      emask2;
747     int              *      emask3;
748     real                    ix, iy, iz;
749     real                    jx, jy, jz;
750     real                    dx, dy, dz;
751     real                    rsq, rinv;
752     real                    gpi, rai, vai;
753     real                    prod_ai;
754     real                    irsq, idr4, idr6;
755     real                    raj, rvdw, ratio;
756     real                    vaj, ccf, dccf, theta, cosq;
757     real                    term, prod, icf4, icf6, gpi2, factor, sinq;
758     real              *     gb_radius;
759     real              *     vsolv;
760     real              *     work;
761     real                    tmpsum[4];
762     real              *     x_align;
763     real              *     y_align;
764     real              *     z_align;
765     int              *      jindex;
766     real              *     dadx;
767
768     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
769     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
770     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
771     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
772     __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
773     __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
774     __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
775     __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
776     __m128                  imask_SSE0, jmask_SSE0;
777     __m128                  imask_SSE1, jmask_SSE1;
778     __m128                  imask_SSE2, jmask_SSE2;
779     __m128                  imask_SSE3, jmask_SSE3;
780     __m128                  jx_SSE, jy_SSE, jz_SSE;
781     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
782     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
783     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
784     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
785     __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
786     __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
787     __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
788     __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
789     __m128                  raj_SSE, vaj_SSE, prod_SSE;
790     __m128                  rvdw_SSE0, ratio_SSE0;
791     __m128                  rvdw_SSE1, ratio_SSE1;
792     __m128                  rvdw_SSE2, ratio_SSE2;
793     __m128                  rvdw_SSE3, ratio_SSE3;
794     __m128                  theta_SSE0, sinq_SSE0, cosq_SSE0, term_SSE0;
795     __m128                  theta_SSE1, sinq_SSE1, cosq_SSE1, term_SSE1;
796     __m128                  theta_SSE2, sinq_SSE2, cosq_SSE2, term_SSE2;
797     __m128                  theta_SSE3, sinq_SSE3, cosq_SSE3, term_SSE3;
798     __m128                  ccf_SSE0, dccf_SSE0;
799     __m128                  ccf_SSE1, dccf_SSE1;
800     __m128                  ccf_SSE2, dccf_SSE2;
801     __m128                  ccf_SSE3, dccf_SSE3;
802     __m128                  icf4_SSE0, icf6_SSE0;
803     __m128                  icf4_SSE1, icf6_SSE1;
804     __m128                  icf4_SSE2, icf6_SSE2;
805     __m128                  icf4_SSE3, icf6_SSE3;
806     __m128                  half_SSE, one_SSE, two_SSE, four_SSE;
807     __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
808
809     natoms              = mdatoms->nr;
810     ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
811     ni1                 = mdatoms->start+mdatoms->homenr;
812
813     n = 0;
814
815     aadata = *((gmx_allvsallgb2_data_t **)paadata);
816
817
818     if (aadata == NULL)
819     {
820         genborn_allvsall_setup(&aadata, top, born, mdatoms, 0.0,
821                                egbSTILL, FALSE, FALSE, TRUE);
822         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
823     }
824
825     x_align = aadata->x_align;
826     y_align = aadata->y_align;
827     z_align = aadata->z_align;
828
829     gb_radius = aadata->gb_radius;
830     vsolv     = aadata->workparam;
831     work      = aadata->work;
832     jindex    = aadata->jindex_gb;
833     dadx      = fr->dadx;
834
835     still_p4_SSE    = _mm_set1_ps(STILL_P4);
836     still_p5inv_SSE = _mm_set1_ps(STILL_P5INV);
837     still_pip5_SSE  = _mm_set1_ps(STILL_PIP5);
838     half_SSE        = _mm_set1_ps(0.5);
839     one_SSE         = _mm_set1_ps(1.0);
840     two_SSE         = _mm_set1_ps(2.0);
841     four_SSE        = _mm_set1_ps(4.0);
842
843     /* This will be summed, so it has to extend to natoms + buffer */
844     for (i = 0; i < natoms+1+natoms/2; i++)
845     {
846         work[i] = 0;
847     }
848
849     for (i = ni0; i < ni1+1+natoms/2; i++)
850     {
851         k           = i%natoms;
852         x_align[i]  = x[3*k];
853         y_align[i]  = x[3*k+1];
854         z_align[i]  = x[3*k+2];
855         work[i]     = 0;
856     }
857
858
859     for (i = ni0; i < ni1; i += UNROLLI)
860     {
861         /* We assume shifts are NOT used for all-vs-all interactions */
862
863         /* Load i atom data */
864         ix_SSE0          = _mm_load1_ps(x_align+i);
865         iy_SSE0          = _mm_load1_ps(y_align+i);
866         iz_SSE0          = _mm_load1_ps(z_align+i);
867         ix_SSE1          = _mm_load1_ps(x_align+i+1);
868         iy_SSE1          = _mm_load1_ps(y_align+i+1);
869         iz_SSE1          = _mm_load1_ps(z_align+i+1);
870         ix_SSE2          = _mm_load1_ps(x_align+i+2);
871         iy_SSE2          = _mm_load1_ps(y_align+i+2);
872         iz_SSE2          = _mm_load1_ps(z_align+i+2);
873         ix_SSE3          = _mm_load1_ps(x_align+i+3);
874         iy_SSE3          = _mm_load1_ps(y_align+i+3);
875         iz_SSE3          = _mm_load1_ps(z_align+i+3);
876
877         gpi_SSE0         = _mm_setzero_ps();
878         gpi_SSE1         = _mm_setzero_ps();
879         gpi_SSE2         = _mm_setzero_ps();
880         gpi_SSE3         = _mm_setzero_ps();
881
882         rai_SSE0         = _mm_load1_ps(gb_radius+i);
883         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
884         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
885         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
886
887         prod_ai_SSE0     = _mm_set1_ps(STILL_P4*vsolv[i]);
888         prod_ai_SSE1     = _mm_set1_ps(STILL_P4*vsolv[i+1]);
889         prod_ai_SSE2     = _mm_set1_ps(STILL_P4*vsolv[i+2]);
890         prod_ai_SSE3     = _mm_set1_ps(STILL_P4*vsolv[i+3]);
891
892         /* Load limits for loop over neighbors */
893         nj0              = jindex[4*i];
894         nj1              = jindex[4*i+1];
895         nj2              = jindex[4*i+2];
896         nj3              = jindex[4*i+3];
897
898         pmask0           = aadata->prologue_mask_gb[i];
899         pmask1           = aadata->prologue_mask_gb[i+1];
900         pmask2           = aadata->prologue_mask_gb[i+2];
901         pmask3           = aadata->prologue_mask_gb[i+3];
902         emask0           = aadata->epilogue_mask[i];
903         emask1           = aadata->epilogue_mask[i+1];
904         emask2           = aadata->epilogue_mask[i+2];
905         emask3           = aadata->epilogue_mask[i+3];
906
907         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
908         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
909         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
910         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
911
912         /* Prologue part, including exclusion mask */
913         for (j = nj0; j < nj1; j += UNROLLJ)
914         {
915             jmask_SSE0 = _mm_load_ps((real *)pmask0);
916             jmask_SSE1 = _mm_load_ps((real *)pmask1);
917             jmask_SSE2 = _mm_load_ps((real *)pmask2);
918             jmask_SSE3 = _mm_load_ps((real *)pmask3);
919             pmask0    += UNROLLJ;
920             pmask1    += UNROLLJ;
921             pmask2    += UNROLLJ;
922             pmask3    += UNROLLJ;
923
924             /* load j atom coordinates */
925             jx_SSE            = _mm_load_ps(x_align+j);
926             jy_SSE            = _mm_load_ps(y_align+j);
927             jz_SSE            = _mm_load_ps(z_align+j);
928
929             /* Calculate distance */
930             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
931             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
932             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
933             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
934             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
935             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
936             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
937             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
938             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
939             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
940             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
941             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
942
943             /* rsq = dx*dx+dy*dy+dz*dz */
944             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
945             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
946             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
947             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
948
949             /* Combine masks */
950             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
951             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
952             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
953             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
954
955             /* Calculate 1/r and 1/r2 */
956             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
957             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
958             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
959             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
960
961             /* Apply mask */
962             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
963             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
964             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
965             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
966
967             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
968             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
969             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
970             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
971             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
972             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
973             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
974             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
975             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
976             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
977             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
978             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
979
980             raj_SSE            = _mm_load_ps(gb_radius+j);
981             vaj_SSE            = _mm_load_ps(vsolv+j);
982
983             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
984             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
985             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
986             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
987
988             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
989             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
990             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
991             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
992
993             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
994             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
995             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
996             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
997             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
998             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
999             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1000             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1001             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1002             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1003             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1004             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1005             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1006             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1007             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1008             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1009             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1010             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1011             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1012             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1013             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1014                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1015             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1016                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1017             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1018                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1019             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1020                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1021
1022             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1023             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1024             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1025             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1026             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1027             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1028             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1029             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1030             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1031
1032             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1033                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1034                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1035                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1036                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1037
1038             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1039             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1040             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1041             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1042
1043             /* Save ai->aj and aj->ai chain rule terms */
1044             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1045             dadx += 4;
1046             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1047             dadx += 4;
1048             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1049             dadx += 4;
1050             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1051             dadx += 4;
1052
1053             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1054             dadx += 4;
1055             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1056             dadx += 4;
1057             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1058             dadx += 4;
1059             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1060             dadx += 4;
1061         }
1062
1063         /* Main part, no exclusions */
1064         for (j = nj1; j < nj2; j += UNROLLJ)
1065         {
1066             /* load j atom coordinates */
1067             jx_SSE            = _mm_load_ps(x_align+j);
1068             jy_SSE            = _mm_load_ps(y_align+j);
1069             jz_SSE            = _mm_load_ps(z_align+j);
1070
1071             /* Calculate distance */
1072             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1073             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1074             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1075             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1076             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1077             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1078             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1079             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1080             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1081             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1082             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1083             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1084
1085             /* rsq = dx*dx+dy*dy+dz*dz */
1086             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1087             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1088             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1089             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1090
1091             /* Calculate 1/r and 1/r2 */
1092             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1093             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1094             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1095             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1096
1097             /* Apply mask */
1098             rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
1099             rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
1100             rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
1101             rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
1102
1103             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1104             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1105             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1106             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1107             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1108             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1109             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1110             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1111             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1112             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1113             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1114             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1115
1116             raj_SSE            = _mm_load_ps(gb_radius+j);
1117
1118             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
1119             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
1120             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
1121             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
1122             vaj_SSE            = _mm_load_ps(vsolv+j);
1123
1124             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1125             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1126             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1127             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1128
1129             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1130             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1131             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1132             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1133             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1134             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1135             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1136             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1137             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1138             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1139             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1140             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1141             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1142             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1143             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1144             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1145             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1146             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1147             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1148             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1149             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1150                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1151             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1152                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1153             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1154                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1155             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1156                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1157
1158             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE );
1159             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1160             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1161             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1162             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1163             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1164             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1165             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1166             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1167
1168             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1169                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1170                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1171                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1172                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1173
1174             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1175             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1176             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1177             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1178
1179             /* Save ai->aj and aj->ai chain rule terms */
1180             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1181             dadx += 4;
1182             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1183             dadx += 4;
1184             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1185             dadx += 4;
1186             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1187             dadx += 4;
1188
1189             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1190             dadx += 4;
1191             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1192             dadx += 4;
1193             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1194             dadx += 4;
1195             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1196             dadx += 4;
1197         }
1198         /* Epilogue part, including exclusion mask */
1199         for (j = nj2; j < nj3; j += UNROLLJ)
1200         {
1201             jmask_SSE0 = _mm_load_ps((real *)emask0);
1202             jmask_SSE1 = _mm_load_ps((real *)emask1);
1203             jmask_SSE2 = _mm_load_ps((real *)emask2);
1204             jmask_SSE3 = _mm_load_ps((real *)emask3);
1205             emask0    += UNROLLJ;
1206             emask1    += UNROLLJ;
1207             emask2    += UNROLLJ;
1208             emask3    += UNROLLJ;
1209
1210             /* load j atom coordinates */
1211             jx_SSE            = _mm_load_ps(x_align+j);
1212             jy_SSE            = _mm_load_ps(y_align+j);
1213             jz_SSE            = _mm_load_ps(z_align+j);
1214
1215             /* Calculate distance */
1216             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1217             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1218             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1219             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1220             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1221             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1222             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1223             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1224             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1225             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1226             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1227             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1228
1229             /* rsq = dx*dx+dy*dy+dz*dz */
1230             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1231             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1232             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1233             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1234
1235             /* Combine masks */
1236             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
1237             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
1238             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
1239             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
1240
1241             /* Calculate 1/r and 1/r2 */
1242             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1243             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1244             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1245             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1246
1247             /* Apply mask */
1248             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1249             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1250             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1251             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1252
1253             irsq_SSE0          = _mm_mul_ps(rinv_SSE0, rinv_SSE0);
1254             irsq_SSE1          = _mm_mul_ps(rinv_SSE1, rinv_SSE1);
1255             irsq_SSE2          = _mm_mul_ps(rinv_SSE2, rinv_SSE2);
1256             irsq_SSE3          = _mm_mul_ps(rinv_SSE3, rinv_SSE3);
1257             idr4_SSE0          = _mm_mul_ps(irsq_SSE0, irsq_SSE0);
1258             idr4_SSE1          = _mm_mul_ps(irsq_SSE1, irsq_SSE1);
1259             idr4_SSE2          = _mm_mul_ps(irsq_SSE2, irsq_SSE2);
1260             idr4_SSE3          = _mm_mul_ps(irsq_SSE3, irsq_SSE3);
1261             idr6_SSE0          = _mm_mul_ps(idr4_SSE0, irsq_SSE0);
1262             idr6_SSE1          = _mm_mul_ps(idr4_SSE1, irsq_SSE1);
1263             idr6_SSE2          = _mm_mul_ps(idr4_SSE2, irsq_SSE2);
1264             idr6_SSE3          = _mm_mul_ps(idr4_SSE3, irsq_SSE3);
1265
1266             raj_SSE            = _mm_load_ps(gb_radius+j);
1267             vaj_SSE            = _mm_load_ps(vsolv+j);
1268
1269             rvdw_SSE0          = _mm_add_ps(rai_SSE0, raj_SSE);
1270             rvdw_SSE1          = _mm_add_ps(rai_SSE1, raj_SSE);
1271             rvdw_SSE2          = _mm_add_ps(rai_SSE2, raj_SSE);
1272             rvdw_SSE3          = _mm_add_ps(rai_SSE3, raj_SSE);
1273
1274             ratio_SSE0         = _mm_mul_ps(rsq_SSE0, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE0, rvdw_SSE0)));
1275             ratio_SSE1         = _mm_mul_ps(rsq_SSE1, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE1, rvdw_SSE1)));
1276             ratio_SSE2         = _mm_mul_ps(rsq_SSE2, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE2, rvdw_SSE2)));
1277             ratio_SSE3         = _mm_mul_ps(rsq_SSE3, gmx_mm_inv_ps( _mm_mul_ps(rvdw_SSE3, rvdw_SSE3)));
1278
1279             ratio_SSE0         = _mm_min_ps(ratio_SSE0, still_p5inv_SSE);
1280             ratio_SSE1         = _mm_min_ps(ratio_SSE1, still_p5inv_SSE);
1281             ratio_SSE2         = _mm_min_ps(ratio_SSE2, still_p5inv_SSE);
1282             ratio_SSE3         = _mm_min_ps(ratio_SSE3, still_p5inv_SSE);
1283             theta_SSE0         = _mm_mul_ps(ratio_SSE0, still_pip5_SSE);
1284             theta_SSE1         = _mm_mul_ps(ratio_SSE1, still_pip5_SSE);
1285             theta_SSE2         = _mm_mul_ps(ratio_SSE2, still_pip5_SSE);
1286             theta_SSE3         = _mm_mul_ps(ratio_SSE3, still_pip5_SSE);
1287             gmx_mm_sincos_ps(theta_SSE0, &sinq_SSE0, &cosq_SSE0);
1288             gmx_mm_sincos_ps(theta_SSE1, &sinq_SSE1, &cosq_SSE1);
1289             gmx_mm_sincos_ps(theta_SSE2, &sinq_SSE2, &cosq_SSE2);
1290             gmx_mm_sincos_ps(theta_SSE3, &sinq_SSE3, &cosq_SSE3);
1291             term_SSE0          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE0));
1292             term_SSE1          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE1));
1293             term_SSE2          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE2));
1294             term_SSE3          = _mm_mul_ps(half_SSE, _mm_sub_ps(one_SSE, cosq_SSE3));
1295             ccf_SSE0           = _mm_mul_ps(term_SSE0, term_SSE0);
1296             ccf_SSE1           = _mm_mul_ps(term_SSE1, term_SSE1);
1297             ccf_SSE2           = _mm_mul_ps(term_SSE2, term_SSE2);
1298             ccf_SSE3           = _mm_mul_ps(term_SSE3, term_SSE3);
1299             dccf_SSE0          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE0),
1300                                             _mm_mul_ps(sinq_SSE0, theta_SSE0));
1301             dccf_SSE1          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE1),
1302                                             _mm_mul_ps(sinq_SSE1, theta_SSE1));
1303             dccf_SSE2          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE2),
1304                                             _mm_mul_ps(sinq_SSE2, theta_SSE2));
1305             dccf_SSE3          = _mm_mul_ps(_mm_mul_ps(two_SSE, term_SSE3),
1306                                             _mm_mul_ps(sinq_SSE3, theta_SSE3));
1307
1308             prod_SSE           = _mm_mul_ps(still_p4_SSE, vaj_SSE);
1309             icf4_SSE0          = _mm_mul_ps(ccf_SSE0, idr4_SSE0);
1310             icf4_SSE1          = _mm_mul_ps(ccf_SSE1, idr4_SSE1);
1311             icf4_SSE2          = _mm_mul_ps(ccf_SSE2, idr4_SSE2);
1312             icf4_SSE3          = _mm_mul_ps(ccf_SSE3, idr4_SSE3);
1313             icf6_SSE0          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE0), dccf_SSE0), idr6_SSE0);
1314             icf6_SSE1          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE1), dccf_SSE1), idr6_SSE1);
1315             icf6_SSE2          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE2), dccf_SSE2), idr6_SSE2);
1316             icf6_SSE3          = _mm_mul_ps( _mm_sub_ps( _mm_mul_ps(four_SSE, ccf_SSE3), dccf_SSE3), idr6_SSE3);
1317
1318             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
1319                                             gmx_mm_sum4_ps(_mm_mul_ps(prod_ai_SSE0, icf4_SSE0),
1320                                                            _mm_mul_ps(prod_ai_SSE1, icf4_SSE1),
1321                                                            _mm_mul_ps(prod_ai_SSE2, icf4_SSE2),
1322                                                            _mm_mul_ps(prod_ai_SSE3, icf4_SSE3))));
1323
1324             gpi_SSE0           = _mm_add_ps(gpi_SSE0, _mm_mul_ps(prod_SSE, icf4_SSE0));
1325             gpi_SSE1           = _mm_add_ps(gpi_SSE1, _mm_mul_ps(prod_SSE, icf4_SSE1));
1326             gpi_SSE2           = _mm_add_ps(gpi_SSE2, _mm_mul_ps(prod_SSE, icf4_SSE2));
1327             gpi_SSE3           = _mm_add_ps(gpi_SSE3, _mm_mul_ps(prod_SSE, icf4_SSE3));
1328
1329             /* Save ai->aj and aj->ai chain rule terms */
1330             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE0));
1331             dadx += 4;
1332             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE1));
1333             dadx += 4;
1334             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE2));
1335             dadx += 4;
1336             _mm_store_ps(dadx, _mm_mul_ps(prod_SSE, icf6_SSE3));
1337             dadx += 4;
1338
1339             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE0, icf6_SSE0));
1340             dadx += 4;
1341             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE1, icf6_SSE1));
1342             dadx += 4;
1343             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE2, icf6_SSE2));
1344             dadx += 4;
1345             _mm_store_ps(dadx, _mm_mul_ps(prod_ai_SSE3, icf6_SSE3));
1346             dadx += 4;
1347         }
1348         _MM_TRANSPOSE4_PS(gpi_SSE0, gpi_SSE1, gpi_SSE2, gpi_SSE3);
1349         gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE1);
1350         gpi_SSE2 = _mm_add_ps(gpi_SSE2, gpi_SSE3);
1351         gpi_SSE0 = _mm_add_ps(gpi_SSE0, gpi_SSE2);
1352         _mm_store_ps(work+i, _mm_add_ps(gpi_SSE0, _mm_load_ps(work+i)));
1353     }
1354
1355     /* In case we have written anything beyond natoms, move it back.
1356      * Never mind that we leave stuff above natoms; that will not
1357      * be accessed later in the routine.
1358      * In principle this should be a move rather than sum, but this
1359      * way we dont have to worry about even/odd offsets...
1360      */
1361     for (i = natoms; i < ni1+1+natoms/2; i++)
1362     {
1363         work[i-natoms] += work[i];
1364     }
1365
1366     /* Parallel summations */
1367     if (PARTDECOMP(cr))
1368     {
1369         gmx_sum(natoms, work, cr);
1370     }
1371
1372     factor  = 0.5 * ONE_4PI_EPS0;
1373     /* Calculate the radii - should we do all atoms, or just our local ones? */
1374     for (i = 0; i < natoms; i++)
1375     {
1376         if (born->use[i] != 0)
1377         {
1378             gpi             = born->gpol[i]+work[i];
1379             gpi2            = gpi * gpi;
1380             born->bRad[i]   = factor*gmx_invsqrt(gpi2);
1381             fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1382         }
1383     }
1384
1385     return 0;
1386 }
1387
1388
1389
1390 int
1391 genborn_allvsall_calc_hct_obc_radii_sse2_single(t_forcerec *           fr,
1392                                                 t_mdatoms *            mdatoms,
1393                                                 gmx_genborn_t *        born,
1394                                                 int                    gb_algorithm,
1395                                                 gmx_localtop_t *       top,
1396                                                 real *                 x,
1397                                                 t_commrec *            cr,
1398                                                 void *                 paadata)
1399 {
1400     gmx_allvsallgb2_data_t *aadata;
1401     int                     natoms;
1402     int                     ni0, ni1;
1403     int                     nj0, nj1, nj2, nj3;
1404     int                     i, j, k, n;
1405     int              *      mask;
1406     int              *      pmask0;
1407     int              *      pmask1;
1408     int              *      pmask2;
1409     int              *      pmask3;
1410     int              *      emask0;
1411     int              *      emask1;
1412     int              *      emask2;
1413     int              *      emask3;
1414     real              *     gb_radius;
1415     real              *     vsolv;
1416     real              *     work;
1417     real                    tmpsum[4];
1418     real              *     x_align;
1419     real              *     y_align;
1420     real              *     z_align;
1421     int              *      jindex;
1422     real              *     dadx;
1423     real              *     obc_param;
1424     real                    rad, min_rad;
1425     real                    rai, rai_inv, rai_inv2, sum_ai, sum_ai2, sum_ai3, tsum, tchain;
1426
1427     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
1428     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
1429     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
1430     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
1431     __m128                  gpi_SSE0, rai_SSE0, prod_ai_SSE0;
1432     __m128                  gpi_SSE1, rai_SSE1, prod_ai_SSE1;
1433     __m128                  gpi_SSE2, rai_SSE2, prod_ai_SSE2;
1434     __m128                  gpi_SSE3, rai_SSE3, prod_ai_SSE3;
1435     __m128                  imask_SSE0, jmask_SSE0;
1436     __m128                  imask_SSE1, jmask_SSE1;
1437     __m128                  imask_SSE2, jmask_SSE2;
1438     __m128                  imask_SSE3, jmask_SSE3;
1439     __m128                  jx_SSE, jy_SSE, jz_SSE;
1440     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
1441     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
1442     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
1443     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
1444     __m128                  rsq_SSE0, rinv_SSE0, irsq_SSE0, idr4_SSE0, idr6_SSE0;
1445     __m128                  rsq_SSE1, rinv_SSE1, irsq_SSE1, idr4_SSE1, idr6_SSE1;
1446     __m128                  rsq_SSE2, rinv_SSE2, irsq_SSE2, idr4_SSE2, idr6_SSE2;
1447     __m128                  rsq_SSE3, rinv_SSE3, irsq_SSE3, idr4_SSE3, idr6_SSE3;
1448     __m128                  raj_SSE, raj_inv_SSE, sk_aj_SSE, sk2_aj_SSE;
1449     __m128                  ccf_SSE0, dccf_SSE0, prod_SSE0;
1450     __m128                  ccf_SSE1, dccf_SSE1, prod_SSE1;
1451     __m128                  ccf_SSE2, dccf_SSE2, prod_SSE2;
1452     __m128                  ccf_SSE3, dccf_SSE3, prod_SSE3;
1453     __m128                  icf4_SSE0, icf6_SSE0;
1454     __m128                  icf4_SSE1, icf6_SSE1;
1455     __m128                  icf4_SSE2, icf6_SSE2;
1456     __m128                  icf4_SSE3, icf6_SSE3;
1457     __m128                  oneeighth_SSE, onefourth_SSE, half_SSE, one_SSE, two_SSE, four_SSE;
1458     __m128                  still_p4_SSE, still_p5inv_SSE, still_pip5_SSE;
1459     __m128                  rai_inv_SSE0;
1460     __m128                  rai_inv_SSE1;
1461     __m128                  rai_inv_SSE2;
1462     __m128                  rai_inv_SSE3;
1463     __m128                  sk_ai_SSE0, sk2_ai_SSE0, sum_ai_SSE0;
1464     __m128                  sk_ai_SSE1, sk2_ai_SSE1, sum_ai_SSE1;
1465     __m128                  sk_ai_SSE2, sk2_ai_SSE2, sum_ai_SSE2;
1466     __m128                  sk_ai_SSE3, sk2_ai_SSE3, sum_ai_SSE3;
1467     __m128                  lij_inv_SSE0, sk2_rinv_SSE0;
1468     __m128                  lij_inv_SSE1, sk2_rinv_SSE1;
1469     __m128                  lij_inv_SSE2, sk2_rinv_SSE2;
1470     __m128                  lij_inv_SSE3, sk2_rinv_SSE3;
1471     __m128                  dr_SSE0;
1472     __m128                  dr_SSE1;
1473     __m128                  dr_SSE2;
1474     __m128                  dr_SSE3;
1475     __m128                  t1_SSE0, t2_SSE0, t3_SSE0, t4_SSE0;
1476     __m128                  t1_SSE1, t2_SSE1, t3_SSE1, t4_SSE1;
1477     __m128                  t1_SSE2, t2_SSE2, t3_SSE2, t4_SSE2;
1478     __m128                  t1_SSE3, t2_SSE3, t3_SSE3, t4_SSE3;
1479     __m128                  obc_mask1_SSE0, obc_mask2_SSE0, obc_mask3_SSE0;
1480     __m128                  obc_mask1_SSE1, obc_mask2_SSE1, obc_mask3_SSE1;
1481     __m128                  obc_mask1_SSE2, obc_mask2_SSE2, obc_mask3_SSE2;
1482     __m128                  obc_mask1_SSE3, obc_mask2_SSE3, obc_mask3_SSE3;
1483     __m128                  uij_SSE0, uij2_SSE0, uij3_SSE0;
1484     __m128                  uij_SSE1, uij2_SSE1, uij3_SSE1;
1485     __m128                  uij_SSE2, uij2_SSE2, uij3_SSE2;
1486     __m128                  uij_SSE3, uij2_SSE3, uij3_SSE3;
1487     __m128                  lij_SSE0, lij2_SSE0, lij3_SSE0;
1488     __m128                  lij_SSE1, lij2_SSE1, lij3_SSE1;
1489     __m128                  lij_SSE2, lij2_SSE2, lij3_SSE2;
1490     __m128                  lij_SSE3, lij2_SSE3, lij3_SSE3;
1491     __m128                  dlij_SSE0, diff2_SSE0, logterm_SSE0;
1492     __m128                  dlij_SSE1, diff2_SSE1, logterm_SSE1;
1493     __m128                  dlij_SSE2, diff2_SSE2, logterm_SSE2;
1494     __m128                  dlij_SSE3, diff2_SSE3, logterm_SSE3;
1495     __m128                  doffset_SSE;
1496
1497     natoms              = mdatoms->nr;
1498     ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
1499     ni1                 = mdatoms->start+mdatoms->homenr;
1500
1501     n = 0;
1502
1503     aadata = *((gmx_allvsallgb2_data_t **)paadata);
1504
1505
1506     if (aadata == NULL)
1507     {
1508         genborn_allvsall_setup(&aadata, top, born, mdatoms, born->gb_doffset,
1509                                egbOBC, TRUE, TRUE, TRUE);
1510         *((gmx_allvsallgb2_data_t **)paadata) = aadata;
1511     }
1512
1513     x_align = aadata->x_align;
1514     y_align = aadata->y_align;
1515     z_align = aadata->z_align;
1516
1517     gb_radius = aadata->gb_radius;
1518     work      = aadata->work;
1519     jindex    = aadata->jindex_gb;
1520     dadx      = fr->dadx;
1521     obc_param = aadata->workparam;
1522
1523     oneeighth_SSE   = _mm_set1_ps(0.125);
1524     onefourth_SSE   = _mm_set1_ps(0.25);
1525     half_SSE        = _mm_set1_ps(0.5);
1526     one_SSE         = _mm_set1_ps(1.0);
1527     two_SSE         = _mm_set1_ps(2.0);
1528     four_SSE        = _mm_set1_ps(4.0);
1529     doffset_SSE     = _mm_set1_ps(born->gb_doffset);
1530
1531     for (i = 0; i < natoms; i++)
1532     {
1533         x_align[i]  = x[3*i];
1534         y_align[i]  = x[3*i+1];
1535         z_align[i]  = x[3*i+2];
1536     }
1537
1538     /* Copy again */
1539     for (i = 0; i < natoms/2+1; i++)
1540     {
1541         x_align[natoms+i]  = x_align[i];
1542         y_align[natoms+i]  = y_align[i];
1543         z_align[natoms+i]  = z_align[i];
1544     }
1545
1546     for (i = 0; i < natoms+natoms/2+1; i++)
1547     {
1548         work[i] = 0;
1549     }
1550
1551     for (i = ni0; i < ni1; i += UNROLLI)
1552     {
1553         /* We assume shifts are NOT used for all-vs-all interactions */
1554
1555         /* Load i atom data */
1556         ix_SSE0          = _mm_load1_ps(x_align+i);
1557         iy_SSE0          = _mm_load1_ps(y_align+i);
1558         iz_SSE0          = _mm_load1_ps(z_align+i);
1559         ix_SSE1          = _mm_load1_ps(x_align+i+1);
1560         iy_SSE1          = _mm_load1_ps(y_align+i+1);
1561         iz_SSE1          = _mm_load1_ps(z_align+i+1);
1562         ix_SSE2          = _mm_load1_ps(x_align+i+2);
1563         iy_SSE2          = _mm_load1_ps(y_align+i+2);
1564         iz_SSE2          = _mm_load1_ps(z_align+i+2);
1565         ix_SSE3          = _mm_load1_ps(x_align+i+3);
1566         iy_SSE3          = _mm_load1_ps(y_align+i+3);
1567         iz_SSE3          = _mm_load1_ps(z_align+i+3);
1568
1569         rai_SSE0         = _mm_load1_ps(gb_radius+i);
1570         rai_SSE1         = _mm_load1_ps(gb_radius+i+1);
1571         rai_SSE2         = _mm_load1_ps(gb_radius+i+2);
1572         rai_SSE3         = _mm_load1_ps(gb_radius+i+3);
1573         rai_inv_SSE0     = gmx_mm_inv_ps(rai_SSE0);
1574         rai_inv_SSE1     = gmx_mm_inv_ps(rai_SSE1);
1575         rai_inv_SSE2     = gmx_mm_inv_ps(rai_SSE2);
1576         rai_inv_SSE3     = gmx_mm_inv_ps(rai_SSE3);
1577
1578         sk_ai_SSE0       = _mm_load1_ps(obc_param+i);
1579         sk_ai_SSE1       = _mm_load1_ps(obc_param+i+1);
1580         sk_ai_SSE2       = _mm_load1_ps(obc_param+i+2);
1581         sk_ai_SSE3       = _mm_load1_ps(obc_param+i+3);
1582         sk2_ai_SSE0      = _mm_mul_ps(sk_ai_SSE0, sk_ai_SSE0);
1583         sk2_ai_SSE1      = _mm_mul_ps(sk_ai_SSE1, sk_ai_SSE1);
1584         sk2_ai_SSE2      = _mm_mul_ps(sk_ai_SSE2, sk_ai_SSE2);
1585         sk2_ai_SSE3      = _mm_mul_ps(sk_ai_SSE3, sk_ai_SSE3);
1586
1587         sum_ai_SSE0      = _mm_setzero_ps();
1588         sum_ai_SSE1      = _mm_setzero_ps();
1589         sum_ai_SSE2      = _mm_setzero_ps();
1590         sum_ai_SSE3      = _mm_setzero_ps();
1591
1592         /* Load limits for loop over neighbors */
1593         nj0              = jindex[4*i];
1594         nj1              = jindex[4*i+1];
1595         nj2              = jindex[4*i+2];
1596         nj3              = jindex[4*i+3];
1597
1598         pmask0           = aadata->prologue_mask_gb[i];
1599         pmask1           = aadata->prologue_mask_gb[i+1];
1600         pmask2           = aadata->prologue_mask_gb[i+2];
1601         pmask3           = aadata->prologue_mask_gb[i+3];
1602         emask0           = aadata->epilogue_mask[i];
1603         emask1           = aadata->epilogue_mask[i+1];
1604         emask2           = aadata->epilogue_mask[i+2];
1605         emask3           = aadata->epilogue_mask[i+3];
1606
1607         imask_SSE0        = _mm_load1_ps((real *)(aadata->imask+i));
1608         imask_SSE1        = _mm_load1_ps((real *)(aadata->imask+i+1));
1609         imask_SSE2        = _mm_load1_ps((real *)(aadata->imask+i+2));
1610         imask_SSE3        = _mm_load1_ps((real *)(aadata->imask+i+3));
1611
1612         /* Prologue part, including exclusion mask */
1613         for (j = nj0; j < nj1; j += UNROLLJ)
1614         {
1615             jmask_SSE0 = _mm_load_ps((real *)pmask0);
1616             jmask_SSE1 = _mm_load_ps((real *)pmask1);
1617             jmask_SSE2 = _mm_load_ps((real *)pmask2);
1618             jmask_SSE3 = _mm_load_ps((real *)pmask3);
1619             pmask0    += UNROLLJ;
1620             pmask1    += UNROLLJ;
1621             pmask2    += UNROLLJ;
1622             pmask3    += UNROLLJ;
1623
1624             /* load j atom coordinates */
1625             jx_SSE            = _mm_load_ps(x_align+j);
1626             jy_SSE            = _mm_load_ps(y_align+j);
1627             jz_SSE            = _mm_load_ps(z_align+j);
1628
1629             /* Calculate distance */
1630             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
1631             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
1632             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
1633             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
1634             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
1635             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
1636             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
1637             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
1638             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
1639             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
1640             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
1641             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
1642
1643             /* rsq = dx*dx+dy*dy+dz*dz */
1644             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
1645             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
1646             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
1647             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
1648
1649             /* Combine masks */
1650             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
1651             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
1652             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
1653             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
1654
1655             /* Calculate 1/r and 1/r2 */
1656             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
1657             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
1658             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
1659             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
1660
1661             /* Apply mask */
1662             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
1663             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
1664             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
1665             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
1666
1667             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
1668             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
1669             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
1670             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
1671
1672             sk_aj_SSE          = _mm_load_ps(obc_param+j);
1673             raj_SSE            = _mm_load_ps(gb_radius+j);
1674             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
1675
1676             /* Evaluate influence of atom aj -> ai */
1677             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
1678             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
1679             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
1680             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
1681             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
1682             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
1683             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
1684             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
1685             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
1686             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
1687             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
1688             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
1689
1690             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
1691             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
1692             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
1693             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
1694             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
1695             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
1696             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
1697             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
1698             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
1699             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
1700             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
1701             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
1702             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1703             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1704             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1705             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1706
1707             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1708             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1709             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1710             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1711             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1712                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
1713             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1714                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
1715             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1716                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
1717             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1718                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
1719             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1720             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1721             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1722             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1723
1724             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1725             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1726             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1727             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1728             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1729             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1730             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1731             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1732             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1733             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1734             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1735             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1736             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1737             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1738             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1739             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1740
1741             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1742             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1743             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1744             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1745             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1746             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1747             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1748             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1749             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
1750             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
1751             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
1752             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
1753             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
1754             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1755             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1756             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1757             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1758
1759             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1760             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1761             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1762             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1763
1764             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
1765             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
1766             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
1767             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
1768             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1769                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1770                                                        prod_SSE0));
1771             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1772                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1773                                                        prod_SSE1));
1774             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
1775                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
1776                                                        prod_SSE2));
1777             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
1778                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
1779                                                        prod_SSE3));
1780
1781             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
1782             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
1783             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
1784             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
1785             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
1786             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
1787             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
1788             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
1789             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
1790             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
1791             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
1792             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
1793             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
1794             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
1795             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
1796             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
1797             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
1798             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
1799             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
1800             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
1801
1802             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1803             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1804             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1805             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1806
1807             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
1808                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
1809             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
1810                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
1811             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
1812                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
1813             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
1814                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
1815             t1_SSE0            = _mm_sub_ps(t1_SSE0,
1816                                             _mm_mul_ps(onefourth_SSE,
1817                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
1818                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
1819             t1_SSE1            = _mm_sub_ps(t1_SSE1,
1820                                             _mm_mul_ps(onefourth_SSE,
1821                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
1822                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
1823             t1_SSE2            = _mm_sub_ps(t1_SSE2,
1824                                             _mm_mul_ps(onefourth_SSE,
1825                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
1826                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
1827             t1_SSE3            = _mm_sub_ps(t1_SSE3,
1828                                             _mm_mul_ps(onefourth_SSE,
1829                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
1830                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
1831
1832             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
1833                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
1834                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
1835             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
1836                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
1837                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
1838             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
1839                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
1840                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
1841             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
1842                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
1843                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
1844             t2_SSE0            = _mm_sub_ps(t2_SSE0,
1845                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
1846                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
1847             t2_SSE1            = _mm_sub_ps(t2_SSE1,
1848                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
1849                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
1850             t2_SSE2            = _mm_sub_ps(t2_SSE2,
1851                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
1852                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
1853             t2_SSE3            = _mm_sub_ps(t2_SSE3,
1854                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
1855                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
1856             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
1857                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
1858             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
1859                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
1860             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
1861                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
1862             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
1863                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
1864             t3_SSE0            = _mm_sub_ps(t3_SSE0,
1865                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
1866                                                        _mm_add_ps(one_SSE,
1867                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
1868             t3_SSE1            = _mm_sub_ps(t3_SSE1,
1869                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
1870                                                        _mm_add_ps(one_SSE,
1871                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
1872             t3_SSE2            = _mm_sub_ps(t3_SSE2,
1873                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
1874                                                        _mm_add_ps(one_SSE,
1875                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
1876             t3_SSE3            = _mm_sub_ps(t3_SSE3,
1877                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
1878                                                        _mm_add_ps(one_SSE,
1879                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
1880
1881             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
1882                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
1883                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
1884             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
1885                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
1886                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
1887             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
1888                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
1889                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
1890             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
1891                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
1892                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
1893
1894             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
1895             dadx += 4;
1896             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
1897             dadx += 4;
1898             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
1899             dadx += 4;
1900             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
1901             dadx += 4;
1902
1903             /* Evaluate influence of atom ai -> aj */
1904             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
1905             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
1906             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
1907             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
1908             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
1909             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
1910             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
1911             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
1912             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
1913             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
1914             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
1915             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
1916
1917             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
1918             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
1919             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
1920             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
1921             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
1922             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
1923             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
1924             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
1925             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
1926             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
1927             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
1928             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
1929             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
1930             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
1931             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
1932             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
1933
1934             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
1935             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
1936             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
1937             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
1938             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
1939                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
1940             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
1941                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
1942             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
1943                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
1944             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
1945                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
1946             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
1947             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
1948             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
1949             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
1950
1951             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
1952             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
1953             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
1954             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
1955             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
1956             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
1957             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
1958             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
1959             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
1960             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
1961             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
1962             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
1963             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
1964             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
1965             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
1966             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
1967
1968             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
1969             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
1970             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
1971             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
1972             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
1973             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
1974             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
1975             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
1976             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
1977             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
1978             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
1979             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
1980             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
1981             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
1982             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
1983             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
1984
1985             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
1986             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
1987             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
1988             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
1989             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
1990             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
1991             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
1992             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
1993             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
1994                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
1995                                                        prod_SSE0));
1996             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
1997                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
1998                                                        prod_SSE1));
1999             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2000                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2001                                                        prod_SSE2));
2002             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2003                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2004                                                        prod_SSE3));
2005             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2006             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2007             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2008             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2009             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2010             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2011             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2012             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2013             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2014             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2015             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2016             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2017             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2018             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2019             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2020             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2021             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2022             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2023             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2024             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2025
2026             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2027                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2028                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2029                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2030                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2031
2032             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2033                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2034             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2035                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2036             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2037                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2038             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2039                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2040             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2041                                             _mm_mul_ps(onefourth_SSE,
2042                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2043                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2044             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2045                                             _mm_mul_ps(onefourth_SSE,
2046                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2047                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2048             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2049                                             _mm_mul_ps(onefourth_SSE,
2050                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2051                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2052             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2053                                             _mm_mul_ps(onefourth_SSE,
2054                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2055                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2056             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2057                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2058                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2059             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2060                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2061                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2062             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2063                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2064                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2065             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2066                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2067                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2068             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2069                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2070                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2071             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2072                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2073                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2074             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2075                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2076                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2077             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2078                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2079                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2080
2081             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2082                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2083             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2084                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2085             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2086                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2087             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2088                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2089
2090             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2091                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2092                                                        _mm_add_ps(one_SSE,
2093                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2094             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2095                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2096                                                        _mm_add_ps(one_SSE,
2097                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2098             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2099                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2100                                                        _mm_add_ps(one_SSE,
2101                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2102             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2103                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2104                                                        _mm_add_ps(one_SSE,
2105                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2106
2107
2108             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2109                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2110                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2111             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2112                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2113                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2114             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2115                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2116                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2117             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2118                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2119                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2120
2121             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2122             dadx += 4;
2123             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2124             dadx += 4;
2125             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2126             dadx += 4;
2127             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2128             dadx += 4;
2129
2130         }
2131
2132         /* Main part, no exclusions */
2133         for (j = nj1; j < nj2; j += UNROLLJ)
2134         {
2135             /* load j atom coordinates */
2136             jx_SSE            = _mm_load_ps(x_align+j);
2137             jy_SSE            = _mm_load_ps(y_align+j);
2138             jz_SSE            = _mm_load_ps(z_align+j);
2139
2140             /* Calculate distance */
2141             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
2142             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
2143             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
2144             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
2145             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
2146             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
2147             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
2148             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
2149             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
2150             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
2151             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
2152             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
2153
2154             /* rsq = dx*dx+dy*dy+dz*dz */
2155             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2156             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2157             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2158             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2159
2160             /* Calculate 1/r and 1/r2 */
2161             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2162             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2163             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2164             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2165
2166             /* Apply mask */
2167             rinv_SSE0          = _mm_and_ps(rinv_SSE0, imask_SSE0);
2168             rinv_SSE1          = _mm_and_ps(rinv_SSE1, imask_SSE1);
2169             rinv_SSE2          = _mm_and_ps(rinv_SSE2, imask_SSE2);
2170             rinv_SSE3          = _mm_and_ps(rinv_SSE3, imask_SSE3);
2171
2172             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2173             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2174             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2175             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2176
2177             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2178             raj_SSE            = _mm_load_ps(gb_radius+j);
2179
2180             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2181
2182             /* Evaluate influence of atom aj -> ai */
2183             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2184             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2185             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2186             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2187             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2188             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2189             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2190             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2191             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2192             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2193             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2194             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2195
2196             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2197             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2198             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2199             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2200             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2201             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2202             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2203             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2204             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2205             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2206             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2207             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2208             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2209             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2210             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2211             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2212
2213             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2214             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2215             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2216             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2217             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2218                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2219             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2220                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2221             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2222                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2223             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2224                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2225             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2226             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2227             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2228             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2229
2230             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2231             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2232             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2233             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2234             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2235             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2236             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2237             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2238             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2239             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2240             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2241             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2242             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2243             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2244             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2245             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2246
2247             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2248             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2249             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2250             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2251             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2252             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2253             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2254             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2255             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2256             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2257             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2258             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2259             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2260             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2261             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2262             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2263             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2264
2265             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2266             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2267             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2268             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2269
2270             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2271             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2272             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2273             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2274             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2275                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2276                                                        prod_SSE0));
2277             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2278                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2279                                                        prod_SSE1));
2280             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2281                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2282                                                        prod_SSE2));
2283             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2284                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2285                                                        prod_SSE3));
2286
2287             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2288             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2289             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2290             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2291             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2292             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2293             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2294             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2295             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2296             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2297             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2298             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2299             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2300             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2301             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2302             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2303             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2304             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2305             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2306             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2307
2308             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2309             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2310             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2311             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2312
2313             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2314                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2315             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2316                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2317             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2318                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2319             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2320                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2321             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2322                                             _mm_mul_ps(onefourth_SSE,
2323                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2324                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2325             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2326                                             _mm_mul_ps(onefourth_SSE,
2327                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2328                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2329             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2330                                             _mm_mul_ps(onefourth_SSE,
2331                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2332                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2333             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2334                                             _mm_mul_ps(onefourth_SSE,
2335                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2336                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2337
2338             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2339                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2340                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2341             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2342                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2343                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2344             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2345                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2346                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2347             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2348                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2349                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2350             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2351                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2352                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2353             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2354                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2355                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2356             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2357                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2358                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2359             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2360                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2361                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2362             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2363                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2364             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2365                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2366             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2367                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2368             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2369                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2370             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2371                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2372                                                        _mm_add_ps(one_SSE,
2373                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2374             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2375                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2376                                                        _mm_add_ps(one_SSE,
2377                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2378             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2379                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2380                                                        _mm_add_ps(one_SSE,
2381                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2382             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2383                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2384                                                        _mm_add_ps(one_SSE,
2385                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2386
2387             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2388                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2389                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2390             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2391                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2392                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2393             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2394                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2395                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2396             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2397                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2398                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2399
2400             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2401             dadx += 4;
2402             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2403             dadx += 4;
2404             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2405             dadx += 4;
2406             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2407             dadx += 4;
2408
2409             /* Evaluate influence of atom ai -> aj */
2410             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2411             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2412             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2413             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2414             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2415             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2416             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2417             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2418             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2419             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2420             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2421             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2422
2423             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2424             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2425             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2426             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2427             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2428             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2429             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2430             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2431             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2432             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2433             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2434             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2435             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, imask_SSE0);
2436             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, imask_SSE1);
2437             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, imask_SSE2);
2438             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, imask_SSE3);
2439
2440             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2441             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2442             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2443             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2444             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2445                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2446             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2447                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2448             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2449                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2450             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2451                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2452             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2453             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2454             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2455             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2456
2457             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2458             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2459             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2460             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2461             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2462             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2463             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2464             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2465             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2466             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2467             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2468             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2469             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2470             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2471             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2472             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2473
2474             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2475             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2476             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2477             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2478             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2479             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2480             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2481             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2482             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
2483             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
2484             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
2485             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
2486             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2487             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2488             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2489             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2490
2491             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2492             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2493             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2494             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2495             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2496             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2497             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2498             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2499             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2500                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2501                                                        prod_SSE0));
2502             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2503                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2504                                                        prod_SSE1));
2505             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2506                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2507                                                        prod_SSE2));
2508             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2509                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2510                                                        prod_SSE3));
2511             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2512             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2513             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2514             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2515             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2516             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2517             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2518             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2519             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
2520             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
2521             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
2522             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
2523             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2524             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2525             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2526             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2527             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2528             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2529             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2530             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2531
2532             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
2533                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
2534                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
2535                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
2536                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
2537
2538             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2539                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2540             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2541                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2542             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2543                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2544             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2545                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2546             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2547                                             _mm_mul_ps(onefourth_SSE,
2548                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2549                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2550             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2551                                             _mm_mul_ps(onefourth_SSE,
2552                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2553                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2554             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2555                                             _mm_mul_ps(onefourth_SSE,
2556                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2557                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2558             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2559                                             _mm_mul_ps(onefourth_SSE,
2560                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2561                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2562             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2563                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2564                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2565             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2566                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2567                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2568             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2569                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2570                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2571             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2572                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2573                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2574             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2575                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2576                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2577             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2578                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2579                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2580             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2581                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2582                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2583             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2584                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2585                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2586
2587             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2588                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2589             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2590                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2591             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2592                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2593             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2594                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2595
2596             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2597                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2598                                                        _mm_add_ps(one_SSE,
2599                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2600             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2601                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2602                                                        _mm_add_ps(one_SSE,
2603                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2604             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2605                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2606                                                        _mm_add_ps(one_SSE,
2607                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2608             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2609                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2610                                                        _mm_add_ps(one_SSE,
2611                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2612
2613             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2614                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2615                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2616             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2617                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2618                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2619             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2620                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2621                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2622             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2623                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2624                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2625
2626             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2627             dadx += 4;
2628             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2629             dadx += 4;
2630             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2631             dadx += 4;
2632             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2633             dadx += 4;
2634         }
2635
2636         /* Epilogue part, including exclusion mask */
2637         for (j = nj2; j < nj3; j += UNROLLJ)
2638         {
2639             jmask_SSE0 = _mm_load_ps((real *)emask0);
2640             jmask_SSE1 = _mm_load_ps((real *)emask1);
2641             jmask_SSE2 = _mm_load_ps((real *)emask2);
2642             jmask_SSE3 = _mm_load_ps((real *)emask3);
2643             emask0    += UNROLLJ;
2644             emask1    += UNROLLJ;
2645             emask2    += UNROLLJ;
2646             emask3    += UNROLLJ;
2647
2648             /* load j atom coordinates */
2649             jx_SSE            = _mm_load_ps(x_align+j);
2650             jy_SSE            = _mm_load_ps(y_align+j);
2651             jz_SSE            = _mm_load_ps(z_align+j);
2652
2653             /* Calculate distance */
2654             dx_SSE0            = _mm_sub_ps(ix_SSE0, jx_SSE);
2655             dy_SSE0            = _mm_sub_ps(iy_SSE0, jy_SSE);
2656             dz_SSE0            = _mm_sub_ps(iz_SSE0, jz_SSE);
2657             dx_SSE1            = _mm_sub_ps(ix_SSE1, jx_SSE);
2658             dy_SSE1            = _mm_sub_ps(iy_SSE1, jy_SSE);
2659             dz_SSE1            = _mm_sub_ps(iz_SSE1, jz_SSE);
2660             dx_SSE2            = _mm_sub_ps(ix_SSE2, jx_SSE);
2661             dy_SSE2            = _mm_sub_ps(iy_SSE2, jy_SSE);
2662             dz_SSE2            = _mm_sub_ps(iz_SSE2, jz_SSE);
2663             dx_SSE3            = _mm_sub_ps(ix_SSE3, jx_SSE);
2664             dy_SSE3            = _mm_sub_ps(iy_SSE3, jy_SSE);
2665             dz_SSE3            = _mm_sub_ps(iz_SSE3, jz_SSE);
2666
2667             /* rsq = dx*dx+dy*dy+dz*dz */
2668             rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
2669             rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
2670             rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
2671             rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
2672
2673             /* Combine masks */
2674             jmask_SSE0         = _mm_and_ps(jmask_SSE0, imask_SSE0);
2675             jmask_SSE1         = _mm_and_ps(jmask_SSE1, imask_SSE1);
2676             jmask_SSE2         = _mm_and_ps(jmask_SSE2, imask_SSE2);
2677             jmask_SSE3         = _mm_and_ps(jmask_SSE3, imask_SSE3);
2678
2679             /* Calculate 1/r and 1/r2 */
2680             rinv_SSE0          = gmx_mm_invsqrt_ps(rsq_SSE0);
2681             rinv_SSE1          = gmx_mm_invsqrt_ps(rsq_SSE1);
2682             rinv_SSE2          = gmx_mm_invsqrt_ps(rsq_SSE2);
2683             rinv_SSE3          = gmx_mm_invsqrt_ps(rsq_SSE3);
2684
2685             /* Apply mask */
2686             rinv_SSE0          = _mm_and_ps(rinv_SSE0, jmask_SSE0);
2687             rinv_SSE1          = _mm_and_ps(rinv_SSE1, jmask_SSE1);
2688             rinv_SSE2          = _mm_and_ps(rinv_SSE2, jmask_SSE2);
2689             rinv_SSE3          = _mm_and_ps(rinv_SSE3, jmask_SSE3);
2690
2691             dr_SSE0            = _mm_mul_ps(rsq_SSE0, rinv_SSE0);
2692             dr_SSE1            = _mm_mul_ps(rsq_SSE1, rinv_SSE1);
2693             dr_SSE2            = _mm_mul_ps(rsq_SSE2, rinv_SSE2);
2694             dr_SSE3            = _mm_mul_ps(rsq_SSE3, rinv_SSE3);
2695
2696             sk_aj_SSE          = _mm_load_ps(obc_param+j);
2697             raj_SSE            = _mm_load_ps(gb_radius+j);
2698
2699             raj_inv_SSE        = gmx_mm_inv_ps(raj_SSE);
2700
2701             /* Evaluate influence of atom aj -> ai */
2702             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_aj_SSE);
2703             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_aj_SSE);
2704             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_aj_SSE);
2705             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_aj_SSE);
2706             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_aj_SSE);
2707             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_aj_SSE);
2708             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_aj_SSE);
2709             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_aj_SSE);
2710             t3_SSE0            = _mm_sub_ps(sk_aj_SSE, dr_SSE0);
2711             t3_SSE1            = _mm_sub_ps(sk_aj_SSE, dr_SSE1);
2712             t3_SSE2            = _mm_sub_ps(sk_aj_SSE, dr_SSE2);
2713             t3_SSE3            = _mm_sub_ps(sk_aj_SSE, dr_SSE3);
2714
2715             obc_mask1_SSE0     = _mm_cmplt_ps(rai_SSE0, t1_SSE0);
2716             obc_mask1_SSE1     = _mm_cmplt_ps(rai_SSE1, t1_SSE1);
2717             obc_mask1_SSE2     = _mm_cmplt_ps(rai_SSE2, t1_SSE2);
2718             obc_mask1_SSE3     = _mm_cmplt_ps(rai_SSE3, t1_SSE3);
2719             obc_mask2_SSE0     = _mm_cmplt_ps(rai_SSE0, t2_SSE0);
2720             obc_mask2_SSE1     = _mm_cmplt_ps(rai_SSE1, t2_SSE1);
2721             obc_mask2_SSE2     = _mm_cmplt_ps(rai_SSE2, t2_SSE2);
2722             obc_mask2_SSE3     = _mm_cmplt_ps(rai_SSE3, t2_SSE3);
2723             obc_mask3_SSE0     = _mm_cmplt_ps(rai_SSE0, t3_SSE0);
2724             obc_mask3_SSE1     = _mm_cmplt_ps(rai_SSE1, t3_SSE1);
2725             obc_mask3_SSE2     = _mm_cmplt_ps(rai_SSE2, t3_SSE2);
2726             obc_mask3_SSE3     = _mm_cmplt_ps(rai_SSE3, t3_SSE3);
2727             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2728             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2729             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2730             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2731
2732             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2733             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2734             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2735             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2736             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2737                                               _mm_andnot_ps(obc_mask2_SSE0, rai_inv_SSE0));
2738             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2739                                               _mm_andnot_ps(obc_mask2_SSE1, rai_inv_SSE1));
2740             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2741                                               _mm_andnot_ps(obc_mask2_SSE2, rai_inv_SSE2));
2742             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2743                                               _mm_andnot_ps(obc_mask2_SSE3, rai_inv_SSE3));
2744             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2745             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2746             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2747             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2748
2749             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2750             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2751             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2752             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2753             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2754             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2755             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2756             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2757             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2758             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2759             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2760             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2761             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2762             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2763             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2764             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2765
2766             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2767             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2768             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2769             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2770             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2771             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2772             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
2773             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
2774             sk2_aj_SSE         = _mm_mul_ps(sk_aj_SSE, sk_aj_SSE);
2775             sk2_rinv_SSE0      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE0);
2776             sk2_rinv_SSE1      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE1);
2777             sk2_rinv_SSE2      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE2);
2778             sk2_rinv_SSE3      = _mm_mul_ps(sk2_aj_SSE, rinv_SSE3);
2779             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
2780             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
2781             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
2782             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
2783
2784             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
2785             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
2786             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
2787             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
2788
2789             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
2790             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
2791             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
2792             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
2793             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
2794                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
2795                                                        prod_SSE0));
2796             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
2797                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
2798                                                        prod_SSE1));
2799             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
2800                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
2801                                                        prod_SSE2));
2802             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
2803                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
2804                                                        prod_SSE3));
2805
2806             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
2807             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
2808             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
2809             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
2810             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
2811             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
2812             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
2813             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
2814             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE0, lij_SSE0));
2815             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE1, lij_SSE1));
2816             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE2, lij_SSE2));
2817             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(rai_inv_SSE3, lij_SSE3));
2818             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
2819             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
2820             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
2821             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
2822             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
2823             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
2824             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
2825             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
2826
2827             sum_ai_SSE0        = _mm_add_ps(sum_ai_SSE0, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2828             sum_ai_SSE1        = _mm_add_ps(sum_ai_SSE1, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2829             sum_ai_SSE2        = _mm_add_ps(sum_ai_SSE2, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2830             sum_ai_SSE3        = _mm_add_ps(sum_ai_SSE3, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2831
2832             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
2833                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
2834             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
2835                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
2836             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
2837                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
2838             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
2839                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
2840             t1_SSE0            = _mm_sub_ps(t1_SSE0,
2841                                             _mm_mul_ps(onefourth_SSE,
2842                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
2843                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
2844             t1_SSE1            = _mm_sub_ps(t1_SSE1,
2845                                             _mm_mul_ps(onefourth_SSE,
2846                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
2847                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
2848             t1_SSE2            = _mm_sub_ps(t1_SSE2,
2849                                             _mm_mul_ps(onefourth_SSE,
2850                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
2851                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
2852             t1_SSE3            = _mm_sub_ps(t1_SSE3,
2853                                             _mm_mul_ps(onefourth_SSE,
2854                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
2855                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
2856
2857             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
2858                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
2859                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
2860             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
2861                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
2862                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
2863             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
2864                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
2865                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
2866             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
2867                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
2868                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
2869             t2_SSE0            = _mm_sub_ps(t2_SSE0,
2870                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
2871                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
2872             t2_SSE1            = _mm_sub_ps(t2_SSE1,
2873                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
2874                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
2875             t2_SSE2            = _mm_sub_ps(t2_SSE2,
2876                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
2877                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
2878             t2_SSE3            = _mm_sub_ps(t2_SSE3,
2879                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
2880                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
2881             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
2882                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
2883             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
2884                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
2885             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
2886                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
2887             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
2888                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
2889             t3_SSE0            = _mm_sub_ps(t3_SSE0,
2890                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
2891                                                        _mm_add_ps(one_SSE,
2892                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
2893             t3_SSE1            = _mm_sub_ps(t3_SSE1,
2894                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
2895                                                        _mm_add_ps(one_SSE,
2896                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
2897             t3_SSE2            = _mm_sub_ps(t3_SSE2,
2898                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
2899                                                        _mm_add_ps(one_SSE,
2900                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
2901             t3_SSE3            = _mm_sub_ps(t3_SSE3,
2902                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
2903                                                        _mm_add_ps(one_SSE,
2904                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
2905
2906             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
2907                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
2908                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
2909             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
2910                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
2911                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
2912             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
2913                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
2914                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
2915             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
2916                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
2917                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
2918
2919             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
2920             dadx += 4;
2921             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
2922             dadx += 4;
2923             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
2924             dadx += 4;
2925             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
2926             dadx += 4;
2927
2928             /* Evaluate influence of atom ai -> aj */
2929             t1_SSE0            = _mm_add_ps(dr_SSE0, sk_ai_SSE0);
2930             t1_SSE1            = _mm_add_ps(dr_SSE1, sk_ai_SSE1);
2931             t1_SSE2            = _mm_add_ps(dr_SSE2, sk_ai_SSE2);
2932             t1_SSE3            = _mm_add_ps(dr_SSE3, sk_ai_SSE3);
2933             t2_SSE0            = _mm_sub_ps(dr_SSE0, sk_ai_SSE0);
2934             t2_SSE1            = _mm_sub_ps(dr_SSE1, sk_ai_SSE1);
2935             t2_SSE2            = _mm_sub_ps(dr_SSE2, sk_ai_SSE2);
2936             t2_SSE3            = _mm_sub_ps(dr_SSE3, sk_ai_SSE3);
2937             t3_SSE0            = _mm_sub_ps(sk_ai_SSE0, dr_SSE0);
2938             t3_SSE1            = _mm_sub_ps(sk_ai_SSE1, dr_SSE1);
2939             t3_SSE2            = _mm_sub_ps(sk_ai_SSE2, dr_SSE2);
2940             t3_SSE3            = _mm_sub_ps(sk_ai_SSE3, dr_SSE3);
2941
2942             obc_mask1_SSE0     = _mm_cmplt_ps(raj_SSE, t1_SSE0);
2943             obc_mask1_SSE1     = _mm_cmplt_ps(raj_SSE, t1_SSE1);
2944             obc_mask1_SSE2     = _mm_cmplt_ps(raj_SSE, t1_SSE2);
2945             obc_mask1_SSE3     = _mm_cmplt_ps(raj_SSE, t1_SSE3);
2946             obc_mask2_SSE0     = _mm_cmplt_ps(raj_SSE, t2_SSE0);
2947             obc_mask2_SSE1     = _mm_cmplt_ps(raj_SSE, t2_SSE1);
2948             obc_mask2_SSE2     = _mm_cmplt_ps(raj_SSE, t2_SSE2);
2949             obc_mask2_SSE3     = _mm_cmplt_ps(raj_SSE, t2_SSE3);
2950             obc_mask3_SSE0     = _mm_cmplt_ps(raj_SSE, t3_SSE0);
2951             obc_mask3_SSE1     = _mm_cmplt_ps(raj_SSE, t3_SSE1);
2952             obc_mask3_SSE2     = _mm_cmplt_ps(raj_SSE, t3_SSE2);
2953             obc_mask3_SSE3     = _mm_cmplt_ps(raj_SSE, t3_SSE3);
2954             obc_mask1_SSE0     = _mm_and_ps(obc_mask1_SSE0, jmask_SSE0);
2955             obc_mask1_SSE1     = _mm_and_ps(obc_mask1_SSE1, jmask_SSE1);
2956             obc_mask1_SSE2     = _mm_and_ps(obc_mask1_SSE2, jmask_SSE2);
2957             obc_mask1_SSE3     = _mm_and_ps(obc_mask1_SSE3, jmask_SSE3);
2958
2959             uij_SSE0           = gmx_mm_inv_ps(t1_SSE0);
2960             uij_SSE1           = gmx_mm_inv_ps(t1_SSE1);
2961             uij_SSE2           = gmx_mm_inv_ps(t1_SSE2);
2962             uij_SSE3           = gmx_mm_inv_ps(t1_SSE3);
2963             lij_SSE0           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE0, gmx_mm_inv_ps(t2_SSE0)),
2964                                               _mm_andnot_ps(obc_mask2_SSE0, raj_inv_SSE));
2965             lij_SSE1           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE1, gmx_mm_inv_ps(t2_SSE1)),
2966                                               _mm_andnot_ps(obc_mask2_SSE1, raj_inv_SSE));
2967             lij_SSE2           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE2, gmx_mm_inv_ps(t2_SSE2)),
2968                                               _mm_andnot_ps(obc_mask2_SSE2, raj_inv_SSE));
2969             lij_SSE3           = _mm_or_ps(   _mm_and_ps(obc_mask2_SSE3, gmx_mm_inv_ps(t2_SSE3)),
2970                                               _mm_andnot_ps(obc_mask2_SSE3, raj_inv_SSE));
2971             dlij_SSE0          = _mm_and_ps(one_SSE, obc_mask2_SSE0);
2972             dlij_SSE1          = _mm_and_ps(one_SSE, obc_mask2_SSE1);
2973             dlij_SSE2          = _mm_and_ps(one_SSE, obc_mask2_SSE2);
2974             dlij_SSE3          = _mm_and_ps(one_SSE, obc_mask2_SSE3);
2975
2976             uij2_SSE0          = _mm_mul_ps(uij_SSE0, uij_SSE0);
2977             uij2_SSE1          = _mm_mul_ps(uij_SSE1, uij_SSE1);
2978             uij2_SSE2          = _mm_mul_ps(uij_SSE2, uij_SSE2);
2979             uij2_SSE3          = _mm_mul_ps(uij_SSE3, uij_SSE3);
2980             uij3_SSE0          = _mm_mul_ps(uij2_SSE0, uij_SSE0);
2981             uij3_SSE1          = _mm_mul_ps(uij2_SSE1, uij_SSE1);
2982             uij3_SSE2          = _mm_mul_ps(uij2_SSE2, uij_SSE2);
2983             uij3_SSE3          = _mm_mul_ps(uij2_SSE3, uij_SSE3);
2984             lij2_SSE0          = _mm_mul_ps(lij_SSE0, lij_SSE0);
2985             lij2_SSE1          = _mm_mul_ps(lij_SSE1, lij_SSE1);
2986             lij2_SSE2          = _mm_mul_ps(lij_SSE2, lij_SSE2);
2987             lij2_SSE3          = _mm_mul_ps(lij_SSE3, lij_SSE3);
2988             lij3_SSE0          = _mm_mul_ps(lij2_SSE0, lij_SSE0);
2989             lij3_SSE1          = _mm_mul_ps(lij2_SSE1, lij_SSE1);
2990             lij3_SSE2          = _mm_mul_ps(lij2_SSE2, lij_SSE2);
2991             lij3_SSE3          = _mm_mul_ps(lij2_SSE3, lij_SSE3);
2992
2993             diff2_SSE0         = _mm_sub_ps(uij2_SSE0, lij2_SSE0);
2994             diff2_SSE1         = _mm_sub_ps(uij2_SSE1, lij2_SSE1);
2995             diff2_SSE2         = _mm_sub_ps(uij2_SSE2, lij2_SSE2);
2996             diff2_SSE3         = _mm_sub_ps(uij2_SSE3, lij2_SSE3);
2997             lij_inv_SSE0       = gmx_mm_invsqrt_ps(lij2_SSE0);
2998             lij_inv_SSE1       = gmx_mm_invsqrt_ps(lij2_SSE1);
2999             lij_inv_SSE2       = gmx_mm_invsqrt_ps(lij2_SSE2);
3000             lij_inv_SSE3       = gmx_mm_invsqrt_ps(lij2_SSE3);
3001             sk2_rinv_SSE0      = _mm_mul_ps(sk2_ai_SSE0, rinv_SSE0);
3002             sk2_rinv_SSE1      = _mm_mul_ps(sk2_ai_SSE1, rinv_SSE1);
3003             sk2_rinv_SSE2      = _mm_mul_ps(sk2_ai_SSE2, rinv_SSE2);
3004             sk2_rinv_SSE3      = _mm_mul_ps(sk2_ai_SSE3, rinv_SSE3);
3005             prod_SSE0          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE0);
3006             prod_SSE1          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE1);
3007             prod_SSE2          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE2);
3008             prod_SSE3          = _mm_mul_ps(onefourth_SSE, sk2_rinv_SSE3);
3009
3010             logterm_SSE0       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE0, lij_inv_SSE0));
3011             logterm_SSE1       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE1, lij_inv_SSE1));
3012             logterm_SSE2       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE2, lij_inv_SSE2));
3013             logterm_SSE3       = gmx_mm_log_ps(_mm_mul_ps(uij_SSE3, lij_inv_SSE3));
3014             t1_SSE0            = _mm_sub_ps(lij_SSE0, uij_SSE0);
3015             t1_SSE1            = _mm_sub_ps(lij_SSE1, uij_SSE1);
3016             t1_SSE2            = _mm_sub_ps(lij_SSE2, uij_SSE2);
3017             t1_SSE3            = _mm_sub_ps(lij_SSE3, uij_SSE3);
3018             t2_SSE0            = _mm_mul_ps(diff2_SSE0,
3019                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE0),
3020                                                        prod_SSE0));
3021             t2_SSE1            = _mm_mul_ps(diff2_SSE1,
3022                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE1),
3023                                                        prod_SSE1));
3024             t2_SSE2            = _mm_mul_ps(diff2_SSE2,
3025                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE2),
3026                                                        prod_SSE2));
3027             t2_SSE3            = _mm_mul_ps(diff2_SSE3,
3028                                             _mm_sub_ps(_mm_mul_ps(onefourth_SSE, dr_SSE3),
3029                                                        prod_SSE3));
3030             t3_SSE0            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE0, logterm_SSE0));
3031             t3_SSE1            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE1, logterm_SSE1));
3032             t3_SSE2            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE2, logterm_SSE2));
3033             t3_SSE3            = _mm_mul_ps(half_SSE, _mm_mul_ps(rinv_SSE3, logterm_SSE3));
3034             t1_SSE0            = _mm_add_ps(t1_SSE0, _mm_add_ps(t2_SSE0, t3_SSE0));
3035             t1_SSE1            = _mm_add_ps(t1_SSE1, _mm_add_ps(t2_SSE1, t3_SSE1));
3036             t1_SSE2            = _mm_add_ps(t1_SSE2, _mm_add_ps(t2_SSE2, t3_SSE2));
3037             t1_SSE3            = _mm_add_ps(t1_SSE3, _mm_add_ps(t2_SSE3, t3_SSE3));
3038             t4_SSE0            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE0));
3039             t4_SSE1            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE1));
3040             t4_SSE2            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE2));
3041             t4_SSE3            = _mm_mul_ps(two_SSE, _mm_sub_ps(raj_inv_SSE, lij_SSE3));
3042             t4_SSE0            = _mm_and_ps(t4_SSE0, obc_mask3_SSE0);
3043             t4_SSE1            = _mm_and_ps(t4_SSE1, obc_mask3_SSE1);
3044             t4_SSE2            = _mm_and_ps(t4_SSE2, obc_mask3_SSE2);
3045             t4_SSE3            = _mm_and_ps(t4_SSE3, obc_mask3_SSE3);
3046             t1_SSE0            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE0, t4_SSE0));
3047             t1_SSE1            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE1, t4_SSE1));
3048             t1_SSE2            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE2, t4_SSE2));
3049             t1_SSE3            = _mm_mul_ps(half_SSE, _mm_add_ps(t1_SSE3, t4_SSE3));
3050
3051             _mm_store_ps(work+j, _mm_add_ps(_mm_load_ps(work+j),
3052                                             gmx_mm_sum4_ps(_mm_and_ps(t1_SSE0, obc_mask1_SSE0),
3053                                                            _mm_and_ps(t1_SSE1, obc_mask1_SSE1),
3054                                                            _mm_and_ps(t1_SSE2, obc_mask1_SSE2),
3055                                                            _mm_and_ps(t1_SSE3, obc_mask1_SSE3))));
3056
3057             t1_SSE0            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE0),
3058                                             _mm_mul_ps(prod_SSE0, lij3_SSE0));
3059             t1_SSE1            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE1),
3060                                             _mm_mul_ps(prod_SSE1, lij3_SSE1));
3061             t1_SSE2            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE2),
3062                                             _mm_mul_ps(prod_SSE2, lij3_SSE2));
3063             t1_SSE3            = _mm_add_ps(_mm_mul_ps(half_SSE, lij2_SSE3),
3064                                             _mm_mul_ps(prod_SSE3, lij3_SSE3));
3065             t1_SSE0            = _mm_sub_ps(t1_SSE0,
3066                                             _mm_mul_ps(onefourth_SSE,
3067                                                        _mm_add_ps(_mm_mul_ps(lij_SSE0, rinv_SSE0),
3068                                                                   _mm_mul_ps(lij3_SSE0, dr_SSE0))));
3069             t1_SSE1            = _mm_sub_ps(t1_SSE1,
3070                                             _mm_mul_ps(onefourth_SSE,
3071                                                        _mm_add_ps(_mm_mul_ps(lij_SSE1, rinv_SSE1),
3072                                                                   _mm_mul_ps(lij3_SSE1, dr_SSE1))));
3073             t1_SSE2            = _mm_sub_ps(t1_SSE2,
3074                                             _mm_mul_ps(onefourth_SSE,
3075                                                        _mm_add_ps(_mm_mul_ps(lij_SSE2, rinv_SSE2),
3076                                                                   _mm_mul_ps(lij3_SSE2, dr_SSE2))));
3077             t1_SSE3            = _mm_sub_ps(t1_SSE3,
3078                                             _mm_mul_ps(onefourth_SSE,
3079                                                        _mm_add_ps(_mm_mul_ps(lij_SSE3, rinv_SSE3),
3080                                                                   _mm_mul_ps(lij3_SSE3, dr_SSE3))));
3081             t2_SSE0            = _mm_mul_ps(onefourth_SSE,
3082                                             _mm_add_ps(_mm_mul_ps(uij_SSE0, rinv_SSE0),
3083                                                        _mm_mul_ps(uij3_SSE0, dr_SSE0)));
3084             t2_SSE1            = _mm_mul_ps(onefourth_SSE,
3085                                             _mm_add_ps(_mm_mul_ps(uij_SSE1, rinv_SSE1),
3086                                                        _mm_mul_ps(uij3_SSE1, dr_SSE1)));
3087             t2_SSE2            = _mm_mul_ps(onefourth_SSE,
3088                                             _mm_add_ps(_mm_mul_ps(uij_SSE2, rinv_SSE2),
3089                                                        _mm_mul_ps(uij3_SSE2, dr_SSE2)));
3090             t2_SSE3            = _mm_mul_ps(onefourth_SSE,
3091                                             _mm_add_ps(_mm_mul_ps(uij_SSE3, rinv_SSE3),
3092                                                        _mm_mul_ps(uij3_SSE3, dr_SSE3)));
3093             t2_SSE0            = _mm_sub_ps(t2_SSE0,
3094                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE0),
3095                                                        _mm_mul_ps(prod_SSE0, uij3_SSE0)));
3096             t2_SSE1            = _mm_sub_ps(t2_SSE1,
3097                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE1),
3098                                                        _mm_mul_ps(prod_SSE1, uij3_SSE1)));
3099             t2_SSE2            = _mm_sub_ps(t2_SSE2,
3100                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE2),
3101                                                        _mm_mul_ps(prod_SSE2, uij3_SSE2)));
3102             t2_SSE3            = _mm_sub_ps(t2_SSE3,
3103                                             _mm_add_ps(_mm_mul_ps(half_SSE, uij2_SSE3),
3104                                                        _mm_mul_ps(prod_SSE3, uij3_SSE3)));
3105
3106             t3_SSE0            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE0),
3107                                             _mm_mul_ps(rinv_SSE0, rinv_SSE0));
3108             t3_SSE1            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE1),
3109                                             _mm_mul_ps(rinv_SSE1, rinv_SSE1));
3110             t3_SSE2            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE2),
3111                                             _mm_mul_ps(rinv_SSE2, rinv_SSE2));
3112             t3_SSE3            = _mm_mul_ps(_mm_mul_ps(onefourth_SSE, logterm_SSE3),
3113                                             _mm_mul_ps(rinv_SSE3, rinv_SSE3));
3114
3115             t3_SSE0            = _mm_sub_ps(t3_SSE0,
3116                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE0, oneeighth_SSE),
3117                                                        _mm_add_ps(one_SSE,
3118                                                                   _mm_mul_ps(sk2_rinv_SSE0, rinv_SSE0))));
3119             t3_SSE1            = _mm_sub_ps(t3_SSE1,
3120                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE1, oneeighth_SSE),
3121                                                        _mm_add_ps(one_SSE,
3122                                                                   _mm_mul_ps(sk2_rinv_SSE1, rinv_SSE1))));
3123             t3_SSE2            = _mm_sub_ps(t3_SSE2,
3124                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE2, oneeighth_SSE),
3125                                                        _mm_add_ps(one_SSE,
3126                                                                   _mm_mul_ps(sk2_rinv_SSE2, rinv_SSE2))));
3127             t3_SSE3            = _mm_sub_ps(t3_SSE3,
3128                                             _mm_mul_ps(_mm_mul_ps(diff2_SSE3, oneeighth_SSE),
3129                                                        _mm_add_ps(one_SSE,
3130                                                                   _mm_mul_ps(sk2_rinv_SSE3, rinv_SSE3))));
3131
3132
3133             t1_SSE0            = _mm_mul_ps(rinv_SSE0,
3134                                             _mm_add_ps(_mm_mul_ps(dlij_SSE0, t1_SSE0),
3135                                                        _mm_add_ps(t2_SSE0, t3_SSE0)));
3136             t1_SSE1            = _mm_mul_ps(rinv_SSE1,
3137                                             _mm_add_ps(_mm_mul_ps(dlij_SSE1, t1_SSE1),
3138                                                        _mm_add_ps(t2_SSE1, t3_SSE1)));
3139             t1_SSE2            = _mm_mul_ps(rinv_SSE2,
3140                                             _mm_add_ps(_mm_mul_ps(dlij_SSE2, t1_SSE2),
3141                                                        _mm_add_ps(t2_SSE2, t3_SSE2)));
3142             t1_SSE3            = _mm_mul_ps(rinv_SSE3,
3143                                             _mm_add_ps(_mm_mul_ps(dlij_SSE3, t1_SSE3),
3144                                                        _mm_add_ps(t2_SSE3, t3_SSE3)));
3145
3146             _mm_store_ps(dadx, _mm_and_ps(t1_SSE0, obc_mask1_SSE0));
3147             dadx += 4;
3148             _mm_store_ps(dadx, _mm_and_ps(t1_SSE1, obc_mask1_SSE1));
3149             dadx += 4;
3150             _mm_store_ps(dadx, _mm_and_ps(t1_SSE2, obc_mask1_SSE2));
3151             dadx += 4;
3152             _mm_store_ps(dadx, _mm_and_ps(t1_SSE3, obc_mask1_SSE3));
3153             dadx += 4;
3154         }
3155         _MM_TRANSPOSE4_PS(sum_ai_SSE0, sum_ai_SSE1, sum_ai_SSE2, sum_ai_SSE3);
3156         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE1);
3157         sum_ai_SSE2 = _mm_add_ps(sum_ai_SSE2, sum_ai_SSE3);
3158         sum_ai_SSE0 = _mm_add_ps(sum_ai_SSE0, sum_ai_SSE2);
3159         _mm_store_ps(work+i, _mm_add_ps(sum_ai_SSE0, _mm_load_ps(work+i)));
3160     }
3161
3162
3163     for (i = 0; i < natoms/2+1; i++)
3164     {
3165         work[i] += work[natoms+i];
3166     }
3167
3168     /* Parallel summations */
3169
3170     if (PARTDECOMP(cr))
3171     {
3172         gmx_sum(natoms, work, cr);
3173     }
3174
3175     if (gb_algorithm == egbHCT)
3176     {
3177         /* HCT */
3178         for (i = 0; i < natoms; i++)
3179         {
3180             if (born->use[i] != 0)
3181             {
3182                 rai     = top->atomtypes.gb_radius[mdatoms->typeA[i]]-born->gb_doffset;
3183                 sum_ai  = 1.0/rai - work[i];
3184                 min_rad = rai + born->gb_doffset;
3185                 rad     = 1.0/sum_ai;
3186
3187                 born->bRad[i]   = rad > min_rad ? rad : min_rad;
3188                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3189             }
3190         }
3191
3192     }
3193     else
3194     {
3195         /* OBC */
3196
3197         /* Calculate the radii */
3198         for (i = 0; i < natoms; i++)
3199         {
3200
3201             if (born->use[i] != 0)
3202             {
3203                 rai        = top->atomtypes.gb_radius[mdatoms->typeA[i]];
3204                 rai_inv2   = 1.0/rai;
3205                 rai        = rai-born->gb_doffset;
3206                 rai_inv    = 1.0/rai;
3207                 sum_ai     = rai * work[i];
3208                 sum_ai2    = sum_ai  * sum_ai;
3209                 sum_ai3    = sum_ai2 * sum_ai;
3210
3211                 tsum          = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
3212                 born->bRad[i] = rai_inv - tsum*rai_inv2;
3213                 born->bRad[i] = 1.0 / born->bRad[i];
3214
3215                 fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
3216
3217                 tchain         = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
3218                 born->drobc[i] = (1.0-tsum*tsum)*tchain*rai_inv2;
3219             }
3220         }
3221     }
3222
3223     return 0;
3224 }
3225
3226
3227
3228
3229
3230
3231
3232
3233 int
3234 genborn_allvsall_calc_chainrule_sse2_single(t_forcerec *           fr,
3235                                             t_mdatoms *            mdatoms,
3236                                             gmx_genborn_t *        born,
3237                                             real *                 x,
3238                                             real *                 f,
3239                                             int                    gb_algorithm,
3240                                             void *                 paadata)
3241 {
3242     gmx_allvsallgb2_data_t *aadata;
3243     int                     natoms;
3244     int                     ni0, ni1;
3245     int                     nj0, nj1, nj2, nj3;
3246     int                     i, j, k, n;
3247     int                     idx;
3248     int              *      mask;
3249     int              *      pmask0;
3250     int              *      emask0;
3251     int              *      jindex;
3252
3253     real                    ix, iy, iz;
3254     real                    fix, fiy, fiz;
3255     real                    jx, jy, jz;
3256     real                    dx, dy, dz;
3257     real                    tx, ty, tz;
3258     real                    rbai, rbaj, fgb, fgb_ai, rbi;
3259     real              *     rb;
3260     real              *     dadx;
3261     real              *     x_align;
3262     real              *     y_align;
3263     real              *     z_align;
3264     real              *     fx_align;
3265     real              *     fy_align;
3266     real              *     fz_align;
3267     real                    tmpsum[4];
3268
3269     __m128                  jmask_SSE0, jmask_SSE1, jmask_SSE2, jmask_SSE3;
3270     __m128                  ix_SSE0, iy_SSE0, iz_SSE0;
3271     __m128                  ix_SSE1, iy_SSE1, iz_SSE1;
3272     __m128                  ix_SSE2, iy_SSE2, iz_SSE2;
3273     __m128                  ix_SSE3, iy_SSE3, iz_SSE3;
3274     __m128                  fix_SSE0, fiy_SSE0, fiz_SSE0;
3275     __m128                  fix_SSE1, fiy_SSE1, fiz_SSE1;
3276     __m128                  fix_SSE2, fiy_SSE2, fiz_SSE2;
3277     __m128                  fix_SSE3, fiy_SSE3, fiz_SSE3;
3278     __m128                  rbai_SSE0, rbai_SSE1, rbai_SSE2, rbai_SSE3;
3279     __m128                  imask_SSE0, imask_SSE1, imask_SSE2, imask_SSE3;
3280     __m128                  jx_SSE, jy_SSE, jz_SSE, rbaj_SSE;
3281     __m128                  dx_SSE0, dy_SSE0, dz_SSE0;
3282     __m128                  dx_SSE1, dy_SSE1, dz_SSE1;
3283     __m128                  dx_SSE2, dy_SSE2, dz_SSE2;
3284     __m128                  dx_SSE3, dy_SSE3, dz_SSE3;
3285     __m128                  fgb_SSE0, fgb_ai_SSE0;
3286     __m128                  fgb_SSE1, fgb_ai_SSE1;
3287     __m128                  fgb_SSE2, fgb_ai_SSE2;
3288     __m128                  fgb_SSE3, fgb_ai_SSE3;
3289     __m128                  tx_SSE0, ty_SSE0, tz_SSE0;
3290     __m128                  tx_SSE1, ty_SSE1, tz_SSE1;
3291     __m128                  tx_SSE2, ty_SSE2, tz_SSE2;
3292     __m128                  tx_SSE3, ty_SSE3, tz_SSE3;
3293     __m128                  t1, t2;
3294
3295     natoms              = mdatoms->nr;
3296     ni0                 = (mdatoms->start/SIMD_WIDTH)*SIMD_WIDTH;
3297     ni1                 = mdatoms->start+mdatoms->homenr;
3298     dadx                = fr->dadx;
3299
3300     aadata = (gmx_allvsallgb2_data_t *)paadata;
3301
3302     x_align  = aadata->x_align;
3303     y_align  = aadata->y_align;
3304     z_align  = aadata->z_align;
3305     fx_align = aadata->fx_align;
3306     fy_align = aadata->fy_align;
3307     fz_align = aadata->fz_align;
3308
3309     jindex    = aadata->jindex_gb;
3310     dadx      = fr->dadx;
3311
3312     n  = 0;
3313     rb = aadata->work;
3314
3315     /* Loop to get the proper form for the Born radius term */
3316     if (gb_algorithm == egbSTILL)
3317     {
3318         for (i = 0; i < natoms; i++)
3319         {
3320             rbi   = born->bRad[i];
3321             rb[i] = (2 * rbi * rbi * fr->dvda[i])/ONE_4PI_EPS0;
3322         }
3323     }
3324     else if (gb_algorithm == egbHCT)
3325     {
3326         for (i = 0; i < natoms; i++)
3327         {
3328             rbi   = born->bRad[i];
3329             rb[i] = rbi * rbi * fr->dvda[i];
3330         }
3331     }
3332     else if (gb_algorithm == egbOBC)
3333     {
3334         for (idx = 0; idx < natoms; idx++)
3335         {
3336             rbi     = born->bRad[idx];
3337             rb[idx] = rbi * rbi * born->drobc[idx] * fr->dvda[idx];
3338         }
3339     }
3340
3341     for (i = 0; i < 2*natoms; i++)
3342     {
3343         fx_align[i]       = 0;
3344         fy_align[i]       = 0;
3345         fz_align[i]       = 0;
3346     }
3347
3348
3349     for (i = 0; i < natoms; i++)
3350     {
3351         rb[i+natoms] = rb[i];
3352     }
3353
3354     for (i = ni0; i < ni1; i += UNROLLI)
3355     {
3356         /* We assume shifts are NOT used for all-vs-all interactions */
3357
3358         /* Load i atom data */
3359         ix_SSE0          = _mm_load1_ps(x_align+i);
3360         iy_SSE0          = _mm_load1_ps(y_align+i);
3361         iz_SSE0          = _mm_load1_ps(z_align+i);
3362         ix_SSE1          = _mm_load1_ps(x_align+i+1);
3363         iy_SSE1          = _mm_load1_ps(y_align+i+1);
3364         iz_SSE1          = _mm_load1_ps(z_align+i+1);
3365         ix_SSE2          = _mm_load1_ps(x_align+i+2);
3366         iy_SSE2          = _mm_load1_ps(y_align+i+2);
3367         iz_SSE2          = _mm_load1_ps(z_align+i+2);
3368         ix_SSE3          = _mm_load1_ps(x_align+i+3);
3369         iy_SSE3          = _mm_load1_ps(y_align+i+3);
3370         iz_SSE3          = _mm_load1_ps(z_align+i+3);
3371
3372         fix_SSE0         = _mm_setzero_ps();
3373         fiy_SSE0         = _mm_setzero_ps();
3374         fiz_SSE0         = _mm_setzero_ps();
3375         fix_SSE1         = _mm_setzero_ps();
3376         fiy_SSE1         = _mm_setzero_ps();
3377         fiz_SSE1         = _mm_setzero_ps();
3378         fix_SSE2         = _mm_setzero_ps();
3379         fiy_SSE2         = _mm_setzero_ps();
3380         fiz_SSE2         = _mm_setzero_ps();
3381         fix_SSE3         = _mm_setzero_ps();
3382         fiy_SSE3         = _mm_setzero_ps();
3383         fiz_SSE3         = _mm_setzero_ps();
3384
3385         rbai_SSE0        = _mm_load1_ps(rb+i);
3386         rbai_SSE1        = _mm_load1_ps(rb+i+1);
3387         rbai_SSE2        = _mm_load1_ps(rb+i+2);
3388         rbai_SSE3        = _mm_load1_ps(rb+i+3);
3389
3390         /* Load limits for loop over neighbors */
3391         nj0              = jindex[4*i];
3392         nj3              = jindex[4*i+3];
3393
3394         /* No masks necessary, since the stored chain rule derivatives will be zero in those cases! */
3395         for (j = nj0; j < nj3; j += UNROLLJ)
3396         {
3397             /* load j atom coordinates */
3398             jx_SSE           = _mm_load_ps(x_align+j);
3399             jy_SSE           = _mm_load_ps(y_align+j);
3400             jz_SSE           = _mm_load_ps(z_align+j);
3401
3402             /* Calculate distance */
3403             dx_SSE0          = _mm_sub_ps(ix_SSE0, jx_SSE);
3404             dy_SSE0          = _mm_sub_ps(iy_SSE0, jy_SSE);
3405             dz_SSE0          = _mm_sub_ps(iz_SSE0, jz_SSE);
3406             dx_SSE1          = _mm_sub_ps(ix_SSE1, jx_SSE);
3407             dy_SSE1          = _mm_sub_ps(iy_SSE1, jy_SSE);
3408             dz_SSE1          = _mm_sub_ps(iz_SSE1, jz_SSE);
3409             dx_SSE2          = _mm_sub_ps(ix_SSE2, jx_SSE);
3410             dy_SSE2          = _mm_sub_ps(iy_SSE2, jy_SSE);
3411             dz_SSE2          = _mm_sub_ps(iz_SSE2, jz_SSE);
3412             dx_SSE3          = _mm_sub_ps(ix_SSE3, jx_SSE);
3413             dy_SSE3          = _mm_sub_ps(iy_SSE3, jy_SSE);
3414             dz_SSE3          = _mm_sub_ps(iz_SSE3, jz_SSE);
3415
3416             rbaj_SSE         = _mm_load_ps(rb+j);
3417
3418             fgb_SSE0         = _mm_mul_ps(rbai_SSE0, _mm_load_ps(dadx));
3419             dadx            += 4;
3420             fgb_SSE1         = _mm_mul_ps(rbai_SSE1, _mm_load_ps(dadx));
3421             dadx            += 4;
3422             fgb_SSE2         = _mm_mul_ps(rbai_SSE2, _mm_load_ps(dadx));
3423             dadx            += 4;
3424             fgb_SSE3         = _mm_mul_ps(rbai_SSE3, _mm_load_ps(dadx));
3425             dadx            += 4;
3426
3427             fgb_ai_SSE0      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3428             dadx            += 4;
3429             fgb_ai_SSE1      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3430             dadx            += 4;
3431             fgb_ai_SSE2      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3432             dadx            += 4;
3433             fgb_ai_SSE3      = _mm_mul_ps(rbaj_SSE, _mm_load_ps(dadx));
3434             dadx            += 4;
3435
3436             /* Total force between ai and aj is the sum of ai->aj and aj->ai */
3437             fgb_SSE0         = _mm_add_ps(fgb_SSE0, fgb_ai_SSE0);
3438             fgb_SSE1         = _mm_add_ps(fgb_SSE1, fgb_ai_SSE1);
3439             fgb_SSE2         = _mm_add_ps(fgb_SSE2, fgb_ai_SSE2);
3440             fgb_SSE3         = _mm_add_ps(fgb_SSE3, fgb_ai_SSE3);
3441
3442             /* Calculate temporary vectorial force */
3443             tx_SSE0            = _mm_mul_ps(fgb_SSE0, dx_SSE0);
3444             ty_SSE0            = _mm_mul_ps(fgb_SSE0, dy_SSE0);
3445             tz_SSE0            = _mm_mul_ps(fgb_SSE0, dz_SSE0);
3446             tx_SSE1            = _mm_mul_ps(fgb_SSE1, dx_SSE1);
3447             ty_SSE1            = _mm_mul_ps(fgb_SSE1, dy_SSE1);
3448             tz_SSE1            = _mm_mul_ps(fgb_SSE1, dz_SSE1);
3449             tx_SSE2            = _mm_mul_ps(fgb_SSE2, dx_SSE2);
3450             ty_SSE2            = _mm_mul_ps(fgb_SSE2, dy_SSE2);
3451             tz_SSE2            = _mm_mul_ps(fgb_SSE2, dz_SSE2);
3452             tx_SSE3            = _mm_mul_ps(fgb_SSE3, dx_SSE3);
3453             ty_SSE3            = _mm_mul_ps(fgb_SSE3, dy_SSE3);
3454             tz_SSE3            = _mm_mul_ps(fgb_SSE3, dz_SSE3);
3455
3456             /* Increment i atom force */
3457             fix_SSE0          = _mm_add_ps(fix_SSE0, tx_SSE0);
3458             fiy_SSE0          = _mm_add_ps(fiy_SSE0, ty_SSE0);
3459             fiz_SSE0          = _mm_add_ps(fiz_SSE0, tz_SSE0);
3460             fix_SSE1          = _mm_add_ps(fix_SSE1, tx_SSE1);
3461             fiy_SSE1          = _mm_add_ps(fiy_SSE1, ty_SSE1);
3462             fiz_SSE1          = _mm_add_ps(fiz_SSE1, tz_SSE1);
3463             fix_SSE2          = _mm_add_ps(fix_SSE2, tx_SSE2);
3464             fiy_SSE2          = _mm_add_ps(fiy_SSE2, ty_SSE2);
3465             fiz_SSE2          = _mm_add_ps(fiz_SSE2, tz_SSE2);
3466             fix_SSE3          = _mm_add_ps(fix_SSE3, tx_SSE3);
3467             fiy_SSE3          = _mm_add_ps(fiy_SSE3, ty_SSE3);
3468             fiz_SSE3          = _mm_add_ps(fiz_SSE3, tz_SSE3);
3469
3470             /* Decrement j atom force */
3471             _mm_store_ps(fx_align+j,
3472                          _mm_sub_ps( _mm_load_ps(fx_align+j), gmx_mm_sum4_ps(tx_SSE0, tx_SSE1, tx_SSE2, tx_SSE3) ));
3473             _mm_store_ps(fy_align+j,
3474                          _mm_sub_ps( _mm_load_ps(fy_align+j), gmx_mm_sum4_ps(ty_SSE0, ty_SSE1, ty_SSE2, ty_SSE3) ));
3475             _mm_store_ps(fz_align+j,
3476                          _mm_sub_ps( _mm_load_ps(fz_align+j), gmx_mm_sum4_ps(tz_SSE0, tz_SSE1, tz_SSE2, tz_SSE3) ));
3477         }
3478         /* Add i forces to mem and shifted force list */
3479         _MM_TRANSPOSE4_PS(fix_SSE0, fix_SSE1, fix_SSE2, fix_SSE3);
3480         fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE1);
3481         fix_SSE2 = _mm_add_ps(fix_SSE2, fix_SSE3);
3482         fix_SSE0 = _mm_add_ps(fix_SSE0, fix_SSE2);
3483         _mm_store_ps(fx_align+i, _mm_add_ps(fix_SSE0, _mm_load_ps(fx_align+i)));
3484
3485         _MM_TRANSPOSE4_PS(fiy_SSE0, fiy_SSE1, fiy_SSE2, fiy_SSE3);
3486         fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE1);
3487         fiy_SSE2 = _mm_add_ps(fiy_SSE2, fiy_SSE3);
3488         fiy_SSE0 = _mm_add_ps(fiy_SSE0, fiy_SSE2);
3489         _mm_store_ps(fy_align+i, _mm_add_ps(fiy_SSE0, _mm_load_ps(fy_align+i)));
3490
3491         _MM_TRANSPOSE4_PS(fiz_SSE0, fiz_SSE1, fiz_SSE2, fiz_SSE3);
3492         fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE1);
3493         fiz_SSE2 = _mm_add_ps(fiz_SSE2, fiz_SSE3);
3494         fiz_SSE0 = _mm_add_ps(fiz_SSE0, fiz_SSE2);
3495         _mm_store_ps(fz_align+i, _mm_add_ps(fiz_SSE0, _mm_load_ps(fz_align+i)));
3496     }
3497
3498     for (i = 0; i < natoms; i++)
3499     {
3500         f[3*i]       += fx_align[i] + fx_align[natoms+i];
3501         f[3*i+1]     += fy_align[i] + fy_align[natoms+i];
3502         f[3*i+2]     += fz_align[i] + fz_align[natoms+i];
3503     }
3504
3505     return 0;
3506 }
3507
3508 #else
3509 /* dummy variable when not using SSE */
3510 int genborn_allvsall_sse2_single_dummy;
3511
3512
3513 #endif