added Verlet scheme and NxN non-bonded functionality
[alexxy/gromacs.git] / src / mdlib / nbnxn_kernels / nbnxn_kernel_x86_simd_inner.h
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
9  * Copyright (c) 2001-2009, The GROMACS Development Team
10  *
11  * Gromacs is a library for molecular simulation and trajectory analysis,
12  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
13  * a full list of developers and information, check out http://www.gromacs.org
14  *
15  * This program is free software; you can redistribute it and/or modify it under 
16  * the terms of the GNU Lesser General Public License as published by the Free 
17  * Software Foundation; either version 2 of the License, or (at your option) any 
18  * later version.
19  * As a special exception, you may use this file as part of a free software
20  * library without restriction.  Specifically, if other files instantiate
21  * templates or use macros or inline functions from this file, or you compile
22  * this file and link it with other files to produce an executable, this
23  * file does not by itself cause the resulting executable to be covered by
24  * the GNU Lesser General Public License.  
25  *
26  * In plain-speak: do not worry about classes/macros/templates either - only
27  * changes to the library have to be LGPL, not an application linking with it.
28  *
29  * To help fund GROMACS development, we humbly ask that you cite
30  * the papers people have written on it - you can find them on the website!
31  */
32
33 /* This is the innermost loop contents for the n vs n atom
34  * SSE2 single precision kernels.
35  */
36
37
38 /* When calculating RF or Ewald interactions we calculate the electrostatic
39  * forces on excluded atom pairs here in the non-bonded loops.
40  * But when energies and/or virial is required we calculate them
41  * separately to as then it is easier to separate the energy and virial
42  * contributions.
43  */
44 #if defined CHECK_EXCLS && defined CALC_COULOMB
45 #define EXCL_FORCES
46 #endif
47
48 #if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_X86_SSE4_1 && !defined COUNT_PAIRS && !(defined __GNUC__ && (defined CALC_COUL_TAB || (defined CALC_COUL_RF && defined GMX_MM128_HERE)))
49 /* Without exclusions and energies we only need to mask the cut-off,
50  * this is faster with blendv (only available with SSE4.1 and later).
51  * With gcc and PME or RF in 128-bit, blendv is slower;
52  * tested with gcc 4.6.2, 4.6.3 and 4.7.1.
53  */
54 #define CUTOFF_BLENDV
55 #endif
56
57         {
58             int        cj,aj,ajx,ajy,ajz;
59
60 #ifdef ENERGY_GROUPS
61             int        egps_j;
62             int        egp_jj[UNROLLJ>>1];
63             int        jj;
64 #endif
65
66 #ifdef CHECK_EXCLS
67             /* Interaction (non-exclusion) mask of all 1's or 0's */
68             gmx_mm_pr  int_SSE0;
69             gmx_mm_pr  int_SSE1;
70             gmx_mm_pr  int_SSE2;
71             gmx_mm_pr  int_SSE3;
72 #endif
73
74             gmx_mm_pr  jxSSE,jySSE,jzSSE;
75             gmx_mm_pr  dx_SSE0,dy_SSE0,dz_SSE0;
76             gmx_mm_pr  dx_SSE1,dy_SSE1,dz_SSE1;
77             gmx_mm_pr  dx_SSE2,dy_SSE2,dz_SSE2;
78             gmx_mm_pr  dx_SSE3,dy_SSE3,dz_SSE3;
79             gmx_mm_pr  tx_SSE0,ty_SSE0,tz_SSE0;
80             gmx_mm_pr  tx_SSE1,ty_SSE1,tz_SSE1;
81             gmx_mm_pr  tx_SSE2,ty_SSE2,tz_SSE2;
82             gmx_mm_pr  tx_SSE3,ty_SSE3,tz_SSE3;
83             gmx_mm_pr  rsq_SSE0,rinv_SSE0,rinvsq_SSE0;
84             gmx_mm_pr  rsq_SSE1,rinv_SSE1,rinvsq_SSE1;
85             gmx_mm_pr  rsq_SSE2,rinv_SSE2,rinvsq_SSE2;
86             gmx_mm_pr  rsq_SSE3,rinv_SSE3,rinvsq_SSE3;
87 #ifndef CUTOFF_BLENDV
88             /* wco: within cut-off, mask of all 1's or 0's */
89             gmx_mm_pr  wco_SSE0;
90             gmx_mm_pr  wco_SSE1;
91             gmx_mm_pr  wco_SSE2;
92             gmx_mm_pr  wco_SSE3;
93 #endif
94 #ifdef VDW_CUTOFF_CHECK
95             gmx_mm_pr  wco_vdw_SSE0;
96             gmx_mm_pr  wco_vdw_SSE1;
97 #ifndef HALF_LJ
98             gmx_mm_pr  wco_vdw_SSE2;
99             gmx_mm_pr  wco_vdw_SSE3;
100 #endif
101 #endif
102 #ifdef CALC_COULOMB
103 #ifdef CHECK_EXCLS
104             /* 1/r masked with the interaction mask */
105             gmx_mm_pr  rinv_ex_SSE0;
106             gmx_mm_pr  rinv_ex_SSE1;
107             gmx_mm_pr  rinv_ex_SSE2;
108             gmx_mm_pr  rinv_ex_SSE3;
109 #endif
110             gmx_mm_pr  jq_SSE;
111             gmx_mm_pr  qq_SSE0;
112             gmx_mm_pr  qq_SSE1;
113             gmx_mm_pr  qq_SSE2;
114             gmx_mm_pr  qq_SSE3;
115 #ifdef CALC_COUL_TAB
116             /* The force (PME mesh force) we need to subtract from 1/r^2 */
117             gmx_mm_pr  fsub_SSE0;
118             gmx_mm_pr  fsub_SSE1;
119             gmx_mm_pr  fsub_SSE2;
120             gmx_mm_pr  fsub_SSE3;
121 #endif
122             /* frcoul = (1/r - fsub)*r */
123             gmx_mm_pr  frcoul_SSE0;
124             gmx_mm_pr  frcoul_SSE1;
125             gmx_mm_pr  frcoul_SSE2;
126             gmx_mm_pr  frcoul_SSE3;
127 #ifdef CALC_COUL_TAB
128             /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
129             gmx_mm_pr  r_SSE0,rs_SSE0,rf_SSE0,frac_SSE0;
130             gmx_mm_pr  r_SSE1,rs_SSE1,rf_SSE1,frac_SSE1;
131             gmx_mm_pr  r_SSE2,rs_SSE2,rf_SSE2,frac_SSE2;
132             gmx_mm_pr  r_SSE3,rs_SSE3,rf_SSE3,frac_SSE3;
133             /* Table index: rs converted to an int */ 
134 #if !(defined GMX_MM256_HERE && defined GMX_DOUBLE)
135             gmx_epi32  ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
136 #else
137             __m128i    ti_SSE0,ti_SSE1,ti_SSE2,ti_SSE3;
138 #endif
139             /* Linear force table values */
140             gmx_mm_pr  ctab0_SSE0,ctab1_SSE0;
141             gmx_mm_pr  ctab0_SSE1,ctab1_SSE1;
142             gmx_mm_pr  ctab0_SSE2,ctab1_SSE2;
143             gmx_mm_pr  ctab0_SSE3,ctab1_SSE3;
144 #ifdef CALC_ENERGIES
145             /* Quadratic energy table value */
146             gmx_mm_pr  ctabv_SSE0;
147             gmx_mm_pr  ctabv_SSE1;
148             gmx_mm_pr  ctabv_SSE2;
149             gmx_mm_pr  ctabv_SSE3;
150             /* The potential (PME mesh) we need to subtract from 1/r */
151             gmx_mm_pr  vc_sub_SSE0;
152             gmx_mm_pr  vc_sub_SSE1;
153             gmx_mm_pr  vc_sub_SSE2;
154             gmx_mm_pr  vc_sub_SSE3;
155 #endif
156 #endif
157 #ifdef CALC_ENERGIES
158             /* Electrostatic potential */
159             gmx_mm_pr  vcoul_SSE0;
160             gmx_mm_pr  vcoul_SSE1;
161             gmx_mm_pr  vcoul_SSE2;
162             gmx_mm_pr  vcoul_SSE3;
163 #endif
164 #endif
165             /* The force times 1/r */
166             gmx_mm_pr  fscal_SSE0;
167             gmx_mm_pr  fscal_SSE1;
168             gmx_mm_pr  fscal_SSE2;
169             gmx_mm_pr  fscal_SSE3;
170
171 #ifdef CALC_LJ
172 #ifdef LJ_COMB_LB
173             /* LJ sigma_j/2 and sqrt(epsilon_j) */
174             gmx_mm_pr  hsig_j_SSE,seps_j_SSE;
175             /* LJ sigma_ij and epsilon_ij */
176             gmx_mm_pr  sig_SSE0,eps_SSE0;
177             gmx_mm_pr  sig_SSE1,eps_SSE1;
178 #ifndef HALF_LJ
179             gmx_mm_pr  sig_SSE2,eps_SSE2;
180             gmx_mm_pr  sig_SSE3,eps_SSE3;
181 #endif
182 #ifdef CALC_ENERGIES
183             gmx_mm_pr  sig2_SSE0,sig6_SSE0;
184             gmx_mm_pr  sig2_SSE1,sig6_SSE1;
185 #ifndef HALF_LJ
186             gmx_mm_pr  sig2_SSE2,sig6_SSE2;
187             gmx_mm_pr  sig2_SSE3,sig6_SSE3;
188 #endif
189 #endif /* LJ_COMB_LB */
190 #endif /* CALC_LJ */
191
192 #ifdef LJ_COMB_GEOM
193             gmx_mm_pr  c6s_j_SSE,c12s_j_SSE;
194 #endif
195
196 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB
197             /* Index for loading LJ parameters, complicated when interleaving */
198             int         aj2;
199 #endif
200
201 #ifndef FIX_LJ_C
202             /* LJ C6 and C12 parameters, used with geometric comb. rule */
203             gmx_mm_pr  c6_SSE0,c12_SSE0;
204             gmx_mm_pr  c6_SSE1,c12_SSE1;
205 #ifndef HALF_LJ
206             gmx_mm_pr  c6_SSE2,c12_SSE2;
207             gmx_mm_pr  c6_SSE3,c12_SSE3;
208 #endif
209 #endif
210
211             /* Intermediate variables for LJ calculation */
212 #ifndef LJ_COMB_LB
213             gmx_mm_pr  rinvsix_SSE0;
214             gmx_mm_pr  rinvsix_SSE1;
215 #ifndef HALF_LJ
216             gmx_mm_pr  rinvsix_SSE2;
217             gmx_mm_pr  rinvsix_SSE3;
218 #endif
219 #endif
220 #ifdef LJ_COMB_LB
221             gmx_mm_pr  sir_SSE0,sir2_SSE0,sir6_SSE0;
222             gmx_mm_pr  sir_SSE1,sir2_SSE1,sir6_SSE1;
223 #ifndef HALF_LJ
224             gmx_mm_pr  sir_SSE2,sir2_SSE2,sir6_SSE2;
225             gmx_mm_pr  sir_SSE3,sir2_SSE3,sir6_SSE3;
226 #endif
227 #endif
228
229             gmx_mm_pr  FrLJ6_SSE0,FrLJ12_SSE0;
230             gmx_mm_pr  FrLJ6_SSE1,FrLJ12_SSE1;
231 #ifndef HALF_LJ
232             gmx_mm_pr  FrLJ6_SSE2,FrLJ12_SSE2;
233             gmx_mm_pr  FrLJ6_SSE3,FrLJ12_SSE3;
234 #endif
235 #ifdef CALC_ENERGIES
236             gmx_mm_pr  VLJ6_SSE0,VLJ12_SSE0,VLJ_SSE0;
237             gmx_mm_pr  VLJ6_SSE1,VLJ12_SSE1,VLJ_SSE1;
238 #ifndef HALF_LJ
239             gmx_mm_pr  VLJ6_SSE2,VLJ12_SSE2,VLJ_SSE2;
240             gmx_mm_pr  VLJ6_SSE3,VLJ12_SSE3,VLJ_SSE3;
241 #endif
242 #endif
243 #endif /* CALC_LJ */
244
245             /* j-cluster index */
246             cj            = l_cj[cjind].cj;
247
248             /* Atom indices (of the first atom in the cluster) */
249             aj            = cj*UNROLLJ;
250 #if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
251 #if UNROLLJ == STRIDE
252             aj2           = aj*2;
253 #else
254             aj2           = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
255 #endif
256 #endif
257 #if UNROLLJ == STRIDE
258             ajx           = aj*DIM;
259 #else
260             ajx           = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
261 #endif
262             ajy           = ajx + STRIDE;
263             ajz           = ajy + STRIDE;
264
265 #ifdef CHECK_EXCLS
266 #ifndef GMX_MM256_HERE
267             {
268                 /* Load integer interaction mask */
269                 __m128i mask_int = _mm_set1_epi32(l_cj[cjind].excl);
270
271                 /* The is no unequal sse instruction, so we need a not here */
272                 int_SSE0  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask0),zeroi_SSE));
273                 int_SSE1  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask1),zeroi_SSE));
274                 int_SSE2  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask2),zeroi_SSE));
275                 int_SSE3  = gmx_mm_castsi128_pr(_mm_cmpeq_epi32(_mm_andnot_si128(mask_int,mask3),zeroi_SSE));
276             }
277 #else
278             {
279 #ifndef GMX_DOUBLE
280                 /* Load integer interaction mask */
281                 /* With AVX there are no integer operations, so cast to real */
282                 gmx_mm_pr mask_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl));
283                 /* We can't compare all 4*8=32 float bits: shift the mask */
284                 gmx_mm_pr masksh_pr = gmx_mm_castsi256_pr(_mm256_set1_epi32(l_cj[cjind].excl>>(2*UNROLLJ)));
285                 /* Intel Compiler version 12.1.3 20120130 is buggy: use cast.
286                  * With gcc we don't need the cast, but it's faster.
287                  */
288 #define cast_cvt(x)  _mm256_cvtepi32_ps(_mm256_castps_si256(x))
289                 int_SSE0  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask0)),zero_SSE);
290                 int_SSE1  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(mask_pr,mask1)),zero_SSE);
291                 int_SSE2  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr,mask0)),zero_SSE);
292                 int_SSE3  = gmx_cmpneq_pr(cast_cvt(gmx_and_pr(masksh_pr,mask1)),zero_SSE);
293 #undef cast_cvt
294 #else
295                 /* Load integer interaction mask */
296                 /* With AVX there are no integer operations,
297                  * and there is no int to double conversion, so cast to float
298                  */
299                 __m256 mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(l_cj[cjind].excl));
300 #define cast_cvt(x)  _mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castps_si256(x)))
301                 int_SSE0  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask0)),zero_SSE);
302                 int_SSE1  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask1)),zero_SSE);
303                 int_SSE2  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask2)),zero_SSE);
304                 int_SSE3  = gmx_cmpneq_pr(cast_cvt(_mm256_and_ps(mask_ps,mask3)),zero_SSE);
305 #undef cast_cvt
306 #endif
307             }
308 #endif
309 #endif
310             /* load j atom coordinates */
311             jxSSE         = gmx_load_pr(x+ajx);
312             jySSE         = gmx_load_pr(x+ajy);
313             jzSSE         = gmx_load_pr(x+ajz);
314             
315             /* Calculate distance */
316             dx_SSE0       = gmx_sub_pr(ix_SSE0,jxSSE);
317             dy_SSE0       = gmx_sub_pr(iy_SSE0,jySSE);
318             dz_SSE0       = gmx_sub_pr(iz_SSE0,jzSSE);
319             dx_SSE1       = gmx_sub_pr(ix_SSE1,jxSSE);
320             dy_SSE1       = gmx_sub_pr(iy_SSE1,jySSE);
321             dz_SSE1       = gmx_sub_pr(iz_SSE1,jzSSE);
322             dx_SSE2       = gmx_sub_pr(ix_SSE2,jxSSE);
323             dy_SSE2       = gmx_sub_pr(iy_SSE2,jySSE);
324             dz_SSE2       = gmx_sub_pr(iz_SSE2,jzSSE);
325             dx_SSE3       = gmx_sub_pr(ix_SSE3,jxSSE);
326             dy_SSE3       = gmx_sub_pr(iy_SSE3,jySSE);
327             dz_SSE3       = gmx_sub_pr(iz_SSE3,jzSSE);
328             
329             /* rsq = dx*dx+dy*dy+dz*dz */
330             rsq_SSE0      = gmx_calc_rsq_pr(dx_SSE0,dy_SSE0,dz_SSE0);
331             rsq_SSE1      = gmx_calc_rsq_pr(dx_SSE1,dy_SSE1,dz_SSE1);
332             rsq_SSE2      = gmx_calc_rsq_pr(dx_SSE2,dy_SSE2,dz_SSE2);
333             rsq_SSE3      = gmx_calc_rsq_pr(dx_SSE3,dy_SSE3,dz_SSE3);
334
335 #ifndef CUTOFF_BLENDV
336             wco_SSE0      = gmx_cmplt_pr(rsq_SSE0,rc2_SSE);
337             wco_SSE1      = gmx_cmplt_pr(rsq_SSE1,rc2_SSE);
338             wco_SSE2      = gmx_cmplt_pr(rsq_SSE2,rc2_SSE);
339             wco_SSE3      = gmx_cmplt_pr(rsq_SSE3,rc2_SSE);
340 #endif
341
342 #ifdef CHECK_EXCLS
343 #ifdef EXCL_FORCES
344             /* Only remove the (sub-)diagonal to avoid double counting */
345 #if UNROLLJ == UNROLLI
346             if (cj == ci_sh)
347             {
348                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag_SSE0);
349                 wco_SSE1  = gmx_and_pr(wco_SSE1,diag_SSE1);
350                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag_SSE2);
351                 wco_SSE3  = gmx_and_pr(wco_SSE3,diag_SSE3);
352             }
353 #else
354 #if UNROLLJ < UNROLLI
355             if (cj == ci_sh*2)
356             {
357                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag0_SSE0);
358                 wco_SSE1  = gmx_and_pr(wco_SSE1,diag0_SSE1);
359                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag0_SSE2);
360                 wco_SSE3  = gmx_and_pr(wco_SSE3,diag0_SSE3);
361             }
362             if (cj == ci_sh*2 + 1)
363             { 
364                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag1_SSE0);
365                 wco_SSE1  = gmx_and_pr(wco_SSE1,diag1_SSE1);
366                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag1_SSE2);
367                 wco_SSE3  = gmx_and_pr(wco_SSE3,diag1_SSE3);
368             }
369 #else
370             if (cj*2 == ci_sh)
371             {
372                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag0_SSE0);
373                 wco_SSE1  = gmx_and_pr(wco_SSE1,diag0_SSE1);
374                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag0_SSE2);
375                 wco_SSE3  = gmx_and_pr(wco_SSE3,diag0_SSE3);
376             }
377             else if (cj*2 + 1 == ci_sh)
378             {
379                 wco_SSE0  = gmx_and_pr(wco_SSE0,diag1_SSE0);
380                 wco_SSE1  = gmx_and_pr(wco_SSE1,diag1_SSE1);
381                 wco_SSE2  = gmx_and_pr(wco_SSE2,diag1_SSE2);
382                 wco_SSE3  = gmx_and_pr(wco_SSE3,diag1_SSE3);
383             }
384 #endif
385 #endif
386 #else /* EXCL_FORCES */
387             /* Remove all excluded atom pairs from the list */
388             wco_SSE0      = gmx_and_pr(wco_SSE0,int_SSE0);
389             wco_SSE1      = gmx_and_pr(wco_SSE1,int_SSE1);
390             wco_SSE2      = gmx_and_pr(wco_SSE2,int_SSE2);
391             wco_SSE3      = gmx_and_pr(wco_SSE3,int_SSE3);
392 #endif
393 #endif
394
395 #ifdef COUNT_PAIRS
396             {
397                 int i,j;
398                 real tmp[UNROLLJ];
399                 for(i=0; i<UNROLLI; i++)
400                 {
401                     gmx_storeu_pr(tmp,i==0 ? wco_SSE0 : (i==1 ? wco_SSE1 : (i==2 ? wco_SSE2 : wco_SSE3)));
402                     for(j=0; j<UNROLLJ; j++)
403                     {
404                         if (!(tmp[j] == 0))
405                         {
406                             npair++;
407                         }
408                     }
409                 }
410             }
411 #endif
412
413 #ifdef CHECK_EXCLS
414             /* For excluded pairs add a small number to avoid r^-6 = NaN */
415             rsq_SSE0      = gmx_add_pr(rsq_SSE0,gmx_andnot_pr(int_SSE0,avoid_sing_SSE));
416             rsq_SSE1      = gmx_add_pr(rsq_SSE1,gmx_andnot_pr(int_SSE1,avoid_sing_SSE));
417             rsq_SSE2      = gmx_add_pr(rsq_SSE2,gmx_andnot_pr(int_SSE2,avoid_sing_SSE));
418             rsq_SSE3      = gmx_add_pr(rsq_SSE3,gmx_andnot_pr(int_SSE3,avoid_sing_SSE));
419 #endif
420
421             /* Calculate 1/r */
422 #ifndef GMX_DOUBLE
423             rinv_SSE0     = gmx_invsqrt_pr(rsq_SSE0);
424             rinv_SSE1     = gmx_invsqrt_pr(rsq_SSE1);
425             rinv_SSE2     = gmx_invsqrt_pr(rsq_SSE2);
426             rinv_SSE3     = gmx_invsqrt_pr(rsq_SSE3);
427 #else
428             GMX_MM_INVSQRT2_PD(rsq_SSE0,rsq_SSE1,rinv_SSE0,rinv_SSE1);
429             GMX_MM_INVSQRT2_PD(rsq_SSE2,rsq_SSE3,rinv_SSE2,rinv_SSE3);
430 #endif
431
432 #ifdef CALC_COULOMB
433             /* Load parameters for j atom */
434             jq_SSE        = gmx_load_pr(q+aj);
435             qq_SSE0       = gmx_mul_pr(iq_SSE0,jq_SSE);
436             qq_SSE1       = gmx_mul_pr(iq_SSE1,jq_SSE);
437             qq_SSE2       = gmx_mul_pr(iq_SSE2,jq_SSE);
438             qq_SSE3       = gmx_mul_pr(iq_SSE3,jq_SSE);
439 #endif
440
441 #ifdef CALC_LJ
442
443 #if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
444             load_lj_pair_params(nbfp0,type,aj,c6_SSE0,c12_SSE0);
445             load_lj_pair_params(nbfp1,type,aj,c6_SSE1,c12_SSE1);
446 #ifndef HALF_LJ
447             load_lj_pair_params(nbfp2,type,aj,c6_SSE2,c12_SSE2);
448             load_lj_pair_params(nbfp3,type,aj,c6_SSE3,c12_SSE3);
449 #endif
450 #endif /* not defined any LJ rule */
451
452 #ifdef LJ_COMB_GEOM
453             c6s_j_SSE     = gmx_load_pr(ljc+aj2+0);
454             c12s_j_SSE    = gmx_load_pr(ljc+aj2+STRIDE);
455             c6_SSE0       = gmx_mul_pr(c6s_SSE0 ,c6s_j_SSE );
456             c6_SSE1       = gmx_mul_pr(c6s_SSE1 ,c6s_j_SSE );
457 #ifndef HALF_LJ
458             c6_SSE2       = gmx_mul_pr(c6s_SSE2 ,c6s_j_SSE );
459             c6_SSE3       = gmx_mul_pr(c6s_SSE3 ,c6s_j_SSE );
460 #endif
461             c12_SSE0      = gmx_mul_pr(c12s_SSE0,c12s_j_SSE);
462             c12_SSE1      = gmx_mul_pr(c12s_SSE1,c12s_j_SSE);
463 #ifndef HALF_LJ
464             c12_SSE2      = gmx_mul_pr(c12s_SSE2,c12s_j_SSE);
465             c12_SSE3      = gmx_mul_pr(c12s_SSE3,c12s_j_SSE);
466 #endif
467 #endif /* LJ_COMB_GEOM */
468
469 #ifdef LJ_COMB_LB
470             hsig_j_SSE    = gmx_load_pr(ljc+aj2+0);
471             seps_j_SSE    = gmx_load_pr(ljc+aj2+STRIDE);
472
473             sig_SSE0      = gmx_add_pr(hsig_i_SSE0,hsig_j_SSE);
474             sig_SSE1      = gmx_add_pr(hsig_i_SSE1,hsig_j_SSE);
475             eps_SSE0      = gmx_mul_pr(seps_i_SSE0,seps_j_SSE);
476             eps_SSE1      = gmx_mul_pr(seps_i_SSE1,seps_j_SSE);
477 #ifndef HALF_LJ
478             sig_SSE2      = gmx_add_pr(hsig_i_SSE2,hsig_j_SSE);
479             sig_SSE3      = gmx_add_pr(hsig_i_SSE3,hsig_j_SSE);
480             eps_SSE2      = gmx_mul_pr(seps_i_SSE2,seps_j_SSE);
481             eps_SSE3      = gmx_mul_pr(seps_i_SSE3,seps_j_SSE);
482 #endif
483 #endif /* LJ_COMB_LB */
484
485 #endif /* CALC_LJ */
486
487 #ifndef CUTOFF_BLENDV
488             rinv_SSE0     = gmx_and_pr(rinv_SSE0,wco_SSE0);
489             rinv_SSE1     = gmx_and_pr(rinv_SSE1,wco_SSE1);
490             rinv_SSE2     = gmx_and_pr(rinv_SSE2,wco_SSE2);
491             rinv_SSE3     = gmx_and_pr(rinv_SSE3,wco_SSE3);
492 #else
493             /* We only need to mask for the cut-off: blendv is faster */
494             rinv_SSE0     = gmx_blendv_pr(rinv_SSE0,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE0));
495             rinv_SSE1     = gmx_blendv_pr(rinv_SSE1,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE1));
496             rinv_SSE2     = gmx_blendv_pr(rinv_SSE2,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE2));
497             rinv_SSE3     = gmx_blendv_pr(rinv_SSE3,zero_SSE,gmx_sub_pr(rc2_SSE,rsq_SSE3));
498 #endif
499
500             rinvsq_SSE0   = gmx_mul_pr(rinv_SSE0,rinv_SSE0);
501             rinvsq_SSE1   = gmx_mul_pr(rinv_SSE1,rinv_SSE1);
502             rinvsq_SSE2   = gmx_mul_pr(rinv_SSE2,rinv_SSE2);
503             rinvsq_SSE3   = gmx_mul_pr(rinv_SSE3,rinv_SSE3);
504
505 #ifdef CALC_COULOMB
506             /* Note that here we calculate force*r, not the usual force/r.
507              * This allows avoiding masking the reaction-field contribution,
508              * as frcoul is later multiplied by rinvsq which has been
509              * masked with the cut-off check.
510              */
511
512 #ifdef EXCL_FORCES
513             /* Only add 1/r for non-excluded atom pairs */
514             rinv_ex_SSE0  = gmx_and_pr(rinv_SSE0,int_SSE0);
515             rinv_ex_SSE1  = gmx_and_pr(rinv_SSE1,int_SSE1);
516             rinv_ex_SSE2  = gmx_and_pr(rinv_SSE2,int_SSE2);
517             rinv_ex_SSE3  = gmx_and_pr(rinv_SSE3,int_SSE3);
518 #else
519             /* No exclusion forces, we always need 1/r */
520 #define     rinv_ex_SSE0    rinv_SSE0
521 #define     rinv_ex_SSE1    rinv_SSE1
522 #define     rinv_ex_SSE2    rinv_SSE2
523 #define     rinv_ex_SSE3    rinv_SSE3
524 #endif
525
526 #ifdef CALC_COUL_RF
527             /* Electrostatic interactions */
528             frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_mul_pr(rsq_SSE0,mrc_3_SSE)));
529             frcoul_SSE1   = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_mul_pr(rsq_SSE1,mrc_3_SSE)));
530             frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_mul_pr(rsq_SSE2,mrc_3_SSE)));
531             frcoul_SSE3   = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_mul_pr(rsq_SSE3,mrc_3_SSE)));
532
533 #ifdef CALC_ENERGIES
534             vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_add_pr(rinv_ex_SSE0,gmx_add_pr(gmx_mul_pr(rsq_SSE0,hrc_3_SSE),moh_rc_SSE)));
535             vcoul_SSE1    = gmx_mul_pr(qq_SSE1,gmx_add_pr(rinv_ex_SSE1,gmx_add_pr(gmx_mul_pr(rsq_SSE1,hrc_3_SSE),moh_rc_SSE)));
536             vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_add_pr(rinv_ex_SSE2,gmx_add_pr(gmx_mul_pr(rsq_SSE2,hrc_3_SSE),moh_rc_SSE)));
537             vcoul_SSE3    = gmx_mul_pr(qq_SSE3,gmx_add_pr(rinv_ex_SSE3,gmx_add_pr(gmx_mul_pr(rsq_SSE3,hrc_3_SSE),moh_rc_SSE)));
538 #endif
539 #endif
540
541 #ifdef CALC_COUL_TAB
542             /* Electrostatic interactions */
543             r_SSE0        = gmx_mul_pr(rsq_SSE0,rinv_SSE0);
544             r_SSE1        = gmx_mul_pr(rsq_SSE1,rinv_SSE1);
545             r_SSE2        = gmx_mul_pr(rsq_SSE2,rinv_SSE2);
546             r_SSE3        = gmx_mul_pr(rsq_SSE3,rinv_SSE3);
547             /* Convert r to scaled table units */
548             rs_SSE0       = gmx_mul_pr(r_SSE0,invtsp_SSE);
549             rs_SSE1       = gmx_mul_pr(r_SSE1,invtsp_SSE);
550             rs_SSE2       = gmx_mul_pr(r_SSE2,invtsp_SSE);
551             rs_SSE3       = gmx_mul_pr(r_SSE3,invtsp_SSE);
552             /* Truncate scaled r to an int */
553             ti_SSE0       = gmx_cvttpr_epi32(rs_SSE0);
554             ti_SSE1       = gmx_cvttpr_epi32(rs_SSE1);
555             ti_SSE2       = gmx_cvttpr_epi32(rs_SSE2);
556             ti_SSE3       = gmx_cvttpr_epi32(rs_SSE3);
557 #ifdef GMX_X86_SSE4_1
558             /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
559             rf_SSE0       = gmx_floor_pr(rs_SSE0);
560             rf_SSE1       = gmx_floor_pr(rs_SSE1);
561             rf_SSE2       = gmx_floor_pr(rs_SSE2);
562             rf_SSE3       = gmx_floor_pr(rs_SSE3);
563 #else
564             rf_SSE0       = gmx_cvtepi32_pr(ti_SSE0);
565             rf_SSE1       = gmx_cvtepi32_pr(ti_SSE1);
566             rf_SSE2       = gmx_cvtepi32_pr(ti_SSE2);
567             rf_SSE3       = gmx_cvtepi32_pr(ti_SSE3);
568 #endif
569             frac_SSE0     = gmx_sub_pr(rs_SSE0,rf_SSE0);
570             frac_SSE1     = gmx_sub_pr(rs_SSE1,rf_SSE1);
571             frac_SSE2     = gmx_sub_pr(rs_SSE2,rf_SSE2);
572             frac_SSE3     = gmx_sub_pr(rs_SSE3,rf_SSE3);
573
574             /* Load and interpolate table forces and possibly energies.
575              * Force and energy can be combined in one table, stride 4: FDV0
576              * or in two separate tables with stride 1: F and V
577              * Currently single precision uses FDV0, double F and V.
578              */
579 #ifndef CALC_ENERGIES
580             load_table_f(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0);
581             load_table_f(tab_coul_F,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1);
582             load_table_f(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2);
583             load_table_f(tab_coul_F,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3);
584 #else
585 #ifdef TAB_FDV0
586             load_table_f_v(tab_coul_F,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
587             load_table_f_v(tab_coul_F,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1,ctabv_SSE1);
588             load_table_f_v(tab_coul_F,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
589             load_table_f_v(tab_coul_F,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3,ctabv_SSE3);
590 #else
591             load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE0,ti0,ctab0_SSE0,ctab1_SSE0,ctabv_SSE0);
592             load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE1,ti1,ctab0_SSE1,ctab1_SSE1,ctabv_SSE1);
593             load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE2,ti2,ctab0_SSE2,ctab1_SSE2,ctabv_SSE2);
594             load_table_f_v(tab_coul_F,tab_coul_V,ti_SSE3,ti3,ctab0_SSE3,ctab1_SSE3,ctabv_SSE3);
595 #endif
596 #endif
597             fsub_SSE0     = gmx_add_pr(ctab0_SSE0,gmx_mul_pr(frac_SSE0,ctab1_SSE0));
598             fsub_SSE1     = gmx_add_pr(ctab0_SSE1,gmx_mul_pr(frac_SSE1,ctab1_SSE1));
599             fsub_SSE2     = gmx_add_pr(ctab0_SSE2,gmx_mul_pr(frac_SSE2,ctab1_SSE2));
600             fsub_SSE3     = gmx_add_pr(ctab0_SSE3,gmx_mul_pr(frac_SSE3,ctab1_SSE3));
601             frcoul_SSE0   = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,gmx_mul_pr(fsub_SSE0,r_SSE0)));
602             frcoul_SSE1   = gmx_mul_pr(qq_SSE1,gmx_sub_pr(rinv_ex_SSE1,gmx_mul_pr(fsub_SSE1,r_SSE1)));
603             frcoul_SSE2   = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,gmx_mul_pr(fsub_SSE2,r_SSE2)));
604             frcoul_SSE3   = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,gmx_mul_pr(fsub_SSE3,r_SSE3)));
605
606 #ifdef CALC_ENERGIES
607             vc_sub_SSE0   = gmx_add_pr(ctabv_SSE0,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE0),gmx_add_pr(ctab0_SSE0,fsub_SSE0)));
608             vc_sub_SSE1   = gmx_add_pr(ctabv_SSE1,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE1),gmx_add_pr(ctab0_SSE1,fsub_SSE1)));
609             vc_sub_SSE2   = gmx_add_pr(ctabv_SSE2,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE2),gmx_add_pr(ctab0_SSE2,fsub_SSE2)));
610             vc_sub_SSE3   = gmx_add_pr(ctabv_SSE3,gmx_mul_pr(gmx_mul_pr(mhalfsp_SSE,frac_SSE3),gmx_add_pr(ctab0_SSE3,fsub_SSE3)));
611
612 #ifndef NO_SHIFT_EWALD
613             /* Add Ewald potential shift to vc_sub for convenience */
614 #ifdef CHECK_EXCLS
615             vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,gmx_and_pr(sh_ewald_SSE,int_SSE0));
616             vc_sub_SSE1   = gmx_add_pr(vc_sub_SSE1,gmx_and_pr(sh_ewald_SSE,int_SSE1));
617             vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,gmx_and_pr(sh_ewald_SSE,int_SSE2));
618             vc_sub_SSE3   = gmx_add_pr(vc_sub_SSE3,gmx_and_pr(sh_ewald_SSE,int_SSE3));
619 #else
620             vc_sub_SSE0   = gmx_add_pr(vc_sub_SSE0,sh_ewald_SSE);
621             vc_sub_SSE1   = gmx_add_pr(vc_sub_SSE1,sh_ewald_SSE);
622             vc_sub_SSE2   = gmx_add_pr(vc_sub_SSE2,sh_ewald_SSE);
623             vc_sub_SSE3   = gmx_add_pr(vc_sub_SSE3,sh_ewald_SSE);
624 #endif
625 #endif
626             
627             vcoul_SSE0    = gmx_mul_pr(qq_SSE0,gmx_sub_pr(rinv_ex_SSE0,vc_sub_SSE0));
628             vcoul_SSE1    = gmx_mul_pr(qq_SSE1,gmx_sub_pr(rinv_ex_SSE1,vc_sub_SSE1));
629             vcoul_SSE2    = gmx_mul_pr(qq_SSE2,gmx_sub_pr(rinv_ex_SSE2,vc_sub_SSE2));
630             vcoul_SSE3    = gmx_mul_pr(qq_SSE3,gmx_sub_pr(rinv_ex_SSE3,vc_sub_SSE3));
631
632 #endif
633 #endif
634
635 #ifdef CALC_ENERGIES
636             /* Mask energy for cut-off and diagonal */
637             vcoul_SSE0    = gmx_and_pr(vcoul_SSE0,wco_SSE0);
638             vcoul_SSE1    = gmx_and_pr(vcoul_SSE1,wco_SSE1);
639             vcoul_SSE2    = gmx_and_pr(vcoul_SSE2,wco_SSE2);
640             vcoul_SSE3    = gmx_and_pr(vcoul_SSE3,wco_SSE3);
641 #endif
642
643 #endif /* CALC_COULOMB */
644
645 #ifdef CALC_LJ
646             /* Lennard-Jones interaction */
647
648 #ifdef VDW_CUTOFF_CHECK
649             wco_vdw_SSE0  = gmx_cmplt_pr(rsq_SSE0,rcvdw2_SSE);
650             wco_vdw_SSE1  = gmx_cmplt_pr(rsq_SSE1,rcvdw2_SSE);
651 #ifndef HALF_LJ
652             wco_vdw_SSE2  = gmx_cmplt_pr(rsq_SSE2,rcvdw2_SSE);
653             wco_vdw_SSE3  = gmx_cmplt_pr(rsq_SSE3,rcvdw2_SSE);
654 #endif
655 #else
656             /* Same cut-off for Coulomb and VdW, reuse the registers */
657 #define     wco_vdw_SSE0    wco_SSE0
658 #define     wco_vdw_SSE1    wco_SSE1
659 #define     wco_vdw_SSE2    wco_SSE2
660 #define     wco_vdw_SSE3    wco_SSE3
661 #endif
662
663 #ifndef LJ_COMB_LB
664             rinvsix_SSE0  = gmx_mul_pr(rinvsq_SSE0,gmx_mul_pr(rinvsq_SSE0,rinvsq_SSE0));
665             rinvsix_SSE1  = gmx_mul_pr(rinvsq_SSE1,gmx_mul_pr(rinvsq_SSE1,rinvsq_SSE1));
666 #ifdef EXCL_FORCES
667             rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,int_SSE0);
668             rinvsix_SSE1  = gmx_and_pr(rinvsix_SSE1,int_SSE1);
669 #endif
670 #ifndef HALF_LJ
671             rinvsix_SSE2  = gmx_mul_pr(rinvsq_SSE2,gmx_mul_pr(rinvsq_SSE2,rinvsq_SSE2));
672             rinvsix_SSE3  = gmx_mul_pr(rinvsq_SSE3,gmx_mul_pr(rinvsq_SSE3,rinvsq_SSE3));
673 #ifdef EXCL_FORCES
674             rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,int_SSE2);
675             rinvsix_SSE3  = gmx_and_pr(rinvsix_SSE3,int_SSE3);
676 #endif
677 #endif
678 #ifdef VDW_CUTOFF_CHECK
679             rinvsix_SSE0  = gmx_and_pr(rinvsix_SSE0,wco_vdw_SSE0);
680             rinvsix_SSE1  = gmx_and_pr(rinvsix_SSE1,wco_vdw_SSE1);
681 #ifndef HALF_LJ
682             rinvsix_SSE2  = gmx_and_pr(rinvsix_SSE2,wco_vdw_SSE2);
683             rinvsix_SSE3  = gmx_and_pr(rinvsix_SSE3,wco_vdw_SSE3);
684 #endif
685 #endif
686             FrLJ6_SSE0    = gmx_mul_pr(c6_SSE0,rinvsix_SSE0);
687             FrLJ6_SSE1    = gmx_mul_pr(c6_SSE1,rinvsix_SSE1);
688 #ifndef HALF_LJ
689             FrLJ6_SSE2    = gmx_mul_pr(c6_SSE2,rinvsix_SSE2);
690             FrLJ6_SSE3    = gmx_mul_pr(c6_SSE3,rinvsix_SSE3);
691 #endif
692             FrLJ12_SSE0   = gmx_mul_pr(c12_SSE0,gmx_mul_pr(rinvsix_SSE0,rinvsix_SSE0));
693             FrLJ12_SSE1   = gmx_mul_pr(c12_SSE1,gmx_mul_pr(rinvsix_SSE1,rinvsix_SSE1));
694 #ifndef HALF_LJ
695             FrLJ12_SSE2   = gmx_mul_pr(c12_SSE2,gmx_mul_pr(rinvsix_SSE2,rinvsix_SSE2));
696             FrLJ12_SSE3   = gmx_mul_pr(c12_SSE3,gmx_mul_pr(rinvsix_SSE3,rinvsix_SSE3));
697 #endif
698 #endif /* not LJ_COMB_LB */
699
700 #ifdef LJ_COMB_LB
701             sir_SSE0      = gmx_mul_pr(sig_SSE0,rinv_SSE0);
702             sir_SSE1      = gmx_mul_pr(sig_SSE1,rinv_SSE1);
703 #ifndef HALF_LJ
704             sir_SSE2      = gmx_mul_pr(sig_SSE2,rinv_SSE2);
705             sir_SSE3      = gmx_mul_pr(sig_SSE3,rinv_SSE3);
706 #endif
707             sir2_SSE0     = gmx_mul_pr(sir_SSE0,sir_SSE0);
708             sir2_SSE1     = gmx_mul_pr(sir_SSE1,sir_SSE1);
709 #ifndef HALF_LJ
710             sir2_SSE2     = gmx_mul_pr(sir_SSE2,sir_SSE2);
711             sir2_SSE3     = gmx_mul_pr(sir_SSE3,sir_SSE3);
712 #endif
713             sir6_SSE0     = gmx_mul_pr(sir2_SSE0,gmx_mul_pr(sir2_SSE0,sir2_SSE0));
714             sir6_SSE1     = gmx_mul_pr(sir2_SSE1,gmx_mul_pr(sir2_SSE1,sir2_SSE1));
715 #ifdef EXCL_FORCES
716             sir6_SSE0     = gmx_and_pr(sir6_SSE0,int_SSE0);
717             sir6_SSE1     = gmx_and_pr(sir6_SSE1,int_SSE1);
718 #endif
719 #ifndef HALF_LJ
720             sir6_SSE2     = gmx_mul_pr(sir2_SSE2,gmx_mul_pr(sir2_SSE2,sir2_SSE2));
721             sir6_SSE3     = gmx_mul_pr(sir2_SSE3,gmx_mul_pr(sir2_SSE3,sir2_SSE3));
722 #ifdef EXCL_FORCES
723             sir6_SSE2     = gmx_and_pr(sir6_SSE2,int_SSE2);
724             sir6_SSE3     = gmx_and_pr(sir6_SSE3,int_SSE3);
725 #endif
726 #endif
727 #ifdef VDW_CUTOFF_CHECK
728             sir6_SSE0     = gmx_and_pr(sir6_SSE0,wco_vdw_SSE0);
729             sir6_SSE1     = gmx_and_pr(sir6_SSE1,wco_vdw_SSE1);
730 #ifndef HALF_LJ
731             sir6_SSE2     = gmx_and_pr(sir6_SSE2,wco_vdw_SSE2);
732             sir6_SSE3     = gmx_and_pr(sir6_SSE3,wco_vdw_SSE3);
733 #endif
734 #endif
735             FrLJ6_SSE0    = gmx_mul_pr(eps_SSE0,sir6_SSE0);
736             FrLJ6_SSE1    = gmx_mul_pr(eps_SSE1,sir6_SSE1);
737 #ifndef HALF_LJ
738             FrLJ6_SSE2    = gmx_mul_pr(eps_SSE2,sir6_SSE2);
739             FrLJ6_SSE3    = gmx_mul_pr(eps_SSE3,sir6_SSE3);
740 #endif
741             FrLJ12_SSE0   = gmx_mul_pr(FrLJ6_SSE0,sir6_SSE0);
742             FrLJ12_SSE1   = gmx_mul_pr(FrLJ6_SSE1,sir6_SSE1);
743 #ifndef HALF_LJ
744             FrLJ12_SSE2   = gmx_mul_pr(FrLJ6_SSE2,sir6_SSE2);
745             FrLJ12_SSE3   = gmx_mul_pr(FrLJ6_SSE3,sir6_SSE3);
746 #endif
747 #if defined CALC_ENERGIES
748             /* We need C6 and C12 to calculate the LJ potential shift */
749             sig2_SSE0     = gmx_mul_pr(sig_SSE0,sig_SSE0);
750             sig2_SSE1     = gmx_mul_pr(sig_SSE1,sig_SSE1);
751 #ifndef HALF_LJ
752             sig2_SSE2     = gmx_mul_pr(sig_SSE2,sig_SSE2);
753             sig2_SSE3     = gmx_mul_pr(sig_SSE3,sig_SSE3);
754 #endif
755             sig6_SSE0     = gmx_mul_pr(sig2_SSE0,gmx_mul_pr(sig2_SSE0,sig2_SSE0));
756             sig6_SSE1     = gmx_mul_pr(sig2_SSE1,gmx_mul_pr(sig2_SSE1,sig2_SSE1));
757 #ifndef HALF_LJ
758             sig6_SSE2     = gmx_mul_pr(sig2_SSE2,gmx_mul_pr(sig2_SSE2,sig2_SSE2));
759             sig6_SSE3     = gmx_mul_pr(sig2_SSE3,gmx_mul_pr(sig2_SSE3,sig2_SSE3));
760 #endif
761             c6_SSE0       = gmx_mul_pr(eps_SSE0,sig6_SSE0);
762             c6_SSE1       = gmx_mul_pr(eps_SSE1,sig6_SSE1);
763 #ifndef HALF_LJ
764             c6_SSE2       = gmx_mul_pr(eps_SSE2,sig6_SSE2);
765             c6_SSE3       = gmx_mul_pr(eps_SSE3,sig6_SSE3);
766 #endif
767             c12_SSE0      = gmx_mul_pr(c6_SSE0,sig6_SSE0);
768             c12_SSE1      = gmx_mul_pr(c6_SSE1,sig6_SSE1);
769 #ifndef HALF_LJ
770             c12_SSE2      = gmx_mul_pr(c6_SSE2,sig6_SSE2);
771             c12_SSE3      = gmx_mul_pr(c6_SSE3,sig6_SSE3);
772 #endif
773 #endif
774 #endif /* LJ_COMB_LB */
775
776 #endif /* CALC_LJ */
777             
778 #ifdef CALC_ENERGIES
779 #ifdef ENERGY_GROUPS
780             /* Extract the group pair index per j pair */
781 #if UNROLLJ == 2
782             egps_j        = nbat->energrp[cj>>1];
783             egp_jj[0]     = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
784 #else
785             egps_j        = nbat->energrp[cj];
786             for(jj=0; jj<(UNROLLJ>>1); jj++)
787             {
788                 egp_jj[jj]  = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
789             }
790 #endif
791 #endif
792
793 #ifdef CALC_COULOMB
794 #ifndef ENERGY_GROUPS
795             vctotSSE      = gmx_add_pr(vctotSSE, gmx_sum4_pr(vcoul_SSE0,vcoul_SSE1,vcoul_SSE2,vcoul_SSE3));
796 #else
797             add_ener_grp(vcoul_SSE0,vctp[0],egp_jj);
798             add_ener_grp(vcoul_SSE1,vctp[1],egp_jj);
799             add_ener_grp(vcoul_SSE2,vctp[2],egp_jj);
800             add_ener_grp(vcoul_SSE3,vctp[3],egp_jj);
801 #endif
802 #endif
803
804 #ifdef CALC_LJ
805             /* Calculate the LJ energies */
806             VLJ6_SSE0     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE0,gmx_mul_pr(c6_SSE0,sh_invrc6_SSE)));
807             VLJ6_SSE1     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE1,gmx_mul_pr(c6_SSE1,sh_invrc6_SSE)));
808 #ifndef HALF_LJ
809             VLJ6_SSE2     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE2,gmx_mul_pr(c6_SSE2,sh_invrc6_SSE)));
810             VLJ6_SSE3     = gmx_mul_pr(sixthSSE,gmx_sub_pr(FrLJ6_SSE3,gmx_mul_pr(c6_SSE3,sh_invrc6_SSE)));
811 #endif
812             VLJ12_SSE0    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE0,gmx_mul_pr(c12_SSE0,sh_invrc12_SSE)));
813             VLJ12_SSE1    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE1,gmx_mul_pr(c12_SSE1,sh_invrc12_SSE)));
814 #ifndef HALF_LJ
815             VLJ12_SSE2    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE2,gmx_mul_pr(c12_SSE2,sh_invrc12_SSE)));
816             VLJ12_SSE3    = gmx_mul_pr(twelvethSSE,gmx_sub_pr(FrLJ12_SSE3,gmx_mul_pr(c12_SSE3,sh_invrc12_SSE)));
817 #endif
818
819             VLJ_SSE0      = gmx_sub_pr(VLJ12_SSE0,VLJ6_SSE0);
820             VLJ_SSE1      = gmx_sub_pr(VLJ12_SSE1,VLJ6_SSE1);
821 #ifndef HALF_LJ
822             VLJ_SSE2      = gmx_sub_pr(VLJ12_SSE2,VLJ6_SSE2);
823             VLJ_SSE3      = gmx_sub_pr(VLJ12_SSE3,VLJ6_SSE3);
824 #endif
825             /* The potential shift should be removed for pairs beyond cut-off */
826             VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,wco_vdw_SSE0);
827             VLJ_SSE1      = gmx_and_pr(VLJ_SSE1,wco_vdw_SSE1);
828 #ifndef HALF_LJ
829             VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,wco_vdw_SSE2);
830             VLJ_SSE3      = gmx_and_pr(VLJ_SSE3,wco_vdw_SSE3);
831 #endif
832 #ifdef CHECK_EXCLS
833             /* The potential shift should be removed for excluded pairs */
834             VLJ_SSE0      = gmx_and_pr(VLJ_SSE0,int_SSE0);
835             VLJ_SSE1      = gmx_and_pr(VLJ_SSE1,int_SSE1);
836 #ifndef HALF_LJ
837             VLJ_SSE2      = gmx_and_pr(VLJ_SSE2,int_SSE2);
838             VLJ_SSE3      = gmx_and_pr(VLJ_SSE3,int_SSE3);
839 #endif
840 #endif
841 #ifndef ENERGY_GROUPS
842             VvdwtotSSE    = gmx_add_pr(VvdwtotSSE,
843 #ifndef HALF_LJ
844                                        gmx_sum4_pr(VLJ_SSE0,VLJ_SSE1,VLJ_SSE2,VLJ_SSE3)
845 #else
846                                        gmx_add_pr(VLJ_SSE0,VLJ_SSE1)
847 #endif
848                                       );
849 #else
850             add_ener_grp(VLJ_SSE0,vvdwtp[0],egp_jj);
851             add_ener_grp(VLJ_SSE1,vvdwtp[1],egp_jj);
852 #ifndef HALF_LJ
853             add_ener_grp(VLJ_SSE2,vvdwtp[2],egp_jj);
854             add_ener_grp(VLJ_SSE3,vvdwtp[3],egp_jj);
855 #endif
856 #endif
857 #endif /* CALC_LJ */
858 #endif /* CALC_ENERGIES */
859
860 #ifdef CALC_LJ
861             fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,
862 #ifdef CALC_COULOMB
863                                                    gmx_add_pr(frcoul_SSE0,
864 #else
865                                                    (
866 #endif
867                                                     gmx_sub_pr(FrLJ12_SSE0,FrLJ6_SSE0)));
868             fscal_SSE1    = gmx_mul_pr(rinvsq_SSE1,
869 #ifdef CALC_COULOMB
870                                                    gmx_add_pr(frcoul_SSE1,
871 #else
872                                                    (
873 #endif
874                                                     gmx_sub_pr(FrLJ12_SSE1,FrLJ6_SSE1)));
875 #else
876             fscal_SSE0    = gmx_mul_pr(rinvsq_SSE0,frcoul_SSE0);
877             fscal_SSE1    = gmx_mul_pr(rinvsq_SSE1,frcoul_SSE1);
878 #endif /* CALC_LJ */
879 #if defined CALC_LJ && !defined HALF_LJ
880             fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,
881 #ifdef CALC_COULOMB
882                                                    gmx_add_pr(frcoul_SSE2,
883 #else
884                                                    (
885 #endif
886                                                     gmx_sub_pr(FrLJ12_SSE2,FrLJ6_SSE2)));
887             fscal_SSE3    = gmx_mul_pr(rinvsq_SSE3,
888 #ifdef CALC_COULOMB
889                                                    gmx_add_pr(frcoul_SSE3,
890 #else
891                                                    (
892 #endif
893                                                     gmx_sub_pr(FrLJ12_SSE3,FrLJ6_SSE3)));
894 #else
895             /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
896             fscal_SSE2    = gmx_mul_pr(rinvsq_SSE2,frcoul_SSE2);
897             fscal_SSE3    = gmx_mul_pr(rinvsq_SSE3,frcoul_SSE3);
898 #endif
899             
900             /* Calculate temporary vectorial force */
901             tx_SSE0       = gmx_mul_pr(fscal_SSE0,dx_SSE0);
902             tx_SSE1       = gmx_mul_pr(fscal_SSE1,dx_SSE1);
903             tx_SSE2       = gmx_mul_pr(fscal_SSE2,dx_SSE2);
904             tx_SSE3       = gmx_mul_pr(fscal_SSE3,dx_SSE3);
905             ty_SSE0       = gmx_mul_pr(fscal_SSE0,dy_SSE0);
906             ty_SSE1       = gmx_mul_pr(fscal_SSE1,dy_SSE1);
907             ty_SSE2       = gmx_mul_pr(fscal_SSE2,dy_SSE2);
908             ty_SSE3       = gmx_mul_pr(fscal_SSE3,dy_SSE3);
909             tz_SSE0       = gmx_mul_pr(fscal_SSE0,dz_SSE0);
910             tz_SSE1       = gmx_mul_pr(fscal_SSE1,dz_SSE1);
911             tz_SSE2       = gmx_mul_pr(fscal_SSE2,dz_SSE2);
912             tz_SSE3       = gmx_mul_pr(fscal_SSE3,dz_SSE3);
913             
914             /* Increment i atom force */
915             fix_SSE0      = gmx_add_pr(fix_SSE0,tx_SSE0);
916             fix_SSE1      = gmx_add_pr(fix_SSE1,tx_SSE1);
917             fix_SSE2      = gmx_add_pr(fix_SSE2,tx_SSE2);
918             fix_SSE3      = gmx_add_pr(fix_SSE3,tx_SSE3);
919             fiy_SSE0      = gmx_add_pr(fiy_SSE0,ty_SSE0);
920             fiy_SSE1      = gmx_add_pr(fiy_SSE1,ty_SSE1);
921             fiy_SSE2      = gmx_add_pr(fiy_SSE2,ty_SSE2);
922             fiy_SSE3      = gmx_add_pr(fiy_SSE3,ty_SSE3);
923             fiz_SSE0      = gmx_add_pr(fiz_SSE0,tz_SSE0);
924             fiz_SSE1      = gmx_add_pr(fiz_SSE1,tz_SSE1);
925             fiz_SSE2      = gmx_add_pr(fiz_SSE2,tz_SSE2);
926             fiz_SSE3      = gmx_add_pr(fiz_SSE3,tz_SSE3);
927             
928             /* Decrement j atom force */
929             gmx_store_pr(f+ajx,
930                          gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_SSE0,tx_SSE1,tx_SSE2,tx_SSE3) ));
931             gmx_store_pr(f+ajy,
932                          gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_SSE0,ty_SSE1,ty_SSE2,ty_SSE3) ));
933             gmx_store_pr(f+ajz,
934                          gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_SSE0,tz_SSE1,tz_SSE2,tz_SSE3) ));
935         }
936
937 #undef  rinv_ex_SSE0
938 #undef  rinv_ex_SSE1
939 #undef  rinv_ex_SSE2
940 #undef  rinv_ex_SSE3
941
942 #undef  wco_vdw_SSE0
943 #undef  wco_vdw_SSE1
944 #undef  wco_vdw_SSE2
945 #undef  wco_vdw_SSE3
946
947 #undef  CUTOFF_BLENDV
948
949 #undef  EXCL_FORCES