added Verlet scheme and NxN non-bonded functionality
[alexxy/gromacs.git] / src / gmxlib / bondfree.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  *                This source code is part of
5  * 
6  *                 G   R   O   M   A   C   S
7  * 
8  *          GROningen MAchine for Chemical Simulations
9  * 
10  *                        VERSION 3.2.0
11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13  * Copyright (c) 2001-2004, The GROMACS development team,
14  * check out http://www.gromacs.org for more information.
15
16  * This program is free software; you can redistribute it and/or
17  * modify it under the terms of the GNU General Public License
18  * as published by the Free Software Foundation; either version 2
19  * of the License, or (at your option) any later version.
20  * 
21  * If you want to redistribute modifications, please consider that
22  * scientific software is very special. Version control is crucial -
23  * bugs must be traceable. We will be happy to consider code for
24  * inclusion in the official distribution, but derived work must not
25  * be called official GROMACS. Details are found in the README & COPYING
26  * files - if they are missing, get the official version at www.gromacs.org.
27  * 
28  * To help us fund GROMACS development, we humbly ask that you cite
29  * the papers on the package - you can find them in the top README file.
30  * 
31  * For more info, check our website at http://www.gromacs.org
32  * 
33  * And Hey:
34  * GROningen Mixture of Alchemy and Childrens' Stories
35  */
36 #ifdef HAVE_CONFIG_H
37 #include <config.h>
38 #endif
39
40 #include <math.h>
41 #include "physics.h"
42 #include "vec.h"
43 #include "maths.h"
44 #include "txtdump.h"
45 #include "bondf.h"
46 #include "smalloc.h"
47 #include "pbc.h"
48 #include "ns.h"
49 #include "macros.h"
50 #include "names.h"
51 #include "gmx_fatal.h"
52 #include "mshift.h"
53 #include "main.h"
54 #include "disre.h"
55 #include "orires.h"
56 #include "force.h"
57 #include "nonbonded.h"
58
59 #if !defined GMX_DOUBLE && defined GMX_X86_SSE2
60 #include "gmx_x86_simd_single.h"
61 #define SSE_PROPER_DIHEDRALS
62 #endif
63
64 /* Find a better place for this? */
65 const int cmap_coeff_matrix[] = {
66 1, 0, -3,  2, 0, 0,  0,  0, -3,  0,  9, -6,  2,  0, -6,  4 ,
67 0, 0,  0,  0, 0, 0,  0,  0,  3,  0, -9,  6, -2,  0,  6, -4,
68 0, 0,  0,  0, 0, 0,  0,  0,  0,  0,  9, -6,  0,  0, -6,  4 ,
69 0, 0,  3, -2, 0, 0,  0,  0,  0,  0, -9,  6,  0,  0,  6, -4,
70 0, 0,  0,  0, 1, 0, -3,  2, -2,  0,  6, -4,  1,  0, -3,  2 ,
71 0, 0,  0,  0, 0, 0,  0,  0, -1,  0,  3, -2,  1,  0, -3,  2 ,
72 0, 0,  0,  0, 0, 0,  0,  0,  0,  0, -3,  2,  0,  0,  3, -2,
73 0, 0,  0,  0, 0, 0,  3, -2,  0,  0, -6,  4,  0,  0,  3, -2,
74 0, 1, -2,  1, 0, 0,  0,  0,  0, -3,  6, -3,  0,  2, -4,  2 ,
75 0, 0,  0,  0, 0, 0,  0,  0,  0,  3, -6,  3,  0, -2,  4, -2,
76 0, 0,  0,  0, 0, 0,  0,  0,  0,  0, -3,  3,  0,  0,  2, -2,
77 0, 0, -1,  1, 0, 0,  0,  0,  0,  0,  3, -3,  0,  0, -2,  2 ,
78 0, 0,  0,  0, 0, 1, -2,  1,  0, -2,  4, -2,  0,  1, -2,  1,
79 0, 0,  0,  0, 0, 0,  0,  0,  0, -1,  2, -1,  0,  1, -2,  1,
80 0, 0,  0,  0, 0, 0,  0,  0,  0,  0,  1, -1,  0,  0, -1,  1,
81 0, 0,  0,  0, 0, 0, -1,  1,  0,  0,  2, -2,  0,  0, -1,  1
82 };
83
84
85
86 int glatnr(int *global_atom_index,int i)
87 {
88     int atnr;
89
90     if (global_atom_index == NULL) {
91         atnr = i + 1;
92     } else {
93         atnr = global_atom_index[i] + 1;
94     }
95
96     return atnr;
97 }
98
99 static int pbc_rvec_sub(const t_pbc *pbc,const rvec xi,const rvec xj,rvec dx)
100 {
101   if (pbc) {
102     return pbc_dx_aiuc(pbc,xi,xj,dx);
103   }
104   else {
105     rvec_sub(xi,xj,dx);
106     return CENTRAL;
107   }
108 }
109
110 /*
111  * Morse potential bond by Frank Everdij
112  *
113  * Three parameters needed:
114  *
115  * b0 = equilibrium distance in nm
116  * be = beta in nm^-1 (actually, it's nu_e*Sqrt(2*pi*pi*mu/D_e))
117  * cb = well depth in kJ/mol
118  *
119  * Note: the potential is referenced to be +cb at infinite separation
120  *       and zero at the equilibrium distance!
121  */
122
123 real morse_bonds(int nbonds,
124                  const t_iatom forceatoms[],const t_iparams forceparams[],
125                  const rvec x[],rvec f[],rvec fshift[],
126                  const t_pbc *pbc,const t_graph *g,
127                  real lambda,real *dvdlambda,
128                  const t_mdatoms *md,t_fcdata *fcd,
129                  int *global_atom_index)
130 {
131   const real one=1.0;
132   const real two=2.0;
133   real  dr,dr2,temp,omtemp,cbomtemp,fbond,vbond,fij,vtot;
134   real  b0,be,cb,b0A,beA,cbA,b0B,beB,cbB,L1;
135   rvec  dx;
136   int   i,m,ki,type,ai,aj;
137   ivec  dt;
138
139   vtot = 0.0;
140   for(i=0; (i<nbonds); ) {
141     type = forceatoms[i++];
142     ai   = forceatoms[i++];
143     aj   = forceatoms[i++];
144     
145     b0A   = forceparams[type].morse.b0A;
146     beA   = forceparams[type].morse.betaA;
147     cbA   = forceparams[type].morse.cbA;
148
149     b0B   = forceparams[type].morse.b0B;
150     beB   = forceparams[type].morse.betaB;
151     cbB   = forceparams[type].morse.cbB;
152
153     L1 = one-lambda;                      /* 1 */
154     b0 = L1*b0A + lambda*b0B;             /* 3 */
155     be = L1*beA + lambda*beB;             /* 3 */
156     cb = L1*cbA + lambda*cbB;             /* 3 */
157
158     ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);            /*   3          */
159     dr2  = iprod(dx,dx);                            /*   5          */
160     dr   = dr2*gmx_invsqrt(dr2);                        /*  10          */
161     temp = exp(-be*(dr-b0));                        /*  12          */
162     
163     if (temp == one)
164     {
165         /* bonds are constrainted. This may _not_ include bond constraints if they are lambda dependent */
166         *dvdlambda += cbB-cbA;
167         continue;
168     }
169
170     omtemp   = one-temp;                               /*   1          */
171     cbomtemp = cb*omtemp;                              /*   1          */
172     vbond    = cbomtemp*omtemp;                        /*   1          */
173     fbond    = -two*be*temp*cbomtemp*gmx_invsqrt(dr2); /*   9          */
174     vtot     += vbond;                                 /*   1          */
175
176     *dvdlambda += (cbB - cbA) * omtemp * omtemp - (2-2*omtemp)*omtemp * cb * ((b0B-b0A)*be - (beB-beA)*(dr-b0)); /* 15 */
177     
178     if (g) {
179       ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
180       ki = IVEC2IS(dt);
181     }
182
183     for (m=0; (m<DIM); m++) {                          /*  15          */
184       fij=fbond*dx[m];
185       f[ai][m]+=fij;
186       f[aj][m]-=fij;
187       fshift[ki][m]+=fij;
188       fshift[CENTRAL][m]-=fij;
189     }
190   }                                           /*  83 TOTAL    */
191   return vtot;
192 }
193
194 real cubic_bonds(int nbonds,
195                  const t_iatom forceatoms[],const t_iparams forceparams[],
196                  const rvec x[],rvec f[],rvec fshift[],
197                  const t_pbc *pbc,const t_graph *g,
198                  real lambda,real *dvdlambda,
199                  const t_mdatoms *md,t_fcdata *fcd,
200                  int *global_atom_index)
201 {
202   const real three = 3.0;
203   const real two   = 2.0;
204   real  kb,b0,kcub;
205   real  dr,dr2,dist,kdist,kdist2,fbond,vbond,fij,vtot;
206   rvec  dx;
207   int   i,m,ki,type,ai,aj;
208   ivec  dt;
209
210   vtot = 0.0;
211   for(i=0; (i<nbonds); ) {
212     type = forceatoms[i++];
213     ai   = forceatoms[i++];
214     aj   = forceatoms[i++];
215     
216     b0   = forceparams[type].cubic.b0;
217     kb   = forceparams[type].cubic.kb;
218     kcub = forceparams[type].cubic.kcub;
219
220     ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);                /*   3          */
221     dr2  = iprod(dx,dx);                                /*   5          */
222     
223     if (dr2 == 0.0)
224       continue;
225       
226     dr         = dr2*gmx_invsqrt(dr2);                      /*  10          */
227     dist       = dr-b0;
228     kdist      = kb*dist;
229     kdist2     = kdist*dist;
230     
231     vbond      = kdist2 + kcub*kdist2*dist;
232     fbond      = -(two*kdist + three*kdist2*kcub)/dr;
233
234     vtot      += vbond;       /* 21 */
235     
236     if (g) {
237       ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
238       ki=IVEC2IS(dt);
239     }
240     for (m=0; (m<DIM); m++) {                          /*  15          */
241       fij=fbond*dx[m];
242       f[ai][m]+=fij;
243       f[aj][m]-=fij;
244       fshift[ki][m]+=fij;
245       fshift[CENTRAL][m]-=fij;
246     }
247   }                                           /*  54 TOTAL    */
248   return vtot;
249 }
250
251 real FENE_bonds(int nbonds,
252                 const t_iatom forceatoms[],const t_iparams forceparams[],
253                 const rvec x[],rvec f[],rvec fshift[],
254                 const t_pbc *pbc,const t_graph *g,
255                 real lambda,real *dvdlambda,
256                 const t_mdatoms *md,t_fcdata *fcd,
257                 int *global_atom_index)
258 {
259   const real half=0.5;
260   const real one=1.0;
261   real  bm,kb;
262   real  dr,dr2,bm2,omdr2obm2,fbond,vbond,fij,vtot;
263   rvec  dx;
264   int   i,m,ki,type,ai,aj;
265   ivec  dt;
266
267   vtot = 0.0;
268   for(i=0; (i<nbonds); ) {
269     type = forceatoms[i++];
270     ai   = forceatoms[i++];
271     aj   = forceatoms[i++];
272     
273     bm   = forceparams[type].fene.bm;
274     kb   = forceparams[type].fene.kb;
275
276     ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);            /*   3          */
277     dr2  = iprod(dx,dx);                                /*   5          */
278     
279     if (dr2 == 0.0)
280       continue;
281
282     bm2 = bm*bm;
283
284     if (dr2 >= bm2)
285       gmx_fatal(FARGS,
286                 "r^2 (%f) >= bm^2 (%f) in FENE bond between atoms %d and %d",
287                 dr2,bm2,
288                 glatnr(global_atom_index,ai),
289                 glatnr(global_atom_index,aj));
290       
291     omdr2obm2  = one - dr2/bm2;
292     
293     vbond      = -half*kb*bm2*log(omdr2obm2);
294     fbond      = -kb/omdr2obm2;
295
296     vtot      += vbond;       /* 35 */
297     
298     if (g) {
299       ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
300       ki=IVEC2IS(dt);
301     }
302     for (m=0; (m<DIM); m++) {                          /*  15          */
303       fij=fbond*dx[m];
304       f[ai][m]+=fij;
305       f[aj][m]-=fij;
306       fshift[ki][m]+=fij;
307       fshift[CENTRAL][m]-=fij;
308     }
309   }                                           /*  58 TOTAL    */
310   return vtot;
311 }
312
313 real harmonic(real kA,real kB,real xA,real xB,real x,real lambda,
314               real *V,real *F)
315 {
316   const real half=0.5;
317   real  L1,kk,x0,dx,dx2;
318   real  v,f,dvdlambda;
319   
320   L1    = 1.0-lambda;
321   kk    = L1*kA+lambda*kB;
322   x0    = L1*xA+lambda*xB;
323
324   dx    = x-x0;
325   dx2   = dx*dx;
326
327   f     = -kk*dx;
328   v     = half*kk*dx2;
329   dvdlambda  = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
330
331   *F    = f;
332   *V    = v;
333
334   return dvdlambda;
335
336   /* That was 19 flops */
337 }
338
339
340 real bonds(int nbonds,
341            const t_iatom forceatoms[],const t_iparams forceparams[],
342            const rvec x[],rvec f[],rvec fshift[],
343            const t_pbc *pbc,const t_graph *g,
344            real lambda,real *dvdlambda,
345            const t_mdatoms *md,t_fcdata *fcd,
346            int *global_atom_index)
347 {
348   int  i,m,ki,ai,aj,type;
349   real dr,dr2,fbond,vbond,fij,vtot;
350   rvec dx;
351   ivec dt;
352
353   vtot = 0.0;
354   for(i=0; (i<nbonds); ) {
355     type = forceatoms[i++];
356     ai   = forceatoms[i++];
357     aj   = forceatoms[i++];
358   
359     ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);    /*   3          */
360     dr2  = iprod(dx,dx);                        /*   5          */
361     dr   = dr2*gmx_invsqrt(dr2);                        /*  10          */
362
363     *dvdlambda += harmonic(forceparams[type].harmonic.krA,
364                            forceparams[type].harmonic.krB,
365                            forceparams[type].harmonic.rA,
366                            forceparams[type].harmonic.rB,
367                            dr,lambda,&vbond,&fbond);  /*  19  */
368
369     if (dr2 == 0.0)
370       continue;
371
372     
373     vtot  += vbond;/* 1*/
374     fbond *= gmx_invsqrt(dr2);                  /*   6          */
375 #ifdef DEBUG
376     if (debug)
377       fprintf(debug,"BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
378               dr,vbond,fbond);
379 #endif
380     if (g) {
381       ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
382       ki=IVEC2IS(dt);
383     }
384     for (m=0; (m<DIM); m++) {                   /*  15          */
385       fij=fbond*dx[m];
386       f[ai][m]+=fij;
387       f[aj][m]-=fij;
388       fshift[ki][m]+=fij;
389       fshift[CENTRAL][m]-=fij;
390     }
391   }                                     /* 59 TOTAL     */
392   return vtot;
393 }
394
395 real restraint_bonds(int nbonds,
396                      const t_iatom forceatoms[],const t_iparams forceparams[],
397                      const rvec x[],rvec f[],rvec fshift[],
398                      const t_pbc *pbc,const t_graph *g,
399                      real lambda,real *dvdlambda,
400                      const t_mdatoms *md,t_fcdata *fcd,
401                      int *global_atom_index)
402 {
403     int  i,m,ki,ai,aj,type;
404     real dr,dr2,fbond,vbond,fij,vtot;
405     real L1;
406     real low,dlow,up1,dup1,up2,dup2,k,dk;
407     real drh,drh2;
408     rvec dx;
409     ivec dt;
410
411     L1   = 1.0 - lambda;
412
413     vtot = 0.0;
414     for(i=0; (i<nbonds); )
415     {
416         type = forceatoms[i++];
417         ai   = forceatoms[i++];
418         aj   = forceatoms[i++];
419         
420         ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);        /*   3          */
421         dr2  = iprod(dx,dx);                            /*   5          */
422         dr   = dr2*gmx_invsqrt(dr2);                    /*  10          */
423
424         low  = L1*forceparams[type].restraint.lowA + lambda*forceparams[type].restraint.lowB;
425         dlow =   -forceparams[type].restraint.lowA +        forceparams[type].restraint.lowB;
426         up1  = L1*forceparams[type].restraint.up1A + lambda*forceparams[type].restraint.up1B;
427         dup1 =   -forceparams[type].restraint.up1A +        forceparams[type].restraint.up1B;
428         up2  = L1*forceparams[type].restraint.up2A + lambda*forceparams[type].restraint.up2B;
429         dup2 =   -forceparams[type].restraint.up2A +        forceparams[type].restraint.up2B;
430         k    = L1*forceparams[type].restraint.kA   + lambda*forceparams[type].restraint.kB;
431         dk   =   -forceparams[type].restraint.kA   +        forceparams[type].restraint.kB;
432         /* 24 */
433
434         if (dr < low)
435         {
436             drh   = dr - low;
437             drh2  = drh*drh;
438             vbond = 0.5*k*drh2;
439             fbond = -k*drh;
440             *dvdlambda += 0.5*dk*drh2 - k*dlow*drh;
441         } /* 11 */
442         else if (dr <= up1)
443         {
444             vbond = 0;
445             fbond = 0;
446         }
447         else if (dr <= up2)
448         {
449             drh   = dr - up1;
450             drh2  = drh*drh;
451             vbond = 0.5*k*drh2;
452             fbond = -k*drh;
453             *dvdlambda += 0.5*dk*drh2 - k*dup1*drh;
454         } /* 11 */
455         else
456         {
457             drh   = dr - up2;
458             vbond = k*(up2 - up1)*(0.5*(up2 - up1) + drh);
459             fbond = -k*(up2 - up1);
460             *dvdlambda += dk*(up2 - up1)*(0.5*(up2 - up1) + drh)
461                 + k*(dup2 - dup1)*(up2 - up1 + drh)
462                 - k*(up2 - up1)*dup2;
463         }
464    
465         if (dr2 == 0.0)
466             continue;
467         
468         vtot  += vbond;/* 1*/
469         fbond *= gmx_invsqrt(dr2);                      /*   6          */
470 #ifdef DEBUG
471         if (debug)
472             fprintf(debug,"BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
473                     dr,vbond,fbond);
474 #endif
475         if (g) {
476             ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
477             ki=IVEC2IS(dt);
478         }
479         for (m=0; (m<DIM); m++) {                       /*  15          */
480             fij=fbond*dx[m];
481             f[ai][m]+=fij;
482             f[aj][m]-=fij;
483             fshift[ki][m]+=fij;
484             fshift[CENTRAL][m]-=fij;
485         }
486     }                                   /* 59 TOTAL     */
487
488     return vtot;
489 }
490
491 real polarize(int nbonds,
492               const t_iatom forceatoms[],const t_iparams forceparams[],
493               const rvec x[],rvec f[],rvec fshift[],
494               const t_pbc *pbc,const t_graph *g,
495               real lambda,real *dvdlambda,
496               const t_mdatoms *md,t_fcdata *fcd,
497               int *global_atom_index)
498 {
499   int  i,m,ki,ai,aj,type;
500   real dr,dr2,fbond,vbond,fij,vtot,ksh;
501   rvec dx;
502   ivec dt;
503
504   vtot = 0.0;
505   for(i=0; (i<nbonds); ) {
506     type = forceatoms[i++];
507     ai   = forceatoms[i++];
508     aj   = forceatoms[i++];
509     ksh  = sqr(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].polarize.alpha;
510     if (debug)
511       fprintf(debug,"POL: local ai = %d aj = %d ksh = %.3f\n",ai,aj,ksh);
512   
513     ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);    /*   3          */
514     dr2  = iprod(dx,dx);                        /*   5          */
515     dr   = dr2*gmx_invsqrt(dr2);                        /*  10          */
516
517     *dvdlambda += harmonic(ksh,ksh,0,0,dr,lambda,&vbond,&fbond);  /*  19  */
518
519     if (dr2 == 0.0)
520       continue;
521     
522     vtot  += vbond;/* 1*/
523     fbond *= gmx_invsqrt(dr2);                  /*   6          */
524
525     if (g) {
526       ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
527       ki=IVEC2IS(dt);
528     }
529     for (m=0; (m<DIM); m++) {                   /*  15          */
530       fij=fbond*dx[m];
531       f[ai][m]+=fij;
532       f[aj][m]-=fij;
533       fshift[ki][m]+=fij;
534       fshift[CENTRAL][m]-=fij;
535     }
536   }                                     /* 59 TOTAL     */
537   return vtot;
538 }
539
540 real anharm_polarize(int nbonds,
541                      const t_iatom forceatoms[],const t_iparams forceparams[],
542                      const rvec x[],rvec f[],rvec fshift[],
543                      const t_pbc *pbc,const t_graph *g,
544                      real lambda,real *dvdlambda,
545                      const t_mdatoms *md,t_fcdata *fcd,
546                      int *global_atom_index)
547 {
548   int  i,m,ki,ai,aj,type;
549   real dr,dr2,fbond,vbond,fij,vtot,ksh,khyp,drcut,ddr,ddr3;
550   rvec dx;
551   ivec dt;
552
553   vtot = 0.0;
554   for(i=0; (i<nbonds); ) {
555     type  = forceatoms[i++];
556     ai    = forceatoms[i++];
557     aj    = forceatoms[i++];
558     ksh   = sqr(md->chargeA[aj])*ONE_4PI_EPS0/forceparams[type].anharm_polarize.alpha; /* 7*/
559     khyp  = forceparams[type].anharm_polarize.khyp;
560     drcut = forceparams[type].anharm_polarize.drcut;
561     if (debug)
562       fprintf(debug,"POL: local ai = %d aj = %d ksh = %.3f\n",ai,aj,ksh);
563   
564     ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);    /*   3          */
565     dr2  = iprod(dx,dx);                        /*   5          */
566     dr   = dr2*gmx_invsqrt(dr2);                        /*  10          */
567
568     *dvdlambda += harmonic(ksh,ksh,0,0,dr,lambda,&vbond,&fbond);  /*  19  */
569
570     if (dr2 == 0.0)
571       continue;
572     
573     if (dr > drcut) {
574         ddr    = dr-drcut;
575         ddr3   = ddr*ddr*ddr;
576         vbond += khyp*ddr*ddr3;
577         fbond -= 4*khyp*ddr3;
578     }
579     fbond *= gmx_invsqrt(dr2);                  /*   6          */
580     vtot  += vbond;/* 1*/
581
582     if (g) {
583       ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
584       ki=IVEC2IS(dt);
585     }
586     for (m=0; (m<DIM); m++) {                   /*  15          */
587       fij=fbond*dx[m];
588       f[ai][m]+=fij;
589       f[aj][m]-=fij;
590       fshift[ki][m]+=fij;
591       fshift[CENTRAL][m]-=fij;
592     }
593   }                                     /* 72 TOTAL     */
594   return vtot;
595 }
596
597 real water_pol(int nbonds,
598                const t_iatom forceatoms[],const t_iparams forceparams[],
599                const rvec x[],rvec f[],rvec fshift[],
600                const t_pbc *pbc,const t_graph *g,
601                real lambda,real *dvdlambda,
602                const t_mdatoms *md,t_fcdata *fcd,
603                int *global_atom_index)
604 {
605   /* This routine implements anisotropic polarizibility for water, through
606    * a shell connected to a dummy with spring constant that differ in the
607    * three spatial dimensions in the molecular frame.
608    */
609   int  i,m,aO,aH1,aH2,aD,aS,type,type0;
610   rvec dOH1,dOH2,dHH,dOD,dDS,nW,kk,dx,kdx,proj;
611 #ifdef DEBUG
612   rvec df;
613 #endif
614   real vtot,fij,r_HH,r_OD,r_nW,tx,ty,tz,qS;
615
616   vtot = 0.0;
617   if (nbonds > 0) {
618     type0  = forceatoms[0];
619     aS     = forceatoms[5];
620     qS     = md->chargeA[aS];
621     kk[XX] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_x;
622     kk[YY] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_y;
623     kk[ZZ] = sqr(qS)*ONE_4PI_EPS0/forceparams[type0].wpol.al_z;
624     r_HH   = 1.0/forceparams[type0].wpol.rHH;
625     r_OD   = 1.0/forceparams[type0].wpol.rOD;
626     if (debug) {
627       fprintf(debug,"WPOL: qS  = %10.5f aS = %5d\n",qS,aS);
628       fprintf(debug,"WPOL: kk  = %10.3f        %10.3f        %10.3f\n",
629               kk[XX],kk[YY],kk[ZZ]);
630       fprintf(debug,"WPOL: rOH = %10.3f  rHH = %10.3f  rOD = %10.3f\n",
631               forceparams[type0].wpol.rOH,
632               forceparams[type0].wpol.rHH,
633               forceparams[type0].wpol.rOD);
634     }
635     for(i=0; (i<nbonds); i+=6) {
636       type = forceatoms[i];
637       if (type != type0)
638         gmx_fatal(FARGS,"Sorry, type = %d, type0 = %d, file = %s, line = %d",
639                     type,type0,__FILE__,__LINE__);
640       aO   = forceatoms[i+1];
641       aH1  = forceatoms[i+2];
642       aH2  = forceatoms[i+3];
643       aD   = forceatoms[i+4];
644       aS   = forceatoms[i+5];
645       
646       /* Compute vectors describing the water frame */
647       rvec_sub(x[aH1],x[aO], dOH1);
648       rvec_sub(x[aH2],x[aO], dOH2);
649       rvec_sub(x[aH2],x[aH1],dHH);
650       rvec_sub(x[aD], x[aO], dOD);
651       rvec_sub(x[aS], x[aD], dDS);
652       cprod(dOH1,dOH2,nW);
653       
654       /* Compute inverse length of normal vector 
655        * (this one could be precomputed, but I'm too lazy now)
656        */
657       r_nW = gmx_invsqrt(iprod(nW,nW));
658       /* This is for precision, but does not make a big difference,
659        * it can go later.
660        */
661       r_OD = gmx_invsqrt(iprod(dOD,dOD)); 
662       
663       /* Normalize the vectors in the water frame */
664       svmul(r_nW,nW,nW);
665       svmul(r_HH,dHH,dHH);
666       svmul(r_OD,dOD,dOD);
667       
668       /* Compute displacement of shell along components of the vector */
669       dx[ZZ] = iprod(dDS,dOD);
670       /* Compute projection on the XY plane: dDS - dx[ZZ]*dOD */
671       for(m=0; (m<DIM); m++)
672         proj[m] = dDS[m]-dx[ZZ]*dOD[m];
673       
674       /*dx[XX] = iprod(dDS,nW);
675         dx[YY] = iprod(dDS,dHH);*/
676       dx[XX] = iprod(proj,nW);
677       for(m=0; (m<DIM); m++)
678         proj[m] -= dx[XX]*nW[m];
679       dx[YY] = iprod(proj,dHH);
680       /*#define DEBUG*/
681 #ifdef DEBUG
682       if (debug) {
683         fprintf(debug,"WPOL: dx2=%10g  dy2=%10g  dz2=%10g  sum=%10g  dDS^2=%10g\n",
684                 sqr(dx[XX]),sqr(dx[YY]),sqr(dx[ZZ]),iprod(dx,dx),iprod(dDS,dDS));
685         fprintf(debug,"WPOL: dHH=(%10g,%10g,%10g)\n",dHH[XX],dHH[YY],dHH[ZZ]);
686         fprintf(debug,"WPOL: dOD=(%10g,%10g,%10g), 1/r_OD = %10g\n",
687                 dOD[XX],dOD[YY],dOD[ZZ],1/r_OD);
688         fprintf(debug,"WPOL: nW =(%10g,%10g,%10g), 1/r_nW = %10g\n",
689                 nW[XX],nW[YY],nW[ZZ],1/r_nW);
690         fprintf(debug,"WPOL: dx  =%10g, dy  =%10g, dz  =%10g\n",
691                 dx[XX],dx[YY],dx[ZZ]);
692         fprintf(debug,"WPOL: dDSx=%10g, dDSy=%10g, dDSz=%10g\n",
693                 dDS[XX],dDS[YY],dDS[ZZ]);
694       }
695 #endif
696       /* Now compute the forces and energy */
697       kdx[XX] = kk[XX]*dx[XX];
698       kdx[YY] = kk[YY]*dx[YY];
699       kdx[ZZ] = kk[ZZ]*dx[ZZ];
700       vtot   += iprod(dx,kdx);
701       for(m=0; (m<DIM); m++) {
702         /* This is a tensor operation but written out for speed */
703         tx        =  nW[m]*kdx[XX];
704         ty        = dHH[m]*kdx[YY];
705         tz        = dOD[m]*kdx[ZZ];
706         fij       = -tx-ty-tz;
707 #ifdef DEBUG
708         df[m] = fij;
709 #endif
710         f[aS][m] += fij;
711         f[aD][m] -= fij;
712       }
713 #ifdef DEBUG
714       if (debug) {
715         fprintf(debug,"WPOL: vwpol=%g\n",0.5*iprod(dx,kdx));
716         fprintf(debug,"WPOL: df = (%10g, %10g, %10g)\n",df[XX],df[YY],df[ZZ]);
717       }
718 #endif
719     }   
720   }
721   return 0.5*vtot;
722 }
723
724 static real do_1_thole(const rvec xi,const rvec xj,rvec fi,rvec fj,
725                        const t_pbc *pbc,real qq,
726                        rvec fshift[],real afac)
727 {
728   rvec r12;
729   real r12sq,r12_1,r12n,r12bar,v0,v1,fscal,ebar,fff;
730   int  m,t;
731     
732   t      = pbc_rvec_sub(pbc,xi,xj,r12); /*  3 */
733   
734   r12sq  = iprod(r12,r12);              /*  5 */
735   r12_1  = gmx_invsqrt(r12sq);              /*  5 */
736   r12bar = afac/r12_1;                  /*  5 */
737   v0     = qq*ONE_4PI_EPS0*r12_1;       /*  2 */
738   ebar   = exp(-r12bar);                /*  5 */
739   v1     = (1-(1+0.5*r12bar)*ebar);     /*  4 */
740   fscal  = ((v0*r12_1)*v1 - v0*0.5*afac*ebar*(r12bar+1))*r12_1; /* 9 */
741   if (debug)
742     fprintf(debug,"THOLE: v0 = %.3f v1 = %.3f r12= % .3f r12bar = %.3f fscal = %.3f  ebar = %.3f\n",v0,v1,1/r12_1,r12bar,fscal,ebar);
743   
744   for(m=0; (m<DIM); m++) {
745     fff    = fscal*r12[m];
746     fi[m] += fff;
747     fj[m] -= fff;             
748     fshift[t][m]       += fff;
749     fshift[CENTRAL][m] -= fff;
750   } /* 15 */
751   
752   return v0*v1; /* 1 */
753   /* 54 */
754 }
755
756 real thole_pol(int nbonds,
757                const t_iatom forceatoms[],const t_iparams forceparams[],
758                const rvec x[],rvec f[],rvec fshift[],
759                const t_pbc *pbc,const t_graph *g,
760                real lambda,real *dvdlambda,
761                const t_mdatoms *md,t_fcdata *fcd,
762                int *global_atom_index)
763 {
764   /* Interaction between two pairs of particles with opposite charge */
765   int i,type,a1,da1,a2,da2;
766   real q1,q2,qq,a,al1,al2,afac;
767   real V=0;
768   
769   for(i=0; (i<nbonds); ) {
770     type  = forceatoms[i++];
771     a1    = forceatoms[i++];
772     da1   = forceatoms[i++];
773     a2    = forceatoms[i++];
774     da2   = forceatoms[i++];
775     q1    = md->chargeA[da1];
776     q2    = md->chargeA[da2];
777     a     = forceparams[type].thole.a;
778     al1   = forceparams[type].thole.alpha1;
779     al2   = forceparams[type].thole.alpha2;
780     qq    = q1*q2;
781     afac  = a*pow(al1*al2,-1.0/6.0);
782     V += do_1_thole(x[a1], x[a2], f[a1], f[a2], pbc, qq,fshift,afac);
783     V += do_1_thole(x[da1],x[a2], f[da1],f[a2], pbc,-qq,fshift,afac);
784     V += do_1_thole(x[a1], x[da2],f[a1], f[da2],pbc,-qq,fshift,afac);
785     V += do_1_thole(x[da1],x[da2],f[da1],f[da2],pbc, qq,fshift,afac);
786   }
787   /* 290 flops */
788   return V;
789 }
790
791 real bond_angle(const rvec xi,const rvec xj,const rvec xk,const t_pbc *pbc,
792                 rvec r_ij,rvec r_kj,real *costh,
793                 int *t1,int *t2)
794 /* Return value is the angle between the bonds i-j and j-k */
795 {
796   /* 41 FLOPS */
797   real th;
798   
799   *t1 = pbc_rvec_sub(pbc,xi,xj,r_ij);                   /*  3           */
800   *t2 = pbc_rvec_sub(pbc,xk,xj,r_kj);                   /*  3           */
801
802   *costh=cos_angle(r_ij,r_kj);          /* 25           */
803   th=acos(*costh);                      /* 10           */
804                                         /* 41 TOTAL     */
805   return th;
806 }
807
808 real angles(int nbonds,
809             const t_iatom forceatoms[],const t_iparams forceparams[],
810             const rvec x[],rvec f[],rvec fshift[],
811             const t_pbc *pbc,const t_graph *g,
812             real lambda,real *dvdlambda,
813             const t_mdatoms *md,t_fcdata *fcd,
814             int *global_atom_index)
815 {
816     int  i,ai,aj,ak,t1,t2,type;
817     rvec r_ij,r_kj;
818     real cos_theta,cos_theta2,theta,dVdt,va,vtot;
819     ivec jt,dt_ij,dt_kj;
820
821     vtot = 0.0;
822     for(i=0; i<nbonds; )
823     {
824         type = forceatoms[i++];
825         ai   = forceatoms[i++];
826         aj   = forceatoms[i++];
827         ak   = forceatoms[i++];
828
829         theta  = bond_angle(x[ai],x[aj],x[ak],pbc,
830                             r_ij,r_kj,&cos_theta,&t1,&t2);      /*  41          */
831   
832         *dvdlambda += harmonic(forceparams[type].harmonic.krA,
833                                forceparams[type].harmonic.krB,
834                                forceparams[type].harmonic.rA*DEG2RAD,
835                                forceparams[type].harmonic.rB*DEG2RAD,
836                                theta,lambda,&va,&dVdt);  /*  21  */
837         vtot += va;
838
839         cos_theta2 = sqr(cos_theta);
840         if (cos_theta2 < 1)
841         {
842             int  m;
843             real st,sth;
844             real cik,cii,ckk;
845             real nrkj2,nrij2;
846             real nrkj_1,nrij_1;
847             rvec f_i,f_j,f_k;
848
849             st  = dVdt*gmx_invsqrt(1 - cos_theta2);     /*  12          */
850             sth = st*cos_theta;                 /*   1          */
851 #ifdef DEBUG
852             if (debug)
853                 fprintf(debug,"ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
854                         theta*RAD2DEG,va,dVdt);
855 #endif
856             nrij2 = iprod(r_ij,r_ij);                   /*   5          */
857             nrkj2 = iprod(r_kj,r_kj);                   /*   5          */
858
859             nrij_1 = gmx_invsqrt(nrij2);                /*  10          */
860             nrkj_1 = gmx_invsqrt(nrkj2);                /*  10          */
861
862             cik = st*nrij_1*nrkj_1;                     /*   2          */
863             cii = sth*nrij_1*nrij_1;                    /*   2          */
864             ckk = sth*nrkj_1*nrkj_1;                    /*   2          */
865       
866             for (m=0; m<DIM; m++)
867             {                   /*  39          */
868                 f_i[m]    = -(cik*r_kj[m] - cii*r_ij[m]);
869                 f_k[m]    = -(cik*r_ij[m] - ckk*r_kj[m]);
870                 f_j[m]    = -f_i[m] - f_k[m];
871                 f[ai][m] += f_i[m];
872                 f[aj][m] += f_j[m];
873                 f[ak][m] += f_k[m];
874             }
875             if (g != NULL)
876             {
877                 copy_ivec(SHIFT_IVEC(g,aj),jt);
878
879                 ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
880                 ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
881                 t1 = IVEC2IS(dt_ij);
882                 t2 = IVEC2IS(dt_kj);
883             }
884             rvec_inc(fshift[t1],f_i);
885             rvec_inc(fshift[CENTRAL],f_j);
886             rvec_inc(fshift[t2],f_k);
887         }                                           /* 161 TOTAL        */
888     }
889
890     return vtot;
891 }
892
893 real linear_angles(int nbonds,
894                    const t_iatom forceatoms[],const t_iparams forceparams[],
895                    const rvec x[],rvec f[],rvec fshift[],
896                    const t_pbc *pbc,const t_graph *g,
897                    real lambda,real *dvdlambda,
898                    const t_mdatoms *md,t_fcdata *fcd,
899                    int *global_atom_index)
900 {
901   int  i,m,ai,aj,ak,t1,t2,type;
902   rvec f_i,f_j,f_k;
903   real L1,kA,kB,aA,aB,dr,dr2,va,vtot,a,b,klin;
904   ivec jt,dt_ij,dt_kj;
905   rvec r_ij,r_kj,r_ik,dx;
906     
907   L1   = 1-lambda;
908   vtot = 0.0;
909   for(i=0; (i<nbonds); ) {
910     type = forceatoms[i++];
911     ai   = forceatoms[i++];
912     aj   = forceatoms[i++];
913     ak   = forceatoms[i++];
914     
915     kA = forceparams[type].linangle.klinA;
916     kB = forceparams[type].linangle.klinB;
917     klin = L1*kA + lambda*kB;
918     
919     aA   = forceparams[type].linangle.aA;
920     aB   = forceparams[type].linangle.aB;
921     a    = L1*aA+lambda*aB;
922     b    = 1-a;
923     
924     t1 = pbc_rvec_sub(pbc,x[ai],x[aj],r_ij);
925     t2 = pbc_rvec_sub(pbc,x[ak],x[aj],r_kj);
926     rvec_sub(r_ij,r_kj,r_ik);
927     
928     dr2 = 0;
929     for(m=0; (m<DIM); m++) 
930     {
931         dr     = - a * r_ij[m] - b * r_kj[m];
932         dr2   += dr*dr;
933         dx[m]  = dr;
934         f_i[m] = a*klin*dr;
935         f_k[m] = b*klin*dr;
936         f_j[m] = -(f_i[m]+f_k[m]);
937         f[ai][m] += f_i[m];
938         f[aj][m] += f_j[m];
939         f[ak][m] += f_k[m];
940     }
941     va    = 0.5*klin*dr2;
942     *dvdlambda += 0.5*(kB-kA)*dr2 + klin*(aB-aA)*iprod(dx,r_ik); 
943             
944     vtot += va;
945     
946     if (g) {
947         copy_ivec(SHIFT_IVEC(g,aj),jt);
948       
949         ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
950         ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
951         t1=IVEC2IS(dt_ij);
952         t2=IVEC2IS(dt_kj);
953     }
954     rvec_inc(fshift[t1],f_i);
955     rvec_inc(fshift[CENTRAL],f_j);
956     rvec_inc(fshift[t2],f_k);
957   }                                           /* 57 TOTAL       */
958   return vtot;
959 }
960
961 real urey_bradley(int nbonds,
962                   const t_iatom forceatoms[],const t_iparams forceparams[],
963                   const rvec x[],rvec f[],rvec fshift[],
964                   const t_pbc *pbc,const t_graph *g,
965                   real lambda,real *dvdlambda,
966                   const t_mdatoms *md,t_fcdata *fcd,
967                   int *global_atom_index)
968 {
969   int  i,m,ai,aj,ak,t1,t2,type,ki;
970   rvec r_ij,r_kj,r_ik;
971   real cos_theta,cos_theta2,theta;
972   real dVdt,va,vtot,dr,dr2,vbond,fbond,fik;
973   real kthA,th0A,kUBA,r13A,kthB,th0B,kUBB,r13B;
974   ivec jt,dt_ij,dt_kj,dt_ik;
975   
976   vtot = 0.0;
977   for(i=0; (i<nbonds); ) {
978     type = forceatoms[i++];
979     ai   = forceatoms[i++];
980     aj   = forceatoms[i++];
981     ak   = forceatoms[i++];
982     th0A  = forceparams[type].u_b.thetaA*DEG2RAD;
983     kthA  = forceparams[type].u_b.kthetaA;
984     r13A  = forceparams[type].u_b.r13A;
985     kUBA  = forceparams[type].u_b.kUBA;
986     th0B  = forceparams[type].u_b.thetaB*DEG2RAD;
987     kthB  = forceparams[type].u_b.kthetaB;
988     r13B  = forceparams[type].u_b.r13B;
989     kUBB  = forceparams[type].u_b.kUBB;
990     
991     theta  = bond_angle(x[ai],x[aj],x[ak],pbc,
992                         r_ij,r_kj,&cos_theta,&t1,&t2);  /*  41          */
993   
994     *dvdlambda += harmonic(kthA,kthB,th0A,th0B,theta,lambda,&va,&dVdt);  /*  21  */
995     vtot += va;
996     
997     ki   = pbc_rvec_sub(pbc,x[ai],x[ak],r_ik);  /*   3          */
998     dr2  = iprod(r_ik,r_ik);                    /*   5          */
999     dr   = dr2*gmx_invsqrt(dr2);                        /*  10          */
1000
1001     *dvdlambda += harmonic(kUBA,kUBB,r13A,r13B,dr,lambda,&vbond,&fbond); /*  19  */
1002
1003     cos_theta2 = sqr(cos_theta);                /*   1          */
1004     if (cos_theta2 < 1) {
1005       real st,sth;
1006       real cik,cii,ckk;
1007       real nrkj2,nrij2;
1008       rvec f_i,f_j,f_k;
1009       
1010       st  = dVdt*gmx_invsqrt(1 - cos_theta2);   /*  12          */
1011       sth = st*cos_theta;                       /*   1          */
1012 #ifdef DEBUG
1013       if (debug)
1014         fprintf(debug,"ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
1015                 theta*RAD2DEG,va,dVdt);
1016 #endif
1017       nrkj2=iprod(r_kj,r_kj);                   /*   5          */
1018       nrij2=iprod(r_ij,r_ij);
1019       
1020       cik=st*gmx_invsqrt(nrkj2*nrij2);          /*  12          */ 
1021       cii=sth/nrij2;                            /*  10          */
1022       ckk=sth/nrkj2;                            /*  10          */
1023       
1024       for (m=0; (m<DIM); m++) {                 /*  39          */
1025         f_i[m]=-(cik*r_kj[m]-cii*r_ij[m]);
1026         f_k[m]=-(cik*r_ij[m]-ckk*r_kj[m]);
1027         f_j[m]=-f_i[m]-f_k[m];
1028         f[ai][m]+=f_i[m];
1029         f[aj][m]+=f_j[m];
1030         f[ak][m]+=f_k[m];
1031       }
1032       if (g) {
1033         copy_ivec(SHIFT_IVEC(g,aj),jt);
1034       
1035         ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
1036         ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
1037         t1=IVEC2IS(dt_ij);
1038         t2=IVEC2IS(dt_kj);
1039       }
1040       rvec_inc(fshift[t1],f_i);
1041       rvec_inc(fshift[CENTRAL],f_j);
1042       rvec_inc(fshift[t2],f_k);
1043     }                                           /* 161 TOTAL    */
1044     /* Time for the bond calculations */
1045     if (dr2 == 0.0)
1046       continue;
1047
1048     vtot  += vbond;  /* 1*/
1049     fbond *= gmx_invsqrt(dr2);                  /*   6          */
1050     
1051     if (g) {
1052       ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,ak),dt_ik);
1053       ki=IVEC2IS(dt_ik);
1054     }
1055     for (m=0; (m<DIM); m++) {                   /*  15          */
1056       fik=fbond*r_ik[m];
1057       f[ai][m]+=fik;
1058       f[ak][m]-=fik;
1059       fshift[ki][m]+=fik;
1060       fshift[CENTRAL][m]-=fik;
1061     }
1062   }
1063   return vtot;
1064 }
1065
1066 real quartic_angles(int nbonds,
1067                     const t_iatom forceatoms[],const t_iparams forceparams[],
1068                     const rvec x[],rvec f[],rvec fshift[],
1069                     const t_pbc *pbc,const t_graph *g,
1070                     real lambda,real *dvdlambda,
1071                     const t_mdatoms *md,t_fcdata *fcd,
1072                     int *global_atom_index)
1073 {
1074   int  i,j,ai,aj,ak,t1,t2,type;
1075   rvec r_ij,r_kj;
1076   real cos_theta,cos_theta2,theta,dt,dVdt,va,dtp,c,vtot;
1077   ivec jt,dt_ij,dt_kj;
1078   
1079   vtot = 0.0;
1080   for(i=0; (i<nbonds); ) {
1081     type = forceatoms[i++];
1082     ai   = forceatoms[i++];
1083     aj   = forceatoms[i++];
1084     ak   = forceatoms[i++];
1085
1086     theta  = bond_angle(x[ai],x[aj],x[ak],pbc,
1087                         r_ij,r_kj,&cos_theta,&t1,&t2);  /*  41          */
1088
1089     dt = theta - forceparams[type].qangle.theta*DEG2RAD; /* 2          */
1090
1091     dVdt = 0;
1092     va = forceparams[type].qangle.c[0];
1093     dtp = 1.0;
1094     for(j=1; j<=4; j++) {
1095       c = forceparams[type].qangle.c[j];
1096       dVdt -= j*c*dtp;
1097       dtp *= dt;
1098       va += c*dtp;
1099     }
1100     /* 20 */
1101
1102     vtot += va;
1103     
1104     cos_theta2 = sqr(cos_theta);                /*   1          */
1105     if (cos_theta2 < 1) {
1106       int  m;
1107       real st,sth;
1108       real cik,cii,ckk;
1109       real nrkj2,nrij2;
1110       rvec f_i,f_j,f_k;
1111       
1112       st  = dVdt*gmx_invsqrt(1 - cos_theta2);           /*  12          */
1113       sth = st*cos_theta;                       /*   1          */
1114 #ifdef DEBUG
1115       if (debug)
1116         fprintf(debug,"ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
1117                 theta*RAD2DEG,va,dVdt);
1118 #endif
1119       nrkj2=iprod(r_kj,r_kj);                   /*   5          */
1120       nrij2=iprod(r_ij,r_ij);
1121       
1122       cik=st*gmx_invsqrt(nrkj2*nrij2);          /*  12          */ 
1123       cii=sth/nrij2;                            /*  10          */
1124       ckk=sth/nrkj2;                            /*  10          */
1125       
1126       for (m=0; (m<DIM); m++) {                 /*  39          */
1127         f_i[m]=-(cik*r_kj[m]-cii*r_ij[m]);
1128         f_k[m]=-(cik*r_ij[m]-ckk*r_kj[m]);
1129         f_j[m]=-f_i[m]-f_k[m];
1130         f[ai][m]+=f_i[m];
1131         f[aj][m]+=f_j[m];
1132         f[ak][m]+=f_k[m];
1133       }
1134       if (g) {
1135         copy_ivec(SHIFT_IVEC(g,aj),jt);
1136       
1137         ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
1138         ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
1139         t1=IVEC2IS(dt_ij);
1140         t2=IVEC2IS(dt_kj);
1141       }
1142       rvec_inc(fshift[t1],f_i);
1143       rvec_inc(fshift[CENTRAL],f_j);
1144       rvec_inc(fshift[t2],f_k);
1145     }                                           /* 153 TOTAL    */
1146   }
1147   return vtot;
1148 }
1149
1150 real dih_angle(const rvec xi,const rvec xj,const rvec xk,const rvec xl,
1151                const t_pbc *pbc,
1152                rvec r_ij,rvec r_kj,rvec r_kl,rvec m,rvec n,
1153                real *sign,int *t1,int *t2,int *t3)
1154 {
1155   real ipr,phi;
1156
1157   *t1 = pbc_rvec_sub(pbc,xi,xj,r_ij);                   /*  3           */
1158   *t2 = pbc_rvec_sub(pbc,xk,xj,r_kj);                   /*  3           */
1159   *t3 = pbc_rvec_sub(pbc,xk,xl,r_kl);                   /*  3           */
1160
1161   cprod(r_ij,r_kj,m);                   /*  9           */
1162   cprod(r_kj,r_kl,n);                   /*  9           */
1163   phi=gmx_angle(m,n);                   /* 49 (assuming 25 for atan2) */
1164   ipr=iprod(r_ij,n);                    /*  5           */
1165   (*sign)=(ipr<0.0)?-1.0:1.0;
1166   phi=(*sign)*phi;                      /*  1           */
1167                                         /* 82 TOTAL     */
1168   return phi;
1169 }
1170
1171
1172 #ifdef SSE_PROPER_DIHEDRALS
1173
1174 /* x86 SIMD inner-product of 4 float vectors */
1175 #define GMX_MM_IPROD_PS(ax,ay,az,bx,by,bz)                 \
1176     _mm_add_ps(_mm_add_ps(_mm_mul_ps(ax,bx),_mm_mul_ps(ay,by)),_mm_mul_ps(az,bz))
1177
1178 /* x86 SIMD norm^2 of 4 float vectors */
1179 #define GMX_MM_NORM2_PS(ax,ay,az) GMX_MM_IPROD_PS(ax,ay,az,ax,ay,az)
1180
1181 /* x86 SIMD cross-product of 4 float vectors */
1182 #define GMX_MM_CPROD_PS(ax,ay,az,bx,by,bz,cx,cy,cz)        \
1183 {                                                          \
1184     cx = _mm_sub_ps(_mm_mul_ps(ay,bz),_mm_mul_ps(az,by));  \
1185     cy = _mm_sub_ps(_mm_mul_ps(az,bx),_mm_mul_ps(ax,bz));  \
1186     cz = _mm_sub_ps(_mm_mul_ps(ax,by),_mm_mul_ps(ay,bx));  \
1187 }
1188
1189 /* load 4 rvec's into 3 x86 SIMD float registers */
1190 #define load_rvec4(r0,r1,r2,r3,rx_SSE,ry_SSE,rz_SSE)          \
1191 {                                                             \
1192     __m128 tmp;                                               \
1193     rx_SSE = _mm_load_ps(r0);                                 \
1194     ry_SSE = _mm_load_ps(r1);                                 \
1195     rz_SSE = _mm_load_ps(r2);                                 \
1196     tmp    = _mm_load_ps(r3);                                 \
1197     _MM_TRANSPOSE4_PS(rx_SSE,ry_SSE,rz_SSE,tmp);              \
1198 }
1199
1200 #define store_rvec4(rx_SSE,ry_SSE,rz_SSE,r0,r1,r2,r3)         \
1201 {                                                             \
1202     __m128 tmp=_mm_setzero_ps();                              \
1203     _MM_TRANSPOSE4_PS(rx_SSE,ry_SSE,rz_SSE,tmp);              \
1204     _mm_store_ps(r0,rx_SSE);                                  \
1205     _mm_store_ps(r1,ry_SSE);                                  \
1206     _mm_store_ps(r2,rz_SSE);                                  \
1207     _mm_store_ps(r3,tmp   );                                  \
1208 }
1209
1210 /* An rvec in a structure which can be allocated 16-byte aligned */
1211 typedef struct {
1212     rvec  v;
1213     float f;
1214 } rvec_sse_t;
1215
1216 /* As dih_angle above, but calculates 4 dihedral angles at once using SSE,
1217  * also calculates the pre-factor required for the dihedral force update.
1218  * Note that bv and buf should be 16-byte aligned.
1219  */
1220 static void
1221 dih_angle_sse(const rvec *x,
1222               int ai[4],int aj[4],int ak[4],int al[4],
1223               const t_pbc *pbc,
1224               int t1[4],int t2[4],int t3[4],
1225               rvec_sse_t *bv,
1226               real *buf)
1227 {
1228     int s;
1229     __m128 rijx_SSE,rijy_SSE,rijz_SSE;
1230     __m128 rkjx_SSE,rkjy_SSE,rkjz_SSE;
1231     __m128 rklx_SSE,rkly_SSE,rklz_SSE;
1232     __m128 mx_SSE,my_SSE,mz_SSE;
1233     __m128 nx_SSE,ny_SSE,nz_SSE;
1234     __m128 cx_SSE,cy_SSE,cz_SSE;
1235     __m128 cn_SSE;
1236     __m128 s_SSE;
1237     __m128 phi_SSE;
1238     __m128 ipr_SSE;
1239     int signs;
1240     __m128 iprm_SSE,iprn_SSE;
1241     __m128 nrkj2_SSE,nrkj_1_SSE,nrkj_2_SSE,nrkj_SSE;
1242     __m128 nrkj_m2_SSE,nrkj_n2_SSE;
1243     __m128 p_SSE,q_SSE;
1244     __m128 fmin_SSE=_mm_set1_ps(GMX_FLOAT_MIN);
1245
1246     for(s=0; s<4; s++)
1247     {
1248         t1[s] = pbc_rvec_sub(pbc,x[ai[s]],x[aj[s]],bv[0+s].v);
1249         t2[s] = pbc_rvec_sub(pbc,x[ak[s]],x[aj[s]],bv[4+s].v);
1250         t3[s] = pbc_rvec_sub(pbc,x[ak[s]],x[al[s]],bv[8+s].v);
1251     }
1252
1253     load_rvec4(bv[0].v,bv[1].v,bv[2].v,bv[3].v,rijx_SSE,rijy_SSE,rijz_SSE);
1254     load_rvec4(bv[4].v,bv[5].v,bv[6].v,bv[7].v,rkjx_SSE,rkjy_SSE,rkjz_SSE);
1255     load_rvec4(bv[8].v,bv[9].v,bv[10].v,bv[11].v,rklx_SSE,rkly_SSE,rklz_SSE);
1256
1257     GMX_MM_CPROD_PS(rijx_SSE,rijy_SSE,rijz_SSE,
1258                     rkjx_SSE,rkjy_SSE,rkjz_SSE,
1259                     mx_SSE,my_SSE,mz_SSE);
1260
1261     GMX_MM_CPROD_PS(rkjx_SSE,rkjy_SSE,rkjz_SSE,
1262                     rklx_SSE,rkly_SSE,rklz_SSE,
1263                     nx_SSE,ny_SSE,nz_SSE);
1264
1265     GMX_MM_CPROD_PS(mx_SSE,my_SSE,mz_SSE,
1266                     nx_SSE,ny_SSE,nz_SSE,
1267                     cx_SSE,cy_SSE,cz_SSE);
1268
1269     cn_SSE = gmx_mm_sqrt_ps(GMX_MM_NORM2_PS(cx_SSE,cy_SSE,cz_SSE));
1270     
1271     s_SSE = GMX_MM_IPROD_PS(mx_SSE,my_SSE,mz_SSE,nx_SSE,ny_SSE,nz_SSE);
1272
1273     phi_SSE = gmx_mm_atan2_ps(cn_SSE,s_SSE);
1274     _mm_store_ps(buf+16,phi_SSE);
1275
1276     ipr_SSE = GMX_MM_IPROD_PS(rijx_SSE,rijy_SSE,rijz_SSE,
1277                               nx_SSE,ny_SSE,nz_SSE);
1278
1279     signs = _mm_movemask_ps(ipr_SSE);
1280     
1281     for(s=0; s<4; s++)
1282     {
1283         if (signs & (1<<s))
1284         {
1285             buf[16+s] = -buf[16+s];
1286         }
1287     }
1288
1289     iprm_SSE    = GMX_MM_NORM2_PS(mx_SSE,my_SSE,mz_SSE);
1290     iprn_SSE    = GMX_MM_NORM2_PS(nx_SSE,ny_SSE,nz_SSE);
1291
1292     /* store_rvec4 messes with the input, don't use it after this! */
1293     store_rvec4(mx_SSE,my_SSE,mz_SSE,bv[0].v,bv[1].v,bv[2].v,bv[3].v);
1294     store_rvec4(nx_SSE,ny_SSE,nz_SSE,bv[4].v,bv[5].v,bv[6].v,bv[7].v);
1295
1296     nrkj2_SSE   = GMX_MM_NORM2_PS(rkjx_SSE,rkjy_SSE,rkjz_SSE);
1297
1298     /* Avoid division by zero. When zero, the result is multiplied by 0
1299      * anyhow, so the 3 max below do not affect the final result.
1300      */
1301     nrkj2_SSE   = _mm_max_ps(nrkj2_SSE,fmin_SSE);
1302     nrkj_1_SSE  = gmx_mm_invsqrt_ps(nrkj2_SSE);
1303     nrkj_2_SSE  = _mm_mul_ps(nrkj_1_SSE,nrkj_1_SSE);
1304     nrkj_SSE    = _mm_mul_ps(nrkj2_SSE,nrkj_1_SSE);
1305
1306     iprm_SSE    = _mm_max_ps(iprm_SSE,fmin_SSE);
1307     iprn_SSE    = _mm_max_ps(iprn_SSE,fmin_SSE);
1308     nrkj_m2_SSE = _mm_mul_ps(nrkj_SSE,gmx_mm_inv_ps(iprm_SSE));
1309     nrkj_n2_SSE = _mm_mul_ps(nrkj_SSE,gmx_mm_inv_ps(iprn_SSE));
1310
1311     _mm_store_ps(buf+0,nrkj_m2_SSE);
1312     _mm_store_ps(buf+4,nrkj_n2_SSE);
1313
1314     p_SSE       = GMX_MM_IPROD_PS(rijx_SSE,rijy_SSE,rijz_SSE,
1315                                   rkjx_SSE,rkjy_SSE,rkjz_SSE);
1316     p_SSE       = _mm_mul_ps(p_SSE,nrkj_2_SSE);
1317
1318     q_SSE       = GMX_MM_IPROD_PS(rklx_SSE,rkly_SSE,rklz_SSE,
1319                                   rkjx_SSE,rkjy_SSE,rkjz_SSE);
1320     q_SSE       = _mm_mul_ps(q_SSE,nrkj_2_SSE);
1321
1322     _mm_store_ps(buf+8 ,p_SSE);
1323     _mm_store_ps(buf+12,q_SSE);
1324 }
1325
1326 #endif /* SSE_PROPER_DIHEDRALS */
1327
1328
1329 void do_dih_fup(int i,int j,int k,int l,real ddphi,
1330                 rvec r_ij,rvec r_kj,rvec r_kl,
1331                 rvec m,rvec n,rvec f[],rvec fshift[],
1332                 const t_pbc *pbc,const t_graph *g,
1333                 const rvec x[],int t1,int t2,int t3)
1334 {
1335   /* 143 FLOPS */
1336   rvec f_i,f_j,f_k,f_l;
1337   rvec uvec,vvec,svec,dx_jl;
1338   real iprm,iprn,nrkj,nrkj2,nrkj_1,nrkj_2;
1339   real a,b,p,q,toler;
1340   ivec jt,dt_ij,dt_kj,dt_lj;  
1341   
1342   iprm  = iprod(m,m);           /*  5   */
1343   iprn  = iprod(n,n);           /*  5   */
1344   nrkj2 = iprod(r_kj,r_kj);     /*  5   */
1345   toler = nrkj2*GMX_REAL_EPS;
1346   if ((iprm > toler) && (iprn > toler)) {
1347     nrkj_1 = gmx_invsqrt(nrkj2);        /* 10   */
1348     nrkj_2 = nrkj_1*nrkj_1;     /*  1   */
1349     nrkj  = nrkj2*nrkj_1;       /*  1   */
1350     a     = -ddphi*nrkj/iprm;   /* 11   */
1351     svmul(a,m,f_i);             /*  3   */
1352     b     = ddphi*nrkj/iprn;    /* 11   */
1353     svmul(b,n,f_l);             /*  3   */
1354     p     = iprod(r_ij,r_kj);   /*  5   */
1355     p    *= nrkj_2;             /*  1   */
1356     q     = iprod(r_kl,r_kj);   /*  5   */
1357     q    *= nrkj_2;             /*  1   */
1358     svmul(p,f_i,uvec);          /*  3   */
1359     svmul(q,f_l,vvec);          /*  3   */
1360     rvec_sub(uvec,vvec,svec);   /*  3   */
1361     rvec_sub(f_i,svec,f_j);     /*  3   */
1362     rvec_add(f_l,svec,f_k);     /*  3   */
1363     rvec_inc(f[i],f_i);         /*  3   */
1364     rvec_dec(f[j],f_j);         /*  3   */
1365     rvec_dec(f[k],f_k);         /*  3   */
1366     rvec_inc(f[l],f_l);         /*  3   */
1367     
1368     if (g) {
1369       copy_ivec(SHIFT_IVEC(g,j),jt);
1370       ivec_sub(SHIFT_IVEC(g,i),jt,dt_ij);
1371       ivec_sub(SHIFT_IVEC(g,k),jt,dt_kj);
1372       ivec_sub(SHIFT_IVEC(g,l),jt,dt_lj);
1373       t1=IVEC2IS(dt_ij);
1374       t2=IVEC2IS(dt_kj);
1375       t3=IVEC2IS(dt_lj);
1376     } else if (pbc) {
1377       t3 = pbc_rvec_sub(pbc,x[l],x[j],dx_jl);
1378     } else {
1379       t3 = CENTRAL;
1380     }
1381     
1382     rvec_inc(fshift[t1],f_i);
1383     rvec_dec(fshift[CENTRAL],f_j);
1384     rvec_dec(fshift[t2],f_k);
1385     rvec_inc(fshift[t3],f_l);
1386   }
1387   /* 112 TOTAL  */
1388 }
1389
1390 /* As do_dih_fup above, but without shift forces */
1391 static void
1392 do_dih_fup_noshiftf(int i,int j,int k,int l,real ddphi,
1393                     rvec r_ij,rvec r_kj,rvec r_kl,
1394                     rvec m,rvec n,rvec f[])
1395 {
1396   rvec f_i,f_j,f_k,f_l;
1397   rvec uvec,vvec,svec,dx_jl;
1398   real iprm,iprn,nrkj,nrkj2,nrkj_1,nrkj_2;
1399   real a,b,p,q,toler;
1400   ivec jt,dt_ij,dt_kj,dt_lj;  
1401   
1402   iprm  = iprod(m,m);           /*  5   */
1403   iprn  = iprod(n,n);           /*  5   */
1404   nrkj2 = iprod(r_kj,r_kj);     /*  5   */
1405   toler = nrkj2*GMX_REAL_EPS;
1406   if ((iprm > toler) && (iprn > toler)) {
1407     nrkj_1 = gmx_invsqrt(nrkj2);        /* 10   */
1408     nrkj_2 = nrkj_1*nrkj_1;     /*  1   */
1409     nrkj  = nrkj2*nrkj_1;       /*  1   */
1410     a     = -ddphi*nrkj/iprm;   /* 11   */
1411     svmul(a,m,f_i);             /*  3   */
1412     b     = ddphi*nrkj/iprn;    /* 11   */
1413     svmul(b,n,f_l);             /*  3   */
1414     p     = iprod(r_ij,r_kj);   /*  5   */
1415     p    *= nrkj_2;             /*  1   */
1416     q     = iprod(r_kl,r_kj);   /*  5   */
1417     q    *= nrkj_2;             /*  1   */
1418     svmul(p,f_i,uvec);          /*  3   */
1419     svmul(q,f_l,vvec);          /*  3   */
1420     rvec_sub(uvec,vvec,svec);   /*  3   */
1421     rvec_sub(f_i,svec,f_j);     /*  3   */
1422     rvec_add(f_l,svec,f_k);     /*  3   */
1423     rvec_inc(f[i],f_i);         /*  3   */
1424     rvec_dec(f[j],f_j);         /*  3   */
1425     rvec_dec(f[k],f_k);         /*  3   */
1426     rvec_inc(f[l],f_l);         /*  3   */
1427   }
1428 }
1429
1430 /* As do_dih_fup_noshiftf above, but with pre-calculated pre-factors */
1431 static void
1432 do_dih_fup_noshiftf_precalc(int i,int j,int k,int l,real ddphi,
1433                             real nrkj_m2,real nrkj_n2,
1434                             real p,real q,
1435                             rvec m,rvec n,rvec f[])
1436 {
1437     rvec f_i,f_j,f_k,f_l;
1438     rvec uvec,vvec,svec,dx_jl;
1439     real a,b,toler;
1440     ivec jt,dt_ij,dt_kj,dt_lj;  
1441   
1442     a = -ddphi*nrkj_m2;
1443     svmul(a,m,f_i);
1444     b =  ddphi*nrkj_n2;
1445     svmul(b,n,f_l);
1446     svmul(p,f_i,uvec);
1447     svmul(q,f_l,vvec);
1448     rvec_sub(uvec,vvec,svec);
1449     rvec_sub(f_i,svec,f_j);
1450     rvec_add(f_l,svec,f_k);
1451     rvec_inc(f[i],f_i);
1452     rvec_dec(f[j],f_j);
1453     rvec_dec(f[k],f_k);
1454     rvec_inc(f[l],f_l);
1455 }
1456
1457
1458 real dopdihs(real cpA,real cpB,real phiA,real phiB,int mult,
1459              real phi,real lambda,real *V,real *F)
1460 {
1461   real v,dvdlambda,mdphi,v1,sdphi,ddphi;
1462   real L1   = 1.0 - lambda;
1463   real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
1464   real dph0 = (phiB - phiA)*DEG2RAD;
1465   real cp   = L1*cpA + lambda*cpB;
1466   
1467   mdphi =  mult*phi - ph0;
1468   sdphi = sin(mdphi);
1469   ddphi = -cp*mult*sdphi;
1470   v1    = 1.0 + cos(mdphi);
1471   v     = cp*v1;
1472   
1473   dvdlambda  = (cpB - cpA)*v1 + cp*dph0*sdphi;
1474   
1475   *V = v;
1476   *F = ddphi;
1477   
1478   return dvdlambda;
1479   
1480   /* That was 40 flops */
1481 }
1482
1483 static void
1484 dopdihs_noener(real cpA,real cpB,real phiA,real phiB,int mult,
1485                real phi,real lambda,real *F)
1486 {
1487   real mdphi,sdphi,ddphi;
1488   real L1   = 1.0 - lambda;
1489   real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
1490   real cp   = L1*cpA + lambda*cpB;
1491   
1492   mdphi = mult*phi - ph0;
1493   sdphi = sin(mdphi);
1494   ddphi = -cp*mult*sdphi;
1495   
1496   *F = ddphi;
1497   
1498   /* That was 20 flops */
1499 }
1500
1501 static void
1502 dopdihs_mdphi(real cpA,real cpB,real phiA,real phiB,int mult,
1503               real phi,real lambda,real *cp,real *mdphi)
1504 {
1505     real L1   = 1.0 - lambda;
1506     real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
1507
1508     *cp    = L1*cpA + lambda*cpB;
1509
1510     *mdphi = mult*phi - ph0;
1511 }
1512
1513 static real dopdihs_min(real cpA,real cpB,real phiA,real phiB,int mult,
1514                         real phi,real lambda,real *V,real *F)
1515      /* similar to dopdihs, except for a minus sign  *
1516       * and a different treatment of mult/phi0       */
1517 {
1518   real v,dvdlambda,mdphi,v1,sdphi,ddphi;
1519   real L1   = 1.0 - lambda;
1520   real ph0  = (L1*phiA + lambda*phiB)*DEG2RAD;
1521   real dph0 = (phiB - phiA)*DEG2RAD;
1522   real cp   = L1*cpA + lambda*cpB;
1523   
1524   mdphi = mult*(phi-ph0);
1525   sdphi = sin(mdphi);
1526   ddphi = cp*mult*sdphi;
1527   v1    = 1.0-cos(mdphi);
1528   v     = cp*v1;
1529   
1530   dvdlambda  = (cpB-cpA)*v1 + cp*dph0*sdphi;
1531   
1532   *V = v;
1533   *F = ddphi;
1534   
1535   return dvdlambda;
1536   
1537   /* That was 40 flops */
1538 }
1539
1540 real pdihs(int nbonds,
1541            const t_iatom forceatoms[],const t_iparams forceparams[],
1542            const rvec x[],rvec f[],rvec fshift[],
1543            const t_pbc *pbc,const t_graph *g,
1544            real lambda,real *dvdlambda,
1545            const t_mdatoms *md,t_fcdata *fcd,
1546            int *global_atom_index)
1547 {
1548   int  i,type,ai,aj,ak,al;
1549   int  t1,t2,t3;
1550   rvec r_ij,r_kj,r_kl,m,n;
1551   real phi,sign,ddphi,vpd,vtot;
1552
1553   vtot = 0.0;
1554
1555   for(i=0; (i<nbonds); ) {
1556     type = forceatoms[i++];
1557     ai   = forceatoms[i++];
1558     aj   = forceatoms[i++];
1559     ak   = forceatoms[i++];
1560     al   = forceatoms[i++];
1561     
1562     phi=dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
1563                   &sign,&t1,&t2,&t3);                   /*  84          */
1564     *dvdlambda += dopdihs(forceparams[type].pdihs.cpA,
1565                           forceparams[type].pdihs.cpB,
1566                           forceparams[type].pdihs.phiA,
1567                           forceparams[type].pdihs.phiB,
1568                           forceparams[type].pdihs.mult,
1569                           phi,lambda,&vpd,&ddphi);
1570
1571     vtot += vpd;
1572     do_dih_fup(ai,aj,ak,al,ddphi,r_ij,r_kj,r_kl,m,n,
1573                f,fshift,pbc,g,x,t1,t2,t3);                      /* 112          */
1574
1575 #ifdef DEBUG
1576     fprintf(debug,"pdih: (%d,%d,%d,%d) phi=%g\n",
1577             ai,aj,ak,al,phi);
1578 #endif
1579   } /* 223 TOTAL        */
1580
1581   return vtot;
1582 }
1583
1584 void make_dp_periodic(real *dp)  /* 1 flop? */
1585 {
1586     /* dp cannot be outside (-pi,pi) */
1587     if (*dp >= M_PI)
1588     {
1589         *dp -= 2*M_PI;
1590     }
1591     else if (*dp < -M_PI)
1592     {
1593         *dp += 2*M_PI;
1594     }
1595     return;
1596 }
1597
1598 /* As pdihs above, but without calculating energies and shift forces */
1599 static void
1600 pdihs_noener(int nbonds,
1601              const t_iatom forceatoms[],const t_iparams forceparams[],
1602              const rvec x[],rvec f[],
1603              const t_pbc *pbc,const t_graph *g,
1604              real lambda,
1605              const t_mdatoms *md,t_fcdata *fcd,
1606              int *global_atom_index)
1607 {
1608     int  i,type,ai,aj,ak,al;
1609     int  t1,t2,t3;
1610     rvec r_ij,r_kj,r_kl,m,n;
1611     real phi,sign,ddphi_tot,ddphi;
1612
1613     for(i=0; (i<nbonds); )
1614     {
1615         ai   = forceatoms[i+1];
1616         aj   = forceatoms[i+2];
1617         ak   = forceatoms[i+3];
1618         al   = forceatoms[i+4];
1619
1620         phi = dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
1621                         &sign,&t1,&t2,&t3);
1622
1623         ddphi_tot = 0;
1624
1625         /* Loop over dihedrals working on the same atoms,
1626          * so we avoid recalculating angles and force distributions.
1627          */
1628         do
1629         {
1630             type = forceatoms[i];
1631             dopdihs_noener(forceparams[type].pdihs.cpA,
1632                            forceparams[type].pdihs.cpB,
1633                            forceparams[type].pdihs.phiA,
1634                            forceparams[type].pdihs.phiB,
1635                            forceparams[type].pdihs.mult,
1636                            phi,lambda,&ddphi);
1637             ddphi_tot += ddphi;
1638
1639             i += 5;
1640         }
1641         while(i < nbonds &&
1642               forceatoms[i+1] == ai &&
1643               forceatoms[i+2] == aj &&
1644               forceatoms[i+3] == ak &&
1645               forceatoms[i+4] == al);
1646
1647         do_dih_fup_noshiftf(ai,aj,ak,al,ddphi_tot,r_ij,r_kj,r_kl,m,n,f);
1648     }
1649 }
1650
1651
1652 #ifdef SSE_PROPER_DIHEDRALS
1653
1654 /* As pdihs_noner above, but using SSE to calculate 4 dihedrals at once */
1655 static void
1656 pdihs_noener_sse(int nbonds,
1657                  const t_iatom forceatoms[],const t_iparams forceparams[],
1658                  const rvec x[],rvec f[],
1659                  const t_pbc *pbc,const t_graph *g,
1660                  real lambda,
1661                  const t_mdatoms *md,t_fcdata *fcd,
1662                  int *global_atom_index)
1663 {
1664     int  i,i4,s;
1665     int  type,ai[4],aj[4],ak[4],al[4];
1666     int  t1[4],t2[4],t3[4];
1667     int  mult[4];
1668     real cp[4],mdphi[4];
1669     real ddphi;
1670     rvec_sse_t rs_array[13],*rs;
1671     real buf_array[24],*buf;
1672     __m128 mdphi_SSE,sin_SSE,cos_SSE;
1673
1674     /* Ensure 16-byte alignment */
1675     rs  = (rvec_sse_t *)(((size_t)(rs_array +1)) & (~((size_t)15)));
1676     buf =      (float *)(((size_t)(buf_array+3)) & (~((size_t)15)));
1677
1678     for(i=0; (i<nbonds); i+=20)
1679     {
1680         /* Collect atoms quadruplets for 4 dihedrals */
1681         i4 = i;
1682         for(s=0; s<4; s++)
1683         {
1684             ai[s] = forceatoms[i4+1];
1685             aj[s] = forceatoms[i4+2];
1686             ak[s] = forceatoms[i4+3];
1687             al[s] = forceatoms[i4+4];
1688             /* At the end fill the arrays with identical entries */
1689             if (i4 + 5 < nbonds)
1690             {
1691                 i4 += 5;
1692             }
1693         }
1694
1695         /* Caclulate 4 dihedral angles at once */
1696         dih_angle_sse(x,ai,aj,ak,al,pbc,t1,t2,t3,rs,buf);
1697
1698         i4 = i;
1699         for(s=0; s<4; s++)
1700         {
1701             if (i4 < nbonds)
1702             {
1703                 /* Calculate the coefficient and angle deviation */
1704                 type = forceatoms[i4];
1705                 dopdihs_mdphi(forceparams[type].pdihs.cpA,
1706                               forceparams[type].pdihs.cpB,
1707                               forceparams[type].pdihs.phiA,
1708                               forceparams[type].pdihs.phiB,
1709                               forceparams[type].pdihs.mult,
1710                               buf[16+s],lambda,&cp[s],&buf[16+s]);
1711                 mult[s] = forceparams[type].pdihs.mult;
1712             }
1713             else
1714             {
1715                 buf[16+s] = 0;
1716             }
1717             i4 += 5;
1718         }
1719
1720         /* Calculate 4 sines at once */
1721         mdphi_SSE = _mm_load_ps(buf+16);
1722         gmx_mm_sincos_ps(mdphi_SSE,&sin_SSE,&cos_SSE);
1723         _mm_store_ps(buf+16,sin_SSE);
1724
1725         i4 = i;
1726         s = 0;
1727         do
1728         {
1729             ddphi = -cp[s]*mult[s]*buf[16+s];
1730
1731             do_dih_fup_noshiftf_precalc(ai[s],aj[s],ak[s],al[s],ddphi,
1732                                         buf[ 0+s],buf[ 4+s],
1733                                         buf[ 8+s],buf[12+s],
1734                                         rs[0+s].v,rs[4+s].v,
1735                                         f);
1736             s++;
1737             i4 += 5;
1738         }
1739         while (s < 4 && i4 < nbonds);
1740     }
1741 }
1742
1743 #endif /* SSE_PROPER_DIHEDRALS */
1744
1745
1746 real idihs(int nbonds,
1747            const t_iatom forceatoms[],const t_iparams forceparams[],
1748            const rvec x[],rvec f[],rvec fshift[],
1749            const t_pbc *pbc,const t_graph *g,
1750            real lambda,real *dvdlambda,
1751            const t_mdatoms *md,t_fcdata *fcd,
1752            int *global_atom_index)
1753 {
1754   int  i,type,ai,aj,ak,al;
1755   int  t1,t2,t3;
1756   real phi,phi0,dphi0,ddphi,sign,vtot;
1757   rvec r_ij,r_kj,r_kl,m,n;
1758   real L1,kk,dp,dp2,kA,kB,pA,pB,dvdl_term;
1759
1760   L1 = 1.0-lambda;
1761   dvdl_term = 0;
1762   vtot = 0.0;
1763   for(i=0; (i<nbonds); ) {
1764     type = forceatoms[i++];
1765     ai   = forceatoms[i++];
1766     aj   = forceatoms[i++];
1767     ak   = forceatoms[i++];
1768     al   = forceatoms[i++];
1769     
1770     phi=dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
1771                   &sign,&t1,&t2,&t3);                   /*  84          */
1772     
1773     /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
1774      * force changes if we just apply a normal harmonic.
1775      * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
1776      * This means we will never have the periodicity problem, unless
1777      * the dihedral is Pi away from phiO, which is very unlikely due to
1778      * the potential.
1779      */
1780     kA = forceparams[type].harmonic.krA;
1781     kB = forceparams[type].harmonic.krB;
1782     pA = forceparams[type].harmonic.rA;
1783     pB = forceparams[type].harmonic.rB;
1784
1785     kk    = L1*kA + lambda*kB;
1786     phi0  = (L1*pA + lambda*pB)*DEG2RAD;
1787     dphi0 = (pB - pA)*DEG2RAD;
1788
1789     dp = phi-phi0;  
1790
1791     make_dp_periodic(&dp);
1792     
1793     dp2 = dp*dp;
1794
1795     vtot += 0.5*kk*dp2;
1796     ddphi = -kk*dp;
1797     
1798     dvdl_term += 0.5*(kB - kA)*dp2 - kk*dphi0*dp;
1799
1800     do_dih_fup(ai,aj,ak,al,(real)(-ddphi),r_ij,r_kj,r_kl,m,n,
1801                f,fshift,pbc,g,x,t1,t2,t3);                      /* 112          */
1802     /* 218 TOTAL        */
1803 #ifdef DEBUG
1804     if (debug)
1805       fprintf(debug,"idih: (%d,%d,%d,%d) phi=%g\n",
1806               ai,aj,ak,al,phi);
1807 #endif
1808   }
1809   
1810   *dvdlambda += dvdl_term;
1811   return vtot;
1812 }
1813
1814
1815 real posres(int nbonds,
1816             const t_iatom forceatoms[],const t_iparams forceparams[],
1817             const rvec x[],rvec f[],rvec vir_diag,
1818             t_pbc *pbc,
1819             real lambda,real *dvdlambda,
1820             int refcoord_scaling,int ePBC,rvec comA,rvec comB)
1821 {
1822     int  i,ai,m,d,type,ki,npbcdim=0;
1823     const t_iparams *pr;
1824     real L1;
1825     real vtot,kk,fm;
1826     real posA,posB,ref=0;
1827     rvec comA_sc,comB_sc,rdist,dpdl,pos,dx;
1828     gmx_bool bForceValid = TRUE;
1829
1830     if ((f==NULL) || (vir_diag==NULL)) {  /* should both be null together! */
1831         bForceValid = FALSE;
1832     }
1833
1834     npbcdim = ePBC2npbcdim(ePBC);
1835
1836     if (refcoord_scaling == erscCOM)
1837     {
1838         clear_rvec(comA_sc);
1839         clear_rvec(comB_sc);
1840         for(m=0; m<npbcdim; m++)
1841         {
1842             for(d=m; d<npbcdim; d++)
1843             {
1844                 comA_sc[m] += comA[d]*pbc->box[d][m];
1845                 comB_sc[m] += comB[d]*pbc->box[d][m];
1846             }
1847         }
1848     }
1849
1850     L1 = 1.0 - lambda;
1851
1852     vtot = 0.0;
1853     for(i=0; (i<nbonds); )
1854     {
1855         type = forceatoms[i++];
1856         ai   = forceatoms[i++];
1857         pr   = &forceparams[type];
1858         
1859         for(m=0; m<DIM; m++)
1860         {
1861             posA = forceparams[type].posres.pos0A[m];
1862             posB = forceparams[type].posres.pos0B[m];
1863             if (m < npbcdim)
1864             {
1865                 switch (refcoord_scaling)
1866                 {
1867                 case erscNO:
1868                     ref      = 0;
1869                     rdist[m] = L1*posA + lambda*posB;
1870                     dpdl[m]  = posB - posA;
1871                     break;
1872                 case erscALL:
1873                     /* Box relative coordinates are stored for dimensions with pbc */
1874                     posA *= pbc->box[m][m];
1875                     posB *= pbc->box[m][m];
1876                     for(d=m+1; d<npbcdim; d++)
1877                     {
1878                         posA += forceparams[type].posres.pos0A[d]*pbc->box[d][m];
1879                         posB += forceparams[type].posres.pos0B[d]*pbc->box[d][m];
1880                     }
1881                     ref      = L1*posA + lambda*posB;
1882                     rdist[m] = 0;
1883                     dpdl[m]  = posB - posA;
1884                     break;
1885                 case erscCOM:
1886                     ref      = L1*comA_sc[m] + lambda*comB_sc[m];
1887                     rdist[m] = L1*posA       + lambda*posB;
1888                     dpdl[m]  = comB_sc[m] - comA_sc[m] + posB - posA;
1889                     break;
1890                 default:
1891                     gmx_fatal(FARGS, "No such scaling method implemented");
1892                 }
1893             }
1894             else
1895             {
1896                 ref      = L1*posA + lambda*posB;
1897                 rdist[m] = 0;
1898                 dpdl[m]  = posB - posA;
1899             }
1900
1901             /* We do pbc_dx with ref+rdist,
1902              * since with only ref we can be up to half a box vector wrong.
1903              */
1904             pos[m] = ref + rdist[m];
1905         }
1906
1907         if (pbc)
1908         {
1909             pbc_dx(pbc,x[ai],pos,dx);
1910         }
1911         else
1912         {
1913             rvec_sub(x[ai],pos,dx);
1914         }
1915
1916         for (m=0; (m<DIM); m++)
1917         {
1918             kk          = L1*pr->posres.fcA[m] + lambda*pr->posres.fcB[m];
1919             fm          = -kk*dx[m];
1920             vtot       += 0.5*kk*dx[m]*dx[m];
1921             *dvdlambda +=
1922                 0.5*(pr->posres.fcB[m] - pr->posres.fcA[m])*dx[m]*dx[m]
1923                 -fm*dpdl[m];
1924
1925             /* Here we correct for the pbc_dx which included rdist */
1926             if (bForceValid) {
1927                 f[ai][m]   += fm;
1928                 vir_diag[m] -= 0.5*(dx[m] + rdist[m])*fm;
1929             }
1930         }
1931     }
1932
1933     return vtot;
1934 }
1935
1936 static real low_angres(int nbonds,
1937                        const t_iatom forceatoms[],const t_iparams forceparams[],
1938                        const rvec x[],rvec f[],rvec fshift[],
1939                        const t_pbc *pbc,const t_graph *g,
1940                        real lambda,real *dvdlambda,
1941                        gmx_bool bZAxis)
1942 {
1943   int  i,m,type,ai,aj,ak,al;
1944   int  t1,t2;
1945   real phi,cos_phi,cos_phi2,vid,vtot,dVdphi;
1946   rvec r_ij,r_kl,f_i,f_k={0,0,0};
1947   real st,sth,nrij2,nrkl2,c,cij,ckl;
1948
1949   ivec dt;  
1950   t2 = 0; /* avoid warning with gcc-3.3. It is never used uninitialized */
1951
1952   vtot = 0.0;
1953   ak=al=0; /* to avoid warnings */
1954   for(i=0; i<nbonds; ) {
1955     type = forceatoms[i++];
1956     ai   = forceatoms[i++];
1957     aj   = forceatoms[i++];
1958     t1   = pbc_rvec_sub(pbc,x[aj],x[ai],r_ij);                  /*  3           */
1959     if (!bZAxis) {      
1960       ak   = forceatoms[i++];
1961       al   = forceatoms[i++];
1962       t2   = pbc_rvec_sub(pbc,x[al],x[ak],r_kl);           /*  3                */
1963     } else {
1964       r_kl[XX] = 0;
1965       r_kl[YY] = 0;
1966       r_kl[ZZ] = 1;
1967     }
1968
1969     cos_phi = cos_angle(r_ij,r_kl);             /* 25           */
1970     phi     = acos(cos_phi);                    /* 10           */
1971
1972     *dvdlambda += dopdihs_min(forceparams[type].pdihs.cpA,
1973                               forceparams[type].pdihs.cpB,
1974                               forceparams[type].pdihs.phiA,
1975                               forceparams[type].pdihs.phiB,
1976                               forceparams[type].pdihs.mult,
1977                               phi,lambda,&vid,&dVdphi); /*  40  */
1978     
1979     vtot += vid;
1980
1981     cos_phi2 = sqr(cos_phi);                    /*   1          */
1982     if (cos_phi2 < 1) {
1983       st  = -dVdphi*gmx_invsqrt(1 - cos_phi2);      /*  12              */
1984       sth = st*cos_phi;                         /*   1          */
1985       nrij2 = iprod(r_ij,r_ij);                 /*   5          */
1986       nrkl2 = iprod(r_kl,r_kl);                 /*   5          */
1987       
1988       c   = st*gmx_invsqrt(nrij2*nrkl2);                /*  11          */ 
1989       cij = sth/nrij2;                          /*  10          */
1990       ckl = sth/nrkl2;                          /*  10          */
1991       
1992       for (m=0; m<DIM; m++) {                   /*  18+18       */
1993         f_i[m] = (c*r_kl[m]-cij*r_ij[m]);
1994         f[ai][m] += f_i[m];
1995         f[aj][m] -= f_i[m];
1996         if (!bZAxis) {
1997           f_k[m] = (c*r_ij[m]-ckl*r_kl[m]);
1998           f[ak][m] += f_k[m];
1999           f[al][m] -= f_k[m];
2000         }
2001       }
2002       
2003       if (g) {
2004         ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
2005         t1=IVEC2IS(dt);
2006       }
2007       rvec_inc(fshift[t1],f_i);
2008       rvec_dec(fshift[CENTRAL],f_i);
2009       if (!bZAxis) {
2010         if (g) {
2011           ivec_sub(SHIFT_IVEC(g,ak),SHIFT_IVEC(g,al),dt);
2012           t2=IVEC2IS(dt);
2013         }
2014         rvec_inc(fshift[t2],f_k);
2015         rvec_dec(fshift[CENTRAL],f_k);
2016       }
2017     }
2018   }
2019
2020   return vtot;  /*  184 / 157 (bZAxis)  total  */
2021 }
2022
2023 real angres(int nbonds,
2024             const t_iatom forceatoms[],const t_iparams forceparams[],
2025             const rvec x[],rvec f[],rvec fshift[],
2026             const t_pbc *pbc,const t_graph *g,
2027             real lambda,real *dvdlambda,
2028             const t_mdatoms *md,t_fcdata *fcd,
2029             int *global_atom_index)
2030 {
2031   return low_angres(nbonds,forceatoms,forceparams,x,f,fshift,pbc,g,
2032                     lambda,dvdlambda,FALSE);
2033 }
2034
2035 real angresz(int nbonds,
2036              const t_iatom forceatoms[],const t_iparams forceparams[],
2037              const rvec x[],rvec f[],rvec fshift[],
2038              const t_pbc *pbc,const t_graph *g,
2039              real lambda,real *dvdlambda,
2040              const t_mdatoms *md,t_fcdata *fcd,
2041              int *global_atom_index)
2042 {
2043   return low_angres(nbonds,forceatoms,forceparams,x,f,fshift,pbc,g,
2044                     lambda,dvdlambda,TRUE);
2045 }
2046
2047 real dihres(int nbonds,
2048             const t_iatom forceatoms[],const t_iparams forceparams[],
2049             const rvec x[],rvec f[],rvec fshift[],
2050             const t_pbc *pbc,const t_graph *g,
2051             real lambda,real *dvdlambda,
2052             const t_mdatoms *md,t_fcdata *fcd,
2053             int *global_atom_index)
2054 {
2055     real vtot = 0;
2056     int  ai,aj,ak,al,i,k,type,t1,t2,t3;
2057     real phi0A,phi0B,dphiA,dphiB,kfacA,kfacB,phi0,dphi,kfac;
2058     real phi,ddphi,ddp,ddp2,dp,sign,d2r,fc,L1;
2059     rvec r_ij,r_kj,r_kl,m,n;
2060
2061     L1 = 1.0-lambda;
2062
2063     d2r = DEG2RAD;
2064     k   = 0;
2065
2066     for (i=0; (i<nbonds); )
2067     {
2068         type = forceatoms[i++];
2069         ai   = forceatoms[i++];
2070         aj   = forceatoms[i++];
2071         ak   = forceatoms[i++];
2072         al   = forceatoms[i++];
2073
2074         phi0A  = forceparams[type].dihres.phiA*d2r;
2075         dphiA  = forceparams[type].dihres.dphiA*d2r;
2076         kfacA  = forceparams[type].dihres.kfacA;
2077
2078         phi0B  = forceparams[type].dihres.phiB*d2r;
2079         dphiB  = forceparams[type].dihres.dphiB*d2r;
2080         kfacB  = forceparams[type].dihres.kfacB;
2081
2082         phi0  = L1*phi0A + lambda*phi0B;
2083         dphi  = L1*dphiA + lambda*dphiB;
2084         kfac = L1*kfacA + lambda*kfacB;
2085
2086         phi = dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
2087                         &sign,&t1,&t2,&t3);
2088         /* 84 flops */
2089
2090         if (debug)
2091         {
2092             fprintf(debug,"dihres[%d]: %d %d %d %d : phi=%f, dphi=%f, kfac=%f\n",
2093                     k++,ai,aj,ak,al,phi0,dphi,kfac);
2094         }
2095         /* phi can jump if phi0 is close to Pi/-Pi, which will cause huge
2096          * force changes if we just apply a normal harmonic.
2097          * Instead, we first calculate phi-phi0 and take it modulo (-Pi,Pi).
2098          * This means we will never have the periodicity problem, unless
2099          * the dihedral is Pi away from phiO, which is very unlikely due to
2100          * the potential.
2101          */
2102         dp = phi-phi0;
2103         make_dp_periodic(&dp);
2104
2105         if (dp > dphi)
2106         {
2107             ddp = dp-dphi;
2108         }
2109         else if (dp < -dphi)
2110         {
2111             ddp = dp+dphi;
2112         }
2113         else
2114         {
2115             ddp = 0;
2116         }
2117
2118         if (ddp != 0.0)
2119         {
2120             ddp2 = ddp*ddp;
2121             vtot += 0.5*kfac*ddp2;
2122             ddphi = kfac*ddp;
2123
2124             *dvdlambda += 0.5*(kfacB - kfacA)*ddp2;
2125             /* lambda dependence from changing restraint distances */
2126             if (ddp > 0)
2127             {
2128                 *dvdlambda -= kfac*ddp*((dphiB - dphiA)+(phi0B - phi0A));
2129             }
2130             else if (ddp < 0)
2131             {
2132                 *dvdlambda += kfac*ddp*((dphiB - dphiA)-(phi0B - phi0A));
2133             }
2134             do_dih_fup(ai,aj,ak,al,ddphi,r_ij,r_kj,r_kl,m,n,
2135                        f,fshift,pbc,g,x,t1,t2,t3);              /* 112          */
2136         }
2137     }
2138     return vtot;
2139 }
2140
2141
2142 real unimplemented(int nbonds,
2143                    const t_iatom forceatoms[],const t_iparams forceparams[],
2144                    const rvec x[],rvec f[],rvec fshift[],
2145                    const t_pbc *pbc,const t_graph *g,
2146                    real lambda,real *dvdlambda,
2147                    const t_mdatoms *md,t_fcdata *fcd,
2148                    int *global_atom_index)
2149 {
2150   gmx_impl("*** you are using a not implemented function");
2151
2152   return 0.0; /* To make the compiler happy */
2153 }
2154
2155 real rbdihs(int nbonds,
2156             const t_iatom forceatoms[],const t_iparams forceparams[],
2157             const rvec x[],rvec f[],rvec fshift[],
2158             const t_pbc *pbc,const t_graph *g,
2159             real lambda,real *dvdlambda,
2160             const t_mdatoms *md,t_fcdata *fcd,
2161             int *global_atom_index)
2162 {
2163   const real c0=0.0,c1=1.0,c2=2.0,c3=3.0,c4=4.0,c5=5.0;
2164   int  type,ai,aj,ak,al,i,j;
2165   int  t1,t2,t3;
2166   rvec r_ij,r_kj,r_kl,m,n;
2167   real parmA[NR_RBDIHS];
2168   real parmB[NR_RBDIHS];
2169   real parm[NR_RBDIHS];
2170   real cos_phi,phi,rbp,rbpBA;
2171   real v,sign,ddphi,sin_phi;
2172   real cosfac,vtot;
2173   real L1   = 1.0-lambda;
2174   real dvdl_term=0;
2175
2176   vtot = 0.0;
2177   for(i=0; (i<nbonds); ) {
2178     type = forceatoms[i++];
2179     ai   = forceatoms[i++];
2180     aj   = forceatoms[i++];
2181     ak   = forceatoms[i++];
2182     al   = forceatoms[i++];
2183
2184       phi=dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
2185                     &sign,&t1,&t2,&t3);                 /*  84          */
2186
2187     /* Change to polymer convention */
2188     if (phi < c0)
2189       phi += M_PI;
2190     else
2191       phi -= M_PI;                      /*   1          */
2192       
2193     cos_phi = cos(phi);         
2194     /* Beware of accuracy loss, cannot use 1-sqrt(cos^2) ! */
2195     sin_phi = sin(phi);
2196
2197     for(j=0; (j<NR_RBDIHS); j++) {
2198       parmA[j] = forceparams[type].rbdihs.rbcA[j];
2199       parmB[j] = forceparams[type].rbdihs.rbcB[j];
2200       parm[j]  = L1*parmA[j]+lambda*parmB[j];
2201     }
2202     /* Calculate cosine powers */
2203     /* Calculate the energy */
2204     /* Calculate the derivative */
2205
2206     v       = parm[0];
2207     dvdl_term   += (parmB[0]-parmA[0]);
2208     ddphi   = c0;
2209     cosfac  = c1;
2210     
2211     rbp     = parm[1];
2212     rbpBA   = parmB[1]-parmA[1];
2213     ddphi  += rbp*cosfac;
2214     cosfac *= cos_phi;
2215     v      += cosfac*rbp;
2216     dvdl_term   += cosfac*rbpBA;
2217     rbp     = parm[2];
2218     rbpBA   = parmB[2]-parmA[2];    
2219     ddphi  += c2*rbp*cosfac;
2220     cosfac *= cos_phi;
2221     v      += cosfac*rbp;
2222     dvdl_term   += cosfac*rbpBA;
2223     rbp     = parm[3];
2224     rbpBA   = parmB[3]-parmA[3];
2225     ddphi  += c3*rbp*cosfac;
2226     cosfac *= cos_phi;
2227     v      += cosfac*rbp;
2228     dvdl_term   += cosfac*rbpBA;
2229     rbp     = parm[4];
2230     rbpBA   = parmB[4]-parmA[4];
2231     ddphi  += c4*rbp*cosfac;
2232     cosfac *= cos_phi;
2233     v      += cosfac*rbp;
2234     dvdl_term   += cosfac*rbpBA;
2235     rbp     = parm[5];
2236     rbpBA   = parmB[5]-parmA[5];
2237     ddphi  += c5*rbp*cosfac;
2238     cosfac *= cos_phi;
2239     v      += cosfac*rbp;
2240     dvdl_term   += cosfac*rbpBA;
2241    
2242     ddphi = -ddphi*sin_phi;                             /*  11          */
2243     
2244     do_dih_fup(ai,aj,ak,al,ddphi,r_ij,r_kj,r_kl,m,n,
2245                f,fshift,pbc,g,x,t1,t2,t3);              /* 112          */
2246     vtot += v;
2247   }  
2248   *dvdlambda += dvdl_term;
2249
2250   return vtot;
2251 }
2252
2253 int cmap_setup_grid_index(int ip, int grid_spacing, int *ipm1, int *ipp1, int *ipp2)
2254 {
2255         int im1, ip1, ip2;
2256         
2257         if(ip<0)
2258         {
2259                 ip = ip + grid_spacing - 1;
2260         }
2261         else if(ip > grid_spacing)
2262         {
2263                 ip = ip - grid_spacing - 1;
2264         }
2265         
2266         im1 = ip - 1;
2267         ip1 = ip + 1;
2268         ip2 = ip + 2;
2269         
2270         if(ip == 0)
2271         {
2272                 im1 = grid_spacing - 1;
2273         }
2274         else if(ip == grid_spacing-2)
2275         {
2276                 ip2 = 0;
2277         }
2278         else if(ip == grid_spacing-1)
2279         {
2280                 ip1 = 0;
2281                 ip2 = 1;
2282         }
2283         
2284         *ipm1 = im1;
2285         *ipp1 = ip1;
2286         *ipp2 = ip2;
2287         
2288         return ip;
2289         
2290 }
2291
2292 real cmap_dihs(int nbonds,
2293                            const t_iatom forceatoms[],const t_iparams forceparams[],
2294                const gmx_cmap_t *cmap_grid,
2295                            const rvec x[],rvec f[],rvec fshift[],
2296                            const t_pbc *pbc,const t_graph *g,
2297                            real lambda,real *dvdlambda,
2298                            const t_mdatoms *md,t_fcdata *fcd,
2299                            int *global_atom_index)
2300 {
2301         int i,j,k,n,idx;
2302         int ai,aj,ak,al,am;
2303         int a1i,a1j,a1k,a1l,a2i,a2j,a2k,a2l;
2304         int type,cmapA;
2305         int t11,t21,t31,t12,t22,t32;
2306         int iphi1,ip1m1,ip1p1,ip1p2;
2307         int iphi2,ip2m1,ip2p1,ip2p2;
2308         int l1,l2,l3,l4;
2309         int pos1,pos2,pos3,pos4,tmp;
2310         
2311         real ty[4],ty1[4],ty2[4],ty12[4],tc[16],tx[16];
2312         real phi1,psi1,cos_phi1,sin_phi1,sign1,xphi1;
2313         real phi2,psi2,cos_phi2,sin_phi2,sign2,xphi2;
2314         real dx,xx,tt,tu,e,df1,df2,ddf1,ddf2,ddf12,vtot;
2315         real ra21,rb21,rg21,rg1,rgr1,ra2r1,rb2r1,rabr1;
2316         real ra22,rb22,rg22,rg2,rgr2,ra2r2,rb2r2,rabr2;
2317         real fg1,hg1,fga1,hgb1,gaa1,gbb1;
2318         real fg2,hg2,fga2,hgb2,gaa2,gbb2;
2319         real fac;
2320         
2321         rvec r1_ij, r1_kj, r1_kl,m1,n1;
2322         rvec r2_ij, r2_kj, r2_kl,m2,n2;
2323         rvec f1_i,f1_j,f1_k,f1_l;
2324         rvec f2_i,f2_j,f2_k,f2_l;
2325         rvec a1,b1,a2,b2;
2326         rvec f1,g1,h1,f2,g2,h2;
2327         rvec dtf1,dtg1,dth1,dtf2,dtg2,dth2;
2328         ivec jt1,dt1_ij,dt1_kj,dt1_lj;
2329         ivec jt2,dt2_ij,dt2_kj,dt2_lj;
2330
2331     const real *cmapd;
2332
2333         int loop_index[4][4] = {
2334                 {0,4,8,12},
2335                 {1,5,9,13},
2336                 {2,6,10,14},
2337                 {3,7,11,15}
2338         };
2339         
2340         /* Total CMAP energy */
2341         vtot = 0;
2342         
2343         for(n=0;n<nbonds; )
2344         {
2345                 /* Five atoms are involved in the two torsions */
2346                 type   = forceatoms[n++];
2347                 ai     = forceatoms[n++];
2348                 aj     = forceatoms[n++];
2349                 ak     = forceatoms[n++];
2350                 al     = forceatoms[n++];
2351                 am     = forceatoms[n++];
2352                 
2353                 /* Which CMAP type is this */
2354                 cmapA = forceparams[type].cmap.cmapA;
2355         cmapd = cmap_grid->cmapdata[cmapA].cmap;
2356
2357                 /* First torsion */
2358                 a1i   = ai;
2359                 a1j   = aj;
2360                 a1k   = ak;
2361                 a1l   = al;
2362                 
2363                 phi1  = dih_angle(x[a1i], x[a1j], x[a1k], x[a1l], pbc, r1_ij, r1_kj, r1_kl, m1, n1,
2364                                                    &sign1, &t11, &t21, &t31); /* 84 */
2365                 
2366         cos_phi1 = cos(phi1);
2367         
2368                 a1[0] = r1_ij[1]*r1_kj[2]-r1_ij[2]*r1_kj[1];
2369                 a1[1] = r1_ij[2]*r1_kj[0]-r1_ij[0]*r1_kj[2];
2370                 a1[2] = r1_ij[0]*r1_kj[1]-r1_ij[1]*r1_kj[0]; /* 9 */
2371                 
2372                 b1[0] = r1_kl[1]*r1_kj[2]-r1_kl[2]*r1_kj[1];
2373                 b1[1] = r1_kl[2]*r1_kj[0]-r1_kl[0]*r1_kj[2];
2374                 b1[2] = r1_kl[0]*r1_kj[1]-r1_kl[1]*r1_kj[0]; /* 9 */
2375                 
2376                 tmp = pbc_rvec_sub(pbc,x[a1l],x[a1k],h1);
2377                 
2378                 ra21  = iprod(a1,a1);       /* 5 */
2379                 rb21  = iprod(b1,b1);       /* 5 */
2380                 rg21  = iprod(r1_kj,r1_kj); /* 5 */
2381                 rg1   = sqrt(rg21);
2382                 
2383                 rgr1  = 1.0/rg1;
2384                 ra2r1 = 1.0/ra21;
2385                 rb2r1 = 1.0/rb21;
2386                 rabr1 = sqrt(ra2r1*rb2r1);
2387                 
2388                 sin_phi1 = rg1 * rabr1 * iprod(a1,h1) * (-1);
2389                 
2390                 if(cos_phi1 < -0.5 || cos_phi1 > 0.5)
2391                 {
2392                         phi1 = asin(sin_phi1);
2393                         
2394                         if(cos_phi1 < 0)
2395                         {
2396                                 if(phi1 > 0)
2397                                 {
2398                                         phi1 = M_PI - phi1;
2399                                 }
2400                                 else
2401                                 {
2402                                         phi1 = -M_PI - phi1;
2403                                 }
2404                         }
2405                 }
2406                 else
2407                 {
2408                         phi1 = acos(cos_phi1);
2409                         
2410                         if(sin_phi1 < 0)
2411                         {
2412                                 phi1 = -phi1;
2413                         }
2414                 }
2415                 
2416                 xphi1 = phi1 + M_PI; /* 1 */
2417                 
2418                 /* Second torsion */
2419                 a2i   = aj;
2420                 a2j   = ak;
2421                 a2k   = al;
2422                 a2l   = am;
2423                 
2424                 phi2  = dih_angle(x[a2i], x[a2j], x[a2k], x[a2l], pbc, r2_ij, r2_kj, r2_kl, m2, n2,
2425                                                   &sign2, &t12, &t22, &t32); /* 84 */
2426                 
2427         cos_phi2 = cos(phi2);
2428
2429                 a2[0] = r2_ij[1]*r2_kj[2]-r2_ij[2]*r2_kj[1];
2430                 a2[1] = r2_ij[2]*r2_kj[0]-r2_ij[0]*r2_kj[2];
2431                 a2[2] = r2_ij[0]*r2_kj[1]-r2_ij[1]*r2_kj[0]; /* 9 */
2432                 
2433                 b2[0] = r2_kl[1]*r2_kj[2]-r2_kl[2]*r2_kj[1];
2434                 b2[1] = r2_kl[2]*r2_kj[0]-r2_kl[0]*r2_kj[2];
2435                 b2[2] = r2_kl[0]*r2_kj[1]-r2_kl[1]*r2_kj[0]; /* 9 */
2436                 
2437                 tmp = pbc_rvec_sub(pbc,x[a2l],x[a2k],h2);
2438                 
2439                 ra22  = iprod(a2,a2);         /* 5 */
2440                 rb22  = iprod(b2,b2);         /* 5 */
2441                 rg22  = iprod(r2_kj,r2_kj);   /* 5 */
2442                 rg2   = sqrt(rg22);
2443                 
2444                 rgr2  = 1.0/rg2;
2445                 ra2r2 = 1.0/ra22;
2446                 rb2r2 = 1.0/rb22;
2447                 rabr2 = sqrt(ra2r2*rb2r2);
2448                 
2449                 sin_phi2 = rg2 * rabr2 * iprod(a2,h2) * (-1);
2450                 
2451                 if(cos_phi2 < -0.5 || cos_phi2 > 0.5)
2452                 {
2453                         phi2 = asin(sin_phi2);
2454                         
2455                         if(cos_phi2 < 0)
2456                         {
2457                                 if(phi2 > 0)
2458                                 {
2459                                         phi2 = M_PI - phi2;
2460                                 }
2461                                 else
2462                                 {
2463                                         phi2 = -M_PI - phi2;
2464                                 }
2465                         }
2466                 }
2467                 else
2468                 {
2469                         phi2 = acos(cos_phi2);
2470                         
2471                         if(sin_phi2 < 0)
2472                         {
2473                                 phi2 = -phi2;
2474                         }
2475                 }
2476                 
2477                 xphi2 = phi2 + M_PI; /* 1 */
2478                 
2479                 /* Range mangling */
2480                 if(xphi1<0)
2481                 {
2482                         xphi1 = xphi1 + 2*M_PI;
2483                 }
2484                 else if(xphi1>=2*M_PI)
2485                 {
2486                         xphi1 = xphi1 - 2*M_PI;
2487                 }
2488                 
2489                 if(xphi2<0)
2490                 {
2491                         xphi2 = xphi2 + 2*M_PI;
2492                 }
2493                 else if(xphi2>=2*M_PI)
2494                 {
2495                         xphi2 = xphi2 - 2*M_PI;
2496                 }
2497                 
2498                 /* Number of grid points */
2499                 dx = 2*M_PI / cmap_grid->grid_spacing;
2500                 
2501                 /* Where on the grid are we */
2502                 iphi1 = (int)(xphi1/dx);
2503                 iphi2 = (int)(xphi2/dx);
2504                 
2505                 iphi1 = cmap_setup_grid_index(iphi1, cmap_grid->grid_spacing, &ip1m1,&ip1p1,&ip1p2);
2506                 iphi2 = cmap_setup_grid_index(iphi2, cmap_grid->grid_spacing, &ip2m1,&ip2p1,&ip2p2);
2507                 
2508                 pos1    = iphi1*cmap_grid->grid_spacing+iphi2;
2509                 pos2    = ip1p1*cmap_grid->grid_spacing+iphi2;
2510                 pos3    = ip1p1*cmap_grid->grid_spacing+ip2p1;
2511                 pos4    = iphi1*cmap_grid->grid_spacing+ip2p1;
2512
2513                 ty[0]   = cmapd[pos1*4];
2514                 ty[1]   = cmapd[pos2*4];
2515                 ty[2]   = cmapd[pos3*4];
2516                 ty[3]   = cmapd[pos4*4];
2517                 
2518                 ty1[0]   = cmapd[pos1*4+1];
2519                 ty1[1]   = cmapd[pos2*4+1];
2520                 ty1[2]   = cmapd[pos3*4+1];
2521                 ty1[3]   = cmapd[pos4*4+1];
2522                 
2523                 ty2[0]   = cmapd[pos1*4+2];
2524                 ty2[1]   = cmapd[pos2*4+2];
2525                 ty2[2]   = cmapd[pos3*4+2];
2526                 ty2[3]   = cmapd[pos4*4+2];
2527                 
2528                 ty12[0]   = cmapd[pos1*4+3];
2529                 ty12[1]   = cmapd[pos2*4+3];
2530                 ty12[2]   = cmapd[pos3*4+3];
2531                 ty12[3]   = cmapd[pos4*4+3];
2532                 
2533                 /* Switch to degrees */
2534                 dx = 360.0 / cmap_grid->grid_spacing;
2535                 xphi1 = xphi1 * RAD2DEG;
2536                 xphi2 = xphi2 * RAD2DEG; 
2537                 
2538                 for(i=0;i<4;i++) /* 16 */
2539                 {
2540                         tx[i] = ty[i];
2541                         tx[i+4] = ty1[i]*dx;
2542                         tx[i+8] = ty2[i]*dx;
2543                         tx[i+12] = ty12[i]*dx*dx;
2544                 }
2545                 
2546                 idx=0;
2547                 for(i=0;i<4;i++) /* 1056 */
2548                 {
2549                         for(j=0;j<4;j++)
2550                         {
2551                                 xx = 0;
2552                                 for(k=0;k<16;k++)
2553                                 {
2554                                         xx = xx + cmap_coeff_matrix[k*16+idx]*tx[k];
2555                                 }
2556                                 
2557                                 idx++;
2558                                 tc[i*4+j]=xx;
2559                         }
2560                 }
2561                 
2562                 tt    = (xphi1-iphi1*dx)/dx;
2563                 tu    = (xphi2-iphi2*dx)/dx;
2564                 
2565                 e     = 0;
2566                 df1   = 0;
2567                 df2   = 0;
2568                 ddf1  = 0;
2569                 ddf2  = 0;
2570                 ddf12 = 0;
2571                 
2572                 for(i=3;i>=0;i--)
2573                 {
2574                         l1 = loop_index[i][3];
2575                         l2 = loop_index[i][2];
2576                         l3 = loop_index[i][1];
2577                         
2578                         e     = tt * e    + ((tc[i*4+3]*tu+tc[i*4+2])*tu + tc[i*4+1])*tu+tc[i*4];
2579                         df1   = tu * df1  + (3.0*tc[l1]*tt+2.0*tc[l2])*tt+tc[l3];
2580                         df2   = tt * df2  + (3.0*tc[i*4+3]*tu+2.0*tc[i*4+2])*tu+tc[i*4+1];
2581                         ddf1  = tu * ddf1 + 2.0*3.0*tc[l1]*tt+2.0*tc[l2];
2582                         ddf2  = tt * ddf2 + 2.0*3.0*tc[4*i+3]*tu+2.0*tc[4*i+2];
2583                 }
2584                 
2585                 ddf12 = tc[5] + 2.0*tc[9]*tt + 3.0*tc[13]*tt*tt + 2.0*tu*(tc[6]+2.0*tc[10]*tt+3.0*tc[14]*tt*tt) +
2586                 3.0*tu*tu*(tc[7]+2.0*tc[11]*tt+3.0*tc[15]*tt*tt);
2587                 
2588                 fac     = RAD2DEG/dx;
2589                 df1     = df1   * fac;
2590                 df2     = df2   * fac;
2591                 ddf1    = ddf1  * fac * fac;
2592                 ddf2    = ddf2  * fac * fac;
2593                 ddf12   = ddf12 * fac * fac;
2594                 
2595                 /* CMAP energy */
2596                 vtot += e;
2597                 
2598                 /* Do forces - first torsion */
2599                 fg1       = iprod(r1_ij,r1_kj);
2600                 hg1       = iprod(r1_kl,r1_kj);
2601                 fga1      = fg1*ra2r1*rgr1;
2602                 hgb1      = hg1*rb2r1*rgr1;
2603                 gaa1      = -ra2r1*rg1;
2604                 gbb1      = rb2r1*rg1;
2605                 
2606                 for(i=0;i<DIM;i++)
2607                 {
2608                         dtf1[i]   = gaa1 * a1[i];
2609                         dtg1[i]   = fga1 * a1[i] - hgb1 * b1[i];
2610                         dth1[i]   = gbb1 * b1[i];
2611                         
2612                         f1[i]     = df1  * dtf1[i];
2613                         g1[i]     = df1  * dtg1[i];
2614                         h1[i]     = df1  * dth1[i];
2615                         
2616                         f1_i[i]   =  f1[i];
2617                         f1_j[i]   = -f1[i] - g1[i];
2618                         f1_k[i]   =  h1[i] + g1[i];
2619                         f1_l[i]   = -h1[i];
2620                         
2621                         f[a1i][i] = f[a1i][i] + f1_i[i];
2622                         f[a1j][i] = f[a1j][i] + f1_j[i]; /* - f1[i] - g1[i] */                                                            
2623                         f[a1k][i] = f[a1k][i] + f1_k[i]; /* h1[i] + g1[i] */                                                            
2624                         f[a1l][i] = f[a1l][i] + f1_l[i]; /* h1[i] */                                                                       
2625                 }
2626                 
2627                 /* Do forces - second torsion */
2628                 fg2       = iprod(r2_ij,r2_kj);
2629                 hg2       = iprod(r2_kl,r2_kj);
2630                 fga2      = fg2*ra2r2*rgr2;
2631                 hgb2      = hg2*rb2r2*rgr2;
2632                 gaa2      = -ra2r2*rg2;
2633                 gbb2      = rb2r2*rg2;
2634                 
2635                 for(i=0;i<DIM;i++)
2636                 {
2637                         dtf2[i]   = gaa2 * a2[i];
2638                         dtg2[i]   = fga2 * a2[i] - hgb2 * b2[i];
2639                         dth2[i]   = gbb2 * b2[i];
2640                         
2641                         f2[i]     = df2  * dtf2[i];
2642                         g2[i]     = df2  * dtg2[i];
2643                         h2[i]     = df2  * dth2[i];
2644                         
2645                         f2_i[i]   =  f2[i];
2646                         f2_j[i]   = -f2[i] - g2[i];
2647                         f2_k[i]   =  h2[i] + g2[i];
2648                         f2_l[i]   = -h2[i];
2649                         
2650                         f[a2i][i] = f[a2i][i] + f2_i[i]; /* f2[i] */                                                                        
2651                         f[a2j][i] = f[a2j][i] + f2_j[i]; /* - f2[i] - g2[i] */                                                              
2652                         f[a2k][i] = f[a2k][i] + f2_k[i]; /* h2[i] + g2[i] */                            
2653                         f[a2l][i] = f[a2l][i] + f2_l[i]; /* - h2[i] */                                                                      
2654                 }
2655                 
2656                 /* Shift forces */
2657                 if(g)
2658                 {
2659                         copy_ivec(SHIFT_IVEC(g,a1j), jt1);
2660                         ivec_sub(SHIFT_IVEC(g,a1i),  jt1,dt1_ij);
2661                         ivec_sub(SHIFT_IVEC(g,a1k),  jt1,dt1_kj);
2662                         ivec_sub(SHIFT_IVEC(g,a1l),  jt1,dt1_lj);
2663                         t11 = IVEC2IS(dt1_ij);
2664                         t21 = IVEC2IS(dt1_kj);
2665                         t31 = IVEC2IS(dt1_lj);
2666                         
2667                         copy_ivec(SHIFT_IVEC(g,a2j), jt2);
2668                         ivec_sub(SHIFT_IVEC(g,a2i),  jt2,dt2_ij);
2669                         ivec_sub(SHIFT_IVEC(g,a2k),  jt2,dt2_kj);
2670                         ivec_sub(SHIFT_IVEC(g,a2l),  jt2,dt2_lj);
2671                         t12 = IVEC2IS(dt2_ij);
2672                         t22 = IVEC2IS(dt2_kj);
2673                         t32 = IVEC2IS(dt2_lj);
2674                 }
2675                 else if(pbc)
2676                 {
2677                         t31 = pbc_rvec_sub(pbc,x[a1l],x[a1j],h1);
2678                         t32 = pbc_rvec_sub(pbc,x[a2l],x[a2j],h2);
2679                 }
2680                 else
2681                 {
2682                         t31 = CENTRAL;
2683                         t32 = CENTRAL;
2684                 }
2685                 
2686                 rvec_inc(fshift[t11],f1_i);
2687                 rvec_inc(fshift[CENTRAL],f1_j);
2688                 rvec_inc(fshift[t21],f1_k);
2689                 rvec_inc(fshift[t31],f1_l);
2690                 
2691                 rvec_inc(fshift[t21],f2_i);
2692                 rvec_inc(fshift[CENTRAL],f2_j);
2693                 rvec_inc(fshift[t22],f2_k);
2694                 rvec_inc(fshift[t32],f2_l);
2695         }       
2696         return vtot;
2697 }
2698
2699
2700
2701 /***********************************************************
2702  *
2703  *   G R O M O S  9 6   F U N C T I O N S
2704  *
2705  ***********************************************************/
2706 real g96harmonic(real kA,real kB,real xA,real xB,real x,real lambda,
2707                  real *V,real *F)
2708 {
2709   const real half=0.5;
2710   real  L1,kk,x0,dx,dx2;
2711   real  v,f,dvdlambda;
2712   
2713   L1    = 1.0-lambda;
2714   kk    = L1*kA+lambda*kB;
2715   x0    = L1*xA+lambda*xB;
2716   
2717   dx    = x-x0;
2718   dx2   = dx*dx;
2719   
2720   f     = -kk*dx;
2721   v     = half*kk*dx2;
2722   dvdlambda  = half*(kB-kA)*dx2 + (xA-xB)*kk*dx;
2723   
2724   *F    = f;
2725   *V    = v;
2726   
2727   return dvdlambda;
2728   
2729   /* That was 21 flops */
2730 }
2731
2732 real g96bonds(int nbonds,
2733               const t_iatom forceatoms[],const t_iparams forceparams[],
2734               const rvec x[],rvec f[],rvec fshift[],
2735               const t_pbc *pbc,const t_graph *g,
2736               real lambda,real *dvdlambda,
2737               const t_mdatoms *md,t_fcdata *fcd,
2738               int *global_atom_index)
2739 {
2740   int  i,m,ki,ai,aj,type;
2741   real dr2,fbond,vbond,fij,vtot;
2742   rvec dx;
2743   ivec dt;
2744   
2745   vtot = 0.0;
2746   for(i=0; (i<nbonds); ) {
2747     type = forceatoms[i++];
2748     ai   = forceatoms[i++];
2749     aj   = forceatoms[i++];
2750   
2751     ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);            /*   3          */
2752     dr2  = iprod(dx,dx);                                /*   5          */
2753       
2754     *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
2755                               forceparams[type].harmonic.krB,
2756                               forceparams[type].harmonic.rA,
2757                               forceparams[type].harmonic.rB,
2758                               dr2,lambda,&vbond,&fbond);
2759
2760     vtot  += 0.5*vbond;                             /* 1*/
2761 #ifdef DEBUG
2762     if (debug)
2763       fprintf(debug,"G96-BONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
2764               sqrt(dr2),vbond,fbond);
2765 #endif
2766    
2767     if (g) {
2768       ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
2769       ki=IVEC2IS(dt);
2770     }
2771     for (m=0; (m<DIM); m++) {                   /*  15          */
2772       fij=fbond*dx[m];
2773       f[ai][m]+=fij;
2774       f[aj][m]-=fij;
2775       fshift[ki][m]+=fij;
2776       fshift[CENTRAL][m]-=fij;
2777     }
2778   }                                     /* 44 TOTAL     */
2779   return vtot;
2780 }
2781
2782 real g96bond_angle(const rvec xi,const rvec xj,const rvec xk,const t_pbc *pbc,
2783                    rvec r_ij,rvec r_kj,
2784                    int *t1,int *t2)
2785 /* Return value is the angle between the bonds i-j and j-k */
2786 {
2787   real costh;
2788   
2789   *t1 = pbc_rvec_sub(pbc,xi,xj,r_ij);                   /*  3           */
2790   *t2 = pbc_rvec_sub(pbc,xk,xj,r_kj);                   /*  3           */
2791
2792   costh=cos_angle(r_ij,r_kj);           /* 25           */
2793                                         /* 41 TOTAL     */
2794   return costh;
2795 }
2796
2797 real g96angles(int nbonds,
2798                const t_iatom forceatoms[],const t_iparams forceparams[],
2799                const rvec x[],rvec f[],rvec fshift[],
2800                const t_pbc *pbc,const t_graph *g,
2801                real lambda,real *dvdlambda,
2802                const t_mdatoms *md,t_fcdata *fcd,
2803                int *global_atom_index)
2804 {
2805   int  i,ai,aj,ak,type,m,t1,t2;
2806   rvec r_ij,r_kj;
2807   real cos_theta,dVdt,va,vtot;
2808   real rij_1,rij_2,rkj_1,rkj_2,rijrkj_1;
2809   rvec f_i,f_j,f_k;
2810   ivec jt,dt_ij,dt_kj;
2811   
2812   vtot = 0.0;
2813   for(i=0; (i<nbonds); ) {
2814     type = forceatoms[i++];
2815     ai   = forceatoms[i++];
2816     aj   = forceatoms[i++];
2817     ak   = forceatoms[i++];
2818     
2819     cos_theta  = g96bond_angle(x[ai],x[aj],x[ak],pbc,r_ij,r_kj,&t1,&t2);
2820
2821     *dvdlambda += g96harmonic(forceparams[type].harmonic.krA,
2822                               forceparams[type].harmonic.krB,
2823                               forceparams[type].harmonic.rA,
2824                               forceparams[type].harmonic.rB,
2825                               cos_theta,lambda,&va,&dVdt);
2826     vtot    += va;
2827     
2828     rij_1    = gmx_invsqrt(iprod(r_ij,r_ij));
2829     rkj_1    = gmx_invsqrt(iprod(r_kj,r_kj));
2830     rij_2    = rij_1*rij_1;
2831     rkj_2    = rkj_1*rkj_1;
2832     rijrkj_1 = rij_1*rkj_1;                     /* 23 */
2833     
2834 #ifdef DEBUG
2835     if (debug)
2836       fprintf(debug,"G96ANGLES: costheta = %10g  vth = %10g  dV/dct = %10g\n",
2837               cos_theta,va,dVdt);
2838 #endif
2839     for (m=0; (m<DIM); m++) {                   /*  42  */
2840       f_i[m]=dVdt*(r_kj[m]*rijrkj_1 - r_ij[m]*rij_2*cos_theta);
2841       f_k[m]=dVdt*(r_ij[m]*rijrkj_1 - r_kj[m]*rkj_2*cos_theta);
2842       f_j[m]=-f_i[m]-f_k[m];
2843       f[ai][m]+=f_i[m];
2844       f[aj][m]+=f_j[m];
2845       f[ak][m]+=f_k[m];
2846     }
2847     
2848     if (g) {
2849       copy_ivec(SHIFT_IVEC(g,aj),jt);
2850       
2851       ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
2852       ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
2853       t1=IVEC2IS(dt_ij);
2854       t2=IVEC2IS(dt_kj);
2855     }      
2856     rvec_inc(fshift[t1],f_i);
2857     rvec_inc(fshift[CENTRAL],f_j);
2858     rvec_inc(fshift[t2],f_k);               /* 9 */
2859     /* 163 TOTAL        */
2860   }
2861   return vtot;
2862 }
2863
2864 real cross_bond_bond(int nbonds,
2865                      const t_iatom forceatoms[],const t_iparams forceparams[],
2866                      const rvec x[],rvec f[],rvec fshift[],
2867                      const t_pbc *pbc,const t_graph *g,
2868                      real lambda,real *dvdlambda,
2869                      const t_mdatoms *md,t_fcdata *fcd,
2870                      int *global_atom_index)
2871 {
2872   /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
2873    * pp. 842-847
2874    */
2875   int  i,ai,aj,ak,type,m,t1,t2;
2876   rvec r_ij,r_kj;
2877   real vtot,vrr,s1,s2,r1,r2,r1e,r2e,krr;
2878   rvec f_i,f_j,f_k;
2879   ivec jt,dt_ij,dt_kj;
2880   
2881   vtot = 0.0;
2882   for(i=0; (i<nbonds); ) {
2883     type = forceatoms[i++];
2884     ai   = forceatoms[i++];
2885     aj   = forceatoms[i++];
2886     ak   = forceatoms[i++];
2887     r1e  = forceparams[type].cross_bb.r1e;
2888     r2e  = forceparams[type].cross_bb.r2e;
2889     krr  = forceparams[type].cross_bb.krr;
2890     
2891     /* Compute distance vectors ... */
2892     t1 = pbc_rvec_sub(pbc,x[ai],x[aj],r_ij);
2893     t2 = pbc_rvec_sub(pbc,x[ak],x[aj],r_kj);
2894     
2895     /* ... and their lengths */
2896     r1 = norm(r_ij);
2897     r2 = norm(r_kj);
2898     
2899     /* Deviations from ideality */
2900     s1 = r1-r1e;
2901     s2 = r2-r2e;
2902     
2903     /* Energy (can be negative!) */
2904     vrr   = krr*s1*s2;
2905     vtot += vrr;
2906     
2907     /* Forces */
2908     svmul(-krr*s2/r1,r_ij,f_i);
2909     svmul(-krr*s1/r2,r_kj,f_k);
2910     
2911     for (m=0; (m<DIM); m++) {                   /*  12  */
2912       f_j[m]    = -f_i[m] - f_k[m];
2913       f[ai][m] += f_i[m];
2914       f[aj][m] += f_j[m];
2915       f[ak][m] += f_k[m];
2916     }
2917     
2918     /* Virial stuff */
2919     if (g) {
2920       copy_ivec(SHIFT_IVEC(g,aj),jt);
2921       
2922       ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
2923       ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
2924       t1=IVEC2IS(dt_ij);
2925       t2=IVEC2IS(dt_kj);
2926     }      
2927     rvec_inc(fshift[t1],f_i);
2928     rvec_inc(fshift[CENTRAL],f_j);
2929     rvec_inc(fshift[t2],f_k);               /* 9 */
2930     /* 163 TOTAL        */
2931   }
2932   return vtot;
2933 }
2934
2935 real cross_bond_angle(int nbonds,
2936                       const t_iatom forceatoms[],const t_iparams forceparams[],
2937                       const rvec x[],rvec f[],rvec fshift[],
2938                       const t_pbc *pbc,const t_graph *g,
2939                       real lambda,real *dvdlambda,
2940                       const t_mdatoms *md,t_fcdata *fcd,
2941                       int *global_atom_index)
2942 {
2943   /* Potential from Lawrence and Skimmer, Chem. Phys. Lett. 372 (2003)
2944    * pp. 842-847
2945    */
2946   int  i,ai,aj,ak,type,m,t1,t2,t3;
2947   rvec r_ij,r_kj,r_ik;
2948   real vtot,vrt,s1,s2,s3,r1,r2,r3,r1e,r2e,r3e,krt,k1,k2,k3;
2949   rvec f_i,f_j,f_k;
2950   ivec jt,dt_ij,dt_kj;
2951   
2952   vtot = 0.0;
2953   for(i=0; (i<nbonds); ) {
2954     type = forceatoms[i++];
2955     ai   = forceatoms[i++];
2956     aj   = forceatoms[i++];
2957     ak   = forceatoms[i++];
2958     r1e  = forceparams[type].cross_ba.r1e;
2959     r2e  = forceparams[type].cross_ba.r2e;
2960     r3e  = forceparams[type].cross_ba.r3e;
2961     krt  = forceparams[type].cross_ba.krt;
2962     
2963     /* Compute distance vectors ... */
2964     t1 = pbc_rvec_sub(pbc,x[ai],x[aj],r_ij);
2965     t2 = pbc_rvec_sub(pbc,x[ak],x[aj],r_kj);
2966     t3 = pbc_rvec_sub(pbc,x[ai],x[ak],r_ik);
2967     
2968     /* ... and their lengths */
2969     r1 = norm(r_ij);
2970     r2 = norm(r_kj);
2971     r3 = norm(r_ik);
2972     
2973     /* Deviations from ideality */
2974     s1 = r1-r1e;
2975     s2 = r2-r2e;
2976     s3 = r3-r3e;
2977     
2978     /* Energy (can be negative!) */
2979     vrt   = krt*s3*(s1+s2);
2980     vtot += vrt;
2981     
2982     /* Forces */
2983     k1 = -krt*(s3/r1);
2984     k2 = -krt*(s3/r2);
2985     k3 = -krt*(s1+s2)/r3;
2986     for(m=0; (m<DIM); m++) {
2987       f_i[m] = k1*r_ij[m] + k3*r_ik[m];
2988       f_k[m] = k2*r_kj[m] - k3*r_ik[m];
2989       f_j[m] = -f_i[m] - f_k[m];
2990     }
2991     
2992     for (m=0; (m<DIM); m++) {                   /*  12  */
2993       f[ai][m] += f_i[m];
2994       f[aj][m] += f_j[m];
2995       f[ak][m] += f_k[m];
2996     }
2997     
2998     /* Virial stuff */
2999     if (g) {
3000       copy_ivec(SHIFT_IVEC(g,aj),jt);
3001       
3002       ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
3003       ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
3004       t1=IVEC2IS(dt_ij);
3005       t2=IVEC2IS(dt_kj);
3006     }      
3007     rvec_inc(fshift[t1],f_i);
3008     rvec_inc(fshift[CENTRAL],f_j);
3009     rvec_inc(fshift[t2],f_k);               /* 9 */
3010     /* 163 TOTAL        */
3011   }
3012   return vtot;
3013 }
3014
3015 static real bonded_tab(const char *type,int table_nr,
3016                        const bondedtable_t *table,real kA,real kB,real r,
3017                        real lambda,real *V,real *F)
3018 {
3019   real k,tabscale,*VFtab,rt,eps,eps2,Yt,Ft,Geps,Heps2,Fp,VV,FF;
3020   int  n0,nnn;
3021   real v,f,dvdlambda;
3022
3023   k = (1.0 - lambda)*kA + lambda*kB;
3024
3025   tabscale = table->scale;
3026   VFtab    = table->tab;
3027   
3028   rt    = r*tabscale;
3029   n0    = rt;
3030   if (n0 >= table->n) {
3031     gmx_fatal(FARGS,"A tabulated %s interaction table number %d is out of the table range: r %f, between table indices %d and %d, table length %d",
3032               type,table_nr,r,n0,n0+1,table->n);
3033   }
3034   eps   = rt - n0;
3035   eps2  = eps*eps;
3036   nnn   = 4*n0;
3037   Yt    = VFtab[nnn];
3038   Ft    = VFtab[nnn+1];
3039   Geps  = VFtab[nnn+2]*eps;
3040   Heps2 = VFtab[nnn+3]*eps2;
3041   Fp    = Ft + Geps + Heps2;
3042   VV    = Yt + Fp*eps;
3043   FF    = Fp + Geps + 2.0*Heps2;
3044   
3045   *F    = -k*FF*tabscale;
3046   *V    = k*VV;
3047   dvdlambda  = (kB - kA)*VV;
3048   
3049   return dvdlambda;
3050   
3051   /* That was 22 flops */
3052 }
3053
3054 real tab_bonds(int nbonds,
3055                const t_iatom forceatoms[],const t_iparams forceparams[],
3056                const rvec x[],rvec f[],rvec fshift[],
3057                const t_pbc *pbc,const t_graph *g,
3058                real lambda,real *dvdlambda,
3059                const t_mdatoms *md,t_fcdata *fcd,
3060                int *global_atom_index)
3061 {
3062   int  i,m,ki,ai,aj,type,table;
3063   real dr,dr2,fbond,vbond,fij,vtot;
3064   rvec dx;
3065   ivec dt;
3066
3067   vtot = 0.0;
3068   for(i=0; (i<nbonds); ) {
3069     type = forceatoms[i++];
3070     ai   = forceatoms[i++];
3071     aj   = forceatoms[i++];
3072   
3073     ki   = pbc_rvec_sub(pbc,x[ai],x[aj],dx);    /*   3          */
3074     dr2  = iprod(dx,dx);                        /*   5          */
3075     dr   = dr2*gmx_invsqrt(dr2);                        /*  10          */
3076
3077     table = forceparams[type].tab.table;
3078
3079     *dvdlambda += bonded_tab("bond",table,
3080                              &fcd->bondtab[table],
3081                              forceparams[type].tab.kA,
3082                              forceparams[type].tab.kB,
3083                              dr,lambda,&vbond,&fbond);  /*  22 */
3084
3085     if (dr2 == 0.0)
3086       continue;
3087
3088     
3089     vtot  += vbond;/* 1*/
3090     fbond *= gmx_invsqrt(dr2);                  /*   6          */
3091 #ifdef DEBUG
3092     if (debug)
3093       fprintf(debug,"TABBONDS: dr = %10g  vbond = %10g  fbond = %10g\n",
3094               dr,vbond,fbond);
3095 #endif
3096     if (g) {
3097       ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
3098       ki=IVEC2IS(dt);
3099     }
3100     for (m=0; (m<DIM); m++) {                   /*  15          */
3101       fij=fbond*dx[m];
3102       f[ai][m]+=fij;
3103       f[aj][m]-=fij;
3104       fshift[ki][m]+=fij;
3105       fshift[CENTRAL][m]-=fij;
3106     }
3107   }                                     /* 62 TOTAL     */
3108   return vtot;
3109 }
3110
3111 real tab_angles(int nbonds,
3112                 const t_iatom forceatoms[],const t_iparams forceparams[],
3113                 const rvec x[],rvec f[],rvec fshift[],
3114                 const t_pbc *pbc,const t_graph *g,
3115                 real lambda,real *dvdlambda,
3116                 const t_mdatoms *md,t_fcdata *fcd,
3117                 int *global_atom_index)
3118 {
3119   int  i,ai,aj,ak,t1,t2,type,table;
3120   rvec r_ij,r_kj;
3121   real cos_theta,cos_theta2,theta,dVdt,va,vtot;
3122   ivec jt,dt_ij,dt_kj;
3123   
3124   vtot = 0.0;
3125   for(i=0; (i<nbonds); ) {
3126     type = forceatoms[i++];
3127     ai   = forceatoms[i++];
3128     aj   = forceatoms[i++];
3129     ak   = forceatoms[i++];
3130     
3131     theta  = bond_angle(x[ai],x[aj],x[ak],pbc,
3132                         r_ij,r_kj,&cos_theta,&t1,&t2);  /*  41          */
3133
3134     table = forceparams[type].tab.table;
3135   
3136     *dvdlambda += bonded_tab("angle",table,
3137                              &fcd->angletab[table],
3138                              forceparams[type].tab.kA,
3139                              forceparams[type].tab.kB,
3140                              theta,lambda,&va,&dVdt);  /*  22  */
3141     vtot += va;
3142     
3143     cos_theta2 = sqr(cos_theta);                /*   1          */
3144     if (cos_theta2 < 1) {
3145       int  m;
3146       real snt,st,sth;
3147       real cik,cii,ckk;
3148       real nrkj2,nrij2;
3149       rvec f_i,f_j,f_k;
3150       
3151       st  = dVdt*gmx_invsqrt(1 - cos_theta2);   /*  12          */
3152       sth = st*cos_theta;                       /*   1          */
3153 #ifdef DEBUG
3154       if (debug)
3155         fprintf(debug,"ANGLES: theta = %10g  vth = %10g  dV/dtheta = %10g\n",
3156                 theta*RAD2DEG,va,dVdt);
3157 #endif
3158       nrkj2=iprod(r_kj,r_kj);                   /*   5          */
3159       nrij2=iprod(r_ij,r_ij);
3160       
3161       cik=st*gmx_invsqrt(nrkj2*nrij2);          /*  12          */ 
3162       cii=sth/nrij2;                            /*  10          */
3163       ckk=sth/nrkj2;                            /*  10          */
3164       
3165       for (m=0; (m<DIM); m++) {                 /*  39          */
3166         f_i[m]=-(cik*r_kj[m]-cii*r_ij[m]);
3167         f_k[m]=-(cik*r_ij[m]-ckk*r_kj[m]);
3168         f_j[m]=-f_i[m]-f_k[m];
3169         f[ai][m]+=f_i[m];
3170         f[aj][m]+=f_j[m];
3171         f[ak][m]+=f_k[m];
3172       }
3173       if (g) {
3174         copy_ivec(SHIFT_IVEC(g,aj),jt);
3175       
3176         ivec_sub(SHIFT_IVEC(g,ai),jt,dt_ij);
3177         ivec_sub(SHIFT_IVEC(g,ak),jt,dt_kj);
3178         t1=IVEC2IS(dt_ij);
3179         t2=IVEC2IS(dt_kj);
3180       }
3181       rvec_inc(fshift[t1],f_i);
3182       rvec_inc(fshift[CENTRAL],f_j);
3183       rvec_inc(fshift[t2],f_k);
3184     }                                           /* 169 TOTAL    */
3185   }
3186   return vtot;
3187 }
3188
3189 real tab_dihs(int nbonds,
3190               const t_iatom forceatoms[],const t_iparams forceparams[],
3191               const rvec x[],rvec f[],rvec fshift[],
3192               const t_pbc *pbc,const t_graph *g,
3193               real lambda,real *dvdlambda,
3194               const t_mdatoms *md,t_fcdata *fcd,
3195               int *global_atom_index)
3196 {
3197   int  i,type,ai,aj,ak,al,table;
3198   int  t1,t2,t3;
3199   rvec r_ij,r_kj,r_kl,m,n;
3200   real phi,sign,ddphi,vpd,vtot;
3201
3202   vtot = 0.0;
3203   for(i=0; (i<nbonds); ) {
3204     type = forceatoms[i++];
3205     ai   = forceatoms[i++];
3206     aj   = forceatoms[i++];
3207     ak   = forceatoms[i++];
3208     al   = forceatoms[i++];
3209     
3210     phi=dih_angle(x[ai],x[aj],x[ak],x[al],pbc,r_ij,r_kj,r_kl,m,n,
3211                   &sign,&t1,&t2,&t3);                   /*  84  */
3212
3213     table = forceparams[type].tab.table;
3214
3215     /* Hopefully phi+M_PI never results in values < 0 */
3216     *dvdlambda += bonded_tab("dihedral",table,
3217                              &fcd->dihtab[table],
3218                              forceparams[type].tab.kA,
3219                              forceparams[type].tab.kB,
3220                              phi+M_PI,lambda,&vpd,&ddphi);
3221                        
3222     vtot += vpd;
3223     do_dih_fup(ai,aj,ak,al,-ddphi,r_ij,r_kj,r_kl,m,n,
3224                f,fshift,pbc,g,x,t1,t2,t3);                      /* 112  */
3225
3226 #ifdef DEBUG
3227     fprintf(debug,"pdih: (%d,%d,%d,%d) phi=%g\n",
3228             ai,aj,ak,al,phi);
3229 #endif
3230   } /* 227 TOTAL        */
3231
3232   return vtot;
3233 }
3234
3235 static unsigned
3236 calc_bonded_reduction_mask(const t_idef *idef,
3237                            int shift,
3238                            int t,int nt)
3239 {
3240     unsigned mask;
3241     int ftype,nb,nat1,nb0,nb1,i,a;
3242
3243     mask = 0;
3244
3245     for(ftype=0; ftype<F_NRE; ftype++)
3246     {
3247         if (interaction_function[ftype].flags & IF_BOND &&
3248             !(ftype == F_CONNBONDS || ftype == F_POSRES) &&
3249             (ftype<F_GB12 || ftype>F_GB14))
3250         {
3251             nb = idef->il[ftype].nr;
3252             if (nb > 0)
3253             {
3254                 nat1 = interaction_function[ftype].nratoms + 1;
3255
3256                 /* Divide this interaction equally over the threads.
3257                  * This is not stored: should match division in calc_bonds.
3258                  */
3259                 nb0 = (((nb/nat1)* t   )/nt)*nat1;
3260                 nb1 = (((nb/nat1)*(t+1))/nt)*nat1;
3261
3262                 for(i=nb0; i<nb1; i+=nat1)
3263                 {
3264                     for(a=1; a<nat1; a++)
3265                     {
3266                         mask |= (1U << (idef->il[ftype].iatoms[i+a]>>shift));
3267                     }
3268                 }
3269             }
3270         }
3271     }
3272
3273     return mask;
3274 }
3275
3276 void init_bonded_thread_force_reduction(t_forcerec *fr,
3277                                         const t_idef *idef)
3278 {
3279 #define MAX_BLOCK_BITS 32
3280     int t;
3281     int ctot,c,b;
3282
3283     if (fr->nthreads <= 1)
3284     {
3285         fr->red_nblock = 0;
3286
3287         return;
3288     }
3289
3290     /* We divide the force array in a maximum of 32 blocks.
3291      * Minimum force block reduction size is 2^6=64.
3292      */
3293     fr->red_ashift = 6;
3294     while (fr->natoms_force > (int)(MAX_BLOCK_BITS*(1U<<fr->red_ashift)))
3295     {
3296         fr->red_ashift++;
3297     }
3298     if (debug)
3299     {
3300         fprintf(debug,"bonded force buffer block atom shift %d bits\n",
3301                 fr->red_ashift);
3302     }
3303
3304     /* Determine to which blocks each thread's bonded force calculation
3305      * contributes. Store this is a mask for each thread.
3306      */
3307 #pragma omp parallel for num_threads(fr->nthreads) schedule(static)
3308     for(t=1; t<fr->nthreads; t++)
3309     {
3310         fr->f_t[t].red_mask =
3311             calc_bonded_reduction_mask(idef,fr->red_ashift,t,fr->nthreads);
3312     }
3313
3314     /* Determine the maximum number of blocks we need to reduce over */
3315     fr->red_nblock = 0;
3316     ctot = 0;
3317     for(t=0; t<fr->nthreads; t++)
3318     {
3319         c = 0;
3320         for(b=0; b<MAX_BLOCK_BITS; b++)
3321         {
3322             if (fr->f_t[t].red_mask & (1U<<b))
3323             {
3324                 fr->red_nblock = max(fr->red_nblock,b+1);
3325                 c++;
3326             }
3327         }
3328         if (debug)
3329         {
3330             fprintf(debug,"thread %d flags %x count %d\n",
3331                     t,fr->f_t[t].red_mask,c);
3332         }
3333         ctot += c;
3334     }
3335     if (debug)
3336     {
3337         fprintf(debug,"Number of blocks to reduce: %d of size %d\n",
3338                 fr->red_nblock,1<<fr->red_ashift);
3339         fprintf(debug,"Reduction density %.2f density/#thread %.2f\n",
3340                 ctot*(1<<fr->red_ashift)/(double)fr->natoms_force,
3341                 ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
3342     }
3343 }
3344
3345 static void zero_thread_forces(f_thread_t *f_t,int n,
3346                                int nblock,int blocksize)
3347 {
3348     int b,a0,a1,a,i,j;
3349
3350     if (n > f_t->f_nalloc)
3351     {
3352         f_t->f_nalloc = over_alloc_large(n);
3353         srenew(f_t->f,f_t->f_nalloc);
3354     }
3355
3356     if (f_t->red_mask != 0)
3357     {
3358         for(b=0; b<nblock; b++)
3359         {
3360             if (f_t->red_mask && (1U<<b))
3361             {
3362                 a0 = b*blocksize;
3363                 a1 = min((b+1)*blocksize,n);
3364                 for(a=a0; a<a1; a++)
3365                 {
3366                     clear_rvec(f_t->f[a]);
3367                 }
3368             }
3369         }
3370     }
3371     for(i=0; i<SHIFTS; i++)
3372     {
3373         clear_rvec(f_t->fshift[i]);
3374     }
3375     for(i=0; i<F_NRE; i++)
3376     {
3377         f_t->ener[i] = 0;
3378     }
3379     for(i=0; i<egNR; i++)
3380     {
3381         for(j=0; j<f_t->grpp.nener; j++)
3382         {
3383             f_t->grpp.ener[i][j] = 0;
3384         }
3385     }
3386     for(i=0; i<efptNR; i++)
3387     {
3388         f_t->dvdl[i] = 0;
3389     }
3390 }
3391
3392 static void reduce_thread_force_buffer(int n,rvec *f,
3393                                        int nthreads,f_thread_t *f_t,
3394                                        int nblock,int block_size)
3395 {
3396     /* The max thread number is arbitrary,
3397      * we used a fixed number to avoid memory management.
3398      * Using more than 16 threads is probably never useful performance wise.
3399      */
3400 #define MAX_BONDED_THREADS 256
3401     int b;
3402
3403     if (nthreads > MAX_BONDED_THREADS)
3404     {
3405         gmx_fatal(FARGS,"Can not reduce bonded forces on more than %d threads",
3406                   MAX_BONDED_THREADS);
3407     }
3408
3409     /* This reduction can run on any number of threads,
3410      * independently of nthreads.
3411      */
3412 #pragma omp parallel for num_threads(nthreads) schedule(static)
3413     for(b=0; b<nblock; b++)
3414     {
3415         rvec *fp[MAX_BONDED_THREADS];
3416         int nfb,ft,fb;
3417         int a0,a1,a;
3418
3419         /* Determine which threads contribute to this block */
3420         nfb = 0;
3421         for(ft=1; ft<nthreads; ft++)
3422         {
3423             if (f_t[ft].red_mask & (1U<<b))
3424             {
3425                 fp[nfb++] = f_t[ft].f;
3426             }
3427         }
3428         if (nfb > 0)
3429         {
3430             /* Reduce force buffers for threads that contribute */
3431             a0 =  b   *block_size;
3432             a1 = (b+1)*block_size;
3433             a1 = min(a1,n);
3434             for(a=a0; a<a1; a++)
3435             {
3436                 for(fb=0; fb<nfb; fb++)
3437                 {
3438                     rvec_inc(f[a],fp[fb][a]);
3439                 }
3440             }
3441         }
3442     }
3443 }
3444
3445 static void reduce_thread_forces(int n,rvec *f,rvec *fshift,
3446                                  real *ener,gmx_grppairener_t *grpp,real *dvdl,
3447                                  int nthreads,f_thread_t *f_t,
3448                                  int nblock,int block_size,
3449                                  gmx_bool bCalcEnerVir,
3450                                  gmx_bool bDHDL)
3451 {
3452     if (nblock > 0)
3453     {
3454         /* Reduce the bonded force buffer */
3455         reduce_thread_force_buffer(n,f,nthreads,f_t,nblock,block_size);
3456     }
3457
3458     /* When necessary, reduce energy and virial using one thread only */
3459     if (bCalcEnerVir)
3460     {
3461         int t,i,j;
3462
3463         for(i=0; i<SHIFTS; i++)
3464         {
3465             for(t=1; t<nthreads; t++)
3466             {
3467                 rvec_inc(fshift[i],f_t[t].fshift[i]);
3468             }
3469         }
3470         for(i=0; i<F_NRE; i++)
3471         {
3472             for(t=1; t<nthreads; t++)
3473             {
3474                 ener[i] += f_t[t].ener[i];
3475             }
3476         }
3477         for(i=0; i<egNR; i++)
3478         {
3479             for(j=0; j<f_t[1].grpp.nener; j++)
3480             {
3481                 for(t=1; t<nthreads; t++)
3482                 {
3483                     
3484                     grpp->ener[i][j] += f_t[t].grpp.ener[i][j];
3485                 }
3486             }
3487         }
3488         if (bDHDL)
3489         {
3490             for(i=0; i<efptNR; i++)
3491             {
3492                 
3493                 for(t=1; t<nthreads; t++)
3494                 {
3495                     dvdl[i] += f_t[t].dvdl[i];
3496                 }
3497             }
3498         }
3499     }
3500 }
3501
3502 static real calc_one_bond(FILE *fplog,int thread,
3503                           int ftype,const t_idef *idef,
3504                           rvec x[], rvec f[], rvec fshift[],
3505                           t_forcerec *fr,
3506                           const t_pbc *pbc,const t_graph *g,
3507                           gmx_enerdata_t *enerd, gmx_grppairener_t *grpp,
3508                           t_nrnb *nrnb,
3509                           real *lambda, real *dvdl,
3510                           const t_mdatoms *md,t_fcdata *fcd,
3511                           gmx_bool bCalcEnerVir,
3512                           int *global_atom_index, gmx_bool bPrintSepPot)
3513 {
3514     int ind,nat1,nbonds,efptFTYPE;
3515     real v=0;
3516     t_iatom *iatoms;
3517     int nb0,nbn;
3518
3519     if (IS_RESTRAINT_TYPE(ftype))
3520     {
3521         efptFTYPE = efptRESTRAINT;
3522     }
3523     else
3524     {
3525         efptFTYPE = efptBONDED;
3526     }
3527
3528     if (interaction_function[ftype].flags & IF_BOND &&
3529         !(ftype == F_CONNBONDS || ftype == F_POSRES))
3530     {
3531         ind  = interaction_function[ftype].nrnb_ind;
3532         nat1 = interaction_function[ftype].nratoms + 1;
3533         nbonds    = idef->il[ftype].nr/nat1;
3534         iatoms    = idef->il[ftype].iatoms;
3535
3536         nb0 = ((nbonds* thread   )/(fr->nthreads))*nat1;
3537         nbn = ((nbonds*(thread+1))/(fr->nthreads))*nat1 - nb0;
3538
3539         if (!IS_LISTED_LJ_C(ftype))
3540         {
3541             if(ftype==F_CMAP)
3542             {
3543                 v = cmap_dihs(nbn,iatoms+nb0,
3544                               idef->iparams,&idef->cmap_grid,
3545                               (const rvec*)x,f,fshift,
3546                               pbc,g,lambda[efptFTYPE],&(dvdl[efptFTYPE]),
3547                               md,fcd,global_atom_index);
3548             }
3549             else if (ftype == F_PDIHS &&
3550                      !bCalcEnerVir && fr->efep==efepNO)
3551             {
3552                 /* No energies, shift forces, dvdl */
3553 #ifndef SSE_PROPER_DIHEDRALS
3554                 pdihs_noener
3555 #else
3556                 pdihs_noener_sse
3557 #endif
3558                     (nbn,idef->il[ftype].iatoms+nb0,
3559                      idef->iparams,
3560                      (const rvec*)x,f,
3561                      pbc,g,lambda[efptFTYPE],md,fcd,
3562                      global_atom_index);
3563                 v = 0;
3564                 dvdl[efptFTYPE] = 0;
3565             }
3566             else
3567             {
3568                 v = interaction_function[ftype].ifunc(nbn,iatoms+nb0,
3569                                                       idef->iparams,
3570                                                       (const rvec*)x,f,fshift,
3571                                                       pbc,g,lambda[efptFTYPE],&(dvdl[efptFTYPE]),
3572                                                       md,fcd,global_atom_index);
3573             }
3574             enerd->dvdl_nonlin[efptFTYPE] += dvdl[efptFTYPE];
3575             if (bPrintSepPot)
3576             {
3577                 fprintf(fplog,"  %-23s #%4d  V %12.5e  dVdl %12.5e\n",
3578                         interaction_function[ftype].longname,
3579                         nbonds/nat1,v,lambda[efptFTYPE]);
3580             }
3581         }
3582         else
3583         {
3584             v = do_listed_vdw_q(ftype,nbn,iatoms+nb0,
3585                                 idef->iparams,
3586                                 (const rvec*)x,f,fshift,
3587                                 pbc,g,lambda,dvdl,
3588                                 md,fr,grpp,global_atom_index);
3589             enerd->dvdl_nonlin[efptCOUL] += dvdl[efptCOUL];
3590             enerd->dvdl_nonlin[efptVDW] += dvdl[efptVDW];
3591             
3592             if (bPrintSepPot)
3593             {
3594                 fprintf(fplog,"  %-5s + %-15s #%4d                  dVdl %12.5e\n",
3595                         interaction_function[ftype].longname,
3596                         interaction_function[F_LJ14].longname,nbonds/nat1,dvdl[efptVDW]);
3597                 fprintf(fplog,"  %-5s + %-15s #%4d                  dVdl %12.5e\n",
3598                         interaction_function[ftype].longname,
3599                         interaction_function[F_COUL14].longname,nbonds/nat1,dvdl[efptCOUL]);
3600             }
3601         }
3602         if (ind != -1 && thread == 0)
3603         {
3604             inc_nrnb(nrnb,ind,nbonds);
3605         }
3606     }
3607
3608     return v;
3609 }
3610
3611 /* WARNING!  THIS FUNCTION MUST EXACTLY TRACK THE calc
3612    function, or horrible things will happen when doing free energy
3613    calculations!  In a good coding world, this would not be a
3614    different function, but for speed reasons, it needs to be made a
3615    separate function.  TODO for 5.0 - figure out a way to reorganize
3616    to reduce duplication.
3617 */
3618
3619 static real calc_one_bond_foreign(FILE *fplog,int ftype, const t_idef *idef,
3620                                   rvec x[], rvec f[], t_forcerec *fr,
3621                                   const t_pbc *pbc,const t_graph *g,
3622                                   gmx_enerdata_t *enerd, t_nrnb *nrnb,
3623                                   real *lambda, real *dvdl,
3624                                   const t_mdatoms *md,t_fcdata *fcd,
3625                                   int *global_atom_index, gmx_bool bPrintSepPot)
3626 {
3627     int ind,nat1,nbonds,efptFTYPE,nbonds_np;
3628     real v=0;
3629     t_iatom *iatoms;
3630
3631     if (IS_RESTRAINT_TYPE(ftype))
3632     {
3633         efptFTYPE = efptRESTRAINT;
3634     }
3635     else
3636     {
3637         efptFTYPE = efptBONDED;
3638     }
3639
3640     if (ftype<F_GB12 || ftype>F_GB14)
3641     {
3642         if (interaction_function[ftype].flags & IF_BOND &&
3643             !(ftype == F_CONNBONDS || ftype == F_POSRES))
3644         {
3645             ind  = interaction_function[ftype].nrnb_ind;
3646             nat1 = interaction_function[ftype].nratoms+1;
3647             nbonds_np = idef->il[ftype].nr_nonperturbed;
3648             nbonds    = idef->il[ftype].nr - nbonds_np;
3649             iatoms    = idef->il[ftype].iatoms + nbonds_np;
3650             if (nbonds > 0)
3651             {
3652                 if (!IS_LISTED_LJ_C(ftype))
3653                 {
3654                     if(ftype==F_CMAP)
3655                     {
3656                         v = cmap_dihs(nbonds,iatoms,
3657                                       idef->iparams,&idef->cmap_grid,
3658                                       (const rvec*)x,f,fr->fshift,
3659                                       pbc,g,lambda[efptFTYPE],&(dvdl[efptFTYPE]),md,fcd,
3660                                       global_atom_index);
3661                     }
3662                     else
3663                     {
3664                         v =         interaction_function[ftype].ifunc(nbonds,iatoms,
3665                                                                   idef->iparams,
3666                                                                   (const rvec*)x,f,fr->fshift,
3667                                                                   pbc,g,lambda[efptFTYPE],&dvdl[efptFTYPE],
3668                                                                   md,fcd,global_atom_index);
3669                     }
3670                 }
3671                 else
3672                 {
3673                     v = do_listed_vdw_q(ftype,nbonds,iatoms,
3674                                         idef->iparams,
3675                                         (const rvec*)x,f,fr->fshift,
3676                                         pbc,g,lambda,dvdl,
3677                                         md,fr,&enerd->grpp,global_atom_index);
3678                 }
3679                 if (ind != -1)
3680                 {
3681                     inc_nrnb(nrnb,ind,nbonds/nat1);
3682                 }
3683             }
3684         }
3685     }
3686     return v;
3687 }
3688
3689 void calc_bonds(FILE *fplog,const gmx_multisim_t *ms,
3690                 const t_idef *idef,
3691                 rvec x[],history_t *hist,
3692                 rvec f[],t_forcerec *fr,
3693                 const t_pbc *pbc,const t_graph *g,
3694                 gmx_enerdata_t *enerd,t_nrnb *nrnb,
3695                 real *lambda,
3696                 const t_mdatoms *md,
3697                 t_fcdata *fcd,int *global_atom_index,
3698                 t_atomtypes *atype, gmx_genborn_t *born,
3699                 int force_flags,
3700                 gmx_bool bPrintSepPot,gmx_large_int_t step)
3701 {
3702     gmx_bool bCalcEnerVir;
3703     int    i;
3704     real   v,dvdl[efptNR],dvdl_dum[efptNR]; /* The dummy array is to have a place to store the dhdl at other values
3705                                                of lambda, which will be thrown away in the end*/
3706     const  t_pbc *pbc_null;
3707     char   buf[22];
3708     int    thread;
3709
3710     bCalcEnerVir = (force_flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY));
3711
3712     for (i=0;i<efptNR;i++)
3713     {
3714         dvdl[i] = 0.0;
3715     }
3716     if (fr->bMolPBC)
3717     {
3718         pbc_null = pbc;
3719     }
3720     else
3721     {
3722         pbc_null = NULL;
3723     }
3724     if (bPrintSepPot)
3725     {
3726         fprintf(fplog,"Step %s: bonded V and dVdl for this node\n",
3727                 gmx_step_str(step,buf));
3728     }
3729
3730 #ifdef DEBUG
3731     if (g && debug)
3732     {
3733         p_graph(debug,"Bondage is fun",g);
3734     }
3735 #endif
3736
3737     /* Do pre force calculation stuff which might require communication */
3738     if (idef->il[F_ORIRES].nr)
3739     {
3740         enerd->term[F_ORIRESDEV] =
3741             calc_orires_dev(ms,idef->il[F_ORIRES].nr,
3742                             idef->il[F_ORIRES].iatoms,
3743                             idef->iparams,md,(const rvec*)x,
3744                             pbc_null,fcd,hist);
3745     }
3746     if (idef->il[F_DISRES].nr)
3747     {
3748         calc_disres_R_6(ms,idef->il[F_DISRES].nr,
3749                         idef->il[F_DISRES].iatoms,
3750                         idef->iparams,(const rvec*)x,pbc_null,
3751                         fcd,hist);
3752     }
3753
3754 #pragma omp parallel for num_threads(fr->nthreads) schedule(static)
3755     for(thread=0; thread<fr->nthreads; thread++)
3756     {
3757         int    ftype,nbonds,ind,nat1;
3758         real   *epot,v;
3759         /* thread stuff */
3760         rvec   *ft,*fshift;
3761         real   *dvdlt;
3762         gmx_grppairener_t *grpp;
3763         int    nb0,nbn;
3764
3765         if (thread == 0)
3766         {
3767             ft     = f;
3768             fshift = fr->fshift;
3769             epot   = enerd->term;
3770             grpp   = &enerd->grpp;
3771             dvdlt  = dvdl;
3772         }
3773         else
3774         {
3775             zero_thread_forces(&fr->f_t[thread],fr->natoms_force,
3776                                fr->red_nblock,1<<fr->red_ashift);
3777
3778             ft     = fr->f_t[thread].f;
3779             fshift = fr->f_t[thread].fshift;
3780             epot   = fr->f_t[thread].ener;
3781             grpp   = &fr->f_t[thread].grpp;
3782             dvdlt  = fr->f_t[thread].dvdl;
3783         }
3784         /* Loop over all bonded force types to calculate the bonded forces */
3785         for(ftype=0; (ftype<F_NRE); ftype++)
3786         {
3787             if (idef->il[ftype].nr > 0 &&
3788                 (interaction_function[ftype].flags & IF_BOND) &&
3789                 (ftype < F_GB12 || ftype > F_GB14) &&
3790                 !(ftype == F_CONNBONDS || ftype == F_POSRES))
3791             {
3792                 v = calc_one_bond(fplog,thread,ftype,idef,x, 
3793                                   ft,fshift,fr,pbc_null,g,enerd,grpp,
3794                                   nrnb,lambda,dvdlt,
3795                                   md,fcd,bCalcEnerVir,
3796                                   global_atom_index,bPrintSepPot);
3797                 epot[ftype]        += v;
3798             }
3799         }
3800     }
3801     if (fr->nthreads > 1)
3802     {
3803         reduce_thread_forces(fr->natoms_force,f,fr->fshift,
3804                              enerd->term,&enerd->grpp,dvdl,
3805                              fr->nthreads,fr->f_t,
3806                              fr->red_nblock,1<<fr->red_ashift,
3807                              bCalcEnerVir,
3808                              force_flags & GMX_FORCE_DHDL);
3809     }
3810
3811     /* Copy the sum of violations for the distance restraints from fcd */
3812     if (fcd)
3813     {
3814         enerd->term[F_DISRESVIOL] = fcd->disres.sumviol;
3815
3816     }
3817 }
3818
3819 void calc_bonds_lambda(FILE *fplog,
3820                        const t_idef *idef,
3821                        rvec x[],
3822                        t_forcerec *fr,
3823                        const t_pbc *pbc,const t_graph *g,
3824                        gmx_enerdata_t *enerd,t_nrnb *nrnb,
3825                        real *lambda,
3826                        const t_mdatoms *md,
3827                        t_fcdata *fcd,
3828                        int *global_atom_index)
3829 {
3830     int    i,ftype,nbonds_np,nbonds,ind,nat;
3831     real   v,dr,dr2,*epot;
3832     real   dvdl_dum[efptNR];
3833     rvec   *f,*fshift_orig;
3834     const  t_pbc *pbc_null;
3835     t_iatom *iatom_fe;
3836
3837     if (fr->bMolPBC)
3838     {
3839         pbc_null = pbc;
3840     }
3841     else
3842     {
3843         pbc_null = NULL;
3844     }
3845
3846     epot = enerd->term;
3847
3848     snew(f,fr->natoms_force);
3849     /* We want to preserve the fshift array in forcerec */
3850     fshift_orig = fr->fshift;
3851     snew(fr->fshift,SHIFTS);
3852
3853     /* Loop over all bonded force types to calculate the bonded forces */
3854     for(ftype=0; (ftype<F_NRE); ftype++) 
3855     {
3856         v = calc_one_bond_foreign(fplog,ftype,idef,x, 
3857                                   f,fr,pbc_null,g,enerd,nrnb,lambda,dvdl_dum,
3858                                   md,fcd,global_atom_index,FALSE);
3859         epot[ftype] += v;
3860     }
3861
3862     sfree(fr->fshift);
3863     fr->fshift = fshift_orig;
3864     sfree(f);
3865 }