Initial commit of generalized born electrostatics. Gromacs now supports the
authorlindahl <lindahl>
Fri, 30 Jan 2009 03:51:09 +0000 (03:51 +0000)
committerlindahl <lindahl>
Fri, 30 Jan 2009 03:51:09 +0000 (03:51 +0000)
Still, HCT, and OBC generalized born algorithms, and the ace approximation of surface area.
Merge from separate GB tree, so while normal Gromacs code should be fine there might be a
couple of new GB bugs due to recent changes in CVS.

61 files changed:
admin/Makefile.in
configure.ac
include/Makefile.am
include/bondf.h
include/force.h
include/genborn.h [new file with mode: 0644]
include/gpp_atomtype.h
include/grompp.h
include/mdrun.h
include/names.h
include/nonbonded.h
include/shellfc.h
include/types/atoms.h
include/types/enums.h
include/types/forcerec.h
include/types/idef.h
include/types/ifunc.h
include/types/inputrec.h
include/types/nrnb.h
share/html/online/mdp_opt.html
src/Makefile.in
src/gmxlib/Makefile.in
src/gmxlib/bondfree.c
src/gmxlib/ifunc.c
src/gmxlib/names.c
src/gmxlib/nonbonded/nb_kernel/mknb_declarations.c
src/gmxlib/nonbonded/nb_kernel/mknb_innerloop.c
src/gmxlib/nonbonded/nb_kernel/mknb_interactions.c
src/gmxlib/nonbonded/nb_kernel/mknb_outerloop.c
src/gmxlib/nonbonded/nb_kernel_ia32_sse/Makefile.am
src/gmxlib/nonbonded/nb_kernel_x86_64_sse/Makefile.am
src/gmxlib/nonbonded/nonbonded.c
src/gmxlib/nrnb.c
src/gmxlib/tpxio.c
src/gmxlib/txtdump.c
src/gmxlib/typedefs.c
src/kernel/convparm.c
src/kernel/gpp_atomtype.c
src/kernel/grompp.c
src/kernel/md.c
src/kernel/nm2type.c
src/kernel/readir.c
src/kernel/resall.c
src/kernel/topdirs.c
src/kernel/topio.c
src/kernel/topio.h
src/kernel/toppush.c
src/kernel/toppush.h
src/kernel/toputil.c
src/kernel/tpbcmp.c
src/mdlib/Makefile.am
src/mdlib/domdec.c
src/mdlib/force.c
src/mdlib/genborn.c [new file with mode: 0644]
src/mdlib/genborn_sse.c [new file with mode: 0644]
src/mdlib/genborn_sse.h [new file with mode: 0644]
src/mdlib/minimize.c
src/mdlib/ns.c
src/mdlib/shellfc.c
src/mdlib/sim_util.c
src/mdlib/tables.c

index ad7ee3a1adda24f6c5390c28773e3a0a12da417b..6b52b304f128afd58e5398843f4c52b7325c1872 100644 (file)
@@ -190,9 +190,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
              exit 1;; \
          esac; \
        done; \
-       echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu  admin/Makefile'; \
+       echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  admin/Makefile'; \
        cd $(top_srcdir) && \
-         $(AUTOMAKE) --gnu  admin/Makefile
+         $(AUTOMAKE) --foreign  admin/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
        @case '$?' in \
index 946f67857d8a197099191c5c490e6c80b9510e6f..0cfdbee4694be4e830c051cb3623cc3803fd9953 100644 (file)
@@ -3,7 +3,7 @@
 #######################################################################
  
 AC_PREREQ(2.50)
-AC_INIT(gromacs, 4.0.99_development_20090120, [gmx-users@gromacs.org])
+AC_INIT(gromacs, 4.0.99_development_20090122, [gmx-users@gromacs.org])
 AC_CONFIG_SRCDIR(src/gmxlib/3dview.c)
 AC_CONFIG_AUX_DIR(config)
 AC_CANONICAL_HOST
index 480b6a397130ae966981176e787f0944146856a3..fe2789ec9f6e2f505cee689a8606f362233df6e7 100644 (file)
@@ -31,6 +31,7 @@ fftgrid.h \
 filenm.h \
 force.h \
 futil.h \
+genborn.h \
 gbutil.h \
 gmx_fatal.h \
 gmx_ana.h \
index 5ba892ceef6c62073b297089d53a39e33bb3dcab..7969be2539c7de2437bb6cd76d6dc2fd35b6f850 100644 (file)
@@ -50,6 +50,7 @@ extern "C" {
 #include "typedefs.h"
 #include "nrnb.h"
 #include "pbc.h"
+#include "genborn.h"
 
 extern int glatnr(int *global_atom_index,int i);
 /* Returns the global topology atom number belonging to local atom index i.
@@ -66,6 +67,7 @@ extern void calc_bonds(FILE *fplog,const gmx_multisim_t *ms,
                        gmx_enerdata_t *enerd,t_nrnb *nrnb,real lambda,
                       const t_mdatoms *md,
                       t_fcdata *fcd,int *ddgatindex,
+                      t_atomtypes *atype, gmx_genborn_t *born,
                       bool bPrintSepPot,int step);
 /* 
  * The function calc_bonds() calculates all bonded force interactions.
index 10cd02c24c56b1fee09a0281de1d6d50466e192c..6b738282e03ffa820a0aa1f2e9c487cf02ea8e2f 100644 (file)
@@ -47,6 +47,7 @@
 #include "network.h"
 #include "tgroup.h"
 #include "vsite.h"
+#include "genborn.h"
 
 static char *sepdvdlformat="  %-30s V %12.5e  dVdl %12.5e\n";
 
@@ -162,6 +163,7 @@ extern void do_force(FILE *log,t_commrec *cr,
                     t_inputrec *inputrec,
                     int step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
                     gmx_localtop_t *top,
+                    gmx_mtop_t *mtop,
                     gmx_groups_t *groups,
                     matrix box,rvec x[],history_t *hist,
                     rvec f[],rvec buf[],
@@ -171,6 +173,7 @@ extern void do_force(FILE *log,t_commrec *cr,
                     real lambda,t_graph *graph,
                     t_forcerec *fr,gmx_vsite_t *vsite,rvec mu_tot,
                     double t,FILE *field,gmx_edsam_t ed,
+                    gmx_genborn_t *born, bool bBornRadii,
                     int flags);
 /* Communicate coordinates (if parallel).
  * Do neighbor searching (if necessary).
@@ -216,6 +219,10 @@ extern void do_force_lowlevel(FILE         *fplog,
                              rvec         f[],    
                              gmx_enerdata_t *enerd,
                              t_fcdata     *fcd,
+                             gmx_mtop_t   *mtop,
+                             gmx_genborn_t *born,
+                             t_atomtypes  *atype,
+                             bool         bBornRadii,
                              matrix       box,
                              real         lambda,
                              t_graph      *graph,
diff --git a/include/genborn.h b/include/genborn.h
new file mode 100644 (file)
index 0000000..9e31b6f
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+ * $Id$
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2008, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+
+#ifndef _genborn_h
+#define _genborn_h
+
+#include "typedefs.h"
+#include "grompp.h"
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+
+typedef struct
+{
+  int nbonds;
+  int bond[10];
+} bonds_t;
+
+typedef struct
+{
+  real length[20000];
+  real angle[20000];
+} bl_t;
+
+/* Struct to hold all the information for GB 
+ * All these things are currently allocated in md.c
+ */
+typedef struct
+{
+       int nr;                  /* number of atoms, length of arrays below */
+       int n12;                 /* number of 1-2 (bond) interactions       */
+       int n13;                 /* number of 1-3 (angle) terms             */
+       int n14;                 /* number of 1-4 (torsion) terms           */
+       real  *gpol;             /* Atomic polarisation energies */
+       real  *bRad;             /* Atomic Born radii */
+       real  *vsolv;            /* Atomic solvation volumes */
+       int *vs;                 /* Array for vsites-exclusions */     
+       int nvs;                 /* Length of vs array         */
+       real es;                 /* Solvation energy and derivatives */
+       real *asurf;             /* Atomic surface area */
+       rvec *dasurf;            /* Surface area derivatives */
+       real as;                 /* Total surface area */
+       real *S_hct;             /* Overlap factors for HCT method */
+       real *drobc;             /* Parameters for OBC chain rule calculation */
+       real *param;             /* Precomputed factor raj*atype->S_hct for HCT/OBC */
+       real *log_table;         /* Table for logarithm lookup */
+       
+       real obc_alpha;          /* OBC parameters */
+       real obc_beta;           /* OBC parameters */
+       real obc_gamma;          /* OBC parameters */
+       real gb_doffset;         /* Dielectric offset for Still/HCT/OBC */
+       
+       real *work;              /* used for parallel summation */
+} 
+gmx_genborn_t;
+
+
+/* Initialise GB stuff */
+int init_gb(gmx_genborn_t **p_born,t_commrec *cr, t_forcerec *fr, t_inputrec *ir,
+                       gmx_mtop_t *mtop, rvec x[], real rgbradii, int gb_algorithm);
+
+
+/* Born radii calculations, both with and without SSE acceleration */
+int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir,int natoms, int nrfa, gmx_mtop_t *mtop,
+                               const t_atomtypes *atype, rvec x[], rvec f[],t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md);
+
+
+
+/* Bonded GB interactions */                                                           
+real gb_bonds_tab(int nbonds, real *x, real *f, real *charge, real *p_gbtabscale,
+                                 real *invsqrta, real *dvda, real *GBtab, const t_iatom forceatoms[],
+                                 real epsilon_r, real facel);
+
+
+
+void gb_pd_send(t_commrec *cr, real *send_data, int nr);
+
+
+/* Functions for setting up the F_GB list in grompp */
+int 
+init_gb_plist(t_params *p_list);
+
+int 
+convert_gb_params(t_idef *idef, t_functype ftype, int start, t_params *gb_plist, gmx_genborn_t *born);
+
+int 
+generate_gb_topology(gmx_mtop_t *mtop, t_params *plist, t_params *gb_plist, gmx_genborn_t *born);
+                                                
+
+
+
+/* Functions for calculating adjustments due to ie chain rule terms */
+real 
+calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_mtop_t *mtop, const t_atomtypes *atype, int nr, 
+                          rvec x[], rvec f[], t_forcerec *fr,const t_iatom forceatoms[],int gb_algorithm, bool bRad);
+
+
+int
+gb_nblist_siev(t_commrec *cr, int natoms, int gb_algorithm, real gbcut, rvec x[], t_forcerec *fr, t_ilist *il, int n14);
+
+
+#endif /* _genborn_h */
+
index 7b4a553261cd099d17d6b4f80231ce9f8684e782..c6282e2a10e0f78c3bbac4bcd364d494aaf58f17 100644 (file)
@@ -64,6 +64,8 @@ extern real get_atomtype_qB(int nt,t_atomtype at);
 extern real get_atomtype_radius(int nt,t_atomtype at);
 extern real get_atomtype_vol(int nt,t_atomtype at);
 extern real get_atomtype_surftens(int nt,t_atomtype at);
+extern real get_atomtype_gb_radius(int nt,t_atomtype at);
+extern real get_atomtype_S_hct(int nt,t_atomtype at);
 extern int get_atomtype_ptype(int nt,t_atomtype at);
 extern int get_atomtype_batype(int nt,t_atomtype at);
 extern int get_atomtype_atomnumber(int nt,t_atomtype at);
@@ -82,14 +84,21 @@ extern void done_atomtype(t_atomtype *at);
 extern int set_atomtype(int nt,t_atomtype at,t_symtab *tab,
                        t_atom *a,char *name,t_param *nb,
                        int bondatomtype,
-                       real radius,real vol,real surftens,int atomnumber);
+                       real radius,real vol,real surftens,int atomnumber,
+                       real gb_radius, real S_hct);
 /* Set the values of an existing atom type nt. Returns nt on success or
    NOTSET on error. */ 
 
+int
+set_atomtype_gbparam(t_atomtype at, int i,
+                    real radius,real vol,real surftens,
+                    real gb_radius, real S_hct);
+
 extern int add_atomtype(t_atomtype at,t_symtab *tab,
                        t_atom *a,char *name,t_param *nb,
                        int bondatomtype,
-                       real radius,real vol,real surftens,real atomnumber);
+                       real radius,real vol,real surftens,real atomnumber,
+                       real gb_radius, real S_hct);
 /* Add a complete new atom type to an existing atomtype structure. Returns
    the number of the atom type. */
 
index e5daa0c5fe79519a669dcec1318c05c4c116b19e..9323eebd692927942f337daa80b337869f4a688d 100644 (file)
@@ -122,6 +122,8 @@ typedef enum {
   d_angletypes,
   d_dihedraltypes,
   d_nonbond_params,
+  d_implicit_genborn_params,
+  d_implicit_surface_params,
   d_moleculetype,
   d_atoms,
   d_vsites2,
@@ -161,6 +163,8 @@ static char *ds[d_maxdir+1] = {
   "angletypes",
   "dihedraltypes",
   "nonbond_params",
+  "implicit_genborn_params",
+  "implicit_surface_params",
   /* All the directives above can not appear after moleculetype */
   "moleculetype",
   "atoms",
index 20178f7e9cfeb7069c60f0f6d681089bd48e23c3..5a56aa6c03f4bb5b73ff708fb1e2fb1664901219 100644 (file)
@@ -55,6 +55,7 @@
 #include "vsite.h"
 #include "pull.h"
 #include "update.h"
+#include "genborn.h"
 
 #define MD_GLAS         (1<<1)
 #define MD_POLARISE     (1<<2)
@@ -90,6 +91,7 @@ typedef time_t gmx_integrator_t(FILE *log,t_commrec *cr,
                                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
                                gmx_edsam_t ed, 
                                t_forcerec *fr,
+                               gmx_genborn_t *born,
                                int repl_ex_nst,int repl_ex_seed,
                                real cpt_period,real max_hours,
                                unsigned long Flags,
index 2d085d168444f03813cfccc98457aa1947911ec2..50523c8fc58733b075ec9025b0ef7c74a01b5507 100644 (file)
@@ -76,6 +76,7 @@ extern const char *ecm_names[ecmNR+1];
 extern const char *eann_names[eannNR+1];
 extern const char *egb_names[egbNR+1];
 extern const char *eis_names[eisNR+1];
+extern const char *esa_names[esaNR+1];
 extern const char *ewt_names[ewtNR+1];
 extern const char *epull_names[epullNR+1];
 extern const char *epullg_names[epullgNR+1];
@@ -110,6 +111,7 @@ extern const char *eQMMMscheme_names[eQMMMschemeNR+1];
 #define ECOM(e)        ENUM_NAME(e,ecmNR,ecm_names)
 #define EANNEAL(e)      ENUM_NAME(e,eannNR,eann_names)
 #define EGBALGORITHM(e) ENUM_NAME(e,egbNR,egb_names)
+#define ESAALGORITHM(e) ENUM_NAME(e,esaNR,esa_names)
 #define EIMPLICITSOL(e) ENUM_NAME(e,eisNR,eis_names)
 #define EWALLTYPE(e)   ENUM_NAME(e,ewtNR,ewt_names)
 #define EPULLTYPE(e)   ENUM_NAME(e,epullNR,epull_names)
index f81cb31e5eeba5652183df2ebd71873acb3cd6ac..41a4d89279c217ba51cc6211d423e4c422d89424 100644 (file)
@@ -41,6 +41,7 @@
 #include "pbc.h"
 #include "network.h"
 #include "tgroup.h"
+#include "genborn.h"
 
 void
 gmx_setup_kernels(FILE *fplog);
index 3fbeddc39b249970bd013058bcad99db01d9fd13..7cbdad2d3d4ee9fe4152e531934f117dca43a4ef 100644 (file)
@@ -38,7 +38,8 @@
 #endif
 
 #include "typedefs.h"
+#include "genborn.h"
+
 /* Initialization function, also predicts the initial shell postions.
  * If x!=NULL, the shells are predict for the global coordinates x.
  */
@@ -54,7 +55,9 @@ extern void make_local_shells(t_commrec *cr,t_mdatoms *md,
 extern int relax_shell_flexcon(FILE *log,t_commrec *cr,bool bVerbose,
                               int mdstep,t_inputrec *inputrec,
                               bool bDoNS,bool bStopCM,
-                              gmx_localtop_t *top,gmx_constr_t constr,
+                              gmx_localtop_t *top,
+                              gmx_mtop_t *mtop,
+                              gmx_constr_t constr,
                               gmx_enerdata_t *enerd,t_fcdata *fcd,
                               t_state *state,rvec f[],
                               rvec buf[],tensor force_vir,
@@ -64,6 +67,7 @@ extern int relax_shell_flexcon(FILE *log,t_commrec *cr,bool bVerbose,
                               gmx_groups_t *groups,
                               gmx_shellfc_t shfc,
                               t_forcerec *fr,
+                              gmx_genborn_t *born, bool bBornRadii,
                               double t,rvec mu_tot,
                               int natoms,bool *bConverged,
                               gmx_vsite_t *vsite,
index 144f88c890654e4810e74ca111889ffc363ee735..e8ddf281cd9bbce71ee4f8ba3ed74ed1a16bff33 100644 (file)
@@ -88,10 +88,13 @@ typedef struct {
 } t_atoms;
 
 typedef struct {
-  int           nr;              /* number of atomtypes                     */
+  int           nr;             /* number of atomtypes                     */
   real         *radius;         /* GBSA radius for each atomtype        */
   real         *vol;            /* GBSA efective volume for each atomtype   */
   real         *surftens;       /* implicit solvent surftens for each atomtype */
+  real         *gb_radius;      /* GB radius for each atom type */
+  real         *S_hct;          /* Overlap factors for HCT/OBC GB models */
+
   int          *atomnumber;     /* Atomic number, used for QM/MM */
 } t_atomtypes;
 
index 857d87a499ea3675747c09b552e6d936be29b586..b4f283ca7c31f328805e53f508273c5b2ea2b7ab 100644 (file)
@@ -60,9 +60,14 @@ enum {
   erscNO, erscALL, erscCOM, erscNR
 };
 
+/*
+ * eelNOTUSED1 used to be GB, but to enable generalized born with different
+ * forms of electrostatics (RF, switch, etc.) in the future it is now selected
+ * separately (through the implicit_solvent option).
+ */
 enum {
   eelCUT,     eelRF,     eelGRF,   eelPME,  eelEWALD,  eelPPPM, 
-  eelPOISSON, eelSWITCH, eelSHIFT, eelUSER, eelGB, eelRF_NEC, eelENCADSHIFT, 
+  eelPOISSON, eelSWITCH, eelSHIFT, eelUSER, eelGB_NOTUSED, eelRF_NEC, eelENCADSHIFT, 
   eelPMEUSER, eelPMESWITCH, eelPMEUSERSWITCH, eelRF_ZERO, eelNR
 };
 
@@ -164,7 +169,11 @@ enum {
 
 /* Algorithms for calculating GB radii */
 enum { 
-  egbSTILL, egbOBC, egbNR 
+  egbSTILL, egbHCT, egbOBC, egbNR 
+};
+
+enum {
+  esaNO, esaAPPROX, esaSTILL, esaNR
 };
 
 /* Wall types */
index ba3c7a8b0f7b1549328f8141ff03b5bf16924206..63dbc751ce4750a129d627a2f2ec184cd3e14b9a 100644 (file)
@@ -237,13 +237,40 @@ typedef struct {
   /* xmdrun flexible constraints */
   real fc_stepsize;
 
+  /* Generalized born implicit solvent */
+  bool bGB;
   /* Generalized born stuff */
+  /* Table data for GB */
+  t_forcetable gbtab;
   /* VdW radius for each atomtype (dim is thus ntype) */
   real *atype_radius;
   /* Effective radius (derived from effective volume) for each type */
   real *atype_vol;
   /* Implicit solvent - surface tension for each atomtype */
   real *atype_surftens;
+  /* Implicit solvent - radius for GB calculation */
+  real *atype_gb_radius;
+  /* Implicit solvent - overlap for HCT model */
+  real *atype_S_hct;
+
+  /* Table scale for GB */
+  real gbtabscale;
+  /* Table range for GB */
+  real gbtabr;
+  /* GB neighborlists (the sr list will contain for each atom all other atoms                                            
+   * (for use in the SA calculation) and the lr list will contain                                                  
+   * for each atom all atoms 1-4 or greater (for use in the GB calculation)                                        
+   */
+  t_nblist gblist_sr;
+  t_nblist gblist_lr;
+  t_nblist gblist;
+
+  /* Inverse square root of the Born radii for implicit solvent */
+  real *invsqrta;
+  /* Derivatives of the potential with respect to the Born radii */
+  real *dvda;
+  /* Derivatives of the Born radii with respect to coordinates */
+  real *dadx;
 
   /* If > 0 signals Test Particle Insertion,
    * the value is the number of atoms of the molecule to insert
index e5363eeff5a4288f165f049aa00bae7a9f4dfc4b..feeeb79cbfce3e897812169d7b978d2b2f000822 100644 (file)
@@ -74,6 +74,7 @@ enum {
   F_IDIHS, 
   F_PIDIHS, 
   F_TABDIHS,
+  F_GB,
   F_LJ14,
   F_COUL14,
   F_LJC14_Q,
@@ -169,6 +170,7 @@ typedef union
   struct {real phi,dphi,kfac;int label,power;             } dihres;  
   struct {int  ex,power,label; real c,obs,kfac;           } orires;
   struct {int  table;real kA;real kB;                     } tab;
+  struct {real c6A,c12A,c6B,c12B,sar,st,pi,gbr,bmlt;      } gb;
   struct {real buf[MAXFORCEPARAM];                       } generic; /* Conversion */
 } t_iparams;
 
index a836af55dcacd9d8ff7a0d9a010a582ae2cbc1a8..b925524a8e39c8a9948b4ad6ce4cc04fa79296a4 100644 (file)
@@ -105,6 +105,12 @@ typedef struct
 #define IS_CHEMBOND(ftype) (interaction_function[(ftype)].nratoms==2 && interaction_function[(ftype)].flags & IF_CHEMBOND)
 /* IS_CHEMBOND tells if function type ftype represents a chemical bond */
 
+/* IS_ANGLE tells if a function type ftype represents an angle 
+ * Per Larsson, 2007-11-06
+ */
+#define IS_ANGLE(ftype) (interaction_function[(ftype)].nratoms==3 && interaction_function[(ftype)].flags & IF_ATYPE)
+#define IS_VSITE(ftype) (interaction_function[(ftype)].flags & IF_VSITE)
+
 #define IS_TABULATED(ftype) (interaction_function[(ftype)].flags & IF_TABULATED)
 
 extern const t_interaction_function interaction_function[F_NRE];
index cb4e49494d83d963ff0f04af80428550ec6c8eaa..5a19452600e88f2634348f4f32b9a9a6f543e78f 100644 (file)
@@ -190,13 +190,15 @@ typedef struct {
   real gb_obc_alpha;    /* 1st scaling factor for Bashford-Case GB      */
   real gb_obc_beta;     /* 2nd scaling factor for Bashford-Case GB      */
   real gb_obc_gamma;    /* 3rd scaling factor for Bashford-Case GB      */
-  real sa_surface_tension; /* Energy factor for SA part of GBSA */
+  real gb_dielectric_offset; /* Dielectric offset for Still/HCT/OBC     */
+  real sa_algorithm;    /* Algorithm for SA part of GBSA                */
+  int  sa_surface_tension; /* Energy factor for SA part of GBSA */
   int  vdwtype;         /* Type of Van der Waals treatment              */
   real rvdw_switch;     /* Van der Waals switch range start (nm)        */
-  real rvdw;               /* Van der Waals cutoff (nm)                        */
+  real rvdw;               /* Van der Waals cutoff (nm)                */
   int  eDispCorr;       /* Perform Long range dispersion corrections    */
   real tabext;          /* Extension of the table beyond the cut-off,   *
-                                    * as well as the table length for 1-4 interac. */
+                        * as well as the table length for 1-4 interac. */
   real shake_tol;      /* tolerance for shake                          */
   int  efep;                   /* free energy interpolation no/yes             */
   double init_lambda;  /* initial value for perturbation variable      */
index 41749109f1151d3e70876004a9bb56fe5b2ef8a2..aa1e557834f5725145b0041a357ea5264791afbf 100644 (file)
@@ -105,7 +105,7 @@ enum
     eNR_SHAKE_RIJ,            eNR_CONSTR_VIR,           eNR_SETTLE,
     eNR_VSITE2,               eNR_VSITE3,               eNR_VSITE3FD,
     eNR_VSITE3FAD,            eNR_VSITE3OUT,            eNR_VSITE4FD,
-    eNR_VSITE4FDN,            eNR_VSITEN,
+    eNR_VSITE4FDN,            eNR_VSITEN,               eNR_GB,
     eNRNB
 };
 
index 5b6bb8651c2ca83851773d73b380bfbbbfa7f0f5..c0f7720bdd35b187b919ad716c2ad69e15662573 100644 (file)
@@ -194,6 +194,9 @@ The distribution of insertion energies is written to the file specified with
 the <tt>-tpid</tt> option of <tt>mdrun</tt>.
 No trajectory or energy file is written.
 Parallel tpi gives identical results to single node tpi.
+For charged molecules, using PME with a fine grid is most accurate
+and also efficient, since the potential in the system only needs
+to be calculated once per frame.
 </dd>
 <dt><b>tpic</b></dt>
 <dd> Test particle insertion into a predefined cavity location.
index 7b2587b8d40d94d0f8a1afee0353b59e03e43295..4dcead5d43b16dbed569483746e465e15b5badfd 100644 (file)
@@ -205,9 +205,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
              exit 1;; \
          esac; \
        done; \
-       echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu  src/Makefile'; \
+       echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  src/Makefile'; \
        cd $(top_srcdir) && \
-         $(AUTOMAKE) --gnu  src/Makefile
+         $(AUTOMAKE) --foreign  src/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
        @case '$?' in \
index 4a8e07decf074263b295097ef3f4191813d1857a..2ce15c3f6e2a6ed92141fd8d1f49dd56d7c2856f 100644 (file)
@@ -305,9 +305,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
              exit 1;; \
          esac; \
        done; \
-       echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu  src/gmxlib/Makefile'; \
+       echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  src/gmxlib/Makefile'; \
        cd $(top_srcdir) && \
-         $(AUTOMAKE) --gnu  src/gmxlib/Makefile
+         $(AUTOMAKE) --foreign  src/gmxlib/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
        @case '$?' in \
index 6e0848ecca790a568e60f9a082815b301077f8eb..634bec670db1e678216f021976d9237a8e329fee 100644 (file)
@@ -1982,6 +1982,7 @@ void calc_bonds(FILE *fplog,const gmx_multisim_t *ms,
                real lambda,
                const t_mdatoms *md,
                t_fcdata *fcd,int *global_atom_index,
+               t_atomtypes *atype, gmx_genborn_t *born,
                bool bPrintSepPot,int step)
 {
   int    ftype,nbonds,ind,nat;
index 99f7976a143fac034e30b85a1e2293ffc332a5e1..d1338e7da9c7725664638755bda781ed9a01aa50 100644 (file)
@@ -44,6 +44,8 @@
 #include "disre.h"
 #include "dihre.h"
 #include "orires.h"
+#include "genborn.h"
+
 
 #define  def_bonded(str,lstr,nra,nrpa,nrpb,ind,func)\
    {str,lstr,(nra),(nrpa),(nrpb),IF_BOND,                        (ind),(func)}
@@ -109,6 +111,7 @@ const t_interaction_function interaction_function[F_NRE]=
   def_bonded  ("IDIHS",    "Improper Dih.",   4, 2, 2,  eNR_IMPROPER,idihs        ),
   def_bonded  ("PIDIHS",   "Improper Dih.",   4, 3, 3,  eNR_PROPER, pdihs         ),
   def_bondedt ("TABDIHS", "Tab. Dih.",        4, 2, 2,  eNR_TABDIHS, tab_dihs     ),
+  def_bonded  ("GB",  "Generalized Born",     2, 2, 2,  eNR_GB,     unimplemented ),
   def_bondedz ("LJ14",     "LJ-14",           2, 2, 2,  eNR_NB14,   unimplemented ),
   def_nofc    ("COUL14",   "Coulomb-14"                                           ),
   def_bondedz ("LJC14_Q",  "LJC-14 q",        2, 5, 0,  eNR_NB14,   unimplemented ),
index 582df4dcea8d260ebda6b4c2d6acd345debd9229..00077e8909b4a4d89dafc6f1f1122f8bb0ac5831 100644 (file)
@@ -162,7 +162,11 @@ const char *eis_names[eisNR+1] = {
 };
 
 const char *egb_names[egbNR+1] = {
-  "Still", "OBC", NULL
+  "Still", "HCT", "OBC", NULL
+};
+
+const char *esa_names[esaNR+1] = {
+  "No", "Ace-approximation", "Still", NULL
 };
 
 const char *ewt_names[ewtNR+1] = {
index 2e2f48aaeb4bc5481b9ed06b354497250f03b1b8..16c02b702009769f1a0aebdf4279762a311facbd 100644 (file)
@@ -228,9 +228,9 @@ mknb_declare_variables()
        if(mknb_func.coul==MKNB_COUL_RF)
                mknb_declare_real("krsq");
        if(mknb_func.coul==MKNB_COUL_GB) {
-               mknb_declare_real("isai,isaj,isaprod,gbscale");
+               mknb_declare_real("isai,isaj,isaprod,gbscale,vgb");
                if(mknb_func.do_force) 
-                       mknb_declare_real("dvdasum,dvdatmp,dvdaj");
+                       mknb_declare_real("dvdasum,dvdatmp,dvdaj,fgb");
        }
     
        if(mknb_func.vdw==MKNB_VDW_BHAM) 
index ec7f7bf38e8e85ba3357d746b3d2368d57c04088..55d02784d6d19db61a055a1539aef2625d13f001 100644 (file)
@@ -210,8 +210,16 @@ mknb_load_inner_parameters(int i,int j)
                                /* Generalized born: load 1/sqrt(a) */
                                mknb_assign("isaj",mknb_array("invsqrta","jnr"));
                                mknb_assign("isaprod","isai*isaj");
+                               
+                               /* GB-PL */
+                               mknb_assign("qq","iq*%s",mknb_array("charge","jnr"));
+                               mknb_assign("vcoul","qq*rinv%d%d",i,j);
+                               
+                               if(mknb_func.do_force)
+                                       mknb_assign("fscal","vcoul*rinv%d%d",i,j);
+                               
                                /* Save a flop by multiplying qq with isa1*isa2 already here */
-                               mknb_assign("qq","isaprod*iq*%s",mknb_array("charge","jnr"));
+                               mknb_assign("qq","isaprod*(-qq)");
                                mknb_assign("gbscale","isaprod*gbtabscale");
                                nflops+=4;
                        } else {
index fd082bb95e920358b6eba23f4667a6a674a16104..c921a6f81426f39314d7a800a14e4e31424b2f61 100644 (file)
@@ -198,19 +198,20 @@ mknb_coul_gb(char *rsq, char *rinv)
   nflops += mknb_calc_gbtable_index("r");
  
   nflops += mknb_read_table("GBtab");
-  mknb_assign("vcoul","qq*VV");
+  mknb_assign("vgb","qq*VV");
   nflops++;
   if(mknb_func.do_force) {
     mknb_assign("fijC","qq*FF*gbscale");
-    mknb_assign("dvdatmp","vcoul+fijC*r");
-    mknb_assign("dvdasum","dvdasum - dvdatmp");
-    nflops+=5;
+    mknb_assign("dvdatmp","-0.5*(vgb+fijC*r)");
+    mknb_assign("dvdasum","dvdasum + dvdatmp");
+    nflops+=6;
     /* fs_minusrinv is empty */
-    sprintf(fs_minus_rinv,"fijC");
+    sprintf(fs_minus_rinv,"fijC-fscal");
     /* Update j atom dvda */
-    mknb_assign(mknb_array("dvda","jnr"),"dvdaj-dvdatmp");
+    mknb_assign(mknb_array("dvda","jnr"),"dvdaj+dvdatmp*isaj*isaj");
   }
-  mknb_assign("vctot","vctot + vcoul");
+  /* This will only give thw Coulomb part back to the total potential */
+  mknb_assign("vctot","vctot + vcoul"); 
   nflops++;
   
   return nflops;
@@ -445,17 +446,18 @@ mknb_calculate_interaction(char *rsq, char *rinv)
     }
     strcpy(fs_minus_rinv,tmp);
   }
+  
   if(strlen(fs_rinvsq)>0 && strlen(fs_minus_rinv)>0) {
-    mknb_assign("fscal","(%s)*rinvsq-(%s)*%s",fs_rinvsq,fs_minus_rinv,rinv);
-    nflops += 3;
+         mknb_assign("fscal","(%s)*rinvsq-(%s)*%s",fs_rinvsq,fs_minus_rinv,rinv);
+         nflops += 3;
   } else if(strlen(fs_rinvsq)>0) {
-    mknb_assign("fscal","(%s)*rinvsq",fs_rinvsq);
-    nflops++;
+         mknb_assign("fscal","(%s)*rinvsq",fs_rinvsq);
+         nflops++;
   } else if(strlen(fs_minus_rinv)>0) {
-    mknb_assign("fscal","-(%s)*%s",fs_minus_rinv,rinv);
-    nflops += 2;
+         mknb_assign("fscal","-(%s)*%s",fs_minus_rinv,rinv);
+         nflops += 2;
   }
-  
+       
   return nflops;
 }
 
index 9b4c3e181ce0551e56fb44c98465284ea07b3c2d..5f3321ead72e0ac45795bfc2092eb9b1ce413696 100644 (file)
@@ -233,7 +233,7 @@ mknb_update_outer_potential()
         */
        if(mknb_func.coul==MKNB_COUL_GB && mknb_func.do_force) {
                mknb_assign(mknb_array("dvda","ii"),
-                                       "%s + dvdasum",
+                                       "%s + dvdasum*isai*isai",
                                        mknb_array("dvda","ii"));
                nflops++;
        }
index ca63759e7ea8eef45ad0c2a686d3bcf771307370..80b57c239580c1e57a6ac2d2a0f32d0151782067 100644 (file)
@@ -51,9 +51,9 @@ libnb_kernel_ia32_sse_la_SOURCES = \
        nb_kernel332_ia32_sse.s         nb_kernel332_ia32_sse.h         \
        nb_kernel333_ia32_sse.s         nb_kernel333_ia32_sse.h         \
        nb_kernel334_ia32_sse.s         nb_kernel334_ia32_sse.h         \
-       nb_kernel400_ia32_sse.s         nb_kernel400_ia32_sse.h         \
-       nb_kernel410_ia32_sse.s         nb_kernel410_ia32_sse.h         \
-       nb_kernel430_ia32_sse.s         nb_kernel430_ia32_sse.h         \
+       nb_kernel400_ia32_sse.c         nb_kernel400_ia32_sse.h         \
+       nb_kernel410_ia32_sse.c         nb_kernel410_ia32_sse.h         \
+       nb_kernel430_ia32_sse.c         nb_kernel430_ia32_sse.h         \
        nb_kernel_ia32_sse_test_asm.s   nb_kernel_ia32_sse_test_asm.h   \
        nb_kernel_ia32_sse.c            nb_kernel_ia32_sse.h    
 
index f6b487de388dde2d9837c82c8ca84a54dd064044..f7b989b3442dfd0a662d9f12b401c05cd83314b6 100644 (file)
@@ -51,9 +51,9 @@ libnb_kernel_x86_64_sse_la_SOURCES = \
        nb_kernel332_x86_64_sse.s       nb_kernel332_x86_64_sse.h       \
        nb_kernel333_x86_64_sse.s       nb_kernel333_x86_64_sse.h       \
        nb_kernel334_x86_64_sse.s       nb_kernel334_x86_64_sse.h       \
-       nb_kernel400_x86_64_sse.s       nb_kernel400_x86_64_sse.h       \
-       nb_kernel410_x86_64_sse.s       nb_kernel410_x86_64_sse.h       \
-       nb_kernel430_x86_64_sse.s       nb_kernel430_x86_64_sse.h       \
+       nb_kernel400_x86_64_sse.c       nb_kernel400_x86_64_sse.h       \
+       nb_kernel410_x86_64_sse.c       nb_kernel410_x86_64_sse.h       \
+       nb_kernel430_x86_64_sse.c       nb_kernel430_x86_64_sse.h       \
        nb_kernel_x86_64_sse_test_asm.s nb_kernel_x86_64_sse_test_asm.h \
        nb_kernel_x86_64_sse.c          nb_kernel_x86_64_sse.h  
 
index 829211d9d413d68cf3d831a25fcef94dc2620cf4..afd5bd23c5cfebd221f6fbb6f367dcdb8c37dc0c 100644 (file)
@@ -437,7 +437,7 @@ void do_nonbonded(t_commrec *cr,t_forcerec *fr,
                                              f[0],
                                              fshift,
                                              egcoul,
-                                                 egnb,
+                                             egnb,
                                              nblists->tab.scale,
                                              tabledata,
                                              &outeriter,
@@ -495,10 +495,10 @@ void do_nonbonded(t_commrec *cr,t_forcerec *fr,
                                       egnb,
                                       &(nblists->tab.scale),
                                       tabledata,
-                                      NULL,
-                                      NULL,
-                                      NULL,
-                                      NULL,
+                                      fr->invsqrta,
+                                      fr->dvda,
+                                      &(fr->gbtabscale),
+                                      fr->gbtab.tab,
                                       &nthreads,
                                       &(nlist->count),
                                       nlist->mtx,
index 689088783da7471eb074c54597d102490dfbafc9..b74403bd1244f08b64736411324d89b934a15e64 100644 (file)
@@ -246,7 +246,8 @@ static const t_nrnb_data nbdata[eNRNB] = {
     { "Virtual Site 3out",             87  },
     { "Virtual Site 4fd",              110 }, 
     { "Virtual Site 4fdn",             254 }, 
-    { "Virtual Site N",                 15 } 
+    { "Virtual Site N",                 15 },
+    { "Mixed Generalized Born stuff",   10 } 
 };
 
 
index 4ab9d902bc484b79cd0d57ec6319378ceefb4f3f..39409aaf06cd4191c42288fec73bc4a900b8c39a 100644 (file)
@@ -62,7 +62,7 @@
 #include "mtop_util.h"
 
 /* This number should be increased whenever the file format changes! */
-static const int tpx_version = 59;
+static const int tpx_version = 60;
 
 /* This number should only be increased when you edit the TOPOLOGY section
  * of the tpx format. This way we can maintain forward compatibility too
@@ -73,7 +73,7 @@ static const int tpx_version = 59;
  * to the end of the tpx file, so we can just skip it if we only
  * want the topology.
  */
-static const int tpx_generation = 17;
+static const int tpx_generation = 18;
 
 /* This number should be the most recent backwards incompatible version 
  * I.e., if this number is 9, we cannot read tpx version 9 with this code.
@@ -131,6 +131,7 @@ static const t_ftupd ftupd[] = {
   { 26, F_FOURDIHS          },
   { 26, F_PIDIHS            },
   { 43, F_TABDIHS           },
+  { 60, F_GB                }, 
   { 41, F_LJC14_Q           },
   { 41, F_LJC_PAIRS_NB      },
   { 32, F_BHAM_LR           },
@@ -386,6 +387,16 @@ static void do_inputrec(t_inputrec *ir,bool bRead, int file_version,
                do_real(ir->gb_obc_alpha);
                do_real(ir->gb_obc_beta);
                do_real(ir->gb_obc_gamma);
+               if(file_version>=60)
+               {
+                       do_real(ir->gb_dielectric_offset);
+                       do_int(ir->sa_algorithm);
+               }
+               else
+               {
+                       ir->gb_dielectric_offset = 0.09;
+                       ir->sa_algorithm = esaAPPROX;
+               }
                do_real(ir->sa_surface_tension);
        }
        else
@@ -987,6 +998,17 @@ void do_iparams(t_functype ftype,t_iparams *iparams,bool bRead, int file_version
     do_int (iparams->vsiten.n);
     do_real(iparams->vsiten.a);
     break;
+  case F_GB:
+    do_real(iparams->gb.c6A);  
+       do_real(iparams->gb.c12A);      
+       do_real(iparams->gb.c6B);       
+       do_real(iparams->gb.c12B);      
+       do_real(iparams->gb.sar);       
+       do_real(iparams->gb.st);
+       do_real(iparams->gb.pi);
+       do_real(iparams->gb.gbr);
+       do_real(iparams->gb.bmlt);
+       break;
   default:
     gmx_fatal(FARGS,"unknown function type %d (%s) in %s line %d",
                
@@ -1326,6 +1348,8 @@ static void do_atomtypes(t_atomtypes *atomtypes,bool bRead,
       snew(atomtypes->vol,j);
       snew(atomtypes->surftens,j);
       snew(atomtypes->atomnumber,j);
+      snew(atomtypes->gb_radius,j);
+      snew(atomtypes->S_hct,j);
     }
     ndo_real(atomtypes->radius,j,bDum);
     ndo_real(atomtypes->vol,j,bDum);
@@ -1334,6 +1358,11 @@ static void do_atomtypes(t_atomtypes *atomtypes,bool bRead,
     {
         ndo_int(atomtypes->atomnumber,j,bDum);
     }
+       if(file_version >= 60)
+       {
+               ndo_real(atomtypes->gb_radius,j,bDum);
+               ndo_real(atomtypes->S_hct,j,bDum);
+       }
   } else {
     /* File versions prior to 26 cannot do GBSA, 
      * so they dont use this structure 
@@ -1343,6 +1372,8 @@ static void do_atomtypes(t_atomtypes *atomtypes,bool bRead,
     atomtypes->vol = NULL;
     atomtypes->surftens = NULL;
     atomtypes->atomnumber = NULL;
+    atomtypes->gb_radius = NULL;
+    atomtypes->S_hct = NULL;
   }  
 }
 
index 606147aa68bd9018ef7acdb5f2016c4f5d69d0c4..b45282b1aaa3db6d78aea9701a0904befa2487d7 100644 (file)
@@ -587,6 +587,8 @@ void pr_inputrec(FILE *fp,int indent,const char *title,t_inputrec *ir,
     PR("gb_obc_alpha",ir->gb_obc_alpha);
     PR("gb_obc_beta",ir->gb_obc_beta);
     PR("gb_obc_gamma",ir->gb_obc_gamma);
+    PR("gb_dielectric_offset",ir->gb_dielectric_offset);
+    PS("sa_algorithm",ESAALGORITHM(ir->gb_algorithm));
     PR("sa_surface_tension",ir->sa_surface_tension);
          
     PS("DispCorr",EDISPCORR(ir->eDispCorr));
@@ -856,6 +858,9 @@ void pr_iparams(FILE *fp,t_functype ftype,t_iparams *iparams)
   case F_VSITEN:
     fprintf(fp,"n=%2d, a=%15.8e\n",iparams->vsiten.n,iparams->vsiten.a);
     break;
+  case F_GB:
+    fprintf(fp, "c6A=%15.8e, c12A=%15.8e, c6B=%15.8e, c12B=%15.8e, sar=%15.8e, st=%15.8e, pi=%15.8e, gbr=%15.8e, bmlt=%15.8e\n",iparams->gb.c6A,iparams->gb.c12A,iparams->gb.c6B,iparams->gb.c12B,iparams->gb.sar,iparams->gb.st,iparams->gb.pi,iparams->gb.gbr,iparams->gb.bmlt);
+    break;               
   default:
     gmx_fatal(FARGS,"unknown function type %d (%s) in %s line %d",
              ftype,interaction_function[ftype].name,__FILE__,__LINE__);
@@ -1220,10 +1225,11 @@ void pr_atomtypes(FILE *fp,int indent,const char *title,t_atomtypes *atomtypes,
     indent=pr_title(fp,indent,title);
     for(i=0;i<atomtypes->nr;i++) {
       pr_indent(fp,indent);
-         fprintf(fp,
-              "atomtype[%3d]={radius=%12.5e, volume=%12.5e, surftens=%12.5e, atomnumber=%4d)}\n",
-              bShowNumbers?i:-1,atomtypes->radius[i],atomtypes->vol[i],
-              atomtypes->surftens[i],atomtypes->atomnumber[i]);
+               fprintf(fp,
+                               "atomtype[%3d]={radius=%12.5e, volume=%12.5e, gb_radius=%12.5e, surftens=%12.5e, atomnumber=%4d, S_hct=%12.5e)}\n",
+                               bShowNumbers?i:-1,atomtypes->radius[i],atomtypes->vol[i],
+                               atomtypes->gb_radius[i],
+                               atomtypes->surftens[i],atomtypes->atomnumber[i],atomtypes->S_hct[i]);
     }
   }
 }
index 4994239d36c913bda6bf8e18dae7b48c1dd98c45..b86de790deb5c1788c07e22a174840d3574cdcf7 100644 (file)
@@ -102,6 +102,8 @@ void init_atomtypes(t_atomtypes *at)
   at->radius = NULL;
   at->vol = NULL;
   at->atomnumber = NULL;
+  at->gb_radius = NULL;
+  at->S_hct = NULL;
 }
 
 void init_groups(gmx_groups_t *groups)
@@ -231,6 +233,16 @@ void done_atom (t_atoms *at)
   sfree(at->atomname);
 }
 
+void done_atomtypes(t_atomtypes *atype)
+{
+  atype->nr = 0;
+  sfree(atype->radius);
+  sfree(atype->vol);
+  sfree(atype->surftens);
+  sfree(atype->gb_radius);
+  sfree(atype->S_hct);
+}
+
 void done_moltype(gmx_moltype_t *molt)
 {
   int f;
@@ -284,6 +296,10 @@ void done_top(t_topology *top)
   int i;
   
   done_atom (&(top->atoms));
+
+  /* For GB */
+  done_atomtypes(&(top->atomtypes));
+
   done_symtab(&(top->symtab));
   done_block(&(top->cgs));
   done_block(&(top->mols));
index 348fa7ea536a9ee97a0ab0b6431a885c8fa5cc1e..973bb126274741bd691389ab091da19709dd5648 100644 (file)
@@ -321,6 +321,12 @@ static void assign_param(t_functype ftype,t_iparams *new,
     new->vsiten.n = round_check(old[0],1,ftype,"number of atoms");
     new->vsiten.a = old[1];
     break;
+    /* 
+    case F_GB:
+      new->gb.st=old[0];
+      printf("new->gb.st=%g\n",new->gb.st);
+    break;
+    */
   default:
     gmx_fatal(FARGS,"unknown function type %d in %s line %d",
                ftype,__FILE__,__LINE__);
index cfbfcd395bb1917803710e1cf0c42a0dc53967e1..bac7eb188776f9cdde7f96272bfe6e684fe8f425 100644 (file)
@@ -60,6 +60,8 @@ typedef struct {
   real          *radius;        /* Radius for GBSA stuff                */
   real          *vol;           /* Effective volume for GBSA            */
   real          *surftens;      /* Surface tension with water, for GBSA */
+  real          *gb_radius;     /* Radius for Still model               */
+  real          *S_hct;         /* Overlap factor for HCT model         */
   int           *atomnumber;    /* Atomic number, used for QM/MM        */
 } gpp_atomtype;
 
@@ -193,6 +195,26 @@ real get_atomtype_surftens(int nt,t_atomtype at)
   return ga->surftens[nt];
 }
 
+real get_atomtype_gb_radius(int nt,t_atomtype at)
+{
+  gpp_atomtype *ga = (gpp_atomtype *) at;
+
+  if ((nt < 0) || (nt >= ga->nr))
+    return NOTSET;
+
+  return ga->gb_radius[nt];
+}
+
+real get_atomtype_S_hct(int nt,t_atomtype at)
+{
+  gpp_atomtype *ga = (gpp_atomtype *) at;
+
+  if ((nt < 0) || (nt >= ga->nr))
+    return NOTSET;
+
+  return ga->S_hct[nt];
+}
+
 real get_atomtype_nbparam(int nt,int param,t_atomtype at)
 {
   gpp_atomtype *ga = (gpp_atomtype *) at;
@@ -209,14 +231,47 @@ t_atomtype init_atomtype(void)
   gpp_atomtype *ga;
   
   snew(ga,1);
+
+  ga->nr           = 0;
+  ga->atom         = NULL;
+  ga->atomname     = NULL;
+  ga->nb           = NULL;
+  ga->bondatomtype = NULL;
+  ga->radius       = NULL;
+  ga->vol          = NULL;
+  ga->surftens     = NULL;
+  ga->atomnumber   = NULL;
+  ga->gb_radius    = NULL;
+  ga->S_hct        = NULL;
   
   return (t_atomtype ) ga;
 }
 
+int
+set_atomtype_gbparam(t_atomtype at, int i,
+                     real radius,real vol,real surftens,
+                     real gb_radius, real S_hct)
+{
+  gpp_atomtype *ga = (gpp_atomtype *) at;
+
+  if ( (i < 0) || (i>= ga->nr))
+    return NOTSET;
+
+  ga->radius[i]    = radius;
+  ga->vol[i]       = vol;
+  ga->surftens[i]  = surftens;
+  ga->gb_radius[i] = gb_radius;
+  ga->S_hct[i]     = S_hct;
+
+  return i;
+}
+
+
 int set_atomtype(int nt,t_atomtype at,t_symtab *tab,
                 t_atom *a,char *name,t_param *nb,
                 int bondatomtype,
-                real radius,real vol,real surftens,int atomnumber)
+                real radius,real vol,real surftens,int atomnumber,
+                 real gb_radius, real S_hct)
 {
   gpp_atomtype *ga = (gpp_atomtype *) at;
   
@@ -231,14 +286,17 @@ int set_atomtype(int nt,t_atomtype at,t_symtab *tab,
   ga->vol[nt] = vol;
   ga->surftens[nt] = surftens;
   ga->atomnumber[nt] = atomnumber;
-  
+  ga->gb_radius[nt] = gb_radius;
+  ga->S_hct[nt]     = S_hct;
+
   return nt;
 }
 
 int add_atomtype(t_atomtype at,t_symtab *tab,
                 t_atom *a,char *name,t_param *nb,
                 int bondatomtype,
-                real radius,real vol,real surftens,real atomnumber)
+                real radius,real vol,real surftens,real atomnumber,
+                real gb_radius, real S_hct)
 {
   gpp_atomtype *ga = (gpp_atomtype *) at;
   
@@ -251,9 +309,11 @@ int add_atomtype(t_atomtype at,t_symtab *tab,
   srenew(ga->vol,ga->nr);
   srenew(ga->surftens,ga->nr);
   srenew(ga->atomnumber,ga->nr);
-  
+  srenew(ga->gb_radius,ga->nr);
+  srenew(ga->S_hct,ga->nr);
+
   return set_atomtype(ga->nr-1,at,tab,a,name,nb,bondatomtype,radius,
-                     vol,surftens,atomnumber);
+                     vol,surftens,atomnumber,gb_radius,S_hct);
 }
 
 void print_at (FILE * out, t_atomtype at)
@@ -285,6 +345,8 @@ void done_atomtype(t_atomtype *at)
   sfree(ga->bondatomtype);
   sfree(ga->radius);
   sfree(ga->vol);
+  sfree(ga->gb_radius);
+  sfree(ga->S_hct);
   sfree(ga->surftens);
   sfree(ga->atomnumber);
   ga->nr = 0;
@@ -328,7 +390,9 @@ static int search_atomtypes(t_atomtype at,int *n,int typelist[],
        (get_atomtype_radius(tli,at) == get_atomtype_radius(thistype,at)) &&
        (get_atomtype_vol(tli,at) == get_atomtype_vol(thistype,at)) &&
        (get_atomtype_surftens(tli,at) == get_atomtype_surftens(thistype,at)) &&
-       (get_atomtype_atomnumber(tli,at) == get_atomtype_atomnumber(thistype,at));
+       (get_atomtype_atomnumber(tli,at) == get_atomtype_atomnumber(thistype,at)) &&
+        (get_atomtype_gb_radius(tli,at) == get_atomtype_gb_radius(thistype,at)) &&
+        (get_atomtype_S_hct(tli,at) == get_atomtype_S_hct(thistype,at));
     }
     if (bFound)
     {
@@ -362,6 +426,8 @@ void renum_atype(t_params plist[],gmx_mtop_t *mtop,
   real     *new_radius;
   real     *new_vol;
   real     *new_surftens;
+  real     *new_gb_radius;
+  real     *new_S_hct;
   int      *new_atomnumber;
   
   ntype = get_atomtype_ntypes(at);
@@ -415,6 +481,8 @@ void renum_atype(t_params plist[],gmx_mtop_t *mtop,
   snew(new_vol,nat);
   snew(new_surftens,nat);
   snew(new_atomnumber,nat);  
+  snew(new_gb_radius,nat);
+  snew(new_S_hct,nat);
 
   /* We now have a list of unique atomtypes in typelist */
 
@@ -442,6 +510,8 @@ void renum_atype(t_params plist[],gmx_mtop_t *mtop,
     new_vol[i]        = get_atomtype_vol(mi,at);
     new_surftens[i]   = get_atomtype_surftens(mi,at);
     new_atomnumber[i] = get_atomtype_atomnumber(mi,at);
+    new_gb_radius[i]  = get_atomtype_gb_radius(mi,at);
+    new_S_hct[i]      = get_atomtype_S_hct(mi,at);
   }
   
   for(i=0; (i<nat*nat); i++) {
@@ -455,12 +525,16 @@ void renum_atype(t_params plist[],gmx_mtop_t *mtop,
   sfree(ga->vol);
   sfree(ga->surftens);
   sfree(ga->atomnumber);
+  sfree(ga->gb_radius);
+  sfree(ga->S_hct);
   
   ga->radius     = new_radius;
   ga->vol        = new_vol;
   ga->surftens   = new_surftens;
   ga->atomnumber = new_atomnumber;
-  
+  ga->gb_radius  = new_gb_radius;
+  ga->S_hct      = new_S_hct;
+
   ga->nr=nat;
 
   sfree(nbsnew);
@@ -479,13 +553,16 @@ void copy_atomtype_atomtypes(t_atomtype atype,t_atomtypes *atomtypes)
   snew(atomtypes->vol,ntype);
   snew(atomtypes->surftens,ntype);
   snew(atomtypes->atomnumber,ntype);
-
+  snew(atomtypes->gb_radius,ntype);
+  snew(atomtypes->S_hct,ntype);
 
   for(i=0; i<ntype; i++) {
     atomtypes->radius[i]     = ga->radius[i];
     atomtypes->vol[i]        = ga->vol[i];
     atomtypes->surftens[i]   = ga->surftens[i];
     atomtypes->atomnumber[i] = ga->atomnumber[i];
+    atomtypes->gb_radius[i]  = ga->gb_radius[i];
+    atomtypes->S_hct[i]      = ga->S_hct[i];
   }
 }
 
index 4587e86dd25d0a7597ce55287317303f1f36cd86..79ad51f662d73efae9c24f8709061a1cace9d0a2 100644 (file)
@@ -78,6 +78,7 @@
 #include "gpp_atomtype.h"
 #include "gpp_tomorse.h"
 #include "mtop_util.h"
+#include "genborn.h"
 
 static int rm_interactions(int ifunc,int nrmols,t_molinfo mols[])
 {
@@ -286,14 +287,19 @@ new_status(char *topfile,char *topppfile,char *confin,
   t_atoms     *confat;
   int         mb,mbs,i,nrmols,nmismatch;
   char        buf[STRLEN];
+  bool        bGB=FALSE;
 
   init_mtop(sys);
+
+  /* Set boolean for GB */
+  if(ir->implicit_solvent)
+    bGB=TRUE;
   
   /* TOPOLOGY processing */
   sys->name = do_top(bVerbose,topfile,topppfile,opts,bZero,&(sys->symtab),
                     plist,comb,reppow,fudgeQQ,
                     atype,&nrmols,&molinfo,ir,
-                    &nmolblock,&molblock);
+                    &nmolblock,&molblock,bGB);
   
   sys->nmolblock = 0;
   snew(sys->molblock,nmolblock);
@@ -774,9 +780,11 @@ int main (int argc, char *argv[])
   char         fn[STRLEN],fnB[STRLEN],*mdparin;
   int          nerror,ntype;
   bool         bNeedVel,bGenVel;
-  bool         have_radius,have_vol,have_surftens;
+  bool         have_radius,have_vol,have_surftens,have_gb_radius,have_S_hct;
   bool         have_atomnumber;
-  
+  t_params     *gb_plist = NULL;
+  gmx_genborn_t *born = NULL;
+
   t_filenm fnm[] = {
     { efMDP, NULL,  NULL,        ffOPTRD },
     { efMDP, "-po", "mdout",     ffWRITE },
@@ -877,18 +885,35 @@ int main (int argc, char *argv[])
     }
   }
 
-  /* If we are doing GBSA, check that we got the parameters we need */
-  have_radius=have_vol=have_surftens=TRUE;
-  ntype = get_atomtype_ntypes(atype);
-  for(i=0; (i<ntype); i++) {
-    have_radius   = have_radius && (get_atomtype_radius(i,atype) > 0);
-    have_vol      = have_vol && (get_atomtype_vol(i,atype) > 0);
-    have_surftens = have_surftens && (get_atomtype_surftens(i,atype) >= 0);
+  /* If we are doing GBSA, check that we got the parameters we need                                                            
+   * This checking is to see if there are GBSA paratmeters for all                                                             
+   * atoms in the force field. To go around this for testing purposes                                                          
+   * comment out the nerror++ counter temporarliy                                                                              
+   */
+  have_radius=have_vol=have_surftens=have_gb_radius=have_S_hct=TRUE;
+  for(i=0;i<get_atomtype_ntypes(atype);i++) {
+    have_radius=have_radius       && (get_atomtype_radius(i,atype) > 0);
+    have_vol=have_vol             && (get_atomtype_vol(i,atype) > 0);
+    have_surftens=have_surftens   && (get_atomtype_surftens(i,atype) > 0);
+    have_gb_radius=have_gb_radius && (get_atomtype_gb_radius(i,atype) > 0);
+    have_S_hct=have_S_hct         && (get_atomtype_S_hct(i,atype) > 0);
+  }
+  if(!have_radius && ir->implicit_solvent==eisGBSA) {
+    fprintf(stderr,"Can't do GB electrostatics; the forcefield is missing values for\n"
+           "atomtype radii, or they might be zero\n.");
+    /* nerror++; */
+  }
+  /*
+  if(!have_surftens && ir->implicit_solvent!=eisNO) {
+    fprintf(stderr,"Can't do implicit solvent; the forcefield is missing values\n"
+           " for atomtype surface tension\n.");
+    nerror++;                                                                                                                
   }
+  */
   
   /* If we are doing QM/MM, check that we got the atom numbers */
   have_atomnumber = TRUE;
-  for (i=0; i<ntype; i++) {
+  for (i=0; i<get_atomtype_ntypes(atype); i++) {
     have_atomnumber = have_atomnumber && (get_atomtype_atomnumber(i,atype) >= 0);
   }
   if (!have_atomnumber && ir->bQMMM)
@@ -930,6 +955,16 @@ int main (int argc, char *argv[])
               ir->refcoord_scaling,ir->ePBC,
               ir->posres_com,ir->posres_comB);
   }
+
+  if(ir->implicit_solvent)
+  {
+      printf("Constructing Generalized Born topology...\n");
+      snew(gb_plist,1);
+      init_gb_plist(gb_plist);
+      snew(born,1);
+      generate_gb_topology(sys,plist,gb_plist,born);
+  }
+
   
   nvsite = 0;
   /* set parameters for virtual site construction (not for vsiten) */
@@ -959,11 +994,22 @@ int main (int argc, char *argv[])
 
   if (bVerbose) 
     fprintf(stderr,"converting bonded parameters...\n");
+       
+  ntype = get_atomtype_ntypes(atype);
   convert_params(ntype, plist, mi, comb, reppow, fudgeQQ, sys);
   
   if (debug)
     pr_symtab(debug,0,"After convert_params",&sys->symtab);
 
+  /* Convert GB parameters to idef */
+  if(ir->implicit_solvent)
+  {
+         gmx_localtop_t *localtop;
+         localtop = gmx_mtop_generate_local_top(sys,ir);
+         
+         convert_gb_params(&localtop->idef, F_GB, localtop->idef.ntypes, gb_plist,born);
+  }
+       
   /* set ptype to VSite for virtual sites */
   for(mt=0; mt<sys->nmoltype; mt++) {
     set_vsites_ptype(FALSE,&sys->moltype[mt]);
index 6e2496f8c3684ba5aab6923bd8f6e208c3211afb..1c7ab07310a6df8c99023c0240db6f4711ae8d8c 100644 (file)
@@ -82,6 +82,7 @@
 #include "mvdata.h"
 #include "checkpoint.h"
 #include "mtop_util.h"
+#include "genborn.h"
 
 #ifdef GMX_MPI
 #include <mpi.h>
@@ -142,7 +143,8 @@ void mdrunner(FILE *fplog,t_commrec *cr,int nfile,t_filenm fnm[],
   bool       bReadRNG,bReadEkin;
   int        list;
   int        nsteps_done;
-       
+  gmx_genborn_t *born;
+
   snew(inputrec,1);
   snew(mtop,1);
        
@@ -307,6 +309,10 @@ void mdrunner(FILE *fplog,t_commrec *cr,int nfile,t_filenm fnm[],
                  opt2fn("-tableb",nfile,fnm),FALSE,pforce);
     fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
     
+    /* Initialise GB-stuff */
+    if(fr->bGB)
+      init_gb(&born,cr,fr,inputrec,mtop,state->x,inputrec->rgbradii,inputrec->gb_algorithm);
+
     /* Initialize QM-MM */
     if(fr->bQMMM)
       init_QMMMrec(cr,box,mtop,inputrec,fr);
@@ -385,6 +391,7 @@ void mdrunner(FILE *fplog,t_commrec *cr,int nfile,t_filenm fnm[],
        gmx_fatal(FARGS,"Error %d initializing PME",status);
     }
   }
+
   
   if (integrator[inputrec->eI].func == do_md) {
     /* Turn on signal handling on all nodes */
@@ -429,6 +436,7 @@ void mdrunner(FILE *fplog,t_commrec *cr,int nfile,t_filenm fnm[],
                                            nstepout,inputrec,mtop,
                                            fcd,state,f,buf,
                                            mdatoms,nrnb,wcycle,ed,fr,
+                                           born,
                                            repl_ex_nst,repl_ex_seed,
                                            cpt_period,max_hours,
                                            Flags,
@@ -478,7 +486,7 @@ time_t do_md(FILE *fplog,t_commrec *cr,int nfile,t_filenm fnm[],
             t_state *state_global,rvec f[],
             rvec buf[],t_mdatoms *mdatoms,
             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
-            gmx_edsam_t ed,t_forcerec *fr,
+            gmx_edsam_t ed,t_forcerec *fr, gmx_genborn_t *born,
             int repl_ex_nst,int repl_ex_seed,
             real cpt_period,real max_hours,
             unsigned long Flags,
@@ -492,7 +500,7 @@ time_t do_md(FILE *fplog,t_commrec *cr,int nfile,t_filenm fnm[],
   double     t,t0,lam0;
   bool       bGStatEveryStep,bGStat,bCalcPres;
   bool       bNS,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
-             bFirstStep,bStateFromTPX,bLastStep;
+             bFirstStep,bStateFromTPX,bLastStep,bBornRadii;
   bool       bNEMD,do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
                 bForceUpdate=FALSE,bX,bV,bF,bXTC,bCPT;
   bool       bMasterState;
@@ -1034,6 +1042,13 @@ time_t do_md(FILE *fplog,t_commrec *cr,int nfile,t_filenm fnm[],
       bLastStep = TRUE;
     }
 
+    /* Determine whether or not to update the Born radii if doing GB */
+    bBornRadii=bFirstStep;
+    if(ir->implicit_solvent && (step % ir->nstgbradii==0))
+    {
+       bBornRadii=TRUE;
+    }
+
     do_log = do_per_step(step,ir->nstlog) || bFirstStep || bLastStep;
     do_verbose = bVerbose && (step % stepout == 0 || bFirstStep || bLastStep);
 
@@ -1129,10 +1144,10 @@ time_t do_md(FILE *fplog,t_commrec *cr,int nfile,t_filenm fnm[],
     if (shellfc) {
       /* Now is the time to relax the shells */
       count=relax_shell_flexcon(fplog,cr,bVerbose,bFFscan ? step+1 : step,
-                               ir,bNS,bStopCM,top,constr,enerd,fcd,
+                               ir,bNS,bStopCM,top,top_global,constr,enerd,fcd,
                                state,f,buf,force_vir,mdatoms,
                                nrnb,wcycle,graph,groups,
-                               shellfc,fr,t,mu_tot,
+                               shellfc,fr,born,bBornRadii,t,mu_tot,
                                state->natoms,&bConverged,vsite,
                                fp_field);
       tcount+=count;
@@ -1145,11 +1160,11 @@ time_t do_md(FILE *fplog,t_commrec *cr,int nfile,t_filenm fnm[],
        * This is parallellized as well, and does communication too. 
        * Check comments in sim_util.c
        */
-      do_force(fplog,cr,ir,step,nrnb,wcycle,top,groups,
+      do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
               state->box,state->x,&state->hist,
               f,buf,force_vir,mdatoms,enerd,fcd,
               state->lambda,graph,
-              fr,vsite,mu_tot,t,fp_field,ed,
+              fr,vsite,mu_tot,t,fp_field,ed,born,bBornRadii,
               GMX_FORCE_STATECHANGED | (bNS ? GMX_FORCE_NS : 0) |
               GMX_FORCE_ALLFORCES | (bCalcPres ? GMX_FORCE_VIRIAL : 0));
     }
index 94493b6ec56b2b01d8d4cb31f2f77119918570e0..cfef7b0472546d7f6684c6865cd9e94486a35efd 100644 (file)
@@ -264,7 +264,7 @@ int nm2type(int nnm,t_nm2type nm2t[],t_symtab *tab,t_atoms *atoms,
        atoms->atom[i].qB = alpha;
        atoms->atom[i].m = atoms->atom[i].mB = mm;
        k = add_atomtype(atype,tab,&(atoms->atom[i]),type,param,
-                        atoms->atom[i].type,0,0,0,atomnr);
+                        atoms->atom[i].type,0,0,0,atomnr,0,0);
       }
       atoms->atom[i].type  = k;
       atoms->atom[i].typeB = k;
index 7daadd6b4f43a0dbcbdb71022b6a28bb4088c71f..22dbf7d7d406e16b9648878ba1f03504f3e403b9 100644 (file)
@@ -406,6 +406,21 @@ void check_ir(t_inputrec *ir, t_gromppopts *opts,int *nerror)
            ir->sc_power);
     CHECK(ir->sc_alpha!=0 && ir->sc_power!=1 && ir->sc_power!=2);
   }
+
+  if(ir->coulombtype==eelGB_NOTUSED)
+  {
+    ir->coulombtype==eelCUT;
+    ir->implicit_solvent=eisGBSA;
+    fprintf(stderr,"Note: Old option for generalized born electrostatics given:\n"
+           "Changing coulombtype from \"generalized-born\" to \"cut-off\" and instead\n"
+           "setting implicit_solvent value to \"GBSA\" in input section.\n");
+  }
+
+  if(ir->implicit_solvent==eisGBSA)
+  {
+      sprintf(err_buf,"With GBSA implicit solvent, rgbradii must be equal to rlist.");
+      CHECK(ir->rgbradii != ir->rlist);
+  }
 }
 
 static int str_nelem(char *str,int maxptr,char *ptr[])
@@ -648,6 +663,8 @@ void get_ir(char *mdparin,char *mdparout,
   RTYPE ("gb_obc_alpha", ir->gb_obc_alpha, 1.0);
   RTYPE ("gb_obc_beta", ir->gb_obc_beta, 0.8);
   RTYPE ("gb_obc_gamma", ir->gb_obc_gamma, 4.85);      
+  RTYPE ("gb_dielectric_offset", ir->gb_dielectric_offset, 0.09);
+  EETYPE("sa_algorithm", ir->sa_algorithm, esa_names, nerror, TRUE);
   CTYPE ("Surface tension (kJ/mol/nm^2) for the SA (nonpolar surface) part of GBSA");
   CTYPE ("The default value (2.092) corresponds to 0.005 kcal/mol/Angstrom^2.");
   RTYPE ("sa_surface_tension", ir->sa_surface_tension, 2.092);
@@ -1836,7 +1853,7 @@ void triple_check(char *mdparin,t_inputrec *ir,gmx_mtop_t *sys,int *nerror)
       sprintf(err_buf,
              "You are using a plain Coulomb cut-off, which might produce artifacts.\n"
              "You might want to consider using %s electrostatics.\n",
-             EI_TPI(ir->eI) ? EELTYPE(eelRF) : EELTYPE(eelPME));
+             EELTYPE(eelPME));
       warning_note(err_buf);
     }
   }
index d128c2d13b4db04cef9599786cf5744dd6cc33f2..d73a15be4356edf9f6beaa786c242f4e880d4891 100644 (file)
@@ -78,7 +78,7 @@ t_atomtype read_atype(char *adb,t_symtab *tab)
     
     if ((buf != NULL) && (sscanf(buf,"%s%lf",name,&m) == 2)) {
       a->m = m;
-      add_atomtype(at,tab,a,name,nb,0,0,0,0,0);
+      add_atomtype(at,tab,a,name,nb, 0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 );
       fprintf(stderr,"\rAtomtype %d",nratt+1);
     }
   }
index 082afecd4ed99ebcef250980d50087fdf1fbb3b8..2ca351fbdbbc3037f710dfb8123b310a53853b13 100644 (file)
@@ -255,6 +255,8 @@ void DS_Init(DirStack **DS)
     set_nec(&(necessary[d_angletypes]),d_atomtypes,d_none);
     set_nec(&(necessary[d_dihedraltypes]),d_atomtypes,d_none);
     set_nec(&(necessary[d_nonbond_params]),d_atomtypes,d_none);
+    set_nec(&(necessary[d_implicit_genborn_params]),d_atomtypes,d_none);
+    set_nec(&(necessary[d_implicit_surface_params]),d_atomtypes,d_none);
     set_nec(&(necessary[d_moleculetype]),d_atomtypes,d_none);
     set_nec(&(necessary[d_atoms]),d_moleculetype,d_none);
     set_nec(&(necessary[d_vsites2]),d_atoms,d_none);
index 3ad069beb351dadb9a11fb5fe2dd14f3540e0dc6..0cb5f03d1da4b9d86b9758c896c4e89842ac302b 100644 (file)
@@ -299,6 +299,7 @@ static char **read_topol(char *infile,char *outfile,
                         int         *nmolblock,
                         gmx_molblock_t **molblock,
                         bool        bFEP,
+                        bool        bGB,
                         bool        bZero,
                         bool        bVerbose)
 {
@@ -493,6 +494,15 @@ static char **read_topol(char *infile,char *outfile,
              push_molt(symtab,bi0,pline);
              break;
            */
+
+         case d_implicit_genborn_params:
+           push_gb_params(atype,pline);
+           break;
+
+         case d_implicit_surface_params:
+           gmx_fatal(FARGS,"Implicit surface directive not supported yet.");
+           break;
+
          case d_moleculetype: {
            if (!bReadMolType) {
              int ntype;
@@ -515,7 +525,7 @@ static char **read_topol(char *infile,char *outfile,
                fprintf(stderr,"Generated %d of the %d 1-4 parameter combinations\n",ncombs-ncopy,ncombs);
                free_nbparam(pair,ntype);
              }
-             /* Copy GBSA parameters to atomtype array */
+             /* Copy GBSA parameters to atomtype array? */
              
              bReadMolType = TRUE;
            }
@@ -677,7 +687,8 @@ char **do_top(bool         bVerbose,
              t_molinfo    **molinfo,
              t_inputrec   *ir,
              int          *nmolblock,
-             gmx_molblock_t **molblock)
+             gmx_molblock_t **molblock,
+             bool          bGB)
 {
   /* Tmpfile might contain a long path */
   char *tmpfile;
@@ -693,7 +704,7 @@ char **do_top(bool         bVerbose,
                   symtab,atype,nrmols,molinfo,
                   plist,combination_rule,repulsion_power,
                   opts,fudgeQQ,nmolblock,molblock,
-                  ir->efep!=efepNO,bZero,bVerbose);
+                  ir->efep!=efepNO,bGB,bZero,bVerbose);
   if ((*combination_rule != eCOMB_GEOMETRIC) && 
       (ir->vdwtype == evdwUSER)) {
     warning("Using sigma/epsilon based combination rules with"
index 9de92b4ce5ed232efe92d6e53fb0e8f1234d1144..6ba999744cee03777d31af7d49ba3480c39a9b50 100644 (file)
@@ -60,7 +60,8 @@ extern char **do_top(bool         bVerbose,
                     t_molinfo    **molinfo,
                     t_inputrec   *ir,
                     int          *nmolblock,
-                    gmx_molblock_t **molblock);
+                    gmx_molblock_t **molblock,
+                    bool          bGB);
 
 
 /* This routine expects sys->molt[m].ilist to be of size F_NRE and ordered. */
index a04e562d532d90854ecd7cb3ce5225d05d42002a..3db2eb193caff971c1d3e513216cc3601c0497af 100644 (file)
@@ -191,7 +191,7 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
   char   type[STRLEN],btype[STRLEN],ptype[STRLEN];
   double m,q;
   double c[MAXFORCEPARAM];
-  double radius,vol,surftens;
+  double radius,vol,surftens,gb_radius,S_hct;
   char   tmpfield[12][100]; /* Max 12 fields of width 100 */
   char   errbuf[256];
   t_atom  *atom;
@@ -261,11 +261,13 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
   }
   
   /* optional fields */
-  surftens = -1;
-  vol      =  0;
-  radius   =  0;
-  atomnr   = -1;
-  
+  surftens  = -1;
+  vol       =  0;
+  radius    =  0;
+  gb_radius =  0;
+  atomnr    = -1;
+  S_hct     =  0;
+       
   switch (nb_funct) {
       
   case F_LJ:
@@ -275,9 +277,9 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
     {
         if ( have_bonded_type )
         {
-            nread = sscanf(line,"%s%s%d%lf%lf%s%lf%lf%lf%lf%lf",
+            nread = sscanf(line,"%s%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf",
                            type,btype,&atomnr,&m,&q,ptype,&c[0],&c[1],
-                           &radius,&vol,&surftens);
+                           &radius,&vol,&surftens,&gb_radius);
             if(nread < 8)
             {
                 too_few();
@@ -287,9 +289,9 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
         else
         {
             /* have_atomic_number && !have_bonded_type */
-            nread = sscanf(line,"%s%d%lf%lf%s%lf%lf%lf%lf%lf",
+            nread = sscanf(line,"%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf",
                            type,&atomnr,&m,&q,ptype,&c[0],&c[1],
-                           &radius,&vol,&surftens);
+                           &radius,&vol,&surftens,&gb_radius);
             if(nread < 7)
             {
                 too_few();
@@ -302,9 +304,9 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
         if ( have_bonded_type )
         {
             /* !have_atomic_number && have_bonded_type */
-            nread = sscanf(line,"%s%s%lf%lf%s%lf%lf%lf%lf%lf",
+            nread = sscanf(line,"%s%s%lf%lf%s%lf%lf%lf%lf%lf%lf",
                            type,btype,&m,&q,ptype,&c[0],&c[1],
-                           &radius,&vol,&surftens);
+                           &radius,&vol,&surftens,&gb_radius);
             if(nread < 7)
             {
                 too_few();
@@ -314,9 +316,9 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
         else
         {
             /* !have_atomic_number && !have_bonded_type */
-            nread = sscanf(line,"%s%lf%lf%s%lf%lf%lf%lf%lf",
+            nread = sscanf(line,"%s%lf%lf%s%lf%lf%lf%lf%lf%lf",
                            type,&m,&q,ptype,&c[0],&c[1],
-                           &radius,&vol,&surftens);
+                           &radius,&vol,&surftens,&gb_radius);
             if(nread < 6)
             {
                 too_few();
@@ -344,9 +346,9 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
       {
           if ( have_bonded_type )
           {
-              nread = sscanf(line,"%s%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf",
+              nread = sscanf(line,"%s%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf%lf",
                              type,btype,&atomnr,&m,&q,ptype,&c[0],&c[1],&c[2],
-                             &radius,&vol,&surftens);
+                             &radius,&vol,&surftens,&gb_radius);
               if(nread < 9)
               {
                   too_few();
@@ -356,9 +358,9 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
           else
           {
               /* have_atomic_number && !have_bonded_type */
-              nread = sscanf(line,"%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf",
+              nread = sscanf(line,"%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf%lf",
                              type,&atomnr,&m,&q,ptype,&c[0],&c[1],&c[2],
-                             &radius,&vol,&surftens);
+                             &radius,&vol,&surftens,&gb_radius);
               if(nread < 8)
               {
                   too_few();
@@ -371,9 +373,9 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
           if ( have_bonded_type )
           {
               /* !have_atomic_number && have_bonded_type */
-              nread = sscanf(line,"%s%s%lf%lf%s%lf%lf%lf%lf%lf%lf",
+              nread = sscanf(line,"%s%s%lf%lf%s%lf%lf%lf%lf%lf%lf%lf",
                              type,btype,&m,&q,ptype,&c[0],&c[1],&c[2],
-                             &radius,&vol,&surftens);
+                             &radius,&vol,&surftens,&gb_radius);
               if(nread < 8)
               {
                   too_few();
@@ -383,9 +385,9 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
           else
           {
               /* !have_atomic_number && !have_bonded_type */
-              nread = sscanf(line,"%s%lf%lf%s%lf%lf%lf%lf%lf%lf",
+              nread = sscanf(line,"%s%lf%lf%s%lf%lf%lf%lf%lf%lf%lf",
                              type,&m,&q,ptype,&c[0],&c[1],&c[2],
-                             &radius,&vol,&surftens);
+                             &radius,&vol,&surftens,&gb_radius);
               if(nread < 7)
               {
                   too_few();
@@ -446,12 +448,12 @@ void push_at (t_symtab *symtab, t_atomtype at, t_bond_atomtype bat,
     sprintf(errbuf,"Overriding atomtype %s",type);
     warning(errbuf);
     if ((nr = set_atomtype(nr,at,symtab,atom,type,param,batype_nr,
-                          radius,vol,surftens,atomnr)) == NOTSET)
+                          radius,vol,surftens,atomnr,gb_radius,S_hct)) == NOTSET)
       gmx_fatal(FARGS,"Replacing atomtype %s failed",type);
   }
   else if ((nr = add_atomtype(at,symtab,atom,type,param,
                              batype_nr,radius,vol,
-                             surftens,atomnr)) == NOTSET)
+                             surftens,atomnr,gb_radius,S_hct)) == NOTSET)
     gmx_fatal(FARGS,"Adding atomtype %s failed",type);
   else {  
     /* Add space in the non-bonded parameters matrix */
@@ -861,6 +863,45 @@ void push_nbt(directive d,t_nbparam **nbt,t_atomtype atype,
     nbp->c[i] = cr[i];
 }
 
+void 
+push_gb_params (t_atomtype at, char *line)
+{
+       int nfield;
+       int i,n,k,found,gfound;
+       double radius,vol,surftens,gb_radius,S_hct;
+       char atypename[STRLEN];
+       char errbuf[STRLEN];
+       
+       if( (nfield = sscanf(line,"%s%lf%lf%lf%lf%lf",atypename,&radius,&vol,&surftens,&gb_radius,&S_hct)) != 6)
+    {
+               sprintf(errbuf,"Too few gb parameters for type %s\n",atypename);
+               warning(errbuf);
+    }
+       
+       /* Search for atomtype */
+       //printf("gb params for atomtype '%s'\n",atypename);
+       found = 0;
+       gfound = -1;
+       for(i=0;i<get_atomtype_ntypes(at) && !found;i++)
+        {
+         if(gmx_strncasecmp(atypename,get_atomtype_name(i,at),STRLEN-1)==0)
+               {
+                       //printf("Found matching atomtype in topology: %s\n",get_atomtype_name(i,at));
+                       found = i;
+                       gfound = i;
+                       //printf("found=%d\n",found);
+               }
+    }
+       
+       if(gfound==-1)
+    {
+               printf("Couldn't find topology match for atomtype %s\n",atypename);
+               abort();
+    }
+
+       set_atomtype_gbparam(at,found,radius,vol,surftens,gb_radius,S_hct);
+}
+
 static void push_atom_now(t_symtab *symtab,t_atoms *at,int atomnr,
                          int atomicnumber,
                          int type,char *ctype,int ptype,
@@ -1773,7 +1814,7 @@ int add_atomtype_decoupled(t_symtab *symtab,t_atomtype at,
   for (i=0; (i<MAXFORCEPARAM); i++)
     param.c[i] = 0.0;
 
-  nr = add_atomtype(at,symtab,&atom,"decoupled",&param,-1,0.0,0.0,0.0,0);
+  nr = add_atomtype(at,symtab,&atom,"decoupled",&param,-1,0.0,0.0,0.0,0,0,0);
 
   /* Add space in the non-bonded parameters matrix */
   realloc_nb_params(at,nbparam,pair);
index 47908effc09ac75aed4fd433cb256787d9ee96cc..f07039c3f5f720e10107d2bfd6f35b77b037170e 100644 (file)
@@ -66,6 +66,10 @@ extern void push_dihedraltype(directive d,t_params bt[],
 extern void push_nbt(directive d,t_nbparam **nbt,t_atomtype atype,
                     char *plines,int nb_funct);
 
+void
+push_gb_params(t_atomtype atype,
+              char       *line);
+
 extern void push_atom(t_symtab   *symtab, 
                      t_block    *cgs,
                      t_atoms    *at,
index b3ab8443ceb0a031269e38b5df7678b97e7b903c..3af2b1c70091c5323d15b34570cfe800466a06f6 100644 (file)
@@ -431,7 +431,7 @@ void print_bondeds(FILE *out,int natoms,directive d,
   for (i=0; (i < natoms); i++) {
     char buf[12];
     sprintf(buf,"%4d",(i+1));
-    add_atomtype(atype,&stab,a,buf,param,0,0,0,0,0);
+    add_atomtype(atype,&stab,a,buf,param,0,0,0,0,0,0,0);
   }
   print_bt(out,d,atype,ftype,fsubtype,plist,TRUE);
     
index 122fab36987d068775c92b1a282772922842ffa6..228227bd37b462a64a4e8f76aaf9b62d41b711a6 100644 (file)
@@ -463,18 +463,19 @@ static void cmp_inputrec(FILE *fp,t_inputrec *ir1,t_inputrec *ir2,real ftol)
   cmp_real(fp,"inputrec->epsilon_r",-1,ir1->epsilon_r,ir2->epsilon_r,ftol);
   cmp_real(fp,"inputrec->epsilon_rf",-1,ir1->epsilon_rf,ir2->epsilon_rf,ftol);
   cmp_real(fp,"inputrec->tabext",-1,ir1->tabext,ir2->tabext,ftol);
+  cmp_int(fp,"inputrec->implicit_solvent",-1,ir1->implicit_solvent,ir2->implicit_solvent);
   cmp_int(fp,"inputrec->gb_algorithm",-1,ir1->gb_algorithm,ir2->gb_algorithm);
-  cmp_real(fp,"inputrec->gb_epsilon_solvent",-1,ir1->gb_epsilon_solvent,ir2->gb_epsilon_solvent,ftol);
   cmp_int(fp,"inputrec->nstgbradii",-1,ir1->nstgbradii,ir2->nstgbradii);
   cmp_real(fp,"inputrec->rgbradii",-1,ir1->rgbradii,ir2->rgbradii,ftol);
   cmp_real(fp,"inputrec->gb_saltconc",-1,ir1->gb_saltconc,ir2->gb_saltconc,ftol);
-  cmp_int(fp,"inputrec->implicit_solvent",-1,ir1->implicit_solvent,ir2->implicit_solvent);
+  cmp_real(fp,"inputrec->gb_epsilon_solvent",-1,ir1->gb_epsilon_solvent,ir2->gb_epsilon_solvent,ftol);
   cmp_real(fp,"inputrec->gb_obc_alpha",-1,ir1->gb_obc_alpha,ir2->gb_obc_alpha,ftol);
   cmp_real(fp,"inputrec->gb_obc_beta",-1,ir1->gb_obc_beta,ir2->gb_obc_beta,ftol);
   cmp_real(fp,"inputrec->gb_obc_gamma",-1,ir1->gb_obc_gamma,ir2->gb_obc_gamma,ftol);
-  cmp_real(fp,"inputrec->sa_surface_tension",-1,ir1->sa_surface_tension,ir2->sa_surface_tension,ftol);
-       
-       
+  cmp_real(fp,"inputrec->gb_dielectric_offset",-1,ir1->gb_dielectric_offset,ir2->gb_dielectric_offset,ftol);
+  cmp_int(fp,"inputrec->sa_algorithm",-1,ir1->sa_algorithm,ir2->sa_algorithm);
+  cmp_real(fp,"inputrec->sa_surface_tension",-1,ir1->sa_surface_tension,ir2->sa_surface_tension,ftol); 
+
   cmp_int(fp,"inputrec->eDispCorr",-1,ir1->eDispCorr,ir2->eDispCorr);
   cmp_real(fp,"inputrec->shake_tol",-1,ir1->shake_tol,ir2->shake_tol,ftol);
   cmp_int(fp,"inputrec->efep",-1,ir1->efep,ir2->efep);
index 2b428d2a46d13c183fdce8108d7185142b1bcb6d..cb81e70beb266a3b436f2a7e2129af3edec2e92e 100644 (file)
@@ -19,7 +19,8 @@ libmd@LIBSUFFIX@_la_SOURCES = \
        force.c         ghat.c          init.c          \
        mdatom.c        mdebin.c        minimize.c      \
        mvxvf.c         ns.c            nsgrid.c        \
-       perf_est.c      \
+       perf_est.c      genborn.c                       \
+       genborn_sse.c   genborn_sse.h                   \
        pme.c           pme_pp.c        pppm.c          \
        partdec.c       pull.c          pullutil.c      \
        rf_util.c       shakef.c        sim_util.c      \
index b5a5b62e0303cdeadcaf82163143e8f9e07522e7..eb21ef3a9a5a802dd6b9fbcc56d48085c31e3df0 100644 (file)
@@ -7582,7 +7582,7 @@ void dd_partition_system(FILE            *fplog,
     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
     bool bCheckDLB,bTurnOnDLB,bLogLoad,bRedist,bSortCG;
     ivec ncells_old,np;
-    
+       
     dd = cr->dd;
     comm = dd->comm;
 
@@ -7730,8 +7730,7 @@ void dd_partition_system(FILE            *fplog,
     {
         /* Initialize the ns grid */
         copy_ivec(fr->ns.grid->n,ncells_old);
-        grid_first(fplog,fr->ns.grid,dd,fr->ePBC,state_local->box,fr->rlistlong,
-                   dd->ncg_home);
+        grid_first(fplog,fr->ns.grid,dd,fr->ePBC,state_local->box,fr->rlistlong,dd->ncg_home);
         if (!bMasterState &&
             fr->ns.grid->n[XX] == ncells_old[XX] &&
             fr->ns.grid->n[YY] == ncells_old[YY] &&
index 7cb3593e3742a2a89f99149bc5ea7c1e04e2cb9e..b6859255bd8eb6543ac693df510be71e401f8ec3 100644 (file)
@@ -1287,13 +1287,18 @@ void init_forcerec(FILE *fp,
     {
         set_bham_b_max(fp,fr,mtop);
     }
-    
+
+    fr->bGB = (ir->implicit_solvent == eisGBSA);
+
     /* Copy the GBSA data (radius, volume and surftens for each
      * atomtype) from the topology atomtype section to forcerec.
      */
     snew(fr->atype_radius,fr->ntype);
     snew(fr->atype_vol,fr->ntype);
     snew(fr->atype_surftens,fr->ntype);
+    snew(fr->atype_gb_radius,fr->ntype);
+    snew(fr->atype_S_hct,fr->ntype);
+
     if (mtop->atomtypes.nr > 0)
     {
         for(i=0;i<fr->ntype;i++)
@@ -1302,8 +1307,12 @@ void init_forcerec(FILE *fp,
             fr->atype_vol[i] = mtop->atomtypes.vol[i];
         for(i=0;i<fr->ntype;i++)
             fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
+        for(i=0;i<fr->ntype;i++)
+            fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
+        for(i=0;i<fr->ntype;i++)
+            fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
     }    
-    
+
     /* Set the charge scaling */
     if (fr->epsilon_r != 0)
         fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
@@ -1551,8 +1560,13 @@ void do_force_lowlevel(FILE       *fplog,   int        step,
                        rvec       f[],
                        gmx_enerdata_t *enerd,
                        t_fcdata   *fcd,
+                       gmx_mtop_t *mtop,
+                       gmx_genborn_t *born,
+                       t_atomtypes *atype,
+                       bool       bBornRadii,
                        matrix     box,
-                       real       lambda,   t_graph    *graph,
+                       real       lambda,  
+                       t_graph    *graph,
                        t_blocka   *excl,    
                        rvec       mu_tot[],
                        int        flags,
@@ -1565,6 +1579,8 @@ void do_force_lowlevel(FILE       *fplog,   int        step,
     rvec    box_size;
     real    dvdlambda,Vsr,Vlr,Vcorr=0,vdip,vcharge;
     t_pbc   pbc;
+    real    dvdgb;
+
 #ifdef GMX_MPI
     double  t0=0.0,t1,t2,t3; /* time measurement for coarse load balancing */
 #endif
@@ -1619,6 +1635,11 @@ void do_force_lowlevel(FILE       *fplog,   int        step,
         enerd->term[F_DVDL] += dvdlambda;
     }
     
+       /* Calculate the Born radii */
+       if (ir->implicit_solvent && bBornRadii) 
+               calc_gb_rad(cr,fr,ir,md->nr, idef->il[F_GB].nr,
+                                       mtop,atype,x,f,&(fr->gblist),born,md);
+               
     where();
     do_nonbonded(cr,fr,x,f,md,
                  fr->bBHAM ?
@@ -1629,6 +1650,13 @@ void do_force_lowlevel(FILE       *fplog,   int        step,
                  flags & GMX_FORCE_FORCES);
     where();
     
+       /* If we are doing GB, calculate bonded forces and apply corrections 
+        * to the solvation derivatives */
+       if (ir->implicit_solvent)  {
+               dvdgb = calc_gb_forces(cr,md,born,mtop,atype,idef->il[F_GB].nr,x,f,fr,idef->il[F_GB].iatoms,ir->gb_algorithm, bBornRadii);
+               enerd->term[F_GB]+=dvdgb;       
+       }
+               
 #ifdef GMX_MPI
     if (TAKETIME)
     {
@@ -1697,7 +1725,7 @@ void do_force_lowlevel(FILE       *fplog,   int        step,
         GMX_MPE_LOG(ev_calc_bonds_start);
         calc_bonds(fplog,cr->ms,
                    idef,x,hist,f,fr,&pbc,graph,enerd,nrnb,lambda,md,fcd,
-                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL,
+                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
                    fr->bSepDVDL && do_per_step(step,ir->nstlog),step);
         debug_gmx();
         GMX_MPE_LOG(ev_calc_bonds_finish);
diff --git a/src/mdlib/genborn.c b/src/mdlib/genborn.c
new file mode 100644 (file)
index 0000000..3dc493e
--- /dev/null
@@ -0,0 +1,2271 @@
+/*
+ * $Id$
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2008, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "genborn.h"
+#include "vec.h"
+#include "grompp.h"
+#include "pdbio.h"
+#include "names.h"
+#include "physics.h"
+#include "partdec.h"
+#include "network.h"
+#include "gmx_fatal.h"
+#include "mtop_util.h"
+
+#ifdef GMX_MPI
+#include "mpi.h"
+#endif
+
+#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) )
+/* x86 SSE intrinsics implementations of selected generalized born routines */
+#include "genborn_sse.h"
+#endif
+
+/* Still parameters - make sure to edit in genborn_sse.c too if you change these! */
+#define STILL_P1  0.073*0.1              /* length        */
+#define STILL_P2  0.921*0.1*CAL2JOULE    /* energy*length */
+#define STILL_P3  6.211*0.1*CAL2JOULE    /* energy*length */
+#define STILL_P4  15.236*0.1*CAL2JOULE
+#define STILL_P5  1.254 
+
+#define STILL_P5INV (1.0/STILL_P5)
+#define STILL_PIP5  (M_PI*STILL_P5)
+
+
+
+int init_gb_nblist(int natoms, t_nblist *nl)
+{
+       nl->maxnri      = natoms*4;
+       nl->maxnrj      = 0;
+       nl->maxlen      = 0;
+       nl->nri         = natoms;
+       nl->nrj         = 0;
+       nl->iinr        = NULL;
+       nl->gid         = NULL;
+       nl->shift       = NULL;
+       nl->jindex      = NULL;
+       /*nl->nltype      = nltype;*/
+       
+       srenew(nl->iinr,   nl->maxnri);
+       srenew(nl->gid,    nl->maxnri);
+       srenew(nl->shift,  nl->maxnri);
+       srenew(nl->jindex, nl->maxnri+1);
+       
+       nl->jindex[0] = 0;
+       
+       return 0;
+}
+
+int print_nblist(int natoms, t_nblist *nl)
+{
+       int i,k,ai,aj,nj0,nj1;
+       
+       printf("genborn.c: print_nblist, natoms=%d\n",natoms); 
+       for(i=0;i<natoms;i++)
+       {
+               ai=nl->iinr[i];
+               nj0=nl->jindex[i];
+               nj1=nl->jindex[i+1];
+               /*printf("ai=%d, nj0=%d, nj1=%d\n",ai,nj0,nj1);*/
+               for(k=nj0;k<nj1;k++)
+               {       
+                       aj=nl->jjnr[k];
+                       printf("ai=%d, aj=%d\n",ai,aj);
+               }
+       }
+       
+       return 0;       
+}
+
+void fill_log_table(const int n, real *table)
+{
+       float numlog;
+       int i;
+       int *const exp_ptr=((int*)&numlog);
+       int x = *exp_ptr;
+       
+       x=0x3F800000;
+       *exp_ptr = x;
+       
+       int incr = 1 << (23-n);
+       int p=pow(2,n);
+       
+       for(i=0;i<p;++i)
+       {
+               table[i]=log2(numlog);
+               x+=incr;
+               *exp_ptr=x;
+       }
+}
+
+
+real table_log(float val, const real *table, const int n)
+{
+       int *const exp_ptr = ((int*)&val);
+       int x              = *exp_ptr;
+       const int log_2    = ((x>>23) & 255) - 127;
+       x &= 0x7FFFFF;
+       x = x >> (23-n);
+       val = table[x];
+       return ((val+log_2)*0.69314718);  
+}
+
+void gb_pd_send(t_commrec *cr, real *send_data, int nr)
+{
+#ifdef GMX_MPI 
+       int i,cur;
+       int *index,*sendc,*disp;
+       
+       snew(sendc,cr->nnodes);
+       snew(disp,cr->nnodes);
+       
+       index = pd_index(cr);
+       cur   = cr->nodeid;
+       
+       /* Setup count/index arrays */
+       for(i=0;i<cr->nnodes;i++)
+       {
+               sendc[i]  = index[i+1]-index[i];
+               disp[i]   = index[i];   
+       }
+       
+       /* Do communication */
+       MPI_Gatherv(send_data+index[cur],sendc[cur],GMX_MPI_REAL,send_data,sendc,disp,GMX_MPI_REAL,0,cr->mpi_comm_mygroup);
+       MPI_Bcast(send_data,nr,GMX_MPI_REAL,0,cr->mpi_comm_mygroup);
+       
+#endif
+}
+
+
+int init_gb_plist(t_params *p_list)
+{
+       p_list->nr    = 0;
+    p_list->param = NULL;
+       
+       return 0;
+}
+
+
+static void assign_gb_param(t_functype ftype,t_iparams *new,
+                                                       real old[MAXFORCEPARAM],int comb,real reppow)
+{
+       int  i,j;
+       
+  /* Set to zero */
+  for(j=0; (j<MAXFORCEPARAM); j++)
+    new->generic.buf[j]=0.0;
+       
+       switch (ftype) {
+               case F_GB:
+                       new->gb.c6A=old[0];
+                       new->gb.c12A=old[1];
+                       new->gb.c6B=old[2];
+                       new->gb.c12B=old[3];
+                       new->gb.sar=old[4];
+                       new->gb.st=old[5];
+                       new->gb.pi=old[6];
+                       new->gb.gbr=old[7];
+                       new->gb.bmlt=old[8];
+                       break;
+               default:
+                       gmx_fatal(FARGS,"unknown function type %d in %s line %d",
+                                                               ftype,__FILE__,__LINE__);               
+       }
+}
+
+static void 
+append_gb_interaction(t_ilist *ilist,
+                                         int type,int nral,atom_id a[MAXATOMLIST])
+{
+  int i,where1;
+  
+  where1     = ilist->nr;
+  ilist->nr += nral+1;
+       
+  ilist->iatoms[where1++]=type;
+  for (i=0; (i<nral); i++) 
+  {
+    ilist->iatoms[where1++]=a[i];
+  }
+}
+
+
+static int 
+enter_gb_params(t_idef *idef, t_functype ftype,
+                               real forceparams[MAXFORCEPARAM],int comb,real reppow,
+                               int start,bool bAppend)
+{
+  t_iparams new;
+  int       type;
+       
+       assign_gb_param(ftype,&new,forceparams,comb,reppow);
+  if (!bAppend) {
+               for (type=start; (type<idef->ntypes); type++) {
+      if (idef->functype[type]==ftype) {
+                                       if (memcmp(&new,&idef->iparams[type],(size_t)sizeof(new)) == 0)
+                                       return type;
+      }
+    }
+  }
+  else
+    type=idef->ntypes;
+  if (debug)
+    fprintf(debug,"copying new to idef->iparams[%d] (ntypes=%d)\n",
+                                               type,idef->ntypes);
+  memcpy(&idef->iparams[type],&new,(size_t)sizeof(new));
+  
+  idef->ntypes++;
+  idef->functype[type]=ftype;
+       
+  return type;
+}
+
+int init_gb_still(t_commrec *cr, t_forcerec  *fr, t_atomtypes *atype, t_idef *idef, t_atoms *atoms, gmx_genborn_t *born,int natoms)
+{
+       
+       int i,j,i1,i2,k,m,nbond,nang,ia,ib,ic,id,nb,idx,idx2,at;
+       int iam,ibm;
+       int at0,at1;
+       real length,angle;
+       real r,ri,rj,ri2,ri3,rj2,r2,r3,r4,rk,ratio,term,h,doffset,electric;
+       real p1,p2,p3,factor,cosine,rab,rbc;
+       
+       real vsol[natoms];
+       real gp[natoms];
+       
+       if(PAR(cr))
+       {
+               pd_at_range(cr,&at0,&at1);
+       
+               for(i=0;i<natoms;i++)
+                       vsol[i]=gp[i]=0;
+       }
+       else
+       {
+               at0=0;
+               at1=natoms;
+       }
+       
+       doffset = born->gb_doffset;
+       
+       for(i=0;i<natoms;i++)
+               born->gpol[i]=born->vsolv[i]=0;         
+       
+       /* Compute atomic solvation volumes for Still method */
+       for(i=0;i<natoms;i++)
+       {       
+               ri=atype->gb_radius[atoms->atom[i].type];
+               r3=ri*ri*ri;
+               born->vsolv[i]=(4*M_PI/3)*r3;
+       }
+       
+       for(j=0;j<born->n12*3;j+=3)
+       {
+               m=idef->il[F_GB].iatoms[j];
+               ia=idef->il[F_GB].iatoms[j+1];
+               ib=idef->il[F_GB].iatoms[j+2];
+               
+               r=1.01*idef->iparams[m].gb.st;
+               
+               ri   = atype->gb_radius[atoms->atom[ia].type];
+               rj   = atype->gb_radius[atoms->atom[ib].type];
+               
+               ri2  = ri*ri;
+               ri3  = ri2*ri;
+               rj2  = rj*rj;
+               
+               ratio  = (rj2-ri2-r*r)/(2*ri*r);
+               h      = ri*(1+ratio);
+               term   = (M_PI/3.0)*h*h*(3.0*ri-h);
+
+               if(PAR(cr))
+               {
+                       vsol[ia]+=term;
+               }
+               else
+               {
+                       born->vsolv[ia] -= term;
+               }
+               
+               ratio  = (ri2-rj2-r*r)/(2*rj*r);
+               h      = rj*(1+ratio);
+               term   = (M_PI/3.0)*h*h*(3.0*rj-h);
+               
+               if(PAR(cr))
+               {
+                       vsol[ib]+=term;
+               }
+               else
+               {
+                       born->vsolv[ib] -= term;
+               }               
+       }
+       
+       if(PAR(cr))
+       {
+               gmx_sum(natoms,vsol,cr);
+               for(i=0;i<natoms;i++)
+                       born->vsolv[i]=born->vsolv[i]-vsol[i];
+       }
+       
+       /* Get the self-, 1-2 and 1-3 polarization for analytical Still method */
+       /* Self */
+       for(j=0;j<natoms;j++)
+       {
+               if(born->vs[j]==1)
+                       born->gpol[j]=-0.5*ONE_4PI_EPS0/(atype->gb_radius[atoms->atom[j].type]-doffset+STILL_P1);
+       }
+       
+       /* 1-2 */
+       for(j=0;j<born->n12*3;j+=3)
+       {
+               m=idef->il[F_GB].iatoms[j];
+               ia=idef->il[F_GB].iatoms[j+1];
+               ib=idef->il[F_GB].iatoms[j+2];
+               
+               r=idef->iparams[m].gb.st;
+               
+               r4=r*r*r*r;
+
+               if(PAR(cr))
+               {
+                       gp[ia]+=STILL_P2*born->vsolv[ib]/r4;
+                       gp[ib]+=STILL_P2*born->vsolv[ia]/r4;
+               }
+               else
+               {
+                       born->gpol[ia]=born->gpol[ia]+STILL_P2*born->vsolv[ib]/r4;
+                       born->gpol[ib]=born->gpol[ib]+STILL_P2*born->vsolv[ia]/r4;
+               }
+       }
+       
+       /* 1-3 */
+       for(j=born->n12*3;j<born->n12*3+born->n13*3;j+=3)
+       {
+               m=idef->il[F_GB].iatoms[j];
+               ia=idef->il[F_GB].iatoms[j+1];
+               ib=idef->il[F_GB].iatoms[j+2];
+               
+               r=idef->iparams[m].gb.st;
+               r4=r*r*r*r;
+
+               if(PAR(cr))
+               {
+                       gp[ia]+=STILL_P3*born->vsolv[ib]/r4;
+                       gp[ib]+=STILL_P3*born->vsolv[ia]/r4;
+               }
+               else
+               {
+                       born->gpol[ia]=born->gpol[ia]+STILL_P3*born->vsolv[ib]/r4;
+                       born->gpol[ib]=born->gpol[ib]+STILL_P3*born->vsolv[ia]/r4;
+               }               
+       }
+
+       if(PAR(cr))
+       {
+               gmx_sum(natoms,gp,cr);
+               for(i=0;i<natoms;i++)
+                       born->gpol[i]=born->gpol[i]+gp[i];
+       }       
+       /*
+        real vsum, gsum;
+        vsum=0; gsum=0;
+        
+        for(i=0;i<natoms;i++)
+     {
+        
+        printf("final_init: id=%d, %s: v=%g, v_t=%g, g=%15.15f, g_t=%15.15f\n",
+        cr->nodeid,
+        *(atoms->atomname[i]),
+        born->vsolv[i],
+        born->vsolv[i]*1000,   
+        born->gpol[i],
+        born->gpol[i]/CAL2JOULE);
+        
+        vsum=vsum+(born->vsolv[i]*1000);
+        gsum=gsum+(born->gpol[i]/CAL2JOULE);
+     }
+        
+        printf("SUM: Vtot=%15.15f, Gtot=%15.15f\n",vsum,gsum);
+        */
+       /*exit(1);*/
+       
+       return 0;
+}
+
+
+
+#define LOG_TABLE_ACCURACY 15 /* Accuracy of the table logarithm */
+
+
+/* Initialize all GB datastructs and compute polarization energies */
+int init_gb(gmx_genborn_t **p_born,t_commrec *cr, t_forcerec *fr, t_inputrec *ir,
+                       gmx_mtop_t *mtop, rvec x[], real rgbradii, int gb_algorithm)
+{
+       int i,j,m,ai,aj,jj,nalloc;
+       double rai,sk,p;
+       gmx_genborn_t *born;
+       int natoms;
+       t_atoms atoms;
+       gmx_localtop_t *localtop;
+       real doffset;
+
+       natoms = mtop->natoms;
+       atoms  = gmx_mtop_global_atoms(mtop);
+       localtop = gmx_mtop_generate_local_top(mtop,ir);
+       
+       snew(born,1);
+        *p_born = born;
+
+       snew(born->S_hct,natoms);
+       snew(born->drobc,natoms);
+
+       snew(fr->invsqrta,natoms);
+       snew(fr->dvda,natoms);
+       snew(fr->dadx,natoms*natoms);
+
+       snew(born->gpol,natoms);
+       snew(born->vsolv,natoms);
+       snew(born->bRad,natoms);
+
+       /* snew(born->asurf,natoms); */
+       /* snew(born->dasurf,natoms); */
+
+       snew(born->vs,natoms);
+       snew(born->param, natoms);
+       
+       nalloc=0;
+       
+       for(i=0;i<natoms;i++)
+               nalloc+=i;
+
+       init_gb_nblist(natoms, &(fr->gblist));
+       
+       snew(fr->gblist.jjnr,nalloc*2);
+       
+       born->n12=0;
+       born->n13=0;
+       born->n14=0;
+       
+       /* Do the Vsites exclusions (if any) */
+       for(i=0;i<natoms;i++)
+       {
+               jj = atoms.atom[i].type;
+               born->vs[i]=1;                                                                                                                                                                                  
+                                                                                                       
+           if(C6(fr->nbfp,fr->ntype,jj,jj)==0 && C12(fr->nbfp,fr->ntype,jj,jj)==0 && atoms.atom[i].q==0)
+                       born->vs[i]=0;
+       }
+       
+       for(i=0;i<F_NRE;i++)
+       {
+               if(IS_ANGLE(i))
+               {
+                       born->n13+=localtop->idef.il[i].nr/(1+NRAL(i));
+               }
+               
+               if(IS_CHEMBOND(i))
+               {
+                       switch(i)
+                       {
+                               case F_BONDS:
+                               case F_CONNBONDS:
+                               case F_CONSTR:
+                               
+                                       for(j=0;j<localtop->idef.il[i].nr;)
+                                       {
+                                               m=localtop->idef.il[i].iatoms[j++];
+                                               ai=localtop->idef.il[i].iatoms[j++];
+                                               aj=localtop->idef.il[i].iatoms[j++];
+                                               
+                                               if(born->vs[ai]==1 && born->vs[aj]==1)
+                                               {
+                                                       born->n12++;
+                                               }
+                                       }
+                               
+                               break;
+                       }
+               }
+       }
+       
+       born->n14=localtop->idef.il[F_LJ14].nr/(1+NRAL(F_LJ14));
+       
+       /* If Still model, initialise the polarisation energies */
+       if(gb_algorithm==egbSTILL)      
+          init_gb_still(cr, fr,&(mtop->atomtypes), &(localtop->idef), &atoms, born, natoms);   
+       
+       /* Copy algorithm parameters from inputrecord to local structure */
+       born->obc_alpha  = ir->gb_obc_alpha;
+       born->obc_beta   = ir->gb_obc_beta;
+       born->obc_gamma  = ir->gb_obc_gamma;
+       born->gb_doffset = ir->gb_dielectric_offset;
+       
+       doffset = born->gb_doffset;
+       
+       /* If HCT/OBC,  precalculate the sk*atype->S_hct factors */
+       if(gb_algorithm==egbHCT || gb_algorithm==egbOBC)
+       {
+               for(i=0;i<natoms;i++)
+               {       
+                       if(born->vs[i]==1)
+                       {
+                               rai            = mtop->atomtypes.gb_radius[atoms.atom[i].type]-doffset; 
+                               sk             = rai * mtop->atomtypes.S_hct[atoms.atom[i].type];
+                               born->param[i] = sk;
+                       }
+                       else
+                       {
+                               born->param[i] = 0;
+                       }
+               }
+       }
+       
+       /* Init the logarithm table */
+       p=pow(2,LOG_TABLE_ACCURACY);
+       snew(born->log_table, p);
+       
+       fill_log_table(LOG_TABLE_ACCURACY, born->log_table);
+
+       if(PAR(cr))
+       {
+               snew(born->work,natoms);
+       }
+       else
+       {
+               born->work = NULL;
+       }
+       
+       return 0;
+}
+
+int generate_gb_topology(gmx_mtop_t *mtop, t_params *plist, t_params *gb_plist, gmx_genborn_t *born)
+{
+       int i,j,k,type,m,a1,a2,a3,a4,idx,nral,maxtypes,start,comb;
+       int n12,n13,n14;
+       double p1,p2,p3,cosine,r2,rab,rbc;
+       int natoms;
+       bl_t *bl;
+       bonds_t *bonds;
+       t_atoms atoms;
+       real doffset;
+       
+       
+       doffset = born->gb_doffset;
+       natoms = mtop->natoms;
+       bl=(bl_t *) malloc(sizeof(bl_t)*natoms);
+       snew(bonds,natoms);
+       atoms  = gmx_mtop_global_atoms(mtop);
+       
+       /* To keep the compiler happy */
+       rab=rbc=0;
+       
+       for(i=0;i<F_NRE;i++)
+       {
+               if(plist[i].nr>0)
+               {
+                       gb_plist->nr+=plist[i].nr;
+               }
+       }
+
+       snew(gb_plist->param,gb_plist->nr);
+
+       p1=STILL_P1;
+       p2=STILL_P2;
+       p3=STILL_P3;
+       
+       idx=0;
+       n12=0;
+       n13=0;
+       n14=0;
+       
+       for(i=0;i<F_NRE;i++)
+       {
+               if(IS_CHEMBOND(i))
+               {
+                       switch(i)
+                       {
+                               case F_BONDS:
+                               case F_CONNBONDS:
+                               case F_CONSTR:
+                               
+                                       for(j=0;j<plist[i].nr; j++)
+                                       {
+                                               a1=plist[i].param[j].a[0];
+                                               a2=plist[i].param[j].a[1];
+                                       
+                                               if(atoms.atom[a1].q!=0 && atoms.atom[a2].q!=0)
+                                               {
+                                                       bl[a1].length[a2]=plist[i].param[j].c[0];
+                                                       bl[a2].length[a1]=plist[i].param[j].c[0];
+                                                       
+                                                       bonds[a1].bond[bonds[a1].nbonds]=a2;
+                                                       bonds[a1].nbonds++;
+                                                       bonds[a2].bond[bonds[a2].nbonds]=a1;
+                                                       bonds[a2].nbonds++;
+               
+                                                       gb_plist->param[idx].a[0]=a1;
+                                                       gb_plist->param[idx].a[1]=a2;
+               
+                                                   /* LJ parameters */
+                                                       gb_plist->param[idx].c[0]=-1;
+                                                       gb_plist->param[idx].c[1]=-1;
+                                                       gb_plist->param[idx].c[2]=-1;
+                                                       gb_plist->param[idx].c[3]=-1;
+               
+                                                       /* GBSA parameters */
+                                                       gb_plist->param[idx].c[4]=mtop->atomtypes.radius[atoms.atom[a1].type]+mtop->atomtypes.radius[atoms.atom[a2].type];      
+                                                       gb_plist->param[idx].c[5]=plist[i].param[j].c[0];
+                                                       gb_plist->param[idx].c[6]=p2;
+                                                       gb_plist->param[idx].c[7]=mtop->atomtypes.gb_radius[atoms.atom[a1].type]+mtop->atomtypes.gb_radius[atoms.atom[a2].type];
+                                                       gb_plist->param[idx].c[8]=0.8875;
+                                                       n12++;
+                                                       idx++;
+                                               }
+                                       }
+                                       break;
+                               
+                               case F_G96BONDS:
+                               case F_MORSE:
+                               case F_CUBICBONDS:
+                               case F_HARMONIC:
+                               case F_FENEBONDS:
+                               case F_TABBONDS:
+                               case F_TABBONDSNC:
+                               case F_POLARIZATION:
+                               case F_VSITE2:
+                               case F_VSITE3:
+                               case F_VSITE3FD:
+                               case F_VSITE3FAD:
+                               case F_VSITE3OUT:
+                               case F_VSITE4FD:
+                               case F_VSITE4FDN:
+                                       break;
+                                               
+                               
+                               default:
+                                       gmx_fatal(FARGS,"generate_gb_topology, F_BONDS");
+
+                       }
+               }
+       }
+       
+       for(i=0;i<F_NRE;i++)
+       {
+               if(IS_ANGLE(i))
+               {
+                       switch(i)
+                       {
+                               case F_ANGLES:
+                               
+                                       for(j=0;j<plist[i].nr; j++)
+                                       {
+                                               a1=plist[i].param[j].a[0];
+                                               a2=plist[i].param[j].a[1];
+                                               a3=plist[i].param[j].a[2];      
+                                               
+                                               gb_plist->param[idx].a[0]=a1;
+                                               gb_plist->param[idx].a[1]=a3;
+               
+                                               /* LJ parameters */     
+                                               gb_plist->param[idx].c[0]=-1;
+                                               gb_plist->param[idx].c[1]=-1;
+                                               gb_plist->param[idx].c[2]=-1;
+                                               gb_plist->param[idx].c[3]=-1;
+               
+                                               /* GBSA parameters */
+                                               gb_plist->param[idx].c[4]=mtop->atomtypes.radius[atoms.atom[a1].type]+mtop->atomtypes.radius[atoms.atom[a3].type];      
+                                               
+                                                for(k=0;k<bonds[a2].nbonds;k++)
+                                                {
+                                                       if(bonds[a2].bond[k]==a1)
+                                                       {
+                                                               rab=bl[a2].length[a1];
+                                                       }
+                   
+                                                       else if(bonds[a2].bond[k]==a3)
+                                                       {
+                                                               rbc=bl[a2].length[a3];
+                                                       }
+                   
+                                               }
+               
+                                               cosine=cos(plist[i].param[j].c[0]/RAD2DEG);
+                                               r2=rab*rab+rbc*rbc-(2*rab*rbc*cosine);
+                                               gb_plist->param[idx].c[5]=sqrt(r2);
+                                               gb_plist->param[idx].c[6]=p3;
+                                               gb_plist->param[idx].c[7]=mtop->atomtypes.gb_radius[atoms.atom[a1].type]+mtop->atomtypes.gb_radius[atoms.atom[a3].type];
+                                               gb_plist->param[idx].c[8]=0.3516;
+                                               
+                                               n13++;
+                                               idx++;
+                                       }
+                                       break;
+                                       
+                               case F_G96ANGLES:
+                               case F_CONSTR:
+                               case F_UREY_BRADLEY:
+                               case F_QUARTIC_ANGLES:
+                               case F_TABANGLES:
+                                       break;
+                               
+                               default:
+                                       gmx_fatal(FARGS,"generate_gb_topology, F_ANGLES");
+
+                       }
+               }
+       }
+       
+       for(i=0;i<plist[F_LJ14].nr; i++)
+       {
+               a1=plist[F_LJ14].param[i].a[0];
+               a2=plist[F_LJ14].param[i].a[1];
+                               
+               gb_plist->param[idx].a[0]=a1;
+               gb_plist->param[idx].a[1]=a2;
+                               
+               gb_plist->param[idx].c[0]=-1;
+               gb_plist->param[idx].c[1]=-1;
+               gb_plist->param[idx].c[2]=-1;
+               gb_plist->param[idx].c[3]=-1;
+               
+               /* GBSA parameters */
+               gb_plist->param[idx].c[4]=mtop->atomtypes.radius[atoms.atom[a1].type]+mtop->atomtypes.radius[atoms.atom[a2].type];      
+               gb_plist->param[idx].c[5]=-1;
+               gb_plist->param[idx].c[6]=p3;
+               gb_plist->param[idx].c[7]=mtop->atomtypes.gb_radius[atoms.atom[a1].type]+mtop->atomtypes.gb_radius[atoms.atom[a2].type];
+               gb_plist->param[idx].c[8]=0.3516;
+               idx++;
+               n14++;
+       }
+
+       gb_plist->nr=n12+n13+n14;
+       born->n12=n12;
+       born->n13=n13;
+       born->n14=n14;
+       
+       return 0;
+       
+}
+
+int convert_gb_params(t_idef *idef, t_functype ftype, int start, t_params *gb_plist, gmx_genborn_t *born)
+{
+       int k,nral,maxtypes,comb,type;
+       real reppow;
+
+       nral=NRAL(F_GB);
+
+       /* pl->nr is the number of gb interactions, so we need to allocate nr*3 elements in iatoms */
+       snew(idef->il[F_GB].iatoms,gb_plist->nr*3);
+
+       maxtypes=idef->ntypes;
+       comb=3;
+       reppow=12;
+       
+       for(k=0;k<gb_plist->nr;k++)
+       {
+                       if(maxtypes<=idef->ntypes)
+                       {
+                               maxtypes+=1000;
+                               srenew(idef->functype,maxtypes);
+                               srenew(idef->iparams,maxtypes);
+                       }
+               
+               type=enter_gb_params(idef,F_GB,gb_plist->param[k].c,comb,reppow,start,0);
+               append_gb_interaction(&idef->il[F_GB],type,NRAL(F_GB),gb_plist->param[k].a);
+       
+       }
+       
+       printf("# %10s:   %d\n","GB-12",born->n12);
+       printf("# %10s:   %d\n","GB-13",born->n13);
+       printf("# %10s:   %d\n","GB-14",born->n14);
+       
+       return 0;
+
+}
+
+
+
+int calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_mtop_t *mtop,
+                                         const t_atomtypes *atype, rvec x[], t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
+{      
+       int i,k,n,nj0,nj1,ai,aj,type;
+       real gpi,dr,dr2,dr4,idr4,rvdw,ratio,ccf,theta,term,rai,raj;
+       real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11;
+       real rinv,idr2,idr6,vaj,dccf,cosq,sinq,prod,gpi2;
+       real factor;
+
+       factor=0.5*ONE_4PI_EPS0;
+       
+       n=0;
+
+       for(i=0;i<natoms;i++)
+               born->bRad[i]=fr->invsqrta[i]=1;
+       
+       for(i=0;i<nl->nri;i++ )
+       {
+               ai  = i;
+               
+               nj0 = nl->jindex[ai];                   
+               nj1 = nl->jindex[ai+1];
+               
+               gpi = born->gpol[ai];
+               rai = mtop->atomtypes.gb_radius[md->typeA[ai]];
+               
+               ix1 = x[ai][0];
+               iy1 = x[ai][1];
+               iz1 = x[ai][2];
+                       
+               for(k=nj0;k<nj1;k++)
+               {
+                       aj    = nl->jjnr[k];
+                       
+                       jx1   = x[aj][0];
+                       jy1   = x[aj][1];
+                       jz1   = x[aj][2];
+                       
+                       dx11  = ix1-jx1;
+                       dy11  = iy1-jy1;
+                       dz11  = iz1-jz1;
+                       
+                       dr2   = dx11*dx11+dy11*dy11+dz11*dz11; 
+                       rinv  = invsqrt(dr2);
+                       idr2  = rinv*rinv;
+                       idr4  = idr2*idr2;
+                       idr6  = idr4*idr2;
+                       
+                       raj = mtop->atomtypes.gb_radius[md->typeA[aj]];
+
+                       rvdw  = rai + raj;
+                       
+                       ratio = dr2 / (rvdw * rvdw);
+                       vaj   = born->vsolv[aj];
+       
+                       if(ratio>STILL_P5INV) {
+                               ccf=1.0;
+                               dccf=0.0;
+                       }
+                       else
+                       {
+                               theta = ratio*STILL_PIP5;
+                               cosq  = cos(theta);
+                               term  = 0.5*(1.0-cosq);
+                               ccf   = term*term;
+                               sinq  = 1.0 - cosq*cosq;
+                               dccf  = 2.0*term*sinq*invsqrt(sinq)*STILL_PIP5*ratio;
+                       }
+               
+                       prod          = STILL_P4*vaj;
+                       gpi           = gpi+prod*ccf*idr4;
+                       fr->dadx[n++] = prod*(4*ccf-dccf)*idr6;
+                       
+               }
+               
+               gpi2 = gpi * gpi;
+               born->bRad[ai] = factor*invsqrt(gpi2);
+               fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+
+       }
+       
+       return 0;
+}
+       
+
+static int 
+calc_gb_rad_hct(t_commrec *cr,t_forcerec *fr,int natoms, gmx_mtop_t *mtop,
+                                       const t_atomtypes *atype, rvec x[], t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
+{
+       int i,k,n,ai,aj,nj0,nj1,dum;
+       real rai,raj,gpi,dr2,dr,sk,sk2,lij,uij,diff2,tmp,sum_ai;
+       real rad,min_rad,rinv,rai_inv;
+       real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11;
+       real lij2, uij2, lij3, uij3, t1,t2,t3;
+       real lij_inv,dlij,duij,sk2_rinv,prod,log_term;
+       rvec dx;
+       real doffset;
+       real *sum_tmp;
+
+       doffset = born->gb_doffset;
+       sum_tmp = born->work;
+       
+       /* Keep the compiler happy */
+       n=0;
+       prod=0;
+       
+       for(i=0;i<natoms;i++)
+               born->bRad[i]=fr->invsqrta[i]=1;
+       
+       for(i=0;i<nl->nri;i++)
+       {
+               ai = nl->iinr[i];
+                       
+               nj0 = nl->jindex[ai];                   
+               nj1 = nl->jindex[ai+1];
+               
+               rai     = mtop->atomtypes.gb_radius[md->typeA[ai]]-doffset; 
+               sum_ai  = 1.0/rai;
+               rai_inv = sum_ai;
+               
+               ix1 = x[ai][0];
+               iy1 = x[ai][1];
+               iz1 = x[ai][2];
+               
+               if(PAR(cr))
+               {
+                       /* Only have the master node do this, since we only want one value at one time */
+                       if(MASTER(cr))
+                               sum_tmp[ai]=sum_ai;
+                       else
+                               sum_tmp[ai]=0;
+               }
+               
+               for(k=nj0;k<nj1;k++)
+               {
+                       aj    = nl->jjnr[k];
+                               
+                       jx1   = x[aj][0];
+                       jy1   = x[aj][1];
+                       jz1   = x[aj][2];
+                       
+                       dx11  = ix1 - jx1;
+                       dy11  = iy1 - jy1;
+                       dz11  = iz1 - jz1;
+                       
+                       dr2   = dx11*dx11+dy11*dy11+dz11*dz11;
+                       rinv  = invsqrt(dr2);
+                       dr    = rinv*dr2;
+                       
+                       sk    = born->param[aj];
+                                       
+                       if(rai < dr+sk)
+                       {
+                               lij     = 1.0/(dr-sk);
+                               dlij    = 1.0;
+                               
+                               if(rai>dr-sk) {
+                                       lij  = rai_inv;
+                                       dlij = 0.0;
+                               }
+                               
+                               lij2     = lij*lij;
+                               lij3     = lij2*lij;
+                               
+                               uij      = 1.0/(dr+sk);
+                               uij2     = uij*uij;
+                               uij3     = uij2*uij;
+                               
+                               diff2    = uij2-lij2;
+                               
+                               lij_inv  = invsqrt(lij2);
+                               sk2      = sk*sk;
+                               sk2_rinv = sk2*rinv;
+                               prod     = 0.25*sk2_rinv;
+                               
+                               /* log_term = table_log(uij*lij_inv,born->log_table,LOG_TABLE_ACCURACY); */
+                               log_term = log(uij*lij_inv);
+                               
+                               tmp      = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term + prod*(-diff2);
+                                                               
+                               if(rai<sk-dr)
+                                       tmp = tmp + 2.0 * (rai_inv-lij);
+                                       
+                               duij    = 1.0;
+                               t1      = 0.5*lij2 + prod*lij3 - 0.25*(lij*rinv+lij3*dr);
+                               t2      = -0.5*uij2 - 0.25*sk2_rinv*uij3 + 0.25*(uij*rinv+uij3*dr);
+                               t3      = 0.125*(1.0+sk2_rinv*rinv)*(-diff2)+0.25*log_term*rinv*rinv;
+       
+                               fr->dadx[n++] = (dlij*t1+duij*t2+t3)*rinv; /* rb2 is moved to chainrule */
+
+                               if(PAR(cr))
+                               {
+                                       sum_tmp[ai] -= 0.5*tmp;
+                               }
+                               else
+                               {
+                                       sum_ai -= 0.5*tmp;
+                               }
+                       }
+               }
+       
+               if(!PAR(cr))
+               {
+                       min_rad = rai + doffset;
+                       rad=1.0/sum_ai; 
+                       
+                       born->bRad[ai]=rad > min_rad ? rad : min_rad;
+                       fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+               }
+
+       }
+
+       if(PAR(cr))
+       {
+               gmx_sum(natoms,sum_tmp,cr);
+
+               /* Calculate the Born radii so that they are available on all nodes */
+               for(i=0;i<natoms;i++)
+               {
+                       ai      = i;
+                       min_rad = mtop->atomtypes.gb_radius[md->typeA[ai]]; 
+                       rad     = 1.0/sum_tmp[ai];
+                       
+                       born->bRad[ai]=rad > min_rad ? rad : min_rad;
+                       fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+               }
+       }
+
+       return 0;
+}
+
+static int 
+calc_gb_rad_obc(t_commrec *cr, t_forcerec *fr, int natoms, gmx_mtop_t *mtop,
+                                       const t_atomtypes *atype, rvec x[], t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
+{
+       int i,k,ai,aj,nj0,nj1,n;
+       real rai,raj,gpi,dr2,dr,sk,sk2,lij,uij,diff2,tmp,sum_ai;
+       real rad, min_rad,sum_ai2,sum_ai3,tsum,tchain,rinv,rai_inv,lij_inv,rai_inv2;
+       real log_term,prod,sk2_rinv;
+       real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11;
+       real lij2,uij2,lij3,uij3,dlij,duij,t1,t2,t3,tmp2;
+       real doffset;
+       real *sum_tmp;
+
+       /* Keep the compiler happy */
+       n=0;
+       prod=0;
+       
+       doffset = born->gb_doffset;
+       sum_tmp = born->work;
+       
+       for(i=0;i<natoms;i++) {
+               born->bRad[i]=fr->invsqrta[i]=1;
+       }
+               
+       for(i=0;i<nl->nri;i++)
+       {
+               ai  = nl->iinr[i];
+               
+               nj0 = nl->jindex[ai];
+               nj1 = nl->jindex[ai+1];
+               
+               rai      = mtop->atomtypes.gb_radius[md->typeA[ai]]-doffset;
+               sum_ai   = 0;
+               rai_inv  = 1.0/rai;
+               rai_inv2 = 1.0/mtop->atomtypes.gb_radius[md->typeA[ai]];
+               
+               ix1 = x[ai][0];
+               iy1 = x[ai][1];
+               iz1 = x[ai][2];
+               
+               if(PAR(cr))
+                       sum_tmp[ai]=0;
+
+               for(k=nj0;k<nj1;k++)
+               {
+                       aj    = nl->jjnr[k];
+       
+                       jx1   = x[aj][0];
+                       jy1   = x[aj][1];
+                       jz1   = x[aj][2];
+                       
+                       dx11  = ix1 - jx1;
+                       dy11  = iy1 - jy1;
+                       dz11  = iz1 - jz1;
+                       
+                       dr2   = dx11*dx11+dy11*dy11+dz11*dz11;
+                       rinv  = invsqrt(dr2);
+                       dr    = dr2*rinv;
+               
+                       /* sk is precalculated in init_gb() */
+                       sk    = born->param[aj];
+                       
+                       if(rai < dr+sk)
+                       {
+                               lij       = 1.0/(dr-sk);
+                               dlij      = 1.0;
+                                                               
+                               if(rai>dr-sk) {
+                                       lij  = rai_inv;
+                                       dlij = 0.0;
+                               }
+                               
+                               uij      = 1.0/(dr+sk);
+                               lij2     = lij  * lij;
+                               lij3     = lij2 * lij;
+                               uij2     = uij  * uij;
+                               uij3     = uij2 * uij;
+                               
+                               diff2    = uij2-lij2;
+                               
+                               lij_inv  = invsqrt(lij2);
+                               sk2      = sk*sk;
+                               sk2_rinv = sk2*rinv;    
+                               prod     = 0.25*sk2_rinv;
+                               
+                               log_term = log(uij*lij_inv);
+                               /* log_term = table_log(uij*lij_inv,born->log_table,LOG_TABLE_ACCURACY); */
+                               tmp      = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term + prod*(-diff2);
+                               
+                               if(rai < sk-dr)
+                                       tmp = tmp + 2.0 * (rai_inv-lij);
+                                       
+                               duij    = 1.0;
+                               t1      = 0.5*lij2 + prod*lij3 - 0.25*(lij*rinv+lij3*dr);
+                               t2      = -0.5*uij2 - 0.25*sk2_rinv*uij3 + 0.25*(uij*rinv+uij3*dr);
+                               t3      = 0.125*(1.0+sk2_rinv*rinv)*(-diff2)+0.25*log_term*rinv*rinv;
+       
+                               fr->dadx[n++] = (dlij*t1+duij*t2+t3)*rinv; /* rb2 is moved to chainrule */
+                               
+                               sum_ai += 0.5*tmp;
+                       
+                               if(PAR(cr))
+                                       sum_tmp[ai] += 0.5*tmp;
+
+                       }
+               }       
+
+               if(!PAR(cr))
+               {
+                       sum_ai  = rai     * sum_ai;
+                       sum_ai2 = sum_ai  * sum_ai;
+                       sum_ai3 = sum_ai2 * sum_ai;
+                       
+                       tsum    = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
+                       born->bRad[ai] = rai_inv - tsum*rai_inv2;
+                       born->bRad[ai] = 1.0 / born->bRad[ai];
+                       
+                       fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+                       
+                       tchain  = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
+                       born->drobc[ai] = (1.0-tsum*tsum)*tchain*rai_inv2;
+               }
+       }
+
+       if(PAR(cr))
+       {
+               gmx_sum(natoms,sum_tmp,cr);
+
+               for(i=0;i<natoms;i++)
+               {
+                       ai      = i;
+                       rai = mtop->atomtypes.gb_radius[md->typeA[ai]];
+                       rai_inv = 1.0/rai;
+                       
+                       sum_ai  = sum_tmp[ai];
+                       sum_ai  = rai     * sum_ai;
+                       sum_ai2 = sum_ai  * sum_ai;
+                       sum_ai3 = sum_ai2 * sum_ai;
+                       
+                       tsum    = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
+                       born->bRad[ai] = rai_inv - tsum/mtop->atomtypes.gb_radius[md->typeA[ai]];
+                       
+                       born->bRad[ai] = 1.0 / born->bRad[ai];
+                       fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+                       
+                       tchain  = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
+                       born->drobc[ai] = (1.0-tsum*tsum)*tchain/mtop->atomtypes.gb_radius[md->typeA[ai]];
+               }
+       }
+       
+       return 0;
+}
+
+
+
+int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir,int natoms, int nrfa, gmx_mtop_t *mtop,
+                               const t_atomtypes *atype, rvec x[], rvec f[],t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
+{
+#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) )
+       /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
+       switch(ir->gb_algorithm)
+       {
+               case egbSTILL:
+                       calc_gb_rad_still_sse(cr,fr,md->nr,mtop, atype, x[0], nl, born, md); 
+                       break;
+               case egbHCT:
+                       gmx_fatal(FARGS, "HCT algorithm not supported with sse");
+                       /* calc_gb_rad_hct_sse(cr,fr,md->nr, forceatoms, forceparams,mtop,atype,x,nl,born,md); */
+                       break;
+               case egbOBC:
+                       calc_gb_rad_obc_sse(cr,fr,md->nr,mtop,atype,x[0],nl,born,md); 
+                       break;
+                       
+               default:
+                       gmx_fatal(FARGS, "Unknown sse-enabled algorithm for Born radii calculation: %d",ir->gb_algorithm);
+       }
+       
+#else
+       
+       /* Switch for determining which algorithm to use for Born radii calculation */
+       switch(ir->gb_algorithm)
+       {
+               case egbSTILL:
+                       calc_gb_rad_still(cr,fr,md->nr,mtop,atype,x,nl,born,md); 
+                       break;
+               case egbHCT:
+                       calc_gb_rad_hct(cr,fr,md->nr,mtop,atype,x,nl,born,md); 
+                       break;
+               case egbOBC:
+                       calc_gb_rad_obc(cr,fr,md->nr,mtop,atype,x,nl,born,md); 
+                       break;
+                       
+               default:
+                       gmx_fatal(FARGS, "Unknown algorithm for Born radii calculation: %d",ir->gb_algorithm);
+       }
+       
+#endif
+       
+       return 0;               
+}
+
+
+
+real gb_bonds_tab(int nbonds, real *x, real *f, real *charge, real *p_gbtabscale,
+                                 real *invsqrta, real *dvda, real *GBtab, const t_iatom forceatoms[],
+                                 real epsilon_r, real facel)
+{
+       int i, n0,nnn,type,ai,aj,ai3,aj3;
+       real isai,isaj;
+       real r,rsq11,ix1,iy1,iz1,jx1,jy1,jz1;
+       real dx11,dy11,dz11,rinv11,iq,facel2;
+       real isaprod,qq,gbscale,gbtabscale,Y,F,Geps,Heps2,Fp,VV,FF,rt,eps,eps2;
+       real vgb,fgb,vcoul,fijC,dvdatmp,fscal,tx,ty,tz,dvdaj;
+       real vctot;     
+
+       gbtabscale=*p_gbtabscale;
+       vctot = 0.0;
+       
+       for(i=0;i<nbonds; )
+       {
+               type          = forceatoms[i++];
+               ai            = forceatoms[i++];
+               aj            = forceatoms[i++];
+               ai3           = ai*3;
+               aj3           = aj*3; 
+           isai          = invsqrta[ai];
+               ix1           = x[ai3+0];
+               iy1           = x[ai3+1];
+               iz1           = x[ai3+2];
+               iq            = (-1)*facel*charge[ai];
+               jx1           = x[aj3+0];
+               jy1           = x[aj3+1];
+               jz1           = x[aj3+2];
+               dx11          = ix1 - jx1;
+               dy11          = iy1 - jy1;
+               dz11          = iz1 - jz1;
+               rsq11         = dx11*dx11+dy11*dy11+dz11*dz11;
+               rinv11        = invsqrt(rsq11);
+               isaj          = invsqrta[aj];
+               isaprod       = isai*isaj;
+               qq            = isaprod*iq*charge[aj];
+               gbscale       = isaprod*gbtabscale;
+               r             = rsq11*rinv11;
+               rt            = r*gbscale;
+               n0            = rt;
+               eps           = rt-n0;
+               eps2          = eps*eps;
+               nnn           = 4*n0;
+               Y             = GBtab[nnn];
+               F             = GBtab[nnn+1];
+               Geps          = eps*GBtab[nnn+2];
+               Heps2         = eps2*GBtab[nnn+3];
+               Fp            = F+Geps+Heps2;
+               VV            = Y+eps*Fp;
+               FF            = Fp+Geps+2.0*Heps2;
+               vgb           = qq*VV;
+               fijC          = qq*FF*gbscale;
+               dvdatmp       = -(vgb+fijC*r)*0.5;
+               dvda[aj]      = dvda[aj] + dvdatmp*isaj*isaj;
+               dvda[ai]      = dvda[ai] + dvdatmp*isai*isai;
+               vctot         = vctot + vgb;
+               fgb           = -(fijC)*rinv11;
+               tx            = fgb*dx11;
+               ty            = fgb*dy11;
+               tz            = fgb*dz11;
+       
+               f[aj3+0]      = f[aj3+0] - tx;
+               f[aj3+1]      = f[aj3+1] - ty;
+               f[aj3+2]      = f[aj3+2] - tz;
+               
+               f[ai3+0]      = f[ai3+0] + tx;
+               f[ai3+1]      = f[ai3+1] + ty;
+               f[ai3+2]      = f[ai3+2] + tz;
+       }
+       
+       return vctot;
+}
+
+
+real calc_gb_selfcorrections(t_commrec *cr, int natoms, 
+                            real *charge, gmx_genborn_t *born, real *dvda, t_mdatoms *md, double facel)
+{      
+       int i,ai,at0,at1;
+       real rai,e,derb,q,q2,fi,rai_inv,vtot;
+
+       if(PAR(cr))
+       {
+               pd_at_range(cr,&at0,&at1);
+       }
+       else
+       {
+               at0=0;
+               at1=natoms;
+       }
+                       
+       vtot=0.0;
+       
+       /* Apply self corrections */    
+       for(i=at0;i<at1;i++)
+       {
+               if(born->vs[i]==1)
+               {
+                       ai       = i;
+                       rai      = born->bRad[ai];
+                       rai_inv  = 1.0/rai;
+                       q        = charge[ai];
+                       q2       = q*q;
+                       fi       = facel*q2;
+                       e        = fi*rai_inv;
+                       derb     = 0.5*e*rai_inv*rai_inv;
+                       dvda[ai] += derb*rai;
+                       vtot     -= 0.5*e;
+               }
+       }
+       
+   return vtot;        
+       
+}
+
+real calc_gb_nonpolar(t_commrec *cr, t_forcerec *fr,int natoms,gmx_genborn_t *born, gmx_mtop_t *mtop, 
+                                         const t_atomtypes *atype, real *dvda,int gb_algorithm, t_mdatoms *md)
+{
+       int ai,i,at0,at1;
+       real e,es,rai,rbi,term,probe,tmp,factor;
+       real rbi_inv,rbi_inv2;
+       
+       /* To keep the compiler happy */
+       factor=0;
+       
+       if(PAR(cr))
+       {
+               pd_at_range(cr,&at0,&at1);
+       }
+       else
+       {
+               at0=0;
+               at1=natoms;
+       }
+       
+       /* The surface area factor is 0.0049 for Still model, 0.0054 for HCT/OBC */
+       if(gb_algorithm==egbSTILL)
+               factor=0.0049*100*CAL2JOULE;
+               
+       if(gb_algorithm==egbHCT || gb_algorithm==egbOBC)
+               factor=0.0054*100*CAL2JOULE;    
+       
+       es    = 0;
+       probe = 0.14;
+       term  = M_PI*4;
+
+       for(i=at0;i<at1;i++)
+       {
+               if(born->vs[i]==1)
+               {
+                       ai        = i;
+                       rai               = mtop->atomtypes.gb_radius[md->typeA[ai]];
+                       rbi_inv   = fr->invsqrta[ai];
+                       rbi_inv2  = rbi_inv * rbi_inv;
+                       tmp       = (rai*rbi_inv2)*(rai*rbi_inv2);
+                       tmp       = tmp*tmp*tmp;
+                       e         = factor*term*(rai+probe)*(rai+probe)*tmp;
+                       dvda[ai]  = dvda[ai] - 6*e*rbi_inv2;    
+                       es        = es + e;
+               }
+       }       
+
+       return es;
+}
+
+real calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_mtop_t *mtop, const t_atomtypes *atype, int nr, 
+                    rvec x[], rvec f[], t_forcerec *fr, const t_iatom forceatoms[], int gb_algorithm, bool bRad)
+{
+       real v=0;
+       int i;
+       
+       /* Do a simple ACE type approximation for the non-polar solvation */
+       v += calc_gb_nonpolar(cr, fr,md->nr, born, mtop, atype, fr->dvda, gb_algorithm,md);
+
+       /* Calculate the bonded GB-interactions */
+       v += gb_bonds_tab(nr,x[0],f[0],md->chargeA,&(fr->gbtabscale),
+                                         fr->invsqrta,fr->dvda,fr->gbtab.tab,forceatoms,fr->epsilon_r, fr->epsfac);
+                                         
+       /* Calculate self corrections to the GB energies - currently only A state used! (FIXME) */
+       v += calc_gb_selfcorrections(cr,md->nr,md->chargeA, born, fr->dvda, md, fr->epsfac);            
+
+       if(PAR(cr))
+       {
+        /* Sum dvda */
+               gmx_sum(md->nr,fr->dvda, cr);
+       }
+
+#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) )
+       /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
+       calc_gb_chainrule_sse(md->nr, &(fr->gblist), fr->dadx, fr->dvda, x[0], f[0], gb_algorithm, born);       
+#else
+       /* Calculate the forces due to chain rule terms with non sse code */
+       calc_gb_chainrule(md->nr, &(fr->gblist), x, f, fr->dvda, fr->dadx, gb_algorithm, born); 
+#endif 
+
+       return v;
+
+}
+
+
+real calc_gb_chainrule(int natoms, t_nblist *nl, rvec x[], rvec t[], real *dvda, real *dadx, 
+                                          int gb_algorithm, gmx_genborn_t *born)
+{      
+       int i,k,n,ai,aj,nj0,nj1;
+       real fgb,fij,rb2,rbi,fix1,fiy1,fiz1;
+       real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11,rsq11;
+       real rinv11,tx,ty,tz,rbai;
+       real rb[natoms];
+       rvec dx;
+
+       n=0;            
+       
+       /* Loop to get the proper form for the Born radius term */
+       if(gb_algorithm==egbSTILL) {
+               for(i=0;i<natoms;i++)
+               {
+                       rbi   = born->bRad[i];
+                       rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
+               }
+       }
+               
+       if(gb_algorithm==egbHCT) {
+               for(i=0;i<natoms;i++)
+               {
+                       rbi   = born->bRad[i];
+                       rb[i] = rbi * rbi * dvda[i];
+               }
+       }
+        
+       if(gb_algorithm==egbOBC) {
+               for(i=0;i<natoms;i++)
+               {
+                       rbi   = born->bRad[i];
+                       rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
+               }
+       }
+               
+       for(i=0;i<nl->nri;i++)
+       {
+               ai   = nl->iinr[i];
+               nj0      = nl->jindex[ai];
+               nj1  = nl->jindex[ai+1];
+               
+               ix1  = x[ai][0];
+               iy1  = x[ai][1];
+               iz1  = x[ai][2];
+               
+               fix1 = 0;
+               fiy1 = 0;
+               fiz1 = 0;
+               
+               rbai = rb[ai];
+                               
+               for(k=nj0;k<nj1;k++)
+               {
+                       aj = nl->jjnr[k];
+               
+                       jx1     = x[aj][0];
+                       jy1     = x[aj][1];
+                       jz1     = x[aj][2];
+                               
+                       dx11    = ix1 - jx1;
+                       dy11    = iy1 - jy1;
+                       dz11    = iz1 - jz1;
+                               
+                       fgb     = rbai*dadx[n++]; 
+               
+                       tx      = fgb * dx11;
+                       ty      = fgb * dy11;
+                       tz      = fgb * dz11;
+                               
+                       fix1    = fix1 + tx;
+                       fiy1    = fiy1 + ty;
+                       fiz1    = fiz1 + tz;
+                               
+                       /* Update force on atom aj */
+                       t[aj][0] = t[aj][0] - tx;
+                       t[aj][1] = t[aj][1] - ty;
+                       t[aj][2] = t[aj][2] - tz;
+               }
+               
+               /* Update force on atom ai */
+               t[ai][0] = t[ai][0] + fix1;
+               t[ai][1] = t[ai][1] + fiy1;
+               t[ai][2] = t[ai][2] + fiz1;
+               
+       }
+       
+
+       return 0;       
+}
+
+/* Skriv om den h�r rutinen s� att den f�r samma format som Charmm SASA
+ * Antagligen kan man d� ber�kna derivatan i samma loop som ytan,
+ * vilket ï¿½r snabbare (g�rs inte i Charmm, men det borde g� :-) )
+ */
+int calc_surfStill(t_inputrec *ir,
+                 t_idef     *idef,
+                 t_atoms    *atoms,
+                 rvec       x[],
+                       rvec       f[],                                          
+                       gmx_genborn_t     *born,
+                 t_atomtypes *atype,
+                       double     *faction,
+                       int        natoms,
+                       t_nblist   *nl,
+                       t_iparams  forceparams[],
+                       t_iatom    forceatoms[],
+                       int        nbonds)
+{
+  int i,j,n,ia,ib,ic;
+  
+  real pc[3],radp,probd,radnp,s,prob,bmlt;
+  real dx,dy,dz,d,rni,dist,bee,ri,rn,asurf,t1ij,t2ij,bij,bji,dbijp;
+  real dbjip,tip,tmp,tij,tji,t3ij,t4ij,t5ij,t3ji,t4ji,t5ji,dbij,dbji;
+  real dpf,dpx,dpy,dpz,pi,pn,si,sn;
+  real aprob[natoms];
+
+  int factor=1;
+  
+
+  /*bonds_t *bonds,*bonds13;*/
+  
+  /*snew(bonds,natoms);*/
+  /*snew(bonds13,natoms);*/
+   
+  /* Zero out the forces to compare only surface area contribution */
+  /*
+  printf("Zeroing out forces before surface area..\n");
+   for(i=0;i<natoms;i++)
+    {
+      
+               if(!(is_hydrogen(*atoms->atomname[i])))
+               {
+                       printf("x=%g, s=%g, p=%g, r=%g\n",
+                               x[i][0],
+                               atype->vol[atoms->atom[i].type],
+                               atype->surftens[atoms->atom[i].type],                   
+                               factor*atype->radius[atoms->atom[i].type]);
+       
+               }
+      printf("faction[i]=%g\n",faction[i*3]);
+      faction[i*3]=0;
+      faction[i*3+1]=0;
+      faction[i*3+2]=0;
+                       
+                       f[i][0]=f[i][1]=f[i][2]=0;
+    }
+   exit(1);
+       */
+  /* Radius of probe */
+  radp=0.14;
+  probd=2*radp;
+       
+       /*********************************************************
+        *********************************************************
+        *********************************************************
+        *********************************************************
+        *********************************************************
+        Begin SA calculation Gromacs-style
+        *********************************************************
+        *********************************************************
+        *********************************************************
+        *********************************************************
+        *********************************************************/
+       
+       int k,type,ai,aj,nj0,nj1;
+       real dr2,sar,rai,raj,fij;
+       rvec dxx;
+       
+       /* First set up the individual areas */
+       for(n=0;n<natoms;n++)
+       {       
+               rn=atype->radius[atoms->atom[n].type];
+               born->asurf[n]=(4*M_PI*(rn+radp)*(rn+radp));
+       }
+       
+       /* Then loop over the bonded interactions */
+       for(i=0;i<nbonds; )
+       {
+               type = forceatoms[i++];
+               ai   = forceatoms[i++];
+               aj   = forceatoms[i++];
+               
+               if(!is_hydrogen(*(atoms->atomname[ai])) &&
+                        !is_hydrogen(*(atoms->atomname[aj])))
+               {
+               
+                       /*printf("genborn.c: xi=%g, xj=%g\n",factor*x[ai][0],factor*x[aj][0]);  */
+               
+                       rvec_sub(x[ai],x[aj],dxx);      
+                       dr2  = iprod(dxx,dxx);  
+                       
+                       sar  = forceparams[type].gb.sar;
+                       bmlt = forceparams[type].gb.bmlt;
+                       rni  = sar+probd;
+                       
+                       rn   = atype->radius[atoms->atom[ai].type];
+                       ri   = atype->radius[atoms->atom[aj].type];
+                       pn   = atype->surftens[atoms->atom[ai].type];
+                       pi   = atype->surftens[atoms->atom[aj].type];                   
+                       /*sn   = atype->vol[atoms->atom[ai].type]; */
+                       /*si   = atype->vol[atoms->atom[aj].type]; */
+                       
+                       /*rni  = rn + ri +probd; */
+                       /*printf("genborn.c: rn=%g, ri=%g\n",ri,rn); */
+                       if(dr2<rni*rni)
+                       {
+                               /*printf("d=%g, s=%g\n",dr2,rni*rni);*/
+                               dist = dr2*invsqrt(dr2);
+                               t1ij = M_PI*(rni-dist);
+                               t2ij = (rn-ri)/dist;
+                               bij  = (rn+radp)*t1ij*(1-t2ij);
+                               bji  = (ri+radp)*t1ij*(1+t2ij);
+                               tij  = pn*bmlt*bij/(4*M_PI*(rn+radp)*(rn+radp));
+                               tji  = pi*bmlt*bji/(4*M_PI*(ri+radp)*(ri+radp));
+                               
+                               born->asurf[ai] = born->asurf[ai]*(1-tij);
+                               born->asurf[aj] = born->asurf[aj]*(1-tji);
+                       }
+               }
+       }
+       
+       /* Now loop over interactions >= 1-4 */
+       bmlt=0.3516;
+       
+       /*printf("NONBONDED INTERACTIONS\n");*/
+       
+       for(i=0;i<natoms;i++)
+       {
+               ai    = i;
+               nj0   = nl->jindex[ai];
+               nj1   = nl->jindex[ai+1];
+               
+               rai   = factor*atype->radius[atoms->atom[ai].type];
+               pn    = atype->surftens[atoms->atom[ai].type];
+               /*sn    = atype->vol[atoms->atom[ai].type];*/
+               
+               for(k=nj0;k<nj1;k++)
+               {
+                       aj  = nl->jjnr[k];
+                       
+                       if(!is_hydrogen(*(atoms->atomname[ai])) &&
+                                       !is_hydrogen(*(atoms->atomname[aj])))
+                       {
+                               raj = factor*atype->radius[atoms->atom[aj].type];
+                               pi  = atype->surftens[atoms->atom[aj].type];
+                               /*si  = atype->vol[atoms->atom[aj].type];*/
+                               rvec_sub(x[ai],x[aj],dxx);
+                       
+                               dr2 = factor*factor*iprod(dxx,dxx);
+                               rni = rai + raj + probd;
+                               /*printf("genborn.c: rn=%g, ri=%g, sar=%g, dr2=%g\n",rai,raj,sar,dr2);  */
+                               /*printf("genborn.c: xi=%g, xj=%g\n",factor*x[ai][0],factor*x[aj][0]); */
+                               
+                               if(dr2<rni*rni)
+                               {
+                                       /*printf("d=%g, s=%g\n",dr2,rni*rni);   */
+                                       dist = dr2*invsqrt(dr2);
+                                       t1ij = M_PI*(rni-dist);
+                                       t2ij = (rai-raj)/dist;
+                                       bij  = (rai+radp)*t1ij*(1-t2ij);
+                                       bji  = (raj+radp)*t1ij*(1+t2ij);
+                                       tij  = pn*bmlt*bij/(4*M_PI*(rai+radp)*(rai+radp));
+                                       tji  = pi*bmlt*bji/(4*M_PI*(raj+radp)*(raj+radp));
+                                                                       
+                                       born->asurf[ai]=born->asurf[ai]*(1-tij);
+                                       born->asurf[aj]=born->asurf[aj]*(1-tji);
+                               }
+                       }
+               }
+       }
+       /*
+       printf("AFTER BOTH AREA CALCULATIONS\n");
+       n=0;
+       for(i=0;i<natoms;i++)
+       {
+      if(!is_hydrogen(*atoms->atomname[i]))
+                       {       
+                               printf("%d, Still=%g, gromacs=%g\n",n,born->asurf[i], born->asurf[i]);
+                               
+                               born->as=born->as+born->asurf[i];
+                               born->as=born->as+born->asurf[i];
+                       }
+               n++;
+       }
+       */
+       /*printf("Total Still area=%g, Total new area=%g\n",born->as, born->as);*/
+        /*printf("nbonds=%d\n",nbonds);*/
+       /* Start to calculate the forces */
+       for(i=0;i<nbonds; )
+       {
+               type = forceatoms[i++];
+               ai   = forceatoms[i++];
+               aj   = forceatoms[i++];
+               
+               if(!is_hydrogen(*(atoms->atomname[ai])) &&
+                        !is_hydrogen(*(atoms->atomname[aj])))
+               {
+                       rvec_sub(x[ai],x[aj],dxx);      
+                       
+                       dr2  = factor*factor*iprod(dxx,dxx);    
+               
+                       sar  = factor*forceparams[type].gb.sar;
+                       bmlt = forceparams[type].gb.bmlt;
+                       rni  = sar+probd;
+                       
+                       rn   = factor*atype->radius[atoms->atom[ai].type];
+                       ri   = factor*atype->radius[atoms->atom[aj].type];
+                       pn   = atype->surftens[atoms->atom[ai].type];
+                       pi   = atype->surftens[atoms->atom[aj].type];                   
+                       sn   = atype->vol[atoms->atom[ai].type];
+                       si   = atype->vol[atoms->atom[aj].type];
+                       
+                       if(dr2<rni*rni)
+                       {
+                               dist = dr2*invsqrt(dr2);
+                               t1ij = M_PI*(rni-dist);
+                               t2ij = (rn-ri)/dist;
+                               bij  = (rn+radp)*t1ij*(1-t2ij);
+                               bji  = (ri+radp)*t1ij*(1+t2ij);
+                               
+                               dbij = M_PI*(rn+radp)*(dr2-(rni*(rn-ri)));
+                               dbji = M_PI*(ri+radp)*(dr2+(rni*(rn-ri)));
+                               
+                               t3ij = sn*born->asurf[ai]*dbij;
+                               t4ij = (4*M_PI*(rn+radp)*(rn+radp))/(pn*bmlt)-bij;
+                               t5ij = t3ij/t4ij;
+                               
+                               t3ji = si*born->asurf[aj]*dbji;
+                               t4ji = (4*M_PI*(ri+radp)*(ri+radp))/(pi*bmlt)-bji;
+                               t5ji = t3ji/t4ji;
+                               
+                               dpf  = (t5ij+t5ji)/(dr2*dist);
+                               /*printf("deriv_cut: ai=%d, xi=%g aj=%d, xj=%g\n",ai,x[ai][0], aj,x[aj][0]);*/
+                               for(k=0;k<DIM;k++)
+                               {
+                                       fij = factor*(-dpf)*dxx[k];
+                                       f[ai][k]+=fij;
+                                       f[aj][k]-=fij;
+                               }
+                       }
+               }
+       }
+       
+       /* Now calculate forces for all interactions >= 1-4 */
+       bmlt = 0.3516;
+       
+       for(i=0;i<natoms;i++)
+       {
+               ai  = i;
+               nj0 = nl->jindex[ai];
+               nj1 = nl->jindex[ai+1];
+               
+               rai   = factor*atype->radius[atoms->atom[ai].type];
+               pn    = atype->surftens[atoms->atom[ai].type];
+               sn    = atype->vol[atoms->atom[ai].type];
+               
+               for(k=nj0;k<nj1;k++)
+               {
+                       aj = nl->jjnr[k];
+                       
+                       if(!is_hydrogen(*(atoms->atomname[ai])) &&
+                                !is_hydrogen(*(atoms->atomname[aj])))
+                       {
+                               raj = factor*atype->radius[atoms->atom[aj].type];
+                               pi  = atype->surftens[atoms->atom[aj].type];
+                               si  = atype->vol[atoms->atom[aj].type];
+                               
+                               rvec_sub(x[ai],x[aj],dxx);
+                               
+                               dr2 = factor*factor*iprod(dxx,dxx);
+                               rni = rai + raj + probd;
+                               
+                               if(dr2<rni*rni)
+                               {
+                                       dist = dr2*invsqrt(dr2);
+                                       t1ij = M_PI*(rni-dist);
+                                       t2ij = (rai-raj)/dist;
+                                       bij  = (rai+radp)*t1ij*(1-t2ij);
+                                       bji  = (raj+radp)*t1ij*(1+t2ij);
+                                       
+                                       dbij = M_PI*(rai+radp)*(dr2-(rni*(rai-raj)));
+                                       dbji = M_PI*(raj+radp)*(dr2+(rni*(rai-raj)));
+                                       
+                                       t3ij = sn*born->asurf[ai]*dbij;
+                                       t4ij = (4*M_PI*(rai+radp)*(rai+radp))/(pn*bmlt)-bij;
+                                       t5ij = t3ij/t4ij;
+                                       
+                                       t3ji = si*born->asurf[aj]*dbji;
+                                       t4ji = (4*M_PI*(raj+radp)*(raj+radp))/(pi*bmlt)-bji;
+                                       t5ji = t3ji/t4ji;
+                                       
+                                       dpf  = (t5ij+t5ji)/(dr2*dist);
+                                       /*printf("deriv_cut: ai=%d, xi=%g aj=%d, xj=%g\n",ai,x[ai][0], aj,x[aj][0]);*/
+                                       for(n=0;n<DIM;n++)
+                                       {
+                                               fij = factor*(-dpf)*dxx[n];
+                                               f[ai][n]+=fij;
+                                               f[aj][n]-=fij;
+                                       }
+                               }
+                       }
+               }
+       }
+       /*
+       printf("AFTER BOTH FORCES CALCULATIONS\n");
+       n=0;
+       for(i=0;i<natoms;i++)
+       {
+               if(!is_hydrogen(*atoms->atomname[i]))
+               {       
+                       printf("%d, gx=%g, gy=%g, gz=%g\n",
+                                                n,
+                                                faction[i*3], 
+                                                faction[i*3+1],
+                                                faction[i*3+2],
+                                                f[i][0],
+                                                f[i][1],
+                                                f[i][2]);
+               }
+               n++;
+       }
+       */
+  return 0;
+}
+
+int calc_surfBrooks(t_inputrec *ir,
+                   t_idef     *idef,
+                   t_atoms    *atoms,
+                   rvec       x[],
+                   gmx_genborn_t     *born,
+                   t_atomtypes *atype,
+                   double      *faction,
+                   int natoms)
+{
+  int i,j,k;
+  real xi,yi,zi,dx,dy,dz,ri,rj;
+  real rho,rho2,rho6,r2,r,aij,aijsum,daij;
+  real kappa,sassum,tx,ty,tz,fix1,fiy1,fiz1;
+
+  real ck[5];
+  real Aij[natoms];
+  real sasi[natoms];
+
+  /* Brooks parameter for cutoff between atom pairs
+   * Increasing kappa will increase the number of atom pairs
+   * included in the calculation, which will also slow the calculation
+   */
+  kappa=0;
+
+  sassum=0;
+
+  /* Hydrogen atoms are included in via the ck parameters for the
+   * heavy atoms
+   */
+  for(i=0;i<natoms;i++)
+    {
+      /*if(strncmp(*atoms->atomname[i],"H",1)!=0) */
+      /*{ */
+         xi=x[i][0];
+         yi=x[i][1];
+         zi=x[i][2];
+         
+         fix1=0;
+         fiy1=0;
+         fiz1=0;
+
+         ri=atype->radius[atoms->atom[i].type];
+         aijsum=0;
+         
+         for(j=0;j<natoms;j++)
+           {
+             /*if(strncmp(*atoms->atomname[j],"H",1)!=0 && i!=j) */
+             if(i!=j)
+             {
+                 dx=xi-x[j][0];
+                 dy=yi-x[j][1];
+                 dz=zi-x[j][2];
+                 
+                 r2=dx*dx+dy*dy+dz*dz;
+                 r=sqrt(r2);
+                 rj=atype->radius[atoms->atom[j].type];
+                 
+                 rho=ri+rj+kappa;
+                 rho2=rho*rho;
+                 rho6=rho2*rho2*rho2;
+
+                   /* Cutoff test */
+                   if(r<=rho)
+                     {
+                       aij=pow((rho2-r2)*(rho2+2*r2),2)/rho6;
+                       daij=((4*r*(rho2-r2)*(rho2-r2))/rho6)-((4*r*(rho2-r2)*(2*r2+rho2))/rho6);
+                       tx=daij*dx;
+                       ty=daij*dy;
+                       tz=daij*dz;
+                       
+                       fix1=fix1+tx;
+                       fiy1=fiy1+ty;
+                       fiz1=fiz1+tz;
+
+                       faction[j*3]=faction[j*3]-tx;
+                       faction[j*3+1]=faction[j*3+1]-ty;
+                       faction[j*3+2]=faction[j*3+2]-tz;
+                       
+                       aijsum=aijsum+aij;
+                       printf("xi=%g, xj=%g, fscal=%g\n",xi,x[j][0],daij);
+                     }
+               }
+           }
+         
+         faction[i*3]=faction[i*3]+fix1;
+         faction[i*3+1]=faction[i*3+1]+fiy1;
+         faction[i*3+2]=faction[i*3+2]+fiz1;
+         
+         /* Calculate surface area coefficient */
+         Aij[i]=pow(aijsum,1/4);
+         
+         for(k=0;k<5;k++)
+           {
+             sasi[i]=sasi[i]+ck[k]*(pow(Aij[i],k));
+           }
+         
+         /* Increase total surface area */
+         sassum=sassum+sasi[i];
+         
+         /*}*/
+    }
+
+  printf("Brooks total surface area is: %g\n", sassum);
+
+
+  return 0;
+}
+
+
+/* This will set up a really simple neighborlist for GB calculations
+ * so that each atom will have enervy other atom in its list.
+ * We don't worry about things liks load balancing etc ...
+ */
+ int do_gb_neighborlist(t_forcerec *fr, int natoms,t_atoms *atoms, t_ilist *il, int nbonds, int n12n13)
+ {
+   int i,j,k,m,ai,aj,ak,an,idx=0,idx_sr,vs_idx;
+   int nalloc=0;
+   int skip[natoms];
+   bonds_t *bonds,*bonds13;
+        
+   /* Calculate the number of elements in the jjnr array 
+       * For the gblist_sr, this will be an exact allocation, but for
+       * gblist_lr, this will be a few elements to much
+       */
+   fr->gblist_sr.count=0;
+   fr->gblist_sr.nri=natoms;
+   fr->gblist_sr.maxnri=natoms;
+        
+   fr->gblist_lr.count=0;
+   fr->gblist_lr.nri=natoms;
+   fr->gblist_lr.maxnri=natoms;
+        
+   for(i=0;i<natoms;i++)
+     {
+               fr->gblist_sr.iinr[i]=i;
+               fr->gblist_sr.gid[i]=0;
+               fr->gblist_sr.shift[i]=0;
+                        
+               fr->gblist_lr.iinr[i]=i;
+               fr->gblist_lr.gid[i]=0;
+               fr->gblist_lr.shift[i]=0;
+     }
+
+   fr->gblist_sr.enlist=0;
+   fr->gblist_sr.maxlen=natoms;
+   fr->gblist_lr.enlist=0;
+   fr->gblist_lr.maxlen=natoms;
+        
+   /* Start the lr list */
+       idx=0;
+       idx_sr=0;
+                        
+       for(i=0;i<natoms;i++)
+        skip[i]=-1;
+                
+       snew(bonds,natoms); 
+       snew(bonds13,natoms);
+        
+       for(i=0;i<nbonds; )
+       {
+               m=il->iatoms[i++];
+               ai=il->iatoms[i++];
+               aj=il->iatoms[i++];
+                                
+               bonds[ai].bond[bonds[ai].nbonds]=aj;
+               bonds[ai].nbonds++;
+               bonds[aj].bond[bonds[aj].nbonds]=ai;
+               bonds[aj].nbonds++;
+        }
+                
+       for(i=nbonds;i<n12n13; )
+        {
+                m=il->iatoms[i++];
+                ai=il->iatoms[i++];
+                aj=il->iatoms[i++];
+                
+                bonds13[ai].bond[bonds13[ai].nbonds]=aj;
+                bonds13[ai].nbonds++;
+                bonds13[aj].bond[bonds13[aj].nbonds]=ai;
+                bonds13[aj].nbonds++;
+        }
+                
+       for(i=0;i<natoms;i++)
+        {
+                skip[i]=i;
+                                       
+                for(k=0;k<bonds[i].nbonds;k++)
+                       skip[bonds[i].bond[k]]=i;
+                
+                for(k=0;k<bonds13[i].nbonds;k++)
+                       skip[bonds13[i].bond[k]]=i;
+                       
+                fr->gblist_lr.jindex[i]=idx;   
+                fr->gblist_sr.jindex[i]=idx_sr;
+                
+                for(k=i+1;k<natoms;k++)
+                {
+                       if(skip[k]!=i)
+                       {
+                               fr->gblist_lr.jjnr[idx++]=k;
+                       }
+                }
+                
+               for(k=0;k<natoms;k++)
+                {
+                       if(skip[k]!=i)
+                       {
+                               fr->gblist_sr.jjnr[idx_sr++]=k;
+                       }
+                }
+                
+        }
+       
+        fr->gblist_lr.jindex[i]=idx;
+        fr->gblist_sr.jindex[i]=idx_sr;
+   
+        sfree(bonds);
+        sfree(bonds13);
+          
+        return 0;
+ }
+
+int gb_nblist_siev(t_commrec *cr, int natoms, int gb_algorithm, real gbcut, rvec x[], t_forcerec *fr, t_ilist *il, int n14)
+{
+       int i,l,ii,j,k,n,nj0,nj1,ai,aj,idx,ii_idx,nalloc,at0,at1;
+       double dr2,gbcut2;
+       rvec  dxx;
+       t_nblist *nblist;
+
+       int count[natoms];
+       int **atoms;
+       
+       memset(count,0,sizeof(int)*natoms);
+       atoms=(int **) malloc(sizeof(int *)*natoms);
+       
+       if(PAR(cr))
+       {
+               pd_at_range(cr,&at0,&at1); 
+       }
+       else
+       {
+               at0=0;
+               at1=natoms;
+       }
+       
+       int found;
+       
+       for(i=0;i<natoms;i++)
+               atoms[i]=(int *) malloc(sizeof(int)*natoms);
+
+       if(gb_algorithm==egbHCT || gb_algorithm==egbOBC)
+       {
+               /* Loop over 1-2, 1-3 and 1-4 interactions */
+               for(k=0;k<il->nr;k+=3)
+               {
+                       ai=il->iatoms[k+1];
+                       aj=il->iatoms[k+2];
+                       
+                       found=0;
+                       
+                       /* So that we do not add the same bond twice. This happens with some constraints between 1-3 atoms
+                        * that are in the bond-list but should not be in the GB nb-list */
+                       for(i=0;i<count[ai];i++)
+                       {
+                               if(atoms[ai][i]==aj)
+                                       found=1;
+                       }       
+                        
+                       /* When doing HCT or OBC, we need to add all interactions to the nb-list twice 
+                        * since the loop for calculating the Born-radii runs over all vs all atoms */  
+                       if(found==0)
+                       {
+                               atoms[ai][count[ai]]=aj;
+                               count[ai]++;
+                       
+                               atoms[aj][count[aj]]=ai;
+                               count[aj]++;
+                       }
+               }
+       }
+
+       if(gb_algorithm==egbSTILL)
+       {
+               /* Loop over 1-4 interactions */
+               for(k=n14;k<il->nr;k+=3)
+               {
+                       ai=il->iatoms[k+1];
+                       aj=il->iatoms[k+2];
+                       
+                       found=0;
+                       
+                       for(i=0;i<count[ai];i++)
+                       {
+                               if(atoms[ai][i]==aj)
+                                       found=1;
+                       }       
+                        
+                       /* Also for Still, we need to add (1-4) interactions twice */
+                       atoms[ai][count[ai]]=aj;
+                       count[ai]++;
+                       
+                       atoms[aj][count[aj]]=ai;
+                       count[aj]++;
+                       
+               }
+       }
+                       
+       /* Loop over the VDWQQ and VDW nblists to set up the nonbonded part of the GB list */
+       for(n=0; (n<fr->nnblists); n++)
+       {
+               for(i=0; (i<eNL_NR); i++)
+               {
+                       nblist=&(fr->nblists[n].nlist_sr[i]);
+                       
+                       if(nblist->nri>0 && (i==eNL_VDWQQ || i==eNL_QQ))
+                       {
+                               for(j=0;j<nblist->nri;j++)
+                               {
+                                       ai = nblist->iinr[j];
+                       
+                                       nj0=nblist->jindex[j];
+                                       nj1=nblist->jindex[j+1];
+                               
+                                       for(k=nj0;k<nj1;k++)
+                                       {
+                                               aj=nblist->jjnr[k];
+                                               
+                                               if(ai>aj)
+                                               {
+                                                       atoms[aj][count[aj]]=ai;
+                                                       count[aj]++;
+                                                       
+                                                       /* We need to add all interactions to the nb-list twice 
+                                                        * since the loop for calculating the Born-radii runs over all vs all atoms 
+                                                        */
+                                                       atoms[ai][count[ai]]=aj;
+                                                       count[ai]++;
+                                               }
+                                               else
+                                               {
+                                                       atoms[ai][count[ai]]=aj;
+                                                       count[ai]++;
+                                                       
+                                                       atoms[aj][count[aj]]=ai;
+                                                       count[aj]++;
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+               
+       idx=0;
+       ii_idx=0;
+       
+       for(i=0;i<natoms;i++)
+       {
+               fr->gblist.iinr[ii_idx]=i;
+       
+               for(k=0;k<count[i];k++)
+               {
+                       fr->gblist.jjnr[idx++]=atoms[i][k];
+               }
+               
+               fr->gblist.jindex[ii_idx+1]=idx;
+               ii_idx++;
+       }
+       
+       fr->gblist.nrj=idx;
+       
+       for(i=0;i<natoms;i++)
+               free(atoms[i]);
+       
+       free(atoms);
+       
+       return 0;
+}
+
diff --git a/src/mdlib/genborn_sse.c b/src/mdlib/genborn_sse.c
new file mode 100644 (file)
index 0000000..9637976
--- /dev/null
@@ -0,0 +1,2537 @@
+/*
+ * $Id$
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2008, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "genborn.h"
+#include "vec.h"
+#include "grompp.h"
+#include "pdbio.h"
+#include "names.h"
+#include "physics.h"
+#include "partdec.h"
+#include "network.h"
+#include "gmx_fatal.h"
+#include "mtop_util.h"
+#include "genborn.h"
+
+#ifdef GMX_MPI
+#include "mpi.h"
+#endif
+
+/* Only compile this file if SSE intrinsics are available */
+#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) )
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+
+/* SIMD (SSE1+MMX indeed) implementation of sin, cos, exp and log
+ Inspired by Intel Approximate Math library, and based on the
+ corresponding algorithms of the cephes math library
+ */
+
+/* Copyright (C) 2007  Julien Pommier
+ This software is provided 'as-is', without any express or implied
+ warranty.  In no event will the authors be held liable for any damages
+ arising from the use of this software.
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+ (this is the zlib license)
+ */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+#define _PS_CONST(Name, Val)                                            \
+static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+
+/* Still parameters - make sure to edit in genborn.c too if you change these! */
+#define STILL_P1  0.073*0.1              /* length        */
+#define STILL_P2  0.921*0.1*CAL2JOULE    /* energy*length */
+#define STILL_P3  6.211*0.1*CAL2JOULE    /* energy*length */
+#define STILL_P4  15.236*0.1*CAL2JOULE
+#define STILL_P5  1.254 
+
+#define STILL_P5INV (1.0/STILL_P5)
+#define STILL_PIP5  (M_PI*STILL_P5)
+
+
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); /* 4 / M_PI */
+
+_PS_CONST(exp_hi,      88.3762626647949f);
+_PS_CONST(exp_lo,      -88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+xmm_mm_union u; u.xmm = xmm_;                   \
+mm0_ = u.mm[0];                                 \
+mm1_ = u.mm[1];                                 \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+}
+
+typedef
+union 
+{
+       __m128 sse;
+       float  f[4];
+} my_sse_t;
+
+
+
+typedef union xmm_mm_union {
+       __m128 xmm;
+       __m64 mm[2];
+} xmm_mm_union;
+
+
+void sincos_ps(__m128 x, __m128 *s, __m128 *c) {
+       __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+       __m64 mm0, mm1, mm2, mm3, mm4, mm5;
+       sign_bit_sin = x;
+       /* take the absolute value */
+       x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
+       /* extract the sign bit (upper one) */
+       sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);
+       
+       /* scale by 4/Pi */
+       y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
+    
+       /* store the integer part of y in mm0:mm1 */
+       xmm3 = _mm_movehl_ps(xmm3, y);
+       mm2 = _mm_cvttps_pi32(y);
+       mm3 = _mm_cvttps_pi32(xmm3);
+       
+       /* j=(j+1) & (~1) (see the cephes sources) */
+       mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
+       mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
+       mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
+       mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);
+       
+       y = _mm_cvtpi32x2_ps(mm2, mm3);
+       
+       mm4 = mm2;
+       mm5 = mm3;
+       
+       /* get the swap sign flag for the sine */
+       mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
+       mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
+       mm0 = _mm_slli_pi32(mm0, 29);
+       mm1 = _mm_slli_pi32(mm1, 29);
+       __m128 swap_sign_bit_sin;
+       COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+       
+       /* get the polynom selection mask for the sine */
+       
+       mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
+       mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
+       mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+       mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+       __m128 poly_mask;
+       COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+       
+       /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+       xmm1 = *(__m128*)_ps_minus_cephes_DP1;
+       xmm2 = *(__m128*)_ps_minus_cephes_DP2;
+       xmm3 = *(__m128*)_ps_minus_cephes_DP3;
+       xmm1 = _mm_mul_ps(y, xmm1);
+       xmm2 = _mm_mul_ps(y, xmm2);
+       xmm3 = _mm_mul_ps(y, xmm3);
+       x = _mm_add_ps(x, xmm1);
+       x = _mm_add_ps(x, xmm2);
+       x = _mm_add_ps(x, xmm3);
+       
+       
+       /* get the sign flag for the cosine */
+       
+       mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2);
+       mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2);
+       mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4);
+       mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4);
+       mm4 = _mm_slli_pi32(mm4, 29);
+       mm5 = _mm_slli_pi32(mm5, 29);
+       __m128 sign_bit_cos;
+       COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+       
+       sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+       
+       
+       /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+       __m128 z = _mm_mul_ps(x,x);
+       y = *(__m128*)_ps_coscof_p0;
+       
+       y = _mm_mul_ps(y, z);
+       y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
+       y = _mm_mul_ps(y, z);
+       y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
+       y = _mm_mul_ps(y, z);
+       y = _mm_mul_ps(y, z);
+       __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
+       y = _mm_sub_ps(y, tmp);
+       y = _mm_add_ps(y, *(__m128*)_ps_1);
+       
+       /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+       
+       __m128 y2 = *(__m128*)_ps_sincof_p0;
+       y2 = _mm_mul_ps(y2, z);
+       y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
+       y2 = _mm_mul_ps(y2, z);
+       y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
+       y2 = _mm_mul_ps(y2, z);
+       y2 = _mm_mul_ps(y2, x);
+       y2 = _mm_add_ps(y2, x);
+       
+       /* select the correct result from the two polynoms */  
+       xmm3 = poly_mask;
+       __m128 ysin2 = _mm_and_ps(xmm3, y2);
+       __m128 ysin1 = _mm_andnot_ps(xmm3, y);
+       y2 = _mm_sub_ps(y2,ysin2);
+       y = _mm_sub_ps(y, ysin1);
+       
+       xmm1 = _mm_add_ps(ysin1,ysin2);
+       xmm2 = _mm_add_ps(y,y2);
+       
+       /* update the sign */
+       *s = _mm_xor_ps(xmm1, sign_bit_sin);
+       *c = _mm_xor_ps(xmm2, sign_bit_cos);
+       _mm_empty(); /* good-bye mmx */
+}
+
+
+__m128 log_ps(__m128 x) {
+       __m64 mm0, mm1;
+       __m128 one = *(__m128*)_ps_1;
+       
+       __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+       
+       x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+       
+       
+       /* part 1: x = frexpf(x, &e); */
+       COPY_XMM_TO_MM(x, mm0, mm1);
+       mm0 = _mm_srli_pi32(mm0, 23);
+       mm1 = _mm_srli_pi32(mm1, 23);
+       /* keep only the fractional part */
+       x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask);
+       x = _mm_or_ps(x, *(__m128*)_ps_0p5);
+       
+       /* now e=mm0:mm1 contain the really base-2 exponent */
+       mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f);
+       
+       
+       mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f);
+       
+       __m128 e = _mm_cvtpi32x2_ps(mm0, mm1);
+       e = _mm_add_ps(e, one);
+       
+       /* part2: 
+     if( x < SQRTHF ) {
+        e -= 1;
+        x = x + x - 1.0;
+     } else { x = x - 1.0; }
+        */
+       __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF);
+       
+       __m128 tmp = _mm_and_ps(x, mask);
+       x = _mm_sub_ps(x, one);
+       e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+       x = _mm_add_ps(x, tmp);
+       
+       
+       __m128 z = _mm_mul_ps(x,x);
+       
+       __m128 y = *(__m128*)_ps_cephes_log_p0;
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8);
+       y = _mm_mul_ps(y, x);
+       
+       y = _mm_mul_ps(y, z);
+       
+       
+       tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1);
+       y = _mm_add_ps(y, tmp);
+       
+       
+       tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
+       y = _mm_sub_ps(y, tmp);
+       
+       tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2);
+       x = _mm_add_ps(x, y);
+       x = _mm_add_ps(x, tmp);
+       x = _mm_or_ps(x, invalid_mask); /* negative arg will be NAN */
+       _mm_empty();
+       return x;
+}
+
+__m128 exp_ps(__m128 x) {
+       __m128 tmp = _mm_setzero_ps(), fx;
+       __m64 mm0, mm1;
+       __m128 one = *(__m128*)_ps_1;
+       
+       x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
+       x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);
+       
+       /* express exp(x) as exp(g + n*log(2)) */
+       fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
+       fx = _mm_add_ps(fx, *(__m128*)_ps_0p5);
+       
+       /* how to perform a floorf with SSE: just below */
+       /* step 1 : cast to int */
+       tmp = _mm_movehl_ps(tmp, fx);
+       mm0 = _mm_cvttps_pi32(fx);
+       mm1 = _mm_cvttps_pi32(tmp);
+       /* step 2 : cast back to float */
+       tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+       /* if greater, substract 1 */
+       __m128 mask = _mm_cmpgt_ps(tmp, fx);    
+       mask = _mm_and_ps(mask, one);
+       fx = _mm_sub_ps(tmp, mask);
+       
+       tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
+       __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
+       x = _mm_sub_ps(x, tmp);
+       x = _mm_sub_ps(x, z);
+       
+       z = _mm_mul_ps(x,x);
+       
+       __m128 y = *(__m128*)_ps_cephes_exp_p0;
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
+       y = _mm_mul_ps(y, x);
+       y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
+       y = _mm_mul_ps(y, z);
+       y = _mm_add_ps(y, x);
+       y = _mm_add_ps(y, one);
+       
+       /* build 2^n */
+       z = _mm_movehl_ps(z, fx);
+       mm0 = _mm_cvttps_pi32(fx);
+       mm1 = _mm_cvttps_pi32(z);
+       mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
+       mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
+       mm0 = _mm_slli_pi32(mm0, 23); 
+       mm1 = _mm_slli_pi32(mm1, 23);
+       
+       __m128 pow2n; 
+       COPY_MM_TO_XMM(mm0, mm1, pow2n);
+       
+       y = _mm_mul_ps(y, pow2n);
+       _mm_empty();
+       return y;
+}
+
+
+int 
+calc_gb_rad_still_sse(t_commrec *cr, t_forcerec *fr,int natoms, gmx_mtop_t *mtop,
+                                         const t_atomtypes *atype, real *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
+{
+
+       int i,k,n,ai,ai3,aj1,aj2,aj3,aj4,nj0,nj1,offset;
+       int aj13,aj23,aj33,aj43;
+       int at0,at1;
+       real *sum_gpi;
+
+       real gpi_ai,gpi2,gpi_tmp;
+       real factor;
+       
+       __m128 ix,iy,iz;
+       __m128 jx,jy,jz;
+       __m128 dx,dy,dz;
+       __m128 t1,t2,t3;
+       __m128 rsq11,rinv,rinv2,rinv4,rinv6;
+       __m128 ratio,gpi,rai,raj,vaj,rvdw,mask_cmp;
+       __m128 ccf,dccf,theta,cosq,term,sinq,res,prod;
+       __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; 
+       
+       __m128i mask, maski;
+       
+       const __m128 half  = {0.5f , 0.5f , 0.5f , 0.5f };
+       const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
+       const __m128 one   = {1.0f,  1.0f , 1.0f , 1.0f };
+       const __m128 two   = {2.0f , 2.0f , 2.0f,  2.0f };
+       const __m128 zero  = {0.0f , 0.0f , 0.0f , 0.0f };
+       const __m128 four  = {4.0f , 4.0f , 4.0f , 4.0f };
+       
+       const __m128 p5inv  = {STILL_P5INV, STILL_P5INV, STILL_P5INV, STILL_P5INV};
+       const __m128 pip5   = {STILL_PIP5,  STILL_PIP5,  STILL_PIP5,  STILL_PIP5};
+       const __m128 p4     = {STILL_P4,    STILL_P4,    STILL_P4,    STILL_P4};
+               
+       if(PAR(cr))
+       {
+               pd_at_range(cr,&at0,&at1);
+       }
+       else
+       {
+               at0=0;
+               at1=natoms;
+       }
+       
+       sum_gpi = born->work;   
+       factor = 0.5 * ONE_4PI_EPS0;
+       
+       /* keep the compiler happy */
+       t1 = _mm_setzero_ps();
+       t2 = _mm_setzero_ps();
+       t3 = _mm_setzero_ps();
+       
+       aj1  = aj2  = aj3  = aj4  = 0;
+       aj13 = aj23 = aj33 = aj43 = 0;
+       n = 0;
+       
+       for(i=0;i<natoms;i++)
+               born->bRad[i]=fr->invsqrta[i]=1;
+       
+       for(i=0;i<nl->nri;i++)
+       {
+               ai     = i;
+               ai3        = ai*3;
+               
+               nj0    = nl->jindex[ai];
+               nj1    = nl->jindex[ai+1];
+               
+               offset = (nj1-nj0)%4;
+               
+               /* Polarization energy for atom ai */
+               gpi_ai = born->gpol[ai];
+               gpi    = _mm_set1_ps(0.0);
+               
+               /* Load particle ai coordinates */
+               ix     = _mm_set1_ps(x[ai3]);
+               iy     = _mm_set1_ps(x[ai3+1]);
+               iz     = _mm_set1_ps(x[ai3+2]);
+               
+               /* Load particle ai gb_radius */
+               rai = _mm_set1_ps(mtop->atomtypes.gb_radius[md->typeA[ai]]);
+               
+               if(PAR(cr))
+                       sum_gpi[ai] = 0;
+               
+               for(k=nj0;k<nj1-offset;k+=4)
+               {
+                       aj1 = nl->jjnr[k];       
+                       aj2 = nl->jjnr[k+1];
+                       aj3 = nl->jjnr[k+2];
+                       aj4 = nl->jjnr[k+3];
+                       
+                       aj13 = aj1 * 3; 
+                       aj23 = aj2 * 3;
+                       aj33 = aj3 * 3;
+                       aj43 = aj4 * 3;
+                       
+                       /* Load particle aj1-4 and transpose */
+                       xmm1 = _mm_loadu_ps(x+aj13);
+                       xmm2 = _mm_loadu_ps(x+aj23);
+                       xmm3 = _mm_loadu_ps(x+aj33);
+                       xmm4 = _mm_loadu_ps(x+aj43);
+                       
+                       xmm5 = _mm_unpacklo_ps(xmm1, xmm2); 
+                       xmm6 = _mm_unpacklo_ps(xmm3, xmm4);
+                       xmm7 = _mm_unpackhi_ps(xmm1, xmm2);
+                       xmm8 = _mm_unpackhi_ps(xmm3, xmm4);
+                       
+                       jx = _mm_movelh_ps(xmm5, xmm6);
+                       jy = _mm_unpackhi_ps(xmm5, xmm6);
+                       jy = _mm_shuffle_ps(jy,jy,_MM_SHUFFLE(3,1,2,0));
+                       jz = _mm_movelh_ps(xmm7, xmm8);
+                       
+                       dx    = _mm_sub_ps(ix, jx);
+                       dy    = _mm_sub_ps(iy, jy);
+                       dz    = _mm_sub_ps(iz, jz);
+                       
+                       t1    = _mm_mul_ps(dx,dx);
+                       t2    = _mm_mul_ps(dy,dy);
+                       t3    = _mm_mul_ps(dz,dz);
+                       
+                       rsq11 = _mm_add_ps(t1,t2);
+                       rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare */
+                       
+                       /* Perform reciprocal square root lookup, 12 bits accuracy */
+                       t1        = _mm_rsqrt_ps(rsq11);   /* t1=lookup, r2=x */
+                       /* Newton-Rhapson iteration */
+                       t2        = _mm_mul_ps(t1,t1); /* lu*lu */
+                       t3        = _mm_mul_ps(rsq11,t2);  /* x*lu*lu */
+                       t3        = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+                       t3        = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+                       rinv      = _mm_mul_ps(half,t3); /* result for all four particles */
+                       
+                       rinv2     = _mm_mul_ps(rinv,rinv);
+                       rinv4     = _mm_mul_ps(rinv2,rinv2);
+                       rinv6     = _mm_mul_ps(rinv2,rinv4);
+                        
+                       xmm1 = _mm_load_ss(born->vsolv+aj1); /*see comment at invsqrta*/
+                       xmm2 = _mm_load_ss(born->vsolv+aj2); 
+                       xmm3 = _mm_load_ss(born->vsolv+aj3); 
+                       xmm4 = _mm_load_ss(born->vsolv+aj4);
+                       
+                       xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /*j1 j1 j2 j2*/
+                       xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /*j3 j3 j4 j4*/
+                       vaj  = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                       
+                       raj       = _mm_set_ps(mtop->atomtypes.gb_radius[md->typeA[aj4]],
+                                                                  mtop->atomtypes.gb_radius[md->typeA[aj3]],
+                                                                  mtop->atomtypes.gb_radius[md->typeA[aj2]],
+                                                                  mtop->atomtypes.gb_radius[md->typeA[aj1]]);
+                                                                                          
+                       rvdw      = _mm_add_ps(rai,raj); 
+                       rvdw      = _mm_mul_ps(rvdw,rvdw);
+                       ratio     = _mm_div_ps(rsq11,rvdw); /*ratio = dr2/(rvdw*rvdw)*/
+                                               
+                       mask_cmp  = _mm_cmpgt_ps(ratio,p5inv); /*if ratio>p5inv */
+                       
+                       switch(_mm_movemask_ps(mask_cmp))
+                       {
+                               case 0xF:
+                                       ccf  = one;
+                                       dccf = zero;
+                                       break;
+                               default:
+                       
+                                       theta     = _mm_mul_ps(ratio,pip5);
+                                       sincos_ps(theta,&sinq,&cosq); /* sine and cosine        */                      
+                                       term      = _mm_sub_ps(one,cosq); /*1-cosq*/
+                                       term      = _mm_mul_ps(half,term); /*0.5*(1.0-cosq)*/
+                                       ccf           = _mm_mul_ps(term,term); /* term*term */
+                                       dccf      = _mm_mul_ps(two,term); /* 2 * term */
+                                       dccf      = _mm_mul_ps(dccf,sinq); /* 2*term*sinq */
+                                       dccf      = _mm_mul_ps(dccf,pip5); /*2*term*sinq*pip5 */
+                                       dccf      = _mm_mul_ps(dccf,ratio); /*dccf = 2*term*sinq*PIP5*ratio*/
+                       
+                                       ccf           = (mask_cmp & one)  | _mm_andnot_ps(mask_cmp,ccf); /*conditional as a mask*/
+                                       dccf      = (mask_cmp & zero) | _mm_andnot_ps(mask_cmp,dccf);
+                                       
+                       }
+                       
+                       prod      = _mm_mul_ps(p4,vaj); 
+                       xmm2      = _mm_mul_ps(ccf,rinv4);
+                       xmm2      = _mm_mul_ps(xmm2,prod); /*prod*ccf*idr4*/
+                       gpi               = _mm_add_ps(gpi,xmm2); /* gpi = gpi + prod*ccf*idr4  */
+                       
+                       /* Chain rule terms */
+                       ccf       = _mm_mul_ps(four,ccf);
+                       xmm3      = _mm_sub_ps(ccf,dccf);
+                       xmm3      = _mm_mul_ps(xmm3,rinv6);
+                       xmm1      = _mm_mul_ps(xmm3,prod);
+                       
+                       _mm_storeu_ps(fr->dadx+n, xmm1);
+                       
+                       n = n + 4;
+               }
+               
+               /* deal with odd elements*/
+               if(offset!=0)
+               {
+                       aj1=aj2=aj3=aj4=0;
+                       
+                       if(offset==1)
+                       {
+                               aj1 = nl->jjnr[k];       /*jnr1-4*/
+                               aj13 = aj1 * 3; /*Replace jnr with j3*/
+                               
+                               xmm1 = _mm_loadu_ps(x+aj13);
+                               
+                               xmm6 = xmm1; /* x1 - - - */
+                               xmm4 = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(2,3,0,1)); /* y1 - - - */
+                               xmm5 = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(3,0,1,2)); /* z1 - - - */
+                               
+                               raj       = _mm_set_ps(0.0f, 0.0f, 0.0f, mtop->atomtypes.gb_radius[md->typeA[aj1]]); 
+                               vaj       = _mm_set_ps(0.0f, 0.0f, 0.0f, born->vsolv[aj1]);                                
+                                                                  
+                               mask = _mm_set_epi32(0,0,0,0xffffffff);
+
+                       }
+                       else if(offset==2)
+                       {
+                               aj1 = nl->jjnr[k];       
+                               aj2 = nl->jjnr[k+1];
+                               
+                               aj13 = aj1 * 3; 
+                               aj23 = aj2 * 3;
+                               
+                               xmm1 = _mm_loadu_ps(x+aj13);
+                               xmm2 = _mm_loadu_ps(x+aj23);
+                                                       
+                               xmm6 = _mm_unpacklo_ps(xmm1, xmm2); /*x1,x2,y1,y2 , done x */
+                               xmm4 = _mm_shuffle_ps( xmm6, xmm6, _MM_SHUFFLE(3,2,3,2)); /* y1, y2, y1, y2, done y */
+                               xmm5 = _mm_unpackhi_ps(xmm1, xmm2); /* z1, z2, z1, z2 , done z */
+               
+                               raj       = _mm_set_ps(0.0f, 0.0f, mtop->atomtypes.gb_radius[md->typeA[aj2]],mtop->atomtypes.gb_radius[md->typeA[aj1]]); 
+                               vaj       = _mm_set_ps(0.0f, 0.0f, born->vsolv[aj2], born->vsolv[aj1]);         
+                               
+                               mask = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+
+                       }
+                       else
+                       {
+                               aj1 = nl->jjnr[k];       
+                               aj2 = nl->jjnr[k+1];
+                               aj3 = nl->jjnr[k+2];
+                                               
+                               aj13 = aj1 * 3; 
+                               aj23 = aj2 * 3;
+                               aj33 = aj3 * 3;
+                               
+                               xmm1 = _mm_loadu_ps(x+aj13);
+                               xmm2 = _mm_loadu_ps(x+aj23);
+                               xmm3 = _mm_loadu_ps(x+aj33);
+                                                                                       
+                               xmm5 = _mm_unpacklo_ps(xmm1, xmm2); /* x1, x2, y1, y2 */
+                               xmm6 = _mm_movelh_ps(xmm5, xmm3); /* x1, x2, x3, y3, done x */
+
+                               xmm4 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(3,2,3,2)); /*y1, y2, x2, x1 */
+                               xmm4 = _mm_shuffle_ps(xmm4, xmm3, _MM_SHUFFLE(0,1,1,0)); /*y1, y2, y3, x3, done y*/
+                               
+                               xmm5 = _mm_unpackhi_ps(xmm1, xmm2); /* z1, z2, -, - */
+                               xmm5 = _mm_shuffle_ps(xmm5, xmm3, _MM_SHUFFLE(3,2,1,0)); /*z1, z2, z3, x4, done z */
+                               
+                               raj       = _mm_set_ps(0.0f, 
+                                                                          mtop->atomtypes.gb_radius[md->typeA[aj3]],
+                                                                          mtop->atomtypes.gb_radius[md->typeA[aj2]],
+                                                                          mtop->atomtypes.gb_radius[md->typeA[aj1]]);
+                                                                                        
+                               vaj       = _mm_set_ps(0.0f, 
+                                                      born->vsolv[aj3], 
+                                                      born->vsolv[aj2], 
+                                                      born->vsolv[aj1]);       
+                                                                                                               
+                               mask = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+                       }
+                       
+                       jx = _mm_and_ps( (__m128) mask, xmm6);
+                       jy = _mm_and_ps( (__m128) mask, xmm4);
+                       jz = _mm_and_ps( (__m128) mask, xmm5);
+               
+                       dx    = _mm_sub_ps(ix, jx);
+                       dy    = _mm_sub_ps(iy, jy);
+                       dz    = _mm_sub_ps(iz, jz);
+                       
+                       t1    = _mm_mul_ps(dx,dx);
+                       t2    = _mm_mul_ps(dy,dy);
+                       t3    = _mm_mul_ps(dz,dz);
+                       
+                       rsq11 = _mm_add_ps(t1,t2);
+                       rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare*/
+                       
+                       /* Perform reciprocal square root lookup, 12 bits accuracy */
+                       t1        = _mm_rsqrt_ps(rsq11);   /* t1=lookup, r2=x */
+                       /* Newton-Rhapson iteration */
+                       t2        = _mm_mul_ps(t1,t1); /* lu*lu */
+                       t3        = _mm_mul_ps(rsq11,t2);  /* x*lu*lu */
+                       t3        = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+                       t3        = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+                       rinv      = _mm_mul_ps(half,t3); /* result for all four particles */
+                       
+                       rinv2     = _mm_mul_ps(rinv,rinv);
+                       rinv4     = _mm_mul_ps(rinv2,rinv2);
+                       rinv6     = _mm_mul_ps(rinv2,rinv4);
+                       
+                       rvdw      = _mm_add_ps(rai,raj); 
+                       rvdw      = _mm_mul_ps(rvdw,rvdw);
+                       ratio     = _mm_div_ps(rsq11,rvdw); /*ratio = dr2/(rvdw*rvdw)*/
+                                               
+                       mask_cmp  = _mm_cmpgt_ps(ratio,p5inv); /*if ratio>p5inv*/
+                       
+                       switch(_mm_movemask_ps(mask_cmp))
+                       {
+                               case 0xF:
+                                       ccf  = one;
+                                       dccf = zero;
+                                       break;
+                               default:
+                       
+                                       theta     = _mm_mul_ps(ratio,pip5);
+                                       sincos_ps(theta,&sinq,&cosq); 
+                                       term      = _mm_sub_ps(one,cosq);
+                                       term      = _mm_mul_ps(half,term);
+                                       ccf           = _mm_mul_ps(term,term);
+                                       dccf      = _mm_mul_ps(two,term);
+                                       dccf      = _mm_mul_ps(dccf,sinq);
+                                       dccf      = _mm_mul_ps(dccf,pip5);
+                                       dccf      = _mm_mul_ps(dccf,theta); 
+                       
+                                       ccf           = (mask_cmp & one)  | _mm_andnot_ps(mask_cmp,ccf); /*conditional as a mask*/
+                                       dccf      = (mask_cmp & zero) | _mm_andnot_ps(mask_cmp,dccf);
+                       }
+               
+                       prod      = _mm_mul_ps(p4,vaj); 
+                       xmm2      = _mm_mul_ps(ccf,rinv4);
+                       xmm2      = _mm_mul_ps(xmm2,prod); /* prod*ccf*idr4*/
+                       xmm2      = _mm_and_ps( (__m128) mask, xmm2);
+                       gpi       = _mm_add_ps(gpi,xmm2);  /*gpi = gpi + prod*ccf*idr4 */
+                       
+                       /* Chain rule terms */
+                       ccf       = _mm_mul_ps(four,ccf);
+                       xmm3      = _mm_sub_ps(ccf,dccf);
+                       xmm3      = _mm_mul_ps(xmm3,rinv6);
+                       xmm1      = _mm_mul_ps(xmm3,prod);
+                       xmm1      = _mm_and_ps( (__m128) mask, xmm1);
+                       
+                       _mm_storeu_ps(fr->dadx+n, xmm1); 
+               
+                       n = n + offset;
+                       
+
+               } 
+               
+               /* gpi now contains four partial terms that need to be added to particle ai gpi*/
+               xmm2  = _mm_movehl_ps(xmm2,gpi);
+               gpi   = _mm_add_ps(gpi,xmm2);
+               xmm2  = _mm_shuffle_ps(gpi,gpi,_MM_SHUFFLE(1,1,1,1));
+               gpi   = _mm_add_ss(gpi,xmm2);
+               
+               _mm_store_ss(&gpi_tmp,gpi);
+
+               if(PAR(cr))
+               {
+                       sum_gpi[ai]=gpi_tmp;
+               }
+               else
+               {
+                       gpi_ai = gpi_ai + gpi_tmp; /* add gpi to the initial pol energy gpi_ai*/
+                       gpi2   = gpi_ai * gpi_ai;
+               
+                       born->bRad[ai]=factor*invsqrt(gpi2);
+                       fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+               }
+               
+       }
+
+       if(PAR(cr))
+       {
+               gmx_sum(natoms,sum_gpi,cr);
+
+               for(i=at0;i<at1;i++)
+               {
+                       ai = i;
+                       gpi_ai = born->gpol[ai];
+                       gpi_ai = gpi_ai + sum_gpi[ai];
+                       gpi2   = gpi_ai*gpi_ai;
+                       
+                       born->bRad[ai]=factor*invsqrt(gpi2);
+                       fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+               }
+               
+               /* Communicate Born radii*/
+               gb_pd_send(cr,born->bRad,md->nr);
+               gb_pd_send(cr,fr->invsqrta,md->nr);
+       }               
+
+       return 0;
+       
+
+}
+
+int 
+calc_gb_rad_hct_sse(t_commrec *cr, t_forcerec *fr, int natoms, gmx_mtop_t *mtop, const t_atomtypes *atype, real *x,
+                                       t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
+{
+       int i,k,n,ai,ai3,aj1,aj2,aj3,aj4,nj0,nj1,at0,at1;
+       int aj13,aj23,aj33,aj43;
+       int offset;
+       real ri,rr,sum,sum_tmp,sum_ai,min_rad,rad;
+       real doffset;
+       real *sum_mpi;
+
+       __m128 ix,iy,iz,jx,jy,jz;
+       __m128 dx,dy,dz,t1,t2,t3;
+       __m128 rsq11,rinv,r,rai;
+       __m128 rai_inv,sk,sk2,lij,dlij,duij;
+       __m128 uij,lij2,uij2,lij3,uij3,diff2;
+       __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
+       __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
+       __m128 mask_cmp,mask_cmp2,mask_cmp3;
+       
+       __m128i maski;
+       
+       const __m128 neg   = {-1.0f , -1.0f , -1.0f , -1.0f };
+       const __m128 zero  = {0.0f , 0.0f , 0.0f , 0.0f };
+       const __m128 eigth = {0.125f , 0.125f , 0.125f , 0.125f };
+       const __m128 qrtr  = {0.25f , 0.25f , 0.25f , 0.25f };
+       const __m128 half  = {0.5f , 0.5f , 0.5f , 0.5f };
+       const __m128 one   = {1.0f , 1.0f , 1.0f , 1.0f };
+       const __m128 two   = {2.0f , 2.0f , 2.0f , 2.0f };
+       const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
+
+       if(PAR(cr))
+       {
+               pd_at_range(cr,&at0,&at1);
+       }
+       else
+       {
+               at0=0;
+               at1=natoms;
+       }
+       
+       doffset = born->gb_doffset;
+       sum_mpi = born->work;
+       
+       for(i=0;i<natoms;i++)
+               born->bRad[i]=fr->invsqrta[i]=1;
+       
+       for(i=0;i<nl->nri;i++)
+       {
+               ai  = nl->iinr[i];
+               ai3 = ai*3;
+               
+               nj0 = nl->jindex[ai];
+               nj1 = nl->jindex[ai+1];
+               
+               offset = (nj1-nj0)%4;
+               
+               /* Load rai*/
+               rr  = mtop->atomtypes.gb_radius[md->typeA[ai]]-doffset;
+               rai = _mm_load1_ps(&rr);
+               
+               /* Load cumulative sum*/
+               sum = 1.0/rr;
+               rai_inv = _mm_load1_ps(&sum);
+
+               /* Load ai coordinates*/
+               ix = _mm_load1_ps(x+ai3);
+               iy = _mm_load1_ps(x+ai3+1);
+               iz = _mm_load1_ps(x+ai3+2);
+
+               if(PAR(cr))
+               {
+                       /* Only have the master node do this, since we only want one value at one time */
+                       if(MASTER(cr))
+                               sum_mpi[ai]=sum_ai;
+                       else
+                               sum_mpi[ai]=0;
+               }
+       
+               for(k=nj0;k<nj1-offset;k+=4)
+               {
+                       aj1 = nl->jjnr[k];      
+                       aj2 = nl->jjnr[k+1];
+                       aj3 = nl->jjnr[k+2];
+                       aj4 = nl->jjnr[k+3];
+                       
+                       aj13 = aj1 * 3; 
+                       aj23 = aj2 * 3;
+                       aj33 = aj3 * 3;
+                       aj43 = aj4 * 3;
+                       
+                       /* Load particle aj1-4 and transpose*/
+                       xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
+                       xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
+                       xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33));
+                       xmm4 = _mm_loadh_pi(xmm4,(__m64 *) (x+aj43));
+                       
+                       xmm5    = _mm_load1_ps(x+aj13+2);  
+                       xmm6    = _mm_load1_ps(x+aj23+2); 
+                       xmm7    = _mm_load1_ps(x+aj33+2); 
+                       xmm8    = _mm_load1_ps(x+aj43+2);
+                                               
+                       xmm5    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+                       xmm6    = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
+                       jz      = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
+                       
+                       xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+                       xmm2    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
+                       jx      = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
+                       jy      = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
+                                               
+                       dx    = _mm_sub_ps(ix, jx);
+                       dy    = _mm_sub_ps(iy, jy);
+                       dz    = _mm_sub_ps(iz, jz);
+                       
+                       t1    = _mm_mul_ps(dx,dx);
+                       t2    = _mm_mul_ps(dy,dy);
+                       t3    = _mm_mul_ps(dz,dz);
+                       
+                       rsq11 = _mm_add_ps(t1,t2);
+                       rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare*/
+                       
+                       /* Perform reciprocal square root lookup, 8 bits accuracy */
+                       t1        = _mm_rsqrt_ps(rsq11);   /* t1=lookup, r2=x */
+                       /* Newton-Rhapson iteration to get 12 bits correct*/
+                       t2        = _mm_mul_ps(t1,t1); /* lu*lu */
+                       t3        = _mm_mul_ps(rsq11,t2);  /* x*lu*lu */
+                       t3        = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+                       t3        = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+                       rinv      = _mm_mul_ps(half,t3); /* result for all four particles */
+                       
+                       r         = _mm_mul_ps(rinv,rsq11);
+                       
+                       xmm1 = _mm_load_ss(born->param+aj1); 
+                       xmm2 = _mm_load_ss(born->param+aj2);  
+                       xmm3 = _mm_load_ss(born->param+aj3); 
+                       xmm4 = _mm_load_ss(born->param+aj4);
+                       
+                       xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /*j1 j1 j2 j2*/
+                       xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /*j3 j3 j4 j4*/
+                       sk   = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                       
+                       /* conditional mask for rai<dr+sk*/
+                       xmm1      = _mm_add_ps(r,sk); 
+                       mask_cmp  = _mm_cmplt_ps(rai,xmm1);
+                       
+                       /* conditional for rai>dr-sk, ends with mask_cmp2*/
+                       xmm2      = _mm_sub_ps(r,sk); /*xmm2 = dr-sk*/
+                       
+                       xmm3      = _mm_rcp_ps(xmm2); /*1.0/(dr-sk), 12 bits accuracy*/
+                       t1        = _mm_mul_ps(xmm3,xmm2);
+                       t1        = _mm_sub_ps(two,t1);
+                       xmm3      = _mm_mul_ps(t1,xmm3);
+                       
+                       xmm4      = rai_inv;
+                       xmm5      = zero; 
+                                                       
+                       mask_cmp2 = _mm_cmpgt_ps(rai,xmm2); /*rai>dr-sk */
+                       lij     = (mask_cmp2 & xmm4)  | _mm_andnot_ps(mask_cmp2, xmm3);
+                       dlij    = (mask_cmp2 & xmm5)  | _mm_andnot_ps(mask_cmp2, one);
+                       
+                       uij             = _mm_rcp_ps(xmm1); /* better approximation than just _mm_rcp_ps, which is just 12 bits*/
+                       t1      = _mm_mul_ps(uij,xmm1);
+                       t1      = _mm_sub_ps(two,t1);
+                       uij     = _mm_mul_ps(t1,uij);
+                       
+                       lij2    = _mm_mul_ps(lij,lij); 
+                       lij3    = _mm_mul_ps(lij2,lij);
+                       uij2    = _mm_mul_ps(uij,uij);
+                       uij3    = _mm_mul_ps(uij2,uij);         
+                                       
+                       diff2   = _mm_sub_ps(uij2,lij2);
+                       
+                       /* Perform reciprocal square root lookup, 12 bits accuracy */
+                       t1        = _mm_rsqrt_ps(lij2);   /* t1=lookup, r2=x */
+                       /* Newton-Rhapson iteration */
+                       t2        = _mm_mul_ps(t1,t1); /* lu*lu */
+                       t3        = _mm_mul_ps(lij2,t2);  /* x*lu*lu */
+                       t3        = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+                       t3        = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+                       lij_inv   = _mm_mul_ps(half,t3); /* result for all four particles */
+                       
+                       sk2     = _mm_mul_ps(sk,sk);
+                       sk2_inv = _mm_mul_ps(sk2,rinv);
+                       prod    = _mm_mul_ps(qrtr,sk2_inv);
+                               
+                       log_term = _mm_mul_ps(uij,lij_inv);
+                       log_term = log_ps(log_term);
+                       
+                       xmm1    = _mm_sub_ps(lij,uij);
+                       xmm2    = _mm_mul_ps(qrtr,r); /* 0.25*dr */
+                       xmm2    = _mm_mul_ps(xmm2,diff2); /* 0.25*dr*prod */
+                       xmm1    = _mm_add_ps(xmm1,xmm2); /* lij-uij + 0.25*dr*diff2 */
+                       xmm2    = _mm_mul_ps(half,rinv); /* 0.5*rinv */
+                       xmm2    = _mm_mul_ps(xmm2,log_term); /* 0.5*rinv*log_term */
+                       xmm1    = _mm_add_ps(xmm1,xmm2); /* lij-uij+0.25*dr*diff2+0.5*rinv*log_term */
+                       xmm8    = _mm_mul_ps(neg,diff2); /* (-1)*diff2 */
+                       xmm2    = _mm_mul_ps(xmm8,prod); /* (-1)*diff2*prod */
+                       tmp     = _mm_add_ps(xmm1,xmm2); /* done tmp-term */
+                       
+                       /* contitional for rai<sk-dr */
+                       xmm3    = _mm_sub_ps(sk,r);
+                       mask_cmp3 = _mm_cmplt_ps(rai,xmm3); /* rai<sk-dr */
+                       
+                       xmm4    = _mm_sub_ps(rai_inv,lij);
+                       xmm4    = _mm_mul_ps(two,xmm4);
+                       xmm4    = _mm_add_ps(tmp,xmm4);
+                                       
+                       tmp   = (mask_cmp3 & xmm4) | _mm_andnot_ps(mask_cmp3,tmp); /* xmm1 will now contain four tmp values */
+                                       
+                       /* the tmp will now contain four partial values, that not all are to be used. Which */
+                       /* ones are governed by the mask_cmp mask. */
+                       tmp     = _mm_mul_ps(half,tmp); /* 0.5*tmp */
+                       tmp     = (mask_cmp & tmp) | _mm_andnot_ps(mask_cmp, zero);
+                       tmp_sum = _mm_add_ps(tmp_sum,tmp);
+               
+                       duij   = one;
+                                       
+                       xmm2   = _mm_mul_ps(half,lij2); 
+                       xmm3   = _mm_mul_ps(prod,lij3); 
+                       xmm2   = _mm_add_ps(xmm2,xmm3); 
+                       xmm3   = _mm_mul_ps(lij,rinv); 
+                       xmm4   = _mm_mul_ps(lij3,r); 
+                       xmm3   = _mm_add_ps(xmm3,xmm4); 
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); 
+                       t1     = _mm_sub_ps(xmm2,xmm3); 
+                                       
+                       xmm2   = _mm_mul_ps(half,uij2);
+                       xmm2   = _mm_mul_ps(neg,xmm2); 
+                       xmm3   = _mm_mul_ps(qrtr,sk2_inv);
+                       xmm3   = _mm_mul_ps(xmm3,uij3); 
+                       xmm2   = _mm_sub_ps(xmm2,xmm3); 
+                       xmm3   = _mm_mul_ps(uij,rinv); 
+                       xmm4   = _mm_mul_ps(uij3,r); 
+                       xmm3   = _mm_add_ps(xmm3,xmm4); 
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); 
+                       t2     = _mm_add_ps(xmm2,xmm3); 
+                                       
+                       xmm2   = _mm_mul_ps(sk2_inv,rinv);
+                       xmm2   = _mm_add_ps(one,xmm2); /*1+sk2_rinv*rinv */
+                       xmm2   = _mm_mul_ps(eigth,xmm2); /*0.125*(1+sk2_rinv*rinv) */
+                       xmm2   = _mm_mul_ps(xmm2,xmm8); /*0.125*(1+sk2_rinv*rinv)*(-diff2) */
+                       xmm3   = _mm_mul_ps(log_term, rinv); /*log_term*rinv */
+                       xmm3   = _mm_mul_ps(xmm3,rinv); /*log_term*rinv*rinv */
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); /*0.25*log_term*rinv*rinv */
+                       t3     = _mm_add_ps(xmm2,xmm3); /* done t3 */
+                       
+                       /* chain rule terms */
+                       xmm2   = _mm_mul_ps(dlij,t1); /*dlij*t1 */
+                       xmm3   = _mm_mul_ps(duij,t2); /*duij*t2 */
+                       xmm2   = _mm_add_ps(xmm2,xmm3);/*dlij*t1+duij*t2 */
+                       xmm2   = _mm_add_ps(xmm2,t3); 
+                       xmm2   = _mm_mul_ps(xmm2,rinv);
+                                       
+                       _mm_storeu_ps(fr->dadx+n,xmm2);
+                       
+                       n      =  n + 4;
+                                       
+               } 
+               
+               if(offset!=0)
+               {
+                       aj1=aj2=aj3=aj4=0;
+                       
+                       if(offset==1)
+                       {
+                               aj1   = nl->jjnr[k];
+                               aj13  = aj1 * 3;
+                               
+                               xmm1  = _mm_loadl_pi(xmm1,(__m64 *) (x+aj13));
+                               xmm5  = _mm_load1_ps(x+aj13+2);
+                               
+                               xmm6  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
+                               xmm4  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+                               
+                               sk    = _mm_load1_ps(born->param+aj1);
+                                               
+                               maski = _mm_set_epi32(0,0,0,0xffffffff);
+                       }
+                       else if(offset==2)
+                       {
+                               aj1   = nl->jjnr[k];
+                               aj2   = nl->jjnr[k+1];
+                               
+                               aj13  = aj1 * 3;
+                               aj23  = aj2 * 3;
+
+                               xmm1  = _mm_loadh_pi(xmm1, (__m64 *) (x+aj13));
+                               xmm2  = _mm_loadh_pi(xmm2, (__m64 *) (x+aj23));
+                               
+                               xmm5  = _mm_load1_ps(x+aj13+2);
+                               xmm6  = _mm_load1_ps(x+aj23+2);
+                               
+                               xmm5  = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+                               xmm5  = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
+                               
+                               xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+                               xmm6  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+                               xmm4  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+                               
+                               xmm1 = _mm_load1_ps(born->param+aj1);
+                               xmm2 = _mm_load1_ps(born->param+aj2);
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+                               sk   = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+                               
+                               maski = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+                       }
+                       else
+                       {
+                               aj1   = nl->jjnr[k];
+                               aj2   = nl->jjnr[k+1];
+                               aj3   = nl->jjnr[k+2];
+                               
+                               aj13  = aj1 * 3;
+                               aj23  = aj2 * 3;
+                               aj33  = aj3 * 3;
+                               
+                               xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13)); 
+                               xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23)); 
+                               xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33)); 
+                               
+                               xmm5 = _mm_load1_ps(x+aj13+2); 
+                               xmm6 = _mm_load1_ps(x+aj23+2); 
+                               xmm7 = _mm_load1_ps(x+aj33+2); 
+                                                                                       
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));                                         
+                               
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2));
+                               xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2));
+                               
+                               xmm6 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0)); 
+                               xmm4 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1));
+                       
+                               xmm1 = _mm_load1_ps(born->param+aj1);
+                               xmm2 = _mm_load1_ps(born->param+aj2);
+                               xmm3 = _mm_load1_ps(born->param+aj3);
+                               
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /*j1 j1 j2 j2*/
+                               xmm3 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0)); /*j3 j3 j3 j3*/
+                               sk   = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                               
+                               
+                               maski = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+                       }
+                       
+                       jx = _mm_and_ps( (__m128) maski, xmm6);
+                       jy = _mm_and_ps( (__m128) maski, xmm4);
+                       jz = _mm_and_ps( (__m128) maski, xmm5);
+                       
+                       sk = _mm_and_ps ( (__m128) maski, sk);
+                       
+                       dx    = _mm_sub_ps(ix, jx);
+                       dy    = _mm_sub_ps(iy, jy);
+                       dz    = _mm_sub_ps(iz, jz);
+                       
+                       t1    = _mm_mul_ps(dx,dx);
+                       t2    = _mm_mul_ps(dy,dy);
+                       t3    = _mm_mul_ps(dz,dz);
+                       
+                       rsq11 = _mm_add_ps(t1,t2);
+                       rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare*/
+                       
+                       /* Perform reciprocal square root lookup, 12 bits accuracy */
+                       t1        = _mm_rsqrt_ps(rsq11);   /* t1=lookup, r2=x */
+                       /* Newton-Rhapson iteration */
+                       t2        = _mm_mul_ps(t1,t1); /* lu*lu */
+                       t3        = _mm_mul_ps(rsq11,t2);  /* x*lu*lu */
+                       t3        = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+                       t3        = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+                       rinv      = _mm_mul_ps(half,t3); /* result for all four particles */
+                       
+                       r         = _mm_mul_ps(rinv,rsq11);
+                       
+                       xmm1      = _mm_add_ps(r,sk); /*dr+sk           */                         
+                       
+                       /* conditional mask for rai<dr+sk */
+                       mask_cmp  = _mm_cmplt_ps(rai,xmm1);
+                       
+                       /* conditional for rai>dr-sk, ends with mask_cmp2 */
+                       xmm2      = _mm_sub_ps(r,sk); /*xmm2 = dr-sk */
+                       
+                       xmm3      = _mm_rcp_ps(xmm2); /*1.0/(dr-sk)*/
+                       t1        = _mm_mul_ps(xmm3,xmm2);
+                       t1        = _mm_sub_ps(two,t1);
+                       xmm3      = _mm_mul_ps(t1,xmm3);
+                                               
+                       xmm4      = rai_inv;
+                       xmm5      = zero; 
+                                                       
+                       mask_cmp2 = _mm_cmpgt_ps(rai,xmm2); /*rai>dr-sk */
+                       lij     = (mask_cmp2 & xmm4)  | _mm_andnot_ps(mask_cmp2, xmm3);
+                       dlij    = (mask_cmp2 & xmm5)  | _mm_andnot_ps(mask_cmp2, one);
+       
+                       uij             = _mm_rcp_ps(xmm1);
+                       t1      = _mm_mul_ps(uij,xmm1);
+                       t1      = _mm_sub_ps(two,t1);
+                       uij     = _mm_mul_ps(t1,uij);
+                       
+                       lij2    = _mm_mul_ps(lij,lij); 
+                       lij3    = _mm_mul_ps(lij2,lij);
+                       uij2    = _mm_mul_ps(uij,uij);
+                       uij3    = _mm_mul_ps(uij2,uij);         
+                                       
+                       diff2   = _mm_sub_ps(uij2,lij2);
+                       
+                       t1        = _mm_rsqrt_ps(lij2);   /* t1=lookup, r2=x */
+                       /* Newton-Rhapson iteration */
+                       t2        = _mm_mul_ps(t1,t1); /* lu*lu */
+                       t3        = _mm_mul_ps(lij2,t2);  /* x*lu*lu */
+                       t3        = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+                       t3        = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+                       lij_inv   = _mm_mul_ps(half,t3); /* result for all four particles */
+
+                       sk2     = _mm_mul_ps(sk,sk);
+                       sk2_inv = _mm_mul_ps(sk2,rinv);
+                       prod    = _mm_mul_ps(qrtr,sk2_inv);
+                               
+                       log_term = _mm_mul_ps(uij,lij_inv);
+                       log_term = log_ps(log_term);
+                                                       
+                       xmm1    = _mm_sub_ps(lij,uij);
+                       xmm2    = _mm_mul_ps(qrtr,r); /* 0.25*dr */
+                       xmm2    = _mm_mul_ps(xmm2,diff2); /*0.25*dr*prod */
+                       xmm1    = _mm_add_ps(xmm1,xmm2); /*lij-uij + 0.25*dr*diff2 */
+                       xmm2    = _mm_mul_ps(half,rinv); /* 0.5*rinv */
+                       xmm2    = _mm_mul_ps(xmm2,log_term); /*0.5*rinv*log_term */
+                       xmm1    = _mm_add_ps(xmm1,xmm2); /*lij-uij+0.25*dr*diff2+0.5*rinv*log_term */
+                       xmm8    = _mm_mul_ps(neg,diff2); /*(-1)*diff2 */
+                       xmm2    = _mm_mul_ps(xmm8,prod); /*(-1)*diff2*prod */
+                       tmp     = _mm_add_ps(xmm1,xmm2); /* done tmp-term */
+                       
+                       /* contitional for rai<sk-dr */                                 
+                       xmm3    = _mm_sub_ps(sk,r);
+                       mask_cmp3 = _mm_cmplt_ps(rai,xmm3); /*rai<sk-dr*/
+                       
+                       xmm4    = _mm_sub_ps(rai_inv,lij);
+                       xmm4    = _mm_mul_ps(two,xmm4);
+                       xmm4    = _mm_add_ps(xmm1,xmm4);
+                                       
+                       tmp    = (mask_cmp3 & xmm4) | _mm_andnot_ps(mask_cmp3,tmp); /* xmm1 will now contain four tmp values*/
+                               
+                       /* tmp will now contain four partial values, that not all are to be used. Which */
+                       /* ones are governed by the mask_cmp mask.*/ 
+                       tmp     = _mm_mul_ps(half,tmp); /*0.5*tmp*/
+                       tmp     = (mask_cmp & tmp) | _mm_andnot_ps(mask_cmp, zero);
+                       tmp_sum = _mm_add_ps(tmp_sum,tmp);
+                       
+                       duij   = one;
+                                       
+                       /* start t1 */
+                       xmm2   = _mm_mul_ps(half,lij2); /*0.5*lij2 */
+                       xmm3   = _mm_mul_ps(prod,lij3); /*prod*lij3;*/
+                       xmm2   = _mm_add_ps(xmm2,xmm3); /*0.5*lij2+prod*lij3 */
+                       xmm3   = _mm_mul_ps(lij,rinv); /*lij*rinv */
+                       xmm4   = _mm_mul_ps(lij3,r); /*lij3*dr; */
+                       xmm3   = _mm_add_ps(xmm3,xmm4); /*lij*rinv+lij3*dr */
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); /*0.25*(lij*rinv+lij3*dr) */
+                       t1     = _mm_sub_ps(xmm2,xmm3); /* done t1 */
+               
+                       /* start t2 */
+                       xmm2   = _mm_mul_ps(half,uij2); /*0.5*uij2 */
+                       xmm2   = _mm_mul_ps(neg,xmm2); /*(-1)*0.5*lij2 */
+                       xmm3   = _mm_mul_ps(qrtr,sk2_inv); /*0.25*sk2_rinv */
+                       xmm3   = _mm_mul_ps(xmm3,uij3); /*0.25*sk2_rinv*uij3 */
+                       xmm2   = _mm_sub_ps(xmm2,xmm3); /*(-1)*0.5*lij2-0.25*sk2_rinv*uij3 */
+                       xmm3   = _mm_mul_ps(uij,rinv); /*uij*rinv */
+                       xmm4   = _mm_mul_ps(uij3,r); /*uij3*dr; */
+                       xmm3   = _mm_add_ps(xmm3,xmm4); /*uij*rinv+uij*dr */
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); /*0.25*(uij*rinv+uij*dr) */
+                       t2     = _mm_add_ps(xmm2,xmm3); /* done t2 */
+                                       
+                       /* start t3 */
+                       xmm2   = _mm_mul_ps(sk2_inv,rinv);
+                       xmm2   = _mm_add_ps(one,xmm2); /*1+sk2_rinv*rinv; */
+                       xmm2   = _mm_mul_ps(eigth,xmm2); /*0.125*(1+sk2_rinv*rinv) */
+                       xmm2   = _mm_mul_ps(xmm2,xmm8); /*0.125*(1+sk2_rinv*rinv)*(-diff2) */
+                       xmm3   = _mm_mul_ps(log_term, rinv); /*log_term*rinv */
+                       xmm3   = _mm_mul_ps(xmm3,rinv); /*log_term*rinv*rinv */
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); /*0.25*log_term*rinv*rinv */
+                       t3     = _mm_add_ps(xmm2,xmm3); /* done t3 */
+                                                
+                       /* chain rule terms */
+                       xmm2   = _mm_mul_ps(dlij,t1); /*dlij*t1 */
+                       xmm3   = _mm_mul_ps(duij,t2); /*duij*t2 */
+                       xmm2   = _mm_add_ps(xmm2,xmm3);/*dlij*t1+duij*t2 */
+                       xmm2   = _mm_add_ps(xmm2,t3); /*everyhting * t3 */
+                       xmm2   = _mm_mul_ps(xmm2,rinv); /*everything * t3 *rinv */
+                       
+                       xmm2   = _mm_and_ps( (__m128) maski,xmm2);
+                       _mm_storeu_ps(fr->dadx+n,xmm2); /* store excess elements, but since we are only advancing n by
+                                                                                        * offset, this will be corrected by the "main" loop */
+                       
+                       n      =  n + offset;
+                       
+               } /* end offset */
+
+               /* the tmp array will contain partial values that need to be added together */
+               tmp     = _mm_movehl_ps(tmp,tmp_sum);
+               tmp_sum = _mm_sub_ps(tmp_sum,tmp);
+               tmp     = _mm_shuffle_ps(tmp_sum,tmp_sum,_MM_SHUFFLE(1,1,1,1));
+               tmp_sum = _mm_sub_ss(tmp_sum,tmp);
+               
+               _mm_store_ss(&sum_tmp,tmp_sum);
+
+               if(PAR(cr))
+               {
+                       sum_mpi[ai]=sum_tmp;
+               }
+               else
+               {
+                       sum_ai=sum_tmp;
+               
+                       min_rad = rr + doffset;
+                       rad=1.0/sum_ai; 
+                       
+                       born->bRad[ai]=rad > min_rad ? rad : min_rad;
+                       fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+               }
+       }
+
+       if(PAR(cr))
+       {
+               gmx_sum(natoms,sum_mpi,cr);
+
+               for(i=at0;i<at1;i++)
+               {
+                       ai      = i;
+                       min_rad = mtop->atomtypes.gb_radius[md->typeA[ai]]; 
+                       rad     = 1.0/sum_mpi[ai];
+                       
+                       born->bRad[ai]=rad > min_rad ? rad : min_rad;
+                       fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+               }
+               
+               gb_pd_send(cr,born->bRad,md->nr);
+               gb_pd_send(cr,fr->invsqrta,md->nr);
+       }               
+
+
+       return 0;
+}
+
+
+__m128 log2_ps(__m128 x)
+{
+       __m128i exp    = _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000);
+       __m128i one    = _mm_set_epi32(0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000); 
+       __m128i off    = _mm_set_epi32(0x3FBF8000, 0x3FBF8000, 0x3FBF8000, 0x3FBF8000); 
+       __m128i mant   = _mm_set_epi32(0x00F7FFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF);
+       __m128i sign   = _mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
+       __m128i base   = _mm_set_epi32(0x43800000, 0x43800000, 0x43800000, 0x43800000);
+       __m128i loge   = _mm_set_epi32(0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218);
+       
+       __m128i D5     = _mm_set_epi32(0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5);
+       __m128i D4     = _mm_set_epi32(0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD); 
+       __m128i D3     = _mm_set_epi32(0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9);
+       __m128i D2     = _mm_set_epi32(0x4026537B, 0x4026537B, 0x4026537B, 0x4026537B);
+       __m128i D1     = _mm_set_epi32(0xC054bFAD, 0xC054bFAD, 0xC054bFAD, 0xC054bFAD); 
+       __m128i D0     = _mm_set_epi32(0x4047691A, 0x4047691A, 0x4047691A, 0x4047691A);
+       
+       __m128  xmm0,xmm1,xmm2;
+       
+       xmm0  = x;
+       xmm1  = xmm0;
+       
+       xmm1  = _mm_and_ps(xmm1, (__m128)exp);
+       xmm1  =  (__m128 ) _mm_srli_epi32( (__m128i) xmm1,8);
+       
+       xmm1  = _mm_or_ps(xmm1, (__m128) one);
+       xmm1  = _mm_sub_ps(xmm1, (__m128) off);
+       xmm1  = _mm_mul_ps(xmm1, (__m128) base);        
+       xmm0  = _mm_and_ps(xmm0, (__m128) mant);
+       xmm0  = _mm_or_ps(xmm0, (__m128) one);
+       
+       xmm2  = _mm_mul_ps(xmm0, (__m128) D5);
+       xmm2  = _mm_add_ps(xmm2, (__m128) D4);
+       xmm2  = _mm_mul_ps(xmm2,xmm0);
+       xmm2  = _mm_add_ps(xmm2, (__m128) D3);
+       xmm2  = _mm_mul_ps(xmm2,xmm0);
+       xmm2  = _mm_add_ps(xmm2, (__m128) D2);
+       xmm2  = _mm_mul_ps(xmm2,xmm0);
+       xmm2  = _mm_add_ps(xmm2, (__m128) D1);
+       xmm2  = _mm_mul_ps(xmm2,xmm0);
+       xmm2  = _mm_add_ps(xmm2, (__m128) D0);
+       xmm0  = _mm_sub_ps(xmm0, (__m128) one);
+       xmm0  = _mm_mul_ps(xmm0,xmm2);
+       xmm1  = _mm_add_ps(xmm1,xmm0);
+       
+       x     = xmm1;
+       x     = _mm_mul_ps(x,(__m128) loge);
+       
+    return x;
+}
+
+
+int 
+calc_gb_rad_obc_sse(t_commrec *cr, t_forcerec * fr, int natoms, gmx_mtop_t *mtop,
+                                       const t_atomtypes *atype, real *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
+{
+       int i,k,n,ai,ai3,aj1,aj2,aj3,aj4,nj0,nj1,at0,at1;
+       int aj13,aj23,aj33,aj43;
+       int offset;
+       real ri;
+       real doffset;
+       real rr,rr_inv,sum_tmp,sum_ai,gbr;
+       real sum_ai2, sum_ai3,tsum,tchain;
+       real z = 0;
+       real *sum_mpi;
+
+       if(PAR(cr))
+       {
+               pd_at_range(cr,&at0,&at1);
+       }
+       else
+       {
+               at0=0;
+               at1=natoms;
+       }
+       
+       doffset = born->gb_doffset;
+       sum_mpi = born->work;
+       
+       __m128 ix,iy,iz,jx,jy,jz;
+       __m128 dx,dy,dz,t1,t2,t3;
+       __m128 rsq11,rinv,r;
+       __m128 rai,rai_inv,rai_inv2,sk,sk2,lij,dlij,duij;
+       __m128 uij,lij2,uij2,lij3,uij3,diff2;
+       __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
+       __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; 
+       __m128 mask_cmp,mask_cmp2,mask_cmp3;
+       
+       __m128i maski;
+       
+       const __m128 neg   = {-1.0f , -1.0f , -1.0f , -1.0f };
+       const __m128 zero  = {0.0f , 0.0f , 0.0f , 0.0f };
+       const __m128 eigth = {0.125f , 0.125f , 0.125f , 0.125f };
+       const __m128 qrtr  = {0.25f , 0.25f , 0.25f , 0.25f };
+       const __m128 half  = {0.5f , 0.5f , 0.5f , 0.5f };
+       const __m128 one   = {1.0f , 1.0f , 1.0f , 1.0f };
+       const __m128 two   = {2.0f , 2.0f , 2.0f , 2.0f };
+       const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
+       
+       float apa[4];   
+
+       for(i=0;i<natoms;i++) {
+               born->bRad[i]=fr->invsqrta[i]=1;
+               born->drobc[i]=0;
+       }
+       
+       /* keep the compiler happy */
+       t1 = _mm_setzero_ps();
+       t2 = _mm_setzero_ps();
+       t3 = _mm_setzero_ps();
+       
+       aj1=aj2=aj3=aj4=0;
+       aj13=aj23=aj33=aj43=0;
+       n=0;
+       
+       for(i=0;i<nl->nri;i++)
+       {
+               ai       = nl->iinr[i];
+               ai3      = ai*3;
+               
+               nj0      = nl->jindex[ai];
+               nj1      = nl->jindex[ai+1];
+               
+               offset   = (nj1-nj0)%4;
+               
+               rr       = mtop->atomtypes.gb_radius[md->typeA[ai]];
+               ri       = rr-doffset;
+               rai      = _mm_load1_ps(&ri);
+               
+               ri       = 1.0/ri;
+               rai_inv  = _mm_load1_ps(&ri);
+               
+               ix               = _mm_load1_ps(x+ai3);
+               iy               = _mm_load1_ps(x+ai3+1);
+               iz           = _mm_load1_ps(x+ai3+2);
+               
+               sum_ai   = 0;
+               tmp_sum = _mm_load1_ps(&z);
+               
+               if(PAR(cr))
+               {
+                       sum_mpi[ai] = 0;
+               }
+               
+               for(k=nj0;k<nj1-offset;k+=4)
+               {
+                       aj1 = nl->jjnr[k];       
+                       aj2 = nl->jjnr[k+1];
+                       aj3 = nl->jjnr[k+2];
+                       aj4 = nl->jjnr[k+3];
+                       
+                       aj13 = aj1 * 3; 
+                       aj23 = aj2 * 3;
+                       aj33 = aj3 * 3;
+                       aj43 = aj4 * 3;
+                       
+                       /* Load particle aj1-4 and transpose */
+                       xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
+                       xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
+                       xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33));
+                       xmm4 = _mm_loadh_pi(xmm4,(__m64 *) (x+aj43));
+                       
+                       xmm5    = _mm_load1_ps(x+aj13+2);  
+                       xmm6    = _mm_load1_ps(x+aj23+2); 
+                       xmm7    = _mm_load1_ps(x+aj33+2); 
+                       xmm8    = _mm_load1_ps(x+aj43+2);
+                                               
+                       xmm5    = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+                       xmm6    = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
+                       jz      = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
+                       
+                       xmm1    = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+                       xmm2    = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
+                       jx      = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
+                       jy      = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
+                                               
+                       dx    = _mm_sub_ps(ix, jx);
+                       dy    = _mm_sub_ps(iy, jy);
+                       dz    = _mm_sub_ps(iz, jz);
+                       
+                       t1    = _mm_mul_ps(dx,dx);
+                       t2    = _mm_mul_ps(dy,dy);
+                       t3    = _mm_mul_ps(dz,dz);
+                       
+                       rsq11 = _mm_add_ps(t1,t2);
+                       rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare */
+                       
+                       /* Perform reciprocal square root lookup, 8 bits accuracy */
+                       t1        = _mm_rsqrt_ps(rsq11);   /* t1=lookup, r2=x */
+                       /* Newton-Rhapson iteration to get 12 bits correct*/
+                       t2        = _mm_mul_ps(t1,t1); /* lu*lu */
+                       t3        = _mm_mul_ps(rsq11,t2);  /* x*lu*lu */
+                       t3        = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+                       t3        = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+                       rinv      = _mm_mul_ps(half,t3); /* result for all four particles */
+                       
+                       r         = _mm_mul_ps(rinv,rsq11);
+                       
+                       xmm1 = _mm_load_ss(born->param+aj1); 
+                       xmm2 = _mm_load_ss(born->param+aj2); 
+                       xmm3 = _mm_load_ss(born->param+aj3); 
+                       xmm4 = _mm_load_ss(born->param+aj4);
+                       
+                       xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /*j1 j1 j2 j2*/
+                       xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /*j3 j3 j4 j4*/
+                       sk = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                                                  
+                       xmm1      = _mm_add_ps(r,sk); /*dr+sk*/                                    
+                       
+                       /* conditional mask for rai<dr+sk */
+                       mask_cmp  = _mm_cmplt_ps(rai,xmm1);
+                       
+                       /* conditional for rai>dr-sk, ends with mask_cmp2 */
+                       xmm2      = _mm_sub_ps(r,sk); /*xmm2 = dr-sk */
+                       
+                       xmm3      = _mm_rcp_ps(xmm2); /*1.0/(dr-sk), 8 bits accuracy */
+                       t1        = _mm_mul_ps(xmm3,xmm2);
+                       t1        = _mm_sub_ps(two,t1);
+                       xmm3      = _mm_mul_ps(t1,xmm3);
+                       
+                       xmm4      = rai_inv;
+                       xmm5      = zero; 
+                                                       
+                       mask_cmp2 = _mm_cmpgt_ps(rai,xmm2); /*rai>dr-sk */
+                       lij     = (mask_cmp2 & xmm4)  | _mm_andnot_ps(mask_cmp2, xmm3);
+                       dlij    = (mask_cmp2 & xmm5)  | _mm_andnot_ps(mask_cmp2, one);
+                       
+                       uij             = _mm_rcp_ps(xmm1); 
+                       t1      = _mm_mul_ps(uij,xmm1);
+                       t1      = _mm_sub_ps(two,t1);
+                       uij     = _mm_mul_ps(t1,uij);
+                       
+                       lij2    = _mm_mul_ps(lij,lij); 
+                       lij3    = _mm_mul_ps(lij2,lij);
+                       uij2    = _mm_mul_ps(uij,uij);
+                       uij3    = _mm_mul_ps(uij2,uij);         
+                                       
+                       diff2   = _mm_sub_ps(uij2,lij2);
+                       
+                       /* Perform reciprocal square root lookup, 12 bits accuracy */
+                       t1        = _mm_rsqrt_ps(lij2);   /* t1=lookup, r2=x */
+                       /* Newton-Rhapson iteration */
+                       t2        = _mm_mul_ps(t1,t1); /* lu*lu */
+                       t3        = _mm_mul_ps(lij2,t2);  /* x*lu*lu */
+                       t3        = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+                       t3        = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+                       lij_inv   = _mm_mul_ps(half,t3); /* result for all four particles */
+                       
+                       sk2     = _mm_mul_ps(sk,sk);
+                       sk2_inv = _mm_mul_ps(sk2,rinv);
+                       prod    = _mm_mul_ps(qrtr,sk2_inv);
+                       
+                       log_term = _mm_mul_ps(uij,lij_inv);
+                       log_term = log2_ps(log_term);
+               
+                       xmm1    = _mm_sub_ps(lij,uij);
+                       xmm2    = _mm_mul_ps(qrtr,r); /* 0.25*dr */
+                       xmm2    = _mm_mul_ps(xmm2,diff2); /*0.25*dr*prod */
+                       xmm1    = _mm_add_ps(xmm1,xmm2); /*lij-uij + 0.25*dr*diff2 */
+                       xmm2    = _mm_mul_ps(half,rinv); /* 0.5*rinv */
+                       xmm2    = _mm_mul_ps(xmm2,log_term); /*0.5*rinv*log_term */
+                       xmm1    = _mm_add_ps(xmm1,xmm2); /*lij-uij+0.25*dr*diff2+0.5*rinv*log_term */
+                       xmm8    = _mm_mul_ps(neg,diff2); /*(-1)*diff2 */
+                       xmm2    = _mm_mul_ps(xmm8,prod); /*(-1)*diff2*prod */
+                       tmp     = _mm_add_ps(xmm1,xmm2); /* done tmp-term */
+                       
+                       /* contitional for rai<sk-dr */
+                       xmm3    = _mm_sub_ps(sk,r);
+                       mask_cmp3 = _mm_cmplt_ps(rai,xmm3); /*rai<sk-dr*/
+                       
+                       
+                       xmm4    = _mm_sub_ps(rai_inv,lij);
+                       xmm4    = _mm_mul_ps(two,xmm4);
+                       xmm4    = _mm_add_ps(tmp,xmm4);
+                                       
+                       tmp   = (mask_cmp3 & xmm4) | _mm_andnot_ps(mask_cmp3,tmp); /* xmm1 will now contain four tmp values */
+                                       
+                       /* the tmp will now contain four partial values, that not all are to be used. Which
+                       * ones are governed by the mask_cmp mask. 
+                       */
+                       tmp     = _mm_mul_ps(half,tmp); /*0.5*tmp*/
+                       tmp     = (mask_cmp & tmp) | _mm_andnot_ps(mask_cmp, zero);
+                       tmp_sum = _mm_add_ps(tmp_sum,tmp);
+               
+                       duij   = one;
+                                       
+                       /* start t1 */
+                       xmm2   = _mm_mul_ps(half,lij2); /*0.5*lij2 */
+                       xmm3   = _mm_mul_ps(prod,lij3); /*prod*lij3; */
+                       xmm2   = _mm_add_ps(xmm2,xmm3); /*0.5*lij2+prod*lij3 */
+                       xmm3   = _mm_mul_ps(lij,rinv); /*lij*rinv */
+                       xmm4   = _mm_mul_ps(lij3,r); /*lij3*dr;*/
+                       xmm3   = _mm_add_ps(xmm3,xmm4); /*lij*rinv+lij3*dr */
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); /*0.25*(lij*rinv+lij3*dr) */
+                       t1     = _mm_sub_ps(xmm2,xmm3); /* done t1 */
+                                       
+                       /* start t2 */
+                       xmm2   = _mm_mul_ps(half,uij2); /*0.5*uij2 */
+                       xmm2   = _mm_mul_ps(neg,xmm2); /*(-1)*0.5*uij2 */
+                       xmm3   = _mm_mul_ps(qrtr,sk2_inv); /*0.25*sk2_rinv */
+                       xmm3   = _mm_mul_ps(xmm3,uij3); /*0.25*sk2_rinv*uij3 */
+                       xmm2   = _mm_sub_ps(xmm2,xmm3); /*(-1)*0.5*lij2-0.25*sk2_rinv*uij3 */
+                       xmm3   = _mm_mul_ps(uij,rinv); /*uij*rinv */
+                       xmm4   = _mm_mul_ps(uij3,r); /*uij3*dr; */
+                       xmm3   = _mm_add_ps(xmm3,xmm4); /*uij*rinv+uij*dr */
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); /*0.25*(uij*rinv+uij*dr) */
+                       t2     = _mm_add_ps(xmm2,xmm3); /* done t2 */
+                                       
+                       /* start t3 */
+                       xmm2   = _mm_mul_ps(sk2_inv,rinv);
+                       xmm2   = _mm_add_ps(one,xmm2); /*1+sk2_rinv*rinv; */
+                       xmm2   = _mm_mul_ps(eigth,xmm2); /*0.125*(1+sk2_rinv*rinv) */
+                       xmm2   = _mm_mul_ps(xmm2,xmm8); /*0.125*(1+sk2_rinv*rinv)*(-diff2) */
+                       xmm3   = _mm_mul_ps(log_term, rinv); /*log_term*rinv */
+                       xmm3   = _mm_mul_ps(xmm3,rinv); /*log_term*rinv*rinv */
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); /*0.25*log_term*rinv*rinv */
+                       t3     = _mm_add_ps(xmm2,xmm3); /* done t3 */
+                       
+                       /* chain rule terms  */
+                       xmm2   = _mm_mul_ps(dlij,t1); /*dlij*t1*/
+                       xmm3   = _mm_mul_ps(duij,t2); /*duij*t2 */
+                       xmm2   = _mm_add_ps(xmm2,xmm3);/*dlij*t1+duij*t2 */
+                       xmm2   = _mm_add_ps(xmm2,t3); 
+                       xmm2   = _mm_mul_ps(xmm2,rinv);
+                                       
+                       _mm_storeu_ps(fr->dadx+n,xmm2);
+                       
+                       n      =  n + 4;
+                                       
+               } /* end normal inner loop */
+               
+               /* deal with offset elements */
+               if(offset!=0)
+               {
+                       aj1=aj2=aj3=aj4=0;
+                       
+                       if(offset==1)
+                       {
+                               aj1   = nl->jjnr[k];
+                               aj13  = aj1 * 3;
+                               
+                               xmm1  = _mm_loadl_pi(xmm1,(__m64 *) (x+aj13));
+                               xmm5  = _mm_load1_ps(x+aj13+2);
+                               
+                               xmm6  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
+                               xmm4  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+                               
+                               sk    = _mm_load1_ps(born->param+aj1);
+                                               
+                               maski = _mm_set_epi32(0,0,0,0xffffffff);
+                       }
+                       else if(offset==2)
+                       {
+                               aj1   = nl->jjnr[k];
+                               aj2   = nl->jjnr[k+1];
+                               
+                               aj13  = aj1 * 3;
+                               aj23  = aj2 * 3;
+
+                               xmm1  = _mm_loadh_pi(xmm1, (__m64 *) (x+aj13));
+                               xmm2  = _mm_loadh_pi(xmm2, (__m64 *) (x+aj23));
+                               
+                               xmm5  = _mm_load1_ps(x+aj13+2);
+                               xmm6  = _mm_load1_ps(x+aj23+2);
+                               
+                               xmm5  = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+                               xmm5  = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
+                               
+                               xmm1  = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+                               xmm6  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+                               xmm4  = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+                               
+                               xmm1 = _mm_load1_ps(born->param+aj1);
+                               xmm2 = _mm_load1_ps(born->param+aj2);
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+                               sk   = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+                               
+                               maski = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+                       }
+                       else
+                       {
+                               aj1   = nl->jjnr[k];
+                               aj2   = nl->jjnr[k+1];
+                               aj3   = nl->jjnr[k+2];
+                               
+                               aj13  = aj1 * 3;
+                               aj23  = aj2 * 3;
+                               aj33  = aj3 * 3;
+                               
+                               xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13)); 
+                               xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23)); 
+                               xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33)); 
+                               
+                               xmm5 = _mm_load1_ps(x+aj13+2); 
+                               xmm6 = _mm_load1_ps(x+aj23+2); 
+                               xmm7 = _mm_load1_ps(x+aj33+2); 
+                                                                                       
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));                                         
+                               
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2));
+                               xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2));
+                               
+                               xmm6 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0)); 
+                               xmm4 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1));
+                       
+                               xmm1 = _mm_load1_ps(born->param+aj1);
+                               xmm2 = _mm_load1_ps(born->param+aj2);
+                               xmm3 = _mm_load1_ps(born->param+aj3);
+                               
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /*j1 j1 j2 j2*/
+                               xmm3 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0)); /*j3 j3 j3 j3*/
+                               sk   = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+                               
+                               
+                               maski = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+                       }
+                       
+                       jx = _mm_and_ps( (__m128) maski, xmm6);
+                       jy = _mm_and_ps( (__m128) maski, xmm4);
+                       jz = _mm_and_ps( (__m128) maski, xmm5);
+                       
+                       sk = _mm_and_ps ( (__m128) maski, sk);
+                       
+                       dx    = _mm_sub_ps(ix, jx);
+                       dy    = _mm_sub_ps(iy, jy);
+                       dz    = _mm_sub_ps(iz, jz);
+                       
+                       t1    = _mm_mul_ps(dx,dx);
+                       t2    = _mm_mul_ps(dy,dy);
+                       t3    = _mm_mul_ps(dz,dz);
+                       
+                       rsq11 = _mm_add_ps(t1,t2);
+                       rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare*/
+                       
+                       /* Perform reciprocal square root lookup, 12 bits accuracy */
+                       t1        = _mm_rsqrt_ps(rsq11);   /* t1=lookup, r2=x */
+                       /* Newton-Rhapson iteration */
+                       t2        = _mm_mul_ps(t1,t1); /* lu*lu */
+                       t3        = _mm_mul_ps(rsq11,t2);  /* x*lu*lu */
+                       t3        = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+                       t3        = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+                       rinv      = _mm_mul_ps(half,t3); /* result for all four particles */
+                       
+                       r         = _mm_mul_ps(rinv,rsq11);
+                       
+                       xmm1      = _mm_add_ps(r,sk); /*dr+sk*/                                    
+                       
+                       /* conditional mask for rai<dr+sk */
+                       mask_cmp  = _mm_cmplt_ps(rai,xmm1);
+                       
+                       /* conditional for rai>dr-sk, ends with mask_cmp2 */
+                       xmm2      = _mm_sub_ps(r,sk); /*xmm2 = dr-sk*/
+                       
+                       xmm3      = _mm_rcp_ps(xmm2); /*1.0/(dr-sk)*/
+                       t1        = _mm_mul_ps(xmm3,xmm2);
+                       t1        = _mm_sub_ps(two,t1);
+                       xmm3      = _mm_mul_ps(t1,xmm3);
+                                               
+                       xmm4      = rai_inv;
+                       xmm5      = zero; 
+                                                       
+                       mask_cmp2 = _mm_cmpgt_ps(rai,xmm2); /*rai>dr-sk */
+                       lij     = (mask_cmp2 & xmm4)  | _mm_andnot_ps(mask_cmp2, xmm3);
+                       dlij    = (mask_cmp2 & xmm5)  | _mm_andnot_ps(mask_cmp2, one);
+       
+                       uij             = _mm_rcp_ps(xmm1);
+                       t1      = _mm_mul_ps(uij,xmm1);
+                       t1      = _mm_sub_ps(two,t1);
+                       uij     = _mm_mul_ps(t1,uij);
+                       
+                       lij2    = _mm_mul_ps(lij,lij); 
+                       lij3    = _mm_mul_ps(lij2,lij);
+                       uij2    = _mm_mul_ps(uij,uij);
+                       uij3    = _mm_mul_ps(uij2,uij);         
+                                       
+                       diff2   = _mm_sub_ps(uij2,lij2);
+                       
+                       t1        = _mm_rsqrt_ps(lij2);   /* t1=lookup, r2=x */
+                       /* Newton-Rhapson iteration */
+                       t2        = _mm_mul_ps(t1,t1); /* lu*lu */
+                       t3        = _mm_mul_ps(lij2,t2);  /* x*lu*lu */
+                       t3        = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+                       t3        = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+                       lij_inv   = _mm_mul_ps(half,t3); /* result for all four particles */
+
+                       sk2     = _mm_mul_ps(sk,sk);
+                       sk2_inv = _mm_mul_ps(sk2,rinv);
+                       prod    = _mm_mul_ps(qrtr,sk2_inv);
+               
+                       log_term = _mm_mul_ps(uij,lij_inv);
+                       log_term = log2_ps(log_term);
+                                                                                                                                                                       
+                       xmm1    = _mm_sub_ps(lij,uij);
+                       xmm2    = _mm_mul_ps(qrtr,r); 
+                       xmm2    = _mm_mul_ps(xmm2,diff2); 
+                       xmm1    = _mm_add_ps(xmm1,xmm2); 
+                       xmm2    = _mm_mul_ps(half,rinv); 
+                       xmm2    = _mm_mul_ps(xmm2,log_term);
+                       xmm1    = _mm_add_ps(xmm1,xmm2); 
+                       xmm8    = _mm_mul_ps(neg,diff2); 
+                       xmm2    = _mm_mul_ps(xmm8,prod); 
+                       tmp     = _mm_add_ps(xmm1,xmm2); 
+                       
+                       /* contitional for rai<sk-dr    */                              
+                       xmm3    = _mm_sub_ps(sk,r);
+                       mask_cmp3 = _mm_cmplt_ps(rai,xmm3); /*rai<sk-dr */
+                       
+                       xmm4    = _mm_sub_ps(rai_inv,lij);
+                       xmm4    = _mm_mul_ps(two,xmm4);
+                       xmm4    = _mm_add_ps(xmm1,xmm4);
+                                       
+                       tmp    = (mask_cmp3 & xmm4) | _mm_andnot_ps(mask_cmp3,tmp); /* xmm1 will now contain four tmp values */
+                               
+                       /* tmp will now contain four partial values, that not all are to be used. Which
+                       * ones are governed by the mask_cmp mask. 
+                       */
+                       tmp     = _mm_mul_ps(half,tmp); 
+                       tmp     = (mask_cmp & tmp) | _mm_andnot_ps(mask_cmp, zero);
+                       tmp_sum = _mm_add_ps(tmp_sum,tmp);
+                       
+                       duij   = one;
+                                       
+                       /* start t1 */
+                       xmm2   = _mm_mul_ps(half,lij2);
+                       xmm3   = _mm_mul_ps(prod,lij3);
+                       xmm2   = _mm_add_ps(xmm2,xmm3);
+                       xmm3   = _mm_mul_ps(lij,rinv); 
+                       xmm4   = _mm_mul_ps(lij3,r); 
+                       xmm3   = _mm_add_ps(xmm3,xmm4); 
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); 
+                       t1     = _mm_sub_ps(xmm2,xmm3); 
+               
+                       /* start t2 */
+                       xmm2   = _mm_mul_ps(half,uij2); 
+                       xmm2   = _mm_mul_ps(neg,xmm2); 
+                       xmm3   = _mm_mul_ps(qrtr,sk2_inv);
+                       xmm3   = _mm_mul_ps(xmm3,uij3); 
+                       xmm2   = _mm_sub_ps(xmm2,xmm3); 
+                       xmm3   = _mm_mul_ps(uij,rinv); 
+                       xmm4   = _mm_mul_ps(uij3,r); 
+                       xmm3   = _mm_add_ps(xmm3,xmm4);
+                       xmm3   = _mm_mul_ps(qrtr,xmm3);
+                       t2     = _mm_add_ps(xmm2,xmm3);
+                                       
+                       /* start t3 */
+                       xmm2   = _mm_mul_ps(sk2_inv,rinv);
+                       xmm2   = _mm_add_ps(one,xmm2); 
+                       xmm2   = _mm_mul_ps(eigth,xmm2); 
+                       xmm2   = _mm_mul_ps(xmm2,xmm8); 
+                       xmm3   = _mm_mul_ps(log_term, rinv);
+                       xmm3   = _mm_mul_ps(xmm3,rinv); 
+                       xmm3   = _mm_mul_ps(qrtr,xmm3); 
+                       t3     = _mm_add_ps(xmm2,xmm3);
+                                                
+                       /* chain rule terms */
+                       xmm2   = _mm_mul_ps(dlij,t1); 
+                       xmm3   = _mm_mul_ps(duij,t2); 
+                       xmm2   = _mm_add_ps(xmm2,xmm3);
+                       xmm2   = _mm_add_ps(xmm2,t3); 
+                       xmm2   = _mm_mul_ps(xmm2,rinv);
+                       
+                       xmm2   = _mm_and_ps( (__m128) maski,xmm2);
+                       _mm_storeu_ps(fr->dadx+n,xmm2); /* store excess elements, but since we are only advancing n by
+                                                                                       * offset, this will be corrected by the "main" loop
+                                                                                               */
+                       
+                       n      =  n + offset;
+                       
+               } /* end offset */
+               
+       
+               /* the tmp array will contain partial values that need to be added together*/
+               tmp     = _mm_movehl_ps(tmp,tmp_sum);
+               tmp_sum = _mm_add_ps(tmp_sum,tmp);
+               tmp     = _mm_shuffle_ps(tmp_sum,tmp_sum,_MM_SHUFFLE(1,1,1,1));
+               tmp_sum = _mm_add_ss(tmp_sum,tmp);
+               
+               _mm_store_ss(&sum_tmp,tmp_sum);
+
+               if(PAR(cr))
+               {
+                       sum_mpi[ai]=sum_tmp;
+               }
+               else
+               {
+                       sum_ai=sum_tmp;
+                       
+                       /* calculate the born radii */
+                       sum_ai  = (rr-doffset) * sum_ai;
+                       sum_ai2 = sum_ai       * sum_ai;
+                       sum_ai3 = sum_ai2      * sum_ai;
+                       
+                       tsum    = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
+                       born->bRad[ai] = ri - tsum/rr;
+                       born->bRad[ai] = 1.0/(born->bRad[ai]);
+                       
+                       fr->invsqrta[ai] = invsqrt(born->bRad[ai]);
+                       tchain = (rr-doffset)*(born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
+                       born->drobc[ai] = (1.0 - tsum*tsum)*tchain*(1.0/rr);
+               }
+
+       }
+
+       if(PAR(cr))
+       {
+               gmx_sum(natoms,sum_mpi,cr);
+               for(i=at0;i<at1;i++)
+               {
+                       ai      = i;
+                       rr      = mtop->atomtypes.gb_radius[md->typeA[ai]];
+                       rr_inv  = 1.0/rr;
+                       
+                       sum_ai  = sum_mpi[ai];
+                       sum_ai  = (rr-doffset) * sum_ai;
+                       sum_ai2 = sum_ai       * sum_ai;
+                       sum_ai3 = sum_ai2      * sum_ai;
+                       
+                       tsum    = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
+                       gbr     = 1.0/(rr-doffset) - tsum*rr_inv;
+                       
+                       born->bRad[ai] = 1.0 / gbr;
+                       fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+                       
+                       tchain  = (rr-doffset) * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
+                       born->drobc[ai] = (1.0-tsum*tsum)*tchain*rr_inv;
+               }
+               
+               gb_pd_send(cr,born->bRad,md->nr);
+               gb_pd_send(cr,fr->invsqrta,md->nr);
+               gb_pd_send(cr,born->drobc,md->nr);
+       }
+       
+       return 0;
+}
+
+
+
+real calc_gb_chainrule_sse(int natoms, t_nblist *nl, real *dadx, real *dvda, real *xd, real *f, int gb_algorithm, gmx_genborn_t *born)                                         
+{
+       int    i,k,n,ai,aj,ai3,nj0,nj1,offset;
+       int        aj1,aj2,aj3,aj4; 
+       
+       real   rbi;
+       real   rb[natoms+4];
+       
+       __m128 ix,iy,iz;
+       __m128 jx,jy,jz;
+       __m128 fix,fiy,fiz;
+       __m128 dx,dy,dz;
+       __m128 t1,t2,t3;
+       __m128 dva,dax,fgb;
+       __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
+       
+       __m128i mask   = _mm_set_epi32(0, 0xffffffff,0xffffffff,0xffffffff);
+       __m128i maski  = _mm_set_epi32(0, 0xffffffff,0xffffffff,0xffffffff);
+       
+       const __m128 two = {2.0f , 2.0f , 2.0f , 2.0f };
+       real z = 0;
+                       
+       /* Loop to get the proper form for the Born radius term, sse style */
+       offset=natoms%4;
+       
+       if(offset!=0)
+       {
+               if(offset==1)
+                       mask = _mm_set_epi32(0,0,0,0xffffffff);
+               else if(offset==2)
+                       mask = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+               else
+                       mask = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+       }
+       
+       if(gb_algorithm==egbSTILL) {
+               
+               xmm3 = _mm_set1_ps(ONE_4PI_EPS0);
+       
+               for(i=0;i<natoms-offset;i+=4)
+               {
+                       xmm1 = _mm_loadu_ps(born->bRad+i);
+                       xmm1 = _mm_mul_ps(xmm1,xmm1);
+                       xmm1 = _mm_mul_ps(xmm1,two); /*2 * rbi * rbi*/
+                       
+                       xmm2 = _mm_loadu_ps(dvda+i);
+                       
+                       xmm1 = _mm_mul_ps(xmm1,xmm2); /* 2 * rbi * rbi * dvda[i] */
+                       xmm1 = _mm_div_ps(xmm1,xmm3); /* (2 * rbi * rbi * dvda[i]) / ONE_4PI_EPS0 */
+               
+                       _mm_storeu_ps(rb+i, xmm1); /* store to memory */
+               }
+               
+               /* with the offset element, the mask stores excess elements to zero. This could cause problems
+               * when something gets allocated right after rb (solution: allocate three positions bigger)      
+               */
+               if(offset!=0)
+               {
+                       xmm1 = _mm_loadu_ps(born->bRad+i);
+                       xmm1 = _mm_mul_ps(xmm1,xmm1);
+                       xmm1 = _mm_mul_ps(xmm1,two); 
+                       
+                       xmm2 = _mm_loadu_ps(dvda+i);
+                       
+                       xmm1 = _mm_mul_ps(xmm1,xmm2); 
+                       xmm1 = _mm_div_ps(xmm1,xmm3); 
+                       xmm1 = _mm_and_ps( (__m128) mask, xmm1);
+               
+                       _mm_storeu_ps(rb+i, xmm1); 
+               }
+       }
+               
+       else if(gb_algorithm==egbHCT) {
+               for(i=0;i<natoms-offset;i+=4)
+               {
+                       xmm1 = _mm_loadu_ps(born->bRad+i);
+                       xmm1 = _mm_mul_ps(xmm1, xmm1); /* rbi*rbi */
+                        
+                       xmm2 = _mm_loadu_ps(dvda+i);
+                       
+                       xmm1 = _mm_mul_ps(xmm1,xmm2); /* rbi*rbi*dvda[i] */
+                       
+                       _mm_storeu_ps(rb+i, xmm1); 
+               }
+               
+               if(offset!=0)
+               {
+                       xmm1 = _mm_loadu_ps(born->bRad+i);
+                       xmm1 = _mm_mul_ps(xmm1, xmm1); /* rbi*rbi */
+                        
+                       xmm2 = _mm_loadu_ps(dvda+i);
+                       
+                       xmm1 = _mm_mul_ps(xmm1,xmm2); /* rbi*rbi*dvda[i] */
+                       xmm1 = _mm_and_ps( (__m128) mask, xmm1);
+                       
+                       _mm_storeu_ps(rb+i, xmm1);
+               }
+       }
+        
+       else if(gb_algorithm==egbOBC) {
+               for(i=0;i<natoms-offset;i+=4)
+               {
+                       xmm1 = _mm_loadu_ps(born->bRad+i);
+                       xmm1 = _mm_mul_ps(xmm1, xmm1); /* rbi*rbi */
+                        
+                       xmm2 = _mm_loadu_ps(dvda+i);
+                       xmm1 = _mm_mul_ps(xmm1,xmm2); /* rbi*rbi*dvda[i] */
+                       xmm2 = _mm_loadu_ps(born->drobc+i);
+                       xmm1 = _mm_mul_ps(xmm1, xmm2); /*rbi*rbi*dvda[i]*born->drobc[i] */
+                                               
+                       _mm_storeu_ps(rb+i, xmm1);
+               }
+       
+               if(offset!=0)
+               {
+                       xmm1 = _mm_loadu_ps(born->bRad+i);
+                       xmm1 = _mm_mul_ps(xmm1, xmm1); /* rbi*rbi */
+                        
+                       xmm2 = _mm_loadu_ps(dvda+i);
+                       xmm1 = _mm_mul_ps(xmm1,xmm2); /* rbi*rbi*dvda[i] */
+                       xmm2 = _mm_loadu_ps(born->drobc+i);
+                       xmm1 = _mm_mul_ps(xmm1, xmm2); /*rbi*rbi*dvda[i]*born->drobc[i] */
+                       xmm1 = _mm_and_ps( (__m128) mask, xmm1);                                
+                                                                                               
+                       _mm_storeu_ps(rb+i, xmm1);
+               }
+       }
+               
+       /* Keep the compiler happy */
+       t1 = _mm_setzero_ps();
+       t2 = _mm_setzero_ps();
+       t3 = _mm_setzero_ps();
+               
+       aj1 = aj2 = aj3 = aj4 = 0;
+       n=0;
+       
+       for(i=0;i<nl->nri;i++)
+       {
+               ai     = nl->iinr[i];
+               ai3        = ai*3;
+               
+               nj0    = nl->jindex[ai];
+               nj1    = nl->jindex[ai+1];
+               
+               offset = (nj1-nj0)%4;
+               
+               /* Load particle ai coordinates */
+               ix  = _mm_load1_ps(xd+ai3);
+               iy  = _mm_load1_ps(xd+ai3+1);
+               iz  = _mm_load1_ps(xd+ai3+2);
+               
+               /* Load particle ai dvda */
+               dva = _mm_load1_ps(rb+ai);
+                       
+               fix = _mm_setzero_ps();
+               fiy = _mm_setzero_ps();
+               fiz = _mm_setzero_ps(); 
+                               
+               /* Inner loop for all particles where n%4==0 */
+               for(k=nj0;k<nj1-offset;k+=4)
+               {
+                       aj1 = nl->jjnr[k];      
+                       aj2 = nl->jjnr[k+1];
+                       aj3 = nl->jjnr[k+2];
+                       aj4 = nl->jjnr[k+3];
+                       
+                       aj1 = aj1 * 3; 
+                       aj2 = aj2 * 3;
+                       aj3 = aj3 * 3;
+                       aj4 = aj4 * 3;
+                       
+                       /* Load j1-4 coordinates, first x and y */
+                       xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (xd+aj1)); /*x1 y1 - - */
+                       xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (xd+aj2)); /*x2 y2 - - */
+                       xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (xd+aj3)); /*x3 y3 - - */
+                       xmm4 = _mm_loadh_pi(xmm4,(__m64 *) (xd+aj4)); /*x4 y4 - - */
+                       
+                       /* ... then z */
+                       xmm5 = _mm_load1_ps(xd+aj1+2); /* z1 z1 z1 z1 */
+                       xmm6 = _mm_load1_ps(xd+aj2+2); /* z2 z2 z2 z2 */
+                       xmm7 = _mm_load1_ps(xd+aj3+2); /* z3 z3 z3 z3 */
+                       xmm8 = _mm_load1_ps(xd+aj4+2); /* z4 z4 z4 z4 */
+                       
+                       /* transpose */
+                       xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); /* z1 z1 z2 z2 */
+                       xmm6 = _mm_shuffle_ps(xmm7,xmm8, _MM_SHUFFLE(0,0,0,0)); /* z3 z3 z4 z4 */
+                       jz   = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(2,0,2,0)); /* z1 z2 z3 z4 */
+                       
+                       xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2)); /* x1 y1 x2 y2 */
+                       xmm2 = _mm_shuffle_ps(xmm3,xmm4, _MM_SHUFFLE(3,2,3,2)); /* x3 y3 x4 y4 */
+                       
+                       jx   = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0)); /* x1 x2 x3 x4 */
+                       jy   = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1)); /* y1 y2 y3 y4 */
+                       
+                       /* load chain rule terms for j1-4 */
+                       dax = _mm_loadu_ps(dadx+n);
+                       n   = n + 4;
+                       
+                       /* distances i -> j1-4 */
+                       dx   = _mm_sub_ps(ix, jx);
+                       dy   = _mm_sub_ps(iy, jy);
+                       dz   = _mm_sub_ps(iz, jz);
+                       
+                       /* calculate scalar force */
+                       fgb  = _mm_mul_ps(dva,dax); 
+               
+                       /* calculate partial force terms */
+                       t1   = _mm_mul_ps(fgb,dx); /* fx1, fx2, fx3, fx4 */
+                       t2   = _mm_mul_ps(fgb,dy); /* fy1, fy2, fy3, fy4  */
+                       t3   = _mm_mul_ps(fgb,dz); /* fz1, fz2, fz3, fz4 */
+               
+                       /* update the i force */
+                       fix       = _mm_add_ps(fix,t1);
+                       fiy       = _mm_add_ps(fiy,t2);
+                       fiz       = _mm_add_ps(fiz,t3); 
+                       
+                       /* accumulate the aj1-4 fx and fy forces from memory */
+                       xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (f+aj1)); /*fx1 fy1 - - */
+                       xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (f+aj2)); /*fx2 fy2 - - */
+                       xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (f+aj3)); /*fx3 fy3 - - */
+                       xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (f+aj4)); /*fx4 fy4 - - */
+                       
+                       xmm5 = _mm_load1_ps(f+aj1+2); /* fz1 fz1 fz1 fz1 */
+                       xmm6 = _mm_load1_ps(f+aj2+2); /* fz2 fz2 fz2 fz2 */
+                       xmm7 = _mm_load1_ps(f+aj3+2); /* fz3 fz3 fz3 fz3 */
+                       xmm8 = _mm_load1_ps(f+aj4+2); /* fz4 fz4 fz4 fz4 */
+                       
+                       /* transpose forces */
+                       xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); /* fz1 fz1 fz2 fz2 */
+                       xmm6 = _mm_shuffle_ps(xmm7,xmm8, _MM_SHUFFLE(0,0,0,0)); /* fz3 fz3 fz4 fz4 */
+                       xmm7 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(2,0,2,0)); /* fz1 fz2 fz3 fz4 */
+                       
+                       xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); /*fx1 fy1 fx2 fy2 */
+                       xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2)); /*fx2 fy3 fx4 fy4 */
+                       
+                       xmm5 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0)); /*fx1 fx2 fx3 fx4 */
+                       xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1)); /*fy1 fy2 fy3 fy4 */
+                       
+                       /* subtract partial forces */
+                       xmm5 = _mm_sub_ps(xmm5, t1); /*fx1 fx2 fx3 fx4 */
+                       xmm6 = _mm_sub_ps(xmm6, t2); /*fy1 fy2 fy3 fy4 */
+                       xmm7 = _mm_sub_ps(xmm7, t3); /*fz1 fz2 fz3 fz4 */
+       
+                       /* transpose back fx's and fy's */
+                       xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /*fx1 fx2 fy1 fy2 */ 
+                       xmm2 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /*fx3 fx4 fy3 fy4 */
+                       xmm1 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0)); /*fx1 fy1 fx2 fy2 */
+                       xmm2 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0)); /*fx3 fy3 fx4 fy4 */
+                       
+                       /* store the force, first fx's and fy's */
+                       _mm_storel_pi( (__m64 *) (f+aj1), xmm1);
+                       _mm_storeh_pi( (__m64 *) (f+aj2), xmm1);
+                       _mm_storel_pi( (__m64 *) (f+aj3), xmm2);
+                       _mm_storeh_pi( (__m64 *) (f+aj4), xmm2);
+                       
+                       /* now do fz�s*/
+                       _mm_store_ss(f+aj1+2,xmm7); /*fz1 */
+                       xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                       _mm_store_ss(f+aj2+2,xmm7); /*fz2 */
+                       xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                       _mm_store_ss(f+aj3+2,xmm7); /*fz4 */
+                       xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1)); 
+                       _mm_store_ss(f+aj4+2,xmm7); /*fz3 */
+               }
+               
+               /*deal with odd elements */
+               if(offset!=0) {
+               
+                       aj1 = aj2 = aj3 = aj4 = 0;
+                       
+                       if(offset==1)
+                       {
+                               aj1 = nl->jjnr[k];       
+                               aj1 = aj1 * 3; 
+                               
+                               xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (xd+aj1)); /*x1 y1 */
+                               xmm5 = _mm_load1_ps(xd+aj1+2); /*z1 z1 z1 z1*/
+                                                               
+                               xmm6 = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(0,0,0,0)); /*x1 - - - */ 
+                               xmm4 = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(1,1,1,1)); /*y1 - - - */
+                               
+                               mask = _mm_set_epi32(0,0,0,0xffffffff);
+                       }
+                       else if(offset==2)
+                       {
+                               aj1 = nl->jjnr[k];       
+                               aj2 = nl->jjnr[k+1];
+                                                       
+                               aj1 = aj1 * 3; 
+                               aj2 = aj2 * 3;
+                               
+                               xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (xd+aj1)); /* x1 y1 - - */
+                               xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (xd+aj2)); /* x2 y2 - - */
+                               
+                               xmm5 = _mm_load1_ps(xd+aj1+2); /*z1 z1 z1 z1*/
+                               xmm6 = _mm_load1_ps(xd+aj2+2); /*z2 z2 z2 z2*/
+                               
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0)); /*z1 z1 z2 z2 */
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0)); /*z1 z2 z1 z2 */
+                               
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); /*x1 y1 x2 y2 */
+                               xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0)); /*x1 x2 x1 x2 */
+                               xmm4 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1)); /*y1 y2 y1 y2 */ 
+                                                                               
+                               mask = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+                       }
+                       else
+                       {
+                               aj1 = nl->jjnr[k];       
+                               aj2 = nl->jjnr[k+1];
+                               aj3 = nl->jjnr[k+2];
+                                                               
+                               aj1 = aj1 * 3; 
+                               aj2 = aj2 * 3;
+                               aj3 = aj3 * 3;
+                               
+                               xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (xd+aj1)); /*x1 y1 - - */
+                               xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (xd+aj2)); /*x2 y2 - - */
+                               xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (xd+aj3)); /*x3 y3 - - */
+                               
+                               xmm5 = _mm_load1_ps(xd+aj1+2); /* z1 z1 z1 z1 */
+                               xmm6 = _mm_load1_ps(xd+aj2+2); /* z2 z2 z2 z2 */
+                               xmm7 = _mm_load1_ps(xd+aj3+2); /* z3 z3 z3 z3 */
+                                                                                       
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); /* z1 z1 z2 z2 */
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1)); /* z1 z2 z3 z3 */                                               
+                               
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2)); /* x1 y1 x2 y2 */
+                               xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2)); /* x3 y3 x3 y3 */
+                               
+                               xmm6 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0)); /* x1 x2 x3 x3 */
+                               xmm4 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1)); /* y1 y2 y3 y3 */
+                               
+                               mask = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+                       }
+                                               
+                       jx = _mm_and_ps( (__m128) mask, xmm6);
+                       jy = _mm_and_ps( (__m128) mask, xmm4);
+                       jz = _mm_and_ps( (__m128) mask, xmm5);
+                       
+                       dax  = _mm_loadu_ps(dadx+n);
+                       n   = n + offset;
+                       dax = _mm_and_ps( (__m128) mask, dax); 
+                       
+                       dx   = _mm_sub_ps(ix, jx);
+                       dy   = _mm_sub_ps(iy, jy);
+                       dz   = _mm_sub_ps(iz, jz);
+                       
+                       fgb  = _mm_mul_ps(dva,dax); 
+               
+                       t1   = _mm_mul_ps(fgb,dx); /* fx1, fx2, fx3, fx4 */
+                       t2   = _mm_mul_ps(fgb,dy); /* fy1, fy2, fy3, fy4 */
+                       t3   = _mm_mul_ps(fgb,dz); /* fz1, fz2, fz3, fz4 */
+                               
+                       if(offset==1) {
+                               xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (f+aj1)); /* fx1 fy1 */
+                               xmm7 = _mm_load1_ps(f+aj1+2); /* fz1 fz1 fz1 fz1 */
+                               
+                               xmm5 = _mm_shuffle_ps(xmm1,xmm1, _MM_SHUFFLE(0,0,0,0)); /* fx1 - - - */
+                               xmm6 = _mm_shuffle_ps(xmm1,xmm1, _MM_SHUFFLE(1,1,1,1)); /* fy1 - - - */
+                               
+                               xmm5 = _mm_sub_ps(xmm5,t1);
+                               xmm6 = _mm_sub_ps(xmm6,t2);
+                               xmm7 = _mm_sub_ps(xmm7,t3);
+                                                               
+                               _mm_store_ss(f+aj1 , xmm5);
+                               _mm_store_ss(f+aj1+1,xmm6);
+                               _mm_store_ss(f+aj1+2,xmm7);
+                       }
+                       else if(offset==2) {
+                               xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (f+aj1)); /*fx1 fy1 - -*/ 
+                               xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (f+aj2)); /*fx2 fy2 - - */
+                               
+                               xmm5 = _mm_load1_ps(f+aj1+2); /* fz1 fz1 fz1 fz1 */
+                               xmm6 = _mm_load1_ps(f+aj2+2); /* fz2 fz2 fz2 fz2 */
+                               
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0)); /*fz1 fz1 fz2 fz2*/
+                               xmm7 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0)); /*fz1 fz2 fz1 fz2 */
+                               
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); /*x1 y1 x2 y2 */
+                               xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0)); /*x1 x2 x1 x2 */
+                               xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1)); /*y1 y2 y1 y2 */
+                               
+                               xmm5 = _mm_sub_ps(xmm5, t1);
+                               xmm6 = _mm_sub_ps(xmm6, t2);
+                               xmm7 = _mm_sub_ps(xmm7, t3);
+                               
+                               xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /*fx1 fx2 fy1 fy2*/
+                               xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0)); /*fx1 fy1 fx2 fy2*/
+                               
+                               _mm_storel_pi( (__m64 *) (f+aj1), xmm5);
+                               _mm_storeh_pi( (__m64 *) (f+aj2), xmm5);
+                               
+                               _mm_store_ss(f+aj1+2,xmm7);
+                               xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                               _mm_store_ss(f+aj2+2,xmm7);
+                       }
+                       else {
+                               xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (f+aj1)); /*fx1 fy1 - - */
+                               xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (f+aj2)); /*fx2 fy2 - - */
+                               xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (f+aj3)); /*fx3 fy3 - - */
+                               
+                               xmm5 = _mm_load1_ps(f+aj1+2); /* fz1 fz1 fz1 fz1 */
+                               xmm6 = _mm_load1_ps(f+aj2+2); /* fz2 fz2 fz2 fz2 */
+                               xmm7 = _mm_load1_ps(f+aj3+2); /* fz3 fz3 fz3 fz3 */
+                               
+                               xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); /* fz1 fz1 fz2 fz2 */
+                               xmm6 = _mm_shuffle_ps(xmm7,xmm7, _MM_SHUFFLE(0,0,0,0)); /* fz3 fz3 fz3 fz3 */
+                               xmm7 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(2,0,2,0)); /* fz1 fz2 fz3 fz4 */
+                               
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); /*fx1 fy1 fx2 fy2 */
+                               xmm2 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(3,2,3,2)); /*fx2 fy3 fx3 fy3 */
+                       
+                               xmm5 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0)); /*fx1 fx2 fx3 fx3 */
+                               xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1)); /*fy1 fy2 fy3 fy3  */
+                               
+                               xmm5 = _mm_sub_ps(xmm5, t1);
+                               xmm6 = _mm_sub_ps(xmm6, t2);
+                               xmm7 = _mm_sub_ps(xmm7, t3);
+                               
+                               xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /*fx1 fx2 fy1 fy2 */
+                               xmm2 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /*fx3 fx3 fy3 fy3 */
+                               xmm1 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0)); /*fx1 fy1 fx2 fy2 */
+                               xmm2 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0)); /*fx3 fy3 fx3 fy3 */
+                       
+                               _mm_storel_pi( (__m64 *) (f+aj1), xmm1);
+                               _mm_storeh_pi( (__m64 *) (f+aj2), xmm1);
+                               _mm_storel_pi( (__m64 *) (f+aj3), xmm2);
+                               
+                               _mm_store_ss(f+aj1+2,xmm7); /*fz1*/
+                               xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                               _mm_store_ss(f+aj2+2,xmm7); /*fz2*/
+                               xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+                               _mm_store_ss(f+aj3+2,xmm7); /*fz3*/
+
+                       }
+                       
+                       t1 = _mm_and_ps( (__m128) mask, t1);
+                       t2 = _mm_and_ps( (__m128) mask, t2);
+                       t3 = _mm_and_ps( (__m128) mask, t3);
+
+                       fix = _mm_add_ps(fix,t1);
+                       fiy = _mm_add_ps(fiy,t2);
+                       fiz = _mm_add_ps(fiz,t3);       
+                       
+               } /*end offset!=0*/
+        
+               /* fix/fiy/fiz now contain four partial force terms, that all should be
+               * added to the i particle forces. 
+               */
+               t1 = _mm_movehl_ps(t1,fix);
+               t2 = _mm_movehl_ps(t2,fiy);
+               t3 = _mm_movehl_ps(t3,fiz);
+       
+               fix = _mm_add_ps(fix,t1);
+               fiy = _mm_add_ps(fiy,t2);
+               fiz = _mm_add_ps(fiz,t3);
+               
+               t1 = _mm_shuffle_ps( fix, fix, _MM_SHUFFLE(1,1,1,1) );
+               t2 = _mm_shuffle_ps( fiy, fiy, _MM_SHUFFLE(1,1,1,1) );
+               t3 = _mm_shuffle_ps( fiz, fiz, _MM_SHUFFLE(1,1,1,1) );
+               
+               fix = _mm_add_ss(fix,t1); /*fx - - - */
+               fiy = _mm_add_ss(fiy,t2); /*fy - - -  */
+               fiz = _mm_add_ss(fiz,t3); /*fz - - -  */
+               
+               xmm2 = _mm_unpacklo_ps(fix,fiy); /*fx, fy, - - */
+               xmm2 = _mm_movelh_ps(xmm2,fiz);
+               xmm2 = _mm_and_ps( (__m128) maski, xmm2);
+               
+               /* load i force from memory */
+               xmm4 = _mm_loadl_pi(xmm4,(__m64 *) (f+ai3)); /*fx fy - -  */
+               xmm5 = _mm_load1_ps(f+ai3+2); /* fz fz fz fz */
+               xmm4 = _mm_shuffle_ps(xmm4,xmm5,_MM_SHUFFLE(3,2,1,0)); /*fx fy fz fz*/
+               
+               /* add to i force */
+               xmm4 = _mm_add_ps(xmm4,xmm2);
+               
+               /* store i force to memory */
+               _mm_storel_pi( (__m64 *) (f+ai3),xmm4); /* fx fy - - */
+               xmm4 = _mm_shuffle_ps(xmm4,xmm4,_MM_SHUFFLE(2,2,2,2)); /* only the third term is correct for fz */
+               _mm_store_ss(f+ai3+2,xmm4); /*fz*/
+       }       
+
+       return 0;       
+}
+
+
+
+
+
+
+#else
+/* keep compiler happy */
+int genborn_sse_dummy;
+
+#endif /* SSE intrinsics available */
diff --git a/src/mdlib/genborn_sse.h b/src/mdlib/genborn_sse.h
new file mode 100644 (file)
index 0000000..469c6b0
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ * $Id$
+ * 
+ *                This source code is part of
+ * 
+ *                 G   R   O   M   A   C   S
+ * 
+ *          GROningen MAchine for Chemical Simulations
+ * 
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2008, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * 
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ * 
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ * 
+ * For more info, check our website at http://www.gromacs.org
+ * 
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+
+#ifndef _genborn_sse_h
+#define _genborn_sse_h
+
+#include "typedefs.h"
+#include "grompp.h"
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+
+real 
+calc_gb_chainrule_sse(int natoms, t_nblist *nl, real *dadx, real *dvda, real *xd, real *f, int gb_algorithm, gmx_genborn_t *born);                                             
+
+
+int 
+calc_gb_rad_still_sse(t_commrec *cr, t_forcerec *fr,int natoms, gmx_mtop_t *mtop,
+                                         const t_atomtypes *atype, real *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md);
+
+int 
+calc_gb_rad_hct_sse(t_commrec *cr, t_forcerec *fr, int natoms, gmx_mtop_t *mtop, const t_atomtypes *atype, real *x,
+                                       t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md);
+
+int 
+calc_gb_rad_obc_sse(t_commrec *cr, t_forcerec * fr, int natoms, gmx_mtop_t *mtop,
+                                       const t_atomtypes *atype, real *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md);
+
+
+
+
+
+#endif /* _genborn_sse_h */
index 55c41e0483794ab335589689ea88e8e571ce5a97..45634de2540ca7abf7acec80166c0d98db0fe939 100644 (file)
@@ -535,7 +535,7 @@ static void evaluate_energy(FILE *fplog,bool bVerbose,t_commrec *cr,
                            gmx_vsite_t *vsite,gmx_constr_t constr,
                            t_fcdata *fcd,
                            t_graph *graph,t_mdatoms *mdatoms,
-                           t_forcerec *fr, rvec mu_tot,
+                           t_forcerec *fr, gmx_genborn_t *born,rvec mu_tot,
                            gmx_enerdata_t *enerd,tensor vir,tensor pres,
                            int count,bool bFirst)
 {
@@ -585,10 +585,10 @@ static void evaluate_energy(FILE *fplog,bool bVerbose,t_commrec *cr,
    * We do not unshift, so molecules are always whole in congrad.c
    */
   do_force(fplog,cr,inputrec,
-          count,nrnb,wcycle,top,&top_global->groups,
+          count,nrnb,wcycle,top,top_global,&top_global->groups,
           ems->s.box,ems->s.x,&ems->s.hist,
           ems->f,*buf,force_vir,mdatoms,enerd,fcd,
-          ems->s.lambda,graph,fr,vsite,mu_tot,t,NULL,NULL,
+          ems->s.lambda,graph,fr,vsite,mu_tot,t,NULL,NULL,born,FALSE,
           GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES | GMX_FORCE_VIRIAL |
           (bNS ? GMX_FORCE_NS : 0));
 
@@ -757,7 +757,7 @@ time_t do_cg(FILE *fplog,t_commrec *cr,
             rvec buf[],t_mdatoms *mdatoms,
             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
             gmx_edsam_t ed,
-            t_forcerec *fr,
+            t_forcerec *fr,gmx_genborn_t *born,
             int repl_ex_nst,int repl_ex_seed,
             real cpt_period,real max_hours,
             unsigned long Flags,
@@ -820,7 +820,7 @@ time_t do_cg(FILE *fplog,t_commrec *cr,
   evaluate_energy(fplog,bVerbose,cr,
                  state_global,top_global,s_min,&buf,top,
                  inputrec,nrnb,wcycle,
-                 vsite,constr,fcd,graph,mdatoms,fr,
+                 vsite,constr,fcd,graph,mdatoms,fr,born,
                  mu_tot,enerd,vir,pres,-1,TRUE);
   where();
 
@@ -973,7 +973,7 @@ time_t do_cg(FILE *fplog,t_commrec *cr,
     evaluate_energy(fplog,bVerbose,cr,
                    state_global,top_global,s_c,&buf,top,
                    inputrec,nrnb,wcycle,
-                   vsite,constr,fcd,graph,mdatoms,fr,
+                   vsite,constr,fcd,graph,mdatoms,fr,born,
                    mu_tot,enerd,vir,pres,-1,FALSE);
     
     /* Calc derivative along line */
@@ -1060,7 +1060,7 @@ time_t do_cg(FILE *fplog,t_commrec *cr,
        evaluate_energy(fplog,bVerbose,cr,
                        state_global,top_global,s_b,&buf,top,
                        inputrec,nrnb,wcycle,
-                       vsite,constr,fcd,graph,mdatoms,fr,
+                       vsite,constr,fcd,graph,mdatoms,fr,born,
                        mu_tot,enerd,vir,pres,-1,FALSE);
        
        /* p does not change within a step, but since the domain decomposition
@@ -1283,7 +1283,7 @@ time_t do_lbfgs(FILE *fplog,t_commrec *cr,
                rvec buf[],t_mdatoms *mdatoms,
                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
                gmx_edsam_t ed,
-               t_forcerec *fr,
+               t_forcerec *fr,gmx_genborn_t *born,
                int repl_ex_nst,int repl_ex_seed,
                real cpt_period,real max_hours,
                unsigned long Flags,
@@ -1408,7 +1408,7 @@ time_t do_lbfgs(FILE *fplog,t_commrec *cr,
   evaluate_energy(fplog,bVerbose,cr,
                  state,top_global,&ems,&buf,top,
                  inputrec,nrnb,wcycle,
-                 vsite,constr,fcd,graph,mdatoms,fr,
+                 vsite,constr,fcd,graph,mdatoms,fr,born,
                  mu_tot,enerd,vir,pres,-1,TRUE);
   where();
        
@@ -1562,7 +1562,7 @@ time_t do_lbfgs(FILE *fplog,t_commrec *cr,
     evaluate_energy(fplog,bVerbose,cr,
                    state,top_global,&ems,&buf,top,
                    inputrec,nrnb,wcycle,
-                   vsite,constr,fcd,graph,mdatoms,fr,
+                   vsite,constr,fcd,graph,mdatoms,fr,born,
                    mu_tot,enerd,vir,pres,step,FALSE);
     EpotC = ems.epot;
     
@@ -1640,7 +1640,7 @@ time_t do_lbfgs(FILE *fplog,t_commrec *cr,
        evaluate_energy(fplog,bVerbose,cr,
                        state,top_global,&ems,&buf,top,
                        inputrec,nrnb,wcycle,
-                       vsite,constr,fcd,graph,mdatoms,fr,
+                       vsite,constr,fcd,graph,mdatoms,fr,born,
                        mu_tot,enerd,vir,pres,step,FALSE);
        EpotB = ems.epot;
        
@@ -1919,7 +1919,7 @@ time_t do_steep(FILE *fplog,t_commrec *cr,
                rvec buf[],t_mdatoms *mdatoms,
                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
                gmx_edsam_t ed,
-               t_forcerec *fr,
+               t_forcerec *fr,gmx_genborn_t *born,
                int repl_ex_nst,int repl_ex_seed,
                real cpt_period,real max_hours,
                unsigned long Flags,
@@ -1994,7 +1994,7 @@ time_t do_steep(FILE *fplog,t_commrec *cr,
     evaluate_energy(fplog,bVerbose,cr,
                    state_global,top_global,s_try,&buf,top,
                    inputrec,nrnb,wcycle,
-                   vsite,constr,fcd,graph,mdatoms,fr,
+                   vsite,constr,fcd,graph,mdatoms,fr,born,
                    mu_tot,enerd,vir,pres,count,count==0);
     
     if (MASTER(cr))
@@ -2131,7 +2131,7 @@ time_t do_nm(FILE *fplog,t_commrec *cr,
             rvec buf[],t_mdatoms *mdatoms,
             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
             gmx_edsam_t ed,
-            t_forcerec *fr,
+            t_forcerec *fr,gmx_genborn_t *born,
             int repl_ex_nst,int repl_ex_seed,
             real cpt_period,real max_hours,
             unsigned long Flags,
@@ -2242,7 +2242,7 @@ time_t do_nm(FILE *fplog,t_commrec *cr,
     evaluate_energy(fplog,bVerbose,cr,
                    state_global,top_global,state_work,&buf,top,
                    inputrec,nrnb,wcycle,
-                   vsite,constr,fcd,graph,mdatoms,fr,
+                   vsite,constr,fcd,graph,mdatoms,fr,born,
                    mu_tot,enerd,vir,pres,count,count==0);
     count++;
 
@@ -2283,7 +2283,7 @@ time_t do_nm(FILE *fplog,t_commrec *cr,
          evaluate_energy(fplog,bVerbose,cr,
                          state_global,top_global,state_work,&buf,top,
                          inputrec,nrnb,wcycle,
-                         vsite,constr,fcd,graph,mdatoms,fr,
+                         vsite,constr,fcd,graph,mdatoms,fr,born,
                          mu_tot,enerd,vir,pres,count,count==0);
          count++;
                        
@@ -2297,7 +2297,7 @@ time_t do_nm(FILE *fplog,t_commrec *cr,
          evaluate_energy(fplog,bVerbose,cr,
                          state_global,top_global,state_work,&buf,top,
                          inputrec,nrnb,wcycle,
-                         vsite,constr,fcd,graph,mdatoms,fr,
+                         vsite,constr,fcd,graph,mdatoms,fr,born,
                          mu_tot,enerd,vir,pres,count,count==0);
          count++;
          
@@ -2395,7 +2395,7 @@ time_t do_tpi(FILE *fplog,t_commrec *cr,
              rvec buf[],t_mdatoms *mdatoms,
              t_nrnb *nrnb,gmx_wallcycle_t wcycle,
              gmx_edsam_t ed,
-             t_forcerec *fr,
+             t_forcerec *fr,gmx_genborn_t *born,
              int repl_ex_nst,int repl_ex_seed,
              real cpt_period,real max_hours,
              unsigned long Flags,
@@ -2782,10 +2782,10 @@ time_t do_tpi(FILE *fplog,t_commrec *cr,
        /* Make do_force do a single node fore calculation */
        cr->nnodes = 1;
        do_force(fplog,cr,inputrec,
-                step,nrnb,wcycle,top,&top_global->groups,
+                step,nrnb,wcycle,top,top_global,&top_global->groups,
                 rerun_fr.box,state->x,&state->hist,
                 f,buf,force_vir,mdatoms,enerd,fcd,
-                lambda,NULL,fr,NULL,mu_tot,t,NULL,NULL,
+                lambda,NULL,fr,NULL,mu_tot,t,NULL,NULL,born,FALSE,
                 GMX_FORCE_NONBONDED |
                 (bNS ? GMX_FORCE_NS : 0) |
                 (bStateChanged ? GMX_FORCE_STATECHANGED : 0)); 
index a8136eda200df4895dbd8a1f58d9cb2aba056bb2..dcd287513deca80ec4f70620a55d2a02b73e831a 100644 (file)
@@ -278,7 +278,12 @@ void init_neighbor_list(FILE *log,t_forcerec *fr,int homenr)
    }  
 
    /* Determine the values for icoul/ivdw. */
-   if (fr->bcoultab)
+   /* Start with GB */
+   if(fr->bGB)
+   {
+       icoul=4;
+   }
+   else if (fr->bcoultab)
    {
        icoul = 3;
    }
@@ -355,7 +360,7 @@ void init_neighbor_list(FILE *log,t_forcerec *fr,int homenr)
        init_nblist(&fr->QMMMlist_sr,&fr->QMMMlist_lr,
                    maxsr,maxlr,0,icoul,FALSE,enlistATOM_ATOM);
    }
-   
+
 }
 
  static void reset_nblist(t_nblist *nl)
@@ -2358,6 +2363,7 @@ void init_ns(FILE *fplog,const t_commrec *cr,
     }
 }
 
+                        
 int search_neighbours(FILE *log,t_forcerec *fr,
                       rvec x[],matrix box,
                       gmx_localtop_t *top,
@@ -2379,7 +2385,7 @@ int search_neighbours(FILE *log,t_forcerec *fr,
     int      cg_start,cg_end,start,end;
     gmx_ns_t *ns;
     t_grid   *grid;
-    
+       
     ns = &fr->ns;
 
     /* Set some local variables */
@@ -2416,6 +2422,7 @@ int search_neighbours(FILE *log,t_forcerec *fr,
     
     if (bGrid && bFillGrid)
     {
+               
         grid = ns->grid;
         bFilledHome = (DOMAINDECOMP(cr) && dd_filled_nsgrid_home(cr->dd));
         if (!bFilledHome)
index 9b8f8af0bb383db13c8ee42d0485c475108c7e3f..d1d72156a79437518c073187dc00f2b0206f26e5 100644 (file)
@@ -746,7 +746,9 @@ static void init_adir(FILE *log,
 int relax_shell_flexcon(FILE *fplog,t_commrec *cr,bool bVerbose,
                        int mdstep,t_inputrec *inputrec,
                        bool bDoNS,bool bStopCM,
-                       gmx_localtop_t *top,gmx_constr_t constr,
+                       gmx_localtop_t *top,
+                       gmx_mtop_t* mtop,
+                       gmx_constr_t constr,
                        gmx_enerdata_t *enerd,t_fcdata *fcd,
                        t_state *state,rvec f[],
                        rvec buf[],tensor force_vir,
@@ -756,6 +758,7 @@ int relax_shell_flexcon(FILE *fplog,t_commrec *cr,bool bVerbose,
                        gmx_groups_t *groups,
                        struct gmx_shellfc *shfc,
                        t_forcerec *fr,
+                       gmx_genborn_t *born, bool bBornRadii,
                        double t,rvec mu_tot,
                        int natoms,bool *bConverged,
                        gmx_vsite_t *vsite,
@@ -863,11 +866,11 @@ int relax_shell_flexcon(FILE *fplog,t_commrec *cr,bool bVerbose,
   if (gmx_debug_at) {
     pr_rvecs(debug,0,"x b4 do_force",state->x + start,homenr);
   }
-  do_force(fplog,cr,inputrec,mdstep,nrnb,wcycle,top,groups,
+  do_force(fplog,cr,inputrec,mdstep,nrnb,wcycle,top,mtop,groups,
           state->box,state->x,&state->hist,
           force[Min],buf,force_vir,md,enerd,fcd,
           state->lambda,graph,
-          fr,vsite,mu_tot,t,fp_field,NULL,
+          fr,vsite,mu_tot,t,fp_field,NULL,born,bBornRadii,
           GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
           (bDoNS ? GMX_FORCE_NS : 0) | GMX_FORCE_VIRIAL);
 
@@ -948,10 +951,10 @@ int relax_shell_flexcon(FILE *fplog,t_commrec *cr,bool bVerbose,
     }
     /* Try the new positions */
     do_force(fplog,cr,inputrec,1,nrnb,wcycle,
-            top,groups,state->box,pos[Try],&state->hist,
+            top,mtop,groups,state->box,pos[Try],&state->hist,
             force[Try],buf,force_vir,
             md,enerd,fcd,state->lambda,graph,
-            fr,vsite,mu_tot,t,fp_field,NULL,
+            fr,vsite,mu_tot,t,fp_field,NULL,born,bBornRadii,
             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES | GMX_FORCE_VIRIAL);
     
     if (gmx_debug_at) {
index 2352cad7fdb3d49e28ce7dcbc1f875a1464ed709..3970b9f20de7d26fbd7535a0e165f64499f859f4 100644 (file)
@@ -81,6 +81,7 @@
 #include "domdec.h"
 #include "partdec.h"
 #include "gmx_wallcycle.h"
+#include "genborn.h"
 
 #ifdef GMX_MPI
 #include "mpi.h"
@@ -328,6 +329,7 @@ void do_force(FILE *fplog,t_commrec *cr,
              t_inputrec *inputrec,
              int step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
              gmx_localtop_t *top,
+             gmx_mtop_t *mtop,
              gmx_groups_t *groups,
              matrix box,rvec x[],history_t *hist,
              rvec f[],rvec buf[],
@@ -337,6 +339,7 @@ void do_force(FILE *fplog,t_commrec *cr,
              real lambda,t_graph *graph,
              t_forcerec *fr,gmx_vsite_t *vsite,rvec mu_tot,
              double t,FILE *field,gmx_edsam_t ed,
+             gmx_genborn_t *born, bool bBornRadii,
              int flags)
 {
   static rvec box_size;
@@ -489,6 +492,9 @@ void do_force(FILE *fplog,t_commrec *cr,
     wallcycle_stop(wcycle,ewcNS);
   }
   
+  if(inputrec->implicit_solvent && bNS) 
+         gb_nblist_siev(cr,mdatoms->nr,inputrec->gb_algorithm, inputrec->rlist,x,fr,&top->idef.il[F_GB], (born->n12+born->n13)*3);
+
   if (DOMAINDECOMP(cr)) {
     if (!(cr->duty & DUTY_PME)) {
       wallcycle_start(wcycle,ewcPPDURINGPME);
@@ -564,7 +570,9 @@ void do_force(FILE *fplog,t_commrec *cr,
   /* Compute the bonded and non-bonded forces */    
   do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
                    cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
-                   x,hist,f,enerd,fcd,box,lambda,graph,&(top->excls),mu_tot_AB,
+                   x,hist,f,enerd,fcd,mtop,born,
+                   &(top->atomtypes),bBornRadii,box,
+                   lambda,graph,&(top->excls),mu_tot_AB,
                    flags,&cycles_force);
   GMX_BARRIER(cr->mpi_comm_mygroup);
 
index b0a5432bc252f913e3a420a822db1662f0730a38..e0fb6fcb6d75c61dfa7e7f23637754ef0da81f72 100644 (file)
@@ -944,6 +944,159 @@ t_forcetable make_tables(FILE *out,const t_forcerec *fr,
   return table;
 }
 
+t_forcetable make_gb_table(FILE *out,const t_forcerec *fr,
+                                                  const char *fn,
+                                                  real rtab)
+{
+       const char *fns[3] = { "gbctab.xvg", "gbdtab.xvg", "gbrtab.xvg" };
+       const char *fns14[3] = { "gbctab14.xvg", "gbdtab14.xvg", "gbrtab14.xvg" };
+       FILE        *fp;
+       t_tabledata *td;
+       bool        bReadTab,bGenTab;
+       real        x0,y0,yp;
+       int         i,j,k,nx,nx0,tabsel[etiNR];
+       void *      p_tmp;
+       double      r,r2,Vtab,Ftab,expterm;
+       
+       t_forcetable table;
+       
+       double abs_error_r, abs_error_r2;
+       double rel_error_r, rel_error_r2;
+       double rel_error_r_old=0, rel_error_r2_old=0;
+       double x0_r_error, x0_r2_error;
+       
+       
+       /* Only set a Coulomb table for GB */
+       //tabsel[0]=etabGB;
+       //tabsel[1]=-1;
+       //tabsel[2]=-1;
+       
+       /* Set the table dimensions for GB, not really necessary to
+        * use etiNR (since we only have one table, but ...) 
+        */
+       snew(td,1);
+       table.r         = fr->gbtabr;
+       table.scale     = fr->gbtabscale;
+       table.scale_exp = 0;
+       table.n         = table.scale*table.r;
+       nx0             = 0;
+       nx              = table.scale*table.r;
+       
+       /* Check whether we have to read or generate 
+        * We will always generate a table, so remove the read code
+        * (Compare with original make_table function
+        */
+       bReadTab = FALSE;
+       bGenTab  = TRUE;
+       
+       /* Each table type (e.g. coul,lj6,lj12) requires four 
+        * numbers per datapoint. For performance reasons we want
+        * the table data to be aligned to 16-byte. This is accomplished
+        * by allocating 16 bytes extra to a temporary pointer, and then
+        * calculating an aligned pointer. This new pointer must not be
+        * used in a free() call, but thankfully we're sloppy enough not
+        * to do this :-)
+        */
+       
+       /* 4 fp entries per table point, nx+1 points, and 16 bytes extra to align it. */
+       p_tmp = malloc(4*(nx+1)*sizeof(real)+16);
+       
+       /* align it - size_t has the same same as a pointer */
+       table.tab = (real *) (((size_t) p_tmp + 16) & (~((size_t) 15)));  
+       
+       init_table(out,nx,nx0,table.scale,&(td[0]),!bReadTab);
+       
+       /* Local implementation so we don't have to use the etabGB
+        * enum above, which will cause problems later when
+        * making the other tables (right now even though we are using
+        * GB, the normal Coulomb tables will be created, but this
+        * will cause a problem since fr->eeltype==etabGB which will not
+        * be defined in fill_table and set_table_type
+        */
+       //fp=fopen("test.xvg","w");
+       for(i=nx0;i<nx;i++)
+    {
+               Vtab    = 0.0;
+               Ftab    = 0.0;
+               r       = td->x[i];
+               r2      = r*r;
+               expterm = exp(-0.25*r2);
+               
+               Vtab = 1/sqrt(r2+expterm);
+               Ftab = (r-0.25*r*expterm)/((r2+expterm)*sqrt(r2+expterm));
+               
+               /* Convert to single precision when we store to mem */
+               td->v[i]  = Vtab;
+               td->f[i]  = Ftab;
+               
+    }
+       
+       copy2table(table.n,0,4,td[0].x,td[0].v,td[0].f,table.tab);
+       
+       if(bDebugMode())
+    {
+               fp=xvgropen(fns[0],fns[0],"r","V");
+               /* plot the output 5 times denser than the table data */
+               //for(i=5*nx0;i<5*table.n;i++) 
+               for(i=nx0;i<table.n;i++)
+               {
+                       //x0=i*table.r/(5*table.n);
+                       x0=i*table.r/table.n;
+                       evaluate_table(table.tab,0,4,table.scale,x0,&y0,&yp);
+                       fprintf(fp,"%15.10e  %15.10e  %15.10e\n",x0,y0,yp);
+                       
+               }
+               ffclose(fp);
+    }
+       
+       /*
+        for(i=100*nx0;i<99.81*table.n;i++)
+        {
+        r = i*table.r/(100*table.n);
+        r2      = r*r;
+        expterm = exp(-0.25*r2);
+        
+        Vtab = 1/sqrt(r2+expterm);
+        Ftab = (r-0.25*r*expterm)/((r2+expterm)*sqrt(r2+expterm));
+        
+        
+        evaluate_table(table.tab,0,4,table.scale,r,&y0,&yp);
+        printf("gb: i=%d, x0=%g, y0=%15.15f, Vtab=%15.15f, yp=%15.15f, Ftab=%15.15f\n",i,r, y0, Vtab, yp, Ftab);
+        
+        abs_error_r=fabs(y0-Vtab);
+        abs_error_r2=fabs(yp-(-1)*Ftab);
+        
+        rel_error_r=abs_error_r/y0;
+        rel_error_r2=fabs(abs_error_r2/yp);
+        
+        //printf("rel_error=%15.15f, rel_error_r2=%15.15f\n",rel_error_r, rel_error_r2);
+        //printf("abs_error=%15.15f, abs_error_r2=%15.15f\n",abs_error_r, abs_error_r2);
+        
+        if(rel_error_r>rel_error_r_old)
+        {
+        rel_error_r_old=rel_error_r;
+        x0_r_error=x0;
+        }
+        
+        if(rel_error_r2>rel_error_r2_old)
+        {
+        rel_error_r2_old=rel_error_r2;
+        x0_r2_error=x0;        
+        }
+        }
+        
+        printf("gb: MAX REL ERROR IN R=%15.15f, MAX REL ERROR IN R2=%15.15f\n",rel_error_r_old, rel_error_r2_old);
+        printf("gb: XO_R=%g, X0_R2=%g\n",x0_r_error, x0_r2_error);
+        
+        exit(1); */
+       done_tabledata(&(td[0]));
+       sfree(td);
+       
+       return table;
+       
+       
+}
+
 bondedtable_t make_bonded_table(FILE *fplog,char *fn,int angle)
 {
   t_tabledata td;
@@ -972,3 +1125,5 @@ bondedtable_t make_bonded_table(FILE *fplog,char *fn,int angle)
 
   return tab;
 }
+
+