Still, HCT, and OBC generalized born algorithms, and the ace approximation of surface area.
Merge from separate GB tree, so while normal Gromacs code should be fine there might be a
couple of new GB bugs due to recent changes in CVS.
exit 1;; \
esac; \
done; \
- echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu admin/Makefile'; \
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign admin/Makefile'; \
cd $(top_srcdir) && \
- $(AUTOMAKE) --gnu admin/Makefile
+ $(AUTOMAKE) --foreign admin/Makefile
.PRECIOUS: Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
#######################################################################
AC_PREREQ(2.50)
-AC_INIT(gromacs, 4.0.99_development_20090120, [gmx-users@gromacs.org])
+AC_INIT(gromacs, 4.0.99_development_20090122, [gmx-users@gromacs.org])
AC_CONFIG_SRCDIR(src/gmxlib/3dview.c)
AC_CONFIG_AUX_DIR(config)
AC_CANONICAL_HOST
filenm.h \
force.h \
futil.h \
+genborn.h \
gbutil.h \
gmx_fatal.h \
gmx_ana.h \
#include "typedefs.h"
#include "nrnb.h"
#include "pbc.h"
+#include "genborn.h"
extern int glatnr(int *global_atom_index,int i);
/* Returns the global topology atom number belonging to local atom index i.
gmx_enerdata_t *enerd,t_nrnb *nrnb,real lambda,
const t_mdatoms *md,
t_fcdata *fcd,int *ddgatindex,
+ t_atomtypes *atype, gmx_genborn_t *born,
bool bPrintSepPot,int step);
/*
* The function calc_bonds() calculates all bonded force interactions.
#include "network.h"
#include "tgroup.h"
#include "vsite.h"
+#include "genborn.h"
static char *sepdvdlformat=" %-30s V %12.5e dVdl %12.5e\n";
t_inputrec *inputrec,
int step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
gmx_localtop_t *top,
+ gmx_mtop_t *mtop,
gmx_groups_t *groups,
matrix box,rvec x[],history_t *hist,
rvec f[],rvec buf[],
real lambda,t_graph *graph,
t_forcerec *fr,gmx_vsite_t *vsite,rvec mu_tot,
double t,FILE *field,gmx_edsam_t ed,
+ gmx_genborn_t *born, bool bBornRadii,
int flags);
/* Communicate coordinates (if parallel).
* Do neighbor searching (if necessary).
rvec f[],
gmx_enerdata_t *enerd,
t_fcdata *fcd,
+ gmx_mtop_t *mtop,
+ gmx_genborn_t *born,
+ t_atomtypes *atype,
+ bool bBornRadii,
matrix box,
real lambda,
t_graph *graph,
--- /dev/null
+/*
+ * $Id$
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2008, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+
+#ifndef _genborn_h
+#define _genborn_h
+
+#include "typedefs.h"
+#include "grompp.h"
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+
+typedef struct
+{
+ int nbonds;
+ int bond[10];
+} bonds_t;
+
+typedef struct
+{
+ real length[20000];
+ real angle[20000];
+} bl_t;
+
+/* Struct to hold all the information for GB
+ * All these things are currently allocated in md.c
+ */
+typedef struct
+{
+ int nr; /* number of atoms, length of arrays below */
+ int n12; /* number of 1-2 (bond) interactions */
+ int n13; /* number of 1-3 (angle) terms */
+ int n14; /* number of 1-4 (torsion) terms */
+ real *gpol; /* Atomic polarisation energies */
+ real *bRad; /* Atomic Born radii */
+ real *vsolv; /* Atomic solvation volumes */
+ int *vs; /* Array for vsites-exclusions */
+ int nvs; /* Length of vs array */
+ real es; /* Solvation energy and derivatives */
+ real *asurf; /* Atomic surface area */
+ rvec *dasurf; /* Surface area derivatives */
+ real as; /* Total surface area */
+ real *S_hct; /* Overlap factors for HCT method */
+ real *drobc; /* Parameters for OBC chain rule calculation */
+ real *param; /* Precomputed factor raj*atype->S_hct for HCT/OBC */
+ real *log_table; /* Table for logarithm lookup */
+
+ real obc_alpha; /* OBC parameters */
+ real obc_beta; /* OBC parameters */
+ real obc_gamma; /* OBC parameters */
+ real gb_doffset; /* Dielectric offset for Still/HCT/OBC */
+
+ real *work; /* used for parallel summation */
+}
+gmx_genborn_t;
+
+
+/* Initialise GB stuff */
+int init_gb(gmx_genborn_t **p_born,t_commrec *cr, t_forcerec *fr, t_inputrec *ir,
+ gmx_mtop_t *mtop, rvec x[], real rgbradii, int gb_algorithm);
+
+
+/* Born radii calculations, both with and without SSE acceleration */
+int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir,int natoms, int nrfa, gmx_mtop_t *mtop,
+ const t_atomtypes *atype, rvec x[], rvec f[],t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md);
+
+
+
+/* Bonded GB interactions */
+real gb_bonds_tab(int nbonds, real *x, real *f, real *charge, real *p_gbtabscale,
+ real *invsqrta, real *dvda, real *GBtab, const t_iatom forceatoms[],
+ real epsilon_r, real facel);
+
+
+
+void gb_pd_send(t_commrec *cr, real *send_data, int nr);
+
+
+/* Functions for setting up the F_GB list in grompp */
+int
+init_gb_plist(t_params *p_list);
+
+int
+convert_gb_params(t_idef *idef, t_functype ftype, int start, t_params *gb_plist, gmx_genborn_t *born);
+
+int
+generate_gb_topology(gmx_mtop_t *mtop, t_params *plist, t_params *gb_plist, gmx_genborn_t *born);
+
+
+
+
+/* Functions for calculating adjustments due to ie chain rule terms */
+real
+calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_mtop_t *mtop, const t_atomtypes *atype, int nr,
+ rvec x[], rvec f[], t_forcerec *fr,const t_iatom forceatoms[],int gb_algorithm, bool bRad);
+
+
+int
+gb_nblist_siev(t_commrec *cr, int natoms, int gb_algorithm, real gbcut, rvec x[], t_forcerec *fr, t_ilist *il, int n14);
+
+
+#endif /* _genborn_h */
+
extern real get_atomtype_radius(int nt,t_atomtype at);
extern real get_atomtype_vol(int nt,t_atomtype at);
extern real get_atomtype_surftens(int nt,t_atomtype at);
+extern real get_atomtype_gb_radius(int nt,t_atomtype at);
+extern real get_atomtype_S_hct(int nt,t_atomtype at);
extern int get_atomtype_ptype(int nt,t_atomtype at);
extern int get_atomtype_batype(int nt,t_atomtype at);
extern int get_atomtype_atomnumber(int nt,t_atomtype at);
extern int set_atomtype(int nt,t_atomtype at,t_symtab *tab,
t_atom *a,char *name,t_param *nb,
int bondatomtype,
- real radius,real vol,real surftens,int atomnumber);
+ real radius,real vol,real surftens,int atomnumber,
+ real gb_radius, real S_hct);
/* Set the values of an existing atom type nt. Returns nt on success or
NOTSET on error. */
+int
+set_atomtype_gbparam(t_atomtype at, int i,
+ real radius,real vol,real surftens,
+ real gb_radius, real S_hct);
+
extern int add_atomtype(t_atomtype at,t_symtab *tab,
t_atom *a,char *name,t_param *nb,
int bondatomtype,
- real radius,real vol,real surftens,real atomnumber);
+ real radius,real vol,real surftens,real atomnumber,
+ real gb_radius, real S_hct);
/* Add a complete new atom type to an existing atomtype structure. Returns
the number of the atom type. */
d_angletypes,
d_dihedraltypes,
d_nonbond_params,
+ d_implicit_genborn_params,
+ d_implicit_surface_params,
d_moleculetype,
d_atoms,
d_vsites2,
"angletypes",
"dihedraltypes",
"nonbond_params",
+ "implicit_genborn_params",
+ "implicit_surface_params",
/* All the directives above can not appear after moleculetype */
"moleculetype",
"atoms",
#include "vsite.h"
#include "pull.h"
#include "update.h"
+#include "genborn.h"
#define MD_GLAS (1<<1)
#define MD_POLARISE (1<<2)
t_nrnb *nrnb,gmx_wallcycle_t wcycle,
gmx_edsam_t ed,
t_forcerec *fr,
+ gmx_genborn_t *born,
int repl_ex_nst,int repl_ex_seed,
real cpt_period,real max_hours,
unsigned long Flags,
extern const char *eann_names[eannNR+1];
extern const char *egb_names[egbNR+1];
extern const char *eis_names[eisNR+1];
+extern const char *esa_names[esaNR+1];
extern const char *ewt_names[ewtNR+1];
extern const char *epull_names[epullNR+1];
extern const char *epullg_names[epullgNR+1];
#define ECOM(e) ENUM_NAME(e,ecmNR,ecm_names)
#define EANNEAL(e) ENUM_NAME(e,eannNR,eann_names)
#define EGBALGORITHM(e) ENUM_NAME(e,egbNR,egb_names)
+#define ESAALGORITHM(e) ENUM_NAME(e,esaNR,esa_names)
#define EIMPLICITSOL(e) ENUM_NAME(e,eisNR,eis_names)
#define EWALLTYPE(e) ENUM_NAME(e,ewtNR,ewt_names)
#define EPULLTYPE(e) ENUM_NAME(e,epullNR,epull_names)
#include "pbc.h"
#include "network.h"
#include "tgroup.h"
+#include "genborn.h"
void
gmx_setup_kernels(FILE *fplog);
#endif
#include "typedefs.h"
-
+#include "genborn.h"
+
/* Initialization function, also predicts the initial shell postions.
* If x!=NULL, the shells are predict for the global coordinates x.
*/
extern int relax_shell_flexcon(FILE *log,t_commrec *cr,bool bVerbose,
int mdstep,t_inputrec *inputrec,
bool bDoNS,bool bStopCM,
- gmx_localtop_t *top,gmx_constr_t constr,
+ gmx_localtop_t *top,
+ gmx_mtop_t *mtop,
+ gmx_constr_t constr,
gmx_enerdata_t *enerd,t_fcdata *fcd,
t_state *state,rvec f[],
rvec buf[],tensor force_vir,
gmx_groups_t *groups,
gmx_shellfc_t shfc,
t_forcerec *fr,
+ gmx_genborn_t *born, bool bBornRadii,
double t,rvec mu_tot,
int natoms,bool *bConverged,
gmx_vsite_t *vsite,
} t_atoms;
typedef struct {
- int nr; /* number of atomtypes */
+ int nr; /* number of atomtypes */
real *radius; /* GBSA radius for each atomtype */
real *vol; /* GBSA efective volume for each atomtype */
real *surftens; /* implicit solvent surftens for each atomtype */
+ real *gb_radius; /* GB radius for each atom type */
+ real *S_hct; /* Overlap factors for HCT/OBC GB models */
+
int *atomnumber; /* Atomic number, used for QM/MM */
} t_atomtypes;
erscNO, erscALL, erscCOM, erscNR
};
+/*
+ * eelNOTUSED1 used to be GB, but to enable generalized born with different
+ * forms of electrostatics (RF, switch, etc.) in the future it is now selected
+ * separately (through the implicit_solvent option).
+ */
enum {
eelCUT, eelRF, eelGRF, eelPME, eelEWALD, eelPPPM,
- eelPOISSON, eelSWITCH, eelSHIFT, eelUSER, eelGB, eelRF_NEC, eelENCADSHIFT,
+ eelPOISSON, eelSWITCH, eelSHIFT, eelUSER, eelGB_NOTUSED, eelRF_NEC, eelENCADSHIFT,
eelPMEUSER, eelPMESWITCH, eelPMEUSERSWITCH, eelRF_ZERO, eelNR
};
/* Algorithms for calculating GB radii */
enum {
- egbSTILL, egbOBC, egbNR
+ egbSTILL, egbHCT, egbOBC, egbNR
+};
+
+enum {
+ esaNO, esaAPPROX, esaSTILL, esaNR
};
/* Wall types */
/* xmdrun flexible constraints */
real fc_stepsize;
+ /* Generalized born implicit solvent */
+ bool bGB;
/* Generalized born stuff */
+ /* Table data for GB */
+ t_forcetable gbtab;
/* VdW radius for each atomtype (dim is thus ntype) */
real *atype_radius;
/* Effective radius (derived from effective volume) for each type */
real *atype_vol;
/* Implicit solvent - surface tension for each atomtype */
real *atype_surftens;
+ /* Implicit solvent - radius for GB calculation */
+ real *atype_gb_radius;
+ /* Implicit solvent - overlap for HCT model */
+ real *atype_S_hct;
+
+ /* Table scale for GB */
+ real gbtabscale;
+ /* Table range for GB */
+ real gbtabr;
+ /* GB neighborlists (the sr list will contain for each atom all other atoms
+ * (for use in the SA calculation) and the lr list will contain
+ * for each atom all atoms 1-4 or greater (for use in the GB calculation)
+ */
+ t_nblist gblist_sr;
+ t_nblist gblist_lr;
+ t_nblist gblist;
+
+ /* Inverse square root of the Born radii for implicit solvent */
+ real *invsqrta;
+ /* Derivatives of the potential with respect to the Born radii */
+ real *dvda;
+ /* Derivatives of the Born radii with respect to coordinates */
+ real *dadx;
/* If > 0 signals Test Particle Insertion,
* the value is the number of atoms of the molecule to insert
F_IDIHS,
F_PIDIHS,
F_TABDIHS,
+ F_GB,
F_LJ14,
F_COUL14,
F_LJC14_Q,
struct {real phi,dphi,kfac;int label,power; } dihres;
struct {int ex,power,label; real c,obs,kfac; } orires;
struct {int table;real kA;real kB; } tab;
+ struct {real c6A,c12A,c6B,c12B,sar,st,pi,gbr,bmlt; } gb;
struct {real buf[MAXFORCEPARAM]; } generic; /* Conversion */
} t_iparams;
#define IS_CHEMBOND(ftype) (interaction_function[(ftype)].nratoms==2 && interaction_function[(ftype)].flags & IF_CHEMBOND)
/* IS_CHEMBOND tells if function type ftype represents a chemical bond */
+/* IS_ANGLE tells if a function type ftype represents an angle
+ * Per Larsson, 2007-11-06
+ */
+#define IS_ANGLE(ftype) (interaction_function[(ftype)].nratoms==3 && interaction_function[(ftype)].flags & IF_ATYPE)
+#define IS_VSITE(ftype) (interaction_function[(ftype)].flags & IF_VSITE)
+
#define IS_TABULATED(ftype) (interaction_function[(ftype)].flags & IF_TABULATED)
extern const t_interaction_function interaction_function[F_NRE];
real gb_obc_alpha; /* 1st scaling factor for Bashford-Case GB */
real gb_obc_beta; /* 2nd scaling factor for Bashford-Case GB */
real gb_obc_gamma; /* 3rd scaling factor for Bashford-Case GB */
- real sa_surface_tension; /* Energy factor for SA part of GBSA */
+ real gb_dielectric_offset; /* Dielectric offset for Still/HCT/OBC */
+ real sa_algorithm; /* Algorithm for SA part of GBSA */
+ int sa_surface_tension; /* Energy factor for SA part of GBSA */
int vdwtype; /* Type of Van der Waals treatment */
real rvdw_switch; /* Van der Waals switch range start (nm) */
- real rvdw; /* Van der Waals cutoff (nm) */
+ real rvdw; /* Van der Waals cutoff (nm) */
int eDispCorr; /* Perform Long range dispersion corrections */
real tabext; /* Extension of the table beyond the cut-off, *
- * as well as the table length for 1-4 interac. */
+ * as well as the table length for 1-4 interac. */
real shake_tol; /* tolerance for shake */
int efep; /* free energy interpolation no/yes */
double init_lambda; /* initial value for perturbation variable */
eNR_SHAKE_RIJ, eNR_CONSTR_VIR, eNR_SETTLE,
eNR_VSITE2, eNR_VSITE3, eNR_VSITE3FD,
eNR_VSITE3FAD, eNR_VSITE3OUT, eNR_VSITE4FD,
- eNR_VSITE4FDN, eNR_VSITEN,
+ eNR_VSITE4FDN, eNR_VSITEN, eNR_GB,
eNRNB
};
the <tt>-tpid</tt> option of <tt>mdrun</tt>.
No trajectory or energy file is written.
Parallel tpi gives identical results to single node tpi.
+For charged molecules, using PME with a fine grid is most accurate
+and also efficient, since the potential in the system only needs
+to be calculated once per frame.
</dd>
<dt><b>tpic</b></dt>
<dd> Test particle insertion into a predefined cavity location.
exit 1;; \
esac; \
done; \
- echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/Makefile'; \
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/Makefile'; \
cd $(top_srcdir) && \
- $(AUTOMAKE) --gnu src/Makefile
+ $(AUTOMAKE) --foreign src/Makefile
.PRECIOUS: Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
exit 1;; \
esac; \
done; \
- echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu src/gmxlib/Makefile'; \
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/gmxlib/Makefile'; \
cd $(top_srcdir) && \
- $(AUTOMAKE) --gnu src/gmxlib/Makefile
+ $(AUTOMAKE) --foreign src/gmxlib/Makefile
.PRECIOUS: Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
real lambda,
const t_mdatoms *md,
t_fcdata *fcd,int *global_atom_index,
+ t_atomtypes *atype, gmx_genborn_t *born,
bool bPrintSepPot,int step)
{
int ftype,nbonds,ind,nat;
#include "disre.h"
#include "dihre.h"
#include "orires.h"
+#include "genborn.h"
+
#define def_bonded(str,lstr,nra,nrpa,nrpb,ind,func)\
{str,lstr,(nra),(nrpa),(nrpb),IF_BOND, (ind),(func)}
def_bonded ("IDIHS", "Improper Dih.", 4, 2, 2, eNR_IMPROPER,idihs ),
def_bonded ("PIDIHS", "Improper Dih.", 4, 3, 3, eNR_PROPER, pdihs ),
def_bondedt ("TABDIHS", "Tab. Dih.", 4, 2, 2, eNR_TABDIHS, tab_dihs ),
+ def_bonded ("GB", "Generalized Born", 2, 2, 2, eNR_GB, unimplemented ),
def_bondedz ("LJ14", "LJ-14", 2, 2, 2, eNR_NB14, unimplemented ),
def_nofc ("COUL14", "Coulomb-14" ),
def_bondedz ("LJC14_Q", "LJC-14 q", 2, 5, 0, eNR_NB14, unimplemented ),
};
const char *egb_names[egbNR+1] = {
- "Still", "OBC", NULL
+ "Still", "HCT", "OBC", NULL
+};
+
+const char *esa_names[esaNR+1] = {
+ "No", "Ace-approximation", "Still", NULL
};
const char *ewt_names[ewtNR+1] = {
if(mknb_func.coul==MKNB_COUL_RF)
mknb_declare_real("krsq");
if(mknb_func.coul==MKNB_COUL_GB) {
- mknb_declare_real("isai,isaj,isaprod,gbscale");
+ mknb_declare_real("isai,isaj,isaprod,gbscale,vgb");
if(mknb_func.do_force)
- mknb_declare_real("dvdasum,dvdatmp,dvdaj");
+ mknb_declare_real("dvdasum,dvdatmp,dvdaj,fgb");
}
if(mknb_func.vdw==MKNB_VDW_BHAM)
/* Generalized born: load 1/sqrt(a) */
mknb_assign("isaj",mknb_array("invsqrta","jnr"));
mknb_assign("isaprod","isai*isaj");
+
+ /* GB-PL */
+ mknb_assign("qq","iq*%s",mknb_array("charge","jnr"));
+ mknb_assign("vcoul","qq*rinv%d%d",i,j);
+
+ if(mknb_func.do_force)
+ mknb_assign("fscal","vcoul*rinv%d%d",i,j);
+
/* Save a flop by multiplying qq with isa1*isa2 already here */
- mknb_assign("qq","isaprod*iq*%s",mknb_array("charge","jnr"));
+ mknb_assign("qq","isaprod*(-qq)");
mknb_assign("gbscale","isaprod*gbtabscale");
nflops+=4;
} else {
nflops += mknb_calc_gbtable_index("r");
nflops += mknb_read_table("GBtab");
- mknb_assign("vcoul","qq*VV");
+ mknb_assign("vgb","qq*VV");
nflops++;
if(mknb_func.do_force) {
mknb_assign("fijC","qq*FF*gbscale");
- mknb_assign("dvdatmp","vcoul+fijC*r");
- mknb_assign("dvdasum","dvdasum - dvdatmp");
- nflops+=5;
+ mknb_assign("dvdatmp","-0.5*(vgb+fijC*r)");
+ mknb_assign("dvdasum","dvdasum + dvdatmp");
+ nflops+=6;
/* fs_minusrinv is empty */
- sprintf(fs_minus_rinv,"fijC");
+ sprintf(fs_minus_rinv,"fijC-fscal");
/* Update j atom dvda */
- mknb_assign(mknb_array("dvda","jnr"),"dvdaj-dvdatmp");
+ mknb_assign(mknb_array("dvda","jnr"),"dvdaj+dvdatmp*isaj*isaj");
}
- mknb_assign("vctot","vctot + vcoul");
+ /* This will only give thw Coulomb part back to the total potential */
+ mknb_assign("vctot","vctot + vcoul");
nflops++;
return nflops;
}
strcpy(fs_minus_rinv,tmp);
}
+
if(strlen(fs_rinvsq)>0 && strlen(fs_minus_rinv)>0) {
- mknb_assign("fscal","(%s)*rinvsq-(%s)*%s",fs_rinvsq,fs_minus_rinv,rinv);
- nflops += 3;
+ mknb_assign("fscal","(%s)*rinvsq-(%s)*%s",fs_rinvsq,fs_minus_rinv,rinv);
+ nflops += 3;
} else if(strlen(fs_rinvsq)>0) {
- mknb_assign("fscal","(%s)*rinvsq",fs_rinvsq);
- nflops++;
+ mknb_assign("fscal","(%s)*rinvsq",fs_rinvsq);
+ nflops++;
} else if(strlen(fs_minus_rinv)>0) {
- mknb_assign("fscal","-(%s)*%s",fs_minus_rinv,rinv);
- nflops += 2;
+ mknb_assign("fscal","-(%s)*%s",fs_minus_rinv,rinv);
+ nflops += 2;
}
-
+
return nflops;
}
*/
if(mknb_func.coul==MKNB_COUL_GB && mknb_func.do_force) {
mknb_assign(mknb_array("dvda","ii"),
- "%s + dvdasum",
+ "%s + dvdasum*isai*isai",
mknb_array("dvda","ii"));
nflops++;
}
nb_kernel332_ia32_sse.s nb_kernel332_ia32_sse.h \
nb_kernel333_ia32_sse.s nb_kernel333_ia32_sse.h \
nb_kernel334_ia32_sse.s nb_kernel334_ia32_sse.h \
- nb_kernel400_ia32_sse.s nb_kernel400_ia32_sse.h \
- nb_kernel410_ia32_sse.s nb_kernel410_ia32_sse.h \
- nb_kernel430_ia32_sse.s nb_kernel430_ia32_sse.h \
+ nb_kernel400_ia32_sse.c nb_kernel400_ia32_sse.h \
+ nb_kernel410_ia32_sse.c nb_kernel410_ia32_sse.h \
+ nb_kernel430_ia32_sse.c nb_kernel430_ia32_sse.h \
nb_kernel_ia32_sse_test_asm.s nb_kernel_ia32_sse_test_asm.h \
nb_kernel_ia32_sse.c nb_kernel_ia32_sse.h
nb_kernel332_x86_64_sse.s nb_kernel332_x86_64_sse.h \
nb_kernel333_x86_64_sse.s nb_kernel333_x86_64_sse.h \
nb_kernel334_x86_64_sse.s nb_kernel334_x86_64_sse.h \
- nb_kernel400_x86_64_sse.s nb_kernel400_x86_64_sse.h \
- nb_kernel410_x86_64_sse.s nb_kernel410_x86_64_sse.h \
- nb_kernel430_x86_64_sse.s nb_kernel430_x86_64_sse.h \
+ nb_kernel400_x86_64_sse.c nb_kernel400_x86_64_sse.h \
+ nb_kernel410_x86_64_sse.c nb_kernel410_x86_64_sse.h \
+ nb_kernel430_x86_64_sse.c nb_kernel430_x86_64_sse.h \
nb_kernel_x86_64_sse_test_asm.s nb_kernel_x86_64_sse_test_asm.h \
nb_kernel_x86_64_sse.c nb_kernel_x86_64_sse.h
f[0],
fshift,
egcoul,
- egnb,
+ egnb,
nblists->tab.scale,
tabledata,
&outeriter,
egnb,
&(nblists->tab.scale),
tabledata,
- NULL,
- NULL,
- NULL,
- NULL,
+ fr->invsqrta,
+ fr->dvda,
+ &(fr->gbtabscale),
+ fr->gbtab.tab,
&nthreads,
&(nlist->count),
nlist->mtx,
{ "Virtual Site 3out", 87 },
{ "Virtual Site 4fd", 110 },
{ "Virtual Site 4fdn", 254 },
- { "Virtual Site N", 15 }
+ { "Virtual Site N", 15 },
+ { "Mixed Generalized Born stuff", 10 }
};
#include "mtop_util.h"
/* This number should be increased whenever the file format changes! */
-static const int tpx_version = 59;
+static const int tpx_version = 60;
/* This number should only be increased when you edit the TOPOLOGY section
* of the tpx format. This way we can maintain forward compatibility too
* to the end of the tpx file, so we can just skip it if we only
* want the topology.
*/
-static const int tpx_generation = 17;
+static const int tpx_generation = 18;
/* This number should be the most recent backwards incompatible version
* I.e., if this number is 9, we cannot read tpx version 9 with this code.
{ 26, F_FOURDIHS },
{ 26, F_PIDIHS },
{ 43, F_TABDIHS },
+ { 60, F_GB },
{ 41, F_LJC14_Q },
{ 41, F_LJC_PAIRS_NB },
{ 32, F_BHAM_LR },
do_real(ir->gb_obc_alpha);
do_real(ir->gb_obc_beta);
do_real(ir->gb_obc_gamma);
+ if(file_version>=60)
+ {
+ do_real(ir->gb_dielectric_offset);
+ do_int(ir->sa_algorithm);
+ }
+ else
+ {
+ ir->gb_dielectric_offset = 0.09;
+ ir->sa_algorithm = esaAPPROX;
+ }
do_real(ir->sa_surface_tension);
}
else
do_int (iparams->vsiten.n);
do_real(iparams->vsiten.a);
break;
+ case F_GB:
+ do_real(iparams->gb.c6A);
+ do_real(iparams->gb.c12A);
+ do_real(iparams->gb.c6B);
+ do_real(iparams->gb.c12B);
+ do_real(iparams->gb.sar);
+ do_real(iparams->gb.st);
+ do_real(iparams->gb.pi);
+ do_real(iparams->gb.gbr);
+ do_real(iparams->gb.bmlt);
+ break;
default:
gmx_fatal(FARGS,"unknown function type %d (%s) in %s line %d",
snew(atomtypes->vol,j);
snew(atomtypes->surftens,j);
snew(atomtypes->atomnumber,j);
+ snew(atomtypes->gb_radius,j);
+ snew(atomtypes->S_hct,j);
}
ndo_real(atomtypes->radius,j,bDum);
ndo_real(atomtypes->vol,j,bDum);
{
ndo_int(atomtypes->atomnumber,j,bDum);
}
+ if(file_version >= 60)
+ {
+ ndo_real(atomtypes->gb_radius,j,bDum);
+ ndo_real(atomtypes->S_hct,j,bDum);
+ }
} else {
/* File versions prior to 26 cannot do GBSA,
* so they dont use this structure
atomtypes->vol = NULL;
atomtypes->surftens = NULL;
atomtypes->atomnumber = NULL;
+ atomtypes->gb_radius = NULL;
+ atomtypes->S_hct = NULL;
}
}
PR("gb_obc_alpha",ir->gb_obc_alpha);
PR("gb_obc_beta",ir->gb_obc_beta);
PR("gb_obc_gamma",ir->gb_obc_gamma);
+ PR("gb_dielectric_offset",ir->gb_dielectric_offset);
+ PS("sa_algorithm",ESAALGORITHM(ir->gb_algorithm));
PR("sa_surface_tension",ir->sa_surface_tension);
PS("DispCorr",EDISPCORR(ir->eDispCorr));
case F_VSITEN:
fprintf(fp,"n=%2d, a=%15.8e\n",iparams->vsiten.n,iparams->vsiten.a);
break;
+ case F_GB:
+ fprintf(fp, "c6A=%15.8e, c12A=%15.8e, c6B=%15.8e, c12B=%15.8e, sar=%15.8e, st=%15.8e, pi=%15.8e, gbr=%15.8e, bmlt=%15.8e\n",iparams->gb.c6A,iparams->gb.c12A,iparams->gb.c6B,iparams->gb.c12B,iparams->gb.sar,iparams->gb.st,iparams->gb.pi,iparams->gb.gbr,iparams->gb.bmlt);
+ break;
default:
gmx_fatal(FARGS,"unknown function type %d (%s) in %s line %d",
ftype,interaction_function[ftype].name,__FILE__,__LINE__);
indent=pr_title(fp,indent,title);
for(i=0;i<atomtypes->nr;i++) {
pr_indent(fp,indent);
- fprintf(fp,
- "atomtype[%3d]={radius=%12.5e, volume=%12.5e, surftens=%12.5e, atomnumber=%4d)}\n",
- bShowNumbers?i:-1,atomtypes->radius[i],atomtypes->vol[i],
- atomtypes->surftens[i],atomtypes->atomnumber[i]);
+ fprintf(fp,
+ "atomtype[%3d]={radius=%12.5e, volume=%12.5e, gb_radius=%12.5e, surftens=%12.5e, atomnumber=%4d, S_hct=%12.5e)}\n",
+ bShowNumbers?i:-1,atomtypes->radius[i],atomtypes->vol[i],
+ atomtypes->gb_radius[i],
+ atomtypes->surftens[i],atomtypes->atomnumber[i],atomtypes->S_hct[i]);
}
}
}
at->radius = NULL;
at->vol = NULL;
at->atomnumber = NULL;
+ at->gb_radius = NULL;
+ at->S_hct = NULL;
}
void init_groups(gmx_groups_t *groups)
sfree(at->atomname);
}
+void done_atomtypes(t_atomtypes *atype)
+{
+ atype->nr = 0;
+ sfree(atype->radius);
+ sfree(atype->vol);
+ sfree(atype->surftens);
+ sfree(atype->gb_radius);
+ sfree(atype->S_hct);
+}
+
void done_moltype(gmx_moltype_t *molt)
{
int f;
int i;
done_atom (&(top->atoms));
+
+ /* For GB */
+ done_atomtypes(&(top->atomtypes));
+
done_symtab(&(top->symtab));
done_block(&(top->cgs));
done_block(&(top->mols));
new->vsiten.n = round_check(old[0],1,ftype,"number of atoms");
new->vsiten.a = old[1];
break;
+ /*
+ case F_GB:
+ new->gb.st=old[0];
+ printf("new->gb.st=%g\n",new->gb.st);
+ break;
+ */
default:
gmx_fatal(FARGS,"unknown function type %d in %s line %d",
ftype,__FILE__,__LINE__);
real *radius; /* Radius for GBSA stuff */
real *vol; /* Effective volume for GBSA */
real *surftens; /* Surface tension with water, for GBSA */
+ real *gb_radius; /* Radius for Still model */
+ real *S_hct; /* Overlap factor for HCT model */
int *atomnumber; /* Atomic number, used for QM/MM */
} gpp_atomtype;
return ga->surftens[nt];
}
+real get_atomtype_gb_radius(int nt,t_atomtype at)
+{
+ gpp_atomtype *ga = (gpp_atomtype *) at;
+
+ if ((nt < 0) || (nt >= ga->nr))
+ return NOTSET;
+
+ return ga->gb_radius[nt];
+}
+
+real get_atomtype_S_hct(int nt,t_atomtype at)
+{
+ gpp_atomtype *ga = (gpp_atomtype *) at;
+
+ if ((nt < 0) || (nt >= ga->nr))
+ return NOTSET;
+
+ return ga->S_hct[nt];
+}
+
real get_atomtype_nbparam(int nt,int param,t_atomtype at)
{
gpp_atomtype *ga = (gpp_atomtype *) at;
gpp_atomtype *ga;
snew(ga,1);
+
+ ga->nr = 0;
+ ga->atom = NULL;
+ ga->atomname = NULL;
+ ga->nb = NULL;
+ ga->bondatomtype = NULL;
+ ga->radius = NULL;
+ ga->vol = NULL;
+ ga->surftens = NULL;
+ ga->atomnumber = NULL;
+ ga->gb_radius = NULL;
+ ga->S_hct = NULL;
return (t_atomtype ) ga;
}
+int
+set_atomtype_gbparam(t_atomtype at, int i,
+ real radius,real vol,real surftens,
+ real gb_radius, real S_hct)
+{
+ gpp_atomtype *ga = (gpp_atomtype *) at;
+
+ if ( (i < 0) || (i>= ga->nr))
+ return NOTSET;
+
+ ga->radius[i] = radius;
+ ga->vol[i] = vol;
+ ga->surftens[i] = surftens;
+ ga->gb_radius[i] = gb_radius;
+ ga->S_hct[i] = S_hct;
+
+ return i;
+}
+
+
int set_atomtype(int nt,t_atomtype at,t_symtab *tab,
t_atom *a,char *name,t_param *nb,
int bondatomtype,
- real radius,real vol,real surftens,int atomnumber)
+ real radius,real vol,real surftens,int atomnumber,
+ real gb_radius, real S_hct)
{
gpp_atomtype *ga = (gpp_atomtype *) at;
ga->vol[nt] = vol;
ga->surftens[nt] = surftens;
ga->atomnumber[nt] = atomnumber;
-
+ ga->gb_radius[nt] = gb_radius;
+ ga->S_hct[nt] = S_hct;
+
return nt;
}
int add_atomtype(t_atomtype at,t_symtab *tab,
t_atom *a,char *name,t_param *nb,
int bondatomtype,
- real radius,real vol,real surftens,real atomnumber)
+ real radius,real vol,real surftens,real atomnumber,
+ real gb_radius, real S_hct)
{
gpp_atomtype *ga = (gpp_atomtype *) at;
srenew(ga->vol,ga->nr);
srenew(ga->surftens,ga->nr);
srenew(ga->atomnumber,ga->nr);
-
+ srenew(ga->gb_radius,ga->nr);
+ srenew(ga->S_hct,ga->nr);
+
return set_atomtype(ga->nr-1,at,tab,a,name,nb,bondatomtype,radius,
- vol,surftens,atomnumber);
+ vol,surftens,atomnumber,gb_radius,S_hct);
}
void print_at (FILE * out, t_atomtype at)
sfree(ga->bondatomtype);
sfree(ga->radius);
sfree(ga->vol);
+ sfree(ga->gb_radius);
+ sfree(ga->S_hct);
sfree(ga->surftens);
sfree(ga->atomnumber);
ga->nr = 0;
(get_atomtype_radius(tli,at) == get_atomtype_radius(thistype,at)) &&
(get_atomtype_vol(tli,at) == get_atomtype_vol(thistype,at)) &&
(get_atomtype_surftens(tli,at) == get_atomtype_surftens(thistype,at)) &&
- (get_atomtype_atomnumber(tli,at) == get_atomtype_atomnumber(thistype,at));
+ (get_atomtype_atomnumber(tli,at) == get_atomtype_atomnumber(thistype,at)) &&
+ (get_atomtype_gb_radius(tli,at) == get_atomtype_gb_radius(thistype,at)) &&
+ (get_atomtype_S_hct(tli,at) == get_atomtype_S_hct(thistype,at));
}
if (bFound)
{
real *new_radius;
real *new_vol;
real *new_surftens;
+ real *new_gb_radius;
+ real *new_S_hct;
int *new_atomnumber;
ntype = get_atomtype_ntypes(at);
snew(new_vol,nat);
snew(new_surftens,nat);
snew(new_atomnumber,nat);
+ snew(new_gb_radius,nat);
+ snew(new_S_hct,nat);
/* We now have a list of unique atomtypes in typelist */
new_vol[i] = get_atomtype_vol(mi,at);
new_surftens[i] = get_atomtype_surftens(mi,at);
new_atomnumber[i] = get_atomtype_atomnumber(mi,at);
+ new_gb_radius[i] = get_atomtype_gb_radius(mi,at);
+ new_S_hct[i] = get_atomtype_S_hct(mi,at);
}
for(i=0; (i<nat*nat); i++) {
sfree(ga->vol);
sfree(ga->surftens);
sfree(ga->atomnumber);
+ sfree(ga->gb_radius);
+ sfree(ga->S_hct);
ga->radius = new_radius;
ga->vol = new_vol;
ga->surftens = new_surftens;
ga->atomnumber = new_atomnumber;
-
+ ga->gb_radius = new_gb_radius;
+ ga->S_hct = new_S_hct;
+
ga->nr=nat;
sfree(nbsnew);
snew(atomtypes->vol,ntype);
snew(atomtypes->surftens,ntype);
snew(atomtypes->atomnumber,ntype);
-
+ snew(atomtypes->gb_radius,ntype);
+ snew(atomtypes->S_hct,ntype);
for(i=0; i<ntype; i++) {
atomtypes->radius[i] = ga->radius[i];
atomtypes->vol[i] = ga->vol[i];
atomtypes->surftens[i] = ga->surftens[i];
atomtypes->atomnumber[i] = ga->atomnumber[i];
+ atomtypes->gb_radius[i] = ga->gb_radius[i];
+ atomtypes->S_hct[i] = ga->S_hct[i];
}
}
#include "gpp_atomtype.h"
#include "gpp_tomorse.h"
#include "mtop_util.h"
+#include "genborn.h"
static int rm_interactions(int ifunc,int nrmols,t_molinfo mols[])
{
t_atoms *confat;
int mb,mbs,i,nrmols,nmismatch;
char buf[STRLEN];
+ bool bGB=FALSE;
init_mtop(sys);
+
+ /* Set boolean for GB */
+ if(ir->implicit_solvent)
+ bGB=TRUE;
/* TOPOLOGY processing */
sys->name = do_top(bVerbose,topfile,topppfile,opts,bZero,&(sys->symtab),
plist,comb,reppow,fudgeQQ,
atype,&nrmols,&molinfo,ir,
- &nmolblock,&molblock);
+ &nmolblock,&molblock,bGB);
sys->nmolblock = 0;
snew(sys->molblock,nmolblock);
char fn[STRLEN],fnB[STRLEN],*mdparin;
int nerror,ntype;
bool bNeedVel,bGenVel;
- bool have_radius,have_vol,have_surftens;
+ bool have_radius,have_vol,have_surftens,have_gb_radius,have_S_hct;
bool have_atomnumber;
-
+ t_params *gb_plist = NULL;
+ gmx_genborn_t *born = NULL;
+
t_filenm fnm[] = {
{ efMDP, NULL, NULL, ffOPTRD },
{ efMDP, "-po", "mdout", ffWRITE },
}
}
- /* If we are doing GBSA, check that we got the parameters we need */
- have_radius=have_vol=have_surftens=TRUE;
- ntype = get_atomtype_ntypes(atype);
- for(i=0; (i<ntype); i++) {
- have_radius = have_radius && (get_atomtype_radius(i,atype) > 0);
- have_vol = have_vol && (get_atomtype_vol(i,atype) > 0);
- have_surftens = have_surftens && (get_atomtype_surftens(i,atype) >= 0);
+ /* If we are doing GBSA, check that we got the parameters we need
+ * This checking is to see if there are GBSA paratmeters for all
+ * atoms in the force field. To go around this for testing purposes
+ * comment out the nerror++ counter temporarliy
+ */
+ have_radius=have_vol=have_surftens=have_gb_radius=have_S_hct=TRUE;
+ for(i=0;i<get_atomtype_ntypes(atype);i++) {
+ have_radius=have_radius && (get_atomtype_radius(i,atype) > 0);
+ have_vol=have_vol && (get_atomtype_vol(i,atype) > 0);
+ have_surftens=have_surftens && (get_atomtype_surftens(i,atype) > 0);
+ have_gb_radius=have_gb_radius && (get_atomtype_gb_radius(i,atype) > 0);
+ have_S_hct=have_S_hct && (get_atomtype_S_hct(i,atype) > 0);
+ }
+ if(!have_radius && ir->implicit_solvent==eisGBSA) {
+ fprintf(stderr,"Can't do GB electrostatics; the forcefield is missing values for\n"
+ "atomtype radii, or they might be zero\n.");
+ /* nerror++; */
+ }
+ /*
+ if(!have_surftens && ir->implicit_solvent!=eisNO) {
+ fprintf(stderr,"Can't do implicit solvent; the forcefield is missing values\n"
+ " for atomtype surface tension\n.");
+ nerror++;
}
+ */
/* If we are doing QM/MM, check that we got the atom numbers */
have_atomnumber = TRUE;
- for (i=0; i<ntype; i++) {
+ for (i=0; i<get_atomtype_ntypes(atype); i++) {
have_atomnumber = have_atomnumber && (get_atomtype_atomnumber(i,atype) >= 0);
}
if (!have_atomnumber && ir->bQMMM)
ir->refcoord_scaling,ir->ePBC,
ir->posres_com,ir->posres_comB);
}
+
+ if(ir->implicit_solvent)
+ {
+ printf("Constructing Generalized Born topology...\n");
+ snew(gb_plist,1);
+ init_gb_plist(gb_plist);
+ snew(born,1);
+ generate_gb_topology(sys,plist,gb_plist,born);
+ }
+
nvsite = 0;
/* set parameters for virtual site construction (not for vsiten) */
if (bVerbose)
fprintf(stderr,"converting bonded parameters...\n");
+
+ ntype = get_atomtype_ntypes(atype);
convert_params(ntype, plist, mi, comb, reppow, fudgeQQ, sys);
if (debug)
pr_symtab(debug,0,"After convert_params",&sys->symtab);
+ /* Convert GB parameters to idef */
+ if(ir->implicit_solvent)
+ {
+ gmx_localtop_t *localtop;
+ localtop = gmx_mtop_generate_local_top(sys,ir);
+
+ convert_gb_params(&localtop->idef, F_GB, localtop->idef.ntypes, gb_plist,born);
+ }
+
/* set ptype to VSite for virtual sites */
for(mt=0; mt<sys->nmoltype; mt++) {
set_vsites_ptype(FALSE,&sys->moltype[mt]);
#include "mvdata.h"
#include "checkpoint.h"
#include "mtop_util.h"
+#include "genborn.h"
#ifdef GMX_MPI
#include <mpi.h>
bool bReadRNG,bReadEkin;
int list;
int nsteps_done;
-
+ gmx_genborn_t *born;
+
snew(inputrec,1);
snew(mtop,1);
opt2fn("-tableb",nfile,fnm),FALSE,pforce);
fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
+ /* Initialise GB-stuff */
+ if(fr->bGB)
+ init_gb(&born,cr,fr,inputrec,mtop,state->x,inputrec->rgbradii,inputrec->gb_algorithm);
+
/* Initialize QM-MM */
if(fr->bQMMM)
init_QMMMrec(cr,box,mtop,inputrec,fr);
gmx_fatal(FARGS,"Error %d initializing PME",status);
}
}
+
if (integrator[inputrec->eI].func == do_md) {
/* Turn on signal handling on all nodes */
nstepout,inputrec,mtop,
fcd,state,f,buf,
mdatoms,nrnb,wcycle,ed,fr,
+ born,
repl_ex_nst,repl_ex_seed,
cpt_period,max_hours,
Flags,
t_state *state_global,rvec f[],
rvec buf[],t_mdatoms *mdatoms,
t_nrnb *nrnb,gmx_wallcycle_t wcycle,
- gmx_edsam_t ed,t_forcerec *fr,
+ gmx_edsam_t ed,t_forcerec *fr, gmx_genborn_t *born,
int repl_ex_nst,int repl_ex_seed,
real cpt_period,real max_hours,
unsigned long Flags,
double t,t0,lam0;
bool bGStatEveryStep,bGStat,bCalcPres;
bool bNS,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
- bFirstStep,bStateFromTPX,bLastStep;
+ bFirstStep,bStateFromTPX,bLastStep,bBornRadii;
bool bNEMD,do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
bForceUpdate=FALSE,bX,bV,bF,bXTC,bCPT;
bool bMasterState;
bLastStep = TRUE;
}
+ /* Determine whether or not to update the Born radii if doing GB */
+ bBornRadii=bFirstStep;
+ if(ir->implicit_solvent && (step % ir->nstgbradii==0))
+ {
+ bBornRadii=TRUE;
+ }
+
do_log = do_per_step(step,ir->nstlog) || bFirstStep || bLastStep;
do_verbose = bVerbose && (step % stepout == 0 || bFirstStep || bLastStep);
if (shellfc) {
/* Now is the time to relax the shells */
count=relax_shell_flexcon(fplog,cr,bVerbose,bFFscan ? step+1 : step,
- ir,bNS,bStopCM,top,constr,enerd,fcd,
+ ir,bNS,bStopCM,top,top_global,constr,enerd,fcd,
state,f,buf,force_vir,mdatoms,
nrnb,wcycle,graph,groups,
- shellfc,fr,t,mu_tot,
+ shellfc,fr,born,bBornRadii,t,mu_tot,
state->natoms,&bConverged,vsite,
fp_field);
tcount+=count;
* This is parallellized as well, and does communication too.
* Check comments in sim_util.c
*/
- do_force(fplog,cr,ir,step,nrnb,wcycle,top,groups,
+ do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
state->box,state->x,&state->hist,
f,buf,force_vir,mdatoms,enerd,fcd,
state->lambda,graph,
- fr,vsite,mu_tot,t,fp_field,ed,
+ fr,vsite,mu_tot,t,fp_field,ed,born,bBornRadii,
GMX_FORCE_STATECHANGED | (bNS ? GMX_FORCE_NS : 0) |
GMX_FORCE_ALLFORCES | (bCalcPres ? GMX_FORCE_VIRIAL : 0));
}
atoms->atom[i].qB = alpha;
atoms->atom[i].m = atoms->atom[i].mB = mm;
k = add_atomtype(atype,tab,&(atoms->atom[i]),type,param,
- atoms->atom[i].type,0,0,0,atomnr);
+ atoms->atom[i].type,0,0,0,atomnr,0,0);
}
atoms->atom[i].type = k;
atoms->atom[i].typeB = k;
ir->sc_power);
CHECK(ir->sc_alpha!=0 && ir->sc_power!=1 && ir->sc_power!=2);
}
+
+ if(ir->coulombtype==eelGB_NOTUSED)
+ {
+ ir->coulombtype==eelCUT;
+ ir->implicit_solvent=eisGBSA;
+ fprintf(stderr,"Note: Old option for generalized born electrostatics given:\n"
+ "Changing coulombtype from \"generalized-born\" to \"cut-off\" and instead\n"
+ "setting implicit_solvent value to \"GBSA\" in input section.\n");
+ }
+
+ if(ir->implicit_solvent==eisGBSA)
+ {
+ sprintf(err_buf,"With GBSA implicit solvent, rgbradii must be equal to rlist.");
+ CHECK(ir->rgbradii != ir->rlist);
+ }
}
static int str_nelem(char *str,int maxptr,char *ptr[])
RTYPE ("gb_obc_alpha", ir->gb_obc_alpha, 1.0);
RTYPE ("gb_obc_beta", ir->gb_obc_beta, 0.8);
RTYPE ("gb_obc_gamma", ir->gb_obc_gamma, 4.85);
+ RTYPE ("gb_dielectric_offset", ir->gb_dielectric_offset, 0.09);
+ EETYPE("sa_algorithm", ir->sa_algorithm, esa_names, nerror, TRUE);
CTYPE ("Surface tension (kJ/mol/nm^2) for the SA (nonpolar surface) part of GBSA");
CTYPE ("The default value (2.092) corresponds to 0.005 kcal/mol/Angstrom^2.");
RTYPE ("sa_surface_tension", ir->sa_surface_tension, 2.092);
sprintf(err_buf,
"You are using a plain Coulomb cut-off, which might produce artifacts.\n"
"You might want to consider using %s electrostatics.\n",
- EI_TPI(ir->eI) ? EELTYPE(eelRF) : EELTYPE(eelPME));
+ EELTYPE(eelPME));
warning_note(err_buf);
}
}
if ((buf != NULL) && (sscanf(buf,"%s%lf",name,&m) == 2)) {
a->m = m;
- add_atomtype(at,tab,a,name,nb,0,0,0,0,0);
+ add_atomtype(at,tab,a,name,nb, 0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0 );
fprintf(stderr,"\rAtomtype %d",nratt+1);
}
}
set_nec(&(necessary[d_angletypes]),d_atomtypes,d_none);
set_nec(&(necessary[d_dihedraltypes]),d_atomtypes,d_none);
set_nec(&(necessary[d_nonbond_params]),d_atomtypes,d_none);
+ set_nec(&(necessary[d_implicit_genborn_params]),d_atomtypes,d_none);
+ set_nec(&(necessary[d_implicit_surface_params]),d_atomtypes,d_none);
set_nec(&(necessary[d_moleculetype]),d_atomtypes,d_none);
set_nec(&(necessary[d_atoms]),d_moleculetype,d_none);
set_nec(&(necessary[d_vsites2]),d_atoms,d_none);
int *nmolblock,
gmx_molblock_t **molblock,
bool bFEP,
+ bool bGB,
bool bZero,
bool bVerbose)
{
push_molt(symtab,bi0,pline);
break;
*/
+
+ case d_implicit_genborn_params:
+ push_gb_params(atype,pline);
+ break;
+
+ case d_implicit_surface_params:
+ gmx_fatal(FARGS,"Implicit surface directive not supported yet.");
+ break;
+
case d_moleculetype: {
if (!bReadMolType) {
int ntype;
fprintf(stderr,"Generated %d of the %d 1-4 parameter combinations\n",ncombs-ncopy,ncombs);
free_nbparam(pair,ntype);
}
- /* Copy GBSA parameters to atomtype array */
+ /* Copy GBSA parameters to atomtype array? */
bReadMolType = TRUE;
}
t_molinfo **molinfo,
t_inputrec *ir,
int *nmolblock,
- gmx_molblock_t **molblock)
+ gmx_molblock_t **molblock,
+ bool bGB)
{
/* Tmpfile might contain a long path */
char *tmpfile;
symtab,atype,nrmols,molinfo,
plist,combination_rule,repulsion_power,
opts,fudgeQQ,nmolblock,molblock,
- ir->efep!=efepNO,bZero,bVerbose);
+ ir->efep!=efepNO,bGB,bZero,bVerbose);
if ((*combination_rule != eCOMB_GEOMETRIC) &&
(ir->vdwtype == evdwUSER)) {
warning("Using sigma/epsilon based combination rules with"
t_molinfo **molinfo,
t_inputrec *ir,
int *nmolblock,
- gmx_molblock_t **molblock);
+ gmx_molblock_t **molblock,
+ bool bGB);
/* This routine expects sys->molt[m].ilist to be of size F_NRE and ordered. */
char type[STRLEN],btype[STRLEN],ptype[STRLEN];
double m,q;
double c[MAXFORCEPARAM];
- double radius,vol,surftens;
+ double radius,vol,surftens,gb_radius,S_hct;
char tmpfield[12][100]; /* Max 12 fields of width 100 */
char errbuf[256];
t_atom *atom;
}
/* optional fields */
- surftens = -1;
- vol = 0;
- radius = 0;
- atomnr = -1;
-
+ surftens = -1;
+ vol = 0;
+ radius = 0;
+ gb_radius = 0;
+ atomnr = -1;
+ S_hct = 0;
+
switch (nb_funct) {
case F_LJ:
{
if ( have_bonded_type )
{
- nread = sscanf(line,"%s%s%d%lf%lf%s%lf%lf%lf%lf%lf",
+ nread = sscanf(line,"%s%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf",
type,btype,&atomnr,&m,&q,ptype,&c[0],&c[1],
- &radius,&vol,&surftens);
+ &radius,&vol,&surftens,&gb_radius);
if(nread < 8)
{
too_few();
else
{
/* have_atomic_number && !have_bonded_type */
- nread = sscanf(line,"%s%d%lf%lf%s%lf%lf%lf%lf%lf",
+ nread = sscanf(line,"%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf",
type,&atomnr,&m,&q,ptype,&c[0],&c[1],
- &radius,&vol,&surftens);
+ &radius,&vol,&surftens,&gb_radius);
if(nread < 7)
{
too_few();
if ( have_bonded_type )
{
/* !have_atomic_number && have_bonded_type */
- nread = sscanf(line,"%s%s%lf%lf%s%lf%lf%lf%lf%lf",
+ nread = sscanf(line,"%s%s%lf%lf%s%lf%lf%lf%lf%lf%lf",
type,btype,&m,&q,ptype,&c[0],&c[1],
- &radius,&vol,&surftens);
+ &radius,&vol,&surftens,&gb_radius);
if(nread < 7)
{
too_few();
else
{
/* !have_atomic_number && !have_bonded_type */
- nread = sscanf(line,"%s%lf%lf%s%lf%lf%lf%lf%lf",
+ nread = sscanf(line,"%s%lf%lf%s%lf%lf%lf%lf%lf%lf",
type,&m,&q,ptype,&c[0],&c[1],
- &radius,&vol,&surftens);
+ &radius,&vol,&surftens,&gb_radius);
if(nread < 6)
{
too_few();
{
if ( have_bonded_type )
{
- nread = sscanf(line,"%s%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf",
+ nread = sscanf(line,"%s%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf%lf",
type,btype,&atomnr,&m,&q,ptype,&c[0],&c[1],&c[2],
- &radius,&vol,&surftens);
+ &radius,&vol,&surftens,&gb_radius);
if(nread < 9)
{
too_few();
else
{
/* have_atomic_number && !have_bonded_type */
- nread = sscanf(line,"%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf",
+ nread = sscanf(line,"%s%d%lf%lf%s%lf%lf%lf%lf%lf%lf%lf",
type,&atomnr,&m,&q,ptype,&c[0],&c[1],&c[2],
- &radius,&vol,&surftens);
+ &radius,&vol,&surftens,&gb_radius);
if(nread < 8)
{
too_few();
if ( have_bonded_type )
{
/* !have_atomic_number && have_bonded_type */
- nread = sscanf(line,"%s%s%lf%lf%s%lf%lf%lf%lf%lf%lf",
+ nread = sscanf(line,"%s%s%lf%lf%s%lf%lf%lf%lf%lf%lf%lf",
type,btype,&m,&q,ptype,&c[0],&c[1],&c[2],
- &radius,&vol,&surftens);
+ &radius,&vol,&surftens,&gb_radius);
if(nread < 8)
{
too_few();
else
{
/* !have_atomic_number && !have_bonded_type */
- nread = sscanf(line,"%s%lf%lf%s%lf%lf%lf%lf%lf%lf",
+ nread = sscanf(line,"%s%lf%lf%s%lf%lf%lf%lf%lf%lf%lf",
type,&m,&q,ptype,&c[0],&c[1],&c[2],
- &radius,&vol,&surftens);
+ &radius,&vol,&surftens,&gb_radius);
if(nread < 7)
{
too_few();
sprintf(errbuf,"Overriding atomtype %s",type);
warning(errbuf);
if ((nr = set_atomtype(nr,at,symtab,atom,type,param,batype_nr,
- radius,vol,surftens,atomnr)) == NOTSET)
+ radius,vol,surftens,atomnr,gb_radius,S_hct)) == NOTSET)
gmx_fatal(FARGS,"Replacing atomtype %s failed",type);
}
else if ((nr = add_atomtype(at,symtab,atom,type,param,
batype_nr,radius,vol,
- surftens,atomnr)) == NOTSET)
+ surftens,atomnr,gb_radius,S_hct)) == NOTSET)
gmx_fatal(FARGS,"Adding atomtype %s failed",type);
else {
/* Add space in the non-bonded parameters matrix */
nbp->c[i] = cr[i];
}
+void
+push_gb_params (t_atomtype at, char *line)
+{
+ int nfield;
+ int i,n,k,found,gfound;
+ double radius,vol,surftens,gb_radius,S_hct;
+ char atypename[STRLEN];
+ char errbuf[STRLEN];
+
+ if( (nfield = sscanf(line,"%s%lf%lf%lf%lf%lf",atypename,&radius,&vol,&surftens,&gb_radius,&S_hct)) != 6)
+ {
+ sprintf(errbuf,"Too few gb parameters for type %s\n",atypename);
+ warning(errbuf);
+ }
+
+ /* Search for atomtype */
+ //printf("gb params for atomtype '%s'\n",atypename);
+ found = 0;
+ gfound = -1;
+ for(i=0;i<get_atomtype_ntypes(at) && !found;i++)
+ {
+ if(gmx_strncasecmp(atypename,get_atomtype_name(i,at),STRLEN-1)==0)
+ {
+ //printf("Found matching atomtype in topology: %s\n",get_atomtype_name(i,at));
+ found = i;
+ gfound = i;
+ //printf("found=%d\n",found);
+ }
+ }
+
+ if(gfound==-1)
+ {
+ printf("Couldn't find topology match for atomtype %s\n",atypename);
+ abort();
+ }
+
+ set_atomtype_gbparam(at,found,radius,vol,surftens,gb_radius,S_hct);
+}
+
static void push_atom_now(t_symtab *symtab,t_atoms *at,int atomnr,
int atomicnumber,
int type,char *ctype,int ptype,
for (i=0; (i<MAXFORCEPARAM); i++)
param.c[i] = 0.0;
- nr = add_atomtype(at,symtab,&atom,"decoupled",¶m,-1,0.0,0.0,0.0,0);
+ nr = add_atomtype(at,symtab,&atom,"decoupled",¶m,-1,0.0,0.0,0.0,0,0,0);
/* Add space in the non-bonded parameters matrix */
realloc_nb_params(at,nbparam,pair);
extern void push_nbt(directive d,t_nbparam **nbt,t_atomtype atype,
char *plines,int nb_funct);
+void
+push_gb_params(t_atomtype atype,
+ char *line);
+
extern void push_atom(t_symtab *symtab,
t_block *cgs,
t_atoms *at,
for (i=0; (i < natoms); i++) {
char buf[12];
sprintf(buf,"%4d",(i+1));
- add_atomtype(atype,&stab,a,buf,param,0,0,0,0,0);
+ add_atomtype(atype,&stab,a,buf,param,0,0,0,0,0,0,0);
}
print_bt(out,d,atype,ftype,fsubtype,plist,TRUE);
cmp_real(fp,"inputrec->epsilon_r",-1,ir1->epsilon_r,ir2->epsilon_r,ftol);
cmp_real(fp,"inputrec->epsilon_rf",-1,ir1->epsilon_rf,ir2->epsilon_rf,ftol);
cmp_real(fp,"inputrec->tabext",-1,ir1->tabext,ir2->tabext,ftol);
+ cmp_int(fp,"inputrec->implicit_solvent",-1,ir1->implicit_solvent,ir2->implicit_solvent);
cmp_int(fp,"inputrec->gb_algorithm",-1,ir1->gb_algorithm,ir2->gb_algorithm);
- cmp_real(fp,"inputrec->gb_epsilon_solvent",-1,ir1->gb_epsilon_solvent,ir2->gb_epsilon_solvent,ftol);
cmp_int(fp,"inputrec->nstgbradii",-1,ir1->nstgbradii,ir2->nstgbradii);
cmp_real(fp,"inputrec->rgbradii",-1,ir1->rgbradii,ir2->rgbradii,ftol);
cmp_real(fp,"inputrec->gb_saltconc",-1,ir1->gb_saltconc,ir2->gb_saltconc,ftol);
- cmp_int(fp,"inputrec->implicit_solvent",-1,ir1->implicit_solvent,ir2->implicit_solvent);
+ cmp_real(fp,"inputrec->gb_epsilon_solvent",-1,ir1->gb_epsilon_solvent,ir2->gb_epsilon_solvent,ftol);
cmp_real(fp,"inputrec->gb_obc_alpha",-1,ir1->gb_obc_alpha,ir2->gb_obc_alpha,ftol);
cmp_real(fp,"inputrec->gb_obc_beta",-1,ir1->gb_obc_beta,ir2->gb_obc_beta,ftol);
cmp_real(fp,"inputrec->gb_obc_gamma",-1,ir1->gb_obc_gamma,ir2->gb_obc_gamma,ftol);
- cmp_real(fp,"inputrec->sa_surface_tension",-1,ir1->sa_surface_tension,ir2->sa_surface_tension,ftol);
-
-
+ cmp_real(fp,"inputrec->gb_dielectric_offset",-1,ir1->gb_dielectric_offset,ir2->gb_dielectric_offset,ftol);
+ cmp_int(fp,"inputrec->sa_algorithm",-1,ir1->sa_algorithm,ir2->sa_algorithm);
+ cmp_real(fp,"inputrec->sa_surface_tension",-1,ir1->sa_surface_tension,ir2->sa_surface_tension,ftol);
+
cmp_int(fp,"inputrec->eDispCorr",-1,ir1->eDispCorr,ir2->eDispCorr);
cmp_real(fp,"inputrec->shake_tol",-1,ir1->shake_tol,ir2->shake_tol,ftol);
cmp_int(fp,"inputrec->efep",-1,ir1->efep,ir2->efep);
force.c ghat.c init.c \
mdatom.c mdebin.c minimize.c \
mvxvf.c ns.c nsgrid.c \
- perf_est.c \
+ perf_est.c genborn.c \
+ genborn_sse.c genborn_sse.h \
pme.c pme_pp.c pppm.c \
partdec.c pull.c pullutil.c \
rf_util.c shakef.c sim_util.c \
int i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
bool bCheckDLB,bTurnOnDLB,bLogLoad,bRedist,bSortCG;
ivec ncells_old,np;
-
+
dd = cr->dd;
comm = dd->comm;
{
/* Initialize the ns grid */
copy_ivec(fr->ns.grid->n,ncells_old);
- grid_first(fplog,fr->ns.grid,dd,fr->ePBC,state_local->box,fr->rlistlong,
- dd->ncg_home);
+ grid_first(fplog,fr->ns.grid,dd,fr->ePBC,state_local->box,fr->rlistlong,dd->ncg_home);
if (!bMasterState &&
fr->ns.grid->n[XX] == ncells_old[XX] &&
fr->ns.grid->n[YY] == ncells_old[YY] &&
{
set_bham_b_max(fp,fr,mtop);
}
-
+
+ fr->bGB = (ir->implicit_solvent == eisGBSA);
+
/* Copy the GBSA data (radius, volume and surftens for each
* atomtype) from the topology atomtype section to forcerec.
*/
snew(fr->atype_radius,fr->ntype);
snew(fr->atype_vol,fr->ntype);
snew(fr->atype_surftens,fr->ntype);
+ snew(fr->atype_gb_radius,fr->ntype);
+ snew(fr->atype_S_hct,fr->ntype);
+
if (mtop->atomtypes.nr > 0)
{
for(i=0;i<fr->ntype;i++)
fr->atype_vol[i] = mtop->atomtypes.vol[i];
for(i=0;i<fr->ntype;i++)
fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
+ for(i=0;i<fr->ntype;i++)
+ fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
+ for(i=0;i<fr->ntype;i++)
+ fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
}
-
+
/* Set the charge scaling */
if (fr->epsilon_r != 0)
fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
rvec f[],
gmx_enerdata_t *enerd,
t_fcdata *fcd,
+ gmx_mtop_t *mtop,
+ gmx_genborn_t *born,
+ t_atomtypes *atype,
+ bool bBornRadii,
matrix box,
- real lambda, t_graph *graph,
+ real lambda,
+ t_graph *graph,
t_blocka *excl,
rvec mu_tot[],
int flags,
rvec box_size;
real dvdlambda,Vsr,Vlr,Vcorr=0,vdip,vcharge;
t_pbc pbc;
+ real dvdgb;
+
#ifdef GMX_MPI
double t0=0.0,t1,t2,t3; /* time measurement for coarse load balancing */
#endif
enerd->term[F_DVDL] += dvdlambda;
}
+ /* Calculate the Born radii */
+ if (ir->implicit_solvent && bBornRadii)
+ calc_gb_rad(cr,fr,ir,md->nr, idef->il[F_GB].nr,
+ mtop,atype,x,f,&(fr->gblist),born,md);
+
where();
do_nonbonded(cr,fr,x,f,md,
fr->bBHAM ?
flags & GMX_FORCE_FORCES);
where();
+ /* If we are doing GB, calculate bonded forces and apply corrections
+ * to the solvation derivatives */
+ if (ir->implicit_solvent) {
+ dvdgb = calc_gb_forces(cr,md,born,mtop,atype,idef->il[F_GB].nr,x,f,fr,idef->il[F_GB].iatoms,ir->gb_algorithm, bBornRadii);
+ enerd->term[F_GB]+=dvdgb;
+ }
+
#ifdef GMX_MPI
if (TAKETIME)
{
GMX_MPE_LOG(ev_calc_bonds_start);
calc_bonds(fplog,cr->ms,
idef,x,hist,f,fr,&pbc,graph,enerd,nrnb,lambda,md,fcd,
- DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL,
+ DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
fr->bSepDVDL && do_per_step(step,ir->nstlog),step);
debug_gmx();
GMX_MPE_LOG(ev_calc_bonds_finish);
--- /dev/null
+/*
+ * $Id$
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2008, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "genborn.h"
+#include "vec.h"
+#include "grompp.h"
+#include "pdbio.h"
+#include "names.h"
+#include "physics.h"
+#include "partdec.h"
+#include "network.h"
+#include "gmx_fatal.h"
+#include "mtop_util.h"
+
+#ifdef GMX_MPI
+#include "mpi.h"
+#endif
+
+#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) )
+/* x86 SSE intrinsics implementations of selected generalized born routines */
+#include "genborn_sse.h"
+#endif
+
+/* Still parameters - make sure to edit in genborn_sse.c too if you change these! */
+#define STILL_P1 0.073*0.1 /* length */
+#define STILL_P2 0.921*0.1*CAL2JOULE /* energy*length */
+#define STILL_P3 6.211*0.1*CAL2JOULE /* energy*length */
+#define STILL_P4 15.236*0.1*CAL2JOULE
+#define STILL_P5 1.254
+
+#define STILL_P5INV (1.0/STILL_P5)
+#define STILL_PIP5 (M_PI*STILL_P5)
+
+
+
+int init_gb_nblist(int natoms, t_nblist *nl)
+{
+ nl->maxnri = natoms*4;
+ nl->maxnrj = 0;
+ nl->maxlen = 0;
+ nl->nri = natoms;
+ nl->nrj = 0;
+ nl->iinr = NULL;
+ nl->gid = NULL;
+ nl->shift = NULL;
+ nl->jindex = NULL;
+ /*nl->nltype = nltype;*/
+
+ srenew(nl->iinr, nl->maxnri);
+ srenew(nl->gid, nl->maxnri);
+ srenew(nl->shift, nl->maxnri);
+ srenew(nl->jindex, nl->maxnri+1);
+
+ nl->jindex[0] = 0;
+
+ return 0;
+}
+
+int print_nblist(int natoms, t_nblist *nl)
+{
+ int i,k,ai,aj,nj0,nj1;
+
+ printf("genborn.c: print_nblist, natoms=%d\n",natoms);
+ for(i=0;i<natoms;i++)
+ {
+ ai=nl->iinr[i];
+ nj0=nl->jindex[i];
+ nj1=nl->jindex[i+1];
+ /*printf("ai=%d, nj0=%d, nj1=%d\n",ai,nj0,nj1);*/
+ for(k=nj0;k<nj1;k++)
+ {
+ aj=nl->jjnr[k];
+ printf("ai=%d, aj=%d\n",ai,aj);
+ }
+ }
+
+ return 0;
+}
+
+void fill_log_table(const int n, real *table)
+{
+ float numlog;
+ int i;
+ int *const exp_ptr=((int*)&numlog);
+ int x = *exp_ptr;
+
+ x=0x3F800000;
+ *exp_ptr = x;
+
+ int incr = 1 << (23-n);
+ int p=pow(2,n);
+
+ for(i=0;i<p;++i)
+ {
+ table[i]=log2(numlog);
+ x+=incr;
+ *exp_ptr=x;
+ }
+}
+
+
+real table_log(float val, const real *table, const int n)
+{
+ int *const exp_ptr = ((int*)&val);
+ int x = *exp_ptr;
+ const int log_2 = ((x>>23) & 255) - 127;
+ x &= 0x7FFFFF;
+ x = x >> (23-n);
+ val = table[x];
+ return ((val+log_2)*0.69314718);
+}
+
+void gb_pd_send(t_commrec *cr, real *send_data, int nr)
+{
+#ifdef GMX_MPI
+ int i,cur;
+ int *index,*sendc,*disp;
+
+ snew(sendc,cr->nnodes);
+ snew(disp,cr->nnodes);
+
+ index = pd_index(cr);
+ cur = cr->nodeid;
+
+ /* Setup count/index arrays */
+ for(i=0;i<cr->nnodes;i++)
+ {
+ sendc[i] = index[i+1]-index[i];
+ disp[i] = index[i];
+ }
+
+ /* Do communication */
+ MPI_Gatherv(send_data+index[cur],sendc[cur],GMX_MPI_REAL,send_data,sendc,disp,GMX_MPI_REAL,0,cr->mpi_comm_mygroup);
+ MPI_Bcast(send_data,nr,GMX_MPI_REAL,0,cr->mpi_comm_mygroup);
+
+#endif
+}
+
+
+int init_gb_plist(t_params *p_list)
+{
+ p_list->nr = 0;
+ p_list->param = NULL;
+
+ return 0;
+}
+
+
+static void assign_gb_param(t_functype ftype,t_iparams *new,
+ real old[MAXFORCEPARAM],int comb,real reppow)
+{
+ int i,j;
+
+ /* Set to zero */
+ for(j=0; (j<MAXFORCEPARAM); j++)
+ new->generic.buf[j]=0.0;
+
+ switch (ftype) {
+ case F_GB:
+ new->gb.c6A=old[0];
+ new->gb.c12A=old[1];
+ new->gb.c6B=old[2];
+ new->gb.c12B=old[3];
+ new->gb.sar=old[4];
+ new->gb.st=old[5];
+ new->gb.pi=old[6];
+ new->gb.gbr=old[7];
+ new->gb.bmlt=old[8];
+ break;
+ default:
+ gmx_fatal(FARGS,"unknown function type %d in %s line %d",
+ ftype,__FILE__,__LINE__);
+ }
+}
+
+static void
+append_gb_interaction(t_ilist *ilist,
+ int type,int nral,atom_id a[MAXATOMLIST])
+{
+ int i,where1;
+
+ where1 = ilist->nr;
+ ilist->nr += nral+1;
+
+ ilist->iatoms[where1++]=type;
+ for (i=0; (i<nral); i++)
+ {
+ ilist->iatoms[where1++]=a[i];
+ }
+}
+
+
+static int
+enter_gb_params(t_idef *idef, t_functype ftype,
+ real forceparams[MAXFORCEPARAM],int comb,real reppow,
+ int start,bool bAppend)
+{
+ t_iparams new;
+ int type;
+
+ assign_gb_param(ftype,&new,forceparams,comb,reppow);
+ if (!bAppend) {
+ for (type=start; (type<idef->ntypes); type++) {
+ if (idef->functype[type]==ftype) {
+ if (memcmp(&new,&idef->iparams[type],(size_t)sizeof(new)) == 0)
+ return type;
+ }
+ }
+ }
+ else
+ type=idef->ntypes;
+ if (debug)
+ fprintf(debug,"copying new to idef->iparams[%d] (ntypes=%d)\n",
+ type,idef->ntypes);
+ memcpy(&idef->iparams[type],&new,(size_t)sizeof(new));
+
+ idef->ntypes++;
+ idef->functype[type]=ftype;
+
+ return type;
+}
+
+int init_gb_still(t_commrec *cr, t_forcerec *fr, t_atomtypes *atype, t_idef *idef, t_atoms *atoms, gmx_genborn_t *born,int natoms)
+{
+
+ int i,j,i1,i2,k,m,nbond,nang,ia,ib,ic,id,nb,idx,idx2,at;
+ int iam,ibm;
+ int at0,at1;
+ real length,angle;
+ real r,ri,rj,ri2,ri3,rj2,r2,r3,r4,rk,ratio,term,h,doffset,electric;
+ real p1,p2,p3,factor,cosine,rab,rbc;
+
+ real vsol[natoms];
+ real gp[natoms];
+
+ if(PAR(cr))
+ {
+ pd_at_range(cr,&at0,&at1);
+
+ for(i=0;i<natoms;i++)
+ vsol[i]=gp[i]=0;
+ }
+ else
+ {
+ at0=0;
+ at1=natoms;
+ }
+
+ doffset = born->gb_doffset;
+
+ for(i=0;i<natoms;i++)
+ born->gpol[i]=born->vsolv[i]=0;
+
+ /* Compute atomic solvation volumes for Still method */
+ for(i=0;i<natoms;i++)
+ {
+ ri=atype->gb_radius[atoms->atom[i].type];
+ r3=ri*ri*ri;
+ born->vsolv[i]=(4*M_PI/3)*r3;
+ }
+
+ for(j=0;j<born->n12*3;j+=3)
+ {
+ m=idef->il[F_GB].iatoms[j];
+ ia=idef->il[F_GB].iatoms[j+1];
+ ib=idef->il[F_GB].iatoms[j+2];
+
+ r=1.01*idef->iparams[m].gb.st;
+
+ ri = atype->gb_radius[atoms->atom[ia].type];
+ rj = atype->gb_radius[atoms->atom[ib].type];
+
+ ri2 = ri*ri;
+ ri3 = ri2*ri;
+ rj2 = rj*rj;
+
+ ratio = (rj2-ri2-r*r)/(2*ri*r);
+ h = ri*(1+ratio);
+ term = (M_PI/3.0)*h*h*(3.0*ri-h);
+
+ if(PAR(cr))
+ {
+ vsol[ia]+=term;
+ }
+ else
+ {
+ born->vsolv[ia] -= term;
+ }
+
+ ratio = (ri2-rj2-r*r)/(2*rj*r);
+ h = rj*(1+ratio);
+ term = (M_PI/3.0)*h*h*(3.0*rj-h);
+
+ if(PAR(cr))
+ {
+ vsol[ib]+=term;
+ }
+ else
+ {
+ born->vsolv[ib] -= term;
+ }
+ }
+
+ if(PAR(cr))
+ {
+ gmx_sum(natoms,vsol,cr);
+ for(i=0;i<natoms;i++)
+ born->vsolv[i]=born->vsolv[i]-vsol[i];
+ }
+
+ /* Get the self-, 1-2 and 1-3 polarization for analytical Still method */
+ /* Self */
+ for(j=0;j<natoms;j++)
+ {
+ if(born->vs[j]==1)
+ born->gpol[j]=-0.5*ONE_4PI_EPS0/(atype->gb_radius[atoms->atom[j].type]-doffset+STILL_P1);
+ }
+
+ /* 1-2 */
+ for(j=0;j<born->n12*3;j+=3)
+ {
+ m=idef->il[F_GB].iatoms[j];
+ ia=idef->il[F_GB].iatoms[j+1];
+ ib=idef->il[F_GB].iatoms[j+2];
+
+ r=idef->iparams[m].gb.st;
+
+ r4=r*r*r*r;
+
+ if(PAR(cr))
+ {
+ gp[ia]+=STILL_P2*born->vsolv[ib]/r4;
+ gp[ib]+=STILL_P2*born->vsolv[ia]/r4;
+ }
+ else
+ {
+ born->gpol[ia]=born->gpol[ia]+STILL_P2*born->vsolv[ib]/r4;
+ born->gpol[ib]=born->gpol[ib]+STILL_P2*born->vsolv[ia]/r4;
+ }
+ }
+
+ /* 1-3 */
+ for(j=born->n12*3;j<born->n12*3+born->n13*3;j+=3)
+ {
+ m=idef->il[F_GB].iatoms[j];
+ ia=idef->il[F_GB].iatoms[j+1];
+ ib=idef->il[F_GB].iatoms[j+2];
+
+ r=idef->iparams[m].gb.st;
+ r4=r*r*r*r;
+
+ if(PAR(cr))
+ {
+ gp[ia]+=STILL_P3*born->vsolv[ib]/r4;
+ gp[ib]+=STILL_P3*born->vsolv[ia]/r4;
+ }
+ else
+ {
+ born->gpol[ia]=born->gpol[ia]+STILL_P3*born->vsolv[ib]/r4;
+ born->gpol[ib]=born->gpol[ib]+STILL_P3*born->vsolv[ia]/r4;
+ }
+ }
+
+ if(PAR(cr))
+ {
+ gmx_sum(natoms,gp,cr);
+ for(i=0;i<natoms;i++)
+ born->gpol[i]=born->gpol[i]+gp[i];
+ }
+ /*
+ real vsum, gsum;
+ vsum=0; gsum=0;
+
+ for(i=0;i<natoms;i++)
+ {
+
+ printf("final_init: id=%d, %s: v=%g, v_t=%g, g=%15.15f, g_t=%15.15f\n",
+ cr->nodeid,
+ *(atoms->atomname[i]),
+ born->vsolv[i],
+ born->vsolv[i]*1000,
+ born->gpol[i],
+ born->gpol[i]/CAL2JOULE);
+
+ vsum=vsum+(born->vsolv[i]*1000);
+ gsum=gsum+(born->gpol[i]/CAL2JOULE);
+ }
+
+ printf("SUM: Vtot=%15.15f, Gtot=%15.15f\n",vsum,gsum);
+ */
+ /*exit(1);*/
+
+ return 0;
+}
+
+
+
+#define LOG_TABLE_ACCURACY 15 /* Accuracy of the table logarithm */
+
+
+/* Initialize all GB datastructs and compute polarization energies */
+int init_gb(gmx_genborn_t **p_born,t_commrec *cr, t_forcerec *fr, t_inputrec *ir,
+ gmx_mtop_t *mtop, rvec x[], real rgbradii, int gb_algorithm)
+{
+ int i,j,m,ai,aj,jj,nalloc;
+ double rai,sk,p;
+ gmx_genborn_t *born;
+ int natoms;
+ t_atoms atoms;
+ gmx_localtop_t *localtop;
+ real doffset;
+
+ natoms = mtop->natoms;
+ atoms = gmx_mtop_global_atoms(mtop);
+ localtop = gmx_mtop_generate_local_top(mtop,ir);
+
+ snew(born,1);
+ *p_born = born;
+
+ snew(born->S_hct,natoms);
+ snew(born->drobc,natoms);
+
+ snew(fr->invsqrta,natoms);
+ snew(fr->dvda,natoms);
+ snew(fr->dadx,natoms*natoms);
+
+ snew(born->gpol,natoms);
+ snew(born->vsolv,natoms);
+ snew(born->bRad,natoms);
+
+ /* snew(born->asurf,natoms); */
+ /* snew(born->dasurf,natoms); */
+
+ snew(born->vs,natoms);
+ snew(born->param, natoms);
+
+ nalloc=0;
+
+ for(i=0;i<natoms;i++)
+ nalloc+=i;
+
+ init_gb_nblist(natoms, &(fr->gblist));
+
+ snew(fr->gblist.jjnr,nalloc*2);
+
+ born->n12=0;
+ born->n13=0;
+ born->n14=0;
+
+ /* Do the Vsites exclusions (if any) */
+ for(i=0;i<natoms;i++)
+ {
+ jj = atoms.atom[i].type;
+ born->vs[i]=1;
+
+ if(C6(fr->nbfp,fr->ntype,jj,jj)==0 && C12(fr->nbfp,fr->ntype,jj,jj)==0 && atoms.atom[i].q==0)
+ born->vs[i]=0;
+ }
+
+ for(i=0;i<F_NRE;i++)
+ {
+ if(IS_ANGLE(i))
+ {
+ born->n13+=localtop->idef.il[i].nr/(1+NRAL(i));
+ }
+
+ if(IS_CHEMBOND(i))
+ {
+ switch(i)
+ {
+ case F_BONDS:
+ case F_CONNBONDS:
+ case F_CONSTR:
+
+ for(j=0;j<localtop->idef.il[i].nr;)
+ {
+ m=localtop->idef.il[i].iatoms[j++];
+ ai=localtop->idef.il[i].iatoms[j++];
+ aj=localtop->idef.il[i].iatoms[j++];
+
+ if(born->vs[ai]==1 && born->vs[aj]==1)
+ {
+ born->n12++;
+ }
+ }
+
+ break;
+ }
+ }
+ }
+
+ born->n14=localtop->idef.il[F_LJ14].nr/(1+NRAL(F_LJ14));
+
+ /* If Still model, initialise the polarisation energies */
+ if(gb_algorithm==egbSTILL)
+ init_gb_still(cr, fr,&(mtop->atomtypes), &(localtop->idef), &atoms, born, natoms);
+
+ /* Copy algorithm parameters from inputrecord to local structure */
+ born->obc_alpha = ir->gb_obc_alpha;
+ born->obc_beta = ir->gb_obc_beta;
+ born->obc_gamma = ir->gb_obc_gamma;
+ born->gb_doffset = ir->gb_dielectric_offset;
+
+ doffset = born->gb_doffset;
+
+ /* If HCT/OBC, precalculate the sk*atype->S_hct factors */
+ if(gb_algorithm==egbHCT || gb_algorithm==egbOBC)
+ {
+ for(i=0;i<natoms;i++)
+ {
+ if(born->vs[i]==1)
+ {
+ rai = mtop->atomtypes.gb_radius[atoms.atom[i].type]-doffset;
+ sk = rai * mtop->atomtypes.S_hct[atoms.atom[i].type];
+ born->param[i] = sk;
+ }
+ else
+ {
+ born->param[i] = 0;
+ }
+ }
+ }
+
+ /* Init the logarithm table */
+ p=pow(2,LOG_TABLE_ACCURACY);
+ snew(born->log_table, p);
+
+ fill_log_table(LOG_TABLE_ACCURACY, born->log_table);
+
+ if(PAR(cr))
+ {
+ snew(born->work,natoms);
+ }
+ else
+ {
+ born->work = NULL;
+ }
+
+ return 0;
+}
+
+int generate_gb_topology(gmx_mtop_t *mtop, t_params *plist, t_params *gb_plist, gmx_genborn_t *born)
+{
+ int i,j,k,type,m,a1,a2,a3,a4,idx,nral,maxtypes,start,comb;
+ int n12,n13,n14;
+ double p1,p2,p3,cosine,r2,rab,rbc;
+ int natoms;
+ bl_t *bl;
+ bonds_t *bonds;
+ t_atoms atoms;
+ real doffset;
+
+
+ doffset = born->gb_doffset;
+ natoms = mtop->natoms;
+ bl=(bl_t *) malloc(sizeof(bl_t)*natoms);
+ snew(bonds,natoms);
+ atoms = gmx_mtop_global_atoms(mtop);
+
+ /* To keep the compiler happy */
+ rab=rbc=0;
+
+ for(i=0;i<F_NRE;i++)
+ {
+ if(plist[i].nr>0)
+ {
+ gb_plist->nr+=plist[i].nr;
+ }
+ }
+
+ snew(gb_plist->param,gb_plist->nr);
+
+ p1=STILL_P1;
+ p2=STILL_P2;
+ p3=STILL_P3;
+
+ idx=0;
+ n12=0;
+ n13=0;
+ n14=0;
+
+ for(i=0;i<F_NRE;i++)
+ {
+ if(IS_CHEMBOND(i))
+ {
+ switch(i)
+ {
+ case F_BONDS:
+ case F_CONNBONDS:
+ case F_CONSTR:
+
+ for(j=0;j<plist[i].nr; j++)
+ {
+ a1=plist[i].param[j].a[0];
+ a2=plist[i].param[j].a[1];
+
+ if(atoms.atom[a1].q!=0 && atoms.atom[a2].q!=0)
+ {
+ bl[a1].length[a2]=plist[i].param[j].c[0];
+ bl[a2].length[a1]=plist[i].param[j].c[0];
+
+ bonds[a1].bond[bonds[a1].nbonds]=a2;
+ bonds[a1].nbonds++;
+ bonds[a2].bond[bonds[a2].nbonds]=a1;
+ bonds[a2].nbonds++;
+
+ gb_plist->param[idx].a[0]=a1;
+ gb_plist->param[idx].a[1]=a2;
+
+ /* LJ parameters */
+ gb_plist->param[idx].c[0]=-1;
+ gb_plist->param[idx].c[1]=-1;
+ gb_plist->param[idx].c[2]=-1;
+ gb_plist->param[idx].c[3]=-1;
+
+ /* GBSA parameters */
+ gb_plist->param[idx].c[4]=mtop->atomtypes.radius[atoms.atom[a1].type]+mtop->atomtypes.radius[atoms.atom[a2].type];
+ gb_plist->param[idx].c[5]=plist[i].param[j].c[0];
+ gb_plist->param[idx].c[6]=p2;
+ gb_plist->param[idx].c[7]=mtop->atomtypes.gb_radius[atoms.atom[a1].type]+mtop->atomtypes.gb_radius[atoms.atom[a2].type];
+ gb_plist->param[idx].c[8]=0.8875;
+ n12++;
+ idx++;
+ }
+ }
+ break;
+
+ case F_G96BONDS:
+ case F_MORSE:
+ case F_CUBICBONDS:
+ case F_HARMONIC:
+ case F_FENEBONDS:
+ case F_TABBONDS:
+ case F_TABBONDSNC:
+ case F_POLARIZATION:
+ case F_VSITE2:
+ case F_VSITE3:
+ case F_VSITE3FD:
+ case F_VSITE3FAD:
+ case F_VSITE3OUT:
+ case F_VSITE4FD:
+ case F_VSITE4FDN:
+ break;
+
+
+ default:
+ gmx_fatal(FARGS,"generate_gb_topology, F_BONDS");
+
+ }
+ }
+ }
+
+ for(i=0;i<F_NRE;i++)
+ {
+ if(IS_ANGLE(i))
+ {
+ switch(i)
+ {
+ case F_ANGLES:
+
+ for(j=0;j<plist[i].nr; j++)
+ {
+ a1=plist[i].param[j].a[0];
+ a2=plist[i].param[j].a[1];
+ a3=plist[i].param[j].a[2];
+
+ gb_plist->param[idx].a[0]=a1;
+ gb_plist->param[idx].a[1]=a3;
+
+ /* LJ parameters */
+ gb_plist->param[idx].c[0]=-1;
+ gb_plist->param[idx].c[1]=-1;
+ gb_plist->param[idx].c[2]=-1;
+ gb_plist->param[idx].c[3]=-1;
+
+ /* GBSA parameters */
+ gb_plist->param[idx].c[4]=mtop->atomtypes.radius[atoms.atom[a1].type]+mtop->atomtypes.radius[atoms.atom[a3].type];
+
+ for(k=0;k<bonds[a2].nbonds;k++)
+ {
+ if(bonds[a2].bond[k]==a1)
+ {
+ rab=bl[a2].length[a1];
+ }
+
+ else if(bonds[a2].bond[k]==a3)
+ {
+ rbc=bl[a2].length[a3];
+ }
+
+ }
+
+ cosine=cos(plist[i].param[j].c[0]/RAD2DEG);
+ r2=rab*rab+rbc*rbc-(2*rab*rbc*cosine);
+ gb_plist->param[idx].c[5]=sqrt(r2);
+ gb_plist->param[idx].c[6]=p3;
+ gb_plist->param[idx].c[7]=mtop->atomtypes.gb_radius[atoms.atom[a1].type]+mtop->atomtypes.gb_radius[atoms.atom[a3].type];
+ gb_plist->param[idx].c[8]=0.3516;
+
+ n13++;
+ idx++;
+ }
+ break;
+
+ case F_G96ANGLES:
+ case F_CONSTR:
+ case F_UREY_BRADLEY:
+ case F_QUARTIC_ANGLES:
+ case F_TABANGLES:
+ break;
+
+ default:
+ gmx_fatal(FARGS,"generate_gb_topology, F_ANGLES");
+
+ }
+ }
+ }
+
+ for(i=0;i<plist[F_LJ14].nr; i++)
+ {
+ a1=plist[F_LJ14].param[i].a[0];
+ a2=plist[F_LJ14].param[i].a[1];
+
+ gb_plist->param[idx].a[0]=a1;
+ gb_plist->param[idx].a[1]=a2;
+
+ gb_plist->param[idx].c[0]=-1;
+ gb_plist->param[idx].c[1]=-1;
+ gb_plist->param[idx].c[2]=-1;
+ gb_plist->param[idx].c[3]=-1;
+
+ /* GBSA parameters */
+ gb_plist->param[idx].c[4]=mtop->atomtypes.radius[atoms.atom[a1].type]+mtop->atomtypes.radius[atoms.atom[a2].type];
+ gb_plist->param[idx].c[5]=-1;
+ gb_plist->param[idx].c[6]=p3;
+ gb_plist->param[idx].c[7]=mtop->atomtypes.gb_radius[atoms.atom[a1].type]+mtop->atomtypes.gb_radius[atoms.atom[a2].type];
+ gb_plist->param[idx].c[8]=0.3516;
+ idx++;
+ n14++;
+ }
+
+ gb_plist->nr=n12+n13+n14;
+ born->n12=n12;
+ born->n13=n13;
+ born->n14=n14;
+
+ return 0;
+
+}
+
+int convert_gb_params(t_idef *idef, t_functype ftype, int start, t_params *gb_plist, gmx_genborn_t *born)
+{
+ int k,nral,maxtypes,comb,type;
+ real reppow;
+
+ nral=NRAL(F_GB);
+
+ /* pl->nr is the number of gb interactions, so we need to allocate nr*3 elements in iatoms */
+ snew(idef->il[F_GB].iatoms,gb_plist->nr*3);
+
+ maxtypes=idef->ntypes;
+ comb=3;
+ reppow=12;
+
+ for(k=0;k<gb_plist->nr;k++)
+ {
+ if(maxtypes<=idef->ntypes)
+ {
+ maxtypes+=1000;
+ srenew(idef->functype,maxtypes);
+ srenew(idef->iparams,maxtypes);
+ }
+
+ type=enter_gb_params(idef,F_GB,gb_plist->param[k].c,comb,reppow,start,0);
+ append_gb_interaction(&idef->il[F_GB],type,NRAL(F_GB),gb_plist->param[k].a);
+
+ }
+
+ printf("# %10s: %d\n","GB-12",born->n12);
+ printf("# %10s: %d\n","GB-13",born->n13);
+ printf("# %10s: %d\n","GB-14",born->n14);
+
+ return 0;
+
+}
+
+
+
+int calc_gb_rad_still(t_commrec *cr, t_forcerec *fr,int natoms, gmx_mtop_t *mtop,
+ const t_atomtypes *atype, rvec x[], t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
+{
+ int i,k,n,nj0,nj1,ai,aj,type;
+ real gpi,dr,dr2,dr4,idr4,rvdw,ratio,ccf,theta,term,rai,raj;
+ real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11;
+ real rinv,idr2,idr6,vaj,dccf,cosq,sinq,prod,gpi2;
+ real factor;
+
+ factor=0.5*ONE_4PI_EPS0;
+
+ n=0;
+
+ for(i=0;i<natoms;i++)
+ born->bRad[i]=fr->invsqrta[i]=1;
+
+ for(i=0;i<nl->nri;i++ )
+ {
+ ai = i;
+
+ nj0 = nl->jindex[ai];
+ nj1 = nl->jindex[ai+1];
+
+ gpi = born->gpol[ai];
+ rai = mtop->atomtypes.gb_radius[md->typeA[ai]];
+
+ ix1 = x[ai][0];
+ iy1 = x[ai][1];
+ iz1 = x[ai][2];
+
+ for(k=nj0;k<nj1;k++)
+ {
+ aj = nl->jjnr[k];
+
+ jx1 = x[aj][0];
+ jy1 = x[aj][1];
+ jz1 = x[aj][2];
+
+ dx11 = ix1-jx1;
+ dy11 = iy1-jy1;
+ dz11 = iz1-jz1;
+
+ dr2 = dx11*dx11+dy11*dy11+dz11*dz11;
+ rinv = invsqrt(dr2);
+ idr2 = rinv*rinv;
+ idr4 = idr2*idr2;
+ idr6 = idr4*idr2;
+
+ raj = mtop->atomtypes.gb_radius[md->typeA[aj]];
+
+ rvdw = rai + raj;
+
+ ratio = dr2 / (rvdw * rvdw);
+ vaj = born->vsolv[aj];
+
+ if(ratio>STILL_P5INV) {
+ ccf=1.0;
+ dccf=0.0;
+ }
+ else
+ {
+ theta = ratio*STILL_PIP5;
+ cosq = cos(theta);
+ term = 0.5*(1.0-cosq);
+ ccf = term*term;
+ sinq = 1.0 - cosq*cosq;
+ dccf = 2.0*term*sinq*invsqrt(sinq)*STILL_PIP5*ratio;
+ }
+
+ prod = STILL_P4*vaj;
+ gpi = gpi+prod*ccf*idr4;
+ fr->dadx[n++] = prod*(4*ccf-dccf)*idr6;
+
+ }
+
+ gpi2 = gpi * gpi;
+ born->bRad[ai] = factor*invsqrt(gpi2);
+ fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+
+ }
+
+ return 0;
+}
+
+
+static int
+calc_gb_rad_hct(t_commrec *cr,t_forcerec *fr,int natoms, gmx_mtop_t *mtop,
+ const t_atomtypes *atype, rvec x[], t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
+{
+ int i,k,n,ai,aj,nj0,nj1,dum;
+ real rai,raj,gpi,dr2,dr,sk,sk2,lij,uij,diff2,tmp,sum_ai;
+ real rad,min_rad,rinv,rai_inv;
+ real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11;
+ real lij2, uij2, lij3, uij3, t1,t2,t3;
+ real lij_inv,dlij,duij,sk2_rinv,prod,log_term;
+ rvec dx;
+ real doffset;
+ real *sum_tmp;
+
+ doffset = born->gb_doffset;
+ sum_tmp = born->work;
+
+ /* Keep the compiler happy */
+ n=0;
+ prod=0;
+
+ for(i=0;i<natoms;i++)
+ born->bRad[i]=fr->invsqrta[i]=1;
+
+ for(i=0;i<nl->nri;i++)
+ {
+ ai = nl->iinr[i];
+
+ nj0 = nl->jindex[ai];
+ nj1 = nl->jindex[ai+1];
+
+ rai = mtop->atomtypes.gb_radius[md->typeA[ai]]-doffset;
+ sum_ai = 1.0/rai;
+ rai_inv = sum_ai;
+
+ ix1 = x[ai][0];
+ iy1 = x[ai][1];
+ iz1 = x[ai][2];
+
+ if(PAR(cr))
+ {
+ /* Only have the master node do this, since we only want one value at one time */
+ if(MASTER(cr))
+ sum_tmp[ai]=sum_ai;
+ else
+ sum_tmp[ai]=0;
+ }
+
+ for(k=nj0;k<nj1;k++)
+ {
+ aj = nl->jjnr[k];
+
+ jx1 = x[aj][0];
+ jy1 = x[aj][1];
+ jz1 = x[aj][2];
+
+ dx11 = ix1 - jx1;
+ dy11 = iy1 - jy1;
+ dz11 = iz1 - jz1;
+
+ dr2 = dx11*dx11+dy11*dy11+dz11*dz11;
+ rinv = invsqrt(dr2);
+ dr = rinv*dr2;
+
+ sk = born->param[aj];
+
+ if(rai < dr+sk)
+ {
+ lij = 1.0/(dr-sk);
+ dlij = 1.0;
+
+ if(rai>dr-sk) {
+ lij = rai_inv;
+ dlij = 0.0;
+ }
+
+ lij2 = lij*lij;
+ lij3 = lij2*lij;
+
+ uij = 1.0/(dr+sk);
+ uij2 = uij*uij;
+ uij3 = uij2*uij;
+
+ diff2 = uij2-lij2;
+
+ lij_inv = invsqrt(lij2);
+ sk2 = sk*sk;
+ sk2_rinv = sk2*rinv;
+ prod = 0.25*sk2_rinv;
+
+ /* log_term = table_log(uij*lij_inv,born->log_table,LOG_TABLE_ACCURACY); */
+ log_term = log(uij*lij_inv);
+
+ tmp = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term + prod*(-diff2);
+
+ if(rai<sk-dr)
+ tmp = tmp + 2.0 * (rai_inv-lij);
+
+ duij = 1.0;
+ t1 = 0.5*lij2 + prod*lij3 - 0.25*(lij*rinv+lij3*dr);
+ t2 = -0.5*uij2 - 0.25*sk2_rinv*uij3 + 0.25*(uij*rinv+uij3*dr);
+ t3 = 0.125*(1.0+sk2_rinv*rinv)*(-diff2)+0.25*log_term*rinv*rinv;
+
+ fr->dadx[n++] = (dlij*t1+duij*t2+t3)*rinv; /* rb2 is moved to chainrule */
+
+ if(PAR(cr))
+ {
+ sum_tmp[ai] -= 0.5*tmp;
+ }
+ else
+ {
+ sum_ai -= 0.5*tmp;
+ }
+ }
+ }
+
+ if(!PAR(cr))
+ {
+ min_rad = rai + doffset;
+ rad=1.0/sum_ai;
+
+ born->bRad[ai]=rad > min_rad ? rad : min_rad;
+ fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+ }
+
+ }
+
+ if(PAR(cr))
+ {
+ gmx_sum(natoms,sum_tmp,cr);
+
+ /* Calculate the Born radii so that they are available on all nodes */
+ for(i=0;i<natoms;i++)
+ {
+ ai = i;
+ min_rad = mtop->atomtypes.gb_radius[md->typeA[ai]];
+ rad = 1.0/sum_tmp[ai];
+
+ born->bRad[ai]=rad > min_rad ? rad : min_rad;
+ fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+ }
+ }
+
+ return 0;
+}
+
+static int
+calc_gb_rad_obc(t_commrec *cr, t_forcerec *fr, int natoms, gmx_mtop_t *mtop,
+ const t_atomtypes *atype, rvec x[], t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
+{
+ int i,k,ai,aj,nj0,nj1,n;
+ real rai,raj,gpi,dr2,dr,sk,sk2,lij,uij,diff2,tmp,sum_ai;
+ real rad, min_rad,sum_ai2,sum_ai3,tsum,tchain,rinv,rai_inv,lij_inv,rai_inv2;
+ real log_term,prod,sk2_rinv;
+ real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11;
+ real lij2,uij2,lij3,uij3,dlij,duij,t1,t2,t3,tmp2;
+ real doffset;
+ real *sum_tmp;
+
+ /* Keep the compiler happy */
+ n=0;
+ prod=0;
+
+ doffset = born->gb_doffset;
+ sum_tmp = born->work;
+
+ for(i=0;i<natoms;i++) {
+ born->bRad[i]=fr->invsqrta[i]=1;
+ }
+
+ for(i=0;i<nl->nri;i++)
+ {
+ ai = nl->iinr[i];
+
+ nj0 = nl->jindex[ai];
+ nj1 = nl->jindex[ai+1];
+
+ rai = mtop->atomtypes.gb_radius[md->typeA[ai]]-doffset;
+ sum_ai = 0;
+ rai_inv = 1.0/rai;
+ rai_inv2 = 1.0/mtop->atomtypes.gb_radius[md->typeA[ai]];
+
+ ix1 = x[ai][0];
+ iy1 = x[ai][1];
+ iz1 = x[ai][2];
+
+ if(PAR(cr))
+ sum_tmp[ai]=0;
+
+ for(k=nj0;k<nj1;k++)
+ {
+ aj = nl->jjnr[k];
+
+ jx1 = x[aj][0];
+ jy1 = x[aj][1];
+ jz1 = x[aj][2];
+
+ dx11 = ix1 - jx1;
+ dy11 = iy1 - jy1;
+ dz11 = iz1 - jz1;
+
+ dr2 = dx11*dx11+dy11*dy11+dz11*dz11;
+ rinv = invsqrt(dr2);
+ dr = dr2*rinv;
+
+ /* sk is precalculated in init_gb() */
+ sk = born->param[aj];
+
+ if(rai < dr+sk)
+ {
+ lij = 1.0/(dr-sk);
+ dlij = 1.0;
+
+ if(rai>dr-sk) {
+ lij = rai_inv;
+ dlij = 0.0;
+ }
+
+ uij = 1.0/(dr+sk);
+ lij2 = lij * lij;
+ lij3 = lij2 * lij;
+ uij2 = uij * uij;
+ uij3 = uij2 * uij;
+
+ diff2 = uij2-lij2;
+
+ lij_inv = invsqrt(lij2);
+ sk2 = sk*sk;
+ sk2_rinv = sk2*rinv;
+ prod = 0.25*sk2_rinv;
+
+ log_term = log(uij*lij_inv);
+ /* log_term = table_log(uij*lij_inv,born->log_table,LOG_TABLE_ACCURACY); */
+ tmp = lij-uij + 0.25*dr*diff2 + (0.5*rinv)*log_term + prod*(-diff2);
+
+ if(rai < sk-dr)
+ tmp = tmp + 2.0 * (rai_inv-lij);
+
+ duij = 1.0;
+ t1 = 0.5*lij2 + prod*lij3 - 0.25*(lij*rinv+lij3*dr);
+ t2 = -0.5*uij2 - 0.25*sk2_rinv*uij3 + 0.25*(uij*rinv+uij3*dr);
+ t3 = 0.125*(1.0+sk2_rinv*rinv)*(-diff2)+0.25*log_term*rinv*rinv;
+
+ fr->dadx[n++] = (dlij*t1+duij*t2+t3)*rinv; /* rb2 is moved to chainrule */
+
+ sum_ai += 0.5*tmp;
+
+ if(PAR(cr))
+ sum_tmp[ai] += 0.5*tmp;
+
+ }
+ }
+
+ if(!PAR(cr))
+ {
+ sum_ai = rai * sum_ai;
+ sum_ai2 = sum_ai * sum_ai;
+ sum_ai3 = sum_ai2 * sum_ai;
+
+ tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
+ born->bRad[ai] = rai_inv - tsum*rai_inv2;
+ born->bRad[ai] = 1.0 / born->bRad[ai];
+
+ fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+
+ tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
+ born->drobc[ai] = (1.0-tsum*tsum)*tchain*rai_inv2;
+ }
+ }
+
+ if(PAR(cr))
+ {
+ gmx_sum(natoms,sum_tmp,cr);
+
+ for(i=0;i<natoms;i++)
+ {
+ ai = i;
+ rai = mtop->atomtypes.gb_radius[md->typeA[ai]];
+ rai_inv = 1.0/rai;
+
+ sum_ai = sum_tmp[ai];
+ sum_ai = rai * sum_ai;
+ sum_ai2 = sum_ai * sum_ai;
+ sum_ai3 = sum_ai2 * sum_ai;
+
+ tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
+ born->bRad[ai] = rai_inv - tsum/mtop->atomtypes.gb_radius[md->typeA[ai]];
+
+ born->bRad[ai] = 1.0 / born->bRad[ai];
+ fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+
+ tchain = rai * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
+ born->drobc[ai] = (1.0-tsum*tsum)*tchain/mtop->atomtypes.gb_radius[md->typeA[ai]];
+ }
+ }
+
+ return 0;
+}
+
+
+
+int calc_gb_rad(t_commrec *cr, t_forcerec *fr, t_inputrec *ir,int natoms, int nrfa, gmx_mtop_t *mtop,
+ const t_atomtypes *atype, rvec x[], rvec f[],t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
+{
+#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) )
+ /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
+ switch(ir->gb_algorithm)
+ {
+ case egbSTILL:
+ calc_gb_rad_still_sse(cr,fr,md->nr,mtop, atype, x[0], nl, born, md);
+ break;
+ case egbHCT:
+ gmx_fatal(FARGS, "HCT algorithm not supported with sse");
+ /* calc_gb_rad_hct_sse(cr,fr,md->nr, forceatoms, forceparams,mtop,atype,x,nl,born,md); */
+ break;
+ case egbOBC:
+ calc_gb_rad_obc_sse(cr,fr,md->nr,mtop,atype,x[0],nl,born,md);
+ break;
+
+ default:
+ gmx_fatal(FARGS, "Unknown sse-enabled algorithm for Born radii calculation: %d",ir->gb_algorithm);
+ }
+
+#else
+
+ /* Switch for determining which algorithm to use for Born radii calculation */
+ switch(ir->gb_algorithm)
+ {
+ case egbSTILL:
+ calc_gb_rad_still(cr,fr,md->nr,mtop,atype,x,nl,born,md);
+ break;
+ case egbHCT:
+ calc_gb_rad_hct(cr,fr,md->nr,mtop,atype,x,nl,born,md);
+ break;
+ case egbOBC:
+ calc_gb_rad_obc(cr,fr,md->nr,mtop,atype,x,nl,born,md);
+ break;
+
+ default:
+ gmx_fatal(FARGS, "Unknown algorithm for Born radii calculation: %d",ir->gb_algorithm);
+ }
+
+#endif
+
+ return 0;
+}
+
+
+
+real gb_bonds_tab(int nbonds, real *x, real *f, real *charge, real *p_gbtabscale,
+ real *invsqrta, real *dvda, real *GBtab, const t_iatom forceatoms[],
+ real epsilon_r, real facel)
+{
+ int i, n0,nnn,type,ai,aj,ai3,aj3;
+ real isai,isaj;
+ real r,rsq11,ix1,iy1,iz1,jx1,jy1,jz1;
+ real dx11,dy11,dz11,rinv11,iq,facel2;
+ real isaprod,qq,gbscale,gbtabscale,Y,F,Geps,Heps2,Fp,VV,FF,rt,eps,eps2;
+ real vgb,fgb,vcoul,fijC,dvdatmp,fscal,tx,ty,tz,dvdaj;
+ real vctot;
+
+ gbtabscale=*p_gbtabscale;
+ vctot = 0.0;
+
+ for(i=0;i<nbonds; )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+ ai3 = ai*3;
+ aj3 = aj*3;
+ isai = invsqrta[ai];
+ ix1 = x[ai3+0];
+ iy1 = x[ai3+1];
+ iz1 = x[ai3+2];
+ iq = (-1)*facel*charge[ai];
+ jx1 = x[aj3+0];
+ jy1 = x[aj3+1];
+ jz1 = x[aj3+2];
+ dx11 = ix1 - jx1;
+ dy11 = iy1 - jy1;
+ dz11 = iz1 - jz1;
+ rsq11 = dx11*dx11+dy11*dy11+dz11*dz11;
+ rinv11 = invsqrt(rsq11);
+ isaj = invsqrta[aj];
+ isaprod = isai*isaj;
+ qq = isaprod*iq*charge[aj];
+ gbscale = isaprod*gbtabscale;
+ r = rsq11*rinv11;
+ rt = r*gbscale;
+ n0 = rt;
+ eps = rt-n0;
+ eps2 = eps*eps;
+ nnn = 4*n0;
+ Y = GBtab[nnn];
+ F = GBtab[nnn+1];
+ Geps = eps*GBtab[nnn+2];
+ Heps2 = eps2*GBtab[nnn+3];
+ Fp = F+Geps+Heps2;
+ VV = Y+eps*Fp;
+ FF = Fp+Geps+2.0*Heps2;
+ vgb = qq*VV;
+ fijC = qq*FF*gbscale;
+ dvdatmp = -(vgb+fijC*r)*0.5;
+ dvda[aj] = dvda[aj] + dvdatmp*isaj*isaj;
+ dvda[ai] = dvda[ai] + dvdatmp*isai*isai;
+ vctot = vctot + vgb;
+ fgb = -(fijC)*rinv11;
+ tx = fgb*dx11;
+ ty = fgb*dy11;
+ tz = fgb*dz11;
+
+ f[aj3+0] = f[aj3+0] - tx;
+ f[aj3+1] = f[aj3+1] - ty;
+ f[aj3+2] = f[aj3+2] - tz;
+
+ f[ai3+0] = f[ai3+0] + tx;
+ f[ai3+1] = f[ai3+1] + ty;
+ f[ai3+2] = f[ai3+2] + tz;
+ }
+
+ return vctot;
+}
+
+
+real calc_gb_selfcorrections(t_commrec *cr, int natoms,
+ real *charge, gmx_genborn_t *born, real *dvda, t_mdatoms *md, double facel)
+{
+ int i,ai,at0,at1;
+ real rai,e,derb,q,q2,fi,rai_inv,vtot;
+
+ if(PAR(cr))
+ {
+ pd_at_range(cr,&at0,&at1);
+ }
+ else
+ {
+ at0=0;
+ at1=natoms;
+ }
+
+ vtot=0.0;
+
+ /* Apply self corrections */
+ for(i=at0;i<at1;i++)
+ {
+ if(born->vs[i]==1)
+ {
+ ai = i;
+ rai = born->bRad[ai];
+ rai_inv = 1.0/rai;
+ q = charge[ai];
+ q2 = q*q;
+ fi = facel*q2;
+ e = fi*rai_inv;
+ derb = 0.5*e*rai_inv*rai_inv;
+ dvda[ai] += derb*rai;
+ vtot -= 0.5*e;
+ }
+ }
+
+ return vtot;
+
+}
+
+real calc_gb_nonpolar(t_commrec *cr, t_forcerec *fr,int natoms,gmx_genborn_t *born, gmx_mtop_t *mtop,
+ const t_atomtypes *atype, real *dvda,int gb_algorithm, t_mdatoms *md)
+{
+ int ai,i,at0,at1;
+ real e,es,rai,rbi,term,probe,tmp,factor;
+ real rbi_inv,rbi_inv2;
+
+ /* To keep the compiler happy */
+ factor=0;
+
+ if(PAR(cr))
+ {
+ pd_at_range(cr,&at0,&at1);
+ }
+ else
+ {
+ at0=0;
+ at1=natoms;
+ }
+
+ /* The surface area factor is 0.0049 for Still model, 0.0054 for HCT/OBC */
+ if(gb_algorithm==egbSTILL)
+ factor=0.0049*100*CAL2JOULE;
+
+ if(gb_algorithm==egbHCT || gb_algorithm==egbOBC)
+ factor=0.0054*100*CAL2JOULE;
+
+ es = 0;
+ probe = 0.14;
+ term = M_PI*4;
+
+ for(i=at0;i<at1;i++)
+ {
+ if(born->vs[i]==1)
+ {
+ ai = i;
+ rai = mtop->atomtypes.gb_radius[md->typeA[ai]];
+ rbi_inv = fr->invsqrta[ai];
+ rbi_inv2 = rbi_inv * rbi_inv;
+ tmp = (rai*rbi_inv2)*(rai*rbi_inv2);
+ tmp = tmp*tmp*tmp;
+ e = factor*term*(rai+probe)*(rai+probe)*tmp;
+ dvda[ai] = dvda[ai] - 6*e*rbi_inv2;
+ es = es + e;
+ }
+ }
+
+ return es;
+}
+
+real calc_gb_forces(t_commrec *cr, t_mdatoms *md, gmx_genborn_t *born, gmx_mtop_t *mtop, const t_atomtypes *atype, int nr,
+ rvec x[], rvec f[], t_forcerec *fr, const t_iatom forceatoms[], int gb_algorithm, bool bRad)
+{
+ real v=0;
+ int i;
+
+ /* Do a simple ACE type approximation for the non-polar solvation */
+ v += calc_gb_nonpolar(cr, fr,md->nr, born, mtop, atype, fr->dvda, gb_algorithm,md);
+
+ /* Calculate the bonded GB-interactions */
+ v += gb_bonds_tab(nr,x[0],f[0],md->chargeA,&(fr->gbtabscale),
+ fr->invsqrta,fr->dvda,fr->gbtab.tab,forceatoms,fr->epsilon_r, fr->epsfac);
+
+ /* Calculate self corrections to the GB energies - currently only A state used! (FIXME) */
+ v += calc_gb_selfcorrections(cr,md->nr,md->chargeA, born, fr->dvda, md, fr->epsfac);
+
+ if(PAR(cr))
+ {
+ /* Sum dvda */
+ gmx_sum(md->nr,fr->dvda, cr);
+ }
+
+#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) )
+ /* x86 or x86-64 with GCC inline assembly and/or SSE intrinsics */
+ calc_gb_chainrule_sse(md->nr, &(fr->gblist), fr->dadx, fr->dvda, x[0], f[0], gb_algorithm, born);
+#else
+ /* Calculate the forces due to chain rule terms with non sse code */
+ calc_gb_chainrule(md->nr, &(fr->gblist), x, f, fr->dvda, fr->dadx, gb_algorithm, born);
+#endif
+
+ return v;
+
+}
+
+
+real calc_gb_chainrule(int natoms, t_nblist *nl, rvec x[], rvec t[], real *dvda, real *dadx,
+ int gb_algorithm, gmx_genborn_t *born)
+{
+ int i,k,n,ai,aj,nj0,nj1;
+ real fgb,fij,rb2,rbi,fix1,fiy1,fiz1;
+ real ix1,iy1,iz1,jx1,jy1,jz1,dx11,dy11,dz11,rsq11;
+ real rinv11,tx,ty,tz,rbai;
+ real rb[natoms];
+ rvec dx;
+
+ n=0;
+
+ /* Loop to get the proper form for the Born radius term */
+ if(gb_algorithm==egbSTILL) {
+ for(i=0;i<natoms;i++)
+ {
+ rbi = born->bRad[i];
+ rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
+ }
+ }
+
+ if(gb_algorithm==egbHCT) {
+ for(i=0;i<natoms;i++)
+ {
+ rbi = born->bRad[i];
+ rb[i] = rbi * rbi * dvda[i];
+ }
+ }
+
+ if(gb_algorithm==egbOBC) {
+ for(i=0;i<natoms;i++)
+ {
+ rbi = born->bRad[i];
+ rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
+ }
+ }
+
+ for(i=0;i<nl->nri;i++)
+ {
+ ai = nl->iinr[i];
+ nj0 = nl->jindex[ai];
+ nj1 = nl->jindex[ai+1];
+
+ ix1 = x[ai][0];
+ iy1 = x[ai][1];
+ iz1 = x[ai][2];
+
+ fix1 = 0;
+ fiy1 = 0;
+ fiz1 = 0;
+
+ rbai = rb[ai];
+
+ for(k=nj0;k<nj1;k++)
+ {
+ aj = nl->jjnr[k];
+
+ jx1 = x[aj][0];
+ jy1 = x[aj][1];
+ jz1 = x[aj][2];
+
+ dx11 = ix1 - jx1;
+ dy11 = iy1 - jy1;
+ dz11 = iz1 - jz1;
+
+ fgb = rbai*dadx[n++];
+
+ tx = fgb * dx11;
+ ty = fgb * dy11;
+ tz = fgb * dz11;
+
+ fix1 = fix1 + tx;
+ fiy1 = fiy1 + ty;
+ fiz1 = fiz1 + tz;
+
+ /* Update force on atom aj */
+ t[aj][0] = t[aj][0] - tx;
+ t[aj][1] = t[aj][1] - ty;
+ t[aj][2] = t[aj][2] - tz;
+ }
+
+ /* Update force on atom ai */
+ t[ai][0] = t[ai][0] + fix1;
+ t[ai][1] = t[ai][1] + fiy1;
+ t[ai][2] = t[ai][2] + fiz1;
+
+ }
+
+
+ return 0;
+}
+
+/* Skriv om den h�r rutinen s� att den f�r samma format som Charmm SASA
+ * Antagligen kan man d� ber�kna derivatan i samma loop som ytan,
+ * vilket �r snabbare (g�rs inte i Charmm, men det borde g� :-) )
+ */
+int calc_surfStill(t_inputrec *ir,
+ t_idef *idef,
+ t_atoms *atoms,
+ rvec x[],
+ rvec f[],
+ gmx_genborn_t *born,
+ t_atomtypes *atype,
+ double *faction,
+ int natoms,
+ t_nblist *nl,
+ t_iparams forceparams[],
+ t_iatom forceatoms[],
+ int nbonds)
+{
+ int i,j,n,ia,ib,ic;
+
+ real pc[3],radp,probd,radnp,s,prob,bmlt;
+ real dx,dy,dz,d,rni,dist,bee,ri,rn,asurf,t1ij,t2ij,bij,bji,dbijp;
+ real dbjip,tip,tmp,tij,tji,t3ij,t4ij,t5ij,t3ji,t4ji,t5ji,dbij,dbji;
+ real dpf,dpx,dpy,dpz,pi,pn,si,sn;
+ real aprob[natoms];
+
+ int factor=1;
+
+
+ /*bonds_t *bonds,*bonds13;*/
+
+ /*snew(bonds,natoms);*/
+ /*snew(bonds13,natoms);*/
+
+ /* Zero out the forces to compare only surface area contribution */
+ /*
+ printf("Zeroing out forces before surface area..\n");
+
+ for(i=0;i<natoms;i++)
+ {
+
+ if(!(is_hydrogen(*atoms->atomname[i])))
+ {
+ printf("x=%g, s=%g, p=%g, r=%g\n",
+ x[i][0],
+ atype->vol[atoms->atom[i].type],
+ atype->surftens[atoms->atom[i].type],
+ factor*atype->radius[atoms->atom[i].type]);
+
+ }
+
+ printf("faction[i]=%g\n",faction[i*3]);
+ faction[i*3]=0;
+ faction[i*3+1]=0;
+ faction[i*3+2]=0;
+
+ f[i][0]=f[i][1]=f[i][2]=0;
+ }
+ exit(1);
+ */
+ /* Radius of probe */
+ radp=0.14;
+ probd=2*radp;
+
+ /*********************************************************
+ *********************************************************
+ *********************************************************
+ *********************************************************
+ *********************************************************
+ Begin SA calculation Gromacs-style
+ *********************************************************
+ *********************************************************
+ *********************************************************
+ *********************************************************
+ *********************************************************/
+
+ int k,type,ai,aj,nj0,nj1;
+ real dr2,sar,rai,raj,fij;
+ rvec dxx;
+
+ /* First set up the individual areas */
+ for(n=0;n<natoms;n++)
+ {
+ rn=atype->radius[atoms->atom[n].type];
+ born->asurf[n]=(4*M_PI*(rn+radp)*(rn+radp));
+ }
+
+ /* Then loop over the bonded interactions */
+ for(i=0;i<nbonds; )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ if(!is_hydrogen(*(atoms->atomname[ai])) &&
+ !is_hydrogen(*(atoms->atomname[aj])))
+ {
+
+ /*printf("genborn.c: xi=%g, xj=%g\n",factor*x[ai][0],factor*x[aj][0]); */
+
+ rvec_sub(x[ai],x[aj],dxx);
+ dr2 = iprod(dxx,dxx);
+
+ sar = forceparams[type].gb.sar;
+ bmlt = forceparams[type].gb.bmlt;
+ rni = sar+probd;
+
+ rn = atype->radius[atoms->atom[ai].type];
+ ri = atype->radius[atoms->atom[aj].type];
+ pn = atype->surftens[atoms->atom[ai].type];
+ pi = atype->surftens[atoms->atom[aj].type];
+ /*sn = atype->vol[atoms->atom[ai].type]; */
+ /*si = atype->vol[atoms->atom[aj].type]; */
+
+ /*rni = rn + ri +probd; */
+ /*printf("genborn.c: rn=%g, ri=%g\n",ri,rn); */
+ if(dr2<rni*rni)
+ {
+ /*printf("d=%g, s=%g\n",dr2,rni*rni);*/
+ dist = dr2*invsqrt(dr2);
+ t1ij = M_PI*(rni-dist);
+ t2ij = (rn-ri)/dist;
+ bij = (rn+radp)*t1ij*(1-t2ij);
+ bji = (ri+radp)*t1ij*(1+t2ij);
+ tij = pn*bmlt*bij/(4*M_PI*(rn+radp)*(rn+radp));
+ tji = pi*bmlt*bji/(4*M_PI*(ri+radp)*(ri+radp));
+
+ born->asurf[ai] = born->asurf[ai]*(1-tij);
+ born->asurf[aj] = born->asurf[aj]*(1-tji);
+ }
+ }
+ }
+
+ /* Now loop over interactions >= 1-4 */
+ bmlt=0.3516;
+
+ /*printf("NONBONDED INTERACTIONS\n");*/
+
+ for(i=0;i<natoms;i++)
+ {
+ ai = i;
+ nj0 = nl->jindex[ai];
+ nj1 = nl->jindex[ai+1];
+
+ rai = factor*atype->radius[atoms->atom[ai].type];
+ pn = atype->surftens[atoms->atom[ai].type];
+ /*sn = atype->vol[atoms->atom[ai].type];*/
+
+ for(k=nj0;k<nj1;k++)
+ {
+ aj = nl->jjnr[k];
+
+ if(!is_hydrogen(*(atoms->atomname[ai])) &&
+ !is_hydrogen(*(atoms->atomname[aj])))
+ {
+ raj = factor*atype->radius[atoms->atom[aj].type];
+ pi = atype->surftens[atoms->atom[aj].type];
+ /*si = atype->vol[atoms->atom[aj].type];*/
+ rvec_sub(x[ai],x[aj],dxx);
+
+ dr2 = factor*factor*iprod(dxx,dxx);
+ rni = rai + raj + probd;
+ /*printf("genborn.c: rn=%g, ri=%g, sar=%g, dr2=%g\n",rai,raj,sar,dr2); */
+ /*printf("genborn.c: xi=%g, xj=%g\n",factor*x[ai][0],factor*x[aj][0]); */
+
+ if(dr2<rni*rni)
+ {
+ /*printf("d=%g, s=%g\n",dr2,rni*rni); */
+ dist = dr2*invsqrt(dr2);
+ t1ij = M_PI*(rni-dist);
+ t2ij = (rai-raj)/dist;
+ bij = (rai+radp)*t1ij*(1-t2ij);
+ bji = (raj+radp)*t1ij*(1+t2ij);
+ tij = pn*bmlt*bij/(4*M_PI*(rai+radp)*(rai+radp));
+ tji = pi*bmlt*bji/(4*M_PI*(raj+radp)*(raj+radp));
+
+ born->asurf[ai]=born->asurf[ai]*(1-tij);
+ born->asurf[aj]=born->asurf[aj]*(1-tji);
+ }
+ }
+ }
+ }
+ /*
+ printf("AFTER BOTH AREA CALCULATIONS\n");
+ n=0;
+ for(i=0;i<natoms;i++)
+ {
+ if(!is_hydrogen(*atoms->atomname[i]))
+ {
+ printf("%d, Still=%g, gromacs=%g\n",n,born->asurf[i], born->asurf[i]);
+
+ born->as=born->as+born->asurf[i];
+ born->as=born->as+born->asurf[i];
+ }
+ n++;
+ }
+ */
+ /*printf("Total Still area=%g, Total new area=%g\n",born->as, born->as);*/
+ /*printf("nbonds=%d\n",nbonds);*/
+ /* Start to calculate the forces */
+ for(i=0;i<nbonds; )
+ {
+ type = forceatoms[i++];
+ ai = forceatoms[i++];
+ aj = forceatoms[i++];
+
+ if(!is_hydrogen(*(atoms->atomname[ai])) &&
+ !is_hydrogen(*(atoms->atomname[aj])))
+ {
+ rvec_sub(x[ai],x[aj],dxx);
+
+ dr2 = factor*factor*iprod(dxx,dxx);
+
+ sar = factor*forceparams[type].gb.sar;
+ bmlt = forceparams[type].gb.bmlt;
+ rni = sar+probd;
+
+ rn = factor*atype->radius[atoms->atom[ai].type];
+ ri = factor*atype->radius[atoms->atom[aj].type];
+ pn = atype->surftens[atoms->atom[ai].type];
+ pi = atype->surftens[atoms->atom[aj].type];
+ sn = atype->vol[atoms->atom[ai].type];
+ si = atype->vol[atoms->atom[aj].type];
+
+ if(dr2<rni*rni)
+ {
+ dist = dr2*invsqrt(dr2);
+ t1ij = M_PI*(rni-dist);
+ t2ij = (rn-ri)/dist;
+ bij = (rn+radp)*t1ij*(1-t2ij);
+ bji = (ri+radp)*t1ij*(1+t2ij);
+
+ dbij = M_PI*(rn+radp)*(dr2-(rni*(rn-ri)));
+ dbji = M_PI*(ri+radp)*(dr2+(rni*(rn-ri)));
+
+ t3ij = sn*born->asurf[ai]*dbij;
+ t4ij = (4*M_PI*(rn+radp)*(rn+radp))/(pn*bmlt)-bij;
+ t5ij = t3ij/t4ij;
+
+ t3ji = si*born->asurf[aj]*dbji;
+ t4ji = (4*M_PI*(ri+radp)*(ri+radp))/(pi*bmlt)-bji;
+ t5ji = t3ji/t4ji;
+
+ dpf = (t5ij+t5ji)/(dr2*dist);
+ /*printf("deriv_cut: ai=%d, xi=%g aj=%d, xj=%g\n",ai,x[ai][0], aj,x[aj][0]);*/
+ for(k=0;k<DIM;k++)
+ {
+ fij = factor*(-dpf)*dxx[k];
+ f[ai][k]+=fij;
+ f[aj][k]-=fij;
+ }
+ }
+ }
+ }
+
+ /* Now calculate forces for all interactions >= 1-4 */
+ bmlt = 0.3516;
+
+ for(i=0;i<natoms;i++)
+ {
+ ai = i;
+ nj0 = nl->jindex[ai];
+ nj1 = nl->jindex[ai+1];
+
+ rai = factor*atype->radius[atoms->atom[ai].type];
+ pn = atype->surftens[atoms->atom[ai].type];
+ sn = atype->vol[atoms->atom[ai].type];
+
+ for(k=nj0;k<nj1;k++)
+ {
+ aj = nl->jjnr[k];
+
+ if(!is_hydrogen(*(atoms->atomname[ai])) &&
+ !is_hydrogen(*(atoms->atomname[aj])))
+ {
+ raj = factor*atype->radius[atoms->atom[aj].type];
+ pi = atype->surftens[atoms->atom[aj].type];
+ si = atype->vol[atoms->atom[aj].type];
+
+ rvec_sub(x[ai],x[aj],dxx);
+
+ dr2 = factor*factor*iprod(dxx,dxx);
+ rni = rai + raj + probd;
+
+ if(dr2<rni*rni)
+ {
+ dist = dr2*invsqrt(dr2);
+ t1ij = M_PI*(rni-dist);
+ t2ij = (rai-raj)/dist;
+ bij = (rai+radp)*t1ij*(1-t2ij);
+ bji = (raj+radp)*t1ij*(1+t2ij);
+
+ dbij = M_PI*(rai+radp)*(dr2-(rni*(rai-raj)));
+ dbji = M_PI*(raj+radp)*(dr2+(rni*(rai-raj)));
+
+ t3ij = sn*born->asurf[ai]*dbij;
+ t4ij = (4*M_PI*(rai+radp)*(rai+radp))/(pn*bmlt)-bij;
+ t5ij = t3ij/t4ij;
+
+ t3ji = si*born->asurf[aj]*dbji;
+ t4ji = (4*M_PI*(raj+radp)*(raj+radp))/(pi*bmlt)-bji;
+ t5ji = t3ji/t4ji;
+
+ dpf = (t5ij+t5ji)/(dr2*dist);
+ /*printf("deriv_cut: ai=%d, xi=%g aj=%d, xj=%g\n",ai,x[ai][0], aj,x[aj][0]);*/
+ for(n=0;n<DIM;n++)
+ {
+ fij = factor*(-dpf)*dxx[n];
+ f[ai][n]+=fij;
+ f[aj][n]-=fij;
+ }
+ }
+ }
+ }
+ }
+ /*
+ printf("AFTER BOTH FORCES CALCULATIONS\n");
+ n=0;
+ for(i=0;i<natoms;i++)
+ {
+ if(!is_hydrogen(*atoms->atomname[i]))
+ {
+ printf("%d, gx=%g, gy=%g, gz=%g\n",
+ n,
+ faction[i*3],
+ faction[i*3+1],
+ faction[i*3+2],
+ f[i][0],
+ f[i][1],
+ f[i][2]);
+ }
+ n++;
+ }
+ */
+ return 0;
+}
+
+int calc_surfBrooks(t_inputrec *ir,
+ t_idef *idef,
+ t_atoms *atoms,
+ rvec x[],
+ gmx_genborn_t *born,
+ t_atomtypes *atype,
+ double *faction,
+ int natoms)
+{
+ int i,j,k;
+ real xi,yi,zi,dx,dy,dz,ri,rj;
+ real rho,rho2,rho6,r2,r,aij,aijsum,daij;
+ real kappa,sassum,tx,ty,tz,fix1,fiy1,fiz1;
+
+ real ck[5];
+ real Aij[natoms];
+ real sasi[natoms];
+
+ /* Brooks parameter for cutoff between atom pairs
+ * Increasing kappa will increase the number of atom pairs
+ * included in the calculation, which will also slow the calculation
+ */
+ kappa=0;
+
+ sassum=0;
+
+ /* Hydrogen atoms are included in via the ck parameters for the
+ * heavy atoms
+ */
+ for(i=0;i<natoms;i++)
+ {
+ /*if(strncmp(*atoms->atomname[i],"H",1)!=0) */
+ /*{ */
+ xi=x[i][0];
+ yi=x[i][1];
+ zi=x[i][2];
+
+ fix1=0;
+ fiy1=0;
+ fiz1=0;
+
+ ri=atype->radius[atoms->atom[i].type];
+ aijsum=0;
+
+ for(j=0;j<natoms;j++)
+ {
+ /*if(strncmp(*atoms->atomname[j],"H",1)!=0 && i!=j) */
+ if(i!=j)
+ {
+ dx=xi-x[j][0];
+ dy=yi-x[j][1];
+ dz=zi-x[j][2];
+
+ r2=dx*dx+dy*dy+dz*dz;
+ r=sqrt(r2);
+ rj=atype->radius[atoms->atom[j].type];
+
+ rho=ri+rj+kappa;
+ rho2=rho*rho;
+ rho6=rho2*rho2*rho2;
+
+ /* Cutoff test */
+ if(r<=rho)
+ {
+ aij=pow((rho2-r2)*(rho2+2*r2),2)/rho6;
+ daij=((4*r*(rho2-r2)*(rho2-r2))/rho6)-((4*r*(rho2-r2)*(2*r2+rho2))/rho6);
+ tx=daij*dx;
+ ty=daij*dy;
+ tz=daij*dz;
+
+ fix1=fix1+tx;
+ fiy1=fiy1+ty;
+ fiz1=fiz1+tz;
+
+ faction[j*3]=faction[j*3]-tx;
+ faction[j*3+1]=faction[j*3+1]-ty;
+ faction[j*3+2]=faction[j*3+2]-tz;
+
+ aijsum=aijsum+aij;
+ printf("xi=%g, xj=%g, fscal=%g\n",xi,x[j][0],daij);
+ }
+ }
+ }
+
+ faction[i*3]=faction[i*3]+fix1;
+ faction[i*3+1]=faction[i*3+1]+fiy1;
+ faction[i*3+2]=faction[i*3+2]+fiz1;
+
+ /* Calculate surface area coefficient */
+ Aij[i]=pow(aijsum,1/4);
+
+ for(k=0;k<5;k++)
+ {
+ sasi[i]=sasi[i]+ck[k]*(pow(Aij[i],k));
+ }
+
+ /* Increase total surface area */
+ sassum=sassum+sasi[i];
+
+ /*}*/
+ }
+
+ printf("Brooks total surface area is: %g\n", sassum);
+
+
+ return 0;
+}
+
+
+/* This will set up a really simple neighborlist for GB calculations
+ * so that each atom will have enervy other atom in its list.
+ * We don't worry about things liks load balancing etc ...
+ */
+ int do_gb_neighborlist(t_forcerec *fr, int natoms,t_atoms *atoms, t_ilist *il, int nbonds, int n12n13)
+ {
+ int i,j,k,m,ai,aj,ak,an,idx=0,idx_sr,vs_idx;
+ int nalloc=0;
+ int skip[natoms];
+ bonds_t *bonds,*bonds13;
+
+ /* Calculate the number of elements in the jjnr array
+ * For the gblist_sr, this will be an exact allocation, but for
+ * gblist_lr, this will be a few elements to much
+ */
+ fr->gblist_sr.count=0;
+ fr->gblist_sr.nri=natoms;
+ fr->gblist_sr.maxnri=natoms;
+
+ fr->gblist_lr.count=0;
+ fr->gblist_lr.nri=natoms;
+ fr->gblist_lr.maxnri=natoms;
+
+ for(i=0;i<natoms;i++)
+ {
+ fr->gblist_sr.iinr[i]=i;
+ fr->gblist_sr.gid[i]=0;
+ fr->gblist_sr.shift[i]=0;
+
+ fr->gblist_lr.iinr[i]=i;
+ fr->gblist_lr.gid[i]=0;
+ fr->gblist_lr.shift[i]=0;
+ }
+
+ fr->gblist_sr.enlist=0;
+ fr->gblist_sr.maxlen=natoms;
+ fr->gblist_lr.enlist=0;
+ fr->gblist_lr.maxlen=natoms;
+
+ /* Start the lr list */
+ idx=0;
+ idx_sr=0;
+
+ for(i=0;i<natoms;i++)
+ skip[i]=-1;
+
+ snew(bonds,natoms);
+ snew(bonds13,natoms);
+
+ for(i=0;i<nbonds; )
+ {
+ m=il->iatoms[i++];
+ ai=il->iatoms[i++];
+ aj=il->iatoms[i++];
+
+ bonds[ai].bond[bonds[ai].nbonds]=aj;
+ bonds[ai].nbonds++;
+ bonds[aj].bond[bonds[aj].nbonds]=ai;
+ bonds[aj].nbonds++;
+ }
+
+ for(i=nbonds;i<n12n13; )
+ {
+ m=il->iatoms[i++];
+ ai=il->iatoms[i++];
+ aj=il->iatoms[i++];
+
+ bonds13[ai].bond[bonds13[ai].nbonds]=aj;
+ bonds13[ai].nbonds++;
+ bonds13[aj].bond[bonds13[aj].nbonds]=ai;
+ bonds13[aj].nbonds++;
+ }
+
+ for(i=0;i<natoms;i++)
+ {
+ skip[i]=i;
+
+ for(k=0;k<bonds[i].nbonds;k++)
+ skip[bonds[i].bond[k]]=i;
+
+ for(k=0;k<bonds13[i].nbonds;k++)
+ skip[bonds13[i].bond[k]]=i;
+
+ fr->gblist_lr.jindex[i]=idx;
+ fr->gblist_sr.jindex[i]=idx_sr;
+
+ for(k=i+1;k<natoms;k++)
+ {
+ if(skip[k]!=i)
+ {
+ fr->gblist_lr.jjnr[idx++]=k;
+ }
+ }
+
+ for(k=0;k<natoms;k++)
+ {
+ if(skip[k]!=i)
+ {
+ fr->gblist_sr.jjnr[idx_sr++]=k;
+ }
+ }
+
+ }
+
+ fr->gblist_lr.jindex[i]=idx;
+ fr->gblist_sr.jindex[i]=idx_sr;
+
+ sfree(bonds);
+ sfree(bonds13);
+
+ return 0;
+ }
+
+int gb_nblist_siev(t_commrec *cr, int natoms, int gb_algorithm, real gbcut, rvec x[], t_forcerec *fr, t_ilist *il, int n14)
+{
+ int i,l,ii,j,k,n,nj0,nj1,ai,aj,idx,ii_idx,nalloc,at0,at1;
+ double dr2,gbcut2;
+ rvec dxx;
+ t_nblist *nblist;
+
+ int count[natoms];
+ int **atoms;
+
+ memset(count,0,sizeof(int)*natoms);
+ atoms=(int **) malloc(sizeof(int *)*natoms);
+
+ if(PAR(cr))
+ {
+ pd_at_range(cr,&at0,&at1);
+ }
+ else
+ {
+ at0=0;
+ at1=natoms;
+ }
+
+ int found;
+
+ for(i=0;i<natoms;i++)
+ atoms[i]=(int *) malloc(sizeof(int)*natoms);
+
+ if(gb_algorithm==egbHCT || gb_algorithm==egbOBC)
+ {
+ /* Loop over 1-2, 1-3 and 1-4 interactions */
+ for(k=0;k<il->nr;k+=3)
+ {
+ ai=il->iatoms[k+1];
+ aj=il->iatoms[k+2];
+
+ found=0;
+
+ /* So that we do not add the same bond twice. This happens with some constraints between 1-3 atoms
+ * that are in the bond-list but should not be in the GB nb-list */
+ for(i=0;i<count[ai];i++)
+ {
+ if(atoms[ai][i]==aj)
+ found=1;
+ }
+
+ /* When doing HCT or OBC, we need to add all interactions to the nb-list twice
+ * since the loop for calculating the Born-radii runs over all vs all atoms */
+ if(found==0)
+ {
+ atoms[ai][count[ai]]=aj;
+ count[ai]++;
+
+ atoms[aj][count[aj]]=ai;
+ count[aj]++;
+ }
+ }
+ }
+
+ if(gb_algorithm==egbSTILL)
+ {
+ /* Loop over 1-4 interactions */
+ for(k=n14;k<il->nr;k+=3)
+ {
+ ai=il->iatoms[k+1];
+ aj=il->iatoms[k+2];
+
+ found=0;
+
+ for(i=0;i<count[ai];i++)
+ {
+ if(atoms[ai][i]==aj)
+ found=1;
+ }
+
+ /* Also for Still, we need to add (1-4) interactions twice */
+ atoms[ai][count[ai]]=aj;
+ count[ai]++;
+
+ atoms[aj][count[aj]]=ai;
+ count[aj]++;
+
+ }
+ }
+
+ /* Loop over the VDWQQ and VDW nblists to set up the nonbonded part of the GB list */
+ for(n=0; (n<fr->nnblists); n++)
+ {
+ for(i=0; (i<eNL_NR); i++)
+ {
+ nblist=&(fr->nblists[n].nlist_sr[i]);
+
+ if(nblist->nri>0 && (i==eNL_VDWQQ || i==eNL_QQ))
+ {
+ for(j=0;j<nblist->nri;j++)
+ {
+ ai = nblist->iinr[j];
+
+ nj0=nblist->jindex[j];
+ nj1=nblist->jindex[j+1];
+
+ for(k=nj0;k<nj1;k++)
+ {
+ aj=nblist->jjnr[k];
+
+ if(ai>aj)
+ {
+ atoms[aj][count[aj]]=ai;
+ count[aj]++;
+
+ /* We need to add all interactions to the nb-list twice
+ * since the loop for calculating the Born-radii runs over all vs all atoms
+ */
+ atoms[ai][count[ai]]=aj;
+ count[ai]++;
+ }
+ else
+ {
+ atoms[ai][count[ai]]=aj;
+ count[ai]++;
+
+ atoms[aj][count[aj]]=ai;
+ count[aj]++;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ idx=0;
+ ii_idx=0;
+
+ for(i=0;i<natoms;i++)
+ {
+ fr->gblist.iinr[ii_idx]=i;
+
+ for(k=0;k<count[i];k++)
+ {
+ fr->gblist.jjnr[idx++]=atoms[i][k];
+ }
+
+ fr->gblist.jindex[ii_idx+1]=idx;
+ ii_idx++;
+ }
+
+ fr->gblist.nrj=idx;
+
+ for(i=0;i<natoms;i++)
+ free(atoms[i]);
+
+ free(atoms);
+
+ return 0;
+}
+
--- /dev/null
+/*
+ * $Id$
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2008, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "genborn.h"
+#include "vec.h"
+#include "grompp.h"
+#include "pdbio.h"
+#include "names.h"
+#include "physics.h"
+#include "partdec.h"
+#include "network.h"
+#include "gmx_fatal.h"
+#include "mtop_util.h"
+#include "genborn.h"
+
+#ifdef GMX_MPI
+#include "mpi.h"
+#endif
+
+/* Only compile this file if SSE intrinsics are available */
+#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) )
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+
+/* SIMD (SSE1+MMX indeed) implementation of sin, cos, exp and log
+
+ Inspired by Intel Approximate Math library, and based on the
+ corresponding algorithms of the cephes math library
+ */
+
+/* Copyright (C) 2007 Julien Pommier
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ (this is the zlib license)
+ */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+#define _PS_CONST(Name, Val) \
+static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val) \
+static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val) \
+static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+
+/* Still parameters - make sure to edit in genborn.c too if you change these! */
+#define STILL_P1 0.073*0.1 /* length */
+#define STILL_P2 0.921*0.1*CAL2JOULE /* energy*length */
+#define STILL_P3 6.211*0.1*CAL2JOULE /* energy*length */
+#define STILL_P4 15.236*0.1*CAL2JOULE
+#define STILL_P5 1.254
+
+#define STILL_P5INV (1.0/STILL_P5)
+#define STILL_PIP5 (M_PI*STILL_P5)
+
+
+
+_PS_CONST(1 , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1, 8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0, 2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2, 4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); /* 4 / M_PI */
+
+_PS_CONST(exp_hi, 88.3762626647949f);
+_PS_CONST(exp_lo, -88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
+xmm_mm_union u; u.xmm = xmm_; \
+mm0_ = u.mm[0]; \
+mm1_ = u.mm[1]; \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
+xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
+}
+
+typedef
+union
+{
+ __m128 sse;
+ float f[4];
+} my_sse_t;
+
+
+
+typedef union xmm_mm_union {
+ __m128 xmm;
+ __m64 mm[2];
+} xmm_mm_union;
+
+
+void sincos_ps(__m128 x, __m128 *s, __m128 *c) {
+ __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+ __m64 mm0, mm1, mm2, mm3, mm4, mm5;
+ sign_bit_sin = x;
+ /* take the absolute value */
+ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
+ /* extract the sign bit (upper one) */
+ sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);
+
+ /* scale by 4/Pi */
+ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
+
+ /* store the integer part of y in mm0:mm1 */
+ xmm3 = _mm_movehl_ps(xmm3, y);
+ mm2 = _mm_cvttps_pi32(y);
+ mm3 = _mm_cvttps_pi32(xmm3);
+
+ /* j=(j+1) & (~1) (see the cephes sources) */
+ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
+ mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
+ mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
+ mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);
+
+ y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+ mm4 = mm2;
+ mm5 = mm3;
+
+ /* get the swap sign flag for the sine */
+ mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
+ mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
+ mm0 = _mm_slli_pi32(mm0, 29);
+ mm1 = _mm_slli_pi32(mm1, 29);
+ __m128 swap_sign_bit_sin;
+ COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+ /* get the polynom selection mask for the sine */
+
+ mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
+ mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
+ mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+ mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+ __m128 poly_mask;
+ COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+
+ /* The magic pass: "Extended precision modular arithmetic"
+ x = ((x - y * DP1) - y * DP2) - y * DP3; */
+ xmm1 = *(__m128*)_ps_minus_cephes_DP1;
+ xmm2 = *(__m128*)_ps_minus_cephes_DP2;
+ xmm3 = *(__m128*)_ps_minus_cephes_DP3;
+ xmm1 = _mm_mul_ps(y, xmm1);
+ xmm2 = _mm_mul_ps(y, xmm2);
+ xmm3 = _mm_mul_ps(y, xmm3);
+ x = _mm_add_ps(x, xmm1);
+ x = _mm_add_ps(x, xmm2);
+ x = _mm_add_ps(x, xmm3);
+
+
+ /* get the sign flag for the cosine */
+
+ mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2);
+ mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2);
+ mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4);
+ mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4);
+ mm4 = _mm_slli_pi32(mm4, 29);
+ mm5 = _mm_slli_pi32(mm5, 29);
+ __m128 sign_bit_cos;
+ COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+
+ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+
+ /* Evaluate the first polynom (0 <= x <= Pi/4) */
+ __m128 z = _mm_mul_ps(x,x);
+ y = *(__m128*)_ps_coscof_p0;
+
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
+ y = _mm_mul_ps(y, z);
+ y = _mm_mul_ps(y, z);
+ __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
+ y = _mm_sub_ps(y, tmp);
+ y = _mm_add_ps(y, *(__m128*)_ps_1);
+
+ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
+
+ __m128 y2 = *(__m128*)_ps_sincof_p0;
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
+ y2 = _mm_mul_ps(y2, z);
+ y2 = _mm_mul_ps(y2, x);
+ y2 = _mm_add_ps(y2, x);
+
+ /* select the correct result from the two polynoms */
+ xmm3 = poly_mask;
+ __m128 ysin2 = _mm_and_ps(xmm3, y2);
+ __m128 ysin1 = _mm_andnot_ps(xmm3, y);
+ y2 = _mm_sub_ps(y2,ysin2);
+ y = _mm_sub_ps(y, ysin1);
+
+ xmm1 = _mm_add_ps(ysin1,ysin2);
+ xmm2 = _mm_add_ps(y,y2);
+
+ /* update the sign */
+ *s = _mm_xor_ps(xmm1, sign_bit_sin);
+ *c = _mm_xor_ps(xmm2, sign_bit_cos);
+ _mm_empty(); /* good-bye mmx */
+}
+
+
+__m128 log_ps(__m128 x) {
+ __m64 mm0, mm1;
+ __m128 one = *(__m128*)_ps_1;
+
+ __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+ x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos); /* cut off denormalized stuff */
+
+
+ /* part 1: x = frexpf(x, &e); */
+ COPY_XMM_TO_MM(x, mm0, mm1);
+ mm0 = _mm_srli_pi32(mm0, 23);
+ mm1 = _mm_srli_pi32(mm1, 23);
+ /* keep only the fractional part */
+ x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask);
+ x = _mm_or_ps(x, *(__m128*)_ps_0p5);
+
+ /* now e=mm0:mm1 contain the really base-2 exponent */
+ mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f);
+
+
+ mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f);
+
+ __m128 e = _mm_cvtpi32x2_ps(mm0, mm1);
+ e = _mm_add_ps(e, one);
+
+ /* part2:
+ if( x < SQRTHF ) {
+ e -= 1;
+ x = x + x - 1.0;
+ } else { x = x - 1.0; }
+ */
+ __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF);
+
+ __m128 tmp = _mm_and_ps(x, mask);
+ x = _mm_sub_ps(x, one);
+ e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+ x = _mm_add_ps(x, tmp);
+
+
+ __m128 z = _mm_mul_ps(x,x);
+
+ __m128 y = *(__m128*)_ps_cephes_log_p0;
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8);
+ y = _mm_mul_ps(y, x);
+
+ y = _mm_mul_ps(y, z);
+
+
+ tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1);
+ y = _mm_add_ps(y, tmp);
+
+
+ tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
+ y = _mm_sub_ps(y, tmp);
+
+ tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2);
+ x = _mm_add_ps(x, y);
+ x = _mm_add_ps(x, tmp);
+ x = _mm_or_ps(x, invalid_mask); /* negative arg will be NAN */
+ _mm_empty();
+ return x;
+}
+
+__m128 exp_ps(__m128 x) {
+ __m128 tmp = _mm_setzero_ps(), fx;
+ __m64 mm0, mm1;
+ __m128 one = *(__m128*)_ps_1;
+
+ x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
+ x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);
+
+ /* express exp(x) as exp(g + n*log(2)) */
+ fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
+ fx = _mm_add_ps(fx, *(__m128*)_ps_0p5);
+
+ /* how to perform a floorf with SSE: just below */
+ /* step 1 : cast to int */
+ tmp = _mm_movehl_ps(tmp, fx);
+ mm0 = _mm_cvttps_pi32(fx);
+ mm1 = _mm_cvttps_pi32(tmp);
+ /* step 2 : cast back to float */
+ tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+ /* if greater, substract 1 */
+ __m128 mask = _mm_cmpgt_ps(tmp, fx);
+ mask = _mm_and_ps(mask, one);
+ fx = _mm_sub_ps(tmp, mask);
+
+ tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
+ __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
+ x = _mm_sub_ps(x, tmp);
+ x = _mm_sub_ps(x, z);
+
+ z = _mm_mul_ps(x,x);
+
+ __m128 y = *(__m128*)_ps_cephes_exp_p0;
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
+ y = _mm_mul_ps(y, x);
+ y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
+ y = _mm_mul_ps(y, z);
+ y = _mm_add_ps(y, x);
+ y = _mm_add_ps(y, one);
+
+ /* build 2^n */
+ z = _mm_movehl_ps(z, fx);
+ mm0 = _mm_cvttps_pi32(fx);
+ mm1 = _mm_cvttps_pi32(z);
+ mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
+ mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
+ mm0 = _mm_slli_pi32(mm0, 23);
+ mm1 = _mm_slli_pi32(mm1, 23);
+
+ __m128 pow2n;
+ COPY_MM_TO_XMM(mm0, mm1, pow2n);
+
+ y = _mm_mul_ps(y, pow2n);
+ _mm_empty();
+ return y;
+}
+
+
+int
+calc_gb_rad_still_sse(t_commrec *cr, t_forcerec *fr,int natoms, gmx_mtop_t *mtop,
+ const t_atomtypes *atype, real *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
+{
+
+ int i,k,n,ai,ai3,aj1,aj2,aj3,aj4,nj0,nj1,offset;
+ int aj13,aj23,aj33,aj43;
+ int at0,at1;
+ real *sum_gpi;
+
+ real gpi_ai,gpi2,gpi_tmp;
+ real factor;
+
+ __m128 ix,iy,iz;
+ __m128 jx,jy,jz;
+ __m128 dx,dy,dz;
+ __m128 t1,t2,t3;
+ __m128 rsq11,rinv,rinv2,rinv4,rinv6;
+ __m128 ratio,gpi,rai,raj,vaj,rvdw,mask_cmp;
+ __m128 ccf,dccf,theta,cosq,term,sinq,res,prod;
+ __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
+
+ __m128i mask, maski;
+
+ const __m128 half = {0.5f , 0.5f , 0.5f , 0.5f };
+ const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
+ const __m128 one = {1.0f, 1.0f , 1.0f , 1.0f };
+ const __m128 two = {2.0f , 2.0f , 2.0f, 2.0f };
+ const __m128 zero = {0.0f , 0.0f , 0.0f , 0.0f };
+ const __m128 four = {4.0f , 4.0f , 4.0f , 4.0f };
+
+ const __m128 p5inv = {STILL_P5INV, STILL_P5INV, STILL_P5INV, STILL_P5INV};
+ const __m128 pip5 = {STILL_PIP5, STILL_PIP5, STILL_PIP5, STILL_PIP5};
+ const __m128 p4 = {STILL_P4, STILL_P4, STILL_P4, STILL_P4};
+
+ if(PAR(cr))
+ {
+ pd_at_range(cr,&at0,&at1);
+ }
+ else
+ {
+ at0=0;
+ at1=natoms;
+ }
+
+ sum_gpi = born->work;
+ factor = 0.5 * ONE_4PI_EPS0;
+
+ /* keep the compiler happy */
+ t1 = _mm_setzero_ps();
+ t2 = _mm_setzero_ps();
+ t3 = _mm_setzero_ps();
+
+ aj1 = aj2 = aj3 = aj4 = 0;
+ aj13 = aj23 = aj33 = aj43 = 0;
+ n = 0;
+
+ for(i=0;i<natoms;i++)
+ born->bRad[i]=fr->invsqrta[i]=1;
+
+ for(i=0;i<nl->nri;i++)
+ {
+ ai = i;
+ ai3 = ai*3;
+
+ nj0 = nl->jindex[ai];
+ nj1 = nl->jindex[ai+1];
+
+ offset = (nj1-nj0)%4;
+
+ /* Polarization energy for atom ai */
+ gpi_ai = born->gpol[ai];
+ gpi = _mm_set1_ps(0.0);
+
+ /* Load particle ai coordinates */
+ ix = _mm_set1_ps(x[ai3]);
+ iy = _mm_set1_ps(x[ai3+1]);
+ iz = _mm_set1_ps(x[ai3+2]);
+
+ /* Load particle ai gb_radius */
+ rai = _mm_set1_ps(mtop->atomtypes.gb_radius[md->typeA[ai]]);
+
+ if(PAR(cr))
+ sum_gpi[ai] = 0;
+
+ for(k=nj0;k<nj1-offset;k+=4)
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+ aj3 = nl->jjnr[k+2];
+ aj4 = nl->jjnr[k+3];
+
+ aj13 = aj1 * 3;
+ aj23 = aj2 * 3;
+ aj33 = aj3 * 3;
+ aj43 = aj4 * 3;
+
+ /* Load particle aj1-4 and transpose */
+ xmm1 = _mm_loadu_ps(x+aj13);
+ xmm2 = _mm_loadu_ps(x+aj23);
+ xmm3 = _mm_loadu_ps(x+aj33);
+ xmm4 = _mm_loadu_ps(x+aj43);
+
+ xmm5 = _mm_unpacklo_ps(xmm1, xmm2);
+ xmm6 = _mm_unpacklo_ps(xmm3, xmm4);
+ xmm7 = _mm_unpackhi_ps(xmm1, xmm2);
+ xmm8 = _mm_unpackhi_ps(xmm3, xmm4);
+
+ jx = _mm_movelh_ps(xmm5, xmm6);
+ jy = _mm_unpackhi_ps(xmm5, xmm6);
+ jy = _mm_shuffle_ps(jy,jy,_MM_SHUFFLE(3,1,2,0));
+ jz = _mm_movelh_ps(xmm7, xmm8);
+
+ dx = _mm_sub_ps(ix, jx);
+ dy = _mm_sub_ps(iy, jy);
+ dz = _mm_sub_ps(iz, jz);
+
+ t1 = _mm_mul_ps(dx,dx);
+ t2 = _mm_mul_ps(dy,dy);
+ t3 = _mm_mul_ps(dz,dz);
+
+ rsq11 = _mm_add_ps(t1,t2);
+ rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare */
+
+ /* Perform reciprocal square root lookup, 12 bits accuracy */
+ t1 = _mm_rsqrt_ps(rsq11); /* t1=lookup, r2=x */
+ /* Newton-Rhapson iteration */
+ t2 = _mm_mul_ps(t1,t1); /* lu*lu */
+ t3 = _mm_mul_ps(rsq11,t2); /* x*lu*lu */
+ t3 = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+ t3 = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+ rinv = _mm_mul_ps(half,t3); /* result for all four particles */
+
+ rinv2 = _mm_mul_ps(rinv,rinv);
+ rinv4 = _mm_mul_ps(rinv2,rinv2);
+ rinv6 = _mm_mul_ps(rinv2,rinv4);
+
+ xmm1 = _mm_load_ss(born->vsolv+aj1); /*see comment at invsqrta*/
+ xmm2 = _mm_load_ss(born->vsolv+aj2);
+ xmm3 = _mm_load_ss(born->vsolv+aj3);
+ xmm4 = _mm_load_ss(born->vsolv+aj4);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /*j1 j1 j2 j2*/
+ xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /*j3 j3 j4 j4*/
+ vaj = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+
+ raj = _mm_set_ps(mtop->atomtypes.gb_radius[md->typeA[aj4]],
+ mtop->atomtypes.gb_radius[md->typeA[aj3]],
+ mtop->atomtypes.gb_radius[md->typeA[aj2]],
+ mtop->atomtypes.gb_radius[md->typeA[aj1]]);
+
+ rvdw = _mm_add_ps(rai,raj);
+ rvdw = _mm_mul_ps(rvdw,rvdw);
+ ratio = _mm_div_ps(rsq11,rvdw); /*ratio = dr2/(rvdw*rvdw)*/
+
+ mask_cmp = _mm_cmpgt_ps(ratio,p5inv); /*if ratio>p5inv */
+
+ switch(_mm_movemask_ps(mask_cmp))
+ {
+ case 0xF:
+ ccf = one;
+ dccf = zero;
+ break;
+ default:
+
+ theta = _mm_mul_ps(ratio,pip5);
+ sincos_ps(theta,&sinq,&cosq); /* sine and cosine */
+ term = _mm_sub_ps(one,cosq); /*1-cosq*/
+ term = _mm_mul_ps(half,term); /*0.5*(1.0-cosq)*/
+ ccf = _mm_mul_ps(term,term); /* term*term */
+ dccf = _mm_mul_ps(two,term); /* 2 * term */
+ dccf = _mm_mul_ps(dccf,sinq); /* 2*term*sinq */
+ dccf = _mm_mul_ps(dccf,pip5); /*2*term*sinq*pip5 */
+ dccf = _mm_mul_ps(dccf,ratio); /*dccf = 2*term*sinq*PIP5*ratio*/
+
+ ccf = (mask_cmp & one) | _mm_andnot_ps(mask_cmp,ccf); /*conditional as a mask*/
+ dccf = (mask_cmp & zero) | _mm_andnot_ps(mask_cmp,dccf);
+
+ }
+
+ prod = _mm_mul_ps(p4,vaj);
+ xmm2 = _mm_mul_ps(ccf,rinv4);
+ xmm2 = _mm_mul_ps(xmm2,prod); /*prod*ccf*idr4*/
+ gpi = _mm_add_ps(gpi,xmm2); /* gpi = gpi + prod*ccf*idr4 */
+
+ /* Chain rule terms */
+ ccf = _mm_mul_ps(four,ccf);
+ xmm3 = _mm_sub_ps(ccf,dccf);
+ xmm3 = _mm_mul_ps(xmm3,rinv6);
+ xmm1 = _mm_mul_ps(xmm3,prod);
+
+ _mm_storeu_ps(fr->dadx+n, xmm1);
+
+ n = n + 4;
+ }
+
+ /* deal with odd elements*/
+ if(offset!=0)
+ {
+ aj1=aj2=aj3=aj4=0;
+
+ if(offset==1)
+ {
+ aj1 = nl->jjnr[k]; /*jnr1-4*/
+ aj13 = aj1 * 3; /*Replace jnr with j3*/
+
+ xmm1 = _mm_loadu_ps(x+aj13);
+
+ xmm6 = xmm1; /* x1 - - - */
+ xmm4 = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(2,3,0,1)); /* y1 - - - */
+ xmm5 = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(3,0,1,2)); /* z1 - - - */
+
+ raj = _mm_set_ps(0.0f, 0.0f, 0.0f, mtop->atomtypes.gb_radius[md->typeA[aj1]]);
+ vaj = _mm_set_ps(0.0f, 0.0f, 0.0f, born->vsolv[aj1]);
+
+ mask = _mm_set_epi32(0,0,0,0xffffffff);
+
+ }
+ else if(offset==2)
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+
+ aj13 = aj1 * 3;
+ aj23 = aj2 * 3;
+
+ xmm1 = _mm_loadu_ps(x+aj13);
+ xmm2 = _mm_loadu_ps(x+aj23);
+
+ xmm6 = _mm_unpacklo_ps(xmm1, xmm2); /*x1,x2,y1,y2 , done x */
+ xmm4 = _mm_shuffle_ps( xmm6, xmm6, _MM_SHUFFLE(3,2,3,2)); /* y1, y2, y1, y2, done y */
+ xmm5 = _mm_unpackhi_ps(xmm1, xmm2); /* z1, z2, z1, z2 , done z */
+
+ raj = _mm_set_ps(0.0f, 0.0f, mtop->atomtypes.gb_radius[md->typeA[aj2]],mtop->atomtypes.gb_radius[md->typeA[aj1]]);
+ vaj = _mm_set_ps(0.0f, 0.0f, born->vsolv[aj2], born->vsolv[aj1]);
+
+ mask = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+
+ }
+ else
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+ aj3 = nl->jjnr[k+2];
+
+ aj13 = aj1 * 3;
+ aj23 = aj2 * 3;
+ aj33 = aj3 * 3;
+
+ xmm1 = _mm_loadu_ps(x+aj13);
+ xmm2 = _mm_loadu_ps(x+aj23);
+ xmm3 = _mm_loadu_ps(x+aj33);
+
+ xmm5 = _mm_unpacklo_ps(xmm1, xmm2); /* x1, x2, y1, y2 */
+ xmm6 = _mm_movelh_ps(xmm5, xmm3); /* x1, x2, x3, y3, done x */
+
+ xmm4 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(3,2,3,2)); /*y1, y2, x2, x1 */
+ xmm4 = _mm_shuffle_ps(xmm4, xmm3, _MM_SHUFFLE(0,1,1,0)); /*y1, y2, y3, x3, done y*/
+
+ xmm5 = _mm_unpackhi_ps(xmm1, xmm2); /* z1, z2, -, - */
+ xmm5 = _mm_shuffle_ps(xmm5, xmm3, _MM_SHUFFLE(3,2,1,0)); /*z1, z2, z3, x4, done z */
+
+ raj = _mm_set_ps(0.0f,
+ mtop->atomtypes.gb_radius[md->typeA[aj3]],
+ mtop->atomtypes.gb_radius[md->typeA[aj2]],
+ mtop->atomtypes.gb_radius[md->typeA[aj1]]);
+
+ vaj = _mm_set_ps(0.0f,
+ born->vsolv[aj3],
+ born->vsolv[aj2],
+ born->vsolv[aj1]);
+
+ mask = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+ }
+
+ jx = _mm_and_ps( (__m128) mask, xmm6);
+ jy = _mm_and_ps( (__m128) mask, xmm4);
+ jz = _mm_and_ps( (__m128) mask, xmm5);
+
+ dx = _mm_sub_ps(ix, jx);
+ dy = _mm_sub_ps(iy, jy);
+ dz = _mm_sub_ps(iz, jz);
+
+ t1 = _mm_mul_ps(dx,dx);
+ t2 = _mm_mul_ps(dy,dy);
+ t3 = _mm_mul_ps(dz,dz);
+
+ rsq11 = _mm_add_ps(t1,t2);
+ rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare*/
+
+ /* Perform reciprocal square root lookup, 12 bits accuracy */
+ t1 = _mm_rsqrt_ps(rsq11); /* t1=lookup, r2=x */
+ /* Newton-Rhapson iteration */
+ t2 = _mm_mul_ps(t1,t1); /* lu*lu */
+ t3 = _mm_mul_ps(rsq11,t2); /* x*lu*lu */
+ t3 = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+ t3 = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+ rinv = _mm_mul_ps(half,t3); /* result for all four particles */
+
+ rinv2 = _mm_mul_ps(rinv,rinv);
+ rinv4 = _mm_mul_ps(rinv2,rinv2);
+ rinv6 = _mm_mul_ps(rinv2,rinv4);
+
+ rvdw = _mm_add_ps(rai,raj);
+ rvdw = _mm_mul_ps(rvdw,rvdw);
+ ratio = _mm_div_ps(rsq11,rvdw); /*ratio = dr2/(rvdw*rvdw)*/
+
+ mask_cmp = _mm_cmpgt_ps(ratio,p5inv); /*if ratio>p5inv*/
+
+ switch(_mm_movemask_ps(mask_cmp))
+ {
+ case 0xF:
+ ccf = one;
+ dccf = zero;
+ break;
+ default:
+
+ theta = _mm_mul_ps(ratio,pip5);
+ sincos_ps(theta,&sinq,&cosq);
+ term = _mm_sub_ps(one,cosq);
+ term = _mm_mul_ps(half,term);
+ ccf = _mm_mul_ps(term,term);
+ dccf = _mm_mul_ps(two,term);
+ dccf = _mm_mul_ps(dccf,sinq);
+ dccf = _mm_mul_ps(dccf,pip5);
+ dccf = _mm_mul_ps(dccf,theta);
+
+ ccf = (mask_cmp & one) | _mm_andnot_ps(mask_cmp,ccf); /*conditional as a mask*/
+ dccf = (mask_cmp & zero) | _mm_andnot_ps(mask_cmp,dccf);
+ }
+
+ prod = _mm_mul_ps(p4,vaj);
+ xmm2 = _mm_mul_ps(ccf,rinv4);
+ xmm2 = _mm_mul_ps(xmm2,prod); /* prod*ccf*idr4*/
+ xmm2 = _mm_and_ps( (__m128) mask, xmm2);
+ gpi = _mm_add_ps(gpi,xmm2); /*gpi = gpi + prod*ccf*idr4 */
+
+ /* Chain rule terms */
+ ccf = _mm_mul_ps(four,ccf);
+ xmm3 = _mm_sub_ps(ccf,dccf);
+ xmm3 = _mm_mul_ps(xmm3,rinv6);
+ xmm1 = _mm_mul_ps(xmm3,prod);
+ xmm1 = _mm_and_ps( (__m128) mask, xmm1);
+
+ _mm_storeu_ps(fr->dadx+n, xmm1);
+
+ n = n + offset;
+
+
+ }
+
+ /* gpi now contains four partial terms that need to be added to particle ai gpi*/
+ xmm2 = _mm_movehl_ps(xmm2,gpi);
+ gpi = _mm_add_ps(gpi,xmm2);
+ xmm2 = _mm_shuffle_ps(gpi,gpi,_MM_SHUFFLE(1,1,1,1));
+ gpi = _mm_add_ss(gpi,xmm2);
+
+ _mm_store_ss(&gpi_tmp,gpi);
+
+ if(PAR(cr))
+ {
+ sum_gpi[ai]=gpi_tmp;
+ }
+ else
+ {
+ gpi_ai = gpi_ai + gpi_tmp; /* add gpi to the initial pol energy gpi_ai*/
+ gpi2 = gpi_ai * gpi_ai;
+
+ born->bRad[ai]=factor*invsqrt(gpi2);
+ fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+ }
+
+ }
+
+ if(PAR(cr))
+ {
+ gmx_sum(natoms,sum_gpi,cr);
+
+ for(i=at0;i<at1;i++)
+ {
+ ai = i;
+ gpi_ai = born->gpol[ai];
+ gpi_ai = gpi_ai + sum_gpi[ai];
+ gpi2 = gpi_ai*gpi_ai;
+
+ born->bRad[ai]=factor*invsqrt(gpi2);
+ fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+ }
+
+ /* Communicate Born radii*/
+ gb_pd_send(cr,born->bRad,md->nr);
+ gb_pd_send(cr,fr->invsqrta,md->nr);
+ }
+
+ return 0;
+
+
+}
+
+int
+calc_gb_rad_hct_sse(t_commrec *cr, t_forcerec *fr, int natoms, gmx_mtop_t *mtop, const t_atomtypes *atype, real *x,
+ t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
+{
+ int i,k,n,ai,ai3,aj1,aj2,aj3,aj4,nj0,nj1,at0,at1;
+ int aj13,aj23,aj33,aj43;
+ int offset;
+ real ri,rr,sum,sum_tmp,sum_ai,min_rad,rad;
+ real doffset;
+ real *sum_mpi;
+
+ __m128 ix,iy,iz,jx,jy,jz;
+ __m128 dx,dy,dz,t1,t2,t3;
+ __m128 rsq11,rinv,r,rai;
+ __m128 rai_inv,sk,sk2,lij,dlij,duij;
+ __m128 uij,lij2,uij2,lij3,uij3,diff2;
+ __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
+ __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
+ __m128 mask_cmp,mask_cmp2,mask_cmp3;
+
+ __m128i maski;
+
+ const __m128 neg = {-1.0f , -1.0f , -1.0f , -1.0f };
+ const __m128 zero = {0.0f , 0.0f , 0.0f , 0.0f };
+ const __m128 eigth = {0.125f , 0.125f , 0.125f , 0.125f };
+ const __m128 qrtr = {0.25f , 0.25f , 0.25f , 0.25f };
+ const __m128 half = {0.5f , 0.5f , 0.5f , 0.5f };
+ const __m128 one = {1.0f , 1.0f , 1.0f , 1.0f };
+ const __m128 two = {2.0f , 2.0f , 2.0f , 2.0f };
+ const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
+
+ if(PAR(cr))
+ {
+ pd_at_range(cr,&at0,&at1);
+ }
+ else
+ {
+ at0=0;
+ at1=natoms;
+ }
+
+ doffset = born->gb_doffset;
+ sum_mpi = born->work;
+
+ for(i=0;i<natoms;i++)
+ born->bRad[i]=fr->invsqrta[i]=1;
+
+ for(i=0;i<nl->nri;i++)
+ {
+ ai = nl->iinr[i];
+ ai3 = ai*3;
+
+ nj0 = nl->jindex[ai];
+ nj1 = nl->jindex[ai+1];
+
+ offset = (nj1-nj0)%4;
+
+ /* Load rai*/
+ rr = mtop->atomtypes.gb_radius[md->typeA[ai]]-doffset;
+ rai = _mm_load1_ps(&rr);
+
+ /* Load cumulative sum*/
+ sum = 1.0/rr;
+ rai_inv = _mm_load1_ps(&sum);
+
+ /* Load ai coordinates*/
+ ix = _mm_load1_ps(x+ai3);
+ iy = _mm_load1_ps(x+ai3+1);
+ iz = _mm_load1_ps(x+ai3+2);
+
+ if(PAR(cr))
+ {
+ /* Only have the master node do this, since we only want one value at one time */
+ if(MASTER(cr))
+ sum_mpi[ai]=sum_ai;
+ else
+ sum_mpi[ai]=0;
+ }
+
+ for(k=nj0;k<nj1-offset;k+=4)
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+ aj3 = nl->jjnr[k+2];
+ aj4 = nl->jjnr[k+3];
+
+ aj13 = aj1 * 3;
+ aj23 = aj2 * 3;
+ aj33 = aj3 * 3;
+ aj43 = aj4 * 3;
+
+ /* Load particle aj1-4 and transpose*/
+ xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
+ xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
+ xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33));
+ xmm4 = _mm_loadh_pi(xmm4,(__m64 *) (x+aj43));
+
+ xmm5 = _mm_load1_ps(x+aj13+2);
+ xmm6 = _mm_load1_ps(x+aj23+2);
+ xmm7 = _mm_load1_ps(x+aj33+2);
+ xmm8 = _mm_load1_ps(x+aj43+2);
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+ xmm6 = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
+ jz = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+ xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
+ jx = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
+ jy = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
+
+ dx = _mm_sub_ps(ix, jx);
+ dy = _mm_sub_ps(iy, jy);
+ dz = _mm_sub_ps(iz, jz);
+
+ t1 = _mm_mul_ps(dx,dx);
+ t2 = _mm_mul_ps(dy,dy);
+ t3 = _mm_mul_ps(dz,dz);
+
+ rsq11 = _mm_add_ps(t1,t2);
+ rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare*/
+
+ /* Perform reciprocal square root lookup, 8 bits accuracy */
+ t1 = _mm_rsqrt_ps(rsq11); /* t1=lookup, r2=x */
+ /* Newton-Rhapson iteration to get 12 bits correct*/
+ t2 = _mm_mul_ps(t1,t1); /* lu*lu */
+ t3 = _mm_mul_ps(rsq11,t2); /* x*lu*lu */
+ t3 = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+ t3 = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+ rinv = _mm_mul_ps(half,t3); /* result for all four particles */
+
+ r = _mm_mul_ps(rinv,rsq11);
+
+ xmm1 = _mm_load_ss(born->param+aj1);
+ xmm2 = _mm_load_ss(born->param+aj2);
+ xmm3 = _mm_load_ss(born->param+aj3);
+ xmm4 = _mm_load_ss(born->param+aj4);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /*j1 j1 j2 j2*/
+ xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /*j3 j3 j4 j4*/
+ sk = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+
+ /* conditional mask for rai<dr+sk*/
+ xmm1 = _mm_add_ps(r,sk);
+ mask_cmp = _mm_cmplt_ps(rai,xmm1);
+
+ /* conditional for rai>dr-sk, ends with mask_cmp2*/
+ xmm2 = _mm_sub_ps(r,sk); /*xmm2 = dr-sk*/
+
+ xmm3 = _mm_rcp_ps(xmm2); /*1.0/(dr-sk), 12 bits accuracy*/
+ t1 = _mm_mul_ps(xmm3,xmm2);
+ t1 = _mm_sub_ps(two,t1);
+ xmm3 = _mm_mul_ps(t1,xmm3);
+
+ xmm4 = rai_inv;
+ xmm5 = zero;
+
+ mask_cmp2 = _mm_cmpgt_ps(rai,xmm2); /*rai>dr-sk */
+ lij = (mask_cmp2 & xmm4) | _mm_andnot_ps(mask_cmp2, xmm3);
+ dlij = (mask_cmp2 & xmm5) | _mm_andnot_ps(mask_cmp2, one);
+
+ uij = _mm_rcp_ps(xmm1); /* better approximation than just _mm_rcp_ps, which is just 12 bits*/
+ t1 = _mm_mul_ps(uij,xmm1);
+ t1 = _mm_sub_ps(two,t1);
+ uij = _mm_mul_ps(t1,uij);
+
+ lij2 = _mm_mul_ps(lij,lij);
+ lij3 = _mm_mul_ps(lij2,lij);
+ uij2 = _mm_mul_ps(uij,uij);
+ uij3 = _mm_mul_ps(uij2,uij);
+
+ diff2 = _mm_sub_ps(uij2,lij2);
+
+ /* Perform reciprocal square root lookup, 12 bits accuracy */
+ t1 = _mm_rsqrt_ps(lij2); /* t1=lookup, r2=x */
+ /* Newton-Rhapson iteration */
+ t2 = _mm_mul_ps(t1,t1); /* lu*lu */
+ t3 = _mm_mul_ps(lij2,t2); /* x*lu*lu */
+ t3 = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+ t3 = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+ lij_inv = _mm_mul_ps(half,t3); /* result for all four particles */
+
+ sk2 = _mm_mul_ps(sk,sk);
+ sk2_inv = _mm_mul_ps(sk2,rinv);
+ prod = _mm_mul_ps(qrtr,sk2_inv);
+
+ log_term = _mm_mul_ps(uij,lij_inv);
+ log_term = log_ps(log_term);
+
+ xmm1 = _mm_sub_ps(lij,uij);
+ xmm2 = _mm_mul_ps(qrtr,r); /* 0.25*dr */
+ xmm2 = _mm_mul_ps(xmm2,diff2); /* 0.25*dr*prod */
+ xmm1 = _mm_add_ps(xmm1,xmm2); /* lij-uij + 0.25*dr*diff2 */
+ xmm2 = _mm_mul_ps(half,rinv); /* 0.5*rinv */
+ xmm2 = _mm_mul_ps(xmm2,log_term); /* 0.5*rinv*log_term */
+ xmm1 = _mm_add_ps(xmm1,xmm2); /* lij-uij+0.25*dr*diff2+0.5*rinv*log_term */
+ xmm8 = _mm_mul_ps(neg,diff2); /* (-1)*diff2 */
+ xmm2 = _mm_mul_ps(xmm8,prod); /* (-1)*diff2*prod */
+ tmp = _mm_add_ps(xmm1,xmm2); /* done tmp-term */
+
+ /* contitional for rai<sk-dr */
+ xmm3 = _mm_sub_ps(sk,r);
+ mask_cmp3 = _mm_cmplt_ps(rai,xmm3); /* rai<sk-dr */
+
+ xmm4 = _mm_sub_ps(rai_inv,lij);
+ xmm4 = _mm_mul_ps(two,xmm4);
+ xmm4 = _mm_add_ps(tmp,xmm4);
+
+ tmp = (mask_cmp3 & xmm4) | _mm_andnot_ps(mask_cmp3,tmp); /* xmm1 will now contain four tmp values */
+
+ /* the tmp will now contain four partial values, that not all are to be used. Which */
+ /* ones are governed by the mask_cmp mask. */
+ tmp = _mm_mul_ps(half,tmp); /* 0.5*tmp */
+ tmp = (mask_cmp & tmp) | _mm_andnot_ps(mask_cmp, zero);
+ tmp_sum = _mm_add_ps(tmp_sum,tmp);
+
+ duij = one;
+
+ xmm2 = _mm_mul_ps(half,lij2);
+ xmm3 = _mm_mul_ps(prod,lij3);
+ xmm2 = _mm_add_ps(xmm2,xmm3);
+ xmm3 = _mm_mul_ps(lij,rinv);
+ xmm4 = _mm_mul_ps(lij3,r);
+ xmm3 = _mm_add_ps(xmm3,xmm4);
+ xmm3 = _mm_mul_ps(qrtr,xmm3);
+ t1 = _mm_sub_ps(xmm2,xmm3);
+
+ xmm2 = _mm_mul_ps(half,uij2);
+ xmm2 = _mm_mul_ps(neg,xmm2);
+ xmm3 = _mm_mul_ps(qrtr,sk2_inv);
+ xmm3 = _mm_mul_ps(xmm3,uij3);
+ xmm2 = _mm_sub_ps(xmm2,xmm3);
+ xmm3 = _mm_mul_ps(uij,rinv);
+ xmm4 = _mm_mul_ps(uij3,r);
+ xmm3 = _mm_add_ps(xmm3,xmm4);
+ xmm3 = _mm_mul_ps(qrtr,xmm3);
+ t2 = _mm_add_ps(xmm2,xmm3);
+
+ xmm2 = _mm_mul_ps(sk2_inv,rinv);
+ xmm2 = _mm_add_ps(one,xmm2); /*1+sk2_rinv*rinv */
+ xmm2 = _mm_mul_ps(eigth,xmm2); /*0.125*(1+sk2_rinv*rinv) */
+ xmm2 = _mm_mul_ps(xmm2,xmm8); /*0.125*(1+sk2_rinv*rinv)*(-diff2) */
+ xmm3 = _mm_mul_ps(log_term, rinv); /*log_term*rinv */
+ xmm3 = _mm_mul_ps(xmm3,rinv); /*log_term*rinv*rinv */
+ xmm3 = _mm_mul_ps(qrtr,xmm3); /*0.25*log_term*rinv*rinv */
+ t3 = _mm_add_ps(xmm2,xmm3); /* done t3 */
+
+ /* chain rule terms */
+ xmm2 = _mm_mul_ps(dlij,t1); /*dlij*t1 */
+ xmm3 = _mm_mul_ps(duij,t2); /*duij*t2 */
+ xmm2 = _mm_add_ps(xmm2,xmm3);/*dlij*t1+duij*t2 */
+ xmm2 = _mm_add_ps(xmm2,t3);
+ xmm2 = _mm_mul_ps(xmm2,rinv);
+
+ _mm_storeu_ps(fr->dadx+n,xmm2);
+
+ n = n + 4;
+
+ }
+
+ if(offset!=0)
+ {
+ aj1=aj2=aj3=aj4=0;
+
+ if(offset==1)
+ {
+ aj1 = nl->jjnr[k];
+ aj13 = aj1 * 3;
+
+ xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (x+aj13));
+ xmm5 = _mm_load1_ps(x+aj13+2);
+
+ xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
+ xmm4 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+
+ sk = _mm_load1_ps(born->param+aj1);
+
+ maski = _mm_set_epi32(0,0,0,0xffffffff);
+ }
+ else if(offset==2)
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+
+ aj13 = aj1 * 3;
+ aj23 = aj2 * 3;
+
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (x+aj13));
+ xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (x+aj23));
+
+ xmm5 = _mm_load1_ps(x+aj13+2);
+ xmm6 = _mm_load1_ps(x+aj23+2);
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+ xmm5 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+ xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+ xmm4 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+
+ xmm1 = _mm_load1_ps(born->param+aj1);
+ xmm2 = _mm_load1_ps(born->param+aj2);
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+ sk = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+
+ maski = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+ }
+ else
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+ aj3 = nl->jjnr[k+2];
+
+ aj13 = aj1 * 3;
+ aj23 = aj2 * 3;
+ aj33 = aj3 * 3;
+
+ xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
+ xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
+ xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33));
+
+ xmm5 = _mm_load1_ps(x+aj13+2);
+ xmm6 = _mm_load1_ps(x+aj23+2);
+ xmm7 = _mm_load1_ps(x+aj33+2);
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
+ xmm5 = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2));
+ xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2));
+
+ xmm6 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0));
+ xmm4 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1));
+
+ xmm1 = _mm_load1_ps(born->param+aj1);
+ xmm2 = _mm_load1_ps(born->param+aj2);
+ xmm3 = _mm_load1_ps(born->param+aj3);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /*j1 j1 j2 j2*/
+ xmm3 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0)); /*j3 j3 j3 j3*/
+ sk = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+
+
+ maski = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+ }
+
+ jx = _mm_and_ps( (__m128) maski, xmm6);
+ jy = _mm_and_ps( (__m128) maski, xmm4);
+ jz = _mm_and_ps( (__m128) maski, xmm5);
+
+ sk = _mm_and_ps ( (__m128) maski, sk);
+
+ dx = _mm_sub_ps(ix, jx);
+ dy = _mm_sub_ps(iy, jy);
+ dz = _mm_sub_ps(iz, jz);
+
+ t1 = _mm_mul_ps(dx,dx);
+ t2 = _mm_mul_ps(dy,dy);
+ t3 = _mm_mul_ps(dz,dz);
+
+ rsq11 = _mm_add_ps(t1,t2);
+ rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare*/
+
+ /* Perform reciprocal square root lookup, 12 bits accuracy */
+ t1 = _mm_rsqrt_ps(rsq11); /* t1=lookup, r2=x */
+ /* Newton-Rhapson iteration */
+ t2 = _mm_mul_ps(t1,t1); /* lu*lu */
+ t3 = _mm_mul_ps(rsq11,t2); /* x*lu*lu */
+ t3 = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+ t3 = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+ rinv = _mm_mul_ps(half,t3); /* result for all four particles */
+
+ r = _mm_mul_ps(rinv,rsq11);
+
+ xmm1 = _mm_add_ps(r,sk); /*dr+sk */
+
+ /* conditional mask for rai<dr+sk */
+ mask_cmp = _mm_cmplt_ps(rai,xmm1);
+
+ /* conditional for rai>dr-sk, ends with mask_cmp2 */
+ xmm2 = _mm_sub_ps(r,sk); /*xmm2 = dr-sk */
+
+ xmm3 = _mm_rcp_ps(xmm2); /*1.0/(dr-sk)*/
+ t1 = _mm_mul_ps(xmm3,xmm2);
+ t1 = _mm_sub_ps(two,t1);
+ xmm3 = _mm_mul_ps(t1,xmm3);
+
+ xmm4 = rai_inv;
+ xmm5 = zero;
+
+ mask_cmp2 = _mm_cmpgt_ps(rai,xmm2); /*rai>dr-sk */
+ lij = (mask_cmp2 & xmm4) | _mm_andnot_ps(mask_cmp2, xmm3);
+ dlij = (mask_cmp2 & xmm5) | _mm_andnot_ps(mask_cmp2, one);
+
+ uij = _mm_rcp_ps(xmm1);
+ t1 = _mm_mul_ps(uij,xmm1);
+ t1 = _mm_sub_ps(two,t1);
+ uij = _mm_mul_ps(t1,uij);
+
+ lij2 = _mm_mul_ps(lij,lij);
+ lij3 = _mm_mul_ps(lij2,lij);
+ uij2 = _mm_mul_ps(uij,uij);
+ uij3 = _mm_mul_ps(uij2,uij);
+
+ diff2 = _mm_sub_ps(uij2,lij2);
+
+ t1 = _mm_rsqrt_ps(lij2); /* t1=lookup, r2=x */
+ /* Newton-Rhapson iteration */
+ t2 = _mm_mul_ps(t1,t1); /* lu*lu */
+ t3 = _mm_mul_ps(lij2,t2); /* x*lu*lu */
+ t3 = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+ t3 = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+ lij_inv = _mm_mul_ps(half,t3); /* result for all four particles */
+
+ sk2 = _mm_mul_ps(sk,sk);
+ sk2_inv = _mm_mul_ps(sk2,rinv);
+ prod = _mm_mul_ps(qrtr,sk2_inv);
+
+ log_term = _mm_mul_ps(uij,lij_inv);
+ log_term = log_ps(log_term);
+
+ xmm1 = _mm_sub_ps(lij,uij);
+ xmm2 = _mm_mul_ps(qrtr,r); /* 0.25*dr */
+ xmm2 = _mm_mul_ps(xmm2,diff2); /*0.25*dr*prod */
+ xmm1 = _mm_add_ps(xmm1,xmm2); /*lij-uij + 0.25*dr*diff2 */
+ xmm2 = _mm_mul_ps(half,rinv); /* 0.5*rinv */
+ xmm2 = _mm_mul_ps(xmm2,log_term); /*0.5*rinv*log_term */
+ xmm1 = _mm_add_ps(xmm1,xmm2); /*lij-uij+0.25*dr*diff2+0.5*rinv*log_term */
+ xmm8 = _mm_mul_ps(neg,diff2); /*(-1)*diff2 */
+ xmm2 = _mm_mul_ps(xmm8,prod); /*(-1)*diff2*prod */
+ tmp = _mm_add_ps(xmm1,xmm2); /* done tmp-term */
+
+ /* contitional for rai<sk-dr */
+ xmm3 = _mm_sub_ps(sk,r);
+ mask_cmp3 = _mm_cmplt_ps(rai,xmm3); /*rai<sk-dr*/
+
+ xmm4 = _mm_sub_ps(rai_inv,lij);
+ xmm4 = _mm_mul_ps(two,xmm4);
+ xmm4 = _mm_add_ps(xmm1,xmm4);
+
+ tmp = (mask_cmp3 & xmm4) | _mm_andnot_ps(mask_cmp3,tmp); /* xmm1 will now contain four tmp values*/
+
+ /* tmp will now contain four partial values, that not all are to be used. Which */
+ /* ones are governed by the mask_cmp mask.*/
+ tmp = _mm_mul_ps(half,tmp); /*0.5*tmp*/
+ tmp = (mask_cmp & tmp) | _mm_andnot_ps(mask_cmp, zero);
+ tmp_sum = _mm_add_ps(tmp_sum,tmp);
+
+ duij = one;
+
+ /* start t1 */
+ xmm2 = _mm_mul_ps(half,lij2); /*0.5*lij2 */
+ xmm3 = _mm_mul_ps(prod,lij3); /*prod*lij3;*/
+ xmm2 = _mm_add_ps(xmm2,xmm3); /*0.5*lij2+prod*lij3 */
+ xmm3 = _mm_mul_ps(lij,rinv); /*lij*rinv */
+ xmm4 = _mm_mul_ps(lij3,r); /*lij3*dr; */
+ xmm3 = _mm_add_ps(xmm3,xmm4); /*lij*rinv+lij3*dr */
+ xmm3 = _mm_mul_ps(qrtr,xmm3); /*0.25*(lij*rinv+lij3*dr) */
+ t1 = _mm_sub_ps(xmm2,xmm3); /* done t1 */
+
+ /* start t2 */
+ xmm2 = _mm_mul_ps(half,uij2); /*0.5*uij2 */
+ xmm2 = _mm_mul_ps(neg,xmm2); /*(-1)*0.5*lij2 */
+ xmm3 = _mm_mul_ps(qrtr,sk2_inv); /*0.25*sk2_rinv */
+ xmm3 = _mm_mul_ps(xmm3,uij3); /*0.25*sk2_rinv*uij3 */
+ xmm2 = _mm_sub_ps(xmm2,xmm3); /*(-1)*0.5*lij2-0.25*sk2_rinv*uij3 */
+ xmm3 = _mm_mul_ps(uij,rinv); /*uij*rinv */
+ xmm4 = _mm_mul_ps(uij3,r); /*uij3*dr; */
+ xmm3 = _mm_add_ps(xmm3,xmm4); /*uij*rinv+uij*dr */
+ xmm3 = _mm_mul_ps(qrtr,xmm3); /*0.25*(uij*rinv+uij*dr) */
+ t2 = _mm_add_ps(xmm2,xmm3); /* done t2 */
+
+ /* start t3 */
+ xmm2 = _mm_mul_ps(sk2_inv,rinv);
+ xmm2 = _mm_add_ps(one,xmm2); /*1+sk2_rinv*rinv; */
+ xmm2 = _mm_mul_ps(eigth,xmm2); /*0.125*(1+sk2_rinv*rinv) */
+ xmm2 = _mm_mul_ps(xmm2,xmm8); /*0.125*(1+sk2_rinv*rinv)*(-diff2) */
+ xmm3 = _mm_mul_ps(log_term, rinv); /*log_term*rinv */
+ xmm3 = _mm_mul_ps(xmm3,rinv); /*log_term*rinv*rinv */
+ xmm3 = _mm_mul_ps(qrtr,xmm3); /*0.25*log_term*rinv*rinv */
+ t3 = _mm_add_ps(xmm2,xmm3); /* done t3 */
+
+ /* chain rule terms */
+ xmm2 = _mm_mul_ps(dlij,t1); /*dlij*t1 */
+ xmm3 = _mm_mul_ps(duij,t2); /*duij*t2 */
+ xmm2 = _mm_add_ps(xmm2,xmm3);/*dlij*t1+duij*t2 */
+ xmm2 = _mm_add_ps(xmm2,t3); /*everyhting * t3 */
+ xmm2 = _mm_mul_ps(xmm2,rinv); /*everything * t3 *rinv */
+
+ xmm2 = _mm_and_ps( (__m128) maski,xmm2);
+ _mm_storeu_ps(fr->dadx+n,xmm2); /* store excess elements, but since we are only advancing n by
+ * offset, this will be corrected by the "main" loop */
+
+ n = n + offset;
+
+ } /* end offset */
+
+ /* the tmp array will contain partial values that need to be added together */
+ tmp = _mm_movehl_ps(tmp,tmp_sum);
+ tmp_sum = _mm_sub_ps(tmp_sum,tmp);
+ tmp = _mm_shuffle_ps(tmp_sum,tmp_sum,_MM_SHUFFLE(1,1,1,1));
+ tmp_sum = _mm_sub_ss(tmp_sum,tmp);
+
+ _mm_store_ss(&sum_tmp,tmp_sum);
+
+ if(PAR(cr))
+ {
+ sum_mpi[ai]=sum_tmp;
+ }
+ else
+ {
+ sum_ai=sum_tmp;
+
+ min_rad = rr + doffset;
+ rad=1.0/sum_ai;
+
+ born->bRad[ai]=rad > min_rad ? rad : min_rad;
+ fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+ }
+ }
+
+ if(PAR(cr))
+ {
+ gmx_sum(natoms,sum_mpi,cr);
+
+ for(i=at0;i<at1;i++)
+ {
+ ai = i;
+ min_rad = mtop->atomtypes.gb_radius[md->typeA[ai]];
+ rad = 1.0/sum_mpi[ai];
+
+ born->bRad[ai]=rad > min_rad ? rad : min_rad;
+ fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+ }
+
+ gb_pd_send(cr,born->bRad,md->nr);
+ gb_pd_send(cr,fr->invsqrta,md->nr);
+ }
+
+
+ return 0;
+}
+
+
+__m128 log2_ps(__m128 x)
+{
+ __m128i exp = _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000);
+ __m128i one = _mm_set_epi32(0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000);
+ __m128i off = _mm_set_epi32(0x3FBF8000, 0x3FBF8000, 0x3FBF8000, 0x3FBF8000);
+ __m128i mant = _mm_set_epi32(0x00F7FFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF);
+ __m128i sign = _mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
+ __m128i base = _mm_set_epi32(0x43800000, 0x43800000, 0x43800000, 0x43800000);
+ __m128i loge = _mm_set_epi32(0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218);
+
+ __m128i D5 = _mm_set_epi32(0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5);
+ __m128i D4 = _mm_set_epi32(0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD);
+ __m128i D3 = _mm_set_epi32(0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9);
+ __m128i D2 = _mm_set_epi32(0x4026537B, 0x4026537B, 0x4026537B, 0x4026537B);
+ __m128i D1 = _mm_set_epi32(0xC054bFAD, 0xC054bFAD, 0xC054bFAD, 0xC054bFAD);
+ __m128i D0 = _mm_set_epi32(0x4047691A, 0x4047691A, 0x4047691A, 0x4047691A);
+
+ __m128 xmm0,xmm1,xmm2;
+
+ xmm0 = x;
+ xmm1 = xmm0;
+
+ xmm1 = _mm_and_ps(xmm1, (__m128)exp);
+ xmm1 = (__m128 ) _mm_srli_epi32( (__m128i) xmm1,8);
+
+ xmm1 = _mm_or_ps(xmm1, (__m128) one);
+ xmm1 = _mm_sub_ps(xmm1, (__m128) off);
+ xmm1 = _mm_mul_ps(xmm1, (__m128) base);
+ xmm0 = _mm_and_ps(xmm0, (__m128) mant);
+ xmm0 = _mm_or_ps(xmm0, (__m128) one);
+
+ xmm2 = _mm_mul_ps(xmm0, (__m128) D5);
+ xmm2 = _mm_add_ps(xmm2, (__m128) D4);
+ xmm2 = _mm_mul_ps(xmm2,xmm0);
+ xmm2 = _mm_add_ps(xmm2, (__m128) D3);
+ xmm2 = _mm_mul_ps(xmm2,xmm0);
+ xmm2 = _mm_add_ps(xmm2, (__m128) D2);
+ xmm2 = _mm_mul_ps(xmm2,xmm0);
+ xmm2 = _mm_add_ps(xmm2, (__m128) D1);
+ xmm2 = _mm_mul_ps(xmm2,xmm0);
+ xmm2 = _mm_add_ps(xmm2, (__m128) D0);
+ xmm0 = _mm_sub_ps(xmm0, (__m128) one);
+ xmm0 = _mm_mul_ps(xmm0,xmm2);
+ xmm1 = _mm_add_ps(xmm1,xmm0);
+
+ x = xmm1;
+ x = _mm_mul_ps(x,(__m128) loge);
+
+ return x;
+}
+
+
+int
+calc_gb_rad_obc_sse(t_commrec *cr, t_forcerec * fr, int natoms, gmx_mtop_t *mtop,
+ const t_atomtypes *atype, real *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
+{
+ int i,k,n,ai,ai3,aj1,aj2,aj3,aj4,nj0,nj1,at0,at1;
+ int aj13,aj23,aj33,aj43;
+ int offset;
+ real ri;
+ real doffset;
+ real rr,rr_inv,sum_tmp,sum_ai,gbr;
+ real sum_ai2, sum_ai3,tsum,tchain;
+ real z = 0;
+ real *sum_mpi;
+
+ if(PAR(cr))
+ {
+ pd_at_range(cr,&at0,&at1);
+ }
+ else
+ {
+ at0=0;
+ at1=natoms;
+ }
+
+ doffset = born->gb_doffset;
+ sum_mpi = born->work;
+
+ __m128 ix,iy,iz,jx,jy,jz;
+ __m128 dx,dy,dz,t1,t2,t3;
+ __m128 rsq11,rinv,r;
+ __m128 rai,rai_inv,rai_inv2,sk,sk2,lij,dlij,duij;
+ __m128 uij,lij2,uij2,lij3,uij3,diff2;
+ __m128 lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
+ __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
+ __m128 mask_cmp,mask_cmp2,mask_cmp3;
+
+ __m128i maski;
+
+ const __m128 neg = {-1.0f , -1.0f , -1.0f , -1.0f };
+ const __m128 zero = {0.0f , 0.0f , 0.0f , 0.0f };
+ const __m128 eigth = {0.125f , 0.125f , 0.125f , 0.125f };
+ const __m128 qrtr = {0.25f , 0.25f , 0.25f , 0.25f };
+ const __m128 half = {0.5f , 0.5f , 0.5f , 0.5f };
+ const __m128 one = {1.0f , 1.0f , 1.0f , 1.0f };
+ const __m128 two = {2.0f , 2.0f , 2.0f , 2.0f };
+ const __m128 three = {3.0f , 3.0f , 3.0f , 3.0f };
+
+ float apa[4];
+
+ for(i=0;i<natoms;i++) {
+ born->bRad[i]=fr->invsqrta[i]=1;
+ born->drobc[i]=0;
+ }
+
+ /* keep the compiler happy */
+ t1 = _mm_setzero_ps();
+ t2 = _mm_setzero_ps();
+ t3 = _mm_setzero_ps();
+
+ aj1=aj2=aj3=aj4=0;
+ aj13=aj23=aj33=aj43=0;
+ n=0;
+
+ for(i=0;i<nl->nri;i++)
+ {
+ ai = nl->iinr[i];
+ ai3 = ai*3;
+
+ nj0 = nl->jindex[ai];
+ nj1 = nl->jindex[ai+1];
+
+ offset = (nj1-nj0)%4;
+
+ rr = mtop->atomtypes.gb_radius[md->typeA[ai]];
+ ri = rr-doffset;
+ rai = _mm_load1_ps(&ri);
+
+ ri = 1.0/ri;
+ rai_inv = _mm_load1_ps(&ri);
+
+ ix = _mm_load1_ps(x+ai3);
+ iy = _mm_load1_ps(x+ai3+1);
+ iz = _mm_load1_ps(x+ai3+2);
+
+ sum_ai = 0;
+ tmp_sum = _mm_load1_ps(&z);
+
+ if(PAR(cr))
+ {
+ sum_mpi[ai] = 0;
+ }
+
+ for(k=nj0;k<nj1-offset;k+=4)
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+ aj3 = nl->jjnr[k+2];
+ aj4 = nl->jjnr[k+3];
+
+ aj13 = aj1 * 3;
+ aj23 = aj2 * 3;
+ aj33 = aj3 * 3;
+ aj43 = aj4 * 3;
+
+ /* Load particle aj1-4 and transpose */
+ xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
+ xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
+ xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33));
+ xmm4 = _mm_loadh_pi(xmm4,(__m64 *) (x+aj43));
+
+ xmm5 = _mm_load1_ps(x+aj13+2);
+ xmm6 = _mm_load1_ps(x+aj23+2);
+ xmm7 = _mm_load1_ps(x+aj33+2);
+ xmm8 = _mm_load1_ps(x+aj43+2);
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+ xmm6 = _mm_shuffle_ps(xmm7,xmm8,_MM_SHUFFLE(0,0,0,0));
+ jz = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+ xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2));
+ jx = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0));
+ jy = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1));
+
+ dx = _mm_sub_ps(ix, jx);
+ dy = _mm_sub_ps(iy, jy);
+ dz = _mm_sub_ps(iz, jz);
+
+ t1 = _mm_mul_ps(dx,dx);
+ t2 = _mm_mul_ps(dy,dy);
+ t3 = _mm_mul_ps(dz,dz);
+
+ rsq11 = _mm_add_ps(t1,t2);
+ rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare */
+
+ /* Perform reciprocal square root lookup, 8 bits accuracy */
+ t1 = _mm_rsqrt_ps(rsq11); /* t1=lookup, r2=x */
+ /* Newton-Rhapson iteration to get 12 bits correct*/
+ t2 = _mm_mul_ps(t1,t1); /* lu*lu */
+ t3 = _mm_mul_ps(rsq11,t2); /* x*lu*lu */
+ t3 = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+ t3 = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+ rinv = _mm_mul_ps(half,t3); /* result for all four particles */
+
+ r = _mm_mul_ps(rinv,rsq11);
+
+ xmm1 = _mm_load_ss(born->param+aj1);
+ xmm2 = _mm_load_ss(born->param+aj2);
+ xmm3 = _mm_load_ss(born->param+aj3);
+ xmm4 = _mm_load_ss(born->param+aj4);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /*j1 j1 j2 j2*/
+ xmm3 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(0,0,0,0)); /*j3 j3 j4 j4*/
+ sk = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_add_ps(r,sk); /*dr+sk*/
+
+ /* conditional mask for rai<dr+sk */
+ mask_cmp = _mm_cmplt_ps(rai,xmm1);
+
+ /* conditional for rai>dr-sk, ends with mask_cmp2 */
+ xmm2 = _mm_sub_ps(r,sk); /*xmm2 = dr-sk */
+
+ xmm3 = _mm_rcp_ps(xmm2); /*1.0/(dr-sk), 8 bits accuracy */
+ t1 = _mm_mul_ps(xmm3,xmm2);
+ t1 = _mm_sub_ps(two,t1);
+ xmm3 = _mm_mul_ps(t1,xmm3);
+
+ xmm4 = rai_inv;
+ xmm5 = zero;
+
+ mask_cmp2 = _mm_cmpgt_ps(rai,xmm2); /*rai>dr-sk */
+ lij = (mask_cmp2 & xmm4) | _mm_andnot_ps(mask_cmp2, xmm3);
+ dlij = (mask_cmp2 & xmm5) | _mm_andnot_ps(mask_cmp2, one);
+
+ uij = _mm_rcp_ps(xmm1);
+ t1 = _mm_mul_ps(uij,xmm1);
+ t1 = _mm_sub_ps(two,t1);
+ uij = _mm_mul_ps(t1,uij);
+
+ lij2 = _mm_mul_ps(lij,lij);
+ lij3 = _mm_mul_ps(lij2,lij);
+ uij2 = _mm_mul_ps(uij,uij);
+ uij3 = _mm_mul_ps(uij2,uij);
+
+ diff2 = _mm_sub_ps(uij2,lij2);
+
+ /* Perform reciprocal square root lookup, 12 bits accuracy */
+ t1 = _mm_rsqrt_ps(lij2); /* t1=lookup, r2=x */
+ /* Newton-Rhapson iteration */
+ t2 = _mm_mul_ps(t1,t1); /* lu*lu */
+ t3 = _mm_mul_ps(lij2,t2); /* x*lu*lu */
+ t3 = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+ t3 = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+ lij_inv = _mm_mul_ps(half,t3); /* result for all four particles */
+
+ sk2 = _mm_mul_ps(sk,sk);
+ sk2_inv = _mm_mul_ps(sk2,rinv);
+ prod = _mm_mul_ps(qrtr,sk2_inv);
+
+ log_term = _mm_mul_ps(uij,lij_inv);
+ log_term = log2_ps(log_term);
+
+ xmm1 = _mm_sub_ps(lij,uij);
+ xmm2 = _mm_mul_ps(qrtr,r); /* 0.25*dr */
+ xmm2 = _mm_mul_ps(xmm2,diff2); /*0.25*dr*prod */
+ xmm1 = _mm_add_ps(xmm1,xmm2); /*lij-uij + 0.25*dr*diff2 */
+ xmm2 = _mm_mul_ps(half,rinv); /* 0.5*rinv */
+ xmm2 = _mm_mul_ps(xmm2,log_term); /*0.5*rinv*log_term */
+ xmm1 = _mm_add_ps(xmm1,xmm2); /*lij-uij+0.25*dr*diff2+0.5*rinv*log_term */
+ xmm8 = _mm_mul_ps(neg,diff2); /*(-1)*diff2 */
+ xmm2 = _mm_mul_ps(xmm8,prod); /*(-1)*diff2*prod */
+ tmp = _mm_add_ps(xmm1,xmm2); /* done tmp-term */
+
+ /* contitional for rai<sk-dr */
+ xmm3 = _mm_sub_ps(sk,r);
+ mask_cmp3 = _mm_cmplt_ps(rai,xmm3); /*rai<sk-dr*/
+
+
+ xmm4 = _mm_sub_ps(rai_inv,lij);
+ xmm4 = _mm_mul_ps(two,xmm4);
+ xmm4 = _mm_add_ps(tmp,xmm4);
+
+ tmp = (mask_cmp3 & xmm4) | _mm_andnot_ps(mask_cmp3,tmp); /* xmm1 will now contain four tmp values */
+
+ /* the tmp will now contain four partial values, that not all are to be used. Which
+ * ones are governed by the mask_cmp mask.
+ */
+ tmp = _mm_mul_ps(half,tmp); /*0.5*tmp*/
+ tmp = (mask_cmp & tmp) | _mm_andnot_ps(mask_cmp, zero);
+ tmp_sum = _mm_add_ps(tmp_sum,tmp);
+
+ duij = one;
+
+ /* start t1 */
+ xmm2 = _mm_mul_ps(half,lij2); /*0.5*lij2 */
+ xmm3 = _mm_mul_ps(prod,lij3); /*prod*lij3; */
+ xmm2 = _mm_add_ps(xmm2,xmm3); /*0.5*lij2+prod*lij3 */
+ xmm3 = _mm_mul_ps(lij,rinv); /*lij*rinv */
+ xmm4 = _mm_mul_ps(lij3,r); /*lij3*dr;*/
+ xmm3 = _mm_add_ps(xmm3,xmm4); /*lij*rinv+lij3*dr */
+ xmm3 = _mm_mul_ps(qrtr,xmm3); /*0.25*(lij*rinv+lij3*dr) */
+ t1 = _mm_sub_ps(xmm2,xmm3); /* done t1 */
+
+ /* start t2 */
+ xmm2 = _mm_mul_ps(half,uij2); /*0.5*uij2 */
+ xmm2 = _mm_mul_ps(neg,xmm2); /*(-1)*0.5*uij2 */
+ xmm3 = _mm_mul_ps(qrtr,sk2_inv); /*0.25*sk2_rinv */
+ xmm3 = _mm_mul_ps(xmm3,uij3); /*0.25*sk2_rinv*uij3 */
+ xmm2 = _mm_sub_ps(xmm2,xmm3); /*(-1)*0.5*lij2-0.25*sk2_rinv*uij3 */
+ xmm3 = _mm_mul_ps(uij,rinv); /*uij*rinv */
+ xmm4 = _mm_mul_ps(uij3,r); /*uij3*dr; */
+ xmm3 = _mm_add_ps(xmm3,xmm4); /*uij*rinv+uij*dr */
+ xmm3 = _mm_mul_ps(qrtr,xmm3); /*0.25*(uij*rinv+uij*dr) */
+ t2 = _mm_add_ps(xmm2,xmm3); /* done t2 */
+
+ /* start t3 */
+ xmm2 = _mm_mul_ps(sk2_inv,rinv);
+ xmm2 = _mm_add_ps(one,xmm2); /*1+sk2_rinv*rinv; */
+ xmm2 = _mm_mul_ps(eigth,xmm2); /*0.125*(1+sk2_rinv*rinv) */
+ xmm2 = _mm_mul_ps(xmm2,xmm8); /*0.125*(1+sk2_rinv*rinv)*(-diff2) */
+ xmm3 = _mm_mul_ps(log_term, rinv); /*log_term*rinv */
+ xmm3 = _mm_mul_ps(xmm3,rinv); /*log_term*rinv*rinv */
+ xmm3 = _mm_mul_ps(qrtr,xmm3); /*0.25*log_term*rinv*rinv */
+ t3 = _mm_add_ps(xmm2,xmm3); /* done t3 */
+
+ /* chain rule terms */
+ xmm2 = _mm_mul_ps(dlij,t1); /*dlij*t1*/
+ xmm3 = _mm_mul_ps(duij,t2); /*duij*t2 */
+ xmm2 = _mm_add_ps(xmm2,xmm3);/*dlij*t1+duij*t2 */
+ xmm2 = _mm_add_ps(xmm2,t3);
+ xmm2 = _mm_mul_ps(xmm2,rinv);
+
+ _mm_storeu_ps(fr->dadx+n,xmm2);
+
+ n = n + 4;
+
+ } /* end normal inner loop */
+
+ /* deal with offset elements */
+ if(offset!=0)
+ {
+ aj1=aj2=aj3=aj4=0;
+
+ if(offset==1)
+ {
+ aj1 = nl->jjnr[k];
+ aj13 = aj1 * 3;
+
+ xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (x+aj13));
+ xmm5 = _mm_load1_ps(x+aj13+2);
+
+ xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(0,0,0,0));
+ xmm4 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));
+
+ sk = _mm_load1_ps(born->param+aj1);
+
+ maski = _mm_set_epi32(0,0,0,0xffffffff);
+ }
+ else if(offset==2)
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+
+ aj13 = aj1 * 3;
+ aj23 = aj2 * 3;
+
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (x+aj13));
+ xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (x+aj23));
+
+ xmm5 = _mm_load1_ps(x+aj13+2);
+ xmm6 = _mm_load1_ps(x+aj23+2);
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0));
+ xmm5 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2));
+ xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+ xmm4 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1));
+
+ xmm1 = _mm_load1_ps(born->param+aj1);
+ xmm2 = _mm_load1_ps(born->param+aj2);
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0));
+ sk = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0));
+
+ maski = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+ }
+ else
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+ aj3 = nl->jjnr[k+2];
+
+ aj13 = aj1 * 3;
+ aj23 = aj2 * 3;
+ aj33 = aj3 * 3;
+
+ xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (x+aj13));
+ xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (x+aj23));
+ xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (x+aj33));
+
+ xmm5 = _mm_load1_ps(x+aj13+2);
+ xmm6 = _mm_load1_ps(x+aj23+2);
+ xmm7 = _mm_load1_ps(x+aj33+2);
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0));
+ xmm5 = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1));
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2));
+ xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2));
+
+ xmm6 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0));
+ xmm4 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1));
+
+ xmm1 = _mm_load1_ps(born->param+aj1);
+ xmm2 = _mm_load1_ps(born->param+aj2);
+ xmm3 = _mm_load1_ps(born->param+aj3);
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(0,0,0,0)); /*j1 j1 j2 j2*/
+ xmm3 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(0,0,0,0)); /*j3 j3 j3 j3*/
+ sk = _mm_shuffle_ps(xmm1,xmm3,_MM_SHUFFLE(2,0,2,0));
+
+
+ maski = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+ }
+
+ jx = _mm_and_ps( (__m128) maski, xmm6);
+ jy = _mm_and_ps( (__m128) maski, xmm4);
+ jz = _mm_and_ps( (__m128) maski, xmm5);
+
+ sk = _mm_and_ps ( (__m128) maski, sk);
+
+ dx = _mm_sub_ps(ix, jx);
+ dy = _mm_sub_ps(iy, jy);
+ dz = _mm_sub_ps(iz, jz);
+
+ t1 = _mm_mul_ps(dx,dx);
+ t2 = _mm_mul_ps(dy,dy);
+ t3 = _mm_mul_ps(dz,dz);
+
+ rsq11 = _mm_add_ps(t1,t2);
+ rsq11 = _mm_add_ps(rsq11,t3); /*rsq11=rsquare*/
+
+ /* Perform reciprocal square root lookup, 12 bits accuracy */
+ t1 = _mm_rsqrt_ps(rsq11); /* t1=lookup, r2=x */
+ /* Newton-Rhapson iteration */
+ t2 = _mm_mul_ps(t1,t1); /* lu*lu */
+ t3 = _mm_mul_ps(rsq11,t2); /* x*lu*lu */
+ t3 = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+ t3 = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+ rinv = _mm_mul_ps(half,t3); /* result for all four particles */
+
+ r = _mm_mul_ps(rinv,rsq11);
+
+ xmm1 = _mm_add_ps(r,sk); /*dr+sk*/
+
+ /* conditional mask for rai<dr+sk */
+ mask_cmp = _mm_cmplt_ps(rai,xmm1);
+
+ /* conditional for rai>dr-sk, ends with mask_cmp2 */
+ xmm2 = _mm_sub_ps(r,sk); /*xmm2 = dr-sk*/
+
+ xmm3 = _mm_rcp_ps(xmm2); /*1.0/(dr-sk)*/
+ t1 = _mm_mul_ps(xmm3,xmm2);
+ t1 = _mm_sub_ps(two,t1);
+ xmm3 = _mm_mul_ps(t1,xmm3);
+
+ xmm4 = rai_inv;
+ xmm5 = zero;
+
+ mask_cmp2 = _mm_cmpgt_ps(rai,xmm2); /*rai>dr-sk */
+ lij = (mask_cmp2 & xmm4) | _mm_andnot_ps(mask_cmp2, xmm3);
+ dlij = (mask_cmp2 & xmm5) | _mm_andnot_ps(mask_cmp2, one);
+
+ uij = _mm_rcp_ps(xmm1);
+ t1 = _mm_mul_ps(uij,xmm1);
+ t1 = _mm_sub_ps(two,t1);
+ uij = _mm_mul_ps(t1,uij);
+
+ lij2 = _mm_mul_ps(lij,lij);
+ lij3 = _mm_mul_ps(lij2,lij);
+ uij2 = _mm_mul_ps(uij,uij);
+ uij3 = _mm_mul_ps(uij2,uij);
+
+ diff2 = _mm_sub_ps(uij2,lij2);
+
+ t1 = _mm_rsqrt_ps(lij2); /* t1=lookup, r2=x */
+ /* Newton-Rhapson iteration */
+ t2 = _mm_mul_ps(t1,t1); /* lu*lu */
+ t3 = _mm_mul_ps(lij2,t2); /* x*lu*lu */
+ t3 = _mm_sub_ps(three,t3); /* 3.0-x*lu*lu */
+ t3 = _mm_mul_ps(t1,t3); /* lu*(3-x*lu*lu) */
+ lij_inv = _mm_mul_ps(half,t3); /* result for all four particles */
+
+ sk2 = _mm_mul_ps(sk,sk);
+ sk2_inv = _mm_mul_ps(sk2,rinv);
+ prod = _mm_mul_ps(qrtr,sk2_inv);
+
+ log_term = _mm_mul_ps(uij,lij_inv);
+ log_term = log2_ps(log_term);
+
+ xmm1 = _mm_sub_ps(lij,uij);
+ xmm2 = _mm_mul_ps(qrtr,r);
+ xmm2 = _mm_mul_ps(xmm2,diff2);
+ xmm1 = _mm_add_ps(xmm1,xmm2);
+ xmm2 = _mm_mul_ps(half,rinv);
+ xmm2 = _mm_mul_ps(xmm2,log_term);
+ xmm1 = _mm_add_ps(xmm1,xmm2);
+ xmm8 = _mm_mul_ps(neg,diff2);
+ xmm2 = _mm_mul_ps(xmm8,prod);
+ tmp = _mm_add_ps(xmm1,xmm2);
+
+ /* contitional for rai<sk-dr */
+ xmm3 = _mm_sub_ps(sk,r);
+ mask_cmp3 = _mm_cmplt_ps(rai,xmm3); /*rai<sk-dr */
+
+ xmm4 = _mm_sub_ps(rai_inv,lij);
+ xmm4 = _mm_mul_ps(two,xmm4);
+ xmm4 = _mm_add_ps(xmm1,xmm4);
+
+ tmp = (mask_cmp3 & xmm4) | _mm_andnot_ps(mask_cmp3,tmp); /* xmm1 will now contain four tmp values */
+
+ /* tmp will now contain four partial values, that not all are to be used. Which
+ * ones are governed by the mask_cmp mask.
+ */
+ tmp = _mm_mul_ps(half,tmp);
+ tmp = (mask_cmp & tmp) | _mm_andnot_ps(mask_cmp, zero);
+ tmp_sum = _mm_add_ps(tmp_sum,tmp);
+
+ duij = one;
+
+ /* start t1 */
+ xmm2 = _mm_mul_ps(half,lij2);
+ xmm3 = _mm_mul_ps(prod,lij3);
+ xmm2 = _mm_add_ps(xmm2,xmm3);
+ xmm3 = _mm_mul_ps(lij,rinv);
+ xmm4 = _mm_mul_ps(lij3,r);
+ xmm3 = _mm_add_ps(xmm3,xmm4);
+ xmm3 = _mm_mul_ps(qrtr,xmm3);
+ t1 = _mm_sub_ps(xmm2,xmm3);
+
+ /* start t2 */
+ xmm2 = _mm_mul_ps(half,uij2);
+ xmm2 = _mm_mul_ps(neg,xmm2);
+ xmm3 = _mm_mul_ps(qrtr,sk2_inv);
+ xmm3 = _mm_mul_ps(xmm3,uij3);
+ xmm2 = _mm_sub_ps(xmm2,xmm3);
+ xmm3 = _mm_mul_ps(uij,rinv);
+ xmm4 = _mm_mul_ps(uij3,r);
+ xmm3 = _mm_add_ps(xmm3,xmm4);
+ xmm3 = _mm_mul_ps(qrtr,xmm3);
+ t2 = _mm_add_ps(xmm2,xmm3);
+
+ /* start t3 */
+ xmm2 = _mm_mul_ps(sk2_inv,rinv);
+ xmm2 = _mm_add_ps(one,xmm2);
+ xmm2 = _mm_mul_ps(eigth,xmm2);
+ xmm2 = _mm_mul_ps(xmm2,xmm8);
+ xmm3 = _mm_mul_ps(log_term, rinv);
+ xmm3 = _mm_mul_ps(xmm3,rinv);
+ xmm3 = _mm_mul_ps(qrtr,xmm3);
+ t3 = _mm_add_ps(xmm2,xmm3);
+
+ /* chain rule terms */
+ xmm2 = _mm_mul_ps(dlij,t1);
+ xmm3 = _mm_mul_ps(duij,t2);
+ xmm2 = _mm_add_ps(xmm2,xmm3);
+ xmm2 = _mm_add_ps(xmm2,t3);
+ xmm2 = _mm_mul_ps(xmm2,rinv);
+
+ xmm2 = _mm_and_ps( (__m128) maski,xmm2);
+ _mm_storeu_ps(fr->dadx+n,xmm2); /* store excess elements, but since we are only advancing n by
+ * offset, this will be corrected by the "main" loop
+ */
+
+ n = n + offset;
+
+ } /* end offset */
+
+
+ /* the tmp array will contain partial values that need to be added together*/
+ tmp = _mm_movehl_ps(tmp,tmp_sum);
+ tmp_sum = _mm_add_ps(tmp_sum,tmp);
+ tmp = _mm_shuffle_ps(tmp_sum,tmp_sum,_MM_SHUFFLE(1,1,1,1));
+ tmp_sum = _mm_add_ss(tmp_sum,tmp);
+
+ _mm_store_ss(&sum_tmp,tmp_sum);
+
+ if(PAR(cr))
+ {
+ sum_mpi[ai]=sum_tmp;
+ }
+ else
+ {
+ sum_ai=sum_tmp;
+
+ /* calculate the born radii */
+ sum_ai = (rr-doffset) * sum_ai;
+ sum_ai2 = sum_ai * sum_ai;
+ sum_ai3 = sum_ai2 * sum_ai;
+
+ tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
+ born->bRad[ai] = ri - tsum/rr;
+ born->bRad[ai] = 1.0/(born->bRad[ai]);
+
+ fr->invsqrta[ai] = invsqrt(born->bRad[ai]);
+ tchain = (rr-doffset)*(born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
+ born->drobc[ai] = (1.0 - tsum*tsum)*tchain*(1.0/rr);
+ }
+
+ }
+
+ if(PAR(cr))
+ {
+ gmx_sum(natoms,sum_mpi,cr);
+ for(i=at0;i<at1;i++)
+ {
+ ai = i;
+ rr = mtop->atomtypes.gb_radius[md->typeA[ai]];
+ rr_inv = 1.0/rr;
+
+ sum_ai = sum_mpi[ai];
+ sum_ai = (rr-doffset) * sum_ai;
+ sum_ai2 = sum_ai * sum_ai;
+ sum_ai3 = sum_ai2 * sum_ai;
+
+ tsum = tanh(born->obc_alpha*sum_ai-born->obc_beta*sum_ai2+born->obc_gamma*sum_ai3);
+ gbr = 1.0/(rr-doffset) - tsum*rr_inv;
+
+ born->bRad[ai] = 1.0 / gbr;
+ fr->invsqrta[ai]=invsqrt(born->bRad[ai]);
+
+ tchain = (rr-doffset) * (born->obc_alpha-2*born->obc_beta*sum_ai+3*born->obc_gamma*sum_ai2);
+ born->drobc[ai] = (1.0-tsum*tsum)*tchain*rr_inv;
+ }
+
+ gb_pd_send(cr,born->bRad,md->nr);
+ gb_pd_send(cr,fr->invsqrta,md->nr);
+ gb_pd_send(cr,born->drobc,md->nr);
+ }
+
+ return 0;
+}
+
+
+
+real calc_gb_chainrule_sse(int natoms, t_nblist *nl, real *dadx, real *dvda, real *xd, real *f, int gb_algorithm, gmx_genborn_t *born)
+{
+ int i,k,n,ai,aj,ai3,nj0,nj1,offset;
+ int aj1,aj2,aj3,aj4;
+
+ real rbi;
+ real rb[natoms+4];
+
+ __m128 ix,iy,iz;
+ __m128 jx,jy,jz;
+ __m128 fix,fiy,fiz;
+ __m128 dx,dy,dz;
+ __m128 t1,t2,t3;
+ __m128 dva,dax,fgb;
+ __m128 xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
+
+ __m128i mask = _mm_set_epi32(0, 0xffffffff,0xffffffff,0xffffffff);
+ __m128i maski = _mm_set_epi32(0, 0xffffffff,0xffffffff,0xffffffff);
+
+ const __m128 two = {2.0f , 2.0f , 2.0f , 2.0f };
+ real z = 0;
+
+ /* Loop to get the proper form for the Born radius term, sse style */
+ offset=natoms%4;
+
+ if(offset!=0)
+ {
+ if(offset==1)
+ mask = _mm_set_epi32(0,0,0,0xffffffff);
+ else if(offset==2)
+ mask = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+ else
+ mask = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+ }
+
+ if(gb_algorithm==egbSTILL) {
+
+ xmm3 = _mm_set1_ps(ONE_4PI_EPS0);
+
+ for(i=0;i<natoms-offset;i+=4)
+ {
+ xmm1 = _mm_loadu_ps(born->bRad+i);
+ xmm1 = _mm_mul_ps(xmm1,xmm1);
+ xmm1 = _mm_mul_ps(xmm1,two); /*2 * rbi * rbi*/
+
+ xmm2 = _mm_loadu_ps(dvda+i);
+
+ xmm1 = _mm_mul_ps(xmm1,xmm2); /* 2 * rbi * rbi * dvda[i] */
+ xmm1 = _mm_div_ps(xmm1,xmm3); /* (2 * rbi * rbi * dvda[i]) / ONE_4PI_EPS0 */
+
+ _mm_storeu_ps(rb+i, xmm1); /* store to memory */
+ }
+
+ /* with the offset element, the mask stores excess elements to zero. This could cause problems
+ * when something gets allocated right after rb (solution: allocate three positions bigger)
+ */
+ if(offset!=0)
+ {
+ xmm1 = _mm_loadu_ps(born->bRad+i);
+ xmm1 = _mm_mul_ps(xmm1,xmm1);
+ xmm1 = _mm_mul_ps(xmm1,two);
+
+ xmm2 = _mm_loadu_ps(dvda+i);
+
+ xmm1 = _mm_mul_ps(xmm1,xmm2);
+ xmm1 = _mm_div_ps(xmm1,xmm3);
+ xmm1 = _mm_and_ps( (__m128) mask, xmm1);
+
+ _mm_storeu_ps(rb+i, xmm1);
+ }
+ }
+
+ else if(gb_algorithm==egbHCT) {
+ for(i=0;i<natoms-offset;i+=4)
+ {
+ xmm1 = _mm_loadu_ps(born->bRad+i);
+ xmm1 = _mm_mul_ps(xmm1, xmm1); /* rbi*rbi */
+
+ xmm2 = _mm_loadu_ps(dvda+i);
+
+ xmm1 = _mm_mul_ps(xmm1,xmm2); /* rbi*rbi*dvda[i] */
+
+ _mm_storeu_ps(rb+i, xmm1);
+ }
+
+ if(offset!=0)
+ {
+ xmm1 = _mm_loadu_ps(born->bRad+i);
+ xmm1 = _mm_mul_ps(xmm1, xmm1); /* rbi*rbi */
+
+ xmm2 = _mm_loadu_ps(dvda+i);
+
+ xmm1 = _mm_mul_ps(xmm1,xmm2); /* rbi*rbi*dvda[i] */
+ xmm1 = _mm_and_ps( (__m128) mask, xmm1);
+
+ _mm_storeu_ps(rb+i, xmm1);
+ }
+ }
+
+ else if(gb_algorithm==egbOBC) {
+ for(i=0;i<natoms-offset;i+=4)
+ {
+ xmm1 = _mm_loadu_ps(born->bRad+i);
+ xmm1 = _mm_mul_ps(xmm1, xmm1); /* rbi*rbi */
+
+ xmm2 = _mm_loadu_ps(dvda+i);
+ xmm1 = _mm_mul_ps(xmm1,xmm2); /* rbi*rbi*dvda[i] */
+ xmm2 = _mm_loadu_ps(born->drobc+i);
+ xmm1 = _mm_mul_ps(xmm1, xmm2); /*rbi*rbi*dvda[i]*born->drobc[i] */
+
+ _mm_storeu_ps(rb+i, xmm1);
+ }
+
+ if(offset!=0)
+ {
+ xmm1 = _mm_loadu_ps(born->bRad+i);
+ xmm1 = _mm_mul_ps(xmm1, xmm1); /* rbi*rbi */
+
+ xmm2 = _mm_loadu_ps(dvda+i);
+ xmm1 = _mm_mul_ps(xmm1,xmm2); /* rbi*rbi*dvda[i] */
+ xmm2 = _mm_loadu_ps(born->drobc+i);
+ xmm1 = _mm_mul_ps(xmm1, xmm2); /*rbi*rbi*dvda[i]*born->drobc[i] */
+ xmm1 = _mm_and_ps( (__m128) mask, xmm1);
+
+ _mm_storeu_ps(rb+i, xmm1);
+ }
+ }
+
+ /* Keep the compiler happy */
+ t1 = _mm_setzero_ps();
+ t2 = _mm_setzero_ps();
+ t3 = _mm_setzero_ps();
+
+ aj1 = aj2 = aj3 = aj4 = 0;
+ n=0;
+
+ for(i=0;i<nl->nri;i++)
+ {
+ ai = nl->iinr[i];
+ ai3 = ai*3;
+
+ nj0 = nl->jindex[ai];
+ nj1 = nl->jindex[ai+1];
+
+ offset = (nj1-nj0)%4;
+
+ /* Load particle ai coordinates */
+ ix = _mm_load1_ps(xd+ai3);
+ iy = _mm_load1_ps(xd+ai3+1);
+ iz = _mm_load1_ps(xd+ai3+2);
+
+ /* Load particle ai dvda */
+ dva = _mm_load1_ps(rb+ai);
+
+ fix = _mm_setzero_ps();
+ fiy = _mm_setzero_ps();
+ fiz = _mm_setzero_ps();
+
+ /* Inner loop for all particles where n%4==0 */
+ for(k=nj0;k<nj1-offset;k+=4)
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+ aj3 = nl->jjnr[k+2];
+ aj4 = nl->jjnr[k+3];
+
+ aj1 = aj1 * 3;
+ aj2 = aj2 * 3;
+ aj3 = aj3 * 3;
+ aj4 = aj4 * 3;
+
+ /* Load j1-4 coordinates, first x and y */
+ xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (xd+aj1)); /*x1 y1 - - */
+ xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (xd+aj2)); /*x2 y2 - - */
+ xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (xd+aj3)); /*x3 y3 - - */
+ xmm4 = _mm_loadh_pi(xmm4,(__m64 *) (xd+aj4)); /*x4 y4 - - */
+
+ /* ... then z */
+ xmm5 = _mm_load1_ps(xd+aj1+2); /* z1 z1 z1 z1 */
+ xmm6 = _mm_load1_ps(xd+aj2+2); /* z2 z2 z2 z2 */
+ xmm7 = _mm_load1_ps(xd+aj3+2); /* z3 z3 z3 z3 */
+ xmm8 = _mm_load1_ps(xd+aj4+2); /* z4 z4 z4 z4 */
+
+ /* transpose */
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); /* z1 z1 z2 z2 */
+ xmm6 = _mm_shuffle_ps(xmm7,xmm8, _MM_SHUFFLE(0,0,0,0)); /* z3 z3 z4 z4 */
+ jz = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(2,0,2,0)); /* z1 z2 z3 z4 */
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2)); /* x1 y1 x2 y2 */
+ xmm2 = _mm_shuffle_ps(xmm3,xmm4, _MM_SHUFFLE(3,2,3,2)); /* x3 y3 x4 y4 */
+
+ jx = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0)); /* x1 x2 x3 x4 */
+ jy = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1)); /* y1 y2 y3 y4 */
+
+ /* load chain rule terms for j1-4 */
+ dax = _mm_loadu_ps(dadx+n);
+ n = n + 4;
+
+ /* distances i -> j1-4 */
+ dx = _mm_sub_ps(ix, jx);
+ dy = _mm_sub_ps(iy, jy);
+ dz = _mm_sub_ps(iz, jz);
+
+ /* calculate scalar force */
+ fgb = _mm_mul_ps(dva,dax);
+
+ /* calculate partial force terms */
+ t1 = _mm_mul_ps(fgb,dx); /* fx1, fx2, fx3, fx4 */
+ t2 = _mm_mul_ps(fgb,dy); /* fy1, fy2, fy3, fy4 */
+ t3 = _mm_mul_ps(fgb,dz); /* fz1, fz2, fz3, fz4 */
+
+ /* update the i force */
+ fix = _mm_add_ps(fix,t1);
+ fiy = _mm_add_ps(fiy,t2);
+ fiz = _mm_add_ps(fiz,t3);
+
+ /* accumulate the aj1-4 fx and fy forces from memory */
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (f+aj1)); /*fx1 fy1 - - */
+ xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (f+aj2)); /*fx2 fy2 - - */
+ xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (f+aj3)); /*fx3 fy3 - - */
+ xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (f+aj4)); /*fx4 fy4 - - */
+
+ xmm5 = _mm_load1_ps(f+aj1+2); /* fz1 fz1 fz1 fz1 */
+ xmm6 = _mm_load1_ps(f+aj2+2); /* fz2 fz2 fz2 fz2 */
+ xmm7 = _mm_load1_ps(f+aj3+2); /* fz3 fz3 fz3 fz3 */
+ xmm8 = _mm_load1_ps(f+aj4+2); /* fz4 fz4 fz4 fz4 */
+
+ /* transpose forces */
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); /* fz1 fz1 fz2 fz2 */
+ xmm6 = _mm_shuffle_ps(xmm7,xmm8, _MM_SHUFFLE(0,0,0,0)); /* fz3 fz3 fz4 fz4 */
+ xmm7 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(2,0,2,0)); /* fz1 fz2 fz3 fz4 */
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); /*fx1 fy1 fx2 fy2 */
+ xmm2 = _mm_shuffle_ps(xmm3,xmm4,_MM_SHUFFLE(3,2,3,2)); /*fx2 fy3 fx4 fy4 */
+
+ xmm5 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0)); /*fx1 fx2 fx3 fx4 */
+ xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1)); /*fy1 fy2 fy3 fy4 */
+
+ /* subtract partial forces */
+ xmm5 = _mm_sub_ps(xmm5, t1); /*fx1 fx2 fx3 fx4 */
+ xmm6 = _mm_sub_ps(xmm6, t2); /*fy1 fy2 fy3 fy4 */
+ xmm7 = _mm_sub_ps(xmm7, t3); /*fz1 fz2 fz3 fz4 */
+
+ /* transpose back fx's and fy's */
+ xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /*fx1 fx2 fy1 fy2 */
+ xmm2 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /*fx3 fx4 fy3 fy4 */
+ xmm1 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0)); /*fx1 fy1 fx2 fy2 */
+ xmm2 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0)); /*fx3 fy3 fx4 fy4 */
+
+ /* store the force, first fx's and fy's */
+ _mm_storel_pi( (__m64 *) (f+aj1), xmm1);
+ _mm_storeh_pi( (__m64 *) (f+aj2), xmm1);
+ _mm_storel_pi( (__m64 *) (f+aj3), xmm2);
+ _mm_storeh_pi( (__m64 *) (f+aj4), xmm2);
+
+ /* now do fz�s*/
+ _mm_store_ss(f+aj1+2,xmm7); /*fz1 */
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(f+aj2+2,xmm7); /*fz2 */
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(f+aj3+2,xmm7); /*fz4 */
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(f+aj4+2,xmm7); /*fz3 */
+ }
+
+ /*deal with odd elements */
+ if(offset!=0) {
+
+ aj1 = aj2 = aj3 = aj4 = 0;
+
+ if(offset==1)
+ {
+ aj1 = nl->jjnr[k];
+ aj1 = aj1 * 3;
+
+ xmm1 = _mm_loadl_pi(xmm1,(__m64 *) (xd+aj1)); /*x1 y1 */
+ xmm5 = _mm_load1_ps(xd+aj1+2); /*z1 z1 z1 z1*/
+
+ xmm6 = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(0,0,0,0)); /*x1 - - - */
+ xmm4 = _mm_shuffle_ps(xmm1, xmm1, _MM_SHUFFLE(1,1,1,1)); /*y1 - - - */
+
+ mask = _mm_set_epi32(0,0,0,0xffffffff);
+ }
+ else if(offset==2)
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+
+ aj1 = aj1 * 3;
+ aj2 = aj2 * 3;
+
+ xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (xd+aj1)); /* x1 y1 - - */
+ xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (xd+aj2)); /* x2 y2 - - */
+
+ xmm5 = _mm_load1_ps(xd+aj1+2); /*z1 z1 z1 z1*/
+ xmm6 = _mm_load1_ps(xd+aj2+2); /*z2 z2 z2 z2*/
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0)); /*z1 z1 z2 z2 */
+ xmm5 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0)); /*z1 z2 z1 z2 */
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); /*x1 y1 x2 y2 */
+ xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0)); /*x1 x2 x1 x2 */
+ xmm4 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1)); /*y1 y2 y1 y2 */
+
+ mask = _mm_set_epi32(0,0,0xffffffff,0xffffffff);
+ }
+ else
+ {
+ aj1 = nl->jjnr[k];
+ aj2 = nl->jjnr[k+1];
+ aj3 = nl->jjnr[k+2];
+
+ aj1 = aj1 * 3;
+ aj2 = aj2 * 3;
+ aj3 = aj3 * 3;
+
+ xmm1 = _mm_loadh_pi(xmm1,(__m64 *) (xd+aj1)); /*x1 y1 - - */
+ xmm2 = _mm_loadh_pi(xmm2,(__m64 *) (xd+aj2)); /*x2 y2 - - */
+ xmm3 = _mm_loadh_pi(xmm3,(__m64 *) (xd+aj3)); /*x3 y3 - - */
+
+ xmm5 = _mm_load1_ps(xd+aj1+2); /* z1 z1 z1 z1 */
+ xmm6 = _mm_load1_ps(xd+aj2+2); /* z2 z2 z2 z2 */
+ xmm7 = _mm_load1_ps(xd+aj3+2); /* z3 z3 z3 z3 */
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); /* z1 z1 z2 z2 */
+ xmm5 = _mm_shuffle_ps(xmm5,xmm7, _MM_SHUFFLE(3,1,3,1)); /* z1 z2 z3 z3 */
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,2,3,2)); /* x1 y1 x2 y2 */
+ xmm2 = _mm_shuffle_ps(xmm3,xmm3, _MM_SHUFFLE(3,2,3,2)); /* x3 y3 x3 y3 */
+
+ xmm6 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(2,0,2,0)); /* x1 x2 x3 x3 */
+ xmm4 = _mm_shuffle_ps(xmm1,xmm2, _MM_SHUFFLE(3,1,3,1)); /* y1 y2 y3 y3 */
+
+ mask = _mm_set_epi32(0,0xffffffff,0xffffffff,0xffffffff);
+ }
+
+ jx = _mm_and_ps( (__m128) mask, xmm6);
+ jy = _mm_and_ps( (__m128) mask, xmm4);
+ jz = _mm_and_ps( (__m128) mask, xmm5);
+
+ dax = _mm_loadu_ps(dadx+n);
+ n = n + offset;
+ dax = _mm_and_ps( (__m128) mask, dax);
+
+ dx = _mm_sub_ps(ix, jx);
+ dy = _mm_sub_ps(iy, jy);
+ dz = _mm_sub_ps(iz, jz);
+
+ fgb = _mm_mul_ps(dva,dax);
+
+ t1 = _mm_mul_ps(fgb,dx); /* fx1, fx2, fx3, fx4 */
+ t2 = _mm_mul_ps(fgb,dy); /* fy1, fy2, fy3, fy4 */
+ t3 = _mm_mul_ps(fgb,dz); /* fz1, fz2, fz3, fz4 */
+
+ if(offset==1) {
+ xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (f+aj1)); /* fx1 fy1 */
+ xmm7 = _mm_load1_ps(f+aj1+2); /* fz1 fz1 fz1 fz1 */
+
+ xmm5 = _mm_shuffle_ps(xmm1,xmm1, _MM_SHUFFLE(0,0,0,0)); /* fx1 - - - */
+ xmm6 = _mm_shuffle_ps(xmm1,xmm1, _MM_SHUFFLE(1,1,1,1)); /* fy1 - - - */
+
+ xmm5 = _mm_sub_ps(xmm5,t1);
+ xmm6 = _mm_sub_ps(xmm6,t2);
+ xmm7 = _mm_sub_ps(xmm7,t3);
+
+ _mm_store_ss(f+aj1 , xmm5);
+ _mm_store_ss(f+aj1+1,xmm6);
+ _mm_store_ss(f+aj1+2,xmm7);
+ }
+ else if(offset==2) {
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (f+aj1)); /*fx1 fy1 - -*/
+ xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (f+aj2)); /*fx2 fy2 - - */
+
+ xmm5 = _mm_load1_ps(f+aj1+2); /* fz1 fz1 fz1 fz1 */
+ xmm6 = _mm_load1_ps(f+aj2+2); /* fz2 fz2 fz2 fz2 */
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(0,0,0,0)); /*fz1 fz1 fz2 fz2*/
+ xmm7 = _mm_shuffle_ps(xmm5,xmm5,_MM_SHUFFLE(2,0,2,0)); /*fz1 fz2 fz1 fz2 */
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); /*x1 y1 x2 y2 */
+ xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(2,0,2,0)); /*x1 x2 x1 x2 */
+ xmm6 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,3,1)); /*y1 y2 y1 y2 */
+
+ xmm5 = _mm_sub_ps(xmm5, t1);
+ xmm6 = _mm_sub_ps(xmm6, t2);
+ xmm7 = _mm_sub_ps(xmm7, t3);
+
+ xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /*fx1 fx2 fy1 fy2*/
+ xmm5 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0)); /*fx1 fy1 fx2 fy2*/
+
+ _mm_storel_pi( (__m64 *) (f+aj1), xmm5);
+ _mm_storeh_pi( (__m64 *) (f+aj2), xmm5);
+
+ _mm_store_ss(f+aj1+2,xmm7);
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(f+aj2+2,xmm7);
+ }
+ else {
+ xmm1 = _mm_loadh_pi(xmm1, (__m64 *) (f+aj1)); /*fx1 fy1 - - */
+ xmm2 = _mm_loadh_pi(xmm2, (__m64 *) (f+aj2)); /*fx2 fy2 - - */
+ xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (f+aj3)); /*fx3 fy3 - - */
+
+ xmm5 = _mm_load1_ps(f+aj1+2); /* fz1 fz1 fz1 fz1 */
+ xmm6 = _mm_load1_ps(f+aj2+2); /* fz2 fz2 fz2 fz2 */
+ xmm7 = _mm_load1_ps(f+aj3+2); /* fz3 fz3 fz3 fz3 */
+
+ xmm5 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(0,0,0,0)); /* fz1 fz1 fz2 fz2 */
+ xmm6 = _mm_shuffle_ps(xmm7,xmm7, _MM_SHUFFLE(0,0,0,0)); /* fz3 fz3 fz3 fz3 */
+ xmm7 = _mm_shuffle_ps(xmm5,xmm6, _MM_SHUFFLE(2,0,2,0)); /* fz1 fz2 fz3 fz4 */
+
+ xmm1 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,2,3,2)); /*fx1 fy1 fx2 fy2 */
+ xmm2 = _mm_shuffle_ps(xmm3,xmm3,_MM_SHUFFLE(3,2,3,2)); /*fx2 fy3 fx3 fy3 */
+
+ xmm5 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(2,0,2,0)); /*fx1 fx2 fx3 fx3 */
+ xmm6 = _mm_shuffle_ps(xmm1,xmm2,_MM_SHUFFLE(3,1,3,1)); /*fy1 fy2 fy3 fy3 */
+
+ xmm5 = _mm_sub_ps(xmm5, t1);
+ xmm6 = _mm_sub_ps(xmm6, t2);
+ xmm7 = _mm_sub_ps(xmm7, t3);
+
+ xmm1 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(1,0,1,0)); /*fx1 fx2 fy1 fy2 */
+ xmm2 = _mm_shuffle_ps(xmm5,xmm6,_MM_SHUFFLE(3,2,3,2)); /*fx3 fx3 fy3 fy3 */
+ xmm1 = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(3,1,2,0)); /*fx1 fy1 fx2 fy2 */
+ xmm2 = _mm_shuffle_ps(xmm2,xmm2,_MM_SHUFFLE(3,1,2,0)); /*fx3 fy3 fx3 fy3 */
+
+ _mm_storel_pi( (__m64 *) (f+aj1), xmm1);
+ _mm_storeh_pi( (__m64 *) (f+aj2), xmm1);
+ _mm_storel_pi( (__m64 *) (f+aj3), xmm2);
+
+ _mm_store_ss(f+aj1+2,xmm7); /*fz1*/
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(f+aj2+2,xmm7); /*fz2*/
+ xmm7 = _mm_shuffle_ps(xmm7,xmm7,_MM_SHUFFLE(0,3,2,1));
+ _mm_store_ss(f+aj3+2,xmm7); /*fz3*/
+
+ }
+
+ t1 = _mm_and_ps( (__m128) mask, t1);
+ t2 = _mm_and_ps( (__m128) mask, t2);
+ t3 = _mm_and_ps( (__m128) mask, t3);
+
+ fix = _mm_add_ps(fix,t1);
+ fiy = _mm_add_ps(fiy,t2);
+ fiz = _mm_add_ps(fiz,t3);
+
+ } /*end offset!=0*/
+
+ /* fix/fiy/fiz now contain four partial force terms, that all should be
+ * added to the i particle forces.
+ */
+ t1 = _mm_movehl_ps(t1,fix);
+ t2 = _mm_movehl_ps(t2,fiy);
+ t3 = _mm_movehl_ps(t3,fiz);
+
+ fix = _mm_add_ps(fix,t1);
+ fiy = _mm_add_ps(fiy,t2);
+ fiz = _mm_add_ps(fiz,t3);
+
+ t1 = _mm_shuffle_ps( fix, fix, _MM_SHUFFLE(1,1,1,1) );
+ t2 = _mm_shuffle_ps( fiy, fiy, _MM_SHUFFLE(1,1,1,1) );
+ t3 = _mm_shuffle_ps( fiz, fiz, _MM_SHUFFLE(1,1,1,1) );
+
+ fix = _mm_add_ss(fix,t1); /*fx - - - */
+ fiy = _mm_add_ss(fiy,t2); /*fy - - - */
+ fiz = _mm_add_ss(fiz,t3); /*fz - - - */
+
+ xmm2 = _mm_unpacklo_ps(fix,fiy); /*fx, fy, - - */
+ xmm2 = _mm_movelh_ps(xmm2,fiz);
+ xmm2 = _mm_and_ps( (__m128) maski, xmm2);
+
+ /* load i force from memory */
+ xmm4 = _mm_loadl_pi(xmm4,(__m64 *) (f+ai3)); /*fx fy - - */
+ xmm5 = _mm_load1_ps(f+ai3+2); /* fz fz fz fz */
+ xmm4 = _mm_shuffle_ps(xmm4,xmm5,_MM_SHUFFLE(3,2,1,0)); /*fx fy fz fz*/
+
+ /* add to i force */
+ xmm4 = _mm_add_ps(xmm4,xmm2);
+
+ /* store i force to memory */
+ _mm_storel_pi( (__m64 *) (f+ai3),xmm4); /* fx fy - - */
+ xmm4 = _mm_shuffle_ps(xmm4,xmm4,_MM_SHUFFLE(2,2,2,2)); /* only the third term is correct for fz */
+ _mm_store_ss(f+ai3+2,xmm4); /*fz*/
+ }
+
+ return 0;
+}
+
+
+
+
+
+
+#else
+/* keep compiler happy */
+int genborn_sse_dummy;
+
+#endif /* SSE intrinsics available */
--- /dev/null
+/*
+ * $Id$
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2008, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+
+
+#ifndef _genborn_sse_h
+#define _genborn_sse_h
+
+#include "typedefs.h"
+#include "grompp.h"
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+
+real
+calc_gb_chainrule_sse(int natoms, t_nblist *nl, real *dadx, real *dvda, real *xd, real *f, int gb_algorithm, gmx_genborn_t *born);
+
+
+int
+calc_gb_rad_still_sse(t_commrec *cr, t_forcerec *fr,int natoms, gmx_mtop_t *mtop,
+ const t_atomtypes *atype, real *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md);
+
+int
+calc_gb_rad_hct_sse(t_commrec *cr, t_forcerec *fr, int natoms, gmx_mtop_t *mtop, const t_atomtypes *atype, real *x,
+ t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md);
+
+int
+calc_gb_rad_obc_sse(t_commrec *cr, t_forcerec * fr, int natoms, gmx_mtop_t *mtop,
+ const t_atomtypes *atype, real *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md);
+
+
+
+
+
+#endif /* _genborn_sse_h */
gmx_vsite_t *vsite,gmx_constr_t constr,
t_fcdata *fcd,
t_graph *graph,t_mdatoms *mdatoms,
- t_forcerec *fr, rvec mu_tot,
+ t_forcerec *fr, gmx_genborn_t *born,rvec mu_tot,
gmx_enerdata_t *enerd,tensor vir,tensor pres,
int count,bool bFirst)
{
* We do not unshift, so molecules are always whole in congrad.c
*/
do_force(fplog,cr,inputrec,
- count,nrnb,wcycle,top,&top_global->groups,
+ count,nrnb,wcycle,top,top_global,&top_global->groups,
ems->s.box,ems->s.x,&ems->s.hist,
ems->f,*buf,force_vir,mdatoms,enerd,fcd,
- ems->s.lambda,graph,fr,vsite,mu_tot,t,NULL,NULL,
+ ems->s.lambda,graph,fr,vsite,mu_tot,t,NULL,NULL,born,FALSE,
GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES | GMX_FORCE_VIRIAL |
(bNS ? GMX_FORCE_NS : 0));
rvec buf[],t_mdatoms *mdatoms,
t_nrnb *nrnb,gmx_wallcycle_t wcycle,
gmx_edsam_t ed,
- t_forcerec *fr,
+ t_forcerec *fr,gmx_genborn_t *born,
int repl_ex_nst,int repl_ex_seed,
real cpt_period,real max_hours,
unsigned long Flags,
evaluate_energy(fplog,bVerbose,cr,
state_global,top_global,s_min,&buf,top,
inputrec,nrnb,wcycle,
- vsite,constr,fcd,graph,mdatoms,fr,
+ vsite,constr,fcd,graph,mdatoms,fr,born,
mu_tot,enerd,vir,pres,-1,TRUE);
where();
evaluate_energy(fplog,bVerbose,cr,
state_global,top_global,s_c,&buf,top,
inputrec,nrnb,wcycle,
- vsite,constr,fcd,graph,mdatoms,fr,
+ vsite,constr,fcd,graph,mdatoms,fr,born,
mu_tot,enerd,vir,pres,-1,FALSE);
/* Calc derivative along line */
evaluate_energy(fplog,bVerbose,cr,
state_global,top_global,s_b,&buf,top,
inputrec,nrnb,wcycle,
- vsite,constr,fcd,graph,mdatoms,fr,
+ vsite,constr,fcd,graph,mdatoms,fr,born,
mu_tot,enerd,vir,pres,-1,FALSE);
/* p does not change within a step, but since the domain decomposition
rvec buf[],t_mdatoms *mdatoms,
t_nrnb *nrnb,gmx_wallcycle_t wcycle,
gmx_edsam_t ed,
- t_forcerec *fr,
+ t_forcerec *fr,gmx_genborn_t *born,
int repl_ex_nst,int repl_ex_seed,
real cpt_period,real max_hours,
unsigned long Flags,
evaluate_energy(fplog,bVerbose,cr,
state,top_global,&ems,&buf,top,
inputrec,nrnb,wcycle,
- vsite,constr,fcd,graph,mdatoms,fr,
+ vsite,constr,fcd,graph,mdatoms,fr,born,
mu_tot,enerd,vir,pres,-1,TRUE);
where();
evaluate_energy(fplog,bVerbose,cr,
state,top_global,&ems,&buf,top,
inputrec,nrnb,wcycle,
- vsite,constr,fcd,graph,mdatoms,fr,
+ vsite,constr,fcd,graph,mdatoms,fr,born,
mu_tot,enerd,vir,pres,step,FALSE);
EpotC = ems.epot;
evaluate_energy(fplog,bVerbose,cr,
state,top_global,&ems,&buf,top,
inputrec,nrnb,wcycle,
- vsite,constr,fcd,graph,mdatoms,fr,
+ vsite,constr,fcd,graph,mdatoms,fr,born,
mu_tot,enerd,vir,pres,step,FALSE);
EpotB = ems.epot;
rvec buf[],t_mdatoms *mdatoms,
t_nrnb *nrnb,gmx_wallcycle_t wcycle,
gmx_edsam_t ed,
- t_forcerec *fr,
+ t_forcerec *fr,gmx_genborn_t *born,
int repl_ex_nst,int repl_ex_seed,
real cpt_period,real max_hours,
unsigned long Flags,
evaluate_energy(fplog,bVerbose,cr,
state_global,top_global,s_try,&buf,top,
inputrec,nrnb,wcycle,
- vsite,constr,fcd,graph,mdatoms,fr,
+ vsite,constr,fcd,graph,mdatoms,fr,born,
mu_tot,enerd,vir,pres,count,count==0);
if (MASTER(cr))
rvec buf[],t_mdatoms *mdatoms,
t_nrnb *nrnb,gmx_wallcycle_t wcycle,
gmx_edsam_t ed,
- t_forcerec *fr,
+ t_forcerec *fr,gmx_genborn_t *born,
int repl_ex_nst,int repl_ex_seed,
real cpt_period,real max_hours,
unsigned long Flags,
evaluate_energy(fplog,bVerbose,cr,
state_global,top_global,state_work,&buf,top,
inputrec,nrnb,wcycle,
- vsite,constr,fcd,graph,mdatoms,fr,
+ vsite,constr,fcd,graph,mdatoms,fr,born,
mu_tot,enerd,vir,pres,count,count==0);
count++;
evaluate_energy(fplog,bVerbose,cr,
state_global,top_global,state_work,&buf,top,
inputrec,nrnb,wcycle,
- vsite,constr,fcd,graph,mdatoms,fr,
+ vsite,constr,fcd,graph,mdatoms,fr,born,
mu_tot,enerd,vir,pres,count,count==0);
count++;
evaluate_energy(fplog,bVerbose,cr,
state_global,top_global,state_work,&buf,top,
inputrec,nrnb,wcycle,
- vsite,constr,fcd,graph,mdatoms,fr,
+ vsite,constr,fcd,graph,mdatoms,fr,born,
mu_tot,enerd,vir,pres,count,count==0);
count++;
rvec buf[],t_mdatoms *mdatoms,
t_nrnb *nrnb,gmx_wallcycle_t wcycle,
gmx_edsam_t ed,
- t_forcerec *fr,
+ t_forcerec *fr,gmx_genborn_t *born,
int repl_ex_nst,int repl_ex_seed,
real cpt_period,real max_hours,
unsigned long Flags,
/* Make do_force do a single node fore calculation */
cr->nnodes = 1;
do_force(fplog,cr,inputrec,
- step,nrnb,wcycle,top,&top_global->groups,
+ step,nrnb,wcycle,top,top_global,&top_global->groups,
rerun_fr.box,state->x,&state->hist,
f,buf,force_vir,mdatoms,enerd,fcd,
- lambda,NULL,fr,NULL,mu_tot,t,NULL,NULL,
+ lambda,NULL,fr,NULL,mu_tot,t,NULL,NULL,born,FALSE,
GMX_FORCE_NONBONDED |
(bNS ? GMX_FORCE_NS : 0) |
(bStateChanged ? GMX_FORCE_STATECHANGED : 0));
}
/* Determine the values for icoul/ivdw. */
- if (fr->bcoultab)
+ /* Start with GB */
+ if(fr->bGB)
+ {
+ icoul=4;
+ }
+ else if (fr->bcoultab)
{
icoul = 3;
}
init_nblist(&fr->QMMMlist_sr,&fr->QMMMlist_lr,
maxsr,maxlr,0,icoul,FALSE,enlistATOM_ATOM);
}
-
+
}
static void reset_nblist(t_nblist *nl)
}
}
+
int search_neighbours(FILE *log,t_forcerec *fr,
rvec x[],matrix box,
gmx_localtop_t *top,
int cg_start,cg_end,start,end;
gmx_ns_t *ns;
t_grid *grid;
-
+
ns = &fr->ns;
/* Set some local variables */
if (bGrid && bFillGrid)
{
+
grid = ns->grid;
bFilledHome = (DOMAINDECOMP(cr) && dd_filled_nsgrid_home(cr->dd));
if (!bFilledHome)
int relax_shell_flexcon(FILE *fplog,t_commrec *cr,bool bVerbose,
int mdstep,t_inputrec *inputrec,
bool bDoNS,bool bStopCM,
- gmx_localtop_t *top,gmx_constr_t constr,
+ gmx_localtop_t *top,
+ gmx_mtop_t* mtop,
+ gmx_constr_t constr,
gmx_enerdata_t *enerd,t_fcdata *fcd,
t_state *state,rvec f[],
rvec buf[],tensor force_vir,
gmx_groups_t *groups,
struct gmx_shellfc *shfc,
t_forcerec *fr,
+ gmx_genborn_t *born, bool bBornRadii,
double t,rvec mu_tot,
int natoms,bool *bConverged,
gmx_vsite_t *vsite,
if (gmx_debug_at) {
pr_rvecs(debug,0,"x b4 do_force",state->x + start,homenr);
}
- do_force(fplog,cr,inputrec,mdstep,nrnb,wcycle,top,groups,
+ do_force(fplog,cr,inputrec,mdstep,nrnb,wcycle,top,mtop,groups,
state->box,state->x,&state->hist,
force[Min],buf,force_vir,md,enerd,fcd,
state->lambda,graph,
- fr,vsite,mu_tot,t,fp_field,NULL,
+ fr,vsite,mu_tot,t,fp_field,NULL,born,bBornRadii,
GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
(bDoNS ? GMX_FORCE_NS : 0) | GMX_FORCE_VIRIAL);
}
/* Try the new positions */
do_force(fplog,cr,inputrec,1,nrnb,wcycle,
- top,groups,state->box,pos[Try],&state->hist,
+ top,mtop,groups,state->box,pos[Try],&state->hist,
force[Try],buf,force_vir,
md,enerd,fcd,state->lambda,graph,
- fr,vsite,mu_tot,t,fp_field,NULL,
+ fr,vsite,mu_tot,t,fp_field,NULL,born,bBornRadii,
GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES | GMX_FORCE_VIRIAL);
if (gmx_debug_at) {
#include "domdec.h"
#include "partdec.h"
#include "gmx_wallcycle.h"
+#include "genborn.h"
#ifdef GMX_MPI
#include "mpi.h"
t_inputrec *inputrec,
int step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
gmx_localtop_t *top,
+ gmx_mtop_t *mtop,
gmx_groups_t *groups,
matrix box,rvec x[],history_t *hist,
rvec f[],rvec buf[],
real lambda,t_graph *graph,
t_forcerec *fr,gmx_vsite_t *vsite,rvec mu_tot,
double t,FILE *field,gmx_edsam_t ed,
+ gmx_genborn_t *born, bool bBornRadii,
int flags)
{
static rvec box_size;
wallcycle_stop(wcycle,ewcNS);
}
+ if(inputrec->implicit_solvent && bNS)
+ gb_nblist_siev(cr,mdatoms->nr,inputrec->gb_algorithm, inputrec->rlist,x,fr,&top->idef.il[F_GB], (born->n12+born->n13)*3);
+
if (DOMAINDECOMP(cr)) {
if (!(cr->duty & DUTY_PME)) {
wallcycle_start(wcycle,ewcPPDURINGPME);
/* Compute the bonded and non-bonded forces */
do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
- x,hist,f,enerd,fcd,box,lambda,graph,&(top->excls),mu_tot_AB,
+ x,hist,f,enerd,fcd,mtop,born,
+ &(top->atomtypes),bBornRadii,box,
+ lambda,graph,&(top->excls),mu_tot_AB,
flags,&cycles_force);
GMX_BARRIER(cr->mpi_comm_mygroup);
return table;
}
+t_forcetable make_gb_table(FILE *out,const t_forcerec *fr,
+ const char *fn,
+ real rtab)
+{
+ const char *fns[3] = { "gbctab.xvg", "gbdtab.xvg", "gbrtab.xvg" };
+ const char *fns14[3] = { "gbctab14.xvg", "gbdtab14.xvg", "gbrtab14.xvg" };
+ FILE *fp;
+ t_tabledata *td;
+ bool bReadTab,bGenTab;
+ real x0,y0,yp;
+ int i,j,k,nx,nx0,tabsel[etiNR];
+ void * p_tmp;
+ double r,r2,Vtab,Ftab,expterm;
+
+ t_forcetable table;
+
+ double abs_error_r, abs_error_r2;
+ double rel_error_r, rel_error_r2;
+ double rel_error_r_old=0, rel_error_r2_old=0;
+ double x0_r_error, x0_r2_error;
+
+
+ /* Only set a Coulomb table for GB */
+ //tabsel[0]=etabGB;
+ //tabsel[1]=-1;
+ //tabsel[2]=-1;
+
+ /* Set the table dimensions for GB, not really necessary to
+ * use etiNR (since we only have one table, but ...)
+ */
+ snew(td,1);
+ table.r = fr->gbtabr;
+ table.scale = fr->gbtabscale;
+ table.scale_exp = 0;
+ table.n = table.scale*table.r;
+ nx0 = 0;
+ nx = table.scale*table.r;
+
+ /* Check whether we have to read or generate
+ * We will always generate a table, so remove the read code
+ * (Compare with original make_table function
+ */
+ bReadTab = FALSE;
+ bGenTab = TRUE;
+
+ /* Each table type (e.g. coul,lj6,lj12) requires four
+ * numbers per datapoint. For performance reasons we want
+ * the table data to be aligned to 16-byte. This is accomplished
+ * by allocating 16 bytes extra to a temporary pointer, and then
+ * calculating an aligned pointer. This new pointer must not be
+ * used in a free() call, but thankfully we're sloppy enough not
+ * to do this :-)
+ */
+
+ /* 4 fp entries per table point, nx+1 points, and 16 bytes extra to align it. */
+ p_tmp = malloc(4*(nx+1)*sizeof(real)+16);
+
+ /* align it - size_t has the same same as a pointer */
+ table.tab = (real *) (((size_t) p_tmp + 16) & (~((size_t) 15)));
+
+ init_table(out,nx,nx0,table.scale,&(td[0]),!bReadTab);
+
+ /* Local implementation so we don't have to use the etabGB
+ * enum above, which will cause problems later when
+ * making the other tables (right now even though we are using
+ * GB, the normal Coulomb tables will be created, but this
+ * will cause a problem since fr->eeltype==etabGB which will not
+ * be defined in fill_table and set_table_type
+ */
+ //fp=fopen("test.xvg","w");
+ for(i=nx0;i<nx;i++)
+ {
+ Vtab = 0.0;
+ Ftab = 0.0;
+ r = td->x[i];
+ r2 = r*r;
+ expterm = exp(-0.25*r2);
+
+ Vtab = 1/sqrt(r2+expterm);
+ Ftab = (r-0.25*r*expterm)/((r2+expterm)*sqrt(r2+expterm));
+
+ /* Convert to single precision when we store to mem */
+ td->v[i] = Vtab;
+ td->f[i] = Ftab;
+
+ }
+
+ copy2table(table.n,0,4,td[0].x,td[0].v,td[0].f,table.tab);
+
+ if(bDebugMode())
+ {
+ fp=xvgropen(fns[0],fns[0],"r","V");
+ /* plot the output 5 times denser than the table data */
+ //for(i=5*nx0;i<5*table.n;i++)
+ for(i=nx0;i<table.n;i++)
+ {
+ //x0=i*table.r/(5*table.n);
+ x0=i*table.r/table.n;
+ evaluate_table(table.tab,0,4,table.scale,x0,&y0,&yp);
+ fprintf(fp,"%15.10e %15.10e %15.10e\n",x0,y0,yp);
+
+ }
+ ffclose(fp);
+ }
+
+ /*
+ for(i=100*nx0;i<99.81*table.n;i++)
+ {
+ r = i*table.r/(100*table.n);
+ r2 = r*r;
+ expterm = exp(-0.25*r2);
+
+ Vtab = 1/sqrt(r2+expterm);
+ Ftab = (r-0.25*r*expterm)/((r2+expterm)*sqrt(r2+expterm));
+
+
+ evaluate_table(table.tab,0,4,table.scale,r,&y0,&yp);
+ printf("gb: i=%d, x0=%g, y0=%15.15f, Vtab=%15.15f, yp=%15.15f, Ftab=%15.15f\n",i,r, y0, Vtab, yp, Ftab);
+
+ abs_error_r=fabs(y0-Vtab);
+ abs_error_r2=fabs(yp-(-1)*Ftab);
+
+ rel_error_r=abs_error_r/y0;
+ rel_error_r2=fabs(abs_error_r2/yp);
+
+ //printf("rel_error=%15.15f, rel_error_r2=%15.15f\n",rel_error_r, rel_error_r2);
+ //printf("abs_error=%15.15f, abs_error_r2=%15.15f\n",abs_error_r, abs_error_r2);
+
+ if(rel_error_r>rel_error_r_old)
+ {
+ rel_error_r_old=rel_error_r;
+ x0_r_error=x0;
+ }
+
+ if(rel_error_r2>rel_error_r2_old)
+ {
+ rel_error_r2_old=rel_error_r2;
+ x0_r2_error=x0;
+ }
+ }
+
+ printf("gb: MAX REL ERROR IN R=%15.15f, MAX REL ERROR IN R2=%15.15f\n",rel_error_r_old, rel_error_r2_old);
+ printf("gb: XO_R=%g, X0_R2=%g\n",x0_r_error, x0_r2_error);
+
+ exit(1); */
+ done_tabledata(&(td[0]));
+ sfree(td);
+
+ return table;
+
+
+}
+
bondedtable_t make_bonded_table(FILE *fplog,char *fn,int angle)
{
t_tabledata td;
return tab;
}
+
+