2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/commandline/cmdlineoptionsmodule.h"
51 #include "gromacs/compat/make_unique.h"
52 #include "gromacs/fileio/confio.h"
53 #include "gromacs/fileio/filetypes.h"
54 #include "gromacs/fileio/gmxfio.h"
55 #include "gromacs/fileio/pdbio.h"
56 #include "gromacs/gmxlib/conformation-utilities.h"
57 #include "gromacs/gmxpreprocess/fflibutil.h"
58 #include "gromacs/gmxpreprocess/genhydro.h"
59 #include "gromacs/gmxpreprocess/grompp-impl.h"
60 #include "gromacs/gmxpreprocess/h_db.h"
61 #include "gromacs/gmxpreprocess/hackblock.h"
62 #include "gromacs/gmxpreprocess/hizzie.h"
63 #include "gromacs/gmxpreprocess/pdb2top.h"
64 #include "gromacs/gmxpreprocess/pgutil.h"
65 #include "gromacs/gmxpreprocess/resall.h"
66 #include "gromacs/gmxpreprocess/specbond.h"
67 #include "gromacs/gmxpreprocess/ter_db.h"
68 #include "gromacs/gmxpreprocess/toputil.h"
69 #include "gromacs/gmxpreprocess/xlate.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/options/basicoptions.h"
72 #include "gromacs/options/filenameoption.h"
73 #include "gromacs/options/ioptionscontainer.h"
74 #include "gromacs/topology/atomprop.h"
75 #include "gromacs/topology/block.h"
76 #include "gromacs/topology/index.h"
77 #include "gromacs/topology/residuetypes.h"
78 #include "gromacs/topology/symtab.h"
79 #include "gromacs/topology/topology.h"
80 #include "gromacs/utility/dir_separator.h"
81 #include "gromacs/utility/exceptions.h"
82 #include "gromacs/utility/fatalerror.h"
83 #include "gromacs/utility/path.h"
84 #include "gromacs/utility/smalloc.h"
85 #include "gromacs/utility/strdb.h"
86 #include "gromacs/utility/stringutil.h"
90 char gmx[RTP_MAXCHAR+2];
91 char main[RTP_MAXCHAR+2];
92 char nter[RTP_MAXCHAR+2];
93 char cter[RTP_MAXCHAR+2];
94 char bter[RTP_MAXCHAR+2];
97 static const char *res2bb_notermini(const char *name,
98 int nrr, const rtprename_t *rr)
100 /* NOTE: This function returns the main building block name,
101 * it does not take terminal renaming into account.
106 while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
111 return (i < nrr ? rr[i].main : name);
114 static const char *select_res(int nr, int resnr,
115 const char *name[], const char *expl[],
117 int nrr, const rtprename_t *rr)
121 printf("Which %s type do you want for residue %d\n", title, resnr+1);
122 for (sel = 0; (sel < nr); sel++)
124 printf("%d. %s (%s)\n",
125 sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
127 printf("\nType a number:"); fflush(stdout);
129 if (scanf("%d", &sel) != 1)
131 gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
137 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
142 const char *lh[easpNR] = { "ASP", "ASPH" };
143 const char *expl[easpNR] = {
144 "Not protonated (charge -1)",
145 "Protonated (charge 0)"
148 return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
151 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
156 const char *lh[egluNR] = { "GLU", "GLUH" };
157 const char *expl[egluNR] = {
158 "Not protonated (charge -1)",
159 "Protonated (charge 0)"
162 return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
165 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
170 const char *lh[eglnNR] = { "GLN", "QLN" };
171 const char *expl[eglnNR] = {
172 "Not protonated (charge 0)",
173 "Protonated (charge +1)"
176 return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
179 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
184 const char *lh[elysNR] = { "LYSN", "LYS" };
185 const char *expl[elysNR] = {
186 "Not protonated (charge 0)",
187 "Protonated (charge +1)"
190 return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
193 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
198 const char *lh[eargNR] = { "ARGN", "ARG" };
199 const char *expl[eargNR] = {
200 "Not protonated (charge 0)",
201 "Protonated (charge +1)"
204 return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
207 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
209 const char *expl[ehisNR] = {
216 return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
219 static void read_rtprename(const char *fname, FILE *fp,
220 int *nrtprename, rtprename_t **rtprename)
222 char line[STRLEN], buf[STRLEN];
231 while (get_a_line(fp, line, STRLEN))
234 /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
235 * For other args, we read up to 6 chars (so we can detect if the length is > 5).
236 * Note that the buffer length has been increased to 7 to allow this,
237 * so we just need to make sure the strings have zero-length initially.
240 rr[n].main[0] = '\0';
241 rr[n].nter[0] = '\0';
242 rr[n].cter[0] = '\0';
243 rr[n].bter[0] = '\0';
244 nc = sscanf(line, "%6s %6s %6s %6s %6s %s",
245 rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
246 if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
247 strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
249 gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
250 fname, RTP_MAXCHAR, line);
255 if (nc != 2 && nc != 5)
257 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d", fname, ncol, 2, 5);
263 gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
268 /* This file does not have special termini names, copy them from main */
269 strcpy(rr[n].nter, rr[n].main);
270 strcpy(rr[n].cter, rr[n].main);
271 strcpy(rr[n].bter, rr[n].main);
281 static char *search_resrename(int nrr, rtprename_t *rr,
283 bool bStart, bool bEnd,
284 bool bCompareFFRTPname)
292 while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx) != 0) ||
293 ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
298 /* If found in the database, rename this residue's rtp building block,
299 * otherwise keep the old name.
322 gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
329 static void rename_resrtp(t_atoms *pdba, int nterpairs, const int *r_start, const int *r_end,
330 int nrr, rtprename_t *rr, t_symtab *symtab,
338 bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
340 for (r = 0; r < pdba->nres; r++)
344 for (j = 0; j < nterpairs; j++)
351 for (j = 0; j < nterpairs; j++)
359 nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
361 if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
363 /* This is a terminal residue, but the residue name,
364 * currently stored in .rtp, is not a standard residue name,
365 * but probably a force field specific rtp name.
366 * Check if we need to rename it because it is terminal.
368 nn = search_resrename(nrr, rr,
369 *pdba->resinfo[r].rtp, bStart, bEnd, true);
372 if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
376 printf("Changing rtp entry of residue %d %s to '%s'\n",
377 pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
379 pdba->resinfo[r].rtp = put_symtab(symtab, nn);
384 static void pdbres_to_gmxrtp(t_atoms *pdba)
388 for (i = 0; (i < pdba->nres); i++)
390 if (pdba->resinfo[i].rtp == nullptr)
392 pdba->resinfo[i].rtp = pdba->resinfo[i].name;
397 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
398 bool bFullCompare, t_symtab *symtab)
403 for (i = 0; (i < pdba->nres); i++)
405 resnm = *pdba->resinfo[i].name;
406 if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
407 (!bFullCompare && strstr(resnm, oldnm) != nullptr))
409 /* Rename the residue name (not the rtp name) */
410 pdba->resinfo[i].name = put_symtab(symtab, newnm);
415 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
416 bool bFullCompare, t_symtab *symtab)
421 for (i = 0; (i < pdba->nres); i++)
423 /* We have not set the rtp name yes, use the residue name */
424 bbnm = *pdba->resinfo[i].name;
425 if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
426 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
428 /* Change the rtp builing block name */
429 pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
434 static void rename_bbint(t_atoms *pdba, const char *oldnm,
435 const char *gettp(int, int, const rtprename_t *),
438 int nrr, const rtprename_t *rr)
444 for (i = 0; i < pdba->nres; i++)
446 /* We have not set the rtp name yet, use the residue name */
447 bbnm = *pdba->resinfo[i].name;
448 if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
449 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
451 ptr = gettp(i, nrr, rr);
452 pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
457 static void check_occupancy(t_atoms *atoms, const char *filename, bool bVerbose)
463 ftp = fn2ftp(filename);
464 if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
466 fprintf(stderr, "No occupancies in %s\n", filename);
470 for (i = 0; (i < atoms->nr); i++)
472 if (atoms->pdbinfo[i].occup != 1)
476 fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
477 *atoms->resinfo[atoms->atom[i].resind].name,
478 atoms->resinfo[ atoms->atom[i].resind].nr,
480 atoms->pdbinfo[i].occup);
482 if (atoms->pdbinfo[i].occup == 0)
492 if (nzero == atoms->nr)
494 fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
496 else if ((nzero > 0) || (nnotone > 0))
500 "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
501 " occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
503 nzero, nnotone, atoms->nr);
507 fprintf(stderr, "All occupancies are one\n");
512 static void write_posres(const char *fn, t_atoms *pdba, real fc)
517 fp = gmx_fio_fopen(fn, "w");
519 "; In this topology include file, you will find position restraint\n"
520 "; entries for all the heavy atoms in your original pdb file.\n"
521 "; This means that all the protons which were added by pdb2gmx are\n"
522 "; not restrained.\n"
524 "[ position_restraints ]\n"
525 "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
527 for (i = 0; (i < pdba->nr); i++)
529 if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
531 fprintf(fp, "%6d%6d %g %g %g\n", i+1, 1, fc, fc, fc);
537 static int read_pdball(const char *inf, bool bOutput, const char *outf, char **title,
538 t_atoms *atoms, rvec **x,
539 int *ePBC, matrix box, bool bRemoveH,
540 t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
541 AtomProperties *aps, bool bVerbose)
542 /* Read a pdb file. (containing proteins) */
544 int natom, new_natom, i;
547 printf("Reading %s...\n", inf);
548 readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
550 if (atoms->pdbinfo == nullptr)
552 snew(atoms->pdbinfo, atoms->nr);
554 if (fn2ftp(inf) == efPDB)
556 get_pdb_atomnumber(atoms, aps);
561 for (i = 0; i < atoms->nr; i++)
563 if (!is_hydrogen(*atoms->atomname[i]))
565 atoms->atom[new_natom] = atoms->atom[i];
566 atoms->atomname[new_natom] = atoms->atomname[i];
567 atoms->pdbinfo[new_natom] = atoms->pdbinfo[i];
568 copy_rvec((*x)[i], (*x)[new_natom]);
572 atoms->nr = new_natom;
579 printf(" '%s',", *title);
581 printf(" %d atoms\n", natom);
583 /* Rename residues */
584 rename_pdbres(atoms, "HOH", watres, false, symtab);
585 rename_pdbres(atoms, "SOL", watres, false, symtab);
586 rename_pdbres(atoms, "WAT", watres, false, symtab);
588 rename_atoms("xlateat.dat", nullptr,
589 atoms, symtab, nullptr, true, rt, true, bVerbose);
597 write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
603 static void process_chain(t_atoms *pdba, rvec *x,
604 bool bTrpU, bool bPheU, bool bTyrU,
605 bool bLysMan, bool bAspMan, bool bGluMan,
606 bool bHisMan, bool bArgMan, bool bGlnMan,
607 real angle, real distance, t_symtab *symtab,
608 int nrr, const rtprename_t *rr)
610 /* Rename aromatics, lys, asp and histidine */
613 rename_bb(pdba, "TYR", "TYRU", false, symtab);
617 rename_bb(pdba, "TRP", "TRPU", false, symtab);
621 rename_bb(pdba, "PHE", "PHEU", false, symtab);
625 rename_bbint(pdba, "LYS", get_lystp, false, symtab, nrr, rr);
629 rename_bbint(pdba, "ARG", get_argtp, false, symtab, nrr, rr);
633 rename_bbint(pdba, "GLN", get_glntp, false, symtab, nrr, rr);
637 rename_bbint(pdba, "ASP", get_asptp, false, symtab, nrr, rr);
641 rename_bb(pdba, "ASPH", "ASP", false, symtab);
645 rename_bbint(pdba, "GLU", get_glutp, false, symtab, nrr, rr);
649 rename_bb(pdba, "GLUH", "GLU", false, symtab);
654 set_histp(pdba, x, angle, distance);
658 rename_bbint(pdba, "HIS", get_histp, true, symtab, nrr, rr);
661 /* Initialize the rtp builing block names with the residue names
662 * for the residues that have not been processed above.
664 pdbres_to_gmxrtp(pdba);
666 /* Now we have all rtp names set.
667 * The rtp names will conform to Gromacs naming,
668 * unless the input pdb file contained one or more force field specific
669 * rtp names as residue names.
673 /* struct for sorting the atoms from the pdb file */
675 int resnr; /* residue number */
676 int j; /* database order index */
677 int index; /* original atom number */
678 char anm1; /* second letter of atom name */
679 char altloc; /* alternate location indicator */
682 static bool pdbicomp(const t_pdbindex &a, const t_pdbindex &b)
684 int d = (a.resnr - b.resnr);
690 d = (a.anm1 - b.anm1);
693 d = (a.altloc - b.altloc);
700 static void sort_pdbatoms(t_restp restp[],
701 int natoms, t_atoms **pdbaptr, rvec **x,
702 t_blocka *block, char ***gnames)
704 t_atoms *pdba, *pdbnew;
718 for (i = 0; i < natoms; i++)
720 atomnm = *pdba->atomname[i];
721 rptr = &restp[pdba->atom[i].resind];
722 for (j = 0; (j < rptr->natom); j++)
724 if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
729 if (j == rptr->natom)
734 "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
735 "while sorting atoms.\n%s", atomnm,
736 *pdba->resinfo[pdba->atom[i].resind].name,
737 pdba->resinfo[pdba->atom[i].resind].nr,
740 is_hydrogen(atomnm) ?
741 "\nFor a hydrogen, this can be a different protonation state, or it\n"
742 "might have had a different number in the PDB file and was rebuilt\n"
743 "(it might for instance have been H3, and we only expected H1 & H2).\n"
744 "Note that hydrogens might have been added to the entry for the N-terminus.\n"
745 "Remove this hydrogen or choose a different protonation state to solve it.\n"
746 "Option -ignh will ignore all hydrogens in the input." : ".");
747 gmx_fatal(FARGS, "%s", buf);
749 /* make shadow array to be sorted into indexgroup */
750 pdbi[i].resnr = pdba->atom[i].resind;
753 pdbi[i].anm1 = atomnm[1];
754 pdbi[i].altloc = pdba->pdbinfo[i].altloc;
756 std::sort(pdbi, pdbi+natoms, pdbicomp);
758 /* pdba is sorted in pdbnew using the pdbi index */
761 init_t_atoms(pdbnew, natoms, true);
763 pdbnew->nr = pdba->nr;
764 pdbnew->nres = pdba->nres;
765 sfree(pdbnew->resinfo);
766 pdbnew->resinfo = pdba->resinfo;
767 for (i = 0; i < natoms; i++)
769 pdbnew->atom[i] = pdba->atom[pdbi[i].index];
770 pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
771 pdbnew->pdbinfo[i] = pdba->pdbinfo[pdbi[i].index];
772 copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
773 /* make indexgroup in block */
774 a[i] = pdbi[i].index;
777 sfree(pdba->atomname);
779 sfree(pdba->pdbinfo);
782 /* copy the sorted pdbnew back to pdba */
785 add_grp(block, gnames, natoms, a, "prot_sort");
791 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], bool bVerbose)
793 int i, j, oldnatoms, ndel;
796 printf("Checking for duplicate atoms....\n");
797 oldnatoms = pdba->nr;
799 /* NOTE: pdba->nr is modified inside the loop */
800 for (i = 1; (i < pdba->nr); i++)
802 /* compare 'i' and 'i-1', throw away 'i' if they are identical
803 this is a 'while' because multiple alternate locations can be present */
804 while ( (i < pdba->nr) &&
805 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
806 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
811 ri = &pdba->resinfo[pdba->atom[i].resind];
812 printf("deleting duplicate atom %4s %s%4d%c",
813 *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
814 if (ri->chainid && (ri->chainid != ' '))
816 printf(" ch %c", ri->chainid);
820 if (pdba->pdbinfo[i].atomnr)
822 printf(" pdb nr %4d", pdba->pdbinfo[i].atomnr);
824 if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
826 printf(" altloc %c", pdba->pdbinfo[i].altloc);
832 /* We can not free, since it might be in the symtab */
833 /* sfree(pdba->atomname[i]); */
834 for (j = i; j < pdba->nr; j++)
836 pdba->atom[j] = pdba->atom[j+1];
837 pdba->atomname[j] = pdba->atomname[j+1];
840 pdba->pdbinfo[j] = pdba->pdbinfo[j+1];
842 copy_rvec(x[j+1], x[j]);
844 srenew(pdba->atom, pdba->nr);
845 /* srenew(pdba->atomname, pdba->nr); */
846 srenew(pdba->pdbinfo, pdba->nr);
849 if (pdba->nr != oldnatoms)
851 printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
858 checkResidueTypeSanity(t_atoms * pdba,
861 gmx_residuetype_t * rt)
863 std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
864 std::string endResidueString = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
866 // Check whether all residues in chain have the same chain ID.
867 bool allResiduesHaveSameChainID = true;
868 char chainID0 = pdba->resinfo[r0].chainid;
870 std::string residueString;
872 for (int i = r0 + 1; i < r1; i++)
874 chainID = pdba->resinfo[i].chainid;
875 if (chainID != chainID0)
877 allResiduesHaveSameChainID = false;
878 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
883 if (!allResiduesHaveSameChainID)
886 "The chain covering the range %s--%s does not have a consistent chain ID. "
887 "The first residue has ID '%c', while residue %s has ID '%c'.",
888 startResidueString.c_str(), endResidueString.c_str(),
889 chainID0, residueString.c_str(), chainID);
892 // At this point all residues have the same ID. If they are also non-blank
893 // we can be a bit more aggressive and require the types match too.
896 bool allResiduesHaveSameType = true;
897 const char *restype0;
899 gmx_residuetype_get_type(rt, *pdba->resinfo[r0].name, &restype0);
901 for (int i = r0 + 1; i < r1; i++)
903 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &restype);
904 if (gmx_strcasecmp(restype, restype0))
906 allResiduesHaveSameType = false;
907 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
912 if (!allResiduesHaveSameType)
915 "The residues in the chain %s--%s do not have a consistent type. "
916 "The first residue has type '%s', while residue %s is of type '%s'. "
917 "Either there is a mistake in your chain, or it includes nonstandard "
918 "residue names that have not yet been added to the residuetypes.dat "
919 "file in the GROMACS library directory. If there are other molecules "
920 "such as ligands, they should not have the same chain ID as the "
921 "adjacent protein chain since it's a separate molecule.",
922 startResidueString.c_str(), endResidueString.c_str(),
923 restype0, residueString.c_str(), restype);
928 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
929 gmx_residuetype_t *rt)
932 const char *p_startrestype;
933 const char *p_restype;
938 int startWarnings = 0;
942 // Check that all residues have the same chain identifier, and if it is
943 // non-blank we also require the residue types to match.
944 checkResidueTypeSanity(pdba, r0, r1, rt);
946 // If we return correctly from checkResidueTypeSanity(), the only
947 // remaining cases where we can have non-matching residue types is if
948 // the chain ID was blank, which could be the case e.g. for a structure
949 // read from a GRO file or other file types without chain information.
950 // In that case we need to be a bit more liberal and detect chains based
951 // on the residue type.
953 // If we get here, the chain ID must be identical for all residues
954 char chainID = pdba->resinfo[r0].chainid;
956 /* Find the starting terminus (typially N or 5') */
957 for (i = r0; i < r1 && *r_start == -1; i++)
959 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
960 if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
962 printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
965 else if (!gmx_strcasecmp(p_startrestype, "Ion"))
969 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
973 printf("Disabling further notes about ions.\n");
979 if (startWarnings < 5)
983 printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
984 "This chain lacks identifiers, which makes it impossible to do strict\n"
985 "classification of the start/end residues. Here we need to guess this residue\n"
986 "should not be part of the chain and instead introduce a break, but that will\n"
987 "be catastrophic if they should in fact be linked. Please check your structure,\n"
988 "and add %s to residuetypes.dat if this was not correct.\n\n",
989 *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
993 printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
994 "This makes it impossible to link them into a molecule, which could either be\n"
995 "correct or a catastrophic error. Please check your structure, and add all\n"
996 "necessary residue names to residuetypes.dat if this was not correct.\n\n",
997 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1000 if (startWarnings == 4)
1002 printf("Disabling further warnings about unidentified residues at start of chain.\n");
1010 /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1011 for (i = *r_start; i < r1; i++)
1013 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
1014 if (!gmx_strcasecmp(p_restype, p_startrestype) && endWarnings == 0)
1018 else if (!gmx_strcasecmp(p_startrestype, "Ion"))
1022 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1026 printf("Disabling further notes about ions.\n");
1032 // This can only trigger if the chain ID is blank - otherwise the
1033 // call to checkResidueTypeSanity() will have caught the problem.
1034 if (endWarnings < 5)
1036 printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1037 "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1038 "it impossible to do strict classification of the start/end residues. Here we\n"
1039 "need to guess this residue should not be part of the chain and instead\n"
1040 "introduce a break, but that will be catastrophic if they should in fact be\n"
1041 "linked. Please check your structure, and add %s to residuetypes.dat\n"
1042 "if this was not correct.\n\n",
1043 *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
1044 *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype, *pdba->resinfo[i].name);
1046 if (endWarnings == 4)
1048 printf("Disabling further warnings about unidentified residues at end of chain.\n");
1057 printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1061 /* enum for chain separation */
1063 enChainSep_id_or_ter, enChainSep_id_and_ter, enChainSep_ter,
1064 enChainSep_id, enChainSep_interactive
1066 static const char *ChainSepEnum[] = {"id_or_ter", "id_and_ter", "ter", "id", "interactive"};
1067 static const char *ChainSepInfoString[] =
1069 "Splitting chemical chains based on TER records or chain id changing.\n",
1070 "Splitting chemical chains based on TER records and chain id changing.\n",
1071 "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1072 "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1073 "Splitting chemical chains interactively.\n"
1077 modify_chain_numbers(t_atoms * pdba,
1078 ChainSepType enumChainSep)
1081 char old_prev_chainid;
1082 char old_this_chainid;
1083 int old_prev_chainnum;
1084 int old_this_chainnum;
1086 char select[STRLEN];
1090 const char * prev_atomname;
1091 const char * this_atomname;
1092 const char * prev_resname;
1093 const char * this_resname;
1099 /* The default chain enumeration is based on TER records only */
1100 printf("%s", ChainSepInfoString[enumChainSep]);
1102 old_prev_chainid = '?';
1103 old_prev_chainnum = -1;
1106 this_atomname = nullptr;
1108 this_resname = nullptr;
1112 for (i = 0; i < pdba->nres; i++)
1114 ri = &pdba->resinfo[i];
1115 old_this_chainid = ri->chainid;
1116 old_this_chainnum = ri->chainnum;
1118 prev_atomname = this_atomname;
1119 prev_atomnum = this_atomnum;
1120 prev_resname = this_resname;
1121 prev_resnum = this_resnum;
1122 prev_chainid = this_chainid;
1124 this_atomname = *(pdba->atomname[i]);
1125 this_atomnum = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1126 this_resname = *ri->name;
1127 this_resnum = ri->nr;
1128 this_chainid = ri->chainid;
1130 switch (enumChainSep)
1132 case enChainSep_id_or_ter:
1133 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1139 case enChainSep_id_and_ter:
1140 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1147 if (old_this_chainid != old_prev_chainid)
1153 case enChainSep_ter:
1154 if (old_this_chainnum != old_prev_chainnum)
1159 case enChainSep_interactive:
1160 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1164 printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1166 "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1167 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1168 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1170 if (nullptr == fgets(select, STRLEN-1, stdin))
1172 gmx_fatal(FARGS, "Error reading from stdin");
1175 if (i == 0 || select[0] == 'y')
1182 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1184 old_prev_chainid = old_this_chainid;
1185 old_prev_chainnum = old_this_chainnum;
1187 ri->chainnum = new_chainnum;
1215 // TODO make all enums into scoped enums
1216 /* enum for vsites */
1218 enVSites_none, enVSites_hydrogens, enVSites_aromatics
1220 static const char *VSitesEnum[] = {"none", "hydrogens", "aromatics"};
1222 /* enum for water model */
1224 enWater_select, enWater_none, enWater_spc, enWater_spce,
1225 enWater_tip3p, enWater_tip4p, enWater_tip5p, enWater_tips3p
1227 static const char *WaterEnum[] = {
1228 "select", "none", "spc", "spce",
1229 "tip3p", "tip4p", "tip5p", "tips3p"
1232 /* enum for merge */
1234 enMerge_no, enMerge_all, enMerge_interactive
1236 static const char *MergeEnum[] = {"no", "all", "interactive"};
1244 class pdb2gmx : public ICommandLineOptionsModule
1248 bVsites_(FALSE), bPrevWat_(FALSE), bVsiteAromatics_(FALSE),
1249 enumChainSep_(enChainSep_id_or_ter),
1250 enumVSites_(enVSites_none),
1251 enumWater_(enWater_select),
1252 enumMerge_(enMerge_no),
1262 // From ICommandLineOptionsModule
1263 void init(CommandLineModuleSettings * /*settings*/) override
1267 void initOptions(IOptionsContainer *options,
1268 ICommandLineOptionsModuleSettings *settings) override;
1270 void optionsFinished() override;
1288 bool bAllowMissing_;
1292 bool bChargeGroups_;
1302 bool bVsiteAromatics_;
1306 real long_bond_dist_;
1307 real short_bond_dist_;
1309 std::string indexOutputFile_;
1310 std::string outputFile_;
1311 std::string topologyFile_;
1312 std::string includeTopologyFile_;
1313 std::string outputConfFile_;
1314 std::string inputConfFile_;
1315 std::string outFile_;
1318 ChainSepType enumChainSep_;
1319 VSitesType enumVSites_;
1320 WaterType enumWater_;
1321 MergeType enumMerge_;
1324 char forcefield_[STRLEN];
1325 char ffdir_[STRLEN];
1335 void pdb2gmx::initOptions(IOptionsContainer *options,
1336 ICommandLineOptionsModuleSettings *settings)
1338 const char *desc[] = {
1339 "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1340 "some database files, adds hydrogens to the molecules and generates",
1341 "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1342 "and a topology in GROMACS format.",
1343 "These files can subsequently be processed to generate a run input file.",
1345 "[THISMODULE] will search for force fields by looking for",
1346 "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1347 "of the current working directory and of the GROMACS library directory",
1348 "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1350 "By default the forcefield selection is interactive,",
1351 "but you can use the [TT]-ff[tt] option to specify one of the short names",
1352 "in the list on the command line instead. In that case [THISMODULE] just looks",
1353 "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1355 "After choosing a force field, all files will be read only from",
1356 "the corresponding force field directory.",
1357 "If you want to modify or add a residue types, you can copy the force",
1358 "field directory from the GROMACS library directory to your current",
1359 "working directory. If you want to add new protein residue types,",
1360 "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1361 "or copy the whole library directory to a local directory and set",
1362 "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1363 "Check Chapter 5 of the manual for more information about file formats.",
1366 "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1367 "need not necessarily contain a protein structure. Every kind of",
1368 "molecule for which there is support in the database can be converted.",
1369 "If there is no support in the database, you can add it yourself.[PAR]",
1371 "The program has limited intelligence, it reads a number of database",
1372 "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1373 "if necessary this can be done manually. The program can prompt the",
1374 "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1375 "desired. For Lys the choice is between neutral (two protons on NZ) or",
1376 "protonated (three protons, default), for Asp and Glu unprotonated",
1377 "(default) or protonated, for His the proton can be either on ND1,",
1378 "on NE2 or on both. By default these selections are done automatically.",
1379 "For His, this is based on an optimal hydrogen bonding",
1380 "conformation. Hydrogen bonds are defined based on a simple geometric",
1381 "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1382 "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1383 "[TT]-dist[tt] respectively.[PAR]",
1385 "The protonation state of N- and C-termini can be chosen interactively",
1386 "with the [TT]-ter[tt] flag. Default termini are ionized (NH3+ and COO-),",
1387 "respectively. Some force fields support zwitterionic forms for chains of",
1388 "one residue, but for polypeptides these options should NOT be selected.",
1389 "The AMBER force fields have unique forms for the terminal residues,",
1390 "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1391 "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1392 "respectively to use these forms, making sure you preserve the format",
1393 "of the coordinate file. Alternatively, use named terminating residues",
1394 "(e.g. ACE, NME).[PAR]",
1396 "The separation of chains is not entirely trivial since the markup",
1397 "in user-generated PDB files frequently varies and sometimes it",
1398 "is desirable to merge entries across a TER record, for instance",
1399 "if you want a disulfide bridge or distance restraints between",
1400 "two protein chains or if you have a HEME group bound to a protein.",
1401 "In such cases multiple chains should be contained in a single",
1402 "[TT]moleculetype[tt] definition.",
1403 "To handle this, [THISMODULE] uses two separate options.",
1404 "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1405 "start, and termini added when applicable. This can be done based on the",
1406 "existence of TER records, when the chain id changes, or combinations of either",
1407 "or both of these. You can also do the selection fully interactively.",
1408 "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1409 "are merged into one moleculetype, after adding all the chemical termini (or not).",
1410 "This can be turned off (no merging), all non-water chains can be merged into a",
1411 "single molecule, or the selection can be done interactively.[PAR]",
1413 "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1414 "If any of the occupancies are not one, indicating that the atom is",
1415 "not resolved well in the structure, a warning message is issued.",
1416 "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1417 "all occupancy fields may be zero. Either way, it is up to the user",
1418 "to verify the correctness of the input data (read the article!).[PAR]",
1420 "During processing the atoms will be reordered according to GROMACS",
1421 "conventions. With [TT]-n[tt] an index file can be generated that",
1422 "contains one group reordered in the same way. This allows you to",
1423 "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1424 "one limitation: reordering is done after the hydrogens are stripped",
1425 "from the input and before new hydrogens are added. This means that",
1426 "you should not use [TT]-ignh[tt].[PAR]",
1428 "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1429 "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1430 "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1433 "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1434 "motions. Angular and out-of-plane motions can be removed by changing",
1435 "hydrogens into virtual sites and fixing angles, which fixes their",
1436 "position relative to neighboring atoms. Additionally, all atoms in the",
1437 "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1438 "can be converted into virtual sites, eliminating the fast improper dihedral",
1439 "fluctuations in these rings (but this feature is deprecated).",
1440 "[BB]Note[bb] that in this case all other hydrogen",
1441 "atoms are also converted to virtual sites. The mass of all atoms that are",
1442 "converted into virtual sites, is added to the heavy atoms.[PAR]",
1443 "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1444 "done by increasing the hydrogen-mass by a factor of 4. This is also",
1445 "done for water hydrogens to slow down the rotational motion of water.",
1446 "The increase in mass of the hydrogens is subtracted from the bonded",
1447 "(heavy) atom so that the total mass of the system remains the same."
1450 settings->setHelpText(desc);
1452 options->addOption(BooleanOption("newrtp")
1453 .store(&bNewRTP_).defaultValue(false).hidden()
1454 .description("Write the residue database in new format to [TT]new.rtp[tt]"));
1455 options->addOption(RealOption("lb")
1456 .store(&long_bond_dist_).defaultValue(0.25).hidden()
1457 .description("Long bond warning distance"));
1458 options->addOption(RealOption("sb")
1459 .store(&short_bond_dist_).defaultValue(0.05).hidden()
1460 .description("Short bond warning distance"));
1461 options->addOption(EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum)
1462 .store(&enumChainSep_)
1463 .description("Condition in PDB files when a new chain should be started (adding termini)"));
1464 options->addOption(EnumOption<MergeType>("merge").enumValue(MergeEnum)
1466 .description("Merge multiple chains into a single [moleculetype]"));
1467 options->addOption(StringOption("ff")
1468 .store(&ff_).defaultValue("select")
1469 .description("Force field, interactive by default. Use [TT]-h[tt] for information."));
1470 options->addOption(EnumOption<WaterType>("water")
1471 .store(&enumWater_).enumValue(WaterEnum)
1472 .description("Water model to use"));
1473 options->addOption(BooleanOption("inter")
1474 .store(&bInter_).defaultValue(false)
1475 .description("Set the next 8 options to interactive"));
1476 options->addOption(BooleanOption("ss")
1477 .store(&bCysMan_).defaultValue(false)
1478 .description("Interactive SS bridge selection"));
1479 options->addOption(BooleanOption("ter")
1480 .store(&bTerMan_).defaultValue(false)
1481 .description("Interactive termini selection, instead of charged (default)"));
1482 options->addOption(BooleanOption("lys")
1483 .store(&bLysMan_).defaultValue(false)
1484 .description("Interactive lysine selection, instead of charged"));
1485 options->addOption(BooleanOption("arg")
1486 .store(&bArgMan_).defaultValue(false)
1487 .description("Interactive arginine selection, instead of charged"));
1488 options->addOption(BooleanOption("asp")
1489 .store(&bAspMan_).defaultValue(false)
1490 .description("Interactive aspartic acid selection, instead of charged"));
1491 options->addOption(BooleanOption("glu")
1492 .store(&bGluMan_).defaultValue(false)
1493 .description("Interactive glutamic acid selection, instead of charged"));
1494 options->addOption(BooleanOption("gln")
1495 .store(&bGlnMan_).defaultValue(false)
1496 .description("Interactive glutamine selection, instead of charged"));
1497 options->addOption(BooleanOption("his")
1498 .store(&bHisMan_).defaultValue(false)
1499 .description("Interactive histidine selection, instead of checking H-bonds"));
1500 options->addOption(RealOption("angle")
1501 .store(&angle_).defaultValue(135.0)
1502 .description("Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1503 options->addOption(RealOption("dist")
1504 .store(&distance_).defaultValue(0.3)
1505 .description("Maximum donor-acceptor distance for a H-bond (nm)"));
1506 options->addOption(BooleanOption("una")
1507 .store(&bUnA_).defaultValue(false)
1508 .description("Select aromatic rings with united CH atoms on phenylalanine, tryptophane and tyrosine"));
1509 options->addOption(BooleanOption("sort")
1510 .store(&bSort_).defaultValue(true).hidden()
1511 .description("Sort the residues according to database, turning this off is dangerous as charge groups might be broken in parts"));
1512 options->addOption(BooleanOption("ignh")
1513 .store(&bRemoveH_).defaultValue(false)
1514 .description("Ignore hydrogen atoms that are in the coordinate file"));
1515 options->addOption(BooleanOption("missing")
1516 .store(&bAllowMissing_).defaultValue(false)
1517 .description("Continue when atoms are missing and bonds cannot be made, dangerous"));
1518 options->addOption(BooleanOption("v")
1519 .store(&bVerbose_).defaultValue(false)
1520 .description("Be slightly more verbose in messages"));
1521 options->addOption(RealOption("posrefc")
1522 .store(&posre_fc_).defaultValue(1000)
1523 .description("Force constant for position restraints"));
1524 options->addOption(EnumOption<VSitesType>("vsite")
1525 .store(&enumVSites_).enumValue(VSitesEnum)
1526 .description("Convert atoms to virtual sites"));
1527 options->addOption(BooleanOption("heavyh")
1528 .store(&bHeavyH_).defaultValue(false)
1529 .description("Make hydrogen atoms heavy"));
1530 options->addOption(BooleanOption("deuterate")
1531 .store(&bDeuterate_).defaultValue(false)
1532 .description("Change the mass of hydrogens to 2 amu"));
1533 options->addOption(BooleanOption("chargegrp")
1534 .store(&bChargeGroups_).defaultValue(true)
1535 .description("Use charge groups in the [REF].rtp[ref] file"));
1536 options->addOption(BooleanOption("cmap")
1537 .store(&bCmap_).defaultValue(true)
1538 .description("Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1539 options->addOption(BooleanOption("renum")
1540 .store(&bRenumRes_).defaultValue(false)
1541 .description("Renumber the residues consecutively in the output"));
1542 options->addOption(BooleanOption("rtpres")
1543 .store(&bRTPresname_).defaultValue(false)
1544 .description("Use [REF].rtp[ref] entry names as residue names"));
1545 options->addOption(FileNameOption("f")
1546 .legacyType(efSTX).inputFile()
1547 .store(&inputConfFile_).required()
1548 .defaultBasename("protein").defaultType(efPDB)
1549 .description("Structure file"));
1550 options->addOption(FileNameOption("o")
1551 .legacyType(efSTO).outputFile()
1552 .store(&outputConfFile_).required()
1553 .defaultBasename("conf")
1554 .description("Structure file"));
1555 options->addOption(FileNameOption("p")
1556 .legacyType(efTOP).outputFile()
1557 .store(&topologyFile_).required()
1558 .defaultBasename("topol")
1559 .description("Topology file"));
1560 options->addOption(FileNameOption("i")
1561 .legacyType(efITP).outputFile()
1562 .store(&includeTopologyFile_).required()
1563 .defaultBasename("posre")
1564 .description("Include file for topology"));
1565 options->addOption(FileNameOption("n")
1566 .legacyType(efNDX).outputFile()
1567 .store(&indexOutputFile_).storeIsSet(&bIndexSet_)
1568 .defaultBasename("index")
1569 .description("Index file"));
1570 options->addOption(FileNameOption("q")
1571 .legacyType(efSTO).outputFile()
1572 .store(&outFile_).storeIsSet(&bOutputSet_)
1573 .defaultBasename("clean").defaultType(efPDB)
1574 .description("Structure file"));
1577 void pdb2gmx::optionsFinished()
1579 if (inputConfFile_.empty())
1581 GMX_THROW(InconsistentInputError("You must supply an input file"));
1585 /* if anything changes here, also change description of -inter */
1600 else if (bDeuterate_)
1609 /* Force field selection, interactive or direct */
1610 choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(),
1611 forcefield_, sizeof(forcefield_),
1612 ffdir_, sizeof(ffdir_));
1614 if (strlen(forcefield_) > 0)
1616 ffname_ = forcefield_;
1617 ffname_[0] = std::toupper(ffname_[0]);
1621 gmx_fatal(FARGS, "Empty forcefield string");
1627 char select[STRLEN];
1630 t_hackblock *hb_chain;
1631 t_restp *restp_chain;
1635 const char *prev_atomname;
1636 const char *this_atomname;
1637 const char *prev_resname;
1638 const char *this_resname;
1643 int prev_chainnumber;
1644 int this_chainnumber;
1645 int this_chainstart;
1646 int prev_chainstart;
1648 printf("\nUsing the %s force field in directory %s\n\n",
1651 choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1653 switch (enumVSites_)
1657 bVsiteAromatics_ = false;
1659 case enVSites_hydrogens:
1661 bVsiteAromatics_ = false;
1663 case enVSites_aromatics:
1665 bVsiteAromatics_ = true;
1668 gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1671 /* Open the symbol table */
1673 open_symtab(&symtab);
1675 /* Residue type database */
1676 gmx_residuetype_t *rt;
1677 gmx_residuetype_init(&rt);
1679 /* Read residue renaming database(s), if present */
1680 std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1683 rtprename_t *rtprename = nullptr;
1684 for (const auto &filename : rrn)
1686 printf("going to rename %s\n", filename.c_str());
1687 FILE *fp = fflib_open(filename);
1688 read_rtprename(filename.c_str(), fp, &nrtprename, &rtprename);
1692 /* Add all alternative names from the residue renaming database to the list
1693 of recognized amino/nucleic acids. */
1694 const char *p_restype;
1695 for (int i = 0; i < nrtprename; i++)
1697 int rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1699 /* Only add names if the 'standard' gromacs/iupac base name was found */
1702 gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1703 gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1704 gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1705 gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1712 if (watermodel_ != nullptr && (strstr(watermodel_, "4p") ||
1713 strstr(watermodel_, "4P")))
1717 else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") ||
1718 strstr(watermodel_, "5P")))
1732 int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(),
1733 &title, &pdba_all, &pdbx, &ePBC, box, bRemoveH_,
1734 &symtab, rt, watres, &aps, bVerbose_);
1738 std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1739 GMX_THROW(InconsistentInputError(message));
1742 printf("Analyzing pdb file\n");
1743 int nwaterchain = 0;
1745 modify_chain_numbers(&pdba_all, enumChainSep_);
1747 int nchainmerges = 0;
1749 this_atomname = nullptr;
1751 this_resname = nullptr;
1754 this_chainnumber = -1;
1755 this_chainstart = 0;
1756 /* Keep the compiler happy */
1757 prev_chainstart = 0;
1762 snew(pdb_ch, maxch);
1765 bool bMerged = false;
1766 for (int i = 0; (i < natom); i++)
1768 ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1770 /* TODO this should live in a helper object, and consolidate
1771 that with code in modify_chain_numbers */
1772 prev_atomname = this_atomname;
1773 prev_atomnum = this_atomnum;
1774 prev_resname = this_resname;
1775 prev_resnum = this_resnum;
1776 prev_chainid = this_chainid;
1777 prev_chainnumber = this_chainnumber;
1780 prev_chainstart = this_chainstart;
1783 this_atomname = *pdba_all.atomname[i];
1784 this_atomnum = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1785 this_resname = *ri->name;
1786 this_resnum = ri->nr;
1787 this_chainid = ri->chainid;
1788 this_chainnumber = ri->chainnum;
1790 bWat_ = gmx_strcasecmp(*ri->name, watres) == 0;
1792 if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1794 this_chainstart = pdba_all.atom[i].resind;
1796 if (i > 0 && !bWat_)
1798 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1800 printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1801 "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1802 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1803 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1805 if (nullptr == fgets(select, STRLEN-1, stdin))
1807 gmx_fatal(FARGS, "Error reading from stdin");
1809 bMerged = (select[0] == 'y');
1811 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1819 pdb_ch[numChains-1].chainstart[pdb_ch[numChains-1].nterpairs] =
1820 pdba_all.atom[i].resind - prev_chainstart;
1821 pdb_ch[numChains-1].nterpairs++;
1822 srenew(pdb_ch[numChains-1].chainstart, pdb_ch[numChains-1].nterpairs+1);
1827 /* set natom for previous chain */
1830 pdb_ch[numChains-1].natom = i-pdb_ch[numChains-1].start;
1837 /* check if chain identifier was used before */
1838 for (int j = 0; (j < numChains); j++)
1840 if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1842 printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1843 "They will be treated as separate chains unless you reorder your file.\n",
1847 // TODO This is too convoluted. Use a std::vector
1848 if (numChains == maxch)
1851 srenew(pdb_ch, maxch);
1853 pdb_ch[numChains].chainid = ri->chainid;
1854 pdb_ch[numChains].chainnum = ri->chainnum;
1855 pdb_ch[numChains].start = i;
1856 pdb_ch[numChains].bAllWat = bWat_;
1859 pdb_ch[numChains].nterpairs = 0;
1863 pdb_ch[numChains].nterpairs = 1;
1865 snew(pdb_ch[numChains].chainstart, pdb_ch[numChains].nterpairs+1);
1866 /* modified [numChains] to [0] below */
1867 pdb_ch[numChains].chainstart[0] = 0;
1873 pdb_ch[numChains-1].natom = natom-pdb_ch[numChains-1].start;
1875 /* set all the water blocks at the end of the chain */
1877 snew(swap_index, numChains);
1879 for (int i = 0; i < numChains; i++)
1881 if (!pdb_ch[i].bAllWat)
1887 for (int i = 0; i < numChains; i++)
1889 if (pdb_ch[i].bAllWat)
1895 if (nwaterchain > 1)
1897 printf("Moved all the water blocks to the end\n");
1902 snew(chains, numChains);
1903 /* copy pdb data and x for all chains */
1904 for (int i = 0; (i < numChains); i++)
1906 int si = swap_index[i];
1907 chains[i].chainid = pdb_ch[si].chainid;
1908 chains[i].chainnum = pdb_ch[si].chainnum;
1909 chains[i].bAllWat = pdb_ch[si].bAllWat;
1910 chains[i].nterpairs = pdb_ch[si].nterpairs;
1911 chains[i].chainstart = pdb_ch[si].chainstart;
1912 snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1913 snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1914 snew(chains[i].r_start, pdb_ch[si].nterpairs);
1915 snew(chains[i].r_end, pdb_ch[si].nterpairs);
1917 snew(chains[i].pdba, 1);
1918 init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1919 snew(chains[i].x, chains[i].pdba->nr);
1920 for (j = 0; j < chains[i].pdba->nr; j++)
1922 chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1923 snew(chains[i].pdba->atomname[j], 1);
1924 *chains[i].pdba->atomname[j] =
1925 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1926 chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1927 copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1929 /* Re-index the residues assuming that the indices are continuous */
1930 int k = chains[i].pdba->atom[0].resind;
1931 int nres = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1932 chains[i].pdba->nres = nres;
1933 for (int j = 0; j < chains[i].pdba->nr; j++)
1935 chains[i].pdba->atom[j].resind -= k;
1937 srenew(chains[i].pdba->resinfo, nres);
1938 for (int j = 0; j < nres; j++)
1940 chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1941 snew(chains[i].pdba->resinfo[j].name, 1);
1942 *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1943 /* make all chain identifiers equal to that of the chain */
1944 chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1948 if (nchainmerges > 0)
1950 printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1954 printf("There are %d chains and %d blocks of water and "
1955 "%d residues with %d atoms\n",
1956 numChains-nwaterchain, nwaterchain,
1957 pdba_all.nres, natom);
1959 printf("\n %5s %4s %6s\n", "chain", "#res", "#atoms");
1960 for (int i = 0; (i < numChains); i++)
1962 printf(" %d '%c' %5d %6d %s\n",
1963 i+1, chains[i].chainid ? chains[i].chainid : '-',
1964 chains[i].pdba->nres, chains[i].pdba->nr,
1965 chains[i].bAllWat ? "(only water)" : "");
1969 check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1971 /* Read atomtypes... */
1972 gpp_atomtype *atype = read_atype(ffdir_, &symtab);
1974 /* read residue database */
1975 printf("Reading residue database... (%s)\n", forcefield_);
1976 std::vector<std::string> rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
1978 t_restp *restp = nullptr;
1979 for (const auto &filename : rtpf)
1981 read_resall(filename.c_str(), &nrtp, &restp, atype, &symtab, false);
1985 /* Not correct with multiple rtp input files with different bonded types */
1986 FILE *fp = gmx_fio_fopen("new.rtp", "w");
1987 print_resall(fp, nrtp, restp, atype);
1991 /* read hydrogen database */
1993 int nah = read_h_db(ffdir_, &ah);
1995 /* Read Termini database... */
1999 t_hackblock **tdblist;
2000 int nNtdb = read_ter_db(ffdir_, 'n', &ntdb, atype);
2001 int nCtdb = read_ter_db(ffdir_, 'c', &ctdb, atype);
2003 FILE *top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2005 print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2009 for (int chain = 0; (chain < numChains); chain++)
2011 cc = &(chains[chain]);
2013 /* set pdba, natom and nres to the current chain */
2016 natom = cc->pdba->nr;
2017 int nres = cc->pdba->nres;
2019 if (cc->chainid && ( cc->chainid != ' ' ) )
2021 printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
2022 chain+1, cc->chainid, natom, nres);
2026 printf("Processing chain %d (%d atoms, %d residues)\n",
2027 chain+1, natom, nres);
2030 process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_,
2031 bHisMan_, bArgMan_, bGlnMan_, angle_, distance_, &symtab,
2032 nrtprename, rtprename);
2034 cc->chainstart[cc->nterpairs] = pdba->nres;
2036 for (int i = 0; i < cc->nterpairs; i++)
2038 find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
2039 &(cc->r_start[j]), &(cc->r_end[j]), rt);
2041 if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2047 if (cc->nterpairs == 0)
2049 printf("Problem with chain definition, or missing terminal residues.\n"
2050 "This chain does not appear to contain a recognized chain molecule.\n"
2051 "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2054 /* Check for disulfides and other special bonds */
2055 nssbonds = mk_specbonds(pdba, x, bCysMan_, &ssbonds, bVerbose_);
2059 rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
2060 &symtab, bVerbose_);
2063 for (int i = 0; i < cc->nterpairs; i++)
2067 * We first apply a filter so we only have the
2068 * termini that can be applied to the residue in question
2069 * (or a generic terminus if no-residue specific is available).
2071 /* First the N terminus */
2074 tdblist = filter_ter(nNtdb, ntdb,
2075 *pdba->resinfo[cc->r_start[i]].name,
2079 printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
2080 "is already in a terminus-specific form and skipping terminus selection.\n");
2081 cc->ntdb[i] = nullptr;
2085 if (bTerMan_ && ntdblist > 1)
2087 sprintf(select, "Select start terminus type for %s-%d",
2088 *pdba->resinfo[cc->r_start[i]].name,
2089 pdba->resinfo[cc->r_start[i]].nr);
2090 cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
2094 cc->ntdb[i] = tdblist[0];
2097 printf("Start terminus %s-%d: %s\n",
2098 *pdba->resinfo[cc->r_start[i]].name,
2099 pdba->resinfo[cc->r_start[i]].nr,
2100 (cc->ntdb[i])->name);
2106 cc->ntdb[i] = nullptr;
2109 /* And the C terminus */
2112 tdblist = filter_ter(nCtdb, ctdb,
2113 *pdba->resinfo[cc->r_end[i]].name,
2117 printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2118 "is already in a terminus-specific form and skipping terminus selection.\n");
2119 cc->ctdb[i] = nullptr;
2123 if (bTerMan_ && ntdblist > 1)
2125 sprintf(select, "Select end terminus type for %s-%d",
2126 *pdba->resinfo[cc->r_end[i]].name,
2127 pdba->resinfo[cc->r_end[i]].nr);
2128 cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
2132 cc->ctdb[i] = tdblist[0];
2134 printf("End terminus %s-%d: %s\n",
2135 *pdba->resinfo[cc->r_end[i]].name,
2136 pdba->resinfo[cc->r_end[i]].nr,
2137 (cc->ctdb[i])->name);
2143 cc->ctdb[i] = nullptr;
2147 /* lookup hackblocks and rtp for all residues */
2148 get_hackblocks_rtp(&hb_chain, &restp_chain,
2149 nrtp, restp, pdba->nres, pdba->resinfo,
2150 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2152 /* ideally, now we would not need the rtp itself anymore, but do
2153 everything using the hb and restp arrays. Unfortunately, that
2154 requires some re-thinking of code in gen_vsite.c, which I won't
2155 do now :( AF 26-7-99 */
2157 rename_atoms(nullptr, ffdir_,
2158 pdba, &symtab, restp_chain, false, rt, false, bVerbose_);
2160 match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose_);
2166 block = new_blocka();
2168 sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
2169 remove_duplicate_atoms(pdba, x, bVerbose_);
2174 fprintf(stderr, "WARNING: with the -remh option the generated "
2175 "index file (%s) might be useless\n"
2176 "(the index file is generated before hydrogens are added)",
2177 indexOutputFile_.c_str());
2179 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2181 for (int i = 0; i < block->nr; i++)
2190 fprintf(stderr, "WARNING: "
2191 "without sorting no check for duplicate atoms can be done\n");
2194 /* Generate Hydrogen atoms (and termini) in the sequence */
2195 printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2196 add_h(&pdba, &x, nah, ah,
2197 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_,
2198 nullptr, nullptr, true, false);
2199 printf("Now there are %d residues with %d atoms\n",
2200 pdba->nres, pdba->nr);
2202 /* make up molecule name(s) */
2204 int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2206 gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2208 std::string molname;
2216 this_chainid = cc->chainid;
2218 /* Add the chain id if we have one */
2219 if (this_chainid != ' ')
2221 suffix.append(formatString("_chain_%c", this_chainid));
2224 /* Check if there have been previous chains with the same id */
2226 for (int k = 0; k < chain; k++)
2228 if (cc->chainid == chains[k].chainid)
2233 /* Add the number for this chain identifier if there are multiple copies */
2236 suffix.append(formatString("%d", nid_used+1));
2239 if (suffix.length() > 0)
2241 molname.append(p_restype);
2242 molname.append(suffix);
2246 molname = p_restype;
2249 std::string itp_fn = topologyFile_;;
2250 std::string posre_fn = includeTopologyFile_;
2251 if ((numChains-nwaterchain > 1) && !cc->bAllWat)
2254 printf("Chain time...\n");
2255 //construct the itp file name
2256 itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2258 itp_fn.append(molname);
2259 itp_fn.append(".itp");
2260 //now do the same for posre
2261 posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2262 posre_fn.append("_");
2263 posre_fn.append(molname);
2264 posre_fn.append(".itp");
2265 if (posre_fn == itp_fn)
2267 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2270 srenew(incls_, nincl_);
2271 incls_[nincl_-1] = gmx_strdup(itp_fn.c_str());
2272 itp_file_ = gmx_fio_fopen(itp_fn.c_str(), "w");
2279 srenew(mols_, nmol_+1);
2282 mols_[nmol_].name = gmx_strdup("SOL");
2283 mols_[nmol_].nr = pdba->nres;
2287 mols_[nmol_].name = gmx_strdup(molname.c_str());
2288 mols_[nmol_].nr = 1;
2294 print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2300 top_file2 = nullptr;
2304 top_file2 = itp_file_;
2308 top_file2 = top_file;
2311 pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, atype, &symtab,
2313 restp_chain, hb_chain,
2315 bVsites_, bVsiteAromatics_, ffdir_,
2316 mHmult_, nssbonds, ssbonds,
2317 long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2318 bRenumRes_, bRTPresname_);
2322 write_posres(posre_fn.c_str(), pdba, posre_fc_);
2327 gmx_fio_fclose(itp_file_);
2330 /* pdba and natom have been reassigned somewhere so: */
2336 if (watermodel_ == nullptr)
2338 for (int chain = 0; chain < numChains; chain++)
2340 if (chains[chain].bAllWat)
2342 auto message = formatString("You have chosen not to include a water model, "
2343 "but there is water in the input file. Select a "
2344 "water model or remove the water from your input file.");
2345 GMX_THROW(InconsistentInputError(message));
2351 std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2352 if (!fflib_fexist(waterFile))
2354 auto message = formatString("The topology file '%s' for the selected water "
2355 "model '%s' can not be found in the force field "
2356 "directory. Select a different water model.",
2357 waterFile.c_str(), watermodel_);
2358 GMX_THROW(InconsistentInputError(message));
2362 print_top_mols(top_file, title, ffdir_, watermodel_, nincl_, incls_, nmol_, mols_);
2363 gmx_fio_fclose(top_file);
2365 gmx_residuetype_destroy(rt);
2367 /* now merge all chains back together */
2370 for (int i = 0; (i < numChains); i++)
2372 natom += chains[i].pdba->nr;
2373 nres += chains[i].pdba->nres;
2377 init_t_atoms(atoms, natom, false);
2378 for (int i = 0; i < atoms->nres; i++)
2380 sfree(atoms->resinfo[i].name);
2382 sfree(atoms->resinfo);
2384 snew(atoms->resinfo, nres);
2388 for (int i = 0; (i < numChains); i++)
2392 printf("Including chain %d in system: %d atoms %d residues\n",
2393 i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2395 for (int j = 0; (j < chains[i].pdba->nr); j++)
2397 atoms->atom[k] = chains[i].pdba->atom[j];
2398 atoms->atom[k].resind += l; /* l is processed nr of residues */
2399 atoms->atomname[k] = chains[i].pdba->atomname[j];
2400 atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2401 copy_rvec(chains[i].x[j], x[k]);
2404 for (int j = 0; (j < chains[i].pdba->nres); j++)
2406 atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2409 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2417 fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2418 print_sums(atoms, true);
2422 fprintf(stderr, "\nWriting coordinate file...\n");
2423 clear_rvec(box_space);
2426 make_new_box(atoms->nr, x, box, box_space, false);
2428 write_sto_conf(outputConfFile_.c_str(), title, atoms, x, nullptr, ePBC, box);
2430 printf("\t\t--------- PLEASE NOTE ------------\n");
2431 printf("You have successfully generated a topology from: %s.\n",
2432 inputConfFile_.c_str());
2433 if (watermodel_ != nullptr)
2435 printf("The %s force field and the %s water model are used.\n",
2436 ffname_, watermodel_);
2440 printf("The %s force field is used.\n",
2443 printf("\t\t--------- ETON ESAELP ------------\n");
2450 const char pdb2gmxInfo::name[] = "pdb2gmx";
2451 const char pdb2gmxInfo::shortDescription[] =
2452 "Convert coordinate files to topology and FF-compliant coordinate files";
2453 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2455 return compat::make_unique<pdb2gmx>();