2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7 * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
8 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9 * and including many others, as listed in the AUTHORS file in the
10 * top-level source directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
52 #include "gromacs/commandline/cmdlineoptionsmodule.h"
53 #include "gromacs/fileio/confio.h"
54 #include "gromacs/fileio/filetypes.h"
55 #include "gromacs/fileio/gmxfio.h"
56 #include "gromacs/fileio/pdbio.h"
57 #include "gromacs/gmxlib/conformation_utilities.h"
58 #include "gromacs/gmxpreprocess/fflibutil.h"
59 #include "gromacs/gmxpreprocess/genhydro.h"
60 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
61 #include "gromacs/gmxpreprocess/grompp_impl.h"
62 #include "gromacs/gmxpreprocess/h_db.h"
63 #include "gromacs/gmxpreprocess/hizzie.h"
64 #include "gromacs/gmxpreprocess/pdb2top.h"
65 #include "gromacs/gmxpreprocess/pgutil.h"
66 #include "gromacs/gmxpreprocess/specbond.h"
67 #include "gromacs/gmxpreprocess/ter_db.h"
68 #include "gromacs/gmxpreprocess/toputil.h"
69 #include "gromacs/gmxpreprocess/xlate.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/options/basicoptions.h"
72 #include "gromacs/options/filenameoption.h"
73 #include "gromacs/options/ioptionscontainer.h"
74 #include "gromacs/topology/atomprop.h"
75 #include "gromacs/topology/block.h"
76 #include "gromacs/topology/index.h"
77 #include "gromacs/topology/residuetypes.h"
78 #include "gromacs/topology/symtab.h"
79 #include "gromacs/topology/topology.h"
80 #include "gromacs/utility/dir_separator.h"
81 #include "gromacs/utility/exceptions.h"
82 #include "gromacs/utility/fatalerror.h"
83 #include "gromacs/utility/path.h"
84 #include "gromacs/utility/smalloc.h"
85 #include "gromacs/utility/strdb.h"
86 #include "gromacs/utility/stringutil.h"
88 #include "hackblock.h"
93 RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
108 static const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
110 /* NOTE: This function returns the main building block name,
111 * it does not take terminal renaming into account.
113 auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
114 return gmx::equalCaseInsensitive(name, rename.gmx);
116 return found != rr.end() ? found->main.c_str() : name.c_str();
119 static const char* select_res(int nr,
124 gmx::ArrayRef<const RtpRename> rr)
126 printf("Which %s type do you want for residue %d\n", title, resnr + 1);
127 for (int sel = 0; (sel < nr); sel++)
129 printf("%d. %s (%s)\n", sel, expl[sel], res2bb_notermini(name[sel], rr));
131 printf("\nType a number:");
135 if (scanf("%d", &userSelection) != 1)
137 gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
140 return name[userSelection];
143 static const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
151 const char* lh[easpNR] = { "ASP", "ASPH" };
152 const char* expl[easpNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
154 return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", rr);
157 static const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
165 const char* lh[egluNR] = { "GLU", "GLUH" };
166 const char* expl[egluNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
168 return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", rr);
171 static const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
179 const char* lh[eglnNR] = { "GLN", "QLN" };
180 const char* expl[eglnNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
182 return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", rr);
185 static const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
193 const char* lh[elysNR] = { "LYSN", "LYS" };
194 const char* expl[elysNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
196 return select_res(elysNR, resnr, lh, expl, "LYSINE", rr);
199 static const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
207 const char* lh[eargNR] = { "ARGN", "ARG" };
208 const char* expl[eargNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
210 return select_res(eargNR, resnr, lh, expl, "ARGININE", rr);
213 static const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
215 const char* expl[ehisNR] = { "H on ND1 only", "H on NE2 only", "H on ND1 and NE2",
218 return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", rr);
221 static void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
223 char line[STRLEN], buf[STRLEN];
226 while (get_a_line(fp, line, STRLEN))
228 /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
229 * For other args, we read up to 6 chars (so we can detect if the length is > 5).
230 * Note that the buffer length has been increased to 7 to allow this,
231 * so we just need to make sure the strings have zero-length initially.
243 int nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
244 RtpRename newEntry(gmx, main, nter, cter, bter);
247 if (nc != 2 && nc != 5)
249 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d",
257 "A line in residue renaming database '%s' has %d columns, while previous "
258 "lines have %d columns",
264 /* This file does not have special termini names, copy them from main */
265 newEntry.nter = newEntry.main;
266 newEntry.cter = newEntry.main;
267 newEntry.bter = newEntry.main;
269 rtprename->push_back(newEntry);
273 static std::string search_resrename(gmx::ArrayRef<const RtpRename> rr,
277 bool bCompareFFRTPname)
279 auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
280 return ((!bCompareFFRTPname && (name == rename.gmx))
281 || (bCompareFFRTPname && (name == rename.main)));
285 /* If found in the database, rename this residue's rtp building block,
286 * otherwise keep the old name.
288 if (found != rr.end())
292 newName = found->bter;
296 newName = found->nter;
300 newName = found->cter;
304 newName = found->main;
307 if (newName[0] == '-')
309 gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name,
310 bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
311 : (bEnd ? " as an ending terminus" : ""));
318 static void rename_resrtp(t_atoms* pdba,
320 gmx::ArrayRef<const int> r_start,
321 gmx::ArrayRef<const int> r_end,
322 gmx::ArrayRef<const RtpRename> rr,
326 bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
328 for (int r = 0; r < pdba->nres; r++)
332 for (int j = 0; j < nterpairs; j++)
339 for (int j = 0; j < nterpairs; j++)
347 std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
349 if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
351 /* This is a terminal residue, but the residue name,
352 * currently stored in .rtp, is not a standard residue name,
353 * but probably a force field specific rtp name.
354 * Check if we need to rename it because it is terminal.
356 newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
359 if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
363 printf("Changing rtp entry of residue %d %s to '%s'\n", pdba->resinfo[r].nr,
364 *pdba->resinfo[r].name, newName.c_str());
366 pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
371 static void pdbres_to_gmxrtp(t_atoms* pdba)
375 for (i = 0; (i < pdba->nres); i++)
377 if (pdba->resinfo[i].rtp == nullptr)
379 pdba->resinfo[i].rtp = pdba->resinfo[i].name;
384 static void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
389 for (i = 0; (i < pdba->nres); i++)
391 resnm = *pdba->resinfo[i].name;
392 if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
393 || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
395 /* Rename the residue name (not the rtp name) */
396 pdba->resinfo[i].name = put_symtab(symtab, newnm);
401 static void rename_bb(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
406 for (i = 0; (i < pdba->nres); i++)
408 /* We have not set the rtp name yes, use the residue name */
409 bbnm = *pdba->resinfo[i].name;
410 if ((bFullCompare && (gmx::equalCaseInsensitive(bbnm, oldnm)))
411 || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
413 /* Change the rtp builing block name */
414 pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
419 static void rename_bbint(t_atoms* pdba,
421 const char* gettp(int, gmx::ArrayRef<const RtpRename>),
424 gmx::ArrayRef<const RtpRename> rr)
430 for (i = 0; i < pdba->nres; i++)
432 /* We have not set the rtp name yet, use the residue name */
433 bbnm = *pdba->resinfo[i].name;
434 if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
437 pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
442 static void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose)
448 ftp = fn2ftp(filename);
449 if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
451 fprintf(stderr, "No occupancies in %s\n", filename);
455 for (i = 0; (i < atoms->nr); i++)
457 if (atoms->pdbinfo[i].occup != 1)
461 fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
462 *atoms->resinfo[atoms->atom[i].resind].name,
463 atoms->resinfo[atoms->atom[i].resind].nr, *atoms->atomname[i],
464 atoms->pdbinfo[i].occup);
466 if (atoms->pdbinfo[i].occup == 0)
476 if (nzero == atoms->nr)
478 fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
480 else if ((nzero > 0) || (nnotone > 0))
484 "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
485 " occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
487 nzero, nnotone, atoms->nr);
491 fprintf(stderr, "All occupancies are one\n");
496 static void write_posres(const char* fn, t_atoms* pdba, real fc)
501 fp = gmx_fio_fopen(fn, "w");
503 "; In this topology include file, you will find position restraint\n"
504 "; entries for all the heavy atoms in your original pdb file.\n"
505 "; This means that all the protons which were added by pdb2gmx are\n"
506 "; not restrained.\n"
508 "[ position_restraints ]\n"
509 "; %4s%6s%8s%8s%8s\n",
510 "atom", "type", "fx", "fy", "fz");
511 for (i = 0; (i < pdba->nr); i++)
513 if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
515 fprintf(fp, "%6d%6d %g %g %g\n", i + 1, 1, fc, fc, fc);
521 static int read_pdball(const char* inf,
535 /* Read a pdb file. (containing proteins) */
537 int natom, new_natom, i;
540 printf("Reading %s...\n", inf);
541 readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
543 if (atoms->pdbinfo == nullptr)
545 snew(atoms->pdbinfo, atoms->nr);
547 if (fn2ftp(inf) == efPDB)
549 get_pdb_atomnumber(atoms, aps);
554 for (i = 0; i < atoms->nr; i++)
556 if (!is_hydrogen(*atoms->atomname[i]))
558 atoms->atom[new_natom] = atoms->atom[i];
559 atoms->atomname[new_natom] = atoms->atomname[i];
560 atoms->pdbinfo[new_natom] = atoms->pdbinfo[i];
561 copy_rvec((*x)[i], (*x)[new_natom]);
565 atoms->nr = new_natom;
572 printf(" '%s',", *title);
574 printf(" %d atoms\n", natom);
576 /* Rename residues */
577 rename_pdbres(atoms, "HOH", watres, false, symtab);
578 rename_pdbres(atoms, "SOL", watres, false, symtab);
579 rename_pdbres(atoms, "WAT", watres, false, symtab);
581 rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
589 write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
595 static void process_chain(t_atoms* pdba,
596 gmx::ArrayRef<gmx::RVec> x,
609 gmx::ArrayRef<const RtpRename> rr)
611 /* Rename aromatics, lys, asp and histidine */
614 rename_bb(pdba, "TYR", "TYRU", false, symtab);
618 rename_bb(pdba, "TRP", "TRPU", false, symtab);
622 rename_bb(pdba, "PHE", "PHEU", false, symtab);
626 rename_bbint(pdba, "LYS", get_lystp, false, symtab, rr);
630 rename_bbint(pdba, "ARG", get_argtp, false, symtab, rr);
634 rename_bbint(pdba, "GLN", get_glntp, false, symtab, rr);
638 rename_bbint(pdba, "ASP", get_asptp, false, symtab, rr);
642 rename_bb(pdba, "ASPH", "ASP", false, symtab);
646 rename_bbint(pdba, "GLU", get_glutp, false, symtab, rr);
650 rename_bb(pdba, "GLUH", "GLU", false, symtab);
655 set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
659 rename_bbint(pdba, "HIS", get_histp, true, symtab, rr);
662 /* Initialize the rtp builing block names with the residue names
663 * for the residues that have not been processed above.
665 pdbres_to_gmxrtp(pdba);
667 /* Now we have all rtp names set.
668 * The rtp names will conform to Gromacs naming,
669 * unless the input pdb file contained one or more force field specific
670 * rtp names as residue names.
674 /* struct for sorting the atoms from the pdb file */
677 int resnr; /* residue number */
678 int j; /* database order index */
679 int index; /* original atom number */
680 char anm1; /* second letter of atom name */
681 char altloc; /* alternate location indicator */
684 static bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
686 int d = (a.resnr - b.resnr);
692 d = (a.anm1 - b.anm1);
695 d = (a.altloc - b.altloc);
702 static void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
705 t_atoms** newPdbAtoms,
706 std::vector<gmx::RVec>* x,
710 t_atoms* pdba = *pdbaptr;
711 std::vector<gmx::RVec> xnew;
718 for (int i = 0; i < natoms; i++)
720 atomnm = *pdba->atomname[i];
721 const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
723 std::find_if(localPpResidue->atomname.begin(), localPpResidue->atomname.end(),
724 [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
725 if (found == localPpResidue->atomname.end())
730 "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
731 "while sorting atoms.\n%s",
732 atomnm, *pdba->resinfo[pdba->atom[i].resind].name,
733 pdba->resinfo[pdba->atom[i].resind].nr, localPpResidue->resname.c_str(),
734 localPpResidue->natom(),
736 ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
737 "might have had a different number in the PDB file and was rebuilt\n"
738 "(it might for instance have been H3, and we only expected H1 & "
740 "Note that hydrogens might have been added to the entry for the "
742 "Remove this hydrogen or choose a different protonation state to "
744 "Option -ignh will ignore all hydrogens in the input."
746 gmx_fatal(FARGS, "%s", buf);
748 /* make shadow array to be sorted into indexgroup */
749 pdbi[i].resnr = pdba->atom[i].resind;
750 pdbi[i].j = std::distance(localPpResidue->atomname.begin(), found);
752 pdbi[i].anm1 = atomnm[1];
753 pdbi[i].altloc = pdba->pdbinfo[i].altloc;
755 std::sort(pdbi, pdbi + natoms, pdbicomp);
757 /* pdba is sorted in pdbnew using the pdbi index */
758 std::vector<int> a(natoms);
759 srenew(*newPdbAtoms, 1);
760 init_t_atoms((*newPdbAtoms), natoms, true);
761 (*newPdbAtoms)->nr = pdba->nr;
762 (*newPdbAtoms)->nres = pdba->nres;
763 srenew((*newPdbAtoms)->resinfo, pdba->nres);
764 std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
765 for (int i = 0; i < natoms; i++)
767 (*newPdbAtoms)->atom[i] = pdba->atom[pdbi[i].index];
768 (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
769 (*newPdbAtoms)->pdbinfo[i] = pdba->pdbinfo[pdbi[i].index];
770 xnew.push_back(x->at(pdbi[i].index));
771 /* make indexgroup in block */
772 a[i] = pdbi[i].index;
777 /* copy the sorted pdbnew back to pdba */
778 *pdbaptr = *newPdbAtoms;
780 add_grp(block, gnames, a, "prot_sort");
784 static int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose)
786 int i, j, oldnatoms, ndel;
789 printf("Checking for duplicate atoms....\n");
790 oldnatoms = pdba->nr;
792 /* NOTE: pdba->nr is modified inside the loop */
793 for (i = 1; (i < pdba->nr); i++)
795 /* compare 'i' and 'i-1', throw away 'i' if they are identical
796 this is a 'while' because multiple alternate locations can be present */
797 while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
798 && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
803 ri = &pdba->resinfo[pdba->atom[i].resind];
804 printf("deleting duplicate atom %4s %s%4d%c", *pdba->atomname[i], *ri->name,
806 if (ri->chainid && (ri->chainid != ' '))
808 printf(" ch %c", ri->chainid);
812 if (pdba->pdbinfo[i].atomnr)
814 printf(" pdb nr %4d", pdba->pdbinfo[i].atomnr);
816 if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
818 printf(" altloc %c", pdba->pdbinfo[i].altloc);
824 /* We can not free, since it might be in the symtab */
825 /* sfree(pdba->atomname[i]); */
826 for (j = i; j < pdba->nr; j++)
828 pdba->atom[j] = pdba->atom[j + 1];
829 pdba->atomname[j] = pdba->atomname[j + 1];
832 pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
834 copy_rvec(x[j + 1], x[j]);
836 srenew(pdba->atom, pdba->nr);
837 /* srenew(pdba->atomname, pdba->nr); */
838 srenew(pdba->pdbinfo, pdba->nr);
841 if (pdba->nr != oldnatoms)
843 printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
849 static void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueType* rt)
851 std::string startResidueString =
852 gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
853 std::string endResidueString =
854 gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
856 // Check whether all residues in chain have the same chain ID.
857 bool allResiduesHaveSameChainID = true;
858 char chainID0 = pdba->resinfo[r0].chainid;
860 std::string residueString;
862 for (int i = r0 + 1; i < r1; i++)
864 chainID = pdba->resinfo[i].chainid;
865 if (chainID != chainID0)
867 allResiduesHaveSameChainID = false;
868 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
873 if (!allResiduesHaveSameChainID)
876 "The chain covering the range %s--%s does not have a consistent chain ID. "
877 "The first residue has ID '%c', while residue %s has ID '%c'.",
878 startResidueString.c_str(), endResidueString.c_str(), chainID0,
879 residueString.c_str(), chainID);
882 // At this point all residues have the same ID. If they are also non-blank
883 // we can be a bit more aggressive and require the types match too.
886 bool allResiduesHaveSameType = true;
888 std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
890 for (int i = r0 + 1; i < r1; i++)
892 restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
893 if (!gmx::equalCaseInsensitive(restype, restype0))
895 allResiduesHaveSameType = false;
896 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
901 if (!allResiduesHaveSameType)
904 "The residues in the chain %s--%s do not have a consistent type. "
905 "The first residue has type '%s', while residue %s is of type '%s'. "
906 "Either there is a mistake in your chain, or it includes nonstandard "
907 "residue names that have not yet been added to the residuetypes.dat "
908 "file in the GROMACS library directory. If there are other molecules "
909 "such as ligands, they should not have the same chain ID as the "
910 "adjacent protein chain since it's a separate molecule.",
911 startResidueString.c_str(), endResidueString.c_str(), restype0.c_str(),
912 residueString.c_str(), restype.c_str());
917 static void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueType* rt)
920 gmx::compat::optional<std::string> startrestype;
925 int startWarnings = 0;
929 // Check that all residues have the same chain identifier, and if it is
930 // non-blank we also require the residue types to match.
931 checkResidueTypeSanity(pdba, r0, r1, rt);
933 // If we return correctly from checkResidueTypeSanity(), the only
934 // remaining cases where we can have non-matching residue types is if
935 // the chain ID was blank, which could be the case e.g. for a structure
936 // read from a GRO file or other file types without chain information.
937 // In that case we need to be a bit more liberal and detect chains based
938 // on the residue type.
940 // If we get here, the chain ID must be identical for all residues
941 char chainID = pdba->resinfo[r0].chainid;
943 /* Find the starting terminus (typially N or 5') */
944 for (i = r0; i < r1 && *r_start == -1; i++)
946 startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
951 if (gmx::equalCaseInsensitive(*startrestype, "Protein")
952 || gmx::equalCaseInsensitive(*startrestype, "DNA")
953 || gmx::equalCaseInsensitive(*startrestype, "RNA"))
955 printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name,
956 pdba->resinfo[i].nr);
959 else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
963 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n",
964 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
968 printf("Disabling further notes about ions.\n");
974 // Either no known residue type, or one not needing special handling
975 if (startWarnings < 5)
979 printf("\nWarning: Starting residue %s%d in chain not identified as "
981 "This chain lacks identifiers, which makes it impossible to do strict\n"
982 "classification of the start/end residues. Here we need to guess this "
984 "should not be part of the chain and instead introduce a break, but "
986 "be catastrophic if they should in fact be linked. Please check your "
988 "and add %s to residuetypes.dat if this was not correct.\n\n",
989 *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
993 printf("\nWarning: No residues in chain starting at %s%d identified as "
995 "This makes it impossible to link them into a molecule, which could "
997 "correct or a catastrophic error. Please check your structure, and add "
999 "necessary residue names to residuetypes.dat if this was not "
1001 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1004 if (startWarnings == 4)
1006 printf("Disabling further warnings about unidentified residues at start of "
1015 /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1016 for (int i = *r_start; i < r1; i++)
1018 gmx::compat::optional<std::string> restype =
1019 rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1024 if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1028 else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1032 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n",
1033 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1037 printf("Disabling further notes about ions.\n");
1043 // Either no known residue type, or one not needing special handling.
1044 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1045 // Otherwise the call to checkResidueTypeSanity() will
1046 // have caught the problem.
1047 if (endWarnings < 5)
1049 printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1050 "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1051 "it impossible to do strict classification of the start/end residues. "
1053 "need to guess this residue should not be part of the chain and "
1055 "introduce a break, but that will be catastrophic if they should in "
1057 "linked. Please check your structure, and add %s to residuetypes.dat\n"
1058 "if this was not correct.\n\n",
1059 *pdba->resinfo[i].name, pdba->resinfo[i].nr, restype->c_str(),
1060 *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr,
1061 startrestype->c_str(), *pdba->resinfo[i].name);
1063 if (endWarnings == 4)
1065 printf("Disabling further warnings about unidentified residues at end of "
1075 printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name,
1076 pdba->resinfo[*r_end].nr);
1080 /* enum for chain separation */
1083 enChainSep_id_or_ter,
1084 enChainSep_id_and_ter,
1087 enChainSep_interactive
1089 static const char* ChainSepEnum[] = { "id_or_ter", "id_and_ter", "ter", "id", "interactive" };
1090 static const char* ChainSepInfoString[] = {
1091 "Splitting chemical chains based on TER records or chain id changing.\n",
1092 "Splitting chemical chains based on TER records and chain id changing.\n",
1093 "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1094 "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1095 "Splitting chemical chains interactively.\n"
1098 static void modify_chain_numbers(t_atoms* pdba, ChainSepType enumChainSep)
1101 char old_prev_chainid;
1102 char old_this_chainid;
1103 int old_prev_chainnum;
1104 int old_this_chainnum;
1106 char select[STRLEN];
1110 const char* prev_atomname;
1111 const char* this_atomname;
1112 const char* prev_resname;
1113 const char* this_resname;
1119 /* The default chain enumeration is based on TER records only */
1120 printf("%s", ChainSepInfoString[enumChainSep]);
1122 old_prev_chainid = '?';
1123 old_prev_chainnum = -1;
1126 this_atomname = nullptr;
1128 this_resname = nullptr;
1132 for (i = 0; i < pdba->nres; i++)
1134 ri = &pdba->resinfo[i];
1135 old_this_chainid = ri->chainid;
1136 old_this_chainnum = ri->chainnum;
1138 prev_atomname = this_atomname;
1139 prev_atomnum = this_atomnum;
1140 prev_resname = this_resname;
1141 prev_resnum = this_resnum;
1142 prev_chainid = this_chainid;
1144 this_atomname = *(pdba->atomname[i]);
1145 this_atomnum = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1146 this_resname = *ri->name;
1147 this_resnum = ri->nr;
1148 this_chainid = ri->chainid;
1150 switch (enumChainSep)
1152 case enChainSep_id_or_ter:
1153 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1159 case enChainSep_id_and_ter:
1160 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1167 if (old_this_chainid != old_prev_chainid)
1173 case enChainSep_ter:
1174 if (old_this_chainnum != old_prev_chainnum)
1179 case enChainSep_interactive:
1180 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1184 printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1186 "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1187 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1188 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1190 if (nullptr == fgets(select, STRLEN - 1, stdin))
1192 gmx_fatal(FARGS, "Error reading from stdin");
1195 if (i == 0 || select[0] == 'y')
1201 default: gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1203 old_prev_chainid = old_this_chainid;
1204 old_prev_chainnum = old_this_chainnum;
1206 ri->chainnum = new_chainnum;
1213 char chainnum = ' ';
1216 bool bAllWat = false;
1218 std::vector<int> chainstart;
1225 bool bAllWat = false;
1227 std::vector<int> chainstart;
1228 std::vector<MoleculePatchDatabase*> ntdb;
1229 std::vector<MoleculePatchDatabase*> ctdb;
1230 std::vector<int> r_start;
1231 std::vector<int> r_end;
1233 std::vector<gmx::RVec> x;
1236 // TODO make all enums into scoped enums
1237 /* enum for vsites */
1244 static const char* VSitesEnum[] = { "none", "hydrogens", "aromatics" };
1246 /* enum for water model */
1258 static const char* WaterEnum[] = { "select", "none", "spc", "spce",
1259 "tip3p", "tip4p", "tip5p", "tips3p" };
1261 /* enum for merge */
1268 static const char* MergeEnum[] = { "no", "all", "interactive" };
1276 class pdb2gmx : public ICommandLineOptionsModule
1282 bVsiteAromatics_(FALSE),
1283 enumChainSep_(enChainSep_id_or_ter),
1284 enumVSites_(enVSites_none),
1285 enumWater_(enWater_select),
1286 enumMerge_(enMerge_no),
1292 // From ICommandLineOptionsModule
1293 void init(CommandLineModuleSettings* /*settings*/) override {}
1295 void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1297 void optionsFinished() override;
1315 bool bAllowMissing_;
1319 bool bChargeGroups_;
1329 bool bVsiteAromatics_;
1333 real long_bond_dist_;
1334 real short_bond_dist_;
1336 std::string indexOutputFile_;
1337 std::string outputFile_;
1338 std::string topologyFile_;
1339 std::string includeTopologyFile_;
1340 std::string outputConfFile_;
1341 std::string inputConfFile_;
1342 std::string outFile_;
1345 ChainSepType enumChainSep_;
1346 VSitesType enumVSites_;
1347 WaterType enumWater_;
1348 MergeType enumMerge_;
1351 char forcefield_[STRLEN];
1352 char ffdir_[STRLEN];
1355 std::vector<std::string> incls_;
1356 std::vector<t_mols> mols_;
1360 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1362 const char* desc[] = {
1363 "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1364 "some database files, adds hydrogens to the molecules and generates",
1365 "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1366 "and a topology in GROMACS format.",
1367 "These files can subsequently be processed to generate a run input file.",
1369 "[THISMODULE] will search for force fields by looking for",
1370 "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1371 "of the current working directory and of the GROMACS library directory",
1372 "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1374 "By default the forcefield selection is interactive,",
1375 "but you can use the [TT]-ff[tt] option to specify one of the short names",
1376 "in the list on the command line instead. In that case [THISMODULE] just looks",
1377 "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1379 "After choosing a force field, all files will be read only from",
1380 "the corresponding force field directory.",
1381 "If you want to modify or add a residue types, you can copy the force",
1382 "field directory from the GROMACS library directory to your current",
1383 "working directory. If you want to add new protein residue types,",
1384 "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1385 "or copy the whole library directory to a local directory and set",
1386 "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1387 "Check Chapter 5 of the manual for more information about file formats.",
1390 "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1391 "need not necessarily contain a protein structure. Every kind of",
1392 "molecule for which there is support in the database can be converted.",
1393 "If there is no support in the database, you can add it yourself.[PAR]",
1395 "The program has limited intelligence, it reads a number of database",
1396 "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1397 "if necessary this can be done manually. The program can prompt the",
1398 "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1399 "desired. For Lys the choice is between neutral (two protons on NZ) or",
1400 "protonated (three protons, default), for Asp and Glu unprotonated",
1401 "(default) or protonated, for His the proton can be either on ND1,",
1402 "on NE2 or on both. By default these selections are done automatically.",
1403 "For His, this is based on an optimal hydrogen bonding",
1404 "conformation. Hydrogen bonds are defined based on a simple geometric",
1405 "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1406 "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1407 "[TT]-dist[tt] respectively.[PAR]",
1409 "The protonation state of N- and C-termini can be chosen interactively",
1410 "with the [TT]-ter[tt] flag. Default termini are ionized (NH3+ and COO-),",
1411 "respectively. Some force fields support zwitterionic forms for chains of",
1412 "one residue, but for polypeptides these options should NOT be selected.",
1413 "The AMBER force fields have unique forms for the terminal residues,",
1414 "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1415 "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1416 "respectively to use these forms, making sure you preserve the format",
1417 "of the coordinate file. Alternatively, use named terminating residues",
1418 "(e.g. ACE, NME).[PAR]",
1420 "The separation of chains is not entirely trivial since the markup",
1421 "in user-generated PDB files frequently varies and sometimes it",
1422 "is desirable to merge entries across a TER record, for instance",
1423 "if you want a disulfide bridge or distance restraints between",
1424 "two protein chains or if you have a HEME group bound to a protein.",
1425 "In such cases multiple chains should be contained in a single",
1426 "[TT]moleculetype[tt] definition.",
1427 "To handle this, [THISMODULE] uses two separate options.",
1428 "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1429 "start, and termini added when applicable. This can be done based on the",
1430 "existence of TER records, when the chain id changes, or combinations of either",
1431 "or both of these. You can also do the selection fully interactively.",
1432 "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1433 "are merged into one moleculetype, after adding all the chemical termini (or not).",
1434 "This can be turned off (no merging), all non-water chains can be merged into a",
1435 "single molecule, or the selection can be done interactively.[PAR]",
1437 "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1438 "If any of the occupancies are not one, indicating that the atom is",
1439 "not resolved well in the structure, a warning message is issued.",
1440 "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1441 "all occupancy fields may be zero. Either way, it is up to the user",
1442 "to verify the correctness of the input data (read the article!).[PAR]",
1444 "During processing the atoms will be reordered according to GROMACS",
1445 "conventions. With [TT]-n[tt] an index file can be generated that",
1446 "contains one group reordered in the same way. This allows you to",
1447 "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1448 "one limitation: reordering is done after the hydrogens are stripped",
1449 "from the input and before new hydrogens are added. This means that",
1450 "you should not use [TT]-ignh[tt].[PAR]",
1452 "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1453 "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1454 "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1457 "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1458 "motions. Angular and out-of-plane motions can be removed by changing",
1459 "hydrogens into virtual sites and fixing angles, which fixes their",
1460 "position relative to neighboring atoms. Additionally, all atoms in the",
1461 "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1462 "can be converted into virtual sites, eliminating the fast improper dihedral",
1463 "fluctuations in these rings (but this feature is deprecated).",
1464 "[BB]Note[bb] that in this case all other hydrogen",
1465 "atoms are also converted to virtual sites. The mass of all atoms that are",
1466 "converted into virtual sites, is added to the heavy atoms.[PAR]",
1467 "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1468 "done by increasing the hydrogen-mass by a factor of 4. This is also",
1469 "done for water hydrogens to slow down the rotational motion of water.",
1470 "The increase in mass of the hydrogens is subtracted from the bonded",
1471 "(heavy) atom so that the total mass of the system remains the same."
1474 settings->setHelpText(desc);
1476 options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1477 "Write the residue database in new format to [TT]new.rtp[tt]"));
1479 RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1481 RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1483 EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum).store(&enumChainSep_).description("Condition in PDB files when a new chain should be started (adding termini)"));
1484 options->addOption(EnumOption<MergeType>("merge")
1485 .enumValue(MergeEnum)
1487 .description("Merge multiple chains into a single [moleculetype]"));
1488 options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1489 "Force field, interactive by default. Use [TT]-h[tt] for information."));
1491 EnumOption<WaterType>("water").store(&enumWater_).enumValue(WaterEnum).description("Water model to use"));
1492 options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1493 "Set the next 8 options to interactive"));
1494 options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1495 "Interactive SS bridge selection"));
1496 options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1497 "Interactive termini selection, instead of charged (default)"));
1498 options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1499 "Interactive lysine selection, instead of charged"));
1500 options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1501 "Interactive arginine selection, instead of charged"));
1502 options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1503 "Interactive aspartic acid selection, instead of charged"));
1504 options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1505 "Interactive glutamic acid selection, instead of charged"));
1506 options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1507 "Interactive glutamine selection, instead of charged"));
1508 options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1509 "Interactive histidine selection, instead of checking H-bonds"));
1510 options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1511 "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1513 RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1514 options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1515 "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1517 options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1518 "Sort the residues according to database, turning this off is dangerous as charge "
1519 "groups might be broken in parts"));
1521 BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1523 BooleanOption("missing")
1524 .store(&bAllowMissing_)
1525 .defaultValue(false)
1527 "Continue when atoms are missing and bonds cannot be made, dangerous"));
1529 BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1531 RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1533 EnumOption<VSitesType>("vsite").store(&enumVSites_).enumValue(VSitesEnum).description("Convert atoms to virtual sites"));
1534 options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1535 "Make hydrogen atoms heavy"));
1537 BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1538 options->addOption(BooleanOption("chargegrp")
1539 .store(&bChargeGroups_)
1541 .description("Use charge groups in the [REF].rtp[ref] file"));
1542 options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1543 "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1544 options->addOption(BooleanOption("renum")
1546 .defaultValue(false)
1547 .description("Renumber the residues consecutively in the output"));
1548 options->addOption(BooleanOption("rtpres")
1549 .store(&bRTPresname_)
1550 .defaultValue(false)
1551 .description("Use [REF].rtp[ref] entry names as residue names"));
1552 options->addOption(FileNameOption("f")
1555 .store(&inputConfFile_)
1557 .defaultBasename("protein")
1559 .description("Structure file"));
1560 options->addOption(FileNameOption("o")
1563 .store(&outputConfFile_)
1565 .defaultBasename("conf")
1566 .description("Structure file"));
1567 options->addOption(FileNameOption("p")
1570 .store(&topologyFile_)
1572 .defaultBasename("topol")
1573 .description("Topology file"));
1574 options->addOption(FileNameOption("i")
1577 .store(&includeTopologyFile_)
1579 .defaultBasename("posre")
1580 .description("Include file for topology"));
1581 options->addOption(FileNameOption("n")
1584 .store(&indexOutputFile_)
1585 .storeIsSet(&bIndexSet_)
1586 .defaultBasename("index")
1587 .description("Index file"));
1588 options->addOption(FileNameOption("q")
1592 .storeIsSet(&bOutputSet_)
1593 .defaultBasename("clean")
1595 .description("Structure file"));
1598 void pdb2gmx::optionsFinished()
1600 if (inputConfFile_.empty())
1602 GMX_THROW(InconsistentInputError("You must supply an input file"));
1606 /* if anything changes here, also change description of -inter */
1621 else if (bDeuterate_)
1630 /* Force field selection, interactive or direct */
1631 choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(), forcefield_,
1632 sizeof(forcefield_), ffdir_, sizeof(ffdir_));
1634 if (strlen(forcefield_) > 0)
1636 ffname_ = forcefield_;
1637 ffname_[0] = std::toupper(ffname_[0]);
1641 gmx_fatal(FARGS, "Empty forcefield string");
1647 char select[STRLEN];
1648 std::vector<DisulfideBond> ssbonds;
1652 const char* prev_atomname;
1653 const char* this_atomname;
1654 const char* prev_resname;
1655 const char* this_resname;
1660 int prev_chainnumber;
1661 int this_chainnumber;
1662 int this_chainstart;
1663 int prev_chainstart;
1665 printf("\nUsing the %s force field in directory %s\n\n", ffname_, ffdir_);
1667 choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1669 switch (enumVSites_)
1673 bVsiteAromatics_ = false;
1675 case enVSites_hydrogens:
1677 bVsiteAromatics_ = false;
1679 case enVSites_aromatics:
1681 bVsiteAromatics_ = true;
1684 gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1687 /* Open the symbol table */
1689 open_symtab(&symtab);
1691 /* Residue type database */
1694 /* Read residue renaming database(s), if present */
1695 std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1697 std::vector<RtpRename> rtprename;
1698 for (const auto& filename : rrn)
1700 printf("going to rename %s\n", filename.c_str());
1701 FILE* fp = fflib_open(filename);
1702 read_rtprename(filename.c_str(), fp, &rtprename);
1706 /* Add all alternative names from the residue renaming database to the list
1707 of recognized amino/nucleic acids. */
1708 for (const auto& rename : rtprename)
1710 /* Only add names if the 'standard' gromacs/iupac base name was found */
1711 if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
1713 rt.addResidue(rename.main, *restype);
1714 rt.addResidue(rename.nter, *restype);
1715 rt.addResidue(rename.cter, *restype);
1716 rt.addResidue(rename.bter, *restype);
1723 if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
1727 else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
1737 char* title = nullptr;
1741 int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(), &title, &pdba_all,
1742 &pdbx, &ePBC, box, bRemoveH_, &symtab, &rt, watres, &aps, bVerbose_);
1746 std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1747 GMX_THROW(InconsistentInputError(message));
1750 printf("Analyzing pdb file\n");
1751 int nwaterchain = 0;
1753 modify_chain_numbers(&pdba_all, enumChainSep_);
1755 int nchainmerges = 0;
1757 this_atomname = nullptr;
1759 this_resname = nullptr;
1762 this_chainnumber = -1;
1763 this_chainstart = 0;
1764 /* Keep the compiler happy */
1765 prev_chainstart = 0;
1768 std::vector<t_pdbchain> pdb_ch;
1771 bool bMerged = false;
1772 for (int i = 0; (i < natom); i++)
1774 ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1776 /* TODO this should live in a helper object, and consolidate
1777 that with code in modify_chain_numbers */
1778 prev_atomname = this_atomname;
1779 prev_atomnum = this_atomnum;
1780 prev_resname = this_resname;
1781 prev_resnum = this_resnum;
1782 prev_chainid = this_chainid;
1783 prev_chainnumber = this_chainnumber;
1786 prev_chainstart = this_chainstart;
1789 this_atomname = *pdba_all.atomname[i];
1790 this_atomnum = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
1791 this_resname = *ri->name;
1792 this_resnum = ri->nr;
1793 this_chainid = ri->chainid;
1794 this_chainnumber = ri->chainnum;
1796 bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
1798 if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1802 "Must have pdbinfo from reading a PDB file if chain number is changing");
1803 this_chainstart = pdba_all.atom[i].resind;
1805 if (i > 0 && !bWat_)
1807 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1809 printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and "
1810 "chain starting with\n"
1811 "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype "
1812 "(keeping termini)? [n/y]\n",
1813 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1814 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1816 if (nullptr == fgets(select, STRLEN - 1, stdin))
1818 gmx_fatal(FARGS, "Error reading from stdin");
1820 bMerged = (select[0] == 'y');
1822 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1830 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
1831 pdba_all.atom[i].resind - prev_chainstart;
1832 pdb_ch[numChains - 1].nterpairs++;
1833 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
1838 /* set natom for previous chain */
1841 pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
1848 /* check if chain identifier was used before */
1849 for (int j = 0; (j < numChains); j++)
1851 if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1853 printf("WARNING: Chain identifier '%c' is used in two non-sequential "
1855 "They will be treated as separate chains unless you reorder your "
1860 t_pdbchain newChain;
1861 newChain.chainid = ri->chainid;
1862 newChain.chainnum = ri->chainnum;
1864 newChain.bAllWat = bWat_;
1867 newChain.nterpairs = 0;
1871 newChain.nterpairs = 1;
1873 newChain.chainstart.resize(newChain.nterpairs + 1);
1874 /* modified [numChains] to [0] below */
1875 newChain.chainstart[0] = 0;
1876 pdb_ch.push_back(newChain);
1882 pdb_ch.back().natom = natom - pdb_ch.back().start;
1884 /* set all the water blocks at the end of the chain */
1885 std::vector<int> swap_index(numChains);
1887 for (int i = 0; i < numChains; i++)
1889 if (!pdb_ch[i].bAllWat)
1895 for (int i = 0; i < numChains; i++)
1897 if (pdb_ch[i].bAllWat)
1903 if (nwaterchain > 1)
1905 printf("Moved all the water blocks to the end\n");
1909 std::vector<t_chain> chains(numChains);
1910 /* copy pdb data and x for all chains */
1911 for (int i = 0; (i < numChains); i++)
1913 int si = swap_index[i];
1914 chains[i].chainid = pdb_ch[si].chainid;
1915 chains[i].chainnum = pdb_ch[si].chainnum;
1916 chains[i].bAllWat = pdb_ch[si].bAllWat;
1917 chains[i].nterpairs = pdb_ch[si].nterpairs;
1918 chains[i].chainstart = pdb_ch[si].chainstart;
1919 chains[i].ntdb.clear();
1920 chains[i].ctdb.clear();
1921 chains[i].r_start.resize(pdb_ch[si].nterpairs);
1922 chains[i].r_end.resize(pdb_ch[si].nterpairs);
1924 snew(chains[i].pdba, 1);
1925 init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1926 for (j = 0; j < chains[i].pdba->nr; j++)
1928 chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start + j];
1929 chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
1930 chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
1931 chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
1933 /* Re-index the residues assuming that the indices are continuous */
1934 int k = chains[i].pdba->atom[0].resind;
1935 int nres = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
1936 chains[i].pdba->nres = nres;
1937 for (int j = 0; j < chains[i].pdba->nr; j++)
1939 chains[i].pdba->atom[j].resind -= k;
1941 srenew(chains[i].pdba->resinfo, nres);
1942 for (int j = 0; j < nres; j++)
1944 chains[i].pdba->resinfo[j] = pdba_all.resinfo[k + j];
1945 chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
1946 /* make all chain identifiers equal to that of the chain */
1947 chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1951 if (nchainmerges > 0)
1953 printf("\nMerged chains into joint molecule definitions at %d places.\n\n", nchainmerges);
1956 printf("There are %d chains and %d blocks of water and "
1957 "%d residues with %d atoms\n",
1958 numChains - nwaterchain, nwaterchain, pdba_all.nres, natom);
1960 printf("\n %5s %4s %6s\n", "chain", "#res", "#atoms");
1961 for (int i = 0; (i < numChains); i++)
1963 printf(" %d '%c' %5d %6d %s\n", i + 1, chains[i].chainid ? chains[i].chainid : '-',
1964 chains[i].pdba->nres, chains[i].pdba->nr, chains[i].bAllWat ? "(only water)" : "");
1968 check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1970 /* Read atomtypes... */
1971 PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
1973 /* read residue database */
1974 printf("Reading residue database... (%s)\n", forcefield_);
1975 std::vector<std::string> rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
1976 std::vector<PreprocessResidue> rtpFFDB;
1977 for (const auto& filename : rtpf)
1979 readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, false);
1983 /* Not correct with multiple rtp input files with different bonded types */
1984 FILE* fp = gmx_fio_fopen("new.rtp", "w");
1985 print_resall(fp, rtpFFDB, atype);
1989 /* read hydrogen database */
1990 std::vector<MoleculePatchDatabase> ah;
1991 read_h_db(ffdir_, &ah);
1993 /* Read Termini database... */
1994 std::vector<MoleculePatchDatabase> ntdb;
1995 std::vector<MoleculePatchDatabase> ctdb;
1996 std::vector<MoleculePatchDatabase*> tdblist;
1997 int nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
1998 int nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
2000 FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2002 print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2005 std::vector<gmx::RVec> x;
2006 /* new pdb datastructure for sorting. */
2007 t_atoms** sortAtoms = nullptr;
2008 t_atoms** localAtoms = nullptr;
2009 snew(sortAtoms, numChains);
2010 snew(localAtoms, numChains);
2011 for (int chain = 0; (chain < numChains); chain++)
2013 cc = &(chains[chain]);
2015 /* set pdba, natom and nres to the current chain */
2018 natom = cc->pdba->nr;
2019 int nres = cc->pdba->nres;
2021 if (cc->chainid && (cc->chainid != ' '))
2023 printf("Processing chain %d '%c' (%d atoms, %d residues)\n", chain + 1, cc->chainid,
2028 printf("Processing chain %d (%d atoms, %d residues)\n", chain + 1, natom, nres);
2031 process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_, bHisMan_,
2032 bArgMan_, bGlnMan_, angle_, distance_, &symtab, rtprename);
2034 cc->chainstart[cc->nterpairs] = pdba->nres;
2036 for (int i = 0; i < cc->nterpairs; i++)
2038 find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]),
2039 &(cc->r_end[j]), &rt);
2041 if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2047 if (cc->nterpairs == 0)
2049 printf("Problem with chain definition, or missing terminal residues.\n"
2050 "This chain does not appear to contain a recognized chain molecule.\n"
2051 "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2054 /* Check for disulfides and other special bonds */
2055 ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2057 if (!rtprename.empty())
2059 rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_);
2062 for (int i = 0; i < cc->nterpairs; i++)
2065 * We first apply a filter so we only have the
2066 * termini that can be applied to the residue in question
2067 * (or a generic terminus if no-residue specific is available).
2069 /* First the N terminus */
2072 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2073 if (tdblist.empty())
2075 printf("No suitable end (N or 5') terminus found in database - assuming this "
2077 "is already in a terminus-specific form and skipping terminus "
2079 cc->ntdb.push_back(nullptr);
2083 if (bTerMan_ && !tdblist.empty())
2085 sprintf(select, "Select start terminus type for %s-%d",
2086 *pdba->resinfo[cc->r_start[i]].name, pdba->resinfo[cc->r_start[i]].nr);
2087 cc->ntdb.push_back(choose_ter(tdblist, select));
2091 cc->ntdb.push_back(tdblist[0]);
2094 printf("Start terminus %s-%d: %s\n", *pdba->resinfo[cc->r_start[i]].name,
2095 pdba->resinfo[cc->r_start[i]].nr, (cc->ntdb[i])->name.c_str());
2101 cc->ntdb.push_back(nullptr);
2104 /* And the C terminus */
2107 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2108 if (tdblist.empty())
2110 printf("No suitable end (C or 3') terminus found in database - assuming this "
2112 "is already in a terminus-specific form and skipping terminus "
2114 cc->ctdb.push_back(nullptr);
2118 if (bTerMan_ && !tdblist.empty())
2120 sprintf(select, "Select end terminus type for %s-%d",
2121 *pdba->resinfo[cc->r_end[i]].name, pdba->resinfo[cc->r_end[i]].nr);
2122 cc->ctdb.push_back(choose_ter(tdblist, select));
2126 cc->ctdb.push_back(tdblist[0]);
2128 printf("End terminus %s-%d: %s\n", *pdba->resinfo[cc->r_end[i]].name,
2129 pdba->resinfo[cc->r_end[i]].nr, (cc->ctdb[i])->name.c_str());
2135 cc->ctdb.push_back(nullptr);
2138 std::vector<MoleculePatchDatabase> hb_chain;
2139 /* lookup hackblocks and rtp for all residues */
2140 std::vector<PreprocessResidue> restp_chain;
2141 get_hackblocks_rtp(&hb_chain, &restp_chain, rtpFFDB, pdba->nres, pdba->resinfo, cc->nterpairs,
2142 &symtab, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_);
2143 /* ideally, now we would not need the rtp itself anymore, but do
2144 everything using the hb and restp arrays. Unfortunately, that
2145 requires some re-thinking of code in gen_vsite.c, which I won't
2146 do now :( AF 26-7-99 */
2148 rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2150 match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_);
2155 t_blocka* block = new_blocka();
2157 sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2158 remove_duplicate_atoms(pdba, x, bVerbose_);
2164 "WARNING: with the -remh option the generated "
2165 "index file (%s) might be useless\n"
2166 "(the index file is generated before hydrogens are added)",
2167 indexOutputFile_.c_str());
2169 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2171 for (int i = 0; i < block->nr; i++)
2183 "without sorting no check for duplicate atoms can be done\n");
2186 /* Generate Hydrogen atoms (and termini) in the sequence */
2187 printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2188 add_h(&pdba, &localAtoms[chain], &x, ah, &symtab, cc->nterpairs, cc->ntdb, cc->ctdb,
2189 cc->r_start, cc->r_end, bAllowMissing_);
2190 printf("Now there are %d residues with %d atoms\n", pdba->nres, pdba->nr);
2192 /* make up molecule name(s) */
2194 int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2196 std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2198 std::string molname;
2206 this_chainid = cc->chainid;
2208 /* Add the chain id if we have one */
2209 if (this_chainid != ' ')
2211 suffix.append(formatString("_chain_%c", this_chainid));
2214 /* Check if there have been previous chains with the same id */
2216 for (int k = 0; k < chain; k++)
2218 if (cc->chainid == chains[k].chainid)
2223 /* Add the number for this chain identifier if there are multiple copies */
2226 suffix.append(formatString("%d", nid_used + 1));
2229 if (suffix.length() > 0)
2231 molname.append(restype);
2232 molname.append(suffix);
2239 std::string itp_fn = topologyFile_;
2241 std::string posre_fn = includeTopologyFile_;
2242 if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2245 printf("Chain time...\n");
2246 // construct the itp file name
2247 itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2249 itp_fn.append(molname);
2250 itp_fn.append(".itp");
2251 // now do the same for posre
2252 posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2253 posre_fn.append("_");
2254 posre_fn.append(molname);
2255 posre_fn.append(".itp");
2256 if (posre_fn == itp_fn)
2258 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2260 incls_.emplace_back();
2261 incls_.back() = itp_fn;
2262 itp_file_ = gmx_fio_fopen(itp_fn.c_str(), "w");
2269 mols_.emplace_back();
2272 mols_.back().name = "SOL";
2273 mols_.back().nr = pdba->nres;
2277 mols_.back().name = molname;
2278 mols_.back().nr = 1;
2283 print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2289 top_file2 = nullptr;
2293 top_file2 = itp_file_;
2297 top_file2 = top_file;
2300 pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, &atype, &symtab, rtpFFDB,
2301 restp_chain, hb_chain, bAllowMissing_, bVsites_, bVsiteAromatics_, ffdir_, mHmult_,
2302 ssbonds, long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2303 bRenumRes_, bRTPresname_);
2307 write_posres(posre_fn.c_str(), pdba, posre_fc_);
2312 gmx_fio_fclose(itp_file_);
2315 /* pdba and natom have been reassigned somewhere so: */
2320 if (watermodel_ == nullptr)
2322 for (int chain = 0; chain < numChains; chain++)
2324 if (chains[chain].bAllWat)
2326 auto message = formatString(
2327 "You have chosen not to include a water model, "
2328 "but there is water in the input file. Select a "
2329 "water model or remove the water from your input file.");
2330 GMX_THROW(InconsistentInputError(message));
2336 std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2337 if (!fflib_fexist(waterFile))
2339 auto message = formatString(
2340 "The topology file '%s' for the selected water "
2341 "model '%s' can not be found in the force field "
2342 "directory. Select a different water model.",
2343 waterFile.c_str(), watermodel_);
2344 GMX_THROW(InconsistentInputError(message));
2348 print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2349 gmx_fio_fclose(top_file);
2351 /* now merge all chains back together */
2354 for (int i = 0; (i < numChains); i++)
2356 natom += chains[i].pdba->nr;
2357 nres += chains[i].pdba->nres;
2361 init_t_atoms(atoms, natom, false);
2362 for (int i = 0; i < atoms->nres; i++)
2364 sfree(atoms->resinfo[i].name);
2367 srenew(atoms->resinfo, nres);
2371 for (int i = 0; (i < numChains); i++)
2375 printf("Including chain %d in system: %d atoms %d residues\n", i + 1,
2376 chains[i].pdba->nr, chains[i].pdba->nres);
2378 for (int j = 0; (j < chains[i].pdba->nr); j++)
2380 atoms->atom[k] = chains[i].pdba->atom[j];
2381 atoms->atom[k].resind += l; /* l is processed nr of residues */
2382 atoms->atomname[k] = chains[i].pdba->atomname[j];
2383 atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2384 x.push_back(chains[i].x[j]);
2387 for (int j = 0; (j < chains[i].pdba->nres); j++)
2389 atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2392 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2396 done_atom(chains[i].pdba);
2401 fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2402 print_sums(atoms, true);
2406 fprintf(stderr, "\nWriting coordinate file...\n");
2407 clear_rvec(box_space);
2410 make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2412 write_sto_conf(outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr, ePBC, box);
2414 done_symtab(&symtab);
2415 done_atom(&pdba_all);
2417 for (int chain = 0; chain < numChains; chain++)
2419 sfree(sortAtoms[chain]);
2420 sfree(localAtoms[chain]);
2427 printf("\t\t--------- PLEASE NOTE ------------\n");
2428 printf("You have successfully generated a topology from: %s.\n", inputConfFile_.c_str());
2429 if (watermodel_ != nullptr)
2431 printf("The %s force field and the %s water model are used.\n", ffname_, watermodel_);
2436 printf("The %s force field is used.\n", ffname_);
2438 printf("\t\t--------- ETON ESAELP ------------\n");
2445 const char pdb2gmxInfo::name[] = "pdb2gmx";
2446 const char pdb2gmxInfo::shortDescription[] =
2447 "Convert coordinate files to topology and FF-compliant coordinate files";
2448 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2450 return std::make_unique<pdb2gmx>();