2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "gromacs/commandline/cmdlineoptionsmodule.h"
52 #include "gromacs/fileio/confio.h"
53 #include "gromacs/fileio/filetypes.h"
54 #include "gromacs/fileio/gmxfio.h"
55 #include "gromacs/fileio/pdbio.h"
56 #include "gromacs/gmxlib/conformation_utilities.h"
57 #include "gromacs/gmxpreprocess/fflibutil.h"
58 #include "gromacs/gmxpreprocess/genhydro.h"
59 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
60 #include "gromacs/gmxpreprocess/grompp_impl.h"
61 #include "gromacs/gmxpreprocess/h_db.h"
62 #include "gromacs/gmxpreprocess/hizzie.h"
63 #include "gromacs/gmxpreprocess/pdb2top.h"
64 #include "gromacs/gmxpreprocess/pgutil.h"
65 #include "gromacs/gmxpreprocess/specbond.h"
66 #include "gromacs/gmxpreprocess/ter_db.h"
67 #include "gromacs/gmxpreprocess/toputil.h"
68 #include "gromacs/gmxpreprocess/xlate.h"
69 #include "gromacs/math/vec.h"
70 #include "gromacs/options/basicoptions.h"
71 #include "gromacs/options/filenameoption.h"
72 #include "gromacs/options/ioptionscontainer.h"
73 #include "gromacs/topology/atomprop.h"
74 #include "gromacs/topology/block.h"
75 #include "gromacs/topology/index.h"
76 #include "gromacs/topology/residuetypes.h"
77 #include "gromacs/topology/symtab.h"
78 #include "gromacs/topology/topology.h"
79 #include "gromacs/utility/dir_separator.h"
80 #include "gromacs/utility/exceptions.h"
81 #include "gromacs/utility/fatalerror.h"
82 #include "gromacs/utility/path.h"
83 #include "gromacs/utility/smalloc.h"
84 #include "gromacs/utility/strdb.h"
85 #include "gromacs/utility/stringutil.h"
87 #include "hackblock.h"
92 RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
107 static const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
109 /* NOTE: This function returns the main building block name,
110 * it does not take terminal renaming into account.
112 auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
113 return gmx::equalCaseInsensitive(name, rename.gmx);
115 return found != rr.end() ? found->main.c_str() : name.c_str();
118 static const char* select_res(int nr,
123 gmx::ArrayRef<const RtpRename> rr)
125 printf("Which %s type do you want for residue %d\n", title, resnr + 1);
126 for (int sel = 0; (sel < nr); sel++)
128 printf("%d. %s (%s)\n", sel, expl[sel], res2bb_notermini(name[sel], rr));
130 printf("\nType a number:");
134 if (scanf("%d", &userSelection) != 1)
136 gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
139 return name[userSelection];
142 static const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
150 const char* lh[easpNR] = { "ASP", "ASPH" };
151 const char* expl[easpNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
153 return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", rr);
156 static const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
164 const char* lh[egluNR] = { "GLU", "GLUH" };
165 const char* expl[egluNR] = { "Not protonated (charge -1)", "Protonated (charge 0)" };
167 return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", rr);
170 static const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
178 const char* lh[eglnNR] = { "GLN", "QLN" };
179 const char* expl[eglnNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
181 return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", rr);
184 static const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
192 const char* lh[elysNR] = { "LYSN", "LYS" };
193 const char* expl[elysNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
195 return select_res(elysNR, resnr, lh, expl, "LYSINE", rr);
198 static const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
206 const char* lh[eargNR] = { "ARGN", "ARG" };
207 const char* expl[eargNR] = { "Not protonated (charge 0)", "Protonated (charge +1)" };
209 return select_res(eargNR, resnr, lh, expl, "ARGININE", rr);
212 static const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
214 const char* expl[ehisNR] = { "H on ND1 only", "H on NE2 only", "H on ND1 and NE2",
217 return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", rr);
220 static void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
222 char line[STRLEN], buf[STRLEN];
225 while (get_a_line(fp, line, STRLEN))
227 /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
228 * For other args, we read up to 6 chars (so we can detect if the length is > 5).
229 * Note that the buffer length has been increased to 7 to allow this,
230 * so we just need to make sure the strings have zero-length initially.
242 int nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
243 RtpRename newEntry(gmx, main, nter, cter, bter);
246 if (nc != 2 && nc != 5)
248 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d",
256 "A line in residue renaming database '%s' has %d columns, while previous "
257 "lines have %d columns",
263 /* This file does not have special termini names, copy them from main */
264 newEntry.nter = newEntry.main;
265 newEntry.cter = newEntry.main;
266 newEntry.bter = newEntry.main;
268 rtprename->push_back(newEntry);
272 static std::string search_resrename(gmx::ArrayRef<const RtpRename> rr,
276 bool bCompareFFRTPname)
278 auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
279 return ((!bCompareFFRTPname && (name == rename.gmx))
280 || (bCompareFFRTPname && (name == rename.main)));
284 /* If found in the database, rename this residue's rtp building block,
285 * otherwise keep the old name.
287 if (found != rr.end())
291 newName = found->bter;
295 newName = found->nter;
299 newName = found->cter;
303 newName = found->main;
306 if (newName[0] == '-')
308 gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name,
309 bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
310 : (bEnd ? " as an ending terminus" : ""));
317 static void rename_resrtp(t_atoms* pdba,
319 gmx::ArrayRef<const int> r_start,
320 gmx::ArrayRef<const int> r_end,
321 gmx::ArrayRef<const RtpRename> rr,
325 bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
327 for (int r = 0; r < pdba->nres; r++)
331 for (int j = 0; j < nterpairs; j++)
338 for (int j = 0; j < nterpairs; j++)
346 std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
348 if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
350 /* This is a terminal residue, but the residue name,
351 * currently stored in .rtp, is not a standard residue name,
352 * but probably a force field specific rtp name.
353 * Check if we need to rename it because it is terminal.
355 newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
358 if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
362 printf("Changing rtp entry of residue %d %s to '%s'\n", pdba->resinfo[r].nr,
363 *pdba->resinfo[r].name, newName.c_str());
365 pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
370 static void pdbres_to_gmxrtp(t_atoms* pdba)
374 for (i = 0; (i < pdba->nres); i++)
376 if (pdba->resinfo[i].rtp == nullptr)
378 pdba->resinfo[i].rtp = pdba->resinfo[i].name;
383 static void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
388 for (i = 0; (i < pdba->nres); i++)
390 resnm = *pdba->resinfo[i].name;
391 if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
392 || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
394 /* Rename the residue name (not the rtp name) */
395 pdba->resinfo[i].name = put_symtab(symtab, newnm);
400 static void rename_bb(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
405 for (i = 0; (i < pdba->nres); i++)
407 /* We have not set the rtp name yes, use the residue name */
408 bbnm = *pdba->resinfo[i].name;
409 if ((bFullCompare && (gmx::equalCaseInsensitive(bbnm, oldnm)))
410 || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
412 /* Change the rtp builing block name */
413 pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
418 static void rename_bbint(t_atoms* pdba,
420 const char* gettp(int, gmx::ArrayRef<const RtpRename>),
423 gmx::ArrayRef<const RtpRename> rr)
429 for (i = 0; i < pdba->nres; i++)
431 /* We have not set the rtp name yet, use the residue name */
432 bbnm = *pdba->resinfo[i].name;
433 if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) || (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
436 pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
441 static void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose)
447 ftp = fn2ftp(filename);
448 if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
450 fprintf(stderr, "No occupancies in %s\n", filename);
454 for (i = 0; (i < atoms->nr); i++)
456 if (atoms->pdbinfo[i].occup != 1)
460 fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
461 *atoms->resinfo[atoms->atom[i].resind].name,
462 atoms->resinfo[atoms->atom[i].resind].nr, *atoms->atomname[i],
463 atoms->pdbinfo[i].occup);
465 if (atoms->pdbinfo[i].occup == 0)
475 if (nzero == atoms->nr)
477 fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
479 else if ((nzero > 0) || (nnotone > 0))
483 "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
484 " occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
486 nzero, nnotone, atoms->nr);
490 fprintf(stderr, "All occupancies are one\n");
495 static void write_posres(const char* fn, t_atoms* pdba, real fc)
500 fp = gmx_fio_fopen(fn, "w");
502 "; In this topology include file, you will find position restraint\n"
503 "; entries for all the heavy atoms in your original pdb file.\n"
504 "; This means that all the protons which were added by pdb2gmx are\n"
505 "; not restrained.\n"
507 "[ position_restraints ]\n"
508 "; %4s%6s%8s%8s%8s\n",
509 "atom", "type", "fx", "fy", "fz");
510 for (i = 0; (i < pdba->nr); i++)
512 if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
514 fprintf(fp, "%6d%6d %g %g %g\n", i + 1, 1, fc, fc, fc);
520 static int read_pdball(const char* inf,
534 /* Read a pdb file. (containing proteins) */
536 int natom, new_natom, i;
539 printf("Reading %s...\n", inf);
540 readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
542 if (atoms->pdbinfo == nullptr)
544 snew(atoms->pdbinfo, atoms->nr);
546 if (fn2ftp(inf) == efPDB)
548 get_pdb_atomnumber(atoms, aps);
553 for (i = 0; i < atoms->nr; i++)
555 if (!is_hydrogen(*atoms->atomname[i]))
557 atoms->atom[new_natom] = atoms->atom[i];
558 atoms->atomname[new_natom] = atoms->atomname[i];
559 atoms->pdbinfo[new_natom] = atoms->pdbinfo[i];
560 copy_rvec((*x)[i], (*x)[new_natom]);
564 atoms->nr = new_natom;
571 printf(" '%s',", *title);
573 printf(" %d atoms\n", natom);
575 /* Rename residues */
576 rename_pdbres(atoms, "HOH", watres, false, symtab);
577 rename_pdbres(atoms, "SOL", watres, false, symtab);
578 rename_pdbres(atoms, "WAT", watres, false, symtab);
580 rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
588 write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
594 static void process_chain(t_atoms* pdba,
595 gmx::ArrayRef<gmx::RVec> x,
608 gmx::ArrayRef<const RtpRename> rr)
610 /* Rename aromatics, lys, asp and histidine */
613 rename_bb(pdba, "TYR", "TYRU", false, symtab);
617 rename_bb(pdba, "TRP", "TRPU", false, symtab);
621 rename_bb(pdba, "PHE", "PHEU", false, symtab);
625 rename_bbint(pdba, "LYS", get_lystp, false, symtab, rr);
629 rename_bbint(pdba, "ARG", get_argtp, false, symtab, rr);
633 rename_bbint(pdba, "GLN", get_glntp, false, symtab, rr);
637 rename_bbint(pdba, "ASP", get_asptp, false, symtab, rr);
641 rename_bb(pdba, "ASPH", "ASP", false, symtab);
645 rename_bbint(pdba, "GLU", get_glutp, false, symtab, rr);
649 rename_bb(pdba, "GLUH", "GLU", false, symtab);
654 set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
658 rename_bbint(pdba, "HIS", get_histp, true, symtab, rr);
661 /* Initialize the rtp builing block names with the residue names
662 * for the residues that have not been processed above.
664 pdbres_to_gmxrtp(pdba);
666 /* Now we have all rtp names set.
667 * The rtp names will conform to Gromacs naming,
668 * unless the input pdb file contained one or more force field specific
669 * rtp names as residue names.
673 /* struct for sorting the atoms from the pdb file */
676 int resnr; /* residue number */
677 int j; /* database order index */
678 int index; /* original atom number */
679 char anm1; /* second letter of atom name */
680 char altloc; /* alternate location indicator */
683 static bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
685 int d = (a.resnr - b.resnr);
691 d = (a.anm1 - b.anm1);
694 d = (a.altloc - b.altloc);
701 static void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
704 t_atoms** newPdbAtoms,
705 std::vector<gmx::RVec>* x,
709 t_atoms* pdba = *pdbaptr;
710 std::vector<gmx::RVec> xnew;
717 for (int i = 0; i < natoms; i++)
719 atomnm = *pdba->atomname[i];
720 const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
722 std::find_if(localPpResidue->atomname.begin(), localPpResidue->atomname.end(),
723 [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
724 if (found == localPpResidue->atomname.end())
729 "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
730 "while sorting atoms.\n%s",
731 atomnm, *pdba->resinfo[pdba->atom[i].resind].name,
732 pdba->resinfo[pdba->atom[i].resind].nr, localPpResidue->resname.c_str(),
733 localPpResidue->natom(),
735 ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
736 "might have had a different number in the PDB file and was rebuilt\n"
737 "(it might for instance have been H3, and we only expected H1 & "
739 "Note that hydrogens might have been added to the entry for the "
741 "Remove this hydrogen or choose a different protonation state to "
743 "Option -ignh will ignore all hydrogens in the input."
745 gmx_fatal(FARGS, "%s", buf);
747 /* make shadow array to be sorted into indexgroup */
748 pdbi[i].resnr = pdba->atom[i].resind;
749 pdbi[i].j = std::distance(localPpResidue->atomname.begin(), found);
751 pdbi[i].anm1 = atomnm[1];
752 pdbi[i].altloc = pdba->pdbinfo[i].altloc;
754 std::sort(pdbi, pdbi + natoms, pdbicomp);
756 /* pdba is sorted in pdbnew using the pdbi index */
757 std::vector<int> a(natoms);
758 srenew(*newPdbAtoms, 1);
759 init_t_atoms((*newPdbAtoms), natoms, true);
760 (*newPdbAtoms)->nr = pdba->nr;
761 (*newPdbAtoms)->nres = pdba->nres;
762 srenew((*newPdbAtoms)->resinfo, pdba->nres);
763 std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
764 for (int i = 0; i < natoms; i++)
766 (*newPdbAtoms)->atom[i] = pdba->atom[pdbi[i].index];
767 (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
768 (*newPdbAtoms)->pdbinfo[i] = pdba->pdbinfo[pdbi[i].index];
769 xnew.push_back(x->at(pdbi[i].index));
770 /* make indexgroup in block */
771 a[i] = pdbi[i].index;
776 /* copy the sorted pdbnew back to pdba */
777 *pdbaptr = *newPdbAtoms;
779 add_grp(block, gnames, a, "prot_sort");
783 static int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose)
785 int i, j, oldnatoms, ndel;
788 printf("Checking for duplicate atoms....\n");
789 oldnatoms = pdba->nr;
791 /* NOTE: pdba->nr is modified inside the loop */
792 for (i = 1; (i < pdba->nr); i++)
794 /* compare 'i' and 'i-1', throw away 'i' if they are identical
795 this is a 'while' because multiple alternate locations can be present */
796 while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
797 && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
802 ri = &pdba->resinfo[pdba->atom[i].resind];
803 printf("deleting duplicate atom %4s %s%4d%c", *pdba->atomname[i], *ri->name,
805 if (ri->chainid && (ri->chainid != ' '))
807 printf(" ch %c", ri->chainid);
811 if (pdba->pdbinfo[i].atomnr)
813 printf(" pdb nr %4d", pdba->pdbinfo[i].atomnr);
815 if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
817 printf(" altloc %c", pdba->pdbinfo[i].altloc);
823 /* We can not free, since it might be in the symtab */
824 /* sfree(pdba->atomname[i]); */
825 for (j = i; j < pdba->nr; j++)
827 pdba->atom[j] = pdba->atom[j + 1];
828 pdba->atomname[j] = pdba->atomname[j + 1];
831 pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
833 copy_rvec(x[j + 1], x[j]);
835 srenew(pdba->atom, pdba->nr);
836 /* srenew(pdba->atomname, pdba->nr); */
837 srenew(pdba->pdbinfo, pdba->nr);
840 if (pdba->nr != oldnatoms)
842 printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
848 static void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueType* rt)
850 std::string startResidueString =
851 gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
852 std::string endResidueString =
853 gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
855 // Check whether all residues in chain have the same chain ID.
856 bool allResiduesHaveSameChainID = true;
857 char chainID0 = pdba->resinfo[r0].chainid;
859 std::string residueString;
861 for (int i = r0 + 1; i < r1; i++)
863 chainID = pdba->resinfo[i].chainid;
864 if (chainID != chainID0)
866 allResiduesHaveSameChainID = false;
867 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
872 if (!allResiduesHaveSameChainID)
875 "The chain covering the range %s--%s does not have a consistent chain ID. "
876 "The first residue has ID '%c', while residue %s has ID '%c'.",
877 startResidueString.c_str(), endResidueString.c_str(), chainID0,
878 residueString.c_str(), chainID);
881 // At this point all residues have the same ID. If they are also non-blank
882 // we can be a bit more aggressive and require the types match too.
885 bool allResiduesHaveSameType = true;
887 std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
889 for (int i = r0 + 1; i < r1; i++)
891 restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
892 if (!gmx::equalCaseInsensitive(restype, restype0))
894 allResiduesHaveSameType = false;
895 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
900 if (!allResiduesHaveSameType)
903 "The residues in the chain %s--%s do not have a consistent type. "
904 "The first residue has type '%s', while residue %s is of type '%s'. "
905 "Either there is a mistake in your chain, or it includes nonstandard "
906 "residue names that have not yet been added to the residuetypes.dat "
907 "file in the GROMACS library directory. If there are other molecules "
908 "such as ligands, they should not have the same chain ID as the "
909 "adjacent protein chain since it's a separate molecule.",
910 startResidueString.c_str(), endResidueString.c_str(), restype0.c_str(),
911 residueString.c_str(), restype.c_str());
916 static void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueType* rt)
919 gmx::compat::optional<std::string> startrestype;
924 int startWarnings = 0;
928 // Check that all residues have the same chain identifier, and if it is
929 // non-blank we also require the residue types to match.
930 checkResidueTypeSanity(pdba, r0, r1, rt);
932 // If we return correctly from checkResidueTypeSanity(), the only
933 // remaining cases where we can have non-matching residue types is if
934 // the chain ID was blank, which could be the case e.g. for a structure
935 // read from a GRO file or other file types without chain information.
936 // In that case we need to be a bit more liberal and detect chains based
937 // on the residue type.
939 // If we get here, the chain ID must be identical for all residues
940 char chainID = pdba->resinfo[r0].chainid;
942 /* Find the starting terminus (typially N or 5') */
943 for (i = r0; i < r1 && *r_start == -1; i++)
945 startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
950 if (gmx::equalCaseInsensitive(*startrestype, "Protein")
951 || gmx::equalCaseInsensitive(*startrestype, "DNA")
952 || gmx::equalCaseInsensitive(*startrestype, "RNA"))
954 printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name,
955 pdba->resinfo[i].nr);
958 else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
962 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n",
963 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
967 printf("Disabling further notes about ions.\n");
973 // Either no known residue type, or one not needing special handling
974 if (startWarnings < 5)
978 printf("\nWarning: Starting residue %s%d in chain not identified as "
980 "This chain lacks identifiers, which makes it impossible to do strict\n"
981 "classification of the start/end residues. Here we need to guess this "
983 "should not be part of the chain and instead introduce a break, but "
985 "be catastrophic if they should in fact be linked. Please check your "
987 "and add %s to residuetypes.dat if this was not correct.\n\n",
988 *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
992 printf("\nWarning: No residues in chain starting at %s%d identified as "
994 "This makes it impossible to link them into a molecule, which could "
996 "correct or a catastrophic error. Please check your structure, and add "
998 "necessary residue names to residuetypes.dat if this was not "
1000 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1003 if (startWarnings == 4)
1005 printf("Disabling further warnings about unidentified residues at start of "
1014 /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1015 for (int i = *r_start; i < r1; i++)
1017 gmx::compat::optional<std::string> restype =
1018 rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1023 if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1027 else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1031 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n",
1032 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1036 printf("Disabling further notes about ions.\n");
1042 // Either no known residue type, or one not needing special handling.
1043 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1044 // Otherwise the call to checkResidueTypeSanity() will
1045 // have caught the problem.
1046 if (endWarnings < 5)
1048 printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1049 "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1050 "it impossible to do strict classification of the start/end residues. "
1052 "need to guess this residue should not be part of the chain and "
1054 "introduce a break, but that will be catastrophic if they should in "
1056 "linked. Please check your structure, and add %s to residuetypes.dat\n"
1057 "if this was not correct.\n\n",
1058 *pdba->resinfo[i].name, pdba->resinfo[i].nr, restype->c_str(),
1059 *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr,
1060 startrestype->c_str(), *pdba->resinfo[i].name);
1062 if (endWarnings == 4)
1064 printf("Disabling further warnings about unidentified residues at end of "
1074 printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name,
1075 pdba->resinfo[*r_end].nr);
1079 /* enum for chain separation */
1082 enChainSep_id_or_ter,
1083 enChainSep_id_and_ter,
1086 enChainSep_interactive
1088 static const char* ChainSepEnum[] = { "id_or_ter", "id_and_ter", "ter", "id", "interactive" };
1089 static const char* ChainSepInfoString[] = {
1090 "Splitting chemical chains based on TER records or chain id changing.\n",
1091 "Splitting chemical chains based on TER records and chain id changing.\n",
1092 "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1093 "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1094 "Splitting chemical chains interactively.\n"
1097 static void modify_chain_numbers(t_atoms* pdba, ChainSepType enumChainSep)
1100 char old_prev_chainid;
1101 char old_this_chainid;
1102 int old_prev_chainnum;
1103 int old_this_chainnum;
1105 char select[STRLEN];
1109 const char* prev_atomname;
1110 const char* this_atomname;
1111 const char* prev_resname;
1112 const char* this_resname;
1118 /* The default chain enumeration is based on TER records only */
1119 printf("%s", ChainSepInfoString[enumChainSep]);
1121 old_prev_chainid = '?';
1122 old_prev_chainnum = -1;
1125 this_atomname = nullptr;
1127 this_resname = nullptr;
1131 for (i = 0; i < pdba->nres; i++)
1133 ri = &pdba->resinfo[i];
1134 old_this_chainid = ri->chainid;
1135 old_this_chainnum = ri->chainnum;
1137 prev_atomname = this_atomname;
1138 prev_atomnum = this_atomnum;
1139 prev_resname = this_resname;
1140 prev_resnum = this_resnum;
1141 prev_chainid = this_chainid;
1143 this_atomname = *(pdba->atomname[i]);
1144 this_atomnum = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1145 this_resname = *ri->name;
1146 this_resnum = ri->nr;
1147 this_chainid = ri->chainid;
1149 switch (enumChainSep)
1151 case enChainSep_id_or_ter:
1152 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1158 case enChainSep_id_and_ter:
1159 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1166 if (old_this_chainid != old_prev_chainid)
1172 case enChainSep_ter:
1173 if (old_this_chainnum != old_prev_chainnum)
1178 case enChainSep_interactive:
1179 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1183 printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1185 "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1186 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1187 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1189 if (nullptr == fgets(select, STRLEN - 1, stdin))
1191 gmx_fatal(FARGS, "Error reading from stdin");
1194 if (i == 0 || select[0] == 'y')
1200 default: gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1202 old_prev_chainid = old_this_chainid;
1203 old_prev_chainnum = old_this_chainnum;
1205 ri->chainnum = new_chainnum;
1212 char chainnum = ' ';
1215 bool bAllWat = false;
1217 std::vector<int> chainstart;
1224 bool bAllWat = false;
1226 std::vector<int> chainstart;
1227 std::vector<MoleculePatchDatabase*> ntdb;
1228 std::vector<MoleculePatchDatabase*> ctdb;
1229 std::vector<int> r_start;
1230 std::vector<int> r_end;
1232 std::vector<gmx::RVec> x;
1235 // TODO make all enums into scoped enums
1236 /* enum for vsites */
1243 static const char* VSitesEnum[] = { "none", "hydrogens", "aromatics" };
1245 /* enum for water model */
1257 static const char* WaterEnum[] = { "select", "none", "spc", "spce",
1258 "tip3p", "tip4p", "tip5p", "tips3p" };
1260 /* enum for merge */
1267 static const char* MergeEnum[] = { "no", "all", "interactive" };
1275 class pdb2gmx : public ICommandLineOptionsModule
1281 bVsiteAromatics_(FALSE),
1282 enumChainSep_(enChainSep_id_or_ter),
1283 enumVSites_(enVSites_none),
1284 enumWater_(enWater_select),
1285 enumMerge_(enMerge_no),
1291 // From ICommandLineOptionsModule
1292 void init(CommandLineModuleSettings* /*settings*/) override {}
1294 void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1296 void optionsFinished() override;
1314 bool bAllowMissing_;
1318 bool bChargeGroups_;
1328 bool bVsiteAromatics_;
1332 real long_bond_dist_;
1333 real short_bond_dist_;
1335 std::string indexOutputFile_;
1336 std::string outputFile_;
1337 std::string topologyFile_;
1338 std::string includeTopologyFile_;
1339 std::string outputConfFile_;
1340 std::string inputConfFile_;
1341 std::string outFile_;
1344 ChainSepType enumChainSep_;
1345 VSitesType enumVSites_;
1346 WaterType enumWater_;
1347 MergeType enumMerge_;
1350 char forcefield_[STRLEN];
1351 char ffdir_[STRLEN];
1354 std::vector<std::string> incls_;
1355 std::vector<t_mols> mols_;
1359 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1361 const char* desc[] = {
1362 "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1363 "some database files, adds hydrogens to the molecules and generates",
1364 "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1365 "and a topology in GROMACS format.",
1366 "These files can subsequently be processed to generate a run input file.",
1368 "[THISMODULE] will search for force fields by looking for",
1369 "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1370 "of the current working directory and of the GROMACS library directory",
1371 "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1373 "By default the forcefield selection is interactive,",
1374 "but you can use the [TT]-ff[tt] option to specify one of the short names",
1375 "in the list on the command line instead. In that case [THISMODULE] just looks",
1376 "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1378 "After choosing a force field, all files will be read only from",
1379 "the corresponding force field directory.",
1380 "If you want to modify or add a residue types, you can copy the force",
1381 "field directory from the GROMACS library directory to your current",
1382 "working directory. If you want to add new protein residue types,",
1383 "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1384 "or copy the whole library directory to a local directory and set",
1385 "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1386 "Check Chapter 5 of the manual for more information about file formats.",
1389 "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1390 "need not necessarily contain a protein structure. Every kind of",
1391 "molecule for which there is support in the database can be converted.",
1392 "If there is no support in the database, you can add it yourself.[PAR]",
1394 "The program has limited intelligence, it reads a number of database",
1395 "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1396 "if necessary this can be done manually. The program can prompt the",
1397 "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1398 "desired. For Lys the choice is between neutral (two protons on NZ) or",
1399 "protonated (three protons, default), for Asp and Glu unprotonated",
1400 "(default) or protonated, for His the proton can be either on ND1,",
1401 "on NE2 or on both. By default these selections are done automatically.",
1402 "For His, this is based on an optimal hydrogen bonding",
1403 "conformation. Hydrogen bonds are defined based on a simple geometric",
1404 "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1405 "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1406 "[TT]-dist[tt] respectively.[PAR]",
1408 "The protonation state of N- and C-termini can be chosen interactively",
1409 "with the [TT]-ter[tt] flag. Default termini are ionized (NH3+ and COO-),",
1410 "respectively. Some force fields support zwitterionic forms for chains of",
1411 "one residue, but for polypeptides these options should NOT be selected.",
1412 "The AMBER force fields have unique forms for the terminal residues,",
1413 "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1414 "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1415 "respectively to use these forms, making sure you preserve the format",
1416 "of the coordinate file. Alternatively, use named terminating residues",
1417 "(e.g. ACE, NME).[PAR]",
1419 "The separation of chains is not entirely trivial since the markup",
1420 "in user-generated PDB files frequently varies and sometimes it",
1421 "is desirable to merge entries across a TER record, for instance",
1422 "if you want a disulfide bridge or distance restraints between",
1423 "two protein chains or if you have a HEME group bound to a protein.",
1424 "In such cases multiple chains should be contained in a single",
1425 "[TT]moleculetype[tt] definition.",
1426 "To handle this, [THISMODULE] uses two separate options.",
1427 "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1428 "start, and termini added when applicable. This can be done based on the",
1429 "existence of TER records, when the chain id changes, or combinations of either",
1430 "or both of these. You can also do the selection fully interactively.",
1431 "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1432 "are merged into one moleculetype, after adding all the chemical termini (or not).",
1433 "This can be turned off (no merging), all non-water chains can be merged into a",
1434 "single molecule, or the selection can be done interactively.[PAR]",
1436 "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1437 "If any of the occupancies are not one, indicating that the atom is",
1438 "not resolved well in the structure, a warning message is issued.",
1439 "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1440 "all occupancy fields may be zero. Either way, it is up to the user",
1441 "to verify the correctness of the input data (read the article!).[PAR]",
1443 "During processing the atoms will be reordered according to GROMACS",
1444 "conventions. With [TT]-n[tt] an index file can be generated that",
1445 "contains one group reordered in the same way. This allows you to",
1446 "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1447 "one limitation: reordering is done after the hydrogens are stripped",
1448 "from the input and before new hydrogens are added. This means that",
1449 "you should not use [TT]-ignh[tt].[PAR]",
1451 "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1452 "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1453 "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1456 "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1457 "motions. Angular and out-of-plane motions can be removed by changing",
1458 "hydrogens into virtual sites and fixing angles, which fixes their",
1459 "position relative to neighboring atoms. Additionally, all atoms in the",
1460 "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1461 "can be converted into virtual sites, eliminating the fast improper dihedral",
1462 "fluctuations in these rings (but this feature is deprecated).",
1463 "[BB]Note[bb] that in this case all other hydrogen",
1464 "atoms are also converted to virtual sites. The mass of all atoms that are",
1465 "converted into virtual sites, is added to the heavy atoms.[PAR]",
1466 "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1467 "done by increasing the hydrogen-mass by a factor of 4. This is also",
1468 "done for water hydrogens to slow down the rotational motion of water.",
1469 "The increase in mass of the hydrogens is subtracted from the bonded",
1470 "(heavy) atom so that the total mass of the system remains the same."
1473 settings->setHelpText(desc);
1475 options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1476 "Write the residue database in new format to [TT]new.rtp[tt]"));
1478 RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1480 RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1482 EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum).store(&enumChainSep_).description("Condition in PDB files when a new chain should be started (adding termini)"));
1483 options->addOption(EnumOption<MergeType>("merge")
1484 .enumValue(MergeEnum)
1486 .description("Merge multiple chains into a single [moleculetype]"));
1487 options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1488 "Force field, interactive by default. Use [TT]-h[tt] for information."));
1490 EnumOption<WaterType>("water").store(&enumWater_).enumValue(WaterEnum).description("Water model to use"));
1491 options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1492 "Set the next 8 options to interactive"));
1493 options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1494 "Interactive SS bridge selection"));
1495 options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1496 "Interactive termini selection, instead of charged (default)"));
1497 options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1498 "Interactive lysine selection, instead of charged"));
1499 options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1500 "Interactive arginine selection, instead of charged"));
1501 options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1502 "Interactive aspartic acid selection, instead of charged"));
1503 options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1504 "Interactive glutamic acid selection, instead of charged"));
1505 options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1506 "Interactive glutamine selection, instead of charged"));
1507 options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1508 "Interactive histidine selection, instead of checking H-bonds"));
1509 options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1510 "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1512 RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1513 options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1514 "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1516 options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1517 "Sort the residues according to database, turning this off is dangerous as charge "
1518 "groups might be broken in parts"));
1520 BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1522 BooleanOption("missing")
1523 .store(&bAllowMissing_)
1524 .defaultValue(false)
1526 "Continue when atoms are missing and bonds cannot be made, dangerous"));
1528 BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1530 RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1532 EnumOption<VSitesType>("vsite").store(&enumVSites_).enumValue(VSitesEnum).description("Convert atoms to virtual sites"));
1533 options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1534 "Make hydrogen atoms heavy"));
1536 BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1537 options->addOption(BooleanOption("chargegrp")
1538 .store(&bChargeGroups_)
1540 .description("Use charge groups in the [REF].rtp[ref] file"));
1541 options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1542 "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1543 options->addOption(BooleanOption("renum")
1545 .defaultValue(false)
1546 .description("Renumber the residues consecutively in the output"));
1547 options->addOption(BooleanOption("rtpres")
1548 .store(&bRTPresname_)
1549 .defaultValue(false)
1550 .description("Use [REF].rtp[ref] entry names as residue names"));
1551 options->addOption(FileNameOption("f")
1554 .store(&inputConfFile_)
1556 .defaultBasename("protein")
1558 .description("Structure file"));
1559 options->addOption(FileNameOption("o")
1562 .store(&outputConfFile_)
1564 .defaultBasename("conf")
1565 .description("Structure file"));
1566 options->addOption(FileNameOption("p")
1569 .store(&topologyFile_)
1571 .defaultBasename("topol")
1572 .description("Topology file"));
1573 options->addOption(FileNameOption("i")
1576 .store(&includeTopologyFile_)
1578 .defaultBasename("posre")
1579 .description("Include file for topology"));
1580 options->addOption(FileNameOption("n")
1583 .store(&indexOutputFile_)
1584 .storeIsSet(&bIndexSet_)
1585 .defaultBasename("index")
1586 .description("Index file"));
1587 options->addOption(FileNameOption("q")
1591 .storeIsSet(&bOutputSet_)
1592 .defaultBasename("clean")
1594 .description("Structure file"));
1597 void pdb2gmx::optionsFinished()
1599 if (inputConfFile_.empty())
1601 GMX_THROW(InconsistentInputError("You must supply an input file"));
1605 /* if anything changes here, also change description of -inter */
1620 else if (bDeuterate_)
1629 /* Force field selection, interactive or direct */
1630 choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(), forcefield_,
1631 sizeof(forcefield_), ffdir_, sizeof(ffdir_));
1633 if (strlen(forcefield_) > 0)
1635 ffname_ = forcefield_;
1636 ffname_[0] = std::toupper(ffname_[0]);
1640 gmx_fatal(FARGS, "Empty forcefield string");
1646 char select[STRLEN];
1647 std::vector<DisulfideBond> ssbonds;
1651 const char* prev_atomname;
1652 const char* this_atomname;
1653 const char* prev_resname;
1654 const char* this_resname;
1659 int prev_chainnumber;
1660 int this_chainnumber;
1661 int this_chainstart;
1662 int prev_chainstart;
1664 printf("\nUsing the %s force field in directory %s\n\n", ffname_, ffdir_);
1666 choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1668 switch (enumVSites_)
1672 bVsiteAromatics_ = false;
1674 case enVSites_hydrogens:
1676 bVsiteAromatics_ = false;
1678 case enVSites_aromatics:
1680 bVsiteAromatics_ = true;
1683 gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1686 /* Open the symbol table */
1688 open_symtab(&symtab);
1690 /* Residue type database */
1693 /* Read residue renaming database(s), if present */
1694 std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1696 std::vector<RtpRename> rtprename;
1697 for (const auto& filename : rrn)
1699 printf("going to rename %s\n", filename.c_str());
1700 FILE* fp = fflib_open(filename);
1701 read_rtprename(filename.c_str(), fp, &rtprename);
1705 /* Add all alternative names from the residue renaming database to the list
1706 of recognized amino/nucleic acids. */
1707 for (const auto& rename : rtprename)
1709 /* Only add names if the 'standard' gromacs/iupac base name was found */
1710 if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
1712 rt.addResidue(rename.main, *restype);
1713 rt.addResidue(rename.nter, *restype);
1714 rt.addResidue(rename.cter, *restype);
1715 rt.addResidue(rename.bter, *restype);
1722 if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
1726 else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
1736 char* title = nullptr;
1740 int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(), &title, &pdba_all,
1741 &pdbx, &ePBC, box, bRemoveH_, &symtab, &rt, watres, &aps, bVerbose_);
1745 std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1746 GMX_THROW(InconsistentInputError(message));
1749 printf("Analyzing pdb file\n");
1750 int nwaterchain = 0;
1752 modify_chain_numbers(&pdba_all, enumChainSep_);
1754 int nchainmerges = 0;
1756 this_atomname = nullptr;
1758 this_resname = nullptr;
1761 this_chainnumber = -1;
1762 this_chainstart = 0;
1763 /* Keep the compiler happy */
1764 prev_chainstart = 0;
1767 std::vector<t_pdbchain> pdb_ch;
1770 bool bMerged = false;
1771 for (int i = 0; (i < natom); i++)
1773 ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1775 /* TODO this should live in a helper object, and consolidate
1776 that with code in modify_chain_numbers */
1777 prev_atomname = this_atomname;
1778 prev_atomnum = this_atomnum;
1779 prev_resname = this_resname;
1780 prev_resnum = this_resnum;
1781 prev_chainid = this_chainid;
1782 prev_chainnumber = this_chainnumber;
1785 prev_chainstart = this_chainstart;
1788 this_atomname = *pdba_all.atomname[i];
1789 this_atomnum = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
1790 this_resname = *ri->name;
1791 this_resnum = ri->nr;
1792 this_chainid = ri->chainid;
1793 this_chainnumber = ri->chainnum;
1795 bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
1797 if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1801 "Must have pdbinfo from reading a PDB file if chain number is changing");
1802 this_chainstart = pdba_all.atom[i].resind;
1804 if (i > 0 && !bWat_)
1806 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1808 printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and "
1809 "chain starting with\n"
1810 "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype "
1811 "(keeping termini)? [n/y]\n",
1812 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1813 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1815 if (nullptr == fgets(select, STRLEN - 1, stdin))
1817 gmx_fatal(FARGS, "Error reading from stdin");
1819 bMerged = (select[0] == 'y');
1821 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1829 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
1830 pdba_all.atom[i].resind - prev_chainstart;
1831 pdb_ch[numChains - 1].nterpairs++;
1832 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
1837 /* set natom for previous chain */
1840 pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
1847 /* check if chain identifier was used before */
1848 for (int j = 0; (j < numChains); j++)
1850 if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1852 printf("WARNING: Chain identifier '%c' is used in two non-sequential "
1854 "They will be treated as separate chains unless you reorder your "
1859 t_pdbchain newChain;
1860 newChain.chainid = ri->chainid;
1861 newChain.chainnum = ri->chainnum;
1863 newChain.bAllWat = bWat_;
1866 newChain.nterpairs = 0;
1870 newChain.nterpairs = 1;
1872 newChain.chainstart.resize(newChain.nterpairs + 1);
1873 /* modified [numChains] to [0] below */
1874 newChain.chainstart[0] = 0;
1875 pdb_ch.push_back(newChain);
1881 pdb_ch.back().natom = natom - pdb_ch.back().start;
1883 /* set all the water blocks at the end of the chain */
1884 std::vector<int> swap_index(numChains);
1886 for (int i = 0; i < numChains; i++)
1888 if (!pdb_ch[i].bAllWat)
1894 for (int i = 0; i < numChains; i++)
1896 if (pdb_ch[i].bAllWat)
1902 if (nwaterchain > 1)
1904 printf("Moved all the water blocks to the end\n");
1908 std::vector<t_chain> chains(numChains);
1909 /* copy pdb data and x for all chains */
1910 for (int i = 0; (i < numChains); i++)
1912 int si = swap_index[i];
1913 chains[i].chainid = pdb_ch[si].chainid;
1914 chains[i].chainnum = pdb_ch[si].chainnum;
1915 chains[i].bAllWat = pdb_ch[si].bAllWat;
1916 chains[i].nterpairs = pdb_ch[si].nterpairs;
1917 chains[i].chainstart = pdb_ch[si].chainstart;
1918 chains[i].ntdb.clear();
1919 chains[i].ctdb.clear();
1920 chains[i].r_start.resize(pdb_ch[si].nterpairs);
1921 chains[i].r_end.resize(pdb_ch[si].nterpairs);
1923 snew(chains[i].pdba, 1);
1924 init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1925 for (j = 0; j < chains[i].pdba->nr; j++)
1927 chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start + j];
1928 chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
1929 chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
1930 chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
1932 /* Re-index the residues assuming that the indices are continuous */
1933 int k = chains[i].pdba->atom[0].resind;
1934 int nres = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
1935 chains[i].pdba->nres = nres;
1936 for (int j = 0; j < chains[i].pdba->nr; j++)
1938 chains[i].pdba->atom[j].resind -= k;
1940 srenew(chains[i].pdba->resinfo, nres);
1941 for (int j = 0; j < nres; j++)
1943 chains[i].pdba->resinfo[j] = pdba_all.resinfo[k + j];
1944 chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
1945 /* make all chain identifiers equal to that of the chain */
1946 chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1950 if (nchainmerges > 0)
1952 printf("\nMerged chains into joint molecule definitions at %d places.\n\n", nchainmerges);
1955 printf("There are %d chains and %d blocks of water and "
1956 "%d residues with %d atoms\n",
1957 numChains - nwaterchain, nwaterchain, pdba_all.nres, natom);
1959 printf("\n %5s %4s %6s\n", "chain", "#res", "#atoms");
1960 for (int i = 0; (i < numChains); i++)
1962 printf(" %d '%c' %5d %6d %s\n", i + 1, chains[i].chainid ? chains[i].chainid : '-',
1963 chains[i].pdba->nres, chains[i].pdba->nr, chains[i].bAllWat ? "(only water)" : "");
1967 check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1969 /* Read atomtypes... */
1970 PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
1972 /* read residue database */
1973 printf("Reading residue database... (%s)\n", forcefield_);
1974 std::vector<std::string> rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
1975 std::vector<PreprocessResidue> rtpFFDB;
1976 for (const auto& filename : rtpf)
1978 readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, false);
1982 /* Not correct with multiple rtp input files with different bonded types */
1983 FILE* fp = gmx_fio_fopen("new.rtp", "w");
1984 print_resall(fp, rtpFFDB, atype);
1988 /* read hydrogen database */
1989 std::vector<MoleculePatchDatabase> ah;
1990 read_h_db(ffdir_, &ah);
1992 /* Read Termini database... */
1993 std::vector<MoleculePatchDatabase> ntdb;
1994 std::vector<MoleculePatchDatabase> ctdb;
1995 std::vector<MoleculePatchDatabase*> tdblist;
1996 int nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
1997 int nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
1999 FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2001 print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2004 std::vector<gmx::RVec> x;
2005 /* new pdb datastructure for sorting. */
2006 t_atoms** sortAtoms = nullptr;
2007 t_atoms** localAtoms = nullptr;
2008 snew(sortAtoms, numChains);
2009 snew(localAtoms, numChains);
2010 for (int chain = 0; (chain < numChains); chain++)
2012 cc = &(chains[chain]);
2014 /* set pdba, natom and nres to the current chain */
2017 natom = cc->pdba->nr;
2018 int nres = cc->pdba->nres;
2020 if (cc->chainid && (cc->chainid != ' '))
2022 printf("Processing chain %d '%c' (%d atoms, %d residues)\n", chain + 1, cc->chainid,
2027 printf("Processing chain %d (%d atoms, %d residues)\n", chain + 1, natom, nres);
2030 process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_, bHisMan_,
2031 bArgMan_, bGlnMan_, angle_, distance_, &symtab, rtprename);
2033 cc->chainstart[cc->nterpairs] = pdba->nres;
2035 for (int i = 0; i < cc->nterpairs; i++)
2037 find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]),
2038 &(cc->r_end[j]), &rt);
2040 if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2046 if (cc->nterpairs == 0)
2048 printf("Problem with chain definition, or missing terminal residues.\n"
2049 "This chain does not appear to contain a recognized chain molecule.\n"
2050 "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2053 /* Check for disulfides and other special bonds */
2054 ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2056 if (!rtprename.empty())
2058 rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_);
2061 for (int i = 0; i < cc->nterpairs; i++)
2064 * We first apply a filter so we only have the
2065 * termini that can be applied to the residue in question
2066 * (or a generic terminus if no-residue specific is available).
2068 /* First the N terminus */
2071 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2072 if (tdblist.empty())
2074 printf("No suitable end (N or 5') terminus found in database - assuming this "
2076 "is already in a terminus-specific form and skipping terminus "
2078 cc->ntdb.push_back(nullptr);
2082 if (bTerMan_ && !tdblist.empty())
2084 sprintf(select, "Select start terminus type for %s-%d",
2085 *pdba->resinfo[cc->r_start[i]].name, pdba->resinfo[cc->r_start[i]].nr);
2086 cc->ntdb.push_back(choose_ter(tdblist, select));
2090 cc->ntdb.push_back(tdblist[0]);
2093 printf("Start terminus %s-%d: %s\n", *pdba->resinfo[cc->r_start[i]].name,
2094 pdba->resinfo[cc->r_start[i]].nr, (cc->ntdb[i])->name.c_str());
2100 cc->ntdb.push_back(nullptr);
2103 /* And the C terminus */
2106 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2107 if (tdblist.empty())
2109 printf("No suitable end (C or 3') terminus found in database - assuming this "
2111 "is already in a terminus-specific form and skipping terminus "
2113 cc->ctdb.push_back(nullptr);
2117 if (bTerMan_ && !tdblist.empty())
2119 sprintf(select, "Select end terminus type for %s-%d",
2120 *pdba->resinfo[cc->r_end[i]].name, pdba->resinfo[cc->r_end[i]].nr);
2121 cc->ctdb.push_back(choose_ter(tdblist, select));
2125 cc->ctdb.push_back(tdblist[0]);
2127 printf("End terminus %s-%d: %s\n", *pdba->resinfo[cc->r_end[i]].name,
2128 pdba->resinfo[cc->r_end[i]].nr, (cc->ctdb[i])->name.c_str());
2134 cc->ctdb.push_back(nullptr);
2137 std::vector<MoleculePatchDatabase> hb_chain;
2138 /* lookup hackblocks and rtp for all residues */
2139 std::vector<PreprocessResidue> restp_chain;
2140 get_hackblocks_rtp(&hb_chain, &restp_chain, rtpFFDB, pdba->nres, pdba->resinfo, cc->nterpairs,
2141 &symtab, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_);
2142 /* ideally, now we would not need the rtp itself anymore, but do
2143 everything using the hb and restp arrays. Unfortunately, that
2144 requires some re-thinking of code in gen_vsite.c, which I won't
2145 do now :( AF 26-7-99 */
2147 rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2149 match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_);
2154 t_blocka* block = new_blocka();
2156 sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2157 remove_duplicate_atoms(pdba, x, bVerbose_);
2163 "WARNING: with the -remh option the generated "
2164 "index file (%s) might be useless\n"
2165 "(the index file is generated before hydrogens are added)",
2166 indexOutputFile_.c_str());
2168 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2170 for (int i = 0; i < block->nr; i++)
2182 "without sorting no check for duplicate atoms can be done\n");
2185 /* Generate Hydrogen atoms (and termini) in the sequence */
2186 printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2187 add_h(&pdba, &localAtoms[chain], &x, ah, &symtab, cc->nterpairs, cc->ntdb, cc->ctdb,
2188 cc->r_start, cc->r_end, bAllowMissing_);
2189 printf("Now there are %d residues with %d atoms\n", pdba->nres, pdba->nr);
2191 /* make up molecule name(s) */
2193 int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2195 std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2197 std::string molname;
2205 this_chainid = cc->chainid;
2207 /* Add the chain id if we have one */
2208 if (this_chainid != ' ')
2210 suffix.append(formatString("_chain_%c", this_chainid));
2213 /* Check if there have been previous chains with the same id */
2215 for (int k = 0; k < chain; k++)
2217 if (cc->chainid == chains[k].chainid)
2222 /* Add the number for this chain identifier if there are multiple copies */
2225 suffix.append(formatString("%d", nid_used + 1));
2228 if (suffix.length() > 0)
2230 molname.append(restype);
2231 molname.append(suffix);
2238 std::string itp_fn = topologyFile_;
2240 std::string posre_fn = includeTopologyFile_;
2241 if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2244 printf("Chain time...\n");
2245 // construct the itp file name
2246 itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2248 itp_fn.append(molname);
2249 itp_fn.append(".itp");
2250 // now do the same for posre
2251 posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2252 posre_fn.append("_");
2253 posre_fn.append(molname);
2254 posre_fn.append(".itp");
2255 if (posre_fn == itp_fn)
2257 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2259 incls_.emplace_back();
2260 incls_.back() = itp_fn;
2261 itp_file_ = gmx_fio_fopen(itp_fn.c_str(), "w");
2268 mols_.emplace_back();
2271 mols_.back().name = "SOL";
2272 mols_.back().nr = pdba->nres;
2276 mols_.back().name = molname;
2277 mols_.back().nr = 1;
2282 print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2288 top_file2 = nullptr;
2292 top_file2 = itp_file_;
2296 top_file2 = top_file;
2299 pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, &atype, &symtab, rtpFFDB,
2300 restp_chain, hb_chain, bAllowMissing_, bVsites_, bVsiteAromatics_, ffdir_, mHmult_,
2301 ssbonds, long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2302 bRenumRes_, bRTPresname_);
2306 write_posres(posre_fn.c_str(), pdba, posre_fc_);
2311 gmx_fio_fclose(itp_file_);
2314 /* pdba and natom have been reassigned somewhere so: */
2319 if (watermodel_ == nullptr)
2321 for (int chain = 0; chain < numChains; chain++)
2323 if (chains[chain].bAllWat)
2325 auto message = formatString(
2326 "You have chosen not to include a water model, "
2327 "but there is water in the input file. Select a "
2328 "water model or remove the water from your input file.");
2329 GMX_THROW(InconsistentInputError(message));
2335 std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2336 if (!fflib_fexist(waterFile))
2338 auto message = formatString(
2339 "The topology file '%s' for the selected water "
2340 "model '%s' can not be found in the force field "
2341 "directory. Select a different water model.",
2342 waterFile.c_str(), watermodel_);
2343 GMX_THROW(InconsistentInputError(message));
2347 print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2348 gmx_fio_fclose(top_file);
2350 /* now merge all chains back together */
2353 for (int i = 0; (i < numChains); i++)
2355 natom += chains[i].pdba->nr;
2356 nres += chains[i].pdba->nres;
2360 init_t_atoms(atoms, natom, false);
2361 for (int i = 0; i < atoms->nres; i++)
2363 sfree(atoms->resinfo[i].name);
2366 srenew(atoms->resinfo, nres);
2370 for (int i = 0; (i < numChains); i++)
2374 printf("Including chain %d in system: %d atoms %d residues\n", i + 1,
2375 chains[i].pdba->nr, chains[i].pdba->nres);
2377 for (int j = 0; (j < chains[i].pdba->nr); j++)
2379 atoms->atom[k] = chains[i].pdba->atom[j];
2380 atoms->atom[k].resind += l; /* l is processed nr of residues */
2381 atoms->atomname[k] = chains[i].pdba->atomname[j];
2382 atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2383 x.push_back(chains[i].x[j]);
2386 for (int j = 0; (j < chains[i].pdba->nres); j++)
2388 atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2391 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2395 done_atom(chains[i].pdba);
2400 fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2401 print_sums(atoms, true);
2405 fprintf(stderr, "\nWriting coordinate file...\n");
2406 clear_rvec(box_space);
2409 make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2411 write_sto_conf(outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr, ePBC, box);
2413 done_symtab(&symtab);
2414 done_atom(&pdba_all);
2416 for (int chain = 0; chain < numChains; chain++)
2418 sfree(sortAtoms[chain]);
2419 sfree(localAtoms[chain]);
2426 printf("\t\t--------- PLEASE NOTE ------------\n");
2427 printf("You have successfully generated a topology from: %s.\n", inputConfFile_.c_str());
2428 if (watermodel_ != nullptr)
2430 printf("The %s force field and the %s water model are used.\n", ffname_, watermodel_);
2435 printf("The %s force field is used.\n", ffname_);
2437 printf("\t\t--------- ETON ESAELP ------------\n");
2444 const char pdb2gmxInfo::name[] = "pdb2gmx";
2445 const char pdb2gmxInfo::shortDescription[] =
2446 "Convert coordinate files to topology and FF-compliant coordinate files";
2447 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2449 return std::make_unique<pdb2gmx>();