2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
51 #include "gromacs/commandline/cmdlineoptionsmodule.h"
52 #include "gromacs/fileio/confio.h"
53 #include "gromacs/fileio/filetypes.h"
54 #include "gromacs/fileio/gmxfio.h"
55 #include "gromacs/fileio/pdbio.h"
56 #include "gromacs/gmxlib/conformation-utilities.h"
57 #include "gromacs/gmxpreprocess/fflibutil.h"
58 #include "gromacs/gmxpreprocess/genhydro.h"
59 #include "gromacs/gmxpreprocess/grompp-impl.h"
60 #include "gromacs/gmxpreprocess/h_db.h"
61 #include "gromacs/gmxpreprocess/hackblock.h"
62 #include "gromacs/gmxpreprocess/hizzie.h"
63 #include "gromacs/gmxpreprocess/pdb2top.h"
64 #include "gromacs/gmxpreprocess/pgutil.h"
65 #include "gromacs/gmxpreprocess/resall.h"
66 #include "gromacs/gmxpreprocess/specbond.h"
67 #include "gromacs/gmxpreprocess/ter_db.h"
68 #include "gromacs/gmxpreprocess/toputil.h"
69 #include "gromacs/gmxpreprocess/xlate.h"
70 #include "gromacs/math/vec.h"
71 #include "gromacs/options/basicoptions.h"
72 #include "gromacs/options/filenameoption.h"
73 #include "gromacs/options/ioptionscontainer.h"
74 #include "gromacs/topology/atomprop.h"
75 #include "gromacs/topology/block.h"
76 #include "gromacs/topology/index.h"
77 #include "gromacs/topology/residuetypes.h"
78 #include "gromacs/topology/symtab.h"
79 #include "gromacs/topology/topology.h"
80 #include "gromacs/utility/dir_separator.h"
81 #include "gromacs/utility/exceptions.h"
82 #include "gromacs/utility/fatalerror.h"
83 #include "gromacs/utility/path.h"
84 #include "gromacs/utility/smalloc.h"
85 #include "gromacs/utility/strdb.h"
86 #include "gromacs/utility/stringutil.h"
90 char gmx[RTP_MAXCHAR+2];
91 char main[RTP_MAXCHAR+2];
92 char nter[RTP_MAXCHAR+2];
93 char cter[RTP_MAXCHAR+2];
94 char bter[RTP_MAXCHAR+2];
97 static const char *res2bb_notermini(const char *name,
98 int nrr, const rtprename_t *rr)
100 /* NOTE: This function returns the main building block name,
101 * it does not take terminal renaming into account.
106 while (i < nrr && !gmx::equalCaseInsensitive(name, rr[i].gmx))
111 return (i < nrr ? rr[i].main : name);
114 static const char *select_res(int nr, int resnr,
115 const char *name[], const char *expl[],
117 int nrr, const rtprename_t *rr)
121 printf("Which %s type do you want for residue %d\n", title, resnr+1);
122 for (sel = 0; (sel < nr); sel++)
124 printf("%d. %s (%s)\n",
125 sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
127 printf("\nType a number:"); fflush(stdout);
129 if (scanf("%d", &sel) != 1)
131 gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
137 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
142 const char *lh[easpNR] = { "ASP", "ASPH" };
143 const char *expl[easpNR] = {
144 "Not protonated (charge -1)",
145 "Protonated (charge 0)"
148 return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
151 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
156 const char *lh[egluNR] = { "GLU", "GLUH" };
157 const char *expl[egluNR] = {
158 "Not protonated (charge -1)",
159 "Protonated (charge 0)"
162 return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
165 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
170 const char *lh[eglnNR] = { "GLN", "QLN" };
171 const char *expl[eglnNR] = {
172 "Not protonated (charge 0)",
173 "Protonated (charge +1)"
176 return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
179 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
184 const char *lh[elysNR] = { "LYSN", "LYS" };
185 const char *expl[elysNR] = {
186 "Not protonated (charge 0)",
187 "Protonated (charge +1)"
190 return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
193 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
198 const char *lh[eargNR] = { "ARGN", "ARG" };
199 const char *expl[eargNR] = {
200 "Not protonated (charge 0)",
201 "Protonated (charge +1)"
204 return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
207 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
209 const char *expl[ehisNR] = {
216 return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
219 static void read_rtprename(const char *fname, FILE *fp,
220 int *nrtprename, rtprename_t **rtprename)
222 char line[STRLEN], buf[STRLEN];
231 while (get_a_line(fp, line, STRLEN))
234 /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
235 * For other args, we read up to 6 chars (so we can detect if the length is > 5).
236 * Note that the buffer length has been increased to 7 to allow this,
237 * so we just need to make sure the strings have zero-length initially.
240 rr[n].main[0] = '\0';
241 rr[n].nter[0] = '\0';
242 rr[n].cter[0] = '\0';
243 rr[n].bter[0] = '\0';
244 nc = sscanf(line, "%6s %6s %6s %6s %6s %s",
245 rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
246 if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
247 strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
249 gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
250 fname, RTP_MAXCHAR, line);
255 if (nc != 2 && nc != 5)
257 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d", fname, ncol, 2, 5);
263 gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
268 /* This file does not have special termini names, copy them from main */
269 strcpy(rr[n].nter, rr[n].main);
270 strcpy(rr[n].cter, rr[n].main);
271 strcpy(rr[n].bter, rr[n].main);
281 static char *search_resrename(int nrr, rtprename_t *rr,
283 bool bStart, bool bEnd,
284 bool bCompareFFRTPname)
292 while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx) != 0) ||
293 ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
298 /* If found in the database, rename this residue's rtp building block,
299 * otherwise keep the old name.
322 gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
329 static void rename_resrtp(t_atoms *pdba, int nterpairs, const int *r_start, const int *r_end,
330 int nrr, rtprename_t *rr, t_symtab *symtab,
338 bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
340 for (r = 0; r < pdba->nres; r++)
344 for (j = 0; j < nterpairs; j++)
351 for (j = 0; j < nterpairs; j++)
359 nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
361 if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
363 /* This is a terminal residue, but the residue name,
364 * currently stored in .rtp, is not a standard residue name,
365 * but probably a force field specific rtp name.
366 * Check if we need to rename it because it is terminal.
368 nn = search_resrename(nrr, rr,
369 *pdba->resinfo[r].rtp, bStart, bEnd, true);
372 if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
376 printf("Changing rtp entry of residue %d %s to '%s'\n",
377 pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
379 pdba->resinfo[r].rtp = put_symtab(symtab, nn);
384 static void pdbres_to_gmxrtp(t_atoms *pdba)
388 for (i = 0; (i < pdba->nres); i++)
390 if (pdba->resinfo[i].rtp == nullptr)
392 pdba->resinfo[i].rtp = pdba->resinfo[i].name;
397 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
398 bool bFullCompare, t_symtab *symtab)
403 for (i = 0; (i < pdba->nres); i++)
405 resnm = *pdba->resinfo[i].name;
406 if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm))) ||
407 (!bFullCompare && strstr(resnm, oldnm) != nullptr))
409 /* Rename the residue name (not the rtp name) */
410 pdba->resinfo[i].name = put_symtab(symtab, newnm);
415 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
416 bool bFullCompare, t_symtab *symtab)
421 for (i = 0; (i < pdba->nres); i++)
423 /* We have not set the rtp name yes, use the residue name */
424 bbnm = *pdba->resinfo[i].name;
425 if ((bFullCompare && (gmx::equalCaseInsensitive(bbnm, oldnm))) ||
426 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
428 /* Change the rtp builing block name */
429 pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
434 static void rename_bbint(t_atoms *pdba, const char *oldnm,
435 const char *gettp(int, int, const rtprename_t *),
438 int nrr, const rtprename_t *rr)
444 for (i = 0; i < pdba->nres; i++)
446 /* We have not set the rtp name yet, use the residue name */
447 bbnm = *pdba->resinfo[i].name;
448 if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
449 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
451 ptr = gettp(i, nrr, rr);
452 pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
457 static void check_occupancy(t_atoms *atoms, const char *filename, bool bVerbose)
463 ftp = fn2ftp(filename);
464 if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
466 fprintf(stderr, "No occupancies in %s\n", filename);
470 for (i = 0; (i < atoms->nr); i++)
472 if (atoms->pdbinfo[i].occup != 1)
476 fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
477 *atoms->resinfo[atoms->atom[i].resind].name,
478 atoms->resinfo[ atoms->atom[i].resind].nr,
480 atoms->pdbinfo[i].occup);
482 if (atoms->pdbinfo[i].occup == 0)
492 if (nzero == atoms->nr)
494 fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
496 else if ((nzero > 0) || (nnotone > 0))
500 "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
501 " occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
503 nzero, nnotone, atoms->nr);
507 fprintf(stderr, "All occupancies are one\n");
512 static void write_posres(const char *fn, t_atoms *pdba, real fc)
517 fp = gmx_fio_fopen(fn, "w");
519 "; In this topology include file, you will find position restraint\n"
520 "; entries for all the heavy atoms in your original pdb file.\n"
521 "; This means that all the protons which were added by pdb2gmx are\n"
522 "; not restrained.\n"
524 "[ position_restraints ]\n"
525 "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
527 for (i = 0; (i < pdba->nr); i++)
529 if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
531 fprintf(fp, "%6d%6d %g %g %g\n", i+1, 1, fc, fc, fc);
537 static int read_pdball(const char *inf, bool bOutput, const char *outf, char **title,
538 t_atoms *atoms, rvec **x,
539 int *ePBC, matrix box, bool bRemoveH,
540 t_symtab *symtab, ResidueType *rt, const char *watres,
541 AtomProperties *aps, bool bVerbose)
542 /* Read a pdb file. (containing proteins) */
544 int natom, new_natom, i;
547 printf("Reading %s...\n", inf);
548 readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
550 if (atoms->pdbinfo == nullptr)
552 snew(atoms->pdbinfo, atoms->nr);
554 if (fn2ftp(inf) == efPDB)
556 get_pdb_atomnumber(atoms, aps);
561 for (i = 0; i < atoms->nr; i++)
563 if (!is_hydrogen(*atoms->atomname[i]))
565 atoms->atom[new_natom] = atoms->atom[i];
566 atoms->atomname[new_natom] = atoms->atomname[i];
567 atoms->pdbinfo[new_natom] = atoms->pdbinfo[i];
568 copy_rvec((*x)[i], (*x)[new_natom]);
572 atoms->nr = new_natom;
579 printf(" '%s',", *title);
581 printf(" %d atoms\n", natom);
583 /* Rename residues */
584 rename_pdbres(atoms, "HOH", watres, false, symtab);
585 rename_pdbres(atoms, "SOL", watres, false, symtab);
586 rename_pdbres(atoms, "WAT", watres, false, symtab);
588 rename_atoms("xlateat.dat", nullptr,
589 atoms, symtab, nullptr, true,
598 write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
604 static void process_chain(t_atoms *pdba, rvec *x,
605 bool bTrpU, bool bPheU, bool bTyrU,
606 bool bLysMan, bool bAspMan, bool bGluMan,
607 bool bHisMan, bool bArgMan, bool bGlnMan,
608 real angle, real distance, t_symtab *symtab,
609 int nrr, const rtprename_t *rr)
611 /* Rename aromatics, lys, asp and histidine */
614 rename_bb(pdba, "TYR", "TYRU", false, symtab);
618 rename_bb(pdba, "TRP", "TRPU", false, symtab);
622 rename_bb(pdba, "PHE", "PHEU", false, symtab);
626 rename_bbint(pdba, "LYS", get_lystp, false, symtab, nrr, rr);
630 rename_bbint(pdba, "ARG", get_argtp, false, symtab, nrr, rr);
634 rename_bbint(pdba, "GLN", get_glntp, false, symtab, nrr, rr);
638 rename_bbint(pdba, "ASP", get_asptp, false, symtab, nrr, rr);
642 rename_bb(pdba, "ASPH", "ASP", false, symtab);
646 rename_bbint(pdba, "GLU", get_glutp, false, symtab, nrr, rr);
650 rename_bb(pdba, "GLUH", "GLU", false, symtab);
655 set_histp(pdba, x, angle, distance);
659 rename_bbint(pdba, "HIS", get_histp, true, symtab, nrr, rr);
662 /* Initialize the rtp builing block names with the residue names
663 * for the residues that have not been processed above.
665 pdbres_to_gmxrtp(pdba);
667 /* Now we have all rtp names set.
668 * The rtp names will conform to Gromacs naming,
669 * unless the input pdb file contained one or more force field specific
670 * rtp names as residue names.
674 /* struct for sorting the atoms from the pdb file */
676 int resnr; /* residue number */
677 int j; /* database order index */
678 int index; /* original atom number */
679 char anm1; /* second letter of atom name */
680 char altloc; /* alternate location indicator */
683 static bool pdbicomp(const t_pdbindex &a, const t_pdbindex &b)
685 int d = (a.resnr - b.resnr);
691 d = (a.anm1 - b.anm1);
694 d = (a.altloc - b.altloc);
701 static void sort_pdbatoms(t_restp restp[],
702 int natoms, t_atoms **pdbaptr, rvec **x,
703 t_blocka *block, char ***gnames)
705 t_atoms *pdba, *pdbnew;
717 for (int i = 0; i < natoms; i++)
719 atomnm = *pdba->atomname[i];
720 rptr = &restp[pdba->atom[i].resind];
721 int j = std::find_if(rptr->atomname, rptr->atomname+rptr->natom,
722 [&atomnm](char** it){return gmx::equalCaseInsensitive(atomnm, *it); })
724 if (j == rptr->natom)
729 "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
730 "while sorting atoms.\n%s", atomnm,
731 *pdba->resinfo[pdba->atom[i].resind].name,
732 pdba->resinfo[pdba->atom[i].resind].nr,
735 is_hydrogen(atomnm) ?
736 "\nFor a hydrogen, this can be a different protonation state, or it\n"
737 "might have had a different number in the PDB file and was rebuilt\n"
738 "(it might for instance have been H3, and we only expected H1 & H2).\n"
739 "Note that hydrogens might have been added to the entry for the N-terminus.\n"
740 "Remove this hydrogen or choose a different protonation state to solve it.\n"
741 "Option -ignh will ignore all hydrogens in the input." : ".");
742 gmx_fatal(FARGS, "%s", buf);
744 /* make shadow array to be sorted into indexgroup */
745 pdbi[i].resnr = pdba->atom[i].resind;
748 pdbi[i].anm1 = atomnm[1];
749 pdbi[i].altloc = pdba->pdbinfo[i].altloc;
751 std::sort(pdbi, pdbi+natoms, pdbicomp);
753 /* pdba is sorted in pdbnew using the pdbi index */
754 std::vector<int> a(natoms);
756 init_t_atoms(pdbnew, natoms, true);
758 pdbnew->nr = pdba->nr;
759 pdbnew->nres = pdba->nres;
760 sfree(pdbnew->resinfo);
761 pdbnew->resinfo = pdba->resinfo;
762 for (int i = 0; i < natoms; i++)
764 pdbnew->atom[i] = pdba->atom[pdbi[i].index];
765 pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
766 pdbnew->pdbinfo[i] = pdba->pdbinfo[pdbi[i].index];
767 copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
768 /* make indexgroup in block */
769 a[i] = pdbi[i].index;
772 sfree(pdba->atomname);
774 sfree(pdba->pdbinfo);
777 /* copy the sorted pdbnew back to pdba */
780 add_grp(block, gnames, a, "prot_sort");
785 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], bool bVerbose)
787 int i, j, oldnatoms, ndel;
790 printf("Checking for duplicate atoms....\n");
791 oldnatoms = pdba->nr;
793 /* NOTE: pdba->nr is modified inside the loop */
794 for (i = 1; (i < pdba->nr); i++)
796 /* compare 'i' and 'i-1', throw away 'i' if they are identical
797 this is a 'while' because multiple alternate locations can be present */
798 while ( (i < pdba->nr) &&
799 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
800 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
805 ri = &pdba->resinfo[pdba->atom[i].resind];
806 printf("deleting duplicate atom %4s %s%4d%c",
807 *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
808 if (ri->chainid && (ri->chainid != ' '))
810 printf(" ch %c", ri->chainid);
814 if (pdba->pdbinfo[i].atomnr)
816 printf(" pdb nr %4d", pdba->pdbinfo[i].atomnr);
818 if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
820 printf(" altloc %c", pdba->pdbinfo[i].altloc);
826 /* We can not free, since it might be in the symtab */
827 /* sfree(pdba->atomname[i]); */
828 for (j = i; j < pdba->nr; j++)
830 pdba->atom[j] = pdba->atom[j+1];
831 pdba->atomname[j] = pdba->atomname[j+1];
834 pdba->pdbinfo[j] = pdba->pdbinfo[j+1];
836 copy_rvec(x[j+1], x[j]);
838 srenew(pdba->atom, pdba->nr);
839 /* srenew(pdba->atomname, pdba->nr); */
840 srenew(pdba->pdbinfo, pdba->nr);
843 if (pdba->nr != oldnatoms)
845 printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
852 checkResidueTypeSanity(t_atoms *pdba,
857 std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
858 std::string endResidueString = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
860 // Check whether all residues in chain have the same chain ID.
861 bool allResiduesHaveSameChainID = true;
862 char chainID0 = pdba->resinfo[r0].chainid;
864 std::string residueString;
866 for (int i = r0 + 1; i < r1; i++)
868 chainID = pdba->resinfo[i].chainid;
869 if (chainID != chainID0)
871 allResiduesHaveSameChainID = false;
872 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
877 if (!allResiduesHaveSameChainID)
880 "The chain covering the range %s--%s does not have a consistent chain ID. "
881 "The first residue has ID '%c', while residue %s has ID '%c'.",
882 startResidueString.c_str(), endResidueString.c_str(),
883 chainID0, residueString.c_str(), chainID);
886 // At this point all residues have the same ID. If they are also non-blank
887 // we can be a bit more aggressive and require the types match too.
890 bool allResiduesHaveSameType = true;
892 std::string restype0 = rt->typeNameForIndexedResidue(*pdba->resinfo[r0].name);
894 for (int i = r0 + 1; i < r1; i++)
896 restype = rt->typeNameForIndexedResidue(*pdba->resinfo[i].name);
897 if (!gmx::equalCaseInsensitive(restype, restype0))
899 allResiduesHaveSameType = false;
900 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
905 if (!allResiduesHaveSameType)
908 "The residues in the chain %s--%s do not have a consistent type. "
909 "The first residue has type '%s', while residue %s is of type '%s'. "
910 "Either there is a mistake in your chain, or it includes nonstandard "
911 "residue names that have not yet been added to the residuetypes.dat "
912 "file in the GROMACS library directory. If there are other molecules "
913 "such as ligands, they should not have the same chain ID as the "
914 "adjacent protein chain since it's a separate molecule.",
915 startResidueString.c_str(), endResidueString.c_str(),
916 restype0.c_str(), residueString.c_str(), restype.c_str());
921 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
925 std::string p_startrestype;
930 int startWarnings = 0;
934 // Check that all residues have the same chain identifier, and if it is
935 // non-blank we also require the residue types to match.
936 checkResidueTypeSanity(pdba, r0, r1, rt);
938 // If we return correctly from checkResidueTypeSanity(), the only
939 // remaining cases where we can have non-matching residue types is if
940 // the chain ID was blank, which could be the case e.g. for a structure
941 // read from a GRO file or other file types without chain information.
942 // In that case we need to be a bit more liberal and detect chains based
943 // on the residue type.
945 // If we get here, the chain ID must be identical for all residues
946 char chainID = pdba->resinfo[r0].chainid;
948 /* Find the starting terminus (typially N or 5') */
949 for (i = r0; i < r1 && *r_start == -1; i++)
951 p_startrestype = rt->typeNameForIndexedResidue(*pdba->resinfo[i].name);
952 if (gmx::equalCaseInsensitive(p_startrestype, "Protein") ||
953 gmx::equalCaseInsensitive(p_startrestype, "DNA") ||
954 gmx::equalCaseInsensitive(p_startrestype, "RNA") )
956 printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
959 else if (gmx::equalCaseInsensitive(p_startrestype, "Ion"))
963 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
967 printf("Disabling further notes about ions.\n");
973 if (startWarnings < 5)
977 printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
978 "This chain lacks identifiers, which makes it impossible to do strict\n"
979 "classification of the start/end residues. Here we need to guess this residue\n"
980 "should not be part of the chain and instead introduce a break, but that will\n"
981 "be catastrophic if they should in fact be linked. Please check your structure,\n"
982 "and add %s to residuetypes.dat if this was not correct.\n\n",
983 *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
987 printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
988 "This makes it impossible to link them into a molecule, which could either be\n"
989 "correct or a catastrophic error. Please check your structure, and add all\n"
990 "necessary residue names to residuetypes.dat if this was not correct.\n\n",
991 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
994 if (startWarnings == 4)
996 printf("Disabling further warnings about unidentified residues at start of chain.\n");
1004 /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1005 for (int i = *r_start; i < r1; i++)
1007 std::string p_restype = rt->typeNameForIndexedResidue(*pdba->resinfo[i].name);
1008 if (gmx::equalCaseInsensitive(p_restype, p_startrestype) && endWarnings == 0)
1012 else if (gmx::equalCaseInsensitive(p_startrestype, "Ion"))
1016 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1020 printf("Disabling further notes about ions.\n");
1026 // This can only trigger if the chain ID is blank - otherwise the
1027 // call to checkResidueTypeSanity() will have caught the problem.
1028 if (endWarnings < 5)
1030 printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1031 "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1032 "it impossible to do strict classification of the start/end residues. Here we\n"
1033 "need to guess this residue should not be part of the chain and instead\n"
1034 "introduce a break, but that will be catastrophic if they should in fact be\n"
1035 "linked. Please check your structure, and add %s to residuetypes.dat\n"
1036 "if this was not correct.\n\n",
1037 *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype.c_str(),
1038 *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype.c_str(), *pdba->resinfo[i].name);
1040 if (endWarnings == 4)
1042 printf("Disabling further warnings about unidentified residues at end of chain.\n");
1051 printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1055 /* enum for chain separation */
1057 enChainSep_id_or_ter, enChainSep_id_and_ter, enChainSep_ter,
1058 enChainSep_id, enChainSep_interactive
1060 static const char *ChainSepEnum[] = {"id_or_ter", "id_and_ter", "ter", "id", "interactive"};
1061 static const char *ChainSepInfoString[] =
1063 "Splitting chemical chains based on TER records or chain id changing.\n",
1064 "Splitting chemical chains based on TER records and chain id changing.\n",
1065 "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1066 "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1067 "Splitting chemical chains interactively.\n"
1071 modify_chain_numbers(t_atoms * pdba,
1072 ChainSepType enumChainSep)
1075 char old_prev_chainid;
1076 char old_this_chainid;
1077 int old_prev_chainnum;
1078 int old_this_chainnum;
1080 char select[STRLEN];
1084 const char * prev_atomname;
1085 const char * this_atomname;
1086 const char * prev_resname;
1087 const char * this_resname;
1093 /* The default chain enumeration is based on TER records only */
1094 printf("%s", ChainSepInfoString[enumChainSep]);
1096 old_prev_chainid = '?';
1097 old_prev_chainnum = -1;
1100 this_atomname = nullptr;
1102 this_resname = nullptr;
1106 for (i = 0; i < pdba->nres; i++)
1108 ri = &pdba->resinfo[i];
1109 old_this_chainid = ri->chainid;
1110 old_this_chainnum = ri->chainnum;
1112 prev_atomname = this_atomname;
1113 prev_atomnum = this_atomnum;
1114 prev_resname = this_resname;
1115 prev_resnum = this_resnum;
1116 prev_chainid = this_chainid;
1118 this_atomname = *(pdba->atomname[i]);
1119 this_atomnum = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1120 this_resname = *ri->name;
1121 this_resnum = ri->nr;
1122 this_chainid = ri->chainid;
1124 switch (enumChainSep)
1126 case enChainSep_id_or_ter:
1127 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1133 case enChainSep_id_and_ter:
1134 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1141 if (old_this_chainid != old_prev_chainid)
1147 case enChainSep_ter:
1148 if (old_this_chainnum != old_prev_chainnum)
1153 case enChainSep_interactive:
1154 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1158 printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1160 "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1161 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1162 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1164 if (nullptr == fgets(select, STRLEN-1, stdin))
1166 gmx_fatal(FARGS, "Error reading from stdin");
1169 if (i == 0 || select[0] == 'y')
1176 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1178 old_prev_chainid = old_this_chainid;
1179 old_prev_chainnum = old_this_chainnum;
1181 ri->chainnum = new_chainnum;
1209 // TODO make all enums into scoped enums
1210 /* enum for vsites */
1212 enVSites_none, enVSites_hydrogens, enVSites_aromatics
1214 static const char *VSitesEnum[] = {"none", "hydrogens", "aromatics"};
1216 /* enum for water model */
1218 enWater_select, enWater_none, enWater_spc, enWater_spce,
1219 enWater_tip3p, enWater_tip4p, enWater_tip5p, enWater_tips3p
1221 static const char *WaterEnum[] = {
1222 "select", "none", "spc", "spce",
1223 "tip3p", "tip4p", "tip5p", "tips3p"
1226 /* enum for merge */
1228 enMerge_no, enMerge_all, enMerge_interactive
1230 static const char *MergeEnum[] = {"no", "all", "interactive"};
1238 class pdb2gmx : public ICommandLineOptionsModule
1242 bVsites_(FALSE), bPrevWat_(FALSE), bVsiteAromatics_(FALSE),
1243 enumChainSep_(enChainSep_id_or_ter),
1244 enumVSites_(enVSites_none),
1245 enumWater_(enWater_select),
1246 enumMerge_(enMerge_no),
1256 // From ICommandLineOptionsModule
1257 void init(CommandLineModuleSettings * /*settings*/) override
1261 void initOptions(IOptionsContainer *options,
1262 ICommandLineOptionsModuleSettings *settings) override;
1264 void optionsFinished() override;
1282 bool bAllowMissing_;
1286 bool bChargeGroups_;
1296 bool bVsiteAromatics_;
1300 real long_bond_dist_;
1301 real short_bond_dist_;
1303 std::string indexOutputFile_;
1304 std::string outputFile_;
1305 std::string topologyFile_;
1306 std::string includeTopologyFile_;
1307 std::string outputConfFile_;
1308 std::string inputConfFile_;
1309 std::string outFile_;
1312 ChainSepType enumChainSep_;
1313 VSitesType enumVSites_;
1314 WaterType enumWater_;
1315 MergeType enumMerge_;
1318 char forcefield_[STRLEN];
1319 char ffdir_[STRLEN];
1329 void pdb2gmx::initOptions(IOptionsContainer *options,
1330 ICommandLineOptionsModuleSettings *settings)
1332 const char *desc[] = {
1333 "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1334 "some database files, adds hydrogens to the molecules and generates",
1335 "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1336 "and a topology in GROMACS format.",
1337 "These files can subsequently be processed to generate a run input file.",
1339 "[THISMODULE] will search for force fields by looking for",
1340 "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1341 "of the current working directory and of the GROMACS library directory",
1342 "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1344 "By default the forcefield selection is interactive,",
1345 "but you can use the [TT]-ff[tt] option to specify one of the short names",
1346 "in the list on the command line instead. In that case [THISMODULE] just looks",
1347 "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1349 "After choosing a force field, all files will be read only from",
1350 "the corresponding force field directory.",
1351 "If you want to modify or add a residue types, you can copy the force",
1352 "field directory from the GROMACS library directory to your current",
1353 "working directory. If you want to add new protein residue types,",
1354 "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1355 "or copy the whole library directory to a local directory and set",
1356 "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1357 "Check Chapter 5 of the manual for more information about file formats.",
1360 "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1361 "need not necessarily contain a protein structure. Every kind of",
1362 "molecule for which there is support in the database can be converted.",
1363 "If there is no support in the database, you can add it yourself.[PAR]",
1365 "The program has limited intelligence, it reads a number of database",
1366 "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1367 "if necessary this can be done manually. The program can prompt the",
1368 "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1369 "desired. For Lys the choice is between neutral (two protons on NZ) or",
1370 "protonated (three protons, default), for Asp and Glu unprotonated",
1371 "(default) or protonated, for His the proton can be either on ND1,",
1372 "on NE2 or on both. By default these selections are done automatically.",
1373 "For His, this is based on an optimal hydrogen bonding",
1374 "conformation. Hydrogen bonds are defined based on a simple geometric",
1375 "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1376 "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1377 "[TT]-dist[tt] respectively.[PAR]",
1379 "The protonation state of N- and C-termini can be chosen interactively",
1380 "with the [TT]-ter[tt] flag. Default termini are ionized (NH3+ and COO-),",
1381 "respectively. Some force fields support zwitterionic forms for chains of",
1382 "one residue, but for polypeptides these options should NOT be selected.",
1383 "The AMBER force fields have unique forms for the terminal residues,",
1384 "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1385 "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1386 "respectively to use these forms, making sure you preserve the format",
1387 "of the coordinate file. Alternatively, use named terminating residues",
1388 "(e.g. ACE, NME).[PAR]",
1390 "The separation of chains is not entirely trivial since the markup",
1391 "in user-generated PDB files frequently varies and sometimes it",
1392 "is desirable to merge entries across a TER record, for instance",
1393 "if you want a disulfide bridge or distance restraints between",
1394 "two protein chains or if you have a HEME group bound to a protein.",
1395 "In such cases multiple chains should be contained in a single",
1396 "[TT]moleculetype[tt] definition.",
1397 "To handle this, [THISMODULE] uses two separate options.",
1398 "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1399 "start, and termini added when applicable. This can be done based on the",
1400 "existence of TER records, when the chain id changes, or combinations of either",
1401 "or both of these. You can also do the selection fully interactively.",
1402 "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1403 "are merged into one moleculetype, after adding all the chemical termini (or not).",
1404 "This can be turned off (no merging), all non-water chains can be merged into a",
1405 "single molecule, or the selection can be done interactively.[PAR]",
1407 "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1408 "If any of the occupancies are not one, indicating that the atom is",
1409 "not resolved well in the structure, a warning message is issued.",
1410 "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1411 "all occupancy fields may be zero. Either way, it is up to the user",
1412 "to verify the correctness of the input data (read the article!).[PAR]",
1414 "During processing the atoms will be reordered according to GROMACS",
1415 "conventions. With [TT]-n[tt] an index file can be generated that",
1416 "contains one group reordered in the same way. This allows you to",
1417 "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1418 "one limitation: reordering is done after the hydrogens are stripped",
1419 "from the input and before new hydrogens are added. This means that",
1420 "you should not use [TT]-ignh[tt].[PAR]",
1422 "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1423 "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1424 "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1427 "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1428 "motions. Angular and out-of-plane motions can be removed by changing",
1429 "hydrogens into virtual sites and fixing angles, which fixes their",
1430 "position relative to neighboring atoms. Additionally, all atoms in the",
1431 "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1432 "can be converted into virtual sites, eliminating the fast improper dihedral",
1433 "fluctuations in these rings (but this feature is deprecated).",
1434 "[BB]Note[bb] that in this case all other hydrogen",
1435 "atoms are also converted to virtual sites. The mass of all atoms that are",
1436 "converted into virtual sites, is added to the heavy atoms.[PAR]",
1437 "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1438 "done by increasing the hydrogen-mass by a factor of 4. This is also",
1439 "done for water hydrogens to slow down the rotational motion of water.",
1440 "The increase in mass of the hydrogens is subtracted from the bonded",
1441 "(heavy) atom so that the total mass of the system remains the same."
1444 settings->setHelpText(desc);
1446 options->addOption(BooleanOption("newrtp")
1447 .store(&bNewRTP_).defaultValue(false).hidden()
1448 .description("Write the residue database in new format to [TT]new.rtp[tt]"));
1449 options->addOption(RealOption("lb")
1450 .store(&long_bond_dist_).defaultValue(0.25).hidden()
1451 .description("Long bond warning distance"));
1452 options->addOption(RealOption("sb")
1453 .store(&short_bond_dist_).defaultValue(0.05).hidden()
1454 .description("Short bond warning distance"));
1455 options->addOption(EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum)
1456 .store(&enumChainSep_)
1457 .description("Condition in PDB files when a new chain should be started (adding termini)"));
1458 options->addOption(EnumOption<MergeType>("merge").enumValue(MergeEnum)
1460 .description("Merge multiple chains into a single [moleculetype]"));
1461 options->addOption(StringOption("ff")
1462 .store(&ff_).defaultValue("select")
1463 .description("Force field, interactive by default. Use [TT]-h[tt] for information."));
1464 options->addOption(EnumOption<WaterType>("water")
1465 .store(&enumWater_).enumValue(WaterEnum)
1466 .description("Water model to use"));
1467 options->addOption(BooleanOption("inter")
1468 .store(&bInter_).defaultValue(false)
1469 .description("Set the next 8 options to interactive"));
1470 options->addOption(BooleanOption("ss")
1471 .store(&bCysMan_).defaultValue(false)
1472 .description("Interactive SS bridge selection"));
1473 options->addOption(BooleanOption("ter")
1474 .store(&bTerMan_).defaultValue(false)
1475 .description("Interactive termini selection, instead of charged (default)"));
1476 options->addOption(BooleanOption("lys")
1477 .store(&bLysMan_).defaultValue(false)
1478 .description("Interactive lysine selection, instead of charged"));
1479 options->addOption(BooleanOption("arg")
1480 .store(&bArgMan_).defaultValue(false)
1481 .description("Interactive arginine selection, instead of charged"));
1482 options->addOption(BooleanOption("asp")
1483 .store(&bAspMan_).defaultValue(false)
1484 .description("Interactive aspartic acid selection, instead of charged"));
1485 options->addOption(BooleanOption("glu")
1486 .store(&bGluMan_).defaultValue(false)
1487 .description("Interactive glutamic acid selection, instead of charged"));
1488 options->addOption(BooleanOption("gln")
1489 .store(&bGlnMan_).defaultValue(false)
1490 .description("Interactive glutamine selection, instead of charged"));
1491 options->addOption(BooleanOption("his")
1492 .store(&bHisMan_).defaultValue(false)
1493 .description("Interactive histidine selection, instead of checking H-bonds"));
1494 options->addOption(RealOption("angle")
1495 .store(&angle_).defaultValue(135.0)
1496 .description("Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1497 options->addOption(RealOption("dist")
1498 .store(&distance_).defaultValue(0.3)
1499 .description("Maximum donor-acceptor distance for a H-bond (nm)"));
1500 options->addOption(BooleanOption("una")
1501 .store(&bUnA_).defaultValue(false)
1502 .description("Select aromatic rings with united CH atoms on phenylalanine, tryptophane and tyrosine"));
1503 options->addOption(BooleanOption("sort")
1504 .store(&bSort_).defaultValue(true).hidden()
1505 .description("Sort the residues according to database, turning this off is dangerous as charge groups might be broken in parts"));
1506 options->addOption(BooleanOption("ignh")
1507 .store(&bRemoveH_).defaultValue(false)
1508 .description("Ignore hydrogen atoms that are in the coordinate file"));
1509 options->addOption(BooleanOption("missing")
1510 .store(&bAllowMissing_).defaultValue(false)
1511 .description("Continue when atoms are missing and bonds cannot be made, dangerous"));
1512 options->addOption(BooleanOption("v")
1513 .store(&bVerbose_).defaultValue(false)
1514 .description("Be slightly more verbose in messages"));
1515 options->addOption(RealOption("posrefc")
1516 .store(&posre_fc_).defaultValue(1000)
1517 .description("Force constant for position restraints"));
1518 options->addOption(EnumOption<VSitesType>("vsite")
1519 .store(&enumVSites_).enumValue(VSitesEnum)
1520 .description("Convert atoms to virtual sites"));
1521 options->addOption(BooleanOption("heavyh")
1522 .store(&bHeavyH_).defaultValue(false)
1523 .description("Make hydrogen atoms heavy"));
1524 options->addOption(BooleanOption("deuterate")
1525 .store(&bDeuterate_).defaultValue(false)
1526 .description("Change the mass of hydrogens to 2 amu"));
1527 options->addOption(BooleanOption("chargegrp")
1528 .store(&bChargeGroups_).defaultValue(true)
1529 .description("Use charge groups in the [REF].rtp[ref] file"));
1530 options->addOption(BooleanOption("cmap")
1531 .store(&bCmap_).defaultValue(true)
1532 .description("Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1533 options->addOption(BooleanOption("renum")
1534 .store(&bRenumRes_).defaultValue(false)
1535 .description("Renumber the residues consecutively in the output"));
1536 options->addOption(BooleanOption("rtpres")
1537 .store(&bRTPresname_).defaultValue(false)
1538 .description("Use [REF].rtp[ref] entry names as residue names"));
1539 options->addOption(FileNameOption("f")
1540 .legacyType(efSTX).inputFile()
1541 .store(&inputConfFile_).required()
1542 .defaultBasename("protein").defaultType(efPDB)
1543 .description("Structure file"));
1544 options->addOption(FileNameOption("o")
1545 .legacyType(efSTO).outputFile()
1546 .store(&outputConfFile_).required()
1547 .defaultBasename("conf")
1548 .description("Structure file"));
1549 options->addOption(FileNameOption("p")
1550 .legacyType(efTOP).outputFile()
1551 .store(&topologyFile_).required()
1552 .defaultBasename("topol")
1553 .description("Topology file"));
1554 options->addOption(FileNameOption("i")
1555 .legacyType(efITP).outputFile()
1556 .store(&includeTopologyFile_).required()
1557 .defaultBasename("posre")
1558 .description("Include file for topology"));
1559 options->addOption(FileNameOption("n")
1560 .legacyType(efNDX).outputFile()
1561 .store(&indexOutputFile_).storeIsSet(&bIndexSet_)
1562 .defaultBasename("index")
1563 .description("Index file"));
1564 options->addOption(FileNameOption("q")
1565 .legacyType(efSTO).outputFile()
1566 .store(&outFile_).storeIsSet(&bOutputSet_)
1567 .defaultBasename("clean").defaultType(efPDB)
1568 .description("Structure file"));
1571 void pdb2gmx::optionsFinished()
1573 if (inputConfFile_.empty())
1575 GMX_THROW(InconsistentInputError("You must supply an input file"));
1579 /* if anything changes here, also change description of -inter */
1594 else if (bDeuterate_)
1603 /* Force field selection, interactive or direct */
1604 choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(),
1605 forcefield_, sizeof(forcefield_),
1606 ffdir_, sizeof(ffdir_));
1608 if (strlen(forcefield_) > 0)
1610 ffname_ = forcefield_;
1611 ffname_[0] = std::toupper(ffname_[0]);
1615 gmx_fatal(FARGS, "Empty forcefield string");
1621 char select[STRLEN];
1624 t_hackblock *hb_chain;
1625 t_restp *restp_chain;
1629 const char *prev_atomname;
1630 const char *this_atomname;
1631 const char *prev_resname;
1632 const char *this_resname;
1637 int prev_chainnumber;
1638 int this_chainnumber;
1639 int this_chainstart;
1640 int prev_chainstart;
1642 printf("\nUsing the %s force field in directory %s\n\n",
1645 choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1647 switch (enumVSites_)
1651 bVsiteAromatics_ = false;
1653 case enVSites_hydrogens:
1655 bVsiteAromatics_ = false;
1657 case enVSites_aromatics:
1659 bVsiteAromatics_ = true;
1662 gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1665 /* Open the symbol table */
1667 open_symtab(&symtab);
1669 /* Residue type database */
1672 /* Read residue renaming database(s), if present */
1673 std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1676 rtprename_t *rtprename = nullptr;
1677 for (const auto &filename : rrn)
1679 printf("going to rename %s\n", filename.c_str());
1680 FILE *fp = fflib_open(filename);
1681 read_rtprename(filename.c_str(), fp, &nrtprename, &rtprename);
1685 /* Add all alternative names from the residue renaming database to the list
1686 of recognized amino/nucleic acids. */
1687 for (int i = 0; i < nrtprename; i++)
1689 /* Only add names if the 'standard' gromacs/iupac base name was found */
1691 /* TODO this should be changed with gmx::optional so that we only need
1692 * to search rt once.
1694 if (rt.nameIndexedInResidueTypes(rtprename[i].gmx))
1696 std::string restype = rt.typeNameForIndexedResidue(rtprename[i].gmx);
1697 rt.addResidue(rtprename[i].main, restype);
1698 rt.addResidue(rtprename[i].nter, restype);
1699 rt.addResidue(rtprename[i].cter, restype);
1700 rt.addResidue(rtprename[i].bter, restype);
1707 if (watermodel_ != nullptr && (strstr(watermodel_, "4p") ||
1708 strstr(watermodel_, "4P")))
1712 else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") ||
1713 strstr(watermodel_, "5P")))
1727 int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(),
1728 &title, &pdba_all, &pdbx, &ePBC, box, bRemoveH_,
1729 &symtab, &rt, watres, &aps, bVerbose_);
1733 std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1734 GMX_THROW(InconsistentInputError(message));
1737 printf("Analyzing pdb file\n");
1738 int nwaterchain = 0;
1740 modify_chain_numbers(&pdba_all, enumChainSep_);
1742 int nchainmerges = 0;
1744 this_atomname = nullptr;
1746 this_resname = nullptr;
1749 this_chainnumber = -1;
1750 this_chainstart = 0;
1751 /* Keep the compiler happy */
1752 prev_chainstart = 0;
1757 snew(pdb_ch, maxch);
1760 bool bMerged = false;
1761 for (int i = 0; (i < natom); i++)
1763 ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1765 /* TODO this should live in a helper object, and consolidate
1766 that with code in modify_chain_numbers */
1767 prev_atomname = this_atomname;
1768 prev_atomnum = this_atomnum;
1769 prev_resname = this_resname;
1770 prev_resnum = this_resnum;
1771 prev_chainid = this_chainid;
1772 prev_chainnumber = this_chainnumber;
1775 prev_chainstart = this_chainstart;
1778 this_atomname = *pdba_all.atomname[i];
1779 this_atomnum = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1780 this_resname = *ri->name;
1781 this_resnum = ri->nr;
1782 this_chainid = ri->chainid;
1783 this_chainnumber = ri->chainnum;
1785 bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
1787 if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1789 this_chainstart = pdba_all.atom[i].resind;
1791 if (i > 0 && !bWat_)
1793 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1795 printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1796 "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1797 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1798 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1800 if (nullptr == fgets(select, STRLEN-1, stdin))
1802 gmx_fatal(FARGS, "Error reading from stdin");
1804 bMerged = (select[0] == 'y');
1806 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1814 pdb_ch[numChains-1].chainstart[pdb_ch[numChains-1].nterpairs] =
1815 pdba_all.atom[i].resind - prev_chainstart;
1816 pdb_ch[numChains-1].nterpairs++;
1817 srenew(pdb_ch[numChains-1].chainstart, pdb_ch[numChains-1].nterpairs+1);
1822 /* set natom for previous chain */
1825 pdb_ch[numChains-1].natom = i-pdb_ch[numChains-1].start;
1832 /* check if chain identifier was used before */
1833 for (int j = 0; (j < numChains); j++)
1835 if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1837 printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1838 "They will be treated as separate chains unless you reorder your file.\n",
1842 // TODO This is too convoluted. Use a std::vector
1843 if (numChains == maxch)
1846 srenew(pdb_ch, maxch);
1848 pdb_ch[numChains].chainid = ri->chainid;
1849 pdb_ch[numChains].chainnum = ri->chainnum;
1850 pdb_ch[numChains].start = i;
1851 pdb_ch[numChains].bAllWat = bWat_;
1854 pdb_ch[numChains].nterpairs = 0;
1858 pdb_ch[numChains].nterpairs = 1;
1860 snew(pdb_ch[numChains].chainstart, pdb_ch[numChains].nterpairs+1);
1861 /* modified [numChains] to [0] below */
1862 pdb_ch[numChains].chainstart[0] = 0;
1868 pdb_ch[numChains-1].natom = natom-pdb_ch[numChains-1].start;
1870 /* set all the water blocks at the end of the chain */
1872 snew(swap_index, numChains);
1874 for (int i = 0; i < numChains; i++)
1876 if (!pdb_ch[i].bAllWat)
1882 for (int i = 0; i < numChains; i++)
1884 if (pdb_ch[i].bAllWat)
1890 if (nwaterchain > 1)
1892 printf("Moved all the water blocks to the end\n");
1897 snew(chains, numChains);
1898 /* copy pdb data and x for all chains */
1899 for (int i = 0; (i < numChains); i++)
1901 int si = swap_index[i];
1902 chains[i].chainid = pdb_ch[si].chainid;
1903 chains[i].chainnum = pdb_ch[si].chainnum;
1904 chains[i].bAllWat = pdb_ch[si].bAllWat;
1905 chains[i].nterpairs = pdb_ch[si].nterpairs;
1906 chains[i].chainstart = pdb_ch[si].chainstart;
1907 snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1908 snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1909 snew(chains[i].r_start, pdb_ch[si].nterpairs);
1910 snew(chains[i].r_end, pdb_ch[si].nterpairs);
1912 snew(chains[i].pdba, 1);
1913 init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1914 snew(chains[i].x, chains[i].pdba->nr);
1915 for (j = 0; j < chains[i].pdba->nr; j++)
1917 chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1918 snew(chains[i].pdba->atomname[j], 1);
1919 *chains[i].pdba->atomname[j] =
1920 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1921 chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1922 copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1924 /* Re-index the residues assuming that the indices are continuous */
1925 int k = chains[i].pdba->atom[0].resind;
1926 int nres = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1927 chains[i].pdba->nres = nres;
1928 for (int j = 0; j < chains[i].pdba->nr; j++)
1930 chains[i].pdba->atom[j].resind -= k;
1932 srenew(chains[i].pdba->resinfo, nres);
1933 for (int j = 0; j < nres; j++)
1935 chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1936 snew(chains[i].pdba->resinfo[j].name, 1);
1937 *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1938 /* make all chain identifiers equal to that of the chain */
1939 chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1943 if (nchainmerges > 0)
1945 printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1949 printf("There are %d chains and %d blocks of water and "
1950 "%d residues with %d atoms\n",
1951 numChains-nwaterchain, nwaterchain,
1952 pdba_all.nres, natom);
1954 printf("\n %5s %4s %6s\n", "chain", "#res", "#atoms");
1955 for (int i = 0; (i < numChains); i++)
1957 printf(" %d '%c' %5d %6d %s\n",
1958 i+1, chains[i].chainid ? chains[i].chainid : '-',
1959 chains[i].pdba->nres, chains[i].pdba->nr,
1960 chains[i].bAllWat ? "(only water)" : "");
1964 check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1966 /* Read atomtypes... */
1967 gpp_atomtype *atype = read_atype(ffdir_, &symtab);
1969 /* read residue database */
1970 printf("Reading residue database... (%s)\n", forcefield_);
1971 std::vector<std::string> rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
1973 t_restp *restp = nullptr;
1974 for (const auto &filename : rtpf)
1976 read_resall(filename.c_str(), &nrtp, &restp, atype, &symtab, false);
1980 /* Not correct with multiple rtp input files with different bonded types */
1981 FILE *fp = gmx_fio_fopen("new.rtp", "w");
1982 print_resall(fp, nrtp, restp, atype);
1986 /* read hydrogen database */
1988 int nah = read_h_db(ffdir_, &ah);
1990 /* Read Termini database... */
1994 t_hackblock **tdblist;
1995 int nNtdb = read_ter_db(ffdir_, 'n', &ntdb, atype);
1996 int nCtdb = read_ter_db(ffdir_, 'c', &ctdb, atype);
1998 FILE *top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2000 print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2004 for (int chain = 0; (chain < numChains); chain++)
2006 cc = &(chains[chain]);
2008 /* set pdba, natom and nres to the current chain */
2011 natom = cc->pdba->nr;
2012 int nres = cc->pdba->nres;
2014 if (cc->chainid && ( cc->chainid != ' ' ) )
2016 printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
2017 chain+1, cc->chainid, natom, nres);
2021 printf("Processing chain %d (%d atoms, %d residues)\n",
2022 chain+1, natom, nres);
2025 process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_,
2026 bHisMan_, bArgMan_, bGlnMan_, angle_, distance_, &symtab,
2027 nrtprename, rtprename);
2029 cc->chainstart[cc->nterpairs] = pdba->nres;
2031 for (int i = 0; i < cc->nterpairs; i++)
2033 find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
2034 &(cc->r_start[j]), &(cc->r_end[j]), &rt);
2036 if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2042 if (cc->nterpairs == 0)
2044 printf("Problem with chain definition, or missing terminal residues.\n"
2045 "This chain does not appear to contain a recognized chain molecule.\n"
2046 "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2049 /* Check for disulfides and other special bonds */
2050 nssbonds = mk_specbonds(pdba, x, bCysMan_, &ssbonds, bVerbose_);
2054 rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
2055 &symtab, bVerbose_);
2058 for (int i = 0; i < cc->nterpairs; i++)
2062 * We first apply a filter so we only have the
2063 * termini that can be applied to the residue in question
2064 * (or a generic terminus if no-residue specific is available).
2066 /* First the N terminus */
2069 tdblist = filter_ter(nNtdb, ntdb,
2070 *pdba->resinfo[cc->r_start[i]].name,
2074 printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
2075 "is already in a terminus-specific form and skipping terminus selection.\n");
2076 cc->ntdb[i] = nullptr;
2080 if (bTerMan_ && ntdblist > 1)
2082 sprintf(select, "Select start terminus type for %s-%d",
2083 *pdba->resinfo[cc->r_start[i]].name,
2084 pdba->resinfo[cc->r_start[i]].nr);
2085 cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
2089 cc->ntdb[i] = tdblist[0];
2092 printf("Start terminus %s-%d: %s\n",
2093 *pdba->resinfo[cc->r_start[i]].name,
2094 pdba->resinfo[cc->r_start[i]].nr,
2095 (cc->ntdb[i])->name);
2101 cc->ntdb[i] = nullptr;
2104 /* And the C terminus */
2107 tdblist = filter_ter(nCtdb, ctdb,
2108 *pdba->resinfo[cc->r_end[i]].name,
2112 printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2113 "is already in a terminus-specific form and skipping terminus selection.\n");
2114 cc->ctdb[i] = nullptr;
2118 if (bTerMan_ && ntdblist > 1)
2120 sprintf(select, "Select end terminus type for %s-%d",
2121 *pdba->resinfo[cc->r_end[i]].name,
2122 pdba->resinfo[cc->r_end[i]].nr);
2123 cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
2127 cc->ctdb[i] = tdblist[0];
2129 printf("End terminus %s-%d: %s\n",
2130 *pdba->resinfo[cc->r_end[i]].name,
2131 pdba->resinfo[cc->r_end[i]].nr,
2132 (cc->ctdb[i])->name);
2138 cc->ctdb[i] = nullptr;
2142 /* lookup hackblocks and rtp for all residues */
2143 get_hackblocks_rtp(&hb_chain, &restp_chain,
2144 nrtp, restp, pdba->nres, pdba->resinfo,
2145 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2147 /* ideally, now we would not need the rtp itself anymore, but do
2148 everything using the hb and restp arrays. Unfortunately, that
2149 requires some re-thinking of code in gen_vsite.c, which I won't
2150 do now :( AF 26-7-99 */
2152 rename_atoms(nullptr, ffdir_,
2153 pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2155 match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose_);
2161 block = new_blocka();
2163 sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
2164 remove_duplicate_atoms(pdba, x, bVerbose_);
2169 fprintf(stderr, "WARNING: with the -remh option the generated "
2170 "index file (%s) might be useless\n"
2171 "(the index file is generated before hydrogens are added)",
2172 indexOutputFile_.c_str());
2174 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2176 for (int i = 0; i < block->nr; i++)
2185 fprintf(stderr, "WARNING: "
2186 "without sorting no check for duplicate atoms can be done\n");
2189 /* Generate Hydrogen atoms (and termini) in the sequence */
2190 printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2191 add_h(&pdba, &x, nah, ah,
2192 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_,
2193 nullptr, nullptr, true, false);
2194 printf("Now there are %d residues with %d atoms\n",
2195 pdba->nres, pdba->nr);
2197 /* make up molecule name(s) */
2199 int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2201 std::string restype = rt.typeNameForIndexedResidue(*pdba->resinfo[k].name);
2203 std::string molname;
2211 this_chainid = cc->chainid;
2213 /* Add the chain id if we have one */
2214 if (this_chainid != ' ')
2216 suffix.append(formatString("_chain_%c", this_chainid));
2219 /* Check if there have been previous chains with the same id */
2221 for (int k = 0; k < chain; k++)
2223 if (cc->chainid == chains[k].chainid)
2228 /* Add the number for this chain identifier if there are multiple copies */
2231 suffix.append(formatString("%d", nid_used+1));
2234 if (suffix.length() > 0)
2236 molname.append(restype);
2237 molname.append(suffix);
2244 std::string itp_fn = topologyFile_;;
2245 std::string posre_fn = includeTopologyFile_;
2246 if ((numChains-nwaterchain > 1) && !cc->bAllWat)
2249 printf("Chain time...\n");
2250 //construct the itp file name
2251 itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2253 itp_fn.append(molname);
2254 itp_fn.append(".itp");
2255 //now do the same for posre
2256 posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2257 posre_fn.append("_");
2258 posre_fn.append(molname);
2259 posre_fn.append(".itp");
2260 if (posre_fn == itp_fn)
2262 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2265 srenew(incls_, nincl_);
2266 incls_[nincl_-1] = gmx_strdup(itp_fn.c_str());
2267 itp_file_ = gmx_fio_fopen(itp_fn.c_str(), "w");
2274 srenew(mols_, nmol_+1);
2277 mols_[nmol_].name = gmx_strdup("SOL");
2278 mols_[nmol_].nr = pdba->nres;
2282 mols_[nmol_].name = gmx_strdup(molname.c_str());
2283 mols_[nmol_].nr = 1;
2289 print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2295 top_file2 = nullptr;
2299 top_file2 = itp_file_;
2303 top_file2 = top_file;
2306 pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, atype, &symtab,
2308 restp_chain, hb_chain,
2310 bVsites_, bVsiteAromatics_, ffdir_,
2311 mHmult_, nssbonds, ssbonds,
2312 long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2313 bRenumRes_, bRTPresname_);
2317 write_posres(posre_fn.c_str(), pdba, posre_fc_);
2322 gmx_fio_fclose(itp_file_);
2325 /* pdba and natom have been reassigned somewhere so: */
2331 if (watermodel_ == nullptr)
2333 for (int chain = 0; chain < numChains; chain++)
2335 if (chains[chain].bAllWat)
2337 auto message = formatString("You have chosen not to include a water model, "
2338 "but there is water in the input file. Select a "
2339 "water model or remove the water from your input file.");
2340 GMX_THROW(InconsistentInputError(message));
2346 std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2347 if (!fflib_fexist(waterFile))
2349 auto message = formatString("The topology file '%s' for the selected water "
2350 "model '%s' can not be found in the force field "
2351 "directory. Select a different water model.",
2352 waterFile.c_str(), watermodel_);
2353 GMX_THROW(InconsistentInputError(message));
2357 print_top_mols(top_file, title, ffdir_, watermodel_, nincl_, incls_, nmol_, mols_);
2358 gmx_fio_fclose(top_file);
2360 /* now merge all chains back together */
2363 for (int i = 0; (i < numChains); i++)
2365 natom += chains[i].pdba->nr;
2366 nres += chains[i].pdba->nres;
2370 init_t_atoms(atoms, natom, false);
2371 for (int i = 0; i < atoms->nres; i++)
2373 sfree(atoms->resinfo[i].name);
2375 sfree(atoms->resinfo);
2377 snew(atoms->resinfo, nres);
2381 for (int i = 0; (i < numChains); i++)
2385 printf("Including chain %d in system: %d atoms %d residues\n",
2386 i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2388 for (int j = 0; (j < chains[i].pdba->nr); j++)
2390 atoms->atom[k] = chains[i].pdba->atom[j];
2391 atoms->atom[k].resind += l; /* l is processed nr of residues */
2392 atoms->atomname[k] = chains[i].pdba->atomname[j];
2393 atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2394 copy_rvec(chains[i].x[j], x[k]);
2397 for (int j = 0; (j < chains[i].pdba->nres); j++)
2399 atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2402 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2410 fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2411 print_sums(atoms, true);
2415 fprintf(stderr, "\nWriting coordinate file...\n");
2416 clear_rvec(box_space);
2419 make_new_box(atoms->nr, x, box, box_space, false);
2421 write_sto_conf(outputConfFile_.c_str(), title, atoms, x, nullptr, ePBC, box);
2423 printf("\t\t--------- PLEASE NOTE ------------\n");
2424 printf("You have successfully generated a topology from: %s.\n",
2425 inputConfFile_.c_str());
2426 if (watermodel_ != nullptr)
2428 printf("The %s force field and the %s water model are used.\n",
2429 ffname_, watermodel_);
2433 printf("The %s force field is used.\n",
2436 printf("\t\t--------- ETON ESAELP ------------\n");
2443 const char pdb2gmxInfo::name[] = "pdb2gmx";
2444 const char pdb2gmxInfo::shortDescription[] =
2445 "Convert coordinate files to topology and FF-compliant coordinate files";
2446 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2448 return std::make_unique<pdb2gmx>();