2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/commandline/cmdlineoptionsmodule.h"
51 #include "gromacs/compat/make_unique.h"
52 #include "gromacs/fileio/confio.h"
53 #include "gromacs/fileio/filetypes.h"
54 #include "gromacs/fileio/gmxfio.h"
55 #include "gromacs/fileio/pdbio.h"
56 #include "gromacs/gmxlib/conformation-utilities.h"
57 #include "gromacs/gmxpreprocess/fflibutil.h"
58 #include "gromacs/gmxpreprocess/genhydro.h"
59 #include "gromacs/gmxpreprocess/h_db.h"
60 #include "gromacs/gmxpreprocess/hizzie.h"
61 #include "gromacs/gmxpreprocess/pdb2top.h"
62 #include "gromacs/gmxpreprocess/pgutil.h"
63 #include "gromacs/gmxpreprocess/resall.h"
64 #include "gromacs/gmxpreprocess/specbond.h"
65 #include "gromacs/gmxpreprocess/ter_db.h"
66 #include "gromacs/gmxpreprocess/toputil.h"
67 #include "gromacs/gmxpreprocess/xlate.h"
68 #include "gromacs/math/vec.h"
69 #include "gromacs/options/basicoptions.h"
70 #include "gromacs/options/filenameoption.h"
71 #include "gromacs/options/ioptionscontainer.h"
72 #include "gromacs/topology/atomprop.h"
73 #include "gromacs/topology/block.h"
74 #include "gromacs/topology/index.h"
75 #include "gromacs/topology/residuetypes.h"
76 #include "gromacs/topology/symtab.h"
77 #include "gromacs/topology/topology.h"
78 #include "gromacs/utility/dir_separator.h"
79 #include "gromacs/utility/exceptions.h"
80 #include "gromacs/utility/fatalerror.h"
81 #include "gromacs/utility/path.h"
82 #include "gromacs/utility/smalloc.h"
83 #include "gromacs/utility/strdb.h"
84 #include "gromacs/utility/stringutil.h"
88 char gmx[RTP_MAXCHAR+2];
89 char main[RTP_MAXCHAR+2];
90 char nter[RTP_MAXCHAR+2];
91 char cter[RTP_MAXCHAR+2];
92 char bter[RTP_MAXCHAR+2];
95 static const char *res2bb_notermini(const char *name,
96 int nrr, const rtprename_t *rr)
98 /* NOTE: This function returns the main building block name,
99 * it does not take terminal renaming into account.
104 while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
109 return (i < nrr ? rr[i].main : name);
112 static const char *select_res(int nr, int resnr,
113 const char *name[], const char *expl[],
115 int nrr, const rtprename_t *rr)
119 printf("Which %s type do you want for residue %d\n", title, resnr+1);
120 for (sel = 0; (sel < nr); sel++)
122 printf("%d. %s (%s)\n",
123 sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
125 printf("\nType a number:"); fflush(stdout);
127 if (scanf("%d", &sel) != 1)
129 gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
135 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
140 const char *lh[easpNR] = { "ASP", "ASPH" };
141 const char *expl[easpNR] = {
142 "Not protonated (charge -1)",
143 "Protonated (charge 0)"
146 return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
149 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
154 const char *lh[egluNR] = { "GLU", "GLUH" };
155 const char *expl[egluNR] = {
156 "Not protonated (charge -1)",
157 "Protonated (charge 0)"
160 return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
163 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
168 const char *lh[eglnNR] = { "GLN", "QLN" };
169 const char *expl[eglnNR] = {
170 "Not protonated (charge 0)",
171 "Protonated (charge +1)"
174 return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
177 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
182 const char *lh[elysNR] = { "LYSN", "LYS" };
183 const char *expl[elysNR] = {
184 "Not protonated (charge 0)",
185 "Protonated (charge +1)"
188 return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
191 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
196 const char *lh[eargNR] = { "ARGN", "ARG" };
197 const char *expl[eargNR] = {
198 "Not protonated (charge 0)",
199 "Protonated (charge +1)"
202 return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
205 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
207 const char *expl[ehisNR] = {
214 return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
217 static void read_rtprename(const char *fname, FILE *fp,
218 int *nrtprename, rtprename_t **rtprename)
220 char line[STRLEN], buf[STRLEN];
229 while (get_a_line(fp, line, STRLEN))
232 /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
233 * For other args, we read up to 6 chars (so we can detect if the length is > 5).
234 * Note that the buffer length has been increased to 7 to allow this,
235 * so we just need to make sure the strings have zero-length initially.
238 rr[n].main[0] = '\0';
239 rr[n].nter[0] = '\0';
240 rr[n].cter[0] = '\0';
241 rr[n].bter[0] = '\0';
242 nc = sscanf(line, "%6s %6s %6s %6s %6s %s",
243 rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
244 if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
245 strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
247 gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
248 fname, RTP_MAXCHAR, line);
253 if (nc != 2 && nc != 5)
255 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d", fname, ncol, 2, 5);
261 gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
266 /* This file does not have special termini names, copy them from main */
267 strcpy(rr[n].nter, rr[n].main);
268 strcpy(rr[n].cter, rr[n].main);
269 strcpy(rr[n].bter, rr[n].main);
279 static char *search_resrename(int nrr, rtprename_t *rr,
281 bool bStart, bool bEnd,
282 bool bCompareFFRTPname)
290 while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx) != 0) ||
291 ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
296 /* If found in the database, rename this residue's rtp building block,
297 * otherwise keep the old name.
320 gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
327 static void rename_resrtp(t_atoms *pdba, int nterpairs, const int *r_start, const int *r_end,
328 int nrr, rtprename_t *rr, t_symtab *symtab,
336 bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
338 for (r = 0; r < pdba->nres; r++)
342 for (j = 0; j < nterpairs; j++)
349 for (j = 0; j < nterpairs; j++)
357 nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
359 if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
361 /* This is a terminal residue, but the residue name,
362 * currently stored in .rtp, is not a standard residue name,
363 * but probably a force field specific rtp name.
364 * Check if we need to rename it because it is terminal.
366 nn = search_resrename(nrr, rr,
367 *pdba->resinfo[r].rtp, bStart, bEnd, true);
370 if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
374 printf("Changing rtp entry of residue %d %s to '%s'\n",
375 pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
377 pdba->resinfo[r].rtp = put_symtab(symtab, nn);
382 static void pdbres_to_gmxrtp(t_atoms *pdba)
386 for (i = 0; (i < pdba->nres); i++)
388 if (pdba->resinfo[i].rtp == nullptr)
390 pdba->resinfo[i].rtp = pdba->resinfo[i].name;
395 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
396 bool bFullCompare, t_symtab *symtab)
401 for (i = 0; (i < pdba->nres); i++)
403 resnm = *pdba->resinfo[i].name;
404 if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
405 (!bFullCompare && strstr(resnm, oldnm) != nullptr))
407 /* Rename the residue name (not the rtp name) */
408 pdba->resinfo[i].name = put_symtab(symtab, newnm);
413 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
414 bool bFullCompare, t_symtab *symtab)
419 for (i = 0; (i < pdba->nres); i++)
421 /* We have not set the rtp name yes, use the residue name */
422 bbnm = *pdba->resinfo[i].name;
423 if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
424 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
426 /* Change the rtp builing block name */
427 pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
432 static void rename_bbint(t_atoms *pdba, const char *oldnm,
433 const char *gettp(int, int, const rtprename_t *),
436 int nrr, const rtprename_t *rr)
442 for (i = 0; i < pdba->nres; i++)
444 /* We have not set the rtp name yet, use the residue name */
445 bbnm = *pdba->resinfo[i].name;
446 if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
447 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
449 ptr = gettp(i, nrr, rr);
450 pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
455 static void check_occupancy(t_atoms *atoms, const char *filename, bool bVerbose)
461 ftp = fn2ftp(filename);
462 if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
464 fprintf(stderr, "No occupancies in %s\n", filename);
468 for (i = 0; (i < atoms->nr); i++)
470 if (atoms->pdbinfo[i].occup != 1)
474 fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
475 *atoms->resinfo[atoms->atom[i].resind].name,
476 atoms->resinfo[ atoms->atom[i].resind].nr,
478 atoms->pdbinfo[i].occup);
480 if (atoms->pdbinfo[i].occup == 0)
490 if (nzero == atoms->nr)
492 fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
494 else if ((nzero > 0) || (nnotone > 0))
498 "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
499 " occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
501 nzero, nnotone, atoms->nr);
505 fprintf(stderr, "All occupancies are one\n");
510 static void write_posres(const char *fn, t_atoms *pdba, real fc)
515 fp = gmx_fio_fopen(fn, "w");
517 "; In this topology include file, you will find position restraint\n"
518 "; entries for all the heavy atoms in your original pdb file.\n"
519 "; This means that all the protons which were added by pdb2gmx are\n"
520 "; not restrained.\n"
522 "[ position_restraints ]\n"
523 "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
525 for (i = 0; (i < pdba->nr); i++)
527 if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
529 fprintf(fp, "%6d%6d %g %g %g\n", i+1, 1, fc, fc, fc);
535 static int read_pdball(const char *inf, bool bOutput, const char *outf, char **title,
536 t_atoms *atoms, rvec **x,
537 int *ePBC, matrix box, bool bRemoveH,
538 t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
539 AtomProperties *aps, bool bVerbose)
540 /* Read a pdb file. (containing proteins) */
542 int natom, new_natom, i;
545 printf("Reading %s...\n", inf);
546 readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
548 if (atoms->pdbinfo == nullptr)
550 snew(atoms->pdbinfo, atoms->nr);
552 if (fn2ftp(inf) == efPDB)
554 get_pdb_atomnumber(atoms, aps);
559 for (i = 0; i < atoms->nr; i++)
561 if (!is_hydrogen(*atoms->atomname[i]))
563 atoms->atom[new_natom] = atoms->atom[i];
564 atoms->atomname[new_natom] = atoms->atomname[i];
565 atoms->pdbinfo[new_natom] = atoms->pdbinfo[i];
566 copy_rvec((*x)[i], (*x)[new_natom]);
570 atoms->nr = new_natom;
577 printf(" '%s',", *title);
579 printf(" %d atoms\n", natom);
581 /* Rename residues */
582 rename_pdbres(atoms, "HOH", watres, false, symtab);
583 rename_pdbres(atoms, "SOL", watres, false, symtab);
584 rename_pdbres(atoms, "WAT", watres, false, symtab);
586 rename_atoms("xlateat.dat", nullptr,
587 atoms, symtab, nullptr, true, rt, true, bVerbose);
595 write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
601 static void process_chain(t_atoms *pdba, rvec *x,
602 bool bTrpU, bool bPheU, bool bTyrU,
603 bool bLysMan, bool bAspMan, bool bGluMan,
604 bool bHisMan, bool bArgMan, bool bGlnMan,
605 real angle, real distance, t_symtab *symtab,
606 int nrr, const rtprename_t *rr)
608 /* Rename aromatics, lys, asp and histidine */
611 rename_bb(pdba, "TYR", "TYRU", false, symtab);
615 rename_bb(pdba, "TRP", "TRPU", false, symtab);
619 rename_bb(pdba, "PHE", "PHEU", false, symtab);
623 rename_bbint(pdba, "LYS", get_lystp, false, symtab, nrr, rr);
627 rename_bbint(pdba, "ARG", get_argtp, false, symtab, nrr, rr);
631 rename_bbint(pdba, "GLN", get_glntp, false, symtab, nrr, rr);
635 rename_bbint(pdba, "ASP", get_asptp, false, symtab, nrr, rr);
639 rename_bb(pdba, "ASPH", "ASP", false, symtab);
643 rename_bbint(pdba, "GLU", get_glutp, false, symtab, nrr, rr);
647 rename_bb(pdba, "GLUH", "GLU", false, symtab);
652 set_histp(pdba, x, angle, distance);
656 rename_bbint(pdba, "HIS", get_histp, true, symtab, nrr, rr);
659 /* Initialize the rtp builing block names with the residue names
660 * for the residues that have not been processed above.
662 pdbres_to_gmxrtp(pdba);
664 /* Now we have all rtp names set.
665 * The rtp names will conform to Gromacs naming,
666 * unless the input pdb file contained one or more force field specific
667 * rtp names as residue names.
671 /* struct for sorting the atoms from the pdb file */
673 int resnr; /* residue number */
674 int j; /* database order index */
675 int index; /* original atom number */
676 char anm1; /* second letter of atom name */
677 char altloc; /* alternate location indicator */
680 static bool pdbicomp(const t_pdbindex &a, const t_pdbindex &b)
682 int d = (a.resnr - b.resnr);
688 d = (a.anm1 - b.anm1);
691 d = (a.altloc - b.altloc);
698 static void sort_pdbatoms(t_restp restp[],
699 int natoms, t_atoms **pdbaptr, rvec **x,
700 t_blocka *block, char ***gnames)
702 t_atoms *pdba, *pdbnew;
716 for (i = 0; i < natoms; i++)
718 atomnm = *pdba->atomname[i];
719 rptr = &restp[pdba->atom[i].resind];
720 for (j = 0; (j < rptr->natom); j++)
722 if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
727 if (j == rptr->natom)
732 "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
733 "while sorting atoms.\n%s", atomnm,
734 *pdba->resinfo[pdba->atom[i].resind].name,
735 pdba->resinfo[pdba->atom[i].resind].nr,
738 is_hydrogen(atomnm) ?
739 "\nFor a hydrogen, this can be a different protonation state, or it\n"
740 "might have had a different number in the PDB file and was rebuilt\n"
741 "(it might for instance have been H3, and we only expected H1 & H2).\n"
742 "Note that hydrogens might have been added to the entry for the N-terminus.\n"
743 "Remove this hydrogen or choose a different protonation state to solve it.\n"
744 "Option -ignh will ignore all hydrogens in the input." : ".");
745 gmx_fatal(FARGS, "%s", buf);
747 /* make shadow array to be sorted into indexgroup */
748 pdbi[i].resnr = pdba->atom[i].resind;
751 pdbi[i].anm1 = atomnm[1];
752 pdbi[i].altloc = pdba->pdbinfo[i].altloc;
754 std::sort(pdbi, pdbi+natoms, pdbicomp);
756 /* pdba is sorted in pdbnew using the pdbi index */
759 init_t_atoms(pdbnew, natoms, true);
761 pdbnew->nr = pdba->nr;
762 pdbnew->nres = pdba->nres;
763 sfree(pdbnew->resinfo);
764 pdbnew->resinfo = pdba->resinfo;
765 for (i = 0; i < natoms; i++)
767 pdbnew->atom[i] = pdba->atom[pdbi[i].index];
768 pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
769 pdbnew->pdbinfo[i] = pdba->pdbinfo[pdbi[i].index];
770 copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
771 /* make indexgroup in block */
772 a[i] = pdbi[i].index;
775 sfree(pdba->atomname);
777 sfree(pdba->pdbinfo);
780 /* copy the sorted pdbnew back to pdba */
783 add_grp(block, gnames, natoms, a, "prot_sort");
789 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], bool bVerbose)
791 int i, j, oldnatoms, ndel;
794 printf("Checking for duplicate atoms....\n");
795 oldnatoms = pdba->nr;
797 /* NOTE: pdba->nr is modified inside the loop */
798 for (i = 1; (i < pdba->nr); i++)
800 /* compare 'i' and 'i-1', throw away 'i' if they are identical
801 this is a 'while' because multiple alternate locations can be present */
802 while ( (i < pdba->nr) &&
803 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
804 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
809 ri = &pdba->resinfo[pdba->atom[i].resind];
810 printf("deleting duplicate atom %4s %s%4d%c",
811 *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
812 if (ri->chainid && (ri->chainid != ' '))
814 printf(" ch %c", ri->chainid);
818 if (pdba->pdbinfo[i].atomnr)
820 printf(" pdb nr %4d", pdba->pdbinfo[i].atomnr);
822 if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
824 printf(" altloc %c", pdba->pdbinfo[i].altloc);
830 /* We can not free, since it might be in the symtab */
831 /* sfree(pdba->atomname[i]); */
832 for (j = i; j < pdba->nr; j++)
834 pdba->atom[j] = pdba->atom[j+1];
835 pdba->atomname[j] = pdba->atomname[j+1];
838 pdba->pdbinfo[j] = pdba->pdbinfo[j+1];
840 copy_rvec(x[j+1], x[j]);
842 srenew(pdba->atom, pdba->nr);
843 /* srenew(pdba->atomname, pdba->nr); */
844 srenew(pdba->pdbinfo, pdba->nr);
847 if (pdba->nr != oldnatoms)
849 printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
856 checkResidueTypeSanity(t_atoms * pdba,
859 gmx_residuetype_t * rt)
861 std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
862 std::string endResidueString = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
864 // Check whether all residues in chain have the same chain ID.
865 bool allResiduesHaveSameChainID = true;
866 char chainID0 = pdba->resinfo[r0].chainid;
868 std::string residueString;
870 for (int i = r0 + 1; i < r1; i++)
872 chainID = pdba->resinfo[i].chainid;
873 if (chainID != chainID0)
875 allResiduesHaveSameChainID = false;
876 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
881 if (!allResiduesHaveSameChainID)
884 "The chain covering the range %s--%s does not have a consistent chain ID. "
885 "The first residue has ID '%c', while residue %s has ID '%c'.",
886 startResidueString.c_str(), endResidueString.c_str(),
887 chainID0, residueString.c_str(), chainID);
890 // At this point all residues have the same ID. If they are also non-blank
891 // we can be a bit more aggressive and require the types match too.
894 bool allResiduesHaveSameType = true;
895 const char *restype0;
897 gmx_residuetype_get_type(rt, *pdba->resinfo[r0].name, &restype0);
899 for (int i = r0 + 1; i < r1; i++)
901 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &restype);
902 if (gmx_strcasecmp(restype, restype0))
904 allResiduesHaveSameType = false;
905 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
910 if (!allResiduesHaveSameType)
913 "The residues in the chain %s--%s do not have a consistent type. "
914 "The first residue has type '%s', while residue %s is of type '%s'. "
915 "Either there is a mistake in your chain, or it includes nonstandard "
916 "residue names that have not yet been added to the residuetypes.dat "
917 "file in the GROMACS library directory. If there are other molecules "
918 "such as ligands, they should not have the same chain ID as the "
919 "adjacent protein chain since it's a separate molecule.",
920 startResidueString.c_str(), endResidueString.c_str(),
921 restype0, residueString.c_str(), restype);
926 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
927 gmx_residuetype_t *rt)
930 const char *p_startrestype;
931 const char *p_restype;
936 int startWarnings = 0;
940 // Check that all residues have the same chain identifier, and if it is
941 // non-blank we also require the residue types to match.
942 checkResidueTypeSanity(pdba, r0, r1, rt);
944 // If we return correctly from checkResidueTypeSanity(), the only
945 // remaining cases where we can have non-matching residue types is if
946 // the chain ID was blank, which could be the case e.g. for a structure
947 // read from a GRO file or other file types without chain information.
948 // In that case we need to be a bit more liberal and detect chains based
949 // on the residue type.
951 // If we get here, the chain ID must be identical for all residues
952 char chainID = pdba->resinfo[r0].chainid;
954 /* Find the starting terminus (typially N or 5') */
955 for (i = r0; i < r1 && *r_start == -1; i++)
957 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
958 if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
960 printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
963 else if (!gmx_strcasecmp(p_startrestype, "Ion"))
967 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
971 printf("Disabling further notes about ions.\n");
977 if (startWarnings < 5)
981 printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
982 "This chain lacks identifiers, which makes it impossible to do strict\n"
983 "classification of the start/end residues. Here we need to guess this residue\n"
984 "should not be part of the chain and instead introduce a break, but that will\n"
985 "be catastrophic if they should in fact be linked. Please check your structure,\n"
986 "and add %s to residuetypes.dat if this was not correct.\n\n",
987 *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
991 printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
992 "This makes it impossible to link them into a molecule, which could either be\n"
993 "correct or a catastrophic error. Please check your structure, and add all\n"
994 "necessary residue names to residuetypes.dat if this was not correct.\n\n",
995 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
998 if (startWarnings == 4)
1000 printf("Disabling further warnings about unidentified residues at start of chain.\n");
1008 /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1009 for (i = *r_start; i < r1; i++)
1011 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
1012 if (!gmx_strcasecmp(p_restype, p_startrestype) && endWarnings == 0)
1016 else if (!gmx_strcasecmp(p_startrestype, "Ion"))
1020 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1024 printf("Disabling further notes about ions.\n");
1030 // This can only trigger if the chain ID is blank - otherwise the
1031 // call to checkResidueTypeSanity() will have caught the problem.
1032 if (endWarnings < 5)
1034 printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1035 "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1036 "it impossible to do strict classification of the start/end residues. Here we\n"
1037 "need to guess this residue should not be part of the chain and instead\n"
1038 "introduce a break, but that will be catastrophic if they should in fact be\n"
1039 "linked. Please check your structure, and add %s to residuetypes.dat\n"
1040 "if this was not correct.\n\n",
1041 *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
1042 *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype, *pdba->resinfo[i].name);
1044 if (endWarnings == 4)
1046 printf("Disabling further warnings about unidentified residues at end of chain.\n");
1055 printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1059 /* enum for chain separation */
1061 enChainSep_id_or_ter, enChainSep_id_and_ter, enChainSep_ter,
1062 enChainSep_id, enChainSep_interactive
1064 static const char *ChainSepEnum[] = {"id_or_ter", "id_and_ter", "ter", "id", "interactive"};
1065 static const char *ChainSepInfoString[] =
1067 "Splitting chemical chains based on TER records or chain id changing.\n",
1068 "Splitting chemical chains based on TER records and chain id changing.\n",
1069 "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1070 "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1071 "Splitting chemical chains interactively.\n"
1075 modify_chain_numbers(t_atoms * pdba,
1076 ChainSepType enumChainSep)
1079 char old_prev_chainid;
1080 char old_this_chainid;
1081 int old_prev_chainnum;
1082 int old_this_chainnum;
1084 char select[STRLEN];
1088 const char * prev_atomname;
1089 const char * this_atomname;
1090 const char * prev_resname;
1091 const char * this_resname;
1097 /* The default chain enumeration is based on TER records only */
1098 printf("%s", ChainSepInfoString[enumChainSep]);
1100 old_prev_chainid = '?';
1101 old_prev_chainnum = -1;
1104 this_atomname = nullptr;
1106 this_resname = nullptr;
1110 for (i = 0; i < pdba->nres; i++)
1112 ri = &pdba->resinfo[i];
1113 old_this_chainid = ri->chainid;
1114 old_this_chainnum = ri->chainnum;
1116 prev_atomname = this_atomname;
1117 prev_atomnum = this_atomnum;
1118 prev_resname = this_resname;
1119 prev_resnum = this_resnum;
1120 prev_chainid = this_chainid;
1122 this_atomname = *(pdba->atomname[i]);
1123 this_atomnum = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1124 this_resname = *ri->name;
1125 this_resnum = ri->nr;
1126 this_chainid = ri->chainid;
1128 switch (enumChainSep)
1130 case enChainSep_id_or_ter:
1131 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1137 case enChainSep_id_and_ter:
1138 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1145 if (old_this_chainid != old_prev_chainid)
1151 case enChainSep_ter:
1152 if (old_this_chainnum != old_prev_chainnum)
1157 case enChainSep_interactive:
1158 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1162 printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1164 "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1165 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1166 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1168 if (nullptr == fgets(select, STRLEN-1, stdin))
1170 gmx_fatal(FARGS, "Error reading from stdin");
1173 if (i == 0 || select[0] == 'y')
1180 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1182 old_prev_chainid = old_this_chainid;
1183 old_prev_chainnum = old_this_chainnum;
1185 ri->chainnum = new_chainnum;
1213 // TODO make all enums into scoped enums
1214 /* enum for vsites */
1216 enVSites_none, enVSites_hydrogens, enVSites_aromatics
1218 static const char *VSitesEnum[] = {"none", "hydrogens", "aromatics"};
1220 /* enum for water model */
1222 enWater_select, enWater_none, enWater_spc, enWater_spce,
1223 enWater_tip3p, enWater_tip4p, enWater_tip5p, enWater_tips3p
1225 static const char *WaterEnum[] = {
1226 "select", "none", "spc", "spce",
1227 "tip3p", "tip4p", "tip5p", "tips3p"
1230 /* enum for merge */
1232 enMerge_no, enMerge_all, enMerge_interactive
1234 static const char *MergeEnum[] = {"no", "all", "interactive"};
1242 class pdb2gmx : public ICommandLineOptionsModule
1246 bVsites_(FALSE), bPrevWat_(FALSE), bVsiteAromatics_(FALSE),
1247 enumChainSep_(enChainSep_id_or_ter),
1248 enumVSites_(enVSites_none),
1249 enumWater_(enWater_select),
1250 enumMerge_(enMerge_no),
1260 // From ICommandLineOptionsModule
1261 void init(CommandLineModuleSettings * /*settings*/) override
1265 void initOptions(IOptionsContainer *options,
1266 ICommandLineOptionsModuleSettings *settings) override;
1268 void optionsFinished() override;
1286 bool bAllowMissing_;
1290 bool bChargeGroups_;
1300 bool bVsiteAromatics_;
1304 real long_bond_dist_;
1305 real short_bond_dist_;
1307 std::string indexOutputFile_;
1308 std::string outputFile_;
1309 std::string topologyFile_;
1310 std::string includeTopologyFile_;
1311 std::string outputConfFile_;
1312 std::string inputConfFile_;
1313 std::string outFile_;
1316 ChainSepType enumChainSep_;
1317 VSitesType enumVSites_;
1318 WaterType enumWater_;
1319 MergeType enumMerge_;
1322 char forcefield_[STRLEN];
1323 char ffdir_[STRLEN];
1333 void pdb2gmx::initOptions(IOptionsContainer *options,
1334 ICommandLineOptionsModuleSettings *settings)
1336 const char *desc[] = {
1337 "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1338 "some database files, adds hydrogens to the molecules and generates",
1339 "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1340 "and a topology in GROMACS format.",
1341 "These files can subsequently be processed to generate a run input file.",
1343 "[THISMODULE] will search for force fields by looking for",
1344 "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1345 "of the current working directory and of the GROMACS library directory",
1346 "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1348 "By default the forcefield selection is interactive,",
1349 "but you can use the [TT]-ff[tt] option to specify one of the short names",
1350 "in the list on the command line instead. In that case [THISMODULE] just looks",
1351 "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1353 "After choosing a force field, all files will be read only from",
1354 "the corresponding force field directory.",
1355 "If you want to modify or add a residue types, you can copy the force",
1356 "field directory from the GROMACS library directory to your current",
1357 "working directory. If you want to add new protein residue types,",
1358 "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1359 "or copy the whole library directory to a local directory and set",
1360 "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1361 "Check Chapter 5 of the manual for more information about file formats.",
1364 "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1365 "need not necessarily contain a protein structure. Every kind of",
1366 "molecule for which there is support in the database can be converted.",
1367 "If there is no support in the database, you can add it yourself.[PAR]",
1369 "The program has limited intelligence, it reads a number of database",
1370 "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1371 "if necessary this can be done manually. The program can prompt the",
1372 "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1373 "desired. For Lys the choice is between neutral (two protons on NZ) or",
1374 "protonated (three protons, default), for Asp and Glu unprotonated",
1375 "(default) or protonated, for His the proton can be either on ND1,",
1376 "on NE2 or on both. By default these selections are done automatically.",
1377 "For His, this is based on an optimal hydrogen bonding",
1378 "conformation. Hydrogen bonds are defined based on a simple geometric",
1379 "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1380 "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1381 "[TT]-dist[tt] respectively.[PAR]",
1383 "The protonation state of N- and C-termini can be chosen interactively",
1384 "with the [TT]-ter[tt] flag. Default termini are ionized (NH3+ and COO-),",
1385 "respectively. Some force fields support zwitterionic forms for chains of",
1386 "one residue, but for polypeptides these options should NOT be selected.",
1387 "The AMBER force fields have unique forms for the terminal residues,",
1388 "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1389 "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1390 "respectively to use these forms, making sure you preserve the format",
1391 "of the coordinate file. Alternatively, use named terminating residues",
1392 "(e.g. ACE, NME).[PAR]",
1394 "The separation of chains is not entirely trivial since the markup",
1395 "in user-generated PDB files frequently varies and sometimes it",
1396 "is desirable to merge entries across a TER record, for instance",
1397 "if you want a disulfide bridge or distance restraints between",
1398 "two protein chains or if you have a HEME group bound to a protein.",
1399 "In such cases multiple chains should be contained in a single",
1400 "[TT]moleculetype[tt] definition.",
1401 "To handle this, [THISMODULE] uses two separate options.",
1402 "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1403 "start, and termini added when applicable. This can be done based on the",
1404 "existence of TER records, when the chain id changes, or combinations of either",
1405 "or both of these. You can also do the selection fully interactively.",
1406 "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1407 "are merged into one moleculetype, after adding all the chemical termini (or not).",
1408 "This can be turned off (no merging), all non-water chains can be merged into a",
1409 "single molecule, or the selection can be done interactively.[PAR]",
1411 "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1412 "If any of the occupancies are not one, indicating that the atom is",
1413 "not resolved well in the structure, a warning message is issued.",
1414 "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1415 "all occupancy fields may be zero. Either way, it is up to the user",
1416 "to verify the correctness of the input data (read the article!).[PAR]",
1418 "During processing the atoms will be reordered according to GROMACS",
1419 "conventions. With [TT]-n[tt] an index file can be generated that",
1420 "contains one group reordered in the same way. This allows you to",
1421 "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1422 "one limitation: reordering is done after the hydrogens are stripped",
1423 "from the input and before new hydrogens are added. This means that",
1424 "you should not use [TT]-ignh[tt].[PAR]",
1426 "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1427 "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1428 "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1431 "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1432 "motions. Angular and out-of-plane motions can be removed by changing",
1433 "hydrogens into virtual sites and fixing angles, which fixes their",
1434 "position relative to neighboring atoms. Additionally, all atoms in the",
1435 "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1436 "can be converted into virtual sites, eliminating the fast improper dihedral",
1437 "fluctuations in these rings (but this feature is deprecated).",
1438 "[BB]Note[bb] that in this case all other hydrogen",
1439 "atoms are also converted to virtual sites. The mass of all atoms that are",
1440 "converted into virtual sites, is added to the heavy atoms.[PAR]",
1441 "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1442 "done by increasing the hydrogen-mass by a factor of 4. This is also",
1443 "done for water hydrogens to slow down the rotational motion of water.",
1444 "The increase in mass of the hydrogens is subtracted from the bonded",
1445 "(heavy) atom so that the total mass of the system remains the same."
1448 settings->setHelpText(desc);
1450 options->addOption(BooleanOption("newrtp")
1451 .store(&bNewRTP_).defaultValue(false).hidden()
1452 .description("Write the residue database in new format to [TT]new.rtp[tt]"));
1453 options->addOption(RealOption("lb")
1454 .store(&long_bond_dist_).defaultValue(0.25).hidden()
1455 .description("Long bond warning distance"));
1456 options->addOption(RealOption("sb")
1457 .store(&short_bond_dist_).defaultValue(0.05).hidden()
1458 .description("Short bond warning distance"));
1459 options->addOption(EnumOption<ChainSepType>("chainsep").enumValue(ChainSepEnum)
1460 .store(&enumChainSep_)
1461 .description("Condition in PDB files when a new chain should be started (adding termini)"));
1462 options->addOption(EnumOption<MergeType>("merge").enumValue(MergeEnum)
1464 .description("Merge multiple chains into a single [moleculetype]"));
1465 options->addOption(StringOption("ff")
1466 .store(&ff_).defaultValue("select")
1467 .description("Force field, interactive by default. Use [TT]-h[tt] for information."));
1468 options->addOption(EnumOption<WaterType>("water")
1469 .store(&enumWater_).enumValue(WaterEnum)
1470 .description("Water model to use"));
1471 options->addOption(BooleanOption("inter")
1472 .store(&bInter_).defaultValue(false)
1473 .description("Set the next 8 options to interactive"));
1474 options->addOption(BooleanOption("ss")
1475 .store(&bCysMan_).defaultValue(false)
1476 .description("Interactive SS bridge selection"));
1477 options->addOption(BooleanOption("ter")
1478 .store(&bTerMan_).defaultValue(false)
1479 .description("Interactive termini selection, instead of charged (default)"));
1480 options->addOption(BooleanOption("lys")
1481 .store(&bLysMan_).defaultValue(false)
1482 .description("Interactive lysine selection, instead of charged"));
1483 options->addOption(BooleanOption("arg")
1484 .store(&bArgMan_).defaultValue(false)
1485 .description("Interactive arginine selection, instead of charged"));
1486 options->addOption(BooleanOption("asp")
1487 .store(&bAspMan_).defaultValue(false)
1488 .description("Interactive aspartic acid selection, instead of charged"));
1489 options->addOption(BooleanOption("glu")
1490 .store(&bGluMan_).defaultValue(false)
1491 .description("Interactive glutamic acid selection, instead of charged"));
1492 options->addOption(BooleanOption("gln")
1493 .store(&bGlnMan_).defaultValue(false)
1494 .description("Interactive glutamine selection, instead of charged"));
1495 options->addOption(BooleanOption("his")
1496 .store(&bHisMan_).defaultValue(false)
1497 .description("Interactive histidine selection, instead of checking H-bonds"));
1498 options->addOption(RealOption("angle")
1499 .store(&angle_).defaultValue(135.0)
1500 .description("Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1501 options->addOption(RealOption("dist")
1502 .store(&distance_).defaultValue(0.3)
1503 .description("Maximum donor-acceptor distance for a H-bond (nm)"));
1504 options->addOption(BooleanOption("una")
1505 .store(&bUnA_).defaultValue(false)
1506 .description("Select aromatic rings with united CH atoms on phenylalanine, tryptophane and tyrosine"));
1507 options->addOption(BooleanOption("sort")
1508 .store(&bSort_).defaultValue(true).hidden()
1509 .description("Sort the residues according to database, turning this off is dangerous as charge groups might be broken in parts"));
1510 options->addOption(BooleanOption("ignh")
1511 .store(&bRemoveH_).defaultValue(false)
1512 .description("Ignore hydrogen atoms that are in the coordinate file"));
1513 options->addOption(BooleanOption("missing")
1514 .store(&bAllowMissing_).defaultValue(false)
1515 .description("Continue when atoms are missing and bonds cannot be made, dangerous"));
1516 options->addOption(BooleanOption("v")
1517 .store(&bVerbose_).defaultValue(false)
1518 .description("Be slightly more verbose in messages"));
1519 options->addOption(RealOption("posrefc")
1520 .store(&posre_fc_).defaultValue(1000)
1521 .description("Force constant for position restraints"));
1522 options->addOption(EnumOption<VSitesType>("vsite")
1523 .store(&enumVSites_).enumValue(VSitesEnum)
1524 .description("Convert atoms to virtual sites"));
1525 options->addOption(BooleanOption("heavyh")
1526 .store(&bHeavyH_).defaultValue(false)
1527 .description("Make hydrogen atoms heavy"));
1528 options->addOption(BooleanOption("deuterate")
1529 .store(&bDeuterate_).defaultValue(false)
1530 .description("Change the mass of hydrogens to 2 amu"));
1531 options->addOption(BooleanOption("chargegrp")
1532 .store(&bChargeGroups_).defaultValue(true)
1533 .description("Use charge groups in the [REF].rtp[ref] file"));
1534 options->addOption(BooleanOption("cmap")
1535 .store(&bCmap_).defaultValue(true)
1536 .description("Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1537 options->addOption(BooleanOption("renum")
1538 .store(&bRenumRes_).defaultValue(false)
1539 .description("Renumber the residues consecutively in the output"));
1540 options->addOption(BooleanOption("rtpres")
1541 .store(&bRTPresname_).defaultValue(false)
1542 .description("Use [REF].rtp[ref] entry names as residue names"));
1543 options->addOption(FileNameOption("f")
1544 .legacyType(efSTX).inputFile()
1545 .store(&inputConfFile_).required()
1546 .defaultBasename("protein").defaultType(efPDB)
1547 .description("Structure file"));
1548 options->addOption(FileNameOption("o")
1549 .legacyType(efSTO).outputFile()
1550 .store(&outputConfFile_).required()
1551 .defaultBasename("conf")
1552 .description("Structure file"));
1553 options->addOption(FileNameOption("p")
1554 .legacyType(efTOP).outputFile()
1555 .store(&topologyFile_).required()
1556 .defaultBasename("topol")
1557 .description("Topology file"));
1558 options->addOption(FileNameOption("i")
1559 .legacyType(efITP).outputFile()
1560 .store(&includeTopologyFile_).required()
1561 .defaultBasename("posre")
1562 .description("Include file for topology"));
1563 options->addOption(FileNameOption("n")
1564 .legacyType(efNDX).outputFile()
1565 .store(&indexOutputFile_).storeIsSet(&bIndexSet_)
1566 .defaultBasename("index")
1567 .description("Index file"));
1568 options->addOption(FileNameOption("q")
1569 .legacyType(efSTO).outputFile()
1570 .store(&outFile_).storeIsSet(&bOutputSet_)
1571 .defaultBasename("clean").defaultType(efPDB)
1572 .description("Structure file"));
1575 void pdb2gmx::optionsFinished()
1577 if (inputConfFile_.empty())
1579 GMX_THROW(InconsistentInputError("You must supply an input file"));
1583 /* if anything changes here, also change description of -inter */
1598 else if (bDeuterate_)
1607 /* Force field selection, interactive or direct */
1608 choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(),
1609 forcefield_, sizeof(forcefield_),
1610 ffdir_, sizeof(ffdir_));
1612 if (strlen(forcefield_) > 0)
1614 ffname_ = forcefield_;
1615 ffname_[0] = std::toupper(ffname_[0]);
1619 gmx_fatal(FARGS, "Empty forcefield string");
1625 char select[STRLEN];
1628 t_hackblock *hb_chain;
1629 t_restp *restp_chain;
1633 const char *prev_atomname;
1634 const char *this_atomname;
1635 const char *prev_resname;
1636 const char *this_resname;
1641 int prev_chainnumber;
1642 int this_chainnumber;
1643 int this_chainstart;
1644 int prev_chainstart;
1646 printf("\nUsing the %s force field in directory %s\n\n",
1649 choose_watermodel(WaterEnum[enumWater_], ffdir_, &watermodel_);
1651 switch (enumVSites_)
1655 bVsiteAromatics_ = false;
1657 case enVSites_hydrogens:
1659 bVsiteAromatics_ = false;
1661 case enVSites_aromatics:
1663 bVsiteAromatics_ = true;
1666 gmx_fatal(FARGS, "Internal inconsistency: VSitesEnum='%s'", VSitesEnum[enumVSites_]);
1669 /* Open the symbol table */
1671 open_symtab(&symtab);
1673 /* Residue type database */
1674 gmx_residuetype_t *rt;
1675 gmx_residuetype_init(&rt);
1677 /* Read residue renaming database(s), if present */
1678 std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
1681 rtprename_t *rtprename = nullptr;
1682 for (const auto &filename : rrn)
1684 printf("going to rename %s\n", filename.c_str());
1685 FILE *fp = fflib_open(filename);
1686 read_rtprename(filename.c_str(), fp, &nrtprename, &rtprename);
1690 /* Add all alternative names from the residue renaming database to the list
1691 of recognized amino/nucleic acids. */
1692 const char *p_restype;
1693 for (int i = 0; i < nrtprename; i++)
1695 int rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1697 /* Only add names if the 'standard' gromacs/iupac base name was found */
1700 gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1701 gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1702 gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1703 gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1710 if (watermodel_ != nullptr && (strstr(watermodel_, "4p") ||
1711 strstr(watermodel_, "4P")))
1715 else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") ||
1716 strstr(watermodel_, "5P")))
1730 int natom = read_pdball(inputConfFile_.c_str(), bOutputSet_, outFile_.c_str(),
1731 &title, &pdba_all, &pdbx, &ePBC, box, bRemoveH_,
1732 &symtab, rt, watres, &aps, bVerbose_);
1736 std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
1737 GMX_THROW(InconsistentInputError(message));
1740 printf("Analyzing pdb file\n");
1741 int nwaterchain = 0;
1743 modify_chain_numbers(&pdba_all, enumChainSep_);
1745 int nchainmerges = 0;
1747 this_atomname = nullptr;
1749 this_resname = nullptr;
1752 this_chainnumber = -1;
1753 this_chainstart = 0;
1754 /* Keep the compiler happy */
1755 prev_chainstart = 0;
1760 snew(pdb_ch, maxch);
1763 bool bMerged = false;
1764 for (int i = 0; (i < natom); i++)
1766 ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1768 /* TODO this should live in a helper object, and consolidate
1769 that with code in modify_chain_numbers */
1770 prev_atomname = this_atomname;
1771 prev_atomnum = this_atomnum;
1772 prev_resname = this_resname;
1773 prev_resnum = this_resnum;
1774 prev_chainid = this_chainid;
1775 prev_chainnumber = this_chainnumber;
1778 prev_chainstart = this_chainstart;
1781 this_atomname = *pdba_all.atomname[i];
1782 this_atomnum = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1783 this_resname = *ri->name;
1784 this_resnum = ri->nr;
1785 this_chainid = ri->chainid;
1786 this_chainnumber = ri->chainnum;
1788 bWat_ = gmx_strcasecmp(*ri->name, watres) == 0;
1790 if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
1792 this_chainstart = pdba_all.atom[i].resind;
1794 if (i > 0 && !bWat_)
1796 if (!strncmp(MergeEnum[enumMerge_], "int", 3))
1798 printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1799 "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1800 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1801 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1803 if (nullptr == fgets(select, STRLEN-1, stdin))
1805 gmx_fatal(FARGS, "Error reading from stdin");
1807 bMerged = (select[0] == 'y');
1809 else if (!strncmp(MergeEnum[enumMerge_], "all", 3))
1817 pdb_ch[numChains-1].chainstart[pdb_ch[numChains-1].nterpairs] =
1818 pdba_all.atom[i].resind - prev_chainstart;
1819 pdb_ch[numChains-1].nterpairs++;
1820 srenew(pdb_ch[numChains-1].chainstart, pdb_ch[numChains-1].nterpairs+1);
1825 /* set natom for previous chain */
1828 pdb_ch[numChains-1].natom = i-pdb_ch[numChains-1].start;
1835 /* check if chain identifier was used before */
1836 for (int j = 0; (j < numChains); j++)
1838 if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1840 printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1841 "They will be treated as separate chains unless you reorder your file.\n",
1845 // TODO This is too convoluted. Use a std::vector
1846 if (numChains == maxch)
1849 srenew(pdb_ch, maxch);
1851 pdb_ch[numChains].chainid = ri->chainid;
1852 pdb_ch[numChains].chainnum = ri->chainnum;
1853 pdb_ch[numChains].start = i;
1854 pdb_ch[numChains].bAllWat = bWat_;
1857 pdb_ch[numChains].nterpairs = 0;
1861 pdb_ch[numChains].nterpairs = 1;
1863 snew(pdb_ch[numChains].chainstart, pdb_ch[numChains].nterpairs+1);
1864 /* modified [numChains] to [0] below */
1865 pdb_ch[numChains].chainstart[0] = 0;
1871 pdb_ch[numChains-1].natom = natom-pdb_ch[numChains-1].start;
1873 /* set all the water blocks at the end of the chain */
1875 snew(swap_index, numChains);
1877 for (int i = 0; i < numChains; i++)
1879 if (!pdb_ch[i].bAllWat)
1885 for (int i = 0; i < numChains; i++)
1887 if (pdb_ch[i].bAllWat)
1893 if (nwaterchain > 1)
1895 printf("Moved all the water blocks to the end\n");
1900 snew(chains, numChains);
1901 /* copy pdb data and x for all chains */
1902 for (int i = 0; (i < numChains); i++)
1904 int si = swap_index[i];
1905 chains[i].chainid = pdb_ch[si].chainid;
1906 chains[i].chainnum = pdb_ch[si].chainnum;
1907 chains[i].bAllWat = pdb_ch[si].bAllWat;
1908 chains[i].nterpairs = pdb_ch[si].nterpairs;
1909 chains[i].chainstart = pdb_ch[si].chainstart;
1910 snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1911 snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1912 snew(chains[i].r_start, pdb_ch[si].nterpairs);
1913 snew(chains[i].r_end, pdb_ch[si].nterpairs);
1915 snew(chains[i].pdba, 1);
1916 init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
1917 snew(chains[i].x, chains[i].pdba->nr);
1918 for (j = 0; j < chains[i].pdba->nr; j++)
1920 chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1921 snew(chains[i].pdba->atomname[j], 1);
1922 *chains[i].pdba->atomname[j] =
1923 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1924 chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1925 copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1927 /* Re-index the residues assuming that the indices are continuous */
1928 int k = chains[i].pdba->atom[0].resind;
1929 int nres = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1930 chains[i].pdba->nres = nres;
1931 for (int j = 0; j < chains[i].pdba->nr; j++)
1933 chains[i].pdba->atom[j].resind -= k;
1935 srenew(chains[i].pdba->resinfo, nres);
1936 for (int j = 0; j < nres; j++)
1938 chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1939 snew(chains[i].pdba->resinfo[j].name, 1);
1940 *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1941 /* make all chain identifiers equal to that of the chain */
1942 chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1946 if (nchainmerges > 0)
1948 printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1952 printf("There are %d chains and %d blocks of water and "
1953 "%d residues with %d atoms\n",
1954 numChains-nwaterchain, nwaterchain,
1955 pdba_all.nres, natom);
1957 printf("\n %5s %4s %6s\n", "chain", "#res", "#atoms");
1958 for (int i = 0; (i < numChains); i++)
1960 printf(" %d '%c' %5d %6d %s\n",
1961 i+1, chains[i].chainid ? chains[i].chainid : '-',
1962 chains[i].pdba->nres, chains[i].pdba->nr,
1963 chains[i].bAllWat ? "(only water)" : "");
1967 check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_);
1969 /* Read atomtypes... */
1970 gpp_atomtype_t atype = read_atype(ffdir_, &symtab);
1972 /* read residue database */
1973 printf("Reading residue database... (%s)\n", forcefield_);
1974 std::vector<std::string> rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
1976 t_restp *restp = nullptr;
1977 for (const auto &filename : rtpf)
1979 read_resall(filename.c_str(), &nrtp, &restp, atype, &symtab, false);
1983 /* Not correct with multiple rtp input files with different bonded types */
1984 FILE *fp = gmx_fio_fopen("new.rtp", "w");
1985 print_resall(fp, nrtp, restp, atype);
1989 /* read hydrogen database */
1991 int nah = read_h_db(ffdir_, &ah);
1993 /* Read Termini database... */
1997 t_hackblock **tdblist;
1998 int nNtdb = read_ter_db(ffdir_, 'n', &ntdb, atype);
1999 int nCtdb = read_ter_db(ffdir_, 'c', &ctdb, atype);
2001 FILE *top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2003 print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2007 for (int chain = 0; (chain < numChains); chain++)
2009 cc = &(chains[chain]);
2011 /* set pdba, natom and nres to the current chain */
2014 natom = cc->pdba->nr;
2015 int nres = cc->pdba->nres;
2017 if (cc->chainid && ( cc->chainid != ' ' ) )
2019 printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
2020 chain+1, cc->chainid, natom, nres);
2024 printf("Processing chain %d (%d atoms, %d residues)\n",
2025 chain+1, natom, nres);
2028 process_chain(pdba, x, bUnA_, bUnA_, bUnA_, bLysMan_, bAspMan_, bGluMan_,
2029 bHisMan_, bArgMan_, bGlnMan_, angle_, distance_, &symtab,
2030 nrtprename, rtprename);
2032 cc->chainstart[cc->nterpairs] = pdba->nres;
2034 for (int i = 0; i < cc->nterpairs; i++)
2036 find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
2037 &(cc->r_start[j]), &(cc->r_end[j]), rt);
2039 if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2045 if (cc->nterpairs == 0)
2047 printf("Problem with chain definition, or missing terminal residues.\n"
2048 "This chain does not appear to contain a recognized chain molecule.\n"
2049 "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
2052 /* Check for disulfides and other special bonds */
2053 nssbonds = mk_specbonds(pdba, x, bCysMan_, &ssbonds, bVerbose_);
2057 rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
2058 &symtab, bVerbose_);
2061 for (int i = 0; i < cc->nterpairs; i++)
2065 * We first apply a filter so we only have the
2066 * termini that can be applied to the residue in question
2067 * (or a generic terminus if no-residue specific is available).
2069 /* First the N terminus */
2072 tdblist = filter_ter(nNtdb, ntdb,
2073 *pdba->resinfo[cc->r_start[i]].name,
2077 printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
2078 "is already in a terminus-specific form and skipping terminus selection.\n");
2079 cc->ntdb[i] = nullptr;
2083 if (bTerMan_ && ntdblist > 1)
2085 sprintf(select, "Select start terminus type for %s-%d",
2086 *pdba->resinfo[cc->r_start[i]].name,
2087 pdba->resinfo[cc->r_start[i]].nr);
2088 cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
2092 cc->ntdb[i] = tdblist[0];
2095 printf("Start terminus %s-%d: %s\n",
2096 *pdba->resinfo[cc->r_start[i]].name,
2097 pdba->resinfo[cc->r_start[i]].nr,
2098 (cc->ntdb[i])->name);
2104 cc->ntdb[i] = nullptr;
2107 /* And the C terminus */
2110 tdblist = filter_ter(nCtdb, ctdb,
2111 *pdba->resinfo[cc->r_end[i]].name,
2115 printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2116 "is already in a terminus-specific form and skipping terminus selection.\n");
2117 cc->ctdb[i] = nullptr;
2121 if (bTerMan_ && ntdblist > 1)
2123 sprintf(select, "Select end terminus type for %s-%d",
2124 *pdba->resinfo[cc->r_end[i]].name,
2125 pdba->resinfo[cc->r_end[i]].nr);
2126 cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
2130 cc->ctdb[i] = tdblist[0];
2132 printf("End terminus %s-%d: %s\n",
2133 *pdba->resinfo[cc->r_end[i]].name,
2134 pdba->resinfo[cc->r_end[i]].nr,
2135 (cc->ctdb[i])->name);
2141 cc->ctdb[i] = nullptr;
2145 /* lookup hackblocks and rtp for all residues */
2146 get_hackblocks_rtp(&hb_chain, &restp_chain,
2147 nrtp, restp, pdba->nres, pdba->resinfo,
2148 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2150 /* ideally, now we would not need the rtp itself anymore, but do
2151 everything using the hb and restp arrays. Unfortunately, that
2152 requires some re-thinking of code in gen_vsite.c, which I won't
2153 do now :( AF 26-7-99 */
2155 rename_atoms(nullptr, ffdir_,
2156 pdba, &symtab, restp_chain, false, rt, false, bVerbose_);
2158 match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose_);
2164 block = new_blocka();
2166 sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
2167 remove_duplicate_atoms(pdba, x, bVerbose_);
2172 fprintf(stderr, "WARNING: with the -remh option the generated "
2173 "index file (%s) might be useless\n"
2174 "(the index file is generated before hydrogens are added)",
2175 indexOutputFile_.c_str());
2177 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2179 for (int i = 0; i < block->nr; i++)
2188 fprintf(stderr, "WARNING: "
2189 "without sorting no check for duplicate atoms can be done\n");
2192 /* Generate Hydrogen atoms (and termini) in the sequence */
2193 printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2194 add_h(&pdba, &x, nah, ah,
2195 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing_,
2196 nullptr, nullptr, true, false);
2197 printf("Now there are %d residues with %d atoms\n",
2198 pdba->nres, pdba->nr);
2200 /* make up molecule name(s) */
2202 int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2204 gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2206 std::string molname;
2214 this_chainid = cc->chainid;
2216 /* Add the chain id if we have one */
2217 if (this_chainid != ' ')
2219 suffix.append(formatString("_chain_%c", this_chainid));
2222 /* Check if there have been previous chains with the same id */
2224 for (int k = 0; k < chain; k++)
2226 if (cc->chainid == chains[k].chainid)
2231 /* Add the number for this chain identifier if there are multiple copies */
2234 suffix.append(formatString("%d", nid_used+1));
2237 if (suffix.length() > 0)
2239 molname.append(p_restype);
2240 molname.append(suffix);
2244 molname = p_restype;
2247 std::string itp_fn = topologyFile_;;
2248 std::string posre_fn = includeTopologyFile_;
2249 if ((numChains-nwaterchain > 1) && !cc->bAllWat)
2252 printf("Chain time...\n");
2253 //construct the itp file name
2254 itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2256 itp_fn.append(molname);
2257 itp_fn.append(".itp");
2258 //now do the same for posre
2259 posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2260 posre_fn.append("_");
2261 posre_fn.append(molname);
2262 posre_fn.append(".itp");
2263 if (posre_fn == itp_fn)
2265 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2268 srenew(incls_, nincl_);
2269 incls_[nincl_-1] = gmx_strdup(itp_fn.c_str());
2270 itp_file_ = gmx_fio_fopen(itp_fn.c_str(), "w");
2277 srenew(mols_, nmol_+1);
2280 mols_[nmol_].name = gmx_strdup("SOL");
2281 mols_[nmol_].nr = pdba->nres;
2285 mols_[nmol_].name = gmx_strdup(molname.c_str());
2286 mols_[nmol_].nr = 1;
2292 print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2298 top_file2 = nullptr;
2302 top_file2 = itp_file_;
2306 top_file2 = top_file;
2309 pdb2top(top_file2, posre_fn.c_str(), molname.c_str(), pdba, &x, atype, &symtab,
2311 restp_chain, hb_chain,
2313 bVsites_, bVsiteAromatics_, ffdir_,
2314 mHmult_, nssbonds, ssbonds,
2315 long_bond_dist_, short_bond_dist_, bDeuterate_, bChargeGroups_, bCmap_,
2316 bRenumRes_, bRTPresname_);
2320 write_posres(posre_fn.c_str(), pdba, posre_fc_);
2325 gmx_fio_fclose(itp_file_);
2328 /* pdba and natom have been reassigned somewhere so: */
2334 if (watermodel_ == nullptr)
2336 for (int chain = 0; chain < numChains; chain++)
2338 if (chains[chain].bAllWat)
2340 auto message = formatString("You have chosen not to include a water model, "
2341 "but there is water in the input file. Select a "
2342 "water model or remove the water from your input file.");
2343 GMX_THROW(InconsistentInputError(message));
2349 std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2350 if (!fflib_fexist(waterFile))
2352 auto message = formatString("The topology file '%s' for the selected water "
2353 "model '%s' can not be found in the force field "
2354 "directory. Select a different water model.",
2355 waterFile.c_str(), watermodel_);
2356 GMX_THROW(InconsistentInputError(message));
2360 print_top_mols(top_file, title, ffdir_, watermodel_, nincl_, incls_, nmol_, mols_);
2361 gmx_fio_fclose(top_file);
2363 gmx_residuetype_destroy(rt);
2365 /* now merge all chains back together */
2368 for (int i = 0; (i < numChains); i++)
2370 natom += chains[i].pdba->nr;
2371 nres += chains[i].pdba->nres;
2375 init_t_atoms(atoms, natom, false);
2376 for (int i = 0; i < atoms->nres; i++)
2378 sfree(atoms->resinfo[i].name);
2380 sfree(atoms->resinfo);
2382 snew(atoms->resinfo, nres);
2386 for (int i = 0; (i < numChains); i++)
2390 printf("Including chain %d in system: %d atoms %d residues\n",
2391 i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2393 for (int j = 0; (j < chains[i].pdba->nr); j++)
2395 atoms->atom[k] = chains[i].pdba->atom[j];
2396 atoms->atom[k].resind += l; /* l is processed nr of residues */
2397 atoms->atomname[k] = chains[i].pdba->atomname[j];
2398 atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2399 copy_rvec(chains[i].x[j], x[k]);
2402 for (int j = 0; (j < chains[i].pdba->nres); j++)
2404 atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2407 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2415 fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2416 print_sums(atoms, true);
2420 fprintf(stderr, "\nWriting coordinate file...\n");
2421 clear_rvec(box_space);
2424 make_new_box(atoms->nr, x, box, box_space, false);
2426 write_sto_conf(outputConfFile_.c_str(), title, atoms, x, nullptr, ePBC, box);
2428 printf("\t\t--------- PLEASE NOTE ------------\n");
2429 printf("You have successfully generated a topology from: %s.\n",
2430 inputConfFile_.c_str());
2431 if (watermodel_ != nullptr)
2433 printf("The %s force field and the %s water model are used.\n",
2434 ffname_, watermodel_);
2438 printf("The %s force field is used.\n",
2441 printf("\t\t--------- ETON ESAELP ------------\n");
2448 const char pdb2gmxInfo::name[] = "pdb2gmx";
2449 const char pdb2gmxInfo::shortDescription[] =
2450 "Convert coordinate files to topology and FF-compliant coordinate files";
2451 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2453 return compat::make_unique<pdb2gmx>();