2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
50 #include "gromacs/commandline/pargs.h"
51 #include "gromacs/fileio/confio.h"
52 #include "gromacs/fileio/gmxfio.h"
53 #include "gromacs/fileio/pdbio.h"
54 #include "gromacs/gmxlib/conformation-utilities.h"
55 #include "gromacs/gmxpreprocess/fflibutil.h"
56 #include "gromacs/gmxpreprocess/genhydro.h"
57 #include "gromacs/gmxpreprocess/h_db.h"
58 #include "gromacs/gmxpreprocess/hizzie.h"
59 #include "gromacs/gmxpreprocess/pdb2top.h"
60 #include "gromacs/gmxpreprocess/pgutil.h"
61 #include "gromacs/gmxpreprocess/resall.h"
62 #include "gromacs/gmxpreprocess/specbond.h"
63 #include "gromacs/gmxpreprocess/ter_db.h"
64 #include "gromacs/gmxpreprocess/toputil.h"
65 #include "gromacs/gmxpreprocess/xlate.h"
66 #include "gromacs/math/vec.h"
67 #include "gromacs/topology/atomprop.h"
68 #include "gromacs/topology/block.h"
69 #include "gromacs/topology/index.h"
70 #include "gromacs/topology/residuetypes.h"
71 #include "gromacs/topology/symtab.h"
72 #include "gromacs/topology/topology.h"
73 #include "gromacs/utility/arraysize.h"
74 #include "gromacs/utility/cstringutil.h"
75 #include "gromacs/utility/dir_separator.h"
76 #include "gromacs/utility/fatalerror.h"
77 #include "gromacs/utility/gmxassert.h"
78 #include "gromacs/utility/smalloc.h"
79 #include "gromacs/utility/strdb.h"
80 #include "gromacs/utility/stringutil.h"
84 char gmx[RTP_MAXCHAR+2];
85 char main[RTP_MAXCHAR+2];
86 char nter[RTP_MAXCHAR+2];
87 char cter[RTP_MAXCHAR+2];
88 char bter[RTP_MAXCHAR+2];
92 static const char *res2bb_notermini(const char *name,
93 int nrr, const rtprename_t *rr)
95 /* NOTE: This function returns the main building block name,
96 * it does not take terminal renaming into account.
101 while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
106 return (i < nrr ? rr[i].main : name);
109 static const char *select_res(int nr, int resnr,
110 const char *name[], const char *expl[],
112 int nrr, const rtprename_t *rr)
116 printf("Which %s type do you want for residue %d\n", title, resnr+1);
117 for (sel = 0; (sel < nr); sel++)
119 printf("%d. %s (%s)\n",
120 sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
122 printf("\nType a number:"); fflush(stdout);
124 if (scanf("%d", &sel) != 1)
126 gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
132 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
137 const char *lh[easpNR] = { "ASP", "ASPH" };
138 const char *expl[easpNR] = {
139 "Not protonated (charge -1)",
140 "Protonated (charge 0)"
143 return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
146 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
151 const char *lh[egluNR] = { "GLU", "GLUH" };
152 const char *expl[egluNR] = {
153 "Not protonated (charge -1)",
154 "Protonated (charge 0)"
157 return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
160 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
165 const char *lh[eglnNR] = { "GLN", "QLN" };
166 const char *expl[eglnNR] = {
167 "Not protonated (charge 0)",
168 "Protonated (charge +1)"
171 return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
174 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
179 const char *lh[elysNR] = { "LYSN", "LYS" };
180 const char *expl[elysNR] = {
181 "Not protonated (charge 0)",
182 "Protonated (charge +1)"
185 return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
188 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
193 const char *lh[eargNR] = { "ARGN", "ARG" };
194 const char *expl[eargNR] = {
195 "Not protonated (charge 0)",
196 "Protonated (charge +1)"
199 return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
202 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
204 const char *expl[ehisNR] = {
211 return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
214 static void read_rtprename(const char *fname, FILE *fp,
215 int *nrtprename, rtprename_t **rtprename)
217 char line[STRLEN], buf[STRLEN];
226 while (get_a_line(fp, line, STRLEN))
229 /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
230 * For other args, we read up to 6 chars (so we can detect if the length is > 5).
231 * Note that the buffer length has been increased to 7 to allow this,
232 * so we just need to make sure the strings have zero-length initially.
235 rr[n].main[0] = '\0';
236 rr[n].nter[0] = '\0';
237 rr[n].cter[0] = '\0';
238 rr[n].bter[0] = '\0';
239 nc = sscanf(line, "%6s %6s %6s %6s %6s %s",
240 rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
241 if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
242 strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
244 gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
245 fname, RTP_MAXCHAR, line);
250 if (nc != 2 && nc != 5)
252 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d or %d", fname, ncol, 2, 5);
258 gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
263 /* This file does not have special termini names, copy them from main */
264 strcpy(rr[n].nter, rr[n].main);
265 strcpy(rr[n].cter, rr[n].main);
266 strcpy(rr[n].bter, rr[n].main);
276 static char *search_resrename(int nrr, rtprename_t *rr,
278 bool bStart, bool bEnd,
279 bool bCompareFFRTPname)
287 while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx) != 0) ||
288 ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
293 /* If found in the database, rename this residue's rtp building block,
294 * otherwise keep the old name.
317 gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
325 static void rename_resrtp(t_atoms *pdba, int nterpairs, const int *r_start, const int *r_end,
326 int nrr, rtprename_t *rr, t_symtab *symtab,
334 bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
336 for (r = 0; r < pdba->nres; r++)
340 for (j = 0; j < nterpairs; j++)
347 for (j = 0; j < nterpairs; j++)
355 nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, FALSE);
357 if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
359 /* This is a terminal residue, but the residue name,
360 * currently stored in .rtp, is not a standard residue name,
361 * but probably a force field specific rtp name.
362 * Check if we need to rename it because it is terminal.
364 nn = search_resrename(nrr, rr,
365 *pdba->resinfo[r].rtp, bStart, bEnd, TRUE);
368 if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
372 printf("Changing rtp entry of residue %d %s to '%s'\n",
373 pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
375 pdba->resinfo[r].rtp = put_symtab(symtab, nn);
380 static void pdbres_to_gmxrtp(t_atoms *pdba)
384 for (i = 0; (i < pdba->nres); i++)
386 if (pdba->resinfo[i].rtp == nullptr)
388 pdba->resinfo[i].rtp = pdba->resinfo[i].name;
393 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
394 bool bFullCompare, t_symtab *symtab)
399 for (i = 0; (i < pdba->nres); i++)
401 resnm = *pdba->resinfo[i].name;
402 if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
403 (!bFullCompare && strstr(resnm, oldnm) != nullptr))
405 /* Rename the residue name (not the rtp name) */
406 pdba->resinfo[i].name = put_symtab(symtab, newnm);
411 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
412 bool bFullCompare, t_symtab *symtab)
417 for (i = 0; (i < pdba->nres); i++)
419 /* We have not set the rtp name yes, use the residue name */
420 bbnm = *pdba->resinfo[i].name;
421 if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
422 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
424 /* Change the rtp builing block name */
425 pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
430 static void rename_bbint(t_atoms *pdba, const char *oldnm,
431 const char *gettp(int, int, const rtprename_t *),
434 int nrr, const rtprename_t *rr)
440 for (i = 0; i < pdba->nres; i++)
442 /* We have not set the rtp name yes, use the residue name */
443 bbnm = *pdba->resinfo[i].name;
444 if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
445 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
447 ptr = gettp(i, nrr, rr);
448 pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
453 static void check_occupancy(t_atoms *atoms, const char *filename, gmx_bool bVerbose)
459 ftp = fn2ftp(filename);
460 if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
462 fprintf(stderr, "No occupancies in %s\n", filename);
466 for (i = 0; (i < atoms->nr); i++)
468 if (atoms->pdbinfo[i].occup != 1)
472 fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
473 *atoms->resinfo[atoms->atom[i].resind].name,
474 atoms->resinfo[ atoms->atom[i].resind].nr,
476 atoms->pdbinfo[i].occup);
478 if (atoms->pdbinfo[i].occup == 0)
488 if (nzero == atoms->nr)
490 fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
492 else if ((nzero > 0) || (nnotone > 0))
496 "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
497 " occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
499 nzero, nnotone, atoms->nr);
503 fprintf(stderr, "All occupancies are one\n");
508 static void write_posres(char *fn, t_atoms *pdba, real fc)
513 fp = gmx_fio_fopen(fn, "w");
515 "; In this topology include file, you will find position restraint\n"
516 "; entries for all the heavy atoms in your original pdb file.\n"
517 "; This means that all the protons which were added by pdb2gmx are\n"
518 "; not restrained.\n"
520 "[ position_restraints ]\n"
521 "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
523 for (i = 0; (i < pdba->nr); i++)
525 if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
527 fprintf(fp, "%6d%6d %g %g %g\n", i+1, 1, fc, fc, fc);
533 static int read_pdball(const char *inf, const char *outf, char **title,
534 t_atoms *atoms, rvec **x,
535 int *ePBC, matrix box, gmx_bool bRemoveH,
536 t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
537 gmx_atomprop_t aps, gmx_bool bVerbose)
538 /* Read a pdb file. (containing proteins) */
540 int natom, new_natom, i;
543 printf("Reading %s...\n", inf);
544 readConfAndAtoms(inf, symtab, title, atoms, ePBC, x, nullptr, box);
546 if (atoms->pdbinfo == nullptr)
548 snew(atoms->pdbinfo, atoms->nr);
550 if (fn2ftp(inf) == efPDB)
552 get_pdb_atomnumber(atoms, aps);
557 for (i = 0; i < atoms->nr; i++)
559 if (!is_hydrogen(*atoms->atomname[i]))
561 atoms->atom[new_natom] = atoms->atom[i];
562 atoms->atomname[new_natom] = atoms->atomname[i];
563 atoms->pdbinfo[new_natom] = atoms->pdbinfo[i];
564 copy_rvec((*x)[i], (*x)[new_natom]);
568 atoms->nr = new_natom;
575 printf(" '%s',", *title);
577 printf(" %d atoms\n", natom);
579 /* Rename residues */
580 rename_pdbres(atoms, "HOH", watres, FALSE, symtab);
581 rename_pdbres(atoms, "SOL", watres, FALSE, symtab);
582 rename_pdbres(atoms, "WAT", watres, FALSE, symtab);
584 rename_atoms("xlateat.dat", nullptr,
585 atoms, symtab, nullptr, TRUE, rt, TRUE, bVerbose);
594 write_sto_conf(outf, *title, atoms, *x, nullptr, *ePBC, box);
600 static void process_chain(t_atoms *pdba, rvec *x,
601 gmx_bool bTrpU, gmx_bool bPheU, gmx_bool bTyrU,
602 gmx_bool bLysMan, gmx_bool bAspMan, gmx_bool bGluMan,
603 gmx_bool bHisMan, gmx_bool bArgMan, gmx_bool bGlnMan,
604 real angle, real distance, t_symtab *symtab,
605 int nrr, const rtprename_t *rr)
607 /* Rename aromatics, lys, asp and histidine */
610 rename_bb(pdba, "TYR", "TYRU", FALSE, symtab);
614 rename_bb(pdba, "TRP", "TRPU", FALSE, symtab);
618 rename_bb(pdba, "PHE", "PHEU", FALSE, symtab);
622 rename_bbint(pdba, "LYS", get_lystp, FALSE, symtab, nrr, rr);
626 rename_bbint(pdba, "ARG", get_argtp, FALSE, symtab, nrr, rr);
630 rename_bbint(pdba, "GLN", get_glntp, FALSE, symtab, nrr, rr);
634 rename_bbint(pdba, "ASP", get_asptp, FALSE, symtab, nrr, rr);
638 rename_bb(pdba, "ASPH", "ASP", FALSE, symtab);
642 rename_bbint(pdba, "GLU", get_glutp, FALSE, symtab, nrr, rr);
646 rename_bb(pdba, "GLUH", "GLU", FALSE, symtab);
651 set_histp(pdba, x, angle, distance);
655 rename_bbint(pdba, "HIS", get_histp, TRUE, symtab, nrr, rr);
658 /* Initialize the rtp builing block names with the residue names
659 * for the residues that have not been processed above.
661 pdbres_to_gmxrtp(pdba);
663 /* Now we have all rtp names set.
664 * The rtp names will conform to Gromacs naming,
665 * unless the input pdb file contained one or more force field specific
666 * rtp names as residue names.
670 /* struct for sorting the atoms from the pdb file */
672 int resnr; /* residue number */
673 int j; /* database order index */
674 int index; /* original atom number */
675 char anm1; /* second letter of atom name */
676 char altloc; /* alternate location indicator */
679 static bool pdbicomp(const t_pdbindex &a, const t_pdbindex &b)
681 int d = (a.resnr - b.resnr);
687 d = (a.anm1 - b.anm1);
690 d = (a.altloc - b.altloc);
697 static void sort_pdbatoms(t_restp restp[],
698 int natoms, t_atoms **pdbaptr, rvec **x,
699 t_blocka *block, char ***gnames)
701 t_atoms *pdba, *pdbnew;
715 for (i = 0; i < natoms; i++)
717 atomnm = *pdba->atomname[i];
718 rptr = &restp[pdba->atom[i].resind];
719 for (j = 0; (j < rptr->natom); j++)
721 if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
726 if (j == rptr->natom)
731 "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
732 "while sorting atoms.\n%s", atomnm,
733 *pdba->resinfo[pdba->atom[i].resind].name,
734 pdba->resinfo[pdba->atom[i].resind].nr,
737 is_hydrogen(atomnm) ?
738 "\nFor a hydrogen, this can be a different protonation state, or it\n"
739 "might have had a different number in the PDB file and was rebuilt\n"
740 "(it might for instance have been H3, and we only expected H1 & H2).\n"
741 "Note that hydrogens might have been added to the entry for the N-terminus.\n"
742 "Remove this hydrogen or choose a different protonation state to solve it.\n"
743 "Option -ignh will ignore all hydrogens in the input." : ".");
744 gmx_fatal(FARGS, "%s", buf);
746 /* make shadow array to be sorted into indexgroup */
747 pdbi[i].resnr = pdba->atom[i].resind;
750 pdbi[i].anm1 = atomnm[1];
751 pdbi[i].altloc = pdba->pdbinfo[i].altloc;
753 std::sort(pdbi, pdbi+natoms, pdbicomp);
755 /* pdba is sorted in pdbnew using the pdbi index */
758 init_t_atoms(pdbnew, natoms, TRUE);
760 pdbnew->nr = pdba->nr;
761 pdbnew->nres = pdba->nres;
762 sfree(pdbnew->resinfo);
763 pdbnew->resinfo = pdba->resinfo;
764 for (i = 0; i < natoms; i++)
766 pdbnew->atom[i] = pdba->atom[pdbi[i].index];
767 pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
768 pdbnew->pdbinfo[i] = pdba->pdbinfo[pdbi[i].index];
769 copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
770 /* make indexgroup in block */
771 a[i] = pdbi[i].index;
774 sfree(pdba->atomname);
776 sfree(pdba->pdbinfo);
779 /* copy the sorted pdbnew back to pdba */
782 add_grp(block, gnames, natoms, a, "prot_sort");
788 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], gmx_bool bVerbose)
790 int i, j, oldnatoms, ndel;
793 printf("Checking for duplicate atoms....\n");
794 oldnatoms = pdba->nr;
796 /* NOTE: pdba->nr is modified inside the loop */
797 for (i = 1; (i < pdba->nr); i++)
799 /* compare 'i' and 'i-1', throw away 'i' if they are identical
800 this is a 'while' because multiple alternate locations can be present */
801 while ( (i < pdba->nr) &&
802 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
803 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
808 ri = &pdba->resinfo[pdba->atom[i].resind];
809 printf("deleting duplicate atom %4s %s%4d%c",
810 *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
811 if (ri->chainid && (ri->chainid != ' '))
813 printf(" ch %c", ri->chainid);
817 if (pdba->pdbinfo[i].atomnr)
819 printf(" pdb nr %4d", pdba->pdbinfo[i].atomnr);
821 if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
823 printf(" altloc %c", pdba->pdbinfo[i].altloc);
829 /* We can not free, since it might be in the symtab */
830 /* sfree(pdba->atomname[i]); */
831 for (j = i; j < pdba->nr; j++)
833 pdba->atom[j] = pdba->atom[j+1];
834 pdba->atomname[j] = pdba->atomname[j+1];
837 pdba->pdbinfo[j] = pdba->pdbinfo[j+1];
839 copy_rvec(x[j+1], x[j]);
841 srenew(pdba->atom, pdba->nr);
842 /* srenew(pdba->atomname, pdba->nr); */
843 srenew(pdba->pdbinfo, pdba->nr);
846 if (pdba->nr != oldnatoms)
848 printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
855 checkResidueTypeSanity(t_atoms * pdba,
858 gmx_residuetype_t * rt)
860 std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
861 std::string endResidueString = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
863 // Check whether all residues in chain have the same chain ID.
864 bool allResiduesHaveSameChainID = true;
865 char chainID0 = pdba->resinfo[r0].chainid;
867 std::string residueString;
869 for (int i = r0 + 1; i < r1; i++)
871 chainID = pdba->resinfo[i].chainid;
872 if (chainID != chainID0)
874 allResiduesHaveSameChainID = false;
875 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
880 if (!allResiduesHaveSameChainID)
883 "The chain covering the range %s--%s does not have a consistent chain ID. "
884 "The first residue has ID '%c', while residue %s has ID '%c'.",
885 startResidueString.c_str(), endResidueString.c_str(),
886 chainID0, residueString.c_str(), chainID);
889 // At this point all residues have the same ID. If they are also non-blank
890 // we can be a bit more aggressive and require the types match too.
893 bool allResiduesHaveSameType = true;
894 const char *restype0;
896 gmx_residuetype_get_type(rt, *pdba->resinfo[r0].name, &restype0);
898 for (int i = r0 + 1; i < r1; i++)
900 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &restype);
901 if (gmx_strcasecmp(restype, restype0))
903 allResiduesHaveSameType = false;
904 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
909 if (!allResiduesHaveSameType)
912 "The residues in the chain %s--%s do not have a consistent type. "
913 "The first residue has type '%s', while residue %s is of type '%s'. "
914 "Either there is a mistake in your chain, or it includes nonstandard "
915 "residue names that have not yet been added to the residuetypes.dat "
916 "file in the GROMACS library directory. If there are other molecules "
917 "such as ligands, they should not have the same chain ID as the "
918 "adjacent protein chain since it's a separate molecule.",
919 startResidueString.c_str(), endResidueString.c_str(),
920 restype0, residueString.c_str(), restype);
925 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
926 gmx_residuetype_t *rt)
929 const char *p_startrestype;
930 const char *p_restype;
935 int startWarnings = 0;
939 // Check that all residues have the same chain identifier, and if it is
940 // non-blank we also require the residue types to match.
941 checkResidueTypeSanity(pdba, r0, r1, rt);
943 // If we return correctly from checkResidueTypeSanity(), the only
944 // remaining cases where we can have non-matching residue types is if
945 // the chain ID was blank, which could be the case e.g. for a structure
946 // read from a GRO file or other file types without chain information.
947 // In that case we need to be a bit more liberal and detect chains based
948 // on the residue type.
950 // If we get here, the chain ID must be identical for all residues
951 char chainID = pdba->resinfo[r0].chainid;
953 /* Find the starting terminus (typially N or 5') */
954 for (i = r0; i < r1 && *r_start == -1; i++)
956 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
957 if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
959 printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
962 else if (!gmx_strcasecmp(p_startrestype, "Ion"))
966 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
970 printf("Disabling further notes about ions.\n");
976 if (startWarnings < 5)
980 printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
981 "This chain lacks identifiers, which makes it impossible to do strict\n"
982 "classification of the start/end residues. Here we need to guess this residue\n"
983 "should not be part of the chain and instead introduce a break, but that will\n"
984 "be catastrophic if they should in fact be linked. Please check your structure,\n"
985 "and add %s to residuetypes.dat if this was not correct.\n\n",
986 *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
990 printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
991 "This makes it impossible to link them into a molecule, which could either be\n"
992 "correct or a catastrophic error. Please check your structure, and add all\n"
993 "necessary residue names to residuetypes.dat if this was not correct.\n\n",
994 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
997 if (startWarnings == 4)
999 printf("Disabling further warnings about unidentified residues at start of chain.\n");
1007 /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1008 for (i = *r_start; i < r1; i++)
1010 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
1011 if (!gmx_strcasecmp(p_restype, p_startrestype) && endWarnings == 0)
1015 else if (!gmx_strcasecmp(p_startrestype, "Ion"))
1019 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1023 printf("Disabling further notes about ions.\n");
1029 // This can only trigger if the chain ID is blank - otherwise the
1030 // call to checkResidueTypeSanity() will have caught the problem.
1031 if (endWarnings < 5)
1033 printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1034 "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1035 "it impossible to do strict classification of the start/end residues. Here we\n"
1036 "need to guess this residue should not be part of the chain and instead\n"
1037 "introduce a break, but that will be catastrophic if they should in fact be\n"
1038 "linked. Please check your structure, and add %s to residuetypes.dat\n"
1039 "if this was not correct.\n\n",
1040 *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
1041 *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype, *pdba->resinfo[i].name);
1043 if (endWarnings == 4)
1045 printf("Disabling further warnings about unidentified residues at end of chain.\n");
1054 printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1068 static SplittingType getSplittingType(const char *chainsep)
1070 SplittingType splitting = SPLIT_TER_ONLY; /* keep compiler happy */
1072 /* Be a bit flexible to catch typos */
1073 if (!strncmp(chainsep, "id_o", 4))
1075 /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
1076 splitting = SPLIT_ID_OR_TER;
1077 printf("Splitting chemical chains based on TER records or chain id changing.\n");
1079 else if (!strncmp(chainsep, "int", 3))
1081 /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
1082 splitting = SPLIT_INTERACTIVE;
1083 printf("Splitting chemical chains interactively.\n");
1085 else if (!strncmp(chainsep, "id_a", 4))
1087 splitting = SPLIT_ID_AND_TER;
1088 printf("Splitting chemical chains based on TER records and chain id changing.\n");
1090 else if (strlen(chainsep) == 2 && !strncmp(chainsep, "id", 4))
1092 splitting = SPLIT_ID_ONLY;
1093 printf("Splitting chemical chains based on changing chain id only (ignoring TER records).\n");
1095 else if (chainsep[0] == 't')
1097 splitting = SPLIT_TER_ONLY;
1098 printf("Splitting chemical chains based on TER records only (ignoring chain id).\n");
1102 gmx_fatal(FARGS, "Unidentified setting for chain separation: %s\n", chainsep);
1108 modify_chain_numbers(t_atoms * pdba,
1109 const char * chainsep)
1112 char old_prev_chainid;
1113 char old_this_chainid;
1114 int old_prev_chainnum;
1115 int old_this_chainnum;
1117 char select[STRLEN];
1121 const char * prev_atomname;
1122 const char * this_atomname;
1123 const char * prev_resname;
1124 const char * this_resname;
1130 SplittingType splitting = getSplittingType(chainsep);
1132 /* The default chain enumeration is id_or_ter */
1134 old_prev_chainid = '?';
1135 old_prev_chainnum = -1;
1138 this_atomname = nullptr;
1140 this_resname = nullptr;
1144 for (i = 0; i < pdba->nres; i++)
1146 ri = &pdba->resinfo[i];
1147 old_this_chainid = ri->chainid;
1148 old_this_chainnum = ri->chainnum;
1150 prev_atomname = this_atomname;
1151 prev_atomnum = this_atomnum;
1152 prev_resname = this_resname;
1153 prev_resnum = this_resnum;
1154 prev_chainid = this_chainid;
1156 this_atomname = *(pdba->atomname[i]);
1157 this_atomnum = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1158 this_resname = *ri->name;
1159 this_resnum = ri->nr;
1160 this_chainid = ri->chainid;
1164 case SPLIT_ID_OR_TER:
1165 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1171 case SPLIT_ID_AND_TER:
1172 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1179 if (old_this_chainid != old_prev_chainid)
1185 case SPLIT_TER_ONLY:
1186 if (old_this_chainnum != old_prev_chainnum)
1191 case SPLIT_INTERACTIVE:
1192 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1196 printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\n"
1197 "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1198 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1199 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1201 if (nullptr == fgets(select, STRLEN-1, stdin))
1203 gmx_fatal(FARGS, "Error reading from stdin");
1206 if (i == 0 || select[0] == 'y')
1213 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1215 old_prev_chainid = old_this_chainid;
1216 old_prev_chainnum = old_this_chainnum;
1218 ri->chainnum = new_chainnum;
1247 int gmx_pdb2gmx(int argc, char *argv[])
1249 const char *desc[] = {
1250 "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1251 "some database files, adds hydrogens to the molecules and generates",
1252 "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1253 "and a topology in GROMACS format.",
1254 "These files can subsequently be processed to generate a run input file.",
1256 "[THISMODULE] will search for force fields by looking for",
1257 "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1258 "of the current working directory and of the GROMACS library directory",
1259 "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1261 "By default the forcefield selection is interactive,",
1262 "but you can use the [TT]-ff[tt] option to specify one of the short names",
1263 "in the list on the command line instead. In that case [THISMODULE] just looks",
1264 "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1266 "After choosing a force field, all files will be read only from",
1267 "the corresponding force field directory.",
1268 "If you want to modify or add a residue types, you can copy the force",
1269 "field directory from the GROMACS library directory to your current",
1270 "working directory. If you want to add new protein residue types,",
1271 "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1272 "or copy the whole library directory to a local directory and set",
1273 "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1274 "Check Chapter 5 of the manual for more information about file formats.",
1277 "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1278 "need not necessarily contain a protein structure. Every kind of",
1279 "molecule for which there is support in the database can be converted.",
1280 "If there is no support in the database, you can add it yourself.[PAR]",
1282 "The program has limited intelligence, it reads a number of database",
1283 "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1284 "if necessary this can be done manually. The program can prompt the",
1285 "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1286 "desired. For Lys the choice is between neutral (two protons on NZ) or",
1287 "protonated (three protons, default), for Asp and Glu unprotonated",
1288 "(default) or protonated, for His the proton can be either on ND1,",
1289 "on NE2 or on both. By default these selections are done automatically.",
1290 "For His, this is based on an optimal hydrogen bonding",
1291 "conformation. Hydrogen bonds are defined based on a simple geometric",
1292 "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1293 "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1294 "[TT]-dist[tt] respectively.[PAR]",
1296 "The protonation state of N- and C-termini can be chosen interactively",
1297 "with the [TT]-ter[tt] flag. Default termini are ionized (NH3+ and COO-),",
1298 "respectively. Some force fields support zwitterionic forms for chains of",
1299 "one residue, but for polypeptides these options should NOT be selected.",
1300 "The AMBER force fields have unique forms for the terminal residues,",
1301 "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1302 "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1303 "respectively to use these forms, making sure you preserve the format",
1304 "of the coordinate file. Alternatively, use named terminating residues",
1305 "(e.g. ACE, NME).[PAR]",
1307 "The separation of chains is not entirely trivial since the markup",
1308 "in user-generated PDB files frequently varies and sometimes it",
1309 "is desirable to merge entries across a TER record, for instance",
1310 "if you want a disulfide bridge or distance restraints between",
1311 "two protein chains or if you have a HEME group bound to a protein.",
1312 "In such cases multiple chains should be contained in a single",
1313 "[TT]moleculetype[tt] definition.",
1314 "To handle this, [THISMODULE] uses two separate options.",
1315 "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1316 "start, and termini added when applicable. This can be done based on the",
1317 "existence of TER records, when the chain id changes, or combinations of either",
1318 "or both of these. You can also do the selection fully interactively.",
1319 "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1320 "are merged into one moleculetype, after adding all the chemical termini (or not).",
1321 "This can be turned off (no merging), all non-water chains can be merged into a",
1322 "single molecule, or the selection can be done interactively.[PAR]",
1324 "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1325 "If any of the occupancies are not one, indicating that the atom is",
1326 "not resolved well in the structure, a warning message is issued.",
1327 "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1328 "all occupancy fields may be zero. Either way, it is up to the user",
1329 "to verify the correctness of the input data (read the article!).[PAR]",
1331 "During processing the atoms will be reordered according to GROMACS",
1332 "conventions. With [TT]-n[tt] an index file can be generated that",
1333 "contains one group reordered in the same way. This allows you to",
1334 "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1335 "one limitation: reordering is done after the hydrogens are stripped",
1336 "from the input and before new hydrogens are added. This means that",
1337 "you should not use [TT]-ignh[tt].[PAR]",
1339 "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1340 "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1341 "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1344 "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1345 "motions. Angular and out-of-plane motions can be removed by changing",
1346 "hydrogens into virtual sites and fixing angles, which fixes their",
1347 "position relative to neighboring atoms. Additionally, all atoms in the",
1348 "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1349 "can be converted into virtual sites, eliminating the fast improper dihedral",
1350 "fluctuations in these rings. [BB]Note[bb] that in this case all other hydrogen",
1351 "atoms are also converted to virtual sites. The mass of all atoms that are",
1352 "converted into virtual sites, is added to the heavy atoms.[PAR]",
1353 "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1354 "done by increasing the hydrogen-mass by a factor of 4. This is also",
1355 "done for water hydrogens to slow down the rotational motion of water.",
1356 "The increase in mass of the hydrogens is subtracted from the bonded",
1357 "(heavy) atom so that the total mass of the system remains the same."
1361 FILE *fp, *top_file, *top_file2, *itp_file = nullptr;
1363 t_atoms pdba_all, *pdba;
1367 int chain, nch, maxch, nwaterchain;
1369 t_chain *chains, *cc;
1370 char select[STRLEN];
1378 int i, j, k, l, nrtp;
1379 int *swap_index, si;
1383 gpp_atomtype_t atype;
1384 gmx_residuetype_t*rt;
1386 char itp_fn[STRLEN], posre_fn[STRLEN], buf_fn[STRLEN];
1387 char molname[STRLEN];
1388 char *c, forcefield[STRLEN], ffdir[STRLEN];
1389 char ffname[STRLEN], suffix[STRLEN], buf[STRLEN];
1393 rtprename_t *rtprename = nullptr;
1394 int nah, nNtdb, nCtdb, ntdblist;
1395 t_hackblock *ntdb, *ctdb, **tdblist;
1399 bool bVsites = FALSE, bWat, bPrevWat = FALSE, bITP, bVsiteAromatics = FALSE;
1401 t_hackblock *hb_chain;
1402 t_restp *restp_chain;
1403 gmx_output_env_t *oenv;
1404 const char *p_restype;
1408 const char * prev_atomname;
1409 const char * this_atomname;
1410 const char * prev_resname;
1411 const char * this_resname;
1416 int prev_chainnumber;
1417 int this_chainnumber;
1419 int this_chainstart;
1420 int prev_chainstart;
1427 { efSTX, "-f", "eiwit.pdb", ffREAD },
1428 { efSTO, "-o", "conf", ffWRITE },
1429 { efTOP, nullptr, nullptr, ffWRITE },
1430 { efITP, "-i", "posre", ffWRITE },
1431 { efNDX, "-n", "clean", ffOPTWR },
1432 { efSTO, "-q", "clean.pdb", ffOPTWR }
1434 #define NFILE asize(fnm)
1436 gmx_bool bNewRTP = FALSE;
1437 gmx_bool bInter = FALSE, bCysMan = FALSE;
1438 gmx_bool bLysMan = FALSE, bAspMan = FALSE, bGluMan = FALSE, bHisMan = FALSE;
1439 gmx_bool bGlnMan = FALSE, bArgMan = FALSE;
1440 gmx_bool bTerMan = FALSE, bUnA = FALSE, bHeavyH = FALSE;
1441 gmx_bool bSort = TRUE, bAllowMissing = FALSE, bRemoveH = FALSE;
1442 gmx_bool bDeuterate = FALSE, bVerbose = FALSE, bChargeGroups = TRUE, bCmap = TRUE;
1443 gmx_bool bRenumRes = FALSE, bRTPresname = FALSE;
1444 real angle = 135.0, distance = 0.3, posre_fc = 1000;
1445 real long_bond_dist = 0.25, short_bond_dist = 0.05;
1446 const char *vsitestr[] = { nullptr, "none", "hydrogens", "aromatics", nullptr };
1447 const char *watstr[] = { nullptr, "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p", nullptr };
1448 const char *chainsep[] = { nullptr, "id_or_ter", "id_and_ter", "ter", "id", "interactive", nullptr };
1449 const char *merge[] = {nullptr, "no", "all", "interactive", nullptr };
1450 const char *ff = "select";
1453 { "-newrtp", FALSE, etBOOL, {&bNewRTP},
1454 "HIDDENWrite the residue database in new format to [TT]new.rtp[tt]"},
1455 { "-lb", FALSE, etREAL, {&long_bond_dist},
1456 "HIDDENLong bond warning distance" },
1457 { "-sb", FALSE, etREAL, {&short_bond_dist},
1458 "HIDDENShort bond warning distance" },
1459 { "-chainsep", FALSE, etENUM, {chainsep},
1460 "Condition in PDB files when a new chain should be started (adding termini)" },
1461 { "-merge", FALSE, etENUM, {&merge},
1462 "Merge multiple chains into a single [moleculetype]" },
1463 { "-ff", FALSE, etSTR, {&ff},
1464 "Force field, interactive by default. Use [TT]-h[tt] for information." },
1465 { "-water", FALSE, etENUM, {watstr},
1466 "Water model to use" },
1467 { "-inter", FALSE, etBOOL, {&bInter},
1468 "Set the next 8 options to interactive"},
1469 { "-ss", FALSE, etBOOL, {&bCysMan},
1470 "Interactive SS bridge selection" },
1471 { "-ter", FALSE, etBOOL, {&bTerMan},
1472 "Interactive termini selection, instead of charged (default)" },
1473 { "-lys", FALSE, etBOOL, {&bLysMan},
1474 "Interactive lysine selection, instead of charged" },
1475 { "-arg", FALSE, etBOOL, {&bArgMan},
1476 "Interactive arginine selection, instead of charged" },
1477 { "-asp", FALSE, etBOOL, {&bAspMan},
1478 "Interactive aspartic acid selection, instead of charged" },
1479 { "-glu", FALSE, etBOOL, {&bGluMan},
1480 "Interactive glutamic acid selection, instead of charged" },
1481 { "-gln", FALSE, etBOOL, {&bGlnMan},
1482 "Interactive glutamine selection, instead of neutral" },
1483 { "-his", FALSE, etBOOL, {&bHisMan},
1484 "Interactive histidine selection, instead of checking H-bonds" },
1485 { "-angle", FALSE, etREAL, {&angle},
1486 "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)" },
1487 { "-dist", FALSE, etREAL, {&distance},
1488 "Maximum donor-acceptor distance for a H-bond (nm)" },
1489 { "-una", FALSE, etBOOL, {&bUnA},
1490 "Select aromatic rings with united CH atoms on phenylalanine, "
1491 "tryptophane and tyrosine" },
1492 { "-sort", FALSE, etBOOL, {&bSort},
1493 "HIDDENSort the residues according to database, turning this off is dangerous as charge groups might be broken in parts" },
1494 { "-ignh", FALSE, etBOOL, {&bRemoveH},
1495 "Ignore hydrogen atoms that are in the coordinate file" },
1496 { "-missing", FALSE, etBOOL, {&bAllowMissing},
1497 "Continue when atoms are missing and bonds cannot be made, dangerous" },
1498 { "-v", FALSE, etBOOL, {&bVerbose},
1499 "Be slightly more verbose in messages" },
1500 { "-posrefc", FALSE, etREAL, {&posre_fc},
1501 "Force constant for position restraints" },
1502 { "-vsite", FALSE, etENUM, {vsitestr},
1503 "Convert atoms to virtual sites" },
1504 { "-heavyh", FALSE, etBOOL, {&bHeavyH},
1505 "Make hydrogen atoms heavy" },
1506 { "-deuterate", FALSE, etBOOL, {&bDeuterate},
1507 "Change the mass of hydrogens to 2 amu" },
1508 { "-chargegrp", TRUE, etBOOL, {&bChargeGroups},
1509 "Use charge groups in the [REF].rtp[ref] file" },
1510 { "-cmap", TRUE, etBOOL, {&bCmap},
1511 "Use cmap torsions (if enabled in the [REF].rtp[ref] file)" },
1512 { "-renum", TRUE, etBOOL, {&bRenumRes},
1513 "Renumber the residues consecutively in the output" },
1514 { "-rtpres", TRUE, etBOOL, {&bRTPresname},
1515 "Use [REF].rtp[ref] entry names as residue names" }
1517 #define NPARGS asize(pa)
1519 if (!parse_common_args(&argc, argv, 0, NFILE, fnm, asize(pa), pa, asize(desc), desc,
1525 /* Force field selection, interactive or direct */
1526 choose_ff(strcmp(ff, "select") == 0 ? nullptr : ff,
1527 forcefield, sizeof(forcefield),
1528 ffdir, sizeof(ffdir));
1530 if (strlen(forcefield) > 0)
1532 strcpy(ffname, forcefield);
1533 ffname[0] = toupper(ffname[0]);
1537 gmx_fatal(FARGS, "Empty forcefield string");
1540 printf("\nUsing the %s force field in directory %s\n\n",
1543 choose_watermodel(watstr[0], ffdir, &watermodel);
1547 /* if anything changes here, also change description of -inter */
1562 else if (bDeuterate)
1571 /* parse_common_args ensures vsitestr has been selected, but
1572 clang-static-analyzer needs clues to know that */
1573 GMX_ASSERT(vsitestr[0], "-vsite default wasn't processed correctly");
1574 switch (vsitestr[0][0])
1576 case 'n': /* none */
1578 bVsiteAromatics = FALSE;
1580 case 'h': /* hydrogens */
1582 bVsiteAromatics = FALSE;
1584 case 'a': /* aromatics */
1586 bVsiteAromatics = TRUE;
1589 gmx_fatal(FARGS, "Internal inconsistency: vsitestr[0]='%s'", vsitestr[0]);
1592 /* Open the symbol table */
1593 open_symtab(&symtab);
1595 /* Residue type database */
1596 gmx_residuetype_init(&rt);
1598 /* Read residue renaming database(s), if present */
1599 std::vector<std::string> rrn = fflib_search_file_end(ffdir, ".r2b", FALSE);
1602 rtprename = nullptr;
1603 for (const auto &filename : rrn)
1605 fp = fflib_open(filename);
1606 read_rtprename(filename.c_str(), fp, &nrtprename, &rtprename);
1610 /* Add all alternative names from the residue renaming database to the list of recognized amino/nucleic acids. */
1611 for (i = 0; i < nrtprename; i++)
1613 rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1615 /* Only add names if the 'standard' gromacs/iupac base name was found */
1618 gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1619 gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1620 gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1621 gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1626 if (watermodel != nullptr && (strstr(watermodel, "4p") ||
1627 strstr(watermodel, "4P")))
1631 else if (watermodel != nullptr && (strstr(watermodel, "5p") ||
1632 strstr(watermodel, "5P")))
1641 aps = gmx_atomprop_init();
1643 natom = read_pdball(opt2fn("-f", NFILE, fnm), opt2fn_null("-q", NFILE, fnm), &title,
1644 &pdba_all, &pdbx, &ePBC, box, bRemoveH, &symtab, rt, watres,
1649 gmx_fatal(FARGS, "No atoms found in pdb file %s\n", opt2fn("-f", NFILE, fnm));
1652 printf("Analyzing pdb file\n");
1655 modify_chain_numbers(&pdba_all, chainsep[0]);
1659 this_atomname = nullptr;
1661 this_resname = nullptr;
1664 this_chainnumber = -1;
1665 this_chainstart = 0;
1666 /* Keep the compiler happy */
1667 prev_chainstart = 0;
1671 snew(pdb_ch, maxch);
1674 for (i = 0; (i < natom); i++)
1676 ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1678 /* TODO this should live in a helper object, and consolidate
1679 that with code in modify_chain_numbers */
1680 prev_atomname = this_atomname;
1681 prev_atomnum = this_atomnum;
1682 prev_resname = this_resname;
1683 prev_resnum = this_resnum;
1684 prev_chainid = this_chainid;
1685 prev_chainnumber = this_chainnumber;
1688 prev_chainstart = this_chainstart;
1691 this_atomname = *pdba_all.atomname[i];
1692 this_atomnum = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1693 this_resname = *ri->name;
1694 this_resnum = ri->nr;
1695 this_chainid = ri->chainid;
1696 this_chainnumber = ri->chainnum;
1698 bWat = gmx_strcasecmp(*ri->name, watres) == 0;
1700 if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat != bPrevWat))
1702 this_chainstart = pdba_all.atom[i].resind;
1706 if (!strncmp(merge[0], "int", 3))
1708 printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1709 "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1710 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1711 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1713 if (nullptr == fgets(select, STRLEN-1, stdin))
1715 gmx_fatal(FARGS, "Error reading from stdin");
1717 bMerged = (select[0] == 'y');
1719 else if (!strncmp(merge[0], "all", 3))
1727 pdb_ch[nch-1].chainstart[pdb_ch[nch-1].nterpairs] =
1728 pdba_all.atom[i].resind - prev_chainstart;
1729 pdb_ch[nch-1].nterpairs++;
1730 srenew(pdb_ch[nch-1].chainstart, pdb_ch[nch-1].nterpairs+1);
1735 /* set natom for previous chain */
1738 pdb_ch[nch-1].natom = i-pdb_ch[nch-1].start;
1745 /* check if chain identifier was used before */
1746 for (j = 0; (j < nch); j++)
1748 if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1750 printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1751 "They will be treated as separate chains unless you reorder your file.\n",
1755 // TODO This is too convoluted. Use a std::vector
1759 srenew(pdb_ch, maxch);
1761 pdb_ch[nch].chainid = ri->chainid;
1762 pdb_ch[nch].chainnum = ri->chainnum;
1763 pdb_ch[nch].start = i;
1764 pdb_ch[nch].bAllWat = bWat;
1767 pdb_ch[nch].nterpairs = 0;
1771 pdb_ch[nch].nterpairs = 1;
1773 snew(pdb_ch[nch].chainstart, pdb_ch[nch].nterpairs+1);
1774 /* modified [nch] to [0] below */
1775 pdb_ch[nch].chainstart[0] = 0;
1781 pdb_ch[nch-1].natom = natom-pdb_ch[nch-1].start;
1783 /* set all the water blocks at the end of the chain */
1784 snew(swap_index, nch);
1786 for (i = 0; i < nch; i++)
1788 if (!pdb_ch[i].bAllWat)
1794 for (i = 0; i < nch; i++)
1796 if (pdb_ch[i].bAllWat)
1802 if (nwaterchain > 1)
1804 printf("Moved all the water blocks to the end\n");
1808 /* copy pdb data and x for all chains */
1809 for (i = 0; (i < nch); i++)
1812 chains[i].chainid = pdb_ch[si].chainid;
1813 chains[i].chainnum = pdb_ch[si].chainnum;
1814 chains[i].bAllWat = pdb_ch[si].bAllWat;
1815 chains[i].nterpairs = pdb_ch[si].nterpairs;
1816 chains[i].chainstart = pdb_ch[si].chainstart;
1817 snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1818 snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1819 snew(chains[i].r_start, pdb_ch[si].nterpairs);
1820 snew(chains[i].r_end, pdb_ch[si].nterpairs);
1822 snew(chains[i].pdba, 1);
1823 init_t_atoms(chains[i].pdba, pdb_ch[si].natom, TRUE);
1824 snew(chains[i].x, chains[i].pdba->nr);
1825 for (j = 0; j < chains[i].pdba->nr; j++)
1827 chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1828 snew(chains[i].pdba->atomname[j], 1);
1829 *chains[i].pdba->atomname[j] =
1830 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1831 chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1832 copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1834 /* Re-index the residues assuming that the indices are continuous */
1835 k = chains[i].pdba->atom[0].resind;
1836 nres = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1837 chains[i].pdba->nres = nres;
1838 for (j = 0; j < chains[i].pdba->nr; j++)
1840 chains[i].pdba->atom[j].resind -= k;
1842 srenew(chains[i].pdba->resinfo, nres);
1843 for (j = 0; j < nres; j++)
1845 chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1846 snew(chains[i].pdba->resinfo[j].name, 1);
1847 *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1848 /* make all chain identifiers equal to that of the chain */
1849 chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1853 if (nchainmerges > 0)
1855 printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1859 printf("There are %d chains and %d blocks of water and "
1860 "%d residues with %d atoms\n",
1861 nch-nwaterchain, nwaterchain,
1862 pdba_all.nres, natom);
1864 printf("\n %5s %4s %6s\n", "chain", "#res", "#atoms");
1865 for (i = 0; (i < nch); i++)
1867 printf(" %d '%c' %5d %6d %s\n",
1868 i+1, chains[i].chainid ? chains[i].chainid : '-',
1869 chains[i].pdba->nres, chains[i].pdba->nr,
1870 chains[i].bAllWat ? "(only water)" : "");
1874 check_occupancy(&pdba_all, opt2fn("-f", NFILE, fnm), bVerbose);
1876 /* Read atomtypes... */
1877 atype = read_atype(ffdir, &symtab);
1879 /* read residue database */
1880 printf("Reading residue database... (%s)\n", forcefield);
1881 std::vector<std::string> rtpf = fflib_search_file_end(ffdir, ".rtp", TRUE);
1884 for (const auto &filename : rtpf)
1886 read_resall(filename.c_str(), &nrtp, &restp, atype, &symtab, FALSE);
1890 /* Not correct with multiple rtp input files with different bonded types */
1891 fp = gmx_fio_fopen("new.rtp", "w");
1892 print_resall(fp, nrtp, restp, atype);
1896 /* read hydrogen database */
1897 nah = read_h_db(ffdir, &ah);
1899 /* Read Termini database... */
1900 nNtdb = read_ter_db(ffdir, 'n', &ntdb, atype);
1901 nCtdb = read_ter_db(ffdir, 'c', &ctdb, atype);
1903 top_fn = ftp2fn(efTOP, NFILE, fnm);
1904 top_file = gmx_fio_fopen(top_fn, "w");
1906 print_top_header(top_file, top_fn, FALSE, ffdir, mHmult);
1912 for (chain = 0; (chain < nch); chain++)
1914 cc = &(chains[chain]);
1916 /* set pdba, natom and nres to the current chain */
1919 natom = cc->pdba->nr;
1920 nres = cc->pdba->nres;
1922 if (cc->chainid && ( cc->chainid != ' ' ) )
1924 printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
1925 chain+1, cc->chainid, natom, nres);
1929 printf("Processing chain %d (%d atoms, %d residues)\n",
1930 chain+1, natom, nres);
1933 process_chain(pdba, x, bUnA, bUnA, bUnA, bLysMan, bAspMan, bGluMan,
1934 bHisMan, bArgMan, bGlnMan, angle, distance, &symtab,
1935 nrtprename, rtprename);
1937 cc->chainstart[cc->nterpairs] = pdba->nres;
1939 for (i = 0; i < cc->nterpairs; i++)
1941 find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
1942 &(cc->r_start[j]), &(cc->r_end[j]), rt);
1944 if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
1950 if (cc->nterpairs == 0)
1952 printf("Problem with chain definition, or missing terminal residues.\n"
1953 "This chain does not appear to contain a recognized chain molecule.\n"
1954 "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
1957 /* Check for disulfides and other special bonds */
1958 nssbonds = mk_specbonds(pdba, x, bCysMan, &ssbonds, bVerbose);
1962 rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
1966 for (i = 0; i < cc->nterpairs; i++)
1970 * We first apply a filter so we only have the
1971 * termini that can be applied to the residue in question
1972 * (or a generic terminus if no-residue specific is available).
1974 /* First the N terminus */
1977 tdblist = filter_ter(nNtdb, ntdb,
1978 *pdba->resinfo[cc->r_start[i]].name,
1982 printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
1983 "is already in a terminus-specific form and skipping terminus selection.\n");
1984 cc->ntdb[i] = nullptr;
1988 if (bTerMan && ntdblist > 1)
1990 sprintf(select, "Select start terminus type for %s-%d",
1991 *pdba->resinfo[cc->r_start[i]].name,
1992 pdba->resinfo[cc->r_start[i]].nr);
1993 cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
1997 cc->ntdb[i] = tdblist[0];
2000 printf("Start terminus %s-%d: %s\n",
2001 *pdba->resinfo[cc->r_start[i]].name,
2002 pdba->resinfo[cc->r_start[i]].nr,
2003 (cc->ntdb[i])->name);
2009 cc->ntdb[i] = nullptr;
2012 /* And the C terminus */
2015 tdblist = filter_ter(nCtdb, ctdb,
2016 *pdba->resinfo[cc->r_end[i]].name,
2020 printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2021 "is already in a terminus-specific form and skipping terminus selection.\n");
2022 cc->ctdb[i] = nullptr;
2026 if (bTerMan && ntdblist > 1)
2028 sprintf(select, "Select end terminus type for %s-%d",
2029 *pdba->resinfo[cc->r_end[i]].name,
2030 pdba->resinfo[cc->r_end[i]].nr);
2031 cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
2035 cc->ctdb[i] = tdblist[0];
2037 printf("End terminus %s-%d: %s\n",
2038 *pdba->resinfo[cc->r_end[i]].name,
2039 pdba->resinfo[cc->r_end[i]].nr,
2040 (cc->ctdb[i])->name);
2046 cc->ctdb[i] = nullptr;
2049 /* lookup hackblocks and rtp for all residues */
2050 get_hackblocks_rtp(&hb_chain, &restp_chain,
2051 nrtp, restp, pdba->nres, pdba->resinfo,
2052 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2054 /* ideally, now we would not need the rtp itself anymore, but do
2055 everything using the hb and restp arrays. Unfortunately, that
2056 requires some re-thinking of code in gen_vsite.c, which I won't
2057 do now :( AF 26-7-99 */
2059 rename_atoms(nullptr, ffdir,
2060 pdba, &symtab, restp_chain, FALSE, rt, FALSE, bVerbose);
2062 match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose);
2066 block = new_blocka();
2068 sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
2069 remove_duplicate_atoms(pdba, x, bVerbose);
2070 if (ftp2bSet(efNDX, NFILE, fnm))
2074 fprintf(stderr, "WARNING: with the -remh option the generated "
2075 "index file (%s) might be useless\n"
2076 "(the index file is generated before hydrogens are added)",
2077 ftp2fn(efNDX, NFILE, fnm));
2079 write_index(ftp2fn(efNDX, NFILE, fnm), block, gnames, FALSE, 0);
2081 for (i = 0; i < block->nr; i++)
2090 fprintf(stderr, "WARNING: "
2091 "without sorting no check for duplicate atoms can be done\n");
2094 /* Generate Hydrogen atoms (and termini) in the sequence */
2095 printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2096 add_h(&pdba, &x, nah, ah,
2097 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing,
2098 nullptr, nullptr, TRUE, FALSE);
2099 printf("Now there are %d residues with %d atoms\n",
2100 pdba->nres, pdba->nr);
2102 strcpy(posre_fn, ftp2fn(efITP, NFILE, fnm));
2104 /* make up molecule name(s) */
2106 k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2108 gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2114 sprintf(molname, "Water");
2118 this_chainid = cc->chainid;
2120 /* Add the chain id if we have one */
2121 if (this_chainid != ' ')
2123 sprintf(buf, "_chain_%c", this_chainid);
2124 strcat(suffix, buf);
2127 /* Check if there have been previous chains with the same id */
2129 for (k = 0; k < chain; k++)
2131 if (cc->chainid == chains[k].chainid)
2136 /* Add the number for this chain identifier if there are multiple copies */
2140 sprintf(buf, "%d", nid_used+1);
2141 strcat(suffix, buf);
2144 if (strlen(suffix) > 0)
2146 sprintf(molname, "%s%s", p_restype, suffix);
2150 strcpy(molname, p_restype);
2154 if ((nch-nwaterchain > 1) && !cc->bAllWat)
2157 strcpy(itp_fn, top_fn);
2158 printf("Chain time...\n");
2159 c = strrchr(itp_fn, '.');
2160 sprintf(c, "_%s.itp", molname);
2161 c = strrchr(posre_fn, '.');
2162 sprintf(c, "_%s.itp", molname);
2163 if (strcmp(itp_fn, posre_fn) == 0)
2165 strcpy(buf_fn, posre_fn);
2166 c = strrchr(buf_fn, '.');
2168 sprintf(posre_fn, "%s_pr.itp", buf_fn);
2172 srenew(incls, nincl);
2173 incls[nincl-1] = gmx_strdup(itp_fn);
2174 itp_file = gmx_fio_fopen(itp_fn, "w");
2181 srenew(mols, nmol+1);
2184 mols[nmol].name = gmx_strdup("SOL");
2185 mols[nmol].nr = pdba->nres;
2189 mols[nmol].name = gmx_strdup(molname);
2196 print_top_comment(itp_file, itp_fn, ffdir, TRUE);
2201 top_file2 = nullptr;
2205 top_file2 = itp_file;
2209 top_file2 = top_file;
2212 pdb2top(top_file2, posre_fn, molname, pdba, &x, atype, &symtab,
2214 restp_chain, hb_chain,
2216 bVsites, bVsiteAromatics, ffdir,
2217 mHmult, nssbonds, ssbonds,
2218 long_bond_dist, short_bond_dist, bDeuterate, bChargeGroups, bCmap,
2219 bRenumRes, bRTPresname);
2223 write_posres(posre_fn, pdba, posre_fc);
2228 gmx_fio_fclose(itp_file);
2231 /* pdba and natom have been reassigned somewhere so: */
2237 if (watermodel == nullptr)
2239 for (chain = 0; chain < nch; chain++)
2241 if (chains[chain].bAllWat)
2243 gmx_fatal(FARGS, "You have chosen not to include a water model, but there is water in the input file. Select a water model or remove the water from your input file.");
2249 sprintf(buf_fn, "%s%c%s.itp", ffdir, DIR_SEPARATOR, watermodel);
2250 if (!fflib_fexist(buf_fn))
2252 gmx_fatal(FARGS, "The topology file '%s' for the selected water model '%s' can not be found in the force field directory. Select a different water model.",
2253 buf_fn, watermodel);
2257 print_top_mols(top_file, title, ffdir, watermodel, nincl, incls, nmol, mols);
2258 gmx_fio_fclose(top_file);
2260 gmx_residuetype_destroy(rt);
2262 /* now merge all chains back together */
2265 for (i = 0; (i < nch); i++)
2267 natom += chains[i].pdba->nr;
2268 nres += chains[i].pdba->nres;
2271 init_t_atoms(atoms, natom, FALSE);
2272 for (i = 0; i < atoms->nres; i++)
2274 sfree(atoms->resinfo[i].name);
2276 sfree(atoms->resinfo);
2278 snew(atoms->resinfo, nres);
2282 for (i = 0; (i < nch); i++)
2286 printf("Including chain %d in system: %d atoms %d residues\n",
2287 i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2289 for (j = 0; (j < chains[i].pdba->nr); j++)
2291 atoms->atom[k] = chains[i].pdba->atom[j];
2292 atoms->atom[k].resind += l; /* l is processed nr of residues */
2293 atoms->atomname[k] = chains[i].pdba->atomname[j];
2294 atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2295 copy_rvec(chains[i].x[j], x[k]);
2298 for (j = 0; (j < chains[i].pdba->nres); j++)
2300 atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2303 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2311 fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2312 print_sums(atoms, TRUE);
2315 fprintf(stderr, "\nWriting coordinate file...\n");
2316 clear_rvec(box_space);
2319 make_new_box(atoms->nr, x, box, box_space, FALSE);
2321 write_sto_conf(ftp2fn(efSTO, NFILE, fnm), title, atoms, x, nullptr, ePBC, box);
2323 printf("\t\t--------- PLEASE NOTE ------------\n");
2324 printf("You have successfully generated a topology from: %s.\n",
2325 opt2fn("-f", NFILE, fnm));
2326 if (watermodel != nullptr)
2328 printf("The %s force field and the %s water model are used.\n",
2329 ffname, watermodel);
2333 printf("The %s force field is used.\n",
2336 printf("\t\t--------- ETON ESAELP ------------\n");