2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
46 #include "gromacs/commandline/pargs.h"
47 #include "gromacs/fileio/confio.h"
48 #include "gromacs/fileio/gmxfio.h"
49 #include "gromacs/fileio/pdbio.h"
50 #include "gromacs/gmxlib/conformation-utilities.h"
51 #include "gromacs/gmxpreprocess/fflibutil.h"
52 #include "gromacs/gmxpreprocess/genhydro.h"
53 #include "gromacs/gmxpreprocess/h_db.h"
54 #include "gromacs/gmxpreprocess/hizzie.h"
55 #include "gromacs/gmxpreprocess/pdb2top.h"
56 #include "gromacs/gmxpreprocess/pgutil.h"
57 #include "gromacs/gmxpreprocess/resall.h"
58 #include "gromacs/gmxpreprocess/specbond.h"
59 #include "gromacs/gmxpreprocess/ter_db.h"
60 #include "gromacs/gmxpreprocess/toputil.h"
61 #include "gromacs/gmxpreprocess/xlate.h"
62 #include "gromacs/math/vec.h"
63 #include "gromacs/topology/atomprop.h"
64 #include "gromacs/topology/block.h"
65 #include "gromacs/topology/index.h"
66 #include "gromacs/topology/residuetypes.h"
67 #include "gromacs/topology/symtab.h"
68 #include "gromacs/topology/topology.h"
69 #include "gromacs/utility/arraysize.h"
70 #include "gromacs/utility/cstringutil.h"
71 #include "gromacs/utility/dir_separator.h"
72 #include "gromacs/utility/fatalerror.h"
73 #include "gromacs/utility/gmxassert.h"
74 #include "gromacs/utility/smalloc.h"
75 #include "gromacs/utility/strdb.h"
76 #include "gromacs/utility/stringutil.h"
80 char gmx[RTP_MAXCHAR+2];
81 char main[RTP_MAXCHAR+2];
82 char nter[RTP_MAXCHAR+2];
83 char cter[RTP_MAXCHAR+2];
84 char bter[RTP_MAXCHAR+2];
88 static const char *res2bb_notermini(const char *name,
89 int nrr, const rtprename_t *rr)
91 /* NOTE: This function returns the main building block name,
92 * it does not take terminal renaming into account.
97 while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
102 return (i < nrr ? rr[i].main : name);
105 static const char *select_res(int nr, int resnr,
106 const char *name[], const char *expl[],
108 int nrr, const rtprename_t *rr)
112 printf("Which %s type do you want for residue %d\n", title, resnr+1);
113 for (sel = 0; (sel < nr); sel++)
115 printf("%d. %s (%s)\n",
116 sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
118 printf("\nType a number:"); fflush(stdout);
120 if (scanf("%d", &sel) != 1)
122 gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
128 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
133 const char *lh[easpNR] = { "ASP", "ASPH" };
134 const char *expl[easpNR] = {
135 "Not protonated (charge -1)",
136 "Protonated (charge 0)"
139 return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
142 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
147 const char *lh[egluNR] = { "GLU", "GLUH" };
148 const char *expl[egluNR] = {
149 "Not protonated (charge -1)",
150 "Protonated (charge 0)"
153 return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
156 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
161 const char *lh[eglnNR] = { "GLN", "QLN" };
162 const char *expl[eglnNR] = {
163 "Not protonated (charge 0)",
164 "Protonated (charge +1)"
167 return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
170 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
175 const char *lh[elysNR] = { "LYSN", "LYS" };
176 const char *expl[elysNR] = {
177 "Not protonated (charge 0)",
178 "Protonated (charge +1)"
181 return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
184 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
189 const char *lh[eargNR] = { "ARGN", "ARG" };
190 const char *expl[eargNR] = {
191 "Not protonated (charge 0)",
192 "Protonated (charge +1)"
195 return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
198 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
200 const char *expl[ehisNR] = {
207 return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
210 static void read_rtprename(const char *fname, FILE *fp,
211 int *nrtprename, rtprename_t **rtprename)
213 char line[STRLEN], buf[STRLEN];
222 while (get_a_line(fp, line, STRLEN))
225 /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
226 * For other args, we read up to 6 chars (so we can detect if the length is > 5).
227 * Note that the buffer length has been increased to 7 to allow this,
228 * so we just need to make sure the strings have zero-length initially.
231 rr[n].main[0] = '\0';
232 rr[n].nter[0] = '\0';
233 rr[n].cter[0] = '\0';
234 rr[n].bter[0] = '\0';
235 nc = sscanf(line, "%6s %6s %6s %6s %6s %s",
236 rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
237 if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
238 strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
240 gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
241 fname, RTP_MAXCHAR, line);
246 if (nc != 2 && nc != 5)
248 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d, %d or %d", fname, ncol, 2, 5);
254 gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
259 /* This file does not have special termini names, copy them from main */
260 strcpy(rr[n].nter, rr[n].main);
261 strcpy(rr[n].cter, rr[n].main);
262 strcpy(rr[n].bter, rr[n].main);
272 static char *search_resrename(int nrr, rtprename_t *rr,
274 gmx_bool bStart, gmx_bool bEnd,
275 gmx_bool bCompareFFRTPname)
283 while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx) != 0) ||
284 ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
289 /* If found in the database, rename this residue's rtp building block,
290 * otherwise keep the old name.
313 gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
321 static void rename_resrtp(t_atoms *pdba, int nterpairs, const int *r_start, const int *r_end,
322 int nrr, rtprename_t *rr, t_symtab *symtab,
326 gmx_bool bStart, bEnd;
328 gmx_bool bFFRTPTERRNM;
330 bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
332 for (r = 0; r < pdba->nres; r++)
336 for (j = 0; j < nterpairs; j++)
343 for (j = 0; j < nterpairs; j++)
351 nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, FALSE);
353 if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
355 /* This is a terminal residue, but the residue name,
356 * currently stored in .rtp, is not a standard residue name,
357 * but probably a force field specific rtp name.
358 * Check if we need to rename it because it is terminal.
360 nn = search_resrename(nrr, rr,
361 *pdba->resinfo[r].rtp, bStart, bEnd, TRUE);
364 if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
368 printf("Changing rtp entry of residue %d %s to '%s'\n",
369 pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
371 pdba->resinfo[r].rtp = put_symtab(symtab, nn);
376 static void pdbres_to_gmxrtp(t_atoms *pdba)
380 for (i = 0; (i < pdba->nres); i++)
382 if (pdba->resinfo[i].rtp == nullptr)
384 pdba->resinfo[i].rtp = pdba->resinfo[i].name;
389 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
390 gmx_bool bFullCompare, t_symtab *symtab)
395 for (i = 0; (i < pdba->nres); i++)
397 resnm = *pdba->resinfo[i].name;
398 if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
399 (!bFullCompare && strstr(resnm, oldnm) != nullptr))
401 /* Rename the residue name (not the rtp name) */
402 pdba->resinfo[i].name = put_symtab(symtab, newnm);
407 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
408 gmx_bool bFullCompare, t_symtab *symtab)
413 for (i = 0; (i < pdba->nres); i++)
415 /* We have not set the rtp name yes, use the residue name */
416 bbnm = *pdba->resinfo[i].name;
417 if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
418 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
420 /* Change the rtp builing block name */
421 pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
426 static void rename_bbint(t_atoms *pdba, const char *oldnm,
427 const char *gettp(int, int, const rtprename_t *),
428 gmx_bool bFullCompare,
430 int nrr, const rtprename_t *rr)
436 for (i = 0; i < pdba->nres; i++)
438 /* We have not set the rtp name yes, use the residue name */
439 bbnm = *pdba->resinfo[i].name;
440 if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
441 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
443 ptr = gettp(i, nrr, rr);
444 pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
449 static void check_occupancy(t_atoms *atoms, const char *filename, gmx_bool bVerbose)
455 ftp = fn2ftp(filename);
456 if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
458 fprintf(stderr, "No occupancies in %s\n", filename);
462 for (i = 0; (i < atoms->nr); i++)
464 if (atoms->pdbinfo[i].occup != 1)
468 fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
469 *atoms->resinfo[atoms->atom[i].resind].name,
470 atoms->resinfo[ atoms->atom[i].resind].nr,
472 atoms->pdbinfo[i].occup);
474 if (atoms->pdbinfo[i].occup == 0)
484 if (nzero == atoms->nr)
486 fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
488 else if ((nzero > 0) || (nnotone > 0))
492 "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
493 " occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
495 nzero, nnotone, atoms->nr);
499 fprintf(stderr, "All occupancies are one\n");
504 static void write_posres(char *fn, t_atoms *pdba, real fc)
509 fp = gmx_fio_fopen(fn, "w");
511 "; In this topology include file, you will find position restraint\n"
512 "; entries for all the heavy atoms in your original pdb file.\n"
513 "; This means that all the protons which were added by pdb2gmx are\n"
514 "; not restrained.\n"
516 "[ position_restraints ]\n"
517 "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
519 for (i = 0; (i < pdba->nr); i++)
521 if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
523 fprintf(fp, "%6d%6d %g %g %g\n", i+1, 1, fc, fc, fc);
529 static int read_pdball(const char *inf, const char *outf, char *title,
530 t_atoms *atoms, rvec **x,
531 int *ePBC, matrix box, gmx_bool bRemoveH,
532 t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
533 gmx_atomprop_t aps, gmx_bool bVerbose)
534 /* Read a pdb file. (containing proteins) */
536 int natom, new_natom, i;
539 printf("Reading %s...\n", inf);
542 read_tps_conf(inf, top, ePBC, x, nullptr, box, FALSE);
543 strncpy(title, *top->name, STRLEN);
544 title[STRLEN-1] = '\0';
548 if (atoms->pdbinfo == nullptr)
550 snew(atoms->pdbinfo, atoms->nr);
552 if (fn2ftp(inf) == efPDB)
554 get_pdb_atomnumber(atoms, aps);
559 for (i = 0; i < atoms->nr; i++)
561 if (!is_hydrogen(*atoms->atomname[i]))
563 atoms->atom[new_natom] = atoms->atom[i];
564 atoms->atomname[new_natom] = atoms->atomname[i];
565 atoms->pdbinfo[new_natom] = atoms->pdbinfo[i];
566 copy_rvec((*x)[i], (*x)[new_natom]);
570 atoms->nr = new_natom;
577 printf(" '%s',", title);
579 printf(" %d atoms\n", natom);
581 /* Rename residues */
582 rename_pdbres(atoms, "HOH", watres, FALSE, symtab);
583 rename_pdbres(atoms, "SOL", watres, FALSE, symtab);
584 rename_pdbres(atoms, "WAT", watres, FALSE, symtab);
586 rename_atoms("xlateat.dat", nullptr,
587 atoms, symtab, nullptr, TRUE, rt, TRUE, bVerbose);
596 write_sto_conf(outf, title, atoms, *x, nullptr, *ePBC, box);
602 static void process_chain(t_atoms *pdba, rvec *x,
603 gmx_bool bTrpU, gmx_bool bPheU, gmx_bool bTyrU,
604 gmx_bool bLysMan, gmx_bool bAspMan, gmx_bool bGluMan,
605 gmx_bool bHisMan, gmx_bool bArgMan, gmx_bool bGlnMan,
606 real angle, real distance, t_symtab *symtab,
607 int nrr, const rtprename_t *rr)
609 /* Rename aromatics, lys, asp and histidine */
612 rename_bb(pdba, "TYR", "TYRU", FALSE, symtab);
616 rename_bb(pdba, "TRP", "TRPU", FALSE, symtab);
620 rename_bb(pdba, "PHE", "PHEU", FALSE, symtab);
624 rename_bbint(pdba, "LYS", get_lystp, FALSE, symtab, nrr, rr);
628 rename_bbint(pdba, "ARG", get_argtp, FALSE, symtab, nrr, rr);
632 rename_bbint(pdba, "GLN", get_glntp, FALSE, symtab, nrr, rr);
636 rename_bbint(pdba, "ASP", get_asptp, FALSE, symtab, nrr, rr);
640 rename_bb(pdba, "ASPH", "ASP", FALSE, symtab);
644 rename_bbint(pdba, "GLU", get_glutp, FALSE, symtab, nrr, rr);
648 rename_bb(pdba, "GLUH", "GLU", FALSE, symtab);
653 set_histp(pdba, x, angle, distance);
657 rename_bbint(pdba, "HIS", get_histp, TRUE, symtab, nrr, rr);
660 /* Initialize the rtp builing block names with the residue names
661 * for the residues that have not been processed above.
663 pdbres_to_gmxrtp(pdba);
665 /* Now we have all rtp names set.
666 * The rtp names will conform to Gromacs naming,
667 * unless the input pdb file contained one or more force field specific
668 * rtp names as residue names.
672 /* struct for sorting the atoms from the pdb file */
674 int resnr; /* residue number */
675 int j; /* database order index */
676 int index; /* original atom number */
677 char anm1; /* second letter of atom name */
678 char altloc; /* alternate location indicator */
681 static int pdbicomp(const void *a, const void *b)
686 pa = (t_pdbindex *)a;
687 pb = (t_pdbindex *)b;
689 d = (pa->resnr - pb->resnr);
695 d = (pa->anm1 - pb->anm1);
698 d = (pa->altloc - pb->altloc);
706 static void sort_pdbatoms(t_restp restp[],
707 int natoms, t_atoms **pdbaptr, rvec **x,
708 t_blocka *block, char ***gnames)
710 t_atoms *pdba, *pdbnew;
724 for (i = 0; i < natoms; i++)
726 atomnm = *pdba->atomname[i];
727 rptr = &restp[pdba->atom[i].resind];
728 for (j = 0; (j < rptr->natom); j++)
730 if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
735 if (j == rptr->natom)
740 "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
741 "while sorting atoms.\n%s", atomnm,
742 *pdba->resinfo[pdba->atom[i].resind].name,
743 pdba->resinfo[pdba->atom[i].resind].nr,
746 is_hydrogen(atomnm) ?
747 "\nFor a hydrogen, this can be a different protonation state, or it\n"
748 "might have had a different number in the PDB file and was rebuilt\n"
749 "(it might for instance have been H3, and we only expected H1 & H2).\n"
750 "Note that hydrogens might have been added to the entry for the N-terminus.\n"
751 "Remove this hydrogen or choose a different protonation state to solve it.\n"
752 "Option -ignh will ignore all hydrogens in the input." : ".");
753 gmx_fatal(FARGS, "%s", buf);
755 /* make shadow array to be sorted into indexgroup */
756 pdbi[i].resnr = pdba->atom[i].resind;
759 pdbi[i].anm1 = atomnm[1];
760 pdbi[i].altloc = pdba->pdbinfo[i].altloc;
762 qsort(pdbi, natoms, (size_t)sizeof(pdbi[0]), pdbicomp);
764 /* pdba is sorted in pdbnew using the pdbi index */
767 init_t_atoms(pdbnew, natoms, TRUE);
769 pdbnew->nr = pdba->nr;
770 pdbnew->nres = pdba->nres;
771 sfree(pdbnew->resinfo);
772 pdbnew->resinfo = pdba->resinfo;
773 for (i = 0; i < natoms; i++)
775 pdbnew->atom[i] = pdba->atom[pdbi[i].index];
776 pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
777 pdbnew->pdbinfo[i] = pdba->pdbinfo[pdbi[i].index];
778 copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
779 /* make indexgroup in block */
780 a[i] = pdbi[i].index;
783 sfree(pdba->atomname);
785 sfree(pdba->pdbinfo);
788 /* copy the sorted pdbnew back to pdba */
791 add_grp(block, gnames, natoms, a, "prot_sort");
797 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], gmx_bool bVerbose)
799 int i, j, oldnatoms, ndel;
802 printf("Checking for duplicate atoms....\n");
803 oldnatoms = pdba->nr;
805 /* NOTE: pdba->nr is modified inside the loop */
806 for (i = 1; (i < pdba->nr); i++)
808 /* compare 'i' and 'i-1', throw away 'i' if they are identical
809 this is a 'while' because multiple alternate locations can be present */
810 while ( (i < pdba->nr) &&
811 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
812 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
817 ri = &pdba->resinfo[pdba->atom[i].resind];
818 printf("deleting duplicate atom %4s %s%4d%c",
819 *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
820 if (ri->chainid && (ri->chainid != ' '))
822 printf(" ch %c", ri->chainid);
826 if (pdba->pdbinfo[i].atomnr)
828 printf(" pdb nr %4d", pdba->pdbinfo[i].atomnr);
830 if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
832 printf(" altloc %c", pdba->pdbinfo[i].altloc);
838 /* We can not free, since it might be in the symtab */
839 /* sfree(pdba->atomname[i]); */
840 for (j = i; j < pdba->nr; j++)
842 pdba->atom[j] = pdba->atom[j+1];
843 pdba->atomname[j] = pdba->atomname[j+1];
846 pdba->pdbinfo[j] = pdba->pdbinfo[j+1];
848 copy_rvec(x[j+1], x[j]);
850 srenew(pdba->atom, pdba->nr);
851 /* srenew(pdba->atomname, pdba->nr); */
852 srenew(pdba->pdbinfo, pdba->nr);
855 if (pdba->nr != oldnatoms)
857 printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
864 checkResidueTypeSanity(t_atoms * pdba,
867 gmx_residuetype_t * rt)
869 std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
870 std::string endResidueString = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
872 // Check whether all residues in chain have the same chain ID.
873 bool allResiduesHaveSameChainID = true;
874 char chainID0 = pdba->resinfo[r0].chainid;
876 std::string residueString;
878 for (int i = r0 + 1; i < r1; i++)
880 chainID = pdba->resinfo[i].chainid;
881 if (chainID != chainID0)
883 allResiduesHaveSameChainID = false;
884 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
889 if (!allResiduesHaveSameChainID)
892 "The chain covering the range %s--%s does not have a consistent chain ID. "
893 "The first residue has ID '%c', while residue %s has ID '%c'.",
894 startResidueString.c_str(), endResidueString.c_str(),
895 chainID0, residueString.c_str(), chainID);
898 // At this point all residues have the same ID. If they are also non-blank
899 // we can be a bit more aggressive and require the types match too.
902 bool allResiduesHaveSameType = true;
903 const char *restype0;
905 gmx_residuetype_get_type(rt, *pdba->resinfo[r0].name, &restype0);
907 for (int i = r0 + 1; i < r1; i++)
909 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &restype);
910 if (gmx_strcasecmp(restype, restype0))
912 allResiduesHaveSameType = false;
913 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
918 if (!allResiduesHaveSameType)
921 "The residues in the chain %s--%s do not have a consistent type. "
922 "The first residue has type '%s', while residue %s is of type '%s'. "
923 "Either there is a mistake in your chain, or it includes nonstandard "
924 "residue names that have not yet been added to the residuetypes.dat "
925 "file in the GROMACS library directory. If there are other molecules "
926 "such as ligands, they should not have the same chain ID as the "
927 "adjacent protein chain since it's a separate molecule.",
928 startResidueString.c_str(), endResidueString.c_str(),
929 restype0, residueString.c_str(), restype);
934 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
935 gmx_residuetype_t *rt)
938 const char *p_startrestype;
939 const char *p_restype;
944 int startWarnings = 0;
948 // Check that all residues have the same chain identifier, and if it is
949 // non-blank we also require the residue types to match.
950 checkResidueTypeSanity(pdba, r0, r1, rt);
952 // If we return correctly from checkResidueTypeSanity(), the only
953 // remaining cases where we can have non-matching residue types is if
954 // the chain ID was blank, which could be the case e.g. for a structure
955 // read from a GRO file or other file types without chain information.
956 // In that case we need to be a bit more liberal and detect chains based
957 // on the residue type.
959 // If we get here, the chain ID must be identical for all residues
960 char chainID = pdba->resinfo[r0].chainid;
962 /* Find the starting terminus (typially N or 5') */
963 for (i = r0; i < r1 && *r_start == -1; i++)
965 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
966 if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
968 printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
971 else if (!gmx_strcasecmp(p_startrestype, "Ion"))
975 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
979 printf("Disabling further notes about ions.\n");
985 if (startWarnings < 5)
989 printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
990 "This chain lacks identifiers, which makes it impossible to do strict\n"
991 "classification of the start/end residues. Here we need to guess this residue\n"
992 "should not be part of the chain and instead introduce a break, but that will\n"
993 "be catastrophic if they should in fact be linked. Please check your structure,\n"
994 "and add %s to residuetypes.dat if this was not correct.\n\n",
995 *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
999 printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
1000 "This makes it impossible to link them into a molecule, which could either be\n"
1001 "correct or a catastrophic error. Please check your structure, and add all\n"
1002 "necessary residue names to residuetypes.dat if this was not correct.\n\n",
1003 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1006 if (startWarnings == 4)
1008 printf("Disabling further warnings about unidentified residues at start of chain.\n");
1016 /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1017 for (i = *r_start; i < r1; i++)
1019 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
1020 if (!gmx_strcasecmp(p_restype, p_startrestype) && endWarnings == 0)
1024 else if (!gmx_strcasecmp(p_startrestype, "Ion"))
1028 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1032 printf("Disabling further notes about ions.\n");
1038 // This can only trigger if the chain ID is blank - otherwise the
1039 // call to checkResidueTypeSanity() will have caught the problem.
1040 if (endWarnings < 5)
1042 printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1043 "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1044 "it impossible to do strict classification of the start/end residues. Here we\n"
1045 "need to guess this residue should not be part of the chain and instead\n"
1046 "introduce a break, but that will be catastrophic if they should in fact be\n"
1047 "linked. Please check your structure, and add %s to residuetypes.dat\n"
1048 "if this was not correct.\n\n",
1049 *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
1050 *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype, *pdba->resinfo[i].name);
1052 if (endWarnings == 4)
1054 printf("Disabling further warnings about unidentified residues at end of chain.\n");
1063 printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1077 static SplittingType getSplittingType(const char *chainsep)
1079 SplittingType splitting = SPLIT_TER_ONLY; /* keep compiler happy */
1081 /* Be a bit flexible to catch typos */
1082 if (!strncmp(chainsep, "id_o", 4))
1084 /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
1085 splitting = SPLIT_ID_OR_TER;
1086 printf("Splitting chemical chains based on TER records or chain id changing.\n");
1088 else if (!strncmp(chainsep, "int", 3))
1090 /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
1091 splitting = SPLIT_INTERACTIVE;
1092 printf("Splitting chemical chains interactively.\n");
1094 else if (!strncmp(chainsep, "id_a", 4))
1096 splitting = SPLIT_ID_AND_TER;
1097 printf("Splitting chemical chains based on TER records and chain id changing.\n");
1099 else if (strlen(chainsep) == 2 && !strncmp(chainsep, "id", 4))
1101 splitting = SPLIT_ID_ONLY;
1102 printf("Splitting chemical chains based on changing chain id only (ignoring TER records).\n");
1104 else if (chainsep[0] == 't')
1106 splitting = SPLIT_TER_ONLY;
1107 printf("Splitting chemical chains based on TER records only (ignoring chain id).\n");
1111 gmx_fatal(FARGS, "Unidentified setting for chain separation: %s\n", chainsep);
1117 modify_chain_numbers(t_atoms * pdba,
1118 const char * chainsep)
1121 char old_prev_chainid;
1122 char old_this_chainid;
1123 int old_prev_chainnum;
1124 int old_this_chainnum;
1126 char select[STRLEN];
1130 const char * prev_atomname;
1131 const char * this_atomname;
1132 const char * prev_resname;
1133 const char * this_resname;
1139 SplittingType splitting = getSplittingType(chainsep);
1141 /* The default chain enumeration is based on TER records only, which is reflected in chainnum below */
1143 old_prev_chainid = '?';
1144 old_prev_chainnum = -1;
1147 this_atomname = nullptr;
1149 this_resname = nullptr;
1153 for (i = 0; i < pdba->nres; i++)
1155 ri = &pdba->resinfo[i];
1156 old_this_chainid = ri->chainid;
1157 old_this_chainnum = ri->chainnum;
1159 prev_atomname = this_atomname;
1160 prev_atomnum = this_atomnum;
1161 prev_resname = this_resname;
1162 prev_resnum = this_resnum;
1163 prev_chainid = this_chainid;
1165 this_atomname = *(pdba->atomname[i]);
1166 this_atomnum = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1167 this_resname = *ri->name;
1168 this_resnum = ri->nr;
1169 this_chainid = ri->chainid;
1173 case SPLIT_ID_OR_TER:
1174 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1180 case SPLIT_ID_AND_TER:
1181 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1188 if (old_this_chainid != old_prev_chainid)
1194 case SPLIT_TER_ONLY:
1195 if (old_this_chainnum != old_prev_chainnum)
1200 case SPLIT_INTERACTIVE:
1201 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1205 printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\n"
1206 "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1207 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1208 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1210 if (nullptr == fgets(select, STRLEN-1, stdin))
1212 gmx_fatal(FARGS, "Error reading from stdin");
1215 if (i == 0 || select[0] == 'y')
1222 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1224 old_prev_chainid = old_this_chainid;
1225 old_prev_chainnum = old_this_chainnum;
1227 ri->chainnum = new_chainnum;
1256 int gmx_pdb2gmx(int argc, char *argv[])
1258 const char *desc[] = {
1259 "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1260 "some database files, adds hydrogens to the molecules and generates",
1261 "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1262 "and a topology in GROMACS format.",
1263 "These files can subsequently be processed to generate a run input file.",
1265 "[THISMODULE] will search for force fields by looking for",
1266 "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1267 "of the current working directory and of the GROMACS library directory",
1268 "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1270 "By default the forcefield selection is interactive,",
1271 "but you can use the [TT]-ff[tt] option to specify one of the short names",
1272 "in the list on the command line instead. In that case [THISMODULE] just looks",
1273 "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1275 "After choosing a force field, all files will be read only from",
1276 "the corresponding force field directory.",
1277 "If you want to modify or add a residue types, you can copy the force",
1278 "field directory from the GROMACS library directory to your current",
1279 "working directory. If you want to add new protein residue types,",
1280 "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1281 "or copy the whole library directory to a local directory and set",
1282 "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1283 "Check Chapter 5 of the manual for more information about file formats.",
1286 "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1287 "need not necessarily contain a protein structure. Every kind of",
1288 "molecule for which there is support in the database can be converted.",
1289 "If there is no support in the database, you can add it yourself.[PAR]",
1291 "The program has limited intelligence, it reads a number of database",
1292 "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1293 "if necessary this can be done manually. The program can prompt the",
1294 "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1295 "desired. For Lys the choice is between neutral (two protons on NZ) or",
1296 "protonated (three protons, default), for Asp and Glu unprotonated",
1297 "(default) or protonated, for His the proton can be either on ND1,",
1298 "on NE2 or on both. By default these selections are done automatically.",
1299 "For His, this is based on an optimal hydrogen bonding",
1300 "conformation. Hydrogen bonds are defined based on a simple geometric",
1301 "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1302 "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1303 "[TT]-dist[tt] respectively.[PAR]",
1305 "The protonation state of N- and C-termini can be chosen interactively",
1306 "with the [TT]-ter[tt] flag. Default termini are ionized (NH3+ and COO-),",
1307 "respectively. Some force fields support zwitterionic forms for chains of",
1308 "one residue, but for polypeptides these options should NOT be selected.",
1309 "The AMBER force fields have unique forms for the terminal residues,",
1310 "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1311 "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1312 "respectively to use these forms, making sure you preserve the format",
1313 "of the coordinate file. Alternatively, use named terminating residues",
1314 "(e.g. ACE, NME).[PAR]",
1316 "The separation of chains is not entirely trivial since the markup",
1317 "in user-generated PDB files frequently varies and sometimes it",
1318 "is desirable to merge entries across a TER record, for instance",
1319 "if you want a disulfide bridge or distance restraints between",
1320 "two protein chains or if you have a HEME group bound to a protein.",
1321 "In such cases multiple chains should be contained in a single",
1322 "[TT]moleculetype[tt] definition.",
1323 "To handle this, [THISMODULE] uses two separate options.",
1324 "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1325 "start, and termini added when applicable. This can be done based on the",
1326 "existence of TER records, when the chain id changes, or combinations of either",
1327 "or both of these. You can also do the selection fully interactively.",
1328 "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1329 "are merged into one moleculetype, after adding all the chemical termini (or not).",
1330 "This can be turned off (no merging), all non-water chains can be merged into a",
1331 "single molecule, or the selection can be done interactively.[PAR]",
1333 "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1334 "If any of the occupancies are not one, indicating that the atom is",
1335 "not resolved well in the structure, a warning message is issued.",
1336 "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1337 "all occupancy fields may be zero. Either way, it is up to the user",
1338 "to verify the correctness of the input data (read the article!).[PAR]",
1340 "During processing the atoms will be reordered according to GROMACS",
1341 "conventions. With [TT]-n[tt] an index file can be generated that",
1342 "contains one group reordered in the same way. This allows you to",
1343 "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1344 "one limitation: reordering is done after the hydrogens are stripped",
1345 "from the input and before new hydrogens are added. This means that",
1346 "you should not use [TT]-ignh[tt].[PAR]",
1348 "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1349 "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1350 "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1353 "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1354 "motions. Angular and out-of-plane motions can be removed by changing",
1355 "hydrogens into virtual sites and fixing angles, which fixes their",
1356 "position relative to neighboring atoms. Additionally, all atoms in the",
1357 "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1358 "can be converted into virtual sites, eliminating the fast improper dihedral",
1359 "fluctuations in these rings. [BB]Note[bb] that in this case all other hydrogen",
1360 "atoms are also converted to virtual sites. The mass of all atoms that are",
1361 "converted into virtual sites, is added to the heavy atoms.[PAR]",
1362 "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1363 "done by increasing the hydrogen-mass by a factor of 4. This is also",
1364 "done for water hydrogens to slow down the rotational motion of water.",
1365 "The increase in mass of the hydrogens is subtracted from the bonded",
1366 "(heavy) atom so that the total mass of the system remains the same."
1370 FILE *fp, *top_file, *top_file2, *itp_file = nullptr;
1372 t_atoms pdba_all, *pdba;
1376 int chain, nch, maxch, nwaterchain;
1378 t_chain *chains, *cc;
1379 char select[STRLEN];
1387 int i, j, k, l, nrtp;
1388 int *swap_index, si;
1392 gpp_atomtype_t atype;
1393 gmx_residuetype_t*rt;
1395 char fn[256], itp_fn[STRLEN], posre_fn[STRLEN], buf_fn[STRLEN];
1396 char molname[STRLEN], title[STRLEN];
1397 char *c, forcefield[STRLEN], ffdir[STRLEN];
1398 char ffname[STRLEN], suffix[STRLEN], buf[STRLEN];
1406 rtprename_t *rtprename = nullptr;
1407 int nah, nNtdb, nCtdb, ntdblist;
1408 t_hackblock *ntdb, *ctdb, **tdblist;
1412 gmx_bool bVsites = FALSE, bWat, bPrevWat = FALSE, bITP, bVsiteAromatics = FALSE;
1414 t_hackblock *hb_chain;
1415 t_restp *restp_chain;
1416 gmx_output_env_t *oenv;
1417 const char *p_restype;
1421 const char * prev_atomname;
1422 const char * this_atomname;
1423 const char * prev_resname;
1424 const char * this_resname;
1429 int prev_chainnumber;
1430 int this_chainnumber;
1432 int this_chainstart;
1433 int prev_chainstart;
1440 { efSTX, "-f", "eiwit.pdb", ffREAD },
1441 { efSTO, "-o", "conf", ffWRITE },
1442 { efTOP, nullptr, nullptr, ffWRITE },
1443 { efITP, "-i", "posre", ffWRITE },
1444 { efNDX, "-n", "clean", ffOPTWR },
1445 { efSTO, "-q", "clean.pdb", ffOPTWR }
1447 #define NFILE asize(fnm)
1449 gmx_bool bNewRTP = FALSE;
1450 gmx_bool bInter = FALSE, bCysMan = FALSE;
1451 gmx_bool bLysMan = FALSE, bAspMan = FALSE, bGluMan = FALSE, bHisMan = FALSE;
1452 gmx_bool bGlnMan = FALSE, bArgMan = FALSE;
1453 gmx_bool bTerMan = FALSE, bUnA = FALSE, bHeavyH = FALSE;
1454 gmx_bool bSort = TRUE, bAllowMissing = FALSE, bRemoveH = FALSE;
1455 gmx_bool bDeuterate = FALSE, bVerbose = FALSE, bChargeGroups = TRUE, bCmap = TRUE;
1456 gmx_bool bRenumRes = FALSE, bRTPresname = FALSE;
1457 real angle = 135.0, distance = 0.3, posre_fc = 1000;
1458 real long_bond_dist = 0.25, short_bond_dist = 0.05;
1459 const char *vsitestr[] = { nullptr, "none", "hydrogens", "aromatics", nullptr };
1460 const char *watstr[] = { nullptr, "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p", nullptr };
1461 const char *chainsep[] = { nullptr, "id_or_ter", "id_and_ter", "ter", "id", "interactive", nullptr };
1462 const char *merge[] = {nullptr, "no", "all", "interactive", nullptr };
1463 const char *ff = "select";
1466 { "-newrtp", FALSE, etBOOL, {&bNewRTP},
1467 "HIDDENWrite the residue database in new format to [TT]new.rtp[tt]"},
1468 { "-lb", FALSE, etREAL, {&long_bond_dist},
1469 "HIDDENLong bond warning distance" },
1470 { "-sb", FALSE, etREAL, {&short_bond_dist},
1471 "HIDDENShort bond warning distance" },
1472 { "-chainsep", FALSE, etENUM, {chainsep},
1473 "Condition in PDB files when a new chain should be started (adding termini)" },
1474 { "-merge", FALSE, etENUM, {&merge},
1475 "Merge multiple chains into a single [moleculetype]" },
1476 { "-ff", FALSE, etSTR, {&ff},
1477 "Force field, interactive by default. Use [TT]-h[tt] for information." },
1478 { "-water", FALSE, etENUM, {watstr},
1479 "Water model to use" },
1480 { "-inter", FALSE, etBOOL, {&bInter},
1481 "Set the next 8 options to interactive"},
1482 { "-ss", FALSE, etBOOL, {&bCysMan},
1483 "Interactive SS bridge selection" },
1484 { "-ter", FALSE, etBOOL, {&bTerMan},
1485 "Interactive termini selection, instead of charged (default)" },
1486 { "-lys", FALSE, etBOOL, {&bLysMan},
1487 "Interactive lysine selection, instead of charged" },
1488 { "-arg", FALSE, etBOOL, {&bArgMan},
1489 "Interactive arginine selection, instead of charged" },
1490 { "-asp", FALSE, etBOOL, {&bAspMan},
1491 "Interactive aspartic acid selection, instead of charged" },
1492 { "-glu", FALSE, etBOOL, {&bGluMan},
1493 "Interactive glutamic acid selection, instead of charged" },
1494 { "-gln", FALSE, etBOOL, {&bGlnMan},
1495 "Interactive glutamine selection, instead of neutral" },
1496 { "-his", FALSE, etBOOL, {&bHisMan},
1497 "Interactive histidine selection, instead of checking H-bonds" },
1498 { "-angle", FALSE, etREAL, {&angle},
1499 "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)" },
1500 { "-dist", FALSE, etREAL, {&distance},
1501 "Maximum donor-acceptor distance for a H-bond (nm)" },
1502 { "-una", FALSE, etBOOL, {&bUnA},
1503 "Select aromatic rings with united CH atoms on phenylalanine, "
1504 "tryptophane and tyrosine" },
1505 { "-sort", FALSE, etBOOL, {&bSort},
1506 "HIDDENSort the residues according to database, turning this off is dangerous as charge groups might be broken in parts" },
1507 { "-ignh", FALSE, etBOOL, {&bRemoveH},
1508 "Ignore hydrogen atoms that are in the coordinate file" },
1509 { "-missing", FALSE, etBOOL, {&bAllowMissing},
1510 "Continue when atoms are missing and bonds cannot be made, dangerous" },
1511 { "-v", FALSE, etBOOL, {&bVerbose},
1512 "Be slightly more verbose in messages" },
1513 { "-posrefc", FALSE, etREAL, {&posre_fc},
1514 "Force constant for position restraints" },
1515 { "-vsite", FALSE, etENUM, {vsitestr},
1516 "Convert atoms to virtual sites" },
1517 { "-heavyh", FALSE, etBOOL, {&bHeavyH},
1518 "Make hydrogen atoms heavy" },
1519 { "-deuterate", FALSE, etBOOL, {&bDeuterate},
1520 "Change the mass of hydrogens to 2 amu" },
1521 { "-chargegrp", TRUE, etBOOL, {&bChargeGroups},
1522 "Use charge groups in the [REF].rtp[ref] file" },
1523 { "-cmap", TRUE, etBOOL, {&bCmap},
1524 "Use cmap torsions (if enabled in the [REF].rtp[ref] file)" },
1525 { "-renum", TRUE, etBOOL, {&bRenumRes},
1526 "Renumber the residues consecutively in the output" },
1527 { "-rtpres", TRUE, etBOOL, {&bRTPresname},
1528 "Use [REF].rtp[ref] entry names as residue names" }
1530 #define NPARGS asize(pa)
1532 if (!parse_common_args(&argc, argv, 0, NFILE, fnm, asize(pa), pa, asize(desc), desc,
1538 /* Force field selection, interactive or direct */
1539 choose_ff(strcmp(ff, "select") == 0 ? nullptr : ff,
1540 forcefield, sizeof(forcefield),
1541 ffdir, sizeof(ffdir));
1543 if (strlen(forcefield) > 0)
1545 strcpy(ffname, forcefield);
1546 ffname[0] = toupper(ffname[0]);
1550 gmx_fatal(FARGS, "Empty forcefield string");
1553 printf("\nUsing the %s force field in directory %s\n\n",
1556 choose_watermodel(watstr[0], ffdir, &watermodel);
1560 /* if anything changes here, also change description of -inter */
1575 else if (bDeuterate)
1584 /* parse_common_args ensures vsitestr has been selected, but
1585 clang-static-analyzer needs clues to know that */
1586 GMX_ASSERT(vsitestr[0], "-vsite default wasn't processed correctly");
1587 switch (vsitestr[0][0])
1589 case 'n': /* none */
1591 bVsiteAromatics = FALSE;
1593 case 'h': /* hydrogens */
1595 bVsiteAromatics = FALSE;
1597 case 'a': /* aromatics */
1599 bVsiteAromatics = TRUE;
1602 gmx_fatal(FARGS, "DEATH HORROR in $s (%s): vsitestr[0]='%d'",
1603 __FILE__, __LINE__, vsitestr[0]);
1606 /* Open the symbol table */
1607 open_symtab(&symtab);
1609 /* Residue type database */
1610 gmx_residuetype_init(&rt);
1612 /* Read residue renaming database(s), if present */
1613 nrrn = fflib_search_file_end(ffdir, ".r2b", FALSE, &rrn);
1616 rtprename = nullptr;
1617 for (i = 0; i < nrrn; i++)
1619 fp = fflib_open(rrn[i]);
1620 read_rtprename(rrn[i], fp, &nrtprename, &rtprename);
1626 /* Add all alternative names from the residue renaming database to the list of recognized amino/nucleic acids. */
1627 for (i = 0; i < nrtprename; i++)
1629 rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1631 /* Only add names if the 'standard' gromacs/iupac base name was found */
1634 gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1635 gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1636 gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1637 gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1642 if (watermodel != nullptr && (strstr(watermodel, "4p") ||
1643 strstr(watermodel, "4P")))
1647 else if (watermodel != nullptr && (strstr(watermodel, "5p") ||
1648 strstr(watermodel, "5P")))
1657 aps = gmx_atomprop_init();
1658 natom = read_pdball(opt2fn("-f", NFILE, fnm), opt2fn_null("-q", NFILE, fnm), title,
1659 &pdba_all, &pdbx, &ePBC, box, bRemoveH, &symtab, rt, watres,
1664 gmx_fatal(FARGS, "No atoms found in pdb file %s\n", opt2fn("-f", NFILE, fnm));
1667 printf("Analyzing pdb file\n");
1670 modify_chain_numbers(&pdba_all, chainsep[0]);
1674 this_atomname = nullptr;
1676 this_resname = nullptr;
1679 this_chainnumber = -1;
1680 this_chainstart = 0;
1681 /* Keep the compiler happy */
1682 prev_chainstart = 0;
1686 snew(pdb_ch, maxch);
1689 for (i = 0; (i < natom); i++)
1691 ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1693 /* TODO this should live in a helper object, and consolidate
1694 that with code in modify_chain_numbers */
1695 prev_atomname = this_atomname;
1696 prev_atomnum = this_atomnum;
1697 prev_resname = this_resname;
1698 prev_resnum = this_resnum;
1699 prev_chainid = this_chainid;
1700 prev_chainnumber = this_chainnumber;
1703 prev_chainstart = this_chainstart;
1706 this_atomname = *pdba_all.atomname[i];
1707 this_atomnum = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1708 this_resname = *ri->name;
1709 this_resnum = ri->nr;
1710 this_chainid = ri->chainid;
1711 this_chainnumber = ri->chainnum;
1713 bWat = gmx_strcasecmp(*ri->name, watres) == 0;
1715 if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat != bPrevWat))
1717 this_chainstart = pdba_all.atom[i].resind;
1721 if (!strncmp(merge[0], "int", 3))
1723 printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1724 "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1725 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1726 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1728 if (nullptr == fgets(select, STRLEN-1, stdin))
1730 gmx_fatal(FARGS, "Error reading from stdin");
1732 bMerged = (select[0] == 'y');
1734 else if (!strncmp(merge[0], "all", 3))
1742 pdb_ch[nch-1].chainstart[pdb_ch[nch-1].nterpairs] =
1743 pdba_all.atom[i].resind - prev_chainstart;
1744 pdb_ch[nch-1].nterpairs++;
1745 srenew(pdb_ch[nch-1].chainstart, pdb_ch[nch-1].nterpairs+1);
1750 /* set natom for previous chain */
1753 pdb_ch[nch-1].natom = i-pdb_ch[nch-1].start;
1760 /* check if chain identifier was used before */
1761 for (j = 0; (j < nch); j++)
1763 if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1765 printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1766 "They will be treated as separate chains unless you reorder your file.\n",
1770 // TODO This is too convoluted. Use a std::vector
1774 srenew(pdb_ch, maxch);
1776 pdb_ch[nch].chainid = ri->chainid;
1777 pdb_ch[nch].chainnum = ri->chainnum;
1778 pdb_ch[nch].start = i;
1779 pdb_ch[nch].bAllWat = bWat;
1782 pdb_ch[nch].nterpairs = 0;
1786 pdb_ch[nch].nterpairs = 1;
1788 snew(pdb_ch[nch].chainstart, pdb_ch[nch].nterpairs+1);
1789 /* modified [nch] to [0] below */
1790 pdb_ch[nch].chainstart[0] = 0;
1796 pdb_ch[nch-1].natom = natom-pdb_ch[nch-1].start;
1798 /* set all the water blocks at the end of the chain */
1799 snew(swap_index, nch);
1801 for (i = 0; i < nch; i++)
1803 if (!pdb_ch[i].bAllWat)
1809 for (i = 0; i < nch; i++)
1811 if (pdb_ch[i].bAllWat)
1817 if (nwaterchain > 1)
1819 printf("Moved all the water blocks to the end\n");
1823 /* copy pdb data and x for all chains */
1824 for (i = 0; (i < nch); i++)
1827 chains[i].chainid = pdb_ch[si].chainid;
1828 chains[i].chainnum = pdb_ch[si].chainnum;
1829 chains[i].bAllWat = pdb_ch[si].bAllWat;
1830 chains[i].nterpairs = pdb_ch[si].nterpairs;
1831 chains[i].chainstart = pdb_ch[si].chainstart;
1832 snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1833 snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1834 snew(chains[i].r_start, pdb_ch[si].nterpairs);
1835 snew(chains[i].r_end, pdb_ch[si].nterpairs);
1837 snew(chains[i].pdba, 1);
1838 init_t_atoms(chains[i].pdba, pdb_ch[si].natom, TRUE);
1839 snew(chains[i].x, chains[i].pdba->nr);
1840 for (j = 0; j < chains[i].pdba->nr; j++)
1842 chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1843 snew(chains[i].pdba->atomname[j], 1);
1844 *chains[i].pdba->atomname[j] =
1845 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1846 chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1847 copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1849 /* Re-index the residues assuming that the indices are continuous */
1850 k = chains[i].pdba->atom[0].resind;
1851 nres = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1852 chains[i].pdba->nres = nres;
1853 for (j = 0; j < chains[i].pdba->nr; j++)
1855 chains[i].pdba->atom[j].resind -= k;
1857 srenew(chains[i].pdba->resinfo, nres);
1858 for (j = 0; j < nres; j++)
1860 chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1861 snew(chains[i].pdba->resinfo[j].name, 1);
1862 *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1863 /* make all chain identifiers equal to that of the chain */
1864 chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1868 if (nchainmerges > 0)
1870 printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1874 printf("There are %d chains and %d blocks of water and "
1875 "%d residues with %d atoms\n",
1876 nch-nwaterchain, nwaterchain,
1877 pdba_all.nres, natom);
1879 printf("\n %5s %4s %6s\n", "chain", "#res", "#atoms");
1880 for (i = 0; (i < nch); i++)
1882 printf(" %d '%c' %5d %6d %s\n",
1883 i+1, chains[i].chainid ? chains[i].chainid : '-',
1884 chains[i].pdba->nres, chains[i].pdba->nr,
1885 chains[i].bAllWat ? "(only water)" : "");
1889 check_occupancy(&pdba_all, opt2fn("-f", NFILE, fnm), bVerbose);
1891 /* Read atomtypes... */
1892 atype = read_atype(ffdir, &symtab);
1894 /* read residue database */
1895 printf("Reading residue database... (%s)\n", forcefield);
1896 nrtpf = fflib_search_file_end(ffdir, ".rtp", TRUE, &rtpf);
1899 for (i = 0; i < nrtpf; i++)
1901 read_resall(rtpf[i], &nrtp, &restp, atype, &symtab, FALSE);
1907 /* Not correct with multiple rtp input files with different bonded types */
1908 fp = gmx_fio_fopen("new.rtp", "w");
1909 print_resall(fp, nrtp, restp, atype);
1913 /* read hydrogen database */
1914 nah = read_h_db(ffdir, &ah);
1916 /* Read Termini database... */
1917 nNtdb = read_ter_db(ffdir, 'n', &ntdb, atype);
1918 nCtdb = read_ter_db(ffdir, 'c', &ctdb, atype);
1920 top_fn = ftp2fn(efTOP, NFILE, fnm);
1921 top_file = gmx_fio_fopen(top_fn, "w");
1923 print_top_header(top_file, top_fn, FALSE, ffdir, mHmult);
1929 for (chain = 0; (chain < nch); chain++)
1931 cc = &(chains[chain]);
1933 /* set pdba, natom and nres to the current chain */
1936 natom = cc->pdba->nr;
1937 nres = cc->pdba->nres;
1939 if (cc->chainid && ( cc->chainid != ' ' ) )
1941 printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
1942 chain+1, cc->chainid, natom, nres);
1946 printf("Processing chain %d (%d atoms, %d residues)\n",
1947 chain+1, natom, nres);
1950 process_chain(pdba, x, bUnA, bUnA, bUnA, bLysMan, bAspMan, bGluMan,
1951 bHisMan, bArgMan, bGlnMan, angle, distance, &symtab,
1952 nrtprename, rtprename);
1954 cc->chainstart[cc->nterpairs] = pdba->nres;
1956 for (i = 0; i < cc->nterpairs; i++)
1958 find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
1959 &(cc->r_start[j]), &(cc->r_end[j]), rt);
1961 if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
1967 if (cc->nterpairs == 0)
1969 printf("Problem with chain definition, or missing terminal residues.\n"
1970 "This chain does not appear to contain a recognized chain molecule.\n"
1971 "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
1974 /* Check for disulfides and other special bonds */
1975 nssbonds = mk_specbonds(pdba, x, bCysMan, &ssbonds, bVerbose);
1979 rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
1987 sprintf(fn, "chain.pdb");
1991 sprintf(fn, "chain_%c%d.pdb", cc->chainid, cc->chainnum);
1993 write_sto_conf(fn, title, pdba, x, nullptr, ePBC, box);
1997 for (i = 0; i < cc->nterpairs; i++)
2001 * We first apply a filter so we only have the
2002 * termini that can be applied to the residue in question
2003 * (or a generic terminus if no-residue specific is available).
2005 /* First the N terminus */
2008 tdblist = filter_ter(nNtdb, ntdb,
2009 *pdba->resinfo[cc->r_start[i]].name,
2013 printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
2014 "is already in a terminus-specific form and skipping terminus selection.\n");
2015 cc->ntdb[i] = nullptr;
2019 if (bTerMan && ntdblist > 1)
2021 sprintf(select, "Select start terminus type for %s-%d",
2022 *pdba->resinfo[cc->r_start[i]].name,
2023 pdba->resinfo[cc->r_start[i]].nr);
2024 cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
2028 cc->ntdb[i] = tdblist[0];
2031 printf("Start terminus %s-%d: %s\n",
2032 *pdba->resinfo[cc->r_start[i]].name,
2033 pdba->resinfo[cc->r_start[i]].nr,
2034 (cc->ntdb[i])->name);
2040 cc->ntdb[i] = nullptr;
2043 /* And the C terminus */
2046 tdblist = filter_ter(nCtdb, ctdb,
2047 *pdba->resinfo[cc->r_end[i]].name,
2051 printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2052 "is already in a terminus-specific form and skipping terminus selection.\n");
2053 cc->ctdb[i] = nullptr;
2057 if (bTerMan && ntdblist > 1)
2059 sprintf(select, "Select end terminus type for %s-%d",
2060 *pdba->resinfo[cc->r_end[i]].name,
2061 pdba->resinfo[cc->r_end[i]].nr);
2062 cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
2066 cc->ctdb[i] = tdblist[0];
2068 printf("End terminus %s-%d: %s\n",
2069 *pdba->resinfo[cc->r_end[i]].name,
2070 pdba->resinfo[cc->r_end[i]].nr,
2071 (cc->ctdb[i])->name);
2077 cc->ctdb[i] = nullptr;
2080 /* lookup hackblocks and rtp for all residues */
2081 get_hackblocks_rtp(&hb_chain, &restp_chain,
2082 nrtp, restp, pdba->nres, pdba->resinfo,
2083 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2085 /* ideally, now we would not need the rtp itself anymore, but do
2086 everything using the hb and restp arrays. Unfortunately, that
2087 requires some re-thinking of code in gen_vsite.c, which I won't
2088 do now :( AF 26-7-99 */
2090 rename_atoms(nullptr, ffdir,
2091 pdba, &symtab, restp_chain, FALSE, rt, FALSE, bVerbose);
2093 match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose);
2097 block = new_blocka();
2099 sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
2100 remove_duplicate_atoms(pdba, x, bVerbose);
2101 if (ftp2bSet(efNDX, NFILE, fnm))
2105 fprintf(stderr, "WARNING: with the -remh option the generated "
2106 "index file (%s) might be useless\n"
2107 "(the index file is generated before hydrogens are added)",
2108 ftp2fn(efNDX, NFILE, fnm));
2110 write_index(ftp2fn(efNDX, NFILE, fnm), block, gnames, FALSE, 0);
2112 for (i = 0; i < block->nr; i++)
2121 fprintf(stderr, "WARNING: "
2122 "without sorting no check for duplicate atoms can be done\n");
2125 /* Generate Hydrogen atoms (and termini) in the sequence */
2126 printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2127 natom = add_h(&pdba, &x, nah, ah,
2128 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing,
2129 nullptr, nullptr, TRUE, FALSE);
2130 printf("Now there are %d residues with %d atoms\n",
2131 pdba->nres, pdba->nr);
2134 write_pdbfile(debug, title, pdba, x, ePBC, box, ' ', 0, nullptr, TRUE);
2139 for (i = 0; (i < natom); i++)
2141 fprintf(debug, "Res %s%d atom %d %s\n",
2142 *(pdba->resinfo[pdba->atom[i].resind].name),
2143 pdba->resinfo[pdba->atom[i].resind].nr, i+1, *pdba->atomname[i]);
2147 strcpy(posre_fn, ftp2fn(efITP, NFILE, fnm));
2149 /* make up molecule name(s) */
2151 k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2153 gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2159 sprintf(molname, "Water");
2163 this_chainid = cc->chainid;
2165 /* Add the chain id if we have one */
2166 if (this_chainid != ' ')
2168 sprintf(buf, "_chain_%c", this_chainid);
2169 strcat(suffix, buf);
2172 /* Check if there have been previous chains with the same id */
2174 for (k = 0; k < chain; k++)
2176 if (cc->chainid == chains[k].chainid)
2181 /* Add the number for this chain identifier if there are multiple copies */
2185 sprintf(buf, "%d", nid_used+1);
2186 strcat(suffix, buf);
2189 if (strlen(suffix) > 0)
2191 sprintf(molname, "%s%s", p_restype, suffix);
2195 strcpy(molname, p_restype);
2199 if ((nch-nwaterchain > 1) && !cc->bAllWat)
2202 strcpy(itp_fn, top_fn);
2203 printf("Chain time...\n");
2204 c = strrchr(itp_fn, '.');
2205 sprintf(c, "_%s.itp", molname);
2206 c = strrchr(posre_fn, '.');
2207 sprintf(c, "_%s.itp", molname);
2208 if (strcmp(itp_fn, posre_fn) == 0)
2210 strcpy(buf_fn, posre_fn);
2211 c = strrchr(buf_fn, '.');
2213 sprintf(posre_fn, "%s_pr.itp", buf_fn);
2217 srenew(incls, nincl);
2218 incls[nincl-1] = gmx_strdup(itp_fn);
2219 itp_file = gmx_fio_fopen(itp_fn, "w");
2226 srenew(mols, nmol+1);
2229 mols[nmol].name = gmx_strdup("SOL");
2230 mols[nmol].nr = pdba->nres;
2234 mols[nmol].name = gmx_strdup(molname);
2241 print_top_comment(itp_file, itp_fn, ffdir, TRUE);
2246 top_file2 = nullptr;
2250 top_file2 = itp_file;
2254 top_file2 = top_file;
2257 pdb2top(top_file2, posre_fn, molname, pdba, &x, atype, &symtab,
2259 restp_chain, hb_chain,
2261 bVsites, bVsiteAromatics, ffdir,
2262 mHmult, nssbonds, ssbonds,
2263 long_bond_dist, short_bond_dist, bDeuterate, bChargeGroups, bCmap,
2264 bRenumRes, bRTPresname);
2268 write_posres(posre_fn, pdba, posre_fc);
2273 gmx_fio_fclose(itp_file);
2276 /* pdba and natom have been reassigned somewhere so: */
2282 if (cc->chainid == ' ')
2284 sprintf(fn, "chain.pdb");
2288 sprintf(fn, "chain_%c.pdb", cc->chainid);
2290 write_sto_conf(fn, "", pdba, x, nullptr, ePBC, box);
2294 if (watermodel == nullptr)
2296 for (chain = 0; chain < nch; chain++)
2298 if (chains[chain].bAllWat)
2300 gmx_fatal(FARGS, "You have chosen not to include a water model, but there is water in the input file. Select a water model or remove the water from your input file.");
2306 sprintf(buf_fn, "%s%c%s.itp", ffdir, DIR_SEPARATOR, watermodel);
2307 if (!fflib_fexist(buf_fn))
2309 gmx_fatal(FARGS, "The topology file '%s' for the selected water model '%s' can not be found in the force field directory. Select a different water model.",
2310 buf_fn, watermodel);
2314 print_top_mols(top_file, title, ffdir, watermodel, nincl, incls, nmol, mols);
2315 gmx_fio_fclose(top_file);
2317 gmx_residuetype_destroy(rt);
2319 /* now merge all chains back together */
2322 for (i = 0; (i < nch); i++)
2324 natom += chains[i].pdba->nr;
2325 nres += chains[i].pdba->nres;
2328 init_t_atoms(atoms, natom, FALSE);
2329 for (i = 0; i < atoms->nres; i++)
2331 sfree(atoms->resinfo[i].name);
2333 sfree(atoms->resinfo);
2335 snew(atoms->resinfo, nres);
2339 for (i = 0; (i < nch); i++)
2343 printf("Including chain %d in system: %d atoms %d residues\n",
2344 i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2346 for (j = 0; (j < chains[i].pdba->nr); j++)
2348 atoms->atom[k] = chains[i].pdba->atom[j];
2349 atoms->atom[k].resind += l; /* l is processed nr of residues */
2350 atoms->atomname[k] = chains[i].pdba->atomname[j];
2351 atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2352 copy_rvec(chains[i].x[j], x[k]);
2355 for (j = 0; (j < chains[i].pdba->nres); j++)
2357 atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2360 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2368 fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2369 print_sums(atoms, TRUE);
2372 fprintf(stderr, "\nWriting coordinate file...\n");
2373 clear_rvec(box_space);
2376 make_new_box(atoms->nr, x, box, box_space, FALSE);
2378 write_sto_conf(ftp2fn(efSTO, NFILE, fnm), title, atoms, x, nullptr, ePBC, box);
2380 printf("\t\t--------- PLEASE NOTE ------------\n");
2381 printf("You have successfully generated a topology from: %s.\n",
2382 opt2fn("-f", NFILE, fnm));
2383 if (watermodel != nullptr)
2385 printf("The %s force field and the %s water model are used.\n",
2386 ffname, watermodel);
2390 printf("The %s force field is used.\n",
2393 printf("\t\t--------- ETON ESAELP ------------\n");