2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
46 #include "gromacs/commandline/pargs.h"
47 #include "gromacs/fileio/confio.h"
48 #include "gromacs/fileio/gmxfio.h"
49 #include "gromacs/fileio/pdbio.h"
50 #include "gromacs/gmxlib/conformation-utilities.h"
51 #include "gromacs/gmxpreprocess/fflibutil.h"
52 #include "gromacs/gmxpreprocess/genhydro.h"
53 #include "gromacs/gmxpreprocess/h_db.h"
54 #include "gromacs/gmxpreprocess/hizzie.h"
55 #include "gromacs/gmxpreprocess/pdb2top.h"
56 #include "gromacs/gmxpreprocess/pgutil.h"
57 #include "gromacs/gmxpreprocess/resall.h"
58 #include "gromacs/gmxpreprocess/specbond.h"
59 #include "gromacs/gmxpreprocess/ter_db.h"
60 #include "gromacs/gmxpreprocess/toputil.h"
61 #include "gromacs/gmxpreprocess/xlate.h"
62 #include "gromacs/math/vec.h"
63 #include "gromacs/topology/atomprop.h"
64 #include "gromacs/topology/block.h"
65 #include "gromacs/topology/index.h"
66 #include "gromacs/topology/residuetypes.h"
67 #include "gromacs/topology/symtab.h"
68 #include "gromacs/topology/topology.h"
69 #include "gromacs/utility/arraysize.h"
70 #include "gromacs/utility/cstringutil.h"
71 #include "gromacs/utility/dir_separator.h"
72 #include "gromacs/utility/fatalerror.h"
73 #include "gromacs/utility/gmxassert.h"
74 #include "gromacs/utility/smalloc.h"
75 #include "gromacs/utility/strdb.h"
76 #include "gromacs/utility/stringutil.h"
80 char gmx[RTP_MAXCHAR+2];
81 char main[RTP_MAXCHAR+2];
82 char nter[RTP_MAXCHAR+2];
83 char cter[RTP_MAXCHAR+2];
84 char bter[RTP_MAXCHAR+2];
88 static const char *res2bb_notermini(const char *name,
89 int nrr, const rtprename_t *rr)
91 /* NOTE: This function returns the main building block name,
92 * it does not take terminal renaming into account.
97 while (i < nrr && gmx_strcasecmp(name, rr[i].gmx) != 0)
102 return (i < nrr ? rr[i].main : name);
105 static const char *select_res(int nr, int resnr,
106 const char *name[], const char *expl[],
108 int nrr, const rtprename_t *rr)
112 printf("Which %s type do you want for residue %d\n", title, resnr+1);
113 for (sel = 0; (sel < nr); sel++)
115 printf("%d. %s (%s)\n",
116 sel, expl[sel], res2bb_notermini(name[sel], nrr, rr));
118 printf("\nType a number:"); fflush(stdout);
120 if (scanf("%d", &sel) != 1)
122 gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr+1);
128 static const char *get_asptp(int resnr, int nrr, const rtprename_t *rr)
133 const char *lh[easpNR] = { "ASP", "ASPH" };
134 const char *expl[easpNR] = {
135 "Not protonated (charge -1)",
136 "Protonated (charge 0)"
139 return select_res(easpNR, resnr, lh, expl, "ASPARTIC ACID", nrr, rr);
142 static const char *get_glutp(int resnr, int nrr, const rtprename_t *rr)
147 const char *lh[egluNR] = { "GLU", "GLUH" };
148 const char *expl[egluNR] = {
149 "Not protonated (charge -1)",
150 "Protonated (charge 0)"
153 return select_res(egluNR, resnr, lh, expl, "GLUTAMIC ACID", nrr, rr);
156 static const char *get_glntp(int resnr, int nrr, const rtprename_t *rr)
161 const char *lh[eglnNR] = { "GLN", "QLN" };
162 const char *expl[eglnNR] = {
163 "Not protonated (charge 0)",
164 "Protonated (charge +1)"
167 return select_res(eglnNR, resnr, lh, expl, "GLUTAMINE", nrr, rr);
170 static const char *get_lystp(int resnr, int nrr, const rtprename_t *rr)
175 const char *lh[elysNR] = { "LYSN", "LYS" };
176 const char *expl[elysNR] = {
177 "Not protonated (charge 0)",
178 "Protonated (charge +1)"
181 return select_res(elysNR, resnr, lh, expl, "LYSINE", nrr, rr);
184 static const char *get_argtp(int resnr, int nrr, const rtprename_t *rr)
189 const char *lh[eargNR] = { "ARGN", "ARG" };
190 const char *expl[eargNR] = {
191 "Not protonated (charge 0)",
192 "Protonated (charge +1)"
195 return select_res(eargNR, resnr, lh, expl, "ARGININE", nrr, rr);
198 static const char *get_histp(int resnr, int nrr, const rtprename_t *rr)
200 const char *expl[ehisNR] = {
207 return select_res(ehisNR, resnr, hh, expl, "HISTIDINE", nrr, rr);
210 static void read_rtprename(const char *fname, FILE *fp,
211 int *nrtprename, rtprename_t **rtprename)
213 char line[STRLEN], buf[STRLEN];
222 while (get_a_line(fp, line, STRLEN))
225 /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
226 * For other args, we read up to 6 chars (so we can detect if the length is > 5).
227 * Note that the buffer length has been increased to 7 to allow this,
228 * so we just need to make sure the strings have zero-length initially.
231 rr[n].main[0] = '\0';
232 rr[n].nter[0] = '\0';
233 rr[n].cter[0] = '\0';
234 rr[n].bter[0] = '\0';
235 nc = sscanf(line, "%6s %6s %6s %6s %6s %s",
236 rr[n].gmx, rr[n].main, rr[n].nter, rr[n].cter, rr[n].bter, buf);
237 if (strlen(rr[n].gmx) > RTP_MAXCHAR || strlen(rr[n].main) > RTP_MAXCHAR ||
238 strlen(rr[n].nter) > RTP_MAXCHAR || strlen(rr[n].cter) > RTP_MAXCHAR || strlen(rr[n].bter) > RTP_MAXCHAR)
240 gmx_fatal(FARGS, "Residue renaming database '%s' has strings longer than %d chars in first 5 columns:\n%s",
241 fname, RTP_MAXCHAR, line);
246 if (nc != 2 && nc != 5)
248 gmx_fatal(FARGS, "Residue renaming database '%s' has %d columns instead of %d, %d or %d", fname, ncol, 2, 5);
254 gmx_fatal(FARGS, "A line in residue renaming database '%s' has %d columns, while previous lines have %d columns", fname, nc, ncol);
259 /* This file does not have special termini names, copy them from main */
260 strcpy(rr[n].nter, rr[n].main);
261 strcpy(rr[n].cter, rr[n].main);
262 strcpy(rr[n].bter, rr[n].main);
272 static char *search_resrename(int nrr, rtprename_t *rr,
274 gmx_bool bStart, gmx_bool bEnd,
275 gmx_bool bCompareFFRTPname)
283 while (i < nrr && ((!bCompareFFRTPname && strcmp(name, rr[i].gmx) != 0) ||
284 ( bCompareFFRTPname && strcmp(name, rr[i].main) != 0)))
289 /* If found in the database, rename this residue's rtp building block,
290 * otherwise keep the old name.
313 gmx_fatal(FARGS, "In the chosen force field there is no residue type for '%s'%s", name, bStart ? ( bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus") : (bEnd ? " as an ending terminus" : ""));
321 static void rename_resrtp(t_atoms *pdba, int nterpairs, const int *r_start, const int *r_end,
322 int nrr, rtprename_t *rr, t_symtab *symtab,
326 gmx_bool bStart, bEnd;
328 gmx_bool bFFRTPTERRNM;
330 bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
332 for (r = 0; r < pdba->nres; r++)
336 for (j = 0; j < nterpairs; j++)
343 for (j = 0; j < nterpairs; j++)
351 nn = search_resrename(nrr, rr, *pdba->resinfo[r].rtp, bStart, bEnd, FALSE);
353 if (bFFRTPTERRNM && nn == nullptr && (bStart || bEnd))
355 /* This is a terminal residue, but the residue name,
356 * currently stored in .rtp, is not a standard residue name,
357 * but probably a force field specific rtp name.
358 * Check if we need to rename it because it is terminal.
360 nn = search_resrename(nrr, rr,
361 *pdba->resinfo[r].rtp, bStart, bEnd, TRUE);
364 if (nn != nullptr && strcmp(*pdba->resinfo[r].rtp, nn) != 0)
368 printf("Changing rtp entry of residue %d %s to '%s'\n",
369 pdba->resinfo[r].nr, *pdba->resinfo[r].name, nn);
371 pdba->resinfo[r].rtp = put_symtab(symtab, nn);
376 static void pdbres_to_gmxrtp(t_atoms *pdba)
380 for (i = 0; (i < pdba->nres); i++)
382 if (pdba->resinfo[i].rtp == nullptr)
384 pdba->resinfo[i].rtp = pdba->resinfo[i].name;
389 static void rename_pdbres(t_atoms *pdba, const char *oldnm, const char *newnm,
390 gmx_bool bFullCompare, t_symtab *symtab)
395 for (i = 0; (i < pdba->nres); i++)
397 resnm = *pdba->resinfo[i].name;
398 if ((bFullCompare && (gmx_strcasecmp(resnm, oldnm) == 0)) ||
399 (!bFullCompare && strstr(resnm, oldnm) != nullptr))
401 /* Rename the residue name (not the rtp name) */
402 pdba->resinfo[i].name = put_symtab(symtab, newnm);
407 static void rename_bb(t_atoms *pdba, const char *oldnm, const char *newnm,
408 gmx_bool bFullCompare, t_symtab *symtab)
413 for (i = 0; (i < pdba->nres); i++)
415 /* We have not set the rtp name yes, use the residue name */
416 bbnm = *pdba->resinfo[i].name;
417 if ((bFullCompare && (gmx_strcasecmp(bbnm, oldnm) == 0)) ||
418 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
420 /* Change the rtp builing block name */
421 pdba->resinfo[i].rtp = put_symtab(symtab, newnm);
426 static void rename_bbint(t_atoms *pdba, const char *oldnm,
427 const char *gettp(int, int, const rtprename_t *),
428 gmx_bool bFullCompare,
430 int nrr, const rtprename_t *rr)
436 for (i = 0; i < pdba->nres; i++)
438 /* We have not set the rtp name yes, use the residue name */
439 bbnm = *pdba->resinfo[i].name;
440 if ((bFullCompare && (strcmp(bbnm, oldnm) == 0)) ||
441 (!bFullCompare && strstr(bbnm, oldnm) != nullptr))
443 ptr = gettp(i, nrr, rr);
444 pdba->resinfo[i].rtp = put_symtab(symtab, ptr);
449 static void check_occupancy(t_atoms *atoms, const char *filename, gmx_bool bVerbose)
455 ftp = fn2ftp(filename);
456 if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
458 fprintf(stderr, "No occupancies in %s\n", filename);
462 for (i = 0; (i < atoms->nr); i++)
464 if (atoms->pdbinfo[i].occup != 1)
468 fprintf(stderr, "Occupancy for atom %s%d-%s is %f rather than 1\n",
469 *atoms->resinfo[atoms->atom[i].resind].name,
470 atoms->resinfo[ atoms->atom[i].resind].nr,
472 atoms->pdbinfo[i].occup);
474 if (atoms->pdbinfo[i].occup == 0)
484 if (nzero == atoms->nr)
486 fprintf(stderr, "All occupancy fields zero. This is probably not an X-Ray structure\n");
488 else if ((nzero > 0) || (nnotone > 0))
492 "WARNING: there were %d atoms with zero occupancy and %d atoms with\n"
493 " occupancy unequal to one (out of %d atoms). Check your pdb file.\n"
495 nzero, nnotone, atoms->nr);
499 fprintf(stderr, "All occupancies are one\n");
504 static void write_posres(char *fn, t_atoms *pdba, real fc)
509 fp = gmx_fio_fopen(fn, "w");
511 "; In this topology include file, you will find position restraint\n"
512 "; entries for all the heavy atoms in your original pdb file.\n"
513 "; This means that all the protons which were added by pdb2gmx are\n"
514 "; not restrained.\n"
516 "[ position_restraints ]\n"
517 "; %4s%6s%8s%8s%8s\n", "atom", "type", "fx", "fy", "fz"
519 for (i = 0; (i < pdba->nr); i++)
521 if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
523 fprintf(fp, "%6d%6d %g %g %g\n", i+1, 1, fc, fc, fc);
529 static int read_pdball(const char *inf, const char *outf, char *title,
530 t_atoms *atoms, rvec **x,
531 int *ePBC, matrix box, gmx_bool bRemoveH,
532 t_symtab *symtab, gmx_residuetype_t *rt, const char *watres,
533 gmx_atomprop_t aps, gmx_bool bVerbose)
534 /* Read a pdb file. (containing proteins) */
536 int natom, new_natom, i;
539 printf("Reading %s...\n", inf);
542 read_tps_conf(inf, top, ePBC, x, nullptr, box, FALSE);
543 strncpy(title, *top->name, STRLEN);
544 title[STRLEN-1] = '\0';
548 if (atoms->pdbinfo == nullptr)
550 snew(atoms->pdbinfo, atoms->nr);
552 if (fn2ftp(inf) == efPDB)
554 get_pdb_atomnumber(atoms, aps);
559 for (i = 0; i < atoms->nr; i++)
561 if (!is_hydrogen(*atoms->atomname[i]))
563 atoms->atom[new_natom] = atoms->atom[i];
564 atoms->atomname[new_natom] = atoms->atomname[i];
565 atoms->pdbinfo[new_natom] = atoms->pdbinfo[i];
566 copy_rvec((*x)[i], (*x)[new_natom]);
570 atoms->nr = new_natom;
577 printf(" '%s',", title);
579 printf(" %d atoms\n", natom);
581 /* Rename residues */
582 rename_pdbres(atoms, "HOH", watres, FALSE, symtab);
583 rename_pdbres(atoms, "SOL", watres, FALSE, symtab);
584 rename_pdbres(atoms, "WAT", watres, FALSE, symtab);
586 rename_atoms("xlateat.dat", nullptr,
587 atoms, symtab, nullptr, TRUE, rt, TRUE, bVerbose);
596 write_sto_conf(outf, title, atoms, *x, nullptr, *ePBC, box);
602 static void process_chain(t_atoms *pdba, rvec *x,
603 gmx_bool bTrpU, gmx_bool bPheU, gmx_bool bTyrU,
604 gmx_bool bLysMan, gmx_bool bAspMan, gmx_bool bGluMan,
605 gmx_bool bHisMan, gmx_bool bArgMan, gmx_bool bGlnMan,
606 real angle, real distance, t_symtab *symtab,
607 int nrr, const rtprename_t *rr)
609 /* Rename aromatics, lys, asp and histidine */
612 rename_bb(pdba, "TYR", "TYRU", FALSE, symtab);
616 rename_bb(pdba, "TRP", "TRPU", FALSE, symtab);
620 rename_bb(pdba, "PHE", "PHEU", FALSE, symtab);
624 rename_bbint(pdba, "LYS", get_lystp, FALSE, symtab, nrr, rr);
628 rename_bbint(pdba, "ARG", get_argtp, FALSE, symtab, nrr, rr);
632 rename_bbint(pdba, "GLN", get_glntp, FALSE, symtab, nrr, rr);
636 rename_bbint(pdba, "ASP", get_asptp, FALSE, symtab, nrr, rr);
640 rename_bb(pdba, "ASPH", "ASP", FALSE, symtab);
644 rename_bbint(pdba, "GLU", get_glutp, FALSE, symtab, nrr, rr);
648 rename_bb(pdba, "GLUH", "GLU", FALSE, symtab);
653 set_histp(pdba, x, angle, distance);
657 rename_bbint(pdba, "HIS", get_histp, TRUE, symtab, nrr, rr);
660 /* Initialize the rtp builing block names with the residue names
661 * for the residues that have not been processed above.
663 pdbres_to_gmxrtp(pdba);
665 /* Now we have all rtp names set.
666 * The rtp names will conform to Gromacs naming,
667 * unless the input pdb file contained one or more force field specific
668 * rtp names as residue names.
672 /* struct for sorting the atoms from the pdb file */
674 int resnr; /* residue number */
675 int j; /* database order index */
676 int index; /* original atom number */
677 char anm1; /* second letter of atom name */
678 char altloc; /* alternate location indicator */
681 static int pdbicomp(const void *a, const void *b)
686 pa = (t_pdbindex *)a;
687 pb = (t_pdbindex *)b;
689 d = (pa->resnr - pb->resnr);
695 d = (pa->anm1 - pb->anm1);
698 d = (pa->altloc - pb->altloc);
706 static void sort_pdbatoms(t_restp restp[],
707 int natoms, t_atoms **pdbaptr, rvec **x,
708 t_blocka *block, char ***gnames)
710 t_atoms *pdba, *pdbnew;
724 for (i = 0; i < natoms; i++)
726 atomnm = *pdba->atomname[i];
727 rptr = &restp[pdba->atom[i].resind];
728 for (j = 0; (j < rptr->natom); j++)
730 if (gmx_strcasecmp(atomnm, *(rptr->atomname[j])) == 0)
735 if (j == rptr->natom)
740 "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
741 "while sorting atoms.\n%s", atomnm,
742 *pdba->resinfo[pdba->atom[i].resind].name,
743 pdba->resinfo[pdba->atom[i].resind].nr,
746 is_hydrogen(atomnm) ?
747 "\nFor a hydrogen, this can be a different protonation state, or it\n"
748 "might have had a different number in the PDB file and was rebuilt\n"
749 "(it might for instance have been H3, and we only expected H1 & H2).\n"
750 "Note that hydrogens might have been added to the entry for the N-terminus.\n"
751 "Remove this hydrogen or choose a different protonation state to solve it.\n"
752 "Option -ignh will ignore all hydrogens in the input." : ".");
753 gmx_fatal(FARGS, buf);
755 /* make shadow array to be sorted into indexgroup */
756 pdbi[i].resnr = pdba->atom[i].resind;
759 pdbi[i].anm1 = atomnm[1];
760 pdbi[i].altloc = pdba->pdbinfo[i].altloc;
762 qsort(pdbi, natoms, (size_t)sizeof(pdbi[0]), pdbicomp);
764 /* pdba is sorted in pdbnew using the pdbi index */
767 init_t_atoms(pdbnew, natoms, TRUE);
769 pdbnew->nr = pdba->nr;
770 pdbnew->nres = pdba->nres;
771 sfree(pdbnew->resinfo);
772 pdbnew->resinfo = pdba->resinfo;
773 for (i = 0; i < natoms; i++)
775 pdbnew->atom[i] = pdba->atom[pdbi[i].index];
776 pdbnew->atomname[i] = pdba->atomname[pdbi[i].index];
777 pdbnew->pdbinfo[i] = pdba->pdbinfo[pdbi[i].index];
778 copy_rvec((*x)[pdbi[i].index], (*xnew)[i]);
779 /* make indexgroup in block */
780 a[i] = pdbi[i].index;
783 sfree(pdba->atomname);
785 sfree(pdba->pdbinfo);
788 /* copy the sorted pdbnew back to pdba */
791 add_grp(block, gnames, natoms, a, "prot_sort");
797 static int remove_duplicate_atoms(t_atoms *pdba, rvec x[], gmx_bool bVerbose)
799 int i, j, oldnatoms, ndel;
802 printf("Checking for duplicate atoms....\n");
803 oldnatoms = pdba->nr;
805 /* NOTE: pdba->nr is modified inside the loop */
806 for (i = 1; (i < pdba->nr); i++)
808 /* compare 'i' and 'i-1', throw away 'i' if they are identical
809 this is a 'while' because multiple alternate locations can be present */
810 while ( (i < pdba->nr) &&
811 (pdba->atom[i-1].resind == pdba->atom[i].resind) &&
812 (strcmp(*pdba->atomname[i-1], *pdba->atomname[i]) == 0) )
817 ri = &pdba->resinfo[pdba->atom[i].resind];
818 printf("deleting duplicate atom %4s %s%4d%c",
819 *pdba->atomname[i], *ri->name, ri->nr, ri->ic);
820 if (ri->chainid && (ri->chainid != ' '))
822 printf(" ch %c", ri->chainid);
826 if (pdba->pdbinfo[i].atomnr)
828 printf(" pdb nr %4d", pdba->pdbinfo[i].atomnr);
830 if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
832 printf(" altloc %c", pdba->pdbinfo[i].altloc);
838 /* We can not free, since it might be in the symtab */
839 /* sfree(pdba->atomname[i]); */
840 for (j = i; j < pdba->nr; j++)
842 pdba->atom[j] = pdba->atom[j+1];
843 pdba->atomname[j] = pdba->atomname[j+1];
846 pdba->pdbinfo[j] = pdba->pdbinfo[j+1];
848 copy_rvec(x[j+1], x[j]);
850 srenew(pdba->atom, pdba->nr);
851 /* srenew(pdba->atomname, pdba->nr); */
852 srenew(pdba->pdbinfo, pdba->nr);
855 if (pdba->nr != oldnatoms)
857 printf("Now there are %d atoms. Deleted %d duplicates.\n", pdba->nr, ndel);
864 checkResidueTypeSanity(t_atoms * pdba,
867 gmx_residuetype_t * rt)
869 std::string startResidueString = gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
870 std::string endResidueString = gmx::formatString("%s%d", *pdba->resinfo[r1-1].name, pdba->resinfo[r1-1].nr);
872 // Check whether all residues in chain have the same chain ID.
873 bool allResiduesHaveSameChainID = true;
874 char chainID0 = pdba->resinfo[r0].chainid;
876 std::string residueString;
878 for (int i = r0 + 1; i < r1; i++)
880 chainID = pdba->resinfo[i].chainid;
881 if (chainID != chainID0)
883 allResiduesHaveSameChainID = false;
884 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
889 if (!allResiduesHaveSameChainID)
892 "The chain covering the range %s--%s does not have a consistent chain ID. "
893 "The first residue has ID '%c', while residue %s has ID '%c'.",
894 startResidueString.c_str(), endResidueString.c_str(),
895 chainID0, residueString.c_str(), chainID);
898 // At this point all residues have the same ID. If they are also non-blank
899 // we can be a bit more aggressive and require the types match too.
902 bool allResiduesHaveSameType = true;
903 const char *restype0;
905 gmx_residuetype_get_type(rt, *pdba->resinfo[r0].name, &restype0);
907 for (int i = r0 + 1; i < r1; i++)
909 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &restype);
910 if (gmx_strcasecmp(restype, restype0))
912 allResiduesHaveSameType = false;
913 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
918 if (!allResiduesHaveSameType)
921 "The residues in the chain %s--%s do not have a consistent type. "
922 "The first residue has type '%s', while residue %s is of type '%s'. "
923 "Either there is a mistake in your chain, or it includes nonstandard "
924 "residue names that have not yet been added to the residuetypes.dat "
925 "file in the GROMACS library directory. If there are other molecules "
926 "such as ligands, they should not have the same chain ID as the "
927 "adjacent protein chain since it's a separate molecule.",
928 startResidueString.c_str(), endResidueString.c_str(),
929 restype0, residueString.c_str(), restype);
934 static void find_nc_ter(t_atoms *pdba, int r0, int r1, int *r_start, int *r_end,
935 gmx_residuetype_t *rt)
938 const char *p_startrestype;
939 const char *p_restype;
944 int startWarnings = 0;
948 // Check that all residues have the same chain identifier, and if it is
949 // non-blank we also require the residue types to match.
950 checkResidueTypeSanity(pdba, r0, r1, rt);
952 // If we return correctly from checkResidueTypeSanity(), the only
953 // remaining cases where we can have non-matching residue types is if
954 // the chain ID was blank, which could be the case e.g. for a structure
955 // read from a GRO file or other file types without chain information.
956 // In that case we need to be a bit more liberal and detect chains based
957 // on the residue type.
959 // If we get here, the chain ID must be identical for all residues
960 char chainID = pdba->resinfo[r0].chainid;
962 /* Find the starting terminus (typially N or 5') */
963 for (i = r0; i < r1 && *r_start == -1; i++)
965 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_startrestype);
966 if (!gmx_strcasecmp(p_startrestype, "Protein") || !gmx_strcasecmp(p_startrestype, "DNA") || !gmx_strcasecmp(p_startrestype, "RNA") )
968 printf("Identified residue %s%d as a starting terminus.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
971 else if (!gmx_strcasecmp(p_startrestype, "Ion"))
975 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
979 printf("Disabling further notes about ions.\n");
985 if (startWarnings < 5)
989 printf("\nWarning: Starting residue %s%d in chain not identified as Protein/RNA/DNA.\n"
990 "This chain lacks identifiers, which makes it impossible to do strict\n"
991 "classification of the start/end residues. Here we need to guess this residue\n"
992 "should not be part of the chain and instead introduce a break, but that will\n"
993 "be catastrophic if they should in fact be linked. Please check your structure,\n"
994 "and add %s to residuetypes.dat if this was not correct.\n\n",
995 *pdba->resinfo[i].name, pdba->resinfo[i].nr, *pdba->resinfo[i].name);
999 printf("\nWarning: No residues in chain starting at %s%d identified as Protein/RNA/DNA.\n"
1000 "This makes it impossible to link them into a molecule, which could either be\n"
1001 "correct or a catastrophic error. Please check your structure, and add all\n"
1002 "necessary residue names to residuetypes.dat if this was not correct.\n\n",
1003 *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1006 if (startWarnings == 4)
1008 printf("Disabling further warnings about unidentified residues at start of chain.\n");
1016 /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1017 for (i = *r_start; i < r1; i++)
1019 gmx_residuetype_get_type(rt, *pdba->resinfo[i].name, &p_restype);
1020 if (!gmx_strcasecmp(p_restype, p_startrestype) && endWarnings == 0)
1024 else if (!gmx_strcasecmp(p_startrestype, "Ion"))
1028 printf("Residue %s%d has type 'Ion', assuming it is not linked into a chain.\n", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1032 printf("Disabling further notes about ions.\n");
1038 // This can only trigger if the chain ID is blank - otherwise the
1039 // call to checkResidueTypeSanity() will have caught the problem.
1040 if (endWarnings < 5)
1042 printf("\nWarning: Residue %s%d in chain has different type ('%s') from\n"
1043 "residue %s%d ('%s'). This chain lacks identifiers, which makes\n"
1044 "it impossible to do strict classification of the start/end residues. Here we\n"
1045 "need to guess this residue should not be part of the chain and instead\n"
1046 "introduce a break, but that will be catastrophic if they should in fact be\n"
1047 "linked. Please check your structure, and add %s to residuetypes.dat\n"
1048 "if this was not correct.\n\n",
1049 *pdba->resinfo[i].name, pdba->resinfo[i].nr, p_restype,
1050 *pdba->resinfo[*r_start].name, pdba->resinfo[*r_start].nr, p_startrestype, *pdba->resinfo[i].name);
1052 if (endWarnings == 4)
1054 printf("Disabling further warnings about unidentified residues at end of chain.\n");
1063 printf("Identified residue %s%d as a ending terminus.\n", *pdba->resinfo[*r_end].name, pdba->resinfo[*r_end].nr);
1077 static SplittingType getSplittingType(const char *chainsep)
1079 SplittingType splitting = SPLIT_TER_ONLY; /* keep compiler happy */
1081 /* Be a bit flexible to catch typos */
1082 if (!strncmp(chainsep, "id_o", 4))
1084 /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
1085 splitting = SPLIT_ID_OR_TER;
1086 printf("Splitting chemical chains based on TER records or chain id changing.\n");
1088 else if (!strncmp(chainsep, "int", 3))
1090 /* For later interactive splitting we tentatively assign new chain numbers at either changing id or ter records */
1091 splitting = SPLIT_INTERACTIVE;
1092 printf("Splitting chemical chains interactively.\n");
1094 else if (!strncmp(chainsep, "id_a", 4))
1096 splitting = SPLIT_ID_AND_TER;
1097 printf("Splitting chemical chains based on TER records and chain id changing.\n");
1099 else if (strlen(chainsep) == 2 && !strncmp(chainsep, "id", 4))
1101 splitting = SPLIT_ID_ONLY;
1102 printf("Splitting chemical chains based on changing chain id only (ignoring TER records).\n");
1104 else if (chainsep[0] == 't')
1106 splitting = SPLIT_TER_ONLY;
1107 printf("Splitting chemical chains based on TER records only (ignoring chain id).\n");
1111 gmx_fatal(FARGS, "Unidentified setting for chain separation: %s\n", chainsep);
1117 modify_chain_numbers(t_atoms * pdba,
1118 const char * chainsep)
1121 char old_prev_chainid;
1122 char old_this_chainid;
1123 int old_prev_chainnum;
1124 int old_this_chainnum;
1126 char select[STRLEN];
1130 const char * prev_atomname;
1131 const char * this_atomname;
1132 const char * prev_resname;
1133 const char * this_resname;
1139 SplittingType splitting = getSplittingType(chainsep);
1141 /* The default chain enumeration is based on TER records only, which is reflected in chainnum below */
1143 old_prev_chainid = '?';
1144 old_prev_chainnum = -1;
1147 this_atomname = nullptr;
1149 this_resname = nullptr;
1153 for (i = 0; i < pdba->nres; i++)
1155 ri = &pdba->resinfo[i];
1156 old_this_chainid = ri->chainid;
1157 old_this_chainnum = ri->chainnum;
1159 prev_atomname = this_atomname;
1160 prev_atomnum = this_atomnum;
1161 prev_resname = this_resname;
1162 prev_resnum = this_resnum;
1163 prev_chainid = this_chainid;
1165 this_atomname = *(pdba->atomname[i]);
1166 this_atomnum = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i+1;
1167 this_resname = *ri->name;
1168 this_resnum = ri->nr;
1169 this_chainid = ri->chainid;
1173 case SPLIT_ID_OR_TER:
1174 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1180 case SPLIT_ID_AND_TER:
1181 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1188 if (old_this_chainid != old_prev_chainid)
1194 case SPLIT_TER_ONLY:
1195 if (old_this_chainnum != old_prev_chainnum)
1200 case SPLIT_INTERACTIVE:
1201 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1205 printf("Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\n"
1206 "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]\n",
1207 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1208 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1210 if (nullptr == fgets(select, STRLEN-1, stdin))
1212 gmx_fatal(FARGS, "Error reading from stdin");
1215 if (i == 0 || select[0] == 'y')
1222 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1225 old_prev_chainid = old_this_chainid;
1226 old_prev_chainnum = old_this_chainnum;
1228 ri->chainnum = new_chainnum;
1257 int gmx_pdb2gmx(int argc, char *argv[])
1259 const char *desc[] = {
1260 "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1261 "some database files, adds hydrogens to the molecules and generates",
1262 "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1263 "and a topology in GROMACS format.",
1264 "These files can subsequently be processed to generate a run input file.",
1266 "[THISMODULE] will search for force fields by looking for",
1267 "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1268 "of the current working directory and of the GROMACS library directory",
1269 "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1271 "By default the forcefield selection is interactive,",
1272 "but you can use the [TT]-ff[tt] option to specify one of the short names",
1273 "in the list on the command line instead. In that case [THISMODULE] just looks",
1274 "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1276 "After choosing a force field, all files will be read only from",
1277 "the corresponding force field directory.",
1278 "If you want to modify or add a residue types, you can copy the force",
1279 "field directory from the GROMACS library directory to your current",
1280 "working directory. If you want to add new protein residue types,",
1281 "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1282 "or copy the whole library directory to a local directory and set",
1283 "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1284 "Check Chapter 5 of the manual for more information about file formats.",
1287 "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1288 "need not necessarily contain a protein structure. Every kind of",
1289 "molecule for which there is support in the database can be converted.",
1290 "If there is no support in the database, you can add it yourself.[PAR]",
1292 "The program has limited intelligence, it reads a number of database",
1293 "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1294 "if necessary this can be done manually. The program can prompt the",
1295 "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1296 "desired. For Lys the choice is between neutral (two protons on NZ) or",
1297 "protonated (three protons, default), for Asp and Glu unprotonated",
1298 "(default) or protonated, for His the proton can be either on ND1,",
1299 "on NE2 or on both. By default these selections are done automatically.",
1300 "For His, this is based on an optimal hydrogen bonding",
1301 "conformation. Hydrogen bonds are defined based on a simple geometric",
1302 "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1303 "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1304 "[TT]-dist[tt] respectively.[PAR]",
1306 "The protonation state of N- and C-termini can be chosen interactively",
1307 "with the [TT]-ter[tt] flag. Default termini are ionized (NH3+ and COO-),",
1308 "respectively. Some force fields support zwitterionic forms for chains of",
1309 "one residue, but for polypeptides these options should NOT be selected.",
1310 "The AMBER force fields have unique forms for the terminal residues,",
1311 "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1312 "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1313 "respectively to use these forms, making sure you preserve the format",
1314 "of the coordinate file. Alternatively, use named terminating residues",
1315 "(e.g. ACE, NME).[PAR]",
1317 "The separation of chains is not entirely trivial since the markup",
1318 "in user-generated PDB files frequently varies and sometimes it",
1319 "is desirable to merge entries across a TER record, for instance",
1320 "if you want a disulfide bridge or distance restraints between",
1321 "two protein chains or if you have a HEME group bound to a protein.",
1322 "In such cases multiple chains should be contained in a single",
1323 "[TT]moleculetype[tt] definition.",
1324 "To handle this, [THISMODULE] uses two separate options.",
1325 "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1326 "start, and termini added when applicable. This can be done based on the",
1327 "existence of TER records, when the chain id changes, or combinations of either",
1328 "or both of these. You can also do the selection fully interactively.",
1329 "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1330 "are merged into one moleculetype, after adding all the chemical termini (or not).",
1331 "This can be turned off (no merging), all non-water chains can be merged into a",
1332 "single molecule, or the selection can be done interactively.[PAR]",
1334 "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1335 "If any of the occupancies are not one, indicating that the atom is",
1336 "not resolved well in the structure, a warning message is issued.",
1337 "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1338 "all occupancy fields may be zero. Either way, it is up to the user",
1339 "to verify the correctness of the input data (read the article!).[PAR]",
1341 "During processing the atoms will be reordered according to GROMACS",
1342 "conventions. With [TT]-n[tt] an index file can be generated that",
1343 "contains one group reordered in the same way. This allows you to",
1344 "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1345 "one limitation: reordering is done after the hydrogens are stripped",
1346 "from the input and before new hydrogens are added. This means that",
1347 "you should not use [TT]-ignh[tt].[PAR]",
1349 "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1350 "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1351 "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1354 "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1355 "motions. Angular and out-of-plane motions can be removed by changing",
1356 "hydrogens into virtual sites and fixing angles, which fixes their",
1357 "position relative to neighboring atoms. Additionally, all atoms in the",
1358 "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1359 "can be converted into virtual sites, eliminating the fast improper dihedral",
1360 "fluctuations in these rings. [BB]Note[bb] that in this case all other hydrogen",
1361 "atoms are also converted to virtual sites. The mass of all atoms that are",
1362 "converted into virtual sites, is added to the heavy atoms.[PAR]",
1363 "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1364 "done by increasing the hydrogen-mass by a factor of 4. This is also",
1365 "done for water hydrogens to slow down the rotational motion of water.",
1366 "The increase in mass of the hydrogens is subtracted from the bonded",
1367 "(heavy) atom so that the total mass of the system remains the same."
1371 FILE *fp, *top_file, *top_file2, *itp_file = nullptr;
1373 t_atoms pdba_all, *pdba;
1377 int chain, nch, maxch, nwaterchain;
1379 t_chain *chains, *cc;
1380 char select[STRLEN];
1388 int i, j, k, l, nrtp;
1389 int *swap_index, si;
1393 gpp_atomtype_t atype;
1394 gmx_residuetype_t*rt;
1396 char fn[256], itp_fn[STRLEN], posre_fn[STRLEN], buf_fn[STRLEN];
1397 char molname[STRLEN], title[STRLEN];
1398 char *c, forcefield[STRLEN], ffdir[STRLEN];
1399 char ffname[STRLEN], suffix[STRLEN], buf[STRLEN];
1407 rtprename_t *rtprename = nullptr;
1408 int nah, nNtdb, nCtdb, ntdblist;
1409 t_hackblock *ntdb, *ctdb, **tdblist;
1413 gmx_bool bVsites = FALSE, bWat, bPrevWat = FALSE, bITP, bVsiteAromatics = FALSE;
1415 t_hackblock *hb_chain;
1416 t_restp *restp_chain;
1417 gmx_output_env_t *oenv;
1418 const char *p_restype;
1422 const char * prev_atomname;
1423 const char * this_atomname;
1424 const char * prev_resname;
1425 const char * this_resname;
1430 int prev_chainnumber;
1431 int this_chainnumber;
1433 int this_chainstart;
1434 int prev_chainstart;
1441 { efSTX, "-f", "eiwit.pdb", ffREAD },
1442 { efSTO, "-o", "conf", ffWRITE },
1443 { efTOP, nullptr, nullptr, ffWRITE },
1444 { efITP, "-i", "posre", ffWRITE },
1445 { efNDX, "-n", "clean", ffOPTWR },
1446 { efSTO, "-q", "clean.pdb", ffOPTWR }
1448 #define NFILE asize(fnm)
1450 gmx_bool bNewRTP = FALSE;
1451 gmx_bool bInter = FALSE, bCysMan = FALSE;
1452 gmx_bool bLysMan = FALSE, bAspMan = FALSE, bGluMan = FALSE, bHisMan = FALSE;
1453 gmx_bool bGlnMan = FALSE, bArgMan = FALSE;
1454 gmx_bool bTerMan = FALSE, bUnA = FALSE, bHeavyH = FALSE;
1455 gmx_bool bSort = TRUE, bAllowMissing = FALSE, bRemoveH = FALSE;
1456 gmx_bool bDeuterate = FALSE, bVerbose = FALSE, bChargeGroups = TRUE, bCmap = TRUE;
1457 gmx_bool bRenumRes = FALSE, bRTPresname = FALSE;
1458 real angle = 135.0, distance = 0.3, posre_fc = 1000;
1459 real long_bond_dist = 0.25, short_bond_dist = 0.05;
1460 const char *vsitestr[] = { nullptr, "none", "hydrogens", "aromatics", nullptr };
1461 const char *watstr[] = { nullptr, "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p", nullptr };
1462 const char *chainsep[] = { nullptr, "id_or_ter", "id_and_ter", "ter", "id", "interactive", nullptr };
1463 const char *merge[] = {nullptr, "no", "all", "interactive", nullptr };
1464 const char *ff = "select";
1467 { "-newrtp", FALSE, etBOOL, {&bNewRTP},
1468 "HIDDENWrite the residue database in new format to [TT]new.rtp[tt]"},
1469 { "-lb", FALSE, etREAL, {&long_bond_dist},
1470 "HIDDENLong bond warning distance" },
1471 { "-sb", FALSE, etREAL, {&short_bond_dist},
1472 "HIDDENShort bond warning distance" },
1473 { "-chainsep", FALSE, etENUM, {chainsep},
1474 "Condition in PDB files when a new chain should be started (adding termini)" },
1475 { "-merge", FALSE, etENUM, {&merge},
1476 "Merge multiple chains into a single [moleculetype]" },
1477 { "-ff", FALSE, etSTR, {&ff},
1478 "Force field, interactive by default. Use [TT]-h[tt] for information." },
1479 { "-water", FALSE, etENUM, {watstr},
1480 "Water model to use" },
1481 { "-inter", FALSE, etBOOL, {&bInter},
1482 "Set the next 8 options to interactive"},
1483 { "-ss", FALSE, etBOOL, {&bCysMan},
1484 "Interactive SS bridge selection" },
1485 { "-ter", FALSE, etBOOL, {&bTerMan},
1486 "Interactive termini selection, instead of charged (default)" },
1487 { "-lys", FALSE, etBOOL, {&bLysMan},
1488 "Interactive lysine selection, instead of charged" },
1489 { "-arg", FALSE, etBOOL, {&bArgMan},
1490 "Interactive arginine selection, instead of charged" },
1491 { "-asp", FALSE, etBOOL, {&bAspMan},
1492 "Interactive aspartic acid selection, instead of charged" },
1493 { "-glu", FALSE, etBOOL, {&bGluMan},
1494 "Interactive glutamic acid selection, instead of charged" },
1495 { "-gln", FALSE, etBOOL, {&bGlnMan},
1496 "Interactive glutamine selection, instead of neutral" },
1497 { "-his", FALSE, etBOOL, {&bHisMan},
1498 "Interactive histidine selection, instead of checking H-bonds" },
1499 { "-angle", FALSE, etREAL, {&angle},
1500 "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)" },
1501 { "-dist", FALSE, etREAL, {&distance},
1502 "Maximum donor-acceptor distance for a H-bond (nm)" },
1503 { "-una", FALSE, etBOOL, {&bUnA},
1504 "Select aromatic rings with united CH atoms on phenylalanine, "
1505 "tryptophane and tyrosine" },
1506 { "-sort", FALSE, etBOOL, {&bSort},
1507 "HIDDENSort the residues according to database, turning this off is dangerous as charge groups might be broken in parts" },
1508 { "-ignh", FALSE, etBOOL, {&bRemoveH},
1509 "Ignore hydrogen atoms that are in the coordinate file" },
1510 { "-missing", FALSE, etBOOL, {&bAllowMissing},
1511 "Continue when atoms are missing and bonds cannot be made, dangerous" },
1512 { "-v", FALSE, etBOOL, {&bVerbose},
1513 "Be slightly more verbose in messages" },
1514 { "-posrefc", FALSE, etREAL, {&posre_fc},
1515 "Force constant for position restraints" },
1516 { "-vsite", FALSE, etENUM, {vsitestr},
1517 "Convert atoms to virtual sites" },
1518 { "-heavyh", FALSE, etBOOL, {&bHeavyH},
1519 "Make hydrogen atoms heavy" },
1520 { "-deuterate", FALSE, etBOOL, {&bDeuterate},
1521 "Change the mass of hydrogens to 2 amu" },
1522 { "-chargegrp", TRUE, etBOOL, {&bChargeGroups},
1523 "Use charge groups in the [REF].rtp[ref] file" },
1524 { "-cmap", TRUE, etBOOL, {&bCmap},
1525 "Use cmap torsions (if enabled in the [REF].rtp[ref] file)" },
1526 { "-renum", TRUE, etBOOL, {&bRenumRes},
1527 "Renumber the residues consecutively in the output" },
1528 { "-rtpres", TRUE, etBOOL, {&bRTPresname},
1529 "Use [REF].rtp[ref] entry names as residue names" }
1531 #define NPARGS asize(pa)
1533 if (!parse_common_args(&argc, argv, 0, NFILE, fnm, asize(pa), pa, asize(desc), desc,
1539 /* Force field selection, interactive or direct */
1540 choose_ff(strcmp(ff, "select") == 0 ? nullptr : ff,
1541 forcefield, sizeof(forcefield),
1542 ffdir, sizeof(ffdir));
1544 if (strlen(forcefield) > 0)
1546 strcpy(ffname, forcefield);
1547 ffname[0] = toupper(ffname[0]);
1551 gmx_fatal(FARGS, "Empty forcefield string");
1554 printf("\nUsing the %s force field in directory %s\n\n",
1557 choose_watermodel(watstr[0], ffdir, &watermodel);
1561 /* if anything changes here, also change description of -inter */
1576 else if (bDeuterate)
1585 /* parse_common_args ensures vsitestr has been selected, but
1586 clang-static-analyzer needs clues to know that */
1587 GMX_ASSERT(vsitestr[0], "-vsite default wasn't processed correctly");
1588 switch (vsitestr[0][0])
1590 case 'n': /* none */
1592 bVsiteAromatics = FALSE;
1594 case 'h': /* hydrogens */
1596 bVsiteAromatics = FALSE;
1598 case 'a': /* aromatics */
1600 bVsiteAromatics = TRUE;
1603 gmx_fatal(FARGS, "DEATH HORROR in $s (%d): vsitestr[0]='%s'",
1604 __FILE__, __LINE__, vsitestr[0]);
1607 /* Open the symbol table */
1608 open_symtab(&symtab);
1610 /* Residue type database */
1611 gmx_residuetype_init(&rt);
1613 /* Read residue renaming database(s), if present */
1614 nrrn = fflib_search_file_end(ffdir, ".r2b", FALSE, &rrn);
1617 rtprename = nullptr;
1618 for (i = 0; i < nrrn; i++)
1620 fp = fflib_open(rrn[i]);
1621 read_rtprename(rrn[i], fp, &nrtprename, &rtprename);
1627 /* Add all alternative names from the residue renaming database to the list of recognized amino/nucleic acids. */
1628 for (i = 0; i < nrtprename; i++)
1630 rc = gmx_residuetype_get_type(rt, rtprename[i].gmx, &p_restype);
1632 /* Only add names if the 'standard' gromacs/iupac base name was found */
1635 gmx_residuetype_add(rt, rtprename[i].main, p_restype);
1636 gmx_residuetype_add(rt, rtprename[i].nter, p_restype);
1637 gmx_residuetype_add(rt, rtprename[i].cter, p_restype);
1638 gmx_residuetype_add(rt, rtprename[i].bter, p_restype);
1643 if (watermodel != nullptr && (strstr(watermodel, "4p") ||
1644 strstr(watermodel, "4P")))
1648 else if (watermodel != nullptr && (strstr(watermodel, "5p") ||
1649 strstr(watermodel, "5P")))
1658 aps = gmx_atomprop_init();
1659 natom = read_pdball(opt2fn("-f", NFILE, fnm), opt2fn_null("-q", NFILE, fnm), title,
1660 &pdba_all, &pdbx, &ePBC, box, bRemoveH, &symtab, rt, watres,
1665 gmx_fatal(FARGS, "No atoms found in pdb file %s\n", opt2fn("-f", NFILE, fnm));
1668 printf("Analyzing pdb file\n");
1671 modify_chain_numbers(&pdba_all, chainsep[0]);
1675 this_atomname = nullptr;
1677 this_resname = nullptr;
1680 this_chainnumber = -1;
1681 this_chainstart = 0;
1682 /* Keep the compiler happy */
1683 prev_chainstart = 0;
1687 snew(pdb_ch, maxch);
1690 for (i = 0; (i < natom); i++)
1692 ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
1694 /* TODO this should live in a helper object, and consolidate
1695 that with code in modify_chain_numbers */
1696 prev_atomname = this_atomname;
1697 prev_atomnum = this_atomnum;
1698 prev_resname = this_resname;
1699 prev_resnum = this_resnum;
1700 prev_chainid = this_chainid;
1701 prev_chainnumber = this_chainnumber;
1704 prev_chainstart = this_chainstart;
1707 this_atomname = *pdba_all.atomname[i];
1708 this_atomnum = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i+1;
1709 this_resname = *ri->name;
1710 this_resnum = ri->nr;
1711 this_chainid = ri->chainid;
1712 this_chainnumber = ri->chainnum;
1714 bWat = gmx_strcasecmp(*ri->name, watres) == 0;
1716 if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat != bPrevWat))
1718 this_chainstart = pdba_all.atom[i].resind;
1722 if (!strncmp(merge[0], "int", 3))
1724 printf("Merge chain ending with residue %s%d (chain id '%c', atom %d %s) and chain starting with\n"
1725 "residue %s%d (chain id '%c', atom %d %s) into a single moleculetype (keeping termini)? [n/y]\n",
1726 prev_resname, prev_resnum, prev_chainid, prev_atomnum, prev_atomname,
1727 this_resname, this_resnum, this_chainid, this_atomnum, this_atomname);
1729 if (nullptr == fgets(select, STRLEN-1, stdin))
1731 gmx_fatal(FARGS, "Error reading from stdin");
1733 bMerged = (select[0] == 'y');
1735 else if (!strncmp(merge[0], "all", 3))
1743 pdb_ch[nch-1].chainstart[pdb_ch[nch-1].nterpairs] =
1744 pdba_all.atom[i].resind - prev_chainstart;
1745 pdb_ch[nch-1].nterpairs++;
1746 srenew(pdb_ch[nch-1].chainstart, pdb_ch[nch-1].nterpairs+1);
1751 /* set natom for previous chain */
1754 pdb_ch[nch-1].natom = i-pdb_ch[nch-1].start;
1761 /* check if chain identifier was used before */
1762 for (j = 0; (j < nch); j++)
1764 if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
1766 printf("WARNING: Chain identifier '%c' is used in two non-sequential blocks.\n"
1767 "They will be treated as separate chains unless you reorder your file.\n",
1771 // TODO This is too convoluted. Use a std::vector
1775 srenew(pdb_ch, maxch);
1777 pdb_ch[nch].chainid = ri->chainid;
1778 pdb_ch[nch].chainnum = ri->chainnum;
1779 pdb_ch[nch].start = i;
1780 pdb_ch[nch].bAllWat = bWat;
1783 pdb_ch[nch].nterpairs = 0;
1787 pdb_ch[nch].nterpairs = 1;
1789 snew(pdb_ch[nch].chainstart, pdb_ch[nch].nterpairs+1);
1790 /* modified [nch] to [0] below */
1791 pdb_ch[nch].chainstart[0] = 0;
1797 pdb_ch[nch-1].natom = natom-pdb_ch[nch-1].start;
1799 /* set all the water blocks at the end of the chain */
1800 snew(swap_index, nch);
1802 for (i = 0; i < nch; i++)
1804 if (!pdb_ch[i].bAllWat)
1810 for (i = 0; i < nch; i++)
1812 if (pdb_ch[i].bAllWat)
1818 if (nwaterchain > 1)
1820 printf("Moved all the water blocks to the end\n");
1824 /* copy pdb data and x for all chains */
1825 for (i = 0; (i < nch); i++)
1828 chains[i].chainid = pdb_ch[si].chainid;
1829 chains[i].chainnum = pdb_ch[si].chainnum;
1830 chains[i].bAllWat = pdb_ch[si].bAllWat;
1831 chains[i].nterpairs = pdb_ch[si].nterpairs;
1832 chains[i].chainstart = pdb_ch[si].chainstart;
1833 snew(chains[i].ntdb, pdb_ch[si].nterpairs);
1834 snew(chains[i].ctdb, pdb_ch[si].nterpairs);
1835 snew(chains[i].r_start, pdb_ch[si].nterpairs);
1836 snew(chains[i].r_end, pdb_ch[si].nterpairs);
1838 snew(chains[i].pdba, 1);
1839 init_t_atoms(chains[i].pdba, pdb_ch[si].natom, TRUE);
1840 snew(chains[i].x, chains[i].pdba->nr);
1841 for (j = 0; j < chains[i].pdba->nr; j++)
1843 chains[i].pdba->atom[j] = pdba_all.atom[pdb_ch[si].start+j];
1844 snew(chains[i].pdba->atomname[j], 1);
1845 *chains[i].pdba->atomname[j] =
1846 gmx_strdup(*pdba_all.atomname[pdb_ch[si].start+j]);
1847 chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start+j];
1848 copy_rvec(pdbx[pdb_ch[si].start+j], chains[i].x[j]);
1850 /* Re-index the residues assuming that the indices are continuous */
1851 k = chains[i].pdba->atom[0].resind;
1852 nres = chains[i].pdba->atom[chains[i].pdba->nr-1].resind - k + 1;
1853 chains[i].pdba->nres = nres;
1854 for (j = 0; j < chains[i].pdba->nr; j++)
1856 chains[i].pdba->atom[j].resind -= k;
1858 srenew(chains[i].pdba->resinfo, nres);
1859 for (j = 0; j < nres; j++)
1861 chains[i].pdba->resinfo[j] = pdba_all.resinfo[k+j];
1862 snew(chains[i].pdba->resinfo[j].name, 1);
1863 *chains[i].pdba->resinfo[j].name = gmx_strdup(*pdba_all.resinfo[k+j].name);
1864 /* make all chain identifiers equal to that of the chain */
1865 chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
1869 if (nchainmerges > 0)
1871 printf("\nMerged chains into joint molecule definitions at %d places.\n\n",
1875 printf("There are %d chains and %d blocks of water and "
1876 "%d residues with %d atoms\n",
1877 nch-nwaterchain, nwaterchain,
1878 pdba_all.nres, natom);
1880 printf("\n %5s %4s %6s\n", "chain", "#res", "#atoms");
1881 for (i = 0; (i < nch); i++)
1883 printf(" %d '%c' %5d %6d %s\n",
1884 i+1, chains[i].chainid ? chains[i].chainid : '-',
1885 chains[i].pdba->nres, chains[i].pdba->nr,
1886 chains[i].bAllWat ? "(only water)" : "");
1890 check_occupancy(&pdba_all, opt2fn("-f", NFILE, fnm), bVerbose);
1892 /* Read atomtypes... */
1893 atype = read_atype(ffdir, &symtab);
1895 /* read residue database */
1896 printf("Reading residue database... (%s)\n", forcefield);
1897 nrtpf = fflib_search_file_end(ffdir, ".rtp", TRUE, &rtpf);
1900 for (i = 0; i < nrtpf; i++)
1902 read_resall(rtpf[i], &nrtp, &restp, atype, &symtab, FALSE);
1908 /* Not correct with multiple rtp input files with different bonded types */
1909 fp = gmx_fio_fopen("new.rtp", "w");
1910 print_resall(fp, nrtp, restp, atype);
1914 /* read hydrogen database */
1915 nah = read_h_db(ffdir, &ah);
1917 /* Read Termini database... */
1918 nNtdb = read_ter_db(ffdir, 'n', &ntdb, atype);
1919 nCtdb = read_ter_db(ffdir, 'c', &ctdb, atype);
1921 top_fn = ftp2fn(efTOP, NFILE, fnm);
1922 top_file = gmx_fio_fopen(top_fn, "w");
1924 print_top_header(top_file, top_fn, FALSE, ffdir, mHmult);
1930 for (chain = 0; (chain < nch); chain++)
1932 cc = &(chains[chain]);
1934 /* set pdba, natom and nres to the current chain */
1937 natom = cc->pdba->nr;
1938 nres = cc->pdba->nres;
1940 if (cc->chainid && ( cc->chainid != ' ' ) )
1942 printf("Processing chain %d '%c' (%d atoms, %d residues)\n",
1943 chain+1, cc->chainid, natom, nres);
1947 printf("Processing chain %d (%d atoms, %d residues)\n",
1948 chain+1, natom, nres);
1951 process_chain(pdba, x, bUnA, bUnA, bUnA, bLysMan, bAspMan, bGluMan,
1952 bHisMan, bArgMan, bGlnMan, angle, distance, &symtab,
1953 nrtprename, rtprename);
1955 cc->chainstart[cc->nterpairs] = pdba->nres;
1957 for (i = 0; i < cc->nterpairs; i++)
1959 find_nc_ter(pdba, cc->chainstart[i], cc->chainstart[i+1],
1960 &(cc->r_start[j]), &(cc->r_end[j]), rt);
1962 if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
1968 if (cc->nterpairs == 0)
1970 printf("Problem with chain definition, or missing terminal residues.\n"
1971 "This chain does not appear to contain a recognized chain molecule.\n"
1972 "If this is incorrect, you can edit residuetypes.dat to modify the behavior.\n");
1975 /* Check for disulfides and other special bonds */
1976 nssbonds = mk_specbonds(pdba, x, bCysMan, &ssbonds, bVerbose);
1980 rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, nrtprename, rtprename,
1988 sprintf(fn, "chain.pdb");
1992 sprintf(fn, "chain_%c%d.pdb", cc->chainid, cc->chainnum);
1994 write_sto_conf(fn, title, pdba, x, nullptr, ePBC, box);
1998 for (i = 0; i < cc->nterpairs; i++)
2002 * We first apply a filter so we only have the
2003 * termini that can be applied to the residue in question
2004 * (or a generic terminus if no-residue specific is available).
2006 /* First the N terminus */
2009 tdblist = filter_ter(nNtdb, ntdb,
2010 *pdba->resinfo[cc->r_start[i]].name,
2014 printf("No suitable end (N or 5') terminus found in database - assuming this residue\n"
2015 "is already in a terminus-specific form and skipping terminus selection.\n");
2016 cc->ntdb[i] = nullptr;
2020 if (bTerMan && ntdblist > 1)
2022 sprintf(select, "Select start terminus type for %s-%d",
2023 *pdba->resinfo[cc->r_start[i]].name,
2024 pdba->resinfo[cc->r_start[i]].nr);
2025 cc->ntdb[i] = choose_ter(ntdblist, tdblist, select);
2029 cc->ntdb[i] = tdblist[0];
2032 printf("Start terminus %s-%d: %s\n",
2033 *pdba->resinfo[cc->r_start[i]].name,
2034 pdba->resinfo[cc->r_start[i]].nr,
2035 (cc->ntdb[i])->name);
2041 cc->ntdb[i] = nullptr;
2044 /* And the C terminus */
2047 tdblist = filter_ter(nCtdb, ctdb,
2048 *pdba->resinfo[cc->r_end[i]].name,
2052 printf("No suitable end (C or 3') terminus found in database - assuming this residue\n"
2053 "is already in a terminus-specific form and skipping terminus selection.\n");
2054 cc->ctdb[i] = nullptr;
2058 if (bTerMan && ntdblist > 1)
2060 sprintf(select, "Select end terminus type for %s-%d",
2061 *pdba->resinfo[cc->r_end[i]].name,
2062 pdba->resinfo[cc->r_end[i]].nr);
2063 cc->ctdb[i] = choose_ter(ntdblist, tdblist, select);
2067 cc->ctdb[i] = tdblist[0];
2069 printf("End terminus %s-%d: %s\n",
2070 *pdba->resinfo[cc->r_end[i]].name,
2071 pdba->resinfo[cc->r_end[i]].nr,
2072 (cc->ctdb[i])->name);
2078 cc->ctdb[i] = nullptr;
2081 /* lookup hackblocks and rtp for all residues */
2082 get_hackblocks_rtp(&hb_chain, &restp_chain,
2083 nrtp, restp, pdba->nres, pdba->resinfo,
2084 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end,
2086 /* ideally, now we would not need the rtp itself anymore, but do
2087 everything using the hb and restp arrays. Unfortunately, that
2088 requires some re-thinking of code in gen_vsite.c, which I won't
2089 do now :( AF 26-7-99 */
2091 rename_atoms(nullptr, ffdir,
2092 pdba, &symtab, restp_chain, FALSE, rt, FALSE, bVerbose);
2094 match_atomnames_with_rtp(restp_chain, hb_chain, pdba, x, bVerbose);
2098 block = new_blocka();
2100 sort_pdbatoms(restp_chain, natom, &pdba, &x, block, &gnames);
2101 remove_duplicate_atoms(pdba, x, bVerbose);
2102 if (ftp2bSet(efNDX, NFILE, fnm))
2106 fprintf(stderr, "WARNING: with the -remh option the generated "
2107 "index file (%s) might be useless\n"
2108 "(the index file is generated before hydrogens are added)",
2109 ftp2fn(efNDX, NFILE, fnm));
2111 write_index(ftp2fn(efNDX, NFILE, fnm), block, gnames, FALSE, 0);
2113 for (i = 0; i < block->nr; i++)
2122 fprintf(stderr, "WARNING: "
2123 "without sorting no check for duplicate atoms can be done\n");
2126 /* Generate Hydrogen atoms (and termini) in the sequence */
2127 printf("Generating any missing hydrogen atoms and/or adding termini.\n");
2128 natom = add_h(&pdba, &x, nah, ah,
2129 cc->nterpairs, cc->ntdb, cc->ctdb, cc->r_start, cc->r_end, bAllowMissing,
2130 nullptr, nullptr, TRUE, FALSE);
2131 printf("Now there are %d residues with %d atoms\n",
2132 pdba->nres, pdba->nr);
2135 write_pdbfile(debug, title, pdba, x, ePBC, box, ' ', 0, nullptr, TRUE);
2140 for (i = 0; (i < natom); i++)
2142 fprintf(debug, "Res %s%d atom %d %s\n",
2143 *(pdba->resinfo[pdba->atom[i].resind].name),
2144 pdba->resinfo[pdba->atom[i].resind].nr, i+1, *pdba->atomname[i]);
2148 strcpy(posre_fn, ftp2fn(efITP, NFILE, fnm));
2150 /* make up molecule name(s) */
2152 k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2154 gmx_residuetype_get_type(rt, *pdba->resinfo[k].name, &p_restype);
2160 sprintf(molname, "Water");
2164 this_chainid = cc->chainid;
2166 /* Add the chain id if we have one */
2167 if (this_chainid != ' ')
2169 sprintf(buf, "_chain_%c", this_chainid);
2170 strcat(suffix, buf);
2173 /* Check if there have been previous chains with the same id */
2175 for (k = 0; k < chain; k++)
2177 if (cc->chainid == chains[k].chainid)
2182 /* Add the number for this chain identifier if there are multiple copies */
2186 sprintf(buf, "%d", nid_used+1);
2187 strcat(suffix, buf);
2190 if (strlen(suffix) > 0)
2192 sprintf(molname, "%s%s", p_restype, suffix);
2196 strcpy(molname, p_restype);
2200 if ((nch-nwaterchain > 1) && !cc->bAllWat)
2203 strcpy(itp_fn, top_fn);
2204 printf("Chain time...\n");
2205 c = strrchr(itp_fn, '.');
2206 sprintf(c, "_%s.itp", molname);
2207 c = strrchr(posre_fn, '.');
2208 sprintf(c, "_%s.itp", molname);
2209 if (strcmp(itp_fn, posre_fn) == 0)
2211 strcpy(buf_fn, posre_fn);
2212 c = strrchr(buf_fn, '.');
2214 sprintf(posre_fn, "%s_pr.itp", buf_fn);
2218 srenew(incls, nincl);
2219 incls[nincl-1] = gmx_strdup(itp_fn);
2220 itp_file = gmx_fio_fopen(itp_fn, "w");
2227 srenew(mols, nmol+1);
2230 mols[nmol].name = gmx_strdup("SOL");
2231 mols[nmol].nr = pdba->nres;
2235 mols[nmol].name = gmx_strdup(molname);
2242 print_top_comment(itp_file, itp_fn, ffdir, TRUE);
2247 top_file2 = nullptr;
2251 top_file2 = itp_file;
2255 top_file2 = top_file;
2258 pdb2top(top_file2, posre_fn, molname, pdba, &x, atype, &symtab,
2260 restp_chain, hb_chain,
2262 bVsites, bVsiteAromatics, ffdir,
2263 mHmult, nssbonds, ssbonds,
2264 long_bond_dist, short_bond_dist, bDeuterate, bChargeGroups, bCmap,
2265 bRenumRes, bRTPresname);
2269 write_posres(posre_fn, pdba, posre_fc);
2274 gmx_fio_fclose(itp_file);
2277 /* pdba and natom have been reassigned somewhere so: */
2283 if (cc->chainid == ' ')
2285 sprintf(fn, "chain.pdb");
2289 sprintf(fn, "chain_%c.pdb", cc->chainid);
2291 write_sto_conf(fn, "", pdba, x, nullptr, ePBC, box);
2295 if (watermodel == nullptr)
2297 for (chain = 0; chain < nch; chain++)
2299 if (chains[chain].bAllWat)
2301 gmx_fatal(FARGS, "You have chosen not to include a water model, but there is water in the input file. Select a water model or remove the water from your input file.");
2307 sprintf(buf_fn, "%s%c%s.itp", ffdir, DIR_SEPARATOR, watermodel);
2308 if (!fflib_fexist(buf_fn))
2310 gmx_fatal(FARGS, "The topology file '%s' for the selected water model '%s' can not be found in the force field directory. Select a different water model.",
2311 buf_fn, watermodel);
2315 print_top_mols(top_file, title, ffdir, watermodel, nincl, incls, nmol, mols);
2316 gmx_fio_fclose(top_file);
2318 gmx_residuetype_destroy(rt);
2320 /* now merge all chains back together */
2323 for (i = 0; (i < nch); i++)
2325 natom += chains[i].pdba->nr;
2326 nres += chains[i].pdba->nres;
2329 init_t_atoms(atoms, natom, FALSE);
2330 for (i = 0; i < atoms->nres; i++)
2332 sfree(atoms->resinfo[i].name);
2334 sfree(atoms->resinfo);
2336 snew(atoms->resinfo, nres);
2340 for (i = 0; (i < nch); i++)
2344 printf("Including chain %d in system: %d atoms %d residues\n",
2345 i+1, chains[i].pdba->nr, chains[i].pdba->nres);
2347 for (j = 0; (j < chains[i].pdba->nr); j++)
2349 atoms->atom[k] = chains[i].pdba->atom[j];
2350 atoms->atom[k].resind += l; /* l is processed nr of residues */
2351 atoms->atomname[k] = chains[i].pdba->atomname[j];
2352 atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2353 copy_rvec(chains[i].x[j], x[k]);
2356 for (j = 0; (j < chains[i].pdba->nres); j++)
2358 atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2361 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2369 fprintf(stderr, "Now there are %d atoms and %d residues\n", k, l);
2370 print_sums(atoms, TRUE);
2373 fprintf(stderr, "\nWriting coordinate file...\n");
2374 clear_rvec(box_space);
2377 make_new_box(atoms->nr, x, box, box_space, FALSE);
2379 write_sto_conf(ftp2fn(efSTO, NFILE, fnm), title, atoms, x, nullptr, ePBC, box);
2381 printf("\t\t--------- PLEASE NOTE ------------\n");
2382 printf("You have successfully generated a topology from: %s.\n",
2383 opt2fn("-f", NFILE, fnm));
2384 if (watermodel != nullptr)
2386 printf("The %s force field and the %s water model are used.\n",
2387 ffname, watermodel);
2391 printf("The %s force field is used.\n",
2394 printf("\t\t--------- ETON ESAELP ------------\n");