Use more modern clang for linters and sanitizers
[alexxy/gromacs.git] / src / gromacs / gmxpreprocess / pdb2gmx.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 #include "gmxpre.h"
39
40 #include "gromacs/utility/enumerationhelpers.h"
41 #include "pdb2gmx.h"
42
43 #include <cctype>
44 #include <cstdlib>
45 #include <cstring>
46 #include <ctime>
47
48 #include <algorithm>
49 #include <memory>
50 #include <string>
51 #include <vector>
52
53 #include "gromacs/commandline/cmdlineoptionsmodule.h"
54 #include "gromacs/fileio/confio.h"
55 #include "gromacs/fileio/filetypes.h"
56 #include "gromacs/fileio/gmxfio.h"
57 #include "gromacs/fileio/pdbio.h"
58 #include "gromacs/gmxlib/conformation_utilities.h"
59 #include "gromacs/gmxpreprocess/fflibutil.h"
60 #include "gromacs/gmxpreprocess/genhydro.h"
61 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
62 #include "gromacs/gmxpreprocess/grompp_impl.h"
63 #include "gromacs/gmxpreprocess/h_db.h"
64 #include "gromacs/gmxpreprocess/hizzie.h"
65 #include "gromacs/gmxpreprocess/pdb2top.h"
66 #include "gromacs/gmxpreprocess/pgutil.h"
67 #include "gromacs/gmxpreprocess/specbond.h"
68 #include "gromacs/gmxpreprocess/ter_db.h"
69 #include "gromacs/gmxpreprocess/toputil.h"
70 #include "gromacs/gmxpreprocess/xlate.h"
71 #include "gromacs/math/vec.h"
72 #include "gromacs/options/basicoptions.h"
73 #include "gromacs/options/filenameoption.h"
74 #include "gromacs/options/ioptionscontainer.h"
75 #include "gromacs/topology/atomprop.h"
76 #include "gromacs/topology/block.h"
77 #include "gromacs/topology/index.h"
78 #include "gromacs/topology/residuetypes.h"
79 #include "gromacs/topology/symtab.h"
80 #include "gromacs/topology/topology.h"
81 #include "gromacs/utility/dir_separator.h"
82 #include "gromacs/utility/exceptions.h"
83 #include "gromacs/utility/fatalerror.h"
84 #include "gromacs/utility/filestream.h"
85 #include "gromacs/utility/loggerbuilder.h"
86 #include "gromacs/utility/path.h"
87 #include "gromacs/utility/smalloc.h"
88 #include "gromacs/utility/strdb.h"
89 #include "gromacs/utility/stringutil.h"
90
91 #include "hackblock.h"
92 #include "resall.h"
93
94 struct RtpRename
95 {
96     RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
97         gmx(newGmx),
98         main(newMain),
99         nter(newNter),
100         cter(newCter),
101         bter(newBter)
102     {
103     }
104     std::string gmx;
105     std::string main;
106     std::string nter;
107     std::string cter;
108     std::string bter;
109 };
110
111 namespace
112 {
113
114 const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
115 {
116     /* NOTE: This function returns the main building block name,
117      *       it does not take terminal renaming into account.
118      */
119     auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
120         return gmx::equalCaseInsensitive(name, rename.gmx);
121     });
122     return found != rr.end() ? found->main.c_str() : name.c_str();
123 }
124
125 const char* enumValueToLongString(HistidineStates enumValue)
126 {
127     constexpr gmx::EnumerationArray<HistidineStates, const char*> histidineStatesLongNames = {
128         "H on ND1 only", "H on NE2 only", "H on ND1 and NE2", "Coupled to Heme"
129     };
130     return histidineStatesLongNames[enumValue];
131 }
132
133 enum class AspartateStates : int
134 {
135     Deprot,
136     Prot,
137     Count
138 };
139
140 const char* enumValueToString(AspartateStates enumValue)
141 {
142     constexpr gmx::EnumerationArray<AspartateStates, const char*> aspartateStateNames = { "ASP",
143                                                                                           "ASPH" };
144     return aspartateStateNames[enumValue];
145 }
146
147 const char* enumValueToLongString(AspartateStates enumValue)
148 {
149     constexpr gmx::EnumerationArray<AspartateStates, const char*> aspartateStateLongNames = {
150         "Not protonated (charge -1)", "Protonated (charge 0)"
151     };
152     return aspartateStateLongNames[enumValue];
153 }
154
155 enum class GlutamateStates : int
156 {
157     Deprot,
158     Prot,
159     Count
160 };
161
162 const char* enumValueToString(GlutamateStates enumValue)
163 {
164     constexpr gmx::EnumerationArray<GlutamateStates, const char*> glutamateStateNames = { "GLU",
165                                                                                           "GLUH" };
166     return glutamateStateNames[enumValue];
167 }
168
169 const char* enumValueToLongString(GlutamateStates enumValue)
170 {
171     constexpr gmx::EnumerationArray<GlutamateStates, const char*> glutamateStateLongNames = {
172         "Not protonated (charge -1)", "Protonated (charge 0)"
173     };
174     return glutamateStateLongNames[enumValue];
175 }
176
177 enum class GlutamineStates : int
178 {
179     Deprot,
180     Prot,
181     Count
182 };
183
184 const char* enumValueToString(GlutamineStates enumValue)
185 {
186     constexpr gmx::EnumerationArray<GlutamineStates, const char*> glutamineStateNames = { "GLN",
187                                                                                           "QLN" };
188     return glutamineStateNames[enumValue];
189 }
190
191 const char* enumValueToLongString(GlutamineStates enumValue)
192 {
193     constexpr gmx::EnumerationArray<GlutamineStates, const char*> glutamineStateLongNames = {
194         "Not protonated (charge 0)", "Protonated (charge +1)"
195     };
196     return glutamineStateLongNames[enumValue];
197 }
198
199 enum class LysineStates : int
200 {
201     Deprot,
202     Prot,
203     Count
204 };
205
206 const char* enumValueToString(LysineStates enumValue)
207 {
208     constexpr gmx::EnumerationArray<LysineStates, const char*> lysineStateNames = { "LYSN", "LYS" };
209     return lysineStateNames[enumValue];
210 }
211
212 const char* enumValueToLongString(LysineStates enumValue)
213 {
214     constexpr gmx::EnumerationArray<LysineStates, const char*> lysineStateLongNames = {
215         "Not protonated (charge 0)", "Protonated (charge +1)"
216     };
217     return lysineStateLongNames[enumValue];
218 }
219
220 enum class ArginineStates : int
221 {
222     Deprot,
223     Prot,
224     Count
225 };
226
227 const char* enumValueToString(ArginineStates enumValue)
228 {
229     constexpr gmx::EnumerationArray<ArginineStates, const char*> arginineStatesNames = { "ARGN",
230                                                                                          "ARG" };
231     return arginineStatesNames[enumValue];
232 }
233
234 const char* enumValueToLongString(ArginineStates enumValue)
235 {
236     constexpr gmx::EnumerationArray<ArginineStates, const char*> arginineStatesLongNames = {
237         "Not protonated (charge 0)", "Protonated (charge +1)"
238     };
239     return arginineStatesLongNames[enumValue];
240 }
241
242 template<typename EnumType>
243 const char* select_res(int resnr, const char* title, gmx::ArrayRef<const RtpRename> rr)
244 {
245     printf("Which %s type do you want for residue %d\n", title, resnr + 1);
246     for (auto sel : gmx::EnumerationWrapper<EnumType>{})
247     {
248         printf("%d. %s (%s)\n",
249                static_cast<int>(sel),
250                enumValueToString(sel),
251                res2bb_notermini(enumValueToString(sel), rr));
252     }
253     printf("\nType a number:");
254     fflush(stdout);
255
256     int userSelection;
257     if (scanf("%d", &userSelection) != 1)
258     {
259         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
260     }
261
262     return enumValueToLongString(static_cast<EnumType>(userSelection));
263 }
264
265 const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
266 {
267     return select_res<AspartateStates>(resnr, "ASPARTIC ACID", rr);
268 }
269
270 const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
271 {
272     return select_res<GlutamateStates>(resnr, "GLUTAMIC ACID", rr);
273 }
274
275 const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
276 {
277     return select_res<GlutamineStates>(resnr, "GLUTAMINE", rr);
278 }
279
280 const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
281 {
282     return select_res<LysineStates>(resnr, "LYSINE", rr);
283 }
284
285 const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
286 {
287     return select_res<ArginineStates>(resnr, "ARGININE", rr);
288 }
289
290 const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
291 {
292     return select_res<HistidineStates>(resnr, "HISTIDINE", rr);
293 }
294
295 void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
296 {
297     char line[STRLEN], buf[STRLEN];
298
299     int ncol = 0;
300     while (get_a_line(fp, line, STRLEN))
301     {
302         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
303          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
304          * Note that the buffer length has been increased to 7 to allow this,
305          * so we just need to make sure the strings have zero-length initially.
306          */
307         char gmx[STRLEN];
308         char main[STRLEN];
309         char nter[STRLEN];
310         char cter[STRLEN];
311         char bter[STRLEN];
312         gmx[0]       = '\0';
313         main[0]      = '\0';
314         nter[0]      = '\0';
315         cter[0]      = '\0';
316         bter[0]      = '\0';
317         int       nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
318         RtpRename newEntry(gmx, main, nter, cter, bter);
319         if (ncol == 0)
320         {
321             if (nc != 2 && nc != 5)
322             {
323                 gmx_fatal(FARGS,
324                           "Residue renaming database '%s' has %d columns instead of %d or %d",
325                           fname,
326                           ncol,
327                           2,
328                           5);
329             }
330             ncol = nc;
331         }
332         else if (nc != ncol)
333         {
334             gmx_fatal(FARGS,
335                       "A line in residue renaming database '%s' has %d columns, while previous "
336                       "lines have %d columns",
337                       fname,
338                       nc,
339                       ncol);
340         }
341
342         if (nc == 2)
343         {
344             /* This file does not have special termini names, copy them from main */
345             newEntry.nter = newEntry.main;
346             newEntry.cter = newEntry.main;
347             newEntry.bter = newEntry.main;
348         }
349         rtprename->push_back(newEntry);
350     }
351 }
352
353 std::string search_resrename(gmx::ArrayRef<const RtpRename> rr, const char* name, bool bStart, bool bEnd, bool bCompareFFRTPname)
354 {
355     auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
356         return ((!bCompareFFRTPname && (name == rename.gmx))
357                 || (bCompareFFRTPname && (name == rename.main)));
358     });
359
360     std::string newName;
361     /* If found in the database, rename this residue's rtp building block,
362      * otherwise keep the old name.
363      */
364     if (found != rr.end())
365     {
366         if (bStart && bEnd)
367         {
368             newName = found->bter;
369         }
370         else if (bStart)
371         {
372             newName = found->nter;
373         }
374         else if (bEnd)
375         {
376             newName = found->cter;
377         }
378         else
379         {
380             newName = found->main;
381         }
382
383         if (newName[0] == '-')
384         {
385             gmx_fatal(FARGS,
386                       "In the chosen force field there is no residue type for '%s'%s",
387                       name,
388                       bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
389                              : (bEnd ? " as an ending terminus" : ""));
390         }
391     }
392
393     return newName;
394 }
395
396 void rename_resrtp(t_atoms*                       pdba,
397                    int                            nterpairs,
398                    gmx::ArrayRef<const int>       r_start,
399                    gmx::ArrayRef<const int>       r_end,
400                    gmx::ArrayRef<const RtpRename> rr,
401                    t_symtab*                      symtab,
402                    bool                           bVerbose,
403                    const gmx::MDLogger&           logger)
404 {
405     bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
406
407     for (int r = 0; r < pdba->nres; r++)
408     {
409         bool bStart = false;
410         bool bEnd   = false;
411         for (int j = 0; j < nterpairs; j++)
412         {
413             if (r == r_start[j])
414             {
415                 bStart = true;
416             }
417         }
418         for (int j = 0; j < nterpairs; j++)
419         {
420             if (r == r_end[j])
421             {
422                 bEnd = true;
423             }
424         }
425
426         std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
427
428         if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
429         {
430             /* This is a terminal residue, but the residue name,
431              * currently stored in .rtp, is not a standard residue name,
432              * but probably a force field specific rtp name.
433              * Check if we need to rename it because it is terminal.
434              */
435             newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
436         }
437
438         if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
439         {
440             if (bVerbose)
441             {
442                 GMX_LOG(logger.info)
443                         .asParagraph()
444                         .appendTextFormatted("Changing rtp entry of residue %d %s to '%s'",
445                                              pdba->resinfo[r].nr,
446                                              *pdba->resinfo[r].name,
447                                              newName.c_str());
448             }
449             pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
450         }
451     }
452 }
453
454 void pdbres_to_gmxrtp(t_atoms* pdba)
455 {
456     int i;
457
458     for (i = 0; (i < pdba->nres); i++)
459     {
460         if (pdba->resinfo[i].rtp == nullptr)
461         {
462             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
463         }
464     }
465 }
466
467 void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
468 {
469     char* resnm;
470     int   i;
471
472     for (i = 0; (i < pdba->nres); i++)
473     {
474         resnm = *pdba->resinfo[i].name;
475         if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
476             || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
477         {
478             /* Rename the residue name (not the rtp name) */
479             pdba->resinfo[i].name = put_symtab(symtab, newnm);
480         }
481     }
482 }
483
484 /*! \brief Rename all residues named \c oldnm to \c newnm
485  *
486  * Search for residues for which the residue name from the input
487  * configuration file matches \c oldnm, and when found choose the rtp
488  * entry and name of \c newnm.
489  *
490  * \todo Refactor this function to accept a lambda that accepts i and
491  * numMatchesFound but always produces \c newnm. Then remove
492  * renameResiduesInteractively by calling this method with suitable
493  * lambdas that capture its parameter \c rr and ignores
494  * numMatchesFound. */
495 void renameResidue(const gmx::MDLogger& logger,
496                    t_atoms*             pdba,
497                    const char*          oldnm,
498                    const char*          newnm,
499                    bool                 bFullCompare,
500                    t_symtab*            symtab)
501 {
502     int numMatchesFound = 0;
503     for (int i = 0; (i < pdba->nres); i++)
504     {
505         /* We have not set the rtp name yet, use the residue name */
506         const char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
507         if ((bFullCompare && (gmx::equalCaseInsensitive(residueNameInInputConfiguration, oldnm)))
508             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
509         {
510             /* Change the rtp building block name */
511             pdba->resinfo[i].rtp  = put_symtab(symtab, newnm);
512             pdba->resinfo[i].name = pdba->resinfo[i].rtp;
513             numMatchesFound++;
514         }
515     }
516     if (numMatchesFound > 0)
517     {
518         GMX_LOG(logger.info)
519                 .asParagraph()
520                 .appendTextFormatted(
521                         "Replaced %d residue%s named %s to the default %s. Use interactive "
522                         "selection of protonated residues if that is what you need.",
523                         numMatchesFound,
524                         numMatchesFound > 1 ? "s" : "",
525                         oldnm,
526                         newnm);
527     }
528 }
529
530 /*! \brief Rename all residues named \c oldnm according to the user's
531  * interactive choice
532  *
533  * Search for residues for which the residue name from the input
534  * configuration file matches \c oldnm, and when found choose the rtp
535  * entry and name of the interactive choice from \c gettp.
536  *
537  * \todo Remove this function, per todo in \c renameResidue. */
538 void renameResidueInteractively(t_atoms*    pdba,
539                                 const char* oldnm,
540                                 const char* gettp(int, gmx::ArrayRef<const RtpRename>),
541                                 bool        bFullCompare,
542                                 t_symtab*   symtab,
543                                 gmx::ArrayRef<const RtpRename> rr)
544 {
545     // Search for residues i for which the residue name from the input
546     // configuration file matches oldnm, so it can replaced by the rtp
547     // entry and name of newnm.
548     for (int i = 0; i < pdba->nres; i++)
549     {
550         /* We have not set the rtp name yet, use the residue name */
551         char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
552         if ((bFullCompare && (strcmp(residueNameInInputConfiguration, oldnm) == 0))
553             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
554         {
555             const char* interactiveRtpChoice = gettp(i, rr);
556             pdba->resinfo[i].rtp             = put_symtab(symtab, interactiveRtpChoice);
557             pdba->resinfo[i].name            = pdba->resinfo[i].rtp;
558         }
559     }
560 }
561
562 void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose, const gmx::MDLogger& logger)
563 {
564     int i, ftp;
565     int nzero   = 0;
566     int nnotone = 0;
567
568     ftp = fn2ftp(filename);
569     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
570     {
571         GMX_LOG(logger.warning).asParagraph().appendTextFormatted("No occupancies in %s", filename);
572     }
573     else
574     {
575         for (i = 0; (i < atoms->nr); i++)
576         {
577             if (atoms->pdbinfo[i].occup != 1)
578             {
579                 if (bVerbose)
580                 {
581                     GMX_LOG(logger.warning)
582                             .asParagraph()
583                             .appendTextFormatted("Occupancy for atom %s%d-%s is %f rather than 1",
584                                                  *atoms->resinfo[atoms->atom[i].resind].name,
585                                                  atoms->resinfo[atoms->atom[i].resind].nr,
586                                                  *atoms->atomname[i],
587                                                  atoms->pdbinfo[i].occup);
588                 }
589                 if (atoms->pdbinfo[i].occup == 0)
590                 {
591                     nzero++;
592                 }
593                 else
594                 {
595                     nnotone++;
596                 }
597             }
598         }
599         if (nzero == atoms->nr)
600         {
601             GMX_LOG(logger.warning)
602                     .asParagraph()
603                     .appendTextFormatted(
604                             "All occupancy fields zero. This is probably not an X-Ray structure");
605         }
606         else if ((nzero > 0) || (nnotone > 0))
607         {
608             GMX_LOG(logger.warning)
609                     .asParagraph()
610                     .appendTextFormatted(
611                             "there were %d atoms with zero occupancy and %d atoms with "
612                             "         occupancy unequal to one (out of %d atoms). Check your pdb "
613                             "file.",
614                             nzero,
615                             nnotone,
616                             atoms->nr);
617         }
618         else
619         {
620             GMX_LOG(logger.warning).asParagraph().appendTextFormatted("All occupancies are one");
621         }
622     }
623 }
624
625 void write_posres(const char* fn, t_atoms* pdba, real fc)
626 {
627     FILE* fp;
628     int   i;
629
630     fp = gmx_fio_fopen(fn, "w");
631     fprintf(fp,
632             "; In this topology include file, you will find position restraint\n"
633             "; entries for all the heavy atoms in your original pdb file.\n"
634             "; This means that all the protons which were added by pdb2gmx are\n"
635             "; not restrained.\n"
636             "\n"
637             "[ position_restraints ]\n"
638             "; %4s%6s%8s%8s%8s\n",
639             "atom",
640             "type",
641             "fx",
642             "fy",
643             "fz");
644     for (i = 0; (i < pdba->nr); i++)
645     {
646         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
647         {
648             fprintf(fp, "%6d%6d  %g  %g  %g\n", i + 1, 1, fc, fc, fc);
649         }
650     }
651     gmx_fio_fclose(fp);
652 }
653
654 int read_pdball(const char*     inf,
655                 bool            bOutput,
656                 const char*     outf,
657                 char**          title,
658                 t_atoms*        atoms,
659                 rvec**          x,
660                 PbcType*        pbcType,
661                 matrix          box,
662                 bool            bRemoveH,
663                 t_symtab*       symtab,
664                 ResidueType*    rt,
665                 const char*     watres,
666                 AtomProperties* aps,
667                 bool            bVerbose)
668 /* Read a pdb file. (containing proteins) */
669 {
670     int natom, new_natom, i;
671
672     /* READ IT */
673     printf("Reading %s...\n", inf);
674     readConfAndAtoms(inf, symtab, title, atoms, pbcType, x, nullptr, box);
675     natom = atoms->nr;
676     if (atoms->pdbinfo == nullptr)
677     {
678         snew(atoms->pdbinfo, atoms->nr);
679     }
680     if (fn2ftp(inf) == efPDB)
681     {
682         get_pdb_atomnumber(atoms, aps);
683     }
684     if (bRemoveH)
685     {
686         new_natom = 0;
687         for (i = 0; i < atoms->nr; i++)
688         {
689             if (!is_hydrogen(*atoms->atomname[i]))
690             {
691                 atoms->atom[new_natom]     = atoms->atom[i];
692                 atoms->atomname[new_natom] = atoms->atomname[i];
693                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
694                 copy_rvec((*x)[i], (*x)[new_natom]);
695                 new_natom++;
696             }
697         }
698         atoms->nr = new_natom;
699         natom     = new_natom;
700     }
701
702     printf("Read");
703     if (title[0])
704     {
705         printf(" '%s',", *title);
706     }
707     printf(" %d atoms\n", natom);
708
709     /* Rename residues */
710     rename_pdbres(atoms, "HOH", watres, false, symtab);
711     rename_pdbres(atoms, "SOL", watres, false, symtab);
712     rename_pdbres(atoms, "WAT", watres, false, symtab);
713
714     rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
715
716     if (natom == 0)
717     {
718         return 0;
719     }
720     if (bOutput)
721     {
722         write_sto_conf(outf, *title, atoms, *x, nullptr, *pbcType, box);
723     }
724
725     return natom;
726 }
727
728 void process_chain(const gmx::MDLogger&           logger,
729                    t_atoms*                       pdba,
730                    gmx::ArrayRef<gmx::RVec>       x,
731                    bool                           bTrpU,
732                    bool                           bPheU,
733                    bool                           bTyrU,
734                    bool                           bLysMan,
735                    bool                           bAspMan,
736                    bool                           bGluMan,
737                    bool                           bHisMan,
738                    bool                           bArgMan,
739                    bool                           bGlnMan,
740                    real                           angle,
741                    real                           distance,
742                    t_symtab*                      symtab,
743                    gmx::ArrayRef<const RtpRename> rr)
744 {
745     /* Rename aromatics, lys, asp and histidine */
746     if (bTyrU)
747     {
748         renameResidue(logger, pdba, "TYR", "TYRU", false, symtab);
749     }
750     if (bTrpU)
751     {
752         renameResidue(logger, pdba, "TRP", "TRPU", false, symtab);
753     }
754     if (bPheU)
755     {
756         renameResidue(logger, pdba, "PHE", "PHEU", false, symtab);
757     }
758     if (bLysMan)
759     {
760         renameResidueInteractively(pdba, "LYS", get_lystp, false, symtab, rr);
761     }
762     if (bArgMan)
763     {
764         renameResidueInteractively(pdba, "ARG", get_argtp, false, symtab, rr);
765     }
766     if (bGlnMan)
767     {
768         renameResidueInteractively(pdba, "GLN", get_glntp, false, symtab, rr);
769     }
770     if (bAspMan)
771     {
772         renameResidueInteractively(pdba, "ASP", get_asptp, false, symtab, rr);
773     }
774     else
775     {
776         renameResidue(logger, pdba, "ASPH", "ASP", false, symtab);
777     }
778     if (bGluMan)
779     {
780         renameResidueInteractively(pdba, "GLU", get_glutp, false, symtab, rr);
781     }
782     else
783     {
784         renameResidue(logger, pdba, "GLUH", "GLU", false, symtab);
785     }
786
787     if (!bHisMan)
788     {
789         set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
790     }
791     else
792     {
793         renameResidueInteractively(pdba, "HIS", get_histp, true, symtab, rr);
794     }
795
796     /* Initialize the rtp builing block names with the residue names
797      * for the residues that have not been processed above.
798      */
799     pdbres_to_gmxrtp(pdba);
800
801     /* Now we have all rtp names set.
802      * The rtp names will conform to Gromacs naming,
803      * unless the input pdb file contained one or more force field specific
804      * rtp names as residue names.
805      */
806 }
807
808 /* struct for sorting the atoms from the pdb file */
809 typedef struct
810 {
811     int  resnr;  /* residue number               */
812     int  j;      /* database order index         */
813     int  index;  /* original atom number         */
814     char anm1;   /* second letter of atom name   */
815     char altloc; /* alternate location indicator */
816 } t_pdbindex;
817
818 bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
819 {
820     int d = (a.resnr - b.resnr);
821     if (d == 0)
822     {
823         d = (a.j - b.j);
824         if (d == 0)
825         {
826             d = (a.anm1 - b.anm1);
827             if (d == 0)
828             {
829                 d = (a.altloc - b.altloc);
830             }
831         }
832     }
833     return d < 0;
834 }
835
836 void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
837                    int                                    natoms,
838                    t_atoms**                              pdbaptr,
839                    t_atoms**                              newPdbAtoms,
840                    std::vector<gmx::RVec>*                x,
841                    t_blocka*                              block,
842                    char***                                gnames)
843 {
844     t_atoms*               pdba = *pdbaptr;
845     std::vector<gmx::RVec> xnew;
846     t_pdbindex*            pdbi;
847     char*                  atomnm;
848
849     natoms = pdba->nr;
850     snew(pdbi, natoms);
851
852     for (int i = 0; i < natoms; i++)
853     {
854         atomnm                                  = *pdba->atomname[i];
855         const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
856         auto                     found =
857                 std::find_if(localPpResidue->atomname.begin(),
858                              localPpResidue->atomname.end(),
859                              [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
860         if (found == localPpResidue->atomname.end())
861         {
862             std::string buf = gmx::formatString(
863                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
864                     "while sorting atoms.\n%s",
865                     atomnm,
866                     *pdba->resinfo[pdba->atom[i].resind].name,
867                     pdba->resinfo[pdba->atom[i].resind].nr,
868                     localPpResidue->resname.c_str(),
869                     localPpResidue->natom(),
870                     is_hydrogen(atomnm)
871                             ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
872                               "might have had a different number in the PDB file and was rebuilt\n"
873                               "(it might for instance have been H3, and we only expected H1 & "
874                               "H2).\n"
875                               "Note that hydrogens might have been added to the entry for the "
876                               "N-terminus.\n"
877                               "Remove this hydrogen or choose a different protonation state to "
878                               "solve it.\n"
879                               "Option -ignh will ignore all hydrogens in the input."
880                             : ".");
881             gmx_fatal(FARGS, "%s", buf.c_str());
882         }
883         /* make shadow array to be sorted into indexgroup */
884         pdbi[i].resnr  = pdba->atom[i].resind;
885         pdbi[i].j      = std::distance(localPpResidue->atomname.begin(), found);
886         pdbi[i].index  = i;
887         pdbi[i].anm1   = atomnm[1];
888         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
889     }
890     std::sort(pdbi, pdbi + natoms, pdbicomp);
891
892     /* pdba is sorted in pdbnew using the pdbi index */
893     std::vector<int> a(natoms);
894     srenew(*newPdbAtoms, 1);
895     init_t_atoms((*newPdbAtoms), natoms, true);
896     (*newPdbAtoms)->nr   = pdba->nr;
897     (*newPdbAtoms)->nres = pdba->nres;
898     srenew((*newPdbAtoms)->resinfo, pdba->nres);
899     std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
900     for (int i = 0; i < natoms; i++)
901     {
902         (*newPdbAtoms)->atom[i]     = pdba->atom[pdbi[i].index];
903         (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
904         (*newPdbAtoms)->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
905         xnew.push_back(x->at(pdbi[i].index));
906         /* make indexgroup in block */
907         a[i] = pdbi[i].index;
908     }
909     /* clean up */
910     done_atom(pdba);
911     sfree(pdba);
912     /* copy the sorted pdbnew back to pdba */
913     *pdbaptr = *newPdbAtoms;
914     *x       = xnew;
915     add_grp(block, gnames, a, "prot_sort");
916     sfree(pdbi);
917 }
918
919 int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose, const gmx::MDLogger& logger)
920 {
921     int        i, j, oldnatoms, ndel;
922     t_resinfo* ri;
923
924     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Checking for duplicate atoms....");
925     oldnatoms = pdba->nr;
926     ndel      = 0;
927     /* NOTE: pdba->nr is modified inside the loop */
928     for (i = 1; (i < pdba->nr); i++)
929     {
930         /* compare 'i' and 'i-1', throw away 'i' if they are identical
931            this is a 'while' because multiple alternate locations can be present */
932         while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
933                && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
934         {
935             ndel++;
936             if (bVerbose)
937             {
938                 ri = &pdba->resinfo[pdba->atom[i].resind];
939                 GMX_LOG(logger.info)
940                         .asParagraph()
941                         .appendTextFormatted("deleting duplicate atom %4s  %s%4d%c",
942                                              *pdba->atomname[i],
943                                              *ri->name,
944                                              ri->nr,
945                                              ri->ic);
946                 if (ri->chainid && (ri->chainid != ' '))
947                 {
948                     printf(" ch %c", ri->chainid);
949                 }
950                 if (pdba->pdbinfo)
951                 {
952                     if (pdba->pdbinfo[i].atomnr)
953                     {
954                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
955                     }
956                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
957                     {
958                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
959                     }
960                 }
961                 printf("\n");
962             }
963             pdba->nr--;
964             /* We can not free, since it might be in the symtab */
965             /* sfree(pdba->atomname[i]); */
966             for (j = i; j < pdba->nr; j++)
967             {
968                 pdba->atom[j]     = pdba->atom[j + 1];
969                 pdba->atomname[j] = pdba->atomname[j + 1];
970                 if (pdba->pdbinfo)
971                 {
972                     pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
973                 }
974                 copy_rvec(x[j + 1], x[j]);
975             }
976             srenew(pdba->atom, pdba->nr);
977             /* srenew(pdba->atomname, pdba->nr); */
978             srenew(pdba->pdbinfo, pdba->nr);
979         }
980     }
981     if (pdba->nr != oldnatoms)
982     {
983         GMX_LOG(logger.info)
984                 .asParagraph()
985                 .appendTextFormatted("Now there are %d atoms. Deleted %d duplicates.", pdba->nr, ndel);
986     }
987
988     return pdba->nr;
989 }
990
991 void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueType* rt)
992 {
993     std::string startResidueString =
994             gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
995     std::string endResidueString =
996             gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
997
998     // Check whether all residues in chain have the same chain ID.
999     bool        allResiduesHaveSameChainID = true;
1000     char        chainID0                   = pdba->resinfo[r0].chainid;
1001     char        chainID;
1002     std::string residueString;
1003
1004     for (int i = r0 + 1; i < r1; i++)
1005     {
1006         chainID = pdba->resinfo[i].chainid;
1007         if (chainID != chainID0)
1008         {
1009             allResiduesHaveSameChainID = false;
1010             residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1011             break;
1012         }
1013     }
1014
1015     if (!allResiduesHaveSameChainID)
1016     {
1017         gmx_fatal(FARGS,
1018                   "The chain covering the range %s--%s does not have a consistent chain ID. "
1019                   "The first residue has ID '%c', while residue %s has ID '%c'.",
1020                   startResidueString.c_str(),
1021                   endResidueString.c_str(),
1022                   chainID0,
1023                   residueString.c_str(),
1024                   chainID);
1025     }
1026
1027     // At this point all residues have the same ID. If they are also non-blank
1028     // we can be a bit more aggressive and require the types match too.
1029     if (chainID0 != ' ')
1030     {
1031         bool        allResiduesHaveSameType = true;
1032         std::string restype;
1033         std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
1034
1035         for (int i = r0 + 1; i < r1; i++)
1036         {
1037             restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1038             if (!gmx::equalCaseInsensitive(restype, restype0))
1039             {
1040                 allResiduesHaveSameType = false;
1041                 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1042                 break;
1043             }
1044         }
1045
1046         if (!allResiduesHaveSameType)
1047         {
1048             gmx_fatal(FARGS,
1049                       "The residues in the chain %s--%s do not have a consistent type. "
1050                       "The first residue has type '%s', while residue %s is of type '%s'. "
1051                       "Either there is a mistake in your chain, or it includes nonstandard "
1052                       "residue names that have not yet been added to the residuetypes.dat "
1053                       "file in the GROMACS library directory. If there are other molecules "
1054                       "such as ligands, they should not have the same chain ID as the "
1055                       "adjacent protein chain since it's a separate molecule.",
1056                       startResidueString.c_str(),
1057                       endResidueString.c_str(),
1058                       restype0.c_str(),
1059                       residueString.c_str(),
1060                       restype.c_str());
1061         }
1062     }
1063 }
1064
1065 void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueType* rt, const gmx::MDLogger& logger)
1066 {
1067     int                        i;
1068     std::optional<std::string> startrestype;
1069
1070     *r_start = -1;
1071     *r_end   = -1;
1072
1073     int startWarnings = 0;
1074     int endWarnings   = 0;
1075     int ionNotes      = 0;
1076
1077     // Check that all residues have the same chain identifier, and if it is
1078     // non-blank we also require the residue types to match.
1079     checkResidueTypeSanity(pdba, r0, r1, rt);
1080
1081     // If we return correctly from checkResidueTypeSanity(), the only
1082     // remaining cases where we can have non-matching residue types is if
1083     // the chain ID was blank, which could be the case e.g. for a structure
1084     // read from a GRO file or other file types without chain information.
1085     // In that case we need to be a bit more liberal and detect chains based
1086     // on the residue type.
1087
1088     // If we get here, the chain ID must be identical for all residues
1089     char chainID = pdba->resinfo[r0].chainid;
1090
1091     /* Find the starting terminus (typially N or 5') */
1092     for (i = r0; i < r1 && *r_start == -1; i++)
1093     {
1094         startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1095         if (!startrestype)
1096         {
1097             continue;
1098         }
1099         if (gmx::equalCaseInsensitive(*startrestype, "Protein")
1100             || gmx::equalCaseInsensitive(*startrestype, "DNA")
1101             || gmx::equalCaseInsensitive(*startrestype, "RNA"))
1102         {
1103             GMX_LOG(logger.info)
1104                     .asParagraph()
1105                     .appendTextFormatted("Identified residue %s%d as a starting terminus.",
1106                                          *pdba->resinfo[i].name,
1107                                          pdba->resinfo[i].nr);
1108             *r_start = i;
1109         }
1110         else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1111         {
1112             if (ionNotes < 5)
1113             {
1114                 GMX_LOG(logger.info)
1115                         .asParagraph()
1116                         .appendTextFormatted(
1117                                 "Residue %s%d has type 'Ion', assuming it is not linked into a "
1118                                 "chain.",
1119                                 *pdba->resinfo[i].name,
1120                                 pdba->resinfo[i].nr);
1121             }
1122             if (ionNotes == 4)
1123             {
1124                 GMX_LOG(logger.info)
1125                         .asParagraph()
1126                         .appendTextFormatted("Disabling further notes about ions.");
1127             }
1128             ionNotes++;
1129         }
1130         else
1131         {
1132             // Either no known residue type, or one not needing special handling
1133             if (startWarnings < 5)
1134             {
1135                 if (chainID == ' ')
1136                 {
1137                     GMX_LOG(logger.warning)
1138                             .asParagraph()
1139                             .appendTextFormatted(
1140                                     "Starting residue %s%d in chain not identified as "
1141                                     "Protein/RNA/DNA. "
1142                                     "This chain lacks identifiers, which makes it impossible to do "
1143                                     "strict "
1144                                     "classification of the start/end residues. Here we need to "
1145                                     "guess this residue "
1146                                     "should not be part of the chain and instead introduce a "
1147                                     "break, but that will "
1148                                     "be catastrophic if they should in fact be linked. Please "
1149                                     "check your structure, "
1150                                     "and add %s to residuetypes.dat if this was not correct.",
1151                                     *pdba->resinfo[i].name,
1152                                     pdba->resinfo[i].nr,
1153                                     *pdba->resinfo[i].name);
1154                 }
1155                 else
1156                 {
1157                     GMX_LOG(logger.warning)
1158                             .asParagraph()
1159                             .appendTextFormatted(
1160                                     "No residues in chain starting at %s%d identified as "
1161                                     "Protein/RNA/DNA. "
1162                                     "This makes it impossible to link them into a molecule, which "
1163                                     "could either be "
1164                                     "correct or a catastrophic error. Please check your structure, "
1165                                     "and add all "
1166                                     "necessary residue names to residuetypes.dat if this was not "
1167                                     "correct.",
1168                                     *pdba->resinfo[i].name,
1169                                     pdba->resinfo[i].nr);
1170                 }
1171             }
1172             if (startWarnings == 4)
1173             {
1174                 GMX_LOG(logger.warning)
1175                         .asParagraph()
1176                         .appendTextFormatted(
1177                                 "Disabling further warnings about unidentified residues at start "
1178                                 "of chain.");
1179             }
1180             startWarnings++;
1181         }
1182     }
1183
1184     if (*r_start >= 0)
1185     {
1186         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1187         for (int i = *r_start; i < r1; i++)
1188         {
1189             std::optional<std::string> restype =
1190                     rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1191             if (!restype)
1192             {
1193                 continue;
1194             }
1195             if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1196             {
1197                 *r_end = i;
1198             }
1199             else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1200             {
1201                 if (ionNotes < 5)
1202                 {
1203                     GMX_LOG(logger.info)
1204                             .asParagraph()
1205                             .appendTextFormatted(
1206                                     "Residue %s%d has type 'Ion', assuming it is not linked into a "
1207                                     "chain.",
1208                                     *pdba->resinfo[i].name,
1209                                     pdba->resinfo[i].nr);
1210                 }
1211                 if (ionNotes == 4)
1212                 {
1213                     GMX_LOG(logger.info)
1214                             .asParagraph()
1215                             .appendTextFormatted("Disabling further notes about ions.");
1216                 }
1217                 ionNotes++;
1218             }
1219             else
1220             {
1221                 // Either no known residue type, or one not needing special handling.
1222                 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1223                 // Otherwise the call to checkResidueTypeSanity() will
1224                 // have caught the problem.
1225                 if (endWarnings < 5)
1226                 {
1227                     GMX_LOG(logger.warning)
1228                             .asParagraph()
1229                             .appendTextFormatted(
1230                                     "Residue %s%d in chain has different type ('%s') from "
1231                                     "residue %s%d ('%s'). This chain lacks identifiers, which "
1232                                     "makes "
1233                                     "it impossible to do strict classification of the start/end "
1234                                     "residues. Here we "
1235                                     "need to guess this residue should not be part of the chain "
1236                                     "and instead "
1237                                     "introduce a break, but that will be catastrophic if they "
1238                                     "should in fact be "
1239                                     "linked. Please check your structure, and add %s to "
1240                                     "residuetypes.dat "
1241                                     "if this was not correct.",
1242                                     *pdba->resinfo[i].name,
1243                                     pdba->resinfo[i].nr,
1244                                     restype->c_str(),
1245                                     *pdba->resinfo[*r_start].name,
1246                                     pdba->resinfo[*r_start].nr,
1247                                     startrestype->c_str(),
1248                                     *pdba->resinfo[i].name);
1249                 }
1250                 if (endWarnings == 4)
1251                 {
1252                     GMX_LOG(logger.warning)
1253                             .asParagraph()
1254                             .appendTextFormatted(
1255                                     "Disabling further warnings about unidentified residues at end "
1256                                     "of chain.");
1257                 }
1258                 endWarnings++;
1259             }
1260         }
1261     }
1262
1263     if (*r_end >= 0)
1264     {
1265         GMX_LOG(logger.info)
1266                 .asParagraph()
1267                 .appendTextFormatted("Identified residue %s%d as a ending terminus.",
1268                                      *pdba->resinfo[*r_end].name,
1269                                      pdba->resinfo[*r_end].nr);
1270     }
1271 }
1272
1273 enum class ChainSeparationType : int
1274 {
1275     IdOrTer,
1276     IdAndTer,
1277     Ter,
1278     Id,
1279     Interactive,
1280     Count
1281 };
1282 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNames = {
1283     { "id_or_ter", "id_and_ter", "ter", "id", "interactive" }
1284 };
1285 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNotificationMessages = {
1286     { "Splitting chemical chains based on TER records or chain id changing.\n",
1287       "Splitting chemical chains based on TER records and chain id changing.\n",
1288       "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1289       "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1290       "Splitting chemical chains interactively.\n" }
1291 };
1292
1293 void modify_chain_numbers(t_atoms* pdba, ChainSeparationType chainSeparation, const gmx::MDLogger& logger)
1294 {
1295     int         i;
1296     char        old_prev_chainid;
1297     char        old_this_chainid;
1298     int         old_prev_chainnum;
1299     int         old_this_chainnum;
1300     t_resinfo*  ri;
1301     char        select[STRLEN];
1302     int         new_chainnum;
1303     int         this_atomnum;
1304     int         prev_atomnum;
1305     const char* prev_atomname;
1306     const char* this_atomname;
1307     const char* prev_resname;
1308     const char* this_resname;
1309     int         prev_resnum;
1310     int         this_resnum;
1311     char        prev_chainid;
1312     char        this_chainid;
1313
1314     /* The default chain enumeration is based on TER records only */
1315     printf("%s", c_chainSeparationTypeNotificationMessages[chainSeparation]);
1316
1317     old_prev_chainid  = '?';
1318     old_prev_chainnum = -1;
1319     new_chainnum      = -1;
1320
1321     this_atomname = nullptr;
1322     this_atomnum  = -1;
1323     this_resname  = nullptr;
1324     this_resnum   = -1;
1325     this_chainid  = '?';
1326
1327     for (i = 0; i < pdba->nres; i++)
1328     {
1329         ri                = &pdba->resinfo[i];
1330         old_this_chainid  = ri->chainid;
1331         old_this_chainnum = ri->chainnum;
1332
1333         prev_atomname = this_atomname;
1334         prev_atomnum  = this_atomnum;
1335         prev_resname  = this_resname;
1336         prev_resnum   = this_resnum;
1337         prev_chainid  = this_chainid;
1338
1339         this_atomname = *(pdba->atomname[i]);
1340         this_atomnum  = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1341         this_resname  = *ri->name;
1342         this_resnum   = ri->nr;
1343         this_chainid  = ri->chainid;
1344
1345         switch (chainSeparation)
1346         {
1347             case ChainSeparationType::IdOrTer:
1348                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1349                 {
1350                     new_chainnum++;
1351                 }
1352                 break;
1353
1354             case ChainSeparationType::IdAndTer:
1355                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1356                 {
1357                     new_chainnum++;
1358                 }
1359                 break;
1360
1361             case ChainSeparationType::Id:
1362                 if (old_this_chainid != old_prev_chainid)
1363                 {
1364                     new_chainnum++;
1365                 }
1366                 break;
1367
1368             case ChainSeparationType::Ter:
1369                 if (old_this_chainnum != old_prev_chainnum)
1370                 {
1371                     new_chainnum++;
1372                 }
1373                 break;
1374             case ChainSeparationType::Interactive:
1375                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1376                 {
1377                     if (i > 0)
1378                     {
1379                         GMX_LOG(logger.info)
1380                                 .asParagraph()
1381                                 .appendTextFormatted(
1382                                         "Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1383 "
1384                                         "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]",
1385                                         prev_resname,
1386                                         prev_resnum,
1387                                         prev_chainid,
1388                                         prev_atomnum,
1389                                         prev_atomname,
1390                                         this_resname,
1391                                         this_resnum,
1392                                         this_chainid,
1393                                         this_atomnum,
1394                                         this_atomname);
1395
1396                         if (nullptr == fgets(select, STRLEN - 1, stdin))
1397                         {
1398                             gmx_fatal(FARGS, "Error reading from stdin");
1399                         }
1400                     }
1401                     if (i == 0 || select[0] == 'y')
1402                     {
1403                         new_chainnum++;
1404                     }
1405                 }
1406                 break;
1407             case ChainSeparationType::Count:
1408                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1409         }
1410         old_prev_chainid  = old_this_chainid;
1411         old_prev_chainnum = old_this_chainnum;
1412
1413         ri->chainnum = new_chainnum;
1414     }
1415 }
1416
1417 bool checkChainCyclicity(t_atoms*                               pdba,
1418                          rvec*                                  pdbx,
1419                          int                                    start_ter,
1420                          int                                    end_ter,
1421                          gmx::ArrayRef<const PreprocessResidue> rtpFFDB,
1422                          gmx::ArrayRef<const RtpRename>         rr,
1423                          real                                   long_bond_dist_,
1424                          real                                   short_bond_dist_)
1425 {
1426     int         ai = -1, aj = -1;
1427     char*       rtpname = *(pdba->resinfo[start_ter].rtp);
1428     std::string newName = search_resrename(rr, rtpname, false, false, false);
1429     if (newName.empty())
1430     {
1431         newName = rtpname;
1432     }
1433     auto        res = getDatabaseEntry(newName, rtpFFDB);
1434     const char *name_ai, *name_aj;
1435
1436     for (const auto& patch : res->rb[BondedTypes::Bonds].b)
1437     { /* Search backward bond for n/5' terminus */
1438         name_ai = patch.ai().c_str();
1439         name_aj = patch.aj().c_str();
1440         if (name_ai[0] == '-')
1441         {
1442             aj = search_res_atom(++name_ai, end_ter, pdba, "check", TRUE);
1443             ai = search_res_atom(name_aj, start_ter, pdba, "check", TRUE);
1444         }
1445         else if (name_aj[0] == '-')
1446         {
1447             aj = search_res_atom(++name_aj, end_ter, pdba, "check", TRUE);
1448             ai = search_res_atom(name_ai, start_ter, pdba, "check", TRUE);
1449         }
1450         if (ai >= 0 && aj >= 0)
1451         {
1452             break; /* Found */
1453         }
1454     }
1455
1456     if (!(ai >= 0 && aj >= 0))
1457     {
1458         rtpname = *(pdba->resinfo[end_ter].rtp);
1459         newName = search_resrename(rr, rtpname, false, false, false);
1460         if (newName.empty())
1461         {
1462             newName = rtpname;
1463         }
1464         res = getDatabaseEntry(newName, rtpFFDB);
1465         for (const auto& patch : res->rb[BondedTypes::Bonds].b)
1466         {
1467             /* Seach forward bond for c/3' terminus */
1468             name_ai = patch.ai().c_str();
1469             name_aj = patch.aj().c_str();
1470
1471             if (name_ai[0] == '+')
1472             {
1473                 ai = search_res_atom(name_aj, end_ter, pdba, "check", TRUE);
1474                 aj = search_res_atom(++name_ai, start_ter, pdba, "check", TRUE);
1475             }
1476             else if (name_aj[0] == '+')
1477             {
1478                 ai = search_res_atom(name_ai, end_ter, pdba, "check", TRUE);
1479                 aj = search_res_atom(++name_aj, start_ter, pdba, "check", TRUE);
1480             }
1481             if (ai >= 0 && aj >= 0)
1482             {
1483                 break;
1484             }
1485         }
1486     }
1487
1488     if (ai >= 0 && aj >= 0)
1489     {
1490         real dist = distance2(pdbx[ai], pdbx[aj]);
1491         /* it is better to read bond length from ffbonded.itp */
1492         return (dist < gmx::square(long_bond_dist_) && dist > gmx::square(short_bond_dist_));
1493     }
1494     else
1495     {
1496         return false;
1497     }
1498 }
1499
1500 struct t_pdbchain
1501 {
1502     char             chainid   = ' ';
1503     int              chainnum  = ' '; // char, but stored as int to make clang-tidy happy
1504     int              start     = -1;
1505     int              natom     = -1;
1506     bool             bAllWat   = false;
1507     int              nterpairs = -1;
1508     std::vector<int> chainstart;
1509 };
1510
1511 struct t_chain
1512 {
1513     char                                chainid   = ' ';
1514     int                                 chainnum  = ' ';
1515     bool                                bAllWat   = false;
1516     int                                 nterpairs = -1;
1517     std::vector<int>                    chainstart;
1518     std::vector<MoleculePatchDatabase*> ntdb;
1519     std::vector<MoleculePatchDatabase*> ctdb;
1520     std::vector<int>                    r_start;
1521     std::vector<int>                    r_end;
1522     t_atoms*                            pdba;
1523     std::vector<gmx::RVec>              x;
1524     std::vector<int>                    cyclicBondsIndex;
1525 };
1526
1527 enum class VSitesType : int
1528 {
1529     None,
1530     Hydrogens,
1531     Aromatics,
1532     Count
1533 };
1534 const gmx::EnumerationArray<VSitesType, const char*> c_vsitesTypeNames = {
1535     { "none", "hydrogens", "aromatics" }
1536 };
1537
1538 enum class WaterType : int
1539 {
1540     Select,
1541     None,
1542     Spc,
1543     SpcE,
1544     Tip3p,
1545     Tip4p,
1546     Tip5p,
1547     Tips3p,
1548     Count
1549 };
1550 const gmx::EnumerationArray<WaterType, const char*> c_waterTypeNames = {
1551     { "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p" }
1552 };
1553
1554 enum class MergeType : int
1555 {
1556     No,
1557     All,
1558     Interactive,
1559     Count
1560 };
1561 const gmx::EnumerationArray<MergeType, const char*> c_mergeTypeNames = {
1562     { "no", "all", "interactive" }
1563 };
1564
1565 } // namespace
1566
1567 namespace gmx
1568 {
1569
1570 namespace
1571 {
1572
1573 class pdb2gmx : public ICommandLineOptionsModule
1574 {
1575 public:
1576     pdb2gmx() :
1577         bVsites_(FALSE),
1578         bPrevWat_(FALSE),
1579         bVsiteAromatics_(FALSE),
1580         chainSeparation_(ChainSeparationType::IdOrTer),
1581         vsitesType_(VSitesType::None),
1582         waterType_(WaterType::Select),
1583         mergeType_(MergeType::No),
1584         itp_file_(nullptr),
1585         mHmult_(0)
1586     {
1587         gmx::LoggerBuilder builder;
1588         builder.addTargetStream(gmx::MDLogger::LogLevel::Info, &gmx::TextOutputFile::standardOutput());
1589         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
1590         loggerOwner_ = std::make_unique<LoggerOwner>(builder.build());
1591     }
1592
1593     // From ICommandLineOptionsModule
1594     void init(CommandLineModuleSettings* /*settings*/) override {}
1595
1596     void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1597
1598     void optionsFinished() override;
1599
1600     int run() override;
1601
1602 private:
1603     bool bNewRTP_;
1604     bool bInter_;
1605     bool bCysMan_;
1606     bool bLysMan_;
1607     bool bAspMan_;
1608     bool bGluMan_;
1609     bool bHisMan_;
1610     bool bGlnMan_;
1611     bool bArgMan_;
1612     bool bTerMan_;
1613     bool bUnA_;
1614     bool bHeavyH_;
1615     bool bSort_;
1616     bool bAllowMissing_;
1617     bool bRemoveH_;
1618     bool bDeuterate_;
1619     bool bVerbose_;
1620     bool bChargeGroups_;
1621     bool bCmap_;
1622     bool bRenumRes_;
1623     bool bRTPresname_;
1624     bool bIndexSet_;
1625     bool bOutputSet_;
1626     bool bVsites_;
1627     bool bWat_;
1628     bool bPrevWat_;
1629     bool bITP_;
1630     bool bVsiteAromatics_;
1631     real angle_;
1632     real distance_;
1633     real posre_fc_;
1634     real long_bond_dist_;
1635     real short_bond_dist_;
1636
1637     std::string indexOutputFile_;
1638     std::string outputFile_;
1639     std::string topologyFile_;
1640     std::string includeTopologyFile_;
1641     std::string outputConfFile_;
1642     std::string inputConfFile_;
1643     std::string outFile_;
1644     std::string ff_;
1645
1646     ChainSeparationType chainSeparation_;
1647     VSitesType          vsitesType_;
1648     WaterType           waterType_;
1649     MergeType           mergeType_;
1650
1651     FILE*                        itp_file_;
1652     char                         forcefield_[STRLEN];
1653     char                         ffdir_[STRLEN];
1654     char*                        ffname_;
1655     char*                        watermodel_;
1656     std::vector<std::string>     incls_;
1657     std::vector<t_mols>          mols_;
1658     real                         mHmult_;
1659     std::unique_ptr<LoggerOwner> loggerOwner_;
1660 };
1661
1662 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1663 {
1664     const char* desc[] = {
1665         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1666         "some database files, adds hydrogens to the molecules and generates",
1667         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1668         "and a topology in GROMACS format.",
1669         "These files can subsequently be processed to generate a run input file.",
1670         "[PAR]",
1671         "[THISMODULE] will search for force fields by looking for",
1672         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1673         "of the current working directory and of the GROMACS library directory",
1674         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1675         "variable.",
1676         "By default the forcefield selection is interactive,",
1677         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1678         "in the list on the command line instead. In that case [THISMODULE] just looks",
1679         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1680         "[PAR]",
1681         "After choosing a force field, all files will be read only from",
1682         "the corresponding force field directory.",
1683         "If you want to modify or add a residue types, you can copy the force",
1684         "field directory from the GROMACS library directory to your current",
1685         "working directory. If you want to add new protein residue types,",
1686         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1687         "or copy the whole library directory to a local directory and set",
1688         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1689         "Check Chapter 5 of the manual for more information about file formats.",
1690         "[PAR]",
1691
1692         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1693         "need not necessarily contain a protein structure. Every kind of",
1694         "molecule for which there is support in the database can be converted.",
1695         "If there is no support in the database, you can add it yourself.[PAR]",
1696
1697         "The program has limited intelligence, it reads a number of database",
1698         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1699         "if necessary this can be done manually. The program can prompt the",
1700         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1701         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1702         "protonated (three protons, default), for Asp and Glu unprotonated",
1703         "(default) or protonated, for His the proton can be either on ND1,",
1704         "on NE2 or on both. By default these selections are done automatically.",
1705         "For His, this is based on an optimal hydrogen bonding",
1706         "conformation. Hydrogen bonds are defined based on a simple geometric",
1707         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1708         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1709         "[TT]-dist[tt] respectively.[PAR]",
1710
1711         "The protonation state of N- and C-termini can be chosen interactively",
1712         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1713         "respectively.  Some force fields support zwitterionic forms for chains of",
1714         "one residue, but for polypeptides these options should NOT be selected.",
1715         "The AMBER force fields have unique forms for the terminal residues,",
1716         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1717         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1718         "respectively to use these forms, making sure you preserve the format",
1719         "of the coordinate file. Alternatively, use named terminating residues",
1720         "(e.g. ACE, NME).[PAR]",
1721
1722         "The separation of chains is not entirely trivial since the markup",
1723         "in user-generated PDB files frequently varies and sometimes it",
1724         "is desirable to merge entries across a TER record, for instance",
1725         "if you want a disulfide bridge or distance restraints between",
1726         "two protein chains or if you have a HEME group bound to a protein.",
1727         "In such cases multiple chains should be contained in a single",
1728         "[TT]moleculetype[tt] definition.",
1729         "To handle this, [THISMODULE] uses two separate options.",
1730         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1731         "start, and termini added when applicable. This can be done based on the",
1732         "existence of TER records, when the chain id changes, or combinations of either",
1733         "or both of these. You can also do the selection fully interactively.",
1734         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1735         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1736         "This can be turned off (no merging), all non-water chains can be merged into a",
1737         "single molecule, or the selection can be done interactively.[PAR]",
1738
1739         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1740         "If any of the occupancies are not one, indicating that the atom is",
1741         "not resolved well in the structure, a warning message is issued.",
1742         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1743         "all occupancy fields may be zero. Either way, it is up to the user",
1744         "to verify the correctness of the input data (read the article!).[PAR]",
1745
1746         "During processing the atoms will be reordered according to GROMACS",
1747         "conventions. With [TT]-n[tt] an index file can be generated that",
1748         "contains one group reordered in the same way. This allows you to",
1749         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1750         "one limitation: reordering is done after the hydrogens are stripped",
1751         "from the input and before new hydrogens are added. This means that",
1752         "you should not use [TT]-ignh[tt].[PAR]",
1753
1754         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1755         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1756         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1757         "[PAR]",
1758
1759         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1760         "motions. Angular and out-of-plane motions can be removed by changing",
1761         "hydrogens into virtual sites and fixing angles, which fixes their",
1762         "position relative to neighboring atoms. Additionally, all atoms in the",
1763         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1764         "can be converted into virtual sites, eliminating the fast improper dihedral",
1765         "fluctuations in these rings (but this feature is deprecated).",
1766         "[BB]Note[bb] that in this case all other hydrogen",
1767         "atoms are also converted to virtual sites. The mass of all atoms that are",
1768         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1769         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1770         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1771         "done for water hydrogens to slow down the rotational motion of water.",
1772         "The increase in mass of the hydrogens is subtracted from the bonded",
1773         "(heavy) atom so that the total mass of the system remains the same.",
1774
1775         "As a special case, ring-closed (or cyclic) molecules are considered.",
1776         "[THISMODULE] automatically determines if a cyclic molecule is present",
1777         "by evaluating the distance between the terminal atoms of a given chain.",
1778         "If this distance is greater than the [TT]-sb[tt]",
1779         "(\"Short bond warning distance\", default 0.05 nm)",
1780         "and less than the [TT]-lb[tt] (\"Long bond warning distance\", default 0.25 nm)",
1781         "the molecule is considered to be ring closed and will be processed as such.",
1782         "Please note that this does not detect cyclic bonds over periodic boundaries."
1783     };
1784
1785     settings->setHelpText(desc);
1786
1787     options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1788             "Write the residue database in new format to [TT]new.rtp[tt]"));
1789     options->addOption(
1790             RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1791     options->addOption(
1792             RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1793     options->addOption(EnumOption<ChainSeparationType>("chainsep")
1794                                .enumValue(c_chainSeparationTypeNames)
1795                                .store(&chainSeparation_)
1796                                .description("Condition in PDB files when a new chain should be "
1797                                             "started (adding termini)"));
1798     options->addOption(EnumOption<MergeType>("merge")
1799                                .enumValue(c_mergeTypeNames)
1800                                .store(&mergeType_)
1801                                .description("Merge multiple chains into a single [moleculetype]"));
1802     options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1803             "Force field, interactive by default. Use [TT]-h[tt] for information."));
1804     options->addOption(
1805             EnumOption<WaterType>("water").store(&waterType_).enumValue(c_waterTypeNames).description("Water model to use"));
1806     options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1807             "Set the next 8 options to interactive"));
1808     options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1809             "Interactive SS bridge selection"));
1810     options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1811             "Interactive termini selection, instead of charged (default)"));
1812     options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1813             "Interactive lysine selection, instead of charged"));
1814     options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1815             "Interactive arginine selection, instead of charged"));
1816     options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1817             "Interactive aspartic acid selection, instead of charged"));
1818     options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1819             "Interactive glutamic acid selection, instead of charged"));
1820     options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1821             "Interactive glutamine selection, instead of charged"));
1822     options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1823             "Interactive histidine selection, instead of checking H-bonds"));
1824     options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1825             "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1826     options->addOption(
1827             RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1828     options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1829             "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1830             "tyrosine"));
1831     options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1832             "Sort the residues according to database, turning this off is dangerous as charge "
1833             "groups might be broken in parts"));
1834     options->addOption(
1835             BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1836     options->addOption(
1837             BooleanOption("missing")
1838                     .store(&bAllowMissing_)
1839                     .defaultValue(false)
1840                     .description(
1841                             "Continue when atoms are missing and bonds cannot be made, dangerous"));
1842     options->addOption(
1843             BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1844     options->addOption(
1845             RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1846     options->addOption(EnumOption<VSitesType>("vsite")
1847                                .store(&vsitesType_)
1848                                .enumValue(c_vsitesTypeNames)
1849                                .description("Convert atoms to virtual sites"));
1850     options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1851             "Make hydrogen atoms heavy"));
1852     options->addOption(
1853             BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1854     options->addOption(BooleanOption("chargegrp")
1855                                .store(&bChargeGroups_)
1856                                .defaultValue(true)
1857                                .description("Use charge groups in the [REF].rtp[ref] file"));
1858     options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1859             "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1860     options->addOption(BooleanOption("renum")
1861                                .store(&bRenumRes_)
1862                                .defaultValue(false)
1863                                .description("Renumber the residues consecutively in the output"));
1864     options->addOption(BooleanOption("rtpres")
1865                                .store(&bRTPresname_)
1866                                .defaultValue(false)
1867                                .description("Use [REF].rtp[ref] entry names as residue names"));
1868     options->addOption(FileNameOption("f")
1869                                .legacyType(efSTX)
1870                                .inputFile()
1871                                .store(&inputConfFile_)
1872                                .required()
1873                                .defaultBasename("protein")
1874                                .defaultType(efPDB)
1875                                .description("Structure file"));
1876     options->addOption(FileNameOption("o")
1877                                .legacyType(efSTO)
1878                                .outputFile()
1879                                .store(&outputConfFile_)
1880                                .required()
1881                                .defaultBasename("conf")
1882                                .description("Structure file"));
1883     options->addOption(FileNameOption("p")
1884                                .legacyType(efTOP)
1885                                .outputFile()
1886                                .store(&topologyFile_)
1887                                .required()
1888                                .defaultBasename("topol")
1889                                .description("Topology file"));
1890     options->addOption(FileNameOption("i")
1891                                .legacyType(efITP)
1892                                .outputFile()
1893                                .store(&includeTopologyFile_)
1894                                .required()
1895                                .defaultBasename("posre")
1896                                .description("Include file for topology"));
1897     options->addOption(FileNameOption("n")
1898                                .legacyType(efNDX)
1899                                .outputFile()
1900                                .store(&indexOutputFile_)
1901                                .storeIsSet(&bIndexSet_)
1902                                .defaultBasename("index")
1903                                .description("Index file"));
1904     options->addOption(FileNameOption("q")
1905                                .legacyType(efSTO)
1906                                .outputFile()
1907                                .store(&outFile_)
1908                                .storeIsSet(&bOutputSet_)
1909                                .defaultBasename("clean")
1910                                .defaultType(efPDB)
1911                                .description("Structure file"));
1912 }
1913
1914 void pdb2gmx::optionsFinished()
1915 {
1916     if (inputConfFile_.empty())
1917     {
1918         GMX_THROW(InconsistentInputError("You must supply an input file"));
1919     }
1920     if (bInter_)
1921     {
1922         /* if anything changes here, also change description of -inter */
1923         bCysMan_ = true;
1924         bTerMan_ = true;
1925         bLysMan_ = true;
1926         bArgMan_ = true;
1927         bAspMan_ = true;
1928         bGluMan_ = true;
1929         bGlnMan_ = true;
1930         bHisMan_ = true;
1931     }
1932
1933     if (bHeavyH_)
1934     {
1935         mHmult_ = 4.0;
1936     }
1937     else if (bDeuterate_)
1938     {
1939         mHmult_ = 2.0;
1940     }
1941     else
1942     {
1943         mHmult_ = 1.0;
1944     }
1945
1946     /* Force field selection, interactive or direct */
1947     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(),
1948               forcefield_,
1949               sizeof(forcefield_),
1950               ffdir_,
1951               sizeof(ffdir_),
1952               loggerOwner_->logger());
1953
1954     if (strlen(forcefield_) > 0)
1955     {
1956         ffname_    = forcefield_;
1957         ffname_[0] = std::toupper(ffname_[0]);
1958     }
1959     else
1960     {
1961         gmx_fatal(FARGS, "Empty forcefield string");
1962     }
1963 }
1964
1965 int pdb2gmx::run()
1966 {
1967     char                       select[STRLEN];
1968     std::vector<DisulfideBond> ssbonds;
1969
1970     int         this_atomnum;
1971     int         prev_atomnum;
1972     const char* prev_atomname;
1973     const char* this_atomname;
1974     const char* prev_resname;
1975     const char* this_resname;
1976     int         prev_resnum;
1977     int         this_resnum;
1978     char        prev_chainid;
1979     char        this_chainid;
1980     int         prev_chainnumber;
1981     int         this_chainnumber;
1982     int         this_chainstart;
1983     int         prev_chainstart;
1984
1985     const gmx::MDLogger& logger = loggerOwner_->logger();
1986
1987     GMX_LOG(logger.info)
1988             .asParagraph()
1989             .appendTextFormatted("Using the %s force field in directory %s", ffname_, ffdir_);
1990
1991     choose_watermodel(c_waterTypeNames[waterType_], ffdir_, &watermodel_, logger);
1992
1993     switch (vsitesType_)
1994     {
1995         case VSitesType::None:
1996             bVsites_         = false;
1997             bVsiteAromatics_ = false;
1998             break;
1999         case VSitesType::Hydrogens:
2000             bVsites_         = true;
2001             bVsiteAromatics_ = false;
2002             break;
2003         case VSitesType::Aromatics:
2004             bVsites_         = true;
2005             bVsiteAromatics_ = true;
2006             break;
2007         case VSitesType::Count:
2008             gmx_fatal(FARGS, "Internal inconsistency: VSitesType is out of range.");
2009     } /* end switch */
2010
2011     /* Open the symbol table */
2012     t_symtab symtab;
2013     open_symtab(&symtab);
2014
2015     /* Residue type database */
2016     ResidueType rt;
2017
2018     /* Read residue renaming database(s), if present */
2019     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
2020
2021     std::vector<RtpRename> rtprename;
2022     for (const auto& filename : rrn)
2023     {
2024         GMX_LOG(logger.info).asParagraph().appendTextFormatted("going to rename %s", filename.c_str());
2025         FILE* fp = fflib_open(filename);
2026         read_rtprename(filename.c_str(), fp, &rtprename);
2027         gmx_ffclose(fp);
2028     }
2029
2030     /* Add all alternative names from the residue renaming database to the list
2031        of recognized amino/nucleic acids. */
2032     for (const auto& rename : rtprename)
2033     {
2034         /* Only add names if the 'standard' gromacs/iupac base name was found */
2035         if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
2036         {
2037             rt.addResidue(rename.main, *restype);
2038             rt.addResidue(rename.nter, *restype);
2039             rt.addResidue(rename.cter, *restype);
2040             rt.addResidue(rename.bter, *restype);
2041         }
2042     }
2043
2044     matrix      box;
2045     const char* watres;
2046     clear_mat(box);
2047     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
2048     {
2049         watres = "HO4";
2050     }
2051     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
2052     {
2053         watres = "HO5";
2054     }
2055     else
2056     {
2057         watres = "HOH";
2058     }
2059
2060     AtomProperties aps;
2061     char*          title = nullptr;
2062     PbcType        pbcType;
2063     t_atoms        pdba_all;
2064     rvec*          pdbx;
2065     int            natom = read_pdball(inputConfFile_.c_str(),
2066                             bOutputSet_,
2067                             outFile_.c_str(),
2068                             &title,
2069                             &pdba_all,
2070                             &pdbx,
2071                             &pbcType,
2072                             box,
2073                             bRemoveH_,
2074                             &symtab,
2075                             &rt,
2076                             watres,
2077                             &aps,
2078                             bVerbose_);
2079
2080     if (natom == 0)
2081     {
2082         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
2083         GMX_THROW(InconsistentInputError(message));
2084     }
2085
2086     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Analyzing pdb file");
2087     int nwaterchain = 0;
2088
2089     modify_chain_numbers(&pdba_all, chainSeparation_, logger);
2090
2091     int nchainmerges = 0;
2092
2093     this_atomname    = nullptr;
2094     this_atomnum     = -1;
2095     this_resname     = nullptr;
2096     this_resnum      = -1;
2097     this_chainid     = '?';
2098     this_chainnumber = -1;
2099     this_chainstart  = 0;
2100     /* Keep the compiler happy */
2101     prev_chainstart = 0;
2102
2103     int                     numChains = 0;
2104     std::vector<t_pdbchain> pdb_ch;
2105
2106     t_resinfo* ri;
2107     bool       bMerged = false;
2108     for (int i = 0; (i < natom); i++)
2109     {
2110         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
2111
2112         /* TODO this should live in a helper object, and consolidate
2113            that with code in modify_chain_numbers */
2114         prev_atomname    = this_atomname;
2115         prev_atomnum     = this_atomnum;
2116         prev_resname     = this_resname;
2117         prev_resnum      = this_resnum;
2118         prev_chainid     = this_chainid;
2119         prev_chainnumber = this_chainnumber;
2120         if (!bMerged)
2121         {
2122             prev_chainstart = this_chainstart;
2123         }
2124
2125         this_atomname    = *pdba_all.atomname[i];
2126         this_atomnum     = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
2127         this_resname     = *ri->name;
2128         this_resnum      = ri->nr;
2129         this_chainid     = ri->chainid;
2130         this_chainnumber = ri->chainnum;
2131
2132         bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
2133
2134         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
2135         {
2136             GMX_RELEASE_ASSERT(
2137                     pdba_all.pdbinfo,
2138                     "Must have pdbinfo from reading a PDB file if chain number is changing");
2139             this_chainstart = pdba_all.atom[i].resind;
2140             bMerged         = false;
2141             if (i > 0 && !bWat_)
2142             {
2143                 if (!strncmp(c_mergeTypeNames[mergeType_], "int", 3))
2144                 {
2145                     GMX_LOG(logger.info)
2146                             .asParagraph()
2147                             .appendTextFormatted(
2148                                     "Merge chain ending with residue %s%d (chain id '%c', atom %d "
2149                                     "%s) and chain starting with "
2150                                     "residue %s%d (chain id '%c', atom %d %s) into a single "
2151                                     "moleculetype (keeping termini)? [n/y]",
2152                                     prev_resname,
2153                                     prev_resnum,
2154                                     prev_chainid,
2155                                     prev_atomnum,
2156                                     prev_atomname,
2157                                     this_resname,
2158                                     this_resnum,
2159                                     this_chainid,
2160                                     this_atomnum,
2161                                     this_atomname);
2162
2163                     if (nullptr == fgets(select, STRLEN - 1, stdin))
2164                     {
2165                         gmx_fatal(FARGS, "Error reading from stdin");
2166                     }
2167                     bMerged = (select[0] == 'y');
2168                 }
2169                 else if (!strncmp(c_mergeTypeNames[mergeType_], "all", 3))
2170                 {
2171                     bMerged = true;
2172                 }
2173             }
2174
2175             if (bMerged)
2176             {
2177                 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
2178                         pdba_all.atom[i].resind - prev_chainstart;
2179                 pdb_ch[numChains - 1].nterpairs++;
2180                 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
2181                 nchainmerges++;
2182             }
2183             else
2184             {
2185                 /* set natom for previous chain */
2186                 if (numChains > 0)
2187                 {
2188                     pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
2189                 }
2190                 if (bWat_)
2191                 {
2192                     nwaterchain++;
2193                     ri->chainid = ' ';
2194                 }
2195                 /* check if chain identifier was used before */
2196                 for (int j = 0; (j < numChains); j++)
2197                 {
2198                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
2199                     {
2200                         GMX_LOG(logger.warning)
2201                                 .asParagraph()
2202                                 .appendTextFormatted(
2203                                         "Chain identifier '%c' is used in two non-sequential "
2204                                         "blocks. "
2205                                         "They will be treated as separate chains unless you "
2206                                         "reorder your file.",
2207                                         ri->chainid);
2208                     }
2209                 }
2210                 t_pdbchain newChain;
2211                 newChain.chainid  = ri->chainid;
2212                 newChain.chainnum = ri->chainnum;
2213                 newChain.start    = i;
2214                 newChain.bAllWat  = bWat_;
2215                 if (bWat_)
2216                 {
2217                     newChain.nterpairs = 0;
2218                 }
2219                 else
2220                 {
2221                     newChain.nterpairs = 1;
2222                 }
2223                 newChain.chainstart.resize(newChain.nterpairs + 1);
2224                 /* modified [numChains] to [0] below */
2225                 newChain.chainstart[0] = 0;
2226                 pdb_ch.push_back(newChain);
2227                 numChains++;
2228             }
2229         }
2230         bPrevWat_ = bWat_;
2231     }
2232     pdb_ch.back().natom = natom - pdb_ch.back().start;
2233
2234     /* set all the water blocks at the end of the chain */
2235     std::vector<int> swap_index(numChains);
2236     int              j = 0;
2237     for (int i = 0; i < numChains; i++)
2238     {
2239         if (!pdb_ch[i].bAllWat)
2240         {
2241             swap_index[j] = i;
2242             j++;
2243         }
2244     }
2245     for (int i = 0; i < numChains; i++)
2246     {
2247         if (pdb_ch[i].bAllWat)
2248         {
2249             swap_index[j] = i;
2250             j++;
2251         }
2252     }
2253     if (nwaterchain > 1)
2254     {
2255         GMX_LOG(logger.info)
2256                 .asParagraph()
2257                 .appendTextFormatted("Moved all the water blocks to the end");
2258     }
2259
2260     t_atoms*             pdba;
2261     std::vector<t_chain> chains(numChains);
2262     /* copy pdb data and x for all chains */
2263     for (int i = 0; (i < numChains); i++)
2264     {
2265         int si               = swap_index[i];
2266         chains[i].chainid    = pdb_ch[si].chainid;
2267         chains[i].chainnum   = pdb_ch[si].chainnum;
2268         chains[i].bAllWat    = pdb_ch[si].bAllWat;
2269         chains[i].nterpairs  = pdb_ch[si].nterpairs;
2270         chains[i].chainstart = pdb_ch[si].chainstart;
2271         chains[i].ntdb.clear();
2272         chains[i].ctdb.clear();
2273         chains[i].r_start.resize(pdb_ch[si].nterpairs);
2274         chains[i].r_end.resize(pdb_ch[si].nterpairs);
2275
2276         snew(chains[i].pdba, 1);
2277         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
2278         for (j = 0; j < chains[i].pdba->nr; j++)
2279         {
2280             chains[i].pdba->atom[j]     = pdba_all.atom[pdb_ch[si].start + j];
2281             chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
2282             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
2283             chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
2284         }
2285         /* Re-index the residues assuming that the indices are continuous */
2286         int k                = chains[i].pdba->atom[0].resind;
2287         int nres             = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
2288         chains[i].pdba->nres = nres;
2289         for (int j = 0; j < chains[i].pdba->nr; j++)
2290         {
2291             chains[i].pdba->atom[j].resind -= k;
2292         }
2293         srenew(chains[i].pdba->resinfo, nres);
2294         for (int j = 0; j < nres; j++)
2295         {
2296             chains[i].pdba->resinfo[j]      = pdba_all.resinfo[k + j];
2297             chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
2298             /* make all chain identifiers equal to that of the chain */
2299             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
2300         }
2301     }
2302
2303     if (nchainmerges > 0)
2304     {
2305         GMX_LOG(logger.info)
2306                 .asParagraph()
2307                 .appendTextFormatted("Merged chains into joint molecule definitions at %d places.",
2308                                      nchainmerges);
2309     }
2310
2311     GMX_LOG(logger.info)
2312             .asParagraph()
2313             .appendTextFormatted(
2314                     "There are %d chains and %d blocks of water and "
2315                     "%d residues with %d atoms",
2316                     numChains - nwaterchain,
2317                     nwaterchain,
2318                     pdba_all.nres,
2319                     natom);
2320
2321     GMX_LOG(logger.info)
2322             .asParagraph()
2323             .appendTextFormatted("  %5s  %4s %6s", "chain", "#res", "#atoms");
2324     for (int i = 0; (i < numChains); i++)
2325     {
2326         GMX_LOG(logger.info)
2327                 .asParagraph()
2328                 .appendTextFormatted("  %d '%c' %5d %6d  %s\n",
2329                                      i + 1,
2330                                      chains[i].chainid ? chains[i].chainid : '-',
2331                                      chains[i].pdba->nres,
2332                                      chains[i].pdba->nr,
2333                                      chains[i].bAllWat ? "(only water)" : "");
2334     }
2335
2336     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_, logger);
2337
2338     /* Read atomtypes... */
2339     PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
2340
2341     /* read residue database */
2342     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Reading residue database... (%s)", forcefield_);
2343     std::vector<std::string>       rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
2344     std::vector<PreprocessResidue> rtpFFDB;
2345     for (const auto& filename : rtpf)
2346     {
2347         readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, logger, false);
2348     }
2349     if (bNewRTP_)
2350     {
2351         /* Not correct with multiple rtp input files with different bonded types */
2352         FILE* fp = gmx_fio_fopen("new.rtp", "w");
2353         print_resall(fp, rtpFFDB, atype);
2354         gmx_fio_fclose(fp);
2355     }
2356
2357     /* read hydrogen database */
2358     std::vector<MoleculePatchDatabase> ah;
2359     read_h_db(ffdir_, &ah);
2360
2361     /* Read Termini database... */
2362     std::vector<MoleculePatchDatabase>  ntdb;
2363     std::vector<MoleculePatchDatabase>  ctdb;
2364     std::vector<MoleculePatchDatabase*> tdblist;
2365     int                                 nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
2366     int                                 nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
2367
2368     FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2369
2370     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2371
2372     t_chain*               cc;
2373     std::vector<gmx::RVec> x;
2374     /* new pdb datastructure for sorting. */
2375     t_atoms** sortAtoms  = nullptr;
2376     t_atoms** localAtoms = nullptr;
2377     snew(sortAtoms, numChains);
2378     snew(localAtoms, numChains);
2379     for (int chain = 0; (chain < numChains); chain++)
2380     {
2381         cc = &(chains[chain]);
2382
2383         /* set pdba, natom and nres to the current chain */
2384         pdba     = cc->pdba;
2385         x        = cc->x;
2386         natom    = cc->pdba->nr;
2387         int nres = cc->pdba->nres;
2388
2389         if (cc->chainid && (cc->chainid != ' '))
2390         {
2391             GMX_LOG(logger.info)
2392                     .asParagraph()
2393                     .appendTextFormatted("Processing chain %d '%c' (%d atoms, %d residues)",
2394                                          chain + 1,
2395                                          cc->chainid,
2396                                          natom,
2397                                          nres);
2398         }
2399         else
2400         {
2401             GMX_LOG(logger.info)
2402                     .asParagraph()
2403                     .appendTextFormatted(
2404                             "Processing chain %d (%d atoms, %d residues)", chain + 1, natom, nres);
2405         }
2406
2407         process_chain(logger,
2408                       pdba,
2409                       x,
2410                       bUnA_,
2411                       bUnA_,
2412                       bUnA_,
2413                       bLysMan_,
2414                       bAspMan_,
2415                       bGluMan_,
2416                       bHisMan_,
2417                       bArgMan_,
2418                       bGlnMan_,
2419                       angle_,
2420                       distance_,
2421                       &symtab,
2422                       rtprename);
2423
2424         cc->chainstart[cc->nterpairs] = pdba->nres;
2425         j                             = 0;
2426         for (int i = 0; i < cc->nterpairs; i++)
2427         {
2428             find_nc_ter(
2429                     pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]), &(cc->r_end[j]), &rt, logger);
2430             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2431             {
2432                 if (checkChainCyclicity(
2433                             pdba, pdbx, cc->r_start[j], cc->r_end[j], rtpFFDB, rtprename, long_bond_dist_, short_bond_dist_))
2434                 {
2435                     cc->cyclicBondsIndex.push_back(cc->r_start[j]);
2436                     cc->cyclicBondsIndex.push_back(cc->r_end[j]);
2437                     cc->r_start[j] = -1;
2438                     cc->r_end[j]   = -1;
2439                 }
2440                 else
2441                 {
2442                     j++;
2443                 }
2444             }
2445         }
2446         cc->nterpairs = j;
2447         if (cc->nterpairs == 0 && cc->cyclicBondsIndex.empty())
2448         {
2449             GMX_LOG(logger.info)
2450                     .asParagraph()
2451                     .appendTextFormatted(
2452                             "Problem with chain definition, or missing terminal residues. "
2453                             "This chain does not appear to contain a recognized chain molecule. "
2454                             "If this is incorrect, you can edit residuetypes.dat to modify the "
2455                             "behavior.");
2456         }
2457
2458         /* Check for disulfides and other special bonds */
2459         ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2460
2461         if (!rtprename.empty())
2462         {
2463             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_, logger);
2464         }
2465
2466         for (int i = 0; i < cc->nterpairs; i++)
2467         {
2468             /* Set termini.
2469              * We first apply a filter so we only have the
2470              * termini that can be applied to the residue in question
2471              * (or a generic terminus if no-residue specific is available).
2472              */
2473             /* First the N terminus */
2474             if (nNtdb > 0)
2475             {
2476                 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2477                 if (tdblist.empty())
2478                 {
2479                     GMX_LOG(logger.info)
2480                             .asParagraph()
2481                             .appendTextFormatted(
2482                                     "No suitable end (N or 5') terminus found in database - "
2483                                     "assuming this residue "
2484                                     "is already in a terminus-specific form and skipping terminus "
2485                                     "selection.");
2486                     cc->ntdb.push_back(nullptr);
2487                 }
2488                 else
2489                 {
2490                     if (bTerMan_ && !tdblist.empty())
2491                     {
2492                         sprintf(select,
2493                                 "Select start terminus type for %s-%d",
2494                                 *pdba->resinfo[cc->r_start[i]].name,
2495                                 pdba->resinfo[cc->r_start[i]].nr);
2496                         cc->ntdb.push_back(choose_ter(tdblist, select));
2497                     }
2498                     else
2499                     {
2500                         cc->ntdb.push_back(tdblist[0]);
2501                     }
2502
2503                     printf("Start terminus %s-%d: %s\n",
2504                            *pdba->resinfo[cc->r_start[i]].name,
2505                            pdba->resinfo[cc->r_start[i]].nr,
2506                            (cc->ntdb[i])->name.c_str());
2507                     tdblist.clear();
2508                 }
2509             }
2510             else
2511             {
2512                 cc->ntdb.push_back(nullptr);
2513             }
2514
2515             /* And the C terminus */
2516             if (nCtdb > 0)
2517             {
2518                 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2519                 if (tdblist.empty())
2520                 {
2521                     GMX_LOG(logger.info)
2522                             .asParagraph()
2523                             .appendTextFormatted(
2524                                     "No suitable end (C or 3') terminus found in database - "
2525                                     "assuming this residue"
2526                                     "is already in a terminus-specific form and skipping terminus "
2527                                     "selection.");
2528                     cc->ctdb.push_back(nullptr);
2529                 }
2530                 else
2531                 {
2532                     if (bTerMan_ && !tdblist.empty())
2533                     {
2534                         sprintf(select,
2535                                 "Select end terminus type for %s-%d",
2536                                 *pdba->resinfo[cc->r_end[i]].name,
2537                                 pdba->resinfo[cc->r_end[i]].nr);
2538                         cc->ctdb.push_back(choose_ter(tdblist, select));
2539                     }
2540                     else
2541                     {
2542                         cc->ctdb.push_back(tdblist[0]);
2543                     }
2544                     printf("End terminus %s-%d: %s\n",
2545                            *pdba->resinfo[cc->r_end[i]].name,
2546                            pdba->resinfo[cc->r_end[i]].nr,
2547                            (cc->ctdb[i])->name.c_str());
2548                     tdblist.clear();
2549                 }
2550             }
2551             else
2552             {
2553                 cc->ctdb.push_back(nullptr);
2554             }
2555         }
2556         std::vector<MoleculePatchDatabase> hb_chain;
2557         /* lookup hackblocks and rtp for all residues */
2558         std::vector<PreprocessResidue> restp_chain;
2559         get_hackblocks_rtp(&hb_chain,
2560                            &restp_chain,
2561                            rtpFFDB,
2562                            pdba->nres,
2563                            pdba->resinfo,
2564                            cc->nterpairs,
2565                            &symtab,
2566                            cc->ntdb,
2567                            cc->ctdb,
2568                            cc->r_start,
2569                            cc->r_end,
2570                            bAllowMissing_,
2571                            logger);
2572         /* ideally, now we would not need the rtp itself anymore, but do
2573            everything using the hb and restp arrays. Unfortunately, that
2574            requires some re-thinking of code in gen_vsite.c, which I won't
2575            do now :( AF 26-7-99 */
2576
2577         rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2578
2579         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_, logger);
2580
2581         if (bSort_)
2582         {
2583             char**    gnames;
2584             t_blocka* block = new_blocka();
2585             snew(gnames, 1);
2586             sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2587             remove_duplicate_atoms(pdba, x, bVerbose_, logger);
2588             if (bIndexSet_)
2589             {
2590                 if (bRemoveH_)
2591                 {
2592                     GMX_LOG(logger.warning)
2593                             .asParagraph()
2594                             .appendTextFormatted(
2595                                     "With the -remh option the generated "
2596                                     "index file (%s) might be useless "
2597                                     "(the index file is generated before hydrogens are added)",
2598                                     indexOutputFile_.c_str());
2599                 }
2600                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2601             }
2602             for (int i = 0; i < block->nr; i++)
2603             {
2604                 sfree(gnames[i]);
2605             }
2606             sfree(gnames);
2607             done_blocka(block);
2608             sfree(block);
2609         }
2610         else
2611         {
2612             GMX_LOG(logger.warning)
2613                     .asParagraph()
2614                     .appendTextFormatted(
2615                             "Without sorting no check for duplicate atoms can be done");
2616         }
2617
2618         /* Generate Hydrogen atoms (and termini) in the sequence */
2619         GMX_LOG(logger.info)
2620                 .asParagraph()
2621                 .appendTextFormatted(
2622                         "Generating any missing hydrogen atoms and/or adding termini.");
2623         add_h(&pdba,
2624               &localAtoms[chain],
2625               &x,
2626               ah,
2627               &symtab,
2628               cc->nterpairs,
2629               cc->ntdb,
2630               cc->ctdb,
2631               cc->r_start,
2632               cc->r_end,
2633               bAllowMissing_,
2634               cc->cyclicBondsIndex);
2635         GMX_LOG(logger.info)
2636                 .asParagraph()
2637                 .appendTextFormatted("Now there are %d residues with %d atoms", pdba->nres, pdba->nr);
2638
2639         /* make up molecule name(s) */
2640
2641         int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2642
2643         std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2644
2645         std::string molname;
2646         std::string suffix;
2647         if (cc->bAllWat)
2648         {
2649             molname = "Water";
2650         }
2651         else
2652         {
2653             this_chainid = cc->chainid;
2654
2655             /* Add the chain id if we have one */
2656             if (this_chainid != ' ')
2657             {
2658                 suffix.append(formatString("_chain_%c", this_chainid));
2659             }
2660
2661             /* Check if there have been previous chains with the same id */
2662             int nid_used = 0;
2663             for (int k = 0; k < chain; k++)
2664             {
2665                 if (cc->chainid == chains[k].chainid)
2666                 {
2667                     nid_used++;
2668                 }
2669             }
2670             /* Add the number for this chain identifier if there are multiple copies */
2671             if (nid_used > 0)
2672             {
2673                 suffix.append(formatString("%d", nid_used + 1));
2674             }
2675
2676             if (suffix.length() > 0)
2677             {
2678                 molname.append(restype);
2679                 molname.append(suffix);
2680             }
2681             else
2682             {
2683                 molname = restype;
2684             }
2685         }
2686         std::string itp_fn = topologyFile_;
2687
2688         std::string posre_fn = includeTopologyFile_;
2689         if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2690         {
2691             bITP_ = true;
2692             printf("Chain time...\n");
2693             // construct the itp file name
2694             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2695             itp_fn.append("_");
2696             itp_fn.append(molname);
2697             itp_fn.append(".itp");
2698             // now do the same for posre
2699             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2700             posre_fn.append("_");
2701             posre_fn.append(molname);
2702             posre_fn.append(".itp");
2703             if (posre_fn == itp_fn)
2704             {
2705                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2706             }
2707             incls_.emplace_back();
2708             incls_.back() = itp_fn;
2709             itp_file_     = gmx_fio_fopen(itp_fn.c_str(), "w");
2710         }
2711         else
2712         {
2713             bITP_ = false;
2714         }
2715
2716         mols_.emplace_back();
2717         if (cc->bAllWat)
2718         {
2719             mols_.back().name = "SOL";
2720             mols_.back().nr   = pdba->nres;
2721         }
2722         else
2723         {
2724             mols_.back().name = molname;
2725             mols_.back().nr   = 1;
2726         }
2727
2728         if (bITP_)
2729         {
2730             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2731         }
2732
2733         FILE* top_file2;
2734         if (cc->bAllWat)
2735         {
2736             top_file2 = nullptr;
2737         }
2738         else if (bITP_)
2739         {
2740             top_file2 = itp_file_;
2741         }
2742         else
2743         {
2744             top_file2 = top_file;
2745         }
2746
2747         pdb2top(top_file2,
2748                 posre_fn.c_str(),
2749                 molname.c_str(),
2750                 pdba,
2751                 &x,
2752                 &atype,
2753                 &symtab,
2754                 rtpFFDB,
2755                 restp_chain,
2756                 hb_chain,
2757                 bAllowMissing_,
2758                 bVsites_,
2759                 bVsiteAromatics_,
2760                 ffdir_,
2761                 mHmult_,
2762                 ssbonds,
2763                 long_bond_dist_,
2764                 short_bond_dist_,
2765                 bDeuterate_,
2766                 bChargeGroups_,
2767                 bCmap_,
2768                 bRenumRes_,
2769                 bRTPresname_,
2770                 cc->cyclicBondsIndex,
2771                 logger);
2772
2773         if (!cc->bAllWat)
2774         {
2775             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2776         }
2777
2778         if (bITP_)
2779         {
2780             gmx_fio_fclose(itp_file_);
2781         }
2782
2783         /* pdba and natom have been reassigned somewhere so: */
2784         cc->pdba = pdba;
2785         cc->x    = x;
2786     }
2787
2788     if (watermodel_ == nullptr)
2789     {
2790         for (int chain = 0; chain < numChains; chain++)
2791         {
2792             if (chains[chain].bAllWat)
2793             {
2794                 auto message = formatString(
2795                         "You have chosen not to include a water model, "
2796                         "but there is water in the input file. Select a "
2797                         "water model or remove the water from your input file.");
2798                 GMX_THROW(InconsistentInputError(message));
2799             }
2800         }
2801     }
2802     else
2803     {
2804         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2805         if (!fflib_fexist(waterFile))
2806         {
2807             auto message = formatString(
2808                     "The topology file '%s' for the selected water "
2809                     "model '%s' can not be found in the force field "
2810                     "directory. Select a different water model.",
2811                     waterFile.c_str(),
2812                     watermodel_);
2813             GMX_THROW(InconsistentInputError(message));
2814         }
2815     }
2816
2817     print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2818     gmx_fio_fclose(top_file);
2819
2820     /* now merge all chains back together */
2821     natom    = 0;
2822     int nres = 0;
2823     for (int i = 0; (i < numChains); i++)
2824     {
2825         natom += chains[i].pdba->nr;
2826         nres += chains[i].pdba->nres;
2827     }
2828     t_atoms* atoms;
2829     snew(atoms, 1);
2830     init_t_atoms(atoms, natom, false);
2831     for (int i = 0; i < atoms->nres; i++)
2832     {
2833         sfree(atoms->resinfo[i].name);
2834     }
2835     atoms->nres = nres;
2836     srenew(atoms->resinfo, nres);
2837     x.clear();
2838     int k = 0;
2839     int l = 0;
2840     for (int i = 0; (i < numChains); i++)
2841     {
2842         if (numChains > 1)
2843         {
2844             GMX_LOG(logger.info)
2845                     .asParagraph()
2846                     .appendTextFormatted("Including chain %d in system: %d atoms %d residues",
2847                                          i + 1,
2848                                          chains[i].pdba->nr,
2849                                          chains[i].pdba->nres);
2850         }
2851         for (int j = 0; (j < chains[i].pdba->nr); j++)
2852         {
2853             atoms->atom[k] = chains[i].pdba->atom[j];
2854             atoms->atom[k].resind += l; /* l is processed nr of residues */
2855             atoms->atomname[k]                            = chains[i].pdba->atomname[j];
2856             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2857             x.push_back(chains[i].x[j]);
2858             k++;
2859         }
2860         for (int j = 0; (j < chains[i].pdba->nres); j++)
2861         {
2862             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2863             if (bRTPresname_)
2864             {
2865                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2866             }
2867             l++;
2868         }
2869         done_atom(chains[i].pdba);
2870     }
2871
2872     if (numChains > 1)
2873     {
2874         GMX_LOG(logger.info)
2875                 .asParagraph()
2876                 .appendTextFormatted("Now there are %d atoms and %d residues", k, l);
2877         print_sums(atoms, true, logger);
2878     }
2879
2880     rvec box_space;
2881     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Writing coordinate file...");
2882     clear_rvec(box_space);
2883     if (box[0][0] == 0)
2884     {
2885         make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2886     }
2887     write_sto_conf(
2888             outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr, pbcType, box);
2889
2890     done_symtab(&symtab);
2891     done_atom(&pdba_all);
2892     done_atom(atoms);
2893     for (int chain = 0; chain < numChains; chain++)
2894     {
2895         sfree(sortAtoms[chain]);
2896         sfree(localAtoms[chain]);
2897     }
2898     sfree(sortAtoms);
2899     sfree(localAtoms);
2900     sfree(atoms);
2901     sfree(title);
2902     sfree(pdbx);
2903
2904     GMX_LOG(logger.info)
2905             .asParagraph()
2906             .appendTextFormatted("\t\t--------- PLEASE NOTE ------------");
2907     GMX_LOG(logger.info)
2908             .asParagraph()
2909             .appendTextFormatted("You have successfully generated a topology from: %s.",
2910                                  inputConfFile_.c_str());
2911     if (watermodel_ != nullptr)
2912     {
2913         GMX_LOG(logger.info)
2914                 .asParagraph()
2915                 .appendTextFormatted(
2916                         "The %s force field and the %s water model are used.", ffname_, watermodel_);
2917         sfree(watermodel_);
2918     }
2919     else
2920     {
2921         GMX_LOG(logger.info).asParagraph().appendTextFormatted("The %s force field is used.", ffname_);
2922     }
2923     GMX_LOG(logger.info)
2924             .asParagraph()
2925             .appendTextFormatted("\t\t--------- ETON ESAELP ------------");
2926
2927     return 0;
2928 }
2929
2930 } // namespace
2931
2932 const char pdb2gmxInfo::name[] = "pdb2gmx";
2933 const char pdb2gmxInfo::shortDescription[] =
2934         "Convert coordinate files to topology and FF-compliant coordinate files";
2935 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2936 {
2937     return std::make_unique<pdb2gmx>();
2938 }
2939
2940 } // namespace gmx