Fix random typos
[alexxy/gromacs.git] / src / gromacs / gmxpreprocess / pdb2gmx.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 #include "gmxpre.h"
39
40 #include "gromacs/utility/enumerationhelpers.h"
41 #include "pdb2gmx.h"
42
43 #include <cctype>
44 #include <cstdlib>
45 #include <cstring>
46 #include <ctime>
47
48 #include <algorithm>
49 #include <memory>
50 #include <string>
51 #include <vector>
52
53 #include "gromacs/commandline/cmdlineoptionsmodule.h"
54 #include "gromacs/fileio/confio.h"
55 #include "gromacs/fileio/filetypes.h"
56 #include "gromacs/fileio/gmxfio.h"
57 #include "gromacs/fileio/pdbio.h"
58 #include "gromacs/gmxlib/conformation_utilities.h"
59 #include "gromacs/gmxpreprocess/fflibutil.h"
60 #include "gromacs/gmxpreprocess/genhydro.h"
61 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
62 #include "gromacs/gmxpreprocess/grompp_impl.h"
63 #include "gromacs/gmxpreprocess/h_db.h"
64 #include "gromacs/gmxpreprocess/hizzie.h"
65 #include "gromacs/gmxpreprocess/pdb2top.h"
66 #include "gromacs/gmxpreprocess/pgutil.h"
67 #include "gromacs/gmxpreprocess/specbond.h"
68 #include "gromacs/gmxpreprocess/ter_db.h"
69 #include "gromacs/gmxpreprocess/toputil.h"
70 #include "gromacs/gmxpreprocess/xlate.h"
71 #include "gromacs/math/vec.h"
72 #include "gromacs/options/basicoptions.h"
73 #include "gromacs/options/filenameoption.h"
74 #include "gromacs/options/ioptionscontainer.h"
75 #include "gromacs/topology/atomprop.h"
76 #include "gromacs/topology/block.h"
77 #include "gromacs/topology/index.h"
78 #include "gromacs/topology/residuetypes.h"
79 #include "gromacs/topology/symtab.h"
80 #include "gromacs/topology/topology.h"
81 #include "gromacs/utility/dir_separator.h"
82 #include "gromacs/utility/exceptions.h"
83 #include "gromacs/utility/fatalerror.h"
84 #include "gromacs/utility/filestream.h"
85 #include "gromacs/utility/loggerbuilder.h"
86 #include "gromacs/utility/path.h"
87 #include "gromacs/utility/smalloc.h"
88 #include "gromacs/utility/strdb.h"
89 #include "gromacs/utility/stringutil.h"
90
91 #include "hackblock.h"
92 #include "resall.h"
93
94 struct RtpRename
95 {
96     RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
97         gmx(newGmx), main(newMain), nter(newNter), cter(newCter), bter(newBter)
98     {
99     }
100     std::string gmx;
101     std::string main;
102     std::string nter;
103     std::string cter;
104     std::string bter;
105 };
106
107 namespace
108 {
109
110 const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
111 {
112     /* NOTE: This function returns the main building block name,
113      *       it does not take terminal renaming into account.
114      */
115     auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
116         return gmx::equalCaseInsensitive(name, rename.gmx);
117     });
118     return found != rr.end() ? found->main.c_str() : name.c_str();
119 }
120
121 const char* enumValueToLongString(HistidineStates enumValue)
122 {
123     constexpr gmx::EnumerationArray<HistidineStates, const char*> histidineStatesLongNames = {
124         "H on ND1 only", "H on NE2 only", "H on ND1 and NE2", "Coupled to Heme"
125     };
126     return histidineStatesLongNames[enumValue];
127 }
128
129 enum class AspartateStates : int
130 {
131     Deprot,
132     Prot,
133     Count
134 };
135
136 const char* enumValueToString(AspartateStates enumValue)
137 {
138     constexpr gmx::EnumerationArray<AspartateStates, const char*> aspartateStateNames = { "ASP",
139                                                                                           "ASPH" };
140     return aspartateStateNames[enumValue];
141 }
142
143 const char* enumValueToLongString(AspartateStates enumValue)
144 {
145     constexpr gmx::EnumerationArray<AspartateStates, const char*> aspartateStateLongNames = {
146         "Not protonated (charge -1)", "Protonated (charge 0)"
147     };
148     return aspartateStateLongNames[enumValue];
149 }
150
151 enum class GlutamateStates : int
152 {
153     Deprot,
154     Prot,
155     Count
156 };
157
158 const char* enumValueToString(GlutamateStates enumValue)
159 {
160     constexpr gmx::EnumerationArray<GlutamateStates, const char*> glutamateStateNames = { "GLU",
161                                                                                           "GLUH" };
162     return glutamateStateNames[enumValue];
163 }
164
165 const char* enumValueToLongString(GlutamateStates enumValue)
166 {
167     constexpr gmx::EnumerationArray<GlutamateStates, const char*> glutamateStateLongNames = {
168         "Not protonated (charge -1)", "Protonated (charge 0)"
169     };
170     return glutamateStateLongNames[enumValue];
171 }
172
173 enum class GlutamineStates : int
174 {
175     Deprot,
176     Prot,
177     Count
178 };
179
180 const char* enumValueToString(GlutamineStates enumValue)
181 {
182     constexpr gmx::EnumerationArray<GlutamineStates, const char*> glutamineStateNames = { "GLN",
183                                                                                           "QLN" };
184     return glutamineStateNames[enumValue];
185 }
186
187 const char* enumValueToLongString(GlutamineStates enumValue)
188 {
189     constexpr gmx::EnumerationArray<GlutamineStates, const char*> glutamineStateLongNames = {
190         "Not protonated (charge 0)", "Protonated (charge +1)"
191     };
192     return glutamineStateLongNames[enumValue];
193 }
194
195 enum class LysineStates : int
196 {
197     Deprot,
198     Prot,
199     Count
200 };
201
202 const char* enumValueToString(LysineStates enumValue)
203 {
204     constexpr gmx::EnumerationArray<LysineStates, const char*> lysineStateNames = { "LYSN", "LYS" };
205     return lysineStateNames[enumValue];
206 }
207
208 const char* enumValueToLongString(LysineStates enumValue)
209 {
210     constexpr gmx::EnumerationArray<LysineStates, const char*> lysineStateLongNames = {
211         "Not protonated (charge 0)", "Protonated (charge +1)"
212     };
213     return lysineStateLongNames[enumValue];
214 }
215
216 enum class ArginineStates : int
217 {
218     Deprot,
219     Prot,
220     Count
221 };
222
223 const char* enumValueToString(ArginineStates enumValue)
224 {
225     constexpr gmx::EnumerationArray<ArginineStates, const char*> arginineStatesNames = { "ARGN",
226                                                                                          "ARG" };
227     return arginineStatesNames[enumValue];
228 }
229
230 const char* enumValueToLongString(ArginineStates enumValue)
231 {
232     constexpr gmx::EnumerationArray<ArginineStates, const char*> arginineStatesLongNames = {
233         "Not protonated (charge 0)", "Protonated (charge +1)"
234     };
235     return arginineStatesLongNames[enumValue];
236 }
237
238 template<typename EnumType>
239 const char* select_res(int resnr, const char* title, gmx::ArrayRef<const RtpRename> rr)
240 {
241     printf("Which %s type do you want for residue %d\n", title, resnr + 1);
242     for (auto sel : gmx::EnumerationWrapper<EnumType>{})
243     {
244         printf("%d. %s (%s)\n",
245                static_cast<int>(sel),
246                enumValueToString(sel),
247                res2bb_notermini(enumValueToString(sel), rr));
248     }
249     printf("\nType a number:");
250     fflush(stdout);
251
252     int userSelection;
253     if (scanf("%d", &userSelection) != 1)
254     {
255         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
256     }
257
258     return enumValueToLongString(static_cast<EnumType>(userSelection));
259 }
260
261 const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
262 {
263     return select_res<AspartateStates>(resnr, "ASPARTIC ACID", rr);
264 }
265
266 const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
267 {
268     return select_res<GlutamateStates>(resnr, "GLUTAMIC ACID", rr);
269 }
270
271 const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
272 {
273     return select_res<GlutamineStates>(resnr, "GLUTAMINE", rr);
274 }
275
276 const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
277 {
278     return select_res<LysineStates>(resnr, "LYSINE", rr);
279 }
280
281 const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
282 {
283     return select_res<ArginineStates>(resnr, "ARGININE", rr);
284 }
285
286 const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
287 {
288     return select_res<HistidineStates>(resnr, "HISTIDINE", rr);
289 }
290
291 void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
292 {
293     char line[STRLEN], buf[STRLEN];
294
295     int ncol = 0;
296     while (get_a_line(fp, line, STRLEN))
297     {
298         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
299          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
300          * Note that the buffer length has been increased to 7 to allow this,
301          * so we just need to make sure the strings have zero-length initially.
302          */
303         char gmx[STRLEN];
304         char main[STRLEN];
305         char nter[STRLEN];
306         char cter[STRLEN];
307         char bter[STRLEN];
308         gmx[0]       = '\0';
309         main[0]      = '\0';
310         nter[0]      = '\0';
311         cter[0]      = '\0';
312         bter[0]      = '\0';
313         int       nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
314         RtpRename newEntry(gmx, main, nter, cter, bter);
315         if (ncol == 0)
316         {
317             if (nc != 2 && nc != 5)
318             {
319                 gmx_fatal(FARGS,
320                           "Residue renaming database '%s' has %d columns instead of %d or %d",
321                           fname,
322                           ncol,
323                           2,
324                           5);
325             }
326             ncol = nc;
327         }
328         else if (nc != ncol)
329         {
330             gmx_fatal(FARGS,
331                       "A line in residue renaming database '%s' has %d columns, while previous "
332                       "lines have %d columns",
333                       fname,
334                       nc,
335                       ncol);
336         }
337
338         if (nc == 2)
339         {
340             /* This file does not have special termini names, copy them from main */
341             newEntry.nter = newEntry.main;
342             newEntry.cter = newEntry.main;
343             newEntry.bter = newEntry.main;
344         }
345         rtprename->push_back(newEntry);
346     }
347 }
348
349 std::string search_resrename(gmx::ArrayRef<const RtpRename> rr, const char* name, bool bStart, bool bEnd, bool bCompareFFRTPname)
350 {
351     auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
352         return ((!bCompareFFRTPname && (name == rename.gmx))
353                 || (bCompareFFRTPname && (name == rename.main)));
354     });
355
356     std::string newName;
357     /* If found in the database, rename this residue's rtp building block,
358      * otherwise keep the old name.
359      */
360     if (found != rr.end())
361     {
362         if (bStart && bEnd)
363         {
364             newName = found->bter;
365         }
366         else if (bStart)
367         {
368             newName = found->nter;
369         }
370         else if (bEnd)
371         {
372             newName = found->cter;
373         }
374         else
375         {
376             newName = found->main;
377         }
378
379         if (newName[0] == '-')
380         {
381             gmx_fatal(FARGS,
382                       "In the chosen force field there is no residue type for '%s'%s",
383                       name,
384                       bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
385                              : (bEnd ? " as an ending terminus" : ""));
386         }
387     }
388
389     return newName;
390 }
391
392 void rename_resrtp(t_atoms*                       pdba,
393                    int                            nterpairs,
394                    gmx::ArrayRef<const int>       r_start,
395                    gmx::ArrayRef<const int>       r_end,
396                    gmx::ArrayRef<const RtpRename> rr,
397                    t_symtab*                      symtab,
398                    bool                           bVerbose,
399                    const gmx::MDLogger&           logger)
400 {
401     bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
402
403     for (int r = 0; r < pdba->nres; r++)
404     {
405         bool bStart = false;
406         bool bEnd   = false;
407         for (int j = 0; j < nterpairs; j++)
408         {
409             if (r == r_start[j])
410             {
411                 bStart = true;
412             }
413         }
414         for (int j = 0; j < nterpairs; j++)
415         {
416             if (r == r_end[j])
417             {
418                 bEnd = true;
419             }
420         }
421
422         std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
423
424         if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
425         {
426             /* This is a terminal residue, but the residue name,
427              * currently stored in .rtp, is not a standard residue name,
428              * but probably a force field specific rtp name.
429              * Check if we need to rename it because it is terminal.
430              */
431             newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
432         }
433
434         if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
435         {
436             if (bVerbose)
437             {
438                 GMX_LOG(logger.info)
439                         .asParagraph()
440                         .appendTextFormatted("Changing rtp entry of residue %d %s to '%s'",
441                                              pdba->resinfo[r].nr,
442                                              *pdba->resinfo[r].name,
443                                              newName.c_str());
444             }
445             pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
446         }
447     }
448 }
449
450 void pdbres_to_gmxrtp(t_atoms* pdba)
451 {
452     int i;
453
454     for (i = 0; (i < pdba->nres); i++)
455     {
456         if (pdba->resinfo[i].rtp == nullptr)
457         {
458             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
459         }
460     }
461 }
462
463 void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
464 {
465     char* resnm;
466     int   i;
467
468     for (i = 0; (i < pdba->nres); i++)
469     {
470         resnm = *pdba->resinfo[i].name;
471         if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
472             || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
473         {
474             /* Rename the residue name (not the rtp name) */
475             pdba->resinfo[i].name = put_symtab(symtab, newnm);
476         }
477     }
478 }
479
480 /*! \brief Rename all residues named \c oldnm to \c newnm
481  *
482  * Search for residues for which the residue name from the input
483  * configuration file matches \c oldnm, and when found choose the rtp
484  * entry and name of \c newnm.
485  *
486  * \todo Refactor this function to accept a lambda that accepts i and
487  * numMatchesFound but always produces \c newnm. Then remove
488  * renameResiduesInteractively by calling this method with suitable
489  * lambdas that capture its parameter \c rr and ignores
490  * numMatchesFound. */
491 void renameResidue(const gmx::MDLogger& logger,
492                    t_atoms*             pdba,
493                    const char*          oldnm,
494                    const char*          newnm,
495                    bool                 bFullCompare,
496                    t_symtab*            symtab)
497 {
498     int numMatchesFound = 0;
499     for (int i = 0; (i < pdba->nres); i++)
500     {
501         /* We have not set the rtp name yet, use the residue name */
502         const char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
503         if ((bFullCompare && (gmx::equalCaseInsensitive(residueNameInInputConfiguration, oldnm)))
504             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
505         {
506             /* Change the rtp building block name */
507             pdba->resinfo[i].rtp  = put_symtab(symtab, newnm);
508             pdba->resinfo[i].name = pdba->resinfo[i].rtp;
509             numMatchesFound++;
510         }
511     }
512     if (numMatchesFound > 0)
513     {
514         GMX_LOG(logger.info)
515                 .asParagraph()
516                 .appendTextFormatted(
517                         "Replaced %d residue%s named %s to the default %s. Use interactive "
518                         "selection of protonated residues if that is what you need.",
519                         numMatchesFound,
520                         numMatchesFound > 1 ? "s" : "",
521                         oldnm,
522                         newnm);
523     }
524 }
525
526 /*! \brief Rename all residues named \c oldnm according to the user's
527  * interactive choice
528  *
529  * Search for residues for which the residue name from the input
530  * configuration file matches \c oldnm, and when found choose the rtp
531  * entry and name of the interactive choice from \c gettp.
532  *
533  * \todo Remove this function, per todo in \c renameResidue. */
534 void renameResidueInteractively(t_atoms*    pdba,
535                                 const char* oldnm,
536                                 const char* gettp(int, gmx::ArrayRef<const RtpRename>),
537                                 bool        bFullCompare,
538                                 t_symtab*   symtab,
539                                 gmx::ArrayRef<const RtpRename> rr)
540 {
541     // Search for residues i for which the residue name from the input
542     // configuration file matches oldnm, so it can replaced by the rtp
543     // entry and name of newnm.
544     for (int i = 0; i < pdba->nres; i++)
545     {
546         /* We have not set the rtp name yet, use the residue name */
547         char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
548         if ((bFullCompare && (strcmp(residueNameInInputConfiguration, oldnm) == 0))
549             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
550         {
551             const char* interactiveRtpChoice = gettp(i, rr);
552             pdba->resinfo[i].rtp             = put_symtab(symtab, interactiveRtpChoice);
553             pdba->resinfo[i].name            = pdba->resinfo[i].rtp;
554         }
555     }
556 }
557
558 void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose, const gmx::MDLogger& logger)
559 {
560     int i, ftp;
561     int nzero   = 0;
562     int nnotone = 0;
563
564     ftp = fn2ftp(filename);
565     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
566     {
567         GMX_LOG(logger.warning).asParagraph().appendTextFormatted("No occupancies in %s", filename);
568     }
569     else
570     {
571         for (i = 0; (i < atoms->nr); i++)
572         {
573             if (atoms->pdbinfo[i].occup != 1)
574             {
575                 if (bVerbose)
576                 {
577                     GMX_LOG(logger.warning)
578                             .asParagraph()
579                             .appendTextFormatted("Occupancy for atom %s%d-%s is %f rather than 1",
580                                                  *atoms->resinfo[atoms->atom[i].resind].name,
581                                                  atoms->resinfo[atoms->atom[i].resind].nr,
582                                                  *atoms->atomname[i],
583                                                  atoms->pdbinfo[i].occup);
584                 }
585                 if (atoms->pdbinfo[i].occup == 0)
586                 {
587                     nzero++;
588                 }
589                 else
590                 {
591                     nnotone++;
592                 }
593             }
594         }
595         if (nzero == atoms->nr)
596         {
597             GMX_LOG(logger.warning)
598                     .asParagraph()
599                     .appendTextFormatted(
600                             "All occupancy fields zero. This is probably not an X-Ray structure");
601         }
602         else if ((nzero > 0) || (nnotone > 0))
603         {
604             GMX_LOG(logger.warning)
605                     .asParagraph()
606                     .appendTextFormatted(
607                             "there were %d atoms with zero occupancy and %d atoms with "
608                             "         occupancy unequal to one (out of %d atoms). Check your pdb "
609                             "file.",
610                             nzero,
611                             nnotone,
612                             atoms->nr);
613         }
614         else
615         {
616             GMX_LOG(logger.warning).asParagraph().appendTextFormatted("All occupancies are one");
617         }
618     }
619 }
620
621 void write_posres(const char* fn, t_atoms* pdba, real fc)
622 {
623     FILE* fp;
624     int   i;
625
626     fp = gmx_fio_fopen(fn, "w");
627     fprintf(fp,
628             "; In this topology include file, you will find position restraint\n"
629             "; entries for all the heavy atoms in your original pdb file.\n"
630             "; This means that all the protons which were added by pdb2gmx are\n"
631             "; not restrained.\n"
632             "\n"
633             "[ position_restraints ]\n"
634             "; %4s%6s%8s%8s%8s\n",
635             "atom",
636             "type",
637             "fx",
638             "fy",
639             "fz");
640     for (i = 0; (i < pdba->nr); i++)
641     {
642         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
643         {
644             fprintf(fp, "%6d%6d  %g  %g  %g\n", i + 1, 1, fc, fc, fc);
645         }
646     }
647     gmx_fio_fclose(fp);
648 }
649
650 int read_pdball(const char*           inf,
651                 bool                  bOutput,
652                 const char*           outf,
653                 char**                title,
654                 t_atoms*              atoms,
655                 rvec**                x,
656                 PbcType*              pbcType,
657                 matrix                box,
658                 bool                  bRemoveH,
659                 t_symtab*             symtab,
660                 const ResidueTypeMap& residueTypeMap,
661                 const char*           watres,
662                 AtomProperties*       aps,
663                 bool                  bVerbose)
664 /* Read a pdb file. (containing proteins) */
665 {
666     int natom, new_natom, i;
667
668     /* READ IT */
669     printf("Reading %s...\n", inf);
670     readConfAndAtoms(inf, symtab, title, atoms, pbcType, x, nullptr, box);
671     natom = atoms->nr;
672     if (atoms->pdbinfo == nullptr)
673     {
674         snew(atoms->pdbinfo, atoms->nr);
675     }
676     if (fn2ftp(inf) == efPDB)
677     {
678         get_pdb_atomnumber(atoms, aps);
679     }
680     if (bRemoveH)
681     {
682         new_natom = 0;
683         for (i = 0; i < atoms->nr; i++)
684         {
685             if (!is_hydrogen(*atoms->atomname[i]))
686             {
687                 atoms->atom[new_natom]     = atoms->atom[i];
688                 atoms->atomname[new_natom] = atoms->atomname[i];
689                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
690                 copy_rvec((*x)[i], (*x)[new_natom]);
691                 new_natom++;
692             }
693         }
694         atoms->nr = new_natom;
695         natom     = new_natom;
696     }
697
698     printf("Read");
699     if (title[0])
700     {
701         printf(" '%s',", *title);
702     }
703     printf(" %d atoms\n", natom);
704
705     /* Rename residues */
706     rename_pdbres(atoms, "HOH", watres, false, symtab);
707     rename_pdbres(atoms, "SOL", watres, false, symtab);
708     rename_pdbres(atoms, "WAT", watres, false, symtab);
709
710     rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, residueTypeMap, true, bVerbose);
711
712     if (natom == 0)
713     {
714         return 0;
715     }
716     if (bOutput)
717     {
718         write_sto_conf(outf, *title, atoms, *x, nullptr, *pbcType, box);
719     }
720
721     return natom;
722 }
723
724 void process_chain(const gmx::MDLogger&           logger,
725                    t_atoms*                       pdba,
726                    gmx::ArrayRef<gmx::RVec>       x,
727                    bool                           bTrpU,
728                    bool                           bPheU,
729                    bool                           bTyrU,
730                    bool                           bLysMan,
731                    bool                           bAspMan,
732                    bool                           bGluMan,
733                    bool                           bHisMan,
734                    bool                           bArgMan,
735                    bool                           bGlnMan,
736                    real                           angle,
737                    real                           distance,
738                    t_symtab*                      symtab,
739                    gmx::ArrayRef<const RtpRename> rr)
740 {
741     /* Rename aromatics, lys, asp and histidine */
742     if (bTyrU)
743     {
744         renameResidue(logger, pdba, "TYR", "TYRU", false, symtab);
745     }
746     if (bTrpU)
747     {
748         renameResidue(logger, pdba, "TRP", "TRPU", false, symtab);
749     }
750     if (bPheU)
751     {
752         renameResidue(logger, pdba, "PHE", "PHEU", false, symtab);
753     }
754     if (bLysMan)
755     {
756         renameResidueInteractively(pdba, "LYS", get_lystp, false, symtab, rr);
757     }
758     if (bArgMan)
759     {
760         renameResidueInteractively(pdba, "ARG", get_argtp, false, symtab, rr);
761     }
762     if (bGlnMan)
763     {
764         renameResidueInteractively(pdba, "GLN", get_glntp, false, symtab, rr);
765     }
766     if (bAspMan)
767     {
768         renameResidueInteractively(pdba, "ASP", get_asptp, false, symtab, rr);
769     }
770     else
771     {
772         renameResidue(logger, pdba, "ASPH", "ASP", false, symtab);
773     }
774     if (bGluMan)
775     {
776         renameResidueInteractively(pdba, "GLU", get_glutp, false, symtab, rr);
777     }
778     else
779     {
780         renameResidue(logger, pdba, "GLUH", "GLU", false, symtab);
781     }
782
783     if (!bHisMan)
784     {
785         set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
786     }
787     else
788     {
789         renameResidueInteractively(pdba, "HIS", get_histp, true, symtab, rr);
790     }
791
792     /* Initialize the rtp builing block names with the residue names
793      * for the residues that have not been processed above.
794      */
795     pdbres_to_gmxrtp(pdba);
796
797     /* Now we have all rtp names set.
798      * The rtp names will conform to Gromacs naming,
799      * unless the input pdb file contained one or more force field specific
800      * rtp names as residue names.
801      */
802 }
803
804 /* struct for sorting the atoms from the pdb file */
805 typedef struct
806 {
807     int  resnr;  /* residue number               */
808     int  j;      /* database order index         */
809     int  index;  /* original atom number         */
810     char anm1;   /* second letter of atom name   */
811     char altloc; /* alternate location indicator */
812 } t_pdbindex;
813
814 bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
815 {
816     int d = (a.resnr - b.resnr);
817     if (d == 0)
818     {
819         d = (a.j - b.j);
820         if (d == 0)
821         {
822             d = (a.anm1 - b.anm1);
823             if (d == 0)
824             {
825                 d = (a.altloc - b.altloc);
826             }
827         }
828     }
829     return d < 0;
830 }
831
832 void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
833                    int                                    natoms,
834                    t_atoms**                              pdbaptr,
835                    t_atoms**                              newPdbAtoms,
836                    std::vector<gmx::RVec>*                x,
837                    t_blocka*                              block,
838                    char***                                gnames)
839 {
840     t_atoms*               pdba = *pdbaptr;
841     std::vector<gmx::RVec> xnew;
842     t_pdbindex*            pdbi;
843     char*                  atomnm;
844
845     natoms = pdba->nr;
846     snew(pdbi, natoms);
847
848     for (int i = 0; i < natoms; i++)
849     {
850         atomnm                                  = *pdba->atomname[i];
851         const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
852         auto                     found =
853                 std::find_if(localPpResidue->atomname.begin(),
854                              localPpResidue->atomname.end(),
855                              [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
856         if (found == localPpResidue->atomname.end())
857         {
858             std::string buf = gmx::formatString(
859                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
860                     "while sorting atoms.\n%s",
861                     atomnm,
862                     *pdba->resinfo[pdba->atom[i].resind].name,
863                     pdba->resinfo[pdba->atom[i].resind].nr,
864                     localPpResidue->resname.c_str(),
865                     localPpResidue->natom(),
866                     is_hydrogen(atomnm)
867                             ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
868                               "might have had a different number in the PDB file and was rebuilt\n"
869                               "(it might for instance have been H3, and we only expected H1 & "
870                               "H2).\n"
871                               "Note that hydrogens might have been added to the entry for the "
872                               "N-terminus.\n"
873                               "Remove this hydrogen or choose a different protonation state to "
874                               "solve it.\n"
875                               "Option -ignh will ignore all hydrogens in the input."
876                             : ".");
877             gmx_fatal(FARGS, "%s", buf.c_str());
878         }
879         /* make shadow array to be sorted into indexgroup */
880         pdbi[i].resnr  = pdba->atom[i].resind;
881         pdbi[i].j      = std::distance(localPpResidue->atomname.begin(), found);
882         pdbi[i].index  = i;
883         pdbi[i].anm1   = atomnm[1];
884         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
885     }
886     std::sort(pdbi, pdbi + natoms, pdbicomp);
887
888     /* pdba is sorted in pdbnew using the pdbi index */
889     std::vector<int> a(natoms);
890     srenew(*newPdbAtoms, 1);
891     init_t_atoms((*newPdbAtoms), natoms, true);
892     (*newPdbAtoms)->nr   = pdba->nr;
893     (*newPdbAtoms)->nres = pdba->nres;
894     srenew((*newPdbAtoms)->resinfo, pdba->nres);
895     std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
896     for (int i = 0; i < natoms; i++)
897     {
898         (*newPdbAtoms)->atom[i]     = pdba->atom[pdbi[i].index];
899         (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
900         (*newPdbAtoms)->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
901         xnew.push_back(x->at(pdbi[i].index));
902         /* make indexgroup in block */
903         a[i] = pdbi[i].index;
904     }
905     /* clean up */
906     done_atom(pdba);
907     sfree(pdba);
908     /* copy the sorted pdbnew back to pdba */
909     *pdbaptr = *newPdbAtoms;
910     *x       = xnew;
911     add_grp(block, gnames, a, "prot_sort");
912     sfree(pdbi);
913 }
914
915 int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose, const gmx::MDLogger& logger)
916 {
917     int        i, j, oldnatoms, ndel;
918     t_resinfo* ri;
919
920     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Checking for duplicate atoms....");
921     oldnatoms = pdba->nr;
922     ndel      = 0;
923     /* NOTE: pdba->nr is modified inside the loop */
924     for (i = 1; (i < pdba->nr); i++)
925     {
926         /* compare 'i' and 'i-1', throw away 'i' if they are identical
927            this is a 'while' because multiple alternate locations can be present */
928         while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
929                && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
930         {
931             ndel++;
932             if (bVerbose)
933             {
934                 ri = &pdba->resinfo[pdba->atom[i].resind];
935                 GMX_LOG(logger.info)
936                         .asParagraph()
937                         .appendTextFormatted("deleting duplicate atom %4s  %s%4d%c",
938                                              *pdba->atomname[i],
939                                              *ri->name,
940                                              ri->nr,
941                                              ri->ic);
942                 if (ri->chainid && (ri->chainid != ' '))
943                 {
944                     printf(" ch %c", ri->chainid);
945                 }
946                 if (pdba->pdbinfo)
947                 {
948                     if (pdba->pdbinfo[i].atomnr)
949                     {
950                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
951                     }
952                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
953                     {
954                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
955                     }
956                 }
957                 printf("\n");
958             }
959             pdba->nr--;
960             /* We can not free, since it might be in the symtab */
961             /* sfree(pdba->atomname[i]); */
962             for (j = i; j < pdba->nr; j++)
963             {
964                 pdba->atom[j]     = pdba->atom[j + 1];
965                 pdba->atomname[j] = pdba->atomname[j + 1];
966                 if (pdba->pdbinfo)
967                 {
968                     pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
969                 }
970                 copy_rvec(x[j + 1], x[j]);
971             }
972             srenew(pdba->atom, pdba->nr);
973             /* srenew(pdba->atomname, pdba->nr); */
974             srenew(pdba->pdbinfo, pdba->nr);
975         }
976     }
977     if (pdba->nr != oldnatoms)
978     {
979         GMX_LOG(logger.info)
980                 .asParagraph()
981                 .appendTextFormatted("Now there are %d atoms. Deleted %d duplicates.", pdba->nr, ndel);
982     }
983
984     return pdba->nr;
985 }
986
987 void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, const ResidueTypeMap& residueTypeMap)
988 {
989     std::string startResidueString =
990             gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
991     std::string endResidueString =
992             gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
993
994     // Check whether all residues in chain have the same chain ID.
995     bool        allResiduesHaveSameChainID = true;
996     char        chainID0                   = pdba->resinfo[r0].chainid;
997     char        chainID;
998     std::string residueString;
999
1000     for (int i = r0 + 1; i < r1; i++)
1001     {
1002         chainID = pdba->resinfo[i].chainid;
1003         if (chainID != chainID0)
1004         {
1005             allResiduesHaveSameChainID = false;
1006             residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1007             break;
1008         }
1009     }
1010
1011     if (!allResiduesHaveSameChainID)
1012     {
1013         gmx_fatal(FARGS,
1014                   "The chain covering the range %s--%s does not have a consistent chain ID. "
1015                   "The first residue has ID '%c', while residue %s has ID '%c'.",
1016                   startResidueString.c_str(),
1017                   endResidueString.c_str(),
1018                   chainID0,
1019                   residueString.c_str(),
1020                   chainID);
1021     }
1022
1023     // At this point all residues have the same ID. If they are also non-blank
1024     // we can be a bit more aggressive and require the types match too.
1025     if (chainID0 != ' ')
1026     {
1027         bool        allResiduesHaveSameType = true;
1028         std::string restype;
1029         std::string restype0 = typeOfNamedDatabaseResidue(residueTypeMap, *pdba->resinfo[r0].name);
1030
1031         for (int i = r0 + 1; i < r1; i++)
1032         {
1033             restype = typeOfNamedDatabaseResidue(residueTypeMap, *pdba->resinfo[i].name);
1034             if (!gmx::equalCaseInsensitive(restype, restype0))
1035             {
1036                 allResiduesHaveSameType = false;
1037                 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1038                 break;
1039             }
1040         }
1041
1042         if (!allResiduesHaveSameType)
1043         {
1044             gmx_fatal(FARGS,
1045                       "The residues in the chain %s--%s do not have a consistent type. "
1046                       "The first residue has type '%s', while residue %s is of type '%s'. "
1047                       "Either there is a mistake in your chain, or it includes nonstandard "
1048                       "residue names that have not yet been added to the residuetypes.dat "
1049                       "file in the GROMACS library directory. If there are other molecules "
1050                       "such as ligands, they should not have the same chain ID as the "
1051                       "adjacent protein chain since it's a separate molecule.",
1052                       startResidueString.c_str(),
1053                       endResidueString.c_str(),
1054                       restype0.c_str(),
1055                       residueString.c_str(),
1056                       restype.c_str());
1057         }
1058     }
1059 }
1060
1061 void find_nc_ter(t_atoms*              pdba,
1062                  int                   r0,
1063                  int                   r1,
1064                  int*                  r_start,
1065                  int*                  r_end,
1066                  const ResidueTypeMap& residueTypeMap,
1067                  const gmx::MDLogger&  logger)
1068 {
1069     std::optional<std::string> residueTypeOfStartingTerminus;
1070
1071     *r_start = -1;
1072     *r_end   = -1;
1073
1074     int startWarnings = 0;
1075     int endWarnings   = 0;
1076     int ionNotes      = 0;
1077
1078     // Check that all residues have the same chain identifier, and if it is
1079     // non-blank we also require the residue types to match.
1080     checkResidueTypeSanity(pdba, r0, r1, residueTypeMap);
1081
1082     // If we return correctly from checkResidueTypeSanity(), the only
1083     // remaining cases where we can have non-matching residue types is if
1084     // the chain ID was blank, which could be the case e.g. for a structure
1085     // read from a GRO file or other file types without chain information.
1086     // In that case we need to be a bit more liberal and detect chains based
1087     // on the residue type.
1088
1089     // If we get here, the chain ID must be identical for all residues
1090     char chainID = pdba->resinfo[r0].chainid;
1091
1092     /* Find the starting terminus (typially N or 5') */
1093     for (int i = r0; i < r1 && *r_start == -1; i++)
1094     {
1095         auto foundIt = residueTypeMap.find(*pdba->resinfo[i].name);
1096         if (foundIt == residueTypeMap.end())
1097         {
1098             continue;
1099         }
1100         residueTypeOfStartingTerminus = foundIt->second;
1101         if (gmx::equalCaseInsensitive(residueTypeOfStartingTerminus.value(), "Protein")
1102             || gmx::equalCaseInsensitive(residueTypeOfStartingTerminus.value(), "DNA")
1103             || gmx::equalCaseInsensitive(residueTypeOfStartingTerminus.value(), "RNA"))
1104         {
1105             GMX_LOG(logger.info)
1106                     .asParagraph()
1107                     .appendTextFormatted("Identified residue %s%d as a starting terminus.",
1108                                          *pdba->resinfo[i].name,
1109                                          pdba->resinfo[i].nr);
1110             *r_start = i;
1111         }
1112         else if (gmx::equalCaseInsensitive(residueTypeOfStartingTerminus.value(), "Ion"))
1113         {
1114             if (ionNotes < 5)
1115             {
1116                 GMX_LOG(logger.info)
1117                         .asParagraph()
1118                         .appendTextFormatted(
1119                                 "Residue %s%d has type 'Ion', assuming it is not linked into a "
1120                                 "chain.",
1121                                 *pdba->resinfo[i].name,
1122                                 pdba->resinfo[i].nr);
1123             }
1124             if (ionNotes == 4)
1125             {
1126                 GMX_LOG(logger.info)
1127                         .asParagraph()
1128                         .appendTextFormatted("Disabling further notes about ions.");
1129             }
1130             ionNotes++;
1131         }
1132         else
1133         {
1134             // Either no known residue type, or one not needing special handling
1135             if (startWarnings < 5)
1136             {
1137                 if (chainID == ' ')
1138                 {
1139                     GMX_LOG(logger.warning)
1140                             .asParagraph()
1141                             .appendTextFormatted(
1142                                     "Starting residue %s%d in chain not identified as "
1143                                     "Protein/RNA/DNA. "
1144                                     "This chain lacks identifiers, which makes it impossible to do "
1145                                     "strict "
1146                                     "classification of the start/end residues. Here we need to "
1147                                     "guess this residue "
1148                                     "should not be part of the chain and instead introduce a "
1149                                     "break, but that will "
1150                                     "be catastrophic if they should in fact be linked. Please "
1151                                     "check your structure, "
1152                                     "and add %s to residuetypes.dat if this was not correct.",
1153                                     *pdba->resinfo[i].name,
1154                                     pdba->resinfo[i].nr,
1155                                     *pdba->resinfo[i].name);
1156                 }
1157                 else
1158                 {
1159                     GMX_LOG(logger.warning)
1160                             .asParagraph()
1161                             .appendTextFormatted(
1162                                     "No residues in chain starting at %s%d identified as "
1163                                     "Protein/RNA/DNA. "
1164                                     "This makes it impossible to link them into a molecule, which "
1165                                     "could either be "
1166                                     "correct or a catastrophic error. Please check your structure, "
1167                                     "and add all "
1168                                     "necessary residue names to residuetypes.dat if this was not "
1169                                     "correct.",
1170                                     *pdba->resinfo[i].name,
1171                                     pdba->resinfo[i].nr);
1172                 }
1173             }
1174             if (startWarnings == 4)
1175             {
1176                 GMX_LOG(logger.warning)
1177                         .asParagraph()
1178                         .appendTextFormatted(
1179                                 "Disabling further warnings about unidentified residues at start "
1180                                 "of chain.");
1181             }
1182             startWarnings++;
1183         }
1184     }
1185
1186     if (*r_start >= 0)
1187     {
1188         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1189         for (int i = *r_start; i < r1; i++)
1190         {
1191             auto foundIt = residueTypeMap.find(*pdba->resinfo[i].name);
1192             if (foundIt == residueTypeMap.end())
1193             {
1194                 continue;
1195             }
1196             const std::string& residueTypeOfCurrentResidue = foundIt->second;
1197             if (gmx::equalCaseInsensitive(residueTypeOfCurrentResidue, residueTypeOfStartingTerminus.value())
1198                 && endWarnings == 0)
1199             {
1200                 *r_end = i;
1201             }
1202             else if (gmx::equalCaseInsensitive(residueTypeOfStartingTerminus.value(), "Ion"))
1203             {
1204                 if (ionNotes < 5)
1205                 {
1206                     GMX_LOG(logger.info)
1207                             .asParagraph()
1208                             .appendTextFormatted(
1209                                     "Residue %s%d has type 'Ion', assuming it is not linked into a "
1210                                     "chain.",
1211                                     *pdba->resinfo[i].name,
1212                                     pdba->resinfo[i].nr);
1213                 }
1214                 if (ionNotes == 4)
1215                 {
1216                     GMX_LOG(logger.info)
1217                             .asParagraph()
1218                             .appendTextFormatted("Disabling further notes about ions.");
1219                 }
1220                 ionNotes++;
1221             }
1222             else
1223             {
1224                 // Either no known residue type, or one not needing special handling.
1225                 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1226                 // Otherwise the call to checkResidueTypeSanity() will
1227                 // have caught the problem.
1228                 if (endWarnings < 5)
1229                 {
1230                     GMX_LOG(logger.warning)
1231                             .asParagraph()
1232                             .appendTextFormatted(
1233                                     "Residue %s%d in chain has different type ('%s') from "
1234                                     "residue %s%d ('%s'). This chain lacks identifiers, which "
1235                                     "makes "
1236                                     "it impossible to do strict classification of the start/end "
1237                                     "residues. Here we "
1238                                     "need to guess this residue should not be part of the chain "
1239                                     "and instead "
1240                                     "introduce a break, but that will be catastrophic if they "
1241                                     "should in fact be "
1242                                     "linked. Please check your structure, and add %s to "
1243                                     "residuetypes.dat "
1244                                     "if this was not correct.",
1245                                     *pdba->resinfo[i].name,
1246                                     pdba->resinfo[i].nr,
1247                                     residueTypeOfCurrentResidue.c_str(),
1248                                     *pdba->resinfo[*r_start].name,
1249                                     pdba->resinfo[*r_start].nr,
1250                                     residueTypeOfStartingTerminus.value().c_str(),
1251                                     *pdba->resinfo[i].name);
1252                 }
1253                 if (endWarnings == 4)
1254                 {
1255                     GMX_LOG(logger.warning)
1256                             .asParagraph()
1257                             .appendTextFormatted(
1258                                     "Disabling further warnings about unidentified residues at end "
1259                                     "of chain.");
1260                 }
1261                 endWarnings++;
1262             }
1263         }
1264     }
1265
1266     if (*r_end >= 0)
1267     {
1268         GMX_LOG(logger.info)
1269                 .asParagraph()
1270                 .appendTextFormatted("Identified residue %s%d as a ending terminus.",
1271                                      *pdba->resinfo[*r_end].name,
1272                                      pdba->resinfo[*r_end].nr);
1273     }
1274 }
1275
1276 enum class ChainSeparationType : int
1277 {
1278     IdOrTer,
1279     IdAndTer,
1280     Ter,
1281     Id,
1282     Interactive,
1283     Count
1284 };
1285 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNames = {
1286     { "id_or_ter", "id_and_ter", "ter", "id", "interactive" }
1287 };
1288 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNotificationMessages = {
1289     { "Splitting chemical chains based on TER records or chain id changing.\n",
1290       "Splitting chemical chains based on TER records and chain id changing.\n",
1291       "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1292       "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1293       "Splitting chemical chains interactively.\n" }
1294 };
1295
1296 void modify_chain_numbers(t_atoms* pdba, ChainSeparationType chainSeparation, const gmx::MDLogger& logger)
1297 {
1298     int         i;
1299     char        old_prev_chainid;
1300     char        old_this_chainid;
1301     int         old_prev_chainnum;
1302     int         old_this_chainnum;
1303     t_resinfo*  ri;
1304     char        select[STRLEN];
1305     int         new_chainnum;
1306     int         this_atomnum;
1307     int         prev_atomnum;
1308     const char* prev_atomname;
1309     const char* this_atomname;
1310     const char* prev_resname;
1311     const char* this_resname;
1312     int         prev_resnum;
1313     int         this_resnum;
1314     char        prev_chainid;
1315     char        this_chainid;
1316
1317     /* The default chain enumeration is based on TER records only */
1318     printf("%s", c_chainSeparationTypeNotificationMessages[chainSeparation]);
1319
1320     old_prev_chainid  = '?';
1321     old_prev_chainnum = -1;
1322     new_chainnum      = -1;
1323
1324     this_atomname = nullptr;
1325     this_atomnum  = -1;
1326     this_resname  = nullptr;
1327     this_resnum   = -1;
1328     this_chainid  = '?';
1329
1330     for (i = 0; i < pdba->nres; i++)
1331     {
1332         ri                = &pdba->resinfo[i];
1333         old_this_chainid  = ri->chainid;
1334         old_this_chainnum = ri->chainnum;
1335
1336         prev_atomname = this_atomname;
1337         prev_atomnum  = this_atomnum;
1338         prev_resname  = this_resname;
1339         prev_resnum   = this_resnum;
1340         prev_chainid  = this_chainid;
1341
1342         this_atomname = *(pdba->atomname[i]);
1343         this_atomnum  = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1344         this_resname  = *ri->name;
1345         this_resnum   = ri->nr;
1346         this_chainid  = ri->chainid;
1347
1348         switch (chainSeparation)
1349         {
1350             case ChainSeparationType::IdOrTer:
1351                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1352                 {
1353                     new_chainnum++;
1354                 }
1355                 break;
1356
1357             case ChainSeparationType::IdAndTer:
1358                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1359                 {
1360                     new_chainnum++;
1361                 }
1362                 break;
1363
1364             case ChainSeparationType::Id:
1365                 if (old_this_chainid != old_prev_chainid)
1366                 {
1367                     new_chainnum++;
1368                 }
1369                 break;
1370
1371             case ChainSeparationType::Ter:
1372                 if (old_this_chainnum != old_prev_chainnum)
1373                 {
1374                     new_chainnum++;
1375                 }
1376                 break;
1377             case ChainSeparationType::Interactive:
1378                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1379                 {
1380                     if (i > 0)
1381                     {
1382                         GMX_LOG(logger.info)
1383                                 .asParagraph()
1384                                 .appendTextFormatted(
1385                                         "Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1386 "
1387                                         "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]",
1388                                         prev_resname,
1389                                         prev_resnum,
1390                                         prev_chainid,
1391                                         prev_atomnum,
1392                                         prev_atomname,
1393                                         this_resname,
1394                                         this_resnum,
1395                                         this_chainid,
1396                                         this_atomnum,
1397                                         this_atomname);
1398
1399                         if (nullptr == fgets(select, STRLEN - 1, stdin))
1400                         {
1401                             gmx_fatal(FARGS, "Error reading from stdin");
1402                         }
1403                     }
1404                     if (i == 0 || select[0] == 'y')
1405                     {
1406                         new_chainnum++;
1407                     }
1408                 }
1409                 break;
1410             case ChainSeparationType::Count:
1411                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1412         }
1413         old_prev_chainid  = old_this_chainid;
1414         old_prev_chainnum = old_this_chainnum;
1415
1416         ri->chainnum = new_chainnum;
1417     }
1418 }
1419
1420 bool checkChainCyclicity(t_atoms*                               pdba,
1421                          rvec*                                  pdbx,
1422                          int                                    start_ter,
1423                          int                                    end_ter,
1424                          gmx::ArrayRef<const PreprocessResidue> rtpFFDB,
1425                          gmx::ArrayRef<const RtpRename>         rr,
1426                          real                                   long_bond_dist_,
1427                          real                                   short_bond_dist_)
1428 {
1429     // if start and end are the same, we can't have a cycle
1430     if (start_ter == end_ter)
1431     {
1432         return false;
1433     }
1434     int         ai = -1, aj = -1;
1435     char*       rtpname = *(pdba->resinfo[start_ter].rtp);
1436     std::string newName = search_resrename(rr, rtpname, false, false, false);
1437     if (newName.empty())
1438     {
1439         newName = rtpname;
1440     }
1441     auto        res = getDatabaseEntry(newName, rtpFFDB);
1442     const char *name_ai, *name_aj;
1443
1444     for (const auto& patch : res->rb[BondedTypes::Bonds].b)
1445     { /* Search backward bond for n/5' terminus */
1446         name_ai = patch.ai().c_str();
1447         name_aj = patch.aj().c_str();
1448         if (name_ai[0] == '-')
1449         {
1450             aj = search_res_atom(++name_ai, end_ter, pdba, "check", TRUE);
1451             ai = search_res_atom(name_aj, start_ter, pdba, "check", TRUE);
1452         }
1453         else if (name_aj[0] == '-')
1454         {
1455             aj = search_res_atom(++name_aj, end_ter, pdba, "check", TRUE);
1456             ai = search_res_atom(name_ai, start_ter, pdba, "check", TRUE);
1457         }
1458         if (ai >= 0 && aj >= 0)
1459         {
1460             break; /* Found */
1461         }
1462     }
1463
1464     if (!(ai >= 0 && aj >= 0))
1465     {
1466         rtpname = *(pdba->resinfo[end_ter].rtp);
1467         newName = search_resrename(rr, rtpname, false, false, false);
1468         if (newName.empty())
1469         {
1470             newName = rtpname;
1471         }
1472         res = getDatabaseEntry(newName, rtpFFDB);
1473         for (const auto& patch : res->rb[BondedTypes::Bonds].b)
1474         {
1475             /* Seach forward bond for c/3' terminus */
1476             name_ai = patch.ai().c_str();
1477             name_aj = patch.aj().c_str();
1478
1479             if (name_ai[0] == '+')
1480             {
1481                 ai = search_res_atom(name_aj, end_ter, pdba, "check", TRUE);
1482                 aj = search_res_atom(++name_ai, start_ter, pdba, "check", TRUE);
1483             }
1484             else if (name_aj[0] == '+')
1485             {
1486                 ai = search_res_atom(name_ai, end_ter, pdba, "check", TRUE);
1487                 aj = search_res_atom(++name_aj, start_ter, pdba, "check", TRUE);
1488             }
1489             if (ai >= 0 && aj >= 0)
1490             {
1491                 break;
1492             }
1493         }
1494     }
1495
1496     if (ai >= 0 && aj >= 0)
1497     {
1498         real dist = distance2(pdbx[ai], pdbx[aj]);
1499         /* it is better to read bond length from ffbonded.itp */
1500         return (dist < gmx::square(long_bond_dist_) && dist > gmx::square(short_bond_dist_));
1501     }
1502     else
1503     {
1504         return false;
1505     }
1506 }
1507
1508 struct t_pdbchain
1509 {
1510     char             chainid   = ' ';
1511     int              chainnum  = ' '; // char, but stored as int to make clang-tidy happy
1512     int              start     = -1;
1513     int              natom     = -1;
1514     bool             bAllWat   = false;
1515     int              nterpairs = -1;
1516     std::vector<int> chainstart;
1517 };
1518
1519 struct t_chain
1520 {
1521     char                                chainid   = ' ';
1522     int                                 chainnum  = ' ';
1523     bool                                bAllWat   = false;
1524     int                                 nterpairs = -1;
1525     std::vector<int>                    chainstart;
1526     std::vector<MoleculePatchDatabase*> ntdb;
1527     std::vector<MoleculePatchDatabase*> ctdb;
1528     std::vector<int>                    r_start;
1529     std::vector<int>                    r_end;
1530     t_atoms*                            pdba;
1531     std::vector<gmx::RVec>              x;
1532     std::vector<int>                    cyclicBondsIndex;
1533 };
1534
1535 enum class VSitesType : int
1536 {
1537     None,
1538     Hydrogens,
1539     Aromatics,
1540     Count
1541 };
1542 const gmx::EnumerationArray<VSitesType, const char*> c_vsitesTypeNames = {
1543     { "none", "hydrogens", "aromatics" }
1544 };
1545
1546 enum class WaterType : int
1547 {
1548     Select,
1549     None,
1550     Spc,
1551     SpcE,
1552     Tip3p,
1553     Tip4p,
1554     Tip5p,
1555     Tips3p,
1556     Count
1557 };
1558 const gmx::EnumerationArray<WaterType, const char*> c_waterTypeNames = {
1559     { "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p" }
1560 };
1561
1562 enum class MergeType : int
1563 {
1564     No,
1565     All,
1566     Interactive,
1567     Count
1568 };
1569 const gmx::EnumerationArray<MergeType, const char*> c_mergeTypeNames = {
1570     { "no", "all", "interactive" }
1571 };
1572
1573 } // namespace
1574
1575 namespace gmx
1576 {
1577
1578 namespace
1579 {
1580
1581 class pdb2gmx : public ICommandLineOptionsModule
1582 {
1583 public:
1584     pdb2gmx() :
1585         bVsites_(FALSE),
1586         bPrevWat_(FALSE),
1587         bVsiteAromatics_(FALSE),
1588         chainSeparation_(ChainSeparationType::IdOrTer),
1589         vsitesType_(VSitesType::None),
1590         waterType_(WaterType::Select),
1591         mergeType_(MergeType::No),
1592         itp_file_(nullptr),
1593         mHmult_(0)
1594     {
1595         gmx::LoggerBuilder builder;
1596         builder.addTargetStream(gmx::MDLogger::LogLevel::Info, &gmx::TextOutputFile::standardOutput());
1597         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
1598         loggerOwner_ = std::make_unique<LoggerOwner>(builder.build());
1599     }
1600
1601     // From ICommandLineOptionsModule
1602     void init(CommandLineModuleSettings* /*settings*/) override {}
1603
1604     void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1605
1606     void optionsFinished() override;
1607
1608     int run() override;
1609
1610 private:
1611     bool bNewRTP_;
1612     bool bInter_;
1613     bool bCysMan_;
1614     bool bLysMan_;
1615     bool bAspMan_;
1616     bool bGluMan_;
1617     bool bHisMan_;
1618     bool bGlnMan_;
1619     bool bArgMan_;
1620     bool bTerMan_;
1621     bool bUnA_;
1622     bool bHeavyH_;
1623     bool bSort_;
1624     bool bAllowMissing_;
1625     bool bRemoveH_;
1626     bool bDeuterate_;
1627     bool bVerbose_;
1628     bool bChargeGroups_;
1629     bool bCmap_;
1630     bool bRenumRes_;
1631     bool bRTPresname_;
1632     bool bIndexSet_;
1633     bool bOutputSet_;
1634     bool bVsites_;
1635     bool bWat_;
1636     bool bPrevWat_;
1637     bool bITP_;
1638     bool bVsiteAromatics_;
1639     real angle_;
1640     real distance_;
1641     real posre_fc_;
1642     real long_bond_dist_;
1643     real short_bond_dist_;
1644
1645     std::string indexOutputFile_;
1646     std::string outputFile_;
1647     std::string topologyFile_;
1648     std::string includeTopologyFile_;
1649     std::string outputConfFile_;
1650     std::string inputConfFile_;
1651     std::string outFile_;
1652     std::string ff_;
1653
1654     ChainSeparationType chainSeparation_;
1655     VSitesType          vsitesType_;
1656     WaterType           waterType_;
1657     MergeType           mergeType_;
1658
1659     FILE*                        itp_file_;
1660     char                         forcefield_[STRLEN];
1661     char                         ffdir_[STRLEN];
1662     char*                        ffname_;
1663     char*                        watermodel_;
1664     std::vector<std::string>     incls_;
1665     std::vector<t_mols>          mols_;
1666     real                         mHmult_;
1667     std::unique_ptr<LoggerOwner> loggerOwner_;
1668 };
1669
1670 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1671 {
1672     const char* desc[] = {
1673         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1674         "some database files, adds hydrogens to the molecules and generates",
1675         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1676         "and a topology in GROMACS format.",
1677         "These files can subsequently be processed to generate a run input file.",
1678         "[PAR]",
1679         "[THISMODULE] will search for force fields by looking for",
1680         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1681         "of the current working directory and of the GROMACS library directory",
1682         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1683         "variable.",
1684         "By default the forcefield selection is interactive,",
1685         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1686         "in the list on the command line instead. In that case [THISMODULE] just looks",
1687         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1688         "[PAR]",
1689         "After choosing a force field, all files will be read only from",
1690         "the corresponding force field directory.",
1691         "If you want to modify or add a residue types, you can copy the force",
1692         "field directory from the GROMACS library directory to your current",
1693         "working directory. If you want to add new protein residue types,",
1694         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1695         "or copy the whole library directory to a local directory and set",
1696         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1697         "Check Chapter 5 of the manual for more information about file formats.",
1698         "[PAR]",
1699
1700         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1701         "need not necessarily contain a protein structure. Every kind of",
1702         "molecule for which there is support in the database can be converted.",
1703         "If there is no support in the database, you can add it yourself.[PAR]",
1704
1705         "The program has limited intelligence, it reads a number of database",
1706         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1707         "if necessary this can be done manually. The program can prompt the",
1708         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1709         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1710         "protonated (three protons, default), for Asp and Glu unprotonated",
1711         "(default) or protonated, for His the proton can be either on ND1,",
1712         "on NE2 or on both. By default these selections are done automatically.",
1713         "For His, this is based on an optimal hydrogen bonding",
1714         "conformation. Hydrogen bonds are defined based on a simple geometric",
1715         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1716         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1717         "[TT]-dist[tt] respectively.[PAR]",
1718
1719         "The protonation state of N- and C-termini can be chosen interactively",
1720         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1721         "respectively.  Some force fields support zwitterionic forms for chains of",
1722         "one residue, but for polypeptides these options should NOT be selected.",
1723         "The AMBER force fields have unique forms for the terminal residues,",
1724         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1725         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1726         "respectively to use these forms, making sure you preserve the format",
1727         "of the coordinate file. Alternatively, use named terminating residues",
1728         "(e.g. ACE, NME).[PAR]",
1729
1730         "The separation of chains is not entirely trivial since the markup",
1731         "in user-generated PDB files frequently varies and sometimes it",
1732         "is desirable to merge entries across a TER record, for instance",
1733         "if you want a disulfide bridge or distance restraints between",
1734         "two protein chains or if you have a HEME group bound to a protein.",
1735         "In such cases multiple chains should be contained in a single",
1736         "[TT]moleculetype[tt] definition.",
1737         "To handle this, [THISMODULE] uses two separate options.",
1738         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1739         "start, and termini added when applicable. This can be done based on the",
1740         "existence of TER records, when the chain id changes, or combinations of either",
1741         "or both of these. You can also do the selection fully interactively.",
1742         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1743         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1744         "This can be turned off (no merging), all non-water chains can be merged into a",
1745         "single molecule, or the selection can be done interactively.[PAR]",
1746
1747         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1748         "If any of the occupancies are not one, indicating that the atom is",
1749         "not resolved well in the structure, a warning message is issued.",
1750         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1751         "all occupancy fields may be zero. Either way, it is up to the user",
1752         "to verify the correctness of the input data (read the article!).[PAR]",
1753
1754         "During processing the atoms will be reordered according to GROMACS",
1755         "conventions. With [TT]-n[tt] an index file can be generated that",
1756         "contains one group reordered in the same way. This allows you to",
1757         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1758         "one limitation: reordering is done after the hydrogens are stripped",
1759         "from the input and before new hydrogens are added. This means that",
1760         "you should not use [TT]-ignh[tt].[PAR]",
1761
1762         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1763         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1764         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1765         "[PAR]",
1766
1767         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1768         "motions. Angular and out-of-plane motions can be removed by changing",
1769         "hydrogens into virtual sites and fixing angles, which fixes their",
1770         "position relative to neighboring atoms. Additionally, all atoms in the",
1771         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1772         "can be converted into virtual sites, eliminating the fast improper dihedral",
1773         "fluctuations in these rings (but this feature is deprecated).",
1774         "[BB]Note[bb] that in this case all other hydrogen",
1775         "atoms are also converted to virtual sites. The mass of all atoms that are",
1776         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1777         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1778         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1779         "done for water hydrogens to slow down the rotational motion of water.",
1780         "The increase in mass of the hydrogens is subtracted from the bonded",
1781         "(heavy) atom so that the total mass of the system remains the same.",
1782
1783         "As a special case, ring-closed (or cyclic) molecules are considered.",
1784         "[THISMODULE] automatically determines if a cyclic molecule is present",
1785         "by evaluating the distance between the terminal atoms of a given chain.",
1786         "If this distance is greater than the [TT]-sb[tt]",
1787         "(\"Short bond warning distance\", default 0.05 nm)",
1788         "and less than the [TT]-lb[tt] (\"Long bond warning distance\", default 0.25 nm)",
1789         "the molecule is considered to be ring closed and will be processed as such.",
1790         "Please note that this does not detect cyclic bonds over periodic boundaries."
1791     };
1792
1793     settings->setHelpText(desc);
1794
1795     options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1796             "Write the residue database in new format to [TT]new.rtp[tt]"));
1797     options->addOption(
1798             RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1799     options->addOption(
1800             RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1801     options->addOption(EnumOption<ChainSeparationType>("chainsep")
1802                                .enumValue(c_chainSeparationTypeNames)
1803                                .store(&chainSeparation_)
1804                                .description("Condition in PDB files when a new chain should be "
1805                                             "started (adding termini)"));
1806     options->addOption(EnumOption<MergeType>("merge")
1807                                .enumValue(c_mergeTypeNames)
1808                                .store(&mergeType_)
1809                                .description("Merge multiple chains into a single [moleculetype]"));
1810     options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1811             "Force field, interactive by default. Use [TT]-h[tt] for information."));
1812     options->addOption(
1813             EnumOption<WaterType>("water").store(&waterType_).enumValue(c_waterTypeNames).description("Water model to use"));
1814     options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1815             "Set the next 8 options to interactive"));
1816     options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1817             "Interactive SS bridge selection"));
1818     options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1819             "Interactive termini selection, instead of charged (default)"));
1820     options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1821             "Interactive lysine selection, instead of charged"));
1822     options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1823             "Interactive arginine selection, instead of charged"));
1824     options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1825             "Interactive aspartic acid selection, instead of charged"));
1826     options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1827             "Interactive glutamic acid selection, instead of charged"));
1828     options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1829             "Interactive glutamine selection, instead of charged"));
1830     options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1831             "Interactive histidine selection, instead of checking H-bonds"));
1832     options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1833             "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1834     options->addOption(
1835             RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1836     options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1837             "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1838             "tyrosine"));
1839     options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1840             "Sort the residues according to database, turning this off is dangerous as charge "
1841             "groups might be broken in parts"));
1842     options->addOption(
1843             BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1844     options->addOption(
1845             BooleanOption("missing")
1846                     .store(&bAllowMissing_)
1847                     .defaultValue(false)
1848                     .description(
1849                             "Continue when atoms are missing and bonds cannot be made, dangerous"));
1850     options->addOption(
1851             BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1852     options->addOption(
1853             RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1854     options->addOption(EnumOption<VSitesType>("vsite")
1855                                .store(&vsitesType_)
1856                                .enumValue(c_vsitesTypeNames)
1857                                .description("Convert atoms to virtual sites"));
1858     options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1859             "Make hydrogen atoms heavy"));
1860     options->addOption(
1861             BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1862     options->addOption(BooleanOption("chargegrp")
1863                                .store(&bChargeGroups_)
1864                                .defaultValue(true)
1865                                .description("Use charge groups in the [REF].rtp[ref] file"));
1866     options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1867             "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1868     options->addOption(BooleanOption("renum")
1869                                .store(&bRenumRes_)
1870                                .defaultValue(false)
1871                                .description("Renumber the residues consecutively in the output"));
1872     options->addOption(BooleanOption("rtpres")
1873                                .store(&bRTPresname_)
1874                                .defaultValue(false)
1875                                .description("Use [REF].rtp[ref] entry names as residue names"));
1876     options->addOption(FileNameOption("f")
1877                                .legacyType(efSTX)
1878                                .inputFile()
1879                                .store(&inputConfFile_)
1880                                .required()
1881                                .defaultBasename("protein")
1882                                .defaultType(efPDB)
1883                                .description("Structure file"));
1884     options->addOption(FileNameOption("o")
1885                                .legacyType(efSTO)
1886                                .outputFile()
1887                                .store(&outputConfFile_)
1888                                .required()
1889                                .defaultBasename("conf")
1890                                .description("Structure file"));
1891     options->addOption(FileNameOption("p")
1892                                .legacyType(efTOP)
1893                                .outputFile()
1894                                .store(&topologyFile_)
1895                                .required()
1896                                .defaultBasename("topol")
1897                                .description("Topology file"));
1898     options->addOption(FileNameOption("i")
1899                                .legacyType(efITP)
1900                                .outputFile()
1901                                .store(&includeTopologyFile_)
1902                                .required()
1903                                .defaultBasename("posre")
1904                                .description("Include file for topology"));
1905     options->addOption(FileNameOption("n")
1906                                .legacyType(efNDX)
1907                                .outputFile()
1908                                .store(&indexOutputFile_)
1909                                .storeIsSet(&bIndexSet_)
1910                                .defaultBasename("index")
1911                                .description("Index file"));
1912     options->addOption(FileNameOption("q")
1913                                .legacyType(efSTO)
1914                                .outputFile()
1915                                .store(&outFile_)
1916                                .storeIsSet(&bOutputSet_)
1917                                .defaultBasename("clean")
1918                                .defaultType(efPDB)
1919                                .description("Structure file"));
1920 }
1921
1922 void pdb2gmx::optionsFinished()
1923 {
1924     if (inputConfFile_.empty())
1925     {
1926         GMX_THROW(InconsistentInputError("You must supply an input file"));
1927     }
1928     if (bInter_)
1929     {
1930         /* if anything changes here, also change description of -inter */
1931         bCysMan_ = true;
1932         bTerMan_ = true;
1933         bLysMan_ = true;
1934         bArgMan_ = true;
1935         bAspMan_ = true;
1936         bGluMan_ = true;
1937         bGlnMan_ = true;
1938         bHisMan_ = true;
1939     }
1940
1941     if (bHeavyH_)
1942     {
1943         mHmult_ = 4.0;
1944     }
1945     else if (bDeuterate_)
1946     {
1947         mHmult_ = 2.0;
1948     }
1949     else
1950     {
1951         mHmult_ = 1.0;
1952     }
1953
1954     /* Force field selection, interactive or direct */
1955     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(),
1956               forcefield_,
1957               sizeof(forcefield_),
1958               ffdir_,
1959               sizeof(ffdir_),
1960               loggerOwner_->logger());
1961
1962     if (strlen(forcefield_) > 0)
1963     {
1964         ffname_    = forcefield_;
1965         ffname_[0] = std::toupper(ffname_[0]);
1966     }
1967     else
1968     {
1969         gmx_fatal(FARGS, "Empty forcefield string");
1970     }
1971 }
1972
1973 int pdb2gmx::run()
1974 {
1975     char                       select[STRLEN];
1976     std::vector<DisulfideBond> ssbonds;
1977
1978     int         this_atomnum;
1979     int         prev_atomnum;
1980     const char* prev_atomname;
1981     const char* this_atomname;
1982     const char* prev_resname;
1983     const char* this_resname;
1984     int         prev_resnum;
1985     int         this_resnum;
1986     char        prev_chainid;
1987     char        this_chainid;
1988     int         prev_chainnumber;
1989     int         this_chainnumber;
1990     int         this_chainstart;
1991     int         prev_chainstart;
1992
1993     const gmx::MDLogger& logger = loggerOwner_->logger();
1994
1995     GMX_LOG(logger.info)
1996             .asParagraph()
1997             .appendTextFormatted("Using the %s force field in directory %s", ffname_, ffdir_);
1998
1999     choose_watermodel(c_waterTypeNames[waterType_], ffdir_, &watermodel_, logger);
2000
2001     switch (vsitesType_)
2002     {
2003         case VSitesType::None:
2004             bVsites_         = false;
2005             bVsiteAromatics_ = false;
2006             break;
2007         case VSitesType::Hydrogens:
2008             bVsites_         = true;
2009             bVsiteAromatics_ = false;
2010             break;
2011         case VSitesType::Aromatics:
2012             bVsites_         = true;
2013             bVsiteAromatics_ = true;
2014             break;
2015         case VSitesType::Count:
2016             gmx_fatal(FARGS, "Internal inconsistency: VSitesType is out of range.");
2017     } /* end switch */
2018
2019     /* Open the symbol table */
2020     t_symtab symtab;
2021     open_symtab(&symtab);
2022
2023     /* Residue type database */
2024     ResidueTypeMap residueTypeMap = residueTypeMapFromLibraryFile("residuetypes.dat");
2025
2026     /* Read residue renaming database(s), if present */
2027     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
2028
2029     std::vector<RtpRename> rtprename;
2030     for (const auto& filename : rrn)
2031     {
2032         GMX_LOG(logger.info).asParagraph().appendTextFormatted("going to rename %s", filename.c_str());
2033         FILE* fp = fflib_open(filename);
2034         read_rtprename(filename.c_str(), fp, &rtprename);
2035         gmx_ffclose(fp);
2036     }
2037
2038     /* Add all alternative names from the residue renaming database to the list
2039        of recognized amino/nucleic acids. */
2040     for (const auto& rename : rtprename)
2041     {
2042         /* Only add names if the 'standard' gromacs/iupac base name was found */
2043         if (auto foundIt = residueTypeMap.find(rename.gmx); foundIt != residueTypeMap.end())
2044         {
2045             // Add the renamed forms with the same residue type as the standard form
2046             auto& residueType = foundIt->second;
2047             addResidue(&residueTypeMap, rename.main, residueType);
2048             addResidue(&residueTypeMap, rename.nter, residueType);
2049             addResidue(&residueTypeMap, rename.cter, residueType);
2050             addResidue(&residueTypeMap, rename.bter, residueType);
2051         }
2052     }
2053
2054     matrix      box;
2055     const char* watres;
2056     clear_mat(box);
2057     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
2058     {
2059         watres = "HO4";
2060     }
2061     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
2062     {
2063         watres = "HO5";
2064     }
2065     else
2066     {
2067         watres = "HOH";
2068     }
2069
2070     AtomProperties aps;
2071     char*          title = nullptr;
2072     PbcType        pbcType;
2073     t_atoms        pdba_all;
2074     rvec*          pdbx;
2075     int            natom = read_pdball(inputConfFile_.c_str(),
2076                             bOutputSet_,
2077                             outFile_.c_str(),
2078                             &title,
2079                             &pdba_all,
2080                             &pdbx,
2081                             &pbcType,
2082                             box,
2083                             bRemoveH_,
2084                             &symtab,
2085                             residueTypeMap,
2086                             watres,
2087                             &aps,
2088                             bVerbose_);
2089
2090     if (natom == 0)
2091     {
2092         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
2093         GMX_THROW(InconsistentInputError(message));
2094     }
2095
2096     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Analyzing pdb file");
2097     int nwaterchain = 0;
2098
2099     modify_chain_numbers(&pdba_all, chainSeparation_, logger);
2100
2101     int nchainmerges = 0;
2102
2103     this_atomname    = nullptr;
2104     this_atomnum     = -1;
2105     this_resname     = nullptr;
2106     this_resnum      = -1;
2107     this_chainid     = '?';
2108     this_chainnumber = -1;
2109     this_chainstart  = 0;
2110     /* Keep the compiler happy */
2111     prev_chainstart = 0;
2112
2113     int                     numChains = 0;
2114     std::vector<t_pdbchain> pdb_ch;
2115
2116     t_resinfo* ri;
2117     bool       bMerged = false;
2118     for (int i = 0; (i < natom); i++)
2119     {
2120         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
2121
2122         /* TODO this should live in a helper object, and consolidate
2123            that with code in modify_chain_numbers */
2124         prev_atomname    = this_atomname;
2125         prev_atomnum     = this_atomnum;
2126         prev_resname     = this_resname;
2127         prev_resnum      = this_resnum;
2128         prev_chainid     = this_chainid;
2129         prev_chainnumber = this_chainnumber;
2130         if (!bMerged)
2131         {
2132             prev_chainstart = this_chainstart;
2133         }
2134
2135         this_atomname    = *pdba_all.atomname[i];
2136         this_atomnum     = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
2137         this_resname     = *ri->name;
2138         this_resnum      = ri->nr;
2139         this_chainid     = ri->chainid;
2140         this_chainnumber = ri->chainnum;
2141
2142         bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
2143
2144         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
2145         {
2146             GMX_RELEASE_ASSERT(
2147                     pdba_all.pdbinfo,
2148                     "Must have pdbinfo from reading a PDB file if chain number is changing");
2149             this_chainstart = pdba_all.atom[i].resind;
2150             bMerged         = false;
2151             if (i > 0 && !bWat_)
2152             {
2153                 if (!strncmp(c_mergeTypeNames[mergeType_], "int", 3))
2154                 {
2155                     GMX_LOG(logger.info)
2156                             .asParagraph()
2157                             .appendTextFormatted(
2158                                     "Merge chain ending with residue %s%d (chain id '%c', atom %d "
2159                                     "%s) and chain starting with "
2160                                     "residue %s%d (chain id '%c', atom %d %s) into a single "
2161                                     "moleculetype (keeping termini)? [n/y]",
2162                                     prev_resname,
2163                                     prev_resnum,
2164                                     prev_chainid,
2165                                     prev_atomnum,
2166                                     prev_atomname,
2167                                     this_resname,
2168                                     this_resnum,
2169                                     this_chainid,
2170                                     this_atomnum,
2171                                     this_atomname);
2172
2173                     if (nullptr == fgets(select, STRLEN - 1, stdin))
2174                     {
2175                         gmx_fatal(FARGS, "Error reading from stdin");
2176                     }
2177                     bMerged = (select[0] == 'y');
2178                 }
2179                 else if (!strncmp(c_mergeTypeNames[mergeType_], "all", 3))
2180                 {
2181                     bMerged = true;
2182                 }
2183             }
2184
2185             if (bMerged)
2186             {
2187                 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
2188                         pdba_all.atom[i].resind - prev_chainstart;
2189                 pdb_ch[numChains - 1].nterpairs++;
2190                 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
2191                 nchainmerges++;
2192             }
2193             else
2194             {
2195                 /* set natom for previous chain */
2196                 if (numChains > 0)
2197                 {
2198                     pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
2199                 }
2200                 if (bWat_)
2201                 {
2202                     nwaterchain++;
2203                     ri->chainid = ' ';
2204                 }
2205                 /* check if chain identifier was used before */
2206                 for (int j = 0; (j < numChains); j++)
2207                 {
2208                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
2209                     {
2210                         GMX_LOG(logger.warning)
2211                                 .asParagraph()
2212                                 .appendTextFormatted(
2213                                         "Chain identifier '%c' is used in two non-sequential "
2214                                         "blocks. "
2215                                         "They will be treated as separate chains unless you "
2216                                         "reorder your file.",
2217                                         ri->chainid);
2218                     }
2219                 }
2220                 t_pdbchain newChain;
2221                 newChain.chainid  = ri->chainid;
2222                 newChain.chainnum = ri->chainnum;
2223                 newChain.start    = i;
2224                 newChain.bAllWat  = bWat_;
2225                 if (bWat_)
2226                 {
2227                     newChain.nterpairs = 0;
2228                 }
2229                 else
2230                 {
2231                     newChain.nterpairs = 1;
2232                 }
2233                 newChain.chainstart.resize(newChain.nterpairs + 1);
2234                 /* modified [numChains] to [0] below */
2235                 newChain.chainstart[0] = 0;
2236                 pdb_ch.push_back(newChain);
2237                 numChains++;
2238             }
2239         }
2240         bPrevWat_ = bWat_;
2241     }
2242     pdb_ch.back().natom = natom - pdb_ch.back().start;
2243
2244     /* set all the water blocks at the end of the chain */
2245     std::vector<int> swap_index(numChains);
2246     int              j = 0;
2247     for (int i = 0; i < numChains; i++)
2248     {
2249         if (!pdb_ch[i].bAllWat)
2250         {
2251             swap_index[j] = i;
2252             j++;
2253         }
2254     }
2255     for (int i = 0; i < numChains; i++)
2256     {
2257         if (pdb_ch[i].bAllWat)
2258         {
2259             swap_index[j] = i;
2260             j++;
2261         }
2262     }
2263     if (nwaterchain > 1)
2264     {
2265         GMX_LOG(logger.info)
2266                 .asParagraph()
2267                 .appendTextFormatted("Moved all the water blocks to the end");
2268     }
2269
2270     t_atoms*             pdba;
2271     std::vector<t_chain> chains(numChains);
2272     /* copy pdb data and x for all chains */
2273     for (int i = 0; (i < numChains); i++)
2274     {
2275         int si               = swap_index[i];
2276         chains[i].chainid    = pdb_ch[si].chainid;
2277         chains[i].chainnum   = pdb_ch[si].chainnum;
2278         chains[i].bAllWat    = pdb_ch[si].bAllWat;
2279         chains[i].nterpairs  = pdb_ch[si].nterpairs;
2280         chains[i].chainstart = pdb_ch[si].chainstart;
2281         chains[i].ntdb.clear();
2282         chains[i].ctdb.clear();
2283         chains[i].r_start.resize(pdb_ch[si].nterpairs);
2284         chains[i].r_end.resize(pdb_ch[si].nterpairs);
2285
2286         snew(chains[i].pdba, 1);
2287         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
2288         for (j = 0; j < chains[i].pdba->nr; j++)
2289         {
2290             chains[i].pdba->atom[j]     = pdba_all.atom[pdb_ch[si].start + j];
2291             chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
2292             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
2293             chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
2294         }
2295         /* Re-index the residues assuming that the indices are continuous */
2296         int k                = chains[i].pdba->atom[0].resind;
2297         int nres             = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
2298         chains[i].pdba->nres = nres;
2299         for (int j = 0; j < chains[i].pdba->nr; j++)
2300         {
2301             chains[i].pdba->atom[j].resind -= k;
2302         }
2303         srenew(chains[i].pdba->resinfo, nres);
2304         for (int j = 0; j < nres; j++)
2305         {
2306             chains[i].pdba->resinfo[j]      = pdba_all.resinfo[k + j];
2307             chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
2308             /* make all chain identifiers equal to that of the chain */
2309             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
2310         }
2311     }
2312
2313     if (nchainmerges > 0)
2314     {
2315         GMX_LOG(logger.info)
2316                 .asParagraph()
2317                 .appendTextFormatted("Merged chains into joint molecule definitions at %d places.",
2318                                      nchainmerges);
2319     }
2320
2321     GMX_LOG(logger.info)
2322             .asParagraph()
2323             .appendTextFormatted(
2324                     "There are %d chains and %d blocks of water and "
2325                     "%d residues with %d atoms",
2326                     numChains - nwaterchain,
2327                     nwaterchain,
2328                     pdba_all.nres,
2329                     natom);
2330
2331     GMX_LOG(logger.info)
2332             .asParagraph()
2333             .appendTextFormatted("  %5s  %4s %6s", "chain", "#res", "#atoms");
2334     for (int i = 0; (i < numChains); i++)
2335     {
2336         GMX_LOG(logger.info)
2337                 .asParagraph()
2338                 .appendTextFormatted("  %d '%c' %5d %6d  %s\n",
2339                                      i + 1,
2340                                      chains[i].chainid ? chains[i].chainid : '-',
2341                                      chains[i].pdba->nres,
2342                                      chains[i].pdba->nr,
2343                                      chains[i].bAllWat ? "(only water)" : "");
2344     }
2345
2346     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_, logger);
2347
2348     /* Read atomtypes... */
2349     PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
2350
2351     /* read residue database */
2352     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Reading residue database... (%s)", forcefield_);
2353     std::vector<std::string>       rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
2354     std::vector<PreprocessResidue> rtpFFDB;
2355     for (const auto& filename : rtpf)
2356     {
2357         readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, logger, false);
2358     }
2359     if (bNewRTP_)
2360     {
2361         /* Not correct with multiple rtp input files with different bonded types */
2362         FILE* fp = gmx_fio_fopen("new.rtp", "w");
2363         print_resall(fp, rtpFFDB, atype);
2364         gmx_fio_fclose(fp);
2365     }
2366
2367     /* read hydrogen database */
2368     std::vector<MoleculePatchDatabase> ah;
2369     read_h_db(ffdir_, &ah);
2370
2371     /* Read Termini database... */
2372     std::vector<MoleculePatchDatabase>  ntdb;
2373     std::vector<MoleculePatchDatabase>  ctdb;
2374     std::vector<MoleculePatchDatabase*> tdblist;
2375     int                                 nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
2376     int                                 nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
2377
2378     FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2379
2380     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2381
2382     t_chain*               cc;
2383     std::vector<gmx::RVec> x;
2384     /* new pdb datastructure for sorting. */
2385     t_atoms** sortAtoms  = nullptr;
2386     t_atoms** localAtoms = nullptr;
2387     snew(sortAtoms, numChains);
2388     snew(localAtoms, numChains);
2389     for (int chain = 0; (chain < numChains); chain++)
2390     {
2391         cc = &(chains[chain]);
2392
2393         /* set pdba, natom and nres to the current chain */
2394         pdba     = cc->pdba;
2395         x        = cc->x;
2396         natom    = cc->pdba->nr;
2397         int nres = cc->pdba->nres;
2398
2399         if (cc->chainid && (cc->chainid != ' '))
2400         {
2401             GMX_LOG(logger.info)
2402                     .asParagraph()
2403                     .appendTextFormatted("Processing chain %d '%c' (%d atoms, %d residues)",
2404                                          chain + 1,
2405                                          cc->chainid,
2406                                          natom,
2407                                          nres);
2408         }
2409         else
2410         {
2411             GMX_LOG(logger.info)
2412                     .asParagraph()
2413                     .appendTextFormatted(
2414                             "Processing chain %d (%d atoms, %d residues)", chain + 1, natom, nres);
2415         }
2416
2417         process_chain(logger,
2418                       pdba,
2419                       x,
2420                       bUnA_,
2421                       bUnA_,
2422                       bUnA_,
2423                       bLysMan_,
2424                       bAspMan_,
2425                       bGluMan_,
2426                       bHisMan_,
2427                       bArgMan_,
2428                       bGlnMan_,
2429                       angle_,
2430                       distance_,
2431                       &symtab,
2432                       rtprename);
2433
2434         cc->chainstart[cc->nterpairs] = pdba->nres;
2435         j                             = 0;
2436         for (int i = 0; i < cc->nterpairs; i++)
2437         {
2438             find_nc_ter(pdba,
2439                         cc->chainstart[i],
2440                         cc->chainstart[i + 1],
2441                         &(cc->r_start[j]),
2442                         &(cc->r_end[j]),
2443                         residueTypeMap,
2444                         logger);
2445             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2446             {
2447                 if (checkChainCyclicity(
2448                             pdba, pdbx, cc->r_start[j], cc->r_end[j], rtpFFDB, rtprename, long_bond_dist_, short_bond_dist_))
2449                 {
2450                     cc->cyclicBondsIndex.push_back(cc->r_start[j]);
2451                     cc->cyclicBondsIndex.push_back(cc->r_end[j]);
2452                     cc->r_start[j] = -1;
2453                     cc->r_end[j]   = -1;
2454                 }
2455                 else
2456                 {
2457                     j++;
2458                 }
2459             }
2460         }
2461         cc->nterpairs = j;
2462         if (cc->nterpairs == 0 && cc->cyclicBondsIndex.empty())
2463         {
2464             GMX_LOG(logger.info)
2465                     .asParagraph()
2466                     .appendTextFormatted(
2467                             "Problem with chain definition, or missing terminal residues. "
2468                             "This chain does not appear to contain a recognized chain molecule. "
2469                             "If this is incorrect, you can edit residuetypes.dat to modify the "
2470                             "behavior.");
2471         }
2472
2473         /* Check for disulfides and other special bonds */
2474         ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2475
2476         if (!rtprename.empty())
2477         {
2478             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_, logger);
2479         }
2480
2481         for (int i = 0; i < cc->nterpairs; i++)
2482         {
2483             /* Set termini.
2484              * We first apply a filter so we only have the
2485              * termini that can be applied to the residue in question
2486              * (or a generic terminus if no-residue specific is available).
2487              */
2488             /* First the N terminus */
2489             if (nNtdb > 0)
2490             {
2491                 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2492                 if (tdblist.empty())
2493                 {
2494                     GMX_LOG(logger.info)
2495                             .asParagraph()
2496                             .appendTextFormatted(
2497                                     "No suitable end (N or 5') terminus found in database - "
2498                                     "assuming this residue "
2499                                     "is already in a terminus-specific form and skipping terminus "
2500                                     "selection.");
2501                     cc->ntdb.push_back(nullptr);
2502                 }
2503                 else
2504                 {
2505                     if (bTerMan_ && !tdblist.empty())
2506                     {
2507                         sprintf(select,
2508                                 "Select start terminus type for %s-%d",
2509                                 *pdba->resinfo[cc->r_start[i]].name,
2510                                 pdba->resinfo[cc->r_start[i]].nr);
2511                         cc->ntdb.push_back(choose_ter(tdblist, select));
2512                     }
2513                     else
2514                     {
2515                         cc->ntdb.push_back(tdblist[0]);
2516                     }
2517
2518                     printf("Start terminus %s-%d: %s\n",
2519                            *pdba->resinfo[cc->r_start[i]].name,
2520                            pdba->resinfo[cc->r_start[i]].nr,
2521                            (cc->ntdb[i])->name.c_str());
2522                     tdblist.clear();
2523                 }
2524             }
2525             else
2526             {
2527                 cc->ntdb.push_back(nullptr);
2528             }
2529
2530             /* And the C terminus */
2531             if (nCtdb > 0)
2532             {
2533                 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2534                 if (tdblist.empty())
2535                 {
2536                     GMX_LOG(logger.info)
2537                             .asParagraph()
2538                             .appendTextFormatted(
2539                                     "No suitable end (C or 3') terminus found in database - "
2540                                     "assuming this residue"
2541                                     "is already in a terminus-specific form and skipping terminus "
2542                                     "selection.");
2543                     cc->ctdb.push_back(nullptr);
2544                 }
2545                 else
2546                 {
2547                     if (bTerMan_ && !tdblist.empty())
2548                     {
2549                         sprintf(select,
2550                                 "Select end terminus type for %s-%d",
2551                                 *pdba->resinfo[cc->r_end[i]].name,
2552                                 pdba->resinfo[cc->r_end[i]].nr);
2553                         cc->ctdb.push_back(choose_ter(tdblist, select));
2554                     }
2555                     else
2556                     {
2557                         cc->ctdb.push_back(tdblist[0]);
2558                     }
2559                     printf("End terminus %s-%d: %s\n",
2560                            *pdba->resinfo[cc->r_end[i]].name,
2561                            pdba->resinfo[cc->r_end[i]].nr,
2562                            (cc->ctdb[i])->name.c_str());
2563                     tdblist.clear();
2564                 }
2565             }
2566             else
2567             {
2568                 cc->ctdb.push_back(nullptr);
2569             }
2570         }
2571         std::vector<MoleculePatchDatabase> hb_chain;
2572         /* lookup hackblocks and rtp for all residues */
2573         std::vector<PreprocessResidue> restp_chain;
2574         get_hackblocks_rtp(&hb_chain,
2575                            &restp_chain,
2576                            rtpFFDB,
2577                            pdba->nres,
2578                            pdba->resinfo,
2579                            cc->nterpairs,
2580                            &symtab,
2581                            cc->ntdb,
2582                            cc->ctdb,
2583                            cc->r_start,
2584                            cc->r_end,
2585                            bAllowMissing_,
2586                            logger);
2587         /* ideally, now we would not need the rtp itself anymore, but do
2588            everything using the hb and restp arrays. Unfortunately, that
2589            requires some re-thinking of code in gen_vsite.c, which I won't
2590            do now :( AF 26-7-99 */
2591
2592         rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, residueTypeMap, false, bVerbose_);
2593
2594         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_, logger);
2595
2596         if (bSort_)
2597         {
2598             char**    gnames;
2599             t_blocka* block = new_blocka();
2600             snew(gnames, 1);
2601             sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2602             remove_duplicate_atoms(pdba, x, bVerbose_, logger);
2603             if (bIndexSet_)
2604             {
2605                 if (bRemoveH_)
2606                 {
2607                     GMX_LOG(logger.warning)
2608                             .asParagraph()
2609                             .appendTextFormatted(
2610                                     "With the -remh option the generated "
2611                                     "index file (%s) might be useless "
2612                                     "(the index file is generated before hydrogens are added)",
2613                                     indexOutputFile_.c_str());
2614                 }
2615                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2616             }
2617             for (int i = 0; i < block->nr; i++)
2618             {
2619                 sfree(gnames[i]);
2620             }
2621             sfree(gnames);
2622             done_blocka(block);
2623             sfree(block);
2624         }
2625         else
2626         {
2627             GMX_LOG(logger.warning)
2628                     .asParagraph()
2629                     .appendTextFormatted(
2630                             "Without sorting no check for duplicate atoms can be done");
2631         }
2632
2633         /* Generate Hydrogen atoms (and termini) in the sequence */
2634         GMX_LOG(logger.info)
2635                 .asParagraph()
2636                 .appendTextFormatted(
2637                         "Generating any missing hydrogen atoms and/or adding termini.");
2638         add_h(&pdba,
2639               &localAtoms[chain],
2640               &x,
2641               ah,
2642               &symtab,
2643               cc->nterpairs,
2644               cc->ntdb,
2645               cc->ctdb,
2646               cc->r_start,
2647               cc->r_end,
2648               bAllowMissing_,
2649               cc->cyclicBondsIndex);
2650         GMX_LOG(logger.info)
2651                 .asParagraph()
2652                 .appendTextFormatted("Now there are %d residues with %d atoms", pdba->nres, pdba->nr);
2653
2654         /* make up molecule name(s) */
2655
2656         int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2657
2658         std::string restype = typeOfNamedDatabaseResidue(residueTypeMap, *pdba->resinfo[k].name);
2659
2660         std::string molname;
2661         std::string suffix;
2662         if (cc->bAllWat)
2663         {
2664             molname = "Water";
2665         }
2666         else
2667         {
2668             this_chainid = cc->chainid;
2669
2670             /* Add the chain id if we have one */
2671             if (this_chainid != ' ')
2672             {
2673                 suffix.append(formatString("_chain_%c", this_chainid));
2674             }
2675
2676             /* Check if there have been previous chains with the same id */
2677             int nid_used = 0;
2678             for (int k = 0; k < chain; k++)
2679             {
2680                 if (cc->chainid == chains[k].chainid)
2681                 {
2682                     nid_used++;
2683                 }
2684             }
2685             /* Add the number for this chain identifier if there are multiple copies */
2686             if (nid_used > 0)
2687             {
2688                 suffix.append(formatString("%d", nid_used + 1));
2689             }
2690
2691             if (suffix.length() > 0)
2692             {
2693                 molname.append(restype);
2694                 molname.append(suffix);
2695             }
2696             else
2697             {
2698                 molname = restype;
2699             }
2700         }
2701         std::string itp_fn = topologyFile_;
2702
2703         std::string posre_fn = includeTopologyFile_;
2704         if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2705         {
2706             bITP_ = true;
2707             printf("Chain time...\n");
2708             // construct the itp file name
2709             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2710             itp_fn.append("_");
2711             itp_fn.append(molname);
2712             itp_fn.append(".itp");
2713             // now do the same for posre
2714             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2715             posre_fn.append("_");
2716             posre_fn.append(molname);
2717             posre_fn.append(".itp");
2718             if (posre_fn == itp_fn)
2719             {
2720                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2721             }
2722             incls_.emplace_back();
2723             incls_.back() = itp_fn;
2724             itp_file_     = gmx_fio_fopen(itp_fn.c_str(), "w");
2725         }
2726         else
2727         {
2728             bITP_ = false;
2729         }
2730
2731         mols_.emplace_back();
2732         if (cc->bAllWat)
2733         {
2734             mols_.back().name = "SOL";
2735             mols_.back().nr   = pdba->nres;
2736         }
2737         else
2738         {
2739             mols_.back().name = molname;
2740             mols_.back().nr   = 1;
2741         }
2742
2743         if (bITP_)
2744         {
2745             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2746         }
2747
2748         FILE* top_file2;
2749         if (cc->bAllWat)
2750         {
2751             top_file2 = nullptr;
2752         }
2753         else if (bITP_)
2754         {
2755             top_file2 = itp_file_;
2756         }
2757         else
2758         {
2759             top_file2 = top_file;
2760         }
2761
2762         pdb2top(top_file2,
2763                 posre_fn.c_str(),
2764                 molname.c_str(),
2765                 pdba,
2766                 &x,
2767                 &atype,
2768                 &symtab,
2769                 rtpFFDB,
2770                 restp_chain,
2771                 hb_chain,
2772                 bAllowMissing_,
2773                 bVsites_,
2774                 bVsiteAromatics_,
2775                 ffdir_,
2776                 mHmult_,
2777                 ssbonds,
2778                 long_bond_dist_,
2779                 short_bond_dist_,
2780                 bDeuterate_,
2781                 bChargeGroups_,
2782                 bCmap_,
2783                 bRenumRes_,
2784                 bRTPresname_,
2785                 cc->cyclicBondsIndex,
2786                 logger);
2787
2788         if (!cc->bAllWat)
2789         {
2790             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2791         }
2792
2793         if (bITP_)
2794         {
2795             gmx_fio_fclose(itp_file_);
2796         }
2797
2798         /* pdba and natom have been reassigned somewhere so: */
2799         cc->pdba = pdba;
2800         cc->x    = x;
2801     }
2802
2803     if (watermodel_ == nullptr)
2804     {
2805         for (int chain = 0; chain < numChains; chain++)
2806         {
2807             if (chains[chain].bAllWat)
2808             {
2809                 auto message = formatString(
2810                         "You have chosen not to include a water model, "
2811                         "but there is water in the input file. Select a "
2812                         "water model or remove the water from your input file.");
2813                 GMX_THROW(InconsistentInputError(message));
2814             }
2815         }
2816     }
2817     else
2818     {
2819         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2820         if (!fflib_fexist(waterFile))
2821         {
2822             auto message = formatString(
2823                     "The topology file '%s' for the selected water "
2824                     "model '%s' can not be found in the force field "
2825                     "directory. Select a different water model.",
2826                     waterFile.c_str(),
2827                     watermodel_);
2828             GMX_THROW(InconsistentInputError(message));
2829         }
2830     }
2831
2832     print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2833     gmx_fio_fclose(top_file);
2834
2835     /* now merge all chains back together */
2836     natom    = 0;
2837     int nres = 0;
2838     for (int i = 0; (i < numChains); i++)
2839     {
2840         natom += chains[i].pdba->nr;
2841         nres += chains[i].pdba->nres;
2842     }
2843     t_atoms* atoms;
2844     snew(atoms, 1);
2845     init_t_atoms(atoms, natom, false);
2846     for (int i = 0; i < atoms->nres; i++)
2847     {
2848         sfree(atoms->resinfo[i].name);
2849     }
2850     atoms->nres = nres;
2851     srenew(atoms->resinfo, nres);
2852     x.clear();
2853     int k = 0;
2854     int l = 0;
2855     for (int i = 0; (i < numChains); i++)
2856     {
2857         if (numChains > 1)
2858         {
2859             GMX_LOG(logger.info)
2860                     .asParagraph()
2861                     .appendTextFormatted("Including chain %d in system: %d atoms %d residues",
2862                                          i + 1,
2863                                          chains[i].pdba->nr,
2864                                          chains[i].pdba->nres);
2865         }
2866         for (int j = 0; (j < chains[i].pdba->nr); j++)
2867         {
2868             atoms->atom[k] = chains[i].pdba->atom[j];
2869             atoms->atom[k].resind += l; /* l is processed nr of residues */
2870             atoms->atomname[k]                            = chains[i].pdba->atomname[j];
2871             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2872             x.push_back(chains[i].x[j]);
2873             k++;
2874         }
2875         for (int j = 0; (j < chains[i].pdba->nres); j++)
2876         {
2877             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2878             if (bRTPresname_)
2879             {
2880                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2881             }
2882             l++;
2883         }
2884         done_atom(chains[i].pdba);
2885     }
2886
2887     if (numChains > 1)
2888     {
2889         GMX_LOG(logger.info)
2890                 .asParagraph()
2891                 .appendTextFormatted("Now there are %d atoms and %d residues", k, l);
2892         print_sums(atoms, true, logger);
2893     }
2894
2895     rvec box_space;
2896     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Writing coordinate file...");
2897     clear_rvec(box_space);
2898     if (box[0][0] == 0)
2899     {
2900         make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2901     }
2902     write_sto_conf(
2903             outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr, pbcType, box);
2904
2905     done_symtab(&symtab);
2906     done_atom(&pdba_all);
2907     done_atom(atoms);
2908     for (int chain = 0; chain < numChains; chain++)
2909     {
2910         sfree(sortAtoms[chain]);
2911         sfree(localAtoms[chain]);
2912     }
2913     sfree(sortAtoms);
2914     sfree(localAtoms);
2915     sfree(atoms);
2916     sfree(title);
2917     sfree(pdbx);
2918
2919     GMX_LOG(logger.info)
2920             .asParagraph()
2921             .appendTextFormatted("\t\t--------- PLEASE NOTE ------------");
2922     GMX_LOG(logger.info)
2923             .asParagraph()
2924             .appendTextFormatted("You have successfully generated a topology from: %s.",
2925                                  inputConfFile_.c_str());
2926     if (watermodel_ != nullptr)
2927     {
2928         GMX_LOG(logger.info)
2929                 .asParagraph()
2930                 .appendTextFormatted(
2931                         "The %s force field and the %s water model are used.", ffname_, watermodel_);
2932         sfree(watermodel_);
2933     }
2934     else
2935     {
2936         GMX_LOG(logger.info).asParagraph().appendTextFormatted("The %s force field is used.", ffname_);
2937     }
2938     GMX_LOG(logger.info)
2939             .asParagraph()
2940             .appendTextFormatted("\t\t--------- ETON ESAELP ------------");
2941
2942     return 0;
2943 }
2944
2945 } // namespace
2946
2947 const char pdb2gmxInfo::name[] = "pdb2gmx";
2948 const char pdb2gmxInfo::shortDescription[] =
2949         "Convert coordinate files to topology and FF-compliant coordinate files";
2950 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2951 {
2952     return std::make_unique<pdb2gmx>();
2953 }
2954
2955 } // namespace gmx