067a1b86835792589ae2018dd47918573f17a77f
[alexxy/gromacs.git] / src / gromacs / gmxpreprocess / pdb2gmx.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 #include "gmxpre.h"
39
40 #include "gromacs/utility/enumerationhelpers.h"
41 #include "pdb2gmx.h"
42
43 #include <cctype>
44 #include <cstdlib>
45 #include <cstring>
46 #include <ctime>
47
48 #include <algorithm>
49 #include <memory>
50 #include <string>
51 #include <vector>
52
53 #include "gromacs/commandline/cmdlineoptionsmodule.h"
54 #include "gromacs/fileio/confio.h"
55 #include "gromacs/fileio/filetypes.h"
56 #include "gromacs/fileio/gmxfio.h"
57 #include "gromacs/fileio/pdbio.h"
58 #include "gromacs/gmxlib/conformation_utilities.h"
59 #include "gromacs/gmxpreprocess/fflibutil.h"
60 #include "gromacs/gmxpreprocess/genhydro.h"
61 #include "gromacs/gmxpreprocess/gpp_atomtype.h"
62 #include "gromacs/gmxpreprocess/grompp_impl.h"
63 #include "gromacs/gmxpreprocess/h_db.h"
64 #include "gromacs/gmxpreprocess/hizzie.h"
65 #include "gromacs/gmxpreprocess/pdb2top.h"
66 #include "gromacs/gmxpreprocess/pgutil.h"
67 #include "gromacs/gmxpreprocess/specbond.h"
68 #include "gromacs/gmxpreprocess/ter_db.h"
69 #include "gromacs/gmxpreprocess/toputil.h"
70 #include "gromacs/gmxpreprocess/xlate.h"
71 #include "gromacs/math/vec.h"
72 #include "gromacs/options/basicoptions.h"
73 #include "gromacs/options/filenameoption.h"
74 #include "gromacs/options/ioptionscontainer.h"
75 #include "gromacs/topology/atomprop.h"
76 #include "gromacs/topology/block.h"
77 #include "gromacs/topology/index.h"
78 #include "gromacs/topology/residuetypes.h"
79 #include "gromacs/topology/symtab.h"
80 #include "gromacs/topology/topology.h"
81 #include "gromacs/utility/dir_separator.h"
82 #include "gromacs/utility/exceptions.h"
83 #include "gromacs/utility/fatalerror.h"
84 #include "gromacs/utility/filestream.h"
85 #include "gromacs/utility/loggerbuilder.h"
86 #include "gromacs/utility/path.h"
87 #include "gromacs/utility/smalloc.h"
88 #include "gromacs/utility/strdb.h"
89 #include "gromacs/utility/stringutil.h"
90
91 #include "hackblock.h"
92 #include "resall.h"
93
94 struct RtpRename
95 {
96     RtpRename(const char* newGmx, const char* newMain, const char* newNter, const char* newCter, const char* newBter) :
97         gmx(newGmx), main(newMain), nter(newNter), cter(newCter), bter(newBter)
98     {
99     }
100     std::string gmx;
101     std::string main;
102     std::string nter;
103     std::string cter;
104     std::string bter;
105 };
106
107 namespace
108 {
109
110 const char* res2bb_notermini(const std::string& name, gmx::ArrayRef<const RtpRename> rr)
111 {
112     /* NOTE: This function returns the main building block name,
113      *       it does not take terminal renaming into account.
114      */
115     auto found = std::find_if(rr.begin(), rr.end(), [&name](const auto& rename) {
116         return gmx::equalCaseInsensitive(name, rename.gmx);
117     });
118     return found != rr.end() ? found->main.c_str() : name.c_str();
119 }
120
121 const char* enumValueToLongString(HistidineStates enumValue)
122 {
123     constexpr gmx::EnumerationArray<HistidineStates, const char*> histidineStatesLongNames = {
124         "H on ND1 only", "H on NE2 only", "H on ND1 and NE2", "Coupled to Heme"
125     };
126     return histidineStatesLongNames[enumValue];
127 }
128
129 enum class AspartateStates : int
130 {
131     Deprot,
132     Prot,
133     Count
134 };
135
136 const char* enumValueToString(AspartateStates enumValue)
137 {
138     constexpr gmx::EnumerationArray<AspartateStates, const char*> aspartateStateNames = { "ASP",
139                                                                                           "ASPH" };
140     return aspartateStateNames[enumValue];
141 }
142
143 const char* enumValueToLongString(AspartateStates enumValue)
144 {
145     constexpr gmx::EnumerationArray<AspartateStates, const char*> aspartateStateLongNames = {
146         "Not protonated (charge -1)", "Protonated (charge 0)"
147     };
148     return aspartateStateLongNames[enumValue];
149 }
150
151 enum class GlutamateStates : int
152 {
153     Deprot,
154     Prot,
155     Count
156 };
157
158 const char* enumValueToString(GlutamateStates enumValue)
159 {
160     constexpr gmx::EnumerationArray<GlutamateStates, const char*> glutamateStateNames = { "GLU",
161                                                                                           "GLUH" };
162     return glutamateStateNames[enumValue];
163 }
164
165 const char* enumValueToLongString(GlutamateStates enumValue)
166 {
167     constexpr gmx::EnumerationArray<GlutamateStates, const char*> glutamateStateLongNames = {
168         "Not protonated (charge -1)", "Protonated (charge 0)"
169     };
170     return glutamateStateLongNames[enumValue];
171 }
172
173 enum class GlutamineStates : int
174 {
175     Deprot,
176     Prot,
177     Count
178 };
179
180 const char* enumValueToString(GlutamineStates enumValue)
181 {
182     constexpr gmx::EnumerationArray<GlutamineStates, const char*> glutamineStateNames = { "GLN",
183                                                                                           "QLN" };
184     return glutamineStateNames[enumValue];
185 }
186
187 const char* enumValueToLongString(GlutamineStates enumValue)
188 {
189     constexpr gmx::EnumerationArray<GlutamineStates, const char*> glutamineStateLongNames = {
190         "Not protonated (charge 0)", "Protonated (charge +1)"
191     };
192     return glutamineStateLongNames[enumValue];
193 }
194
195 enum class LysineStates : int
196 {
197     Deprot,
198     Prot,
199     Count
200 };
201
202 const char* enumValueToString(LysineStates enumValue)
203 {
204     constexpr gmx::EnumerationArray<LysineStates, const char*> lysineStateNames = { "LYSN", "LYS" };
205     return lysineStateNames[enumValue];
206 }
207
208 const char* enumValueToLongString(LysineStates enumValue)
209 {
210     constexpr gmx::EnumerationArray<LysineStates, const char*> lysineStateLongNames = {
211         "Not protonated (charge 0)", "Protonated (charge +1)"
212     };
213     return lysineStateLongNames[enumValue];
214 }
215
216 enum class ArginineStates : int
217 {
218     Deprot,
219     Prot,
220     Count
221 };
222
223 const char* enumValueToString(ArginineStates enumValue)
224 {
225     constexpr gmx::EnumerationArray<ArginineStates, const char*> arginineStatesNames = { "ARGN",
226                                                                                          "ARG" };
227     return arginineStatesNames[enumValue];
228 }
229
230 const char* enumValueToLongString(ArginineStates enumValue)
231 {
232     constexpr gmx::EnumerationArray<ArginineStates, const char*> arginineStatesLongNames = {
233         "Not protonated (charge 0)", "Protonated (charge +1)"
234     };
235     return arginineStatesLongNames[enumValue];
236 }
237
238 template<typename EnumType>
239 const char* select_res(int resnr, const char* title, gmx::ArrayRef<const RtpRename> rr)
240 {
241     printf("Which %s type do you want for residue %d\n", title, resnr + 1);
242     for (auto sel : gmx::EnumerationWrapper<EnumType>{})
243     {
244         printf("%d. %s (%s)\n",
245                static_cast<int>(sel),
246                enumValueToString(sel),
247                res2bb_notermini(enumValueToString(sel), rr));
248     }
249     printf("\nType a number:");
250     fflush(stdout);
251
252     int userSelection;
253     if (scanf("%d", &userSelection) != 1)
254     {
255         gmx_fatal(FARGS, "Answer me for res %s %d!", title, resnr + 1);
256     }
257
258     return enumValueToLongString(static_cast<EnumType>(userSelection));
259 }
260
261 const char* get_asptp(int resnr, gmx::ArrayRef<const RtpRename> rr)
262 {
263     return select_res<AspartateStates>(resnr, "ASPARTIC ACID", rr);
264 }
265
266 const char* get_glutp(int resnr, gmx::ArrayRef<const RtpRename> rr)
267 {
268     return select_res<GlutamateStates>(resnr, "GLUTAMIC ACID", rr);
269 }
270
271 const char* get_glntp(int resnr, gmx::ArrayRef<const RtpRename> rr)
272 {
273     return select_res<GlutamineStates>(resnr, "GLUTAMINE", rr);
274 }
275
276 const char* get_lystp(int resnr, gmx::ArrayRef<const RtpRename> rr)
277 {
278     return select_res<LysineStates>(resnr, "LYSINE", rr);
279 }
280
281 const char* get_argtp(int resnr, gmx::ArrayRef<const RtpRename> rr)
282 {
283     return select_res<ArginineStates>(resnr, "ARGININE", rr);
284 }
285
286 const char* get_histp(int resnr, gmx::ArrayRef<const RtpRename> rr)
287 {
288     return select_res<HistidineStates>(resnr, "HISTIDINE", rr);
289 }
290
291 void read_rtprename(const char* fname, FILE* fp, std::vector<RtpRename>* rtprename)
292 {
293     char line[STRLEN], buf[STRLEN];
294
295     int ncol = 0;
296     while (get_a_line(fp, line, STRLEN))
297     {
298         /* line is NULL-terminated and length<STRLEN, so final arg cannot overflow.
299          * For other args, we read up to 6 chars (so we can detect if the length is > 5).
300          * Note that the buffer length has been increased to 7 to allow this,
301          * so we just need to make sure the strings have zero-length initially.
302          */
303         char gmx[STRLEN];
304         char main[STRLEN];
305         char nter[STRLEN];
306         char cter[STRLEN];
307         char bter[STRLEN];
308         gmx[0]       = '\0';
309         main[0]      = '\0';
310         nter[0]      = '\0';
311         cter[0]      = '\0';
312         bter[0]      = '\0';
313         int       nc = sscanf(line, "%6s %6s %6s %6s %6s %s", gmx, main, nter, cter, bter, buf);
314         RtpRename newEntry(gmx, main, nter, cter, bter);
315         if (ncol == 0)
316         {
317             if (nc != 2 && nc != 5)
318             {
319                 gmx_fatal(FARGS,
320                           "Residue renaming database '%s' has %d columns instead of %d or %d",
321                           fname,
322                           ncol,
323                           2,
324                           5);
325             }
326             ncol = nc;
327         }
328         else if (nc != ncol)
329         {
330             gmx_fatal(FARGS,
331                       "A line in residue renaming database '%s' has %d columns, while previous "
332                       "lines have %d columns",
333                       fname,
334                       nc,
335                       ncol);
336         }
337
338         if (nc == 2)
339         {
340             /* This file does not have special termini names, copy them from main */
341             newEntry.nter = newEntry.main;
342             newEntry.cter = newEntry.main;
343             newEntry.bter = newEntry.main;
344         }
345         rtprename->push_back(newEntry);
346     }
347 }
348
349 std::string search_resrename(gmx::ArrayRef<const RtpRename> rr, const char* name, bool bStart, bool bEnd, bool bCompareFFRTPname)
350 {
351     auto found = std::find_if(rr.begin(), rr.end(), [&name, &bCompareFFRTPname](const auto& rename) {
352         return ((!bCompareFFRTPname && (name == rename.gmx))
353                 || (bCompareFFRTPname && (name == rename.main)));
354     });
355
356     std::string newName;
357     /* If found in the database, rename this residue's rtp building block,
358      * otherwise keep the old name.
359      */
360     if (found != rr.end())
361     {
362         if (bStart && bEnd)
363         {
364             newName = found->bter;
365         }
366         else if (bStart)
367         {
368             newName = found->nter;
369         }
370         else if (bEnd)
371         {
372             newName = found->cter;
373         }
374         else
375         {
376             newName = found->main;
377         }
378
379         if (newName[0] == '-')
380         {
381             gmx_fatal(FARGS,
382                       "In the chosen force field there is no residue type for '%s'%s",
383                       name,
384                       bStart ? (bEnd ? " as a standalone (starting & ending) residue" : " as a starting terminus")
385                              : (bEnd ? " as an ending terminus" : ""));
386         }
387     }
388
389     return newName;
390 }
391
392 void rename_resrtp(t_atoms*                       pdba,
393                    int                            nterpairs,
394                    gmx::ArrayRef<const int>       r_start,
395                    gmx::ArrayRef<const int>       r_end,
396                    gmx::ArrayRef<const RtpRename> rr,
397                    t_symtab*                      symtab,
398                    bool                           bVerbose,
399                    const gmx::MDLogger&           logger)
400 {
401     bool bFFRTPTERRNM = (getenv("GMX_NO_FFRTP_TER_RENAME") == nullptr);
402
403     for (int r = 0; r < pdba->nres; r++)
404     {
405         bool bStart = false;
406         bool bEnd   = false;
407         for (int j = 0; j < nterpairs; j++)
408         {
409             if (r == r_start[j])
410             {
411                 bStart = true;
412             }
413         }
414         for (int j = 0; j < nterpairs; j++)
415         {
416             if (r == r_end[j])
417             {
418                 bEnd = true;
419             }
420         }
421
422         std::string newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, false);
423
424         if (bFFRTPTERRNM && newName.empty() && (bStart || bEnd))
425         {
426             /* This is a terminal residue, but the residue name,
427              * currently stored in .rtp, is not a standard residue name,
428              * but probably a force field specific rtp name.
429              * Check if we need to rename it because it is terminal.
430              */
431             newName = search_resrename(rr, *pdba->resinfo[r].rtp, bStart, bEnd, true);
432         }
433
434         if (!newName.empty() && newName != *pdba->resinfo[r].rtp)
435         {
436             if (bVerbose)
437             {
438                 GMX_LOG(logger.info)
439                         .asParagraph()
440                         .appendTextFormatted("Changing rtp entry of residue %d %s to '%s'",
441                                              pdba->resinfo[r].nr,
442                                              *pdba->resinfo[r].name,
443                                              newName.c_str());
444             }
445             pdba->resinfo[r].rtp = put_symtab(symtab, newName.c_str());
446         }
447     }
448 }
449
450 void pdbres_to_gmxrtp(t_atoms* pdba)
451 {
452     int i;
453
454     for (i = 0; (i < pdba->nres); i++)
455     {
456         if (pdba->resinfo[i].rtp == nullptr)
457         {
458             pdba->resinfo[i].rtp = pdba->resinfo[i].name;
459         }
460     }
461 }
462
463 void rename_pdbres(t_atoms* pdba, const char* oldnm, const char* newnm, bool bFullCompare, t_symtab* symtab)
464 {
465     char* resnm;
466     int   i;
467
468     for (i = 0; (i < pdba->nres); i++)
469     {
470         resnm = *pdba->resinfo[i].name;
471         if ((bFullCompare && (gmx::equalCaseInsensitive(resnm, oldnm)))
472             || (!bFullCompare && strstr(resnm, oldnm) != nullptr))
473         {
474             /* Rename the residue name (not the rtp name) */
475             pdba->resinfo[i].name = put_symtab(symtab, newnm);
476         }
477     }
478 }
479
480 /*! \brief Rename all residues named \c oldnm to \c newnm
481  *
482  * Search for residues for which the residue name from the input
483  * configuration file matches \c oldnm, and when found choose the rtp
484  * entry and name of \c newnm.
485  *
486  * \todo Refactor this function to accept a lambda that accepts i and
487  * numMatchesFound but always produces \c newnm. Then remove
488  * renameResiduesInteractively by calling this method with suitable
489  * lambdas that capture its parameter \c rr and ignores
490  * numMatchesFound. */
491 void renameResidue(const gmx::MDLogger& logger,
492                    t_atoms*             pdba,
493                    const char*          oldnm,
494                    const char*          newnm,
495                    bool                 bFullCompare,
496                    t_symtab*            symtab)
497 {
498     int numMatchesFound = 0;
499     for (int i = 0; (i < pdba->nres); i++)
500     {
501         /* We have not set the rtp name yet, use the residue name */
502         const char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
503         if ((bFullCompare && (gmx::equalCaseInsensitive(residueNameInInputConfiguration, oldnm)))
504             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
505         {
506             /* Change the rtp building block name */
507             pdba->resinfo[i].rtp  = put_symtab(symtab, newnm);
508             pdba->resinfo[i].name = pdba->resinfo[i].rtp;
509             numMatchesFound++;
510         }
511     }
512     if (numMatchesFound > 0)
513     {
514         GMX_LOG(logger.info)
515                 .asParagraph()
516                 .appendTextFormatted(
517                         "Replaced %d residue%s named %s to the default %s. Use interactive "
518                         "selection of protonated residues if that is what you need.",
519                         numMatchesFound,
520                         numMatchesFound > 1 ? "s" : "",
521                         oldnm,
522                         newnm);
523     }
524 }
525
526 /*! \brief Rename all residues named \c oldnm according to the user's
527  * interactive choice
528  *
529  * Search for residues for which the residue name from the input
530  * configuration file matches \c oldnm, and when found choose the rtp
531  * entry and name of the interactive choice from \c gettp.
532  *
533  * \todo Remove this function, per todo in \c renameResidue. */
534 void renameResidueInteractively(t_atoms*    pdba,
535                                 const char* oldnm,
536                                 const char* gettp(int, gmx::ArrayRef<const RtpRename>),
537                                 bool        bFullCompare,
538                                 t_symtab*   symtab,
539                                 gmx::ArrayRef<const RtpRename> rr)
540 {
541     // Search for residues i for which the residue name from the input
542     // configuration file matches oldnm, so it can replaced by the rtp
543     // entry and name of newnm.
544     for (int i = 0; i < pdba->nres; i++)
545     {
546         /* We have not set the rtp name yet, use the residue name */
547         char* residueNameInInputConfiguration = *pdba->resinfo[i].name;
548         if ((bFullCompare && (strcmp(residueNameInInputConfiguration, oldnm) == 0))
549             || (!bFullCompare && strstr(residueNameInInputConfiguration, oldnm) != nullptr))
550         {
551             const char* interactiveRtpChoice = gettp(i, rr);
552             pdba->resinfo[i].rtp             = put_symtab(symtab, interactiveRtpChoice);
553             pdba->resinfo[i].name            = pdba->resinfo[i].rtp;
554         }
555     }
556 }
557
558 void check_occupancy(t_atoms* atoms, const char* filename, bool bVerbose, const gmx::MDLogger& logger)
559 {
560     int i, ftp;
561     int nzero   = 0;
562     int nnotone = 0;
563
564     ftp = fn2ftp(filename);
565     if (!atoms->pdbinfo || ((ftp != efPDB) && (ftp != efBRK) && (ftp != efENT)))
566     {
567         GMX_LOG(logger.warning).asParagraph().appendTextFormatted("No occupancies in %s", filename);
568     }
569     else
570     {
571         for (i = 0; (i < atoms->nr); i++)
572         {
573             if (atoms->pdbinfo[i].occup != 1)
574             {
575                 if (bVerbose)
576                 {
577                     GMX_LOG(logger.warning)
578                             .asParagraph()
579                             .appendTextFormatted("Occupancy for atom %s%d-%s is %f rather than 1",
580                                                  *atoms->resinfo[atoms->atom[i].resind].name,
581                                                  atoms->resinfo[atoms->atom[i].resind].nr,
582                                                  *atoms->atomname[i],
583                                                  atoms->pdbinfo[i].occup);
584                 }
585                 if (atoms->pdbinfo[i].occup == 0)
586                 {
587                     nzero++;
588                 }
589                 else
590                 {
591                     nnotone++;
592                 }
593             }
594         }
595         if (nzero == atoms->nr)
596         {
597             GMX_LOG(logger.warning)
598                     .asParagraph()
599                     .appendTextFormatted(
600                             "All occupancy fields zero. This is probably not an X-Ray structure");
601         }
602         else if ((nzero > 0) || (nnotone > 0))
603         {
604             GMX_LOG(logger.warning)
605                     .asParagraph()
606                     .appendTextFormatted(
607                             "there were %d atoms with zero occupancy and %d atoms with "
608                             "         occupancy unequal to one (out of %d atoms). Check your pdb "
609                             "file.",
610                             nzero,
611                             nnotone,
612                             atoms->nr);
613         }
614         else
615         {
616             GMX_LOG(logger.warning).asParagraph().appendTextFormatted("All occupancies are one");
617         }
618     }
619 }
620
621 void write_posres(const char* fn, t_atoms* pdba, real fc)
622 {
623     FILE* fp;
624     int   i;
625
626     fp = gmx_fio_fopen(fn, "w");
627     fprintf(fp,
628             "; In this topology include file, you will find position restraint\n"
629             "; entries for all the heavy atoms in your original pdb file.\n"
630             "; This means that all the protons which were added by pdb2gmx are\n"
631             "; not restrained.\n"
632             "\n"
633             "[ position_restraints ]\n"
634             "; %4s%6s%8s%8s%8s\n",
635             "atom",
636             "type",
637             "fx",
638             "fy",
639             "fz");
640     for (i = 0; (i < pdba->nr); i++)
641     {
642         if (!is_hydrogen(*pdba->atomname[i]) && !is_dummymass(*pdba->atomname[i]))
643         {
644             fprintf(fp, "%6d%6d  %g  %g  %g\n", i + 1, 1, fc, fc, fc);
645         }
646     }
647     gmx_fio_fclose(fp);
648 }
649
650 int read_pdball(const char*     inf,
651                 bool            bOutput,
652                 const char*     outf,
653                 char**          title,
654                 t_atoms*        atoms,
655                 rvec**          x,
656                 PbcType*        pbcType,
657                 matrix          box,
658                 bool            bRemoveH,
659                 t_symtab*       symtab,
660                 ResidueTypeMap* rt,
661                 const char*     watres,
662                 AtomProperties* aps,
663                 bool            bVerbose)
664 /* Read a pdb file. (containing proteins) */
665 {
666     int natom, new_natom, i;
667
668     /* READ IT */
669     printf("Reading %s...\n", inf);
670     readConfAndAtoms(inf, symtab, title, atoms, pbcType, x, nullptr, box);
671     natom = atoms->nr;
672     if (atoms->pdbinfo == nullptr)
673     {
674         snew(atoms->pdbinfo, atoms->nr);
675     }
676     if (fn2ftp(inf) == efPDB)
677     {
678         get_pdb_atomnumber(atoms, aps);
679     }
680     if (bRemoveH)
681     {
682         new_natom = 0;
683         for (i = 0; i < atoms->nr; i++)
684         {
685             if (!is_hydrogen(*atoms->atomname[i]))
686             {
687                 atoms->atom[new_natom]     = atoms->atom[i];
688                 atoms->atomname[new_natom] = atoms->atomname[i];
689                 atoms->pdbinfo[new_natom]  = atoms->pdbinfo[i];
690                 copy_rvec((*x)[i], (*x)[new_natom]);
691                 new_natom++;
692             }
693         }
694         atoms->nr = new_natom;
695         natom     = new_natom;
696     }
697
698     printf("Read");
699     if (title[0])
700     {
701         printf(" '%s',", *title);
702     }
703     printf(" %d atoms\n", natom);
704
705     /* Rename residues */
706     rename_pdbres(atoms, "HOH", watres, false, symtab);
707     rename_pdbres(atoms, "SOL", watres, false, symtab);
708     rename_pdbres(atoms, "WAT", watres, false, symtab);
709
710     rename_atoms("xlateat.dat", nullptr, atoms, symtab, {}, true, rt, true, bVerbose);
711
712     if (natom == 0)
713     {
714         return 0;
715     }
716     if (bOutput)
717     {
718         write_sto_conf(outf, *title, atoms, *x, nullptr, *pbcType, box);
719     }
720
721     return natom;
722 }
723
724 void process_chain(const gmx::MDLogger&           logger,
725                    t_atoms*                       pdba,
726                    gmx::ArrayRef<gmx::RVec>       x,
727                    bool                           bTrpU,
728                    bool                           bPheU,
729                    bool                           bTyrU,
730                    bool                           bLysMan,
731                    bool                           bAspMan,
732                    bool                           bGluMan,
733                    bool                           bHisMan,
734                    bool                           bArgMan,
735                    bool                           bGlnMan,
736                    real                           angle,
737                    real                           distance,
738                    t_symtab*                      symtab,
739                    gmx::ArrayRef<const RtpRename> rr)
740 {
741     /* Rename aromatics, lys, asp and histidine */
742     if (bTyrU)
743     {
744         renameResidue(logger, pdba, "TYR", "TYRU", false, symtab);
745     }
746     if (bTrpU)
747     {
748         renameResidue(logger, pdba, "TRP", "TRPU", false, symtab);
749     }
750     if (bPheU)
751     {
752         renameResidue(logger, pdba, "PHE", "PHEU", false, symtab);
753     }
754     if (bLysMan)
755     {
756         renameResidueInteractively(pdba, "LYS", get_lystp, false, symtab, rr);
757     }
758     if (bArgMan)
759     {
760         renameResidueInteractively(pdba, "ARG", get_argtp, false, symtab, rr);
761     }
762     if (bGlnMan)
763     {
764         renameResidueInteractively(pdba, "GLN", get_glntp, false, symtab, rr);
765     }
766     if (bAspMan)
767     {
768         renameResidueInteractively(pdba, "ASP", get_asptp, false, symtab, rr);
769     }
770     else
771     {
772         renameResidue(logger, pdba, "ASPH", "ASP", false, symtab);
773     }
774     if (bGluMan)
775     {
776         renameResidueInteractively(pdba, "GLU", get_glutp, false, symtab, rr);
777     }
778     else
779     {
780         renameResidue(logger, pdba, "GLUH", "GLU", false, symtab);
781     }
782
783     if (!bHisMan)
784     {
785         set_histp(pdba, gmx::as_rvec_array(x.data()), symtab, angle, distance);
786     }
787     else
788     {
789         renameResidueInteractively(pdba, "HIS", get_histp, true, symtab, rr);
790     }
791
792     /* Initialize the rtp builing block names with the residue names
793      * for the residues that have not been processed above.
794      */
795     pdbres_to_gmxrtp(pdba);
796
797     /* Now we have all rtp names set.
798      * The rtp names will conform to Gromacs naming,
799      * unless the input pdb file contained one or more force field specific
800      * rtp names as residue names.
801      */
802 }
803
804 /* struct for sorting the atoms from the pdb file */
805 typedef struct
806 {
807     int  resnr;  /* residue number               */
808     int  j;      /* database order index         */
809     int  index;  /* original atom number         */
810     char anm1;   /* second letter of atom name   */
811     char altloc; /* alternate location indicator */
812 } t_pdbindex;
813
814 bool pdbicomp(const t_pdbindex& a, const t_pdbindex& b)
815 {
816     int d = (a.resnr - b.resnr);
817     if (d == 0)
818     {
819         d = (a.j - b.j);
820         if (d == 0)
821         {
822             d = (a.anm1 - b.anm1);
823             if (d == 0)
824             {
825                 d = (a.altloc - b.altloc);
826             }
827         }
828     }
829     return d < 0;
830 }
831
832 void sort_pdbatoms(gmx::ArrayRef<const PreprocessResidue> restp_chain,
833                    int                                    natoms,
834                    t_atoms**                              pdbaptr,
835                    t_atoms**                              newPdbAtoms,
836                    std::vector<gmx::RVec>*                x,
837                    t_blocka*                              block,
838                    char***                                gnames)
839 {
840     t_atoms*               pdba = *pdbaptr;
841     std::vector<gmx::RVec> xnew;
842     t_pdbindex*            pdbi;
843     char*                  atomnm;
844
845     natoms = pdba->nr;
846     snew(pdbi, natoms);
847
848     for (int i = 0; i < natoms; i++)
849     {
850         atomnm                                  = *pdba->atomname[i];
851         const PreprocessResidue* localPpResidue = &restp_chain[pdba->atom[i].resind];
852         auto                     found =
853                 std::find_if(localPpResidue->atomname.begin(),
854                              localPpResidue->atomname.end(),
855                              [&atomnm](char** it) { return gmx::equalCaseInsensitive(atomnm, *it); });
856         if (found == localPpResidue->atomname.end())
857         {
858             std::string buf = gmx::formatString(
859                     "Atom %s in residue %s %d was not found in rtp entry %s with %d atoms\n"
860                     "while sorting atoms.\n%s",
861                     atomnm,
862                     *pdba->resinfo[pdba->atom[i].resind].name,
863                     pdba->resinfo[pdba->atom[i].resind].nr,
864                     localPpResidue->resname.c_str(),
865                     localPpResidue->natom(),
866                     is_hydrogen(atomnm)
867                             ? "\nFor a hydrogen, this can be a different protonation state, or it\n"
868                               "might have had a different number in the PDB file and was rebuilt\n"
869                               "(it might for instance have been H3, and we only expected H1 & "
870                               "H2).\n"
871                               "Note that hydrogens might have been added to the entry for the "
872                               "N-terminus.\n"
873                               "Remove this hydrogen or choose a different protonation state to "
874                               "solve it.\n"
875                               "Option -ignh will ignore all hydrogens in the input."
876                             : ".");
877             gmx_fatal(FARGS, "%s", buf.c_str());
878         }
879         /* make shadow array to be sorted into indexgroup */
880         pdbi[i].resnr  = pdba->atom[i].resind;
881         pdbi[i].j      = std::distance(localPpResidue->atomname.begin(), found);
882         pdbi[i].index  = i;
883         pdbi[i].anm1   = atomnm[1];
884         pdbi[i].altloc = pdba->pdbinfo[i].altloc;
885     }
886     std::sort(pdbi, pdbi + natoms, pdbicomp);
887
888     /* pdba is sorted in pdbnew using the pdbi index */
889     std::vector<int> a(natoms);
890     srenew(*newPdbAtoms, 1);
891     init_t_atoms((*newPdbAtoms), natoms, true);
892     (*newPdbAtoms)->nr   = pdba->nr;
893     (*newPdbAtoms)->nres = pdba->nres;
894     srenew((*newPdbAtoms)->resinfo, pdba->nres);
895     std::copy(pdba->resinfo, pdba->resinfo + pdba->nres, (*newPdbAtoms)->resinfo);
896     for (int i = 0; i < natoms; i++)
897     {
898         (*newPdbAtoms)->atom[i]     = pdba->atom[pdbi[i].index];
899         (*newPdbAtoms)->atomname[i] = pdba->atomname[pdbi[i].index];
900         (*newPdbAtoms)->pdbinfo[i]  = pdba->pdbinfo[pdbi[i].index];
901         xnew.push_back(x->at(pdbi[i].index));
902         /* make indexgroup in block */
903         a[i] = pdbi[i].index;
904     }
905     /* clean up */
906     done_atom(pdba);
907     sfree(pdba);
908     /* copy the sorted pdbnew back to pdba */
909     *pdbaptr = *newPdbAtoms;
910     *x       = xnew;
911     add_grp(block, gnames, a, "prot_sort");
912     sfree(pdbi);
913 }
914
915 int remove_duplicate_atoms(t_atoms* pdba, gmx::ArrayRef<gmx::RVec> x, bool bVerbose, const gmx::MDLogger& logger)
916 {
917     int        i, j, oldnatoms, ndel;
918     t_resinfo* ri;
919
920     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Checking for duplicate atoms....");
921     oldnatoms = pdba->nr;
922     ndel      = 0;
923     /* NOTE: pdba->nr is modified inside the loop */
924     for (i = 1; (i < pdba->nr); i++)
925     {
926         /* compare 'i' and 'i-1', throw away 'i' if they are identical
927            this is a 'while' because multiple alternate locations can be present */
928         while ((i < pdba->nr) && (pdba->atom[i - 1].resind == pdba->atom[i].resind)
929                && (strcmp(*pdba->atomname[i - 1], *pdba->atomname[i]) == 0))
930         {
931             ndel++;
932             if (bVerbose)
933             {
934                 ri = &pdba->resinfo[pdba->atom[i].resind];
935                 GMX_LOG(logger.info)
936                         .asParagraph()
937                         .appendTextFormatted("deleting duplicate atom %4s  %s%4d%c",
938                                              *pdba->atomname[i],
939                                              *ri->name,
940                                              ri->nr,
941                                              ri->ic);
942                 if (ri->chainid && (ri->chainid != ' '))
943                 {
944                     printf(" ch %c", ri->chainid);
945                 }
946                 if (pdba->pdbinfo)
947                 {
948                     if (pdba->pdbinfo[i].atomnr)
949                     {
950                         printf("  pdb nr %4d", pdba->pdbinfo[i].atomnr);
951                     }
952                     if (pdba->pdbinfo[i].altloc && (pdba->pdbinfo[i].altloc != ' '))
953                     {
954                         printf("  altloc %c", pdba->pdbinfo[i].altloc);
955                     }
956                 }
957                 printf("\n");
958             }
959             pdba->nr--;
960             /* We can not free, since it might be in the symtab */
961             /* sfree(pdba->atomname[i]); */
962             for (j = i; j < pdba->nr; j++)
963             {
964                 pdba->atom[j]     = pdba->atom[j + 1];
965                 pdba->atomname[j] = pdba->atomname[j + 1];
966                 if (pdba->pdbinfo)
967                 {
968                     pdba->pdbinfo[j] = pdba->pdbinfo[j + 1];
969                 }
970                 copy_rvec(x[j + 1], x[j]);
971             }
972             srenew(pdba->atom, pdba->nr);
973             /* srenew(pdba->atomname, pdba->nr); */
974             srenew(pdba->pdbinfo, pdba->nr);
975         }
976     }
977     if (pdba->nr != oldnatoms)
978     {
979         GMX_LOG(logger.info)
980                 .asParagraph()
981                 .appendTextFormatted("Now there are %d atoms. Deleted %d duplicates.", pdba->nr, ndel);
982     }
983
984     return pdba->nr;
985 }
986
987 void checkResidueTypeSanity(t_atoms* pdba, int r0, int r1, ResidueTypeMap* rt)
988 {
989     std::string startResidueString =
990             gmx::formatString("%s%d", *pdba->resinfo[r0].name, pdba->resinfo[r0].nr);
991     std::string endResidueString =
992             gmx::formatString("%s%d", *pdba->resinfo[r1 - 1].name, pdba->resinfo[r1 - 1].nr);
993
994     // Check whether all residues in chain have the same chain ID.
995     bool        allResiduesHaveSameChainID = true;
996     char        chainID0                   = pdba->resinfo[r0].chainid;
997     char        chainID;
998     std::string residueString;
999
1000     for (int i = r0 + 1; i < r1; i++)
1001     {
1002         chainID = pdba->resinfo[i].chainid;
1003         if (chainID != chainID0)
1004         {
1005             allResiduesHaveSameChainID = false;
1006             residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1007             break;
1008         }
1009     }
1010
1011     if (!allResiduesHaveSameChainID)
1012     {
1013         gmx_fatal(FARGS,
1014                   "The chain covering the range %s--%s does not have a consistent chain ID. "
1015                   "The first residue has ID '%c', while residue %s has ID '%c'.",
1016                   startResidueString.c_str(),
1017                   endResidueString.c_str(),
1018                   chainID0,
1019                   residueString.c_str(),
1020                   chainID);
1021     }
1022
1023     // At this point all residues have the same ID. If they are also non-blank
1024     // we can be a bit more aggressive and require the types match too.
1025     if (chainID0 != ' ')
1026     {
1027         bool        allResiduesHaveSameType = true;
1028         std::string restype;
1029         std::string restype0 = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[r0].name);
1030
1031         for (int i = r0 + 1; i < r1; i++)
1032         {
1033             restype = rt->typeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1034             if (!gmx::equalCaseInsensitive(restype, restype0))
1035             {
1036                 allResiduesHaveSameType = false;
1037                 residueString = gmx::formatString("%s%d", *pdba->resinfo[i].name, pdba->resinfo[i].nr);
1038                 break;
1039             }
1040         }
1041
1042         if (!allResiduesHaveSameType)
1043         {
1044             gmx_fatal(FARGS,
1045                       "The residues in the chain %s--%s do not have a consistent type. "
1046                       "The first residue has type '%s', while residue %s is of type '%s'. "
1047                       "Either there is a mistake in your chain, or it includes nonstandard "
1048                       "residue names that have not yet been added to the residuetypes.dat "
1049                       "file in the GROMACS library directory. If there are other molecules "
1050                       "such as ligands, they should not have the same chain ID as the "
1051                       "adjacent protein chain since it's a separate molecule.",
1052                       startResidueString.c_str(),
1053                       endResidueString.c_str(),
1054                       restype0.c_str(),
1055                       residueString.c_str(),
1056                       restype.c_str());
1057         }
1058     }
1059 }
1060
1061 void find_nc_ter(t_atoms* pdba, int r0, int r1, int* r_start, int* r_end, ResidueTypeMap* rt, const gmx::MDLogger& logger)
1062 {
1063     int                        i;
1064     std::optional<std::string> startrestype;
1065
1066     *r_start = -1;
1067     *r_end   = -1;
1068
1069     int startWarnings = 0;
1070     int endWarnings   = 0;
1071     int ionNotes      = 0;
1072
1073     // Check that all residues have the same chain identifier, and if it is
1074     // non-blank we also require the residue types to match.
1075     checkResidueTypeSanity(pdba, r0, r1, rt);
1076
1077     // If we return correctly from checkResidueTypeSanity(), the only
1078     // remaining cases where we can have non-matching residue types is if
1079     // the chain ID was blank, which could be the case e.g. for a structure
1080     // read from a GRO file or other file types without chain information.
1081     // In that case we need to be a bit more liberal and detect chains based
1082     // on the residue type.
1083
1084     // If we get here, the chain ID must be identical for all residues
1085     char chainID = pdba->resinfo[r0].chainid;
1086
1087     /* Find the starting terminus (typially N or 5') */
1088     for (i = r0; i < r1 && *r_start == -1; i++)
1089     {
1090         startrestype = rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1091         if (!startrestype)
1092         {
1093             continue;
1094         }
1095         if (gmx::equalCaseInsensitive(*startrestype, "Protein")
1096             || gmx::equalCaseInsensitive(*startrestype, "DNA")
1097             || gmx::equalCaseInsensitive(*startrestype, "RNA"))
1098         {
1099             GMX_LOG(logger.info)
1100                     .asParagraph()
1101                     .appendTextFormatted("Identified residue %s%d as a starting terminus.",
1102                                          *pdba->resinfo[i].name,
1103                                          pdba->resinfo[i].nr);
1104             *r_start = i;
1105         }
1106         else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1107         {
1108             if (ionNotes < 5)
1109             {
1110                 GMX_LOG(logger.info)
1111                         .asParagraph()
1112                         .appendTextFormatted(
1113                                 "Residue %s%d has type 'Ion', assuming it is not linked into a "
1114                                 "chain.",
1115                                 *pdba->resinfo[i].name,
1116                                 pdba->resinfo[i].nr);
1117             }
1118             if (ionNotes == 4)
1119             {
1120                 GMX_LOG(logger.info)
1121                         .asParagraph()
1122                         .appendTextFormatted("Disabling further notes about ions.");
1123             }
1124             ionNotes++;
1125         }
1126         else
1127         {
1128             // Either no known residue type, or one not needing special handling
1129             if (startWarnings < 5)
1130             {
1131                 if (chainID == ' ')
1132                 {
1133                     GMX_LOG(logger.warning)
1134                             .asParagraph()
1135                             .appendTextFormatted(
1136                                     "Starting residue %s%d in chain not identified as "
1137                                     "Protein/RNA/DNA. "
1138                                     "This chain lacks identifiers, which makes it impossible to do "
1139                                     "strict "
1140                                     "classification of the start/end residues. Here we need to "
1141                                     "guess this residue "
1142                                     "should not be part of the chain and instead introduce a "
1143                                     "break, but that will "
1144                                     "be catastrophic if they should in fact be linked. Please "
1145                                     "check your structure, "
1146                                     "and add %s to residuetypes.dat if this was not correct.",
1147                                     *pdba->resinfo[i].name,
1148                                     pdba->resinfo[i].nr,
1149                                     *pdba->resinfo[i].name);
1150                 }
1151                 else
1152                 {
1153                     GMX_LOG(logger.warning)
1154                             .asParagraph()
1155                             .appendTextFormatted(
1156                                     "No residues in chain starting at %s%d identified as "
1157                                     "Protein/RNA/DNA. "
1158                                     "This makes it impossible to link them into a molecule, which "
1159                                     "could either be "
1160                                     "correct or a catastrophic error. Please check your structure, "
1161                                     "and add all "
1162                                     "necessary residue names to residuetypes.dat if this was not "
1163                                     "correct.",
1164                                     *pdba->resinfo[i].name,
1165                                     pdba->resinfo[i].nr);
1166                 }
1167             }
1168             if (startWarnings == 4)
1169             {
1170                 GMX_LOG(logger.warning)
1171                         .asParagraph()
1172                         .appendTextFormatted(
1173                                 "Disabling further warnings about unidentified residues at start "
1174                                 "of chain.");
1175             }
1176             startWarnings++;
1177         }
1178     }
1179
1180     if (*r_start >= 0)
1181     {
1182         /* Go through the rest of the residues, check that they are the same class, and identify the ending terminus. */
1183         for (int i = *r_start; i < r1; i++)
1184         {
1185             std::optional<std::string> restype =
1186                     rt->optionalTypeOfNamedDatabaseResidue(*pdba->resinfo[i].name);
1187             if (!restype)
1188             {
1189                 continue;
1190             }
1191             if (gmx::equalCaseInsensitive(*restype, *startrestype) && endWarnings == 0)
1192             {
1193                 *r_end = i;
1194             }
1195             else if (gmx::equalCaseInsensitive(*startrestype, "Ion"))
1196             {
1197                 if (ionNotes < 5)
1198                 {
1199                     GMX_LOG(logger.info)
1200                             .asParagraph()
1201                             .appendTextFormatted(
1202                                     "Residue %s%d has type 'Ion', assuming it is not linked into a "
1203                                     "chain.",
1204                                     *pdba->resinfo[i].name,
1205                                     pdba->resinfo[i].nr);
1206                 }
1207                 if (ionNotes == 4)
1208                 {
1209                     GMX_LOG(logger.info)
1210                             .asParagraph()
1211                             .appendTextFormatted("Disabling further notes about ions.");
1212                 }
1213                 ionNotes++;
1214             }
1215             else
1216             {
1217                 // Either no known residue type, or one not needing special handling.
1218                 GMX_RELEASE_ASSERT(chainID == ' ', "Chain ID must be blank");
1219                 // Otherwise the call to checkResidueTypeSanity() will
1220                 // have caught the problem.
1221                 if (endWarnings < 5)
1222                 {
1223                     GMX_LOG(logger.warning)
1224                             .asParagraph()
1225                             .appendTextFormatted(
1226                                     "Residue %s%d in chain has different type ('%s') from "
1227                                     "residue %s%d ('%s'). This chain lacks identifiers, which "
1228                                     "makes "
1229                                     "it impossible to do strict classification of the start/end "
1230                                     "residues. Here we "
1231                                     "need to guess this residue should not be part of the chain "
1232                                     "and instead "
1233                                     "introduce a break, but that will be catastrophic if they "
1234                                     "should in fact be "
1235                                     "linked. Please check your structure, and add %s to "
1236                                     "residuetypes.dat "
1237                                     "if this was not correct.",
1238                                     *pdba->resinfo[i].name,
1239                                     pdba->resinfo[i].nr,
1240                                     restype->c_str(),
1241                                     *pdba->resinfo[*r_start].name,
1242                                     pdba->resinfo[*r_start].nr,
1243                                     startrestype->c_str(),
1244                                     *pdba->resinfo[i].name);
1245                 }
1246                 if (endWarnings == 4)
1247                 {
1248                     GMX_LOG(logger.warning)
1249                             .asParagraph()
1250                             .appendTextFormatted(
1251                                     "Disabling further warnings about unidentified residues at end "
1252                                     "of chain.");
1253                 }
1254                 endWarnings++;
1255             }
1256         }
1257     }
1258
1259     if (*r_end >= 0)
1260     {
1261         GMX_LOG(logger.info)
1262                 .asParagraph()
1263                 .appendTextFormatted("Identified residue %s%d as a ending terminus.",
1264                                      *pdba->resinfo[*r_end].name,
1265                                      pdba->resinfo[*r_end].nr);
1266     }
1267 }
1268
1269 enum class ChainSeparationType : int
1270 {
1271     IdOrTer,
1272     IdAndTer,
1273     Ter,
1274     Id,
1275     Interactive,
1276     Count
1277 };
1278 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNames = {
1279     { "id_or_ter", "id_and_ter", "ter", "id", "interactive" }
1280 };
1281 const gmx::EnumerationArray<ChainSeparationType, const char*> c_chainSeparationTypeNotificationMessages = {
1282     { "Splitting chemical chains based on TER records or chain id changing.\n",
1283       "Splitting chemical chains based on TER records and chain id changing.\n",
1284       "Splitting chemical chains based on TER records only (ignoring chain id).\n",
1285       "Splitting chemical chains based on changing chain id only (ignoring TER records).\n",
1286       "Splitting chemical chains interactively.\n" }
1287 };
1288
1289 void modify_chain_numbers(t_atoms* pdba, ChainSeparationType chainSeparation, const gmx::MDLogger& logger)
1290 {
1291     int         i;
1292     char        old_prev_chainid;
1293     char        old_this_chainid;
1294     int         old_prev_chainnum;
1295     int         old_this_chainnum;
1296     t_resinfo*  ri;
1297     char        select[STRLEN];
1298     int         new_chainnum;
1299     int         this_atomnum;
1300     int         prev_atomnum;
1301     const char* prev_atomname;
1302     const char* this_atomname;
1303     const char* prev_resname;
1304     const char* this_resname;
1305     int         prev_resnum;
1306     int         this_resnum;
1307     char        prev_chainid;
1308     char        this_chainid;
1309
1310     /* The default chain enumeration is based on TER records only */
1311     printf("%s", c_chainSeparationTypeNotificationMessages[chainSeparation]);
1312
1313     old_prev_chainid  = '?';
1314     old_prev_chainnum = -1;
1315     new_chainnum      = -1;
1316
1317     this_atomname = nullptr;
1318     this_atomnum  = -1;
1319     this_resname  = nullptr;
1320     this_resnum   = -1;
1321     this_chainid  = '?';
1322
1323     for (i = 0; i < pdba->nres; i++)
1324     {
1325         ri                = &pdba->resinfo[i];
1326         old_this_chainid  = ri->chainid;
1327         old_this_chainnum = ri->chainnum;
1328
1329         prev_atomname = this_atomname;
1330         prev_atomnum  = this_atomnum;
1331         prev_resname  = this_resname;
1332         prev_resnum   = this_resnum;
1333         prev_chainid  = this_chainid;
1334
1335         this_atomname = *(pdba->atomname[i]);
1336         this_atomnum  = (pdba->pdbinfo != nullptr) ? pdba->pdbinfo[i].atomnr : i + 1;
1337         this_resname  = *ri->name;
1338         this_resnum   = ri->nr;
1339         this_chainid  = ri->chainid;
1340
1341         switch (chainSeparation)
1342         {
1343             case ChainSeparationType::IdOrTer:
1344                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1345                 {
1346                     new_chainnum++;
1347                 }
1348                 break;
1349
1350             case ChainSeparationType::IdAndTer:
1351                 if (old_this_chainid != old_prev_chainid && old_this_chainnum != old_prev_chainnum)
1352                 {
1353                     new_chainnum++;
1354                 }
1355                 break;
1356
1357             case ChainSeparationType::Id:
1358                 if (old_this_chainid != old_prev_chainid)
1359                 {
1360                     new_chainnum++;
1361                 }
1362                 break;
1363
1364             case ChainSeparationType::Ter:
1365                 if (old_this_chainnum != old_prev_chainnum)
1366                 {
1367                     new_chainnum++;
1368                 }
1369                 break;
1370             case ChainSeparationType::Interactive:
1371                 if (old_this_chainid != old_prev_chainid || old_this_chainnum != old_prev_chainnum)
1372                 {
1373                     if (i > 0)
1374                     {
1375                         GMX_LOG(logger.info)
1376                                 .asParagraph()
1377                                 .appendTextFormatted(
1378                                         "Split the chain (and introduce termini) between residue %s%d (chain id '%c', atom %d %s)\
1379 "
1380                                         "and residue %s%d (chain id '%c', atom %d %s) ? [n/y]",
1381                                         prev_resname,
1382                                         prev_resnum,
1383                                         prev_chainid,
1384                                         prev_atomnum,
1385                                         prev_atomname,
1386                                         this_resname,
1387                                         this_resnum,
1388                                         this_chainid,
1389                                         this_atomnum,
1390                                         this_atomname);
1391
1392                         if (nullptr == fgets(select, STRLEN - 1, stdin))
1393                         {
1394                             gmx_fatal(FARGS, "Error reading from stdin");
1395                         }
1396                     }
1397                     if (i == 0 || select[0] == 'y')
1398                     {
1399                         new_chainnum++;
1400                     }
1401                 }
1402                 break;
1403             case ChainSeparationType::Count:
1404                 gmx_fatal(FARGS, "Internal inconsistency - this shouldn't happen...");
1405         }
1406         old_prev_chainid  = old_this_chainid;
1407         old_prev_chainnum = old_this_chainnum;
1408
1409         ri->chainnum = new_chainnum;
1410     }
1411 }
1412
1413 bool checkChainCyclicity(t_atoms*                               pdba,
1414                          rvec*                                  pdbx,
1415                          int                                    start_ter,
1416                          int                                    end_ter,
1417                          gmx::ArrayRef<const PreprocessResidue> rtpFFDB,
1418                          gmx::ArrayRef<const RtpRename>         rr,
1419                          real                                   long_bond_dist_,
1420                          real                                   short_bond_dist_)
1421 {
1422     // if start and end are the same, we can't have a cycle
1423     if (start_ter == end_ter)
1424     {
1425         return false;
1426     }
1427     int         ai = -1, aj = -1;
1428     char*       rtpname = *(pdba->resinfo[start_ter].rtp);
1429     std::string newName = search_resrename(rr, rtpname, false, false, false);
1430     if (newName.empty())
1431     {
1432         newName = rtpname;
1433     }
1434     auto        res = getDatabaseEntry(newName, rtpFFDB);
1435     const char *name_ai, *name_aj;
1436
1437     for (const auto& patch : res->rb[BondedTypes::Bonds].b)
1438     { /* Search backward bond for n/5' terminus */
1439         name_ai = patch.ai().c_str();
1440         name_aj = patch.aj().c_str();
1441         if (name_ai[0] == '-')
1442         {
1443             aj = search_res_atom(++name_ai, end_ter, pdba, "check", TRUE);
1444             ai = search_res_atom(name_aj, start_ter, pdba, "check", TRUE);
1445         }
1446         else if (name_aj[0] == '-')
1447         {
1448             aj = search_res_atom(++name_aj, end_ter, pdba, "check", TRUE);
1449             ai = search_res_atom(name_ai, start_ter, pdba, "check", TRUE);
1450         }
1451         if (ai >= 0 && aj >= 0)
1452         {
1453             break; /* Found */
1454         }
1455     }
1456
1457     if (!(ai >= 0 && aj >= 0))
1458     {
1459         rtpname = *(pdba->resinfo[end_ter].rtp);
1460         newName = search_resrename(rr, rtpname, false, false, false);
1461         if (newName.empty())
1462         {
1463             newName = rtpname;
1464         }
1465         res = getDatabaseEntry(newName, rtpFFDB);
1466         for (const auto& patch : res->rb[BondedTypes::Bonds].b)
1467         {
1468             /* Seach forward bond for c/3' terminus */
1469             name_ai = patch.ai().c_str();
1470             name_aj = patch.aj().c_str();
1471
1472             if (name_ai[0] == '+')
1473             {
1474                 ai = search_res_atom(name_aj, end_ter, pdba, "check", TRUE);
1475                 aj = search_res_atom(++name_ai, start_ter, pdba, "check", TRUE);
1476             }
1477             else if (name_aj[0] == '+')
1478             {
1479                 ai = search_res_atom(name_ai, end_ter, pdba, "check", TRUE);
1480                 aj = search_res_atom(++name_aj, start_ter, pdba, "check", TRUE);
1481             }
1482             if (ai >= 0 && aj >= 0)
1483             {
1484                 break;
1485             }
1486         }
1487     }
1488
1489     if (ai >= 0 && aj >= 0)
1490     {
1491         real dist = distance2(pdbx[ai], pdbx[aj]);
1492         /* it is better to read bond length from ffbonded.itp */
1493         return (dist < gmx::square(long_bond_dist_) && dist > gmx::square(short_bond_dist_));
1494     }
1495     else
1496     {
1497         return false;
1498     }
1499 }
1500
1501 struct t_pdbchain
1502 {
1503     char             chainid   = ' ';
1504     int              chainnum  = ' '; // char, but stored as int to make clang-tidy happy
1505     int              start     = -1;
1506     int              natom     = -1;
1507     bool             bAllWat   = false;
1508     int              nterpairs = -1;
1509     std::vector<int> chainstart;
1510 };
1511
1512 struct t_chain
1513 {
1514     char                                chainid   = ' ';
1515     int                                 chainnum  = ' ';
1516     bool                                bAllWat   = false;
1517     int                                 nterpairs = -1;
1518     std::vector<int>                    chainstart;
1519     std::vector<MoleculePatchDatabase*> ntdb;
1520     std::vector<MoleculePatchDatabase*> ctdb;
1521     std::vector<int>                    r_start;
1522     std::vector<int>                    r_end;
1523     t_atoms*                            pdba;
1524     std::vector<gmx::RVec>              x;
1525     std::vector<int>                    cyclicBondsIndex;
1526 };
1527
1528 enum class VSitesType : int
1529 {
1530     None,
1531     Hydrogens,
1532     Aromatics,
1533     Count
1534 };
1535 const gmx::EnumerationArray<VSitesType, const char*> c_vsitesTypeNames = {
1536     { "none", "hydrogens", "aromatics" }
1537 };
1538
1539 enum class WaterType : int
1540 {
1541     Select,
1542     None,
1543     Spc,
1544     SpcE,
1545     Tip3p,
1546     Tip4p,
1547     Tip5p,
1548     Tips3p,
1549     Count
1550 };
1551 const gmx::EnumerationArray<WaterType, const char*> c_waterTypeNames = {
1552     { "select", "none", "spc", "spce", "tip3p", "tip4p", "tip5p", "tips3p" }
1553 };
1554
1555 enum class MergeType : int
1556 {
1557     No,
1558     All,
1559     Interactive,
1560     Count
1561 };
1562 const gmx::EnumerationArray<MergeType, const char*> c_mergeTypeNames = {
1563     { "no", "all", "interactive" }
1564 };
1565
1566 } // namespace
1567
1568 namespace gmx
1569 {
1570
1571 namespace
1572 {
1573
1574 class pdb2gmx : public ICommandLineOptionsModule
1575 {
1576 public:
1577     pdb2gmx() :
1578         bVsites_(FALSE),
1579         bPrevWat_(FALSE),
1580         bVsiteAromatics_(FALSE),
1581         chainSeparation_(ChainSeparationType::IdOrTer),
1582         vsitesType_(VSitesType::None),
1583         waterType_(WaterType::Select),
1584         mergeType_(MergeType::No),
1585         itp_file_(nullptr),
1586         mHmult_(0)
1587     {
1588         gmx::LoggerBuilder builder;
1589         builder.addTargetStream(gmx::MDLogger::LogLevel::Info, &gmx::TextOutputFile::standardOutput());
1590         builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, &gmx::TextOutputFile::standardError());
1591         loggerOwner_ = std::make_unique<LoggerOwner>(builder.build());
1592     }
1593
1594     // From ICommandLineOptionsModule
1595     void init(CommandLineModuleSettings* /*settings*/) override {}
1596
1597     void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
1598
1599     void optionsFinished() override;
1600
1601     int run() override;
1602
1603 private:
1604     bool bNewRTP_;
1605     bool bInter_;
1606     bool bCysMan_;
1607     bool bLysMan_;
1608     bool bAspMan_;
1609     bool bGluMan_;
1610     bool bHisMan_;
1611     bool bGlnMan_;
1612     bool bArgMan_;
1613     bool bTerMan_;
1614     bool bUnA_;
1615     bool bHeavyH_;
1616     bool bSort_;
1617     bool bAllowMissing_;
1618     bool bRemoveH_;
1619     bool bDeuterate_;
1620     bool bVerbose_;
1621     bool bChargeGroups_;
1622     bool bCmap_;
1623     bool bRenumRes_;
1624     bool bRTPresname_;
1625     bool bIndexSet_;
1626     bool bOutputSet_;
1627     bool bVsites_;
1628     bool bWat_;
1629     bool bPrevWat_;
1630     bool bITP_;
1631     bool bVsiteAromatics_;
1632     real angle_;
1633     real distance_;
1634     real posre_fc_;
1635     real long_bond_dist_;
1636     real short_bond_dist_;
1637
1638     std::string indexOutputFile_;
1639     std::string outputFile_;
1640     std::string topologyFile_;
1641     std::string includeTopologyFile_;
1642     std::string outputConfFile_;
1643     std::string inputConfFile_;
1644     std::string outFile_;
1645     std::string ff_;
1646
1647     ChainSeparationType chainSeparation_;
1648     VSitesType          vsitesType_;
1649     WaterType           waterType_;
1650     MergeType           mergeType_;
1651
1652     FILE*                        itp_file_;
1653     char                         forcefield_[STRLEN];
1654     char                         ffdir_[STRLEN];
1655     char*                        ffname_;
1656     char*                        watermodel_;
1657     std::vector<std::string>     incls_;
1658     std::vector<t_mols>          mols_;
1659     real                         mHmult_;
1660     std::unique_ptr<LoggerOwner> loggerOwner_;
1661 };
1662
1663 void pdb2gmx::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
1664 {
1665     const char* desc[] = {
1666         "[THISMODULE] reads a [REF].pdb[ref] (or [REF].gro[ref]) file, reads",
1667         "some database files, adds hydrogens to the molecules and generates",
1668         "coordinates in GROMACS (GROMOS), or optionally [REF].pdb[ref], format",
1669         "and a topology in GROMACS format.",
1670         "These files can subsequently be processed to generate a run input file.",
1671         "[PAR]",
1672         "[THISMODULE] will search for force fields by looking for",
1673         "a [TT]forcefield.itp[tt] file in subdirectories [TT]<forcefield>.ff[tt]",
1674         "of the current working directory and of the GROMACS library directory",
1675         "as inferred from the path of the binary or the [TT]GMXLIB[tt] environment",
1676         "variable.",
1677         "By default the forcefield selection is interactive,",
1678         "but you can use the [TT]-ff[tt] option to specify one of the short names",
1679         "in the list on the command line instead. In that case [THISMODULE] just looks",
1680         "for the corresponding [TT]<forcefield>.ff[tt] directory.",
1681         "[PAR]",
1682         "After choosing a force field, all files will be read only from",
1683         "the corresponding force field directory.",
1684         "If you want to modify or add a residue types, you can copy the force",
1685         "field directory from the GROMACS library directory to your current",
1686         "working directory. If you want to add new protein residue types,",
1687         "you will need to modify [TT]residuetypes.dat[tt] in the library directory",
1688         "or copy the whole library directory to a local directory and set",
1689         "the environment variable [TT]GMXLIB[tt] to the name of that directory.",
1690         "Check Chapter 5 of the manual for more information about file formats.",
1691         "[PAR]",
1692
1693         "Note that a [REF].pdb[ref] file is nothing more than a file format, and it",
1694         "need not necessarily contain a protein structure. Every kind of",
1695         "molecule for which there is support in the database can be converted.",
1696         "If there is no support in the database, you can add it yourself.[PAR]",
1697
1698         "The program has limited intelligence, it reads a number of database",
1699         "files, that allow it to make special bonds (Cys-Cys, Heme-His, etc.),",
1700         "if necessary this can be done manually. The program can prompt the",
1701         "user to select which kind of LYS, ASP, GLU, CYS or HIS residue is",
1702         "desired. For Lys the choice is between neutral (two protons on NZ) or",
1703         "protonated (three protons, default), for Asp and Glu unprotonated",
1704         "(default) or protonated, for His the proton can be either on ND1,",
1705         "on NE2 or on both. By default these selections are done automatically.",
1706         "For His, this is based on an optimal hydrogen bonding",
1707         "conformation. Hydrogen bonds are defined based on a simple geometric",
1708         "criterion, specified by the maximum hydrogen-donor-acceptor angle",
1709         "and donor-acceptor distance, which are set by [TT]-angle[tt] and",
1710         "[TT]-dist[tt] respectively.[PAR]",
1711
1712         "The protonation state of N- and C-termini can be chosen interactively",
1713         "with the [TT]-ter[tt] flag.  Default termini are ionized (NH3+ and COO-),",
1714         "respectively.  Some force fields support zwitterionic forms for chains of",
1715         "one residue, but for polypeptides these options should NOT be selected.",
1716         "The AMBER force fields have unique forms for the terminal residues,",
1717         "and these are incompatible with the [TT]-ter[tt] mechanism. You need",
1718         "to prefix your N- or C-terminal residue names with \"N\" or \"C\"",
1719         "respectively to use these forms, making sure you preserve the format",
1720         "of the coordinate file. Alternatively, use named terminating residues",
1721         "(e.g. ACE, NME).[PAR]",
1722
1723         "The separation of chains is not entirely trivial since the markup",
1724         "in user-generated PDB files frequently varies and sometimes it",
1725         "is desirable to merge entries across a TER record, for instance",
1726         "if you want a disulfide bridge or distance restraints between",
1727         "two protein chains or if you have a HEME group bound to a protein.",
1728         "In such cases multiple chains should be contained in a single",
1729         "[TT]moleculetype[tt] definition.",
1730         "To handle this, [THISMODULE] uses two separate options.",
1731         "First, [TT]-chainsep[tt] allows you to choose when a new chemical chain should",
1732         "start, and termini added when applicable. This can be done based on the",
1733         "existence of TER records, when the chain id changes, or combinations of either",
1734         "or both of these. You can also do the selection fully interactively.",
1735         "In addition, there is a [TT]-merge[tt] option that controls how multiple chains",
1736         "are merged into one moleculetype, after adding all the chemical termini (or not).",
1737         "This can be turned off (no merging), all non-water chains can be merged into a",
1738         "single molecule, or the selection can be done interactively.[PAR]",
1739
1740         "[THISMODULE] will also check the occupancy field of the [REF].pdb[ref] file.",
1741         "If any of the occupancies are not one, indicating that the atom is",
1742         "not resolved well in the structure, a warning message is issued.",
1743         "When a [REF].pdb[ref] file does not originate from an X-ray structure determination",
1744         "all occupancy fields may be zero. Either way, it is up to the user",
1745         "to verify the correctness of the input data (read the article!).[PAR]",
1746
1747         "During processing the atoms will be reordered according to GROMACS",
1748         "conventions. With [TT]-n[tt] an index file can be generated that",
1749         "contains one group reordered in the same way. This allows you to",
1750         "convert a GROMOS trajectory and coordinate file to GROMOS. There is",
1751         "one limitation: reordering is done after the hydrogens are stripped",
1752         "from the input and before new hydrogens are added. This means that",
1753         "you should not use [TT]-ignh[tt].[PAR]",
1754
1755         "The [REF].gro[ref] and [TT].g96[tt] file formats do not support chain",
1756         "identifiers. Therefore it is useful to enter a [REF].pdb[ref] file name at",
1757         "the [TT]-o[tt] option when you want to convert a multi-chain [REF].pdb[ref] file.",
1758         "[PAR]",
1759
1760         "The option [TT]-vsite[tt] removes hydrogen and fast improper dihedral",
1761         "motions. Angular and out-of-plane motions can be removed by changing",
1762         "hydrogens into virtual sites and fixing angles, which fixes their",
1763         "position relative to neighboring atoms. Additionally, all atoms in the",
1764         "aromatic rings of the standard amino acids (i.e. PHE, TRP, TYR and HIS)",
1765         "can be converted into virtual sites, eliminating the fast improper dihedral",
1766         "fluctuations in these rings (but this feature is deprecated).",
1767         "[BB]Note[bb] that in this case all other hydrogen",
1768         "atoms are also converted to virtual sites. The mass of all atoms that are",
1769         "converted into virtual sites, is added to the heavy atoms.[PAR]",
1770         "Also slowing down of dihedral motion can be done with [TT]-heavyh[tt]",
1771         "done by increasing the hydrogen-mass by a factor of 4. This is also",
1772         "done for water hydrogens to slow down the rotational motion of water.",
1773         "The increase in mass of the hydrogens is subtracted from the bonded",
1774         "(heavy) atom so that the total mass of the system remains the same.",
1775
1776         "As a special case, ring-closed (or cyclic) molecules are considered.",
1777         "[THISMODULE] automatically determines if a cyclic molecule is present",
1778         "by evaluating the distance between the terminal atoms of a given chain.",
1779         "If this distance is greater than the [TT]-sb[tt]",
1780         "(\"Short bond warning distance\", default 0.05 nm)",
1781         "and less than the [TT]-lb[tt] (\"Long bond warning distance\", default 0.25 nm)",
1782         "the molecule is considered to be ring closed and will be processed as such.",
1783         "Please note that this does not detect cyclic bonds over periodic boundaries."
1784     };
1785
1786     settings->setHelpText(desc);
1787
1788     options->addOption(BooleanOption("newrtp").store(&bNewRTP_).defaultValue(false).hidden().description(
1789             "Write the residue database in new format to [TT]new.rtp[tt]"));
1790     options->addOption(
1791             RealOption("lb").store(&long_bond_dist_).defaultValue(0.25).hidden().description("Long bond warning distance"));
1792     options->addOption(
1793             RealOption("sb").store(&short_bond_dist_).defaultValue(0.05).hidden().description("Short bond warning distance"));
1794     options->addOption(EnumOption<ChainSeparationType>("chainsep")
1795                                .enumValue(c_chainSeparationTypeNames)
1796                                .store(&chainSeparation_)
1797                                .description("Condition in PDB files when a new chain should be "
1798                                             "started (adding termini)"));
1799     options->addOption(EnumOption<MergeType>("merge")
1800                                .enumValue(c_mergeTypeNames)
1801                                .store(&mergeType_)
1802                                .description("Merge multiple chains into a single [moleculetype]"));
1803     options->addOption(StringOption("ff").store(&ff_).defaultValue("select").description(
1804             "Force field, interactive by default. Use [TT]-h[tt] for information."));
1805     options->addOption(
1806             EnumOption<WaterType>("water").store(&waterType_).enumValue(c_waterTypeNames).description("Water model to use"));
1807     options->addOption(BooleanOption("inter").store(&bInter_).defaultValue(false).description(
1808             "Set the next 8 options to interactive"));
1809     options->addOption(BooleanOption("ss").store(&bCysMan_).defaultValue(false).description(
1810             "Interactive SS bridge selection"));
1811     options->addOption(BooleanOption("ter").store(&bTerMan_).defaultValue(false).description(
1812             "Interactive termini selection, instead of charged (default)"));
1813     options->addOption(BooleanOption("lys").store(&bLysMan_).defaultValue(false).description(
1814             "Interactive lysine selection, instead of charged"));
1815     options->addOption(BooleanOption("arg").store(&bArgMan_).defaultValue(false).description(
1816             "Interactive arginine selection, instead of charged"));
1817     options->addOption(BooleanOption("asp").store(&bAspMan_).defaultValue(false).description(
1818             "Interactive aspartic acid selection, instead of charged"));
1819     options->addOption(BooleanOption("glu").store(&bGluMan_).defaultValue(false).description(
1820             "Interactive glutamic acid selection, instead of charged"));
1821     options->addOption(BooleanOption("gln").store(&bGlnMan_).defaultValue(false).description(
1822             "Interactive glutamine selection, instead of charged"));
1823     options->addOption(BooleanOption("his").store(&bHisMan_).defaultValue(false).description(
1824             "Interactive histidine selection, instead of checking H-bonds"));
1825     options->addOption(RealOption("angle").store(&angle_).defaultValue(135.0).description(
1826             "Minimum hydrogen-donor-acceptor angle for a H-bond (degrees)"));
1827     options->addOption(
1828             RealOption("dist").store(&distance_).defaultValue(0.3).description("Maximum donor-acceptor distance for a H-bond (nm)"));
1829     options->addOption(BooleanOption("una").store(&bUnA_).defaultValue(false).description(
1830             "Select aromatic rings with united CH atoms on phenylalanine, tryptophane and "
1831             "tyrosine"));
1832     options->addOption(BooleanOption("sort").store(&bSort_).defaultValue(true).hidden().description(
1833             "Sort the residues according to database, turning this off is dangerous as charge "
1834             "groups might be broken in parts"));
1835     options->addOption(
1836             BooleanOption("ignh").store(&bRemoveH_).defaultValue(false).description("Ignore hydrogen atoms that are in the coordinate file"));
1837     options->addOption(
1838             BooleanOption("missing")
1839                     .store(&bAllowMissing_)
1840                     .defaultValue(false)
1841                     .description(
1842                             "Continue when atoms are missing and bonds cannot be made, dangerous"));
1843     options->addOption(
1844             BooleanOption("v").store(&bVerbose_).defaultValue(false).description("Be slightly more verbose in messages"));
1845     options->addOption(
1846             RealOption("posrefc").store(&posre_fc_).defaultValue(1000).description("Force constant for position restraints"));
1847     options->addOption(EnumOption<VSitesType>("vsite")
1848                                .store(&vsitesType_)
1849                                .enumValue(c_vsitesTypeNames)
1850                                .description("Convert atoms to virtual sites"));
1851     options->addOption(BooleanOption("heavyh").store(&bHeavyH_).defaultValue(false).description(
1852             "Make hydrogen atoms heavy"));
1853     options->addOption(
1854             BooleanOption("deuterate").store(&bDeuterate_).defaultValue(false).description("Change the mass of hydrogens to 2 amu"));
1855     options->addOption(BooleanOption("chargegrp")
1856                                .store(&bChargeGroups_)
1857                                .defaultValue(true)
1858                                .description("Use charge groups in the [REF].rtp[ref] file"));
1859     options->addOption(BooleanOption("cmap").store(&bCmap_).defaultValue(true).description(
1860             "Use cmap torsions (if enabled in the [REF].rtp[ref] file)"));
1861     options->addOption(BooleanOption("renum")
1862                                .store(&bRenumRes_)
1863                                .defaultValue(false)
1864                                .description("Renumber the residues consecutively in the output"));
1865     options->addOption(BooleanOption("rtpres")
1866                                .store(&bRTPresname_)
1867                                .defaultValue(false)
1868                                .description("Use [REF].rtp[ref] entry names as residue names"));
1869     options->addOption(FileNameOption("f")
1870                                .legacyType(efSTX)
1871                                .inputFile()
1872                                .store(&inputConfFile_)
1873                                .required()
1874                                .defaultBasename("protein")
1875                                .defaultType(efPDB)
1876                                .description("Structure file"));
1877     options->addOption(FileNameOption("o")
1878                                .legacyType(efSTO)
1879                                .outputFile()
1880                                .store(&outputConfFile_)
1881                                .required()
1882                                .defaultBasename("conf")
1883                                .description("Structure file"));
1884     options->addOption(FileNameOption("p")
1885                                .legacyType(efTOP)
1886                                .outputFile()
1887                                .store(&topologyFile_)
1888                                .required()
1889                                .defaultBasename("topol")
1890                                .description("Topology file"));
1891     options->addOption(FileNameOption("i")
1892                                .legacyType(efITP)
1893                                .outputFile()
1894                                .store(&includeTopologyFile_)
1895                                .required()
1896                                .defaultBasename("posre")
1897                                .description("Include file for topology"));
1898     options->addOption(FileNameOption("n")
1899                                .legacyType(efNDX)
1900                                .outputFile()
1901                                .store(&indexOutputFile_)
1902                                .storeIsSet(&bIndexSet_)
1903                                .defaultBasename("index")
1904                                .description("Index file"));
1905     options->addOption(FileNameOption("q")
1906                                .legacyType(efSTO)
1907                                .outputFile()
1908                                .store(&outFile_)
1909                                .storeIsSet(&bOutputSet_)
1910                                .defaultBasename("clean")
1911                                .defaultType(efPDB)
1912                                .description("Structure file"));
1913 }
1914
1915 void pdb2gmx::optionsFinished()
1916 {
1917     if (inputConfFile_.empty())
1918     {
1919         GMX_THROW(InconsistentInputError("You must supply an input file"));
1920     }
1921     if (bInter_)
1922     {
1923         /* if anything changes here, also change description of -inter */
1924         bCysMan_ = true;
1925         bTerMan_ = true;
1926         bLysMan_ = true;
1927         bArgMan_ = true;
1928         bAspMan_ = true;
1929         bGluMan_ = true;
1930         bGlnMan_ = true;
1931         bHisMan_ = true;
1932     }
1933
1934     if (bHeavyH_)
1935     {
1936         mHmult_ = 4.0;
1937     }
1938     else if (bDeuterate_)
1939     {
1940         mHmult_ = 2.0;
1941     }
1942     else
1943     {
1944         mHmult_ = 1.0;
1945     }
1946
1947     /* Force field selection, interactive or direct */
1948     choose_ff(strcmp(ff_.c_str(), "select") == 0 ? nullptr : ff_.c_str(),
1949               forcefield_,
1950               sizeof(forcefield_),
1951               ffdir_,
1952               sizeof(ffdir_),
1953               loggerOwner_->logger());
1954
1955     if (strlen(forcefield_) > 0)
1956     {
1957         ffname_    = forcefield_;
1958         ffname_[0] = std::toupper(ffname_[0]);
1959     }
1960     else
1961     {
1962         gmx_fatal(FARGS, "Empty forcefield string");
1963     }
1964 }
1965
1966 int pdb2gmx::run()
1967 {
1968     char                       select[STRLEN];
1969     std::vector<DisulfideBond> ssbonds;
1970
1971     int         this_atomnum;
1972     int         prev_atomnum;
1973     const char* prev_atomname;
1974     const char* this_atomname;
1975     const char* prev_resname;
1976     const char* this_resname;
1977     int         prev_resnum;
1978     int         this_resnum;
1979     char        prev_chainid;
1980     char        this_chainid;
1981     int         prev_chainnumber;
1982     int         this_chainnumber;
1983     int         this_chainstart;
1984     int         prev_chainstart;
1985
1986     const gmx::MDLogger& logger = loggerOwner_->logger();
1987
1988     GMX_LOG(logger.info)
1989             .asParagraph()
1990             .appendTextFormatted("Using the %s force field in directory %s", ffname_, ffdir_);
1991
1992     choose_watermodel(c_waterTypeNames[waterType_], ffdir_, &watermodel_, logger);
1993
1994     switch (vsitesType_)
1995     {
1996         case VSitesType::None:
1997             bVsites_         = false;
1998             bVsiteAromatics_ = false;
1999             break;
2000         case VSitesType::Hydrogens:
2001             bVsites_         = true;
2002             bVsiteAromatics_ = false;
2003             break;
2004         case VSitesType::Aromatics:
2005             bVsites_         = true;
2006             bVsiteAromatics_ = true;
2007             break;
2008         case VSitesType::Count:
2009             gmx_fatal(FARGS, "Internal inconsistency: VSitesType is out of range.");
2010     } /* end switch */
2011
2012     /* Open the symbol table */
2013     t_symtab symtab;
2014     open_symtab(&symtab);
2015
2016     /* Residue type database */
2017     ResidueTypeMap rt;
2018
2019     /* Read residue renaming database(s), if present */
2020     std::vector<std::string> rrn = fflib_search_file_end(ffdir_, ".r2b", FALSE);
2021
2022     std::vector<RtpRename> rtprename;
2023     for (const auto& filename : rrn)
2024     {
2025         GMX_LOG(logger.info).asParagraph().appendTextFormatted("going to rename %s", filename.c_str());
2026         FILE* fp = fflib_open(filename);
2027         read_rtprename(filename.c_str(), fp, &rtprename);
2028         gmx_ffclose(fp);
2029     }
2030
2031     /* Add all alternative names from the residue renaming database to the list
2032        of recognized amino/nucleic acids. */
2033     for (const auto& rename : rtprename)
2034     {
2035         /* Only add names if the 'standard' gromacs/iupac base name was found */
2036         if (auto restype = rt.optionalTypeOfNamedDatabaseResidue(rename.gmx))
2037         {
2038             rt.addResidue(rename.main, *restype);
2039             rt.addResidue(rename.nter, *restype);
2040             rt.addResidue(rename.cter, *restype);
2041             rt.addResidue(rename.bter, *restype);
2042         }
2043     }
2044
2045     matrix      box;
2046     const char* watres;
2047     clear_mat(box);
2048     if (watermodel_ != nullptr && (strstr(watermodel_, "4p") || strstr(watermodel_, "4P")))
2049     {
2050         watres = "HO4";
2051     }
2052     else if (watermodel_ != nullptr && (strstr(watermodel_, "5p") || strstr(watermodel_, "5P")))
2053     {
2054         watres = "HO5";
2055     }
2056     else
2057     {
2058         watres = "HOH";
2059     }
2060
2061     AtomProperties aps;
2062     char*          title = nullptr;
2063     PbcType        pbcType;
2064     t_atoms        pdba_all;
2065     rvec*          pdbx;
2066     int            natom = read_pdball(inputConfFile_.c_str(),
2067                             bOutputSet_,
2068                             outFile_.c_str(),
2069                             &title,
2070                             &pdba_all,
2071                             &pdbx,
2072                             &pbcType,
2073                             box,
2074                             bRemoveH_,
2075                             &symtab,
2076                             &rt,
2077                             watres,
2078                             &aps,
2079                             bVerbose_);
2080
2081     if (natom == 0)
2082     {
2083         std::string message = formatString("No atoms found in pdb file %s\n", inputConfFile_.c_str());
2084         GMX_THROW(InconsistentInputError(message));
2085     }
2086
2087     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Analyzing pdb file");
2088     int nwaterchain = 0;
2089
2090     modify_chain_numbers(&pdba_all, chainSeparation_, logger);
2091
2092     int nchainmerges = 0;
2093
2094     this_atomname    = nullptr;
2095     this_atomnum     = -1;
2096     this_resname     = nullptr;
2097     this_resnum      = -1;
2098     this_chainid     = '?';
2099     this_chainnumber = -1;
2100     this_chainstart  = 0;
2101     /* Keep the compiler happy */
2102     prev_chainstart = 0;
2103
2104     int                     numChains = 0;
2105     std::vector<t_pdbchain> pdb_ch;
2106
2107     t_resinfo* ri;
2108     bool       bMerged = false;
2109     for (int i = 0; (i < natom); i++)
2110     {
2111         ri = &pdba_all.resinfo[pdba_all.atom[i].resind];
2112
2113         /* TODO this should live in a helper object, and consolidate
2114            that with code in modify_chain_numbers */
2115         prev_atomname    = this_atomname;
2116         prev_atomnum     = this_atomnum;
2117         prev_resname     = this_resname;
2118         prev_resnum      = this_resnum;
2119         prev_chainid     = this_chainid;
2120         prev_chainnumber = this_chainnumber;
2121         if (!bMerged)
2122         {
2123             prev_chainstart = this_chainstart;
2124         }
2125
2126         this_atomname    = *pdba_all.atomname[i];
2127         this_atomnum     = (pdba_all.pdbinfo != nullptr) ? pdba_all.pdbinfo[i].atomnr : i + 1;
2128         this_resname     = *ri->name;
2129         this_resnum      = ri->nr;
2130         this_chainid     = ri->chainid;
2131         this_chainnumber = ri->chainnum;
2132
2133         bWat_ = gmx::equalCaseInsensitive(*ri->name, watres);
2134
2135         if ((i == 0) || (this_chainnumber != prev_chainnumber) || (bWat_ != bPrevWat_))
2136         {
2137             GMX_RELEASE_ASSERT(
2138                     pdba_all.pdbinfo,
2139                     "Must have pdbinfo from reading a PDB file if chain number is changing");
2140             this_chainstart = pdba_all.atom[i].resind;
2141             bMerged         = false;
2142             if (i > 0 && !bWat_)
2143             {
2144                 if (!strncmp(c_mergeTypeNames[mergeType_], "int", 3))
2145                 {
2146                     GMX_LOG(logger.info)
2147                             .asParagraph()
2148                             .appendTextFormatted(
2149                                     "Merge chain ending with residue %s%d (chain id '%c', atom %d "
2150                                     "%s) and chain starting with "
2151                                     "residue %s%d (chain id '%c', atom %d %s) into a single "
2152                                     "moleculetype (keeping termini)? [n/y]",
2153                                     prev_resname,
2154                                     prev_resnum,
2155                                     prev_chainid,
2156                                     prev_atomnum,
2157                                     prev_atomname,
2158                                     this_resname,
2159                                     this_resnum,
2160                                     this_chainid,
2161                                     this_atomnum,
2162                                     this_atomname);
2163
2164                     if (nullptr == fgets(select, STRLEN - 1, stdin))
2165                     {
2166                         gmx_fatal(FARGS, "Error reading from stdin");
2167                     }
2168                     bMerged = (select[0] == 'y');
2169                 }
2170                 else if (!strncmp(c_mergeTypeNames[mergeType_], "all", 3))
2171                 {
2172                     bMerged = true;
2173                 }
2174             }
2175
2176             if (bMerged)
2177             {
2178                 pdb_ch[numChains - 1].chainstart[pdb_ch[numChains - 1].nterpairs] =
2179                         pdba_all.atom[i].resind - prev_chainstart;
2180                 pdb_ch[numChains - 1].nterpairs++;
2181                 pdb_ch[numChains - 1].chainstart.resize(pdb_ch[numChains - 1].nterpairs + 1);
2182                 nchainmerges++;
2183             }
2184             else
2185             {
2186                 /* set natom for previous chain */
2187                 if (numChains > 0)
2188                 {
2189                     pdb_ch[numChains - 1].natom = i - pdb_ch[numChains - 1].start;
2190                 }
2191                 if (bWat_)
2192                 {
2193                     nwaterchain++;
2194                     ri->chainid = ' ';
2195                 }
2196                 /* check if chain identifier was used before */
2197                 for (int j = 0; (j < numChains); j++)
2198                 {
2199                     if (pdb_ch[j].chainid != ' ' && pdb_ch[j].chainid == ri->chainid)
2200                     {
2201                         GMX_LOG(logger.warning)
2202                                 .asParagraph()
2203                                 .appendTextFormatted(
2204                                         "Chain identifier '%c' is used in two non-sequential "
2205                                         "blocks. "
2206                                         "They will be treated as separate chains unless you "
2207                                         "reorder your file.",
2208                                         ri->chainid);
2209                     }
2210                 }
2211                 t_pdbchain newChain;
2212                 newChain.chainid  = ri->chainid;
2213                 newChain.chainnum = ri->chainnum;
2214                 newChain.start    = i;
2215                 newChain.bAllWat  = bWat_;
2216                 if (bWat_)
2217                 {
2218                     newChain.nterpairs = 0;
2219                 }
2220                 else
2221                 {
2222                     newChain.nterpairs = 1;
2223                 }
2224                 newChain.chainstart.resize(newChain.nterpairs + 1);
2225                 /* modified [numChains] to [0] below */
2226                 newChain.chainstart[0] = 0;
2227                 pdb_ch.push_back(newChain);
2228                 numChains++;
2229             }
2230         }
2231         bPrevWat_ = bWat_;
2232     }
2233     pdb_ch.back().natom = natom - pdb_ch.back().start;
2234
2235     /* set all the water blocks at the end of the chain */
2236     std::vector<int> swap_index(numChains);
2237     int              j = 0;
2238     for (int i = 0; i < numChains; i++)
2239     {
2240         if (!pdb_ch[i].bAllWat)
2241         {
2242             swap_index[j] = i;
2243             j++;
2244         }
2245     }
2246     for (int i = 0; i < numChains; i++)
2247     {
2248         if (pdb_ch[i].bAllWat)
2249         {
2250             swap_index[j] = i;
2251             j++;
2252         }
2253     }
2254     if (nwaterchain > 1)
2255     {
2256         GMX_LOG(logger.info)
2257                 .asParagraph()
2258                 .appendTextFormatted("Moved all the water blocks to the end");
2259     }
2260
2261     t_atoms*             pdba;
2262     std::vector<t_chain> chains(numChains);
2263     /* copy pdb data and x for all chains */
2264     for (int i = 0; (i < numChains); i++)
2265     {
2266         int si               = swap_index[i];
2267         chains[i].chainid    = pdb_ch[si].chainid;
2268         chains[i].chainnum   = pdb_ch[si].chainnum;
2269         chains[i].bAllWat    = pdb_ch[si].bAllWat;
2270         chains[i].nterpairs  = pdb_ch[si].nterpairs;
2271         chains[i].chainstart = pdb_ch[si].chainstart;
2272         chains[i].ntdb.clear();
2273         chains[i].ctdb.clear();
2274         chains[i].r_start.resize(pdb_ch[si].nterpairs);
2275         chains[i].r_end.resize(pdb_ch[si].nterpairs);
2276
2277         snew(chains[i].pdba, 1);
2278         init_t_atoms(chains[i].pdba, pdb_ch[si].natom, true);
2279         for (j = 0; j < chains[i].pdba->nr; j++)
2280         {
2281             chains[i].pdba->atom[j]     = pdba_all.atom[pdb_ch[si].start + j];
2282             chains[i].pdba->atomname[j] = put_symtab(&symtab, *pdba_all.atomname[pdb_ch[si].start + j]);
2283             chains[i].pdba->pdbinfo[j] = pdba_all.pdbinfo[pdb_ch[si].start + j];
2284             chains[i].x.emplace_back(pdbx[pdb_ch[si].start + j]);
2285         }
2286         /* Re-index the residues assuming that the indices are continuous */
2287         int k                = chains[i].pdba->atom[0].resind;
2288         int nres             = chains[i].pdba->atom[chains[i].pdba->nr - 1].resind - k + 1;
2289         chains[i].pdba->nres = nres;
2290         for (int j = 0; j < chains[i].pdba->nr; j++)
2291         {
2292             chains[i].pdba->atom[j].resind -= k;
2293         }
2294         srenew(chains[i].pdba->resinfo, nres);
2295         for (int j = 0; j < nres; j++)
2296         {
2297             chains[i].pdba->resinfo[j]      = pdba_all.resinfo[k + j];
2298             chains[i].pdba->resinfo[j].name = put_symtab(&symtab, *pdba_all.resinfo[k + j].name);
2299             /* make all chain identifiers equal to that of the chain */
2300             chains[i].pdba->resinfo[j].chainid = pdb_ch[si].chainid;
2301         }
2302     }
2303
2304     if (nchainmerges > 0)
2305     {
2306         GMX_LOG(logger.info)
2307                 .asParagraph()
2308                 .appendTextFormatted("Merged chains into joint molecule definitions at %d places.",
2309                                      nchainmerges);
2310     }
2311
2312     GMX_LOG(logger.info)
2313             .asParagraph()
2314             .appendTextFormatted(
2315                     "There are %d chains and %d blocks of water and "
2316                     "%d residues with %d atoms",
2317                     numChains - nwaterchain,
2318                     nwaterchain,
2319                     pdba_all.nres,
2320                     natom);
2321
2322     GMX_LOG(logger.info)
2323             .asParagraph()
2324             .appendTextFormatted("  %5s  %4s %6s", "chain", "#res", "#atoms");
2325     for (int i = 0; (i < numChains); i++)
2326     {
2327         GMX_LOG(logger.info)
2328                 .asParagraph()
2329                 .appendTextFormatted("  %d '%c' %5d %6d  %s\n",
2330                                      i + 1,
2331                                      chains[i].chainid ? chains[i].chainid : '-',
2332                                      chains[i].pdba->nres,
2333                                      chains[i].pdba->nr,
2334                                      chains[i].bAllWat ? "(only water)" : "");
2335     }
2336
2337     check_occupancy(&pdba_all, inputConfFile_.c_str(), bVerbose_, logger);
2338
2339     /* Read atomtypes... */
2340     PreprocessingAtomTypes atype = read_atype(ffdir_, &symtab);
2341
2342     /* read residue database */
2343     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Reading residue database... (%s)", forcefield_);
2344     std::vector<std::string>       rtpf = fflib_search_file_end(ffdir_, ".rtp", true);
2345     std::vector<PreprocessResidue> rtpFFDB;
2346     for (const auto& filename : rtpf)
2347     {
2348         readResidueDatabase(filename, &rtpFFDB, &atype, &symtab, logger, false);
2349     }
2350     if (bNewRTP_)
2351     {
2352         /* Not correct with multiple rtp input files with different bonded types */
2353         FILE* fp = gmx_fio_fopen("new.rtp", "w");
2354         print_resall(fp, rtpFFDB, atype);
2355         gmx_fio_fclose(fp);
2356     }
2357
2358     /* read hydrogen database */
2359     std::vector<MoleculePatchDatabase> ah;
2360     read_h_db(ffdir_, &ah);
2361
2362     /* Read Termini database... */
2363     std::vector<MoleculePatchDatabase>  ntdb;
2364     std::vector<MoleculePatchDatabase>  ctdb;
2365     std::vector<MoleculePatchDatabase*> tdblist;
2366     int                                 nNtdb = read_ter_db(ffdir_, 'n', &ntdb, &atype);
2367     int                                 nCtdb = read_ter_db(ffdir_, 'c', &ctdb, &atype);
2368
2369     FILE* top_file = gmx_fio_fopen(topologyFile_.c_str(), "w");
2370
2371     print_top_header(top_file, topologyFile_.c_str(), FALSE, ffdir_, mHmult_);
2372
2373     t_chain*               cc;
2374     std::vector<gmx::RVec> x;
2375     /* new pdb datastructure for sorting. */
2376     t_atoms** sortAtoms  = nullptr;
2377     t_atoms** localAtoms = nullptr;
2378     snew(sortAtoms, numChains);
2379     snew(localAtoms, numChains);
2380     for (int chain = 0; (chain < numChains); chain++)
2381     {
2382         cc = &(chains[chain]);
2383
2384         /* set pdba, natom and nres to the current chain */
2385         pdba     = cc->pdba;
2386         x        = cc->x;
2387         natom    = cc->pdba->nr;
2388         int nres = cc->pdba->nres;
2389
2390         if (cc->chainid && (cc->chainid != ' '))
2391         {
2392             GMX_LOG(logger.info)
2393                     .asParagraph()
2394                     .appendTextFormatted("Processing chain %d '%c' (%d atoms, %d residues)",
2395                                          chain + 1,
2396                                          cc->chainid,
2397                                          natom,
2398                                          nres);
2399         }
2400         else
2401         {
2402             GMX_LOG(logger.info)
2403                     .asParagraph()
2404                     .appendTextFormatted(
2405                             "Processing chain %d (%d atoms, %d residues)", chain + 1, natom, nres);
2406         }
2407
2408         process_chain(logger,
2409                       pdba,
2410                       x,
2411                       bUnA_,
2412                       bUnA_,
2413                       bUnA_,
2414                       bLysMan_,
2415                       bAspMan_,
2416                       bGluMan_,
2417                       bHisMan_,
2418                       bArgMan_,
2419                       bGlnMan_,
2420                       angle_,
2421                       distance_,
2422                       &symtab,
2423                       rtprename);
2424
2425         cc->chainstart[cc->nterpairs] = pdba->nres;
2426         j                             = 0;
2427         for (int i = 0; i < cc->nterpairs; i++)
2428         {
2429             find_nc_ter(
2430                     pdba, cc->chainstart[i], cc->chainstart[i + 1], &(cc->r_start[j]), &(cc->r_end[j]), &rt, logger);
2431             if (cc->r_start[j] >= 0 && cc->r_end[j] >= 0)
2432             {
2433                 if (checkChainCyclicity(
2434                             pdba, pdbx, cc->r_start[j], cc->r_end[j], rtpFFDB, rtprename, long_bond_dist_, short_bond_dist_))
2435                 {
2436                     cc->cyclicBondsIndex.push_back(cc->r_start[j]);
2437                     cc->cyclicBondsIndex.push_back(cc->r_end[j]);
2438                     cc->r_start[j] = -1;
2439                     cc->r_end[j]   = -1;
2440                 }
2441                 else
2442                 {
2443                     j++;
2444                 }
2445             }
2446         }
2447         cc->nterpairs = j;
2448         if (cc->nterpairs == 0 && cc->cyclicBondsIndex.empty())
2449         {
2450             GMX_LOG(logger.info)
2451                     .asParagraph()
2452                     .appendTextFormatted(
2453                             "Problem with chain definition, or missing terminal residues. "
2454                             "This chain does not appear to contain a recognized chain molecule. "
2455                             "If this is incorrect, you can edit residuetypes.dat to modify the "
2456                             "behavior.");
2457         }
2458
2459         /* Check for disulfides and other special bonds */
2460         ssbonds = makeDisulfideBonds(pdba, gmx::as_rvec_array(x.data()), bCysMan_, bVerbose_);
2461
2462         if (!rtprename.empty())
2463         {
2464             rename_resrtp(pdba, cc->nterpairs, cc->r_start, cc->r_end, rtprename, &symtab, bVerbose_, logger);
2465         }
2466
2467         for (int i = 0; i < cc->nterpairs; i++)
2468         {
2469             /* Set termini.
2470              * We first apply a filter so we only have the
2471              * termini that can be applied to the residue in question
2472              * (or a generic terminus if no-residue specific is available).
2473              */
2474             /* First the N terminus */
2475             if (nNtdb > 0)
2476             {
2477                 tdblist = filter_ter(ntdb, *pdba->resinfo[cc->r_start[i]].name);
2478                 if (tdblist.empty())
2479                 {
2480                     GMX_LOG(logger.info)
2481                             .asParagraph()
2482                             .appendTextFormatted(
2483                                     "No suitable end (N or 5') terminus found in database - "
2484                                     "assuming this residue "
2485                                     "is already in a terminus-specific form and skipping terminus "
2486                                     "selection.");
2487                     cc->ntdb.push_back(nullptr);
2488                 }
2489                 else
2490                 {
2491                     if (bTerMan_ && !tdblist.empty())
2492                     {
2493                         sprintf(select,
2494                                 "Select start terminus type for %s-%d",
2495                                 *pdba->resinfo[cc->r_start[i]].name,
2496                                 pdba->resinfo[cc->r_start[i]].nr);
2497                         cc->ntdb.push_back(choose_ter(tdblist, select));
2498                     }
2499                     else
2500                     {
2501                         cc->ntdb.push_back(tdblist[0]);
2502                     }
2503
2504                     printf("Start terminus %s-%d: %s\n",
2505                            *pdba->resinfo[cc->r_start[i]].name,
2506                            pdba->resinfo[cc->r_start[i]].nr,
2507                            (cc->ntdb[i])->name.c_str());
2508                     tdblist.clear();
2509                 }
2510             }
2511             else
2512             {
2513                 cc->ntdb.push_back(nullptr);
2514             }
2515
2516             /* And the C terminus */
2517             if (nCtdb > 0)
2518             {
2519                 tdblist = filter_ter(ctdb, *pdba->resinfo[cc->r_end[i]].name);
2520                 if (tdblist.empty())
2521                 {
2522                     GMX_LOG(logger.info)
2523                             .asParagraph()
2524                             .appendTextFormatted(
2525                                     "No suitable end (C or 3') terminus found in database - "
2526                                     "assuming this residue"
2527                                     "is already in a terminus-specific form and skipping terminus "
2528                                     "selection.");
2529                     cc->ctdb.push_back(nullptr);
2530                 }
2531                 else
2532                 {
2533                     if (bTerMan_ && !tdblist.empty())
2534                     {
2535                         sprintf(select,
2536                                 "Select end terminus type for %s-%d",
2537                                 *pdba->resinfo[cc->r_end[i]].name,
2538                                 pdba->resinfo[cc->r_end[i]].nr);
2539                         cc->ctdb.push_back(choose_ter(tdblist, select));
2540                     }
2541                     else
2542                     {
2543                         cc->ctdb.push_back(tdblist[0]);
2544                     }
2545                     printf("End terminus %s-%d: %s\n",
2546                            *pdba->resinfo[cc->r_end[i]].name,
2547                            pdba->resinfo[cc->r_end[i]].nr,
2548                            (cc->ctdb[i])->name.c_str());
2549                     tdblist.clear();
2550                 }
2551             }
2552             else
2553             {
2554                 cc->ctdb.push_back(nullptr);
2555             }
2556         }
2557         std::vector<MoleculePatchDatabase> hb_chain;
2558         /* lookup hackblocks and rtp for all residues */
2559         std::vector<PreprocessResidue> restp_chain;
2560         get_hackblocks_rtp(&hb_chain,
2561                            &restp_chain,
2562                            rtpFFDB,
2563                            pdba->nres,
2564                            pdba->resinfo,
2565                            cc->nterpairs,
2566                            &symtab,
2567                            cc->ntdb,
2568                            cc->ctdb,
2569                            cc->r_start,
2570                            cc->r_end,
2571                            bAllowMissing_,
2572                            logger);
2573         /* ideally, now we would not need the rtp itself anymore, but do
2574            everything using the hb and restp arrays. Unfortunately, that
2575            requires some re-thinking of code in gen_vsite.c, which I won't
2576            do now :( AF 26-7-99 */
2577
2578         rename_atoms(nullptr, ffdir_, pdba, &symtab, restp_chain, false, &rt, false, bVerbose_);
2579
2580         match_atomnames_with_rtp(restp_chain, hb_chain, pdba, &symtab, x, bVerbose_, logger);
2581
2582         if (bSort_)
2583         {
2584             char**    gnames;
2585             t_blocka* block = new_blocka();
2586             snew(gnames, 1);
2587             sort_pdbatoms(restp_chain, natom, &pdba, &sortAtoms[chain], &x, block, &gnames);
2588             remove_duplicate_atoms(pdba, x, bVerbose_, logger);
2589             if (bIndexSet_)
2590             {
2591                 if (bRemoveH_)
2592                 {
2593                     GMX_LOG(logger.warning)
2594                             .asParagraph()
2595                             .appendTextFormatted(
2596                                     "With the -remh option the generated "
2597                                     "index file (%s) might be useless "
2598                                     "(the index file is generated before hydrogens are added)",
2599                                     indexOutputFile_.c_str());
2600                 }
2601                 write_index(indexOutputFile_.c_str(), block, gnames, false, 0);
2602             }
2603             for (int i = 0; i < block->nr; i++)
2604             {
2605                 sfree(gnames[i]);
2606             }
2607             sfree(gnames);
2608             done_blocka(block);
2609             sfree(block);
2610         }
2611         else
2612         {
2613             GMX_LOG(logger.warning)
2614                     .asParagraph()
2615                     .appendTextFormatted(
2616                             "Without sorting no check for duplicate atoms can be done");
2617         }
2618
2619         /* Generate Hydrogen atoms (and termini) in the sequence */
2620         GMX_LOG(logger.info)
2621                 .asParagraph()
2622                 .appendTextFormatted(
2623                         "Generating any missing hydrogen atoms and/or adding termini.");
2624         add_h(&pdba,
2625               &localAtoms[chain],
2626               &x,
2627               ah,
2628               &symtab,
2629               cc->nterpairs,
2630               cc->ntdb,
2631               cc->ctdb,
2632               cc->r_start,
2633               cc->r_end,
2634               bAllowMissing_,
2635               cc->cyclicBondsIndex);
2636         GMX_LOG(logger.info)
2637                 .asParagraph()
2638                 .appendTextFormatted("Now there are %d residues with %d atoms", pdba->nres, pdba->nr);
2639
2640         /* make up molecule name(s) */
2641
2642         int k = (cc->nterpairs > 0 && cc->r_start[0] >= 0) ? cc->r_start[0] : 0;
2643
2644         std::string restype = rt.typeOfNamedDatabaseResidue(*pdba->resinfo[k].name);
2645
2646         std::string molname;
2647         std::string suffix;
2648         if (cc->bAllWat)
2649         {
2650             molname = "Water";
2651         }
2652         else
2653         {
2654             this_chainid = cc->chainid;
2655
2656             /* Add the chain id if we have one */
2657             if (this_chainid != ' ')
2658             {
2659                 suffix.append(formatString("_chain_%c", this_chainid));
2660             }
2661
2662             /* Check if there have been previous chains with the same id */
2663             int nid_used = 0;
2664             for (int k = 0; k < chain; k++)
2665             {
2666                 if (cc->chainid == chains[k].chainid)
2667                 {
2668                     nid_used++;
2669                 }
2670             }
2671             /* Add the number for this chain identifier if there are multiple copies */
2672             if (nid_used > 0)
2673             {
2674                 suffix.append(formatString("%d", nid_used + 1));
2675             }
2676
2677             if (suffix.length() > 0)
2678             {
2679                 molname.append(restype);
2680                 molname.append(suffix);
2681             }
2682             else
2683             {
2684                 molname = restype;
2685             }
2686         }
2687         std::string itp_fn = topologyFile_;
2688
2689         std::string posre_fn = includeTopologyFile_;
2690         if ((numChains - nwaterchain > 1) && !cc->bAllWat)
2691         {
2692             bITP_ = true;
2693             printf("Chain time...\n");
2694             // construct the itp file name
2695             itp_fn = stripSuffixIfPresent(itp_fn, ".top");
2696             itp_fn.append("_");
2697             itp_fn.append(molname);
2698             itp_fn.append(".itp");
2699             // now do the same for posre
2700             posre_fn = stripSuffixIfPresent(posre_fn, ".itp");
2701             posre_fn.append("_");
2702             posre_fn.append(molname);
2703             posre_fn.append(".itp");
2704             if (posre_fn == itp_fn)
2705             {
2706                 posre_fn = Path::concatenateBeforeExtension(posre_fn, "_pr");
2707             }
2708             incls_.emplace_back();
2709             incls_.back() = itp_fn;
2710             itp_file_     = gmx_fio_fopen(itp_fn.c_str(), "w");
2711         }
2712         else
2713         {
2714             bITP_ = false;
2715         }
2716
2717         mols_.emplace_back();
2718         if (cc->bAllWat)
2719         {
2720             mols_.back().name = "SOL";
2721             mols_.back().nr   = pdba->nres;
2722         }
2723         else
2724         {
2725             mols_.back().name = molname;
2726             mols_.back().nr   = 1;
2727         }
2728
2729         if (bITP_)
2730         {
2731             print_top_comment(itp_file_, itp_fn.c_str(), ffdir_, true);
2732         }
2733
2734         FILE* top_file2;
2735         if (cc->bAllWat)
2736         {
2737             top_file2 = nullptr;
2738         }
2739         else if (bITP_)
2740         {
2741             top_file2 = itp_file_;
2742         }
2743         else
2744         {
2745             top_file2 = top_file;
2746         }
2747
2748         pdb2top(top_file2,
2749                 posre_fn.c_str(),
2750                 molname.c_str(),
2751                 pdba,
2752                 &x,
2753                 &atype,
2754                 &symtab,
2755                 rtpFFDB,
2756                 restp_chain,
2757                 hb_chain,
2758                 bAllowMissing_,
2759                 bVsites_,
2760                 bVsiteAromatics_,
2761                 ffdir_,
2762                 mHmult_,
2763                 ssbonds,
2764                 long_bond_dist_,
2765                 short_bond_dist_,
2766                 bDeuterate_,
2767                 bChargeGroups_,
2768                 bCmap_,
2769                 bRenumRes_,
2770                 bRTPresname_,
2771                 cc->cyclicBondsIndex,
2772                 logger);
2773
2774         if (!cc->bAllWat)
2775         {
2776             write_posres(posre_fn.c_str(), pdba, posre_fc_);
2777         }
2778
2779         if (bITP_)
2780         {
2781             gmx_fio_fclose(itp_file_);
2782         }
2783
2784         /* pdba and natom have been reassigned somewhere so: */
2785         cc->pdba = pdba;
2786         cc->x    = x;
2787     }
2788
2789     if (watermodel_ == nullptr)
2790     {
2791         for (int chain = 0; chain < numChains; chain++)
2792         {
2793             if (chains[chain].bAllWat)
2794             {
2795                 auto message = formatString(
2796                         "You have chosen not to include a water model, "
2797                         "but there is water in the input file. Select a "
2798                         "water model or remove the water from your input file.");
2799                 GMX_THROW(InconsistentInputError(message));
2800             }
2801         }
2802     }
2803     else
2804     {
2805         std::string waterFile = formatString("%s%c%s.itp", ffdir_, DIR_SEPARATOR, watermodel_);
2806         if (!fflib_fexist(waterFile))
2807         {
2808             auto message = formatString(
2809                     "The topology file '%s' for the selected water "
2810                     "model '%s' can not be found in the force field "
2811                     "directory. Select a different water model.",
2812                     waterFile.c_str(),
2813                     watermodel_);
2814             GMX_THROW(InconsistentInputError(message));
2815         }
2816     }
2817
2818     print_top_mols(top_file, title, ffdir_, watermodel_, incls_, mols_);
2819     gmx_fio_fclose(top_file);
2820
2821     /* now merge all chains back together */
2822     natom    = 0;
2823     int nres = 0;
2824     for (int i = 0; (i < numChains); i++)
2825     {
2826         natom += chains[i].pdba->nr;
2827         nres += chains[i].pdba->nres;
2828     }
2829     t_atoms* atoms;
2830     snew(atoms, 1);
2831     init_t_atoms(atoms, natom, false);
2832     for (int i = 0; i < atoms->nres; i++)
2833     {
2834         sfree(atoms->resinfo[i].name);
2835     }
2836     atoms->nres = nres;
2837     srenew(atoms->resinfo, nres);
2838     x.clear();
2839     int k = 0;
2840     int l = 0;
2841     for (int i = 0; (i < numChains); i++)
2842     {
2843         if (numChains > 1)
2844         {
2845             GMX_LOG(logger.info)
2846                     .asParagraph()
2847                     .appendTextFormatted("Including chain %d in system: %d atoms %d residues",
2848                                          i + 1,
2849                                          chains[i].pdba->nr,
2850                                          chains[i].pdba->nres);
2851         }
2852         for (int j = 0; (j < chains[i].pdba->nr); j++)
2853         {
2854             atoms->atom[k] = chains[i].pdba->atom[j];
2855             atoms->atom[k].resind += l; /* l is processed nr of residues */
2856             atoms->atomname[k]                            = chains[i].pdba->atomname[j];
2857             atoms->resinfo[atoms->atom[k].resind].chainid = chains[i].chainid;
2858             x.push_back(chains[i].x[j]);
2859             k++;
2860         }
2861         for (int j = 0; (j < chains[i].pdba->nres); j++)
2862         {
2863             atoms->resinfo[l] = chains[i].pdba->resinfo[j];
2864             if (bRTPresname_)
2865             {
2866                 atoms->resinfo[l].name = atoms->resinfo[l].rtp;
2867             }
2868             l++;
2869         }
2870         done_atom(chains[i].pdba);
2871     }
2872
2873     if (numChains > 1)
2874     {
2875         GMX_LOG(logger.info)
2876                 .asParagraph()
2877                 .appendTextFormatted("Now there are %d atoms and %d residues", k, l);
2878         print_sums(atoms, true, logger);
2879     }
2880
2881     rvec box_space;
2882     GMX_LOG(logger.info).asParagraph().appendTextFormatted("Writing coordinate file...");
2883     clear_rvec(box_space);
2884     if (box[0][0] == 0)
2885     {
2886         make_new_box(atoms->nr, gmx::as_rvec_array(x.data()), box, box_space, false);
2887     }
2888     write_sto_conf(
2889             outputConfFile_.c_str(), title, atoms, gmx::as_rvec_array(x.data()), nullptr, pbcType, box);
2890
2891     done_symtab(&symtab);
2892     done_atom(&pdba_all);
2893     done_atom(atoms);
2894     for (int chain = 0; chain < numChains; chain++)
2895     {
2896         sfree(sortAtoms[chain]);
2897         sfree(localAtoms[chain]);
2898     }
2899     sfree(sortAtoms);
2900     sfree(localAtoms);
2901     sfree(atoms);
2902     sfree(title);
2903     sfree(pdbx);
2904
2905     GMX_LOG(logger.info)
2906             .asParagraph()
2907             .appendTextFormatted("\t\t--------- PLEASE NOTE ------------");
2908     GMX_LOG(logger.info)
2909             .asParagraph()
2910             .appendTextFormatted("You have successfully generated a topology from: %s.",
2911                                  inputConfFile_.c_str());
2912     if (watermodel_ != nullptr)
2913     {
2914         GMX_LOG(logger.info)
2915                 .asParagraph()
2916                 .appendTextFormatted(
2917                         "The %s force field and the %s water model are used.", ffname_, watermodel_);
2918         sfree(watermodel_);
2919     }
2920     else
2921     {
2922         GMX_LOG(logger.info).asParagraph().appendTextFormatted("The %s force field is used.", ffname_);
2923     }
2924     GMX_LOG(logger.info)
2925             .asParagraph()
2926             .appendTextFormatted("\t\t--------- ETON ESAELP ------------");
2927
2928     return 0;
2929 }
2930
2931 } // namespace
2932
2933 const char pdb2gmxInfo::name[] = "pdb2gmx";
2934 const char pdb2gmxInfo::shortDescription[] =
2935         "Convert coordinate files to topology and FF-compliant coordinate files";
2936 ICommandLineOptionsModulePointer pdb2gmxInfo::create()
2937 {
2938     return std::make_unique<pdb2gmx>();
2939 }
2940
2941 } // namespace gmx