From 988c773acd759dbebc9f2a1c1ebca460a4e6f305 Mon Sep 17 00:00:00 2001 From: Paul Bauer Date: Wed, 9 Oct 2019 15:16:58 +0200 Subject: [PATCH] Add validation of tarball builds Allows validation of GROMACS builds against a file stored in the release tarball to give users an indication if the source they are using is actually unmodified from the release version. This only applies to builds from tarballs, as the git version checking already does all of this for us. Hash can be generated by the new cmake target checksum-files, and applies for files in src/ and python_packaging/, if they are included in a whitelist defined in the script. Refs #2128 Change-Id: Ia3bc41c0d0566993166f567409510cf9c9ec2020 --- admin/builds/source-package.py | 3 +- admin/createFileHash.py | 149 ++++++++++++++++++ cmake/VersionInfo.cmake.cmakein | 4 +- cmake/gmxVersionInfo.cmake | 75 +++++++++ docs/install-guide/index.rst | 23 ++- .../2020/major/miscellaneous.rst | 15 ++ src/gromacs/CMakeLists.txt | 2 + .../utility/baseversion-gen.cpp.cmakein | 2 + src/gromacs/utility/baseversion.cpp | 10 ++ src/gromacs/utility/baseversion.h | 16 +- src/gromacs/utility/baseversion_gen.h | 4 + src/gromacs/utility/binaryinformation.cpp | 26 +++ 12 files changed, 322 insertions(+), 7 deletions(-) create mode 100644 admin/createFileHash.py diff --git a/admin/builds/source-package.py b/admin/builds/source-package.py index 6c6322e1f9..58e80fa263 100644 --- a/admin/builds/source-package.py +++ b/admin/builds/source-package.py @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2015,2016,2017, by the GROMACS development team, led by +# Copyright (c) 2015,2016,2017,2019, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -54,6 +54,7 @@ def do_build(context): context.build_target(target='man') context.build_target(target='completion') context.build_target(target='install-guide') + context.build_target(target='checksum-files') context.build_target(target='package_source') diff --git a/admin/createFileHash.py b/admin/createFileHash.py new file mode 100644 index 0000000000..5aedafc96d --- /dev/null +++ b/admin/createFileHash.py @@ -0,0 +1,149 @@ +#! /usr/bin/env python +# This file is part of the GROMACS molecular simulation package. +# +# Copyright (c) 2019, by the GROMACS development team, led by +# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, +# and including many others, as listed in the AUTHORS file in the +# top-level source directory and at http://www.gromacs.org. +# +# GROMACS is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public License +# as published by the Free Software Foundation; either version 2.1 +# of the License, or (at your option) any later version. +# +# GROMACS is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with GROMACS; if not, see +# http://www.gnu.org/licenses, or write to the Free Software Foundation, +# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# If you want to redistribute modifications to GROMACS, please +# consider that scientific software is very special. Version +# control is crucial - bugs must be traceable. We will be happy to +# consider code for inclusion in the official distribution, but +# derived work must not be called official GROMACS. Details are found +# in the README & COPYING files - if they are missing, get the +# official version at http://www.gromacs.org. +# +# To help us fund GROMACS development, we humbly ask that you cite +# the research papers on the package. Check out http://www.gromacs.org. +import hashlib, hmac, os, stat, sys, re +from re import search + +""" +Calculate hash of files in build tree to allow checking against +stored hashes in case of the tree not being in git (e.g. if the +program is build from a release tarball. + +Based on example script found here: + https://unix.stackexchange.com/a/35847 +""" + +def is_in_whitelist(name): + """Return true if file is white listed to be included in hash calculation.""" + in_whitelist = False + whitelist = ["\.cpp$", "\.h$", "\.cuh$", "\.cu$", "\.clh$", "CMakeList.txt$", "\.cmake$", "\.in$", "\.cmakein$", "\.py$"] + for item in whitelist: + if search(item, name): + in_whitelist = True + break + + return in_whitelist + +def is_blacklisted(name): + """Return if a file has been explicitly blacklisted. + + """ + is_blacklisted = False + blacklist = ["gmx-completion"] + for item in blacklist: + if search(item, name): + is_blacklisted = True + break + + return is_blacklisted + +def file_hash(name): + """Return the hash of the contents of the specified file, as a hex string + + Reads file in chunks of 16384 bytes and calculates the hash of the complete + file afterwards. + The hashing algorithm used is sha256, to avoid accidental clashes when using + a more simple algorithm such as md5. + """ + f = open(name, 'rb') + h = hashlib.sha256() + while True: + buf = f.read(16384) + if len(buf) == 0: break + h.update(buf) + f.close() + return h.hexdigest() + +def traverse(h, path, original_path): + """Recursive function to traverse a file path until a regular file is found. + Walks down the path given as the input and updates the hash function with + information of new files that are found on bottom of the list. + + Information used to calculate the hash are the name and the contents of the file. + Uses both absolute and relative path to make sure only the relative path is used + to calculate the hash. + + Ignores files that are not in the white-list and also skips files that are + explicitly blacklisted. + Other things that are ignored are symlinks and all kinds of special files. + """ + rs = os.lstat(path) + quoted_name = repr(os.path.relpath(path, original_path)) + if stat.S_ISDIR(rs.st_mode): + for entry in sorted(os.listdir(path)): + traverse(h, os.path.join(path, entry), original_path) + elif stat.S_ISREG(rs.st_mode): + # Only test files that actually take part in building GROMACS + if (is_in_whitelist(path) and not is_blacklisted(path)): + fullname = 'reg ' + quoted_name + ' ' + fullname += str(rs.st_size) + ' ' + fullname += file_hash(path) + '\n' + h.update(fullname.encode('utf-8')) + else: pass # silently symlinks and other special files + +def main(): + """Run the hashing script. + + Takes single directory to hash files in. + + """ + import os + import sys + import argparse + + parser = argparse.ArgumentParser(description='Hash all white listed files in a single directory') + parser.add_argument('-s', + '--source-root', + help='Source tree directory, can be specified multiple times to get several directories hashed', + nargs='*', + required=True) + parser.add_argument('-o', + '--output-file', + help='File to write hash to.', + default='hashresult') + + args = parser.parse_args() + + outfile_path = args.output_file + h = hashlib.sha256() + for input_sources in args.source_root: + traverse(h, input_sources, input_sources) + + end = 'end\n' + h.update(end.encode('utf-8')) + outputfile = open(outfile_path, 'w') + outputfile.write(h.hexdigest()) + +if __name__ == '__main__': + main() + diff --git a/cmake/VersionInfo.cmake.cmakein b/cmake/VersionInfo.cmake.cmakein index 4eefad669e..e95494a01b 100644 --- a/cmake/VersionInfo.cmake.cmakein +++ b/cmake/VersionInfo.cmake.cmakein @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2014,2015, by the GROMACS development team, led by +# Copyright (c) 2014,2015,2019, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -40,3 +40,5 @@ set(GMX_VERSION_STRING "@GMX_VERSION_STRING@") set(GMX_VERSION_STRING_FULL "@GMX_VERSION_STRING_FULL@") set(GMX_VERSION_FULL_HASH "@GMX_VERSION_FULL_HASH@") set(GMX_VERSION_CENTRAL_BASE_HASH "@GMX_VERSION_CENTRAL_BASE_HASH@") +set(GMX_RELEASE_SOURCE_FILE_CHECKSUM "@GMX_RELEASE_SOURCE_FILE_CHECKSUM@") +set(GMX_CURRENT_SOURCE_FILE_CHECKSUM "@GMX_CURRENT_SOURCE_FILE_CHECKSUM@") diff --git a/cmake/gmxVersionInfo.cmake b/cmake/gmxVersionInfo.cmake index ff8a3db586..7ee447aa00 100644 --- a/cmake/gmxVersionInfo.cmake +++ b/cmake/gmxVersionInfo.cmake @@ -330,6 +330,12 @@ set(VERSION_INFO_DEPS ${VERSION_INFO_CMAKE_FILE}) # the function below. set(VERSION_INFO_CMAKEIN_FILE ${CMAKE_CURRENT_LIST_DIR}/VersionInfo.cmake.cmakein) set(VERSION_INFO_CONFIGURE_SCRIPT ${CMAKE_CURRENT_LIST_DIR}/gmxConfigureVersionInfo.cmake) +# A set of directories to scan for calculating the hash of source files. +set(SET_OF_DIRECTORIES_TO_CHECKSUM "${PROJECT_SOURCE_DIR}/src") +list(APPEND SET_OF_DIRECTORIES_TO_CHECKSUM "${PROJECT_SOURCE_DIR}/python_packaging") +# Try to find python for the checksumming script +set(PythonInterp_FIND_QUIETLY ON) +find_package(PythonInterp 3.5) # Rules to create the VersionInfo.cmake file. # For git info, the sequence is: @@ -394,11 +400,80 @@ else() set(GMX_VERSION_STRING_FULL ${GMX_VERSION_STRING}) set(GMX_VERSION_FULL_HASH "") set(GMX_VERSION_CENTRAL_BASE_HASH "") + # To notify the user during compilation and at runtime that the build source + # has not been modified after unpacking the source tarball, the contents are hashed + # to be compared to a hash computed during the release process. If the hash matches + # all is fine and the user gets a message in the log file indicating that. + # If either the release hash file is missing, or if the hash does not match + # a different message is printed to indicate that the source has been changed + # compared to the version actually released. This is not needed in case a build + # is done in git, as we have the information there already. + # This is not done if the user has explicitly set an additional custom version string with + # -DGMX_VERSION_STRING_OF_FORK, as this indicates that they are knowing that a custom + # version of GROMACS is in use. + set(RELEASE_CHECKSUM_FILE "${PROJECT_SOURCE_DIR}/src/reference_checksum") + if(NOT GMX_VERSION_STRING_OF_FORK OR "${GMX_VERSION_STRING_OF_FORK}" STREQUAL "") + if(EXISTS ${RELEASE_CHECKSUM_FILE} AND PythonInterp_FOUND) + file(READ ${RELEASE_CHECKSUM_FILE} GMX_RELEASE_SOURCE_FILE_CHECKSUM) + string(STRIP ${GMX_RELEASE_SOURCE_FILE_CHECKSUM} GMX_RELEASE_SOURCE_FILE_CHECKSUM) + set(CHECKSUM_RESULT_FILE "${CMAKE_CURRENT_BINARY_DIR}/computed_checksum") + execute_process(COMMAND ${PYTHON_EXECUTABLE} + ${PROJECT_SOURCE_DIR}/admin/createFileHash.py + -s ${SET_OF_DIRECTORIES_TO_CHECKSUM} + -o ${CHECKSUM_RESULT_FILE} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + OUTPUT_QUIET) + file(READ ${CHECKSUM_RESULT_FILE} GMX_CURRENT_SOURCE_FILE_CHECKSUM) + string(STRIP ${GMX_CURRENT_SOURCE_FILE_CHECKSUM} GMX_CURRENT_SOURCE_FILE_CHECKSUM) + if(NOT ${GMX_RELEASE_SOURCE_FILE_CHECKSUM} STREQUAL ${GMX_CURRENT_SOURCE_FILE_CHECKSUM}) + set(GMX_VERSION_STRING_FULL "${GMX_VERSION_STRING_FULL}_MODIFIED") + message(STATUS "The source code for this GROMACS installation is different from the officially released version.") + endif() + elseif(PythonInterp_FOUND) + set(GMX_VERSION_STRING_FULL "${GMX_VERSION_STRING_FULL}_UNCHECKED") + set(GMX_RELEASE_SOURCE_FILE_CHECKSUM "NoChecksumFile") + set(GMX_CURRENT_SOURCE_FILE_CHECKSUM "NoChecksumFile") + message(WARNING "Could not valdiate the GROMACS source due to missing reference checksum file.") + else() + set(GMX_VERSION_STRING_FULL "${GMX_VERSION_STRING_FULL}_UNCHECKED") + set(GMX_RELEASE_SOURCE_FILE_CHECKSUM "NoPythonAvailable") + set(GMX_CURRENT_SOURCE_FILE_CHECKSUM "NoPythonAvailable") + message(STATUS "Could not calculate checksum of source files without Python") + endif() + endif() configure_file(${VERSION_INFO_CMAKEIN_FILE} ${VERSION_INFO_CMAKE_FILE}) endif() unset(GMX_VERSION_STRING_FULL) unset(GMX_VERSION_FULL_HASH) unset(GMX_VERSION_CENTRAL_BASE_HASH) +unset(GMX_RELEASE_SOURCE_FILE_CHECKSUM) +unset(GMX_CURRENT_SOURCE_FILE_CHECKSUM) + + +# What file the checksum should be written to +set(CHECKSUM_FILE "${PROJECT_SOURCE_DIR}/src/reference_checksum") + +# Target that allows checksumming a source tree when producing a tarball. +# Allows verification of builds from the tarball to make sure the source had +# not been tampered with. +# Note: The RUN_ALWAYS here is to regenerate the hash file only, it does not +# mean that the target is run in all builds +if (PYTHONINTERP_FOUND) + gmx_add_custom_output_target(checksum-files RUN_ALWAYS + OUTPUT ${CHECKSUM_FILE} + COMMAND ${PYTHON_EXECUTABLE} + ${PROJECT_SOURCE_DIR}/admin/createFileHash.py + -s ${SET_OF_DIRECTORIES_TO_CHECKSUM} + -o ${CHECKSUM_FILE} + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + COMMENT "Generating checksum of source files") +else() + add_custom_target(checksum-files + COMMAND ${CMAKE_COMMAND} -E echo + "Can not checksum files without python being available" + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + COMMENT "Generating checksum of source files") +endif() # The main user-visible interface to the machinery. # See documentation at the top of the script. diff --git a/docs/install-guide/index.rst b/docs/install-guide/index.rst index 861846d28d..c0e7238c99 100644 --- a/docs/install-guide/index.rst +++ b/docs/install-guide/index.rst @@ -154,10 +154,6 @@ If you are running on Mac OS X, the best option is the Intel compiler. Both clang and gcc will work, but they produce lower performance and each have some shortcomings. clang 3.8 now offers support for OpenMP, and so may provide decent performance. -The CMake variable ``CMAKE_OSX_DEPLOYMENT_TARGET`` influences CMake's -choice of C++ stdlib implementation. Setting to ``10.9`` (default) or -higher is the simplest way to find a compatible compiler and stdlib -implementation. For all non-x86 platforms, your best option is typically to use gcc or the vendor's default or recommended compiler, and check for @@ -1117,6 +1113,25 @@ the performance of |Gromacs|. Until that is ready, we recommend that you try a few different parallelization options, and experiment with tools such as ``gmx tune_pme``. +Validating |Gromacs| for source code modifications +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When building |Gromacs| from a release tarball, the build process automatically +checks if any file contributing to the build process have been modified since they have +been packed in the archive. This results in the marking of the version as either ``MODIFIED`` +(if the source files have been modified) or ``UNCHECKED`` (if no validation was possible, e.g. +if no Python installation was found). The actual checking is performed by comparing a checksum +stored in the release tarball against one generated by the ``createFileHash.py`` Python script +during the build configuration. When running a |Gromacs| binary, the checksum is also printed +in the log file, together with a message if there is a mismatch or no validation has been possible. + +This allows users to check whether the binary they are using was built from source code that is +identical to the source code released by the |Gromacs| team. Thus unintentional modifications +to the source code for building binaries that are used for running production simulations +are easily detectable. Additionally, by manually setting a version tag using the +GMX_VERSION_STRING_OF_FORK cmake option, users can mark a modified |Gromacs| release +code with their custom version string suffix. + Having difficulty? ^^^^^^^^^^^^^^^^^^ diff --git a/docs/release-notes/2020/major/miscellaneous.rst b/docs/release-notes/2020/major/miscellaneous.rst index 12a23aa356..a2864e30ec 100644 --- a/docs/release-notes/2020/major/miscellaneous.rst +++ b/docs/release-notes/2020/major/miscellaneous.rst @@ -21,3 +21,18 @@ being used, anybody providing a forked version of |Gromacs| shall set GMX_VERSION_STRING_OF_FORK in the source code (or if necessary when running CMake). It will then appear in the log file and users will know which version and fork of the code produced the result. + +Provide checksum to validate release tarballs +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Released versions of |Gromacs| will now provide a checksum calculated +from the files participating in building the binaries. When building +|Gromacs| from the tarball, the files will be checksummed again and +compared against the checksum generated during the release build. If the +checksums don't match, the version string is modified to indicate that +the source tree has been modified, and the information is printed in the +log files for the users. If checksumming has not been possible (either due +to missing Python during installation, or because the original checksum file +is missing), this is indicated through a different version string. + +:issue:`2128` + diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt index 70260da57e..2462d1a989 100644 --- a/src/gromacs/CMakeLists.txt +++ b/src/gromacs/CMakeLists.txt @@ -154,6 +154,8 @@ gmx_configure_version_file( REMOTE_HASH EXTRA_VARS GMX_SOURCE_DOI + GMX_RELEASE_HASH + GMX_SOURCE_HASH ) list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE}) diff --git a/src/gromacs/utility/baseversion-gen.cpp.cmakein b/src/gromacs/utility/baseversion-gen.cpp.cmakein index 17daa56644..e9038ce9ad 100644 --- a/src/gromacs/utility/baseversion-gen.cpp.cmakein +++ b/src/gromacs/utility/baseversion-gen.cpp.cmakein @@ -38,3 +38,5 @@ const char _gmx_ver_string[] = "@GMX_VERSION_STRING_FULL@"; const char _gmx_full_git_hash[] = "@GMX_VERSION_FULL_HASH@"; const char _gmx_central_base_hash[] = "@GMX_VERSION_CENTRAL_BASE_HASH@"; const char gmxSourceDoiString[] = "@GMX_SOURCE_DOI@"; +const char gmxReleaseSourceFileChecksum[] = "@GMX_RELEASE_SOURCE_FILE_CHECKSUM@"; +const char gmxCurrentSourceFileChecksum[] = "@GMX_CURRENT_SOURCE_FILE_CHECKSUM@"; diff --git a/src/gromacs/utility/baseversion.cpp b/src/gromacs/utility/baseversion.cpp index ee195b2b3f..23daa6d9a7 100644 --- a/src/gromacs/utility/baseversion.cpp +++ b/src/gromacs/utility/baseversion.cpp @@ -60,6 +60,16 @@ const char *gmxDOI() return gmxSourceDoiString; } +const char *gmxReleaseSourceChecksum() +{ + return gmxReleaseSourceFileChecksum; +} + +const char *gmxCurrentSourceChecksum() +{ + return gmxCurrentSourceFileChecksum; +} + #if GMX_DOUBLE void gmx_is_double_precision() { diff --git a/src/gromacs/utility/baseversion.h b/src/gromacs/utility/baseversion.h index b4e7fc4a74..7a1ef6f84a 100644 --- a/src/gromacs/utility/baseversion.h +++ b/src/gromacs/utility/baseversion.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2014,2015,2018, by the GROMACS development team, led by + * Copyright (c) 2014,2015,2018,2019, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -101,4 +101,18 @@ const char *getGpuImplementationString(); */ const char *gmxDOI(); +/*! \brief + * Hash of the complete source released in the tarball. + * + * Empty when not a release tarball build. + */ +const char *gmxReleaseSourceChecksum(); + +/*! \brief + * Hash of the complete source actually used when building. + * + * Always computed when building from tarball. + */ +const char *gmxCurrentSourceChecksum(); + #endif diff --git a/src/gromacs/utility/baseversion_gen.h b/src/gromacs/utility/baseversion_gen.h index ee793533a5..f970e18a48 100644 --- a/src/gromacs/utility/baseversion_gen.h +++ b/src/gromacs/utility/baseversion_gen.h @@ -61,6 +61,10 @@ extern const char _gmx_central_base_hash[]; * referencing of different \Gromacs releases. */ extern const char gmxSourceDoiString[]; +//! Sha256 checksum of source and header files, populated for release builds. +extern const char gmxReleaseSourceFileChecksum[]; +//! Sha256 checksum of source and header files, populated for builds from tarball. +extern const char gmxCurrentSourceFileChecksum[]; //! \} //! \endcond diff --git a/src/gromacs/utility/binaryinformation.cpp b/src/gromacs/utility/binaryinformation.cpp index 390a0385e9..47798cd9e2 100644 --- a/src/gromacs/utility/binaryinformation.cpp +++ b/src/gromacs/utility/binaryinformation.cpp @@ -238,6 +238,32 @@ void gmx_print_version_info(gmx::TextWriter *writer) { writer->writeLine(formatString("Branched from: %s", base_hash)); } + const char *const releaseSourceChecksum = gmxReleaseSourceChecksum(); + const char *const currentSourceChecksum = gmxCurrentSourceChecksum(); + if (releaseSourceChecksum[0] != '\0') + { + if (std::strcmp(releaseSourceChecksum, "NoChecksumFile") == 0) + { + writer->writeLine(formatString("The source code this program was compiled from has not been verified because the reference checksum was missing during compilation. This means you have an incomplete GROMACS distribution, please make sure to download an intact source distribution and compile that before proceeding.")); + writer->writeLine(formatString("Computed checksum: %s", currentSourceChecksum)); + } + else if (std::strcmp(releaseSourceChecksum, "NoPythonAvailable") == 0) + { + writer->writeLine(formatString("Build source could not be verified, because the checksum could not be computed.")); + } + else if (std::strcmp(releaseSourceChecksum, currentSourceChecksum) != 0) + { + writer->writeLine(formatString("This program has been built from source code that has been altered and does not match the code released as part of the official GROMACS version %s. If you did not intend to use an altered GROMACS version, make sure to download an intact source distribution and compile that before proceeding.", gmx_version())); + writer->writeLine(formatString("If you have modified the source code, you are strongly encouraged to set your custom version suffix (using -DGMX_VERSION_STRING_OF_FORK) which will can help later with scientific reproducibility but also when reporting bugs.")); + writer->writeLine(formatString("Release checksum: %s", releaseSourceChecksum)); + writer->writeLine(formatString("Computed checksum: %s", currentSourceChecksum)); + } + else + { + writer->writeLine(formatString("Verified release checksum is %s", releaseSourceChecksum)); + } + } + #if GMX_DOUBLE writer->writeLine("Precision: double"); -- 2.22.0