python_packaging/src/gmxapi/commandline.py

   1 #
   2 # This file is part of the GROMACS molecular simulation package.
   3 #
   4 # Copyright (c) 2019,2020, by the GROMACS development team, led by
   5 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6 # and including many others, as listed in the AUTHORS file in the
   7 # top-level source directory and at http://www.gromacs.org.
   8 #
   9 # GROMACS is free software; you can redistribute it and/or
  10 # modify it under the terms of the GNU Lesser General Public License
  11 # as published by the Free Software Foundation; either version 2.1
  12 # of the License, or (at your option) any later version.
  13 #
  14 # GROMACS is distributed in the hope that it will be useful,
  15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17 # Lesser General Public License for more details.
  18 #
  19 # You should have received a copy of the GNU Lesser General Public
  20 # License along with GROMACS; if not, see
  21 # http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22 # Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23 #
  24 # If you want to redistribute modifications to GROMACS, please
  25 # consider that scientific software is very special. Version
  26 # control is crucial - bugs must be traceable. We will be happy to
  27 # consider code for inclusion in the official distribution, but
  28 # derived work must not be called official GROMACS. Details are found
  29 # in the README & COPYING files - if they are missing, get the
  30 # official version at http://www.gromacs.org.
  31 #
  32 # To help us fund GROMACS development, we humbly ask that you cite
  33 # the research papers on the package. Check out http://www.gromacs.org.
  34
  35 """
  36 Provide command line operation.
  37 """
  38
  39 __all__ = ['commandline_operation']
  40
  41 import os
  42 import shutil
  43 import subprocess
  44
  45 import gmxapi as gmx
  46 from gmxapi import exceptions
  47 from gmxapi import logger as root_logger
  48 from gmxapi.datamodel import NDArray
  49 from gmxapi.operation import OutputCollectionDescription
  50
  51 # Module-level logger
  52 logger = root_logger.getChild('commandline')
  53 logger.info('Importing {}'.format(__name__))
  54
  55
  56 # Create an Operation that consumes a list and a boolean to produce a string and an integer.
  57 #
  58 # Wrap the defined function using a decorator that
  59 #    * strips the `output` parameter from the signature
  60 #    * provides `output` publishing proxy to the inner function and
  61 #    * produce a result with attributes for
  62 #       * file: mapping of output flags to output filenames
  63 #       * stdout: process STDOUT
  64 #       * stderr: porcess STDERR
  65 #       * returncode: integer return code of wrapped command
  66 #
  67 # Note that the existence of the 'file' output map is expressed here, but
  68 # the keys of the map are not implicit or set by the wrapped function.
  69 # For the map to be non-empty, it must be defined before the resulting helper
  70 # function is called.
  71 #
  72 # TODO: Operation returns the output object when called with the shorter signature.
  73 #
  74 @gmx.function_wrapper(output={'stdout': str,
  75                               'stderr': str,
  76                               'returncode': int})
  77 def cli(command: NDArray, shell: bool, output: OutputCollectionDescription, stdin: str = ''):
  78     """Execute a command line program in a subprocess.
  79
  80     Configure an executable in a subprocess. Executes when run in an execution
  81     Context, as part of a work graph or via gmx.run(). Runs in the current
  82     working directory.
  83
  84     Shell processing is not enabled, but can be considered for a future version.
  85     This means that shell expansions such as environment variables, globbing (`*`),
  86     and other special symbols (like `~` for home directory) are not available.
  87     This allows a simpler and more robust implementation, as well as a better
  88     ability to uniquely identify the effects of a command line operation. If you
  89     think this disallows important use cases, please let us know.
  90
  91     Arguments:
  92          command: a tuple (or list) to be the subprocess arguments, including `executable`
  93          output: mapping of command line flags to output filename arguments
  94          shell: unused (provides forward-compatibility)
  95          stdin (str): String input to send to STDIN (terminal input) of the executable.
  96
  97     Multi-line text sent to *stdin* should be joined into a single string
  98     (e.g. ``'\n'.join(list_of_strings) + '\n'``).
  99     If multiple strings are provided to *stdin*, gmxapi will assume an ensemble,
 100     and will run one operation for each provided string.
 101
 102     Only string input (:py:func:str) to *stdin* is currently supported.
 103     If you have a use case that requires streaming input or binary input,
 104     please open an issue or contact the author(s).
 105
 106     Arguments are iteratively added to the command line with standard Python
 107     iteration, so you should use a tuple or list even if you have only one parameter.
 108     I.e. If you provide a string with `arguments="asdf"` then it will be passed as
 109     `... "a" "s" "d" "f"`. To pass a single string argument, `arguments=("asdf")`
 110     or `arguments=["asdf"]`.
 111
 112     `input` and `output` should be a dictionary with string keys, where the keys
 113     name command line "flags" or options.
 114
 115     Example:
 116         Execute a command named `exe` that takes a flagged option for file name
 117         (stored in a local Python variable `my_filename`) and an `origin` flag
 118         that uses the next three arguments to define a vector.
 119
 120             >>> my_filename = "somefilename"
 121             >>> result = cli(('exe', '--origin', 1.0, 2.0, 3.0, '-f', my_filename), shell=False)
 122             >>> assert hasattr(result, 'file')
 123             >>> assert hasattr(result, 'stdout')
 124             >>> assert hasattr(result, 'stderr')
 125             >>> assert hasattr(result, 'returncode')
 126
 127     Returns:
 128         A data structure with attributes for each of the results `file`, `stdout`, `stderr`, and `returncode`
 129
 130     Result object attributes:
 131         * `file`: the mapping of CLI flags to filename strings resulting from the `output` kwarg
 132         * `stdout`: A string mapping from process STDOUT.
 133         * `stderr`: A string mapping from process STDERR; it will be the
 134                     error output (if any) if the process failed.
 135         * `returncode`: return code of the subprocess.
 136
 137     """
 138     # In the operation implementation, we expect the `shell` parameter to be intercepted by the
 139     # wrapper and set to False.
 140     if shell:
 141         raise exceptions.UsageError("Operation does not support shell processing.")
 142
 143     if stdin == '':
 144         stdin = None
 145
 146     if isinstance(command, (str, bytes)):
 147         command = [command]
 148     command = list([arg for arg in command])
 149
 150     executable = shutil.which(command[0])
 151     if executable is None:
 152         raise exceptions.ValueError('"{}" is not found or not executable.'.format(command[0]))
 153     command[0] = executable
 154
 155     # TODO: (FR9) Can OS input/output filehandles be a responsibility of
 156     #  the code providing 'resources'?
 157
 158     stdout = ''
 159     stderr = ''
 160     logger.debug('executing subprocess')
 161     try:
 162         completed_process = subprocess.run(command,
 163                                            shell=shell,
 164                                            input=stdin,
 165                                            check=True,
 166                                            stdout=subprocess.PIPE,
 167                                            stderr=subprocess.PIPE,
 168                                            encoding='utf-8',
 169                                            universal_newlines=True
 170                                            )
 171         returncode = completed_process.returncode
 172         # TODO: Resource management code should manage a safe data object for `output`.
 173         logger.debug('STDOUT:')
 174         if completed_process.stderr is not None:
 175             for line in completed_process.stdout.split('\n'):
 176                 logger.debug(line)
 177         else:
 178             logger.debug('STDOUT is empty')
 179         logger.debug('STDERR:')
 180         if completed_process.stderr is not None:
 181             for line in completed_process.stderr.split('\n'):
 182                 logger.debug(line)
 183         else:
 184             logger.debug('STDERR is empty')
 185
 186         stdout = completed_process.stdout
 187         stderr = completed_process.stderr
 188
 189     except subprocess.CalledProcessError as e:
 190         logger.info("commandline operation had non-zero return status"
 191                     "when calling {}".format(e.cmd))
 192         stdout = e.stdout
 193         stderr = e.stderr
 194         returncode = e.returncode
 195
 196     # Publish outputs.
 197     output.stdout = stdout
 198     output.stderr = stderr
 199     output.returncode = returncode
 200
 201
 202 # TODO: (FR4) Make this a formal operation to properly handle gmxapi data dependencies.
 203 #  The consumer of this operation has an NDArray input. filemap may contain gmxapi data flow
 204 #  aspects that we want the framework to handle for us.
 205 def filemap_to_flag_list(filemap: dict = None):
 206     """Convert a map of command line flags and filenames to a list of command line arguments.
 207
 208     Used to map inputs and outputs of command line tools to and from gmxapi data handles.
 209     User provides mappings of flags and filenames so that gmxapi can construct an
 210     executable command line.
 211
 212     Primary use case is implicit. commandline_operation() instantiates this operation based on
 213     user input, and sends the output to cli()
 214
 215     Arguments:
 216         filemap: key-value map of command line flags and filename arguments
 217
 218     Returns:
 219         list of strings and/or gmxapi data references
 220     """
 221     result = []
 222     if filemap is not None:
 223         for key, value in filemap.items():
 224             # Note that the value may be a string, a list, an ndarray, or a future
 225             if not isinstance(value, (list, tuple, NDArray)):
 226                 if hasattr(value, 'result') and value.dtype == NDArray:
 227                     pass
 228                 elif hasattr(value, 'result') and value.dtype != NDArray:
 229                     # TODO: Fix this ugly hack when we have proper Future slicing and can make NDArray futures.
 230                     result_function = value.result
 231                     value.result = lambda function=result_function: [function()]
 232                 else:
 233                     value = [value]
 234             result = gmx.join_arrays(front=result, back=gmx.join_arrays(front=[key], back=value))
 235     return result
 236
 237
 238 # TODO: (FR4) Use generating function or decorator that can validate kwargs?
 239 # TODO: (FR4) Outputs need to be fully formed and typed in the object returned
 240 #  from the helper (decorated function).
 241 def commandline_operation(executable=None,
 242                           arguments=(),
 243                           input_files: dict = None,
 244                           output_files: dict = None,
 245                           stdin: str = None,
 246                           **kwargs):
 247     """Helper function to define a new operation that executes a subprocess in gmxapi data flow.
 248
 249     Define a new Operation for a particular executable and input/output parameter set.
 250     Generate a chain of operations to process the named key word arguments and handle
 251     input/output data dependencies.
 252
 253     Arguments:
 254         executable: name of an executable on the path
 255         arguments: list of positional arguments to insert at ``argv[1]``
 256         input_files: mapping of command-line flags to input file names
 257         output_files: mapping of command-line flags to output file names
 258         stdin (str): String input to send to STDIN (terminal input) of the executable (optional).
 259
 260     Multi-line text sent to *stdin* should be joined into a single string.
 261     E.g.::
 262
 263         commandline_operation(..., stdin='\\n'.join(list_of_strings) + '\\n')
 264
 265     If multiple strings are provided to *stdin*, gmxapi will assume an ensemble,
 266     and will run one operation for each provided string.
 267
 268     Only string input (:py:func:`str`) to *stdin* is currently supported.
 269     If you have a use case that requires streaming input or binary input,
 270     please open an issue or contact the author(s).
 271
 272     Output:
 273         The output node of the resulting operation handle contains
 274
 275         * ``file``: the mapping of CLI flags to filename strings resulting from the ``output_files`` kwarg
 276         * ``stdout``: A string mapping from process STDOUT.
 277         * ``stderr``: A string mapping from process STDERR; it will be the
 278                       error output (if any) if the process failed.
 279         * ``returncode``: return code of the subprocess.
 280
 281     """
 282
 283     # Implementation details: When used in a script, this function returns an
 284     # instance of an operation. However, because of the dynamic specification of
 285     # inputs and outputs, each invocation may have the overhead of defining new
 286     # types to express the data flow topology, regardless of the executable.
 287     # If this overhead is problematic, consider exposing the intermediate step
 288     # at which the Operation is fully specified to facilitate reuse.
 289
 290     ##
 291     # 1. Define a new operation with outputs from `cli()` plus `file` from `output_files`
 292
 293     # output_files is essentially passed through, but we need assurance that results
 294     # will not be published until the rest of the operation has run (i.e. the cli() executable.)
 295
 296     # Warning: decorating a local function like this is counter to the notion of Operations
 297     # as portable (importable, serializable/deserializable). The big picture here needs
 298     # some more consideration.
 299     # TODO: (NOW) Distinguish portable Operations from relocatable Futures.
 300     # There is nothing antithetical about objects implementing gmxapi data interfaces
 301     # that are only resolvable by a certain Context as long as that Context can convey
 302     # the results to another Context upon request. Re-instantiating Operations is
 303     # only one way of relocating Futures. In this case, though, the dynamic creation of
 304     # merged_ops doesn't seem right, and commandline_operation should probably be
 305     # a proper Operation.
 306     #
 307     # TODO: (FR4+) Characterize the `file` dictionary key type:
 308     #  explicitly sequences rather than maybe-string/maybe-sequence-of-strings
 309     @gmx.function_wrapper(output={'stdout': str,
 310                                   'stderr': str,
 311                                   'returncode': int,
 312                                   'file': dict})
 313     def merged_ops(stdout: str = None,
 314                    stderr: str = None,
 315                    returncode: int = None,
 316                    file: dict = None,
 317                    output: OutputCollectionDescription = None):
 318         assert stdout is not None
 319         assert stderr is not None
 320         assert returncode is not None
 321         assert file is not None
 322         assert output is not None
 323         output.returncode = returncode
 324         output.stdout = stdout
 325         output.stderr = stderr
 326         if returncode == 0:
 327             output.file = file
 328         else:
 329             output.file = {}
 330
 331     ##
 332     # 2. Prepare data flow.
 333
 334     if input_files is None:
 335         input_files = {}
 336     if output_files is None:
 337         output_files = {}
 338     if isinstance(arguments, (str, bytes)):
 339         arguments = [arguments]
 340     command = gmx.concatenate_lists([[executable],
 341                                      arguments,
 342                                      filemap_to_flag_list(input_files),
 343                                      filemap_to_flag_list(output_files)])
 344     shell = gmx.make_constant(False)
 345     cli_args = {'command': command,
 346                 'shell': shell}
 347     cli_args.update(**kwargs)
 348     if stdin is not None:
 349         cli_args['stdin'] = str(stdin)
 350
 351     ##
 352     # 3. Merge operations
 353     #
 354     # Note: Without a `label` argument, repeated calls to cli(**cli_args) should
 355     # produce references to the same unique resource. Creating this handle
 356     # separately should not be necessary, but we've got a way to go until we have the
 357     # fingerprinting and Context resource management we need for that.
 358     # TODO: ``label`` kwarg
 359     # TODO: input fingerprinting
 360     cli_result = cli(**cli_args)
 361     merged_result = merged_ops(stdout=cli_result.output.stdout,
 362                                stderr=cli_result.output.stderr,
 363                                returncode=cli_result.output.returncode,
 364                                file=output_files,
 365                                **kwargs)
 366
 367     # Return an object with an OutputCollection granting access to outputs of
 368     # cli() and of output_files (as "file")
 369     return merged_result