diff --git a/src/shapepipe/run.py b/src/shapepipe/run.py index 8212fdfb5..7a1faf869 100644 --- a/src/shapepipe/run.py +++ b/src/shapepipe/run.py @@ -6,6 +6,7 @@ """ +import os import sys from datetime import datetime from importlib.metadata import requires @@ -22,12 +23,27 @@ from shapepipe.pipeline.job_handler import JobHandler from shapepipe.pipeline.mpi_run import split_mpi_jobs, submit_mpi_jobs -try: - from mpi4py import MPI -except ImportError: # pragma: no cover - import_mpi = False +# Importing mpi4py initializes MPI immediately, which aborts the whole +# process when no MPI launcher is available — e.g. inside an +# ``srun``-launched shell on a SLURM cluster, where Open MPI detects the +# SLURM step environment, expects a PMI server that srun never started, +# and calls MPI_Abort before even ``shapepipe_run -h`` can print (#744). +# Only import (and hence initialize) MPI when a launcher environment is +# actually present: ``mpirun``/``orterun`` set OMPI_COMM_WORLD_SIZE, +# ``srun --mpi=pmi2`` sets PMI_RANK and ``srun --mpi=pmix`` sets +# PMIX_RANK. A bare ``shapepipe_run`` (login node, compute-node shell, +# container) runs in SMP mode without ever touching MPI. +_MPI_LAUNCHER_VARS = ("OMPI_COMM_WORLD_SIZE", "PMI_RANK", "PMIX_RANK") + +if any(var in os.environ for var in _MPI_LAUNCHER_VARS): + try: + from mpi4py import MPI + except ImportError: # pragma: no cover + import_mpi = False + else: + import_mpi = True else: - import_mpi = True + import_mpi = False class ShapePipe: @@ -178,7 +194,7 @@ def _check_dependencies(self): module_dep = self._get_module_depends("depends") + __installs__ module_exe = self._get_module_depends("executes") - module_dep += ["mpi4py"] if import_mpi else module_dep + module_dep += ["mpi4py"] if import_mpi else [] exe_to_module = { exe: module diff --git a/src/shapepipe/tests/test_run.py b/src/shapepipe/tests/test_run.py new file mode 100644 index 000000000..03e6da22f --- /dev/null +++ b/src/shapepipe/tests/test_run.py @@ -0,0 +1,54 @@ +"""UNIT TESTS FOR RUN. + +This module contains unit tests for the shapepipe.run module, in +particular the MPI-launcher gating of the mpi4py import (#744): a bare +``shapepipe_run`` must never initialize MPI, otherwise the whole process +aborts inside an ``srun``-launched shell whose Open MPI lacks SLURM PMI +support. + +:Author: Claude (on behalf of Cail Daley) + +""" + +import os +import subprocess +import sys + +import pytest + +SNIPPET = "import shapepipe.run as r; print(r.import_mpi)" + +# Env vars that either mark an MPI launcher (the gate) or make Open MPI +# believe it was direct-launched by srun (the failure mode under test). +_SCRUBBED_PREFIXES = ("OMPI_", "PMI_", "PMIX_", "SLURM_") + + +def _import_mpi_flag(extra_env): + """Report shapepipe.run.import_mpi in a subprocess with a clean env.""" + env = { + key: value + for key, value in os.environ.items() + if not key.startswith(_SCRUBBED_PREFIXES) + } + env.update(extra_env) + result = subprocess.run( + [sys.executable, "-c", SNIPPET], + env=env, + capture_output=True, + text=True, + ) + assert result.returncode == 0, ( + f"subprocess failed (exit {result.returncode}): {result.stderr}" + ) + return result.stdout.strip() + + +def test_bare_launch_skips_mpi(): + """A bare launch (no MPI launcher env) must not import/init MPI.""" + assert _import_mpi_flag({}) == "False" + + +def test_mpirun_launch_imports_mpi(): + """An mpirun-style env (OMPI_COMM_WORLD_SIZE) must import MPI.""" + pytest.importorskip("mpi4py") + assert _import_mpi_flag({"OMPI_COMM_WORLD_SIZE": "1"}) == "True"