From 519a786c8978772770337003c9208c97a328c99a Mon Sep 17 00:00:00 2001 From: Cail Daley Date: Thu, 11 Jun 2026 17:14:10 +0200 Subject: [PATCH 1/2] fix(run): only import mpi4py when an MPI launcher is present MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Importing mpi4py initializes MPI at import time, which aborts the whole process when Open MPI detects a SLURM step environment but no PMI server — i.e. inside any srun-launched shell on a cluster whose container OMPI lacks SLURM PMI support. Even shapepipe_run -h dies before printing (#744; empirically bisected to SLURM_STEP_ID being set). Gate the import on launcher-set env vars (OMPI_COMM_WORLD_SIZE for mpirun, PMI_RANK for srun --mpi=pmi2, PMIX_RANK for srun --mpi=pmix). A bare shapepipe_run never touches MPI and runs SMP as before; mpirun launches are unchanged. Verified on candide: bare run under srun now exits 0 (was OPAL abort), mpirun -n 1 and -n 2 paths intact. Behavior note: a config with MODE = MPI launched without any MPI launcher now falls back to SMP instead of running single-rank MPI. Closes #744 Co-Authored-By: Claude Fable 5 --- src/shapepipe/run.py | 26 +++++++++++++---- src/shapepipe/tests/test_run.py | 52 +++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 src/shapepipe/tests/test_run.py diff --git a/src/shapepipe/run.py b/src/shapepipe/run.py index 8212fdfb5..f377c24ba 100644 --- a/src/shapepipe/run.py +++ b/src/shapepipe/run.py @@ -6,6 +6,7 @@ """ +import os import sys from datetime import datetime from importlib.metadata import requires @@ -22,12 +23,27 @@ from shapepipe.pipeline.job_handler import JobHandler from shapepipe.pipeline.mpi_run import split_mpi_jobs, submit_mpi_jobs -try: - from mpi4py import MPI -except ImportError: # pragma: no cover - import_mpi = False +# Importing mpi4py initializes MPI immediately, which aborts the whole +# process when no MPI launcher is available — e.g. inside an +# ``srun``-launched shell on a SLURM cluster, where Open MPI detects the +# SLURM step environment, expects a PMI server that srun never started, +# and calls MPI_Abort before even ``shapepipe_run -h`` can print (#744). +# Only import (and hence initialize) MPI when a launcher environment is +# actually present: ``mpirun``/``orterun`` set OMPI_COMM_WORLD_SIZE, +# ``srun --mpi=pmi2`` sets PMI_RANK and ``srun --mpi=pmix`` sets +# PMIX_RANK. A bare ``shapepipe_run`` (login node, compute-node shell, +# container) runs in SMP mode without ever touching MPI. +_MPI_LAUNCHER_VARS = ("OMPI_COMM_WORLD_SIZE", "PMI_RANK", "PMIX_RANK") + +if any(var in os.environ for var in _MPI_LAUNCHER_VARS): + try: + from mpi4py import MPI + except ImportError: # pragma: no cover + import_mpi = False + else: + import_mpi = True else: - import_mpi = True + import_mpi = False class ShapePipe: diff --git a/src/shapepipe/tests/test_run.py b/src/shapepipe/tests/test_run.py new file mode 100644 index 000000000..bb60370ca --- /dev/null +++ b/src/shapepipe/tests/test_run.py @@ -0,0 +1,52 @@ +"""UNIT TESTS FOR RUN. + +This module contains unit tests for the shapepipe.run module, in +particular the MPI-launcher gating of the mpi4py import (#744): a bare +``shapepipe_run`` must never initialize MPI, otherwise the whole process +aborts inside an ``srun``-launched shell whose Open MPI lacks SLURM PMI +support. + +:Author: Claude (on behalf of Cail Daley) + +""" + +import os +import subprocess +import sys + +import pytest + +SNIPPET = "import shapepipe.run as r; print(r.import_mpi)" + +# Env vars that either mark an MPI launcher (the gate) or make Open MPI +# believe it was direct-launched by srun (the failure mode under test). +_SCRUBBED_PREFIXES = ("OMPI_", "PMI_", "PMIX_", "SLURM_") + + +def _import_mpi_flag(extra_env): + """Report shapepipe.run.import_mpi in a subprocess with a clean env.""" + env = { + key: value + for key, value in os.environ.items() + if not key.startswith(_SCRUBBED_PREFIXES) + } + env.update(extra_env) + result = subprocess.run( + [sys.executable, "-c", SNIPPET], + env=env, + capture_output=True, + text=True, + check=True, + ) + return result.stdout.strip() + + +def test_bare_launch_skips_mpi(): + """A bare launch (no MPI launcher env) must not import/init MPI.""" + assert _import_mpi_flag({}) == "False" + + +def test_mpirun_launch_imports_mpi(): + """An mpirun-style env (OMPI_COMM_WORLD_SIZE) must import MPI.""" + pytest.importorskip("mpi4py") + assert _import_mpi_flag({"OMPI_COMM_WORLD_SIZE": "1"}) == "True" From dab2d5c1cf1cac701c2985bb4cd5a6efb8df9803 Mon Sep 17 00:00:00 2001 From: Cail Daley Date: Thu, 11 Jun 2026 17:25:00 +0200 Subject: [PATCH 2/2] =?UTF-8?q?fix(run):=20review=20nits=20=E2=80=94=20mod?= =?UTF-8?q?ule=5Fdep=20self-append,=20surface=20test=20stderr?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The else-branch of the mpi4py dependency line appended the whole list to itself (harmless only because DependencyHandler dedups); with the launcher gate this became the common path. And the regression test now surfaces the subprocess stderr on failure instead of swallowing it in a bare CalledProcessError. Co-Authored-By: Claude Fable 5 --- src/shapepipe/run.py | 2 +- src/shapepipe/tests/test_run.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/shapepipe/run.py b/src/shapepipe/run.py index f377c24ba..7a1faf869 100644 --- a/src/shapepipe/run.py +++ b/src/shapepipe/run.py @@ -194,7 +194,7 @@ def _check_dependencies(self): module_dep = self._get_module_depends("depends") + __installs__ module_exe = self._get_module_depends("executes") - module_dep += ["mpi4py"] if import_mpi else module_dep + module_dep += ["mpi4py"] if import_mpi else [] exe_to_module = { exe: module diff --git a/src/shapepipe/tests/test_run.py b/src/shapepipe/tests/test_run.py index bb60370ca..03e6da22f 100644 --- a/src/shapepipe/tests/test_run.py +++ b/src/shapepipe/tests/test_run.py @@ -36,7 +36,9 @@ def _import_mpi_flag(extra_env): env=env, capture_output=True, text=True, - check=True, + ) + assert result.returncode == 0, ( + f"subprocess failed (exit {result.returncode}): {result.stderr}" ) return result.stdout.strip()