Skip to content

Commit d0b9e00

Browse files
doquanghuyclaude
andcommitted
feat(workflows): add continue_on_error step field
Adds an optional `continue_on_error: bool` field on every step. When set to `true` and the step fails, the engine records the result (`exit_code`, `stderr` on `steps.<id>.output` plus `status` as a sibling key on `steps.<id>`) and continues to the next sibling step instead of halting the run. Downstream `if`, `switch`, or `gate` steps can then branch on `{{ steps.<id>.output.exit_code }}` to route the recovery path. Engine details -------------- `WorkflowEngine._execute_steps` now consults the step config when a step returns `StepStatus.FAILED`: - Gate aborts (`output.aborted`) always halt the run — operator decisions take precedence over the flag. - Otherwise, if `continue_on_error` is the literal `True`, log a `step_continue_on_error` event and proceed to the next sibling. The runtime check uses identity comparison (`is True`) rather than truthiness, so truthy non-bool values like the string `"true"` cannot silently change run semantics even if a caller bypasses `validate_workflow()`. - Otherwise, behave as before: log `step_failed`, set `RunStatus.FAILED`, and return. Validation ---------- `_validate_steps` rejects non-bool values for `continue_on_error`. Coerced strings like `"true"` are not accepted so authoring mistakes surface at validation time rather than silently changing run semantics. Tests ----- `TestContinueOnError` in `tests/test_workflows.py` (8 tests): - `test_undeclared_failure_halts_run` — default halt behaviour. - `test_declared_and_fired_continues_run` — flag + fail → continue. - `test_declared_but_step_succeeded_is_noop` — flag + success → no-op. - `test_if_branch_routes_around_failure` — end-to-end recovery. - `test_gate_abort_still_halts_with_continue_on_error` — abort always halts. - `test_validation_rejects_non_bool_continue_on_error` — `"true"` rejected at validation. - `test_validation_accepts_bool_continue_on_error` — `true`/`false` pass cleanly. - `test_engine_ignores_truthy_non_bool_continue_on_error` — defense-in-depth: engine ignores string `"true"` even when validation is bypassed. Rebased onto current upstream/main (post #2664 merge); the new `TestContinueOnError` class sits immediately after upstream's `TestContextRunId` so the two feature suites coexist cleanly. Closes #2591. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent c6afe4c commit d0b9e00

3 files changed

Lines changed: 396 additions & 31 deletions

File tree

src/specify_cli/workflows/engine.py

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,20 @@ def _validate_steps(
231231
step_errors = step_impl.validate(step_config)
232232
errors.extend(step_errors)
233233

234+
# Validate optional `continue_on_error` field. The engine honours
235+
# this on any step that returns FAILED so the pipeline can route
236+
# around the failure via downstream `if`/`switch`/`gate`. The
237+
# field must be a literal boolean — coercion from truthy strings
238+
# is deliberately not supported so authoring mistakes surface
239+
# at validation time rather than silently changing run semantics.
240+
if "continue_on_error" in step_config:
241+
coe = step_config["continue_on_error"]
242+
if not isinstance(coe, bool):
243+
errors.append(
244+
f"Step {step_id!r}: 'continue_on_error' must be a "
245+
f"boolean, got {type(coe).__name__}."
246+
)
247+
234248
# Recursively validate nested steps
235249
for nested_key in ("then", "else", "steps"):
236250
nested = step_config.get(nested_key)
@@ -622,7 +636,10 @@ def _execute_steps(
622636

623637
# Handle failures
624638
if result.status == StepStatus.FAILED:
625-
# Gate abort (output.aborted) maps to ABORTED status
639+
# Gate abort (output.aborted) maps to ABORTED status.
640+
# Aborts are deliberate operator decisions, so
641+
# `continue_on_error` does NOT override them — that flag
642+
# is for transient/expected step failures only.
626643
if result.output.get("aborted"):
627644
state.status = RunStatus.ABORTED
628645
state.append_log(
@@ -631,15 +648,48 @@ def _execute_steps(
631648
"step_id": step_id,
632649
}
633650
)
634-
else:
635-
state.status = RunStatus.FAILED
651+
state.save()
652+
return
653+
654+
# `continue_on_error: true` lets the pipeline route
655+
# around the failure instead of halting. The step
656+
# result (including exit_code, stderr, status) is
657+
# still recorded so downstream `if`/`switch`/`gate`
658+
# steps can branch on it. Log a single, unambiguous
659+
# event per failure resolution — either the run
660+
# continued past it, or it halted.
661+
#
662+
# Use identity comparison (`is True`) rather than
663+
# truthiness so that only a literal boolean enables
664+
# the behaviour, even if validation was skipped.
665+
# Validation rejects non-bool values at parse time,
666+
# but `WorkflowEngine.execute()` does not auto-validate
667+
# (see `WorkflowEngine.load_workflow`, whose docstring
668+
# explicitly notes "not yet validated; call
669+
# `validate_workflow()` or `engine.validate()`
670+
# separately"), so a caller passing an unvalidated
671+
# definition could otherwise see truthy non-bool
672+
# values like the string `"true"` silently change
673+
# run semantics.
674+
if step_config.get("continue_on_error") is True:
636675
state.append_log(
637676
{
638-
"event": "step_failed",
677+
"event": "step_continue_on_error",
639678
"step_id": step_id,
640679
"error": result.error,
641680
}
642681
)
682+
state.save()
683+
continue
684+
685+
state.status = RunStatus.FAILED
686+
state.append_log(
687+
{
688+
"event": "step_failed",
689+
"step_id": step_id,
690+
"error": result.error,
691+
}
692+
)
643693
state.save()
644694
return
645695

tests/test_workflows.py

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2333,6 +2333,303 @@ def test_workflow_without_context_reference_unchanged(self, project_dir):
23332333
assert state.step_results["only-step"]["output"]["stdout"].strip() == "hello"
23342334

23352335

2336+
# ===== continue_on_error Tests =====
2337+
#
2338+
# Locks the contract documented in workflows/README.md "Error Handling"
2339+
# section: when an executable step fails and `continue_on_error: true`
2340+
# is declared, the engine records the step's `output` (with `exit_code`
2341+
# and `stderr` from the failure) and its `status` (sibling key on
2342+
# `steps.<id>`, not nested under `output`) and continues to the next
2343+
# sibling step instead of halting the run. Gate aborts
2344+
# (`output.aborted`) still halt regardless of the flag.
2345+
2346+
2347+
class TestContinueOnError:
2348+
"""Test the `continue_on_error` step-level field."""
2349+
2350+
def test_undeclared_failure_halts_run(self, project_dir):
2351+
"""Default behaviour (no `continue_on_error`): a failing step
2352+
halts the workflow run with `status == FAILED`.
2353+
2354+
Locks the byte-equivalent default — workflows that do not
2355+
declare the flag must behave exactly as before this feature.
2356+
"""
2357+
from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
2358+
from specify_cli.workflows.base import RunStatus
2359+
2360+
definition = WorkflowDefinition.from_string("""
2361+
schema_version: "1.0"
2362+
workflow:
2363+
id: "halt-on-fail"
2364+
name: "Halt On Fail"
2365+
version: "1.0.0"
2366+
steps:
2367+
- id: fail-step
2368+
type: shell
2369+
run: "exit 7"
2370+
- id: after
2371+
type: shell
2372+
run: "echo should-not-run"
2373+
""")
2374+
engine = WorkflowEngine(project_dir)
2375+
state = engine.execute(definition)
2376+
2377+
assert state.status == RunStatus.FAILED
2378+
assert "fail-step" in state.step_results
2379+
assert state.step_results["fail-step"]["output"]["exit_code"] == 7
2380+
# Subsequent step never executes when the flag is absent.
2381+
assert "after" not in state.step_results
2382+
2383+
def test_declared_and_fired_continues_run(self, project_dir):
2384+
"""`continue_on_error: true` + failing step: the run keeps
2385+
going, the failed step's result is recorded, and the
2386+
downstream step runs.
2387+
"""
2388+
from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
2389+
from specify_cli.workflows.base import RunStatus
2390+
2391+
definition = WorkflowDefinition.from_string("""
2392+
schema_version: "1.0"
2393+
workflow:
2394+
id: "continue-past-fail"
2395+
name: "Continue Past Fail"
2396+
version: "1.0.0"
2397+
steps:
2398+
- id: flaky-step
2399+
type: shell
2400+
run: "exit 42"
2401+
continue_on_error: true
2402+
- id: after
2403+
type: shell
2404+
run: "echo did-run"
2405+
""")
2406+
engine = WorkflowEngine(project_dir)
2407+
state = engine.execute(definition)
2408+
2409+
assert state.status == RunStatus.COMPLETED
2410+
# Failed step's exit_code is preserved so downstream branching
2411+
# can inspect it.
2412+
assert state.step_results["flaky-step"]["output"]["exit_code"] == 42
2413+
assert state.step_results["flaky-step"]["status"] == "failed"
2414+
# Downstream step ran successfully.
2415+
assert state.step_results["after"]["output"]["exit_code"] == 0
2416+
2417+
def test_declared_but_step_succeeded_is_noop(self, project_dir):
2418+
"""`continue_on_error: true` on a step that succeeds is a
2419+
no-op — the flag only changes behaviour on FAILED status.
2420+
"""
2421+
from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
2422+
from specify_cli.workflows.base import RunStatus
2423+
2424+
definition = WorkflowDefinition.from_string("""
2425+
schema_version: "1.0"
2426+
workflow:
2427+
id: "flag-but-success"
2428+
name: "Flag But Success"
2429+
version: "1.0.0"
2430+
steps:
2431+
- id: ok-step
2432+
type: shell
2433+
run: "echo ok"
2434+
continue_on_error: true
2435+
- id: after
2436+
type: shell
2437+
run: "echo done"
2438+
""")
2439+
engine = WorkflowEngine(project_dir)
2440+
state = engine.execute(definition)
2441+
2442+
assert state.status == RunStatus.COMPLETED
2443+
assert state.step_results["ok-step"]["status"] == "completed"
2444+
assert state.step_results["ok-step"]["output"]["exit_code"] == 0
2445+
assert state.step_results["after"]["output"]["exit_code"] == 0
2446+
2447+
def test_if_branch_routes_around_failure(self, project_dir):
2448+
"""End-to-end: `continue_on_error` + `if` cleanly routes around
2449+
a failure. The recovery branch runs; the success branch does
2450+
not.
2451+
2452+
Mirrors the canonical usage pattern from the original feature
2453+
discussion in issue #2591.
2454+
"""
2455+
from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
2456+
from specify_cli.workflows.base import RunStatus
2457+
2458+
definition = WorkflowDefinition.from_string("""
2459+
schema_version: "1.0"
2460+
workflow:
2461+
id: "route-around"
2462+
name: "Route Around Failure"
2463+
version: "1.0.0"
2464+
steps:
2465+
- id: heavy-thing
2466+
type: shell
2467+
run: "exit 1"
2468+
continue_on_error: true
2469+
- id: check-result
2470+
type: if
2471+
condition: "{{ steps.heavy-thing.output.exit_code != 0 }}"
2472+
then:
2473+
- id: recovery
2474+
type: shell
2475+
run: "echo recovery-ran"
2476+
else:
2477+
- id: happy-path
2478+
type: shell
2479+
run: "echo happy-path-ran"
2480+
""")
2481+
engine = WorkflowEngine(project_dir)
2482+
state = engine.execute(definition)
2483+
2484+
assert state.status == RunStatus.COMPLETED
2485+
assert "recovery" in state.step_results
2486+
assert "happy-path" not in state.step_results
2487+
2488+
def test_gate_abort_still_halts_with_continue_on_error(
2489+
self, project_dir, monkeypatch
2490+
):
2491+
"""`continue_on_error` does NOT override a deliberate gate
2492+
abort. `output.aborted` always halts the run with
2493+
`status == ABORTED`.
2494+
2495+
Aborts are explicit operator decisions; continue_on_error
2496+
is for transient/expected step failures only.
2497+
"""
2498+
from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
2499+
from specify_cli.workflows.base import RunStatus
2500+
from specify_cli.workflows.steps.gate import GateStep
2501+
from specify_cli.workflows.steps import gate as gate_module
2502+
2503+
# Force the gate step into interactive mode and feed a "reject"
2504+
# choice so the abort path actually runs in the test env
2505+
# (default behaviour returns PAUSED when stdin is not a TTY).
2506+
# Swap sys.stdin itself for a stub: setattr on the real
2507+
# TextIOWrapper's `isatty` method is not assignable under some
2508+
# runners (e.g. pytest with capture disabled).
2509+
class _TTYStdin:
2510+
def isatty(self) -> bool:
2511+
return True
2512+
2513+
monkeypatch.setattr(gate_module.sys, "stdin", _TTYStdin())
2514+
monkeypatch.setattr(
2515+
GateStep, "_prompt", staticmethod(lambda _msg, _opts: "reject")
2516+
)
2517+
2518+
definition = WorkflowDefinition.from_string("""
2519+
schema_version: "1.0"
2520+
workflow:
2521+
id: "gate-abort-halts"
2522+
name: "Gate Abort Halts"
2523+
version: "1.0.0"
2524+
steps:
2525+
- id: gate-step
2526+
type: gate
2527+
message: "Approve?"
2528+
options: [approve, reject]
2529+
on_reject: abort
2530+
continue_on_error: true
2531+
- id: should-not-run
2532+
type: shell
2533+
run: "echo nope"
2534+
""")
2535+
engine = WorkflowEngine(project_dir)
2536+
state = engine.execute(definition)
2537+
2538+
assert state.status == RunStatus.ABORTED
2539+
assert "should-not-run" not in state.step_results
2540+
2541+
def test_validation_rejects_non_bool_continue_on_error(self):
2542+
"""`continue_on_error` must be a literal boolean; coerced
2543+
strings like `"true"` are rejected at validation time so
2544+
authoring mistakes surface before execution.
2545+
"""
2546+
from specify_cli.workflows.engine import (
2547+
WorkflowDefinition,
2548+
validate_workflow,
2549+
)
2550+
2551+
definition = WorkflowDefinition.from_string("""
2552+
schema_version: "1.0"
2553+
workflow:
2554+
id: "bad-coe"
2555+
name: "Bad COE"
2556+
version: "1.0.0"
2557+
steps:
2558+
- id: step-one
2559+
type: shell
2560+
run: "true"
2561+
continue_on_error: "true"
2562+
""")
2563+
errors = validate_workflow(definition)
2564+
assert any(
2565+
"continue_on_error" in e and "boolean" in e for e in errors
2566+
), errors
2567+
2568+
def test_validation_accepts_bool_continue_on_error(self):
2569+
"""Boolean values pass validation cleanly."""
2570+
from specify_cli.workflows.engine import (
2571+
WorkflowDefinition,
2572+
validate_workflow,
2573+
)
2574+
2575+
for value in (True, False):
2576+
yaml_value = "true" if value else "false"
2577+
definition = WorkflowDefinition.from_string(f"""
2578+
schema_version: "1.0"
2579+
workflow:
2580+
id: "good-coe"
2581+
name: "Good COE"
2582+
version: "1.0.0"
2583+
steps:
2584+
- id: step-one
2585+
type: shell
2586+
run: "true"
2587+
continue_on_error: {yaml_value}
2588+
""")
2589+
errors = validate_workflow(definition)
2590+
assert errors == [], errors
2591+
2592+
def test_engine_ignores_truthy_non_bool_continue_on_error(self, project_dir):
2593+
"""Defense-in-depth: even if a caller bypasses
2594+
`validate_workflow()` and feeds the engine a definition with
2595+
`continue_on_error: "true"` (a string), the engine must NOT
2596+
honour the flag — only a literal boolean enables the
2597+
behaviour. `WorkflowEngine.execute()` does not auto-validate
2598+
(the `WorkflowEngine.load_workflow` docstring explicitly
2599+
notes the definition is "not yet validated; call
2600+
`validate_workflow()` or `engine.validate()` separately"),
2601+
so the engine guards against truthy non-bool values itself
2602+
via an identity check rather than truthiness.
2603+
"""
2604+
from specify_cli.workflows.engine import WorkflowDefinition, WorkflowEngine
2605+
from specify_cli.workflows.base import RunStatus
2606+
2607+
# Bypass `validate_workflow()` — execute() is what would
2608+
# be called by a caller that skipped validation.
2609+
definition = WorkflowDefinition.from_string("""
2610+
schema_version: "1.0"
2611+
workflow:
2612+
id: "string-coe"
2613+
name: "String COE"
2614+
version: "1.0.0"
2615+
steps:
2616+
- id: fail-step
2617+
type: shell
2618+
run: "exit 1"
2619+
continue_on_error: "true"
2620+
- id: should-not-run
2621+
type: shell
2622+
run: "echo should-not-run"
2623+
""")
2624+
engine = WorkflowEngine(project_dir)
2625+
state = engine.execute(definition)
2626+
2627+
# String "true" is truthy but not a literal boolean, so the
2628+
# engine must treat the step as a halting failure.
2629+
assert state.status == RunStatus.FAILED
2630+
assert "should-not-run" not in state.step_results
2631+
2632+
23362633
# ===== State Persistence Tests =====
23372634

23382635
class TestRunState:

0 commit comments

Comments
 (0)