fix: per-archetype PremortemTask decomposition for small-model compatibility

EgonBot · EgonBot · commit d9c1feba67f4 · 2026-03-07T21:58:45.000Z
PremortemAnalysis required the LLM to emit a deeply nested schema in one call:
3 AssumptionItem + 3 FailureModeItem, 11+ required fields each, with linked
cross-reference IDs. Local small models (Qwen 3.5-35B, GLM 4.7 Flash) echoed
the schema structure back instead of producing values, exhausting all retries.

Fix: decompose into one independent LLM call per archetype using ArchetypeNarrative
(6 plain text fields, no IDs). Code assembles AssumptionItem + FailureModeItem
from the narrative and assigns all IDs and cross-references.

Changes (3 hunks):
1. Add ArchetypeNarrative schema (after PremortemAnalysis). Includes an 'archetype'
   field so the LLM can adapt the category name to the specific project rather than
   being locked to hardcoded labels.
2. Rewrite execute() to run num_rounds × 3 archetypes (3×3=9 calls in ALL_DETAILS,
   1×3=3 in FAST mode), restoring the original 9+9 assumption/failure-mode volume.
   Archetype suggestions guide the LLM; the returned narrative.archetype is used in
   the output (LLM may rename/adapt it per project).
   Failed archetype calls are skipped gracefully; first call failure raises.
3. Fix _calculate_risk_level_verbose: return 'Not Scored' when likelihood or
   impact is None (was rendering 'Likelihood None/5, Impact None/5').

Validated: PremortemTask PASSED on GLM 4.7 Flash (HVT_minimal run).
diff --git a/worker_plan/worker_plan_internal/diagnostics/premortem.py b/worker_plan/worker_plan_internal/diagnostics/premortem.py
@@ -71,6 +71,15 @@ class PremortemAnalysis(BaseModel):
     assumptions_to_kill: List[AssumptionItem] = Field(description="A list of 3 new, critical, underlying assumptions to test immediately.")
     failure_modes: List[FailureModeItem] = Field(description="A list containing exactly 3 distinct failure failure_modes, one for each archetype.")
 
+class ArchetypeNarrative(BaseModel):
+    """Minimal per-archetype schema. IDs and cross-references are assigned by the program, not the LLM."""
+    archetype: str = Field(description="The failure archetype category most relevant to this project and scenario (e.g. 'Process/Financial', 'Technical/Logistical', 'Market/Human', or a more specific variant).")
+    assumption: str = Field(description="One critical assumption the project is making that, if false, would cause this failure.")
+    test_now: str = Field(description="One concrete action to immediately test if this assumption holds.")
+    failure_title: str = Field(description="A short, compelling title for this failure scenario (e.g. 'The Gridlock Gamble').")
+    failure_story: str = Field(description="A detailed narrative of how this failure unfolds. Explain causes, chain of events, and impact.")
+    warning_signs: List[str] = Field(description="2-4 observable signals that this failure is beginning to occur.")
+
 PREMORTEM_SYSTEM_PROMPT = """
 Persona: You are a senior project analyst. Your primary goal is to write compelling, detailed, and distinct failure stories that are also operationally actionable.
 
@@ -119,92 +128,84 @@ def execute(cls, llm_executor: LLMExecutor, speed_vs_detail: SpeedVsDetailEnum,
         logger.debug(f"User Prompt:\n{user_prompt}")
         system_prompt = PREMORTEM_SYSTEM_PROMPT.strip()
 
-        accumulated_chat_message_list = [
-            ChatMessage(
-                role=MessageRole.SYSTEM,
-                content=system_prompt,
-            )
-        ]
-
-        user_prompt_list = [
-            user_prompt,
-            "Generate 3 new assumptions that are thematically different from the previous ones. Start assumption_id at A4.",
-            "Generate 3 new assumptions that are thematically different from the previous ones and covers different archetypes. Start assumption_id at A7.",
-        ]
+        # Archetype suggestions guide the LLM; the LLM writes the actual archetype name
+        # in ArchetypeNarrative.archetype, adapting it to the specific project if needed.
+        archetype_suggestions = ["Process/Financial", "Technical/Logistical", "Market/Human"]
+        num_rounds = 1 if speed_vs_detail == SpeedVsDetailEnum.FAST_BUT_SKIP_DETAILS else 3
         if speed_vs_detail == SpeedVsDetailEnum.FAST_BUT_SKIP_DETAILS:
-            user_prompt_list = user_prompt_list[:1]
-            logger.info("Running in FAST_BUT_SKIP_DETAILS mode. Omitting some assumptions.")
+            logger.info("Running in FAST_BUT_SKIP_DETAILS mode. 1 round × 3 archetypes = 3 calls.")
         else:
-            logger.info("Running in ALL_DETAILS_BUT_SLOW mode. Processing all assumptions.")
+            logger.info("Running in ALL_DETAILS_BUT_SLOW mode. 3 rounds × 3 archetypes = 9 calls.")
 
-        responses: list[PremortemAnalysis] = []
+        assumptions_to_kill: list[AssumptionItem] = []
+        failure_modes: list[FailureModeItem] = []
         metadata_list: list[dict] = []
-        for user_prompt_index, user_prompt_item in enumerate(user_prompt_list):
-            logger.info(f"Processing user_prompt_index: {user_prompt_index+1} of {len(user_prompt_list)}")
-            chat_message_list = accumulated_chat_message_list.copy()
-            chat_message_list.append(
-                ChatMessage(
-                    role=MessageRole.USER,
-                    content=user_prompt_item,
+
+        call_index = 0
+        first_call = True
+        for round_index in range(num_rounds):
+            for archetype_suggestion in archetype_suggestions:
+                call_index += 1
+                assumption_id = f"A{len(assumptions_to_kill) + 1}"
+                failure_mode_index = len(failure_modes) + 1
+                logger.info(f"Call {call_index}: round={round_index+1}, suggestion={archetype_suggestion!r}")
+
+                archetype_user_prompt = (
+                    f"{user_prompt}\n\n"
+                    f"Suggested archetype: {archetype_suggestion}\n"
+                    f"Write one assumption and one failure scenario. "
+                    f"Adapt the archetype name if a more specific label fits the project."
                 )
-            )
+            chat_message_list = [
+                ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),
+                ChatMessage(role=MessageRole.USER, content=archetype_user_prompt),
+            ]
 
             def execute_function(llm: LLM) -> dict:
-                sllm = llm.as_structured_llm(PremortemAnalysis)
+                sllm = llm.as_structured_llm(ArchetypeNarrative)
                 start_time = time.perf_counter()
-                
                 chat_response = sllm.chat(chat_message_list)
-                pydantic_response = chat_response.raw
-                
+                narrative = require_raw(chat_response, ArchetypeNarrative)
                 end_time = time.perf_counter()
                 duration = int(ceil(end_time - start_time))
-                
                 metadata = dict(llm.metadata)
                 metadata["llm_classname"] = llm.class_name()
                 metadata["duration"] = duration
-                
-                return {
-                    "pydantic_response": pydantic_response,
-                    "metadata": metadata,
-                    "duration": duration
-                }
+                return {"narrative": narrative, "metadata": metadata}
 
             try:
                 result = llm_executor.run(execute_function)
             except PipelineStopRequested:
-                # Re-raise PipelineStopRequested without wrapping it
                 raise
             except Exception as e:
-                logger.debug(f"LLM chat interaction failed: {e}")
-                logger.error("LLM chat interaction failed.", exc_info=True)
-                if user_prompt_index == 0:
-                    logger.error("The first user prompt failed. This is a critical error. Please check the system prompt and user prompt.")
-                    raise ValueError("LLM chat interaction failed.") from e
-                else:
-                    logger.error(f"User prompt {user_prompt_index+1} failed. Continuing with next user prompt.")
-                    continue
-            
-            assistant_content_raw: dict = result["pydantic_response"].model_dump()
-            # Compact JSON without newlines and spaces, since it's going to be parsed by the LLM. Pretty printing wastes input tokens for the LLM.
-            assistant_content: str = json.dumps(assistant_content_raw, separators=(',', ':'))
-
-            chat_message_list.append(
-                ChatMessage(
-                    role=MessageRole.ASSISTANT,
-                    content=assistant_content,
-                )
-            )
-
-            responses.append(result["pydantic_response"])
+                logger.error(f"Call {call_index} failed: {e}", exc_info=True)
+                if first_call:
+                    raise ValueError(f"First archetype call failed: {e}") from e
+                logger.warning(f"Skipping suggestion {archetype_suggestion!r} due to failure.")
+                continue
+
+            first_call = False
+            narrative: ArchetypeNarrative = result["narrative"]
             metadata_list.append(result["metadata"])
-            accumulated_chat_message_list = chat_message_list.copy()
 
-        # Use the last response as the primary result
-        assumptions_to_kill: list[AssumptionItem] = []
-        failure_modes: list[FailureModeItem] = []
-        for response in responses:
-            assumptions_to_kill.extend(response.assumptions_to_kill)
-            failure_modes.extend(response.failure_modes)
+            # Code assigns IDs and cross-references — the LLM only provides narrative text.
+            # The actual archetype name comes from narrative.archetype (LLM adapts to the project).
+            assumption = AssumptionItem(
+                assumption_id=assumption_id,
+                statement=narrative.assumption,
+                test_now=narrative.test_now,
+                falsifier=f"Result of: {narrative.test_now} — reveals the assumption does not hold.",
+            )
+            failure_mode = FailureModeItem(
+                failure_mode_index=failure_mode_index,
+                root_cause_assumption_id=assumption_id,
+                failure_mode_archetype=narrative.archetype,
+                failure_mode_title=narrative.failure_title,
+                risk_analysis=narrative.failure_story,
+                early_warning_signs=narrative.warning_signs,
+            )
+            assumptions_to_kill.append(assumption)
+            failure_modes.append(failure_mode)
 
         final_response = PremortemAnalysis(
             assumptions_to_kill=assumptions_to_kill,
@@ -286,7 +287,7 @@ def _calculate_risk_level_brief(likelihood: Optional[int], impact: Optional[int]
     def _calculate_risk_level_verbose(likelihood: Optional[int], impact: Optional[int]) -> str:
         """Calculates a qualitative risk level from likelihood and impact scores."""
         if likelihood is None or impact is None:
-            return f"Likelihood {likelihood}/5, Impact {impact}/5"
+            return "Not Scored"
         
         score = likelihood * impact
         if score >= 15: