Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions lib/hermes/mcp/error.ex
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ defmodule Hermes.MCP.Error do

# MCP-specific error codes
@resource_not_found -32_002
@session_expired -32_001

# Generic server error code for custom errors
@server_error -32_000
Expand All @@ -65,6 +66,7 @@ defmodule Hermes.MCP.Error do
invalid_params: "Invalid params",
internal_error: "Internal error",
resource_not_found: "Resource not found",
session_expired: "Session expired or not initialized. Please reconnect.",
server_error: "Server error"
}

Expand Down Expand Up @@ -137,6 +139,15 @@ defmodule Hermes.MCP.Error do
}
end

def protocol(:session_expired, data) do
%__MODULE__{
code: @session_expired,
reason: :session_expired,
message: @error_messages.session_expired,
data: data
}
end

@doc """
Creates a transport-level error.

Expand Down Expand Up @@ -270,6 +281,7 @@ defmodule Hermes.MCP.Error do
defp reason_from_code(@invalid_params), do: :invalid_params
defp reason_from_code(@internal_error), do: :internal_error
defp reason_from_code(@resource_not_found), do: :resource_not_found
defp reason_from_code(@session_expired), do: :session_expired
defp reason_from_code(_), do: :server_error

defp default_message(reason) do
Expand Down
4 changes: 2 additions & 2 deletions lib/hermes/server/base.ex
Original file line number Diff line number Diff line change
Expand Up @@ -378,11 +378,11 @@ defmodule Hermes.Server.Base do
end

defp handle_server_not_initialized(state) do
error = Error.protocol(:invalid_request, %{message: "Server not initialized"})
error = Error.protocol(:session_expired)

Logging.server_event(
"request_error",
%{error: error, reason: "not_initialized"},
%{error: error, reason: "session_expired"},
level: :warning
)

Expand Down
12 changes: 11 additions & 1 deletion lib/hermes/server/transport/streamable_http/plug.ex
Original file line number Diff line number Diff line change
Expand Up @@ -277,13 +277,23 @@ if Code.ensure_loaded?(Plug) do
end

defp route_sse_response(conn, transport, session_id, response, body, context, session_header) do
if handler_pid = StreamableHTTP.get_sse_handler(transport, session_id) do
handler_pid = StreamableHTTP.get_sse_handler(transport, session_id)

# Check Process.alive? to handle race condition where handler died but
# transport hasn't processed the :DOWN message yet. Without this check,
# send/2 silently drops the message to a dead process.
if handler_pid && Process.alive?(handler_pid) do
send(handler_pid, {:sse_message, response})

conn
|> put_resp_content_type("application/json")
|> send_resp(202, "{}")
else
# Handler is missing or stale - clean up stale entry if present
if handler_pid do
StreamableHTTP.unregister_sse_handler(transport, session_id)
end

establish_sse_for_request(
conn,
transport,
Expand Down
122 changes: 122 additions & 0 deletions test/hermes/server/transport/streamable_http/plug_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -352,4 +352,126 @@ defmodule Hermes.Server.Transport.StreamableHTTP.PlugTest do
assert conn.status == 200
end
end

describe "stale SSE handler race condition" do
@moduledoc """
Tests for handling stale SSE handler PIDs due to race conditions.

Even though the transport monitors SSE handlers and cleans them up,
there's a race window between when a handler dies and when the
transport processes the {:DOWN} message. During this window,
get_sse_handler can return a stale PID.
"""

setup %{registry: registry} do
name = registry.transport(StubServer, :streamable_http)
sup = registry.task_supervisor(StubServer)

{:ok, transport} =
start_supervised({StreamableHTTP, server: StubServer, name: name, registry: registry, task_supervisor: sup})

%{transport: transport}
end

test "race condition: get_sse_handler can return stale PID before DOWN is processed", %{transport: transport} do
session_id = "race-condition-session"

# Spawn a handler process that we control
test_pid = self()

handler_pid =
spawn(fn ->
# Notify test that we're ready
send(test_pid, :handler_ready)

receive do
:die -> :ok
end
end)

# Wait for handler to be ready
assert_receive :handler_ready

# Register the handler through the transport
# Note: register_sse_handler registers self(), so we need a workaround
# We'll simulate the race by quickly killing and checking

# Register our test process instead
:ok = StreamableHTTP.register_sse_handler(transport, session_id)
test_process_pid = self()

# Verify registration worked
assert StreamableHTTP.get_sse_handler(transport, session_id) == test_process_pid

# Now, the transport will receive a :DOWN message for test_process if we exit
# But since we're the test process, we can't do that.
# Instead, let's verify that Process.alive? check works correctly:

# Kill the external handler we spawned
Process.exit(handler_pid, :kill)
Process.sleep(5)

refute Process.alive?(handler_pid),
"Handler should be dead after being killed"

# The registered handler (test process) is still alive
assert Process.alive?(test_process_pid)

# But the spawned handler_pid is dead
# This demonstrates that Process.alive? correctly distinguishes
# between live and dead processes

# The fix in route_sse_response should use:
# if handler_pid && Process.alive?(handler_pid) do
# send(...)
# else
# establish_sse_for_request(...)
# end
end

test "send/2 to dead process silently fails - the underlying issue" do
# This test documents the Erlang/Elixir behavior that causes the bug

pid =
spawn(fn ->
receive do
_ -> :ok
after
100 -> :timeout
end
end)

Process.exit(pid, :kill)
Process.sleep(10)

refute Process.alive?(pid), "Process should be dead"

# send/2 returns the message even when sending to dead process!
result = send(pid, {:test_message, "hello"})
assert result == {:test_message, "hello"}

# The message is silently lost - this is the core issue.
# The fix must check Process.alive? before sending.
end

test "Process.alive? correctly detects dead process" do
# This test verifies that Process.alive? is the correct solution

pid =
spawn(fn ->
receive do
_ -> :ok
end
end)

assert Process.alive?(pid), "Process should be alive initially"

Process.exit(pid, :kill)
Process.sleep(10)

refute Process.alive?(pid), "Process should be dead after kill"

# This proves Process.alive? is reliable for detecting stale PIDs
end
end
end