Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,24 @@
UNRELEASED
----------

### Bug Fixes

- Fix HTTP/2 pooled connections wedging under sustained concurrent load
(#836). The pool checks out a TCP connection first then upgrades to
SSL+ALPN; `connected(enter)` armed the 2s pool idle timer while the
protocol was still classified as HTTP/1.1, and the timer then fired
on a busy multiplexed HTTP/2 connection, terminating it mid-request.
`init_h2_connection` / `init_h2_after_upgrade` now explicitly cancel
the idle timer. hackney_conn also traps `EXIT` from the linked
`h2_connection` and stays alive briefly in `closed` state so late
calls that raced the pool checkout get a proper error reply instead
of `exit:{normal, _}`. Pool's `checkout_h2` validates the state of
the connection process (not just `is_process_alive`).
- Bump `h2` dependency to 0.4.0.

### Refactor

- HTTP/2 is now delegated to the `erlang_h2` library (hex `h2` 0.3.0).
- HTTP/2 is now delegated to the `erlang_h2` library (hex `h2` 0.4.0).
Hackney no longer ships its own HTTP/2 framing, HPACK codec, or
connection/stream state machine:
- `hackney_http2.erl`, `hackney_http2_machine.erl`, `hackney_hpack.erl`
Expand Down
2 changes: 1 addition & 1 deletion rebar.config
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
%% Pure Erlang QUIC + HTTP/3 stack
{quic, "1.0.0"},
%% Pure Erlang HTTP/2 stack
{h2, "0.3.0"},
{h2, "0.4.0"},
{idna, "~>7.1.0"},
{mimerl, "~>1.4"},
{certifi, "~>2.16.0"},
Expand Down
71 changes: 55 additions & 16 deletions src/hackney_conn.erl
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@

-define(CONNECT_TIMEOUT, 8000).
-define(IDLE_TIMEOUT, infinity).
%% Grace window for pooled hackney_conn in `closed` state, during which
%% late-arriving calls race the pool DOWN cleanup and still get a proper
%% error reply instead of exit:{normal, _}. See issue #836.
-define(CLOSED_GRACE_MS, 50).

%% State data record
-record(conn_data, {
Expand Down Expand Up @@ -895,6 +899,10 @@ connected({call, From}, {send_headers, Method, Path, Headers}, Data) ->
%% HTTP/2 owner messages from h2 library
connected(info, {h2, H2Conn, Event}, #conn_data{h2_conn = H2Conn} = Data) ->
handle_h2_event(Event, Data);
%% h2_connection is linked via start_link; trap_exit surfaces its termination
%% as an 'EXIT' signal. Convert to the same cleanup path as the monitor DOWN.
connected(info, {'EXIT', H2Conn, Reason}, #conn_data{h2_conn = H2Conn} = Data) ->
h2_on_closed(Reason, Data#conn_data{h2_conn = undefined, h2_mon = undefined});
connected(info, {'DOWN', Mon, process, _Pid, Reason},
#conn_data{h2_mon = Mon} = Data) ->
h2_on_closed(Reason, Data#conn_data{h2_conn = undefined, h2_mon = undefined});
Expand Down Expand Up @@ -1406,15 +1414,23 @@ closed(enter, _OldState, #conn_data{socket = Socket, transport = Transport, pool
undefined -> ok;
_ -> Transport:close(Socket)
end,
%% For pooled connections, stop the process so pool can clean up
%% For non-pooled connections, stay alive to allow reconnection
%% Pooled connections used to stop immediately here, but that made
%% late-arriving {call, From, {request, _}} messages from workers that
%% raced the pool checkout race a terminating gen_statem — which
%% surfaces as `exit:{normal, _}` in the caller (issue #836). Stay
%% alive briefly so those late calls get a proper `{error, {closed, _}}`
%% reply via handle_common's invalid_state fallback, then stop.
case PoolPid of
undefined ->
{keep_state, Data#conn_data{socket = undefined}};
_ ->
{stop, normal, Data#conn_data{socket = undefined}}
{keep_state, Data#conn_data{socket = undefined},
[{state_timeout, ?CLOSED_GRACE_MS, closed_grace_expired}]}
end;

closed(state_timeout, closed_grace_expired, Data) ->
{stop, normal, Data};

closed({call, From}, connect, Data) ->
%% Allow reconnection from closed state
#conn_data{
Expand Down Expand Up @@ -1576,6 +1592,12 @@ handle_common(info, {select, _Resource, _Ref, ready_input},
_ = hackney_h3:process(ConnRef),
keep_state_and_data;

%% With trap_exit = true, an EXIT signal from any linked process (other than
%% h2_conn, handled in connected/3) arrives here. Swallow it rather than
%% propagating — avoids tearing down the gen_statem on stray links.
handle_common(info, {'EXIT', _Pid, _Reason}, _State, _Data) ->
keep_state_and_data;

handle_common(info, _Msg, _State, _Data) ->
keep_state_and_data.

Expand Down Expand Up @@ -2303,13 +2325,18 @@ start_h2_connection(Socket, Data, From, Origin) ->
h2_mon = Mon,
h2_streams = #{}
},
%% Cancel any pending idle_timeout armed by the
%% TCP-first connected(enter): HTTP/2 connections
%% multiplex and stay in `connected`, so the 2s
%% pool default would kill a busy conn (#836).
CancelIdle = {state_timeout, infinity, idle_timeout},
case Origin of
first_connect ->
{next_state, connected, NewData,
[{reply, From, ok}]};
[CancelIdle, {reply, From, ok}]};
after_upgrade ->
{keep_state, NewData,
[{reply, From, ok}]}
[CancelIdle, {reply, From, ok}]}
end;
{error, WaitErr} ->
catch h2_connection:close(H2Conn),
Expand Down Expand Up @@ -2364,17 +2391,26 @@ do_h2_send(From, Method, Path, Headers, Body, StreamState, Mode, Data) ->
B when is_binary(B) -> B;
L -> iolist_to_binary(L)
end,
SendRes = case BodyBin of
<<>> -> h2_connection:send_request_headers(H2Conn, H2Headers, true);
_ ->
case h2_connection:send_request_headers(H2Conn, H2Headers, false) of
{ok, SId} ->
case h2_connection:send_data(H2Conn, SId, BodyBin, true) of
ok -> {ok, SId};
{error, _} = E1 -> E1
end;
Err -> Err
end
%% h2_connection can die between pool checkout and this call; gen_statem:call
%% on a dead pid raises exit:noproc. Catch that and normalise to an error
%% so the caller sees {error, {closed, _}} instead of a gen_statem:call
%% blowing up (issue #836).
SendRes = try
case BodyBin of
<<>> -> h2_connection:send_request_headers(H2Conn, H2Headers, true);
_ ->
case h2_connection:send_request_headers(H2Conn, H2Headers, false) of
{ok, SId} ->
case h2_connection:send_data(H2Conn, SId, BodyBin, true) of
ok -> {ok, SId};
{error, _} = E1 -> E1
end;
Err -> Err
end
end
catch
exit:{ExitReason, _} -> {error, {closed, ExitReason}};
exit:ExitReason -> {error, {closed, ExitReason}}
end,
case SendRes of
{ok, StreamId} ->
Expand Down Expand Up @@ -2544,6 +2580,9 @@ h2_on_closed(Reason, Data) ->
{Replies, Data1} = collect_h2_aborts({closed, Reason}, Data),
Stripped = Data1#conn_data{h2_conn = undefined, h2_mon = undefined,
socket = undefined},
%% Transition to closed. For pooled conns, closed(enter,...) keeps the
%% process alive for ?CLOSED_GRACE_MS so calls from workers that raced
%% the pool checkout get a proper error reply (issue #836).
{next_state, closed, Stripped, Replies}.

collect_h2_aborts(Err, #conn_data{h2_streams = Streams} = Data) ->
Expand Down
23 changes: 19 additions & 4 deletions src/hackney_pool.erl
Original file line number Diff line number Diff line change
Expand Up @@ -490,17 +490,18 @@ handle_call({checkin_sync, Pid, ShouldClose}, _From, State) ->
{reply, ok, State2};

handle_call({checkout_h2, Key}, _From, #state{h2_connections = H2Conns} = State) ->
%% HTTP/2 checkout - return existing connection if available
%% HTTP/2 checkout - return existing connection if available.
%% Liveness check includes both process_alive and gen_statem state, so a
%% hackney_conn that already transitioned to `closed` (e.g. after an h2
%% GOAWAY) but has not yet been removed via 'DOWN' is not handed out.
case maps:get(Key, H2Conns, undefined) of
undefined ->
{reply, none, State};
Pid ->
%% Verify connection is still alive
case erlang:is_process_alive(Pid) of
case h2_conn_usable(Pid) of
true ->
{reply, {ok, Pid}, State};
false ->
%% Connection died, remove from pool
H2Conns2 = maps:remove(Key, H2Conns),
{reply, none, State#state{h2_connections = H2Conns2}}
end
Expand Down Expand Up @@ -913,6 +914,20 @@ do_checkin_with_close_flag(Pid, ShouldClose, State) ->
State
end.

%% @private Check that a pooled HTTP/2 conn is alive and in `connected` state.
%% Short timeout so a stuck conn doesn't wedge the pool; any failure → unusable.
h2_conn_usable(Pid) ->
case erlang:is_process_alive(Pid) of
false -> false;
true ->
try hackney_conn:get_state(Pid) of
{ok, connected} -> true;
_ -> false
catch
_:_ -> false
end
end.

%% @private Remove an HTTP/2 connection from the pool
do_unregister_h2(Pid, State) ->
#state{h2_connections = H2Conns, pid_monitors = PidMonitors} = State,
Expand Down
77 changes: 77 additions & 0 deletions test/hackney_http2_concurrency_tests.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
%%% Regression tests for issue #836: HTTP/2 pooled shared connection wedges
%%% under concurrent sustained load.
%%%
%%% Before the fix, connected(enter) armed a 2s idle_timeout while the
%%% connection was still classified as HTTP/1.1 (pool checks out TCP, then
%%% upgrades to SSL+ALPN); the timer fired on a perfectly busy H2 conn and
%%% tore it down mid-request, crashing in-flight `gen_statem:call`s with
%%% `exit:{normal, _}`.
-module(hackney_http2_concurrency_tests).

-include_lib("eunit/include/eunit.hrl").

%% Resolve test cert dir from the module's beam location so the paths work
%% regardless of where eunit is run from.
cert_dir() ->
BeamDir = filename:dirname(code:which(?MODULE)),
%% _build/test/lib/hackney/test -> project root -> test/certs
Root = filename:join([BeamDir, "..", "..", "..", "..", ".."]),
filename:join([filename:absname(Root), "test", "certs"]).

concurrent_tight_loop_test_() ->
{timeout, 30, fun run_concurrent_tight_loop/0}.

run_concurrent_tight_loop() ->
_ = application:ensure_all_started(hackney),
_ = application:ensure_all_started(h2),
Handler = fun(Conn, Sid, _M, _P, _H) ->
ok = h2:send_response(Conn, Sid, 200,
[{<<"content-type">>, <<"text/plain">>}]),
ok = h2:send_data(Conn, Sid, <<"ok">>, true)
end,
Certs = cert_dir(),
{ok, Server} = h2:start_server(0, #{
cert => filename:join(Certs, "server.pem"),
key => filename:join(Certs, "server.key"),
handler => Handler
}),
Port = h2:server_port(Server),
Pool = hackney_h2_concurrency_pool,
_ = hackney_pool:start_pool(Pool, [{max_connections, 10}]),
try
URL = iolist_to_binary([<<"https://localhost:">>,
integer_to_list(Port), <<"/">>]),
Opts = [{pool, Pool},
{protocols, [http2]},
{recv_timeout, 5000},
{ssl_options, [{insecure, true}, {verify, verify_none}]}],
{ok, 200, _, _} = hackney:request(get, URL, [], <<>>, Opts),
Parent = self(),
Deadline = erlang:monotonic_time(millisecond) + 3000,
Worker = fun(Name) ->
(fun Self(Count) ->
case erlang:monotonic_time(millisecond) < Deadline of
true ->
case hackney:request(get, URL, [], <<>>, Opts) of
{ok, 200, _, _} -> Self(Count + 1);
Other ->
Parent ! {worker, Name, {error, Count, Other}}
end;
false -> Parent ! {worker, Name, {done, Count}}
end
end)(0)
end,
spawn_link(fun() -> Worker(p1) end),
spawn_link(fun() -> Worker(p2) end),
R1 = receive {worker, p1, V1} -> V1 after 8000 -> stall end,
R2 = receive {worker, p2, V2} -> V2 after 2000 -> stall end,
?assertMatch({done, _}, R1),
?assertMatch({done, _}, R2),
{done, N1} = R1,
{done, N2} = R2,
?assert(N1 > 0),
?assert(N2 > 0)
after
catch hackney_pool:stop_pool(Pool),
catch h2:stop_server(Server)
end.
Loading