diff --git a/NEWS.md b/NEWS.md index bb9b7230..4e834108 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,9 +3,24 @@ UNRELEASED ---------- +### Bug Fixes + +- Fix HTTP/2 pooled connections wedging under sustained concurrent load + (#836). The pool checks out a TCP connection first then upgrades to + SSL+ALPN; `connected(enter)` armed the 2s pool idle timer while the + protocol was still classified as HTTP/1.1, and the timer then fired + on a busy multiplexed HTTP/2 connection, terminating it mid-request. + `init_h2_connection` / `init_h2_after_upgrade` now explicitly cancel + the idle timer. hackney_conn also traps `EXIT` from the linked + `h2_connection` and stays alive briefly in `closed` state so late + calls that raced the pool checkout get a proper error reply instead + of `exit:{normal, _}`. Pool's `checkout_h2` validates the state of + the connection process (not just `is_process_alive`). +- Bump `h2` dependency to 0.4.0. + ### Refactor -- HTTP/2 is now delegated to the `erlang_h2` library (hex `h2` 0.3.0). +- HTTP/2 is now delegated to the `erlang_h2` library (hex `h2` 0.4.0). Hackney no longer ships its own HTTP/2 framing, HPACK codec, or connection/stream state machine: - `hackney_http2.erl`, `hackney_http2_machine.erl`, `hackney_hpack.erl` diff --git a/rebar.config b/rebar.config index d32bb642..6f76a53d 100644 --- a/rebar.config +++ b/rebar.config @@ -60,7 +60,7 @@ %% Pure Erlang QUIC + HTTP/3 stack {quic, "1.0.0"}, %% Pure Erlang HTTP/2 stack - {h2, "0.3.0"}, + {h2, "0.4.0"}, {idna, "~>7.1.0"}, {mimerl, "~>1.4"}, {certifi, "~>2.16.0"}, diff --git a/src/hackney_conn.erl b/src/hackney_conn.erl index cf7a7a49..02a96595 100644 --- a/src/hackney_conn.erl +++ b/src/hackney_conn.erl @@ -107,6 +107,10 @@ -define(CONNECT_TIMEOUT, 8000). -define(IDLE_TIMEOUT, infinity). +%% Grace window for pooled hackney_conn in `closed` state, during which +%% late-arriving calls race the pool DOWN cleanup and still get a proper +%% error reply instead of exit:{normal, _}. See issue #836. +-define(CLOSED_GRACE_MS, 50). %% State data record -record(conn_data, { @@ -895,6 +899,10 @@ connected({call, From}, {send_headers, Method, Path, Headers}, Data) -> %% HTTP/2 owner messages from h2 library connected(info, {h2, H2Conn, Event}, #conn_data{h2_conn = H2Conn} = Data) -> handle_h2_event(Event, Data); +%% h2_connection is linked via start_link; trap_exit surfaces its termination +%% as an 'EXIT' signal. Convert to the same cleanup path as the monitor DOWN. +connected(info, {'EXIT', H2Conn, Reason}, #conn_data{h2_conn = H2Conn} = Data) -> + h2_on_closed(Reason, Data#conn_data{h2_conn = undefined, h2_mon = undefined}); connected(info, {'DOWN', Mon, process, _Pid, Reason}, #conn_data{h2_mon = Mon} = Data) -> h2_on_closed(Reason, Data#conn_data{h2_conn = undefined, h2_mon = undefined}); @@ -1406,15 +1414,23 @@ closed(enter, _OldState, #conn_data{socket = Socket, transport = Transport, pool undefined -> ok; _ -> Transport:close(Socket) end, - %% For pooled connections, stop the process so pool can clean up - %% For non-pooled connections, stay alive to allow reconnection + %% Pooled connections used to stop immediately here, but that made + %% late-arriving {call, From, {request, _}} messages from workers that + %% raced the pool checkout race a terminating gen_statem — which + %% surfaces as `exit:{normal, _}` in the caller (issue #836). Stay + %% alive briefly so those late calls get a proper `{error, {closed, _}}` + %% reply via handle_common's invalid_state fallback, then stop. case PoolPid of undefined -> {keep_state, Data#conn_data{socket = undefined}}; _ -> - {stop, normal, Data#conn_data{socket = undefined}} + {keep_state, Data#conn_data{socket = undefined}, + [{state_timeout, ?CLOSED_GRACE_MS, closed_grace_expired}]} end; +closed(state_timeout, closed_grace_expired, Data) -> + {stop, normal, Data}; + closed({call, From}, connect, Data) -> %% Allow reconnection from closed state #conn_data{ @@ -1576,6 +1592,12 @@ handle_common(info, {select, _Resource, _Ref, ready_input}, _ = hackney_h3:process(ConnRef), keep_state_and_data; +%% With trap_exit = true, an EXIT signal from any linked process (other than +%% h2_conn, handled in connected/3) arrives here. Swallow it rather than +%% propagating — avoids tearing down the gen_statem on stray links. +handle_common(info, {'EXIT', _Pid, _Reason}, _State, _Data) -> + keep_state_and_data; + handle_common(info, _Msg, _State, _Data) -> keep_state_and_data. @@ -2303,13 +2325,18 @@ start_h2_connection(Socket, Data, From, Origin) -> h2_mon = Mon, h2_streams = #{} }, + %% Cancel any pending idle_timeout armed by the + %% TCP-first connected(enter): HTTP/2 connections + %% multiplex and stay in `connected`, so the 2s + %% pool default would kill a busy conn (#836). + CancelIdle = {state_timeout, infinity, idle_timeout}, case Origin of first_connect -> {next_state, connected, NewData, - [{reply, From, ok}]}; + [CancelIdle, {reply, From, ok}]}; after_upgrade -> {keep_state, NewData, - [{reply, From, ok}]} + [CancelIdle, {reply, From, ok}]} end; {error, WaitErr} -> catch h2_connection:close(H2Conn), @@ -2364,17 +2391,26 @@ do_h2_send(From, Method, Path, Headers, Body, StreamState, Mode, Data) -> B when is_binary(B) -> B; L -> iolist_to_binary(L) end, - SendRes = case BodyBin of - <<>> -> h2_connection:send_request_headers(H2Conn, H2Headers, true); - _ -> - case h2_connection:send_request_headers(H2Conn, H2Headers, false) of - {ok, SId} -> - case h2_connection:send_data(H2Conn, SId, BodyBin, true) of - ok -> {ok, SId}; - {error, _} = E1 -> E1 - end; - Err -> Err - end + %% h2_connection can die between pool checkout and this call; gen_statem:call + %% on a dead pid raises exit:noproc. Catch that and normalise to an error + %% so the caller sees {error, {closed, _}} instead of a gen_statem:call + %% blowing up (issue #836). + SendRes = try + case BodyBin of + <<>> -> h2_connection:send_request_headers(H2Conn, H2Headers, true); + _ -> + case h2_connection:send_request_headers(H2Conn, H2Headers, false) of + {ok, SId} -> + case h2_connection:send_data(H2Conn, SId, BodyBin, true) of + ok -> {ok, SId}; + {error, _} = E1 -> E1 + end; + Err -> Err + end + end + catch + exit:{ExitReason, _} -> {error, {closed, ExitReason}}; + exit:ExitReason -> {error, {closed, ExitReason}} end, case SendRes of {ok, StreamId} -> @@ -2544,6 +2580,9 @@ h2_on_closed(Reason, Data) -> {Replies, Data1} = collect_h2_aborts({closed, Reason}, Data), Stripped = Data1#conn_data{h2_conn = undefined, h2_mon = undefined, socket = undefined}, + %% Transition to closed. For pooled conns, closed(enter,...) keeps the + %% process alive for ?CLOSED_GRACE_MS so calls from workers that raced + %% the pool checkout get a proper error reply (issue #836). {next_state, closed, Stripped, Replies}. collect_h2_aborts(Err, #conn_data{h2_streams = Streams} = Data) -> diff --git a/src/hackney_pool.erl b/src/hackney_pool.erl index 9b20c422..3b210f80 100644 --- a/src/hackney_pool.erl +++ b/src/hackney_pool.erl @@ -490,17 +490,18 @@ handle_call({checkin_sync, Pid, ShouldClose}, _From, State) -> {reply, ok, State2}; handle_call({checkout_h2, Key}, _From, #state{h2_connections = H2Conns} = State) -> - %% HTTP/2 checkout - return existing connection if available + %% HTTP/2 checkout - return existing connection if available. + %% Liveness check includes both process_alive and gen_statem state, so a + %% hackney_conn that already transitioned to `closed` (e.g. after an h2 + %% GOAWAY) but has not yet been removed via 'DOWN' is not handed out. case maps:get(Key, H2Conns, undefined) of undefined -> {reply, none, State}; Pid -> - %% Verify connection is still alive - case erlang:is_process_alive(Pid) of + case h2_conn_usable(Pid) of true -> {reply, {ok, Pid}, State}; false -> - %% Connection died, remove from pool H2Conns2 = maps:remove(Key, H2Conns), {reply, none, State#state{h2_connections = H2Conns2}} end @@ -913,6 +914,20 @@ do_checkin_with_close_flag(Pid, ShouldClose, State) -> State end. +%% @private Check that a pooled HTTP/2 conn is alive and in `connected` state. +%% Short timeout so a stuck conn doesn't wedge the pool; any failure → unusable. +h2_conn_usable(Pid) -> + case erlang:is_process_alive(Pid) of + false -> false; + true -> + try hackney_conn:get_state(Pid) of + {ok, connected} -> true; + _ -> false + catch + _:_ -> false + end + end. + %% @private Remove an HTTP/2 connection from the pool do_unregister_h2(Pid, State) -> #state{h2_connections = H2Conns, pid_monitors = PidMonitors} = State, diff --git a/test/hackney_http2_concurrency_tests.erl b/test/hackney_http2_concurrency_tests.erl new file mode 100644 index 00000000..57bcf59c --- /dev/null +++ b/test/hackney_http2_concurrency_tests.erl @@ -0,0 +1,77 @@ +%%% Regression tests for issue #836: HTTP/2 pooled shared connection wedges +%%% under concurrent sustained load. +%%% +%%% Before the fix, connected(enter) armed a 2s idle_timeout while the +%%% connection was still classified as HTTP/1.1 (pool checks out TCP, then +%%% upgrades to SSL+ALPN); the timer fired on a perfectly busy H2 conn and +%%% tore it down mid-request, crashing in-flight `gen_statem:call`s with +%%% `exit:{normal, _}`. +-module(hackney_http2_concurrency_tests). + +-include_lib("eunit/include/eunit.hrl"). + +%% Resolve test cert dir from the module's beam location so the paths work +%% regardless of where eunit is run from. +cert_dir() -> + BeamDir = filename:dirname(code:which(?MODULE)), + %% _build/test/lib/hackney/test -> project root -> test/certs + Root = filename:join([BeamDir, "..", "..", "..", "..", ".."]), + filename:join([filename:absname(Root), "test", "certs"]). + +concurrent_tight_loop_test_() -> + {timeout, 30, fun run_concurrent_tight_loop/0}. + +run_concurrent_tight_loop() -> + _ = application:ensure_all_started(hackney), + _ = application:ensure_all_started(h2), + Handler = fun(Conn, Sid, _M, _P, _H) -> + ok = h2:send_response(Conn, Sid, 200, + [{<<"content-type">>, <<"text/plain">>}]), + ok = h2:send_data(Conn, Sid, <<"ok">>, true) + end, + Certs = cert_dir(), + {ok, Server} = h2:start_server(0, #{ + cert => filename:join(Certs, "server.pem"), + key => filename:join(Certs, "server.key"), + handler => Handler + }), + Port = h2:server_port(Server), + Pool = hackney_h2_concurrency_pool, + _ = hackney_pool:start_pool(Pool, [{max_connections, 10}]), + try + URL = iolist_to_binary([<<"https://localhost:">>, + integer_to_list(Port), <<"/">>]), + Opts = [{pool, Pool}, + {protocols, [http2]}, + {recv_timeout, 5000}, + {ssl_options, [{insecure, true}, {verify, verify_none}]}], + {ok, 200, _, _} = hackney:request(get, URL, [], <<>>, Opts), + Parent = self(), + Deadline = erlang:monotonic_time(millisecond) + 3000, + Worker = fun(Name) -> + (fun Self(Count) -> + case erlang:monotonic_time(millisecond) < Deadline of + true -> + case hackney:request(get, URL, [], <<>>, Opts) of + {ok, 200, _, _} -> Self(Count + 1); + Other -> + Parent ! {worker, Name, {error, Count, Other}} + end; + false -> Parent ! {worker, Name, {done, Count}} + end + end)(0) + end, + spawn_link(fun() -> Worker(p1) end), + spawn_link(fun() -> Worker(p2) end), + R1 = receive {worker, p1, V1} -> V1 after 8000 -> stall end, + R2 = receive {worker, p2, V2} -> V2 after 2000 -> stall end, + ?assertMatch({done, _}, R1), + ?assertMatch({done, _}, R2), + {done, N1} = R1, + {done, N2} = R2, + ?assert(N1 > 0), + ?assert(N2 > 0) + after + catch hackney_pool:stop_pool(Pool), + catch h2:stop_server(Server) + end.