Skip to content

Commit

Permalink
Ensure rexi_buffer metric includes the internal bufferd messages
Browse files Browse the repository at this point in the history
Previously, the rexi_buffer metric reported just the mailbox size of the
buffers. However rexi buffers are special as they themselves act as explicit
message queues, and current length of those explicit buffers wasn't reflected
anywhere in metrics.

So, improve the metric usefulness and reflect both the explicit and the
implicit queue lengths.

There was already an existent `gen_buffered_count` gen_server call, however
since the buffers could potentially be in the hotpath, avoid using and instead
use a persistent term + a counter scheme similar how we do for all couch_stats
metrics.

In addition, noticed that rexi had no tests at all. It is of course, battle
tested in production, but since we made changes to it, added some tests to
cover the changed bits.
  • Loading branch information
nickva committed Jul 24, 2024
1 parent 4214120 commit d0cf54e
Show file tree
Hide file tree
Showing 4 changed files with 284 additions and 12 deletions.
52 changes: 40 additions & 12 deletions src/rexi/src/rexi_buffer.erl
Original file line number Diff line number Diff line change
Expand Up @@ -18,55 +18,76 @@
init/1,
handle_call/3,
handle_cast/2,
handle_info/2
handle_info/2,
terminate/2
]).

-export([
send/2,
start_link/1
start_link/1,
get_buffered_count/1,
erase_buffer/1
]).

-define(BUFFER_COUNT_DEFAULT, 2000).
-define(COUNTER, counter).

-record(state, {
server_id,
buffer = queue:new(),
sender = nil,
count = 0,
counter,
max_count
}).

start_link(ServerId) ->
gen_server:start_link({local, ServerId}, ?MODULE, nil, []).
gen_server:start_link({local, ServerId}, ?MODULE, [ServerId], []).

send(Dest, Msg) ->
Server = list_to_atom(lists:concat([rexi_buffer, "_", get_node(Dest)])),
gen_server:cast(Server, {deliver, Dest, Msg}).

init(_) ->
get_buffered_count(ServerId) when is_atom(ServerId) ->
case persistent_term:get(counter_key(ServerId), undefined) of
undefined -> 0;
Ref -> counters:get(Ref, 1)
end.

erase_buffer(ServerId) ->
gen_server:call(ServerId, erase_buffer, infinity).

init([ServerId]) ->
%% TODO Leverage os_mon to discover available memory in the system
Max = list_to_integer(config:get("rexi", "buffer_count", "2000")),
{ok, #state{max_count = Max}}.
Counter = counters:new(1, []),
persistent_term:put(counter_key(ServerId), Counter),
Max = config:get_integer("rexi", "buffer_count", ?BUFFER_COUNT_DEFAULT),
{ok, #state{server_id = ServerId, max_count = Max, counter = Counter}}.

handle_call(erase_buffer, _From, State) ->
{reply, ok, State#state{buffer = queue:new(), count = 0}, 0};
handle_call(get_buffered_count, _From, State) ->
{reply, State#state.count, State, 0}.
handle_call(erase_buffer, _From, #state{counter = Counter} = State) ->
counters:put(Counter, 1, 0),
{reply, ok, State#state{buffer = queue:new(), count = 0}, 0}.

handle_cast({deliver, Dest, Msg}, #state{buffer = Q, count = C} = State) ->
handle_cast({deliver, Dest, Msg}, #state{} = State) ->
#state{counter = Counter, buffer = Q, count = C} = State,
couch_stats:increment_counter([rexi, buffered]),
Q2 = queue:in({Dest, Msg}, Q),
case should_drop(State) of
true ->
couch_stats:increment_counter([rexi, dropped]),
{noreply, State#state{buffer = queue:drop(Q2)}, 0};
false ->
counters:add(Counter, 1, 1),
{noreply, State#state{buffer = Q2, count = C + 1}, 0}
end.

handle_info(timeout, #state{sender = nil, buffer = {[], []}, count = 0} = State) ->
{noreply, State};
handle_info(timeout, #state{sender = nil, count = C} = State) when C > 0 ->
#state{buffer = Q, count = C} = State,
#state{counter = Counter, buffer = Q} = State,
{{value, {Dest, Msg}}, Q2} = queue:out_r(Q),
NewState = State#state{buffer = Q2, count = C - 1},
counters:add(Counter, 1, -1),
case erlang:send(Dest, Msg, [noconnect, nosuspend]) of
ok when C =:= 1 ->
% We just sent the last queued messsage, we'll use this opportunity
Expand All @@ -86,10 +107,17 @@ handle_info(timeout, State) ->
handle_info({'DOWN', Ref, _, Pid, _}, #state{sender = {Pid, Ref}} = State) ->
{noreply, State#state{sender = nil}, 0}.

terminate(_Reason, #state{server_id = ServerId}) ->
persistent_term:erase(counter_key(ServerId)),
ok.

should_drop(#state{count = Count, max_count = Max}) ->
Count >= Max.

get_node({_, Node}) when is_atom(Node) ->
Node;
get_node(Pid) when is_pid(Pid) ->
node(Pid).

counter_key(ServerId) when is_atom(ServerId) ->
{?MODULE, ?COUNTER, ServerId}.
8 changes: 8 additions & 0 deletions src/rexi/src/rexi_server_mon.erl
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ start_link(ChildMod) ->
status() ->
gen_server:call(?MODULE, status).

aggregate_queue_len(rexi_buffer) ->
% rexi_buffer acts as an explicit message queue. In order to get useful
% metrics from it we really need to add both its process' message queue and
% already buffered messages.
ServerIds = server_ids(rexi_buffer),
MQLengths = [message_queue_len(ServerId) || ServerId <- ServerIds],
BufLengths = [rexi_buffer:get_buffered_count(ServerId) || ServerId <- ServerIds],
lists:sum(MQLengths) + lists:sum(BufLengths);
aggregate_queue_len(ChildMod) ->
lists:sum([message_queue_len(ServerId) || ServerId <- server_ids(ChildMod)]).

Expand Down
114 changes: 114 additions & 0 deletions src/rexi/test/rexi_buffer_tests.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
% Licensed under the Apache License, Version 2.0 (the "License"); you may not
% use this file except in compliance with the License. You may obtain a copy of
% the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
% License for the specific language governing permissions and limitations under
% the License.

-module(rexi_buffer_tests).

-include_lib("couch/include/couch_eunit.hrl").

rexi_buffer_test_() ->
{
foreach,
fun setup/0,
fun teardown/1,
[
?TDEF_FE(t_send),
?TDEF_FE(t_get_buffered_count),
?TDEF_FE(t_buffer_erase),
?TDEF_FE(t_terminate_clears_persistent_term)
]
}.

setup() ->
Module = atom_to_binary(?MODULE),
RandSuffix = binary:encode_hex(rand:bytes(4)),
ServerId = binary_to_atom(<<Module/binary, "_", RandSuffix/binary>>),
{ok, Pid} = rexi_buffer:start_link(ServerId),
unlink(Pid),
{ServerId, Pid}.

teardown({_ServerId, Pid}) ->
case is_process_alive(Pid) of
true -> test_util:stop_sync(Pid);
false -> ok
end.

t_send({ServerId, Pid}) ->
?assert(is_process_alive(Pid)),
?assertEqual(Pid, whereis(ServerId)),
{DestPid, DestRef} = spawn_monitor(fun() ->
receive
Msg -> exit({got, Msg})
end
end),
gen_server:cast(ServerId, {deliver, DestPid, potato}),
ReceivedVal =
receive
{'DOWN', DestRef, process, DestPid, Res} -> Res
end,
?assertEqual({got, potato}, ReceivedVal).

t_get_buffered_count({ServerId, _}) ->
NonExistentDest = {foo, '[email protected]'},
?assertEqual(0, rexi_buffer:get_buffered_count('nonexistent_server_id')),
?assertEqual(0, rexi_buffer:get_buffered_count(ServerId)),
% Set a fake sender to make the buffer block
sys:replace_state(ServerId, fun(OldSt) -> setelement(4, OldSt, {foo, bar}) end),
gen_server:cast(ServerId, {deliver, NonExistentDest, potato}),
test_util:wait(fun() ->
case rexi_buffer:get_buffered_count(ServerId) of
0 -> wait;
N when is_integer(N), N > 0 -> ok
end
end),
?assertEqual(1, rexi_buffer:get_buffered_count(ServerId)),
gen_server:cast(ServerId, {deliver, NonExistentDest, tomato}),
gen_server:cast(ServerId, {deliver, NonExistentDest, cabbage}),
test_util:wait(fun() ->
case rexi_buffer:get_buffered_count(ServerId) of
N when is_integer(N), N =< 2 -> wait;
N when is_integer(N), N > 2 -> ok
end
end),
?assertEqual(3, rexi_buffer:get_buffered_count(ServerId)),
% Unblock sender
sys:replace_state(ServerId, fun(OldSt) -> setelement(4, OldSt, nil) end),
gen_server:cast(ServerId, {deliver, NonExistentDest, cucumber}),
test_util:wait(fun() ->
case rexi_buffer:get_buffered_count(ServerId) of
N when is_integer(N), N > 0 -> wait;
0 -> ok
end
end),
?assertEqual(ok, rexi_buffer:erase_buffer(ServerId)),
?assertEqual(0, rexi_buffer:get_buffered_count(ServerId)).

t_buffer_erase({ServerId, _}) ->
NonExistentDest = {foo, '[email protected]'},
?assertEqual(0, rexi_buffer:get_buffered_count('nonexistent_server_id')),
?assertEqual(0, rexi_buffer:get_buffered_count(ServerId)),
% Set a fake sender to make the buffer block
sys:replace_state(ServerId, fun(OldSt) -> setelement(4, OldSt, {foo, bar}) end),
gen_server:cast(ServerId, {deliver, NonExistentDest, potato}),
test_util:wait(fun() ->
case rexi_buffer:get_buffered_count(ServerId) of
0 -> wait;
N when is_integer(N), N > 0 -> ok
end
end),
?assertEqual(1, rexi_buffer:get_buffered_count(ServerId)),
?assertEqual(ok, rexi_buffer:erase_buffer(ServerId)),
?assertEqual(0, rexi_buffer:get_buffered_count(ServerId)).

t_terminate_clears_persistent_term({ServerId, Pid}) ->
?assertNotEqual(undefined, persistent_term:get({rexi_buffer, counter, ServerId}, undefined)),
?assertEqual(ok, gen_server:stop(Pid, shutdown, infinity)),
?assertEqual(undefined, persistent_term:get({rexi_buffer, counter, ServerId}, undefined)).
122 changes: 122 additions & 0 deletions src/rexi/test/rexi_tests.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
% Licensed under the Apache License, Version 2.0 (the "License"); you may not
% use this file except in compliance with the License. You may obtain a copy of
% the License at
%
% http://www.apache.org/licenses/LICENSE-2.0
%
% Unless required by applicable law or agreed to in writing, software
% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
% License for the specific language governing permissions and limitations under
% the License.

-module(rexi_tests).

-export([
rpc_test_fun/1
]).

-include_lib("couch/include/couch_eunit.hrl").

rexi_buffer_test_() ->
{
foreach,
fun setup/0,
fun teardown/1,
[
?TDEF_FE(t_cast),
?TDEF_FE(t_sync_cast),
?TDEF_FE(t_kill),
?TDEF_FE(t_cast_error),
?TDEF_FE(t_metrics),
?TDEF_FE(t_ping)
]
}.

setup() ->
test_util:start_couch([rexi]).

teardown(Ctx) ->
test_util:stop_couch(Ctx).

rpc_test_fun({sleep, MSec}) ->
rexi:reply({sleeping, self()}),
timer:sleep(MSec);
rpc_test_fun({error, Error}) ->
error(Error);
rpc_test_fun(ping) ->
rexi:ping();
rpc_test_fun(Arg) ->
rexi:reply({Arg, get()}).

t_cast(_) ->
?assertMatch({RexiServer, node42} when is_atom(RexiServer), rexi_utils:server_pid(node42)),
put(nonce, yup),
Ref = rexi:cast(node(), {?MODULE, rpc_test_fun, [potato]}),
{Res, Dict} =
receive
{Ref, {R, D}} -> {R, maps:from_list(D)}
end,
?assertEqual(potato, Res),
?assertMatch(
#{
nonce := yup,
'$initial_call' := {?MODULE, rpc_test_fun, 1},
rexi_from := {_Pid, _Ref}
},
Dict
).

t_sync_cast(_) ->
?assertMatch({RexiServer, node42} when is_atom(RexiServer), rexi_utils:server_pid(node42)),
put(nonce, yup),
Ref = rexi:cast(node(), self(), {?MODULE, rpc_test_fun, [potato]}, [sync]),
{Res, Dict} =
receive
{Ref, {R, D}} -> {R, maps:from_list(D)}
end,
?assertEqual(potato, Res),
?assertMatch(
#{
nonce := yup,
'$initial_call' := {?MODULE, rpc_test_fun, 1},
rexi_from := {_Pid, _Ref}
},
Dict
).

t_cast_error(_) ->
?assertMatch({RexiServer, node42} when is_atom(RexiServer), rexi_utils:server_pid(node42)),
Ref = rexi:cast(node(), self(), {?MODULE, rpc_test_fun, [{error, tomato}]}, []),
Res =
receive
{Ref, RexiExit} -> RexiExit
end,
?assertMatch({rexi_EXIT, {tomato, [{?MODULE, rpc_test_fun, 1, _} | _]}}, Res).

t_kill(_) ->
Ref = rexi:cast(node(), {?MODULE, rpc_test_fun, [{sleep, 10000}]}),
WorkerPid =
receive
{Ref, {sleeping, Pid}} -> Pid
end,
?assert(is_process_alive(WorkerPid)),
Mon = monitor(process, WorkerPid),
rexi:kill_all([{node(), Ref}]),
KillReason =
receive
{'DOWN', Mon, _, _, Res} -> Res
end,
?assertEqual(killed, KillReason).

t_metrics(_) ->
?assertEqual(0, rexi:aggregate_buffer_queue_len()),
?assertEqual(0, rexi:aggregate_server_queue_len()).

t_ping(_) ->
rexi:cast(node(), {?MODULE, rpc_test_fun, [ping]}),
Res =
receive
{rexi, Ping} -> Ping
end,
?assertEqual('$rexi_ping', Res).

0 comments on commit d0cf54e

Please sign in to comment.