最近线上热更新,偶尔几次发现某些程序堆积大量消息或者卡死情况下,热更也会被卡住,主要体现在code_server进入waiting状态,会导致remsh也没法连进去查看节点信息。

热更新代码:
c.erl模块

1
2
3
4
5
6
7
8
%% l(Mod)
%% Reload module Mod from file of same name
-spec l(Module) -> code:load_ret() when
Module :: module().

l(Mod) ->
code:purge(Mod),
code:load_file(Mod).

实际场景中的批量热更方式

1
2
3
code:soft_purge(Mod)
{ok, Prepared} = code:prepare_loading(Mods)
code:finish_loading(Prepared)

code_server卡住的code_server进程状态:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
[{meta, [{registered_name, code_server}, 
{dictionary, []},
{status, waiting}]},
{signals, [{links, []},
{monitors, []},
{monitored_by, [...]},
{trap_exit, true}]},

{location, [{initial_call, {erlang, apply, 2}},
{current_stacktrace, [{erts_code_purger, soft_purge, 1, []},
{code_server, handle_call, 3, [{file, "code_server.erl"}, {line, 333}]},
{code_server, loop, 1, [{file, "code_server.erl"}, {line, 154}]}]}]},
{memory_used, [{memory, 319596},
{message_queue_len, 10},
{heap_size, 10958},
{total_heap_size, 39648},
{garbage_collection, [{max_heap_size, #{error_logger => true, kill => true, size => 0}},
{min_bin_vheap_size, 46422},
{min_heap_size, 10958},
{fullsweep_after, 65535},
{minor_gcs, 536}]}]},
{work, [{reductions, 30906022}]}]

从进程状态发现进程停在soft_purge, 查看源码发现这里会可能卡住:
code:soft_purge -> code_server:do_soft_purge -> erts_code_purger:soft_purge -> erts_code_purge:cpc_receive

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
soft_purge(Mod) ->
Ref = make_ref(),
erts_code_purger ! {soft_purge, Mod, self(), Ref},
receive
{reply, soft_purge, Result, Ref} ->
Result
end.

do_soft_purge(Mod, Reqs) ->
case erts_internal:purge_module(Mod, prepare) of
false ->
{true, Reqs};
true ->
{PurgeOp, NewReqs} = check_proc_code(erlang:processes(),
Mod, false, Reqs),
{erts_internal:purge_module(Mod, PurgeOp), NewReqs}
end.

......

check_proc_code(Pids, Mod, Hard, PReqs) ->
Tag = erlang:make_ref(),
CpcS = #cpc_static{hard = Hard,
module = Mod,
tag = Tag,
purge_requests = PReqs},
cpc_receive(CpcS, cpc_init(CpcS, Pids, 0), #cpc_kill{}, []).

cpc_receive(#cpc_static{hard = true} = CpcS,
0,
#cpc_kill{outstanding = [], waiting = [], killed = Killed},
PReqs) ->
%% No outstanding cpc requests. We did a hard check, so result is
%% whether or not we killed any processes...
cpc_result(CpcS, PReqs, Killed);
cpc_receive(#cpc_static{hard = false} = CpcS, 0, _KillState, PReqs) ->
%% No outstanding cpc requests and we did a soft check that succeeded...
cpc_result(CpcS, PReqs, complete);
cpc_receive(#cpc_static{tag = Tag} = CpcS, NoReq, KillState0, PReqs) ->
receive
{check_process_code, {Tag, _Pid}, false} ->
%% Process not referring the module; done with this process...
cpc_receive(CpcS, NoReq-1, KillState0, PReqs);
{check_process_code, {Tag, Pid}, true} ->
%% Process referring the module...
case CpcS#cpc_static.hard of
false ->
%% ... and soft check. The whole operation failed so
%% no point continuing; fail straight away. Garbage
%% messages from this session will be ignored
%% by following sessions...
cpc_result(CpcS, PReqs, abort);
true ->
%% ... and hard check; schedule kill of it...
KillState1 = cpc_sched_kill(Pid, KillState0),
cpc_receive(CpcS, NoReq-1, KillState1, PReqs)
end;
{'DOWN', MonRef, process, _, _} ->
KillState1 = cpc_handle_down(MonRef, KillState0),
cpc_receive(CpcS, NoReq, KillState1, PReqs);
PReq when element(1, PReq) == purge;
element(1, PReq) == soft_purge;
element(1, PReq) == test_purge ->
%% A new purge request; save it until later...
cpc_receive(CpcS, NoReq, KillState0, [PReq | PReqs]);
_Garbage ->
%% Garbage message; ignore it...
cpc_receive(CpcS, NoReq, KillState0, PReqs)
end.

无论是用code:purge还是soft_purge,最终都会走到cpc_receive这里,这里的receive是没有timeout,进程会一直waiting,
如果有进程卡住没响应的情况下,同样热更新时候的code_server也会卡住,这个坑暂时只能避过,尽量不要在有进程卡住的情况下做热更。

也有人向官方提过一个类似的问题
http://erlang.org/pipermail/erlang-questions/2018-June/095677.html
实际测试里面代代码,erlang:process_flag(priority, high),这个是影响erlang:check_process_code(Pid, rec)的关键,也就是说就算进程进入死循环,进程是normal的话,影响不大。

实在是办法卡住导致code_server卡住想往下走时候,也可以通过erl_call的办法把卡住的进程kill掉。
echo "exit(c:pid(X,X,X),kill)."|/usr/local/lib/erlang/lib/erl_interface-3.13/bin/erl_call -e -name game@127.0.0.1 -c cookie