erlang热更卡住 - fycheung的博客

最近线上热更新，偶尔几次发现某些程序堆积大量消息或者卡死情况下，热更也会被卡住，主要体现在code_server进入waiting状态，会导致remsh也没法连进去查看节点信息。

热更新代码：
c.erl模块

%% l(Mod)
%%  Reload module Mod from file of same name
-spec l(Module) -> code:load_ret() when
      Module :: module().

l(Mod) ->
    code:purge(Mod),
    code:load_file(Mod).

实际场景中的批量热更方式

1
2
3

code:soft_purge(Mod)
{ok, Prepared} = code:prepare_loading(Mods)
code:finish_loading(Prepared)

code_server卡住的code_server进程状态：

[{meta, [{registered_name, code_server}, 
         {dictionary, []}, 
         {status, waiting}]}, 
         {signals, [{links, []}, 
                    {monitors, []}, 
                    {monitored_by, [...]}, 
                    {trap_exit, true}]},
                    
        {location, [{initial_call, {erlang, apply, 2}}, 
                    {current_stacktrace, [{erts_code_purger, soft_purge, 1, []}, 
                                          {code_server, handle_call, 3, [{file, "code_server.erl"}, {line, 333}]}, 
                                          {code_server, loop, 1, [{file, "code_server.erl"}, {line, 154}]}]}]}, 
        {memory_used, [{memory, 319596}, 
                       {message_queue_len, 10}, 
                       {heap_size, 10958}, 
                       {total_heap_size, 39648}, 
                       {garbage_collection, [{max_heap_size, #{error_logger => true, kill => true, size => 0}}, 
                                             {min_bin_vheap_size, 46422}, 
                                             {min_heap_size, 10958}, 
                                             {fullsweep_after, 65535}, 
                                             {minor_gcs, 536}]}]}, 
        {work, [{reductions, 30906022}]}]

从进程状态发现进程停在soft_purge, 查看源码发现这里会可能卡住：
code:soft_purge -> code_server:do_soft_purge -> erts_code_purger:soft_purge -> erts_code_purge:cpc_receive

soft_purge(Mod) ->
    Ref = make_ref(),
    erts_code_purger ! {soft_purge, Mod, self(), Ref},
    receive
    {reply, soft_purge, Result, Ref} ->
        Result
    end.

do_soft_purge(Mod, Reqs) ->
    case erts_internal:purge_module(Mod, prepare) of
    false ->
        {true, Reqs};
    true ->
        {PurgeOp, NewReqs} = check_proc_code(erlang:processes(),
                         Mod, false, Reqs),
        {erts_internal:purge_module(Mod, PurgeOp), NewReqs}
    end.
    
......

check_proc_code(Pids, Mod, Hard, PReqs) ->
    Tag = erlang:make_ref(),
    CpcS = #cpc_static{hard = Hard,
               module = Mod,
               tag = Tag,
               purge_requests = PReqs},
    cpc_receive(CpcS, cpc_init(CpcS, Pids, 0), #cpc_kill{}, []).

cpc_receive(#cpc_static{hard = true} = CpcS,
        0,
        #cpc_kill{outstanding = [], waiting = [], killed = Killed},
        PReqs) ->
    %% No outstanding cpc requests. We did a hard check, so result is
    %% whether or not we killed any processes...
    cpc_result(CpcS, PReqs, Killed);
cpc_receive(#cpc_static{hard = false} = CpcS, 0, _KillState, PReqs) ->
    %% No outstanding cpc requests and we did a soft check that succeeded...
    cpc_result(CpcS, PReqs, complete);
cpc_receive(#cpc_static{tag = Tag} = CpcS, NoReq, KillState0, PReqs) ->
    receive
    {check_process_code, {Tag, _Pid}, false} ->
        %% Process not referring the module; done with this process...
        cpc_receive(CpcS, NoReq-1, KillState0, PReqs);
    {check_process_code, {Tag, Pid}, true} ->
        %% Process referring the module...
        case CpcS#cpc_static.hard of
        false ->
            %% ... and soft check. The whole operation failed so
            %% no point continuing; fail straight away. Garbage
            %% messages from this session will be ignored
            %% by following sessions...
            cpc_result(CpcS, PReqs, abort);
        true ->
            %% ... and hard check; schedule kill of it...
            KillState1 = cpc_sched_kill(Pid, KillState0),
            cpc_receive(CpcS, NoReq-1, KillState1, PReqs)
        end;
    {'DOWN', MonRef, process, _, _} ->
        KillState1 = cpc_handle_down(MonRef, KillState0),
        cpc_receive(CpcS, NoReq, KillState1, PReqs);
    PReq when element(1, PReq) == purge;
          element(1, PReq) == soft_purge;
          element(1, PReq) == test_purge ->
        %% A new purge request; save it until later...
        cpc_receive(CpcS, NoReq, KillState0, [PReq | PReqs]);
    _Garbage ->
        %% Garbage message; ignore it...
        cpc_receive(CpcS, NoReq, KillState0, PReqs)
    end.

无论是用code:purge还是soft_purge,最终都会走到cpc_receive这里，这里的receive是没有timeout，进程会一直waiting，
如果有进程卡住没响应的情况下，同样热更新时候的code_server也会卡住，这个坑暂时只能避过，尽量不要在有进程卡住的情况下做热更。

也有人向官方提过一个类似的问题
http://erlang.org/pipermail/erlang-questions/2018-June/095677.html
实际测试里面代代码，erlang:process_flag(priority, high),这个是影响erlang:check_process_code(Pid, rec)的关键，也就是说就算进程进入死循环，进程是normal的话，影响不大。

实在是办法卡住导致code_server卡住想往下走时候，也可以通过erl_call的办法把卡住的进程kill掉。
echo "exit(c:pid(X,X,X),kill)."|/usr/local/lib/erlang/lib/erl_interface-3.13/bin/erl_call -e -name game@127.0.0.1 -c cookie