Android 12 init 子进程回收与服务重启分析

佚名 2024-05-10 11:33:59 举报

篇首语：本文由小编为大家整理，主要介绍了Android 12 init 子进程回收与服务重启分析相关的知识，希望对你有一定的参考价值。

文章托管在gitee上 Android Notes , 同步csdn
本文基于android12 分析

在init运行过程中，不可避免的会出现子进程或服务退出，需要做一些针对性处理：

对于已终止的子进程需要将其回收掉，防止产生僵尸进程
对于非oneshot服务，需要重新将其拉起，防止异常退出。

处理子进程退出

在init中通过监听信号 SIGCHLD，来获取子进程终止事件，然后做一些针对性动作。

InstallSignalFdHandler

初始化信号处理器，注册子进程终止的监听

/// @system/core/init/init.cppstatic void InstallSignalFdHandler(Epoll* epoll)     // Applying SA_NOCLDSTOP to a defaulted SIGCHLD handler prevents the signalfd from receiving    // SIGCHLD when a child process stops or continues (b/77867680#comment9).    const struct sigaction act  .sa_handler = SIG_DFL, .sa_flags = SA_NOCLDSTOP ;    sigaction(SIGCHLD, &act, nullptr);// 添加flag ，不接收进程 stop/continue 事件    sigset_t mask;    sigemptyset(&mask);    sigaddset(&mask, SIGCHLD);    if (!IsRebootCapable())  // 没有CAP_SYS_BOOT capability，不具备重启能力        // If init does not have the CAP_SYS_BOOT capability, it is running in a container.        // In that case, receiving SIGTERM will cause the system to shut down.        sigaddset(&mask, SIGTERM); // 添加SIGTERM到信号集        if (sigprocmask(SIG_BLOCK, &mask, nullptr) == -1)  // block这些信号，与signalfd匹配使用        PLOG(FATAL) << "failed to block signals";        // Register a handler to unblock signals in the child processes.    // UnblockSignals在fork返回之前，在子进程上下文中被执行，使得子进程不block这些信号    const int result = pthread_atfork(nullptr, nullptr, &UnblockSignals);    if (result != 0)         LOG(FATAL) << "Failed to register a fork handler: " << strerror(result);        signal_fd = signalfd(-1, &mask, SFD_CLOEXEC); // 创建fd，用于读取被block的信号    if (signal_fd == -1)         PLOG(FATAL) << "failed to create signalfd";        // 通过 epoll 监听新的信号到来    if (auto result = epoll->RegisterHandler(signal_fd, HandleSignalFd); !result.ok())         LOG(FATAL) << result.error();

UnblockSignals

在子进程执行该函数，即子进程默认是不阻塞这些信号的。

/// @system/core/init/init.cppstatic void UnblockSignals()     const struct sigaction act  .sa_handler = SIG_DFL ;    sigaction(SIGCHLD, &act, nullptr);    sigset_t mask;    sigemptyset(&mask);    sigaddset(&mask, SIGCHLD);    sigaddset(&mask, SIGTERM);    if (sigprocmask(SIG_UNBLOCK, &mask, nullptr) == -1)         PLOG(FATAL) << "failed to unblock signals for PID " << getpid();

当epoll监听到signal_fd有事件到来，即产生了相关信号，则会回调HandleSignalFd来处理

HandleSignalFd

/// system/core/init/init.cppstatic void HandleSignalFd()     signalfd_siginfo siginfo;    // 从fd读取信号信息    ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo)));    if (bytes_read != sizeof(siginfo))         PLOG(ERROR) << "Failed to read siginfo from signal_fd";        return;        switch (siginfo.ssi_signo)         case SIGCHLD: // 子进程终止事件            ReapAnyOutstandingChildren();            break;        case SIGTERM: // 信号15，kill命令默认发送的信号            HandleSigtermSignal(siginfo);            break;        default:            PLOG(ERROR) << "signal_fd: received unexpected signal " << siginfo.ssi_signo;            break;

处理 SIGCHLD 会调用ReapAnyOutstandingChildren，它实现了所有终止子进程的回收

ReapAnyOutstandingChildren

/// @system/core/init/sigchld_handler.cppvoid ReapAnyOutstandingChildren()     while (ReapOneProcess() != 0)  // 循环处理所有已终止的进程（调用exit或被信号杀死）

ReapOneProcess

这个函数的作用如下：

调用waitid回收已经终止的进程
打印进程死亡原因，被信号kill或者调用exit退出
针对 service 调用其 Reap 函数，清理状态、处理重启及 onrestart 命令

/// @system/core/init/sigchld_handler.cppstatic pid_t ReapOneProcess()     siginfo_t siginfo = ;    // This returns a zombie pid or informs us that there are no zombies left to be reaped.    // It does NOT reap the pid; that is done below.    if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0)         PLOG(ERROR) << "waitid failed";        return 0;        auto pid = siginfo.si_pid;    if (pid == 0) return 0;    // At this point we know we have a zombie pid, so we use this scopeguard to reap the pid    // whenever the function returns from this point forward.    // We do NOT want to reap the zombie earlier as in Service::Reap(), we kill(-pid, ...) and we    // want the pid to remain valid throughout that (and potentially future) usages.    auto reaper = make_scope_guard([pid]  TEMP_FAILURE_RETRY(waitpid(pid, nullptr, WNOHANG)); );    std::string name;    std::string wait_string;    Service* service = nullptr;    if (SubcontextChildReap(pid))  // 处理Subcontext进程退出，非正在关机中会重启该进程        name = "Subcontext";     else       // 判断该进程是否是某个服务，比如surfaceflinger        service = ServiceList::GetInstance().FindService(pid, &Service::pid);        if (service)  // 服务存在            name = StringPrintf("Service "%s" (pid %d)", service->name().c_str(), pid);            if (service->flags() & SVC_EXEC)  // 通过"exec" or "exec_start" 启动的可执行程序进程                auto exec_duration = boot_clock::now() - service->time_started();                auto exec_duration_ms =                    std::chrono::duration_cast<std::chrono::milliseconds>(exec_duration).count();                wait_string = StringPrintf(" waiting took %f seconds", exec_duration_ms / 1000.0f);             else if (service->flags() & SVC_ONESHOT)  // 一次性的服务                auto exec_duration = boot_clock::now() - service->time_started();                auto exec_duration_ms =                        std::chrono::duration_cast<std::chrono::milliseconds>(exec_duration)                                .count();                wait_string = StringPrintf(" oneshot service took %f seconds in background",                                           exec_duration_ms / 1000.0f);                     else             name = StringPrintf("Untracked pid %d", pid); // 非服务进程，未追踪的进程退出                if (siginfo.si_code == CLD_EXITED)  // 进程 exit        LOG(INFO) << name << " exited with status " << siginfo.si_status << wait_string;     else  // 进程被 kill        LOG(INFO) << name << " received signal " << siginfo.si_status << wait_string;        if (!service) return pid;    service->Reap(siginfo); // 调用Reap，做清理工作，并重启非oneshot的服务    if (service->flags() & SVC_TEMPORARY)  // 通过"exec" 启动的服务        ServiceList::GetInstance().RemoveService(*service);        return pid;

Service::Reap

kill进程组所有进程
清理所有socket资源相关文件
回调reap_callbacks_，比如之前设置的启动失败回调
critical服务持续保持退出(4分钟大于4次)，则重启到BootLoader
标记服务SVC_RESTARTING，在HandleProcessActions中重启服务
执行onrestart命令
通知服务状态改变

/// @system/core/init/service.cpp  void Service::Reap(const siginfo_t& siginfo)       if (!(flags_ & SVC_ONESHOT) || (flags_ & SVC_RESTART)) // 不是一次性的或者需要重启的          KillProcessGroup(SIGKILL, false); // 服务死亡，杀死其进程组所有进程， 第二个参数表示是否report_oneshot       else           // Legacy behavior from ~2007 until Android R: this else branch did not exist and we did not          // kill the process group in this case.          if (SelinuxGetVendorAndroidVersion() >= __ANDROID_API_R__)  // 杀死oneshot服务的进程组              // The new behavior in Android R is to kill these process groups in all cases.  The              // "true" parameter instructions KillProcessGroup() to report a warning message where it              // detects a difference in behavior has occurred.              KillProcessGroup(SIGKILL, true);                      // Remove any socket resources we may have created.      for (const auto& socket : sockets_)  // 清理该服务创建的socket 路径文件          auto path = ANDROID_SOCKET_DIR "/" + socket.name;          unlink(path.c_str());            for (const auto& f : reap_callbacks_)  // 执行通过 AddReapCallback 添加的reap操作的回调          f(siginfo);            if ((siginfo.si_code != CLD_EXITED || siginfo.si_status != 0) && on_failure_reboot_target_)           LOG(ERROR) << "Service with "reboot_on_failure" option failed, shutting down system.";          trigger_shutdown(*on_failure_reboot_target_);// 带有reboot_on_failure选项的服务，非正常退出则会触发关机            if (flags_ & SVC_EXEC) UnSetExec();  // 重置 is_exec_service_running_ flag      if (flags_ & SVC_TEMPORARY) return; // 临时oneshot服务，返回      pid_ = 0;      flags_ &= (~SVC_RUNNING);      start_order_ = 0;      // Oneshot processes go into the disabled state on exit,      // except when manually restarted.      // 标记为 SVC_RESTART 的，是需要重启服务的。在StopOrReset函数先kill进程，然后标记为SVC_RESTART，到回收后则进行重启      if ((flags_ & SVC_ONESHOT) && !(flags_ & SVC_RESTART) && !(flags_ & SVC_RESET))           flags_ |= SVC_DISABLED; // oneshot服务置disabled状态            // Disabled and reset processes do not get restarted automatically.      if (flags_ & (SVC_DISABLED | SVC_RESET))   // disabled 和 reset 状态服务不重启          NotifyStateChange("stopped");          return;        #if INIT_FULL_SOURCES      static bool is_apex_updatable = android::sysprop::ApexProperties::updatable().value_or(false);  #else      static bool is_apex_updatable = false;  #endif      const bool is_process_updatable = !use_bootstrap_ns_ && is_apex_updatable;      // If we crash > 4 times in "fatal_crash_window_" minutes or before boot_completed,      // reboot into bootloader or set crashing property      boot_clock::time_point now = boot_clock::now();      // critica或可更新(如apex) 并且 服务未标记要重启      if (((flags_ & SVC_CRITICAL) || is_process_updatable) && !(flags_ & SVC_RESTART))           bool boot_completed = GetBoolProperty("sys.boot_completed", false);          if (now < time_crashed_ + fatal_crash_window_ || !boot_completed)  // 在窗口时间内 或 开机流程未完成              if (++crash_count_ > 4)                   auto exit_reason = boot_completed ?                      "in " + std::to_string(fatal_crash_window_.count()) + " minutes" :                      "before boot completed";                  if (flags_ & SVC_CRITICAL)  // critical 服务在窗口时间(4分钟内)或开机完成前 crash超过4次，则会重启到 bootloader                      if (!GetBoolProperty("init.svc_debug.no_fatal." + name_, false))                           // Aborts into "atal_reboot_target_".                          SetFatalRebootTarget(fatal_reboot_target_);                          LOG(FATAL) << "critical process "" << name_ << "" exited 4 times "                                     << exit_reason;                                         else  // 非 critical 服务只有一个打印，然后记录到属性                      LOG(ERROR) << "process with updatable components "" << name_                                 << "" exited 4 times " << exit_reason;                      // Notifies update_verifier and apexd                      SetProperty("sys.init.updatable_crashing_process_name", name_);                      SetProperty("sys.init.updatable_crashing", "1");                                           else  // 重新记录时间和次数              time_crashed_ = now;              crash_count_ = 1;                      flags_ &= (~SVC_RESTART);      flags_ |= SVC_RESTARTING; // 注意此处标记，是服务重启的关键      // Execute all onrestart commands for this service.      onrestart_.ExecuteAllCommands(); // 执行所有 onrestart 命令， 在rc里面配置的      NotifyStateChange("restarting");      return;

Service::KillProcessGroup

void Service::KillProcessGroup(int signal, bool report_oneshot)     // If we"ve already seen a successful result from killProcessGroup*(), then we have removed    // the cgroup already and calling these functions a second time will simply result in an error.    // This is true regardless of which signal was sent.    // These functions handle their own logging, so no additional logging is needed.    if (!process_cgroup_empty_)         LOG(INFO) << "Sending signal " << signal << " to service "" << name_ << "" (pid " << pid_                  << ") process group...";        int max_processes = 0;        int r;        if (signal == SIGTERM)             r = killProcessGroupOnce(proc_attr_.uid, pid_, signal, &max_processes);         else             r = killProcessGroup(proc_attr_.uid, pid_, signal, &max_processes);                if (report_oneshot && max_processes > 0)             LOG(WARNING)                    << "Killed " << max_processes                    << " additional processes from a oneshot process group for service "" << name_                    << "". This is new behavior, previously child processes would not be killed in "                       "this case.";                if (r == 0) process_cgroup_empty_ = true;        if (oom_score_adjust_ != DEFAULT_OOM_SCORE_ADJUST)         LmkdUnregister(name_, pid_); // 从lmkd移除进程信息

上面两个killProcessGroup实现如下：

/// @system/core/libprocessgroup/processgroup.cppint killProcessGroup(uid_t uid, int initialPid, int signal, int* max_processes)   // 内部调用DoKillProcessGroupOnce去kill进程组    return KillProcessGroup(uid, initialPid, signal, 40 /*retries*/, max_processes);int killProcessGroupOnce(uid_t uid, int initialPid, int signal, int* max_processes)     return KillProcessGroup(uid, initialPid, signal, 0 /*retries*/, max_processes);

关于cgroup配置可参见 cgroups.json

/// @system/core/libprocessgroup/profiles/cgroups.json  "Cgroups": [          "Controller": "blkio",      "Path": "/dev/blkio",      "Mode": "0755",      "UID": "system",      "GID": "system"    ,          "Controller": "cpu",      "Path": "/dev/cpuctl",      "Mode": "0755",      "UID": "system",      "GID": "system"    ,          "Controller": "cpuset",      "Path": "/dev/cpuset",      "Mode": "0755",      "UID": "system",      "GID": "system"    ,          "Controller": "memory",      "Path": "/dev/memcg",      "Mode": "0700",      
            以上是关于Android 12 init 子进程回收与服务重启分析的主要内容，如果未能解决你的问题，请参考以下文章