[fix] fix restart/shutdown races in process monitor

This commit is contained in:
dijunkun
2026-03-20 14:50:42 +08:00
parent 56c0bca62f
commit 38b7775b1b
2 changed files with 82 additions and 59 deletions

View File

@@ -34,9 +34,16 @@
#endif #endif
#ifndef _WIN32 #ifndef _WIN32
Daemon* Daemon::instance_ = nullptr; volatile std::sig_atomic_t Daemon::stop_requested_ = 0;
#endif #endif
namespace {
constexpr int kRestartDelayMs = 1000;
#ifndef _WIN32
constexpr int kWaitPollIntervalMs = 200;
#endif
} // namespace
// get executable file path // get executable file path
static std::string GetExecutablePath() { static std::string GetExecutablePath() {
#ifdef _WIN32 #ifdef _WIN32
@@ -66,33 +73,35 @@ static std::string GetExecutablePath() {
return ""; return "";
} }
Daemon::Daemon(const std::string& name) Daemon::Daemon(const std::string& name) : name_(name), running_(false) {}
: name_(name)
#ifdef _WIN32 void Daemon::stop() {
, running_.store(false);
running_(false) #ifndef _WIN32
#else stop_requested_ = 1;
,
running_(true)
#endif #endif
{
} }
void Daemon::stop() { running_ = false; } bool Daemon::isRunning() const {
#ifndef _WIN32
bool Daemon::isRunning() const { return running_; } return running_.load() && (stop_requested_ == 0);
#else
return running_.load();
#endif
}
bool Daemon::start(MainLoopFunc loop) { bool Daemon::start(MainLoopFunc loop) {
#ifdef _WIN32 #ifdef _WIN32
running_ = true; running_.store(true);
return runWithRestart(loop); return runWithRestart(loop);
#elif __APPLE__ #elif __APPLE__
// macOS: Use child process monitoring (like Windows) to preserve GUI // macOS: Use child process monitoring (like Windows) to preserve GUI
running_ = true; stop_requested_ = 0;
running_.store(true);
return runWithRestart(loop); return runWithRestart(loop);
#else #else
// linux: Daemonize first, then run with restart monitoring // linux: Daemonize first, then run with restart monitoring
instance_ = this; stop_requested_ = 0;
// check if running from terminal before fork // check if running from terminal before fork
bool from_terminal = bool from_terminal =
@@ -134,29 +143,13 @@ bool Daemon::start(MainLoopFunc loop) {
} }
// set up signal handlers // set up signal handlers
signal(SIGTERM, [](int) { signal(SIGTERM, [](int) { stop_requested_ = 1; });
if (instance_) instance_->stop(); signal(SIGINT, [](int) { stop_requested_ = 1; });
});
signal(SIGINT, [](int) {
if (instance_) instance_->stop();
});
// ignore SIGPIPE // ignore SIGPIPE
signal(SIGPIPE, SIG_IGN); signal(SIGPIPE, SIG_IGN);
// set up SIGCHLD handler to reap zombie processes running_.store(true);
struct sigaction sa_chld;
sa_chld.sa_handler = [](int) {
// reap zombie processes
while (waitpid(-1, nullptr, WNOHANG) > 0) {
// continue until no more zombie children
}
};
sigemptyset(&sa_chld.sa_mask);
sa_chld.sa_flags = SA_RESTART | SA_NOCLDSTOP;
sigaction(SIGCHLD, &sa_chld, nullptr);
running_ = true;
return runWithRestart(loop); return runWithRestart(loop);
#endif #endif
} }
@@ -204,8 +197,7 @@ bool Daemon::runWithRestart(MainLoopFunc loop) {
restart_count++; restart_count++;
std::cerr << "Exception caught, restarting... (attempt " std::cerr << "Exception caught, restarting... (attempt "
<< restart_count << ")" << std::endl; << restart_count << ")" << std::endl;
std::this_thread::sleep_for( std::this_thread::sleep_for(std::chrono::milliseconds(kRestartDelayMs));
std::chrono::milliseconds(DAEMON_DEFAULT_RESTART_DELAY_MS));
} }
} }
return true; return true;
@@ -237,27 +229,41 @@ bool Daemon::runWithRestart(MainLoopFunc loop) {
if (!success) { if (!success) {
std::cerr << "Failed to create child process, error: " << GetLastError() std::cerr << "Failed to create child process, error: " << GetLastError()
<< std::endl; << std::endl;
std::this_thread::sleep_for( std::this_thread::sleep_for(std::chrono::milliseconds(kRestartDelayMs));
std::chrono::milliseconds(DAEMON_DEFAULT_RESTART_DELAY_MS));
restart_count++; restart_count++;
continue; continue;
} }
while (isRunning()) {
DWORD wait_result = WaitForSingleObject(pi.hProcess, 200);
if (wait_result == WAIT_OBJECT_0) {
break;
}
if (wait_result == WAIT_FAILED) {
std::cerr << "Failed waiting child process, error: " << GetLastError()
<< std::endl;
break;
}
}
if (!isRunning()) {
TerminateProcess(pi.hProcess, 1);
WaitForSingleObject(pi.hProcess, 3000);
}
DWORD exit_code = 0; DWORD exit_code = 0;
WaitForSingleObject(pi.hProcess, INFINITE);
GetExitCodeProcess(pi.hProcess, &exit_code); GetExitCodeProcess(pi.hProcess, &exit_code);
CloseHandle(pi.hProcess); CloseHandle(pi.hProcess);
CloseHandle(pi.hThread); CloseHandle(pi.hThread);
if (exit_code == 0) { if (!isRunning() || exit_code == 0) {
break; // normal exit break; // normal exit
} }
restart_count++; restart_count++;
std::cerr << "Child process exited with code " << exit_code std::cerr << "Child process exited with code " << exit_code
<< ", restarting... (attempt " << restart_count << ")" << ", restarting... (attempt " << restart_count << ")"
<< std::endl; << std::endl;
std::this_thread::sleep_for( std::this_thread::sleep_for(std::chrono::milliseconds(kRestartDelayMs));
std::chrono::milliseconds(DAEMON_DEFAULT_RESTART_DELAY_MS));
#else #else
// linux: use fork + exec to create child process // linux: use fork + exec to create child process
pid_t pid = fork(); pid_t pid = fork();
@@ -266,21 +272,39 @@ bool Daemon::runWithRestart(MainLoopFunc loop) {
_exit(1); // exec failed _exit(1); // exec failed
} else if (pid > 0) { } else if (pid > 0) {
int status = 0; int status = 0;
pid_t waited_pid = waitpid(pid, &status, 0); pid_t waited_pid = -1;
while (isRunning()) {
waited_pid = waitpid(pid, &status, WNOHANG);
if (waited_pid == pid) {
break;
}
if (waited_pid < 0 && errno != EINTR) {
break;
}
std::this_thread::sleep_for(
std::chrono::milliseconds(kWaitPollIntervalMs));
}
if (!isRunning() && waited_pid != pid) {
kill(pid, SIGTERM);
waited_pid = waitpid(pid, &status, 0);
}
if (waited_pid < 0) { if (waited_pid < 0) {
if (!isRunning()) {
break;
}
restart_count++; restart_count++;
std::cerr << "waitpid failed, errno: " << errno std::cerr << "waitpid failed, errno: " << errno
<< ", restarting... (attempt " << restart_count << ")" << ", restarting... (attempt " << restart_count << ")"
<< std::endl; << std::endl;
std::this_thread::sleep_for( std::this_thread::sleep_for(std::chrono::milliseconds(kRestartDelayMs));
std::chrono::milliseconds(DAEMON_DEFAULT_RESTART_DELAY_MS));
continue; continue;
} }
if (WIFEXITED(status)) { if (WIFEXITED(status)) {
int exit_code = WEXITSTATUS(status); int exit_code = WEXITSTATUS(status);
if (exit_code == 0) { if (!isRunning() || exit_code == 0) {
break; // normal exit break; // normal exit
} }
restart_count++; restart_count++;
@@ -288,6 +312,9 @@ bool Daemon::runWithRestart(MainLoopFunc loop) {
<< ", restarting... (attempt " << restart_count << ")" << ", restarting... (attempt " << restart_count << ")"
<< std::endl; << std::endl;
} else if (WIFSIGNALED(status)) { } else if (WIFSIGNALED(status)) {
if (!isRunning()) {
break;
}
restart_count++; restart_count++;
std::cerr << "Child process crashed with signal " << WTERMSIG(status) std::cerr << "Child process crashed with signal " << WTERMSIG(status)
<< ", restarting... (attempt " << restart_count << ")" << ", restarting... (attempt " << restart_count << ")"
@@ -298,12 +325,10 @@ bool Daemon::runWithRestart(MainLoopFunc loop) {
"(attempt " "(attempt "
<< restart_count << ")" << std::endl; << restart_count << ")" << std::endl;
} }
std::this_thread::sleep_for( std::this_thread::sleep_for(std::chrono::milliseconds(kRestartDelayMs));
std::chrono::milliseconds(DAEMON_DEFAULT_RESTART_DELAY_MS));
} else { } else {
std::cerr << "Failed to fork child process" << std::endl; std::cerr << "Failed to fork child process" << std::endl;
std::this_thread::sleep_for( std::this_thread::sleep_for(std::chrono::milliseconds(kRestartDelayMs));
std::chrono::milliseconds(DAEMON_DEFAULT_RESTART_DELAY_MS));
restart_count++; restart_count++;
} }
#endif #endif

View File

@@ -7,11 +7,11 @@
#ifndef _DAEMON_H_ #ifndef _DAEMON_H_
#define _DAEMON_H_ #define _DAEMON_H_
#include <atomic>
#include <csignal>
#include <functional> #include <functional>
#include <string> #include <string>
#define DAEMON_DEFAULT_RESTART_DELAY_MS 1000
class Daemon { class Daemon {
public: public:
using MainLoopFunc = std::function<void()>; using MainLoopFunc = std::function<void()>;
@@ -28,12 +28,10 @@ class Daemon {
std::string name_; std::string name_;
bool runWithRestart(MainLoopFunc loop); bool runWithRestart(MainLoopFunc loop);
#ifdef _WIN32 #ifndef _WIN32
bool running_; static volatile std::sig_atomic_t stop_requested_;
#else
static Daemon* instance_;
volatile bool running_;
#endif #endif
std::atomic<bool> running_;
}; };
#endif #endif