From 8bdacaecd100a1110fd4e675e22455e976f4ec8b Mon Sep 17 00:00:00 2001 From: Chen Chen Date: Fri, 29 May 2026 16:01:07 -0400 Subject: [PATCH] fix(logwatchers/journald): back off when sd_journal_wait fails sd_journal_wait can return immediately with a negative errno when sd_journal_get_fd is unable to allocate an inotify instance (e.g. when the host has exhausted fs.inotify.max_user_instances). The previous code discarded Wait's return value, so the watchLoop spun at 100% CPU until inotify pressure cleared. Check the return code and sleep for waitLogTimeout when Wait did not actually wait, preserving the intended 5s polling cadence. --- .../logwatchers/journald/log_watcher.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/systemlogmonitor/logwatchers/journald/log_watcher.go b/pkg/systemlogmonitor/logwatchers/journald/log_watcher.go index 15477c463..cea831d9a 100644 --- a/pkg/systemlogmonitor/logwatchers/journald/log_watcher.go +++ b/pkg/systemlogmonitor/logwatchers/journald/log_watcher.go @@ -114,7 +114,16 @@ func (j *journaldWatcher) watchLoop() { } // If next reaches the end, wait for waitLogTimeout. if n == 0 { - j.journal.Wait(waitLogTimeout) + // sd_journal_wait can return immediately with a negative + // errno (e.g. -ENOSPC when inotify user instances are + // exhausted, so sd_journal_get_fd cannot create the + // inotify instance it needs). Fall back to a manual sleep + // so the loop honors waitLogTimeout instead of spinning + // at 100% CPU. + if r := j.journal.Wait(waitLogTimeout); r < 0 { + klog.Errorf("sd_journal_wait failed (%d), backing off for %v", r, waitLogTimeout) + time.Sleep(waitLogTimeout) + } continue }