k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/logs/container_log_manager.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package logs 18 19 import ( 20 "compress/gzip" 21 "context" 22 "fmt" 23 "io" 24 "os" 25 "path/filepath" 26 "sort" 27 "strings" 28 "sync" 29 30 "k8s.io/client-go/util/workqueue" 31 "k8s.io/klog/v2" 32 33 "k8s.io/apimachinery/pkg/api/resource" 34 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 35 "k8s.io/apimachinery/pkg/util/wait" 36 internalapi "k8s.io/cri-api/pkg/apis" 37 runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" 38 kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" 39 "k8s.io/utils/clock" 40 ) 41 42 const ( 43 // timestampFormat is format of the timestamp suffix for rotated log. 44 // See https://golang.org/pkg/time/#Time.Format. 45 timestampFormat = "20060102-150405" 46 // compressSuffix is the suffix for compressed log. 47 compressSuffix = ".gz" 48 // tmpSuffix is the suffix for temporary file. 49 tmpSuffix = ".tmp" 50 ) 51 52 // ContainerLogManager manages lifecycle of all container logs. 53 // 54 // Implementation is thread-safe. 55 type ContainerLogManager interface { 56 // TODO(random-liu): Add RotateLogs function and call it under disk pressure. 57 // Start container log manager. 58 Start() 59 // Clean removes all logs of specified container. 60 Clean(ctx context.Context, containerID string) error 61 } 62 63 // LogRotatePolicy is a policy for container log rotation. The policy applies to all 64 // containers managed by kubelet. 65 type LogRotatePolicy struct { 66 // MaxSize in bytes of the container log file before it is rotated. Negative 67 // number means to disable container log rotation. 68 MaxSize int64 69 // MaxFiles is the maximum number of log files that can be present. 70 // If rotating the logs creates excess files, the oldest file is removed. 71 MaxFiles int 72 } 73 74 // GetAllLogs gets all inuse (rotated/compressed) logs for a specific container log. 75 // Returned logs are sorted in oldest to newest order. 76 // TODO(#59902): Leverage this function to support log rotation in `kubectl logs`. 77 func GetAllLogs(log string) ([]string, error) { 78 // pattern is used to match all rotated files. 79 pattern := fmt.Sprintf("%s.*", log) 80 logs, err := filepath.Glob(pattern) 81 if err != nil { 82 return nil, fmt.Errorf("failed to list all log files with pattern %q: %v", pattern, err) 83 } 84 inuse, _ := filterUnusedLogs(logs) 85 sort.Strings(inuse) 86 return append(inuse, log), nil 87 } 88 89 // compressReadCloser wraps gzip.Reader with a function to close file handler. 90 type compressReadCloser struct { 91 f *os.File 92 *gzip.Reader 93 } 94 95 func (rc *compressReadCloser) Close() error { 96 ferr := rc.f.Close() 97 rerr := rc.Reader.Close() 98 if ferr != nil { 99 return ferr 100 } 101 if rerr != nil { 102 return rerr 103 } 104 return nil 105 } 106 107 // UncompressLog compresses a compressed log and return a readcloser for the 108 // stream of the uncompressed content. 109 // TODO(#59902): Leverage this function to support log rotation in `kubectl logs`. 110 func UncompressLog(log string) (_ io.ReadCloser, retErr error) { 111 if !strings.HasSuffix(log, compressSuffix) { 112 return nil, fmt.Errorf("log is not compressed") 113 } 114 f, err := os.Open(log) 115 if err != nil { 116 return nil, fmt.Errorf("failed to open log: %v", err) 117 } 118 defer func() { 119 if retErr != nil { 120 f.Close() 121 } 122 }() 123 r, err := gzip.NewReader(f) 124 if err != nil { 125 return nil, fmt.Errorf("failed to create gzip reader: %v", err) 126 } 127 return &compressReadCloser{f: f, Reader: r}, nil 128 } 129 130 // parseMaxSize parses quantity string to int64 max size in bytes. 131 func parseMaxSize(size string) (int64, error) { 132 quantity, err := resource.ParseQuantity(size) 133 if err != nil { 134 return 0, err 135 } 136 maxSize, ok := quantity.AsInt64() 137 if !ok { 138 return 0, fmt.Errorf("invalid max log size") 139 } 140 return maxSize, nil 141 } 142 143 type containerLogManager struct { 144 runtimeService internalapi.RuntimeService 145 osInterface kubecontainer.OSInterface 146 policy LogRotatePolicy 147 clock clock.Clock 148 mutex sync.Mutex 149 queue workqueue.TypedRateLimitingInterface[string] 150 maxWorkers int 151 monitoringPeriod metav1.Duration 152 } 153 154 // NewContainerLogManager creates a new container log manager. 155 func NewContainerLogManager(runtimeService internalapi.RuntimeService, osInterface kubecontainer.OSInterface, maxSize string, maxFiles int, maxWorkers int, monitorInterval metav1.Duration) (ContainerLogManager, error) { 156 if maxFiles <= 1 { 157 return nil, fmt.Errorf("invalid MaxFiles %d, must be > 1", maxFiles) 158 } 159 parsedMaxSize, err := parseMaxSize(maxSize) 160 if err != nil { 161 return nil, fmt.Errorf("failed to parse container log max size %q: %v", maxSize, err) 162 } 163 // Negative number means to disable container log rotation 164 if parsedMaxSize < 0 { 165 return NewStubContainerLogManager(), nil 166 } 167 // policy LogRotatePolicy 168 return &containerLogManager{ 169 osInterface: osInterface, 170 runtimeService: runtimeService, 171 policy: LogRotatePolicy{ 172 MaxSize: parsedMaxSize, 173 MaxFiles: maxFiles, 174 }, 175 clock: clock.RealClock{}, 176 mutex: sync.Mutex{}, 177 maxWorkers: maxWorkers, 178 queue: workqueue.NewTypedRateLimitingQueueWithConfig( 179 workqueue.DefaultTypedControllerRateLimiter[string](), 180 workqueue.TypedRateLimitingQueueConfig[string]{Name: "kubelet_log_rotate_manager"}, 181 ), 182 monitoringPeriod: monitorInterval, 183 }, nil 184 } 185 186 // Start the container log manager. 187 func (c *containerLogManager) Start() { 188 ctx := context.Background() 189 klog.InfoS("Initializing container log rotate workers", "workers", c.maxWorkers, "monitorPeriod", c.monitoringPeriod) 190 for i := 0; i < c.maxWorkers; i++ { 191 worker := i + 1 192 go c.processQueueItems(ctx, worker) 193 } 194 // Start a goroutine periodically does container log rotation. 195 go wait.Forever(func() { 196 if err := c.rotateLogs(ctx); err != nil { 197 klog.ErrorS(err, "Failed to rotate container logs") 198 } 199 }, c.monitoringPeriod.Duration) 200 } 201 202 // Clean removes all logs of specified container (including rotated one). 203 func (c *containerLogManager) Clean(ctx context.Context, containerID string) error { 204 c.mutex.Lock() 205 defer c.mutex.Unlock() 206 resp, err := c.runtimeService.ContainerStatus(ctx, containerID, false) 207 if err != nil { 208 return fmt.Errorf("failed to get container status %q: %v", containerID, err) 209 } 210 if resp.GetStatus() == nil { 211 return fmt.Errorf("container status is nil for %q", containerID) 212 } 213 pattern := fmt.Sprintf("%s*", resp.GetStatus().GetLogPath()) 214 logs, err := c.osInterface.Glob(pattern) 215 if err != nil { 216 return fmt.Errorf("failed to list all log files with pattern %q: %v", pattern, err) 217 } 218 219 for _, l := range logs { 220 if err := c.osInterface.Remove(l); err != nil && !os.IsNotExist(err) { 221 return fmt.Errorf("failed to remove container %q log %q: %v", containerID, l, err) 222 } 223 } 224 225 return nil 226 } 227 228 func (c *containerLogManager) processQueueItems(ctx context.Context, worker int) { 229 klog.V(4).InfoS("Starting container log rotation worker", "workerID", worker) 230 for c.processContainer(ctx, worker) { 231 } 232 klog.V(4).InfoS("Terminating container log rotation worker", "workerID", worker) 233 } 234 235 func (c *containerLogManager) rotateLogs(ctx context.Context) error { 236 c.mutex.Lock() 237 defer c.mutex.Unlock() 238 klog.V(4).InfoS("Starting container log rotation sequence") 239 // TODO(#59998): Use kubelet pod cache. 240 containers, err := c.runtimeService.ListContainers(ctx, &runtimeapi.ContainerFilter{}) 241 if err != nil { 242 return fmt.Errorf("failed to list containers: %v", err) 243 } 244 for _, container := range containers { 245 // Only rotate logs for running containers. Non-running containers won't 246 // generate new output, it doesn't make sense to keep an empty latest log. 247 if container.GetState() != runtimeapi.ContainerState_CONTAINER_RUNNING { 248 continue 249 } 250 // Doing this to avoid additional overhead with logging of label like arguments that can prove costly 251 if v := klog.V(4); v.Enabled() { 252 klog.V(4).InfoS("Adding new entry to the queue for processing", "id", container.GetId(), "name", container.Metadata.GetName(), "labels", container.GetLabels()) 253 } 254 c.queue.Add(container.GetId()) 255 } 256 return nil 257 } 258 259 func (c *containerLogManager) processContainer(ctx context.Context, worker int) (ok bool) { 260 key, quit := c.queue.Get() 261 if quit { 262 return false 263 } 264 defer func() { 265 c.queue.Done(key) 266 c.queue.Forget(key) 267 }() 268 // Always default the return to true to keep the processing of Queue ongoing 269 ok = true 270 id := key 271 272 resp, err := c.runtimeService.ContainerStatus(ctx, id, false) 273 if err != nil { 274 klog.ErrorS(err, "Failed to get container status", "worker", worker, "containerID", id) 275 return 276 } 277 if resp.GetStatus() == nil { 278 klog.ErrorS(err, "Container status is nil", "worker", worker, "containerID", id) 279 return 280 } 281 path := resp.GetStatus().GetLogPath() 282 info, err := c.osInterface.Stat(path) 283 284 if err != nil { 285 if !os.IsNotExist(err) { 286 klog.ErrorS(err, "Failed to stat container log", "worker", worker, "containerID", id, "path", path) 287 return 288 } 289 290 if err = c.runtimeService.ReopenContainerLog(ctx, id); err != nil { 291 klog.ErrorS(err, "Container log doesn't exist, reopen container log failed", "worker", worker, "containerID", id, "path", path) 292 return 293 } 294 295 info, err = c.osInterface.Stat(path) 296 if err != nil { 297 klog.ErrorS(err, "Failed to stat container log after reopen", "worker", worker, "containerID", id, "path", path) 298 return 299 } 300 } 301 if info.Size() < c.policy.MaxSize { 302 klog.V(7).InfoS("log file doesn't need to be rotated", "worker", worker, "containerID", id, "path", path, "currentSize", info.Size(), "maxSize", c.policy.MaxSize) 303 return 304 } 305 306 if err := c.rotateLog(ctx, id, path); err != nil { 307 klog.ErrorS(err, "Failed to rotate log for container", "worker", worker, "containerID", id, "path", path, "currentSize", info.Size(), "maxSize", c.policy.MaxSize) 308 return 309 } 310 return 311 } 312 313 func (c *containerLogManager) rotateLog(ctx context.Context, id, log string) error { 314 // pattern is used to match all rotated files. 315 pattern := fmt.Sprintf("%s.*", log) 316 logs, err := filepath.Glob(pattern) 317 if err != nil { 318 return fmt.Errorf("failed to list all log files with pattern %q: %v", pattern, err) 319 } 320 321 logs, err = c.cleanupUnusedLogs(logs) 322 if err != nil { 323 return fmt.Errorf("failed to cleanup logs: %v", err) 324 } 325 326 logs, err = c.removeExcessLogs(logs) 327 if err != nil { 328 return fmt.Errorf("failed to remove excess logs: %v", err) 329 } 330 331 // Compress uncompressed log files. 332 for _, l := range logs { 333 if strings.HasSuffix(l, compressSuffix) { 334 continue 335 } 336 if err := c.compressLog(l); err != nil { 337 return fmt.Errorf("failed to compress log %q: %v", l, err) 338 } 339 } 340 341 if err := c.rotateLatestLog(ctx, id, log); err != nil { 342 return fmt.Errorf("failed to rotate log %q: %v", log, err) 343 } 344 345 return nil 346 } 347 348 // cleanupUnusedLogs cleans up temporary or unused log files generated by previous log rotation 349 // failure. 350 func (c *containerLogManager) cleanupUnusedLogs(logs []string) ([]string, error) { 351 inuse, unused := filterUnusedLogs(logs) 352 for _, l := range unused { 353 if err := c.osInterface.Remove(l); err != nil { 354 return nil, fmt.Errorf("failed to remove unused log %q: %v", l, err) 355 } 356 } 357 return inuse, nil 358 } 359 360 // filterUnusedLogs splits logs into 2 groups, the 1st group is in used logs, 361 // the second group is unused logs. 362 func filterUnusedLogs(logs []string) (inuse []string, unused []string) { 363 for _, l := range logs { 364 if isInUse(l, logs) { 365 inuse = append(inuse, l) 366 } else { 367 unused = append(unused, l) 368 } 369 } 370 return inuse, unused 371 } 372 373 // isInUse checks whether a container log file is still inuse. 374 func isInUse(l string, logs []string) bool { 375 // All temporary files are not in use. 376 if strings.HasSuffix(l, tmpSuffix) { 377 return false 378 } 379 // All compressed logs are in use. 380 if strings.HasSuffix(l, compressSuffix) { 381 return true 382 } 383 // Files has already been compressed are not in use. 384 for _, another := range logs { 385 if l+compressSuffix == another { 386 return false 387 } 388 } 389 return true 390 } 391 392 // removeExcessLogs removes old logs to make sure there are only at most MaxFiles log files. 393 func (c *containerLogManager) removeExcessLogs(logs []string) ([]string, error) { 394 // Sort log files in oldest to newest order. 395 sort.Strings(logs) 396 // Container will create a new log file, and we'll rotate the latest log file. 397 // Other than those 2 files, we can have at most MaxFiles-2 rotated log files. 398 // Keep MaxFiles-2 files by removing old files. 399 // We should remove from oldest to newest, so as not to break ongoing `kubectl logs`. 400 maxRotatedFiles := c.policy.MaxFiles - 2 401 if maxRotatedFiles < 0 { 402 maxRotatedFiles = 0 403 } 404 i := 0 405 for ; i < len(logs)-maxRotatedFiles; i++ { 406 if err := c.osInterface.Remove(logs[i]); err != nil { 407 return nil, fmt.Errorf("failed to remove old log %q: %v", logs[i], err) 408 } 409 } 410 logs = logs[i:] 411 return logs, nil 412 } 413 414 // compressLog compresses a log to log.gz with gzip. 415 func (c *containerLogManager) compressLog(log string) error { 416 r, err := c.osInterface.Open(log) 417 if err != nil { 418 return fmt.Errorf("failed to open log %q: %v", log, err) 419 } 420 defer r.Close() 421 tmpLog := log + tmpSuffix 422 f, err := c.osInterface.OpenFile(tmpLog, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) 423 if err != nil { 424 return fmt.Errorf("failed to create temporary log %q: %v", tmpLog, err) 425 } 426 defer func() { 427 // Best effort cleanup of tmpLog. 428 c.osInterface.Remove(tmpLog) 429 }() 430 defer f.Close() 431 w := gzip.NewWriter(f) 432 defer w.Close() 433 if _, err := io.Copy(w, r); err != nil { 434 return fmt.Errorf("failed to compress %q to %q: %v", log, tmpLog, err) 435 } 436 // The archive needs to be closed before renaming, otherwise an error will occur on Windows. 437 w.Close() 438 f.Close() 439 compressedLog := log + compressSuffix 440 if err := c.osInterface.Rename(tmpLog, compressedLog); err != nil { 441 return fmt.Errorf("failed to rename %q to %q: %v", tmpLog, compressedLog, err) 442 } 443 // Remove old log file. 444 r.Close() 445 if err := c.osInterface.Remove(log); err != nil { 446 return fmt.Errorf("failed to remove log %q after compress: %v", log, err) 447 } 448 return nil 449 } 450 451 // rotateLatestLog rotates latest log without compression, so that container can still write 452 // and fluentd can finish reading. 453 func (c *containerLogManager) rotateLatestLog(ctx context.Context, id, log string) error { 454 timestamp := c.clock.Now().Format(timestampFormat) 455 rotated := fmt.Sprintf("%s.%s", log, timestamp) 456 if err := c.osInterface.Rename(log, rotated); err != nil { 457 return fmt.Errorf("failed to rotate log %q to %q: %v", log, rotated, err) 458 } 459 if err := c.runtimeService.ReopenContainerLog(ctx, id); err != nil { 460 // Rename the rotated log back, so that we can try rotating it again 461 // next round. 462 // If kubelet gets restarted at this point, we'll lose original log. 463 if renameErr := c.osInterface.Rename(rotated, log); renameErr != nil { 464 // This shouldn't happen. 465 // Report an error if this happens, because we will lose original 466 // log. 467 klog.ErrorS(renameErr, "Failed to rename rotated log", "rotatedLog", rotated, "newLog", log, "containerID", id) 468 } 469 return fmt.Errorf("failed to reopen container log %q: %v", id, err) 470 } 471 return nil 472 }