k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/kubelet/logs/container_log_manager.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package logs
    18  
    19  import (
    20  	"compress/gzip"
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"os"
    25  	"path/filepath"
    26  	"sort"
    27  	"strings"
    28  	"sync"
    29  
    30  	"k8s.io/client-go/util/workqueue"
    31  	"k8s.io/klog/v2"
    32  
    33  	"k8s.io/apimachinery/pkg/api/resource"
    34  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    35  	"k8s.io/apimachinery/pkg/util/wait"
    36  	internalapi "k8s.io/cri-api/pkg/apis"
    37  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    38  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    39  	"k8s.io/utils/clock"
    40  )
    41  
    42  const (
    43  	// timestampFormat is format of the timestamp suffix for rotated log.
    44  	// See https://golang.org/pkg/time/#Time.Format.
    45  	timestampFormat = "20060102-150405"
    46  	// compressSuffix is the suffix for compressed log.
    47  	compressSuffix = ".gz"
    48  	// tmpSuffix is the suffix for temporary file.
    49  	tmpSuffix = ".tmp"
    50  )
    51  
    52  // ContainerLogManager manages lifecycle of all container logs.
    53  //
    54  // Implementation is thread-safe.
    55  type ContainerLogManager interface {
    56  	// TODO(random-liu): Add RotateLogs function and call it under disk pressure.
    57  	// Start container log manager.
    58  	Start()
    59  	// Clean removes all logs of specified container.
    60  	Clean(ctx context.Context, containerID string) error
    61  }
    62  
    63  // LogRotatePolicy is a policy for container log rotation. The policy applies to all
    64  // containers managed by kubelet.
    65  type LogRotatePolicy struct {
    66  	// MaxSize in bytes of the container log file before it is rotated. Negative
    67  	// number means to disable container log rotation.
    68  	MaxSize int64
    69  	// MaxFiles is the maximum number of log files that can be present.
    70  	// If rotating the logs creates excess files, the oldest file is removed.
    71  	MaxFiles int
    72  }
    73  
    74  // GetAllLogs gets all inuse (rotated/compressed) logs for a specific container log.
    75  // Returned logs are sorted in oldest to newest order.
    76  // TODO(#59902): Leverage this function to support log rotation in `kubectl logs`.
    77  func GetAllLogs(log string) ([]string, error) {
    78  	// pattern is used to match all rotated files.
    79  	pattern := fmt.Sprintf("%s.*", log)
    80  	logs, err := filepath.Glob(pattern)
    81  	if err != nil {
    82  		return nil, fmt.Errorf("failed to list all log files with pattern %q: %v", pattern, err)
    83  	}
    84  	inuse, _ := filterUnusedLogs(logs)
    85  	sort.Strings(inuse)
    86  	return append(inuse, log), nil
    87  }
    88  
    89  // compressReadCloser wraps gzip.Reader with a function to close file handler.
    90  type compressReadCloser struct {
    91  	f *os.File
    92  	*gzip.Reader
    93  }
    94  
    95  func (rc *compressReadCloser) Close() error {
    96  	ferr := rc.f.Close()
    97  	rerr := rc.Reader.Close()
    98  	if ferr != nil {
    99  		return ferr
   100  	}
   101  	if rerr != nil {
   102  		return rerr
   103  	}
   104  	return nil
   105  }
   106  
   107  // UncompressLog compresses a compressed log and return a readcloser for the
   108  // stream of the uncompressed content.
   109  // TODO(#59902): Leverage this function to support log rotation in `kubectl logs`.
   110  func UncompressLog(log string) (_ io.ReadCloser, retErr error) {
   111  	if !strings.HasSuffix(log, compressSuffix) {
   112  		return nil, fmt.Errorf("log is not compressed")
   113  	}
   114  	f, err := os.Open(log)
   115  	if err != nil {
   116  		return nil, fmt.Errorf("failed to open log: %v", err)
   117  	}
   118  	defer func() {
   119  		if retErr != nil {
   120  			f.Close()
   121  		}
   122  	}()
   123  	r, err := gzip.NewReader(f)
   124  	if err != nil {
   125  		return nil, fmt.Errorf("failed to create gzip reader: %v", err)
   126  	}
   127  	return &compressReadCloser{f: f, Reader: r}, nil
   128  }
   129  
   130  // parseMaxSize parses quantity string to int64 max size in bytes.
   131  func parseMaxSize(size string) (int64, error) {
   132  	quantity, err := resource.ParseQuantity(size)
   133  	if err != nil {
   134  		return 0, err
   135  	}
   136  	maxSize, ok := quantity.AsInt64()
   137  	if !ok {
   138  		return 0, fmt.Errorf("invalid max log size")
   139  	}
   140  	return maxSize, nil
   141  }
   142  
   143  type containerLogManager struct {
   144  	runtimeService   internalapi.RuntimeService
   145  	osInterface      kubecontainer.OSInterface
   146  	policy           LogRotatePolicy
   147  	clock            clock.Clock
   148  	mutex            sync.Mutex
   149  	queue            workqueue.TypedRateLimitingInterface[string]
   150  	maxWorkers       int
   151  	monitoringPeriod metav1.Duration
   152  }
   153  
   154  // NewContainerLogManager creates a new container log manager.
   155  func NewContainerLogManager(runtimeService internalapi.RuntimeService, osInterface kubecontainer.OSInterface, maxSize string, maxFiles int, maxWorkers int, monitorInterval metav1.Duration) (ContainerLogManager, error) {
   156  	if maxFiles <= 1 {
   157  		return nil, fmt.Errorf("invalid MaxFiles %d, must be > 1", maxFiles)
   158  	}
   159  	parsedMaxSize, err := parseMaxSize(maxSize)
   160  	if err != nil {
   161  		return nil, fmt.Errorf("failed to parse container log max size %q: %v", maxSize, err)
   162  	}
   163  	// Negative number means to disable container log rotation
   164  	if parsedMaxSize < 0 {
   165  		return NewStubContainerLogManager(), nil
   166  	}
   167  	// policy LogRotatePolicy
   168  	return &containerLogManager{
   169  		osInterface:    osInterface,
   170  		runtimeService: runtimeService,
   171  		policy: LogRotatePolicy{
   172  			MaxSize:  parsedMaxSize,
   173  			MaxFiles: maxFiles,
   174  		},
   175  		clock:      clock.RealClock{},
   176  		mutex:      sync.Mutex{},
   177  		maxWorkers: maxWorkers,
   178  		queue: workqueue.NewTypedRateLimitingQueueWithConfig(
   179  			workqueue.DefaultTypedControllerRateLimiter[string](),
   180  			workqueue.TypedRateLimitingQueueConfig[string]{Name: "kubelet_log_rotate_manager"},
   181  		),
   182  		monitoringPeriod: monitorInterval,
   183  	}, nil
   184  }
   185  
   186  // Start the container log manager.
   187  func (c *containerLogManager) Start() {
   188  	ctx := context.Background()
   189  	klog.InfoS("Initializing container log rotate workers", "workers", c.maxWorkers, "monitorPeriod", c.monitoringPeriod)
   190  	for i := 0; i < c.maxWorkers; i++ {
   191  		worker := i + 1
   192  		go c.processQueueItems(ctx, worker)
   193  	}
   194  	// Start a goroutine periodically does container log rotation.
   195  	go wait.Forever(func() {
   196  		if err := c.rotateLogs(ctx); err != nil {
   197  			klog.ErrorS(err, "Failed to rotate container logs")
   198  		}
   199  	}, c.monitoringPeriod.Duration)
   200  }
   201  
   202  // Clean removes all logs of specified container (including rotated one).
   203  func (c *containerLogManager) Clean(ctx context.Context, containerID string) error {
   204  	c.mutex.Lock()
   205  	defer c.mutex.Unlock()
   206  	resp, err := c.runtimeService.ContainerStatus(ctx, containerID, false)
   207  	if err != nil {
   208  		return fmt.Errorf("failed to get container status %q: %v", containerID, err)
   209  	}
   210  	if resp.GetStatus() == nil {
   211  		return fmt.Errorf("container status is nil for %q", containerID)
   212  	}
   213  	pattern := fmt.Sprintf("%s*", resp.GetStatus().GetLogPath())
   214  	logs, err := c.osInterface.Glob(pattern)
   215  	if err != nil {
   216  		return fmt.Errorf("failed to list all log files with pattern %q: %v", pattern, err)
   217  	}
   218  
   219  	for _, l := range logs {
   220  		if err := c.osInterface.Remove(l); err != nil && !os.IsNotExist(err) {
   221  			return fmt.Errorf("failed to remove container %q log %q: %v", containerID, l, err)
   222  		}
   223  	}
   224  
   225  	return nil
   226  }
   227  
   228  func (c *containerLogManager) processQueueItems(ctx context.Context, worker int) {
   229  	klog.V(4).InfoS("Starting container log rotation worker", "workerID", worker)
   230  	for c.processContainer(ctx, worker) {
   231  	}
   232  	klog.V(4).InfoS("Terminating container log rotation worker", "workerID", worker)
   233  }
   234  
   235  func (c *containerLogManager) rotateLogs(ctx context.Context) error {
   236  	c.mutex.Lock()
   237  	defer c.mutex.Unlock()
   238  	klog.V(4).InfoS("Starting container log rotation sequence")
   239  	// TODO(#59998): Use kubelet pod cache.
   240  	containers, err := c.runtimeService.ListContainers(ctx, &runtimeapi.ContainerFilter{})
   241  	if err != nil {
   242  		return fmt.Errorf("failed to list containers: %v", err)
   243  	}
   244  	for _, container := range containers {
   245  		// Only rotate logs for running containers. Non-running containers won't
   246  		// generate new output, it doesn't make sense to keep an empty latest log.
   247  		if container.GetState() != runtimeapi.ContainerState_CONTAINER_RUNNING {
   248  			continue
   249  		}
   250  		// Doing this to avoid additional overhead with logging of label like arguments that can prove costly
   251  		if v := klog.V(4); v.Enabled() {
   252  			klog.V(4).InfoS("Adding new entry to the queue for processing", "id", container.GetId(), "name", container.Metadata.GetName(), "labels", container.GetLabels())
   253  		}
   254  		c.queue.Add(container.GetId())
   255  	}
   256  	return nil
   257  }
   258  
   259  func (c *containerLogManager) processContainer(ctx context.Context, worker int) (ok bool) {
   260  	key, quit := c.queue.Get()
   261  	if quit {
   262  		return false
   263  	}
   264  	defer func() {
   265  		c.queue.Done(key)
   266  		c.queue.Forget(key)
   267  	}()
   268  	// Always default the return to true to keep the processing of Queue ongoing
   269  	ok = true
   270  	id := key
   271  
   272  	resp, err := c.runtimeService.ContainerStatus(ctx, id, false)
   273  	if err != nil {
   274  		klog.ErrorS(err, "Failed to get container status", "worker", worker, "containerID", id)
   275  		return
   276  	}
   277  	if resp.GetStatus() == nil {
   278  		klog.ErrorS(err, "Container status is nil", "worker", worker, "containerID", id)
   279  		return
   280  	}
   281  	path := resp.GetStatus().GetLogPath()
   282  	info, err := c.osInterface.Stat(path)
   283  
   284  	if err != nil {
   285  		if !os.IsNotExist(err) {
   286  			klog.ErrorS(err, "Failed to stat container log", "worker", worker, "containerID", id, "path", path)
   287  			return
   288  		}
   289  
   290  		if err = c.runtimeService.ReopenContainerLog(ctx, id); err != nil {
   291  			klog.ErrorS(err, "Container log doesn't exist, reopen container log failed", "worker", worker, "containerID", id, "path", path)
   292  			return
   293  		}
   294  
   295  		info, err = c.osInterface.Stat(path)
   296  		if err != nil {
   297  			klog.ErrorS(err, "Failed to stat container log after reopen", "worker", worker, "containerID", id, "path", path)
   298  			return
   299  		}
   300  	}
   301  	if info.Size() < c.policy.MaxSize {
   302  		klog.V(7).InfoS("log file doesn't need to be rotated", "worker", worker, "containerID", id, "path", path, "currentSize", info.Size(), "maxSize", c.policy.MaxSize)
   303  		return
   304  	}
   305  
   306  	if err := c.rotateLog(ctx, id, path); err != nil {
   307  		klog.ErrorS(err, "Failed to rotate log for container", "worker", worker, "containerID", id, "path", path, "currentSize", info.Size(), "maxSize", c.policy.MaxSize)
   308  		return
   309  	}
   310  	return
   311  }
   312  
   313  func (c *containerLogManager) rotateLog(ctx context.Context, id, log string) error {
   314  	// pattern is used to match all rotated files.
   315  	pattern := fmt.Sprintf("%s.*", log)
   316  	logs, err := filepath.Glob(pattern)
   317  	if err != nil {
   318  		return fmt.Errorf("failed to list all log files with pattern %q: %v", pattern, err)
   319  	}
   320  
   321  	logs, err = c.cleanupUnusedLogs(logs)
   322  	if err != nil {
   323  		return fmt.Errorf("failed to cleanup logs: %v", err)
   324  	}
   325  
   326  	logs, err = c.removeExcessLogs(logs)
   327  	if err != nil {
   328  		return fmt.Errorf("failed to remove excess logs: %v", err)
   329  	}
   330  
   331  	// Compress uncompressed log files.
   332  	for _, l := range logs {
   333  		if strings.HasSuffix(l, compressSuffix) {
   334  			continue
   335  		}
   336  		if err := c.compressLog(l); err != nil {
   337  			return fmt.Errorf("failed to compress log %q: %v", l, err)
   338  		}
   339  	}
   340  
   341  	if err := c.rotateLatestLog(ctx, id, log); err != nil {
   342  		return fmt.Errorf("failed to rotate log %q: %v", log, err)
   343  	}
   344  
   345  	return nil
   346  }
   347  
   348  // cleanupUnusedLogs cleans up temporary or unused log files generated by previous log rotation
   349  // failure.
   350  func (c *containerLogManager) cleanupUnusedLogs(logs []string) ([]string, error) {
   351  	inuse, unused := filterUnusedLogs(logs)
   352  	for _, l := range unused {
   353  		if err := c.osInterface.Remove(l); err != nil {
   354  			return nil, fmt.Errorf("failed to remove unused log %q: %v", l, err)
   355  		}
   356  	}
   357  	return inuse, nil
   358  }
   359  
   360  // filterUnusedLogs splits logs into 2 groups, the 1st group is in used logs,
   361  // the second group is unused logs.
   362  func filterUnusedLogs(logs []string) (inuse []string, unused []string) {
   363  	for _, l := range logs {
   364  		if isInUse(l, logs) {
   365  			inuse = append(inuse, l)
   366  		} else {
   367  			unused = append(unused, l)
   368  		}
   369  	}
   370  	return inuse, unused
   371  }
   372  
   373  // isInUse checks whether a container log file is still inuse.
   374  func isInUse(l string, logs []string) bool {
   375  	// All temporary files are not in use.
   376  	if strings.HasSuffix(l, tmpSuffix) {
   377  		return false
   378  	}
   379  	// All compressed logs are in use.
   380  	if strings.HasSuffix(l, compressSuffix) {
   381  		return true
   382  	}
   383  	// Files has already been compressed are not in use.
   384  	for _, another := range logs {
   385  		if l+compressSuffix == another {
   386  			return false
   387  		}
   388  	}
   389  	return true
   390  }
   391  
   392  // removeExcessLogs removes old logs to make sure there are only at most MaxFiles log files.
   393  func (c *containerLogManager) removeExcessLogs(logs []string) ([]string, error) {
   394  	// Sort log files in oldest to newest order.
   395  	sort.Strings(logs)
   396  	// Container will create a new log file, and we'll rotate the latest log file.
   397  	// Other than those 2 files, we can have at most MaxFiles-2 rotated log files.
   398  	// Keep MaxFiles-2 files by removing old files.
   399  	// We should remove from oldest to newest, so as not to break ongoing `kubectl logs`.
   400  	maxRotatedFiles := c.policy.MaxFiles - 2
   401  	if maxRotatedFiles < 0 {
   402  		maxRotatedFiles = 0
   403  	}
   404  	i := 0
   405  	for ; i < len(logs)-maxRotatedFiles; i++ {
   406  		if err := c.osInterface.Remove(logs[i]); err != nil {
   407  			return nil, fmt.Errorf("failed to remove old log %q: %v", logs[i], err)
   408  		}
   409  	}
   410  	logs = logs[i:]
   411  	return logs, nil
   412  }
   413  
   414  // compressLog compresses a log to log.gz with gzip.
   415  func (c *containerLogManager) compressLog(log string) error {
   416  	r, err := c.osInterface.Open(log)
   417  	if err != nil {
   418  		return fmt.Errorf("failed to open log %q: %v", log, err)
   419  	}
   420  	defer r.Close()
   421  	tmpLog := log + tmpSuffix
   422  	f, err := c.osInterface.OpenFile(tmpLog, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
   423  	if err != nil {
   424  		return fmt.Errorf("failed to create temporary log %q: %v", tmpLog, err)
   425  	}
   426  	defer func() {
   427  		// Best effort cleanup of tmpLog.
   428  		c.osInterface.Remove(tmpLog)
   429  	}()
   430  	defer f.Close()
   431  	w := gzip.NewWriter(f)
   432  	defer w.Close()
   433  	if _, err := io.Copy(w, r); err != nil {
   434  		return fmt.Errorf("failed to compress %q to %q: %v", log, tmpLog, err)
   435  	}
   436  	// The archive needs to be closed before renaming, otherwise an error will occur on Windows.
   437  	w.Close()
   438  	f.Close()
   439  	compressedLog := log + compressSuffix
   440  	if err := c.osInterface.Rename(tmpLog, compressedLog); err != nil {
   441  		return fmt.Errorf("failed to rename %q to %q: %v", tmpLog, compressedLog, err)
   442  	}
   443  	// Remove old log file.
   444  	r.Close()
   445  	if err := c.osInterface.Remove(log); err != nil {
   446  		return fmt.Errorf("failed to remove log %q after compress: %v", log, err)
   447  	}
   448  	return nil
   449  }
   450  
   451  // rotateLatestLog rotates latest log without compression, so that container can still write
   452  // and fluentd can finish reading.
   453  func (c *containerLogManager) rotateLatestLog(ctx context.Context, id, log string) error {
   454  	timestamp := c.clock.Now().Format(timestampFormat)
   455  	rotated := fmt.Sprintf("%s.%s", log, timestamp)
   456  	if err := c.osInterface.Rename(log, rotated); err != nil {
   457  		return fmt.Errorf("failed to rotate log %q to %q: %v", log, rotated, err)
   458  	}
   459  	if err := c.runtimeService.ReopenContainerLog(ctx, id); err != nil {
   460  		// Rename the rotated log back, so that we can try rotating it again
   461  		// next round.
   462  		// If kubelet gets restarted at this point, we'll lose original log.
   463  		if renameErr := c.osInterface.Rename(rotated, log); renameErr != nil {
   464  			// This shouldn't happen.
   465  			// Report an error if this happens, because we will lose original
   466  			// log.
   467  			klog.ErrorS(renameErr, "Failed to rename rotated log", "rotatedLog", rotated, "newLog", log, "containerID", id)
   468  		}
   469  		return fmt.Errorf("failed to reopen container log %q: %v", id, err)
   470  	}
   471  	return nil
   472  }