k8s.io/kubernetes@v1.29.3/pkg/kubelet/logs/container_log_manager.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package logs
    18  
    19  import (
    20  	"compress/gzip"
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"os"
    25  	"path/filepath"
    26  	"sort"
    27  	"strings"
    28  	"sync"
    29  	"time"
    30  
    31  	"k8s.io/klog/v2"
    32  
    33  	"k8s.io/apimachinery/pkg/api/resource"
    34  	"k8s.io/apimachinery/pkg/util/wait"
    35  	internalapi "k8s.io/cri-api/pkg/apis"
    36  	runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
    37  	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
    38  	"k8s.io/utils/clock"
    39  )
    40  
    41  const (
    42  	// logMonitorPeriod is the period container log manager monitors
    43  	// container logs and performs log rotation.
    44  	logMonitorPeriod = 10 * time.Second
    45  	// timestampFormat is format of the timestamp suffix for rotated log.
    46  	// See https://golang.org/pkg/time/#Time.Format.
    47  	timestampFormat = "20060102-150405"
    48  	// compressSuffix is the suffix for compressed log.
    49  	compressSuffix = ".gz"
    50  	// tmpSuffix is the suffix for temporary file.
    51  	tmpSuffix = ".tmp"
    52  )
    53  
    54  // ContainerLogManager manages lifecycle of all container logs.
    55  //
    56  // Implementation is thread-safe.
    57  type ContainerLogManager interface {
    58  	// TODO(random-liu): Add RotateLogs function and call it under disk pressure.
    59  	// Start container log manager.
    60  	Start()
    61  	// Clean removes all logs of specified container.
    62  	Clean(ctx context.Context, containerID string) error
    63  }
    64  
    65  // LogRotatePolicy is a policy for container log rotation. The policy applies to all
    66  // containers managed by kubelet.
    67  type LogRotatePolicy struct {
    68  	// MaxSize in bytes of the container log file before it is rotated. Negative
    69  	// number means to disable container log rotation.
    70  	MaxSize int64
    71  	// MaxFiles is the maximum number of log files that can be present.
    72  	// If rotating the logs creates excess files, the oldest file is removed.
    73  	MaxFiles int
    74  }
    75  
    76  // GetAllLogs gets all inuse (rotated/compressed) logs for a specific container log.
    77  // Returned logs are sorted in oldest to newest order.
    78  // TODO(#59902): Leverage this function to support log rotation in `kubectl logs`.
    79  func GetAllLogs(log string) ([]string, error) {
    80  	// pattern is used to match all rotated files.
    81  	pattern := fmt.Sprintf("%s.*", log)
    82  	logs, err := filepath.Glob(pattern)
    83  	if err != nil {
    84  		return nil, fmt.Errorf("failed to list all log files with pattern %q: %v", pattern, err)
    85  	}
    86  	inuse, _ := filterUnusedLogs(logs)
    87  	sort.Strings(inuse)
    88  	return append(inuse, log), nil
    89  }
    90  
    91  // compressReadCloser wraps gzip.Reader with a function to close file handler.
    92  type compressReadCloser struct {
    93  	f *os.File
    94  	*gzip.Reader
    95  }
    96  
    97  func (rc *compressReadCloser) Close() error {
    98  	ferr := rc.f.Close()
    99  	rerr := rc.Reader.Close()
   100  	if ferr != nil {
   101  		return ferr
   102  	}
   103  	if rerr != nil {
   104  		return rerr
   105  	}
   106  	return nil
   107  }
   108  
   109  // UncompressLog compresses a compressed log and return a readcloser for the
   110  // stream of the uncompressed content.
   111  // TODO(#59902): Leverage this function to support log rotation in `kubectl logs`.
   112  func UncompressLog(log string) (_ io.ReadCloser, retErr error) {
   113  	if !strings.HasSuffix(log, compressSuffix) {
   114  		return nil, fmt.Errorf("log is not compressed")
   115  	}
   116  	f, err := os.Open(log)
   117  	if err != nil {
   118  		return nil, fmt.Errorf("failed to open log: %v", err)
   119  	}
   120  	defer func() {
   121  		if retErr != nil {
   122  			f.Close()
   123  		}
   124  	}()
   125  	r, err := gzip.NewReader(f)
   126  	if err != nil {
   127  		return nil, fmt.Errorf("failed to create gzip reader: %v", err)
   128  	}
   129  	return &compressReadCloser{f: f, Reader: r}, nil
   130  }
   131  
   132  // parseMaxSize parses quantity string to int64 max size in bytes.
   133  func parseMaxSize(size string) (int64, error) {
   134  	quantity, err := resource.ParseQuantity(size)
   135  	if err != nil {
   136  		return 0, err
   137  	}
   138  	maxSize, ok := quantity.AsInt64()
   139  	if !ok {
   140  		return 0, fmt.Errorf("invalid max log size")
   141  	}
   142  	return maxSize, nil
   143  }
   144  
   145  type containerLogManager struct {
   146  	runtimeService internalapi.RuntimeService
   147  	osInterface    kubecontainer.OSInterface
   148  	policy         LogRotatePolicy
   149  	clock          clock.Clock
   150  	mutex          sync.Mutex
   151  }
   152  
   153  // NewContainerLogManager creates a new container log manager.
   154  func NewContainerLogManager(runtimeService internalapi.RuntimeService, osInterface kubecontainer.OSInterface, maxSize string, maxFiles int) (ContainerLogManager, error) {
   155  	if maxFiles <= 1 {
   156  		return nil, fmt.Errorf("invalid MaxFiles %d, must be > 1", maxFiles)
   157  	}
   158  	parsedMaxSize, err := parseMaxSize(maxSize)
   159  	if err != nil {
   160  		return nil, fmt.Errorf("failed to parse container log max size %q: %v", maxSize, err)
   161  	}
   162  	// Negative number means to disable container log rotation
   163  	if parsedMaxSize < 0 {
   164  		return NewStubContainerLogManager(), nil
   165  	}
   166  	// policy LogRotatePolicy
   167  	return &containerLogManager{
   168  		osInterface:    osInterface,
   169  		runtimeService: runtimeService,
   170  		policy: LogRotatePolicy{
   171  			MaxSize:  parsedMaxSize,
   172  			MaxFiles: maxFiles,
   173  		},
   174  		clock: clock.RealClock{},
   175  		mutex: sync.Mutex{},
   176  	}, nil
   177  }
   178  
   179  // Start the container log manager.
   180  func (c *containerLogManager) Start() {
   181  	ctx := context.Background()
   182  	// Start a goroutine periodically does container log rotation.
   183  	go wait.Forever(func() {
   184  		if err := c.rotateLogs(ctx); err != nil {
   185  			klog.ErrorS(err, "Failed to rotate container logs")
   186  		}
   187  	}, logMonitorPeriod)
   188  }
   189  
   190  // Clean removes all logs of specified container (including rotated one).
   191  func (c *containerLogManager) Clean(ctx context.Context, containerID string) error {
   192  	c.mutex.Lock()
   193  	defer c.mutex.Unlock()
   194  	resp, err := c.runtimeService.ContainerStatus(ctx, containerID, false)
   195  	if err != nil {
   196  		return fmt.Errorf("failed to get container status %q: %v", containerID, err)
   197  	}
   198  	if resp.GetStatus() == nil {
   199  		return fmt.Errorf("container status is nil for %q", containerID)
   200  	}
   201  	pattern := fmt.Sprintf("%s*", resp.GetStatus().GetLogPath())
   202  	logs, err := c.osInterface.Glob(pattern)
   203  	if err != nil {
   204  		return fmt.Errorf("failed to list all log files with pattern %q: %v", pattern, err)
   205  	}
   206  
   207  	for _, l := range logs {
   208  		if err := c.osInterface.Remove(l); err != nil && !os.IsNotExist(err) {
   209  			return fmt.Errorf("failed to remove container %q log %q: %v", containerID, l, err)
   210  		}
   211  	}
   212  
   213  	return nil
   214  }
   215  
   216  func (c *containerLogManager) rotateLogs(ctx context.Context) error {
   217  	c.mutex.Lock()
   218  	defer c.mutex.Unlock()
   219  	// TODO(#59998): Use kubelet pod cache.
   220  	containers, err := c.runtimeService.ListContainers(ctx, &runtimeapi.ContainerFilter{})
   221  	if err != nil {
   222  		return fmt.Errorf("failed to list containers: %v", err)
   223  	}
   224  	// NOTE(random-liu): Figure out whether we need to rotate container logs in parallel.
   225  	for _, container := range containers {
   226  		// Only rotate logs for running containers. Non-running containers won't
   227  		// generate new output, it doesn't make sense to keep an empty latest log.
   228  		if container.GetState() != runtimeapi.ContainerState_CONTAINER_RUNNING {
   229  			continue
   230  		}
   231  		id := container.GetId()
   232  		// Note that we should not block log rotate for an error of a single container.
   233  		resp, err := c.runtimeService.ContainerStatus(ctx, id, false)
   234  		if err != nil {
   235  			klog.ErrorS(err, "Failed to get container status", "containerID", id)
   236  			continue
   237  		}
   238  		if resp.GetStatus() == nil {
   239  			klog.ErrorS(err, "Container status is nil", "containerID", id)
   240  			continue
   241  		}
   242  		path := resp.GetStatus().GetLogPath()
   243  		info, err := c.osInterface.Stat(path)
   244  		if err != nil {
   245  			if !os.IsNotExist(err) {
   246  				klog.ErrorS(err, "Failed to stat container log", "path", path)
   247  				continue
   248  			}
   249  			// In rotateLatestLog, there are several cases that we may
   250  			// lose original container log after ReopenContainerLog fails.
   251  			// We try to recover it by reopening container log.
   252  			if err := c.runtimeService.ReopenContainerLog(ctx, id); err != nil {
   253  				klog.ErrorS(err, "Container log doesn't exist, reopen container log failed", "containerID", id, "path", path)
   254  				continue
   255  			}
   256  			// The container log should be recovered.
   257  			info, err = c.osInterface.Stat(path)
   258  			if err != nil {
   259  				klog.ErrorS(err, "Failed to stat container log after reopen", "path", path)
   260  				continue
   261  			}
   262  		}
   263  		if info.Size() < c.policy.MaxSize {
   264  			continue
   265  		}
   266  		// Perform log rotation.
   267  		if err := c.rotateLog(ctx, id, path); err != nil {
   268  			klog.ErrorS(err, "Failed to rotate log for container", "path", path, "containerID", id)
   269  			continue
   270  		}
   271  	}
   272  	return nil
   273  }
   274  
   275  func (c *containerLogManager) rotateLog(ctx context.Context, id, log string) error {
   276  	// pattern is used to match all rotated files.
   277  	pattern := fmt.Sprintf("%s.*", log)
   278  	logs, err := filepath.Glob(pattern)
   279  	if err != nil {
   280  		return fmt.Errorf("failed to list all log files with pattern %q: %v", pattern, err)
   281  	}
   282  
   283  	logs, err = c.cleanupUnusedLogs(logs)
   284  	if err != nil {
   285  		return fmt.Errorf("failed to cleanup logs: %v", err)
   286  	}
   287  
   288  	logs, err = c.removeExcessLogs(logs)
   289  	if err != nil {
   290  		return fmt.Errorf("failed to remove excess logs: %v", err)
   291  	}
   292  
   293  	// Compress uncompressed log files.
   294  	for _, l := range logs {
   295  		if strings.HasSuffix(l, compressSuffix) {
   296  			continue
   297  		}
   298  		if err := c.compressLog(l); err != nil {
   299  			return fmt.Errorf("failed to compress log %q: %v", l, err)
   300  		}
   301  	}
   302  
   303  	if err := c.rotateLatestLog(ctx, id, log); err != nil {
   304  		return fmt.Errorf("failed to rotate log %q: %v", log, err)
   305  	}
   306  
   307  	return nil
   308  }
   309  
   310  // cleanupUnusedLogs cleans up temporary or unused log files generated by previous log rotation
   311  // failure.
   312  func (c *containerLogManager) cleanupUnusedLogs(logs []string) ([]string, error) {
   313  	inuse, unused := filterUnusedLogs(logs)
   314  	for _, l := range unused {
   315  		if err := c.osInterface.Remove(l); err != nil {
   316  			return nil, fmt.Errorf("failed to remove unused log %q: %v", l, err)
   317  		}
   318  	}
   319  	return inuse, nil
   320  }
   321  
   322  // filterUnusedLogs splits logs into 2 groups, the 1st group is in used logs,
   323  // the second group is unused logs.
   324  func filterUnusedLogs(logs []string) (inuse []string, unused []string) {
   325  	for _, l := range logs {
   326  		if isInUse(l, logs) {
   327  			inuse = append(inuse, l)
   328  		} else {
   329  			unused = append(unused, l)
   330  		}
   331  	}
   332  	return inuse, unused
   333  }
   334  
   335  // isInUse checks whether a container log file is still inuse.
   336  func isInUse(l string, logs []string) bool {
   337  	// All temporary files are not in use.
   338  	if strings.HasSuffix(l, tmpSuffix) {
   339  		return false
   340  	}
   341  	// All compressed logs are in use.
   342  	if strings.HasSuffix(l, compressSuffix) {
   343  		return true
   344  	}
   345  	// Files has already been compressed are not in use.
   346  	for _, another := range logs {
   347  		if l+compressSuffix == another {
   348  			return false
   349  		}
   350  	}
   351  	return true
   352  }
   353  
   354  // removeExcessLogs removes old logs to make sure there are only at most MaxFiles log files.
   355  func (c *containerLogManager) removeExcessLogs(logs []string) ([]string, error) {
   356  	// Sort log files in oldest to newest order.
   357  	sort.Strings(logs)
   358  	// Container will create a new log file, and we'll rotate the latest log file.
   359  	// Other than those 2 files, we can have at most MaxFiles-2 rotated log files.
   360  	// Keep MaxFiles-2 files by removing old files.
   361  	// We should remove from oldest to newest, so as not to break ongoing `kubectl logs`.
   362  	maxRotatedFiles := c.policy.MaxFiles - 2
   363  	if maxRotatedFiles < 0 {
   364  		maxRotatedFiles = 0
   365  	}
   366  	i := 0
   367  	for ; i < len(logs)-maxRotatedFiles; i++ {
   368  		if err := c.osInterface.Remove(logs[i]); err != nil {
   369  			return nil, fmt.Errorf("failed to remove old log %q: %v", logs[i], err)
   370  		}
   371  	}
   372  	logs = logs[i:]
   373  	return logs, nil
   374  }
   375  
   376  // compressLog compresses a log to log.gz with gzip.
   377  func (c *containerLogManager) compressLog(log string) error {
   378  	r, err := c.osInterface.Open(log)
   379  	if err != nil {
   380  		return fmt.Errorf("failed to open log %q: %v", log, err)
   381  	}
   382  	defer r.Close()
   383  	tmpLog := log + tmpSuffix
   384  	f, err := c.osInterface.OpenFile(tmpLog, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
   385  	if err != nil {
   386  		return fmt.Errorf("failed to create temporary log %q: %v", tmpLog, err)
   387  	}
   388  	defer func() {
   389  		// Best effort cleanup of tmpLog.
   390  		c.osInterface.Remove(tmpLog)
   391  	}()
   392  	defer f.Close()
   393  	w := gzip.NewWriter(f)
   394  	defer w.Close()
   395  	if _, err := io.Copy(w, r); err != nil {
   396  		return fmt.Errorf("failed to compress %q to %q: %v", log, tmpLog, err)
   397  	}
   398  	// The archive needs to be closed before renaming, otherwise an error will occur on Windows.
   399  	w.Close()
   400  	f.Close()
   401  	compressedLog := log + compressSuffix
   402  	if err := c.osInterface.Rename(tmpLog, compressedLog); err != nil {
   403  		return fmt.Errorf("failed to rename %q to %q: %v", tmpLog, compressedLog, err)
   404  	}
   405  	// Remove old log file.
   406  	r.Close()
   407  	if err := c.osInterface.Remove(log); err != nil {
   408  		return fmt.Errorf("failed to remove log %q after compress: %v", log, err)
   409  	}
   410  	return nil
   411  }
   412  
   413  // rotateLatestLog rotates latest log without compression, so that container can still write
   414  // and fluentd can finish reading.
   415  func (c *containerLogManager) rotateLatestLog(ctx context.Context, id, log string) error {
   416  	timestamp := c.clock.Now().Format(timestampFormat)
   417  	rotated := fmt.Sprintf("%s.%s", log, timestamp)
   418  	if err := c.osInterface.Rename(log, rotated); err != nil {
   419  		return fmt.Errorf("failed to rotate log %q to %q: %v", log, rotated, err)
   420  	}
   421  	if err := c.runtimeService.ReopenContainerLog(ctx, id); err != nil {
   422  		// Rename the rotated log back, so that we can try rotating it again
   423  		// next round.
   424  		// If kubelet gets restarted at this point, we'll lose original log.
   425  		if renameErr := c.osInterface.Rename(rotated, log); renameErr != nil {
   426  			// This shouldn't happen.
   427  			// Report an error if this happens, because we will lose original
   428  			// log.
   429  			klog.ErrorS(renameErr, "Failed to rename rotated log", "rotatedLog", rotated, "newLog", log, "containerID", id)
   430  		}
   431  		return fmt.Errorf("failed to reopen container log %q: %v", id, err)
   432  	}
   433  	return nil
   434  }