k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml (about) 1 # This ConfigMap is used to ingest logs against new resources like 2 # "k8s_container" and "k8s_node" when $LOGGING_STACKDRIVER_RESOURCE_TYPES is set 3 # to "new". 4 # When $LOGGING_STACKDRIVER_RESOURCE_TYPES is set to "old", the ConfigMap in 5 # fluentd-gcp-configmap-old.yaml will be used for ingesting logs against old 6 # resources like "gke_container" and "gce_instance". 7 kind: ConfigMap 8 apiVersion: v1 9 data: 10 containers.input.conf: |- 11 # This configuration file for Fluentd is used 12 # to watch changes to Docker log files that live in the 13 # directory /var/lib/docker/containers/ and are symbolically 14 # linked to from the /var/log/containers directory using names that capture the 15 # pod name and container name. These logs are then submitted to 16 # Google Cloud Logging which assumes the installation of the cloud-logging plug-in. 17 # 18 # Example 19 # ======= 20 # A line in the Docker log file might look like this JSON: 21 # 22 # {"log":"2014/09/25 21:15:03 Got request with path wombat\\n", 23 # "stream":"stderr", 24 # "time":"2014-09-25T21:15:03.499185026Z"} 25 # 26 # The original tag is derived from the log file's location. 27 # For example a Docker container's logs might be in the directory: 28 # /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b 29 # and in the file: 30 # 997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log 31 # where 997599971ee6... is the Docker ID of the running container. 32 # The Kubernetes kubelet makes a symbolic link to this file on the host 33 # machine in the /var/log/containers directory which includes the pod name, 34 # the namespace name and the Kubernetes container name: 35 # synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log 36 # -> 37 # /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log 38 # The /var/log directory on the host is mapped to the /var/log directory in the container 39 # running this instance of Fluentd and we end up collecting the file: 40 # /var/log/containers/synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log 41 # This results in the tag: 42 # var.log.containers.synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log 43 # where 'synthetic-logger-0.25lps-pod' is the pod name, 'default' is the 44 # namespace name, 'synth-lgr' is the container name and '997599971ee6..' is 45 # the container ID. 46 # The record reformer is used to extract pod_name, namespace_name and 47 # container_name from the tag and set them in a local_resource_id in the 48 # format of: 49 # 'k8s_container.<NAMESPACE_NAME>.<POD_NAME>.<CONTAINER_NAME>'. 50 # The reformer also changes the tags to 'stderr' or 'stdout' based on the 51 # value of 'stream'. 52 # local_resource_id is later used by google_cloud plugin to determine the 53 # monitored resource to ingest logs against. 54 55 # Json Log Example: 56 # {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"} 57 # CRI Log Example: 58 # 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here 59 <source> 60 @type tail 61 path /var/log/containers/*.log 62 pos_file /var/log/gcp-containers.log.pos 63 # Tags at this point are in the format of: 64 # reform.var.log.containers.<POD_NAME>_<NAMESPACE_NAME>_<CONTAINER_NAME>-<CONTAINER_ID>.log 65 tag reform.* 66 read_from_head true 67 <parse> 68 @type multi_format 69 <pattern> 70 format json 71 time_key time 72 time_format %Y-%m-%dT%H:%M:%S.%NZ 73 </pattern> 74 <pattern> 75 format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/ 76 time_format %Y-%m-%dT%H:%M:%S.%N%:z 77 </pattern> 78 </parse> 79 </source> 80 81 <filter reform.**> 82 @type parser 83 format /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<log>.*)/ 84 reserve_data true 85 suppress_parse_error_log true 86 emit_invalid_record_to_error false 87 key_name log 88 </filter> 89 90 <match reform.**> 91 @type record_reformer 92 enable_ruby true 93 <record> 94 # Extract local_resource_id from tag for 'k8s_container' monitored 95 # resource. The format is: 96 # 'k8s_container.<namespace_name>.<pod_name>.<container_name>'. 97 "logging.googleapis.com/local_resource_id" ${"k8s_container.#{tag_suffix[4].rpartition('.')[0].split('_')[1]}.#{tag_suffix[4].rpartition('.')[0].split('_')[0]}.#{tag_suffix[4].rpartition('.')[0].split('_')[2].rpartition('-')[0]}"} 98 # Rename the field 'log' to a more generic field 'message'. This way the 99 # fluent-plugin-google-cloud knows to flatten the field as textPayload 100 # instead of jsonPayload after extracting 'time', 'severity' and 101 # 'stream' from the record. 102 message ${record['log']} 103 # If 'severity' is not set, assume stderr is ERROR and stdout is INFO. 104 severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end} 105 </record> 106 tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end} 107 remove_keys stream,log 108 </match> 109 110 # Detect exceptions in the log output and forward them as one log entry. 111 <match {raw.stderr,raw.stdout}> 112 @type detect_exceptions 113 114 remove_tag_prefix raw 115 message message 116 stream "logging.googleapis.com/local_resource_id" 117 multiline_flush_interval 5 118 max_bytes 500000 119 max_lines 1000 120 </match> 121 system.input.conf: |- 122 # Example: 123 # Dec 21 23:17:22 gke-foo-1-1-4b5cbd14-node-4eoj startupscript: Finished running startup script /var/run/google.startup.script 124 <source> 125 @type tail 126 format syslog 127 path /var/log/startupscript.log 128 pos_file /var/log/gcp-startupscript.log.pos 129 tag startupscript 130 </source> 131 132 # Examples: 133 # time="2016-02-04T06:51:03.053580605Z" level=info msg="GET /containers/json" 134 # time="2016-02-04T07:53:57.505612354Z" level=error msg="HTTP Error" err="No such image: -f" statusCode=404 135 # TODO(random-liu): Remove this after cri container runtime rolls out. 136 <source> 137 @type tail 138 format /^time="(?<time>[^)]*)" level=(?<severity>[^ ]*) msg="(?<message>[^"]*)"( err="(?<error>[^"]*)")?( statusCode=($<status_code>\d+))?/ 139 path /var/log/docker.log 140 pos_file /var/log/gcp-docker.log.pos 141 tag docker 142 </source> 143 144 # Example: 145 # 2016/02/04 06:52:38 filePurge: successfully removed file /var/etcd/data/member/wal/00000000000006d0-00000000010a23d1.wal 146 <source> 147 @type tail 148 # Not parsing this, because it doesn't have anything particularly useful to 149 # parse out of it (like severities). 150 format none 151 path /var/log/etcd.log 152 pos_file /var/log/gcp-etcd.log.pos 153 tag etcd 154 </source> 155 156 # Multi-line parsing is required for all the kube logs because very large log 157 # statements, such as those that include entire object bodies, get split into 158 # multiple lines by glog. 159 160 # Example: 161 # I0204 07:32:30.020537 3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537] 162 <source> 163 @type tail 164 format multiline 165 multiline_flush_interval 5s 166 format_firstline /^\w\d{4}/ 167 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/ 168 time_format %m%d %H:%M:%S.%N 169 path /var/log/kubelet.log 170 pos_file /var/log/gcp-kubelet.log.pos 171 tag kubelet 172 </source> 173 174 # Example: 175 # I1118 21:26:53.975789 6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed 176 <source> 177 @type tail 178 format multiline 179 multiline_flush_interval 5s 180 format_firstline /^\w\d{4}/ 181 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/ 182 time_format %m%d %H:%M:%S.%N 183 path /var/log/kube-proxy.log 184 pos_file /var/log/gcp-kube-proxy.log.pos 185 tag kube-proxy 186 </source> 187 188 # Example: 189 # I0204 07:00:19.604280 5 handlers.go:131] GET /api/v1/nodes: (1.624207ms) 200 [[kube-controller-manager/v1.1.3 (linux/amd64) kubernetes/6a81b50] 127.0.0.1:38266] 190 <source> 191 @type tail 192 format multiline 193 multiline_flush_interval 5s 194 format_firstline /^\w\d{4}/ 195 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/ 196 time_format %m%d %H:%M:%S.%N 197 path /var/log/kube-apiserver.log 198 pos_file /var/log/gcp-kube-apiserver.log.pos 199 tag kube-apiserver 200 </source> 201 202 # Example: 203 # I0204 06:55:31.872680 5 servicecontroller.go:277] LB already exists and doesn't need update for service kube-system/kube-ui 204 <source> 205 @type tail 206 format multiline 207 multiline_flush_interval 5s 208 format_firstline /^\w\d{4}/ 209 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/ 210 time_format %m%d %H:%M:%S.%N 211 path /var/log/kube-controller-manager.log 212 pos_file /var/log/gcp-kube-controller-manager.log.pos 213 tag kube-controller-manager 214 </source> 215 216 # Example: 217 # W0204 06:49:18.239674 7 reflector.go:245] pkg/scheduler/factory/factory.go:193: watch of *api.Service ended with: 401: The event in requested index is outdated and cleared (the requested history has been cleared [2578313/2577886]) [2579312] 218 <source> 219 @type tail 220 format multiline 221 multiline_flush_interval 5s 222 format_firstline /^\w\d{4}/ 223 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/ 224 time_format %m%d %H:%M:%S.%N 225 path /var/log/kube-scheduler.log 226 pos_file /var/log/gcp-kube-scheduler.log.pos 227 tag kube-scheduler 228 </source> 229 230 # Example: 231 # I0603 15:31:05.793605 6 cluster_manager.go:230] Reading config from path /etc/gce.conf 232 <source> 233 @type tail 234 format multiline 235 multiline_flush_interval 5s 236 format_firstline /^\w\d{4}/ 237 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/ 238 time_format %m%d %H:%M:%S.%N 239 path /var/log/glbc.log 240 pos_file /var/log/gcp-glbc.log.pos 241 tag glbc 242 </source> 243 244 # Example: 245 # I0603 15:31:05.793605 6 cluster_manager.go:230] Reading config from path /etc/gce.conf 246 <source> 247 @type tail 248 format multiline 249 multiline_flush_interval 5s 250 format_firstline /^\w\d{4}/ 251 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/ 252 time_format %m%d %H:%M:%S.%N 253 path /var/log/cluster-autoscaler.log 254 pos_file /var/log/gcp-cluster-autoscaler.log.pos 255 tag cluster-autoscaler 256 </source> 257 258 # Logs from systemd-journal for interesting services. 259 # TODO(random-liu): Keep this for compatibility, remove this after 260 # cri container runtime rolls out. 261 <source> 262 @type systemd 263 filters [{ "_SYSTEMD_UNIT": "docker.service" }] 264 pos_file /var/log/gcp-journald-docker.pos 265 read_from_head true 266 tag docker 267 </source> 268 269 <source> 270 @type systemd 271 filters [{ "_SYSTEMD_UNIT": "{{ fluentd_container_runtime_service }}.service" }] 272 pos_file /var/log/gcp-journald-container-runtime.pos 273 read_from_head true 274 tag container-runtime 275 </source> 276 277 <source> 278 @type systemd 279 filters [{ "_SYSTEMD_UNIT": "kubelet.service" }] 280 pos_file /var/log/gcp-journald-kubelet.pos 281 read_from_head true 282 tag kubelet 283 </source> 284 285 <source> 286 @type systemd 287 filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }] 288 pos_file /var/log/gcp-journald-node-problem-detector.pos 289 read_from_head true 290 tag node-problem-detector 291 </source> 292 293 # BEGIN_NODE_JOURNAL 294 # Whether to include node-journal or not is determined when starting the 295 # cluster. It is not changed when the cluster is already running. 296 <source> 297 @type systemd 298 pos_file /var/log/gcp-journald.pos 299 read_from_head true 300 tag node-journal 301 </source> 302 303 <filter node-journal> 304 @type grep 305 <exclude> 306 key _SYSTEMD_UNIT 307 pattern ^(docker|{{ fluentd_container_runtime_service }}|kubelet|node-problem-detector)\.service$ 308 </exclude> 309 </filter> 310 # END_NODE_JOURNAL 311 monitoring.conf: |- 312 # This source is used to acquire approximate process start timestamp, 313 # which purpose is explained before the corresponding output plugin. 314 <source> 315 @type exec 316 command /bin/sh -c 'date +%s' 317 tag process_start 318 time_format %Y-%m-%d %H:%M:%S 319 keys process_start_timestamp 320 </source> 321 322 # This filter is used to convert process start timestamp to integer 323 # value for correct ingestion in the prometheus output plugin. 324 <filter process_start> 325 @type record_transformer 326 enable_ruby true 327 auto_typecast true 328 <record> 329 process_start_timestamp ${record["process_start_timestamp"].to_i} 330 </record> 331 </filter> 332 output.conf: |- 333 # This match is placed before the all-matching output to provide metric 334 # exporter with a process start timestamp for correct exporting of 335 # cumulative metrics to Stackdriver. 336 <match process_start> 337 @type prometheus 338 339 <metric> 340 type gauge 341 name process_start_time_seconds 342 desc Timestamp of the process start in seconds 343 key process_start_timestamp 344 </metric> 345 </match> 346 347 # This filter allows to count the number of log entries read by fluentd 348 # before they are processed by the output plugin. This in turn allows to 349 # monitor the number of log entries that were read but never sent, e.g. 350 # because of liveness probe removing buffer. 351 <filter **> 352 @type prometheus 353 <metric> 354 type counter 355 name logging_entry_count 356 desc Total number of log entries generated by either application containers or system components 357 </metric> 358 </filter> 359 360 # This section is exclusive for k8s_container logs. Those come with 361 # 'stderr'/'stdout' tags. 362 # TODO(instrumentation): Reconsider this workaround later. 363 # Trim the entries which exceed slightly less than 100KB, to avoid 364 # dropping them. It is a necessity, because Stackdriver only supports 365 # entries that are up to 100KB in size. 366 <filter {stderr,stdout}> 367 @type record_transformer 368 enable_ruby true 369 <record> 370 message ${record['message'].length > 100000 ? "[Trimmed]#{record['message'][0..100000]}..." : record['message']} 371 </record> 372 </filter> 373 374 # Do not collect fluentd's own logs to avoid infinite loops. 375 <match fluent.**> 376 @type null 377 </match> 378 379 # Add a unique insertId to each log entry that doesn't already have it. 380 # This helps guarantee the order and prevent log duplication. 381 <filter **> 382 @type add_insert_ids 383 </filter> 384 385 # This section is exclusive for k8s_container logs. These logs come with 386 # 'stderr'/'stdout' tags. 387 # We use a separate output stanza for 'k8s_node' logs with a smaller buffer 388 # because node logs are less important than user's container logs. 389 <match {stderr,stdout}> 390 @type google_cloud 391 392 # Try to detect JSON formatted log entries. 393 detect_json true 394 # Collect metrics in Prometheus registry about plugin activity. 395 enable_monitoring true 396 monitoring_type prometheus 397 # Allow log entries from multiple containers to be sent in the same request. 398 split_logs_by_tag false 399 # Set the buffer type to file to improve the reliability and reduce the memory consumption 400 buffer_type file 401 buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer 402 # Set queue_full action to block because we want to pause gracefully 403 # in case of the off-the-limits load instead of throwing an exception 404 buffer_queue_full_action block 405 # Set the chunk limit conservatively to avoid exceeding the recommended 406 # chunk size of 5MB per write request. 407 buffer_chunk_limit 512k 408 # Cap the combined memory usage of this buffer and the one below to 409 # 512KiB/chunk * (6 + 2) chunks = 4 MiB 410 buffer_queue_limit 6 411 # Never wait more than 5 seconds before flushing logs in the non-error case. 412 flush_interval 5s 413 # Never wait longer than 30 seconds between retries. 414 max_retry_wait 30 415 # Disable the limit on the number of retries (retry forever). 416 disable_retry_limit 417 # Use multiple threads for processing. 418 num_threads 2 419 use_grpc true 420 # Skip timestamp adjustment as this is in a controlled environment with 421 # known timestamp format. This helps with CPU usage. 422 adjust_invalid_timestamps false 423 </match> 424 425 # Attach local_resource_id for 'k8s_node' monitored resource. 426 <filter **> 427 @type record_transformer 428 enable_ruby true 429 <record> 430 "logging.googleapis.com/local_resource_id" ${"k8s_node.#{ENV['NODE_NAME']}"} 431 </record> 432 </filter> 433 434 # This section is exclusive for 'k8s_node' logs. These logs come with tags 435 # that are neither 'stderr' or 'stdout'. 436 # We use a separate output stanza for 'k8s_container' logs with a larger 437 # buffer because user's container logs are more important than node logs. 438 <match **> 439 @type google_cloud 440 441 detect_json true 442 enable_monitoring true 443 monitoring_type prometheus 444 # Allow entries from multiple system logs to be sent in the same request. 445 split_logs_by_tag false 446 detect_subservice false 447 buffer_type file 448 buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer 449 buffer_queue_full_action block 450 buffer_chunk_limit 512k 451 buffer_queue_limit 2 452 flush_interval 5s 453 max_retry_wait 30 454 disable_retry_limit 455 num_threads 2 456 use_grpc true 457 # Skip timestamp adjustment as this is in a controlled environment with 458 # known timestamp format. This helps with CPU usage. 459 adjust_invalid_timestamps false 460 </match> 461 metadata: 462 name: fluentd-gcp-config-v1.2.5 463 namespace: kube-system 464 labels: 465 addonmanager.kubernetes.io/mode: Reconcile