k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/gce/windows/k8s-node-setup.psm1 (about) 1 # Copyright 2019 The Kubernetes Authors. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 15 <# 16 .SYNOPSIS 17 Library for configuring Windows nodes and joining them to the cluster. 18 19 .NOTES 20 This module depends on common.psm1. 21 22 Some portions copied / adapted from 23 https://github.com/Microsoft/SDN/blob/master/Kubernetes/windows/start-kubelet.ps1. 24 25 .EXAMPLE 26 Suggested usage for dev/test: 27 [Net.ServicePointManager]::SecurityProtocol = ` 28 [Net.SecurityProtocolType]::Tls12 29 Invoke-WebRequest ` 30 https://github.com/kubernetes/kubernetes/raw/master/cluster/gce/windows/k8s-node-setup.psm1 ` 31 -OutFile C:\k8s-node-setup.psm1 32 Invoke-WebRequest ` 33 https://github.com/kubernetes/kubernetes/raw/master/cluster/gce/windows/configure.ps1 ` 34 -OutFile C:\configure.ps1 35 Import-Module -Force C:\k8s-node-setup.psm1 # -Force to override existing 36 # Execute functions manually or run configure.ps1. 37 #> 38 39 # IMPORTANT PLEASE NOTE: 40 # Any time the file structure in the `windows` directory changes, `windows/BUILD` 41 # and `k8s.io/release/lib/releaselib.sh` must be manually updated with the changes. 42 # We HIGHLY recommend not changing the file structure, because consumers of 43 # Kubernetes releases depend on the release structure remaining stable. 44 45 # TODO: update scripts for these style guidelines: 46 # - Remove {} around variable references unless actually needed for clarity. 47 # - Always use single-quoted strings unless actually interpolating variables 48 # or using escape characters. 49 # - Use "approved verbs": 50 # https://docs.microsoft.com/en-us/powershell/developer/cmdlet/approved-verbs-for-windows-powershell-commands 51 # - Document functions using proper syntax: 52 # https://technet.microsoft.com/en-us/library/hh847834(v=wps.620).aspx 53 54 $GCE_METADATA_SERVER = "169.254.169.254" 55 # The "management" interface is used by the kubelet and by Windows pods to talk 56 # to the rest of the Kubernetes cluster *without NAT*. This interface does not 57 # exist until an initial HNS network has been created on the Windows node - see 58 # Add_InitialHnsNetwork(). 59 $MGMT_ADAPTER_NAME = "vEthernet (Ethernet*" 60 $CRICTL_VERSION = 'v1.30.0' 61 $CRICTL_SHA256 = '43d37d94c0dc03830c0988049537fc22fe4b0ad4273ec9066e03586dc8920eb0' 62 63 Import-Module -Force C:\common.psm1 64 65 # Writes a TODO with $Message to the console. 66 function Log_Todo { 67 param ( 68 [parameter(Mandatory=$true)] [string]$Message 69 ) 70 Log-Output "TODO: ${Message}" 71 } 72 73 # Writes a not-implemented warning with $Message to the console and exits the 74 # script. 75 function Log_NotImplemented { 76 param ( 77 [parameter(Mandatory=$true)] [string]$Message 78 ) 79 Log-Output "Not implemented yet: ${Message}" -Fatal 80 } 81 82 # Fails and exits if the route to the GCE metadata server is not present, 83 # otherwise does nothing and emits nothing. 84 function Verify_GceMetadataServerRouteIsPresent { 85 Try { 86 Get-NetRoute ` 87 -ErrorAction "Stop" ` 88 -AddressFamily IPv4 ` 89 -DestinationPrefix ${GCE_METADATA_SERVER}/32 | Out-Null 90 } Catch [Microsoft.PowerShell.Cmdletization.Cim.CimJobException] { 91 Log-Output -Fatal ` 92 ("GCE metadata server route is not present as expected.`n" + 93 "$(Get-NetRoute -AddressFamily IPv4 | Out-String)") 94 } 95 } 96 97 # Checks if the route to the GCE metadata server is present. Returns when the 98 # route is NOT present or after a timeout has expired. 99 function WaitFor_GceMetadataServerRouteToBeRemoved { 100 $elapsed = 0 101 $timeout = 60 102 Log-Output ("Waiting up to ${timeout} seconds for GCE metadata server " + 103 "route to be removed") 104 while (${elapsed} -lt ${timeout}) { 105 Try { 106 Get-NetRoute ` 107 -ErrorAction "Stop" ` 108 -AddressFamily IPv4 ` 109 -DestinationPrefix ${GCE_METADATA_SERVER}/32 | Out-Null 110 } Catch [Microsoft.PowerShell.Cmdletization.Cim.CimJobException] { 111 break 112 } 113 $sleeptime = 2 114 Start-Sleep ${sleeptime} 115 ${elapsed} += ${sleeptime} 116 } 117 } 118 119 # Adds a route to the GCE metadata server to every network interface. 120 function Add_GceMetadataServerRoute { 121 # Before setting up HNS the Windows VM has a "vEthernet (nat)" interface and 122 # a "Ethernet" interface, and the route to the metadata server exists on the 123 # Ethernet interface. After adding the HNS network a "vEthernet (Ethernet)" 124 # interface is added, and it seems to subsume the routes of the "Ethernet" 125 # interface (trying to add routes on the Ethernet interface at this point just 126 # results in "New-NetRoute : Element not found" errors). I don't know what's 127 # up with that, but since it's hard to know what's the right thing to do here 128 # we just try to add the route on all of the network adapters. 129 Get-NetAdapter | ForEach-Object { 130 $adapter_index = $_.InterfaceIndex 131 New-NetRoute ` 132 -ErrorAction Ignore ` 133 -DestinationPrefix "${GCE_METADATA_SERVER}/32" ` 134 -InterfaceIndex ${adapter_index} | Out-Null 135 } 136 } 137 138 # Returns a PowerShell object representing the Windows version. 139 function Get_WindowsVersion { 140 # Unlike checking `[System.Environment]::OSVersion.Version`, this long-winded 141 # approach gets the OS revision/patch number correctly 142 # (https://superuser.com/a/1160428/652018). 143 $win_ver = New-Object -TypeName PSObject 144 $win_ver | Add-Member -MemberType NoteProperty -Name Major -Value $(Get-ItemProperty -Path 'Registry::HKEY_LOCAL_MACHINE\Software\Microsoft\Windows NT\CurrentVersion' CurrentMajorVersionNumber).CurrentMajorVersionNumber 145 $win_ver | Add-Member -MemberType NoteProperty -Name Minor -Value $(Get-ItemProperty -Path 'Registry::HKEY_LOCAL_MACHINE\Software\Microsoft\Windows NT\CurrentVersion' CurrentMinorVersionNumber).CurrentMinorVersionNumber 146 $win_ver | Add-Member -MemberType NoteProperty -Name Build -Value $(Get-ItemProperty -Path 'Registry::HKEY_LOCAL_MACHINE\Software\Microsoft\Windows NT\CurrentVersion' CurrentBuild).CurrentBuild 147 $win_ver | Add-Member -MemberType NoteProperty -Name Revision -Value $(Get-ItemProperty -Path 'Registry::HKEY_LOCAL_MACHINE\Software\Microsoft\Windows NT\CurrentVersion' UBR).UBR 148 return $win_ver 149 } 150 151 # Writes debugging information, such as Windows version and patch info, to the 152 # console. 153 function Dump-DebugInfoToConsole { 154 Try { 155 $version = Get_WindowsVersion | Out-String 156 $hotfixes = "$(Get-Hotfix | Out-String)" 157 $image = "$(Get-InstanceMetadata 'image' | Out-String)" 158 Log-Output "Windows version:`n$version" 159 Log-Output "Installed hotfixes:`n$hotfixes" 160 Log-Output "GCE Windows image:`n$image" 161 } Catch { } 162 } 163 164 # Configures Window Defender preferences 165 function Configure-WindowsDefender { 166 if ((Get-WindowsFeature -Name 'Windows-Defender').Installed) { 167 Log-Output "Configuring Windows Defender preferences" 168 Set-MpPreference -SubmitSamplesConsent NeverSend 169 Log-Output "Disabling Windows Defender sample submission" 170 Set-MpPreference -MAPSReporting Disabled 171 Log-Output "Disabling Windows Defender Microsoft Active Protection Service Reporting" 172 173 Log-Output "Defender Preferences" 174 Get-MpPreference 175 } 176 } 177 178 # Converts the kube-env string in Yaml 179 # 180 # Returns: a PowerShell Hashtable object containing the key-value pairs from 181 # kube-env. 182 function ConvertFrom_Yaml_KubeEnv { 183 param ( 184 [parameter(Mandatory=$true)] [string]$kube_env_str 185 ) 186 $kube_env_table = @{} 187 $currentLine = $null 188 switch -regex (${kube_env_str} -split '\r?\n') { 189 '^(\S.*)' { 190 # record start pattern, line that doesn't start with a whitespace 191 if ($null -ne $currentLine) { 192 $key, $val = $currentLine -split ":",2 193 $kube_env_table[$key] = $val.Trim("'", " ", "`"") 194 } 195 $currentLine = $matches.1 196 continue 197 } 198 199 '^(\s+.*)' { 200 # line that start with whitespace 201 $currentLine += $matches.1 202 continue 203 } 204 } 205 206 # Handle the last line if any 207 if ($currentLine) { 208 $key, $val = $currentLine -split ":",2 209 $kube_env_table[$key] = $val.Trim("'", " ", "`"") 210 } 211 212 return ${kube_env_table} 213 } 214 215 # Fetches the kube-env from the instance metadata. 216 # 217 # Returns: a PowerShell Hashtable object containing the key-value pairs from 218 # kube-env. 219 function Fetch-KubeEnv { 220 # Testing / debugging: 221 # First: 222 # ${kube_env} = Get-InstanceMetadataAttribute 'kube-env' 223 # or: 224 # ${kube_env} = [IO.File]::ReadAllText(".\kubeEnv.txt") 225 # ${kube_env_table} = ConvertFrom_Yaml_KubeEnv ${kube_env} 226 # ${kube_env_table} 227 # ${kube_env_table}.GetType() 228 229 # The type of kube_env is a powershell String. 230 $kube_env = Get-InstanceMetadataAttribute 'kube-env' 231 $kube_env_table = ConvertFrom_Yaml_KubeEnv ${kube_env} 232 233 Log-Output "Logging kube-env key-value pairs except CERT and KEY values" 234 foreach ($entry in $kube_env_table.GetEnumerator()) { 235 if ((-not ($entry.Name.contains("CERT"))) -and (-not ($entry.Name.contains("KEY")))) { 236 Log-Output "$($entry.Name): $($entry.Value)" 237 } 238 } 239 return ${kube_env_table} 240 } 241 242 # Sets the environment variable $Key to $Value at the Machine scope (will 243 # be present in the environment for all new shells after a reboot). 244 function Set_MachineEnvironmentVar { 245 param ( 246 [parameter(Mandatory=$true)] [string]$Key, 247 [parameter(Mandatory=$true)] [AllowEmptyString()] [string]$Value 248 ) 249 [Environment]::SetEnvironmentVariable($Key, $Value, "Machine") 250 } 251 252 # Sets the environment variable $Key to $Value in the current shell. 253 function Set_CurrentShellEnvironmentVar { 254 param ( 255 [parameter(Mandatory=$true)] [string]$Key, 256 [parameter(Mandatory=$true)] [AllowEmptyString()] [string]$Value 257 ) 258 $expression = '$env:' + $Key + ' = "' + $Value + '"' 259 Invoke-Expression ${expression} 260 } 261 262 # Sets environment variables used by Kubernetes binaries and by other functions 263 # in this module. Depends on numerous ${kube_env} keys. 264 function Set-EnvironmentVars { 265 if ($kube_env.ContainsKey('WINDOWS_CONTAINER_RUNTIME_ENDPOINT')) { 266 $container_runtime_endpoint = ${kube_env}['WINDOWS_CONTAINER_RUNTIME_ENDPOINT'] 267 } else { 268 Log-Output "ERROR: WINDOWS_CONTAINER_RUNTIME_ENDPOINT not set in kube-env, falling back in CONTAINER_RUNTIME_ENDPOINT" 269 $container_runtime_endpoint = ${kube_env}['CONTAINER_RUNTIME_ENDPOINT'] 270 } 271 # Turning the kube-env values into environment variables is not required but 272 # it makes debugging this script easier, and it also makes the syntax a lot 273 # easier (${env:K8S_DIR} can be expanded within a string but 274 # ${kube_env}['K8S_DIR'] cannot be afaik). 275 $env_vars = @{ 276 "K8S_DIR" = ${kube_env}['K8S_DIR'] 277 # Typically 'C:\etc\kubernetes\node\bin' (not just 'C:\etc\kubernetes\node') 278 "NODE_DIR" = ${kube_env}['NODE_DIR'] 279 "CNI_DIR" = ${kube_env}['CNI_DIR'] 280 "CNI_CONFIG_DIR" = ${kube_env}['CNI_CONFIG_DIR'] 281 "WINDOWS_CNI_STORAGE_PATH" = ${kube_env}['WINDOWS_CNI_STORAGE_PATH'] 282 "WINDOWS_CNI_VERSION" = ${kube_env}['WINDOWS_CNI_VERSION'] 283 "CSI_PROXY_STORAGE_PATH" = ${kube_env}['CSI_PROXY_STORAGE_PATH'] 284 "CSI_PROXY_VERSION" = ${kube_env}['CSI_PROXY_VERSION'] 285 "CSI_PROXY_FLAGS" = ${kube_env}['CSI_PROXY_FLAGS'] 286 "ENABLE_CSI_PROXY" = ${kube_env}['ENABLE_CSI_PROXY'] 287 "PKI_DIR" = ${kube_env}['PKI_DIR'] 288 "CA_FILE_PATH" = ${kube_env}['CA_FILE_PATH'] 289 "KUBELET_CONFIG" = ${kube_env}['KUBELET_CONFIG_FILE'] 290 "BOOTSTRAP_KUBECONFIG" = ${kube_env}['BOOTSTRAP_KUBECONFIG_FILE'] 291 "KUBECONFIG" = ${kube_env}['KUBECONFIG_FILE'] 292 "KUBEPROXY_KUBECONFIG" = ${kube_env}['KUBEPROXY_KUBECONFIG_FILE'] 293 "LOGS_DIR" = ${kube_env}['LOGS_DIR'] 294 "MANIFESTS_DIR" = ${kube_env}['MANIFESTS_DIR'] 295 "INFRA_CONTAINER" = ${kube_env}['WINDOWS_INFRA_CONTAINER'] 296 "WINDOWS_ENABLE_PIGZ" = ${kube_env}['WINDOWS_ENABLE_PIGZ'] 297 "WINDOWS_ENABLE_HYPERV" = ${kube_env}['WINDOWS_ENABLE_HYPERV'] 298 "ENABLE_NODE_PROBLEM_DETECTOR" = ${kube_env}['ENABLE_NODE_PROBLEM_DETECTOR'] 299 "NODEPROBLEMDETECTOR_KUBECONFIG_FILE" = ${kube_env}['WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE'] 300 "ENABLE_AUTH_PROVIDER_GCP" = ${kube_env}['ENABLE_AUTH_PROVIDER_GCP'] 301 "AUTH_PROVIDER_GCP_STORAGE_PATH" = ${kube_env}['AUTH_PROVIDER_GCP_STORAGE_PATH'] 302 "AUTH_PROVIDER_GCP_VERSION" = ${kube_env}['AUTH_PROVIDER_GCP_VERSION'] 303 "AUTH_PROVIDER_GCP_HASH_WINDOWS_AMD64" = ${kube_env}['AUTH_PROVIDER_GCP_HASH_WINDOWS_AMD64'] 304 "AUTH_PROVIDER_GCP_WINDOWS_BIN_DIR" = ${kube_env}['AUTH_PROVIDER_GCP_WINDOWS_BIN_DIR'] 305 "AUTH_PROVIDER_GCP_WINDOWS_CONF_FILE" = ${kube_env}['AUTH_PROVIDER_GCP_WINDOWS_CONF_FILE'] 306 307 "Path" = ${env:Path} + ";" + ${kube_env}['NODE_DIR'] 308 "KUBE_NETWORK" = "l2bridge".ToLower() 309 "KUBELET_CERT_PATH" = ${kube_env}['PKI_DIR'] + '\kubelet.crt' 310 "KUBELET_KEY_PATH" = ${kube_env}['PKI_DIR'] + '\kubelet.key' 311 312 "CONTAINER_RUNTIME_ENDPOINT" = $container_runtime_endpoint 313 314 'LICENSE_DIR' = 'C:\Program Files\Google\Compute Engine\THIRD_PARTY_NOTICES' 315 } 316 317 # Set the environment variables in two ways: permanently on the machine (only 318 # takes effect after a reboot), and in the current shell. 319 $env_vars.GetEnumerator() | ForEach-Object{ 320 $message = "Setting environment variable: " + $_.key + " = " + $_.value 321 Log-Output ${message} 322 Set_MachineEnvironmentVar $_.key $_.value 323 Set_CurrentShellEnvironmentVar $_.key $_.value 324 } 325 } 326 327 # Configures various settings and prerequisites needed for the rest of the 328 # functions in this module and the Kubernetes binaries to operate properly. 329 function Set-PrerequisiteOptions { 330 # Windows updates cause the node to reboot at arbitrary times. 331 Log-Output "Disabling Windows Update service" 332 & sc.exe config wuauserv start=disabled 333 & sc.exe stop wuauserv 334 Write-VerboseServiceInfoToConsole -Service 'wuauserv' -Delay 1 335 336 # Use TLS 1.2: needed for Invoke-WebRequest downloads from github.com. 337 [Net.ServicePointManager]::SecurityProtocol = ` 338 [Net.SecurityProtocolType]::Tls12 339 340 Configure-WindowsDefender 341 } 342 343 # Creates directories where other functions in this module will read and write 344 # data. 345 # Note: C:\tmp is required for running certain kubernetes tests. 346 # C:\var\log is used by kubelet to stored container logs and also 347 # hard-coded in the fluentd/stackdriver config for log collection. 348 function Create-Directories { 349 Log-Output "Creating ${env:K8S_DIR} and its subdirectories." 350 ForEach ($dir in ("${env:K8S_DIR}", "${env:NODE_DIR}", "${env:LOGS_DIR}", 351 "${env:CNI_DIR}", "${env:CNI_CONFIG_DIR}", "${env:MANIFESTS_DIR}", 352 "${env:PKI_DIR}", "${env:LICENSE_DIR}"), "C:\tmp", "C:\var\log") { 353 mkdir -Force $dir 354 } 355 } 356 357 # Downloads some external helper scripts needed by other functions in this 358 # module. 359 function Download-HelperScripts { 360 if (ShouldWrite-File ${env:K8S_DIR}\hns.psm1) { 361 MustDownload-File ` 362 -OutFile ${env:K8S_DIR}\hns.psm1 ` 363 -URLs 'https://storage.googleapis.com/gke-release/winnode/config/sdn/master/hns.psm1' 364 } 365 } 366 367 # Downloads the Kubernetes binaries from kube-env's NODE_BINARY_TAR_URL and 368 # puts them in a subdirectory of $env:K8S_DIR. 369 # 370 # Required ${kube_env} keys: 371 # NODE_BINARY_TAR_URL 372 function DownloadAndInstall-KubernetesBinaries { 373 # Assume that presence of kubelet.exe indicates that the kubernetes binaries 374 # were already previously downloaded to this node. 375 if (-not (ShouldWrite-File ${env:NODE_DIR}\kubelet.exe)) { 376 return 377 } 378 379 $tmp_dir = 'C:\k8s_tmp' 380 New-Item -Force -ItemType 'directory' $tmp_dir | Out-Null 381 382 $urls = ${kube_env}['NODE_BINARY_TAR_URL'].Split(",") 383 $filename = Split-Path -leaf $urls[0] 384 $hash = $null 385 if ($kube_env.ContainsKey('NODE_BINARY_TAR_HASH')) { 386 $hash = ${kube_env}['NODE_BINARY_TAR_HASH'] 387 } 388 MustDownload-File -Hash $hash -OutFile $tmp_dir\$filename -URLs $urls 389 390 tar xzvf $tmp_dir\$filename -C $tmp_dir 391 Move-Item -Force $tmp_dir\kubernetes\node\bin\* ${env:NODE_DIR}\ 392 Move-Item -Force ` 393 $tmp_dir\kubernetes\LICENSES ${env:LICENSE_DIR}\LICENSES_kubernetes 394 395 # Clean up the temporary directory 396 Remove-Item -Force -Recurse $tmp_dir 397 } 398 399 # Downloads the csi-proxy binaries from kube-env's CSI_PROXY_STORAGE_PATH and 400 # CSI_PROXY_VERSION, and then puts them in a subdirectory of $env:NODE_DIR. 401 # Note: for now the installation is skipped for non-test clusters. Will be 402 # installed for all cluster after tests pass. 403 # Required ${kube_env} keys: 404 # CSI_PROXY_STORAGE_PATH and CSI_PROXY_VERSION 405 function DownloadAndInstall-CSIProxyBinaries { 406 if ("${env:ENABLE_CSI_PROXY}" -eq "true") { 407 if (ShouldWrite-File ${env:NODE_DIR}\csi-proxy.exe) { 408 $tmp_dir = 'C:\k8s_tmp' 409 New-Item -Force -ItemType 'directory' $tmp_dir | Out-Null 410 $filename = 'csi-proxy.exe' 411 $urls = "${env:CSI_PROXY_STORAGE_PATH}/${env:CSI_PROXY_VERSION}/$filename" 412 MustDownload-File -OutFile $tmp_dir\$filename -URLs $urls 413 Move-Item -Force $tmp_dir\$filename ${env:NODE_DIR}\$filename 414 # Clean up the temporary directory 415 Remove-Item -Force -Recurse $tmp_dir 416 } 417 } 418 } 419 420 function Start-CSIProxy { 421 if ("${env:ENABLE_CSI_PROXY}" -eq "true") { 422 Log-Output "Creating CSI Proxy Service" 423 $flags = "-windows-service -log_file=${env:LOGS_DIR}\csi-proxy.log -logtostderr=false ${env:CSI_PROXY_FLAGS}" 424 & sc.exe create csiproxy binPath= "${env:NODE_DIR}\csi-proxy.exe $flags" 425 & sc.exe failure csiproxy reset= 0 actions= restart/10000 426 Log-Output "Starting CSI Proxy Service" 427 & sc.exe start csiproxy 428 Write-VerboseServiceInfoToConsole -Service 'csiproxy' -Delay 1 429 } 430 } 431 432 # TODO(pjh): this is copied from 433 # https://github.com/Microsoft/SDN/blob/master/Kubernetes/windows/start-kubelet.ps1#L98. 434 # See if there's a way to fetch or construct the "management subnet" so that 435 # this is not needed. 436 function ConvertTo_DecimalIP 437 { 438 param( 439 [parameter(Mandatory = $true, Position = 0)] 440 [Net.IPAddress] $IPAddress 441 ) 442 443 $i = 3; $decimal_ip = 0; 444 $IPAddress.GetAddressBytes() | % { 445 $decimal_ip += $_ * [Math]::Pow(256, $i); $i-- 446 } 447 return [UInt32]$decimal_ip 448 } 449 450 # TODO(pjh): this is copied from 451 # https://github.com/Microsoft/SDN/blob/master/Kubernetes/windows/start-kubelet.ps1#L98. 452 # See if there's a way to fetch or construct the "management subnet" so that 453 # this is not needed. 454 function ConvertTo_DottedDecimalIP 455 { 456 param( 457 [parameter(Mandatory = $true, Position = 0)] 458 [Uint32] $IPAddress 459 ) 460 461 $dotted_ip = $(for ($i = 3; $i -gt -1; $i--) { 462 $remainder = $IPAddress % [Math]::Pow(256, $i) 463 ($IPAddress - $remainder) / [Math]::Pow(256, $i) 464 $IPAddress = $remainder 465 }) 466 return [String]::Join(".", $dotted_ip) 467 } 468 469 # TODO(pjh): this is copied from 470 # https://github.com/Microsoft/SDN/blob/master/Kubernetes/windows/start-kubelet.ps1#L98. 471 # See if there's a way to fetch or construct the "management subnet" so that 472 # this is not needed. 473 function ConvertTo_MaskLength 474 { 475 param( 476 [parameter(Mandatory = $True, Position = 0)] 477 [Net.IPAddress] $SubnetMask 478 ) 479 480 $bits = "$($SubnetMask.GetAddressBytes() | % { 481 [Convert]::ToString($_, 2) 482 } )" -replace "[\s0]" 483 return $bits.Length 484 } 485 486 # Returns a network adapter object for the "management" interface via which the 487 # Windows pods+kubelet will communicate with the rest of the Kubernetes cluster. 488 # 489 # This function will fail if Add_InitialHnsNetwork() has not been called first. 490 function Get_MgmtNetAdapter { 491 $net_adapter = Get-NetAdapter | Where-Object Name -like ${MGMT_ADAPTER_NAME} 492 if (-not ${net_adapter}) { 493 Throw ("Failed to find a suitable network adapter, check your network " + 494 "settings.") 495 } 496 497 return $net_adapter 498 } 499 500 # Decodes the base64 $Data string and writes it as binary to $File. Does 501 # nothing if $File already exists and $REDO_STEPS is not set. 502 function Write_PkiData { 503 param ( 504 [parameter(Mandatory=$true)] [string] $Data, 505 [parameter(Mandatory=$true)] [string] $File 506 ) 507 508 if (-not (ShouldWrite-File $File)) { 509 return 510 } 511 512 # This command writes out a PEM certificate file, analogous to "base64 513 # --decode" on Linux. See https://stackoverflow.com/a/51914136/1230197. 514 [IO.File]::WriteAllBytes($File, [Convert]::FromBase64String($Data)) 515 Log_Todo ("need to set permissions correctly on ${File}; not sure what the " + 516 "Windows equivalent of 'umask 077' is") 517 # Linux: owned by root, rw by user only. 518 # -rw------- 1 root root 1.2K Oct 12 00:56 ca-certificates.crt 519 # -rw------- 1 root root 1.3K Oct 12 00:56 kubelet.crt 520 # -rw------- 1 root root 1.7K Oct 12 00:56 kubelet.key 521 # Windows: 522 # https://docs.microsoft.com/en-us/dotnet/api/system.io.fileattributes 523 # https://docs.microsoft.com/en-us/dotnet/api/system.io.fileattributes 524 } 525 526 # Creates the node PKI files in $env:PKI_DIR. 527 # 528 # Required ${kube_env} keys: 529 # CA_CERT 530 # ${kube_env} keys that can be omitted for nodes that do not use an 531 # authentication plugin: 532 # KUBELET_CERT 533 # KUBELET_KEY 534 function Create-NodePki { 535 Log-Output 'Creating node pki files' 536 537 if ($kube_env.ContainsKey('CA_CERT')) { 538 $CA_CERT_BUNDLE = ${kube_env}['CA_CERT'] 539 Write_PkiData "${CA_CERT_BUNDLE}" ${env:CA_FILE_PATH} 540 } 541 else { 542 Log-Output -Fatal 'CA_CERT not present in kube-env' 543 } 544 545 if ($kube_env.ContainsKey('KUBELET_CERT')) { 546 $KUBELET_CERT = ${kube_env}['KUBELET_CERT'] 547 Write_PkiData "${KUBELET_CERT}" ${env:KUBELET_CERT_PATH} 548 } 549 else { 550 Log-Output -Fatal 'KUBELET_CERT not present in kube-env' 551 } 552 if ($kube_env.ContainsKey('KUBELET_KEY')) { 553 $KUBELET_KEY = ${kube_env}['KUBELET_KEY'] 554 Write_PkiData "${KUBELET_KEY}" ${env:KUBELET_KEY_PATH} 555 } 556 else { 557 Log-Output -Fatal 'KUBELET_KEY not present in kube-env' 558 } 559 560 Get-ChildItem ${env:PKI_DIR} 561 } 562 563 # Creates the bootstrap kubelet kubeconfig at $env:BOOTSTRAP_KUBECONFIG. 564 # https://kubernetes.io/docs/reference/command-line-tools-reference/kubelet-tls-bootstrapping/ 565 # 566 # Create-NodePki() must be called first. 567 # 568 # Required ${kube_env} keys: 569 # KUBERNETES_MASTER_NAME: the apiserver IP address. 570 function Write_BootstrapKubeconfig { 571 if (-not (ShouldWrite-File ${env:BOOTSTRAP_KUBECONFIG})) { 572 return 573 } 574 575 # TODO(mtaufen): is user "kubelet" correct? Other examples use e.g. 576 # "system:node:$(hostname)". 577 578 $apiserverAddress = ${kube_env}['KUBERNETES_MASTER_NAME'] 579 New-Item -Force -ItemType file ${env:BOOTSTRAP_KUBECONFIG} | Out-Null 580 Set-Content ${env:BOOTSTRAP_KUBECONFIG} ` 581 'apiVersion: v1 582 kind: Config 583 users: 584 - name: kubelet 585 user: 586 client-certificate: KUBELET_CERT_PATH 587 client-key: KUBELET_KEY_PATH 588 clusters: 589 - name: local 590 cluster: 591 server: https://APISERVER_ADDRESS 592 certificate-authority: CA_FILE_PATH 593 contexts: 594 - context: 595 cluster: local 596 user: kubelet 597 name: service-account-context 598 current-context: service-account-context'.` 599 replace('KUBELET_CERT_PATH', ${env:KUBELET_CERT_PATH}).` 600 replace('KUBELET_KEY_PATH', ${env:KUBELET_KEY_PATH}).` 601 replace('APISERVER_ADDRESS', ${apiserverAddress}).` 602 replace('CA_FILE_PATH', ${env:CA_FILE_PATH}) 603 Log-Output ("kubelet bootstrap kubeconfig:`n" + 604 "$(Get-Content -Raw ${env:BOOTSTRAP_KUBECONFIG})") 605 } 606 607 # Fetches the kubelet kubeconfig from the metadata server and writes it to 608 # $env:KUBECONFIG. 609 # 610 # Create-NodePki() must be called first. 611 function Write_KubeconfigFromMetadata { 612 if (-not (ShouldWrite-File ${env:KUBECONFIG})) { 613 return 614 } 615 616 $kubeconfig = Get-InstanceMetadataAttribute 'kubeconfig' 617 if ($kubeconfig -eq $null) { 618 Log-Output ` 619 "kubeconfig metadata key not found, can't write ${env:KUBECONFIG}" ` 620 -Fatal 621 } 622 Set-Content ${env:KUBECONFIG} $kubeconfig 623 Log-Output ("kubelet kubeconfig from metadata (non-bootstrap):`n" + 624 "$(Get-Content -Raw ${env:KUBECONFIG})") 625 } 626 627 # Creates the kubelet kubeconfig at $env:KUBECONFIG for nodes that use an 628 # authentication plugin, or at $env:BOOTSTRAP_KUBECONFIG for nodes that do not. 629 # 630 # Create-NodePki() must be called first. 631 # 632 # Required ${kube_env} keys: 633 # KUBERNETES_MASTER_NAME: the apiserver IP address. 634 function Create-KubeletKubeconfig { 635 Write_BootstrapKubeconfig 636 } 637 638 # Creates the kubeconfig user file for applications that communicate with Kubernetes. 639 # 640 # Create-NodePki() must be called first. 641 # 642 # Required ${kube_env} keys: 643 # CA_CERT 644 # KUBERNETES_MASTER_NAME 645 function Create-Kubeconfig { 646 param ( 647 [parameter(Mandatory=$true)] [string]$Name, 648 [parameter(Mandatory=$true)] [string]$Path, 649 [parameter(Mandatory=$true)] [string]$Token 650 ) 651 if (-not (ShouldWrite-File $Path)) { 652 return 653 } 654 655 New-Item -Force -ItemType file $Path | Out-Null 656 657 # In configure-helper.sh kubelet kubeconfig uses certificate-authority while 658 # kubeproxy kubeconfig uses certificate-authority-data, ugh. Does it matter? 659 # Use just one or the other for consistency? 660 Set-Content $Path ` 661 'apiVersion: v1 662 kind: Config 663 users: 664 - name: APP_NAME 665 user: 666 token: APP_TOKEN 667 clusters: 668 - name: local 669 cluster: 670 server: https://APISERVER_ADDRESS 671 certificate-authority-data: CA_CERT 672 contexts: 673 - context: 674 cluster: local 675 user: APP_NAME 676 name: service-account-context 677 current-context: service-account-context'.` 678 replace('APP_NAME', $Name).` 679 replace('APP_TOKEN', $Token).` 680 replace('CA_CERT', ${kube_env}['CA_CERT']).` 681 replace('APISERVER_ADDRESS', ${kube_env}['KUBERNETES_MASTER_NAME']) 682 683 Log-Output ("${Name} kubeconfig:`n" + 684 "$(Get-Content -Raw ${Path})") 685 } 686 687 # Creates the kube-proxy user kubeconfig file at $env:KUBEPROXY_KUBECONFIG. 688 # 689 # Create-NodePki() must be called first. 690 # 691 # Required ${kube_env} keys: 692 # CA_CERT 693 # KUBE_PROXY_TOKEN 694 function Create-KubeproxyKubeconfig { 695 Create-Kubeconfig -Name 'kube-proxy' ` 696 -Path ${env:KUBEPROXY_KUBECONFIG} ` 697 -Token ${kube_env}['KUBE_PROXY_TOKEN'] 698 } 699 700 # Returns the IP alias range configured for this GCE instance. 701 function Get_IpAliasRange { 702 $url = ("http://${GCE_METADATA_SERVER}/computeMetadata/v1/instance/" + 703 "network-interfaces/0/ip-aliases/0") 704 $client = New-Object Net.WebClient 705 $client.Headers.Add('Metadata-Flavor', 'Google') 706 return ($client.DownloadString($url)).Trim() 707 } 708 709 # Retrieves the pod CIDR and sets it in $env:POD_CIDR. 710 function Set-PodCidr { 711 while($true) { 712 $pod_cidr = Get_IpAliasRange 713 if (-not $?) { 714 Log-Output ${pod_cIDR} 715 Log-Output "Retrying Get_IpAliasRange..." 716 Start-Sleep -sec 1 717 continue 718 } 719 break 720 } 721 722 Log-Output "fetched pod CIDR (same as IP alias range): ${pod_cidr}" 723 Set_MachineEnvironmentVar "POD_CIDR" ${pod_cidr} 724 Set_CurrentShellEnvironmentVar "POD_CIDR" ${pod_cidr} 725 } 726 727 # Adds an initial HNS network on the Windows node which forces the creation of 728 # a virtual switch and the "management" interface that will be used to 729 # communicate with the rest of the Kubernetes cluster without NAT. 730 # 731 # Note that adding the initial HNS network may cause connectivity to the GCE 732 # metadata server to be lost due to a Windows bug. 733 # Configure-HostNetworkingService() restores connectivity, look there for 734 # details. 735 # 736 # Download-HelperScripts() must have been called first. 737 function Add_InitialHnsNetwork { 738 $INITIAL_HNS_NETWORK = 'External' 739 740 # This comes from 741 # https://github.com/Microsoft/SDN/blob/master/Kubernetes/flannel/l2bridge/start.ps1#L74 742 # (or 743 # https://github.com/Microsoft/SDN/blob/master/Kubernetes/windows/start-kubelet.ps1#L206). 744 # 745 # daschott noted on Slack: "L2bridge networks require an external vSwitch. 746 # The first network ("External") with hardcoded values in the script is just 747 # a placeholder to create an external vSwitch. This is purely for convenience 748 # to be able to remove/modify the actual HNS network ("cbr0") or rejoin the 749 # nodes without a network blip. Creating a vSwitch takes time, causes network 750 # blips, and it makes it more likely to hit the issue where flanneld is 751 # stuck, so we want to do this as rarely as possible." 752 $hns_network = Get-HnsNetwork | Where-Object Name -eq $INITIAL_HNS_NETWORK 753 if ($hns_network) { 754 if ($REDO_STEPS) { 755 Log-Output ("Warning: initial '$INITIAL_HNS_NETWORK' HNS network " + 756 "already exists, removing it and recreating it") 757 $hns_network | Remove-HnsNetwork 758 $hns_network = $null 759 } 760 else { 761 Log-Output ("Skip: initial '$INITIAL_HNS_NETWORK' HNS network " + 762 "already exists, not recreating it") 763 return 764 } 765 } 766 Log-Output ("Creating initial HNS network to force creation of " + 767 "${MGMT_ADAPTER_NAME} interface") 768 # Note: RDP connection will hiccup when running this command. 769 New-HNSNetwork ` 770 -Type "L2Bridge" ` 771 -AddressPrefix "192.168.255.0/30" ` 772 -Gateway "192.168.255.1" ` 773 -Name $INITIAL_HNS_NETWORK ` 774 -Verbose 775 } 776 777 # Get the network in uint32 for the given cidr 778 function Get_NetworkDecimal_From_CIDR([string] $cidr) { 779 $network, [int]$subnetlen = $cidr.Split('/') 780 $decimal_network = ConvertTo_DecimalIP($network) 781 return $decimal_network 782 } 783 784 # Get gateway ip string (the first address) based on pod cidr. 785 # For Windows nodes the pod gateway IP address is the first address in the pod 786 # CIDR for the host. 787 function Get_Gateway_From_CIDR([string] $cidr) { 788 $network=Get_NetworkDecimal_From_CIDR($cidr) 789 $gateway=ConvertTo_DottedDecimalIP($network+1) 790 return $gateway 791 } 792 793 # Get endpoint gateway ip string (the second address) based on pod cidr. 794 # For Windows nodes the pod gateway IP address is the first address in the pod 795 # CIDR for the host, but from inside containers it's the second address. 796 function Get_Endpoint_Gateway_From_CIDR([string] $cidr) { 797 $network=Get_NetworkDecimal_From_CIDR($cidr) 798 $gateway=ConvertTo_DottedDecimalIP($network+2) 799 return $gateway 800 } 801 802 # Get pod IP range start based (the third address) on pod cidr 803 # We reserve the first two in the cidr range for gateways. Start the cidr 804 # range from the third so that IPAM does not allocate those IPs to pods. 805 function Get_PodIP_Range_Start([string] $cidr) { 806 $network=Get_NetworkDecimal_From_CIDR($cidr) 807 $start=ConvertTo_DottedDecimalIP($network+3) 808 return $start 809 } 810 811 # Configures HNS on the Windows node to enable Kubernetes networking: 812 # - Creates the "management" interface associated with an initial HNS network. 813 # - Creates the HNS network $env:KUBE_NETWORK for pod networking. 814 # - Creates an HNS endpoint for pod networking. 815 # - Adds necessary routes on the management interface. 816 # - Verifies that the GCE metadata server connection remains intact. 817 # 818 # Prerequisites: 819 # $env:POD_CIDR is set (by Set-PodCidr). 820 # Download-HelperScripts() has been called. 821 function Configure-HostNetworkingService { 822 Import-Module -Force ${env:K8S_DIR}\hns.psm1 823 824 Add_InitialHnsNetwork 825 826 $pod_gateway = Get_Gateway_From_CIDR(${env:POD_CIDR}) 827 $pod_endpoint_gateway = Get_Endpoint_Gateway_From_CIDR(${env:POD_CIDR}) 828 Log-Output ("Setting up Windows node HNS networking: " + 829 "podCidr = ${env:POD_CIDR}, podGateway = ${pod_gateway}, " + 830 "podEndpointGateway = ${pod_endpoint_gateway}") 831 832 $hns_network = Get-HnsNetwork | Where-Object Name -eq ${env:KUBE_NETWORK} 833 if ($hns_network) { 834 if ($REDO_STEPS) { 835 Log-Output ("Warning: ${env:KUBE_NETWORK} HNS network already exists, " + 836 "removing it and recreating it") 837 $hns_network | Remove-HnsNetwork 838 $hns_network = $null 839 } 840 else { 841 Log-Output "Skip: ${env:KUBE_NETWORK} HNS network already exists" 842 } 843 } 844 $created_hns_network = $false 845 if (-not $hns_network) { 846 # Note: RDP connection will hiccup when running this command. 847 $hns_network = New-HNSNetwork ` 848 -Type "L2Bridge" ` 849 -AddressPrefix ${env:POD_CIDR} ` 850 -Gateway ${pod_gateway} ` 851 -Name ${env:KUBE_NETWORK} ` 852 -Verbose 853 $created_hns_network = $true 854 } 855 # This name of endpoint is referred in pkg/proxy/winkernel/proxier.go as part of 856 # kube-proxy as well. A health check port for every service that is specified as 857 # "externalTrafficPolicy: local" will be added on the endpoint. 858 # PLEASE KEEP THEM CONSISTENT!!! 859 $endpoint_name = "cbr0" 860 861 $vnic_name = "vEthernet (${endpoint_name})" 862 863 $hns_endpoint = Get-HnsEndpoint | Where-Object Name -eq $endpoint_name 864 # Note: we don't expect to ever enter this block currently - while the HNS 865 # network does seem to persist across reboots, the HNS endpoints do not. 866 if ($hns_endpoint) { 867 if ($REDO_STEPS) { 868 Log-Output ("Warning: HNS endpoint $endpoint_name already exists, " + 869 "removing it and recreating it") 870 $hns_endpoint | Remove-HnsEndpoint 871 $hns_endpoint = $null 872 } 873 else { 874 Log-Output "Skip: HNS endpoint $endpoint_name already exists" 875 } 876 } 877 if (-not $hns_endpoint) { 878 $hns_endpoint = New-HnsEndpoint ` 879 -NetworkId ${hns_network}.Id ` 880 -Name ${endpoint_name} ` 881 -IPAddress ${pod_endpoint_gateway} ` 882 -Gateway "0.0.0.0" ` 883 -Verbose 884 # TODO(pjh): find out: why is this always CompartmentId 1? 885 Attach-HnsHostEndpoint ` 886 -EndpointID ${hns_endpoint}.Id ` 887 -CompartmentID 1 ` 888 -Verbose 889 netsh interface ipv4 set interface "${vnic_name}" forwarding=enabled 890 } 891 892 Try { 893 Get-HNSPolicyList | Remove-HnsPolicyList 894 } Catch { } 895 896 # Add a route from the management NIC to the pod CIDR. 897 # 898 # When a packet from a Kubernetes service backend arrives on the destination 899 # Windows node, the reverse SNAT will be applied and the source address of 900 # the packet gets replaced from the pod IP to the service VIP. The packet 901 # will then leave the VM and return back through hairpinning. 902 # 903 # When IP alias is enabled, IP forwarding is disabled for anti-spoofing; 904 # the packet with the service VIP will get blocked and be lost. With this 905 # route, the packet will be routed to the pod subnetwork, and not leave the 906 # VM. 907 $mgmt_net_adapter = Get_MgmtNetAdapter 908 New-NetRoute ` 909 -ErrorAction Ignore ` 910 -InterfaceAlias ${mgmt_net_adapter}.ifAlias ` 911 -DestinationPrefix ${env:POD_CIDR} ` 912 -NextHop "0.0.0.0" ` 913 -Verbose 914 915 if ($created_hns_network) { 916 # There is an HNS bug where the route to the GCE metadata server will be 917 # removed when the HNS network is created: 918 # https://github.com/Microsoft/hcsshim/issues/299#issuecomment-425491610. 919 # The behavior here is very unpredictable: the route may only be removed 920 # after some delay, or it may appear to be removed then you'll add it back 921 # but then it will be removed once again. So, we first wait a long 922 # unfortunate amount of time to ensure that things have quiesced, then we 923 # wait until we're sure the route is really gone before re-adding it again. 924 Log-Output "Waiting 45 seconds for host network state to quiesce" 925 Start-Sleep 45 926 WaitFor_GceMetadataServerRouteToBeRemoved 927 Log-Output "Re-adding the GCE metadata server route" 928 Add_GceMetadataServerRoute 929 } 930 Verify_GceMetadataServerRouteIsPresent 931 932 Log-Output "Host network setup complete" 933 } 934 935 function Configure-GcePdTools { 936 if (ShouldWrite-File ${env:K8S_DIR}\GetGcePdName.dll) { 937 MustDownload-File -OutFile ${env:K8S_DIR}\GetGcePdName.dll ` 938 -URLs "https://storage.googleapis.com/gke-release/winnode/config/gce-tools/master/GetGcePdName/GetGcePdName.dll" 939 } 940 if (-not (Test-Path $PsHome\profile.ps1)) { 941 New-Item -path $PsHome\profile.ps1 -type file 942 } 943 944 Add-Content $PsHome\profile.ps1 ` 945 '$modulePath = "K8S_DIR\GetGcePdName.dll" 946 Unblock-File $modulePath 947 Import-Module -Name $modulePath'.replace('K8S_DIR', ${env:K8S_DIR}) 948 } 949 950 # Setup cni network for containerd. 951 function Prepare-CniNetworking { 952 Configure_Containerd_CniNetworking 953 } 954 955 # Obtain the host dns conf and save it to a file so that kubelet/CNI 956 # can use it to configure dns suffix search list for pods. 957 # The value of DNS server is ignored right now because the pod will 958 # always only use cluster DNS service, but for consistency, we still 959 # parsed them here in the same format as Linux resolv.conf. 960 # This function must be called after Configure-HostNetworkingService. 961 function Configure-HostDnsConf { 962 $net_adapter = Get_MgmtNetAdapter 963 $server_ips = (Get-DnsClientServerAddress ` 964 -InterfaceAlias ${net_adapter}.Name).ServerAddresses 965 $search_list = (Get-DnsClient).ConnectionSpecificSuffixSearchList 966 $conf = "" 967 ForEach ($ip in $server_ips) { 968 $conf = $conf + "nameserver $ip`r`n" 969 } 970 $conf = $conf + "search $search_list" 971 # Do not put hostdns.conf into the CNI config directory so as to 972 # avoid the container runtime treating it as CNI config. 973 $hostdns_conf = "${env:CNI_DIR}\hostdns.conf" 974 New-Item -Force -ItemType file ${hostdns_conf} | Out-Null 975 Set-Content ${hostdns_conf} $conf 976 Log-Output "HOST dns conf:`n$(Get-Content -Raw ${hostdns_conf})" 977 } 978 979 # Fetches the kubelet config from the instance metadata and puts it at 980 # $env:KUBELET_CONFIG. 981 function Configure-Kubelet { 982 if (-not (ShouldWrite-File ${env:KUBELET_CONFIG})) { 983 return 984 } 985 986 # The Kubelet config is built by build-kubelet-config() in 987 # cluster/gce/util.sh, and stored in the metadata server under the 988 # 'kubelet-config' key. 989 $kubelet_config = Get-InstanceMetadataAttribute 'kubelet-config' 990 Set-Content ${env:KUBELET_CONFIG} $kubelet_config 991 Log-Output "Kubelet config:`n$(Get-Content -Raw ${env:KUBELET_CONFIG})" 992 } 993 994 # Sets up the kubelet and kube-proxy arguments and starts them as native 995 # Windows services. 996 # 997 # Required ${kube_env} keys: 998 # KUBELET_ARGS 999 # KUBEPROXY_ARGS 1000 # CLUSTER_IP_RANGE 1001 function Start-WorkerServices { 1002 # Compute kubelet args 1003 $kubelet_args_str = ${kube_env}['KUBELET_ARGS'] 1004 $kubelet_args = $kubelet_args_str.Split(" ") 1005 Log-Output "kubelet_args from metadata: ${kubelet_args}" 1006 1007 # To join GCE instances to AD, we need to shorten their names, as NetBIOS name 1008 # must be <= 15 characters, and GKE generated names are longer than that. 1009 # To perform the join in an automated way, it's preferable to apply the rename 1010 # and domain join in the GCESysprep step. However, after sysprep is complete 1011 # and the machine restarts, kubelet bootstrapping should not use the shortened 1012 # computer name, and instead use the instance's name by using --hostname-override, 1013 # otherwise kubelet and kube-proxy will not be able to run properly. 1014 $instance_name = "$(Get-InstanceMetadata 'name' | Out-String)" 1015 $default_kubelet_args = @(` 1016 "--pod-infra-container-image=${env:INFRA_CONTAINER}", 1017 "--hostname-override=${instance_name}" 1018 ) 1019 1020 $kubelet_args = ${default_kubelet_args} + ${kubelet_args} 1021 Log-Output 'Using bootstrap kubeconfig for authentication' 1022 $kubelet_args = (${kubelet_args} + 1023 "--bootstrap-kubeconfig=${env:BOOTSTRAP_KUBECONFIG}") 1024 Log-Output "Final kubelet_args: ${kubelet_args}" 1025 1026 # Compute kube-proxy args 1027 $kubeproxy_args_str = ${kube_env}['KUBEPROXY_ARGS'] 1028 $kubeproxy_args = $kubeproxy_args_str.Split(" ") 1029 Log-Output "kubeproxy_args from metadata: ${kubeproxy_args}" 1030 1031 # kubeproxy is started on Linux nodes using 1032 # kube-manifests/kubernetes/gci-trusty/kube-proxy.manifest, which is 1033 # generated by start-kube-proxy in configure-helper.sh and contains e.g.: 1034 # kube-proxy --master=https://35.239.84.171 1035 # --kubeconfig=/var/lib/kube-proxy/kubeconfig --cluster-cidr=10.64.0.0/14 1036 # --oom-score-adj=-998 --v=2 1037 # --iptables-sync-period=1m --iptables-min-sync-period=10s 1038 # --ipvs-sync-period=1m --ipvs-min-sync-period=10s 1039 # And also with various volumeMounts and "securityContext: privileged: true". 1040 $default_kubeproxy_args = @(` 1041 "--kubeconfig=${env:KUBEPROXY_KUBECONFIG}", 1042 "--cluster-cidr=$(${kube_env}['CLUSTER_IP_RANGE'])", 1043 "--hostname-override=${instance_name}" 1044 ) 1045 1046 $kubeproxy_args = ${default_kubeproxy_args} + ${kubeproxy_args} 1047 Log-Output "Final kubeproxy_args: ${kubeproxy_args}" 1048 1049 # TODO(pjh): kubelet is emitting these messages: 1050 # I1023 23:44:11.761915 2468 kubelet.go:274] Adding pod path: 1051 # C:\etc\kubernetes 1052 # I1023 23:44:11.775601 2468 file.go:68] Watching path 1053 # "C:\\etc\\kubernetes" 1054 # ... 1055 # E1023 23:44:31.794327 2468 file.go:182] Can't process manifest file 1056 # "C:\\etc\\kubernetes\\hns.psm1": C:\etc\kubernetes\hns.psm1: couldn't parse 1057 # as pod(yaml: line 10: did not find expected <document start>), please check 1058 # config file. 1059 # 1060 # Figure out how to change the directory that the kubelet monitors for new 1061 # pod manifests. 1062 1063 # We configure the service to restart on failure, after 10s wait. We reset 1064 # the restart count to 0 each time, so we re-use our restart/10000 action on 1065 # each failure. Note it currently restarts even when explicitly stopped, you 1066 # have to delete the service entry to *really* kill it (e.g. `sc.exe delete 1067 # kubelet`). See issue #72900. 1068 if (Get-Process | Where-Object Name -eq "kubelet") { 1069 Log-Output -Fatal ` 1070 "A kubelet process is already running, don't know what to do" 1071 } 1072 Log-Output "Creating kubelet service" 1073 & sc.exe create kubelet binPath= "${env:NODE_DIR}\kube-log-runner.exe -log-file=${env:LOGS_DIR}\kubelet.log ${env:NODE_DIR}\kubelet.exe ${kubelet_args}" start= demand 1074 & sc.exe failure kubelet reset= 0 actions= restart/10000 1075 Log-Output "Starting kubelet service" 1076 & sc.exe start kubelet 1077 1078 Log-Output "Waiting 10 seconds for kubelet to stabilize" 1079 Start-Sleep 10 1080 Write-VerboseServiceInfoToConsole -Service 'kubelet' 1081 1082 if (Get-Process | Where-Object Name -eq "kube-proxy") { 1083 Log-Output -Fatal ` 1084 "A kube-proxy process is already running, don't know what to do" 1085 } 1086 Log-Output "Creating kube-proxy service" 1087 & sc.exe create kube-proxy binPath= "${env:NODE_DIR}\kube-log-runner.exe -log-file=${env:LOGS_DIR}\kube-proxy.log ${env:NODE_DIR}\kube-proxy.exe ${kubeproxy_args}" start= demand 1088 & sc.exe failure kube-proxy reset= 0 actions= restart/10000 1089 Log-Output "Starting kube-proxy service" 1090 & sc.exe start kube-proxy 1091 Write-VerboseServiceInfoToConsole -Service 'kube-proxy' -Delay 1 1092 1093 # F1020 23:08:52.000083 9136 server.go:361] unable to load in-cluster 1094 # configuration, KUBERNETES_SERVICE_HOST and KUBERNETES_SERVICE_PORT must be 1095 # defined 1096 # TODO(pjh): still getting errors like these in kube-proxy log: 1097 # E1023 04:03:58.143449 4840 reflector.go:205] k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion/factory.go:129: Failed to list *core.Endpoints: Get https://35.239.84.171/api/v1/endpoints?limit=500&resourceVersion=0: dial tcp 35.239.84.171:443: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond. 1098 # E1023 04:03:58.150266 4840 reflector.go:205] k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion/factory.go:129: Failed to list *core.Service: Get https://35.239.84.171/api/v1/services?limit=500&resourceVersion=0: dial tcp 35.239.84.171:443: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond. 1099 WaitFor_KubeletAndKubeProxyReady 1100 Verify_GceMetadataServerRouteIsPresent 1101 Log-Output "Kubernetes components started successfully" 1102 } 1103 1104 # Stop and unregister both kubelet & kube-proxy services. 1105 function Unregister-WorkerServices { 1106 & sc.exe delete kube-proxy 1107 & sc.exe delete kubelet 1108 } 1109 1110 # Wait for kubelet and kube-proxy to be ready within 10s. 1111 function WaitFor_KubeletAndKubeProxyReady { 1112 $waited = 0 1113 $timeout = 10 1114 while (((Get-Service kube-proxy).Status -ne 'Running' -or (Get-Service kubelet).Status -ne 'Running') -and $waited -lt $timeout) { 1115 Start-Sleep 1 1116 $waited++ 1117 } 1118 1119 # Timeout occurred 1120 if ($waited -ge $timeout) { 1121 Log-Output "$(Get-Service kube* | Out-String)" 1122 Throw ("Timeout while waiting ${timeout} seconds for kubelet and kube-proxy services to start") 1123 } 1124 } 1125 1126 # Runs 'kubectl get nodes'. 1127 # Runs additional verification commands to ensure node successfully joined cluster 1128 # and that it connects to the API Server. 1129 function Verify-WorkerServices { 1130 $timeout = 12 1131 $retries = 0 1132 $retryDelayInSeconds = 5 1133 1134 Log-Output ("Testing node connection to API server...") 1135 do { 1136 $retries++ 1137 $nodes_list = & "${env:NODE_DIR}\kubectl.exe" get nodes -o=custom-columns=:.metadata.name -A | Out-String 1138 $host_status = & "${env:NODE_DIR}\kubectl.exe" get nodes (hostname) -o=custom-columns=:.status.conditions[4].type | Out-String 1139 Start-Sleep $retryDelayInSeconds 1140 } while (((-Not $nodes_list) -or (-Not $nodes_list.contains((hostname))) -or (-Not $host_status.contains("Ready")))-and ($retries -le $timeout)) 1141 1142 If (-Not $nodes_list){ 1143 Throw ("Node: '$(hostname)' failed to connect to API server") 1144 1145 }ElseIf (-Not $nodes_list.contains((hostname))) { 1146 Throw ("Node: '$(hostname)' failed to join the cluster; NODES: '`n $($nodes_list)'") 1147 1148 }ELseIf (-Not $host_status.contains("Ready")) { 1149 Throw ("Node: '$(hostname)' is not in Ready state") 1150 } 1151 1152 Log-Output ("Node: $(hostname) successfully joined cluster `n NODES: `n $($nodes_list)") 1153 Verify_GceMetadataServerRouteIsPresent 1154 1155 } 1156 1157 # Downloads the Windows crictl package and installs its contents (e.g. 1158 # crictl.exe) in $env:NODE_DIR. 1159 function DownloadAndInstall-Crictl { 1160 if (-not (ShouldWrite-File ${env:NODE_DIR}\crictl.exe)) { 1161 return 1162 } 1163 $CRI_TOOLS_GCS_BUCKET = 'k8s-artifacts-cri-tools' 1164 $url = ('https://storage.googleapis.com/' + $CRI_TOOLS_GCS_BUCKET + 1165 '/release/' + $CRICTL_VERSION + '/crictl-' + $CRICTL_VERSION + 1166 '-windows-amd64.tar.gz') 1167 MustDownload-File ` 1168 -URLs $url ` 1169 -OutFile ${env:NODE_DIR}\crictl.tar.gz ` 1170 -Hash $CRICTL_SHA256 ` 1171 -Algorithm SHA256 1172 tar xzvf ${env:NODE_DIR}\crictl.tar.gz -C ${env:NODE_DIR} 1173 } 1174 1175 # Sets crictl configuration values. 1176 function Configure-Crictl { 1177 if (${env:CONTAINER_RUNTIME_ENDPOINT}) { 1178 & "${env:NODE_DIR}\crictl.exe" config runtime-endpoint ` 1179 ${env:CONTAINER_RUNTIME_ENDPOINT} 1180 } 1181 } 1182 1183 # Pulls the infra/pause container image onto the node so that it will be 1184 # immediately available when the kubelet tries to run pods. 1185 # TODO(pjh): downloading the container container image may take a few minutes; 1186 # figure out how to run this in the background while perform the rest of the 1187 # node startup steps! 1188 # Pull-InfraContainer must be called AFTER Verify-WorkerServices. 1189 function Pull-InfraContainer { 1190 $name, $label = ${env:INFRA_CONTAINER} -split ':',2 1191 if (-not ("$(& crictl images)" -match "$name.*$label")) { 1192 & crictl pull ${env:INFRA_CONTAINER} 1193 if (!$?) { 1194 throw "Error running 'crictl pull ${env:INFRA_CONTAINER}'" 1195 } 1196 } 1197 $inspect = "$(& crictl inspecti ${env:INFRA_CONTAINER} | Out-String)" 1198 Log-Output "Infra/pause container:`n$inspect" 1199 } 1200 1201 # Setup the containerd on the node. 1202 function Setup-ContainerRuntime { 1203 Install-Pigz 1204 Install_Containerd 1205 Configure_Containerd 1206 Start_Containerd 1207 } 1208 1209 function Test-ContainersFeatureInstalled { 1210 return (Get-WindowsFeature Containers).Installed 1211 } 1212 1213 # After this function returns, the computer must be restarted to complete 1214 # the installation! 1215 function Install-ContainersFeature { 1216 Log-Output "Installing Windows 'Containers' feature" 1217 Install-WindowsFeature Containers 1218 } 1219 1220 # Verifies if Hyper-V should be enabled in the node 1221 function Test-ShouldEnableHyperVFeature { 1222 return "${env:WINDOWS_ENABLE_HYPERV}" -eq "true" 1223 } 1224 1225 # Check if Hyper-V feature is enabled 1226 function Test-HyperVFeatureEnabled { 1227 return ((Get-WindowsOptionalFeature -Online -FeatureName Microsoft-Hyper-V).State -eq 'Enabled') 1228 } 1229 1230 # After this function returns, the computer must be restarted to complete 1231 # the installation! 1232 function Enable-HyperVFeature { 1233 Log-Output "Enabling Windows 'HyperV' feature" 1234 Enable-WindowsOptionalFeature -Online -FeatureName Microsoft-Hyper-V -All -NoRestart 1235 Enable-WindowsOptionalFeature -Online -FeatureName Microsoft-Hyper-V-Management-PowerShell -All -NoRestart 1236 } 1237 1238 # Configures the TCP/IP parameters to be in sync with the GCP recommendation. 1239 # Not setting these values correctly can cause network issues for connections 1240 # that live longer than 10 minutes. 1241 # See: https://cloud.google.com/compute/docs/troubleshooting/general-tips#idle-connections 1242 function Set-WindowsTCPParameters { 1243 Set-ItemProperty -Force -Confirm:$false -Path ` 1244 'HKLM:\SYSTEM\CurrentControlSet\Services\Tcpip\Parameters' ` 1245 -Name 'KeepAliveInterval' -Type Dword -Value 1000 1246 Set-ItemProperty -Force -Confirm:$false ` 1247 -Path 'HKLM:\SYSTEM\CurrentControlSet\Services\Tcpip\Parameters' ` 1248 -Name 'KeepAliveTime' -Type Dword -Value 60000 1249 Set-ItemProperty -Force -Confirm:$false ` 1250 -Path 'HKLM:\SYSTEM\CurrentControlSet\Services\Tcpip\Parameters' ` 1251 -Name 'TcpMaxDataRetransmissions' -Type Dword -Value 10 1252 1253 Log-Output 'TCP/IP Parameters' 1254 Get-ItemProperty -Path 'HKLM:\SYSTEM\CurrentControlSet\Services\Tcpip\Parameters' 1255 } 1256 1257 # Writes a CNI config file under $env:CNI_CONFIG_DIR for containerd. 1258 # 1259 # Prerequisites: 1260 # $env:POD_CIDR is set (by Set-PodCidr). 1261 # The "management" interface exists (Configure-HostNetworkingService). 1262 # The HNS network for pod networking has been configured 1263 # (Configure-HostNetworkingService). 1264 # Containerd is installed (Install_Containerd). 1265 # 1266 # Required ${kube_env} keys: 1267 # DNS_SERVER_IP 1268 # DNS_DOMAIN 1269 # SERVICE_CLUSTER_IP_RANGE 1270 function Configure_Containerd_CniNetworking { 1271 $l2bridge_conf = "${env:CNI_CONFIG_DIR}\l2bridge.conf" 1272 if (-not (ShouldWrite-File ${l2bridge_conf})) { 1273 return 1274 } 1275 1276 $mgmt_ip = (Get_MgmtNetAdapter | 1277 Get-NetIPAddress -AddressFamily IPv4).IPAddress 1278 1279 $pod_gateway = Get_Endpoint_Gateway_From_CIDR(${env:POD_CIDR}) 1280 1281 # Explanation of the CNI config values: 1282 # POD_CIDR: the pod CIDR assigned to this node. 1283 # POD_GATEWAY: the gateway IP. 1284 # MGMT_IP: the IP address assigned to the node's primary network interface 1285 # (i.e. the internal IP of the GCE VM). 1286 # SERVICE_CIDR: the CIDR used for kubernetes services. 1287 # DNS_SERVER_IP: the cluster's DNS server IP address. 1288 # DNS_DOMAIN: the cluster's DNS domain, e.g. "cluster.local". 1289 # 1290 # OutBoundNAT ExceptionList: No SNAT for CIDRs in the list, the same as default GKE non-masquerade destination ranges listed at https://cloud.google.com/kubernetes-engine/docs/how-to/ip-masquerade-agent#default-non-masq-dests 1291 1292 New-Item -Force -ItemType file ${l2bridge_conf} | Out-Null 1293 Set-Content ${l2bridge_conf} ` 1294 '{ 1295 "cniVersion": "0.2.0", 1296 "name": "l2bridge", 1297 "type": "sdnbridge", 1298 "master": "Ethernet", 1299 "capabilities": { 1300 "portMappings": true, 1301 "dns": true 1302 }, 1303 "ipam": { 1304 "subnet": "POD_CIDR", 1305 "routes": [ 1306 { 1307 "GW": "POD_GATEWAY" 1308 } 1309 ] 1310 }, 1311 "dns": { 1312 "Nameservers": [ 1313 "DNS_SERVER_IP" 1314 ], 1315 "Search": [ 1316 "DNS_DOMAIN" 1317 ] 1318 }, 1319 "AdditionalArgs": [ 1320 { 1321 "Name": "EndpointPolicy", 1322 "Value": { 1323 "Type": "OutBoundNAT", 1324 "Settings": { 1325 "Exceptions": [ 1326 "169.254.0.0/16", 1327 "10.0.0.0/8", 1328 "172.16.0.0/12", 1329 "192.168.0.0/16", 1330 "100.64.0.0/10", 1331 "192.0.0.0/24", 1332 "192.0.2.0/24", 1333 "192.88.99.0/24", 1334 "198.18.0.0/15", 1335 "198.51.100.0/24", 1336 "203.0.113.0/24", 1337 "240.0.0.0/4" 1338 ] 1339 } 1340 } 1341 }, 1342 { 1343 "Name": "EndpointPolicy", 1344 "Value": { 1345 "Type": "SDNRoute", 1346 "Settings": { 1347 "DestinationPrefix": "SERVICE_CIDR", 1348 "NeedEncap": true 1349 } 1350 } 1351 }, 1352 { 1353 "Name": "EndpointPolicy", 1354 "Value": { 1355 "Type": "SDNRoute", 1356 "Settings": { 1357 "DestinationPrefix": "MGMT_IP/32", 1358 "NeedEncap": true 1359 } 1360 } 1361 } 1362 ] 1363 }'.replace('POD_CIDR', ${env:POD_CIDR}).` 1364 replace('POD_GATEWAY', ${pod_gateway}).` 1365 replace('DNS_SERVER_IP', ${kube_env}['DNS_SERVER_IP']).` 1366 replace('DNS_DOMAIN', ${kube_env}['DNS_DOMAIN']).` 1367 replace('MGMT_IP', ${mgmt_ip}).` 1368 replace('SERVICE_CIDR', ${kube_env}['SERVICE_CLUSTER_IP_RANGE']) 1369 1370 Log-Output "containerd CNI config:`n$(Get-Content -Raw ${l2bridge_conf})" 1371 } 1372 1373 # Download and install containerd and CNI binaries into $env:NODE_DIR. 1374 function Install_Containerd { 1375 # Assume that presence of containerd.exe indicates that all containerd 1376 # binaries were already previously downloaded to this node. 1377 if (-not (ShouldWrite-File ${env:NODE_DIR}\containerd.exe)) { 1378 return 1379 } 1380 1381 $tmp_dir = 'C:\containerd_tmp' 1382 New-Item $tmp_dir -ItemType 'directory' -Force | Out-Null 1383 1384 # TODO(ibrahimab) Change this to a gcs bucket with CI maintained and accessible by community. 1385 $version = '1.6.2' 1386 $tar_url = ("https://github.com/containerd/containerd/releases/download/v${version}/" + 1387 "cri-containerd-cni-${version}-windows-amd64.tar.gz") 1388 $sha_url = $tar_url + ".sha256sum" 1389 MustDownload-File -URLs $sha_url -OutFile $tmp_dir\sha256sum 1390 $sha = $(Get-Content $tmp_dir\sha256sum).Split(" ")[0].ToUpper() 1391 1392 MustDownload-File ` 1393 -URLs $tar_url ` 1394 -OutFile $tmp_dir\containerd.tar.gz ` 1395 -Hash $sha ` 1396 -Algorithm SHA256 1397 1398 tar xzvf $tmp_dir\containerd.tar.gz -C $tmp_dir 1399 Move-Item -Force $tmp_dir\cni\bin\*.exe "${env:CNI_DIR}\" 1400 Move-Item -Force $tmp_dir\*.exe "${env:NODE_DIR}\" 1401 Remove-Item -Force -Recurse $tmp_dir 1402 1403 # Exclusion for Defender. 1404 Add-MpPreference -ExclusionProcess "${env:NODE_DIR}\containerd.exe" 1405 } 1406 1407 # Lookup the path of containerd config if exists, else returns a default. 1408 function Get_Containerd_ConfigPath { 1409 $service = Get-WMIObject -Class Win32_Service -Filter "Name='containerd'" 1410 if (!($service -eq $null) -and 1411 $service.PathName -match ".*\s--config\s*(\S+).*" -and 1412 $matches.Count -eq 2) { 1413 return $matches[1] 1414 } else { 1415 return 'C:\Program Files\containerd\config.toml' 1416 } 1417 } 1418 1419 # Generates the containerd config.toml file. 1420 function Configure_Containerd { 1421 $config_path = Get_Containerd_ConfigPath 1422 $config_dir = [System.IO.Path]::GetDirectoryName($config_path) 1423 New-Item $config_dir -ItemType 'directory' -Force | Out-Null 1424 Set-Content ${config_path} @" 1425 [plugins.scheduler] 1426 schedule_delay = '0s' 1427 startup_delay = '0s' 1428 [plugins.cri] 1429 sandbox_image = 'INFRA_CONTAINER_IMAGE' 1430 [plugins.cri.containerd] 1431 snapshotter = 'windows' 1432 default_runtime_name = 'runhcs-wcow-process' 1433 disable_snapshot_annotations = true 1434 discard_unpacked_layers = true 1435 [plugins.cri.cni] 1436 bin_dir = 'CNI_BIN_DIR' 1437 conf_dir = 'CNI_CONF_DIR' 1438 "@.replace('INFRA_CONTAINER_IMAGE', ${env:INFRA_CONTAINER}).` 1439 replace('CNI_BIN_DIR', "${env:CNI_DIR}").` 1440 replace('CNI_CONF_DIR', "${env:CNI_CONFIG_DIR}") 1441 } 1442 1443 # Register if needed and start containerd service. 1444 function Start_Containerd { 1445 # Do the registration only if the containerd service does not exist. 1446 if ((Get-WMIObject -Class Win32_Service -Filter "Name='containerd'") -eq $null) { 1447 Log-Output "Creating containerd service" 1448 & containerd.exe --register-service --log-file "${env:LOGS_DIR}/containerd.log" 1449 } 1450 1451 Log-Output "Starting containerd service" 1452 Restart-Service containerd 1453 } 1454 1455 # Pigz Resources 1456 $PIGZ_ROOT = 'C:\pigz' 1457 $PIGZ_VERSION = '2.3.1' 1458 $PIGZ_TAR_URL = "https://storage.googleapis.com/gke-release/winnode/pigz/prod/gke_windows/pigz/release/5/20201104-134221/pigz-$PIGZ_VERSION.zip" 1459 $PIGZ_TAR_HASH = '5a6f8f5530acc85ea51797f58c1409e5af6b69e55da243ffc608784cf14fec0cd16f74cc61c564d69e1a267750aecfc1e4c53b5219ff5f893b42a7576306f34c' 1460 1461 # Install Pigz (https://github.com/madler/pigz) into Windows for improved image 1462 # extraction performance. 1463 function Install-Pigz { 1464 if ("${env:WINDOWS_ENABLE_PIGZ}" -eq "true") { 1465 if (-not (Test-Path $PIGZ_ROOT)) { 1466 Log-Output "Installing Pigz $PIGZ_VERSION" 1467 New-Item -Path $PIGZ_ROOT -ItemType Directory 1468 MustDownload-File ` 1469 -Url $PIGZ_TAR_URL ` 1470 -OutFile "$PIGZ_ROOT\pigz-$PIGZ_VERSION.zip" ` 1471 -Hash $PIGZ_TAR_HASH ` 1472 -Algorithm SHA512 1473 Expand-Archive -Path "$PIGZ_ROOT\pigz-$PIGZ_VERSION.zip" ` 1474 -DestinationPath $PIGZ_ROOT 1475 Remove-Item -Path "$PIGZ_ROOT\pigz-$PIGZ_VERSION.zip" 1476 # Containerd search for unpigz.exe on the first container image 1477 # pull request after the service is started. If unpigz.exe is in the 1478 # Windows path it'll use it instead of the default unzipper. 1479 # See: https://github.com/containerd/containerd/issues/1896 1480 Add-MachineEnvironmentPath -Path $PIGZ_ROOT 1481 # Add process exclusion for Windows Defender to boost performance. 1482 Add-MpPreference -ExclusionProcess "$PIGZ_ROOT\unpigz.exe" 1483 Log-Output "Installed Pigz $PIGZ_VERSION" 1484 } else { 1485 Log-Output "Pigz already installed." 1486 } 1487 } 1488 } 1489 1490 # Node Problem Detector Resources 1491 $NPD_SERVICE = "node-problem-detector" 1492 $DEFAULT_NPD_VERSION = '0.8.10-gke0.1' 1493 $DEFAULT_NPD_RELEASE_PATH = 'https://storage.googleapis.com/gke-release/winnode' 1494 $DEFAULT_NPD_HASH = '97ddfe3544da9e02a1cfb55d24f329eb29d606fca7fbbf800415d5de9dbc29a00563f8e0d1919595c8e316fd989d45b09b13c07be528841fc5fd37e21d016a2d' 1495 1496 # Install Node Problem Detector (NPD). 1497 # NPD analyzes the host for problems that can disrupt workloads. 1498 # https://github.com/kubernetes/node-problem-detector 1499 function DownloadAndInstall-NodeProblemDetector { 1500 if ("${env:ENABLE_NODE_PROBLEM_DETECTOR}" -eq "standalone") { 1501 if (ShouldWrite-File "${env:NODE_DIR}\node-problem-detector.exe") { 1502 $npd_version = $DEFAULT_NPD_VERSION 1503 $npd_hash = $DEFAULT_NPD_HASH 1504 if (-not [string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_VERSION'])) { 1505 $npd_version = ${kube_env}['NODE_PROBLEM_DETECTOR_VERSION'] 1506 $npd_hash = ${kube_env}['NODE_PROBLEM_DETECTOR_TAR_HASH'] 1507 } 1508 $npd_release_path = $DEFAULT_NPD_RELEASE_PATH 1509 if (-not [string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_RELEASE_PATH'])) { 1510 $npd_release_path = ${kube_env}['NODE_PROBLEM_DETECTOR_RELEASE_PATH'] 1511 } 1512 1513 $npd_tar = "node-problem-detector-v${npd_version}-windows_amd64.tar.gz" 1514 1515 Log-Output "Downloading ${npd_tar}." 1516 1517 $npd_dir = "${env:K8S_DIR}\node-problem-detector" 1518 New-Item -Path $npd_dir -ItemType Directory -Force -Confirm:$false 1519 1520 MustDownload-File ` 1521 -URLs "${npd_release_path}/node-problem-detector/${npd_tar}" ` 1522 -Hash $npd_hash ` 1523 -Algorithm SHA512 ` 1524 -OutFile "${npd_dir}\${npd_tar}" 1525 1526 tar xzvf "${npd_dir}\${npd_tar}" -C $npd_dir 1527 Move-Item "${npd_dir}\bin\*" "${env:NODE_DIR}\" -Force -Confirm:$false 1528 Remove-Item "${npd_dir}\bin" -Force -Confirm:$false 1529 Remove-Item "${npd_dir}\${npd_tar}" -Force -Confirm:$false 1530 } 1531 else { 1532 Log-Output "Node Problem Detector already installed." 1533 } 1534 } 1535 } 1536 1537 # Creates the node-problem-detector user kubeconfig file at 1538 # $env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE (if defined). 1539 # 1540 # Create-NodePki() must be called first. 1541 # 1542 # Required ${kube_env} keys: 1543 # CA_CERT 1544 # NODE_PROBLEM_DETECTOR_TOKEN 1545 function Create-NodeProblemDetectorKubeConfig { 1546 if (-not [string]::IsNullOrEmpty(${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE})) { 1547 Create-Kubeconfig -Name 'node-problem-detector' ` 1548 -Path ${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE} ` 1549 -Token ${kube_env}['NODE_PROBLEM_DETECTOR_TOKEN'] 1550 } 1551 } 1552 1553 # Configures NPD to run with the bundled monitor configs and report against the Kubernetes api server. 1554 function Configure-NodeProblemDetector { 1555 $npd_bin = "${env:NODE_DIR}\node-problem-detector.exe" 1556 if ("${env:ENABLE_NODE_PROBLEM_DETECTOR}" -eq "standalone" -and (Test-Path $npd_bin)) { 1557 $npd_svc = Get-Service -Name $NPD_SERVICE -ErrorAction SilentlyContinue 1558 if ($npd_svc -eq $null) { 1559 $npd_dir = "${env:K8S_DIR}\node-problem-detector" 1560 $npd_logs_dir = "${env:LOGS_DIR}\node-problem-detector" 1561 1562 New-Item -Path $npd_logs_dir -Type Directory -Force -Confirm:$false 1563 1564 $flags = '' 1565 if ([string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS'])) { 1566 $system_log_monitors = @() 1567 $system_stats_monitors = @() 1568 $custom_plugin_monitors = @() 1569 1570 # Custom Plugin Monitors 1571 $custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-kubelet.json") 1572 $custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-kubeproxy.json") 1573 $custom_plugin_monitors += @("${npd_dir}\config\windows-defender-monitor.json") 1574 1575 # System Stats Monitors 1576 $system_stats_monitors += @("${npd_dir}\config\windows-system-stats-monitor.json") 1577 1578 # NPD Configuration for CRI monitor 1579 $system_log_monitors += @("${npd_dir}\config\windows-containerd-monitor-filelog.json") 1580 $custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-containerd.json") 1581 1582 $flags="--v=2 --port=20256 --log_dir=${npd_logs_dir}" 1583 if ($system_log_monitors.count -gt 0) { 1584 $flags+=" --config.system-log-monitor={0}" -f ($system_log_monitors -join ",") 1585 } 1586 if ($system_stats_monitors.count -gt 0) { 1587 $flags+=" --config.system-stats-monitor={0}" -f ($system_stats_monitors -join ",") 1588 } 1589 if ($custom_plugin_monitors.count -gt 0) { 1590 $flags+=" --config.custom-plugin-monitor={0}" -f ($custom_plugin_monitors -join ",") 1591 } 1592 } 1593 else { 1594 $flags = ${kube_env}['NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS'] 1595 } 1596 $kubernetes_master_name = ${kube_env}['KUBERNETES_MASTER_NAME'] 1597 $flags = "${flags} --apiserver-override=`"https://${kubernetes_master_name}?inClusterConfig=false&auth=${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE}`"" 1598 1599 Log-Output "Creating service: ${NPD_SERVICE}" 1600 Log-Output "${npd_bin} ${flags}" 1601 sc.exe create $NPD_SERVICE binpath= "${npd_bin} ${flags}" displayName= "Node Problem Detector" 1602 sc.exe failure $NPD_SERVICE reset= 30 actions= restart/5000 1603 sc.exe start $NPD_SERVICE 1604 1605 Write-VerboseServiceInfoToConsole -Service $NPD_SERVICE 1606 } 1607 else { 1608 Log-Output "${NPD_SERVICE} already configured." 1609 } 1610 } 1611 } 1612 1613 # TODO(pjh): move the logging agent code below into a separate 1614 # module; it was put here temporarily to avoid disrupting the file layout in 1615 # the K8s release machinery. 1616 $LOGGINGAGENT_VERSION = '1.8.10' 1617 $LOGGINGAGENT_ROOT = 'C:\fluent-bit' 1618 $LOGGINGAGENT_SERVICE = 'fluent-bit' 1619 $LOGGINGAGENT_CMDLINE = '*fluent-bit.exe*' 1620 1621 $LOGGINGEXPORTER_VERSION = 'v0.17.0' 1622 $LOGGINGEXPORTER_ROOT = 'C:\flb-exporter' 1623 $LOGGINGEXPORTER_SERVICE = 'flb-exporter' 1624 $LOGGINGEXPORTER_CMDLINE = '*flb-exporter.exe*' 1625 $LOGGINGEXPORTER_HASH = 'c808c9645d84b06b89932bd707d51a9d1d0b451b5a702a5f9b2b4462c8be6502' 1626 1627 # Restart Logging agent or starts it if it is not currently running 1628 function Restart-LoggingAgent { 1629 if (IsStackdriverAgentInstalled) { 1630 Restart-StackdriverAgent 1631 return 1632 } 1633 1634 Restart-LogService $LOGGINGEXPORTER_SERVICE $LOGGINGEXPORTER_CMDLINE 1635 Restart-LogService $LOGGINGAGENT_SERVICE $LOGGINGAGENT_CMDLINE 1636 } 1637 1638 # Restarts the service, or starts it if it is not currently 1639 # running. A standard `Restart-Service` may fail because 1640 # the process is sometimes unstoppable, so this function works around it 1641 # by killing the processes. 1642 function Restart-LogService([string]$service, [string]$cmdline) { 1643 Stop-Service -NoWait -ErrorAction Ignore $service 1644 1645 # Wait (if necessary) for service to stop. 1646 $timeout = 10 1647 $stopped = (Get-service $service).Status -eq 'Stopped' 1648 for ($i = 0; $i -lt $timeout -and !($stopped); $i++) { 1649 Start-Sleep 1 1650 $stopped = (Get-service $service).Status -eq 'Stopped' 1651 } 1652 1653 if ((Get-service $service).Status -ne 'Stopped') { 1654 # Force kill the processes. 1655 Stop-Process -Force -PassThru -Id (Get-WmiObject win32_process | 1656 Where CommandLine -Like $cmdline).ProcessId 1657 1658 # Wait until process has stopped. 1659 $waited = 0 1660 $log_period = 10 1661 $timeout = 60 1662 while ((Get-service $service).Status -ne 'Stopped' -and $waited -lt $timeout) { 1663 Start-Sleep 1 1664 $waited++ 1665 1666 if ($waited % $log_period -eq 0) { 1667 Log-Output "Waiting for ${service} service to stop" 1668 } 1669 } 1670 1671 # Timeout occurred 1672 if ($waited -ge $timeout) { 1673 Throw ("Timeout while waiting for ${service} service to stop") 1674 } 1675 } 1676 1677 Start-Service $service 1678 } 1679 1680 # Check whether the logging agent is installed by whether it's registered as service 1681 function IsLoggingAgentInstalled { 1682 $logging_status = (Get-Service $LOGGINGAGENT_SERVICE -ErrorAction Ignore).Status 1683 return -not [string]::IsNullOrEmpty($logging_status) 1684 } 1685 1686 # Installs the logging agent according to https://docs.fluentbit.io/manual/installation/windows# 1687 # Also installs fluent bit stackdriver exporter 1688 function Install-LoggingAgent { 1689 if (IsStackdriverAgentInstalled) { 1690 # Remove the existing storage.json file if it exists. This is a workaround 1691 # for the bug where the logging agent cannot start up if the file is 1692 # corrupted. 1693 Remove-Item ` 1694 -Force ` 1695 -ErrorAction Ignore ` 1696 ("$STACKDRIVER_ROOT\LoggingAgent\Main\pos\winevtlog.pos\worker0\" + 1697 "storage.json") 1698 Log-Output ("Skip: Stackdriver logging agent is already installed") 1699 return 1700 } 1701 1702 if (IsLoggingAgentInstalled) { 1703 # Note: we should reinstall the agent if $REDO_STEPS is true 1704 # here, but we don't know how to run the installer without it prompting 1705 # when logging agent is already installed. We dumped the strings in the 1706 # installer binary and searched for flags to do this but found nothing. Oh 1707 # well. 1708 Log-Output ("Skip: Fluentbit logging agent is already installed") 1709 return 1710 } 1711 1712 DownloadAndInstall-LoggingAgents 1713 Create-LoggingAgentServices 1714 } 1715 1716 function DownloadAndInstall-LoggingAgents { 1717 # Install Logging agent if not present 1718 if (ShouldWrite-File $LOGGINGAGENT_ROOT\td-agent-bit-${LOGGINGAGENT_VERSION}-win64) { 1719 $install_dir = 'C:\flb-installers' 1720 $url = ("https://storage.googleapis.com/gke-release/winnode/fluentbit/td-agent-bit-${LOGGINGAGENT_VERSION}-win64.zip") 1721 1722 Log-Output 'Downloading Logging agent' 1723 New-Item $install_dir -ItemType 'directory' -Force | Out-Null 1724 MustDownload-File -OutFile $install_dir\td.zip -URLs $url 1725 1726 cd $install_dir 1727 Log-Output 'Extracting Logging agent' 1728 Expand-Archive td.zip 1729 mv .\td\td-agent-bit-${LOGGINGAGENT_VERSION}-win64\ $LOGGINGAGENT_ROOT 1730 cd C:\ 1731 Remove-Item -Force -Recurse $install_dir 1732 } 1733 1734 # Download Logging exporter if needed 1735 if (ShouldWrite-File $LOGGINGEXPORTER_ROOT\flb-exporter.exe) { 1736 $url = ("https://storage.googleapis.com/gke-release/winnode/fluentbit-exporter/${LOGGINGEXPORTER_VERSION}/flb-exporter-${LOGGINGEXPORTER_VERSION}.exe") 1737 Log-Output 'Downloading logging exporter' 1738 New-Item $LOGGINGEXPORTER_ROOT -ItemType 'directory' -Force | Out-Null 1739 MustDownload-File ` 1740 -OutFile $LOGGINGEXPORTER_ROOT\flb-exporter.exe ` 1741 -URLs $url ` 1742 -Hash $LOGGINGEXPORTER_HASH ` 1743 -Algorithm SHA256 1744 } 1745 } 1746 1747 function Create-LoggingAgentServices { 1748 cd $LOGGINGAGENT_ROOT 1749 1750 Log-Output "Creating service: ${LOGGINGAGENT_SERVICE}" 1751 sc.exe create $LOGGINGAGENT_SERVICE binpath= "${LOGGINGAGENT_ROOT}\bin\fluent-bit.exe -c \fluent-bit\conf\fluent-bit.conf" 1752 sc.exe failure $LOGGINGAGENT_SERVICE reset= 30 actions= restart/5000 1753 Write-VerboseServiceInfoToConsole -Service $LOGGINGAGENT_SERVICE 1754 1755 Log-Output "Creating service: ${LOGGINGEXPORTER_SERVICE}" 1756 sc.exe create $LOGGINGEXPORTER_SERVICE binpath= "${LOGGINGEXPORTER_ROOT}\flb-exporter.exe --kubernetes-separator=_ --stackdriver-resource-model=k8s --enable-pod-label-discovery --logtostderr --winsvc --pod-label-dot-replacement=_" 1757 sc.exe failure $LOGGINGEXPORTER_SERVICE reset= 30 actions= restart/5000 1758 Write-VerboseServiceInfoToConsole -Service $LOGGINGEXPORTER_SERVICE 1759 } 1760 1761 # Writes the logging configuration file for Logging agent. Restart-LoggingAgent 1762 # should then be called to pick up the new configuration. 1763 function Configure-LoggingAgent { 1764 if (IsStackdriverAgentInstalled) { 1765 Configure-StackdriverAgent 1766 return 1767 } 1768 1769 $fluentbit_config_file = "$LOGGINGAGENT_ROOT\conf\fluent-bit.conf" 1770 $FLUENTBIT_CONFIG | Out-File -FilePath $fluentbit_config_file -Encoding ASCII 1771 Log-Output "Wrote logging config to $fluentbit_config_file" 1772 1773 $fluentbit_parser_file = "$LOGGINGAGENT_ROOT\conf\parsers.conf" 1774 $PARSERS_CONFIG | Out-File -FilePath $fluentbit_parser_file -Encoding ASCII 1775 1776 # Create directory for all the log position files. 1777 New-Item -Type Directory -Path "/var/run/google-fluentbit/pos-files/" -Force | Out-Null 1778 1779 Log-Output "Wrote logging config to $fluentbit_parser_file" 1780 } 1781 1782 # Fluentbit main config file 1783 $FLUENTBIT_CONFIG = @' 1784 [SERVICE] 1785 Flush 5 1786 Grace 120 1787 Log_Level info 1788 Log_File /var/log/fluentbit.log 1789 Daemon off 1790 Parsers_File parsers.conf 1791 HTTP_Server off 1792 HTTP_Listen 0.0.0.0 1793 HTTP_PORT 2020 1794 plugins_file plugins.conf 1795 1796 # Storage 1797 # ======= 1798 # Fluent Bit can use memory and filesystem buffering based mechanisms 1799 # 1800 # - https://docs.fluentbit.io/manual/administration/buffering-and-storage 1801 # 1802 # storage metrics 1803 # --------------- 1804 # publish storage pipeline metrics in '/api/v1/storage'. The metrics are 1805 # exported only if the 'http_server' option is enabled. 1806 # 1807 # storage.metrics on 1808 1809 # storage.path 1810 # ------------ 1811 # absolute file system path to store filesystem data buffers (chunks). 1812 # 1813 # storage.path /tmp/storage 1814 1815 # storage.sync 1816 # ------------ 1817 # configure the synchronization mode used to store the data into the 1818 # filesystem. It can take the values normal or full. 1819 # 1820 # storage.sync normal 1821 1822 # storage.checksum 1823 # ---------------- 1824 # enable the data integrity check when writing and reading data from the 1825 # filesystem. The storage layer uses the CRC32 algorithm. 1826 # 1827 # storage.checksum off 1828 1829 # storage.backlog.mem_limit 1830 # ------------------------- 1831 # if storage.path is set, Fluent Bit will look for data chunks that were 1832 # not delivered and are still in the storage layer, these are called 1833 # backlog data. This option configure a hint of maximum value of memory 1834 # to use when processing these records. 1835 # 1836 # storage.backlog.mem_limit 5M 1837 1838 [INPUT] 1839 Name winlog 1840 Interval_Sec 2 1841 # Channels Setup,Windows PowerShell 1842 Channels application,system,security 1843 Tag winevt.raw 1844 DB /var/run/google-fluentbit/pos-files/winlog.db 1845 1846 # Json Log Example: 1847 # {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"} 1848 [INPUT] 1849 Name tail 1850 Alias kube_containers 1851 Tag kube_<namespace_name>_<pod_name>_<container_name> 1852 Tag_Regex (?<pod_name>[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)- 1853 Mem_Buf_Limit 5MB 1854 Skip_Long_Lines On 1855 Refresh_Interval 5 1856 Path C:\var\log\containers\*.log 1857 DB /var/run/google-fluentbit/pos-files/flb_kube.db 1858 1859 [FILTER] 1860 Name parser 1861 Match kube_* 1862 Key_Name log 1863 Reserve_Data True 1864 Parser docker 1865 Parser containerd 1866 1867 # Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg 1868 # Example: 1869 # I0716 02:08:55.559351 3356 log_spam.go:42] Command line arguments: 1870 [INPUT] 1871 Name tail 1872 Alias node-problem-detector 1873 Tag node-problem-detector 1874 Mem_Buf_Limit 5MB 1875 Skip_Long_Lines On 1876 Refresh_Interval 5 1877 Path C:\etc\kubernetes\logs\node-problem-detector\*.log.INFO* 1878 DB /var/run/google-fluentbit/pos-files/node-problem-detector.db 1879 Multiline On 1880 Parser_Firstline glog 1881 1882 # Example: 1883 # I0928 03:15:50.440223 4880 main.go:51] Starting CSI-Proxy Server ... 1884 [INPUT] 1885 Name tail 1886 Alias csi-proxy 1887 Tag csi-proxy 1888 Mem_Buf_Limit 5MB 1889 Skip_Long_Lines On 1890 Refresh_Interval 5 1891 Path /etc/kubernetes/logs/csi-proxy.log 1892 DB /var/run/google-fluentbit/pos-files/csi-proxy.db 1893 Multiline On 1894 Parser_Firstline glog 1895 1896 # I1118 21:26:53.975789 6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed 1897 [INPUT] 1898 Name tail 1899 Alias kube-proxy 1900 Tag kube-proxy 1901 Mem_Buf_Limit 5MB 1902 Skip_Long_Lines On 1903 Refresh_Interval 5 1904 Path /etc/kubernetes/logs/kube-proxy.log 1905 DB /var/run/google-fluentbit/pos-files/kube-proxy.db 1906 Multiline On 1907 Parser_Firstline glog 1908 1909 # Example: 1910 # time="2019-12-10T21:27:59.836946700Z" level=info msg="loading plugin \"io.containerd.grpc.v1.cri\"..." type=io.containerd.grpc.v1 1911 [INPUT] 1912 Name tail 1913 Alias container-runtime 1914 Tag container-runtime 1915 Mem_Buf_Limit 5MB 1916 Skip_Long_Lines On 1917 Refresh_Interval 5 1918 Path /etc/kubernetes/logs/containerd.log 1919 DB /var/run/google-fluentbit/pos-files/container-runtime.db 1920 # TODO: Add custom parser for containerd logs once format is settled. 1921 1922 # Example: 1923 # I0204 07:32:30.020537 3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537] 1924 [INPUT] 1925 Name tail 1926 Alias kubelet 1927 Tag kubelet 1928 Mem_Buf_Limit 5MB 1929 Skip_Long_Lines On 1930 Refresh_Interval 5 1931 Path /etc/kubernetes/logs/kubelet.log 1932 DB /var/run/google-fluentbit/pos-files/kubelet.db 1933 Multiline On 1934 Parser_Firstline glog 1935 1936 [FILTER] 1937 Name modify 1938 Match * 1939 Hard_rename log message 1940 1941 [FILTER] 1942 Name modify 1943 Match winevt.raw 1944 Hard_rename Message message 1945 1946 [FILTER] 1947 Name parser 1948 Match kube_* 1949 Key_Name message 1950 Reserve_Data True 1951 Parser glog 1952 Parser json 1953 1954 [OUTPUT] 1955 Name http 1956 Match * 1957 Host 127.0.0.1 1958 Port 2021 1959 URI /logs 1960 header_tag FLUENT-TAG 1961 Format msgpack 1962 Retry_Limit 2 1963 '@ 1964 1965 # Fluentbit parsers config file 1966 $PARSERS_CONFIG = @' 1967 [PARSER] 1968 Name docker 1969 Format json 1970 Time_Key time 1971 Time_Format %Y-%m-%dT%H:%M:%S.%L%z 1972 1973 [PARSER] 1974 Name containerd 1975 Format regex 1976 Regex ^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$ 1977 Time_Key time 1978 Time_Format %Y-%m-%dT%H:%M:%S.%L%z 1979 1980 [PARSER] 1981 Name json 1982 Format json 1983 1984 [PARSER] 1985 Name syslog 1986 Format regex 1987 Regex ^\<(?<pri>[0-9]+)\>(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$ 1988 Time_Key time 1989 Time_Format %b %d %H:%M:%S 1990 1991 [PARSER] 1992 Name glog 1993 Format regex 1994 Regex ^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source_file>[^ \]]+)\:(?<source_line>\d+)\]\s(?<message>.*)$ 1995 Time_Key time 1996 Time_Format %m%d %H:%M:%S.%L 1997 1998 [PARSER] 1999 Name network-log 2000 Format json 2001 Time_Key timestamp 2002 Time_Format %Y-%m-%dT%H:%M:%S.%L%z 2003 2004 [PARSER] 2005 Name syslog-rfc5424 2006 Format regex 2007 Regex ^\<(?<pri>[0-9]{1,5})\>1 (?<time>[^ ]+) (?<host>[^ ]+) (?<ident>[^ ]+) (?<pid>[-0-9]+) (?<msgid>[^ ]+) (?<extradata>(\[(.*?)\]|-)) (?<message>.+)$ 2008 Time_Key time 2009 Time_Format %Y-%m-%dT%H:%M:%S.%L%z 2010 Time_Keep On 2011 2012 [PARSER] 2013 Name syslog-rfc3164-local 2014 Format regex 2015 Regex ^\<(?<pri>[0-9]+)\>(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$ 2016 Time_Key time 2017 Time_Format %b %d %H:%M:%S 2018 Time_Keep On 2019 2020 [PARSER] 2021 Name syslog-rfc3164 2022 Format regex 2023 Regex /^\<(?<pri>[0-9]+)\>(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$/ 2024 Time_Key time 2025 Time_Format %b %d %H:%M:%S 2026 Time_Keep On 2027 2028 [PARSER] 2029 Name kube-custom 2030 Format regex 2031 Regex (?<tag>[^.]+)?\.?(?<pod_name>[a-z0-9](?:[-a-z0-9]*[a-z0-9])?(?:\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*)_(?<namespace_name>[^_]+)_(?<container_name>.+)-(?<docker_id>[a-z0-9]{64})\.log$ 2032 '@ 2033 2034 2035 # ----------- Stackdriver logging setup -------------------------- 2036 # This section would be deprecated soon 2037 # 2038 2039 $STACKDRIVER_ROOT = 'C:\Program Files (x86)\Stackdriver' 2040 2041 # Restarts the Stackdriver logging agent, or starts it if it is not currently 2042 # running. A standard `Restart-Service StackdriverLogging` may fail because 2043 # StackdriverLogging sometimes is unstoppable, so this function works around it 2044 # by killing the processes. 2045 function Restart-StackdriverAgent { 2046 Stop-Service -NoWait -ErrorAction Ignore StackdriverLogging 2047 2048 # Wait (if necessary) for service to stop. 2049 $timeout = 10 2050 $stopped = (Get-service StackdriverLogging).Status -eq 'Stopped' 2051 for ($i = 0; $i -lt $timeout -and !($stopped); $i++) { 2052 Start-Sleep 1 2053 $stopped = (Get-service StackdriverLogging).Status -eq 'Stopped' 2054 } 2055 2056 if ((Get-service StackdriverLogging).Status -ne 'Stopped') { 2057 # Force kill the processes. 2058 Stop-Process -Force -PassThru -Id (Get-WmiObject win32_process | 2059 Where CommandLine -Like '*Stackdriver/logging*').ProcessId 2060 2061 # Wait until process has stopped. 2062 $waited = 0 2063 $log_period = 10 2064 $timeout = 60 2065 while ((Get-service StackdriverLogging).Status -ne 'Stopped' -and $waited -lt $timeout) { 2066 Start-Sleep 1 2067 $waited++ 2068 2069 if ($waited % $log_period -eq 0) { 2070 Log-Output "Waiting for StackdriverLogging service to stop" 2071 } 2072 } 2073 2074 # Timeout occurred 2075 if ($waited -ge $timeout) { 2076 Throw ("Timeout while waiting for StackdriverLogging service to stop") 2077 } 2078 } 2079 2080 Start-Service StackdriverLogging 2081 } 2082 2083 # Check whether the logging agent is installed by whether it's registered as service 2084 function IsStackdriverAgentInstalled { 2085 $stackdriver_status = (Get-Service StackdriverLogging -ErrorAction Ignore).Status 2086 return -not [string]::IsNullOrEmpty($stackdriver_status) 2087 } 2088 2089 # Writes the logging configuration file for Stackdriver. Restart-LoggingAgent 2090 # should then be called to pick up the new configuration. 2091 function Configure-StackdriverAgent { 2092 $fluentd_config_dir = "$STACKDRIVER_ROOT\LoggingAgent\config.d" 2093 $fluentd_config_file = "$fluentd_config_dir\k8s_containers.conf" 2094 2095 # Create a configuration file for kubernetes containers. 2096 # The config.d directory should have already been created automatically, but 2097 # try creating again just in case. 2098 New-Item $fluentd_config_dir -ItemType 'directory' -Force | Out-Null 2099 2100 $config = $FLUENTD_CONFIG.replace('NODE_NAME', (hostname)) 2101 $config | Out-File -FilePath $fluentd_config_file -Encoding ASCII 2102 Log-Output "Wrote fluentd logging config to $fluentd_config_file" 2103 2104 # Configure StackdriverLogging to automatically restart on failure after 10 2105 # seconds. The logging agent may die die to various disruptions but can be 2106 # resumed. 2107 sc.exe failure StackdriverLogging reset= 0 actions= restart/1000/restart/10000 2108 Write-VerboseServiceInfoToConsole -Service 'StackdriverLogging' 2109 } 2110 2111 # The NODE_NAME placeholder must be replaced with the node's name (hostname). 2112 $FLUENTD_CONFIG = @' 2113 # This configuration file for Fluentd is used to watch changes to kubernetes 2114 # container logs in the directory /var/lib/docker/containers/ and submit the 2115 # log records to Google Cloud Logging using the cloud-logging plugin. 2116 # 2117 # Example 2118 # ======= 2119 # A line in the Docker log file might look like this JSON: 2120 # 2121 # {"log":"2014/09/25 21:15:03 Got request with path wombat\\n", 2122 # "stream":"stderr", 2123 # "time":"2014-09-25T21:15:03.499185026Z"} 2124 # 2125 # The original tag is derived from the log file's location. 2126 # For example a Docker container's logs might be in the directory: 2127 # /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b 2128 # and in the file: 2129 # 997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log 2130 # where 997599971ee6... is the Docker ID of the running container. 2131 # The Kubernetes kubelet makes a symbolic link to this file on the host 2132 # machine in the /var/log/containers directory which includes the pod name, 2133 # the namespace name and the Kubernetes container name: 2134 # synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log 2135 # -> 2136 # /var/lib/docker/containers/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b/997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b-json.log 2137 # The /var/log directory on the host is mapped to the /var/log directory in the container 2138 # running this instance of Fluentd and we end up collecting the file: 2139 # /var/log/containers/synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log 2140 # This results in the tag: 2141 # var.log.containers.synthetic-logger-0.25lps-pod_default_synth-lgr-997599971ee6366d4a5920d25b79286ad45ff37a74494f262e3bc98d909d0a7b.log 2142 # where 'synthetic-logger-0.25lps-pod' is the pod name, 'default' is the 2143 # namespace name, 'synth-lgr' is the container name and '997599971ee6..' is 2144 # the container ID. 2145 # The record reformer is used to extract pod_name, namespace_name and 2146 # container_name from the tag and set them in a local_resource_id in the 2147 # format of: 2148 # 'k8s_container.<NAMESPACE_NAME>.<POD_NAME>.<CONTAINER_NAME>'. 2149 # The reformer also changes the tags to 'stderr' or 'stdout' based on the 2150 # value of 'stream'. 2151 # local_resource_id is later used by google_cloud plugin to determine the 2152 # monitored resource to ingest logs against. 2153 # Json Log Example: 2154 # {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"} 2155 # CRI Log Example: 2156 # 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here 2157 <source> 2158 @type tail 2159 path /var/log/containers/*.log 2160 pos_file /var/log/gcp-containers.log.pos 2161 # Tags at this point are in the format of: 2162 # reform.var.log.containers.<POD_NAME>_<NAMESPACE_NAME>_<CONTAINER_NAME>-<CONTAINER_ID>.log 2163 tag reform.* 2164 read_from_head true 2165 <parse> 2166 @type multi_format 2167 <pattern> 2168 format json 2169 time_key time 2170 time_format %Y-%m-%dT%H:%M:%S.%NZ 2171 keep_time_key 2172 </pattern> 2173 <pattern> 2174 format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/ 2175 time_format %Y-%m-%dT%H:%M:%S.%N%:z 2176 </pattern> 2177 </parse> 2178 </source> 2179 # Example: 2180 # I0204 07:32:30.020537 3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537] 2181 <source> 2182 @type tail 2183 format multiline 2184 multiline_flush_interval 5s 2185 format_firstline /^\w\d{4}/ 2186 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/ 2187 time_format %m%d %H:%M:%S.%N 2188 path /etc/kubernetes/logs/kubelet.log 2189 pos_file /etc/kubernetes/logs/gcp-kubelet.log.pos 2190 tag kubelet 2191 </source> 2192 # Example: 2193 # I1118 21:26:53.975789 6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed 2194 <source> 2195 @type tail 2196 format multiline 2197 multiline_flush_interval 5s 2198 format_firstline /^\w\d{4}/ 2199 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/ 2200 time_format %m%d %H:%M:%S.%N 2201 path /etc/kubernetes/logs/kube-proxy.log 2202 pos_file /etc/kubernetes/logs/gcp-kube-proxy.log.pos 2203 tag kube-proxy 2204 </source> 2205 # Example: 2206 # I0928 03:15:50.440223 4880 main.go:51] Starting CSI-Proxy Server ... 2207 <source> 2208 @type tail 2209 format multiline 2210 multiline_flush_interval 5s 2211 format_firstline /^\w\d{4}/ 2212 format1 /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<message>.*)/ 2213 time_format %m%d %H:%M:%S.%N 2214 path /etc/kubernetes/logs/csi-proxy.log 2215 pos_file /etc/kubernetes/logs/gcp-csi-proxy.log.pos 2216 tag csi-proxy 2217 </source> 2218 # Example: 2219 # time="2019-12-10T21:27:59.836946700Z" level=info msg="loading plugin \"io.containerd.grpc.v1.cri\"..." type=io.containerd.grpc.v1 2220 <source> 2221 @type tail 2222 format multiline 2223 multiline_flush_interval 5s 2224 format_firstline /^time=/ 2225 format1 /^time="(?<time>[^ ]*)" level=(?<severity>\w*) (?<message>.*)/ 2226 time_format %Y-%m-%dT%H:%M:%S.%N%z 2227 path /etc/kubernetes/logs/containerd.log 2228 pos_file /etc/kubernetes/logs/gcp-containerd.log.pos 2229 tag container-runtime 2230 </source> 2231 <match reform.**> 2232 @type record_reformer 2233 enable_ruby true 2234 <record> 2235 # Extract local_resource_id from tag for 'k8s_container' monitored 2236 # resource. The format is: 2237 # 'k8s_container.<namespace_name>.<pod_name>.<container_name>'. 2238 "logging.googleapis.com/local_resource_id" ${"k8s_container.#{tag_suffix[4].rpartition('.')[0].split('_')[1]}.#{tag_suffix[4].rpartition('.')[0].split('_')[0]}.#{tag_suffix[4].rpartition('.')[0].split('_')[2].rpartition('-')[0]}"} 2239 # Rename the field 'log' to a more generic field 'message'. This way the 2240 # fluent-plugin-google-cloud knows to flatten the field as textPayload 2241 # instead of jsonPayload after extracting 'time', 'severity' and 2242 # 'stream' from the record. 2243 message ${record['log']} 2244 # If 'severity' is not set, assume stderr is ERROR and stdout is INFO. 2245 severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end} 2246 </record> 2247 tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end} 2248 remove_keys stream,log 2249 </match> 2250 # TODO: detect exceptions and forward them as one log entry using the 2251 # detect_exceptions plugin 2252 # This section is exclusive for k8s_container logs. These logs come with 2253 # 'raw.stderr' or 'raw.stdout' tags. 2254 <match {raw.stderr,raw.stdout}> 2255 @type google_cloud 2256 # Try to detect JSON formatted log entries. 2257 detect_json true 2258 # Allow log entries from multiple containers to be sent in the same request. 2259 split_logs_by_tag false 2260 # Set the buffer type to file to improve the reliability and reduce the memory consumption 2261 buffer_type file 2262 buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer 2263 # Set queue_full action to block because we want to pause gracefully 2264 # in case of the off-the-limits load instead of throwing an exception 2265 buffer_queue_full_action block 2266 # Set the chunk limit conservatively to avoid exceeding the recommended 2267 # chunk size of 5MB per write request. 2268 buffer_chunk_limit 512k 2269 # Cap the combined memory usage of this buffer and the one below to 2270 # 512KiB/chunk * (6 + 2) chunks = 4 MiB 2271 buffer_queue_limit 6 2272 # Never wait more than 5 seconds before flushing logs in the non-error case. 2273 flush_interval 5s 2274 # Never wait longer than 30 seconds between retries. 2275 max_retry_wait 30 2276 # Disable the limit on the number of retries (retry forever). 2277 disable_retry_limit 2278 # Use multiple threads for processing. 2279 num_threads 2 2280 use_grpc true 2281 # Skip timestamp adjustment as this is in a controlled environment with 2282 # known timestamp format. This helps with CPU usage. 2283 adjust_invalid_timestamps false 2284 </match> 2285 # Attach local_resource_id for 'k8s_node' monitored resource. 2286 <filter **> 2287 @type record_transformer 2288 enable_ruby true 2289 <record> 2290 "logging.googleapis.com/local_resource_id" ${"k8s_node.NODE_NAME"} 2291 </record> 2292 </filter> 2293 '@ 2294 2295 # Downloads the out-of-tree kubelet image credential provider binaries. 2296 function DownloadAndInstall-AuthProviderGcpBinary { 2297 if ("${env:ENABLE_AUTH_PROVIDER_GCP}" -eq "true") { 2298 $filename = 'auth-provider-gcp.exe' 2299 if (ShouldWrite-File ${env:AUTH_PROVIDER_GCP_WINDOWS_BIN_DIR}\$filename) { 2300 Log-Output "Installing auth provider gcp binaries" 2301 $tmp_dir = 'C:\k8s_tmp' 2302 New-Item -Force -ItemType 'directory' $tmp_dir | Out-Null 2303 $url = "${env:AUTH_PROVIDER_GCP_STORAGE_PATH}/${env:AUTH_PROVIDER_GCP_VERSION}/windows_amd64/$filename" 2304 MustDownload-File -Hash $AUTH_PROVIDER_GCP_HASH_WINDOWS_AMD64 -Algorithm SHA512 -OutFile $tmp_dir\$filename -URLs $url 2305 Move-Item -Force $tmp_dir\$filename ${env:AUTH_PROVIDER_GCP_WINDOWS_BIN_DIR} 2306 Remove-Item -Force -Recurse $tmp_dir 2307 } else { 2308 Log-Output "Skipping auth provider gcp binaries installation, auth-provider-gcp.exe file already exists." 2309 } 2310 } 2311 } 2312 2313 # Creates config file for the out-of-tree kubelet image credential provider. 2314 function Create-AuthProviderGcpConfig { 2315 if ("${env:ENABLE_AUTH_PROVIDER_GCP}" -eq "true") { 2316 if (ShouldWrite-File ${env:AUTH_PROVIDER_GCP_WINDOWS_CONF_FILE}) { 2317 Log-Output "Creating auth provider gcp config file" 2318 Set-Content ${env:AUTH_PROVIDER_GCP_WINDOWS_CONF_FILE} @' 2319 kind: CredentialProviderConfig 2320 apiVersion: kubelet.config.k8s.io/v1 2321 providers: 2322 - name: auth-provider-gcp.exe 2323 apiVersion: credentialprovider.kubelet.k8s.io/v1 2324 matchImages: 2325 - "container.cloud.google.com" 2326 - "gcr.io" 2327 - "*.gcr.io" 2328 - "*.pkg.dev" 2329 args: 2330 - get-credentials 2331 - --v=3 2332 defaultCacheDuration: 1m 2333 '@ 2334 } else { 2335 Log-Output "Skipping auth provider gcp config file creation, it already exists" 2336 } 2337 } 2338 } 2339 2340 2341 # Export all public functions: 2342 Export-ModuleMember -Function *-*