github.com/sealerio/sealer@v0.11.1-0.20240507115618-f4f89c5853ae/pkg/infra/container/imagecontext/base/entrypoint (about) 1 #!/bin/bash 2 3 # Copyright 2019 The Kubernetes Authors. 4 # 5 # Licensed under the Apache License, Version 2.0 (the "License"); 6 # you may not use this file except in compliance with the License. 7 # You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 set -o errexit 18 set -o nounset 19 set -o pipefail 20 21 # If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host. 22 # Otherwise we are in a non-initial user namespace. 23 # https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118 24 userns="" 25 if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then 26 userns="1" 27 echo 'INFO: running in a user namespace (experimental)' 28 fi 29 30 validate_userns() { 31 if [[ -z "${userns}" ]]; then 32 return 33 fi 34 35 local nofile_hard 36 nofile_hard="$(ulimit -Hn)" 37 local nofile_hard_expected="64000" 38 if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then 39 echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 40 fi 41 42 if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then 43 echo "ERROR: UserNS: cgroup v2 needs to be enabled" >&2 44 exit 1 45 fi 46 for f in cpu memory pids; do 47 if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then 48 echo "ERROR: UserNS: $f controller needs to be delegated" >&2 49 exit 1 50 fi 51 done 52 } 53 54 fake_file_with_content(){ 55 local path="$1" 56 local content="$2" 57 local base="/run/fake" 58 local fake_path="${base}/${path}" 59 mkdir -p "$(dirname "${fake_path}")" 60 echo "INFO: UserNS: faking ${path} to be \"${content}\" (writable)" 61 echo "${content}" > "${fake_path}" 62 mount --bind "${fake_path}" "${path}" 63 } 64 65 fake_sysctl() { 66 local key="$1" 67 local key_slash 68 # shellcheck disable=SC2001 69 key_slash="$(echo "${key}" | sed -e s@\\.@/@g)" 70 local path="/proc/sys/${key_slash}" 71 if [[ -f "${path}" ]]; then 72 local content 73 content="$(cat "${path}")" 74 fake_file_with_content "${path}" "${content}" 75 fi 76 } 77 78 79 configure_proxy() { 80 # ensure all processes receive the proxy settings by default 81 # https://www.freedesktop.org/software/systemd/man/systemd-system.conf.html 82 mkdir -p /etc/systemd/system.conf.d/ 83 cat <<EOF >/etc/systemd/system.conf.d/proxy-default-environment.conf 84 [Manager] 85 DefaultEnvironment="HTTP_PROXY=${HTTP_PROXY:-}" "HTTPS_PROXY=${HTTPS_PROXY:-}" "NO_PROXY=${NO_PROXY:-}" 86 EOF 87 } 88 89 fix_mount() { 90 echo 'INFO: ensuring we can execute mount/umount even with userns-remap' 91 # necessary only when userns-remap is enabled on the host, but harmless 92 # The binary /bin/mount should be owned by root and have the setuid bit 93 chown root:root "$(which mount)" "$(which umount)" 94 chmod -s "$(which mount)" "$(which umount)" 95 96 # This is a workaround to an AUFS bug that might cause `Text file 97 # busy` on `mount` command below. See more details in 98 # https://github.com/moby/moby/issues/9547 99 if [[ "$(stat -f -c %T "$(which mount)")" == 'aufs' ]]; then 100 echo 'INFO: detected aufs, calling sync' >&2 101 sync 102 fi 103 104 if [[ -z "${userns}" ]]; then 105 echo 'INFO: remounting /sys read-only' 106 # systemd-in-a-container should have read only /sys 107 # https://systemd.io/CONTAINER_INTERFACE/ 108 # however, we need other things from `docker run --privileged` ... 109 # and this flag also happens to make /sys rw, amongst other things 110 # 111 # This step is skipped when running inside UserNS, because it fails with EACCES. 112 mount -o remount,ro /sys 113 fi 114 115 echo 'INFO: making mounts shared' >&2 116 # for mount propagation 117 mount --make-rshared / 118 } 119 120 # helper used by fix_cgroup 121 mount_kubelet_cgroup_root() { 122 local cgroup_root=$1 123 local subsystem=$2 124 if [ -z "${cgroup_root}" ]; then 125 return 0 126 fi 127 mkdir -p "${subsystem}/${cgroup_root}" 128 if [ "${subsystem}" == "/sys/fs/cgroup/cpuset" ]; then 129 # This is needed. Otherwise, assigning process to the cgroup 130 # (or any nested cgroup) would result in ENOSPC. 131 cat "${subsystem}/cpuset.cpus" > "${subsystem}/${cgroup_root}/cpuset.cpus" 132 cat "${subsystem}/cpuset.mems" > "${subsystem}/${cgroup_root}/cpuset.mems" 133 fi 134 # We need to perform a self bind mount here because otherwise, 135 # systemd might delete the cgroup unintentionally before the 136 # kubelet starts. 137 mount --bind "${subsystem}/${cgroup_root}" "${subsystem}/${cgroup_root}" 138 } 139 140 fix_cgroup() { 141 if [[ -f "/sys/fs/cgroup/cgroup.controllers" ]]; then 142 echo 'INFO: detected cgroup v2' 143 # Both Docker and Podman enable CgroupNS on cgroup v2 hosts by default. 144 # 145 # So mostly we do not need to mess around with the cgroup path stuff, 146 # however, we still need to create the "/kubelet" cgroup at least. 147 # (Otherwise kubelet fails with `cgroup-root ["kubelet"] doesn't exist` error, see #1969) 148 # 149 # The "/kubelet" cgroup is created in ExecStartPre of the kubeadm service. 150 # 151 # [FAQ: Why not create "/kubelet" cgroup here?] 152 # We can't create the cgroup with controllers here, because /sys/fs/cgroup/cgroup.subtree_control is empty. 153 # And yet we can't write controllers to /sys/fs/cgroup/cgroup.subtree_control by ourselves either, because 154 # /sys/fs/cgroup/cgroup.procs is not empty at this moment. 155 # 156 # After switching from this entrypoint script to systemd, systemd evacuates the processes in the root 157 # group to "/init.scope" group, so we can write the root subtree_control and create "/kubelet" cgroup. 158 else 159 echo 'INFO: detected cgroup v1' 160 echo 'INFO: fix cgroup mounts for all subsystems' 161 # See: https://d2iq.com/blog/running-kind-inside-a-kubernetes-cluster-for-continuous-integration 162 # Capture initial state before modifying 163 # 164 # Basically we're looking for the cgroup-path for the cpu controller for the 165 # current process. this tells us what cgroup-path the container is in. 166 # Then we collect the subsystems that are active on this path. 167 # We assume the cpu controller is in use on all node containers. 168 # 169 # See: https://man7.org/linux/man-pages/man7/cgroups.7.html 170 local current_cgroup 171 current_cgroup=$(grep -E '^[^:]*:([^:]*,)?cpu(,[^,:]*)?:.*' /proc/self/cgroup | cut -d: -f3) 172 local cgroup_subsystems 173 cgroup_subsystems=$(findmnt -lun -o source,target -t cgroup | grep "${current_cgroup}" | awk '{print $2}') 174 # For each cgroup subsystem, Docker does a bind mount from the current 175 # cgroup to the root of the cgroup subsystem. For instance: 176 # /sys/fs/cgroup/memory/docker/<cid> -> /sys/fs/cgroup/memory 177 # 178 # This will confuse Kubelet and cadvisor and will dump the following error 179 # messages in kubelet log: 180 # `summary_sys_containers.go:47] Failed to get system container stats for ".../kubelet.service"` 181 # 182 # This is because `/proc/<pid>/cgroup` is not affected by the bind mount. 183 # The following is a workaround to recreate the original cgroup 184 # environment by doing another bind mount for each subsystem. 185 local cgroup_mounts 186 # xref: https://github.com/kubernetes/minikube/pull/9508 187 # Example inputs: 188 # 189 # Docker: /docker/562a56986a84b3cd38d6a32ac43fdfcc8ad4d2473acf2839cbf549273f35c206 /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:143 master:23 - cgroup devices rw,devices 190 # podman: /libpod_parent/libpod-73a4fb9769188ae5dc51cb7e24b9f2752a4af7b802a8949f06a7b2f2363ab0e9 ... 191 # Cloud Shell: /kubepods/besteffort/pod3d6beaa3004913efb68ce073d73494b0/accdf94879f0a494f317e9a0517f23cdd18b35ff9439efd0175f17bbc56877c4 /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime master:19 - cgroup cgroup rw,memory 192 # GitHub actions #9304: /actions_job/0924fbbcf7b18d2a00c171482b4600747afc367a9dfbeac9d6b14b35cda80399 /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:263 master:24 - cgroup cgroup rw,memory 193 cgroup_mounts=$(grep -E -o '/[[:alnum:]].* /sys/fs/cgroup.*.*cgroup' /proc/self/mountinfo || true) 194 if [[ -n "${cgroup_mounts}" ]]; then 195 local mount_root 196 mount_root=$(head -n 1 <<<"${cgroup_mounts}" | cut -d' ' -f1) 197 for mount_point in $(echo "${cgroup_mounts}" | cut -d' ' -f 2); do 198 # bind mount each mount_point to mount_point + mount_root 199 # mount --bind /sys/fs/cgroup/cpu /sys/fs/cgroup/cpu/docker/fb07bb6daf7730a3cb14fc7ff3e345d1e47423756ce54409e66e01911bab2160 200 local target="${mount_point}${mount_root}" 201 if ! findmnt "${target}"; then 202 mkdir -p "${target}" 203 mount --bind "${mount_point}" "${target}" 204 fi 205 done 206 fi 207 # kubelet will try to manage cgroups / pods that are not owned by it when 208 # "nesting" clusters, unless we instruct it to use a different cgroup root. 209 # We do this, and when doing so we must fixup this alternative root 210 # currently this is hardcoded to be /kubelet 211 mount --make-rprivate /sys/fs/cgroup 212 echo "${cgroup_subsystems}" | 213 while IFS= read -r subsystem; do 214 mount_kubelet_cgroup_root "/kubelet" "${subsystem}" 215 done 216 fi 217 218 # fix cgroups: cannot found cgroup mount destination: unknown,see:https://github.com/docker/for-linux/issues/219 219 echo 'INFO: fix cgroup mounts for systemd' 220 # kernel provides cgroups? 221 if [ ! -e /proc/cgroups ]; then 222 echo 'INFO:do not have /proc/cgroups' 223 exit 0 224 fi 225 226 # if we don't even have the directory we need, something else must be wrong 227 if [ ! -d /sys/fs/cgroup ]; then 228 echo 'INFO:do not have /sys/fs/cgroup' 229 exit 0 230 fi 231 232 # mount /sys/fs/cgroup if not already done 233 if ! mountpoint -q /sys/fs/cgroup; then 234 mount -t tmpfs -o uid=0,gid=0,mode=0755 cgroup /sys/fs/cgroup 235 fi 236 mkdir /sys/fs/cgroup/systemd || true 237 mount -t cgroup -o none,name=systemd cgroup /sys/fs/cgroup/systemd || true 238 } 239 240 fix_machine_id() { 241 # Deletes the machine-id embedded in the node image and generates a new one. 242 # This is necessary because both kubelet and other components like weave net 243 # use machine-id internally to distinguish nodes. 244 echo 'INFO: clearing and regenerating /etc/machine-id' >&2 245 rm -f /etc/machine-id 246 systemd-machine-id-setup 247 } 248 249 fix_product_name() { 250 # this is a small fix to hide the underlying hardware and fix issue #426 251 # https://github.com/kubernetes-sigs/kind/issues/426 252 if [[ -f /sys/class/dmi/id/product_name ]]; then 253 echo 'INFO: faking /sys/class/dmi/id/product_name to be "sealer"' >&2 254 echo 'sealer' > /etc/product_name 255 mount -o ro,bind /etc/product_name /sys/class/dmi/id/product_name 256 fi 257 } 258 259 fix_product_uuid() { 260 # The system UUID is usually read from DMI via sysfs, the problem is that 261 # in the kind case this means that all (container) nodes share the same 262 # system/product uuid, as they share the same DMI. 263 # Note: The UUID is read from DMI, this tool is overwriting the sysfs files 264 # which should fix the attached issue, but this workaround does not address 265 # the issue if a tool is reading directly from DMI. 266 # https://github.com/kubernetes-sigs/kind/issues/1027 267 [[ ! -f /etc/product_uuid ]] && cat /proc/sys/kernel/random/uuid > /etc/product_uuid 268 if [[ -f /sys/class/dmi/id/product_uuid ]]; then 269 echo 'INFO: faking /sys/class/dmi/id/product_uuid to be random' >&2 270 mount -o ro,bind /etc/product_uuid /sys/class/dmi/id/product_uuid 271 fi 272 if [[ -f /sys/devices/virtual/dmi/id/product_uuid ]]; then 273 echo 'INFO: faking /sys/devices/virtual/dmi/id/product_uuid as well' >&2 274 mount -o ro,bind /etc/product_uuid /sys/devices/virtual/dmi/id/product_uuid 275 fi 276 } 277 278 fix_kmsg() { 279 # In environments where /dev/kmsg is not available, the kubelet (1.15+) won't 280 # start because it cannot open /dev/kmsg when starting the kmsgparser in the 281 # OOM parser. 282 # To support those environments, we link /dev/kmsg to /dev/console. 283 # https://github.com/kubernetes-sigs/kind/issues/662 284 if [[ ! -e /dev/kmsg ]]; then 285 if [[ -e /dev/console ]]; then 286 echo 'WARN: /dev/kmsg does not exist, symlinking /dev/console' >&2 287 ln -s /dev/console /dev/kmsg 288 else 289 echo 'WARN: /dev/kmsg does not exist, nor does /dev/console!' >&2 290 fi 291 elif [[ -n "${userns}" ]]; then 292 if [[ -f "/proc/sys/kernel/dmesg_restrict" ]]; then 293 if [[ "$(cat /proc/sys/kernel/dmesg_restrict)" = "1" ]]; then 294 echo 'WARN: UserNS: /dev/kmsg is not readable, faking with /dev/null (hint: set sysctl value "kernel.dmesg_restrict" to 0)' >&2 295 mount --bind /dev/null /dev/kmsg 296 fi 297 fi 298 fi 299 } 300 301 select_iptables() { 302 # based on: https://github.com/kubernetes-sigs/iptables-wrappers/blob/97b01f43a8e8db07840fc4b95e833a37c0d36b12/iptables-wrapper-installer.sh 303 local mode num_legacy_lines num_nft_lines 304 num_legacy_lines=$( (iptables-legacy-save || true; ip6tables-legacy-save || true) 2>/dev/null | grep '^-' | wc -l || true) 305 num_nft_lines=$( (timeout 5 sh -c "iptables-nft-save; ip6tables-nft-save" || true) 2>/dev/null | grep '^-' | wc -l || true) 306 if [ "${num_legacy_lines}" -ge "${num_nft_lines}" ]; then 307 mode=legacy 308 else 309 mode=nft 310 fi 311 echo "INFO: setting iptables to detected mode: ${mode}" >&2 312 update-alternatives --set iptables "/usr/sbin/iptables-${mode}" > /dev/null 313 update-alternatives --set ip6tables "/usr/sbin/ip6tables-${mode}" > /dev/null 314 } 315 316 enable_network_magic(){ 317 # well-known docker embedded DNS is at 127.0.0.11:53 318 local docker_embedded_dns_ip='127.0.0.11' 319 320 # first we need to detect an IP to use for reaching the docker host 321 local docker_host_ip 322 docker_host_ip="$( (head -n1 <(getent ahostsv4 'host.docker.internal') | cut -d' ' -f1) || true)" 323 # if the ip doesn't exist or is a loopback address use the default gateway 324 if [[ -z "${docker_host_ip}" ]] || [[ $docker_host_ip =~ ^127\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then 325 docker_host_ip=$(ip -4 route show default | cut -d' ' -f3) 326 fi 327 328 # patch docker's iptables rules to switch out the DNS IP 329 iptables-save \ 330 | sed \ 331 `# switch docker DNS DNAT rules to our chosen IP` \ 332 -e "s/-d ${docker_embedded_dns_ip}/-d ${docker_host_ip}/g" \ 333 `# we need to also apply these rules to non-local traffic (from pods)` \ 334 -e 's/-A OUTPUT \(.*\) -j DOCKER_OUTPUT/\0\n-A PREROUTING \1 -j DOCKER_OUTPUT/' \ 335 `# switch docker DNS SNAT rules rules to our chosen IP` \ 336 -e "s/--to-source :53/--to-source ${docker_host_ip}:53/g"\ 337 | iptables-restore 338 339 # now we can ensure that DNS is configured to use our IP 340 cp /etc/resolv.conf /etc/resolv.conf.original 341 sed -e "s/${docker_embedded_dns_ip}/${docker_host_ip}/g" /etc/resolv.conf.original >/etc/resolv.conf 342 343 # fixup IPs in manifests ... 344 curr_ipv4="$( (head -n1 <(getent ahostsv4 "$(hostname)") | cut -d' ' -f1) || true)" 345 echo "INFO: Detected IPv4 address: ${curr_ipv4}" >&2 346 if [ -f /etc/old-ipv4 ]; then 347 old_ipv4=$(cat /etc/old-ipv4) 348 echo "INFO: Detected old IPv4 address: ${old_ipv4}" >&2 349 # sanity check that we have a current address 350 if [[ -z $curr_ipv4 ]]; then 351 echo "ERROR: Have an old IPv4 address but no current IPv4 address (!)" >&2 352 exit 1 353 fi 354 # kubernetes manifests are only present on control-plane nodes 355 sed -i "s#${old_ipv4}#${curr_ipv4}#" /etc/kubernetes/manifests/*.yaml || true 356 # this is no longer required with autodiscovery 357 sed -i "s#${old_ipv4}#${curr_ipv4}#" /var/lib/kubelet/kubeadm-flags.env || true 358 fi 359 if [[ -n $curr_ipv4 ]]; then 360 echo -n "${curr_ipv4}" >/etc/old-ipv4 361 fi 362 363 # do IPv6 364 curr_ipv6="$( (head -n1 <(getent ahostsv6 "$(hostname)") | cut -d' ' -f1) || true)" 365 echo "INFO: Detected IPv6 address: ${curr_ipv6}" >&2 366 if [ -f /etc/old-ipv6 ]; then 367 old_ipv6=$(cat /etc/old-ipv6) 368 echo "INFO: Detected old IPv6 address: ${old_ipv6}" >&2 369 # sanity check that we have a current address 370 if [[ -z $curr_ipv6 ]]; then 371 echo "ERROR: Have an old IPv6 address but no current IPv6 address (!)" >&2 372 fi 373 # kubernetes manifests are only present on control-plane nodes 374 sed -i "s#${old_ipv6}#${curr_ipv6}#" /etc/kubernetes/manifests/*.yaml || true 375 # this is no longer required with autodiscovery 376 sed -i "s#${old_ipv6}#${curr_ipv6}#" /var/lib/kubelet/kubeadm-flags.env || true 377 fi 378 if [[ -n $curr_ipv6 ]]; then 379 echo -n "${curr_ipv6}" >/etc/old-ipv6 380 fi 381 } 382 383 # validate state 384 validate_userns 385 386 # run pre-init fixups 387 # NOTE: it's important that we do configure* first in this order to avoid races 388 configure_proxy 389 fix_kmsg 390 fix_mount 391 fix_cgroup 392 fix_machine_id 393 fix_product_name 394 fix_product_uuid 395 select_iptables 396 enable_network_magic 397 398 # we want the command (expected to be systemd) to be PID1, so exec to it 399 exec "$@"