github.com/alibaba/sealer@v0.8.6-0.20220430115802-37a2bdaa8173/pkg/infra/container/imagecontext/arm/entrypoint (about) 1 #!/bin/bash 2 3 set -o errexit 4 set -o nounset 5 set -o pipefail 6 7 # If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host. 8 # Otherwise we are in a non-initial user namespace. 9 # https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118 10 userns="" 11 if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then 12 userns="1" 13 echo 'INFO: running in a user namespace (experimental)' 14 fi 15 16 validate_userns() { 17 if [[ -z "${userns}" ]]; then 18 return 19 fi 20 21 local nofile_hard 22 nofile_hard="$(ulimit -Hn)" 23 local nofile_hard_expected="64000" 24 if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then 25 echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2 26 fi 27 28 if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then 29 echo "ERROR: UserNS: cgroup v2 needs to be enabled" >&2 30 exit 1 31 fi 32 for f in cpu memory pids; do 33 if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then 34 echo "ERROR: UserNS: $f controller needs to be delegated" >&2 35 exit 1 36 fi 37 done 38 } 39 40 fake_file_with_content(){ 41 local path="$1" 42 local content="$2" 43 local base="/run/fake" 44 local fake_path="${base}/${path}" 45 mkdir -p "$(dirname "${fake_path}")" 46 echo "INFO: UserNS: faking ${path} to be \"${content}\" (writable)" 47 echo "${content}" > "${fake_path}" 48 mount --bind "${fake_path}" "${path}" 49 } 50 51 fake_sysctl() { 52 local key="$1" 53 local key_slash 54 # shellcheck disable=SC2001 55 key_slash="$(echo "${key}" | sed -e s@\\.@/@g)" 56 local path="/proc/sys/${key_slash}" 57 if [[ -f "${path}" ]]; then 58 local content 59 content="$(cat "${path}")" 60 fake_file_with_content "${path}" "${content}" 61 fi 62 } 63 64 65 configure_proxy() { 66 # ensure all processes receive the proxy settings by default 67 # https://www.freedesktop.org/software/systemd/man/systemd-system.conf.html 68 mkdir -p /etc/systemd/system.conf.d/ 69 cat <<EOF >/etc/systemd/system.conf.d/proxy-default-environment.conf 70 [Manager] 71 DefaultEnvironment="HTTP_PROXY=${HTTP_PROXY:-}" "HTTPS_PROXY=${HTTPS_PROXY:-}" "NO_PROXY=${NO_PROXY:-}" 72 EOF 73 } 74 75 fix_mount() { 76 echo 'INFO: ensuring we can execute mount/umount even with userns-remap' 77 # necessary only when userns-remap is enabled on the host, but harmless 78 # The binary /bin/mount should be owned by root and have the setuid bit 79 chown root:root "$(which mount)" "$(which umount)" 80 chmod -s "$(which mount)" "$(which umount)" 81 82 # This is a workaround to an AUFS bug that might cause `Text file 83 # busy` on `mount` command below. See more details in 84 # https://github.com/moby/moby/issues/9547 85 if [[ "$(stat -f -c %T "$(which mount)")" == 'aufs' ]]; then 86 echo 'INFO: detected aufs, calling sync' >&2 87 sync 88 fi 89 90 if [[ -z "${userns}" ]]; then 91 echo 'INFO: remounting /sys read-only' 92 # systemd-in-a-container should have read only /sys 93 # https://systemd.io/CONTAINER_INTERFACE/ 94 # however, we need other things from `docker run --privileged` ... 95 # and this flag also happens to make /sys rw, amongst other things 96 # 97 # This step is skipped when running inside UserNS, because it fails with EACCES. 98 mount -o remount,ro /sys 99 fi 100 101 echo 'INFO: making mounts shared' >&2 102 # for mount propagation 103 mount --make-rshared / 104 } 105 106 # helper used by fix_cgroup 107 mount_kubelet_cgroup_root() { 108 local cgroup_root=$1 109 local subsystem=$2 110 if [ -z "${cgroup_root}" ]; then 111 return 0 112 fi 113 mkdir -p "${subsystem}/${cgroup_root}" 114 if [ "${subsystem}" == "/sys/fs/cgroup/cpuset" ]; then 115 # This is needed. Otherwise, assigning process to the cgroup 116 # (or any nested cgroup) would result in ENOSPC. 117 cat "${subsystem}/cpuset.cpus" > "${subsystem}/${cgroup_root}/cpuset.cpus" 118 cat "${subsystem}/cpuset.mems" > "${subsystem}/${cgroup_root}/cpuset.mems" 119 fi 120 # We need to perform a self bind mount here because otherwise, 121 # systemd might delete the cgroup unintentionally before the 122 # kubelet starts. 123 mount --bind "${subsystem}/${cgroup_root}" "${subsystem}/${cgroup_root}" 124 } 125 126 fix_cgroup() { 127 if [[ -f "/sys/fs/cgroup/cgroup.controllers" ]]; then 128 echo 'INFO: detected cgroup v2' 129 # Both Docker and Podman enable CgroupNS on cgroup v2 hosts by default. 130 # 131 # So mostly we do not need to mess around with the cgroup path stuff, 132 # however, we still need to create the "/kubelet" cgroup at least. 133 # (Otherwise kubelet fails with `cgroup-root ["kubelet"] doesn't exist` error, see #1969) 134 # 135 # The "/kubelet" cgroup is created in ExecStartPre of the kubeadm service. 136 # 137 # [FAQ: Why not create "/kubelet" cgroup here?] 138 # We can't create the cgroup with controllers here, because /sys/fs/cgroup/cgroup.subtree_control is empty. 139 # And yet we can't write controllers to /sys/fs/cgroup/cgroup.subtree_control by ourselves either, because 140 # /sys/fs/cgroup/cgroup.procs is not empty at this moment. 141 # 142 # After switching from this entrypoint script to systemd, systemd evacuates the processes in the root 143 # group to "/init.scope" group, so we can write the root subtree_control and create "/kubelet" cgroup. 144 return 145 fi 146 echo 'INFO: detected cgroup v1' 147 echo 'INFO: fix cgroup mounts for all subsystems' 148 # See: https://d2iq.com/blog/running-kind-inside-a-kubernetes-cluster-for-continuous-integration 149 # Capture initial state before modifying 150 # 151 # Basically we're looking for the cgroup-path for the cpu controller for the 152 # current process. this tells us what cgroup-path the container is in. 153 # Then we collect the subsystems that are active on this path. 154 # We assume the cpu controller is in use on all node containers. 155 # 156 # See: https://man7.org/linux/man-pages/man7/cgroups.7.html 157 local current_cgroup 158 current_cgroup=$(grep -E '^[^:]*:([^:]*,)?cpu(,[^,:]*)?:.*' /proc/self/cgroup | cut -d: -f3) 159 local cgroup_subsystems 160 cgroup_subsystems=$(findmnt -lun -o source,target -t cgroup | grep "${current_cgroup}" | awk '{print $2}') 161 # For each cgroup subsystem, Docker does a bind mount from the current 162 # cgroup to the root of the cgroup subsystem. For instance: 163 # /sys/fs/cgroup/memory/docker/<cid> -> /sys/fs/cgroup/memory 164 # 165 # This will confuse Kubelet and cadvisor and will dump the following error 166 # messages in kubelet log: 167 # `summary_sys_containers.go:47] Failed to get system container stats for ".../kubelet.service"` 168 # 169 # This is because `/proc/<pid>/cgroup` is not affected by the bind mount. 170 # The following is a workaround to recreate the original cgroup 171 # environment by doing another bind mount for each subsystem. 172 local cgroup_mounts 173 # xref: https://github.com/kubernetes/minikube/pull/9508 174 # Example inputs: 175 # 176 # Docker: /docker/562a56986a84b3cd38d6a32ac43fdfcc8ad4d2473acf2839cbf549273f35c206 /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:143 master:23 - cgroup devices rw,devices 177 # podman: /libpod_parent/libpod-73a4fb9769188ae5dc51cb7e24b9f2752a4af7b802a8949f06a7b2f2363ab0e9 ... 178 # Cloud Shell: /kubepods/besteffort/pod3d6beaa3004913efb68ce073d73494b0/accdf94879f0a494f317e9a0517f23cdd18b35ff9439efd0175f17bbc56877c4 /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime master:19 - cgroup cgroup rw,memory 179 # GitHub actions #9304: /actions_job/0924fbbcf7b18d2a00c171482b4600747afc367a9dfbeac9d6b14b35cda80399 /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:263 master:24 - cgroup cgroup rw,memory 180 cgroup_mounts=$(grep -E -o '/[[:alnum:]].* /sys/fs/cgroup.*.*cgroup' /proc/self/mountinfo || true) 181 if [[ -n "${cgroup_mounts}" ]]; then 182 local mount_root 183 mount_root=$(head -n 1 <<<"${cgroup_mounts}" | cut -d' ' -f1) 184 for mount_point in $(echo "${cgroup_mounts}" | cut -d' ' -f 2); do 185 # bind mount each mount_point to mount_point + mount_root 186 # mount --bind /sys/fs/cgroup/cpu /sys/fs/cgroup/cpu/docker/fb07bb6daf7730a3cb14fc7ff3e345d1e47423756ce54409e66e01911bab2160 187 local target="${mount_point}${mount_root}" 188 if ! findmnt "${target}"; then 189 mkdir -p "${target}" 190 mount --bind "${mount_point}" "${target}" 191 fi 192 done 193 fi 194 # kubelet will try to manage cgroups / pods that are not owned by it when 195 # "nesting" clusters, unless we instruct it to use a different cgroup root. 196 # We do this, and when doing so we must fixup this alternative root 197 # currently this is hardcoded to be /kubelet 198 mount --make-rprivate /sys/fs/cgroup 199 echo "${cgroup_subsystems}" | 200 while IFS= read -r subsystem; do 201 mount_kubelet_cgroup_root "/kubelet" "${subsystem}" 202 done 203 } 204 205 fix_machine_id() { 206 # Deletes the machine-id embedded in the node image and generates a new one. 207 # This is necessary because both kubelet and other components like weave net 208 # use machine-id internally to distinguish nodes. 209 echo 'INFO: clearing and regenerating /etc/machine-id' >&2 210 rm -f /etc/machine-id 211 systemd-machine-id-setup 212 } 213 214 fix_product_name() { 215 # this is a small fix to hide the underlying hardware and fix issue #426 216 # https://github.com/kubernetes-sigs/kind/issues/426 217 if [[ -f /sys/class/dmi/id/product_name ]]; then 218 echo 'INFO: faking /sys/class/dmi/id/product_name to be "sealer"' >&2 219 echo 'sealer' > /etc/product_name 220 mount -o ro,bind /etc/product_name /sys/class/dmi/id/product_name 221 fi 222 } 223 224 fix_product_uuid() { 225 # The system UUID is usually read from DMI via sysfs, the problem is that 226 # in the kind case this means that all (container) nodes share the same 227 # system/product uuid, as they share the same DMI. 228 # Note: The UUID is read from DMI, this tool is overwriting the sysfs files 229 # which should fix the attached issue, but this workaround does not address 230 # the issue if a tool is reading directly from DMI. 231 # https://github.com/kubernetes-sigs/kind/issues/1027 232 [[ ! -f /etc/product_uuid ]] && cat /proc/sys/kernel/random/uuid > /etc/product_uuid 233 if [[ -f /sys/class/dmi/id/product_uuid ]]; then 234 echo 'INFO: faking /sys/class/dmi/id/product_uuid to be random' >&2 235 mount -o ro,bind /etc/product_uuid /sys/class/dmi/id/product_uuid 236 fi 237 if [[ -f /sys/devices/virtual/dmi/id/product_uuid ]]; then 238 echo 'INFO: faking /sys/devices/virtual/dmi/id/product_uuid as well' >&2 239 mount -o ro,bind /etc/product_uuid /sys/devices/virtual/dmi/id/product_uuid 240 fi 241 } 242 243 fix_kmsg() { 244 # In environments where /dev/kmsg is not available, the kubelet (1.15+) won't 245 # start because it cannot open /dev/kmsg when starting the kmsgparser in the 246 # OOM parser. 247 # To support those environments, we link /dev/kmsg to /dev/console. 248 # https://github.com/kubernetes-sigs/kind/issues/662 249 if [[ ! -e /dev/kmsg ]]; then 250 if [[ -e /dev/console ]]; then 251 echo 'WARN: /dev/kmsg does not exist, symlinking /dev/console' >&2 252 ln -s /dev/console /dev/kmsg 253 else 254 echo 'WARN: /dev/kmsg does not exist, nor does /dev/console!' >&2 255 fi 256 elif [[ -n "${userns}" ]]; then 257 if [[ -f "/proc/sys/kernel/dmesg_restrict" ]]; then 258 if [[ "$(cat /proc/sys/kernel/dmesg_restrict)" = "1" ]]; then 259 echo 'WARN: UserNS: /dev/kmsg is not readable, faking with /dev/null (hint: set sysctl value "kernel.dmesg_restrict" to 0)' >&2 260 mount --bind /dev/null /dev/kmsg 261 fi 262 fi 263 fi 264 } 265 266 # validate state 267 validate_userns 268 269 # run pre-init fixups 270 # NOTE: it's important that we do configure* first in this order to avoid races 271 configure_proxy 272 fix_kmsg 273 fix_mount 274 fix_cgroup 275 fix_machine_id 276 fix_product_name 277 fix_product_uuid 278 279 # we want the command (expected to be systemd) to be PID1, so exec to it 280 exec "$@"