github.com/sealerio/sealer@v0.11.1-0.20240507115618-f4f89c5853ae/pkg/infra/container/imagecontext/arm/entrypoint

github.com/sealerio/sealer@v0.11.1-0.20240507115618-f4f89c5853ae/pkg/infra/container/imagecontext/arm/entrypoint (about)

     1  #!/bin/bash
     2  
     3  set -o errexit
     4  set -o nounset
     5  set -o pipefail
     6  
     7  # If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host.
     8  # Otherwise we are in a non-initial user namespace.
     9  # https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118
    10  userns=""
    11  if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then
    12    userns="1"
    13    echo 'INFO: running in a user namespace (experimental)'
    14  fi
    15  
    16  validate_userns() {
    17    if [[ -z "${userns}" ]]; then
    18      return
    19    fi
    20  
    21    local nofile_hard
    22    nofile_hard="$(ulimit -Hn)"
    23    local nofile_hard_expected="64000"
    24    if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then
    25      echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2
    26    fi
    27  
    28    if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then
    29      echo "ERROR: UserNS: cgroup v2 needs to be enabled" >&2
    30      exit 1
    31    fi
    32    for f in cpu memory pids; do
    33      if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then
    34        echo "ERROR: UserNS: $f controller needs to be delegated" >&2
    35      exit 1
    36      fi
    37    done
    38  }
    39  
    40  fake_file_with_content(){
    41    local path="$1"
    42    local content="$2"
    43    local base="/run/fake"
    44    local fake_path="${base}/${path}"
    45    mkdir -p "$(dirname "${fake_path}")"
    46    echo "INFO: UserNS: faking ${path} to be \"${content}\" (writable)"
    47    echo "${content}" > "${fake_path}"
    48    mount --bind "${fake_path}" "${path}"
    49  }
    50  
    51  fake_sysctl() {
    52    local key="$1"
    53    local key_slash
    54    # shellcheck disable=SC2001
    55    key_slash="$(echo "${key}" | sed -e s@\\.@/@g)"
    56    local path="/proc/sys/${key_slash}"
    57    if [[ -f "${path}" ]]; then
    58      local content
    59      content="$(cat "${path}")"
    60      fake_file_with_content "${path}" "${content}"
    61    fi
    62  }
    63  
    64  
    65  configure_proxy() {
    66    # ensure all processes receive the proxy settings by default
    67    # https://www.freedesktop.org/software/systemd/man/systemd-system.conf.html
    68    mkdir -p /etc/systemd/system.conf.d/
    69    cat <<EOF >/etc/systemd/system.conf.d/proxy-default-environment.conf
    70  [Manager]
    71  DefaultEnvironment="HTTP_PROXY=${HTTP_PROXY:-}" "HTTPS_PROXY=${HTTPS_PROXY:-}" "NO_PROXY=${NO_PROXY:-}"
    72  EOF
    73  }
    74  
    75  fix_mount() {
    76    echo 'INFO: ensuring we can execute mount/umount even with userns-remap' 
    77    # necessary only when userns-remap is enabled on the host, but harmless
    78    # The binary /bin/mount should be owned by root and have the setuid bit
    79    chown root:root "$(which mount)" "$(which umount)"
    80    chmod -s "$(which mount)" "$(which umount)"
    81  
    82    # This is a workaround to an AUFS bug that might cause `Text file
    83    # busy` on `mount` command below. See more details in
    84    # https://github.com/moby/moby/issues/9547
    85    if [[ "$(stat -f -c %T "$(which mount)")" == 'aufs' ]]; then
    86      echo 'INFO: detected aufs, calling sync' >&2
    87      sync
    88    fi
    89  
    90    if [[ -z "${userns}" ]]; then
    91      echo 'INFO: remounting /sys read-only'
    92      # systemd-in-a-container should have read only /sys
    93      # https://systemd.io/CONTAINER_INTERFACE/
    94      # however, we need other things from `docker run --privileged` ...
    95      # and this flag also happens to make /sys rw, amongst other things
    96      #
    97      # This step is skipped when running inside UserNS, because it fails with EACCES.
    98      mount -o remount,ro /sys
    99    fi
   100  
   101    echo 'INFO: making mounts shared' >&2
   102    # for mount propagation
   103    mount --make-rshared /
   104  }
   105  
   106  # helper used by fix_cgroup
   107  mount_kubelet_cgroup_root() {
   108    local cgroup_root=$1
   109    local subsystem=$2
   110    if [ -z "${cgroup_root}" ]; then
   111      return 0
   112    fi
   113    mkdir -p "${subsystem}/${cgroup_root}"
   114    if [ "${subsystem}" == "/sys/fs/cgroup/cpuset" ]; then
   115      # This is needed. Otherwise, assigning process to the cgroup
   116      # (or any nested cgroup) would result in ENOSPC.
   117      cat "${subsystem}/cpuset.cpus" > "${subsystem}/${cgroup_root}/cpuset.cpus"
   118      cat "${subsystem}/cpuset.mems" > "${subsystem}/${cgroup_root}/cpuset.mems"
   119    fi
   120    # We need to perform a self bind mount here because otherwise,
   121    # systemd might delete the cgroup unintentionally before the
   122    # kubelet starts.
   123    mount --bind "${subsystem}/${cgroup_root}" "${subsystem}/${cgroup_root}"
   124  }
   125  
   126  fix_cgroup() {
   127    if [[ -f "/sys/fs/cgroup/cgroup.controllers" ]]; then
   128      echo 'INFO: detected cgroup v2'
   129      # Both Docker and Podman enable CgroupNS on cgroup v2 hosts by default.
   130      #
   131      # So mostly we do not need to mess around with the cgroup path stuff,
   132      # however, we still need to create the "/kubelet" cgroup at least.
   133      # (Otherwise kubelet fails with `cgroup-root ["kubelet"] doesn't exist` error, see #1969)
   134      #
   135      # The "/kubelet" cgroup is created in ExecStartPre of the kubeadm service.
   136      #
   137      # [FAQ: Why not create "/kubelet" cgroup here?]
   138      # We can't create the cgroup with controllers here, because /sys/fs/cgroup/cgroup.subtree_control is empty.
   139      # And yet we can't write controllers to /sys/fs/cgroup/cgroup.subtree_control by ourselves either, because
   140      # /sys/fs/cgroup/cgroup.procs is not empty at this moment.
   141      #
   142      # After switching from this entrypoint script to systemd, systemd evacuates the processes in the root
   143      # group to "/init.scope" group, so we can write the root subtree_control and create "/kubelet" cgroup.
   144    else 
   145      echo 'INFO: detected cgroup v1'
   146      echo 'INFO: fix cgroup mounts for all subsystems'
   147      # See: https://d2iq.com/blog/running-kind-inside-a-kubernetes-cluster-for-continuous-integration
   148      # Capture initial state before modifying
   149      #
   150      # Basically we're looking for the cgroup-path for the cpu controller for the
   151      # current process. this tells us what cgroup-path the container is in.
   152      # Then we collect the subsystems that are active on this path.
   153      # We assume the cpu controller is in use on all node containers.
   154      #
   155      # See: https://man7.org/linux/man-pages/man7/cgroups.7.html
   156      local current_cgroup
   157      current_cgroup=$(grep -E '^[^:]*:([^:]*,)?cpu(,[^,:]*)?:.*' /proc/self/cgroup | cut -d: -f3)
   158      local cgroup_subsystems
   159      cgroup_subsystems=$(findmnt -lun -o source,target -t cgroup | grep "${current_cgroup}" | awk '{print $2}')
   160      # For each cgroup subsystem, Docker does a bind mount from the current
   161      # cgroup to the root of the cgroup subsystem. For instance:
   162      #   /sys/fs/cgroup/memory/docker/<cid> -> /sys/fs/cgroup/memory
   163      #
   164      # This will confuse Kubelet and cadvisor and will dump the following error
   165      # messages in kubelet log:
   166      #   `summary_sys_containers.go:47] Failed to get system container stats for ".../kubelet.service"`
   167      #
   168      # This is because `/proc/<pid>/cgroup` is not affected by the bind mount.
   169      # The following is a workaround to recreate the original cgroup
   170      # environment by doing another bind mount for each subsystem.
   171      local cgroup_mounts
   172      # xref: https://github.com/kubernetes/minikube/pull/9508
   173      # Example inputs:
   174      #
   175      # Docker:               /docker/562a56986a84b3cd38d6a32ac43fdfcc8ad4d2473acf2839cbf549273f35c206 /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:143 master:23 - cgroup devices rw,devices
   176      # podman:               /libpod_parent/libpod-73a4fb9769188ae5dc51cb7e24b9f2752a4af7b802a8949f06a7b2f2363ab0e9 ...
   177      # Cloud Shell:          /kubepods/besteffort/pod3d6beaa3004913efb68ce073d73494b0/accdf94879f0a494f317e9a0517f23cdd18b35ff9439efd0175f17bbc56877c4 /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime master:19 - cgroup cgroup rw,memory
   178      # GitHub actions #9304: /actions_job/0924fbbcf7b18d2a00c171482b4600747afc367a9dfbeac9d6b14b35cda80399 /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:263 master:24 - cgroup cgroup rw,memory
   179      cgroup_mounts=$(grep -E -o '/[[:alnum:]].* /sys/fs/cgroup.*.*cgroup' /proc/self/mountinfo || true)
   180      if [[ -n "${cgroup_mounts}" ]]; then
   181        local mount_root
   182        mount_root=$(head -n 1 <<<"${cgroup_mounts}" | cut -d' ' -f1)
   183        for mount_point in $(echo "${cgroup_mounts}" | cut -d' ' -f 2); do
   184          # bind mount each mount_point to mount_point + mount_root
   185          # mount --bind /sys/fs/cgroup/cpu /sys/fs/cgroup/cpu/docker/fb07bb6daf7730a3cb14fc7ff3e345d1e47423756ce54409e66e01911bab2160
   186          local target="${mount_point}${mount_root}"
   187          if ! findmnt "${target}"; then
   188            mkdir -p "${target}"
   189            mount --bind "${mount_point}" "${target}"
   190          fi
   191        done
   192      fi
   193      # kubelet will try to manage cgroups / pods that are not owned by it when
   194      # "nesting" clusters, unless we instruct it to use a different cgroup root.
   195      # We do this, and when doing so we must fixup this alternative root
   196      # currently this is hardcoded to be /kubelet
   197      mount --make-rprivate /sys/fs/cgroup
   198      echo "${cgroup_subsystems}" |
   199      while IFS= read -r subsystem; do
   200        mount_kubelet_cgroup_root "/kubelet" "${subsystem}"
   201      done
   202    fi
   203  
   204    # fix cgroups: cannot found cgroup mount destination: unknown,see:https://github.com/docker/for-linux/issues/219
   205    echo 'INFO: fix cgroup mounts for systemd'
   206    # kernel provides cgroups?
   207    if [ ! -e /proc/cgroups ]; then
   208      echo 'INFO:do not have /proc/cgroups'
   209  	  exit 0
   210    fi
   211  
   212    # if we don't even have the directory we need, something else must be wrong
   213    if [ ! -d /sys/fs/cgroup ]; then
   214      echo 'INFO:do not have /sys/fs/cgroup'
   215  	  exit 0
   216    fi
   217  
   218    # mount /sys/fs/cgroup if not already done
   219    if ! mountpoint -q /sys/fs/cgroup; then
   220  	  mount -t tmpfs -o uid=0,gid=0,mode=0755 cgroup /sys/fs/cgroup
   221    fi
   222    mkdir /sys/fs/cgroup/systemd || true
   223    mount -t cgroup -o none,name=systemd cgroup /sys/fs/cgroup/systemd || true
   224  }
   225  
   226  fix_machine_id() {
   227    # Deletes the machine-id embedded in the node image and generates a new one.
   228    # This is necessary because both kubelet and other components like weave net
   229    # use machine-id internally to distinguish nodes.
   230    echo 'INFO: clearing and regenerating /etc/machine-id' >&2
   231    rm -f /etc/machine-id
   232    systemd-machine-id-setup
   233  }
   234  
   235  fix_product_name() {
   236    # this is a small fix to hide the underlying hardware and fix issue #426
   237    # https://github.com/kubernetes-sigs/kind/issues/426
   238    if [[ -f /sys/class/dmi/id/product_name ]]; then
   239      echo 'INFO: faking /sys/class/dmi/id/product_name to be "sealer"' >&2
   240      echo 'sealer' > /etc/product_name
   241      mount -o ro,bind /etc/product_name /sys/class/dmi/id/product_name
   242    fi
   243  }
   244  
   245  fix_product_uuid() {
   246    # The system UUID is usually read from DMI via sysfs, the problem is that
   247    # in the kind case this means that all (container) nodes share the same
   248    # system/product uuid, as they share the same DMI.
   249    # Note: The UUID is read from DMI, this tool is overwriting the sysfs files
   250    # which should fix the attached issue, but this workaround does not address
   251    # the issue if a tool is reading directly from DMI.
   252    # https://github.com/kubernetes-sigs/kind/issues/1027
   253    [[ ! -f /etc/product_uuid ]] && cat /proc/sys/kernel/random/uuid > /etc/product_uuid
   254    if [[ -f /sys/class/dmi/id/product_uuid ]]; then
   255      echo 'INFO: faking /sys/class/dmi/id/product_uuid to be random' >&2
   256      mount -o ro,bind /etc/product_uuid /sys/class/dmi/id/product_uuid
   257    fi
   258    if [[ -f /sys/devices/virtual/dmi/id/product_uuid ]]; then
   259      echo 'INFO: faking /sys/devices/virtual/dmi/id/product_uuid as well' >&2
   260      mount -o ro,bind /etc/product_uuid /sys/devices/virtual/dmi/id/product_uuid
   261    fi
   262  }
   263  
   264  fix_kmsg() {
   265    # In environments where /dev/kmsg is not available, the kubelet (1.15+) won't
   266    # start because it cannot open /dev/kmsg when starting the kmsgparser in the
   267    # OOM parser.
   268    # To support those environments, we link /dev/kmsg to /dev/console.
   269    # https://github.com/kubernetes-sigs/kind/issues/662
   270    if [[ ! -e /dev/kmsg ]]; then
   271      if [[ -e /dev/console ]]; then
   272        echo 'WARN: /dev/kmsg does not exist, symlinking /dev/console' >&2
   273        ln -s /dev/console /dev/kmsg
   274      else
   275        echo 'WARN: /dev/kmsg does not exist, nor does /dev/console!' >&2
   276      fi
   277    elif [[ -n "${userns}" ]]; then
   278      if [[ -f "/proc/sys/kernel/dmesg_restrict" ]]; then
   279        if [[ "$(cat /proc/sys/kernel/dmesg_restrict)" = "1" ]]; then
   280          echo 'WARN: UserNS: /dev/kmsg is not readable, faking with /dev/null (hint: set sysctl value "kernel.dmesg_restrict" to 0)' >&2
   281          mount --bind /dev/null /dev/kmsg
   282        fi
   283      fi
   284    fi
   285  }
   286  
   287  # validate state
   288  validate_userns
   289  
   290  # run pre-init fixups
   291  # NOTE: it's important that we do configure* first in this order to avoid races
   292  configure_proxy
   293  fix_kmsg
   294  fix_mount
   295  fix_cgroup
   296  fix_machine_id
   297  fix_product_name
   298  fix_product_uuid
   299  
   300  # we want the command (expected to be systemd) to be PID1, so exec to it
   301  exec "$@"