github.com/sealerio/sealer@v0.11.1-0.20240507115618-f4f89c5853ae/pkg/infra/container/imagecontext/base/entrypoint

github.com/sealerio/sealer@v0.11.1-0.20240507115618-f4f89c5853ae/pkg/infra/container/imagecontext/base/entrypoint (about)

     1  #!/bin/bash
     2  
     3  # Copyright 2019 The Kubernetes Authors.
     4  #
     5  # Licensed under the Apache License, Version 2.0 (the "License");
     6  # you may not use this file except in compliance with the License.
     7  # You may obtain a copy of the License at
     8  #
     9  #     http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  set -o errexit
    18  set -o nounset
    19  set -o pipefail
    20  
    21  # If /proc/self/uid_map 4294967295 mappings, we are in the initial user namespace, i.e. the host.
    22  # Otherwise we are in a non-initial user namespace.
    23  # https://github.com/opencontainers/runc/blob/v1.0.0-rc92/libcontainer/system/linux.go#L109-L118
    24  userns=""
    25  if grep -Eqv "0[[:space:]]+0[[:space:]]+4294967295" /proc/self/uid_map; then
    26    userns="1"
    27    echo 'INFO: running in a user namespace (experimental)'
    28  fi
    29  
    30  validate_userns() {
    31    if [[ -z "${userns}" ]]; then
    32      return
    33    fi
    34  
    35    local nofile_hard
    36    nofile_hard="$(ulimit -Hn)"
    37    local nofile_hard_expected="64000"
    38    if [[ "${nofile_hard}" -lt "${nofile_hard_expected}" ]]; then
    39      echo "WARN: UserNS: expected RLIMIT_NOFILE to be at least ${nofile_hard_expected}, got ${nofile_hard}" >&2
    40    fi
    41  
    42    if [[ ! -f "/sys/fs/cgroup/cgroup.controllers" ]]; then
    43      echo "ERROR: UserNS: cgroup v2 needs to be enabled" >&2
    44      exit 1
    45    fi
    46    for f in cpu memory pids; do
    47      if ! grep -qw $f /sys/fs/cgroup/cgroup.controllers; then
    48        echo "ERROR: UserNS: $f controller needs to be delegated" >&2
    49      exit 1
    50      fi
    51    done
    52  }
    53  
    54  fake_file_with_content(){
    55    local path="$1"
    56    local content="$2"
    57    local base="/run/fake"
    58    local fake_path="${base}/${path}"
    59    mkdir -p "$(dirname "${fake_path}")"
    60    echo "INFO: UserNS: faking ${path} to be \"${content}\" (writable)"
    61    echo "${content}" > "${fake_path}"
    62    mount --bind "${fake_path}" "${path}"
    63  }
    64  
    65  fake_sysctl() {
    66    local key="$1"
    67    local key_slash
    68    # shellcheck disable=SC2001
    69    key_slash="$(echo "${key}" | sed -e s@\\.@/@g)"
    70    local path="/proc/sys/${key_slash}"
    71    if [[ -f "${path}" ]]; then
    72      local content
    73      content="$(cat "${path}")"
    74      fake_file_with_content "${path}" "${content}"
    75    fi
    76  }
    77  
    78  
    79  configure_proxy() {
    80    # ensure all processes receive the proxy settings by default
    81    # https://www.freedesktop.org/software/systemd/man/systemd-system.conf.html
    82    mkdir -p /etc/systemd/system.conf.d/
    83    cat <<EOF >/etc/systemd/system.conf.d/proxy-default-environment.conf
    84  [Manager]
    85  DefaultEnvironment="HTTP_PROXY=${HTTP_PROXY:-}" "HTTPS_PROXY=${HTTPS_PROXY:-}" "NO_PROXY=${NO_PROXY:-}"
    86  EOF
    87  }
    88  
    89  fix_mount() {
    90    echo 'INFO: ensuring we can execute mount/umount even with userns-remap' 
    91    # necessary only when userns-remap is enabled on the host, but harmless
    92    # The binary /bin/mount should be owned by root and have the setuid bit
    93    chown root:root "$(which mount)" "$(which umount)"
    94    chmod -s "$(which mount)" "$(which umount)"
    95  
    96    # This is a workaround to an AUFS bug that might cause `Text file
    97    # busy` on `mount` command below. See more details in
    98    # https://github.com/moby/moby/issues/9547
    99    if [[ "$(stat -f -c %T "$(which mount)")" == 'aufs' ]]; then
   100      echo 'INFO: detected aufs, calling sync' >&2
   101      sync
   102    fi
   103  
   104    if [[ -z "${userns}" ]]; then
   105      echo 'INFO: remounting /sys read-only'
   106      # systemd-in-a-container should have read only /sys
   107      # https://systemd.io/CONTAINER_INTERFACE/
   108      # however, we need other things from `docker run --privileged` ...
   109      # and this flag also happens to make /sys rw, amongst other things
   110      #
   111      # This step is skipped when running inside UserNS, because it fails with EACCES.
   112      mount -o remount,ro /sys
   113    fi
   114  
   115    echo 'INFO: making mounts shared' >&2
   116    # for mount propagation
   117    mount --make-rshared /
   118  }
   119  
   120  # helper used by fix_cgroup
   121  mount_kubelet_cgroup_root() {
   122    local cgroup_root=$1
   123    local subsystem=$2
   124    if [ -z "${cgroup_root}" ]; then
   125      return 0
   126    fi
   127    mkdir -p "${subsystem}/${cgroup_root}"
   128    if [ "${subsystem}" == "/sys/fs/cgroup/cpuset" ]; then
   129      # This is needed. Otherwise, assigning process to the cgroup
   130      # (or any nested cgroup) would result in ENOSPC.
   131      cat "${subsystem}/cpuset.cpus" > "${subsystem}/${cgroup_root}/cpuset.cpus"
   132      cat "${subsystem}/cpuset.mems" > "${subsystem}/${cgroup_root}/cpuset.mems"
   133    fi
   134    # We need to perform a self bind mount here because otherwise,
   135    # systemd might delete the cgroup unintentionally before the
   136    # kubelet starts.
   137    mount --bind "${subsystem}/${cgroup_root}" "${subsystem}/${cgroup_root}"
   138  }
   139  
   140  fix_cgroup() {
   141    if [[ -f "/sys/fs/cgroup/cgroup.controllers" ]]; then
   142      echo 'INFO: detected cgroup v2'
   143      # Both Docker and Podman enable CgroupNS on cgroup v2 hosts by default.
   144      #
   145      # So mostly we do not need to mess around with the cgroup path stuff,
   146      # however, we still need to create the "/kubelet" cgroup at least.
   147      # (Otherwise kubelet fails with `cgroup-root ["kubelet"] doesn't exist` error, see #1969)
   148      #
   149      # The "/kubelet" cgroup is created in ExecStartPre of the kubeadm service.
   150      #
   151      # [FAQ: Why not create "/kubelet" cgroup here?]
   152      # We can't create the cgroup with controllers here, because /sys/fs/cgroup/cgroup.subtree_control is empty.
   153      # And yet we can't write controllers to /sys/fs/cgroup/cgroup.subtree_control by ourselves either, because
   154      # /sys/fs/cgroup/cgroup.procs is not empty at this moment.
   155      #
   156      # After switching from this entrypoint script to systemd, systemd evacuates the processes in the root
   157      # group to "/init.scope" group, so we can write the root subtree_control and create "/kubelet" cgroup.
   158    else 
   159      echo 'INFO: detected cgroup v1'
   160      echo 'INFO: fix cgroup mounts for all subsystems'
   161      # See: https://d2iq.com/blog/running-kind-inside-a-kubernetes-cluster-for-continuous-integration
   162      # Capture initial state before modifying
   163      #
   164      # Basically we're looking for the cgroup-path for the cpu controller for the
   165      # current process. this tells us what cgroup-path the container is in.
   166      # Then we collect the subsystems that are active on this path.
   167      # We assume the cpu controller is in use on all node containers.
   168      #
   169      # See: https://man7.org/linux/man-pages/man7/cgroups.7.html
   170      local current_cgroup
   171      current_cgroup=$(grep -E '^[^:]*:([^:]*,)?cpu(,[^,:]*)?:.*' /proc/self/cgroup | cut -d: -f3)
   172      local cgroup_subsystems
   173      cgroup_subsystems=$(findmnt -lun -o source,target -t cgroup | grep "${current_cgroup}" | awk '{print $2}')
   174      # For each cgroup subsystem, Docker does a bind mount from the current
   175      # cgroup to the root of the cgroup subsystem. For instance:
   176      #   /sys/fs/cgroup/memory/docker/<cid> -> /sys/fs/cgroup/memory
   177      #
   178      # This will confuse Kubelet and cadvisor and will dump the following error
   179      # messages in kubelet log:
   180      #   `summary_sys_containers.go:47] Failed to get system container stats for ".../kubelet.service"`
   181      #
   182      # This is because `/proc/<pid>/cgroup` is not affected by the bind mount.
   183      # The following is a workaround to recreate the original cgroup
   184      # environment by doing another bind mount for each subsystem.
   185      local cgroup_mounts
   186      # xref: https://github.com/kubernetes/minikube/pull/9508
   187      # Example inputs:
   188      #
   189      # Docker:               /docker/562a56986a84b3cd38d6a32ac43fdfcc8ad4d2473acf2839cbf549273f35c206 /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:143 master:23 - cgroup devices rw,devices
   190      # podman:               /libpod_parent/libpod-73a4fb9769188ae5dc51cb7e24b9f2752a4af7b802a8949f06a7b2f2363ab0e9 ...
   191      # Cloud Shell:          /kubepods/besteffort/pod3d6beaa3004913efb68ce073d73494b0/accdf94879f0a494f317e9a0517f23cdd18b35ff9439efd0175f17bbc56877c4 /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime master:19 - cgroup cgroup rw,memory
   192      # GitHub actions #9304: /actions_job/0924fbbcf7b18d2a00c171482b4600747afc367a9dfbeac9d6b14b35cda80399 /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:263 master:24 - cgroup cgroup rw,memory
   193      cgroup_mounts=$(grep -E -o '/[[:alnum:]].* /sys/fs/cgroup.*.*cgroup' /proc/self/mountinfo || true)
   194      if [[ -n "${cgroup_mounts}" ]]; then
   195        local mount_root
   196        mount_root=$(head -n 1 <<<"${cgroup_mounts}" | cut -d' ' -f1)
   197        for mount_point in $(echo "${cgroup_mounts}" | cut -d' ' -f 2); do
   198          # bind mount each mount_point to mount_point + mount_root
   199          # mount --bind /sys/fs/cgroup/cpu /sys/fs/cgroup/cpu/docker/fb07bb6daf7730a3cb14fc7ff3e345d1e47423756ce54409e66e01911bab2160
   200          local target="${mount_point}${mount_root}"
   201          if ! findmnt "${target}"; then
   202            mkdir -p "${target}"
   203            mount --bind "${mount_point}" "${target}"
   204          fi
   205        done
   206      fi
   207      # kubelet will try to manage cgroups / pods that are not owned by it when
   208      # "nesting" clusters, unless we instruct it to use a different cgroup root.
   209      # We do this, and when doing so we must fixup this alternative root
   210      # currently this is hardcoded to be /kubelet
   211      mount --make-rprivate /sys/fs/cgroup
   212      echo "${cgroup_subsystems}" |
   213      while IFS= read -r subsystem; do
   214        mount_kubelet_cgroup_root "/kubelet" "${subsystem}"
   215      done
   216    fi
   217  
   218    # fix cgroups: cannot found cgroup mount destination: unknown,see:https://github.com/docker/for-linux/issues/219
   219    echo 'INFO: fix cgroup mounts for systemd'
   220    # kernel provides cgroups?
   221    if [ ! -e /proc/cgroups ]; then
   222      echo 'INFO:do not have /proc/cgroups'
   223  	  exit 0
   224    fi
   225  
   226    # if we don't even have the directory we need, something else must be wrong
   227    if [ ! -d /sys/fs/cgroup ]; then
   228      echo 'INFO:do not have /sys/fs/cgroup'
   229  	  exit 0
   230    fi
   231  
   232    # mount /sys/fs/cgroup if not already done
   233    if ! mountpoint -q /sys/fs/cgroup; then
   234  	  mount -t tmpfs -o uid=0,gid=0,mode=0755 cgroup /sys/fs/cgroup
   235    fi
   236    mkdir /sys/fs/cgroup/systemd || true
   237    mount -t cgroup -o none,name=systemd cgroup /sys/fs/cgroup/systemd || true
   238  }
   239  
   240  fix_machine_id() {
   241    # Deletes the machine-id embedded in the node image and generates a new one.
   242    # This is necessary because both kubelet and other components like weave net
   243    # use machine-id internally to distinguish nodes.
   244    echo 'INFO: clearing and regenerating /etc/machine-id' >&2
   245    rm -f /etc/machine-id
   246    systemd-machine-id-setup
   247  }
   248  
   249  fix_product_name() {
   250    # this is a small fix to hide the underlying hardware and fix issue #426
   251    # https://github.com/kubernetes-sigs/kind/issues/426
   252    if [[ -f /sys/class/dmi/id/product_name ]]; then
   253      echo 'INFO: faking /sys/class/dmi/id/product_name to be "sealer"' >&2
   254      echo 'sealer' > /etc/product_name
   255      mount -o ro,bind /etc/product_name /sys/class/dmi/id/product_name
   256    fi
   257  }
   258  
   259  fix_product_uuid() {
   260    # The system UUID is usually read from DMI via sysfs, the problem is that
   261    # in the kind case this means that all (container) nodes share the same
   262    # system/product uuid, as they share the same DMI.
   263    # Note: The UUID is read from DMI, this tool is overwriting the sysfs files
   264    # which should fix the attached issue, but this workaround does not address
   265    # the issue if a tool is reading directly from DMI.
   266    # https://github.com/kubernetes-sigs/kind/issues/1027
   267    [[ ! -f /etc/product_uuid ]] && cat /proc/sys/kernel/random/uuid > /etc/product_uuid
   268    if [[ -f /sys/class/dmi/id/product_uuid ]]; then
   269      echo 'INFO: faking /sys/class/dmi/id/product_uuid to be random' >&2
   270      mount -o ro,bind /etc/product_uuid /sys/class/dmi/id/product_uuid
   271    fi
   272    if [[ -f /sys/devices/virtual/dmi/id/product_uuid ]]; then
   273      echo 'INFO: faking /sys/devices/virtual/dmi/id/product_uuid as well' >&2
   274      mount -o ro,bind /etc/product_uuid /sys/devices/virtual/dmi/id/product_uuid
   275    fi
   276  }
   277  
   278  fix_kmsg() {
   279    # In environments where /dev/kmsg is not available, the kubelet (1.15+) won't
   280    # start because it cannot open /dev/kmsg when starting the kmsgparser in the
   281    # OOM parser.
   282    # To support those environments, we link /dev/kmsg to /dev/console.
   283    # https://github.com/kubernetes-sigs/kind/issues/662
   284    if [[ ! -e /dev/kmsg ]]; then
   285      if [[ -e /dev/console ]]; then
   286        echo 'WARN: /dev/kmsg does not exist, symlinking /dev/console' >&2
   287        ln -s /dev/console /dev/kmsg
   288      else
   289        echo 'WARN: /dev/kmsg does not exist, nor does /dev/console!' >&2
   290      fi
   291    elif [[ -n "${userns}" ]]; then
   292      if [[ -f "/proc/sys/kernel/dmesg_restrict" ]]; then
   293        if [[ "$(cat /proc/sys/kernel/dmesg_restrict)" = "1" ]]; then
   294          echo 'WARN: UserNS: /dev/kmsg is not readable, faking with /dev/null (hint: set sysctl value "kernel.dmesg_restrict" to 0)' >&2
   295          mount --bind /dev/null /dev/kmsg
   296        fi
   297      fi
   298    fi
   299  }
   300  
   301  select_iptables() {
   302    # based on: https://github.com/kubernetes-sigs/iptables-wrappers/blob/97b01f43a8e8db07840fc4b95e833a37c0d36b12/iptables-wrapper-installer.sh
   303    local mode num_legacy_lines num_nft_lines
   304    num_legacy_lines=$( (iptables-legacy-save || true; ip6tables-legacy-save || true) 2>/dev/null | grep '^-' | wc -l || true)
   305    num_nft_lines=$( (timeout 5 sh -c "iptables-nft-save; ip6tables-nft-save" || true) 2>/dev/null | grep '^-' | wc -l || true)
   306    if [ "${num_legacy_lines}" -ge "${num_nft_lines}" ]; then
   307      mode=legacy
   308    else
   309      mode=nft
   310    fi
   311    echo "INFO: setting iptables to detected mode: ${mode}" >&2
   312    update-alternatives --set iptables "/usr/sbin/iptables-${mode}" > /dev/null
   313    update-alternatives --set ip6tables "/usr/sbin/ip6tables-${mode}" > /dev/null
   314  }
   315  
   316  enable_network_magic(){
   317    # well-known docker embedded DNS is at 127.0.0.11:53
   318    local docker_embedded_dns_ip='127.0.0.11'
   319  
   320    # first we need to detect an IP to use for reaching the docker host
   321    local docker_host_ip
   322    docker_host_ip="$( (head -n1 <(getent ahostsv4 'host.docker.internal') | cut -d' ' -f1) || true)"
   323    # if the ip doesn't exist or is a loopback address use the default gateway
   324    if [[ -z "${docker_host_ip}" ]] || [[ $docker_host_ip =~ ^127\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
   325      docker_host_ip=$(ip -4 route show default | cut -d' ' -f3)
   326    fi
   327  
   328    # patch docker's iptables rules to switch out the DNS IP
   329    iptables-save \
   330      | sed \
   331        `# switch docker DNS DNAT rules to our chosen IP` \
   332        -e "s/-d ${docker_embedded_dns_ip}/-d ${docker_host_ip}/g" \
   333        `# we need to also apply these rules to non-local traffic (from pods)` \
   334        -e 's/-A OUTPUT \(.*\) -j DOCKER_OUTPUT/\0\n-A PREROUTING \1 -j DOCKER_OUTPUT/' \
   335        `# switch docker DNS SNAT rules rules to our chosen IP` \
   336        -e "s/--to-source :53/--to-source ${docker_host_ip}:53/g"\
   337      | iptables-restore
   338  
   339    # now we can ensure that DNS is configured to use our IP
   340    cp /etc/resolv.conf /etc/resolv.conf.original
   341    sed -e "s/${docker_embedded_dns_ip}/${docker_host_ip}/g" /etc/resolv.conf.original >/etc/resolv.conf
   342  
   343    # fixup IPs in manifests ...
   344    curr_ipv4="$( (head -n1 <(getent ahostsv4 "$(hostname)") | cut -d' ' -f1) || true)"
   345    echo "INFO: Detected IPv4 address: ${curr_ipv4}" >&2
   346    if [ -f /etc/old-ipv4 ]; then
   347        old_ipv4=$(cat /etc/old-ipv4)
   348        echo "INFO: Detected old IPv4 address: ${old_ipv4}" >&2
   349        # sanity check that we have a current address
   350        if [[ -z $curr_ipv4 ]]; then
   351          echo "ERROR: Have an old IPv4 address but no current IPv4 address (!)" >&2
   352          exit 1
   353        fi
   354        # kubernetes manifests are only present on control-plane nodes
   355        sed -i "s#${old_ipv4}#${curr_ipv4}#" /etc/kubernetes/manifests/*.yaml || true
   356        # this is no longer required with autodiscovery
   357        sed -i "s#${old_ipv4}#${curr_ipv4}#" /var/lib/kubelet/kubeadm-flags.env || true
   358    fi
   359    if [[ -n $curr_ipv4 ]]; then
   360      echo -n "${curr_ipv4}" >/etc/old-ipv4
   361    fi
   362  
   363    # do IPv6
   364    curr_ipv6="$( (head -n1 <(getent ahostsv6 "$(hostname)") | cut -d' ' -f1) || true)"
   365    echo "INFO: Detected IPv6 address: ${curr_ipv6}" >&2
   366    if [ -f /etc/old-ipv6 ]; then
   367        old_ipv6=$(cat /etc/old-ipv6)
   368        echo "INFO: Detected old IPv6 address: ${old_ipv6}" >&2
   369        # sanity check that we have a current address
   370        if [[ -z $curr_ipv6 ]]; then
   371          echo "ERROR: Have an old IPv6 address but no current IPv6 address (!)" >&2
   372        fi
   373        # kubernetes manifests are only present on control-plane nodes
   374        sed -i "s#${old_ipv6}#${curr_ipv6}#" /etc/kubernetes/manifests/*.yaml || true
   375        # this is no longer required with autodiscovery
   376        sed -i "s#${old_ipv6}#${curr_ipv6}#" /var/lib/kubelet/kubeadm-flags.env || true
   377    fi
   378    if [[ -n $curr_ipv6 ]]; then
   379      echo -n "${curr_ipv6}" >/etc/old-ipv6
   380    fi
   381  }
   382  
   383  # validate state
   384  validate_userns
   385  
   386  # run pre-init fixups
   387  # NOTE: it's important that we do configure* first in this order to avoid races
   388  configure_proxy
   389  fix_kmsg
   390  fix_mount
   391  fix_cgroup
   392  fix_machine_id
   393  fix_product_name
   394  fix_product_uuid
   395  select_iptables
   396  enable_network_magic
   397  
   398  # we want the command (expected to be systemd) to be PID1, so exec to it
   399  exec "$@"