github.com/pachyderm/pachyderm@v1.13.4/etc/reset.py (about)

     1  #!/usr/bin/env python3
     2  
     3  import os
     4  import re
     5  import json
     6  import asyncio
     7  import secrets
     8  import argparse
     9  import collections
    10  import http.client
    11  from pathlib import Path
    12  
    13  ETCD_IMAGE = "pachyderm/etcd:v3.3.5"
    14  IDE_USER_IMAGE = "pachyderm/ide-user:local"
    15  IDE_HUB_IMAGE = "pachyderm/ide-hub:local"
    16  PIPELINE_BUILD_DIR = "etc/pipeline-build"
    17  
    18  DELETABLE_RESOURCES = [
    19      "roles.rbac.authorization.k8s.io",
    20      "rolebindings.rbac.authorization.k8s.io"
    21  ]
    22  
    23  NEWLINE_SEPARATE_OBJECTS_PATTERN = re.compile(r"\}\n+\{", re.MULTILINE)
    24  
    25  # Path to file used for ensuring minikube doesn't need to be deleted.
    26  # With newer versions of minikube, cluster state (host paths, pods, etc.) is
    27  # persisted across host system restarts, but credentials aren't, causing
    28  # permissions failures on k8s admin calls. We use this file to reference
    29  # whether minikube has been started since the last host system restart, which
    30  # will allow us to figure out whether to reset the cluster state so we don't
    31  # get the permissions errors. It's stored in `/tmp` because that directory
    32  # is wiped on every system restart, and doesn't require root to write to.
    33  MINIKUBE_RUN_FILE = Path("/tmp/pachyderm-minikube-reset")
    34  
    35  RunResult = collections.namedtuple("RunResult", ["rc", "stdout", "stderr"])
    36  
    37  client_version = None
    38  async def get_client_version():
    39      global client_version
    40      if client_version is None:
    41          client_version = (await capture("pachctl", "version", "--client-only")).strip()
    42      return client_version
    43  
    44  class BaseDriver:
    45      def image(self, name):
    46          return name
    47  
    48      async def reset(self):
    49          # Check for the presence of the pachyderm IDE to see whether it should
    50          # be undeployed too. Using kubectl rather than helm here because
    51          # this'll work even if the helm CLI is not installed.
    52          undeploy_args = []
    53          jupyterhub_apps = json.loads(await capture("kubectl", "get", "pod", "-lapp=jupyterhub", "-o", "json"))
    54          if len(jupyterhub_apps["items"]) > 0:
    55              undeploy_args.append("--ide")
    56  
    57           # ignore errors here because most likely no cluster is just deployed
    58           # yet
    59          await run("pachctl", "undeploy", "--metadata", *undeploy_args, stdin="y\n", raise_on_error=False)
    60          # clear out resources not removed from the undeploy process
    61          await run("kubectl", "delete", ",".join(DELETABLE_RESOURCES), "-l", "suite=pachyderm")
    62  
    63      async def push_image(self, images):
    64          pass
    65  
    66      def deploy_args(self):
    67          # We use hostpaths for storage. On docker for mac and minikube,
    68          # hostpaths aren't cleared until the VM is restarted. Because of this
    69          # behavior, re-deploying on the same hostpath without a restart will
    70          # cause us to bring up a new pachyderm cluster with access to the old
    71          # cluster volume, causing a bad state. This works around the issue by
    72          # just using a different hostpath on every deployment.
    73          host_path = Path("/var") / f"pachyderm-{secrets.token_hex(5)}"
    74          return ["local", "-d", "--no-guaranteed", f"--host-path={host_path}"]
    75  
    76      async def deploy(self, dash, ide, builder_images):
    77          deploy_args = ["pachctl", "deploy", *self.deploy_args(), "--dry-run", "--create-context", "--log-level=debug"]
    78          if not dash:
    79              deploy_args.append("--no-dashboard")
    80          if os.environ.get("STORAGE_V2") == "true":
    81              deploy_args.append("--storage-v2")
    82  
    83          deployments_str = await capture(*deploy_args)
    84          deployments_json = json.loads("[{}]".format(NEWLINE_SEPARATE_OBJECTS_PATTERN.sub("},{", deployments_str)))
    85  
    86          dash_spec = find_in_json(deployments_json, lambda j: \
    87              isinstance(j, dict) and j.get("name") == "dash" and j.get("image") is not None)
    88          grpc_proxy_spec = find_in_json(deployments_json, lambda j: \
    89              isinstance(j, dict) and j.get("name") == "grpc-proxy")
    90          
    91          pull_images = [run("docker", "pull", ETCD_IMAGE)]
    92          if dash_spec is not None:
    93              pull_images.append(run("docker", "pull", dash_spec["image"]))
    94          if grpc_proxy_spec is not None:
    95              pull_images.append(run("docker", "pull", grpc_proxy_spec["image"]))
    96          await asyncio.gather(*pull_images)
    97  
    98          push_images = [ETCD_IMAGE, "pachyderm/pachd:local", "pachyderm/worker:local", *builder_images]
    99          if dash_spec is not None:
   100              push_images.append(dash_spec["image"])
   101          if grpc_proxy_spec is not None:
   102              push_images.append(grpc_proxy_spec["image"])
   103  
   104          await asyncio.gather(*[self.push_image(i) for i in push_images])
   105          await run("kubectl", "create", "-f", "-", stdin=deployments_str)
   106  
   107          await retry(ping, attempts=60)
   108  
   109          if ide:
   110              await asyncio.gather(*[self.push_image(i) for i in [IDE_USER_IMAGE, IDE_HUB_IMAGE]])
   111  
   112              await run("pachctl", "enterprise", "activate", stdin=os.environ["PACH_ENTERPRISE_KEY"])
   113              await run("pachctl", "auth", "activate", stdin="admin\n")
   114              await run("pachctl", "deploy", "ide", 
   115                  "--user-image", self.image(IDE_USER_IMAGE),
   116                  "--hub-image", self.image(IDE_HUB_IMAGE),
   117              )
   118  
   119  class MinikubeDriver(BaseDriver):
   120      async def reset(self):
   121          is_minikube_running = True
   122  
   123          async def minikube_status():
   124              await run("minikube", "status", capture_output=True)
   125  
   126          if MINIKUBE_RUN_FILE.exists():
   127              try:
   128                  await minikube_status()
   129              except:
   130                  is_minikube_running = False
   131          else:
   132              await run("minikube", "delete")
   133              is_minikube_running = False
   134  
   135          if not is_minikube_running:
   136              await run("minikube", "start")
   137              await retry(minikube_status)
   138              MINIKUBE_RUN_FILE.touch()
   139  
   140          await super().reset()
   141  
   142      async def push_image(self, image):
   143          await run("./etc/kube/push-to-minikube.sh", image)
   144  
   145      async def deploy(self, dash, ide, builder_images):
   146          await super().deploy(dash, ide, builder_images)
   147  
   148          # enable direct connect
   149          ip = (await capture("minikube", "ip")).strip()
   150          await run("pachctl", "config", "update", "context", f"--pachd-address={ip}:30650")
   151  
   152          # the config update above will cause subsequent deploys to create a
   153          # new context, so to prevent the config from growing in size on every
   154          # deploy, we'll go ahead and delete any "orphaned" contexts now
   155          for line in (await capture("pachctl", "config", "list", "context")).strip().split("\n")[1:]:
   156              context = line.strip()
   157              if context.startswith("local"):
   158                  await run("pachctl", "config", "delete", "context", context)
   159  
   160  class GCPDriver(BaseDriver):
   161      def __init__(self, project_id, cluster_name=None):
   162          if cluster_name is None:
   163              cluster_name = f"pach-{secrets.token_hex(5)}"
   164          self.cluster_name = cluster_name
   165          self.object_storage_name = f"{cluster_name}-storage"
   166          self.project_id = project_id
   167  
   168      def image(self, name):
   169          return f"gcr.io/{self.project_id}/{name}"
   170  
   171      async def reset(self):
   172          cluster_exists = (await run("gcloud", "container", "clusters", "describe", self.cluster_name,
   173              raise_on_error=False, capture_output=True)).rc == 0
   174  
   175          if cluster_exists:
   176              await super().reset()
   177          else:
   178              await run("gcloud", "config", "set", "container/cluster", self.cluster_name)
   179              await run("gcloud", "container", "clusters", "create", self.cluster_name, "--scopes=storage-rw",
   180                  "--machine-type=n1-standard-8", "--num-nodes=2")
   181  
   182              account = (await capture("gcloud", "config", "get-value", "account")).strip()
   183              await run("kubectl", "create", "clusterrolebinding", "cluster-admin-binding",
   184                  "--clusterrole=cluster-admin", f"--user={account}")
   185  
   186              await run("gsutil", "mb", f"gs://{self.object_storage_name}")
   187  
   188              docker_config_path = Path.home() / ".docker" / "config.json"
   189              await run("kubectl", "create", "secret", "generic", "regcred",
   190                  f"--from-file=.dockerconfigjson={docker_config_path}",
   191                  "--type=kubernetes.io/dockerconfigjson")
   192  
   193      async def push_image(self, image):
   194          image_url = self.image(image)
   195          if ":local" in image_url:
   196              image_url = image_url.replace(":local", ":" + (await get_client_version()))
   197          await run("docker", "tag", image, image_url)
   198          await run("docker", "push", image_url)
   199  
   200      def deploy_args(self):
   201          return ["google", self.object_storage_name, "32", "--dynamic-etcd-nodes=1", "--image-pull-secret=regcred",
   202              f"--registry=gcr.io/{self.project_id}"]
   203  
   204  class HubApiError(Exception):
   205      def __init__(self, errors):
   206          def get_message(error):
   207              try:
   208                  return f"{error['title']}: {error['detail']}"
   209              except KeyError:
   210                  return json.dumps(error)
   211  
   212          if len(errors) > 1:
   213              message = ["multiple errors:"]
   214              for error in errors:
   215                  message.append(f"- {get_message(error)}")
   216              message = "\n".join(message)
   217          else:
   218              message = get_message(errors[0])
   219  
   220          super().__init__(message)
   221          self.errors = errors
   222  
   223  class HubDriver:
   224      def __init__(self, api_key, org_id, cluster_name):
   225          self.api_key = api_key
   226          self.org_id = org_id
   227          self.cluster_name = cluster_name
   228          self.old_cluster_names = []
   229  
   230      def request(self, method, endpoint, body=None):
   231          headers = {
   232              "Authorization": f"Api-Key {self.api_key}",
   233          }
   234          if body is not None:
   235              body = json.dumps({
   236                  "data": {
   237                      "attributes": body,
   238                  }
   239              })
   240  
   241          conn = http.client.HTTPSConnection(f"hub.pachyderm.com")
   242          conn.request(method, f"/api/v1{endpoint}", headers=headers, body=body)
   243          response = conn.getresponse()
   244          j = json.load(response)
   245  
   246          if "errors" in j:
   247              raise HubApiError(j["errors"])
   248          
   249          return j["data"]
   250  
   251      async def push_image(self, src):
   252          dst = src.replace(":local", ":" + (await get_client_version()))
   253          await run("docker", "tag", src, dst)
   254          await run("docker", "push", dst)
   255  
   256      async def reset(self):
   257          if self.cluster_name is None:
   258              return
   259  
   260          for pach in self.request("GET", f"/organizations/{self.org_id}/pachs?limit=100"):
   261              if pach["attributes"]["name"].startswith(f"{self.cluster_name}-"):
   262                  self.request("DELETE", f"/organizations/{self.org_id}/pachs/{pach['id']}")
   263                  self.old_cluster_names.append(pach["attributes"]["name"])
   264  
   265      async def deploy(self, dash, ide, builder_images):
   266          if ide:
   267              raise Exception("cannot deploy IDE in hub")
   268          if len(builder_images):
   269              raise Exception("cannot deploy builder images")
   270  
   271          await asyncio.gather(
   272              self.push_image("pachyderm/pachd:local"),
   273              self.push_image("pachyderm/worker:local"),
   274          )
   275  
   276          response = self.request("POST", f"/organizations/{self.org_id}/pachs", body={
   277              "name": self.cluster_name or "sandbox",
   278              "pachVersion": await get_client_version(),
   279          })
   280  
   281          cluster_name = response["attributes"]["name"]
   282          gke_name = response["attributes"]["gkeName"]
   283          pach_id = response["id"]
   284  
   285          await run("pachctl", "config", "set", "context", cluster_name, stdin=json.dumps({
   286              "source": 2,
   287              "pachd_address": f"grpcs://{gke_name}.clusters.pachyderm.io:31400",
   288          }))
   289  
   290          await run("pachctl", "config", "set", "active-context", cluster_name)
   291  
   292          # hack-ey way to clean up the old contexts, now that the active
   293          # context has been swapped to the new cluster
   294          await asyncio.gather(*[
   295              run("pachctl", "config", "delete", "context", n, raise_on_error=False) for n in self.old_cluster_names
   296          ])
   297  
   298          await retry(ping, attempts=100)
   299  
   300          async def get_otp():
   301              response = self.request("GET", f"/organizations/{self.org_id}/pachs/{pach_id}/otps")
   302              return response["attributes"]["otp"]
   303          otp = await retry(get_otp, sleep=5)
   304  
   305          await run("pachctl", "auth", "login", "--one-time-password", stdin=f"{otp}\n")
   306  
   307  async def run(cmd, *args, raise_on_error=True, stdin=None, capture_output=False, timeout=None, cwd=None):
   308      print_status("running: `{} {}`".format(cmd, " ".join(args)))
   309  
   310      proc = await asyncio.create_subprocess_exec(
   311          cmd, *args,
   312          stdin=asyncio.subprocess.PIPE if stdin is not None else None,
   313          stdout=asyncio.subprocess.PIPE if capture_output else None,
   314          stderr=asyncio.subprocess.PIPE if capture_output else None,
   315          cwd=cwd,
   316      )
   317      
   318      future = proc.communicate(input=stdin.encode("utf8") if stdin is not None else None)
   319      result = await (future if timeout is None else asyncio.wait_for(future, timeout=timeout))
   320  
   321      if capture_output:
   322          stdout, stderr = result
   323          stdout = stdout.decode("utf8")
   324          stderr = stderr.decode("utf8")
   325      else:
   326          stdout, stderr = None, None
   327  
   328      if raise_on_error and proc.returncode:
   329          raise Exception(f"unexpected return code from `{cmd}`: {proc.returncode}")
   330  
   331      return RunResult(rc=proc.returncode, stdout=stdout, stderr=stderr)
   332  
   333  async def capture(cmd, *args, **kwargs):
   334      _, stdout, _ = await run(cmd, *args, capture_output=True, **kwargs)
   335      return stdout
   336  
   337  def find_in_json(j, f):
   338      if f(j):
   339          return j
   340  
   341      iter = None
   342      if isinstance(j, dict):
   343          iter = j.values()
   344      elif isinstance(j, list):
   345          iter = j
   346  
   347      if iter is not None:
   348          for sub_j in iter:
   349              v = find_in_json(sub_j, f)
   350              if v is not None:
   351                  return v
   352  
   353  def print_status(status):
   354      print(f"===> {status}")
   355  
   356  async def retry(f, attempts=10, sleep=1.0):
   357      """
   358      Repeatedly retries operation up to `attempts` times, with a given `sleep`
   359      between runs.
   360      """
   361      for i in range(attempts):
   362          try:
   363              return await f()
   364          except:
   365              if i == attempts - 1:
   366                  raise
   367              await asyncio.sleep(sleep)
   368  
   369  async def ping():
   370      await run("pachctl", "version", capture_output=True, timeout=5)
   371  
   372  async def main():
   373      parser = argparse.ArgumentParser(description="Resets a pachyderm cluster.")
   374      parser.add_argument("--target", default="", help="Where to deploy")
   375      parser.add_argument("--dash", action="store_true", help="Deploy dash")
   376      parser.add_argument("--ide", action="store_true", help="Deploy IDE")
   377      parser.add_argument("--builders", action="store_true", help="Deploy images used in pipeline builds")
   378      args = parser.parse_args()
   379  
   380      if "GOPATH" not in os.environ:
   381          raise Exception("Must set GOPATH")
   382      if "PACH_CA_CERTS" in os.environ:
   383          raise Exception("Must unset PACH_CA_CERTS\nRun:\nunset PACH_CA_CERTS")
   384      if args.ide and "PACH_ENTERPRISE_KEY" not in os.environ:
   385          raise Exception("Must set PACH_ENTERPRISE_KEY")
   386  
   387      driver = None
   388  
   389      if args.target == "":
   390          # derive which driver to use from the k8s context name
   391          kube_context = await capture("kubectl", "config", "current-context", raise_on_error=False)
   392          kube_context = kube_context.strip() if kube_context else ""
   393          if kube_context == "minikube":
   394              print_status("using the minikube driver")
   395              driver = MinikubeDriver()
   396          elif kube_context == "docker-desktop":
   397              print_status("using the base driver")
   398              driver = BaseDriver()
   399          if driver is None:
   400              # minikube won't set the k8s context if the VM isn't running. This
   401              # checks for the presence of the minikube executable as an
   402              # alternate means.
   403              try:
   404                  await run("minikube", "version", capture_output=True)
   405              except:
   406                  pass
   407              else:
   408                  driver = MinikubeDriver()
   409          if driver is None:
   410              raise Exception(f"could not derive driver from context name: {kube_context}")
   411      elif args.target == "minikube":
   412          print_status("using the minikube driver")
   413          driver = MinikubeDriver()
   414      elif args.target == "base":
   415          print_status("using the base driver")
   416          driver = BaseDriver()
   417      elif args.target.startswith("gcp"):
   418          print_status("using the gcp driver")
   419          project_id = (await capture("gcloud", "config", "get-value", "project")).strip()
   420          target_parts = args.target.split(":", maxsplit=1)
   421          cluster_name = target_parts[1] if len(target_parts) == 2 else None
   422          driver = GCPDriver(project_id, cluster_name)
   423      elif args.target.startswith("hub"):
   424          print_status("using the hub driver")
   425          target_parts = args.target.split(":", maxsplit=1)
   426          cluster_name = target_parts[1] if len(target_parts) == 2 else None
   427          driver = HubDriver(os.environ["PACH_HUB_API_KEY"], os.environ["PACH_HUB_ORG_ID"], cluster_name)
   428      else:
   429          raise Exception(f"unknown target: {args.target}")
   430  
   431      await asyncio.gather(
   432          run("make", "docker-build"),
   433          run("make", "install"),
   434          driver.reset(),
   435      )
   436  
   437      builder_images = []
   438      if args.builders:
   439          procs = []
   440          version = await get_client_version()
   441          for language in (d for d in os.listdir(PIPELINE_BUILD_DIR) if os.path.isdir(os.path.join(PIPELINE_BUILD_DIR, d))):
   442              builder_image = f"pachyderm/{language}-build:{version}"
   443              procs.append(run("docker", "build", "-t", builder_image, ".", cwd=os.path.join(PIPELINE_BUILD_DIR, language)))
   444              builder_images.append(builder_image)
   445          await asyncio.gather(*procs)
   446      
   447      await driver.deploy(args.dash, args.ide, builder_images)
   448  
   449  if __name__ == "__main__":
   450      asyncio.run(main(), debug=True)