github.com/pachyderm/pachyderm@v1.13.4/etc/reset.py (about) 1 #!/usr/bin/env python3 2 3 import os 4 import re 5 import json 6 import asyncio 7 import secrets 8 import argparse 9 import collections 10 import http.client 11 from pathlib import Path 12 13 ETCD_IMAGE = "pachyderm/etcd:v3.3.5" 14 IDE_USER_IMAGE = "pachyderm/ide-user:local" 15 IDE_HUB_IMAGE = "pachyderm/ide-hub:local" 16 PIPELINE_BUILD_DIR = "etc/pipeline-build" 17 18 DELETABLE_RESOURCES = [ 19 "roles.rbac.authorization.k8s.io", 20 "rolebindings.rbac.authorization.k8s.io" 21 ] 22 23 NEWLINE_SEPARATE_OBJECTS_PATTERN = re.compile(r"\}\n+\{", re.MULTILINE) 24 25 # Path to file used for ensuring minikube doesn't need to be deleted. 26 # With newer versions of minikube, cluster state (host paths, pods, etc.) is 27 # persisted across host system restarts, but credentials aren't, causing 28 # permissions failures on k8s admin calls. We use this file to reference 29 # whether minikube has been started since the last host system restart, which 30 # will allow us to figure out whether to reset the cluster state so we don't 31 # get the permissions errors. It's stored in `/tmp` because that directory 32 # is wiped on every system restart, and doesn't require root to write to. 33 MINIKUBE_RUN_FILE = Path("/tmp/pachyderm-minikube-reset") 34 35 RunResult = collections.namedtuple("RunResult", ["rc", "stdout", "stderr"]) 36 37 client_version = None 38 async def get_client_version(): 39 global client_version 40 if client_version is None: 41 client_version = (await capture("pachctl", "version", "--client-only")).strip() 42 return client_version 43 44 class BaseDriver: 45 def image(self, name): 46 return name 47 48 async def reset(self): 49 # Check for the presence of the pachyderm IDE to see whether it should 50 # be undeployed too. Using kubectl rather than helm here because 51 # this'll work even if the helm CLI is not installed. 52 undeploy_args = [] 53 jupyterhub_apps = json.loads(await capture("kubectl", "get", "pod", "-lapp=jupyterhub", "-o", "json")) 54 if len(jupyterhub_apps["items"]) > 0: 55 undeploy_args.append("--ide") 56 57 # ignore errors here because most likely no cluster is just deployed 58 # yet 59 await run("pachctl", "undeploy", "--metadata", *undeploy_args, stdin="y\n", raise_on_error=False) 60 # clear out resources not removed from the undeploy process 61 await run("kubectl", "delete", ",".join(DELETABLE_RESOURCES), "-l", "suite=pachyderm") 62 63 async def push_image(self, images): 64 pass 65 66 def deploy_args(self): 67 # We use hostpaths for storage. On docker for mac and minikube, 68 # hostpaths aren't cleared until the VM is restarted. Because of this 69 # behavior, re-deploying on the same hostpath without a restart will 70 # cause us to bring up a new pachyderm cluster with access to the old 71 # cluster volume, causing a bad state. This works around the issue by 72 # just using a different hostpath on every deployment. 73 host_path = Path("/var") / f"pachyderm-{secrets.token_hex(5)}" 74 return ["local", "-d", "--no-guaranteed", f"--host-path={host_path}"] 75 76 async def deploy(self, dash, ide, builder_images): 77 deploy_args = ["pachctl", "deploy", *self.deploy_args(), "--dry-run", "--create-context", "--log-level=debug"] 78 if not dash: 79 deploy_args.append("--no-dashboard") 80 if os.environ.get("STORAGE_V2") == "true": 81 deploy_args.append("--storage-v2") 82 83 deployments_str = await capture(*deploy_args) 84 deployments_json = json.loads("[{}]".format(NEWLINE_SEPARATE_OBJECTS_PATTERN.sub("},{", deployments_str))) 85 86 dash_spec = find_in_json(deployments_json, lambda j: \ 87 isinstance(j, dict) and j.get("name") == "dash" and j.get("image") is not None) 88 grpc_proxy_spec = find_in_json(deployments_json, lambda j: \ 89 isinstance(j, dict) and j.get("name") == "grpc-proxy") 90 91 pull_images = [run("docker", "pull", ETCD_IMAGE)] 92 if dash_spec is not None: 93 pull_images.append(run("docker", "pull", dash_spec["image"])) 94 if grpc_proxy_spec is not None: 95 pull_images.append(run("docker", "pull", grpc_proxy_spec["image"])) 96 await asyncio.gather(*pull_images) 97 98 push_images = [ETCD_IMAGE, "pachyderm/pachd:local", "pachyderm/worker:local", *builder_images] 99 if dash_spec is not None: 100 push_images.append(dash_spec["image"]) 101 if grpc_proxy_spec is not None: 102 push_images.append(grpc_proxy_spec["image"]) 103 104 await asyncio.gather(*[self.push_image(i) for i in push_images]) 105 await run("kubectl", "create", "-f", "-", stdin=deployments_str) 106 107 await retry(ping, attempts=60) 108 109 if ide: 110 await asyncio.gather(*[self.push_image(i) for i in [IDE_USER_IMAGE, IDE_HUB_IMAGE]]) 111 112 await run("pachctl", "enterprise", "activate", stdin=os.environ["PACH_ENTERPRISE_KEY"]) 113 await run("pachctl", "auth", "activate", stdin="admin\n") 114 await run("pachctl", "deploy", "ide", 115 "--user-image", self.image(IDE_USER_IMAGE), 116 "--hub-image", self.image(IDE_HUB_IMAGE), 117 ) 118 119 class MinikubeDriver(BaseDriver): 120 async def reset(self): 121 is_minikube_running = True 122 123 async def minikube_status(): 124 await run("minikube", "status", capture_output=True) 125 126 if MINIKUBE_RUN_FILE.exists(): 127 try: 128 await minikube_status() 129 except: 130 is_minikube_running = False 131 else: 132 await run("minikube", "delete") 133 is_minikube_running = False 134 135 if not is_minikube_running: 136 await run("minikube", "start") 137 await retry(minikube_status) 138 MINIKUBE_RUN_FILE.touch() 139 140 await super().reset() 141 142 async def push_image(self, image): 143 await run("./etc/kube/push-to-minikube.sh", image) 144 145 async def deploy(self, dash, ide, builder_images): 146 await super().deploy(dash, ide, builder_images) 147 148 # enable direct connect 149 ip = (await capture("minikube", "ip")).strip() 150 await run("pachctl", "config", "update", "context", f"--pachd-address={ip}:30650") 151 152 # the config update above will cause subsequent deploys to create a 153 # new context, so to prevent the config from growing in size on every 154 # deploy, we'll go ahead and delete any "orphaned" contexts now 155 for line in (await capture("pachctl", "config", "list", "context")).strip().split("\n")[1:]: 156 context = line.strip() 157 if context.startswith("local"): 158 await run("pachctl", "config", "delete", "context", context) 159 160 class GCPDriver(BaseDriver): 161 def __init__(self, project_id, cluster_name=None): 162 if cluster_name is None: 163 cluster_name = f"pach-{secrets.token_hex(5)}" 164 self.cluster_name = cluster_name 165 self.object_storage_name = f"{cluster_name}-storage" 166 self.project_id = project_id 167 168 def image(self, name): 169 return f"gcr.io/{self.project_id}/{name}" 170 171 async def reset(self): 172 cluster_exists = (await run("gcloud", "container", "clusters", "describe", self.cluster_name, 173 raise_on_error=False, capture_output=True)).rc == 0 174 175 if cluster_exists: 176 await super().reset() 177 else: 178 await run("gcloud", "config", "set", "container/cluster", self.cluster_name) 179 await run("gcloud", "container", "clusters", "create", self.cluster_name, "--scopes=storage-rw", 180 "--machine-type=n1-standard-8", "--num-nodes=2") 181 182 account = (await capture("gcloud", "config", "get-value", "account")).strip() 183 await run("kubectl", "create", "clusterrolebinding", "cluster-admin-binding", 184 "--clusterrole=cluster-admin", f"--user={account}") 185 186 await run("gsutil", "mb", f"gs://{self.object_storage_name}") 187 188 docker_config_path = Path.home() / ".docker" / "config.json" 189 await run("kubectl", "create", "secret", "generic", "regcred", 190 f"--from-file=.dockerconfigjson={docker_config_path}", 191 "--type=kubernetes.io/dockerconfigjson") 192 193 async def push_image(self, image): 194 image_url = self.image(image) 195 if ":local" in image_url: 196 image_url = image_url.replace(":local", ":" + (await get_client_version())) 197 await run("docker", "tag", image, image_url) 198 await run("docker", "push", image_url) 199 200 def deploy_args(self): 201 return ["google", self.object_storage_name, "32", "--dynamic-etcd-nodes=1", "--image-pull-secret=regcred", 202 f"--registry=gcr.io/{self.project_id}"] 203 204 class HubApiError(Exception): 205 def __init__(self, errors): 206 def get_message(error): 207 try: 208 return f"{error['title']}: {error['detail']}" 209 except KeyError: 210 return json.dumps(error) 211 212 if len(errors) > 1: 213 message = ["multiple errors:"] 214 for error in errors: 215 message.append(f"- {get_message(error)}") 216 message = "\n".join(message) 217 else: 218 message = get_message(errors[0]) 219 220 super().__init__(message) 221 self.errors = errors 222 223 class HubDriver: 224 def __init__(self, api_key, org_id, cluster_name): 225 self.api_key = api_key 226 self.org_id = org_id 227 self.cluster_name = cluster_name 228 self.old_cluster_names = [] 229 230 def request(self, method, endpoint, body=None): 231 headers = { 232 "Authorization": f"Api-Key {self.api_key}", 233 } 234 if body is not None: 235 body = json.dumps({ 236 "data": { 237 "attributes": body, 238 } 239 }) 240 241 conn = http.client.HTTPSConnection(f"hub.pachyderm.com") 242 conn.request(method, f"/api/v1{endpoint}", headers=headers, body=body) 243 response = conn.getresponse() 244 j = json.load(response) 245 246 if "errors" in j: 247 raise HubApiError(j["errors"]) 248 249 return j["data"] 250 251 async def push_image(self, src): 252 dst = src.replace(":local", ":" + (await get_client_version())) 253 await run("docker", "tag", src, dst) 254 await run("docker", "push", dst) 255 256 async def reset(self): 257 if self.cluster_name is None: 258 return 259 260 for pach in self.request("GET", f"/organizations/{self.org_id}/pachs?limit=100"): 261 if pach["attributes"]["name"].startswith(f"{self.cluster_name}-"): 262 self.request("DELETE", f"/organizations/{self.org_id}/pachs/{pach['id']}") 263 self.old_cluster_names.append(pach["attributes"]["name"]) 264 265 async def deploy(self, dash, ide, builder_images): 266 if ide: 267 raise Exception("cannot deploy IDE in hub") 268 if len(builder_images): 269 raise Exception("cannot deploy builder images") 270 271 await asyncio.gather( 272 self.push_image("pachyderm/pachd:local"), 273 self.push_image("pachyderm/worker:local"), 274 ) 275 276 response = self.request("POST", f"/organizations/{self.org_id}/pachs", body={ 277 "name": self.cluster_name or "sandbox", 278 "pachVersion": await get_client_version(), 279 }) 280 281 cluster_name = response["attributes"]["name"] 282 gke_name = response["attributes"]["gkeName"] 283 pach_id = response["id"] 284 285 await run("pachctl", "config", "set", "context", cluster_name, stdin=json.dumps({ 286 "source": 2, 287 "pachd_address": f"grpcs://{gke_name}.clusters.pachyderm.io:31400", 288 })) 289 290 await run("pachctl", "config", "set", "active-context", cluster_name) 291 292 # hack-ey way to clean up the old contexts, now that the active 293 # context has been swapped to the new cluster 294 await asyncio.gather(*[ 295 run("pachctl", "config", "delete", "context", n, raise_on_error=False) for n in self.old_cluster_names 296 ]) 297 298 await retry(ping, attempts=100) 299 300 async def get_otp(): 301 response = self.request("GET", f"/organizations/{self.org_id}/pachs/{pach_id}/otps") 302 return response["attributes"]["otp"] 303 otp = await retry(get_otp, sleep=5) 304 305 await run("pachctl", "auth", "login", "--one-time-password", stdin=f"{otp}\n") 306 307 async def run(cmd, *args, raise_on_error=True, stdin=None, capture_output=False, timeout=None, cwd=None): 308 print_status("running: `{} {}`".format(cmd, " ".join(args))) 309 310 proc = await asyncio.create_subprocess_exec( 311 cmd, *args, 312 stdin=asyncio.subprocess.PIPE if stdin is not None else None, 313 stdout=asyncio.subprocess.PIPE if capture_output else None, 314 stderr=asyncio.subprocess.PIPE if capture_output else None, 315 cwd=cwd, 316 ) 317 318 future = proc.communicate(input=stdin.encode("utf8") if stdin is not None else None) 319 result = await (future if timeout is None else asyncio.wait_for(future, timeout=timeout)) 320 321 if capture_output: 322 stdout, stderr = result 323 stdout = stdout.decode("utf8") 324 stderr = stderr.decode("utf8") 325 else: 326 stdout, stderr = None, None 327 328 if raise_on_error and proc.returncode: 329 raise Exception(f"unexpected return code from `{cmd}`: {proc.returncode}") 330 331 return RunResult(rc=proc.returncode, stdout=stdout, stderr=stderr) 332 333 async def capture(cmd, *args, **kwargs): 334 _, stdout, _ = await run(cmd, *args, capture_output=True, **kwargs) 335 return stdout 336 337 def find_in_json(j, f): 338 if f(j): 339 return j 340 341 iter = None 342 if isinstance(j, dict): 343 iter = j.values() 344 elif isinstance(j, list): 345 iter = j 346 347 if iter is not None: 348 for sub_j in iter: 349 v = find_in_json(sub_j, f) 350 if v is not None: 351 return v 352 353 def print_status(status): 354 print(f"===> {status}") 355 356 async def retry(f, attempts=10, sleep=1.0): 357 """ 358 Repeatedly retries operation up to `attempts` times, with a given `sleep` 359 between runs. 360 """ 361 for i in range(attempts): 362 try: 363 return await f() 364 except: 365 if i == attempts - 1: 366 raise 367 await asyncio.sleep(sleep) 368 369 async def ping(): 370 await run("pachctl", "version", capture_output=True, timeout=5) 371 372 async def main(): 373 parser = argparse.ArgumentParser(description="Resets a pachyderm cluster.") 374 parser.add_argument("--target", default="", help="Where to deploy") 375 parser.add_argument("--dash", action="store_true", help="Deploy dash") 376 parser.add_argument("--ide", action="store_true", help="Deploy IDE") 377 parser.add_argument("--builders", action="store_true", help="Deploy images used in pipeline builds") 378 args = parser.parse_args() 379 380 if "GOPATH" not in os.environ: 381 raise Exception("Must set GOPATH") 382 if "PACH_CA_CERTS" in os.environ: 383 raise Exception("Must unset PACH_CA_CERTS\nRun:\nunset PACH_CA_CERTS") 384 if args.ide and "PACH_ENTERPRISE_KEY" not in os.environ: 385 raise Exception("Must set PACH_ENTERPRISE_KEY") 386 387 driver = None 388 389 if args.target == "": 390 # derive which driver to use from the k8s context name 391 kube_context = await capture("kubectl", "config", "current-context", raise_on_error=False) 392 kube_context = kube_context.strip() if kube_context else "" 393 if kube_context == "minikube": 394 print_status("using the minikube driver") 395 driver = MinikubeDriver() 396 elif kube_context == "docker-desktop": 397 print_status("using the base driver") 398 driver = BaseDriver() 399 if driver is None: 400 # minikube won't set the k8s context if the VM isn't running. This 401 # checks for the presence of the minikube executable as an 402 # alternate means. 403 try: 404 await run("minikube", "version", capture_output=True) 405 except: 406 pass 407 else: 408 driver = MinikubeDriver() 409 if driver is None: 410 raise Exception(f"could not derive driver from context name: {kube_context}") 411 elif args.target == "minikube": 412 print_status("using the minikube driver") 413 driver = MinikubeDriver() 414 elif args.target == "base": 415 print_status("using the base driver") 416 driver = BaseDriver() 417 elif args.target.startswith("gcp"): 418 print_status("using the gcp driver") 419 project_id = (await capture("gcloud", "config", "get-value", "project")).strip() 420 target_parts = args.target.split(":", maxsplit=1) 421 cluster_name = target_parts[1] if len(target_parts) == 2 else None 422 driver = GCPDriver(project_id, cluster_name) 423 elif args.target.startswith("hub"): 424 print_status("using the hub driver") 425 target_parts = args.target.split(":", maxsplit=1) 426 cluster_name = target_parts[1] if len(target_parts) == 2 else None 427 driver = HubDriver(os.environ["PACH_HUB_API_KEY"], os.environ["PACH_HUB_ORG_ID"], cluster_name) 428 else: 429 raise Exception(f"unknown target: {args.target}") 430 431 await asyncio.gather( 432 run("make", "docker-build"), 433 run("make", "install"), 434 driver.reset(), 435 ) 436 437 builder_images = [] 438 if args.builders: 439 procs = [] 440 version = await get_client_version() 441 for language in (d for d in os.listdir(PIPELINE_BUILD_DIR) if os.path.isdir(os.path.join(PIPELINE_BUILD_DIR, d))): 442 builder_image = f"pachyderm/{language}-build:{version}" 443 procs.append(run("docker", "build", "-t", builder_image, ".", cwd=os.path.join(PIPELINE_BUILD_DIR, language))) 444 builder_images.append(builder_image) 445 await asyncio.gather(*procs) 446 447 await driver.deploy(args.dash, args.ide, builder_images) 448 449 if __name__ == "__main__": 450 asyncio.run(main(), debug=True)