github.com/filecoin-project/bacalhau@v0.3.23-0.20230228154132-45c989550ace/integration/airflow/bacalhau_airflow/operators.py (about) 1 """ 2 Airflow operators for Bacalhau. 3 """ 4 import time 5 6 from attr import attr 7 from openlineage.airflow.extractors.base import OperatorLineage 8 from openlineage.client.facet import BaseFacet 9 from openlineage.client.run import Dataset 10 11 from airflow.compat.functools import cached_property 12 from airflow.models import BaseOperator 13 from airflow.models.baseoperator import BaseOperatorLink 14 from airflow.models.taskinstance import TaskInstanceKey 15 from airflow.utils.context import Context 16 from bacalhau_airflow.hooks import BacalhauHook 17 18 19 class BacalhauLink(BaseOperatorLink): 20 """Link to the Bacalhau service.""" 21 22 name = "Bacalhau" 23 24 def get_link(self, operator: BaseOperator, *, ti_key: TaskInstanceKey): 25 """Get the URL of the Bacalhau public service.""" 26 return "https://docs.bacalhau.org/" 27 28 29 class BacalhauSubmitJobOperator(BaseOperator): 30 """Submit a job to the Bacalhau service.""" 31 32 ui_color = "#36cbfa" 33 ui_fgcolor = "#0554f9" 34 custom_operator_name = "BacalhauSubmitJob" 35 36 template_fields = ("input_volumes",) 37 38 def __init__( 39 self, 40 api_version: str, 41 job_spec: dict, 42 # inputs: dict = None, 43 input_volumes: list = [], 44 **kwargs, 45 ) -> None: 46 """Constructor of the operator to submit a Bacalhau job. 47 48 Args: 49 api_version (str): The API version to use. Example: "V1beta1". 50 job_spec (dict): A dictionary with the job specification. See example dags for more details. 51 input_volumes (list, optional): 52 Use this parameter to pipe an upstream's output into a Bacalhau task. 53 54 This makes use of Airflow's XComs to support communication between tasks. 55 Please learn more about XComs here: https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/xcoms.html 56 57 Every task of `BacalhauSubmitJobOperator` stores an XCom key-value named `cids` (type `str`), a CID comma-separated list of the output shards. 58 That way, a downstream task can use the `input_volumes` parameter to mount the upstream's output shards into its own input volumes. 59 60 The format of this parameter is a list of strings, where each string is a pair of `cid` and `mount_point` separated by a colon. 61 Defaults to []. 62 63 For example, the list `[ "{{ task_instance.xcom_pull(task_ids='run-1', key='cids') }}:/datasets" ]` takes all shards created by task "run-1" and mounts them at "/datasets". 64 """ 65 super().__init__(**kwargs) 66 # On start properties 67 self.api_version = api_version 68 self.job_spec = job_spec 69 self.input_volumes = input_volumes 70 # On complete properties 71 self.bacalhau_job_id = "" 72 73 def execute(self, context: Context) -> str: 74 """Execute the operator. 75 76 Args: 77 context (Context): 78 79 Returns: 80 str: The job ID created. 81 """ 82 83 # TODO do the same for inputs? 84 85 # TODO manage the case when 1+ cids are passed in input_volumes and must be mounted in children mount points 86 # 'failed to create container: Error response from daemon: Duplicate 87 88 unravelled_input_volumes = [] 89 if self.input_volumes and len(self.input_volumes) > 0: 90 for input_volume in self.input_volumes: 91 if type(input_volume) == str: 92 cids_str, mount_point = input_volume.split(":") 93 if "," in cids_str: 94 cids = cids_str.split(",") 95 for cid in cids: 96 unravelled_input_volumes.append( 97 { 98 "cid": cid, 99 "path": mount_point, 100 "storagesource": "ipfs", # TODO make this configurable (filecoin, etc) 101 } 102 ) 103 else: 104 unravelled_input_volumes.append( 105 { 106 "cid": cids_str, 107 "path": mount_point, 108 "storagesource": "ipfs", # TODO make this configurable (filecoin, etc) 109 } 110 ) 111 112 if len(unravelled_input_volumes) > 0: 113 if "inputs" not in self.job_spec: 114 self.job_spec["inputs"] = [] 115 self.job_spec["inputs"] = self.job_spec["inputs"] + unravelled_input_volumes 116 117 print("self.job_spec") 118 print(self.job_spec) 119 120 job_id = self.hook.submit_job( 121 api_version=self.api_version, job_spec=self.job_spec 122 ) 123 self.bacalhau_job_id = job_id 124 context["ti"].xcom_push(key="bacalhau_job_id", value=job_id) 125 print("job_id") 126 print(job_id) 127 128 # use hook to wait for job to complete 129 # TODO move this logic to a hook 130 while True: 131 events = self.hook.get_events(job_id) 132 133 terminate = False 134 for event in events["events"]: 135 print(event) 136 if "event_name" in event: 137 # TODO fix case when event hangs/errors out/never completes 138 if ( 139 event["event_name"] == "ComputeError" 140 or event["event_name"] == "Error" 141 or event["event_name"] == "ResultsPublished" 142 or event["event_name"] == "Completed" 143 ): 144 # print(event) 145 terminate = True 146 break 147 # else: 148 # print(event) 149 if terminate: 150 break 151 print("clock is ticking...") 152 time.sleep(2) 153 154 # fetch all shards' resulting CIDs 155 results = self.hook.get_results(job_id) 156 # join CIDs comma separated.. 157 cids = [] 158 for result in results: 159 cids.append(result["data"]["cid"]) 160 cids_str = ",".join(cids) 161 # print(cids_str) 162 context["ti"].xcom_push(key="cids", value=cids_str) 163 164 return job_id 165 166 @cached_property 167 def hook(self): 168 """Create and return an BacalhauHook (cached).""" 169 return BacalhauHook() 170 171 def get_hook(self): 172 """Create and return an BacalhauHook (cached).""" 173 return self.hook 174 175 # get_openlineage_facets_on_start() is run by Openlineage/Marquez before the execute() funciton is run, allowing 176 # to collect metadata before the execution of the task. 177 # Implementation details can be found in Openlineage doc: https://openlineage.io/docs/integrations/airflow/operator#implementation 178 # TODO this peace of code has not been tested and should be refactored before being used 179 # def get_openlineage_facets_on_start(self) -> OperatorLineage: 180 # return OperatorLineage( 181 # inputs=[ 182 # Dataset( 183 # namespace=f'{os.getenv("BACALHAU_API_HOST")}:1234', 184 # name="inputs", 185 # facets={ 186 # "command": self.command, 187 # "concurrency": self.concurrency, 188 # "dry_run": self.dry_run, 189 # "env": self.env, 190 # "gpu": self.gpu, 191 # "input_urls": self.input_urls, 192 # "input_volumes": self.input_volumes, 193 # "inputs": self.inputs, 194 # "output_volumes": self.output_volumes, 195 # "publisher": self.publisher, 196 # "workdir": self.workdir, 197 # }, 198 # ) 199 # ], 200 # output=[], 201 # run_facets={}, 202 # job_facets={}, 203 # )