github.com/filecoin-project/bacalhau@v0.3.23-0.20230228154132-45c989550ace/integration/airflow/bacalhau_airflow/operators.py

github.com/filecoin-project/bacalhau@v0.3.23-0.20230228154132-45c989550ace/integration/airflow/bacalhau_airflow/operators.py (about)

     1  """
     2  Airflow operators for Bacalhau.
     3  """
     4  import time
     5  
     6  from attr import attr
     7  from openlineage.airflow.extractors.base import OperatorLineage
     8  from openlineage.client.facet import BaseFacet
     9  from openlineage.client.run import Dataset
    10  
    11  from airflow.compat.functools import cached_property
    12  from airflow.models import BaseOperator
    13  from airflow.models.baseoperator import BaseOperatorLink
    14  from airflow.models.taskinstance import TaskInstanceKey
    15  from airflow.utils.context import Context
    16  from bacalhau_airflow.hooks import BacalhauHook
    17  
    18  
    19  class BacalhauLink(BaseOperatorLink):
    20      """Link to the Bacalhau service."""
    21  
    22      name = "Bacalhau"
    23  
    24      def get_link(self, operator: BaseOperator, *, ti_key: TaskInstanceKey):
    25          """Get the URL of the Bacalhau public service."""
    26          return "https://docs.bacalhau.org/"
    27  
    28  
    29  class BacalhauSubmitJobOperator(BaseOperator):
    30      """Submit a job to the Bacalhau service."""
    31  
    32      ui_color = "#36cbfa"
    33      ui_fgcolor = "#0554f9"
    34      custom_operator_name = "BacalhauSubmitJob"
    35  
    36      template_fields = ("input_volumes",)
    37  
    38      def __init__(
    39          self,
    40          api_version: str,
    41          job_spec: dict,
    42          #  inputs: dict = None,
    43          input_volumes: list = [],
    44          **kwargs,
    45      ) -> None:
    46          """Constructor of the operator to submit a Bacalhau job.
    47  
    48          Args:
    49              api_version (str): The API version to use. Example: "V1beta1".
    50              job_spec (dict): A dictionary with the job specification. See example dags for more details.
    51              input_volumes (list, optional):
    52                  Use this parameter to pipe an upstream's output into a Bacalhau task.
    53  
    54                  This makes use of Airflow's XComs to support communication between tasks.
    55                  Please learn more about XComs here: https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/xcoms.html
    56  
    57                  Every task of `BacalhauSubmitJobOperator` stores an XCom key-value named `cids` (type `str`), a CID comma-separated list of the output shards.
    58                  That way, a downstream task can use the `input_volumes` parameter to mount the upstream's output shards into its own input volumes.
    59  
    60                  The format of this parameter is a list of strings, where each string is a pair of `cid` and `mount_point` separated by a colon.
    61                  Defaults to [].
    62  
    63                  For example, the list `[ "{{ task_instance.xcom_pull(task_ids='run-1', key='cids') }}:/datasets" ]` takes all shards created by task "run-1" and mounts them at "/datasets".
    64          """
    65          super().__init__(**kwargs)
    66          # On start properties
    67          self.api_version = api_version
    68          self.job_spec = job_spec
    69          self.input_volumes = input_volumes
    70          # On complete properties
    71          self.bacalhau_job_id = ""
    72  
    73      def execute(self, context: Context) -> str:
    74          """Execute the operator.
    75  
    76          Args:
    77              context (Context):
    78  
    79          Returns:
    80              str: The job ID created.
    81          """
    82  
    83          # TODO do the same for inputs?
    84  
    85          # TODO manage the case when 1+ cids are passed in input_volumes and must be mounted in children mount points
    86          # 'failed to create container: Error response from daemon: Duplicate
    87  
    88          unravelled_input_volumes = []
    89          if self.input_volumes and len(self.input_volumes) > 0:
    90              for input_volume in self.input_volumes:
    91                  if type(input_volume) == str:
    92                      cids_str, mount_point = input_volume.split(":")
    93                      if "," in cids_str:
    94                          cids = cids_str.split(",")
    95                          for cid in cids:
    96                              unravelled_input_volumes.append(
    97                                  {
    98                                      "cid": cid,
    99                                      "path": mount_point,
   100                                      "storagesource": "ipfs",  # TODO make this configurable (filecoin, etc)
   101                                  }
   102                              )
   103                      else:
   104                          unravelled_input_volumes.append(
   105                              {
   106                                  "cid": cids_str,
   107                                  "path": mount_point,
   108                                  "storagesource": "ipfs",  # TODO make this configurable (filecoin, etc)
   109                              }
   110                          )
   111  
   112          if len(unravelled_input_volumes) > 0:
   113              if "inputs" not in self.job_spec:
   114                  self.job_spec["inputs"] = []
   115              self.job_spec["inputs"] = self.job_spec["inputs"] + unravelled_input_volumes
   116  
   117          print("self.job_spec")
   118          print(self.job_spec)
   119  
   120          job_id = self.hook.submit_job(
   121              api_version=self.api_version, job_spec=self.job_spec
   122          )
   123          self.bacalhau_job_id = job_id
   124          context["ti"].xcom_push(key="bacalhau_job_id", value=job_id)
   125          print("job_id")
   126          print(job_id)
   127  
   128          # use hook to wait for job to complete
   129          # TODO move this logic to a hook
   130          while True:
   131              events = self.hook.get_events(job_id)
   132  
   133              terminate = False
   134              for event in events["events"]:
   135                  print(event)
   136                  if "event_name" in event:
   137                      # TODO fix case when event hangs/errors out/never completes
   138                      if (
   139                          event["event_name"] == "ComputeError"
   140                          or event["event_name"] == "Error"
   141                          or event["event_name"] == "ResultsPublished"
   142                          or event["event_name"] == "Completed"
   143                      ):
   144                          # print(event)
   145                          terminate = True
   146                          break
   147                      # else:
   148                      #     print(event)
   149              if terminate:
   150                  break
   151              print("clock is ticking...")
   152              time.sleep(2)
   153  
   154          # fetch all shards' resulting CIDs
   155          results = self.hook.get_results(job_id)
   156          # join CIDs comma separated..
   157          cids = []
   158          for result in results:
   159              cids.append(result["data"]["cid"])
   160          cids_str = ",".join(cids)
   161          # print(cids_str)
   162          context["ti"].xcom_push(key="cids", value=cids_str)
   163  
   164          return job_id
   165  
   166      @cached_property
   167      def hook(self):
   168          """Create and return an BacalhauHook (cached)."""
   169          return BacalhauHook()
   170  
   171      def get_hook(self):
   172          """Create and return an BacalhauHook (cached)."""
   173          return self.hook
   174  
   175      # get_openlineage_facets_on_start() is run by Openlineage/Marquez before the execute() funciton is run, allowing
   176      # to collect metadata before the execution of the task.
   177      # Implementation details can be found in Openlineage doc: https://openlineage.io/docs/integrations/airflow/operator#implementation
   178      # TODO this peace of code has not been tested and should be refactored before being used
   179      # def get_openlineage_facets_on_start(self) -> OperatorLineage:
   180      #     return OperatorLineage(
   181      #         inputs=[
   182      #             Dataset(
   183      #                 namespace=f'{os.getenv("BACALHAU_API_HOST")}:1234',
   184      #                 name="inputs",
   185      #                 facets={
   186      #                     "command": self.command,
   187      #                     "concurrency": self.concurrency,
   188      #                     "dry_run": self.dry_run,
   189      #                     "env": self.env,
   190      #                     "gpu": self.gpu,
   191      #                     "input_urls": self.input_urls,
   192      #                     "input_volumes": self.input_volumes,
   193      #                     "inputs": self.inputs,
   194      #                     "output_volumes": self.output_volumes,
   195      #                     "publisher": self.publisher,
   196      #                     "workdir": self.workdir,
   197      #                 },
   198      #             )
   199      #         ],
   200      #         output=[],
   201      #         run_facets={},
   202      #         job_facets={},
   203      #     )