github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/clients/python-wrapper/lakefs/import_manager.py (about)

     1  """
     2  Import module provides a simpler interface to the lakeFS SDK import functionality
     3  """
     4  
     5  from __future__ import annotations
     6  
     7  import asyncio
     8  from datetime import timedelta
     9  from typing import Optional, Dict, List
    10  
    11  import lakefs_sdk
    12  
    13  from lakefs.models import ImportStatus, _OBJECT, _COMMON_PREFIX
    14  from lakefs.client import Client, _BaseLakeFSObject
    15  from lakefs.exceptions import ImportManagerException, api_exception_handler
    16  
    17  
    18  class ImportManager(_BaseLakeFSObject):
    19      """
    20      ImportManager provides an easy-to-use interface to perform imports with multiple sources.
    21      It provides both synchronous and asynchronous functionality allowing the user to start an import process,
    22      continue executing logic and poll for the import completion.
    23  
    24      ImportManager usage example:
    25  
    26      .. code-block:: python
    27  
    28          import lakefs
    29  
    30          branch = lakefs.repository("<repository_name>").branch("<branch_name>")
    31          mgr = branch.import_data(commit_message="my imported data", metadata={"foo": "bar"})
    32  
    33          # add sources for import
    34          mgr.prefix(object_store_uri="s3://import-bucket/data1/",
    35                     destination="import-prefix/").object(object_store_uri="s3://import-bucket/data2/imported_file",
    36                                                          destination="import-prefix/imported_file")
    37          # start import and wait
    38          mgr.run()
    39  
    40      """
    41      _repo_id: str
    42      _branch_id: str
    43      _in_progress: bool = False
    44      _import_id: str = None
    45      commit_message: str
    46      commit_metadata: Optional[Dict]
    47      sources: List[lakefs_sdk.ImportLocation]
    48  
    49      def __init__(self, repository_id: str, branch_id: str, commit_message: str = "",
    50                   commit_metadata: Optional[Dict] = None, client: Optional[Client] = None) -> None:
    51          self._repo_id = repository_id
    52          self._branch_id = branch_id
    53          self.commit_message = commit_message
    54          self.commit_metadata = commit_metadata
    55          self.sources = []
    56          super().__init__(client)
    57  
    58      @property
    59      def import_id(self) -> str:
    60          """
    61          Returns the id of the current import process
    62          """
    63          return self._import_id
    64  
    65      def _append_source(self, import_location: lakefs_sdk.ImportLocation):
    66          if self._import_id is not None:
    67              raise ImportManagerException("Cannot add additional sources to an already started import")
    68  
    69          self.sources.append(import_location)
    70  
    71      def prefix(self, object_store_uri: str, destination: str) -> ImportManager:
    72          """
    73          Creates a new import source of type "common_prefix" and adds it to the list of sources
    74  
    75          :param object_store_uri: The URI from which to import the objects
    76          :param destination: The destination prefix relative to the branch
    77          :return: The ImportManager instance (self) after update, to allow operations chaining
    78          """
    79          self._append_source(lakefs_sdk.ImportLocation(type=_COMMON_PREFIX,
    80                                                        path=object_store_uri,
    81                                                        destination=destination))
    82          return self
    83  
    84      def object(self, object_store_uri: str, destination: str) -> ImportManager:
    85          """
    86          Creates a new import source of type "object" and adds it to the list of sources
    87  
    88          :param object_store_uri: The URI from which to import the object
    89          :param destination: The destination path for the object relative to the branch
    90          :return: The ImportManager instance (self) after update, to allow operations chaining
    91          """
    92          self._append_source(lakefs_sdk.ImportLocation(type=_OBJECT, path=object_store_uri, destination=destination))
    93          return self
    94  
    95      def start(self) -> str:
    96          """
    97          Start import, reporting back (and storing) a process id
    98  
    99          :return: The import process identifier in lakeFS
   100          :raise ImportManagerException: if an import process is already in progress
   101          :raise NotFoundException: if branch or repository do not exist
   102          :raise NotAuthorizedException: if user is not authorized to perform this operation
   103          :raise ValidationError: if path_type is not one of the allowed values
   104          :raise ServerException: for any other errors
   105          """
   106          if self._in_progress:
   107              raise ImportManagerException("Import in progress")
   108          if self._import_id is not None:
   109              raise ImportManagerException("Import Manager can only be used once")
   110  
   111          creation = lakefs_sdk.ImportCreation(paths=self.sources,
   112                                               commit=lakefs_sdk.CommitCreation(message=self.commit_message,
   113                                                                                metadata=self.commit_metadata))
   114          with api_exception_handler():
   115              res = self._client.sdk_client.import_api.import_start(repository=self._repo_id,
   116                                                                    branch=self._branch_id,
   117                                                                    import_creation=creation)
   118              self._import_id = res.id
   119              self._in_progress = True
   120  
   121          return self._import_id
   122  
   123      async def _wait_for_completion(self, poll_interval: timedelta) -> lakefs_sdk.ImportStatus:
   124          while True:
   125              await asyncio.sleep(poll_interval.total_seconds())
   126              with api_exception_handler():
   127                  resp = self._client.sdk_client.import_api.import_status(repository=self._repo_id,
   128                                                                          branch=self._branch_id,
   129                                                                          id=self._import_id)
   130              if resp.completed:
   131                  return resp
   132              if resp.error is not None:
   133                  raise ImportManagerException(f"Import Error: {resp.error.message}")
   134  
   135      def wait(self, poll_interval: Optional[timedelta] = timedelta(seconds=2)) -> ImportStatus:
   136          """
   137          Poll a started import task ID, blocking until completion
   138  
   139          :param poll_interval: The interval for polling the import status.
   140          :return: Import status as returned by the lakeFS server
   141          :raise ImportManagerException: if no import is in progress
   142          :raise NotFoundException: if branch, repository or import id do not exist
   143          :raise NotAuthorizedException: if user is not authorized to perform this operation
   144          :raise ServerException: for any other errors
   145          """
   146          if self._import_id is None:
   147              raise ImportManagerException("No import in progress")
   148  
   149          res = asyncio.run(self._wait_for_completion(poll_interval))
   150          self._in_progress = False
   151          return ImportStatus(**res.dict())
   152  
   153      def run(self, poll_interval: Optional[timedelta] = None) -> ImportStatus:
   154          """
   155          Same as calling start() and then wait()
   156  
   157          :param poll_interval: The interval for polling the import status.
   158          :return: Import status as returned by the lakeFS server
   159          :raises: See start(), wait()
   160          """
   161          self.start()
   162          wait_kwargs = {} if poll_interval is None else {"poll_interval": poll_interval}
   163          return self.wait(**wait_kwargs)
   164  
   165      def cancel(self) -> None:
   166          """
   167          Cancel an ongoing import process
   168  
   169          :raise NotFoundException: if branch, repository or import id do not exist
   170          :raise NotAuthorizedException: if user is not authorized to perform this operation
   171          :raise ConflictException: if the import was already completed
   172          :raise ServerException: for any other errors
   173          """
   174          if self._import_id is None:  # Can't cancel on no id
   175              raise ImportManagerException("No import in progress")
   176  
   177          with api_exception_handler():
   178              self._client.sdk_client.import_api.import_cancel(repository=self._repo_id,
   179                                                               branch=self._branch_id,
   180                                                               id=self._import_id)
   181              self._in_progress = False
   182  
   183      def status(self) -> ImportStatus:
   184          """
   185          Get the current import status
   186  
   187          :return: Import status as returned by the lakeFS server
   188          :raise ImportManagerException: if no import is in progress
   189          :raise NotFoundException: if branch, repository or import id do not exist
   190          :raise NotAuthorizedException: if user is not authorized to perform this operation
   191          :raise ServerException: for any other errors
   192          """
   193  
   194          if self._import_id is None:
   195              raise ImportManagerException("No import in progress")
   196  
   197          with api_exception_handler():
   198              res = self._client.sdk_client.import_api.import_status(repository=self._repo_id,
   199                                                                     branch=self._branch_id,
   200                                                                     id=self._import_id)
   201  
   202              if res.completed:
   203                  self._in_progress = False
   204              return ImportStatus(**res.dict())