github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/clients/python-wrapper/lakefs/import_manager.py (about) 1 """ 2 Import module provides a simpler interface to the lakeFS SDK import functionality 3 """ 4 5 from __future__ import annotations 6 7 import asyncio 8 from datetime import timedelta 9 from typing import Optional, Dict, List 10 11 import lakefs_sdk 12 13 from lakefs.models import ImportStatus, _OBJECT, _COMMON_PREFIX 14 from lakefs.client import Client, _BaseLakeFSObject 15 from lakefs.exceptions import ImportManagerException, api_exception_handler 16 17 18 class ImportManager(_BaseLakeFSObject): 19 """ 20 ImportManager provides an easy-to-use interface to perform imports with multiple sources. 21 It provides both synchronous and asynchronous functionality allowing the user to start an import process, 22 continue executing logic and poll for the import completion. 23 24 ImportManager usage example: 25 26 .. code-block:: python 27 28 import lakefs 29 30 branch = lakefs.repository("<repository_name>").branch("<branch_name>") 31 mgr = branch.import_data(commit_message="my imported data", metadata={"foo": "bar"}) 32 33 # add sources for import 34 mgr.prefix(object_store_uri="s3://import-bucket/data1/", 35 destination="import-prefix/").object(object_store_uri="s3://import-bucket/data2/imported_file", 36 destination="import-prefix/imported_file") 37 # start import and wait 38 mgr.run() 39 40 """ 41 _repo_id: str 42 _branch_id: str 43 _in_progress: bool = False 44 _import_id: str = None 45 commit_message: str 46 commit_metadata: Optional[Dict] 47 sources: List[lakefs_sdk.ImportLocation] 48 49 def __init__(self, repository_id: str, branch_id: str, commit_message: str = "", 50 commit_metadata: Optional[Dict] = None, client: Optional[Client] = None) -> None: 51 self._repo_id = repository_id 52 self._branch_id = branch_id 53 self.commit_message = commit_message 54 self.commit_metadata = commit_metadata 55 self.sources = [] 56 super().__init__(client) 57 58 @property 59 def import_id(self) -> str: 60 """ 61 Returns the id of the current import process 62 """ 63 return self._import_id 64 65 def _append_source(self, import_location: lakefs_sdk.ImportLocation): 66 if self._import_id is not None: 67 raise ImportManagerException("Cannot add additional sources to an already started import") 68 69 self.sources.append(import_location) 70 71 def prefix(self, object_store_uri: str, destination: str) -> ImportManager: 72 """ 73 Creates a new import source of type "common_prefix" and adds it to the list of sources 74 75 :param object_store_uri: The URI from which to import the objects 76 :param destination: The destination prefix relative to the branch 77 :return: The ImportManager instance (self) after update, to allow operations chaining 78 """ 79 self._append_source(lakefs_sdk.ImportLocation(type=_COMMON_PREFIX, 80 path=object_store_uri, 81 destination=destination)) 82 return self 83 84 def object(self, object_store_uri: str, destination: str) -> ImportManager: 85 """ 86 Creates a new import source of type "object" and adds it to the list of sources 87 88 :param object_store_uri: The URI from which to import the object 89 :param destination: The destination path for the object relative to the branch 90 :return: The ImportManager instance (self) after update, to allow operations chaining 91 """ 92 self._append_source(lakefs_sdk.ImportLocation(type=_OBJECT, path=object_store_uri, destination=destination)) 93 return self 94 95 def start(self) -> str: 96 """ 97 Start import, reporting back (and storing) a process id 98 99 :return: The import process identifier in lakeFS 100 :raise ImportManagerException: if an import process is already in progress 101 :raise NotFoundException: if branch or repository do not exist 102 :raise NotAuthorizedException: if user is not authorized to perform this operation 103 :raise ValidationError: if path_type is not one of the allowed values 104 :raise ServerException: for any other errors 105 """ 106 if self._in_progress: 107 raise ImportManagerException("Import in progress") 108 if self._import_id is not None: 109 raise ImportManagerException("Import Manager can only be used once") 110 111 creation = lakefs_sdk.ImportCreation(paths=self.sources, 112 commit=lakefs_sdk.CommitCreation(message=self.commit_message, 113 metadata=self.commit_metadata)) 114 with api_exception_handler(): 115 res = self._client.sdk_client.import_api.import_start(repository=self._repo_id, 116 branch=self._branch_id, 117 import_creation=creation) 118 self._import_id = res.id 119 self._in_progress = True 120 121 return self._import_id 122 123 async def _wait_for_completion(self, poll_interval: timedelta) -> lakefs_sdk.ImportStatus: 124 while True: 125 await asyncio.sleep(poll_interval.total_seconds()) 126 with api_exception_handler(): 127 resp = self._client.sdk_client.import_api.import_status(repository=self._repo_id, 128 branch=self._branch_id, 129 id=self._import_id) 130 if resp.completed: 131 return resp 132 if resp.error is not None: 133 raise ImportManagerException(f"Import Error: {resp.error.message}") 134 135 def wait(self, poll_interval: Optional[timedelta] = timedelta(seconds=2)) -> ImportStatus: 136 """ 137 Poll a started import task ID, blocking until completion 138 139 :param poll_interval: The interval for polling the import status. 140 :return: Import status as returned by the lakeFS server 141 :raise ImportManagerException: if no import is in progress 142 :raise NotFoundException: if branch, repository or import id do not exist 143 :raise NotAuthorizedException: if user is not authorized to perform this operation 144 :raise ServerException: for any other errors 145 """ 146 if self._import_id is None: 147 raise ImportManagerException("No import in progress") 148 149 res = asyncio.run(self._wait_for_completion(poll_interval)) 150 self._in_progress = False 151 return ImportStatus(**res.dict()) 152 153 def run(self, poll_interval: Optional[timedelta] = None) -> ImportStatus: 154 """ 155 Same as calling start() and then wait() 156 157 :param poll_interval: The interval for polling the import status. 158 :return: Import status as returned by the lakeFS server 159 :raises: See start(), wait() 160 """ 161 self.start() 162 wait_kwargs = {} if poll_interval is None else {"poll_interval": poll_interval} 163 return self.wait(**wait_kwargs) 164 165 def cancel(self) -> None: 166 """ 167 Cancel an ongoing import process 168 169 :raise NotFoundException: if branch, repository or import id do not exist 170 :raise NotAuthorizedException: if user is not authorized to perform this operation 171 :raise ConflictException: if the import was already completed 172 :raise ServerException: for any other errors 173 """ 174 if self._import_id is None: # Can't cancel on no id 175 raise ImportManagerException("No import in progress") 176 177 with api_exception_handler(): 178 self._client.sdk_client.import_api.import_cancel(repository=self._repo_id, 179 branch=self._branch_id, 180 id=self._import_id) 181 self._in_progress = False 182 183 def status(self) -> ImportStatus: 184 """ 185 Get the current import status 186 187 :return: Import status as returned by the lakeFS server 188 :raise ImportManagerException: if no import is in progress 189 :raise NotFoundException: if branch, repository or import id do not exist 190 :raise NotAuthorizedException: if user is not authorized to perform this operation 191 :raise ServerException: for any other errors 192 """ 193 194 if self._import_id is None: 195 raise ImportManagerException("No import in progress") 196 197 with api_exception_handler(): 198 res = self._client.sdk_client.import_api.import_status(repository=self._repo_id, 199 branch=self._branch_id, 200 id=self._import_id) 201 202 if res.completed: 203 self._in_progress = False 204 return ImportStatus(**res.dict())