github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/webui/src/pages/repositories/repository/fileRenderers/duckdb.tsx (about) 1 import * as duckdb from '@duckdb/duckdb-wasm'; 2 import * as arrow from 'apache-arrow'; 3 import {AsyncDuckDB, AsyncDuckDBConnection, DuckDBDataProtocol} from '@duckdb/duckdb-wasm'; 4 import duckdb_wasm from '@duckdb/duckdb-wasm/dist/duckdb-mvp.wasm?url'; 5 import mvp_worker from '@duckdb/duckdb-wasm/dist/duckdb-browser-mvp.worker.js?url'; 6 import duckdb_wasm_eh from '@duckdb/duckdb-wasm/dist/duckdb-eh.wasm?url'; 7 import eh_worker from '@duckdb/duckdb-wasm/dist/duckdb-browser-eh.worker.js?url'; 8 9 10 11 const MANUAL_BUNDLES: duckdb.DuckDBBundles = { 12 mvp: { 13 mainModule: duckdb_wasm, 14 mainWorker: mvp_worker, 15 }, 16 eh: { 17 mainModule: duckdb_wasm_eh, 18 mainWorker: eh_worker, 19 }, 20 }; 21 22 let _db: AsyncDuckDB | null = null; 23 24 async function getDuckDB(): Promise<duckdb.AsyncDuckDB> { 25 if (_db !== null) { 26 return _db 27 } 28 const bundle = await duckdb.selectBundle(MANUAL_BUNDLES) 29 if (!bundle.mainWorker) { 30 throw Error("could not initialize DuckDB") 31 } 32 const worker = new Worker(bundle.mainWorker) 33 const logger = new duckdb.VoidLogger() 34 const db = new duckdb.AsyncDuckDB(logger, worker) 35 await db.instantiate(bundle.mainModule, bundle.pthreadWorker) 36 const conn = await db.connect() 37 await conn.close() 38 _db = db 39 return _db 40 } 41 42 43 // taken from @duckdb/duckdb-wasm/dist/types/src/bindings/tokens.d.ts 44 // which, unfortunately, we cannot import. 45 const DUCKDB_STRING_CONSTANT = 2; 46 const LAKEFS_URI_PATTERN = /^(['"]?)(lakefs:\/\/(.*))(['"])\s*$/; 47 48 // returns a mapping of `lakefs://..` URIs to their `s3://...` equivalent 49 async function extractFiles(conn: AsyncDuckDBConnection, sql: string): Promise<{ [name: string]: string }> { 50 const tokenized = await conn.bindings.tokenize(sql) 51 const r = Math.random(); // random number to make sure the S3 gateway picks up the request 52 let prev = 0; 53 const fileMap: { [name: string]: string } = {}; 54 tokenized.offsets.forEach((offset, i) => { 55 let currentToken = sql.length; 56 if (i < tokenized.offsets.length - 1) { 57 currentToken = tokenized.offsets[i+1]; 58 } 59 const part = sql.substring(prev, currentToken); 60 prev = currentToken; 61 if (tokenized.types[i] === DUCKDB_STRING_CONSTANT) { 62 const matches = part.match(LAKEFS_URI_PATTERN) 63 if (matches !== null) { 64 fileMap[matches[2]] = `s3://${matches[3]}?r=${r}`; 65 } 66 } 67 }) 68 return fileMap 69 } 70 71 /* eslint-disable @typescript-eslint/no-explicit-any */ 72 export async function runDuckDBQuery(sql: string): Promise<arrow.Table<any>> { 73 const db = await getDuckDB() 74 /* eslint-disable @typescript-eslint/no-explicit-any */ 75 let result: arrow.Table<any> 76 const conn = await db.connect() 77 try { 78 // TODO (ozk): read this from the server's configuration? 79 await conn.query(`SET s3_region='us-east-1';`) 80 // set the example values (used to make sure the S3 gateway picks up the request) 81 // real authentication is done using the existing swagger cookie or token 82 await conn.query(`SET s3_access_key_id='use_swagger_credentials';`) 83 await conn.query(`SET s3_secret_access_key='these_are_meaningless_but_must_be_set';`) 84 await conn.query(`SET s3_endpoint='${document.location.protocol}//${document.location.host}'`) 85 86 // register lakefs uri-ed files as s3 files 87 const fileMap = await extractFiles(conn, sql) 88 const fileNames = Object.getOwnPropertyNames(fileMap) 89 await Promise.all(fileNames.map( 90 fileName => db.registerFileURL(fileName, fileMap[fileName], DuckDBDataProtocol.S3, true) 91 )) 92 // execute the query 93 result = await conn.query(sql) 94 95 // remove registrations 96 await Promise.all(fileNames.map(fileName => db.dropFile(fileName))) 97 } finally { 98 await conn.close() 99 } 100 return result 101 }