github.com/apache/arrow/go/v16@v16.1.0/arrow/compute/internal/kernels/string_casts.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 //go:build go1.18 18 19 package kernels 20 21 import ( 22 "fmt" 23 "strconv" 24 "unicode/utf8" 25 26 "github.com/apache/arrow/go/v16/arrow" 27 "github.com/apache/arrow/go/v16/arrow/array" 28 "github.com/apache/arrow/go/v16/arrow/bitutil" 29 "github.com/apache/arrow/go/v16/arrow/compute/exec" 30 "github.com/apache/arrow/go/v16/arrow/float16" 31 "github.com/apache/arrow/go/v16/internal/bitutils" 32 ) 33 34 func validateUtf8Fsb(input *exec.ArraySpan) error { 35 var ( 36 inputData = input.Buffers[1].Buf 37 width = int64(input.Type.(*arrow.FixedSizeBinaryType).ByteWidth) 38 bitmap = input.Buffers[0].Buf 39 ) 40 41 return bitutils.VisitBitBlocksShort(bitmap, input.Offset, input.Len, 42 func(pos int64) error { 43 pos += input.Offset 44 beg := pos * width 45 end := (pos + 1) * width 46 if !utf8.Valid(inputData[beg:end]) { 47 return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, inputData[beg:end]) 48 } 49 return nil 50 }, func() error { return nil }) 51 } 52 53 func validateUtf8[OffsetT int32 | int64](input *exec.ArraySpan) error { 54 var ( 55 inputOffsets = exec.GetSpanOffsets[OffsetT](input, 1) 56 inputData = input.Buffers[2].Buf 57 bitmap = input.Buffers[0].Buf 58 ) 59 60 return bitutils.VisitBitBlocksShort(bitmap, input.Offset, input.Len, 61 func(pos int64) error { 62 v := inputData[inputOffsets[pos]:inputOffsets[pos+1]] 63 if !utf8.Valid(v) { 64 return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, v) 65 } 66 return nil 67 }, func() error { return nil }) 68 } 69 70 func CastFsbToFsb(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { 71 inputWidth := batch.Values[0].Array.Type.(*arrow.FixedSizeBinaryType).ByteWidth 72 outputWidth := ctx.State.(CastState).ToType.(*arrow.FixedSizeBinaryType).ByteWidth 73 74 if inputWidth != outputWidth { 75 return fmt.Errorf("%w: failed casting from %s to %s: widths must match", 76 arrow.ErrInvalid, batch.Values[0].Array.Type, out.Type) 77 } 78 79 return ZeroCopyCastExec(ctx, batch, out) 80 } 81 82 func CastBinaryToBinary[InOffsetsT, OutOffsetsT int32 | int64](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { 83 opts := ctx.State.(CastState) 84 input := &batch.Values[0].Array 85 86 if !input.Type.(arrow.BinaryDataType).IsUtf8() && out.Type.(arrow.BinaryDataType).IsUtf8() && !opts.AllowInvalidUtf8 { 87 if err := validateUtf8[InOffsetsT](input); err != nil { 88 return err 89 } 90 } 91 92 // start with a zero-copy cast, then change the indices to the 93 // expected size 94 if err := ZeroCopyCastExec(ctx, batch, out); err != nil { 95 return err 96 } 97 98 switch { 99 case SizeOf[InOffsetsT]() == SizeOf[OutOffsetsT](): 100 // offsets are the same width, nothing more to do 101 return nil 102 case SizeOf[InOffsetsT]() > SizeOf[OutOffsetsT](): 103 // downcast from int64 -> int32 104 inputOffsets := exec.GetSpanOffsets[InOffsetsT](input, 1) 105 106 // binary offsets are ascending, so it's enough to check 107 // the last one for overflow 108 if inputOffsets[input.Len] > InOffsetsT(MaxOf[OutOffsetsT]()) { 109 return fmt.Errorf("%w: failed casting from %s to %s: input array too large", 110 arrow.ErrInvalid, input.Type, out.Type) 111 } 112 113 buf := ctx.Allocate(out.Type.(arrow.OffsetsDataType).OffsetTypeTraits().BytesRequired(int(out.Len + out.Offset + 1))) 114 out.Buffers[1].WrapBuffer(buf) 115 116 outOffsets := exec.GetSpanOffsets[OutOffsetsT](out, 1) 117 118 castNumericUnsafe(arrow.INT64, arrow.INT32, 119 arrow.GetBytes(inputOffsets), arrow.GetBytes(outOffsets), len(inputOffsets)) 120 return nil 121 default: 122 // upcast from int32 -> int64 123 buf := ctx.Allocate(out.Type.(arrow.OffsetsDataType).OffsetTypeTraits().BytesRequired(int(out.Len + out.Offset + 1))) 124 out.Buffers[1].WrapBuffer(buf) 125 126 inputOffsets := exec.GetSpanOffsets[InOffsetsT](input, 1) 127 outOffsets := exec.GetSpanOffsets[OutOffsetsT](out, 1) 128 129 castNumericUnsafe(arrow.INT32, arrow.INT64, 130 arrow.GetBytes(inputOffsets), arrow.GetBytes(outOffsets), len(inputOffsets)) 131 return nil 132 } 133 } 134 135 func CastFsbToBinary[OffsetsT int32 | int64](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { 136 opts := ctx.State.(CastState) 137 input := &batch.Values[0].Array 138 139 if out.Type.(arrow.BinaryDataType).IsUtf8() && !opts.AllowInvalidUtf8 { 140 if err := validateUtf8Fsb(input); err != nil { 141 return err 142 } 143 } 144 145 // check for overflow 146 maxOffset := int64(MaxOf[OffsetsT]()) 147 width := OffsetsT(input.Type.(*arrow.FixedSizeBinaryType).ByteWidth) 148 if (int64(width) * input.Len) > maxOffset { 149 return fmt.Errorf("%w: failed casting from %s to %s: input array too large", 150 arrow.ErrInvalid, input.Type, out.Type) 151 } 152 153 out.Len = input.Len 154 out.Nulls = input.Nulls 155 if input.Offset == out.Offset { 156 out.Buffers[0].SetBuffer(input.GetBuffer(0)) 157 } else { 158 out.Buffers[0].WrapBuffer(ctx.AllocateBitmap(input.Len)) 159 bitutil.CopyBitmap(input.Buffers[0].Buf, int(input.Offset), int(input.Len), out.Buffers[0].Buf, int(out.Offset)) 160 } 161 162 // this buffer is preallocated 163 offsets := exec.GetSpanOffsets[OffsetsT](out, 1) 164 offsets[0] = OffsetsT(input.Offset) * width 165 for i := 0; i < int(input.Len); i++ { 166 offsets[i+1] = offsets[i] + width 167 } 168 169 if len(input.Buffers[1].Buf) > 0 { 170 out.Buffers[2] = input.Buffers[1] 171 } 172 173 return nil 174 } 175 176 func addBinaryToBinaryCast[InOffsetT, OutOffsetT int32 | int64](inType arrow.Type, outType exec.OutputType) exec.ScalarKernel { 177 return exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(inType)}, 178 outType, CastBinaryToBinary[InOffsetT, OutOffsetT], nil) 179 } 180 181 func addToBinaryKernels[OffsetsT int32 | int64](outType exec.OutputType, kernels []exec.ScalarKernel) []exec.ScalarKernel { 182 return append(kernels, 183 addBinaryToBinaryCast[int32, OffsetsT](arrow.STRING, outType), 184 addBinaryToBinaryCast[int32, OffsetsT](arrow.BINARY, outType), 185 addBinaryToBinaryCast[int64, OffsetsT](arrow.LARGE_STRING, outType), 186 addBinaryToBinaryCast[int64, OffsetsT](arrow.LARGE_BINARY, outType), 187 exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(arrow.FIXED_SIZE_BINARY)}, 188 outType, CastFsbToBinary[OffsetsT], nil), 189 ) 190 } 191 192 func GetFsbCastKernels() []exec.ScalarKernel { 193 outputType := exec.NewComputedOutputType(resolveOutputFromOptions) 194 out := GetCommonCastKernels(arrow.FIXED_SIZE_BINARY, outputType) 195 kernel := exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(arrow.FIXED_SIZE_BINARY)}, 196 OutputFirstType, CastFsbToFsb, nil) 197 kernel.NullHandling = exec.NullComputedNoPrealloc 198 return append(out, kernel) 199 } 200 201 func float16Formatter(v float16.Num) string { return v.String() } 202 func date32Formatter(v arrow.Date32) string { return v.FormattedString() } 203 func date64Formatter(v arrow.Date64) string { return v.FormattedString() } 204 func numericFormatterSigned[T arrow.IntType](v T) string { return strconv.FormatInt(int64(v), 10) } 205 func numericFormatterUnsigned[T arrow.UintType](v T) string { return strconv.FormatUint(uint64(v), 10) } 206 func float32Formatter(v float32) string { return strconv.FormatFloat(float64(v), 'g', -1, 32) } 207 func float64Formatter(v float64) string { return strconv.FormatFloat(v, 'g', -1, 64) } 208 209 func boolToStringCastExec(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { 210 var ( 211 input = &batch.Values[0].Array 212 bldr = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder) 213 ) 214 defer bldr.Release() 215 216 bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, 217 func(pos int64) { 218 bldr.Append(strconv.FormatBool(bitutil.BitIsSet(input.Buffers[1].Buf, int(pos)))) 219 }, func() { bldr.AppendNull() }) 220 221 arr := bldr.NewArray() 222 out.TakeOwnership(arr.Data()) 223 return nil 224 } 225 226 type timeIntrinsic interface { 227 arrow.Time32 | arrow.Time64 228 FormattedString(arrow.TimeUnit) string 229 } 230 231 func timeToStringCastExec[T timeIntrinsic](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { 232 var ( 233 input = &batch.Values[0].Array 234 inputData = exec.GetSpanValues[T](input, 1) 235 bldr = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder) 236 inputType = input.Type.(arrow.TemporalWithUnit) 237 ) 238 defer bldr.Release() 239 240 bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, 241 func(pos int64) { 242 bldr.Append(inputData[pos].FormattedString(inputType.TimeUnit())) 243 }, func() { bldr.AppendNull() }) 244 245 arr := bldr.NewArray() 246 out.TakeOwnership(arr.Data()) 247 return nil 248 } 249 250 func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType](formatter func(T) string) exec.ArrayKernelExec { 251 return func(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { 252 var ( 253 input = &batch.Values[0].Array 254 inputData = exec.GetSpanValues[T](input, 1) 255 bldr = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder) 256 ) 257 defer bldr.Release() 258 259 bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, 260 func(pos int64) { 261 bldr.Append(formatter(inputData[pos])) 262 }, func() { bldr.AppendNull() }) 263 264 arr := bldr.NewArray() 265 out.TakeOwnership(arr.Data()) 266 return nil 267 } 268 } 269 270 func castTimestampToString(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error { 271 var ( 272 input = &batch.Values[0].Array 273 inputData = exec.GetSpanValues[arrow.Timestamp](input, 1) 274 inputType = input.Type.(*arrow.TimestampType) 275 bldr = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder) 276 ) 277 defer bldr.Release() 278 279 toTime, err := inputType.GetToTimeFunc() 280 if err != nil { 281 return err 282 } 283 284 // prealloc 285 fmtstring := "2006-01-02 15:04:05" 286 switch inputType.Unit { 287 case arrow.Millisecond: 288 fmtstring += ".000" 289 case arrow.Microsecond: 290 fmtstring += ".000000" 291 case arrow.Nanosecond: 292 fmtstring += ".000000000" 293 } 294 295 switch inputType.TimeZone { 296 case "UTC": 297 fmtstring += "Z" 298 case "": 299 default: 300 fmtstring += "-0700" 301 } 302 303 strlen := len(fmtstring) 304 bldr.Reserve(int(input.Len)) 305 bldr.ReserveData(int(input.Len-input.Nulls) * strlen) 306 307 bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len, 308 func(pos int64) { 309 bldr.Append(toTime(inputData[pos]).Format(fmtstring)) 310 }, 311 func() { bldr.AppendNull() }) 312 313 arr := bldr.NewArray() 314 out.TakeOwnership(arr.Data()) 315 return nil 316 } 317 318 func getNumericToStringCastExec(inType arrow.Type) exec.ArrayKernelExec { 319 switch inType { 320 case arrow.INT8: 321 return numericToStringCastExec(numericFormatterSigned[int8]) 322 case arrow.UINT8: 323 return numericToStringCastExec(numericFormatterUnsigned[uint8]) 324 case arrow.INT16: 325 return numericToStringCastExec(numericFormatterSigned[int16]) 326 case arrow.UINT16: 327 return numericToStringCastExec(numericFormatterUnsigned[uint16]) 328 case arrow.INT32: 329 return numericToStringCastExec(numericFormatterSigned[int32]) 330 case arrow.UINT32: 331 return numericToStringCastExec(numericFormatterUnsigned[uint32]) 332 case arrow.INT64: 333 return numericToStringCastExec(numericFormatterSigned[int64]) 334 case arrow.UINT64: 335 return numericToStringCastExec(numericFormatterUnsigned[uint64]) 336 case arrow.FLOAT16: 337 return numericToStringCastExec(float16Formatter) 338 case arrow.FLOAT32: 339 return numericToStringCastExec(float32Formatter) 340 case arrow.FLOAT64: 341 return numericToStringCastExec(float64Formatter) 342 case arrow.BOOL: 343 return boolToStringCastExec 344 case arrow.DATE32: 345 return numericToStringCastExec(date32Formatter) 346 case arrow.DATE64: 347 return numericToStringCastExec(date64Formatter) 348 case arrow.TIME32: 349 return timeToStringCastExec[arrow.Time32] 350 case arrow.TIME64: 351 return timeToStringCastExec[arrow.Time64] 352 case arrow.TIMESTAMP: 353 return castTimestampToString 354 } 355 panic("unimplemented cast: " + inType.String()) 356 } 357 358 func addNumericAndTemporalToStringCasts(outType exec.OutputType, out []exec.ScalarKernel) []exec.ScalarKernel { 359 k := exec.NewScalarKernel([]exec.InputType{exec.NewExactInput(arrow.FixedWidthTypes.Boolean)}, outType, 360 getNumericToStringCastExec(arrow.BOOL), nil) 361 k.NullHandling = exec.NullComputedNoPrealloc 362 out = append(out, k) 363 364 for _, dt := range numericTypes { 365 k = exec.NewScalarKernel([]exec.InputType{exec.NewExactInput(dt)}, outType, 366 getNumericToStringCastExec(dt.ID()), nil) 367 k.NullHandling = exec.NullComputedNoPrealloc 368 out = append(out, k) 369 } 370 371 for _, dt := range []arrow.DataType{arrow.FixedWidthTypes.Date32, arrow.FixedWidthTypes.Date64} { 372 k = exec.NewScalarKernel([]exec.InputType{exec.NewExactInput(dt)}, outType, 373 getNumericToStringCastExec(dt.ID()), nil) 374 k.NullHandling = exec.NullComputedNoPrealloc 375 out = append(out, k) 376 } 377 378 for _, id := range []arrow.Type{arrow.TIME32, arrow.TIME64, arrow.TIMESTAMP} { 379 k = exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(id)}, outType, 380 getNumericToStringCastExec(id), nil) 381 k.NullHandling = exec.NullComputedNoPrealloc 382 out = append(out, k) 383 } 384 385 return out 386 } 387 388 func GetToBinaryKernels(outType arrow.DataType) []exec.ScalarKernel { 389 if outType.ID() == arrow.FIXED_SIZE_BINARY { 390 return nil 391 } 392 393 outputType := exec.NewOutputType(outType) 394 out := GetCommonCastKernels(outType.ID(), outputType) 395 396 switch outType.ID() { 397 case arrow.BINARY: 398 return addToBinaryKernels[int32](outputType, out) 399 case arrow.LARGE_BINARY: 400 return addToBinaryKernels[int64](outputType, out) 401 case arrow.STRING: 402 out = addToBinaryKernels[int32](outputType, out) 403 return addNumericAndTemporalToStringCasts(outputType, out) 404 case arrow.LARGE_STRING: 405 out = addToBinaryKernels[int64](outputType, out) 406 return addNumericAndTemporalToStringCasts(outputType, out) 407 } 408 return nil 409 }