github.com/apache/arrow/go/v16@v16.1.0/arrow/compute/internal/kernels/string_casts.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  //go:build go1.18
    18  
    19  package kernels
    20  
    21  import (
    22  	"fmt"
    23  	"strconv"
    24  	"unicode/utf8"
    25  
    26  	"github.com/apache/arrow/go/v16/arrow"
    27  	"github.com/apache/arrow/go/v16/arrow/array"
    28  	"github.com/apache/arrow/go/v16/arrow/bitutil"
    29  	"github.com/apache/arrow/go/v16/arrow/compute/exec"
    30  	"github.com/apache/arrow/go/v16/arrow/float16"
    31  	"github.com/apache/arrow/go/v16/internal/bitutils"
    32  )
    33  
    34  func validateUtf8Fsb(input *exec.ArraySpan) error {
    35  	var (
    36  		inputData = input.Buffers[1].Buf
    37  		width     = int64(input.Type.(*arrow.FixedSizeBinaryType).ByteWidth)
    38  		bitmap    = input.Buffers[0].Buf
    39  	)
    40  
    41  	return bitutils.VisitBitBlocksShort(bitmap, input.Offset, input.Len,
    42  		func(pos int64) error {
    43  			pos += input.Offset
    44  			beg := pos * width
    45  			end := (pos + 1) * width
    46  			if !utf8.Valid(inputData[beg:end]) {
    47  				return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, inputData[beg:end])
    48  			}
    49  			return nil
    50  		}, func() error { return nil })
    51  }
    52  
    53  func validateUtf8[OffsetT int32 | int64](input *exec.ArraySpan) error {
    54  	var (
    55  		inputOffsets = exec.GetSpanOffsets[OffsetT](input, 1)
    56  		inputData    = input.Buffers[2].Buf
    57  		bitmap       = input.Buffers[0].Buf
    58  	)
    59  
    60  	return bitutils.VisitBitBlocksShort(bitmap, input.Offset, input.Len,
    61  		func(pos int64) error {
    62  			v := inputData[inputOffsets[pos]:inputOffsets[pos+1]]
    63  			if !utf8.Valid(v) {
    64  				return fmt.Errorf("%w: invalid UTF8 bytes: %x", arrow.ErrInvalid, v)
    65  			}
    66  			return nil
    67  		}, func() error { return nil })
    68  }
    69  
    70  func CastFsbToFsb(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
    71  	inputWidth := batch.Values[0].Array.Type.(*arrow.FixedSizeBinaryType).ByteWidth
    72  	outputWidth := ctx.State.(CastState).ToType.(*arrow.FixedSizeBinaryType).ByteWidth
    73  
    74  	if inputWidth != outputWidth {
    75  		return fmt.Errorf("%w: failed casting from %s to %s: widths must match",
    76  			arrow.ErrInvalid, batch.Values[0].Array.Type, out.Type)
    77  	}
    78  
    79  	return ZeroCopyCastExec(ctx, batch, out)
    80  }
    81  
    82  func CastBinaryToBinary[InOffsetsT, OutOffsetsT int32 | int64](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
    83  	opts := ctx.State.(CastState)
    84  	input := &batch.Values[0].Array
    85  
    86  	if !input.Type.(arrow.BinaryDataType).IsUtf8() && out.Type.(arrow.BinaryDataType).IsUtf8() && !opts.AllowInvalidUtf8 {
    87  		if err := validateUtf8[InOffsetsT](input); err != nil {
    88  			return err
    89  		}
    90  	}
    91  
    92  	// start with a zero-copy cast, then change the indices to the
    93  	// expected size
    94  	if err := ZeroCopyCastExec(ctx, batch, out); err != nil {
    95  		return err
    96  	}
    97  
    98  	switch {
    99  	case SizeOf[InOffsetsT]() == SizeOf[OutOffsetsT]():
   100  		// offsets are the same width, nothing more to do
   101  		return nil
   102  	case SizeOf[InOffsetsT]() > SizeOf[OutOffsetsT]():
   103  		// downcast from int64 -> int32
   104  		inputOffsets := exec.GetSpanOffsets[InOffsetsT](input, 1)
   105  
   106  		// binary offsets are ascending, so it's enough to check
   107  		// the last one for overflow
   108  		if inputOffsets[input.Len] > InOffsetsT(MaxOf[OutOffsetsT]()) {
   109  			return fmt.Errorf("%w: failed casting from %s to %s: input array too large",
   110  				arrow.ErrInvalid, input.Type, out.Type)
   111  		}
   112  
   113  		buf := ctx.Allocate(out.Type.(arrow.OffsetsDataType).OffsetTypeTraits().BytesRequired(int(out.Len + out.Offset + 1)))
   114  		out.Buffers[1].WrapBuffer(buf)
   115  
   116  		outOffsets := exec.GetSpanOffsets[OutOffsetsT](out, 1)
   117  
   118  		castNumericUnsafe(arrow.INT64, arrow.INT32,
   119  			arrow.GetBytes(inputOffsets), arrow.GetBytes(outOffsets), len(inputOffsets))
   120  		return nil
   121  	default:
   122  		// upcast from int32 -> int64
   123  		buf := ctx.Allocate(out.Type.(arrow.OffsetsDataType).OffsetTypeTraits().BytesRequired(int(out.Len + out.Offset + 1)))
   124  		out.Buffers[1].WrapBuffer(buf)
   125  
   126  		inputOffsets := exec.GetSpanOffsets[InOffsetsT](input, 1)
   127  		outOffsets := exec.GetSpanOffsets[OutOffsetsT](out, 1)
   128  
   129  		castNumericUnsafe(arrow.INT32, arrow.INT64,
   130  			arrow.GetBytes(inputOffsets), arrow.GetBytes(outOffsets), len(inputOffsets))
   131  		return nil
   132  	}
   133  }
   134  
   135  func CastFsbToBinary[OffsetsT int32 | int64](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
   136  	opts := ctx.State.(CastState)
   137  	input := &batch.Values[0].Array
   138  
   139  	if out.Type.(arrow.BinaryDataType).IsUtf8() && !opts.AllowInvalidUtf8 {
   140  		if err := validateUtf8Fsb(input); err != nil {
   141  			return err
   142  		}
   143  	}
   144  
   145  	// check for overflow
   146  	maxOffset := int64(MaxOf[OffsetsT]())
   147  	width := OffsetsT(input.Type.(*arrow.FixedSizeBinaryType).ByteWidth)
   148  	if (int64(width) * input.Len) > maxOffset {
   149  		return fmt.Errorf("%w: failed casting from %s to %s: input array too large",
   150  			arrow.ErrInvalid, input.Type, out.Type)
   151  	}
   152  
   153  	out.Len = input.Len
   154  	out.Nulls = input.Nulls
   155  	if input.Offset == out.Offset {
   156  		out.Buffers[0].SetBuffer(input.GetBuffer(0))
   157  	} else {
   158  		out.Buffers[0].WrapBuffer(ctx.AllocateBitmap(input.Len))
   159  		bitutil.CopyBitmap(input.Buffers[0].Buf, int(input.Offset), int(input.Len), out.Buffers[0].Buf, int(out.Offset))
   160  	}
   161  
   162  	// this buffer is preallocated
   163  	offsets := exec.GetSpanOffsets[OffsetsT](out, 1)
   164  	offsets[0] = OffsetsT(input.Offset) * width
   165  	for i := 0; i < int(input.Len); i++ {
   166  		offsets[i+1] = offsets[i] + width
   167  	}
   168  
   169  	if len(input.Buffers[1].Buf) > 0 {
   170  		out.Buffers[2] = input.Buffers[1]
   171  	}
   172  
   173  	return nil
   174  }
   175  
   176  func addBinaryToBinaryCast[InOffsetT, OutOffsetT int32 | int64](inType arrow.Type, outType exec.OutputType) exec.ScalarKernel {
   177  	return exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(inType)},
   178  		outType, CastBinaryToBinary[InOffsetT, OutOffsetT], nil)
   179  }
   180  
   181  func addToBinaryKernels[OffsetsT int32 | int64](outType exec.OutputType, kernels []exec.ScalarKernel) []exec.ScalarKernel {
   182  	return append(kernels,
   183  		addBinaryToBinaryCast[int32, OffsetsT](arrow.STRING, outType),
   184  		addBinaryToBinaryCast[int32, OffsetsT](arrow.BINARY, outType),
   185  		addBinaryToBinaryCast[int64, OffsetsT](arrow.LARGE_STRING, outType),
   186  		addBinaryToBinaryCast[int64, OffsetsT](arrow.LARGE_BINARY, outType),
   187  		exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(arrow.FIXED_SIZE_BINARY)},
   188  			outType, CastFsbToBinary[OffsetsT], nil),
   189  	)
   190  }
   191  
   192  func GetFsbCastKernels() []exec.ScalarKernel {
   193  	outputType := exec.NewComputedOutputType(resolveOutputFromOptions)
   194  	out := GetCommonCastKernels(arrow.FIXED_SIZE_BINARY, outputType)
   195  	kernel := exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(arrow.FIXED_SIZE_BINARY)},
   196  		OutputFirstType, CastFsbToFsb, nil)
   197  	kernel.NullHandling = exec.NullComputedNoPrealloc
   198  	return append(out, kernel)
   199  }
   200  
   201  func float16Formatter(v float16.Num) string                 { return v.String() }
   202  func date32Formatter(v arrow.Date32) string                 { return v.FormattedString() }
   203  func date64Formatter(v arrow.Date64) string                 { return v.FormattedString() }
   204  func numericFormatterSigned[T arrow.IntType](v T) string    { return strconv.FormatInt(int64(v), 10) }
   205  func numericFormatterUnsigned[T arrow.UintType](v T) string { return strconv.FormatUint(uint64(v), 10) }
   206  func float32Formatter(v float32) string                     { return strconv.FormatFloat(float64(v), 'g', -1, 32) }
   207  func float64Formatter(v float64) string                     { return strconv.FormatFloat(v, 'g', -1, 64) }
   208  
   209  func boolToStringCastExec(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
   210  	var (
   211  		input = &batch.Values[0].Array
   212  		bldr  = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder)
   213  	)
   214  	defer bldr.Release()
   215  
   216  	bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len,
   217  		func(pos int64) {
   218  			bldr.Append(strconv.FormatBool(bitutil.BitIsSet(input.Buffers[1].Buf, int(pos))))
   219  		}, func() { bldr.AppendNull() })
   220  
   221  	arr := bldr.NewArray()
   222  	out.TakeOwnership(arr.Data())
   223  	return nil
   224  }
   225  
   226  type timeIntrinsic interface {
   227  	arrow.Time32 | arrow.Time64
   228  	FormattedString(arrow.TimeUnit) string
   229  }
   230  
   231  func timeToStringCastExec[T timeIntrinsic](ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
   232  	var (
   233  		input     = &batch.Values[0].Array
   234  		inputData = exec.GetSpanValues[T](input, 1)
   235  		bldr      = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder)
   236  		inputType = input.Type.(arrow.TemporalWithUnit)
   237  	)
   238  	defer bldr.Release()
   239  
   240  	bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len,
   241  		func(pos int64) {
   242  			bldr.Append(inputData[pos].FormattedString(inputType.TimeUnit()))
   243  		}, func() { bldr.AppendNull() })
   244  
   245  	arr := bldr.NewArray()
   246  	out.TakeOwnership(arr.Data())
   247  	return nil
   248  }
   249  
   250  func numericToStringCastExec[T arrow.IntType | arrow.UintType | arrow.FloatType](formatter func(T) string) exec.ArrayKernelExec {
   251  	return func(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
   252  		var (
   253  			input     = &batch.Values[0].Array
   254  			inputData = exec.GetSpanValues[T](input, 1)
   255  			bldr      = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder)
   256  		)
   257  		defer bldr.Release()
   258  
   259  		bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len,
   260  			func(pos int64) {
   261  				bldr.Append(formatter(inputData[pos]))
   262  			}, func() { bldr.AppendNull() })
   263  
   264  		arr := bldr.NewArray()
   265  		out.TakeOwnership(arr.Data())
   266  		return nil
   267  	}
   268  }
   269  
   270  func castTimestampToString(ctx *exec.KernelCtx, batch *exec.ExecSpan, out *exec.ExecResult) error {
   271  	var (
   272  		input     = &batch.Values[0].Array
   273  		inputData = exec.GetSpanValues[arrow.Timestamp](input, 1)
   274  		inputType = input.Type.(*arrow.TimestampType)
   275  		bldr      = array.NewBuilder(exec.GetAllocator(ctx.Ctx), out.Type).(array.StringLikeBuilder)
   276  	)
   277  	defer bldr.Release()
   278  
   279  	toTime, err := inputType.GetToTimeFunc()
   280  	if err != nil {
   281  		return err
   282  	}
   283  
   284  	// prealloc
   285  	fmtstring := "2006-01-02 15:04:05"
   286  	switch inputType.Unit {
   287  	case arrow.Millisecond:
   288  		fmtstring += ".000"
   289  	case arrow.Microsecond:
   290  		fmtstring += ".000000"
   291  	case arrow.Nanosecond:
   292  		fmtstring += ".000000000"
   293  	}
   294  
   295  	switch inputType.TimeZone {
   296  	case "UTC":
   297  		fmtstring += "Z"
   298  	case "":
   299  	default:
   300  		fmtstring += "-0700"
   301  	}
   302  
   303  	strlen := len(fmtstring)
   304  	bldr.Reserve(int(input.Len))
   305  	bldr.ReserveData(int(input.Len-input.Nulls) * strlen)
   306  
   307  	bitutils.VisitBitBlocks(input.Buffers[0].Buf, input.Offset, input.Len,
   308  		func(pos int64) {
   309  			bldr.Append(toTime(inputData[pos]).Format(fmtstring))
   310  		},
   311  		func() { bldr.AppendNull() })
   312  
   313  	arr := bldr.NewArray()
   314  	out.TakeOwnership(arr.Data())
   315  	return nil
   316  }
   317  
   318  func getNumericToStringCastExec(inType arrow.Type) exec.ArrayKernelExec {
   319  	switch inType {
   320  	case arrow.INT8:
   321  		return numericToStringCastExec(numericFormatterSigned[int8])
   322  	case arrow.UINT8:
   323  		return numericToStringCastExec(numericFormatterUnsigned[uint8])
   324  	case arrow.INT16:
   325  		return numericToStringCastExec(numericFormatterSigned[int16])
   326  	case arrow.UINT16:
   327  		return numericToStringCastExec(numericFormatterUnsigned[uint16])
   328  	case arrow.INT32:
   329  		return numericToStringCastExec(numericFormatterSigned[int32])
   330  	case arrow.UINT32:
   331  		return numericToStringCastExec(numericFormatterUnsigned[uint32])
   332  	case arrow.INT64:
   333  		return numericToStringCastExec(numericFormatterSigned[int64])
   334  	case arrow.UINT64:
   335  		return numericToStringCastExec(numericFormatterUnsigned[uint64])
   336  	case arrow.FLOAT16:
   337  		return numericToStringCastExec(float16Formatter)
   338  	case arrow.FLOAT32:
   339  		return numericToStringCastExec(float32Formatter)
   340  	case arrow.FLOAT64:
   341  		return numericToStringCastExec(float64Formatter)
   342  	case arrow.BOOL:
   343  		return boolToStringCastExec
   344  	case arrow.DATE32:
   345  		return numericToStringCastExec(date32Formatter)
   346  	case arrow.DATE64:
   347  		return numericToStringCastExec(date64Formatter)
   348  	case arrow.TIME32:
   349  		return timeToStringCastExec[arrow.Time32]
   350  	case arrow.TIME64:
   351  		return timeToStringCastExec[arrow.Time64]
   352  	case arrow.TIMESTAMP:
   353  		return castTimestampToString
   354  	}
   355  	panic("unimplemented cast: " + inType.String())
   356  }
   357  
   358  func addNumericAndTemporalToStringCasts(outType exec.OutputType, out []exec.ScalarKernel) []exec.ScalarKernel {
   359  	k := exec.NewScalarKernel([]exec.InputType{exec.NewExactInput(arrow.FixedWidthTypes.Boolean)}, outType,
   360  		getNumericToStringCastExec(arrow.BOOL), nil)
   361  	k.NullHandling = exec.NullComputedNoPrealloc
   362  	out = append(out, k)
   363  
   364  	for _, dt := range numericTypes {
   365  		k = exec.NewScalarKernel([]exec.InputType{exec.NewExactInput(dt)}, outType,
   366  			getNumericToStringCastExec(dt.ID()), nil)
   367  		k.NullHandling = exec.NullComputedNoPrealloc
   368  		out = append(out, k)
   369  	}
   370  
   371  	for _, dt := range []arrow.DataType{arrow.FixedWidthTypes.Date32, arrow.FixedWidthTypes.Date64} {
   372  		k = exec.NewScalarKernel([]exec.InputType{exec.NewExactInput(dt)}, outType,
   373  			getNumericToStringCastExec(dt.ID()), nil)
   374  		k.NullHandling = exec.NullComputedNoPrealloc
   375  		out = append(out, k)
   376  	}
   377  
   378  	for _, id := range []arrow.Type{arrow.TIME32, arrow.TIME64, arrow.TIMESTAMP} {
   379  		k = exec.NewScalarKernel([]exec.InputType{exec.NewIDInput(id)}, outType,
   380  			getNumericToStringCastExec(id), nil)
   381  		k.NullHandling = exec.NullComputedNoPrealloc
   382  		out = append(out, k)
   383  	}
   384  
   385  	return out
   386  }
   387  
   388  func GetToBinaryKernels(outType arrow.DataType) []exec.ScalarKernel {
   389  	if outType.ID() == arrow.FIXED_SIZE_BINARY {
   390  		return nil
   391  	}
   392  
   393  	outputType := exec.NewOutputType(outType)
   394  	out := GetCommonCastKernels(outType.ID(), outputType)
   395  
   396  	switch outType.ID() {
   397  	case arrow.BINARY:
   398  		return addToBinaryKernels[int32](outputType, out)
   399  	case arrow.LARGE_BINARY:
   400  		return addToBinaryKernels[int64](outputType, out)
   401  	case arrow.STRING:
   402  		out = addToBinaryKernels[int32](outputType, out)
   403  		return addNumericAndTemporalToStringCasts(outputType, out)
   404  	case arrow.LARGE_STRING:
   405  		out = addToBinaryKernels[int64](outputType, out)
   406  		return addNumericAndTemporalToStringCasts(outputType, out)
   407  	}
   408  	return nil
   409  }