go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/resultdb/util/chunking.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/resultdb/util/chunking.go (about)

     1  // Copyright 2024 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package util contains utility functions.
    16  package util
    17  
    18  import "go.chromium.org/luci/common/errors"
    19  
    20  // SplitToChunks splits the contents into chunks, each chunk does not exceed
    21  // maxChunkSize number of bytes.
    22  // Assuming the content is encoded in UTF-8.
    23  // The function will guarantee not to split a multi-byte Unicode into
    24  // different chunks.
    25  // The function will attempt to split the chunk as close to maxChunkSize
    26  // as it can, but it will also prefer splitting at line breaks ("\r\n",
    27  // otherwise "\r" or "\n"), or whitespaces. It will scan for the last lookbackWindow bytes
    28  // for line break/white space to split.
    29  // If there is no linebreak or whitespace within lookbackWindow bytes,
    30  // it will split the chunk as close to maxChunkSize (without breaking
    31  // a multi-byte UTF-8 character).
    32  func SplitToChunks(content []byte, maxChunkSize int, lookbackWindow int) ([]string, error) {
    33  	if lookbackWindow > maxChunkSize {
    34  		return nil, errors.Reason("lookback window %d must not be bigger than maxChunkSize %d", lookbackWindow, maxChunkSize).Err()
    35  	}
    36  	// Start index of a chunk.
    37  	startIndex := 0
    38  	chunks := []string{}
    39  
    40  	// Continue chunking if the remaining content is still bigger than maxSize.
    41  	for len(content)-startIndex > maxChunkSize {
    42  		// Look for the byte at the end of the chunk that we can split without
    43  		// breaking multi-byte character.
    44  		utf8StartIndex, err := firstCharacterIndexBackward(content, startIndex+maxChunkSize)
    45  		if err != nil {
    46  			return nil, errors.Annotate(err, "indexOfUTF8Backward").Err()
    47  		}
    48  		// endIndex is the biggest index of that we can potentially split.
    49  		endIndex := utf8StartIndex - 1
    50  
    51  		// Look backward within lookbackWindow to find linebreak/whitespace.
    52  		whiteSpaceIndex, whiteSpaceLength := newLineWhiteSpace(content, endIndex, lookbackWindow)
    53  		// Found new line or white space.
    54  		if whiteSpaceIndex != -1 {
    55  			chunk := string(content[startIndex : whiteSpaceIndex+whiteSpaceLength])
    56  			chunks = append(chunks, chunk)
    57  			startIndex = whiteSpaceIndex + whiteSpaceLength
    58  		} else { // No new line or white space, we should split at max size.
    59  			chunk := string(content[startIndex : endIndex+1])
    60  			chunks = append(chunks, chunk)
    61  			startIndex = endIndex + 1
    62  		}
    63  	}
    64  	// Add the last chunk.
    65  	if startIndex < len(content) {
    66  		chunk := string(content[startIndex:])
    67  		chunks = append(chunks, chunk)
    68  	}
    69  	return chunks, nil
    70  }
    71  
    72  // newLineWhiteSpace starts at endIndex and looks back at most
    73  // lookbackWindow size to find a new line or white space character.
    74  // It prioritizes in the following order:
    75  //   - \r\n
    76  //   - \n or \r
    77  //   - ' ' or \t
    78  //
    79  // If no such character can be found, return -1.
    80  func newLineWhiteSpace(content []byte, endIndex int, lookbackWindow int) (index int, length int) {
    81  	nrIndex := -1
    82  	whiteSpaceIndex := -1
    83  	lookUntil := endIndex - lookbackWindow + 1
    84  	if lookUntil < 0 {
    85  		lookUntil = 0
    86  	}
    87  	for i := endIndex; i >= lookUntil; i-- {
    88  		ch := content[i]
    89  		// Check for \n\r. If we see it, return immediately.
    90  		if ch == '\r' && i < endIndex && content[i+1] == '\n' {
    91  			return i, 2
    92  		}
    93  		if ch == '\n' || ch == '\r' {
    94  			if nrIndex == -1 {
    95  				nrIndex = i
    96  			}
    97  		}
    98  		if ch == ' ' || ch == '\t' {
    99  			if whiteSpaceIndex == -1 {
   100  				whiteSpaceIndex = i
   101  			}
   102  		}
   103  	}
   104  	if nrIndex != -1 {
   105  		return nrIndex, 1
   106  	}
   107  	if whiteSpaceIndex != -1 {
   108  		return whiteSpaceIndex, 1
   109  	}
   110  	return -1, 0
   111  }
   112  
   113  // firstCharacterIndexBackward looks backward from fromPosition to find
   114  // the first index of byte that mark the start of a UTF-8 character.
   115  func firstCharacterIndexBackward(content []byte, fromPosition int) (int, error) {
   116  	// A UTF-8 character can take 4 bytes at most.
   117  	toPosition := fromPosition - 3
   118  	if toPosition < 0 {
   119  		toPosition = 0
   120  	}
   121  	for i := fromPosition; i >= toPosition; i-- {
   122  		if isUTF8StartByte(content[i]) {
   123  			return i, nil
   124  		}
   125  	}
   126  	// After 4 bytes, if we cannot find, it means the string is not in UTF-8.
   127  	return -1, errors.New("byte slice may not be in UTF-8 format")
   128  }
   129  
   130  // Return true if the byte mark the start of a UTF-8 character.
   131  // A Unicode character maybe encoded using from 1-4 bytes.
   132  // See https://en.wikipedia.org/wiki/UTF-8
   133  func isUTF8StartByte(b byte) bool {
   134  	// This is an ASCII character, which only takes 1 byte.
   135  	if b <= 0x7F {
   136  		return true
   137  	}
   138  	// Multi-byte character patterns, starts with 110xxxxx, 1110xxxx, or 11110xxx.
   139  	return b&0xC0 == 0xC0
   140  }