go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/resultdb/util/chunking.go (about) 1 // Copyright 2024 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package util contains utility functions. 16 package util 17 18 import "go.chromium.org/luci/common/errors" 19 20 // SplitToChunks splits the contents into chunks, each chunk does not exceed 21 // maxChunkSize number of bytes. 22 // Assuming the content is encoded in UTF-8. 23 // The function will guarantee not to split a multi-byte Unicode into 24 // different chunks. 25 // The function will attempt to split the chunk as close to maxChunkSize 26 // as it can, but it will also prefer splitting at line breaks ("\r\n", 27 // otherwise "\r" or "\n"), or whitespaces. It will scan for the last lookbackWindow bytes 28 // for line break/white space to split. 29 // If there is no linebreak or whitespace within lookbackWindow bytes, 30 // it will split the chunk as close to maxChunkSize (without breaking 31 // a multi-byte UTF-8 character). 32 func SplitToChunks(content []byte, maxChunkSize int, lookbackWindow int) ([]string, error) { 33 if lookbackWindow > maxChunkSize { 34 return nil, errors.Reason("lookback window %d must not be bigger than maxChunkSize %d", lookbackWindow, maxChunkSize).Err() 35 } 36 // Start index of a chunk. 37 startIndex := 0 38 chunks := []string{} 39 40 // Continue chunking if the remaining content is still bigger than maxSize. 41 for len(content)-startIndex > maxChunkSize { 42 // Look for the byte at the end of the chunk that we can split without 43 // breaking multi-byte character. 44 utf8StartIndex, err := firstCharacterIndexBackward(content, startIndex+maxChunkSize) 45 if err != nil { 46 return nil, errors.Annotate(err, "indexOfUTF8Backward").Err() 47 } 48 // endIndex is the biggest index of that we can potentially split. 49 endIndex := utf8StartIndex - 1 50 51 // Look backward within lookbackWindow to find linebreak/whitespace. 52 whiteSpaceIndex, whiteSpaceLength := newLineWhiteSpace(content, endIndex, lookbackWindow) 53 // Found new line or white space. 54 if whiteSpaceIndex != -1 { 55 chunk := string(content[startIndex : whiteSpaceIndex+whiteSpaceLength]) 56 chunks = append(chunks, chunk) 57 startIndex = whiteSpaceIndex + whiteSpaceLength 58 } else { // No new line or white space, we should split at max size. 59 chunk := string(content[startIndex : endIndex+1]) 60 chunks = append(chunks, chunk) 61 startIndex = endIndex + 1 62 } 63 } 64 // Add the last chunk. 65 if startIndex < len(content) { 66 chunk := string(content[startIndex:]) 67 chunks = append(chunks, chunk) 68 } 69 return chunks, nil 70 } 71 72 // newLineWhiteSpace starts at endIndex and looks back at most 73 // lookbackWindow size to find a new line or white space character. 74 // It prioritizes in the following order: 75 // - \r\n 76 // - \n or \r 77 // - ' ' or \t 78 // 79 // If no such character can be found, return -1. 80 func newLineWhiteSpace(content []byte, endIndex int, lookbackWindow int) (index int, length int) { 81 nrIndex := -1 82 whiteSpaceIndex := -1 83 lookUntil := endIndex - lookbackWindow + 1 84 if lookUntil < 0 { 85 lookUntil = 0 86 } 87 for i := endIndex; i >= lookUntil; i-- { 88 ch := content[i] 89 // Check for \n\r. If we see it, return immediately. 90 if ch == '\r' && i < endIndex && content[i+1] == '\n' { 91 return i, 2 92 } 93 if ch == '\n' || ch == '\r' { 94 if nrIndex == -1 { 95 nrIndex = i 96 } 97 } 98 if ch == ' ' || ch == '\t' { 99 if whiteSpaceIndex == -1 { 100 whiteSpaceIndex = i 101 } 102 } 103 } 104 if nrIndex != -1 { 105 return nrIndex, 1 106 } 107 if whiteSpaceIndex != -1 { 108 return whiteSpaceIndex, 1 109 } 110 return -1, 0 111 } 112 113 // firstCharacterIndexBackward looks backward from fromPosition to find 114 // the first index of byte that mark the start of a UTF-8 character. 115 func firstCharacterIndexBackward(content []byte, fromPosition int) (int, error) { 116 // A UTF-8 character can take 4 bytes at most. 117 toPosition := fromPosition - 3 118 if toPosition < 0 { 119 toPosition = 0 120 } 121 for i := fromPosition; i >= toPosition; i-- { 122 if isUTF8StartByte(content[i]) { 123 return i, nil 124 } 125 } 126 // After 4 bytes, if we cannot find, it means the string is not in UTF-8. 127 return -1, errors.New("byte slice may not be in UTF-8 format") 128 } 129 130 // Return true if the byte mark the start of a UTF-8 character. 131 // A Unicode character maybe encoded using from 1-4 bytes. 132 // See https://en.wikipedia.org/wiki/UTF-8 133 func isUTF8StartByte(b byte) bool { 134 // This is an ASCII character, which only takes 1 byte. 135 if b <= 0x7F { 136 return true 137 } 138 // Multi-byte character patterns, starts with 110xxxxx, 1110xxxx, or 11110xxx. 139 return b&0xC0 == 0xC0 140 }