github.com/aretext/aretext@v1.3.0/text/reader.go (about) 1 package text 2 3 import ( 4 "io" 5 "unicode/utf8" 6 7 textUtf8 "github.com/aretext/aretext/text/utf8" 8 ) 9 10 // Reader reads UTF-8 bytes from a text.Tree. 11 // It implements io.Reader. 12 // Copying the struct produces a new, independent reader. 13 // text.Tree is NOT thread-safe, so reading from a tree while modifying it is undefined behavior! 14 type Reader struct { 15 group *leafNodeGroup 16 nodeIdx uint64 17 textByteOffset uint64 18 } 19 20 // Read implements io.Reader#Read 21 func (r *Reader) Read(b []byte) (int, error) { 22 i := 0 23 for { 24 if i == len(b) { 25 return i, nil 26 } 27 28 if r.group.next == nil && r.nodeIdx == r.group.numNodes { 29 return i, io.EOF 30 } 31 32 node := &r.group.nodes[r.nodeIdx] 33 bytesWritten := copy(b[i:], node.textBytes[r.textByteOffset:node.numBytes]) 34 i += bytesWritten 35 r.advance(uint64(bytesWritten)) 36 } 37 } 38 39 func (r *Reader) advance(n uint64) { 40 // Assumes that there are at least n bytes in the current leaf. 41 r.textByteOffset += n 42 if r.textByteOffset == uint64(r.group.nodes[r.nodeIdx].numBytes) { 43 r.nodeIdx++ 44 r.textByteOffset = 0 45 } 46 if r.nodeIdx == r.group.numNodes && r.group.next != nil { 47 r.group = r.group.next 48 r.nodeIdx = 0 49 r.textByteOffset = 0 50 } 51 } 52 53 func (r *Reader) readNextByte() (byte, error) { 54 // Fast path: next byte is in current leaf. 55 if r.nodeIdx < r.group.numNodes && r.textByteOffset < uint64(r.group.nodes[r.nodeIdx].numBytes) { 56 b := r.group.nodes[r.nodeIdx].textBytes[r.textByteOffset] 57 r.advance(1) 58 return b, nil 59 } 60 61 // Slow path: fallback to default read. 62 var buf [1]byte 63 _, err := r.Read(buf[:]) 64 return buf[0], err 65 } 66 67 // ReadRune implements io.RuneReader#ReadRune 68 // If the next bytes in the reader are not valid UTF8, it returns ErrInvalidUtf8. 69 // If there are no more bytes to read, it returns io.EOF. 70 func (r *Reader) ReadRune() (rune, int, error) { 71 var buf [4]byte 72 73 // Read the next byte to determine the number of bytes in the next rune. 74 firstByte, err := r.readNextByte() 75 if err != nil { 76 return '\x00', 0, err 77 } 78 79 n := textUtf8.CharWidth[firstByte] 80 if n == 0 { 81 return '\x00', 0, ErrInvalidUtf8 82 } else if n == 1 { 83 // Fast path for ASCII. 84 return rune(firstByte), 1, nil 85 } 86 87 // Read remaining bytes in the rune. 88 buf[0] = firstByte 89 if _, err := r.Read(buf[1:n]); err != nil { 90 return '\x00', 0, ErrInvalidUtf8 91 } 92 93 // Decode the multi-byte rune. 94 rn, sz := utf8.DecodeRune(buf[:n]) 95 if sz != int(n) { 96 return '\x00', 0, ErrInvalidUtf8 97 } 98 return rn, sz, nil 99 } 100 101 // ReverseReader reads bytes in reverse order. 102 type ReverseReader struct { 103 Reader 104 } 105 106 // Read implements io.Reader#Read 107 func (r *ReverseReader) Read(b []byte) (int, error) { 108 i := 0 109 for { 110 if i == len(b) { 111 return i, nil 112 } 113 114 if r.group.prev == nil && r.nodeIdx == 0 && r.textByteOffset == 0 { 115 return i, io.EOF 116 } 117 118 node := &r.group.nodes[r.nodeIdx] 119 bytesWritten := 0 120 for i+bytesWritten < len(b) && r.textByteOffset > uint64(bytesWritten) { 121 b[i+bytesWritten] = node.textBytes[r.textByteOffset-1-uint64(bytesWritten)] 122 bytesWritten++ 123 } 124 r.textByteOffset -= uint64(bytesWritten) 125 i += bytesWritten 126 127 if r.textByteOffset > 0 { 128 continue 129 } 130 131 if r.nodeIdx > 0 { 132 r.nodeIdx-- 133 r.textByteOffset = uint64(r.group.nodes[r.nodeIdx].numBytes) 134 continue 135 } 136 137 if r.group.prev != nil { 138 r.group = r.group.prev 139 r.nodeIdx = r.group.numNodes - 1 140 r.textByteOffset = uint64(r.group.nodes[r.nodeIdx].numBytes) 141 } 142 } 143 } 144 145 // ReadRune implements io.RuneReader#ReadRune 146 func (r *ReverseReader) ReadRune() (rune, int, error) { 147 n, err := r.lookaheadToRuneStartByte() 148 if err != nil { 149 return '\x00', 0, err 150 } 151 152 var buf [4]byte 153 if _, err := r.Read(buf[:n]); err != nil { 154 return '\x00', 0, err 155 } 156 157 // Bytes were read in reverse order, so we need to swap them to decode as UTF-8. 158 if n == 2 { 159 buf[0], buf[1] = buf[1], buf[0] 160 } else if n == 3 { 161 buf[0], buf[2] = buf[2], buf[0] 162 } else if n == 4 { 163 buf[0], buf[3] = buf[3], buf[0] 164 buf[1], buf[2] = buf[2], buf[1] 165 } 166 167 rn, sz := utf8.DecodeRune(buf[:n]) 168 if sz != n { 169 return '\x00', 0, ErrInvalidUtf8 170 } 171 return rn, sz, nil 172 } 173 174 func (r *ReverseReader) lookaheadToRuneStartByte() (int, error) { 175 rcopy := *r // Copy the struct to produce a new, independent reader for lookahead. 176 var buf [4]byte // At most 4 bytes to the start of the next rune in valid UTF-8 encoding. 177 n, _ := rcopy.Read(buf[:]) 178 if n == 0 { 179 return 0, io.EOF 180 } 181 182 for i := 0; i < n; i++ { 183 if textUtf8.StartByteIndicator[buf[i]] > 0 { 184 // Found the start byte. 185 return i + 1, nil 186 } 187 } 188 189 // Could not find the start byte, so this is not a valid UTF-8 encoding. 190 return 0, ErrInvalidUtf8 191 }