github.com/aretext/aretext@v1.3.0/text/reader.go (about)

     1  package text
     2  
     3  import (
     4  	"io"
     5  	"unicode/utf8"
     6  
     7  	textUtf8 "github.com/aretext/aretext/text/utf8"
     8  )
     9  
    10  // Reader reads UTF-8 bytes from a text.Tree.
    11  // It implements io.Reader.
    12  // Copying the struct produces a new, independent reader.
    13  // text.Tree is NOT thread-safe, so reading from a tree while modifying it is undefined behavior!
    14  type Reader struct {
    15  	group          *leafNodeGroup
    16  	nodeIdx        uint64
    17  	textByteOffset uint64
    18  }
    19  
    20  // Read implements io.Reader#Read
    21  func (r *Reader) Read(b []byte) (int, error) {
    22  	i := 0
    23  	for {
    24  		if i == len(b) {
    25  			return i, nil
    26  		}
    27  
    28  		if r.group.next == nil && r.nodeIdx == r.group.numNodes {
    29  			return i, io.EOF
    30  		}
    31  
    32  		node := &r.group.nodes[r.nodeIdx]
    33  		bytesWritten := copy(b[i:], node.textBytes[r.textByteOffset:node.numBytes])
    34  		i += bytesWritten
    35  		r.advance(uint64(bytesWritten))
    36  	}
    37  }
    38  
    39  func (r *Reader) advance(n uint64) {
    40  	// Assumes that there are at least n bytes in the current leaf.
    41  	r.textByteOffset += n
    42  	if r.textByteOffset == uint64(r.group.nodes[r.nodeIdx].numBytes) {
    43  		r.nodeIdx++
    44  		r.textByteOffset = 0
    45  	}
    46  	if r.nodeIdx == r.group.numNodes && r.group.next != nil {
    47  		r.group = r.group.next
    48  		r.nodeIdx = 0
    49  		r.textByteOffset = 0
    50  	}
    51  }
    52  
    53  func (r *Reader) readNextByte() (byte, error) {
    54  	// Fast path: next byte is in current leaf.
    55  	if r.nodeIdx < r.group.numNodes && r.textByteOffset < uint64(r.group.nodes[r.nodeIdx].numBytes) {
    56  		b := r.group.nodes[r.nodeIdx].textBytes[r.textByteOffset]
    57  		r.advance(1)
    58  		return b, nil
    59  	}
    60  
    61  	// Slow path: fallback to default read.
    62  	var buf [1]byte
    63  	_, err := r.Read(buf[:])
    64  	return buf[0], err
    65  }
    66  
    67  // ReadRune implements io.RuneReader#ReadRune
    68  // If the next bytes in the reader are not valid UTF8, it returns ErrInvalidUtf8.
    69  // If there are no more bytes to read, it returns io.EOF.
    70  func (r *Reader) ReadRune() (rune, int, error) {
    71  	var buf [4]byte
    72  
    73  	// Read the next byte to determine the number of bytes in the next rune.
    74  	firstByte, err := r.readNextByte()
    75  	if err != nil {
    76  		return '\x00', 0, err
    77  	}
    78  
    79  	n := textUtf8.CharWidth[firstByte]
    80  	if n == 0 {
    81  		return '\x00', 0, ErrInvalidUtf8
    82  	} else if n == 1 {
    83  		// Fast path for ASCII.
    84  		return rune(firstByte), 1, nil
    85  	}
    86  
    87  	// Read remaining bytes in the rune.
    88  	buf[0] = firstByte
    89  	if _, err := r.Read(buf[1:n]); err != nil {
    90  		return '\x00', 0, ErrInvalidUtf8
    91  	}
    92  
    93  	// Decode the multi-byte rune.
    94  	rn, sz := utf8.DecodeRune(buf[:n])
    95  	if sz != int(n) {
    96  		return '\x00', 0, ErrInvalidUtf8
    97  	}
    98  	return rn, sz, nil
    99  }
   100  
   101  // ReverseReader reads bytes in reverse order.
   102  type ReverseReader struct {
   103  	Reader
   104  }
   105  
   106  // Read implements io.Reader#Read
   107  func (r *ReverseReader) Read(b []byte) (int, error) {
   108  	i := 0
   109  	for {
   110  		if i == len(b) {
   111  			return i, nil
   112  		}
   113  
   114  		if r.group.prev == nil && r.nodeIdx == 0 && r.textByteOffset == 0 {
   115  			return i, io.EOF
   116  		}
   117  
   118  		node := &r.group.nodes[r.nodeIdx]
   119  		bytesWritten := 0
   120  		for i+bytesWritten < len(b) && r.textByteOffset > uint64(bytesWritten) {
   121  			b[i+bytesWritten] = node.textBytes[r.textByteOffset-1-uint64(bytesWritten)]
   122  			bytesWritten++
   123  		}
   124  		r.textByteOffset -= uint64(bytesWritten)
   125  		i += bytesWritten
   126  
   127  		if r.textByteOffset > 0 {
   128  			continue
   129  		}
   130  
   131  		if r.nodeIdx > 0 {
   132  			r.nodeIdx--
   133  			r.textByteOffset = uint64(r.group.nodes[r.nodeIdx].numBytes)
   134  			continue
   135  		}
   136  
   137  		if r.group.prev != nil {
   138  			r.group = r.group.prev
   139  			r.nodeIdx = r.group.numNodes - 1
   140  			r.textByteOffset = uint64(r.group.nodes[r.nodeIdx].numBytes)
   141  		}
   142  	}
   143  }
   144  
   145  // ReadRune implements io.RuneReader#ReadRune
   146  func (r *ReverseReader) ReadRune() (rune, int, error) {
   147  	n, err := r.lookaheadToRuneStartByte()
   148  	if err != nil {
   149  		return '\x00', 0, err
   150  	}
   151  
   152  	var buf [4]byte
   153  	if _, err := r.Read(buf[:n]); err != nil {
   154  		return '\x00', 0, err
   155  	}
   156  
   157  	// Bytes were read in reverse order, so we need to swap them to decode as UTF-8.
   158  	if n == 2 {
   159  		buf[0], buf[1] = buf[1], buf[0]
   160  	} else if n == 3 {
   161  		buf[0], buf[2] = buf[2], buf[0]
   162  	} else if n == 4 {
   163  		buf[0], buf[3] = buf[3], buf[0]
   164  		buf[1], buf[2] = buf[2], buf[1]
   165  	}
   166  
   167  	rn, sz := utf8.DecodeRune(buf[:n])
   168  	if sz != n {
   169  		return '\x00', 0, ErrInvalidUtf8
   170  	}
   171  	return rn, sz, nil
   172  }
   173  
   174  func (r *ReverseReader) lookaheadToRuneStartByte() (int, error) {
   175  	rcopy := *r     // Copy the struct to produce a new, independent reader for lookahead.
   176  	var buf [4]byte // At most 4 bytes to the start of the next rune in valid UTF-8 encoding.
   177  	n, _ := rcopy.Read(buf[:])
   178  	if n == 0 {
   179  		return 0, io.EOF
   180  	}
   181  
   182  	for i := 0; i < n; i++ {
   183  		if textUtf8.StartByteIndicator[buf[i]] > 0 {
   184  			// Found the start byte.
   185  			return i + 1, nil
   186  		}
   187  	}
   188  
   189  	// Could not find the start byte, so this is not a valid UTF-8 encoding.
   190  	return 0, ErrInvalidUtf8
   191  }