github.com/vertgenlab/gonomics@v1.0.0/fileio/byteio.go (about)

     1  package fileio
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"compress/gzip"
     7  	"errors"
     8  	"fmt"
     9  	"io"
    10  	"log"
    11  	"os"
    12  	"strconv"
    13  	"strings"
    14  
    15  	"github.com/vertgenlab/gonomics/exception"
    16  )
    17  
    18  const (
    19  	defaultBufSize = 4096
    20  )
    21  
    22  // ByteReader implements the io.Reader interface by providing
    23  // the Read(b []byte) method. The struct contains an embedded *bufio.Reader
    24  // and a pointer to os.File for closure when reading is complete.
    25  // The primary advantage of ByteReader over EasyReader is that data
    26  // is read into a shared bytes.Buffer instead of allocating memory for a
    27  // string as is done with EasyReader.
    28  // ByteReader can be be used to efficiently parse files on the byte level
    29  // which may be significantly faster and require less memory than using
    30  // strings.Split on a string derived from EasyReader.
    31  // The drawback is that ByteReader is not as easy to use as EasyReader
    32  // and should generally be reserved for performance intensive tasks.
    33  type ByteReader struct {
    34  	*bufio.Reader
    35  	File         *os.File
    36  	internalGzip *gzip.Reader
    37  	line         []byte
    38  	Buffer       *bytes.Buffer
    39  }
    40  
    41  // Read reads data into p and is a method required to implement the io.Reader interface.
    42  // It returns the number of bytes read into p.
    43  func (reader *ByteReader) Read(b []byte) (n int, err error) {
    44  	return reader.Read(b)
    45  }
    46  
    47  // NewByteReader will process a given file and performs error handling if an error occurs.
    48  // ByteReader will process gzipped files accordingly by performing a check on the suffix
    49  // of the provided file.
    50  func NewByteReader(filename string) *ByteReader {
    51  	var err error
    52  	file := MustOpen(filename)
    53  	var answer ByteReader = ByteReader{
    54  		File:   file,
    55  		Buffer: &bytes.Buffer{},
    56  	}
    57  	switch true {
    58  	case strings.HasSuffix(filename, ".gz"):
    59  		answer.internalGzip, err = gzip.NewReader(file)
    60  		exception.PanicOnErr(err)
    61  		answer.Reader = bufio.NewReader(answer.internalGzip)
    62  	default:
    63  		answer.Reader = bufio.NewReader(file)
    64  	}
    65  	return &answer
    66  }
    67  
    68  // ReadLine will return a bytes.Buffer pointing to the internal slice of bytes. Provided this function is called within a loop,
    69  // the function will read one line at a time, and return bool to continue reading. Important to note the buffer return points to
    70  // the internal slice belonging to the reader, meaning the slice will be overridden if the data is not copied.
    71  func ReadLine(reader *ByteReader) (*bytes.Buffer, bool) {
    72  	var err error
    73  	reader.line, err = reader.ReadSlice('\n')
    74  	reader.Buffer.Reset()
    75  	if err == nil {
    76  		if reader.line[len(reader.line)-1] == '\n' {
    77  			return bytesToBuffer(reader), false
    78  		} else {
    79  			log.Panicf("Error: end of line did not end with an end of line character...\n")
    80  		}
    81  	} else {
    82  		if err == bufio.ErrBufferFull {
    83  			reader.line = readMore(reader)
    84  			return bytesToBuffer(reader), false
    85  		} else {
    86  			CatchErrThrowEOF(err)
    87  		}
    88  	}
    89  	return nil, true
    90  }
    91  
    92  // readMore is a private helper function to deal with very long lines to
    93  // avoid alocating too much memory upfront and only resize the size of the buffer
    94  // only when necessary.
    95  func readMore(reader *ByteReader) []byte {
    96  	_, err := reader.Buffer.Write(reader.line)
    97  	exception.PanicOnErr(err)
    98  	reader.line, err = reader.ReadSlice('\n')
    99  	if err == nil {
   100  		return reader.line
   101  	}
   102  	if err == bufio.ErrBufferFull {
   103  		_, err = reader.Buffer.Write(reader.line)
   104  		exception.PanicOnErr(err)
   105  		// recursive call to read next bytes until reaching end of line character
   106  		return readMore(reader)
   107  	}
   108  	exception.PanicOnErr(err)
   109  	return reader.line
   110  }
   111  
   112  // CatchErrThrowEOF will silently handles and throws the EOF error and will log and exit any other errors.
   113  func CatchErrThrowEOF(err error) {
   114  	if err == io.EOF {
   115  		return
   116  	} else {
   117  		exception.PanicOnErr(err)
   118  	}
   119  }
   120  
   121  // bytesToBuffer will parse []byte and return a pointer to the same underlying bytes.Buffer.
   122  func bytesToBuffer(reader *ByteReader) *bytes.Buffer {
   123  	var err error
   124  	if reader.line[len(reader.line)-2] == '\r' {
   125  		_, err = reader.Buffer.Write(reader.line[:len(reader.line)-2])
   126  	} else {
   127  		_, err = reader.Buffer.Write(reader.line[:len(reader.line)-1])
   128  	}
   129  	exception.PanicOnErr(err)
   130  	return reader.Buffer
   131  }
   132  
   133  // Close closes the File, rendering it unusable for I/O. On files that support SetDeadline,
   134  // any pending I/O operations will be canceled and return immediately with an error.
   135  // Close will return an error if it has already been called.
   136  func (br *ByteReader) Close() error {
   137  	var gzErr, fileErr error
   138  	if br.internalGzip != nil {
   139  		gzErr = br.internalGzip.Close()
   140  	}
   141  	if br.File != nil {
   142  		fileErr = br.File.Close()
   143  	} else {
   144  		return errors.New("no file found")
   145  	}
   146  
   147  	switch { // Handle error returns. Priority is gzErr > fileErr
   148  	case gzErr != nil:
   149  		return gzErr
   150  
   151  	case fileErr != nil:
   152  		log.Println("WARNING: attempted to close file, but file already closed")
   153  		return nil
   154  
   155  	default:
   156  		return nil
   157  	}
   158  }
   159  
   160  // StringToIntSlice will process a row of data separated by commas, convert the slice into a slice of type int.
   161  // PSL and genePred formats have a trailing comma we need to account for and the check at the beginning will adjust
   162  // the length of the working slice.
   163  func StringToIntSlice(line string) []int {
   164  	work := strings.Split(line, ",")
   165  	var sliceSize int = len(work)
   166  	if line[len(line)-1] == ',' {
   167  		sliceSize--
   168  	}
   169  	var answer []int = make([]int, sliceSize)
   170  	var err error
   171  	for i := 0; i < sliceSize; i++ {
   172  		answer[i], err = strconv.Atoi(work[i])
   173  		exception.PanicOnErr(err)
   174  	}
   175  	return answer
   176  }
   177  
   178  // IntListToString will process a slice of type int as an input and return a each value separated by a comma as a string.
   179  // Important Note: string will include a trailing comma to satisfy UCSC's anomalies.
   180  func IntSliceToString(nums []int) string {
   181  	ans := strings.Builder{}
   182  	ans.Grow(2 * len(nums))
   183  	for i := 0; i < len(nums); i++ {
   184  		ans.WriteString(fmt.Sprintf("%d", nums[i]))
   185  		ans.WriteByte(',')
   186  	}
   187  	return ans.String()
   188  }
   189  
   190  // IntToString a function that converts a number of type int and return a string.
   191  func IntToString(i int) string {
   192  	return fmt.Sprintf("%d", i)
   193  }