github.com/mondo192/jfrog-client-go@v1.0.0/utils/io/content/contentreader.go (about)

     1  package content
     2  
     3  import (
     4  	"bufio"
     5  	"encoding/json"
     6  	"errors"
     7  	"github.com/mondo192/jfrog-client-go/utils"
     8  	"github.com/mondo192/jfrog-client-go/utils/errorutils"
     9  	"github.com/mondo192/jfrog-client-go/utils/log"
    10  	"io"
    11  	"os"
    12  	"reflect"
    13  	"sort"
    14  	"sync"
    15  )
    16  
    17  // Open and read JSON files, find the array key inside it and load its value into the memory in small chunks.
    18  // Currently, 'ContentReader' only support extracting a single value for a given key (arrayKey), other keys are ignored.
    19  // The value must be of type array.
    20  // Each array value can be fetched using 'NextRecord' (thread-safe).
    21  // This technique solves the limit of memory size which may be too small to fit large JSON.
    22  type ContentReader struct {
    23  	// filesPaths - source data file paths.
    24  	filesPaths []string
    25  	// arrayKey - Read the value of the specific object in JSON.
    26  	arrayKey string
    27  	// The objects from the source data file are being pushed into the data channel.
    28  	dataChannel chan map[string]interface{}
    29  	errorsQueue *utils.ErrorsQueue
    30  	once        *sync.Once
    31  	// Number of elements in the array (cache)
    32  	length int
    33  	empty  bool
    34  }
    35  
    36  func NewContentReader(filePath string, arrayKey string) *ContentReader {
    37  	self := NewMultiSourceContentReader([]string{filePath}, arrayKey)
    38  	self.empty = filePath == ""
    39  	return self
    40  }
    41  
    42  func NewMultiSourceContentReader(filePaths []string, arrayKey string) *ContentReader {
    43  	self := ContentReader{}
    44  	self.filesPaths = filePaths
    45  	self.arrayKey = arrayKey
    46  	self.dataChannel = make(chan map[string]interface{}, utils.MaxBufferSize)
    47  	self.errorsQueue = utils.NewErrorsQueue(utils.MaxBufferSize)
    48  	self.once = new(sync.Once)
    49  	self.empty = len(filePaths) == 0
    50  	return &self
    51  }
    52  
    53  func NewEmptyContentReader(arrayKey string) *ContentReader {
    54  	self := NewContentReader("", arrayKey)
    55  	return self
    56  }
    57  
    58  func (cr *ContentReader) IsEmpty() bool {
    59  	return cr.empty
    60  }
    61  
    62  // Each call to 'NextRecord()' will return a single element from the channel.
    63  // Only the first call invokes a goroutine to read data from the file and push it into the channel.
    64  // 'io.EOF' will be returned if no data is left.
    65  func (cr *ContentReader) NextRecord(recordOutput interface{}) error {
    66  	if cr.empty {
    67  		return errorutils.CheckErrorf("Empty")
    68  	}
    69  	cr.once.Do(func() {
    70  		go func() {
    71  			defer close(cr.dataChannel)
    72  			cr.length = 0
    73  			cr.run()
    74  		}()
    75  	})
    76  	record, ok := <-cr.dataChannel
    77  	if !ok {
    78  		return io.EOF
    79  	}
    80  	// Transform the data into a Go type
    81  	err := ConvertToStruct(record, &recordOutput)
    82  	if err != nil {
    83  		cr.errorsQueue.AddError(err)
    84  		return err
    85  	}
    86  	cr.length++
    87  	return err
    88  }
    89  
    90  // Prepare the reader to read the file all over again (not thread-safe).
    91  func (cr *ContentReader) Reset() {
    92  	cr.dataChannel = make(chan map[string]interface{}, utils.MaxBufferSize)
    93  	cr.once = new(sync.Once)
    94  }
    95  
    96  // Cleanup the reader data.
    97  func (cr *ContentReader) Close() error {
    98  	for _, filePath := range cr.filesPaths {
    99  		if filePath == "" {
   100  			continue
   101  		}
   102  		if err := errorutils.CheckError(os.Remove(filePath)); err != nil {
   103  			return errors.New("Failed to close reader: " + err.Error())
   104  		}
   105  	}
   106  	cr.filesPaths = nil
   107  	return nil
   108  }
   109  
   110  func (cr *ContentReader) GetFilesPaths() []string {
   111  	return cr.filesPaths
   112  }
   113  
   114  // Number of element in the array.
   115  func (cr *ContentReader) Length() (int, error) {
   116  	if cr.empty {
   117  		return 0, nil
   118  	}
   119  	if cr.length == 0 {
   120  		for item := new(interface{}); cr.NextRecord(item) == nil; item = new(interface{}) {
   121  		}
   122  		cr.Reset()
   123  		if err := cr.GetError(); err != nil {
   124  			return 0, err
   125  		}
   126  	}
   127  	return cr.length, nil
   128  }
   129  
   130  // Open and read the files one by one. Push each array element into the channel.
   131  // The channel may block the thread, therefore should run async.
   132  func (cr *ContentReader) run() {
   133  	for _, filePath := range cr.filesPaths {
   134  		cr.readSingleFile(filePath)
   135  	}
   136  }
   137  
   138  func (cr *ContentReader) readSingleFile(filePath string) {
   139  	fd, err := os.Open(filePath)
   140  	if err != nil {
   141  		log.Error(err.Error())
   142  		cr.errorsQueue.AddError(errorutils.CheckError(err))
   143  		return
   144  	}
   145  	defer func() {
   146  		err = fd.Close()
   147  		if err != nil {
   148  			log.Error(err.Error())
   149  			cr.errorsQueue.AddError(errorutils.CheckError(err))
   150  		}
   151  	}()
   152  	br := bufio.NewReaderSize(fd, 65536)
   153  	dec := json.NewDecoder(br)
   154  	err = findDecoderTargetPosition(dec, cr.arrayKey, true)
   155  	if err != nil {
   156  		if err == io.EOF {
   157  			cr.errorsQueue.AddError(errorutils.CheckErrorf(cr.arrayKey + " not found"))
   158  			return
   159  		}
   160  		cr.errorsQueue.AddError(err)
   161  		log.Error(err.Error())
   162  		return
   163  	}
   164  	for dec.More() {
   165  		var ResultItem map[string]interface{}
   166  		err := dec.Decode(&ResultItem)
   167  		if err != nil {
   168  			log.Error(err)
   169  			cr.errorsQueue.AddError(errorutils.CheckError(err))
   170  			return
   171  		}
   172  		cr.dataChannel <- ResultItem
   173  	}
   174  }
   175  
   176  func (cr *ContentReader) GetError() error {
   177  	return cr.errorsQueue.GetError()
   178  }
   179  
   180  // Search and set the decoder's position at the desired key in the JSON file.
   181  // If the desired key is not found, return io.EOF
   182  func findDecoderTargetPosition(dec *json.Decoder, target string, isArray bool) error {
   183  	for dec.More() {
   184  		// Token returns the next JSON token in the input stream.
   185  		t, err := dec.Token()
   186  		if err != nil {
   187  			return errorutils.CheckError(err)
   188  		}
   189  		if t == target {
   190  			if isArray {
   191  				// Skip '['
   192  				_, err = dec.Token()
   193  			}
   194  			return errorutils.CheckError(err)
   195  		}
   196  	}
   197  	return nil
   198  }
   199  
   200  func MergeReaders(arr []*ContentReader, arrayKey string) (contentReader *ContentReader, err error) {
   201  	cw, err := NewContentWriter(arrayKey, true, false)
   202  	if err != nil {
   203  		return nil, err
   204  	}
   205  	defer func() {
   206  		e := cw.Close()
   207  		if err == nil {
   208  			err = e
   209  		}
   210  	}()
   211  	for _, cr := range arr {
   212  		for item := new(interface{}); cr.NextRecord(item) == nil; item = new(interface{}) {
   213  			cw.Write(*item)
   214  		}
   215  		if err := cr.GetError(); err != nil {
   216  			return nil, err
   217  		}
   218  	}
   219  	contentReader = NewContentReader(cw.GetFilePath(), arrayKey)
   220  	return contentReader, nil
   221  }
   222  
   223  // Sort a content-reader in the required order (ascending or descending).
   224  // Performs a merge-sort on the reader, splitting the reader to multiple readers of size 'utils.MaxBufferSize'.
   225  // Sort each of the split readers, and merge them into a single sorted reader.
   226  // In case of multiple items with the same key - all the items will appear in the sorted reader, but their order is not guaranteed to be preserved.
   227  func SortContentReader(readerRecord SortableContentItem, reader *ContentReader, ascendingOrder bool) (*ContentReader, error) {
   228  	getSortKeyFunc := func(record interface{}) (string, error) {
   229  		// Get the expected record type from the reader.
   230  		recordType := reflect.ValueOf(readerRecord).Type()
   231  		recordItem := (reflect.New(recordType)).Interface()
   232  		err := ConvertToStruct(record, &recordItem)
   233  		if err != nil {
   234  			return "", err
   235  		}
   236  		contentItem, ok := recordItem.(SortableContentItem)
   237  		if !ok {
   238  			return "", errorutils.CheckErrorf("attempting to sort a content-reader with unsortable items")
   239  		}
   240  		return contentItem.GetSortKey(), nil
   241  	}
   242  	return SortContentReaderByCalculatedKey(reader, getSortKeyFunc, ascendingOrder)
   243  }
   244  
   245  type keyCalculationFunc func(interface{}) (string, error)
   246  
   247  type SortRecord struct {
   248  	Key    string      `json:"key,omitempty"`
   249  	Record interface{} `json:"record,omitempty"`
   250  }
   251  
   252  func (sr SortRecord) GetSortKey() string {
   253  	return sr.Key
   254  }
   255  
   256  // Sort a ContentReader, according to a key generated by getKeyFunc.
   257  // getKeyFunc gets an item from the reader and returns the key of the item.
   258  // Attention! In case of multiple items with the same key - only the first item in the original reader will appear in the sorted one! The other items will be removed.
   259  // Also pay attention that the order of the fields inside the objects might change.
   260  func SortContentReaderByCalculatedKey(reader *ContentReader, getKeyFunc keyCalculationFunc, ascendingOrder bool) (contentReader *ContentReader, err error) {
   261  	var sortedReaders []*ContentReader
   262  	defer func() {
   263  		for _, r := range sortedReaders {
   264  			e := r.Close()
   265  			if err == nil {
   266  				err = e
   267  			}
   268  		}
   269  	}()
   270  
   271  	// Split reader to multiple sorted readers of size 'utils.MaxBufferSize'.
   272  	sortedReaders, err = splitReaderToSortedBufferSizeReadersByCalculatedKey(reader, getKeyFunc, ascendingOrder)
   273  	if err != nil {
   274  		return nil, err
   275  	}
   276  
   277  	// Merge the sorted readers.
   278  	return mergeSortedReadersByCalculatedKey(sortedReaders, ascendingOrder)
   279  }
   280  
   281  // Split the reader to multiple readers of size 'utils.MaxBufferSize' to prevent memory overflow.
   282  // Sort each split-reader content according to the provided 'ascendingOrder'.
   283  func splitReaderToSortedBufferSizeReadersByCalculatedKey(reader *ContentReader, getKeyFunc keyCalculationFunc, ascendingOrder bool) ([]*ContentReader, error) {
   284  	var splitReaders []*ContentReader
   285  
   286  	// Split and sort.
   287  	keysToContentItems := make(map[string]SortableContentItem)
   288  	allKeys := make([]string, 0, utils.MaxBufferSize)
   289  	for newRecord := new(interface{}); reader.NextRecord(newRecord) == nil; newRecord = new(interface{}) {
   290  		sortKey, err := getKeyFunc(newRecord)
   291  		if err != nil {
   292  			return nil, err
   293  		}
   294  
   295  		if _, exist := keysToContentItems[sortKey]; !exist {
   296  			recordWrapper := &SortRecord{Key: sortKey, Record: newRecord}
   297  			keysToContentItems[sortKey] = recordWrapper
   298  			allKeys = append(allKeys, sortKey)
   299  			if len(allKeys) == utils.MaxBufferSize {
   300  				sortedFile, err := SortAndSaveBufferToFile(keysToContentItems, allKeys, ascendingOrder)
   301  				if err != nil {
   302  					return nil, err
   303  				}
   304  				splitReaders = append(splitReaders, sortedFile)
   305  				keysToContentItems = make(map[string]SortableContentItem)
   306  				allKeys = make([]string, 0, utils.MaxBufferSize)
   307  			}
   308  		}
   309  	}
   310  	if err := reader.GetError(); err != nil {
   311  		return nil, err
   312  	}
   313  	reader.Reset()
   314  	if len(allKeys) > 0 {
   315  		sortedFile, err := SortAndSaveBufferToFile(keysToContentItems, allKeys, ascendingOrder)
   316  		if err != nil {
   317  			return nil, err
   318  		}
   319  		splitReaders = append(splitReaders, sortedFile)
   320  	}
   321  
   322  	return splitReaders, nil
   323  }
   324  
   325  func mergeSortedReadersByCalculatedKey(sortedReaders []*ContentReader, ascendingOrder bool) (contentReader *ContentReader, err error) {
   326  	if len(sortedReaders) == 0 {
   327  		contentReader = NewEmptyContentReader(DefaultKey)
   328  		return contentReader, nil
   329  	}
   330  	resultWriter, err := NewContentWriter(DefaultKey, true, false)
   331  	if err != nil {
   332  		return nil, err
   333  	}
   334  	defer func() {
   335  		e := resultWriter.Close()
   336  		if err == nil {
   337  			err = e
   338  		}
   339  	}()
   340  	currentContentItem := make([]*SortRecord, len(sortedReaders))
   341  	sortedFilesClone := make([]*ContentReader, len(sortedReaders))
   342  	copy(sortedFilesClone, sortedReaders)
   343  
   344  	for {
   345  		var candidateToWrite *SortRecord
   346  		smallestIndex := 0
   347  		for i := 0; i < len(sortedFilesClone); i++ {
   348  			if currentContentItem[i] == nil && sortedFilesClone[i] != nil {
   349  				record := new(SortRecord)
   350  				if err := sortedFilesClone[i].NextRecord(record); nil != err {
   351  					sortedFilesClone[i] = nil
   352  					continue
   353  				}
   354  				currentContentItem[i] = record
   355  			}
   356  
   357  			var candidateKey, currentKey string
   358  			if candidateToWrite != nil && currentContentItem[i] != nil {
   359  				candidateKey = candidateToWrite.Key
   360  				currentKey = currentContentItem[i].Key
   361  
   362  				// If there are two items with the same key - the second one will be removed
   363  				if candidateKey == currentKey {
   364  					currentContentItem[i] = nil
   365  				}
   366  			}
   367  			if candidateToWrite == nil || (currentContentItem[i] != nil && compareStrings(candidateKey, currentKey, ascendingOrder)) {
   368  				candidateToWrite = currentContentItem[i]
   369  				smallestIndex = i
   370  			}
   371  		}
   372  		if candidateToWrite == nil {
   373  			break
   374  		}
   375  		resultWriter.Write(candidateToWrite.Record)
   376  		currentContentItem[smallestIndex] = nil
   377  	}
   378  	contentReader = NewContentReader(resultWriter.GetFilePath(), resultWriter.GetArrayKey())
   379  	return contentReader, nil
   380  }
   381  
   382  // Merge a slice of sorted content-readers into a single sorted content-reader.
   383  func MergeSortedReaders(readerRecord SortableContentItem, sortedReaders []*ContentReader, ascendingOrder bool) (contentReader *ContentReader, err error) {
   384  	if len(sortedReaders) == 0 {
   385  		return NewEmptyContentReader(DefaultKey), nil
   386  	}
   387  	resultWriter, err := NewContentWriter(DefaultKey, true, false)
   388  	if err != nil {
   389  		return nil, err
   390  	}
   391  	defer func() {
   392  		e := resultWriter.Close()
   393  		if err == nil {
   394  			err = e
   395  		}
   396  	}()
   397  
   398  	// Get the expected record type from the reader.
   399  	value := reflect.ValueOf(readerRecord)
   400  	valueType := value.Type()
   401  
   402  	currentContentItem := make([]*SortableContentItem, len(sortedReaders))
   403  	sortedFilesClone := make([]*ContentReader, len(sortedReaders))
   404  	copy(sortedFilesClone, sortedReaders)
   405  
   406  	for {
   407  		var candidateToWrite *SortableContentItem
   408  		smallestIndex := 0
   409  		for i := 0; i < len(sortedFilesClone); i++ {
   410  			if currentContentItem[i] == nil && sortedFilesClone[i] != nil {
   411  				temp := (reflect.New(valueType)).Interface()
   412  				if err := sortedFilesClone[i].NextRecord(temp); nil != err {
   413  					sortedFilesClone[i] = nil
   414  					continue
   415  				}
   416  				// Expect to receive 'SortableContentItem'.
   417  				contentItem, ok := (temp).(SortableContentItem)
   418  				if !ok {
   419  					return nil, errorutils.CheckErrorf("Attempting to sort a content-reader with unsortable items.")
   420  				}
   421  				currentContentItem[i] = &contentItem
   422  			}
   423  
   424  			if candidateToWrite == nil || (currentContentItem[i] != nil && compareStrings((*candidateToWrite).GetSortKey(),
   425  				(*currentContentItem[i]).GetSortKey(), ascendingOrder)) {
   426  				candidateToWrite = currentContentItem[i]
   427  				smallestIndex = i
   428  			}
   429  		}
   430  		if candidateToWrite == nil {
   431  			break
   432  		}
   433  		resultWriter.Write(*candidateToWrite)
   434  		currentContentItem[smallestIndex] = nil
   435  	}
   436  	contentReader = NewContentReader(resultWriter.GetFilePath(), resultWriter.GetArrayKey())
   437  	return contentReader, nil
   438  }
   439  
   440  func compareStrings(src, against string, ascendingOrder bool) bool {
   441  	if ascendingOrder {
   442  		return src > against
   443  	}
   444  	return src < against
   445  }
   446  
   447  func SortAndSaveBufferToFile(keysToContentItems map[string]SortableContentItem, allKeys []string, increasingOrder bool) (contentReader *ContentReader, err error) {
   448  	if len(allKeys) == 0 {
   449  		return nil, nil
   450  	}
   451  	writer, err := NewContentWriter(DefaultKey, true, false)
   452  	if err != nil {
   453  		return nil, err
   454  	}
   455  	defer func() {
   456  		e := writer.Close()
   457  		if err == nil {
   458  			err = e
   459  		}
   460  	}()
   461  	if increasingOrder {
   462  		sort.Strings(allKeys)
   463  	} else {
   464  		sort.Sort(sort.Reverse(sort.StringSlice(allKeys)))
   465  	}
   466  	for _, v := range allKeys {
   467  		writer.Write(keysToContentItems[v])
   468  	}
   469  	contentReader = NewContentReader(writer.GetFilePath(), writer.GetArrayKey())
   470  	return contentReader, nil
   471  }
   472  
   473  func ConvertToStruct(record, recordOutput interface{}) error {
   474  	data, err := json.Marshal(record)
   475  	if errorutils.CheckError(err) != nil {
   476  		return err
   477  	}
   478  	err = errorutils.CheckError(json.Unmarshal(data, recordOutput))
   479  	return err
   480  }