github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/search.go (about)

     1  package parquet
     2  
     3  // Search is like Find, but uses the default ordering of the given type. Search
     4  // and Find are scoped to a given ColumnChunk and find the pages within a
     5  // ColumnChunk which might contain the result.  See Find for more details.
     6  func Search(index ColumnIndex, value Value, typ Type) int {
     7  	return Find(index, value, CompareNullsLast(typ.Compare))
     8  }
     9  
    10  // Find uses the ColumnIndex passed as argument to find the page in a column
    11  // chunk (determined by the given ColumnIndex) that the given value is expected
    12  // to be found in.
    13  //
    14  // The function returns the index of the first page that might contain the
    15  // value. If the function determines that the value does not exist in the
    16  // index, NumPages is returned.
    17  //
    18  // If you want to search the entire parquet file, you must iterate over the
    19  // RowGroups and search each one individually, if there are multiple in the
    20  // file. If you call writer.Flush before closing the file, then you will have
    21  // multiple RowGroups to iterate over, otherwise Flush is called once on Close.
    22  //
    23  // The comparison function passed as last argument is used to determine the
    24  // relative order of values. This should generally be the Compare method of
    25  // the column type, but can sometimes be customized to modify how null values
    26  // are interpreted, for example:
    27  //
    28  //	pageIndex := parquet.Find(columnIndex, value,
    29  //		parquet.CompareNullsFirst(typ.Compare),
    30  //	)
    31  func Find(index ColumnIndex, value Value, cmp func(Value, Value) int) int {
    32  	switch {
    33  	case index.IsAscending():
    34  		return binarySearch(index, value, cmp)
    35  	default:
    36  		return linearSearch(index, value, cmp)
    37  	}
    38  }
    39  
    40  func binarySearch(index ColumnIndex, value Value, cmp func(Value, Value) int) int {
    41  	n := index.NumPages()
    42  	curIdx := 0
    43  	topIdx := n
    44  
    45  	// while there's at least one more page to check
    46  	for (topIdx - curIdx) > 1 {
    47  
    48  		// nextIdx is set to halfway between curIdx and topIdx
    49  		nextIdx := ((topIdx - curIdx) / 2) + curIdx
    50  
    51  		smallerThanMin := cmp(value, index.MinValue(nextIdx))
    52  
    53  		switch {
    54  		// search below pages[nextIdx]
    55  		case smallerThanMin < 0:
    56  			topIdx = nextIdx
    57  		// search pages[nextIdx] and above
    58  		case smallerThanMin > 0:
    59  			curIdx = nextIdx
    60  		case smallerThanMin == 0:
    61  			// this case is hit when winValue == value of nextIdx
    62  			// we must check below this index to find if there's
    63  			// another page before this.
    64  			// e.g. searching for first page 3 is in:
    65  			// [1,2,3]
    66  			// [3,4,5]
    67  			// [6,7,8]
    68  
    69  			// if the page proceeding this has a maxValue matching the value we're
    70  			// searching, continue the search.
    71  			// otherwise, we can return early
    72  			//
    73  			// cases covered by else block
    74  			// if cmp(value, index.MaxValue(nextIdx-1)) < 0: the value is only in this page
    75  			// if cmp(value, index.MaxValue(nextIdx-1)) > 0: we've got a sorting problem with overlapping pages
    76  			//
    77  			// bounds check not needed for nextIdx-1 because nextIdx is guaranteed to be at least curIdx + 1
    78  			// line 82 & 85 above
    79  			if cmp(value, index.MaxValue(nextIdx-1)) == 0 {
    80  				topIdx = nextIdx
    81  			} else {
    82  				return nextIdx
    83  			}
    84  		}
    85  	}
    86  
    87  	// last page check, if it wasn't explicitly found above
    88  	if curIdx < n {
    89  
    90  		// check pages[curIdx] for value
    91  		min := index.MinValue(curIdx)
    92  		max := index.MaxValue(curIdx)
    93  
    94  		// if value is not in pages[curIdx], then it's not in this columnChunk
    95  		if cmp(value, min) < 0 || cmp(value, max) > 0 {
    96  			curIdx = n
    97  		}
    98  	}
    99  
   100  	return curIdx
   101  }
   102  
   103  func linearSearch(index ColumnIndex, value Value, cmp func(Value, Value) int) int {
   104  	n := index.NumPages()
   105  
   106  	for i := 0; i < n; i++ {
   107  		min := index.MinValue(i)
   108  		max := index.MaxValue(i)
   109  
   110  		if cmp(min, value) <= 0 && cmp(value, max) <= 0 {
   111  			return i
   112  		}
   113  	}
   114  
   115  	return n
   116  }