github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/search.go (about) 1 package parquet 2 3 // Search is like Find, but uses the default ordering of the given type. Search 4 // and Find are scoped to a given ColumnChunk and find the pages within a 5 // ColumnChunk which might contain the result. See Find for more details. 6 func Search(index ColumnIndex, value Value, typ Type) int { 7 return Find(index, value, CompareNullsLast(typ.Compare)) 8 } 9 10 // Find uses the ColumnIndex passed as argument to find the page in a column 11 // chunk (determined by the given ColumnIndex) that the given value is expected 12 // to be found in. 13 // 14 // The function returns the index of the first page that might contain the 15 // value. If the function determines that the value does not exist in the 16 // index, NumPages is returned. 17 // 18 // If you want to search the entire parquet file, you must iterate over the 19 // RowGroups and search each one individually, if there are multiple in the 20 // file. If you call writer.Flush before closing the file, then you will have 21 // multiple RowGroups to iterate over, otherwise Flush is called once on Close. 22 // 23 // The comparison function passed as last argument is used to determine the 24 // relative order of values. This should generally be the Compare method of 25 // the column type, but can sometimes be customized to modify how null values 26 // are interpreted, for example: 27 // 28 // pageIndex := parquet.Find(columnIndex, value, 29 // parquet.CompareNullsFirst(typ.Compare), 30 // ) 31 func Find(index ColumnIndex, value Value, cmp func(Value, Value) int) int { 32 switch { 33 case index.IsAscending(): 34 return binarySearch(index, value, cmp) 35 default: 36 return linearSearch(index, value, cmp) 37 } 38 } 39 40 func binarySearch(index ColumnIndex, value Value, cmp func(Value, Value) int) int { 41 n := index.NumPages() 42 curIdx := 0 43 topIdx := n 44 45 // while there's at least one more page to check 46 for (topIdx - curIdx) > 1 { 47 48 // nextIdx is set to halfway between curIdx and topIdx 49 nextIdx := ((topIdx - curIdx) / 2) + curIdx 50 51 smallerThanMin := cmp(value, index.MinValue(nextIdx)) 52 53 switch { 54 // search below pages[nextIdx] 55 case smallerThanMin < 0: 56 topIdx = nextIdx 57 // search pages[nextIdx] and above 58 case smallerThanMin > 0: 59 curIdx = nextIdx 60 case smallerThanMin == 0: 61 // this case is hit when winValue == value of nextIdx 62 // we must check below this index to find if there's 63 // another page before this. 64 // e.g. searching for first page 3 is in: 65 // [1,2,3] 66 // [3,4,5] 67 // [6,7,8] 68 69 // if the page proceeding this has a maxValue matching the value we're 70 // searching, continue the search. 71 // otherwise, we can return early 72 // 73 // cases covered by else block 74 // if cmp(value, index.MaxValue(nextIdx-1)) < 0: the value is only in this page 75 // if cmp(value, index.MaxValue(nextIdx-1)) > 0: we've got a sorting problem with overlapping pages 76 // 77 // bounds check not needed for nextIdx-1 because nextIdx is guaranteed to be at least curIdx + 1 78 // line 82 & 85 above 79 if cmp(value, index.MaxValue(nextIdx-1)) == 0 { 80 topIdx = nextIdx 81 } else { 82 return nextIdx 83 } 84 } 85 } 86 87 // last page check, if it wasn't explicitly found above 88 if curIdx < n { 89 90 // check pages[curIdx] for value 91 min := index.MinValue(curIdx) 92 max := index.MaxValue(curIdx) 93 94 // if value is not in pages[curIdx], then it's not in this columnChunk 95 if cmp(value, min) < 0 || cmp(value, max) > 0 { 96 curIdx = n 97 } 98 } 99 100 return curIdx 101 } 102 103 func linearSearch(index ColumnIndex, value Value, cmp func(Value, Value) int) int { 104 n := index.NumPages() 105 106 for i := 0; i < n; i++ { 107 min := index.MinValue(i) 108 max := index.MaxValue(i) 109 110 if cmp(min, value) <= 0 && cmp(value, max) <= 0 { 111 return i 112 } 113 } 114 115 return n 116 }