github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/hll.cu (about) 1 // Copyright (c) 2017-2018 Uber Technologies, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include <thrust/iterator/discard_iterator.h> 16 #include <thrust/transform.h> 17 #include <algorithm> 18 #include "query/algorithm.hpp" 19 #include "query/memory.hpp" 20 21 CGoCallResHandle HyperLogLog(DimensionColumnVector prevDimOut, 22 DimensionColumnVector curDimOut, 23 uint32_t *prevValuesOut, 24 uint32_t *curValuesOut, 25 int prevResultSize, 26 int curBatchSize, 27 bool isLastBatch, 28 uint8_t **hllVectorPtr, 29 size_t *hllVectorSizePtr, 30 uint16_t **hllRegIDCountPerDimPtr, 31 void *stream, 32 int device) { 33 CGoCallResHandle resHandle = {nullptr, nullptr}; 34 try { 35 #ifdef RUN_ON_DEVICE 36 cudaSetDevice(device); 37 #endif 38 cudaStream_t cudaStream = reinterpret_cast<cudaStream_t>(stream); 39 resHandle.res = 40 reinterpret_cast<void *>(ares::hyperloglog(prevDimOut, 41 curDimOut, 42 prevValuesOut, 43 curValuesOut, 44 prevResultSize, 45 curBatchSize, 46 isLastBatch, 47 hllVectorPtr, 48 hllVectorSizePtr, 49 hllRegIDCountPerDimPtr, 50 cudaStream)); 51 CheckCUDAError("HyperLogLog"); 52 return resHandle; 53 } 54 catch (std::exception &e) { 55 std::cerr << "Exception happend when doing HyperLogLog:" << e.what() 56 << std::endl; 57 resHandle.pStrErr = strdup(e.what()); 58 } 59 return resHandle; 60 } 61 62 namespace ares { 63 64 // hll sort batch sort current batch using higher 48bits of the hash produced 65 // from dim values + 16bit reg_id from hll value, 66 // Note: we store the actual dim values of the current batch into the 67 // prevDimOut.DimValues vector, but we write the index vector, hash vector, hll 68 // value vector into the curDimOut, index vector will be initialized to range 69 // [prevResultSize, preResultSize + curBatchSize) 70 void sortCurrentBatch(uint8_t *dimValues, uint64_t *hashValues, 71 uint32_t *indexVector, 72 uint8_t numDimsPerDimWidth[NUM_DIM_WIDTH], 73 int vectorCapacity, uint32_t *curValuesOut, 74 int prevResultSize, int curBatchSize, 75 cudaStream_t cudaStream) { 76 DimensionHashIterator hashIter(dimValues, indexVector, numDimsPerDimWidth, 77 vectorCapacity); 78 auto zippedValueIter = thrust::make_zip_iterator( 79 thrust::make_tuple(indexVector, curValuesOut)); 80 thrust::transform( 81 GET_EXECUTION_POLICY(cudaStream), 82 hashIter, hashIter + curBatchSize, curValuesOut, hashValues, 83 HLLHashFunctor()); 84 thrust::stable_sort_by_key( 85 GET_EXECUTION_POLICY(cudaStream), 86 hashValues, hashValues + curBatchSize, 87 zippedValueIter); 88 } 89 90 // prepareHeadFlags prepares dimHeadFlags determines whether a element is the 91 // head of a dimension partition 92 template<typename DimHeadIter> 93 void prepareHeadFlags(uint64_t *hashVector, DimHeadIter dimHeadFlags, 94 int resultSize, cudaStream_t cudaStream) { 95 HLLDimNotEqualFunctor dimNotEqual; 96 // TODO(jians): if we see performance issue here, we can try to use custome 97 // kernel to utilize shared memory 98 thrust::transform( 99 GET_EXECUTION_POLICY(cudaStream), 100 hashVector, hashVector + resultSize - 1, hashVector + 1, dimHeadFlags + 1, 101 dimNotEqual); 102 } 103 104 // createAndCopyHLLVector creates the hll vector based on 105 // scanned count of reg_id counts per dimension and copy hll value 106 // reg_id < 4096, copy hll measure value in sparse format 107 // reg_id >= 4096, copy hll measure value in dense format 108 void createAndCopyHLLVector(uint64_t *hashVector, 109 uint8_t **hllVectorPtr, 110 size_t *hllVectorSizePtr, 111 uint16_t **hllRegIDCountPerDimPtr, 112 unsigned int *dimCumCount, 113 uint32_t *values, 114 int resultSizeWithRegIDs, 115 int resultSize, 116 cudaStream_t cudaStream) { 117 HLLRegIDHeadFlagIterator regIDHeadFlagIterator(hashVector); 118 // allocate dimRegIDCount vector 119 ares::deviceMalloc(reinterpret_cast<void **>(hllRegIDCountPerDimPtr), 120 (size_t)resultSize * sizeof(uint16_t)); 121 // produce dimRegIDCount vector 122 thrust::reduce_by_key( 123 GET_EXECUTION_POLICY(cudaStream), 124 dimCumCount, dimCumCount + resultSizeWithRegIDs, 125 regIDHeadFlagIterator, thrust::make_discard_iterator(), 126 *hllRegIDCountPerDimPtr); 127 128 // iterator for get byte count for each dim according to reg id count 129 auto hllDimByteCountIter = thrust::make_transform_iterator( 130 *hllRegIDCountPerDimPtr, HLLDimByteCountFunctor()); 131 132 auto hllDimRegIDCountIter = thrust::make_transform_iterator( 133 *hllRegIDCountPerDimPtr, CastFunctor<uint16_t, uint64_t>()); 134 // get dim reg id cumulative count (cumulative count of reg_id per each 135 // dimension value) 136 ares::device_vector<uint64_t> hllDimRegIDCumCount(resultSize + 1, 0); 137 ares::device_vector<uint64_t> hllVectorOffsets(resultSize + 1, 0); 138 ares::device_vector<uint64_t> hllRegIDCumCount(resultSizeWithRegIDs); 139 thrust::inclusive_scan( 140 GET_EXECUTION_POLICY(cudaStream), 141 hllDimRegIDCountIter, hllDimRegIDCountIter + resultSize, 142 hllDimRegIDCumCount.begin() + 1); 143 thrust::inclusive_scan( 144 GET_EXECUTION_POLICY(cudaStream), 145 hllDimByteCountIter, hllDimByteCountIter + resultSize, 146 hllVectorOffsets.begin() + 1); 147 thrust::inclusive_scan( 148 GET_EXECUTION_POLICY(cudaStream), 149 regIDHeadFlagIterator, 150 regIDHeadFlagIterator + resultSizeWithRegIDs, 151 hllRegIDCumCount.begin()); 152 *hllVectorSizePtr = hllVectorOffsets[resultSize]; 153 HLLValueOutputIterator hllValueOutputIter( 154 dimCumCount, values, thrust::raw_pointer_cast(hllRegIDCumCount.data()), 155 thrust::raw_pointer_cast(hllDimRegIDCumCount.data()), 156 thrust::raw_pointer_cast(hllVectorOffsets.data())); 157 158 // allocate dense vector 159 deviceMalloc(reinterpret_cast<void **>(hllVectorPtr), *hllVectorSizePtr); 160 deviceMemset(*hllVectorPtr, 0, *hllVectorSizePtr); 161 thrust::transform_if( 162 GET_EXECUTION_POLICY(cudaStream), 163 hllValueOutputIter, hllValueOutputIter + resultSizeWithRegIDs, 164 regIDHeadFlagIterator, thrust::make_discard_iterator(), 165 CopyHLLFunctor(*hllVectorPtr), thrust::identity<unsigned int>()); 166 } 167 168 // copyDim is the same as regular dimension copy in regular reduce operations 169 void copyDim(DimensionColumnVector inputKeys, 170 DimensionColumnVector outputKeys, int outputLength, 171 cudaStream_t cudaStream) { 172 DimensionColumnPermutateIterator iterIn( 173 inputKeys.DimValues, outputKeys.IndexVector, inputKeys.VectorCapacity, 174 outputLength, inputKeys.NumDimsPerDimWidth); 175 176 DimensionColumnOutputIterator iterOut(outputKeys.DimValues, 177 inputKeys.VectorCapacity, outputLength, 178 inputKeys.NumDimsPerDimWidth, 0); 179 180 int numDims = 0; 181 for (int i = 0; i < NUM_DIM_WIDTH; i++) { 182 numDims += inputKeys.NumDimsPerDimWidth[i]; 183 } 184 185 thrust::copy(GET_EXECUTION_POLICY(cudaStream), 186 iterIn, iterIn + numDims * 2 * outputLength, iterOut); 187 } 188 189 // merge merges previous batch results with current batch results 190 // based on hash value (asce) and hll value (desc) 191 void merge(uint64_t *inputHashValues, uint64_t *outputHashValues, 192 uint32_t *inputValues, uint32_t *outputValues, 193 uint32_t *inputIndexVector, uint32_t *outputIndexVector, 194 int prevResultSize, int curBatchResultSize, 195 cudaStream_t cudaStream) { 196 auto zippedPrevBatchMergeKey = thrust::make_zip_iterator( 197 thrust::make_tuple(inputHashValues, inputValues)); 198 auto zippedCurBatchMergeKey = thrust::make_zip_iterator(thrust::make_tuple( 199 inputHashValues + prevResultSize, inputValues + prevResultSize)); 200 auto zippedOutputKey = thrust::make_zip_iterator( 201 thrust::make_tuple(outputHashValues, outputValues)); 202 203 thrust::merge_by_key( 204 GET_EXECUTION_POLICY(cudaStream), 205 zippedPrevBatchMergeKey, zippedPrevBatchMergeKey + prevResultSize, 206 zippedCurBatchMergeKey, zippedCurBatchMergeKey + curBatchResultSize, 207 inputIndexVector, inputIndexVector + prevResultSize, zippedOutputKey, 208 outputIndexVector, HLLMergeComparator()); 209 } 210 211 int reduceCurrentBatch(uint64_t *inputHashValues, 212 uint32_t *inputIndexVector, 213 uint32_t *inputValues, 214 uint64_t *outputHashValues, 215 uint32_t *outputIndexVector, 216 uint32_t *outputValues, 217 int length, 218 cudaStream_t cudaStream) { 219 thrust::equal_to<uint64_t> binaryPred; 220 thrust::maximum<uint32_t> maxOp; 221 ReduceByHashFunctor<thrust::maximum<uint32_t> > reduceFunc(maxOp); 222 auto zippedInputIter = thrust::make_zip_iterator( 223 thrust::make_tuple(inputIndexVector, inputValues)); 224 auto zippedOutputIter = thrust::make_zip_iterator( 225 thrust::make_tuple(outputIndexVector, outputValues)); 226 auto resEnd = thrust::reduce_by_key( 227 GET_EXECUTION_POLICY(cudaStream), 228 inputHashValues, inputHashValues + length, zippedInputIter, 229 outputHashValues, zippedOutputIter, binaryPred, reduceFunc); 230 return thrust::get<0>(resEnd) - outputHashValues; 231 } 232 233 int makeHLLVector(uint64_t *hashValues, uint32_t *indexVector, 234 uint32_t *values, int resultSize, uint8_t **hllVectorPtr, 235 size_t *hllVectorSizePtr, uint16_t **hllRegIDCountPerDimPtr, 236 cudaStream_t cudaStream) { 237 ares::device_vector<unsigned int> dimHeadFlags(resultSize, 1); 238 prepareHeadFlags(hashValues, dimHeadFlags.begin(), resultSize, cudaStream); 239 240 int reducedResultSize = 241 thrust::remove_if( 242 GET_EXECUTION_POLICY(cudaStream), 243 indexVector, indexVector + resultSize, dimHeadFlags.begin(), 244 thrust::detail::equal_to_value<unsigned int>(0)) - 245 indexVector; 246 thrust::inclusive_scan( 247 GET_EXECUTION_POLICY(cudaStream), 248 dimHeadFlags.begin(), dimHeadFlags.end(), dimHeadFlags.begin()); 249 createAndCopyHLLVector(hashValues, hllVectorPtr, hllVectorSizePtr, 250 hllRegIDCountPerDimPtr, 251 thrust::raw_pointer_cast(dimHeadFlags.data()), values, 252 resultSize, reducedResultSize, cudaStream); 253 return reducedResultSize; 254 } 255 256 // the steps for hyperloglog: 257 // 1. sort current batch 258 // 2. reduce current batch 259 // 3. merge current batch result with result from previous batches 260 // 4. (last batch only) create dense hll vector 261 // 5. copy dimension values 262 int hyperloglog(DimensionColumnVector prevDimOut, 263 DimensionColumnVector curDimOut, uint32_t *prevValuesOut, 264 uint32_t *curValuesOut, int prevResultSize, int curBatchSize, 265 bool isLastBatch, uint8_t **hllVectorPtr, 266 size_t *hllVectorSizePtr, uint16_t **hllRegIDCountPerDimPtr, 267 cudaStream_t cudaStream) { 268 sortCurrentBatch(prevDimOut.DimValues, curDimOut.HashValues, 269 curDimOut.IndexVector, curDimOut.NumDimsPerDimWidth, 270 curDimOut.VectorCapacity, curValuesOut, prevResultSize, 271 curBatchSize, cudaStream); 272 int curResultSize = reduceCurrentBatch( 273 curDimOut.HashValues, curDimOut.IndexVector, curValuesOut, 274 prevDimOut.HashValues + prevResultSize, 275 prevDimOut.IndexVector + prevResultSize, prevValuesOut + prevResultSize, 276 curBatchSize, cudaStream); 277 278 merge(prevDimOut.HashValues, curDimOut.HashValues, prevValuesOut, 279 curValuesOut, prevDimOut.IndexVector, curDimOut.IndexVector, 280 prevResultSize, curResultSize, cudaStream); 281 282 int resSize = prevResultSize + curResultSize; 283 if (isLastBatch && resSize > 0) { 284 resSize = makeHLLVector( 285 curDimOut.HashValues, curDimOut.IndexVector, curValuesOut, resSize, 286 hllVectorPtr, hllVectorSizePtr, hllRegIDCountPerDimPtr, cudaStream); 287 } 288 copyDim(prevDimOut, curDimOut, resSize, cudaStream); 289 return resSize; 290 } 291 292 } // namespace ares