github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/hll.cu (about)

     1  //  Copyright (c) 2017-2018 Uber Technologies, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #include <thrust/iterator/discard_iterator.h>
    16  #include <thrust/transform.h>
    17  #include <algorithm>
    18  #include "query/algorithm.hpp"
    19  #include "query/memory.hpp"
    20  
    21  CGoCallResHandle HyperLogLog(DimensionColumnVector prevDimOut,
    22                               DimensionColumnVector curDimOut,
    23                               uint32_t *prevValuesOut,
    24                               uint32_t *curValuesOut,
    25                               int prevResultSize,
    26                               int curBatchSize,
    27                               bool isLastBatch,
    28                               uint8_t **hllVectorPtr,
    29                               size_t *hllVectorSizePtr,
    30                               uint16_t **hllRegIDCountPerDimPtr,
    31                               void *stream,
    32                               int device) {
    33    CGoCallResHandle resHandle = {nullptr, nullptr};
    34    try {
    35  #ifdef RUN_ON_DEVICE
    36      cudaSetDevice(device);
    37  #endif
    38      cudaStream_t cudaStream = reinterpret_cast<cudaStream_t>(stream);
    39      resHandle.res =
    40          reinterpret_cast<void *>(ares::hyperloglog(prevDimOut,
    41                                                     curDimOut,
    42                                                     prevValuesOut,
    43                                                     curValuesOut,
    44                                                     prevResultSize,
    45                                                     curBatchSize,
    46                                                     isLastBatch,
    47                                                     hllVectorPtr,
    48                                                     hllVectorSizePtr,
    49                                                     hllRegIDCountPerDimPtr,
    50                                                     cudaStream));
    51      CheckCUDAError("HyperLogLog");
    52      return resHandle;
    53    }
    54    catch (std::exception &e) {
    55      std::cerr << "Exception happend when doing HyperLogLog:" << e.what()
    56                << std::endl;
    57      resHandle.pStrErr = strdup(e.what());
    58    }
    59    return resHandle;
    60  }
    61  
    62  namespace ares {
    63  
    64  // hll sort batch sort current batch using higher 48bits of the hash produced
    65  // from dim values + 16bit reg_id from hll value,
    66  // Note: we store the actual dim values of the current batch into the
    67  // prevDimOut.DimValues vector, but we write the index vector, hash vector, hll
    68  // value vector into the curDimOut, index vector will be initialized to range
    69  // [prevResultSize, preResultSize + curBatchSize)
    70  void sortCurrentBatch(uint8_t *dimValues, uint64_t *hashValues,
    71                        uint32_t *indexVector,
    72                        uint8_t numDimsPerDimWidth[NUM_DIM_WIDTH],
    73                        int vectorCapacity, uint32_t *curValuesOut,
    74                        int prevResultSize, int curBatchSize,
    75                        cudaStream_t cudaStream) {
    76    DimensionHashIterator hashIter(dimValues, indexVector, numDimsPerDimWidth,
    77                                   vectorCapacity);
    78    auto zippedValueIter = thrust::make_zip_iterator(
    79        thrust::make_tuple(indexVector, curValuesOut));
    80    thrust::transform(
    81        GET_EXECUTION_POLICY(cudaStream),
    82        hashIter, hashIter + curBatchSize, curValuesOut, hashValues,
    83        HLLHashFunctor());
    84    thrust::stable_sort_by_key(
    85        GET_EXECUTION_POLICY(cudaStream),
    86        hashValues, hashValues + curBatchSize,
    87        zippedValueIter);
    88  }
    89  
    90  // prepareHeadFlags prepares dimHeadFlags determines whether a element is the
    91  // head of a dimension partition
    92  template<typename DimHeadIter>
    93  void prepareHeadFlags(uint64_t *hashVector, DimHeadIter dimHeadFlags,
    94                        int resultSize, cudaStream_t cudaStream) {
    95    HLLDimNotEqualFunctor dimNotEqual;
    96    // TODO(jians): if we see performance issue here, we can try to use custome
    97    // kernel to utilize shared memory
    98    thrust::transform(
    99        GET_EXECUTION_POLICY(cudaStream),
   100        hashVector, hashVector + resultSize - 1, hashVector + 1, dimHeadFlags + 1,
   101        dimNotEqual);
   102  }
   103  
   104  // createAndCopyHLLVector creates the hll vector based on
   105  // scanned count of reg_id counts per dimension and copy hll value
   106  // reg_id < 4096, copy hll measure value in sparse format
   107  // reg_id >= 4096, copy hll measure value in dense format
   108  void createAndCopyHLLVector(uint64_t *hashVector,
   109                              uint8_t **hllVectorPtr,
   110                              size_t *hllVectorSizePtr,
   111                              uint16_t **hllRegIDCountPerDimPtr,
   112                              unsigned int *dimCumCount,
   113                              uint32_t *values,
   114                              int resultSizeWithRegIDs,
   115                              int resultSize,
   116                              cudaStream_t cudaStream) {
   117    HLLRegIDHeadFlagIterator regIDHeadFlagIterator(hashVector);
   118    // allocate dimRegIDCount vector
   119    ares::deviceMalloc(reinterpret_cast<void **>(hllRegIDCountPerDimPtr),
   120               (size_t)resultSize * sizeof(uint16_t));
   121    // produce dimRegIDCount vector
   122    thrust::reduce_by_key(
   123        GET_EXECUTION_POLICY(cudaStream),
   124        dimCumCount, dimCumCount + resultSizeWithRegIDs,
   125        regIDHeadFlagIterator, thrust::make_discard_iterator(),
   126        *hllRegIDCountPerDimPtr);
   127  
   128    // iterator for get byte count for each dim according to reg id count
   129    auto hllDimByteCountIter = thrust::make_transform_iterator(
   130        *hllRegIDCountPerDimPtr, HLLDimByteCountFunctor());
   131  
   132    auto hllDimRegIDCountIter = thrust::make_transform_iterator(
   133        *hllRegIDCountPerDimPtr, CastFunctor<uint16_t, uint64_t>());
   134    // get dim reg id cumulative count (cumulative count of reg_id per each
   135    // dimension value)
   136    ares::device_vector<uint64_t> hllDimRegIDCumCount(resultSize + 1, 0);
   137    ares::device_vector<uint64_t> hllVectorOffsets(resultSize + 1, 0);
   138    ares::device_vector<uint64_t> hllRegIDCumCount(resultSizeWithRegIDs);
   139    thrust::inclusive_scan(
   140        GET_EXECUTION_POLICY(cudaStream),
   141        hllDimRegIDCountIter, hllDimRegIDCountIter + resultSize,
   142        hllDimRegIDCumCount.begin() + 1);
   143    thrust::inclusive_scan(
   144        GET_EXECUTION_POLICY(cudaStream),
   145        hllDimByteCountIter, hllDimByteCountIter + resultSize,
   146        hllVectorOffsets.begin() + 1);
   147    thrust::inclusive_scan(
   148        GET_EXECUTION_POLICY(cudaStream),
   149        regIDHeadFlagIterator,
   150        regIDHeadFlagIterator + resultSizeWithRegIDs,
   151        hllRegIDCumCount.begin());
   152    *hllVectorSizePtr = hllVectorOffsets[resultSize];
   153    HLLValueOutputIterator hllValueOutputIter(
   154        dimCumCount, values, thrust::raw_pointer_cast(hllRegIDCumCount.data()),
   155        thrust::raw_pointer_cast(hllDimRegIDCumCount.data()),
   156        thrust::raw_pointer_cast(hllVectorOffsets.data()));
   157  
   158    // allocate dense vector
   159    deviceMalloc(reinterpret_cast<void **>(hllVectorPtr), *hllVectorSizePtr);
   160    deviceMemset(*hllVectorPtr, 0, *hllVectorSizePtr);
   161    thrust::transform_if(
   162        GET_EXECUTION_POLICY(cudaStream),
   163        hllValueOutputIter, hllValueOutputIter + resultSizeWithRegIDs,
   164        regIDHeadFlagIterator, thrust::make_discard_iterator(),
   165        CopyHLLFunctor(*hllVectorPtr), thrust::identity<unsigned int>());
   166  }
   167  
   168  // copyDim is the same as regular dimension copy in regular reduce operations
   169  void copyDim(DimensionColumnVector inputKeys,
   170               DimensionColumnVector outputKeys, int outputLength,
   171               cudaStream_t cudaStream) {
   172    DimensionColumnPermutateIterator iterIn(
   173        inputKeys.DimValues, outputKeys.IndexVector, inputKeys.VectorCapacity,
   174        outputLength, inputKeys.NumDimsPerDimWidth);
   175  
   176    DimensionColumnOutputIterator iterOut(outputKeys.DimValues,
   177                                          inputKeys.VectorCapacity, outputLength,
   178                                          inputKeys.NumDimsPerDimWidth, 0);
   179  
   180    int numDims = 0;
   181    for (int i = 0; i < NUM_DIM_WIDTH; i++) {
   182      numDims += inputKeys.NumDimsPerDimWidth[i];
   183    }
   184  
   185    thrust::copy(GET_EXECUTION_POLICY(cudaStream),
   186                 iterIn, iterIn + numDims * 2 * outputLength, iterOut);
   187  }
   188  
   189  // merge merges previous batch results with current batch results
   190  // based on hash value (asce) and hll value (desc)
   191  void merge(uint64_t *inputHashValues, uint64_t *outputHashValues,
   192             uint32_t *inputValues, uint32_t *outputValues,
   193             uint32_t *inputIndexVector, uint32_t *outputIndexVector,
   194             int prevResultSize, int curBatchResultSize,
   195             cudaStream_t cudaStream) {
   196    auto zippedPrevBatchMergeKey = thrust::make_zip_iterator(
   197        thrust::make_tuple(inputHashValues, inputValues));
   198    auto zippedCurBatchMergeKey = thrust::make_zip_iterator(thrust::make_tuple(
   199        inputHashValues + prevResultSize, inputValues + prevResultSize));
   200    auto zippedOutputKey = thrust::make_zip_iterator(
   201        thrust::make_tuple(outputHashValues, outputValues));
   202  
   203    thrust::merge_by_key(
   204        GET_EXECUTION_POLICY(cudaStream),
   205        zippedPrevBatchMergeKey, zippedPrevBatchMergeKey + prevResultSize,
   206        zippedCurBatchMergeKey, zippedCurBatchMergeKey + curBatchResultSize,
   207        inputIndexVector, inputIndexVector + prevResultSize, zippedOutputKey,
   208        outputIndexVector, HLLMergeComparator());
   209  }
   210  
   211  int reduceCurrentBatch(uint64_t *inputHashValues,
   212                         uint32_t *inputIndexVector,
   213                         uint32_t *inputValues,
   214                         uint64_t *outputHashValues,
   215                         uint32_t *outputIndexVector,
   216                         uint32_t *outputValues,
   217                         int length,
   218                         cudaStream_t cudaStream) {
   219    thrust::equal_to<uint64_t> binaryPred;
   220    thrust::maximum<uint32_t> maxOp;
   221    ReduceByHashFunctor<thrust::maximum<uint32_t> > reduceFunc(maxOp);
   222    auto zippedInputIter = thrust::make_zip_iterator(
   223        thrust::make_tuple(inputIndexVector, inputValues));
   224    auto zippedOutputIter = thrust::make_zip_iterator(
   225        thrust::make_tuple(outputIndexVector, outputValues));
   226    auto resEnd = thrust::reduce_by_key(
   227        GET_EXECUTION_POLICY(cudaStream),
   228        inputHashValues, inputHashValues + length, zippedInputIter,
   229        outputHashValues, zippedOutputIter, binaryPred, reduceFunc);
   230    return thrust::get<0>(resEnd) - outputHashValues;
   231  }
   232  
   233  int makeHLLVector(uint64_t *hashValues, uint32_t *indexVector,
   234                    uint32_t *values, int resultSize, uint8_t **hllVectorPtr,
   235                    size_t *hllVectorSizePtr, uint16_t **hllRegIDCountPerDimPtr,
   236                    cudaStream_t cudaStream) {
   237    ares::device_vector<unsigned int> dimHeadFlags(resultSize, 1);
   238    prepareHeadFlags(hashValues, dimHeadFlags.begin(), resultSize, cudaStream);
   239  
   240    int reducedResultSize =
   241        thrust::remove_if(
   242            GET_EXECUTION_POLICY(cudaStream),
   243            indexVector, indexVector + resultSize, dimHeadFlags.begin(),
   244            thrust::detail::equal_to_value<unsigned int>(0)) -
   245        indexVector;
   246    thrust::inclusive_scan(
   247        GET_EXECUTION_POLICY(cudaStream),
   248        dimHeadFlags.begin(), dimHeadFlags.end(), dimHeadFlags.begin());
   249    createAndCopyHLLVector(hashValues, hllVectorPtr, hllVectorSizePtr,
   250                           hllRegIDCountPerDimPtr,
   251                           thrust::raw_pointer_cast(dimHeadFlags.data()), values,
   252                           resultSize, reducedResultSize, cudaStream);
   253    return reducedResultSize;
   254  }
   255  
   256  // the steps for hyperloglog:
   257  // 1. sort current batch
   258  // 2. reduce current batch
   259  // 3. merge current batch result with result from previous batches
   260  // 4. (last batch only) create dense hll vector
   261  // 5. copy dimension values
   262  int hyperloglog(DimensionColumnVector prevDimOut,
   263                  DimensionColumnVector curDimOut, uint32_t *prevValuesOut,
   264                  uint32_t *curValuesOut, int prevResultSize, int curBatchSize,
   265                  bool isLastBatch, uint8_t **hllVectorPtr,
   266                  size_t *hllVectorSizePtr, uint16_t **hllRegIDCountPerDimPtr,
   267                  cudaStream_t cudaStream) {
   268    sortCurrentBatch(prevDimOut.DimValues, curDimOut.HashValues,
   269                     curDimOut.IndexVector, curDimOut.NumDimsPerDimWidth,
   270                     curDimOut.VectorCapacity, curValuesOut, prevResultSize,
   271                     curBatchSize, cudaStream);
   272    int curResultSize = reduceCurrentBatch(
   273        curDimOut.HashValues, curDimOut.IndexVector, curValuesOut,
   274        prevDimOut.HashValues + prevResultSize,
   275        prevDimOut.IndexVector + prevResultSize, prevValuesOut + prevResultSize,
   276        curBatchSize, cudaStream);
   277  
   278    merge(prevDimOut.HashValues, curDimOut.HashValues, prevValuesOut,
   279          curValuesOut, prevDimOut.IndexVector, curDimOut.IndexVector,
   280          prevResultSize, curResultSize, cudaStream);
   281  
   282    int resSize = prevResultSize + curResultSize;
   283    if (isLastBatch && resSize > 0) {
   284      resSize = makeHLLVector(
   285          curDimOut.HashValues, curDimOut.IndexVector, curValuesOut, resSize,
   286          hllVectorPtr, hllVectorSizePtr, hllRegIDCountPerDimPtr, cudaStream);
   287    }
   288    copyDim(prevDimOut, curDimOut, resSize, cudaStream);
   289    return resSize;
   290  }
   291  
   292  }  // namespace ares