github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/sort_reduce.cu (about)

     1  //  Copyright (c) 2017-2018 Uber Technologies, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #include <thrust/iterator/zip_iterator.h>
    16  #include <thrust/iterator/discard_iterator.h>
    17  #include <thrust/gather.h>
    18  #include <thrust/transform.h>
    19  #include <cstring>
    20  #include <algorithm>
    21  #include <exception>
    22  #include "query/algorithm.hpp"
    23  #include "query/iterator.hpp"
    24  #include "query/time_series_aggregate.h"
    25  #include "memory.hpp"
    26  
    27  CGoCallResHandle Sort(DimensionColumnVector keys,
    28                        int length,
    29                        void *cudaStream,
    30                        int device) {
    31    CGoCallResHandle resHandle = {nullptr, nullptr};
    32    try {
    33  #ifdef RUN_ON_DEVICE
    34      cudaSetDevice(device);
    35  #endif
    36      ares::sort(keys, length,
    37                 reinterpret_cast<cudaStream_t>(cudaStream));
    38      CheckCUDAError("Sort");
    39    }
    40    catch (std::exception &e) {
    41      std::cerr << "Exception happend when doing Sort:" << e.what()
    42                << std::endl;
    43      resHandle.pStrErr = strdup(e.what());
    44    }
    45    return resHandle;
    46  }
    47  
    48  CGoCallResHandle Reduce(DimensionColumnVector inputKeys,
    49                          uint8_t *inputValues,
    50                          DimensionColumnVector outputKeys,
    51                          uint8_t *outputValues,
    52                          int valueBytes,
    53                          int length,
    54                          AggregateFunction aggFunc,
    55                          void *stream,
    56                          int device) {
    57    CGoCallResHandle resHandle = {nullptr, nullptr};
    58    try {
    59  #ifdef RUN_ON_DEVICE
    60      cudaSetDevice(device);
    61  #endif
    62      cudaStream_t cudaStream = reinterpret_cast<cudaStream_t>(stream);
    63      resHandle.res = reinterpret_cast<void *>(ares::reduce(inputKeys,
    64                                                            inputValues,
    65                                                            outputKeys,
    66                                                            outputValues,
    67                                                            valueBytes,
    68                                                            length,
    69                                                            aggFunc,
    70                                                            cudaStream));
    71      CheckCUDAError("Reduce");
    72      return resHandle;
    73    }
    74    catch (std::exception &e) {
    75      std::cerr << "Exception happend when doing Reduce:" << e.what()
    76                << std::endl;
    77      resHandle.pStrErr = strdup(e.what());
    78    }
    79    return resHandle;
    80  }
    81  
    82  
    83  CGoCallResHandle Expand(DimensionColumnVector inputKeys,
    84                          DimensionColumnVector outputKeys,
    85                          uint32_t *baseCounts,
    86                          uint32_t *indexVector,
    87                          int indexVectorLen,
    88                          int outputOccupiedLen,
    89                          void *stream,
    90                          int device) {
    91    CGoCallResHandle resHandle = {nullptr, nullptr};
    92  
    93    try {
    94      SET_DEVICE(device);
    95      cudaStream_t cudaStream = reinterpret_cast<cudaStream_t>(stream);
    96      resHandle.res = reinterpret_cast<void *>(ares::expand(inputKeys,
    97                                                            outputKeys,
    98                                                            baseCounts,
    99                                                            indexVector,
   100                                                            indexVectorLen,
   101                                                            outputOccupiedLen,
   102                                                            cudaStream));
   103      CheckCUDAError("Expand");
   104      return resHandle;
   105    }
   106    catch (std::exception &e) {
   107      std::cerr << "Exception happend when doing Expand:" << e.what()
   108                << std::endl;
   109      resHandle.pStrErr = strdup(e.what());
   110    }
   111  
   112    return resHandle;
   113  }
   114  
   115  namespace ares {
   116  
   117  // sort based on DimensionColumnVector
   118  void sort(DimensionColumnVector keys,
   119            int length,
   120            cudaStream_t cudaStream) {
   121    DimensionHashIterator hashIter(keys.DimValues,
   122                                   keys.IndexVector,
   123                                   keys.NumDimsPerDimWidth,
   124                                   keys.VectorCapacity);
   125    thrust::copy(GET_EXECUTION_POLICY(cudaStream),
   126                 hashIter,
   127                 hashIter + length,
   128                 keys.HashValues);
   129    thrust::stable_sort_by_key(GET_EXECUTION_POLICY(cudaStream),
   130                               keys.HashValues,
   131                               keys.HashValues + length,
   132                               keys.IndexVector);
   133  }
   134  
   135  template<typename Value, typename AggFunc>
   136  int reduceInternal(uint64_t *inputHashValues, uint32_t *inputIndexVector,
   137                     uint8_t *inputValues, uint64_t *outputHashValues,
   138                     uint32_t *outputIndexVector, uint8_t *outputValues,
   139                     int length, cudaStream_t cudaStream) {
   140    thrust::equal_to<uint64_t> binaryPred;
   141    AggFunc aggFunc;
   142    ReduceByHashFunctor<AggFunc> reduceFunc(aggFunc);
   143    auto zippedInputIter = thrust::make_zip_iterator(thrust::make_tuple(
   144        inputIndexVector,
   145        thrust::make_permutation_iterator(reinterpret_cast<Value *>(inputValues),
   146                                          inputIndexVector)));
   147    auto zippedOutputIter = thrust::make_zip_iterator(thrust::make_tuple(
   148        outputIndexVector, reinterpret_cast<Value *>(outputValues)));
   149    auto resEnd = thrust::reduce_by_key(GET_EXECUTION_POLICY(cudaStream),
   150                                        inputHashValues,
   151                                        inputHashValues + length,
   152                                        zippedInputIter,
   153                                        thrust::make_discard_iterator(),
   154                                        zippedOutputIter,
   155                                        binaryPred,
   156                                        reduceFunc);
   157    return thrust::get<1>(resEnd) - zippedOutputIter;
   158  }
   159  
   160  struct rolling_avg {
   161    typedef uint64_t first_argument_type;
   162    typedef uint64_t second_argument_type;
   163    typedef uint64_t result_type;
   164  
   165    __host__  __device__ uint64_t operator()(
   166        uint64_t lhs, uint64_t rhs) const {
   167      uint32_t lCount = lhs >> 32;
   168      uint32_t rCount = rhs >> 32;
   169      uint32_t totalCount = lCount + rCount;
   170      if (totalCount == 0) {
   171        return 0;
   172      }
   173  
   174      uint64_t res = 0;
   175      *(reinterpret_cast<uint32_t *>(&res) + 1) = totalCount;
   176      // do division first to avoid overflow.
   177      *reinterpret_cast<float_t*>(&res) =
   178          *reinterpret_cast<float_t*>(&lhs) / totalCount * lCount +
   179          *reinterpret_cast<float_t*>(&rhs) / totalCount * rCount;
   180      return res;
   181    }
   182  };
   183  
   184  int bindValueAndAggFunc(uint64_t *inputHashValues,
   185                          uint32_t *inputIndexVector,
   186                          uint8_t *inputValues,
   187                          uint64_t *outputHashValues,
   188                          uint32_t *outputIndexVector,
   189                          uint8_t *outputValues,
   190                          int valueBytes,
   191                          int length,
   192                          AggregateFunction aggFunc,
   193                          cudaStream_t cudaStream) {
   194    switch (aggFunc) {
   195      #define REDUCE_INTERNAL(ValueType, AggFunc) \
   196        return reduceInternal< ValueType, AggFunc >( \
   197              inputHashValues, \
   198              inputIndexVector, \
   199              inputValues, \
   200              outputHashValues, \
   201              outputIndexVector, \
   202              outputValues, \
   203              length, \
   204              cudaStream);
   205  
   206      case AGGR_SUM_UNSIGNED:
   207        if (valueBytes == 4) {
   208          REDUCE_INTERNAL(uint32_t, thrust::plus<uint32_t>)
   209        } else {
   210          REDUCE_INTERNAL(uint64_t, thrust::plus<uint64_t>)
   211        }
   212      case AGGR_SUM_SIGNED:
   213        if (valueBytes == 4) {
   214          REDUCE_INTERNAL(int32_t, thrust::plus<int32_t>)
   215        } else {
   216          REDUCE_INTERNAL(int64_t, thrust::plus<int64_t>)
   217        }
   218      case AGGR_SUM_FLOAT:
   219        if (valueBytes == 4) {
   220          REDUCE_INTERNAL(float_t, thrust::plus<float_t>)
   221        } else {
   222          REDUCE_INTERNAL(double_t, thrust::plus<double_t>)
   223        }
   224      case AGGR_MIN_UNSIGNED:
   225        REDUCE_INTERNAL(uint32_t, thrust::minimum<uint32_t>)
   226      case AGGR_MIN_SIGNED:
   227        REDUCE_INTERNAL(int32_t, thrust::minimum<int32_t>)
   228      case AGGR_MIN_FLOAT:
   229        REDUCE_INTERNAL(float_t, thrust::minimum<float_t>)
   230      case AGGR_MAX_UNSIGNED:
   231        REDUCE_INTERNAL(uint32_t, thrust::maximum<uint32_t>)
   232      case AGGR_MAX_SIGNED:
   233        REDUCE_INTERNAL(int32_t, thrust::maximum<int32_t>)
   234      case AGGR_MAX_FLOAT:
   235        REDUCE_INTERNAL(float_t, thrust::maximum<float_t>)
   236      case AGGR_AVG_FLOAT:
   237        REDUCE_INTERNAL(uint64_t, rolling_avg)
   238      default:
   239        throw std::invalid_argument("Unsupported aggregation function type");
   240    }
   241  }
   242  
   243  int reduce(DimensionColumnVector inputKeys, uint8_t *inputValues,
   244             DimensionColumnVector outputKeys, uint8_t *outputValues,
   245             int valueBytes, int length, AggregateFunction aggFunc,
   246             cudaStream_t cudaStream) {
   247    int outputLength = bindValueAndAggFunc(
   248        inputKeys.HashValues,
   249        inputKeys.IndexVector,
   250        inputValues,
   251        outputKeys.HashValues,
   252        outputKeys.IndexVector,
   253        outputValues,
   254        valueBytes,
   255        length,
   256        aggFunc,
   257        cudaStream);
   258    DimensionColumnPermutateIterator iterIn(
   259        inputKeys.DimValues, outputKeys.IndexVector, inputKeys.VectorCapacity,
   260        outputLength, inputKeys.NumDimsPerDimWidth);
   261    DimensionColumnOutputIterator iterOut(outputKeys.DimValues,
   262                                          inputKeys.VectorCapacity, outputLength,
   263                                          inputKeys.NumDimsPerDimWidth, 0);
   264  
   265    int numDims = 0;
   266    for (int i = 0; i < NUM_DIM_WIDTH; i++) {
   267      numDims += inputKeys.NumDimsPerDimWidth[i];
   268    }
   269    // copy dim values into output
   270    thrust::copy(GET_EXECUTION_POLICY(cudaStream),
   271        iterIn, iterIn + numDims * 2 * outputLength, iterOut);
   272    return outputLength;
   273  }
   274  
   275  
   276  int expand(DimensionColumnVector inputKeys,
   277             DimensionColumnVector outputKeys,
   278             uint32_t *baseCounts,
   279             uint32_t *indexVector,
   280             int indexVectorLen,
   281             int outputOccupiedLen,
   282             cudaStream_t cudaStream) {
   283    // create count interator from baseCount and indexVector
   284    IndexCountIterator countIter = IndexCountIterator(baseCounts, indexVector);
   285  
   286    // total item counts by adding counts together
   287    uint32_t totalCount = thrust::reduce(GET_EXECUTION_POLICY(cudaStream),
   288                                         countIter,
   289                                         countIter+indexVectorLen);
   290  
   291    // scan the counts to obtain output offsets for each input element
   292    ares::device_vector<uint32_t> offsets(indexVectorLen);
   293    thrust::exclusive_scan(GET_EXECUTION_POLICY(cudaStream),
   294                           countIter,
   295                           countIter+indexVectorLen,
   296                           offsets.begin());
   297  
   298    // scatter the nonzero counts into their corresponding output positions
   299    ares::device_vector<uint32_t> indices(totalCount);
   300    thrust::scatter_if(GET_EXECUTION_POLICY(cudaStream),
   301                       thrust::counting_iterator<uint32_t>(0),
   302                       thrust::counting_iterator<uint32_t>(indexVectorLen),
   303                       offsets.begin(),
   304                       countIter,
   305                       indices.begin());
   306  
   307    // compute max-scan over the indices, filling in the holes
   308    thrust::inclusive_scan(GET_EXECUTION_POLICY(cudaStream),
   309                           indices.begin(),
   310                           indices.end(),
   311                           indices.begin(),
   312                           thrust::maximum<uint32_t>());
   313  
   314    // get the raw pointer from device/host vector
   315    uint32_t * newIndexVector = thrust::raw_pointer_cast(&indices[0]);
   316  
   317    int outputLen = min(totalCount, outputKeys.VectorCapacity
   318                          - outputOccupiedLen);
   319    // start the real copy operation
   320    DimensionColumnPermutateIterator iterIn(
   321        inputKeys.DimValues, newIndexVector, inputKeys.VectorCapacity,
   322        outputLen, inputKeys.NumDimsPerDimWidth);
   323  
   324    DimensionColumnOutputIterator iterOut(outputKeys.DimValues,
   325                                          outputKeys.VectorCapacity, outputLen,
   326                                          inputKeys.NumDimsPerDimWidth,
   327                                          outputOccupiedLen);
   328  
   329    int numDims = 0;
   330    for (int i = 0; i < NUM_DIM_WIDTH; i++) {
   331        numDims += inputKeys.NumDimsPerDimWidth[i];
   332    }
   333    // copy dim values into output
   334    thrust::copy(GET_EXECUTION_POLICY(cudaStream), iterIn,
   335                  iterIn + numDims * 2 * outputLen, iterOut);
   336    // return total count in the output dimensionVector
   337    return outputLen + outputOccupiedLen;
   338  }
   339  
   340  }  // namespace ares