github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/sort_reduce.cu (about) 1 // Copyright (c) 2017-2018 Uber Technologies, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include <thrust/iterator/zip_iterator.h> 16 #include <thrust/iterator/discard_iterator.h> 17 #include <thrust/gather.h> 18 #include <thrust/transform.h> 19 #include <cstring> 20 #include <algorithm> 21 #include <exception> 22 #include "query/algorithm.hpp" 23 #include "query/iterator.hpp" 24 #include "query/time_series_aggregate.h" 25 #include "memory.hpp" 26 27 CGoCallResHandle Sort(DimensionColumnVector keys, 28 int length, 29 void *cudaStream, 30 int device) { 31 CGoCallResHandle resHandle = {nullptr, nullptr}; 32 try { 33 #ifdef RUN_ON_DEVICE 34 cudaSetDevice(device); 35 #endif 36 ares::sort(keys, length, 37 reinterpret_cast<cudaStream_t>(cudaStream)); 38 CheckCUDAError("Sort"); 39 } 40 catch (std::exception &e) { 41 std::cerr << "Exception happend when doing Sort:" << e.what() 42 << std::endl; 43 resHandle.pStrErr = strdup(e.what()); 44 } 45 return resHandle; 46 } 47 48 CGoCallResHandle Reduce(DimensionColumnVector inputKeys, 49 uint8_t *inputValues, 50 DimensionColumnVector outputKeys, 51 uint8_t *outputValues, 52 int valueBytes, 53 int length, 54 AggregateFunction aggFunc, 55 void *stream, 56 int device) { 57 CGoCallResHandle resHandle = {nullptr, nullptr}; 58 try { 59 #ifdef RUN_ON_DEVICE 60 cudaSetDevice(device); 61 #endif 62 cudaStream_t cudaStream = reinterpret_cast<cudaStream_t>(stream); 63 resHandle.res = reinterpret_cast<void *>(ares::reduce(inputKeys, 64 inputValues, 65 outputKeys, 66 outputValues, 67 valueBytes, 68 length, 69 aggFunc, 70 cudaStream)); 71 CheckCUDAError("Reduce"); 72 return resHandle; 73 } 74 catch (std::exception &e) { 75 std::cerr << "Exception happend when doing Reduce:" << e.what() 76 << std::endl; 77 resHandle.pStrErr = strdup(e.what()); 78 } 79 return resHandle; 80 } 81 82 83 CGoCallResHandle Expand(DimensionColumnVector inputKeys, 84 DimensionColumnVector outputKeys, 85 uint32_t *baseCounts, 86 uint32_t *indexVector, 87 int indexVectorLen, 88 int outputOccupiedLen, 89 void *stream, 90 int device) { 91 CGoCallResHandle resHandle = {nullptr, nullptr}; 92 93 try { 94 SET_DEVICE(device); 95 cudaStream_t cudaStream = reinterpret_cast<cudaStream_t>(stream); 96 resHandle.res = reinterpret_cast<void *>(ares::expand(inputKeys, 97 outputKeys, 98 baseCounts, 99 indexVector, 100 indexVectorLen, 101 outputOccupiedLen, 102 cudaStream)); 103 CheckCUDAError("Expand"); 104 return resHandle; 105 } 106 catch (std::exception &e) { 107 std::cerr << "Exception happend when doing Expand:" << e.what() 108 << std::endl; 109 resHandle.pStrErr = strdup(e.what()); 110 } 111 112 return resHandle; 113 } 114 115 namespace ares { 116 117 // sort based on DimensionColumnVector 118 void sort(DimensionColumnVector keys, 119 int length, 120 cudaStream_t cudaStream) { 121 DimensionHashIterator hashIter(keys.DimValues, 122 keys.IndexVector, 123 keys.NumDimsPerDimWidth, 124 keys.VectorCapacity); 125 thrust::copy(GET_EXECUTION_POLICY(cudaStream), 126 hashIter, 127 hashIter + length, 128 keys.HashValues); 129 thrust::stable_sort_by_key(GET_EXECUTION_POLICY(cudaStream), 130 keys.HashValues, 131 keys.HashValues + length, 132 keys.IndexVector); 133 } 134 135 template<typename Value, typename AggFunc> 136 int reduceInternal(uint64_t *inputHashValues, uint32_t *inputIndexVector, 137 uint8_t *inputValues, uint64_t *outputHashValues, 138 uint32_t *outputIndexVector, uint8_t *outputValues, 139 int length, cudaStream_t cudaStream) { 140 thrust::equal_to<uint64_t> binaryPred; 141 AggFunc aggFunc; 142 ReduceByHashFunctor<AggFunc> reduceFunc(aggFunc); 143 auto zippedInputIter = thrust::make_zip_iterator(thrust::make_tuple( 144 inputIndexVector, 145 thrust::make_permutation_iterator(reinterpret_cast<Value *>(inputValues), 146 inputIndexVector))); 147 auto zippedOutputIter = thrust::make_zip_iterator(thrust::make_tuple( 148 outputIndexVector, reinterpret_cast<Value *>(outputValues))); 149 auto resEnd = thrust::reduce_by_key(GET_EXECUTION_POLICY(cudaStream), 150 inputHashValues, 151 inputHashValues + length, 152 zippedInputIter, 153 thrust::make_discard_iterator(), 154 zippedOutputIter, 155 binaryPred, 156 reduceFunc); 157 return thrust::get<1>(resEnd) - zippedOutputIter; 158 } 159 160 struct rolling_avg { 161 typedef uint64_t first_argument_type; 162 typedef uint64_t second_argument_type; 163 typedef uint64_t result_type; 164 165 __host__ __device__ uint64_t operator()( 166 uint64_t lhs, uint64_t rhs) const { 167 uint32_t lCount = lhs >> 32; 168 uint32_t rCount = rhs >> 32; 169 uint32_t totalCount = lCount + rCount; 170 if (totalCount == 0) { 171 return 0; 172 } 173 174 uint64_t res = 0; 175 *(reinterpret_cast<uint32_t *>(&res) + 1) = totalCount; 176 // do division first to avoid overflow. 177 *reinterpret_cast<float_t*>(&res) = 178 *reinterpret_cast<float_t*>(&lhs) / totalCount * lCount + 179 *reinterpret_cast<float_t*>(&rhs) / totalCount * rCount; 180 return res; 181 } 182 }; 183 184 int bindValueAndAggFunc(uint64_t *inputHashValues, 185 uint32_t *inputIndexVector, 186 uint8_t *inputValues, 187 uint64_t *outputHashValues, 188 uint32_t *outputIndexVector, 189 uint8_t *outputValues, 190 int valueBytes, 191 int length, 192 AggregateFunction aggFunc, 193 cudaStream_t cudaStream) { 194 switch (aggFunc) { 195 #define REDUCE_INTERNAL(ValueType, AggFunc) \ 196 return reduceInternal< ValueType, AggFunc >( \ 197 inputHashValues, \ 198 inputIndexVector, \ 199 inputValues, \ 200 outputHashValues, \ 201 outputIndexVector, \ 202 outputValues, \ 203 length, \ 204 cudaStream); 205 206 case AGGR_SUM_UNSIGNED: 207 if (valueBytes == 4) { 208 REDUCE_INTERNAL(uint32_t, thrust::plus<uint32_t>) 209 } else { 210 REDUCE_INTERNAL(uint64_t, thrust::plus<uint64_t>) 211 } 212 case AGGR_SUM_SIGNED: 213 if (valueBytes == 4) { 214 REDUCE_INTERNAL(int32_t, thrust::plus<int32_t>) 215 } else { 216 REDUCE_INTERNAL(int64_t, thrust::plus<int64_t>) 217 } 218 case AGGR_SUM_FLOAT: 219 if (valueBytes == 4) { 220 REDUCE_INTERNAL(float_t, thrust::plus<float_t>) 221 } else { 222 REDUCE_INTERNAL(double_t, thrust::plus<double_t>) 223 } 224 case AGGR_MIN_UNSIGNED: 225 REDUCE_INTERNAL(uint32_t, thrust::minimum<uint32_t>) 226 case AGGR_MIN_SIGNED: 227 REDUCE_INTERNAL(int32_t, thrust::minimum<int32_t>) 228 case AGGR_MIN_FLOAT: 229 REDUCE_INTERNAL(float_t, thrust::minimum<float_t>) 230 case AGGR_MAX_UNSIGNED: 231 REDUCE_INTERNAL(uint32_t, thrust::maximum<uint32_t>) 232 case AGGR_MAX_SIGNED: 233 REDUCE_INTERNAL(int32_t, thrust::maximum<int32_t>) 234 case AGGR_MAX_FLOAT: 235 REDUCE_INTERNAL(float_t, thrust::maximum<float_t>) 236 case AGGR_AVG_FLOAT: 237 REDUCE_INTERNAL(uint64_t, rolling_avg) 238 default: 239 throw std::invalid_argument("Unsupported aggregation function type"); 240 } 241 } 242 243 int reduce(DimensionColumnVector inputKeys, uint8_t *inputValues, 244 DimensionColumnVector outputKeys, uint8_t *outputValues, 245 int valueBytes, int length, AggregateFunction aggFunc, 246 cudaStream_t cudaStream) { 247 int outputLength = bindValueAndAggFunc( 248 inputKeys.HashValues, 249 inputKeys.IndexVector, 250 inputValues, 251 outputKeys.HashValues, 252 outputKeys.IndexVector, 253 outputValues, 254 valueBytes, 255 length, 256 aggFunc, 257 cudaStream); 258 DimensionColumnPermutateIterator iterIn( 259 inputKeys.DimValues, outputKeys.IndexVector, inputKeys.VectorCapacity, 260 outputLength, inputKeys.NumDimsPerDimWidth); 261 DimensionColumnOutputIterator iterOut(outputKeys.DimValues, 262 inputKeys.VectorCapacity, outputLength, 263 inputKeys.NumDimsPerDimWidth, 0); 264 265 int numDims = 0; 266 for (int i = 0; i < NUM_DIM_WIDTH; i++) { 267 numDims += inputKeys.NumDimsPerDimWidth[i]; 268 } 269 // copy dim values into output 270 thrust::copy(GET_EXECUTION_POLICY(cudaStream), 271 iterIn, iterIn + numDims * 2 * outputLength, iterOut); 272 return outputLength; 273 } 274 275 276 int expand(DimensionColumnVector inputKeys, 277 DimensionColumnVector outputKeys, 278 uint32_t *baseCounts, 279 uint32_t *indexVector, 280 int indexVectorLen, 281 int outputOccupiedLen, 282 cudaStream_t cudaStream) { 283 // create count interator from baseCount and indexVector 284 IndexCountIterator countIter = IndexCountIterator(baseCounts, indexVector); 285 286 // total item counts by adding counts together 287 uint32_t totalCount = thrust::reduce(GET_EXECUTION_POLICY(cudaStream), 288 countIter, 289 countIter+indexVectorLen); 290 291 // scan the counts to obtain output offsets for each input element 292 ares::device_vector<uint32_t> offsets(indexVectorLen); 293 thrust::exclusive_scan(GET_EXECUTION_POLICY(cudaStream), 294 countIter, 295 countIter+indexVectorLen, 296 offsets.begin()); 297 298 // scatter the nonzero counts into their corresponding output positions 299 ares::device_vector<uint32_t> indices(totalCount); 300 thrust::scatter_if(GET_EXECUTION_POLICY(cudaStream), 301 thrust::counting_iterator<uint32_t>(0), 302 thrust::counting_iterator<uint32_t>(indexVectorLen), 303 offsets.begin(), 304 countIter, 305 indices.begin()); 306 307 // compute max-scan over the indices, filling in the holes 308 thrust::inclusive_scan(GET_EXECUTION_POLICY(cudaStream), 309 indices.begin(), 310 indices.end(), 311 indices.begin(), 312 thrust::maximum<uint32_t>()); 313 314 // get the raw pointer from device/host vector 315 uint32_t * newIndexVector = thrust::raw_pointer_cast(&indices[0]); 316 317 int outputLen = min(totalCount, outputKeys.VectorCapacity 318 - outputOccupiedLen); 319 // start the real copy operation 320 DimensionColumnPermutateIterator iterIn( 321 inputKeys.DimValues, newIndexVector, inputKeys.VectorCapacity, 322 outputLen, inputKeys.NumDimsPerDimWidth); 323 324 DimensionColumnOutputIterator iterOut(outputKeys.DimValues, 325 outputKeys.VectorCapacity, outputLen, 326 inputKeys.NumDimsPerDimWidth, 327 outputOccupiedLen); 328 329 int numDims = 0; 330 for (int i = 0; i < NUM_DIM_WIDTH; i++) { 331 numDims += inputKeys.NumDimsPerDimWidth[i]; 332 } 333 // copy dim values into output 334 thrust::copy(GET_EXECUTION_POLICY(cudaStream), iterIn, 335 iterIn + numDims * 2 * outputLen, iterOut); 336 // return total count in the output dimensionVector 337 return outputLen + outputOccupiedLen; 338 } 339 340 } // namespace ares