github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/iterator.hpp

github.com/rohankumardubey/aresdb@v0.0.2-0.20190517170215-e54e3ca06b9c/query/iterator.hpp (about)

     1  //  Copyright (c) 2017-2018 Uber Technologies, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #ifndef QUERY_ITERATOR_HPP_
    16  #define QUERY_ITERATOR_HPP_
    17  
    18  #include <cuda_runtime.h>
    19  #include <thrust/detail/internal_functional.h>
    20  #include <thrust/iterator/constant_iterator.h>
    21  #include <thrust/iterator/iterator_adaptor.h>
    22  #include <thrust/iterator/permutation_iterator.h>
    23  #include <thrust/iterator/zip_iterator.h>
    24  #include <thrust/tuple.h>
    25  #include <thrust/iterator/detail/normal_iterator.h>
    26  #include <cfloat>
    27  #include <cmath>
    28  #include <tuple>
    29  #include "query/time_series_aggregate.h"
    30  #include "query/utils.hpp"
    31  
    32  namespace ares {
    33  
    34  // VectorPartyIterator iterates <value,validity> tuples against a column
    35  // (vector parties) under 3 modes:
    36  // 1: all values are present
    37  // 2: uncompressed
    38  // 3: compressed
    39  // based on the presences of 3 vectors (values, nulls, counts).
    40  // Only 4 type of values vector is supported here:
    41  // Uint32, Int32, Float32 and Bool. Bool vector is packed into 1 bit.
    42  // More details
    43  // can be found here:
    44  // https://github.com/uber/aresdb/wiki/VectorStore
    45  //
    46  // This struct aims at reduce overall struct size so that we will use as few
    47  // registers per thread as possible when launching kernels.
    48  // We made following efforts to reduce overall size:
    49  // 1. Put in a single struct to avoid too much padding.
    50  // 2. Passing values pointers only and passing nulls and counts as offset to
    51  // value pointers. When allocating device space, we will store values, nulls
    52  // and counts consecutively.
    53  // 3. Base counts and start count will share the same memory address.
    54  //
    55  // Mode 0 vp is modeled as constant vector so value pointer always presents.
    56  // Length is only used by compressed column.
    57  //
    58  // The mode of the column is judged by following logic:
    59  // If valuesOffset == 0, it means it’s a mode 1 vector.
    60  // Otherwise if nullsOffset == 0, it’s a mode 2 vector.
    61  // Otherwise it’s a mode 3 vector.
    62  template <typename Value>
    63  class VectorPartyIterator
    64      : public thrust::iterator_adaptor<
    65            VectorPartyIterator<Value>, uint32_t *, thrust::tuple<Value, bool>,
    66            thrust::use_default, thrust::use_default, thrust::tuple<Value, bool>,
    67            thrust::use_default> {
    68   public:
    69    friend class thrust::iterator_core_access;
    70  
    71    typedef thrust::iterator_adaptor<VectorPartyIterator<Value>,
    72                                     uint32_t *, thrust::tuple<Value, bool>,
    73                                     thrust::use_default, thrust::use_default,
    74                                     thrust::tuple<Value, bool>,
    75                                     thrust::use_default> super_t;
    76  
    77    __host__ __device__ VectorPartyIterator() {}
    78  
    79    // base is counts vector if mode 0. nulls vector if mode 2, values vector
    80    // if mode 0.
    81    __host__ __device__
    82    VectorPartyIterator(
    83        uint32_t *baseCounts,
    84        const uint32_t startCount,
    85        uint8_t *basePtr,
    86        uint32_t nullsOffset,
    87        uint32_t valuesOffset,
    88        uint32_t length,
    89        uint8_t stepInBytes,
    90        uint8_t nullBitOffset)
    91        : super_t(baseCounts != nullptr
    92                  ? baseCounts : reinterpret_cast<uint32_t *>(startCount)),
    93          basePtr(basePtr),
    94          nullsOffset(nullsOffset),
    95          valuesOffset(valuesOffset),
    96          length(length),
    97          currentPtrIndex(0),
    98          stepInBytes(stepInBytes),
    99          nullBitOffset(nullBitOffset) {
   100      mode = judge_mode();
   101      hasBaseCounts = baseCounts != nullptr;
   102    }
   103  
   104   private:
   105    uint8_t *basePtr;
   106    uint32_t nullsOffset;  // Offset of nullsPtr.
   107    uint32_t valuesOffset;  // Offset of basePtr.
   108    uint32_t currentPtrIndex;
   109    uint32_t length;
   110    uint8_t stepInBytes;
   111    uint8_t hasBaseCounts;  // Whether the base iterator is a base count vector or
   112    // a start count.
   113    uint8_t nullBitOffset;  // Starting bit offset in null vector.
   114    uint8_t mode;  // For convenience we store here as this does not add to the
   115    // final space of this iterator.
   116  
   117    __host__ __device__
   118    uint8_t judge_mode() const {
   119      if (valuesOffset == 0) {
   120        return 1;
   121      }
   122      if (nullsOffset == 0) {
   123        return 2;
   124      }
   125      return 3;
   126    }
   127  
   128    // offset must be greater than 0, otherwise it may lead to access to undesired
   129    // location.
   130    __host__ __device__
   131    bool get_null() const {
   132      if (mode >= 2) {
   133        return get_bool(basePtr + nullsOffset);
   134      }
   135      return true;
   136    }
   137  
   138    __host__ __device__
   139    bool get_bool(uint8_t *ptr) const {
   140      uint32_t nullIndex = currentPtrIndex + nullBitOffset;
   141      return ptr[nullIndex / 8] & (1 << nullIndex % 8);
   142    }
   143  
   144    // overloaded functions for different value types.
   145    __host__ __device__
   146    uint32_t get_value(uint32_t *values) const {
   147      uint8_t *ptr =
   148          reinterpret_cast<uint8_t *>(values) + currentPtrIndex * stepInBytes;
   149      switch (stepInBytes) {
   150        case 2:return *reinterpret_cast<uint16_t *>(ptr);
   151        case 4:return *reinterpret_cast<uint32_t *>(ptr);
   152        default:return *ptr;
   153      }
   154    }
   155  
   156    __host__ __device__
   157    int32_t get_value(int32_t *values) const {
   158      int8_t *ptr =
   159          reinterpret_cast<int8_t *>(values) + currentPtrIndex * stepInBytes;
   160      switch (stepInBytes) {
   161        case 2:return *reinterpret_cast<int16_t *>(ptr);
   162        case 4:return *reinterpret_cast<int32_t *>(ptr);
   163        default:return *ptr;
   164      }
   165    }
   166  
   167    __host__ __device__
   168    float_t get_value(float_t *values) const {
   169      return values[currentPtrIndex];
   170    }
   171  
   172    __host__ __device__
   173    GeoPointT get_value(GeoPointT *values) const {
   174      return values[currentPtrIndex];
   175    }
   176  
   177    __host__ __device__
   178    int64_t get_value(int64_t *values) const {
   179      return values[currentPtrIndex];
   180    }
   181  
   182    __host__ __device__
   183    UUIDT get_value(UUIDT *values) const {
   184      return values[currentPtrIndex];
   185    }
   186  
   187    __host__ __device__
   188    bool get_value(bool *values) const {
   189      return get_bool(reinterpret_cast<uint8_t *>(values));
   190    }
   191  
   192    __host__ __device__
   193    Value get_value() const {
   194      return get_value(reinterpret_cast<Value *>(
   195                           basePtr + valuesOffset));
   196    }
   197  
   198    // Should be called only if it's a mode 3 vector.
   199    __host__ __device__
   200    uint32_t get_count(uint32_t index) const {
   201      return reinterpret_cast<uint32_t *>(basePtr)[currentPtrIndex + index];
   202    }
   203  
   204    __host__ __device__
   205    thrust::tuple<Value, bool> dereference() const {
   206      return thrust::make_tuple(get_value(),
   207                                get_null());
   208    }
   209  
   210    // For compressed vp, moving strategy is like following steps:
   211    //  1. if the offset is less than next count, we will not move the iterator.
   212    //  2. if the offset is less than the warp size(32 or 64), we will do a
   213    // linear search.
   214    //  3. otherwise, we will do a binary search to go to the correct position.
   215    // It works best if n is larger than 0 since we will do quick check and
   216    // linear check, otherwise we will do binary search for negative n.
   217    __host__ __device__
   218    void advance(typename super_t::difference_type n) {
   219      if (hasBaseCounts) {
   220        this->base_reference() += n;
   221      } else {
   222        this->base_reference() = reinterpret_cast<uint32_t *>(
   223            reinterpret_cast<uintptr_t >(this->base_reference()) + n);
   224      }
   225  
   226      if (mode == 3) {
   227        uint32_t newIndex;
   228        if (hasBaseCounts) {
   229          newIndex = *this->base_reference();
   230        } else {
   231          newIndex = reinterpret_cast<uintptr_t>(this->base_reference());
   232        }
   233  
   234        // quick check to avoid binary search
   235        if (newIndex >= get_count(0)
   236            && newIndex < get_count(1)) {
   237          return;
   238        }
   239  
   240        // do linear search if n is less than or equal to wrap size.
   241        if (n <= WARP_SIZE && n >= 0) {
   242          for (int i = 1; i < length - currentPtrIndex; i++) {
   243            if (newIndex >= get_count(i)
   244                && newIndex < get_count(i + 1)) {
   245              currentPtrIndex += i;
   246              return;
   247            }
   248          }
   249        }
   250  
   251        uint32_t first = currentPtrIndex;
   252        uint32_t last = length;
   253  
   254        if (n < 0) {
   255          first = 0;
   256          last = currentPtrIndex;
   257        }
   258  
   259        // this algorithm computes upper_bound - 1 of [first, last),
   260        // which is the last element in the [first, last) that is
   261        // smaller or equal to the given index
   262        while (first < last) {
   263          uint32_t mid = first + (last - first) / 2;
   264          if (get_count(mid) > newIndex) {
   265            last = mid;
   266          } else {
   267            first = mid + 1;
   268          }
   269        }
   270        currentPtrIndex = first - 1;
   271      } else {
   272        // If this column is not compressed, this means the counting
   273        // iterator is simply thrust counting iterator since base count
   274        // column should be also uncompressed as well. We can safely
   275        // advance our zip iterator with n;
   276        currentPtrIndex += n;
   277      }
   278    }
   279  
   280    __host__ __device__
   281    void increment() {
   282      advance(1);
   283    }
   284  
   285    __host__ __device__
   286    void decrement() {
   287      advance(-1);
   288    }
   289  };
   290  
   291  template <>
   292  class VectorPartyIterator<GeoPointT>
   293      : public thrust::iterator_adaptor<
   294            VectorPartyIterator<GeoPointT>, uint32_t *,
   295            thrust::tuple<GeoPointT, bool>, thrust::use_default,
   296            thrust::use_default, thrust::tuple<GeoPointT, bool>,
   297            thrust::use_default> {
   298   public:
   299    friend class thrust::iterator_core_access;
   300  
   301    typedef thrust::iterator_adaptor<
   302        VectorPartyIterator<GeoPointT>, uint32_t *,
   303        thrust::tuple<GeoPointT, bool>, thrust::use_default, thrust::use_default,
   304        thrust::tuple<GeoPointT, bool>, thrust::use_default>
   305        super_t;
   306  
   307    __host__ __device__ VectorPartyIterator() {}
   308  
   309    __host__ __device__ VectorPartyIterator(
   310        uint32_t *baseCounts, const uint32_t startCount, uint8_t *basePtr,
   311        uint32_t nullsOffset, uint32_t valuesOffset, uint32_t length,
   312        uint8_t stepInBytes, uint8_t nullBitOffset)
   313        : super_t(reinterpret_cast<uint32_t *>(startCount)),
   314          basePtr(basePtr),
   315          valuesOffset(valuesOffset),
   316          currentPtrIndex(0),
   317          nullBitOffset(nullBitOffset) {
   318    }
   319  
   320   private:
   321    uint8_t *basePtr;
   322    uint32_t currentPtrIndex;
   323    uint32_t valuesOffset;
   324    uint8_t nullBitOffset;
   325  
   326    __host__ __device__ bool get_null() const {
   327      // mode 1
   328      if (valuesOffset == 0) {
   329        return true;
   330      }
   331      return get_bool(basePtr);
   332    }
   333  
   334    __host__ __device__ GeoPointT get_value() const {
   335      return reinterpret_cast<GeoPointT *>(basePtr+valuesOffset)[currentPtrIndex];
   336    }
   337  
   338    __host__ __device__ bool get_bool(uint8_t *ptr) const {
   339      uint32_t nullIndex = currentPtrIndex + nullBitOffset;
   340      return ptr[nullIndex / 8] & (1 << nullIndex % 8);
   341    }
   342  
   343    __host__ __device__ thrust::tuple<GeoPointT, bool> dereference() const {
   344      return thrust::make_tuple(get_value(), get_null());
   345    }
   346  
   347    __host__ __device__ void advance(typename super_t::difference_type n) {
   348      this->base_reference() = reinterpret_cast<uint32_t *>(
   349          reinterpret_cast<uintptr_t>(this->base_reference()) + n);
   350      currentPtrIndex += n;
   351    }
   352  
   353    __host__ __device__ void increment() { advance(1); }
   354  
   355    __host__ __device__ void decrement() { advance(-1); }
   356  };
   357  
   358  template<typename Value>
   359  using ColumnIterator =
   360  thrust::permutation_iterator<VectorPartyIterator<Value>, uint32_t *>;
   361  
   362  // Helper function for creating VectorPartyIterator without specifying
   363  // Value template argument.
   364  template <class Value>
   365  ColumnIterator<Value> make_column_iterator(
   366      uint32_t *indexVector, uint32_t *baseCounts, uint32_t startCount,
   367      uint8_t *basePtr, uint32_t nullsOffset, uint32_t valuesOffset,
   368      uint32_t length, uint8_t stepInBytes, uint8_t nullBitOffset) {
   369    return thrust::make_permutation_iterator(
   370        VectorPartyIterator<Value>(baseCounts, startCount, basePtr, nullsOffset,
   371                                   valuesOffset, length, stepInBytes,
   372                                   nullBitOffset),
   373        indexVector);
   374  }
   375  
   376  // SimpleIterator combines interators in 4 different cases:
   377  // 1. Constant value.
   378  // 2. Mode 0 column, equivalent to constant value.
   379  // 3. Scratch space.
   380  //
   381  // We will use a value pointer and null offset to represent values
   382  // and nulls. If values or nulls are missing, we will reuse the space
   383  // to store constant values.
   384  // Only 4 Value types are supported: Uint32, Int32, Float32 and Bool.
   385  // Each bool value take 1 byte.
   386  // We will use the first 4 bytes of the value pointer to store default value
   387  // and rest 4 bytes as the position of the iterator.
   388  template<typename Value>
   389  class SimpleIterator :
   390      public thrust::iterator_adaptor<SimpleIterator<Value>,
   391                                      Value *, thrust::tuple<Value, bool>,
   392                                      thrust::use_default, thrust::use_default,
   393                                      thrust::tuple<Value, bool>,
   394                                      thrust::use_default> {
   395   public:
   396    friend class thrust::iterator_core_access;
   397  
   398    typedef thrust::iterator_adaptor<SimpleIterator<Value>,
   399                                     Value *, thrust::tuple<Value, bool>,
   400                                     thrust::use_default, thrust::use_default,
   401                                     thrust::tuple<Value, bool>,
   402                                     thrust::use_default> super_t;
   403  
   404    __host__ __device__ SimpleIterator() {}
   405  
   406    SimpleIterator(
   407        Value *values,
   408        uint32_t nullsOffset,
   409        Value defaultValue,
   410        bool defaultNull
   411    )
   412        : super_t(values != nullptr
   413                  ? values : reinterpret_cast<Value *>(
   414                      *reinterpret_cast<uintptr_t *>(&defaultValue) << 32)),
   415          useDefault(values == nullptr) {
   416      if (useDefault) {
   417        nulls.defaultNull = defaultNull;
   418      } else {
   419        nulls.nullsOffset = nullsOffset;
   420      }
   421    }
   422  
   423   private:
   424    union {
   425      uint32_t nullsOffset;
   426      bool defaultNull;
   427    } nulls;
   428    bool useDefault;
   429  
   430    __host__ __device__
   431    thrust::tuple<Value, bool> dereference() const {
   432      if (useDefault) {
   433        uintptr_t val = reinterpret_cast<uintptr_t>(this->base_reference()) >> 32;
   434        return thrust::make_tuple(*reinterpret_cast<Value *>(&val),
   435                                  nulls.defaultNull);
   436      }
   437      return thrust::make_tuple(*this->base_reference(),
   438                                *(reinterpret_cast<bool *>(this->base_reference())
   439                                    + nulls.nullsOffset));
   440    }
   441  
   442    __host__ __device__
   443    void advance(typename super_t::difference_type n) {
   444      this->base_reference() += n;
   445      if (!useDefault) {
   446        nulls.nullsOffset += (1 - sizeof(Value)) * n;
   447      }
   448    }
   449  
   450    __host__ __device__
   451    void increment() {
   452      advance(1);
   453    }
   454  
   455    __host__ __device__
   456    void decrement() {
   457      advance(-1);
   458    }
   459  };
   460  
   461  template <typename Value>
   462  struct ConstantIterator {
   463    typedef typename std::conditional<
   464        std::is_same<Value, UUIDT>::value || std::is_same<Value, int64_t>::value,
   465        thrust::constant_iterator<thrust::tuple<Value, bool>>,
   466        SimpleIterator<Value>>::type type;
   467  };
   468  
   469  template <typename Value>
   470  inline typename ConstantIterator<Value>::type make_constant_iterator(
   471      Value defaultValue, bool defaultNull) {
   472    return SimpleIterator<Value>(nullptr, 0, defaultValue, defaultNull);
   473  }
   474  
   475  template <>
   476  inline typename ConstantIterator<int64_t>::type make_constant_iterator(
   477      int64_t defaultValue, bool defaultNull) {
   478    return thrust::make_constant_iterator(
   479        thrust::make_tuple<int64_t, bool>(defaultValue, defaultNull));
   480  }
   481  
   482  template <>
   483  inline typename ConstantIterator<UUIDT>::type make_constant_iterator(
   484      UUIDT defaultValue, bool defaultNull) {
   485    return thrust::make_constant_iterator(
   486        thrust::make_tuple<UUIDT, bool>(defaultValue, defaultNull));
   487  }
   488  
   489  using GeoSimpleIterator = thrust::constant_iterator<thrust::tuple<GeoPointT,
   490                                                                    bool>>;
   491  
   492  // DimensionOutputIterator is for writing individual dimension transform
   493  // results into a consecutive chunk of memory which later will be used
   494  // as the key for sort and reduce.
   495  template<typename Value>
   496  using DimensionOutputIterator = thrust::zip_iterator<
   497      thrust::tuple<thrust::detail::normal_iterator<Value *>,
   498                    thrust::detail::normal_iterator<bool *> > >;
   499  
   500  template<typename Value>
   501  DimensionOutputIterator<Value> make_dimension_output_iterator(
   502      uint8_t *dimValues, uint8_t *dimNulls) {
   503    return thrust::make_zip_iterator(
   504        thrust::make_tuple(reinterpret_cast<Value *>(dimValues),
   505                           reinterpret_cast<bool *>(dimNulls)));
   506  }
   507  
   508  template<typename Value>
   509  SimpleIterator<Value> make_scratch_space_input_iterator(
   510      uint8_t *valueIter, uint32_t nullOffset) {
   511    return SimpleIterator<Value>(reinterpret_cast<Value *>(valueIter),
   512                                 nullOffset, 0, 0);
   513  }
   514  
   515  template<typename Value>
   516  using ScratchSpaceOutputIterator = thrust::zip_iterator<
   517      thrust::tuple<
   518          thrust::detail::normal_iterator<Value *>,
   519          thrust::detail::normal_iterator<bool *> > >;
   520  
   521  // Helper function for creating ScratchSpaceIterator without specifying
   522  // Value template argument.
   523  template<typename Value>
   524  ScratchSpaceOutputIterator<Value> make_scratch_space_output_iterator(
   525      Value *v, uint32_t nullOffset) {
   526    return thrust::make_zip_iterator(
   527        thrust::make_tuple(v, reinterpret_cast<bool *>(v) + nullOffset));
   528  }
   529  
   530  template<typename Value>
   531  class MeasureProxy {
   532   public:
   533    __host__ __device__
   534    MeasureProxy(Value *outputIter, Value identity, uint32_t count, bool isAvg)
   535        : outputIter(outputIter),
   536          count(count),
   537          identity(identity), isAvg(isAvg) {
   538    }
   539  
   540    __host__ __device__
   541    MeasureProxy operator=(thrust::tuple<Value, bool> t) const {
   542      if (!thrust::get<1>(t)) {
   543        *outputIter = identity;
   544      } else if (isAvg) {
   545        return assignAvg(t);
   546      } else {
   547        *outputIter = thrust::get<0>(t) * count;
   548      }
   549      return *this;
   550    }
   551  
   552    __host__ __device__
   553    MeasureProxy assignAvg(thrust::tuple<Value, bool> t) const {
   554      // Each operand is 64bits where higher 32bits are used for count and
   555      // lower bits are for intermediate average.
   556      float_t v = thrust::get<0>(t);
   557      *reinterpret_cast<float_t *>(outputIter) = v;
   558      *(reinterpret_cast<uint32_t *>(outputIter) + 1) = count;
   559      return *this;
   560    }
   561  
   562   private:
   563    Value *outputIter;
   564    Value identity;
   565    uint32_t count;
   566    bool isAvg;
   567  };
   568  
   569  // MeasureOutputIterator is the output iterator for the final step of measure
   570  // transformation and as the input for aggregation. It needs to take care
   571  // of 2 things:
   572  //  1. Given a <value, validity> tuple, write to a single value vector while
   573  // write null value as the identity value for different aggregation function
   574  //  2. If base column is a compressed column, also need to recover the original
   575  // value instead of the compressed value. e.g. if we have a value 3 with count 3
   576  // and the aggregation function is sum. We need to write 3 * 3 to measure
   577  // output iterator.
   578  template<typename Value>
   579  class MeasureOutputIterator : public thrust::iterator_adaptor<
   580      MeasureOutputIterator<Value>, Value *, thrust::tuple<Value, bool>,
   581      thrust::use_default, thrust::use_default, MeasureProxy<Value>,
   582      thrust::use_default> {
   583   public:
   584    friend class thrust::iterator_core_access;
   585  
   586    // shorthand for the name of the iterator_adaptor we're deriving from.
   587    typedef thrust::iterator_adaptor<MeasureOutputIterator<Value>,
   588                                     Value *,
   589                                     thrust::tuple<Value, bool>,
   590                                     thrust::use_default,
   591                                     thrust::use_default,
   592                                     MeasureProxy<Value>,
   593                                     thrust::use_default> super_t;
   594  
   595    __host__ __device__
   596    MeasureOutputIterator(Value *base, const uint32_t *baseCounts,
   597                          const uint32_t *indexVector,
   598                          AggregateFunction aggFunc)
   599        : super_t(base),
   600          baseCounts(baseCounts),
   601          indexVector(indexVector) {
   602      // We don't need to do anything for this value if it's not sum.
   603      if (!((aggFunc >= AGGR_SUM_UNSIGNED && aggFunc <= AGGR_SUM_FLOAT)
   604          || (aggFunc == AGGR_AVG_FLOAT))) {
   605        skipCount = true;
   606      }
   607      isAvg = aggFunc == AGGR_AVG_FLOAT;
   608      identity = get_identity_value<Value>(aggFunc);
   609    }
   610  
   611   private:
   612    const uint32_t *baseCounts;
   613    const uint32_t *indexVector;
   614    Value identity;
   615    bool skipCount = false;
   616    bool isAvg = false;
   617  
   618    __host__ __device__
   619    typename super_t::reference dereference() const {
   620      uint32_t index = *indexVector;
   621      uint32_t count = !skipCount && baseCounts != nullptr ? baseCounts[index + 1]
   622          - baseCounts[index] : 1;
   623      return MeasureProxy<Value>(this->base_reference(), identity, count, isAvg);
   624    }
   625  
   626    __host__ __device__
   627    void advance(typename super_t::difference_type n) {
   628      this->base_reference() += n;
   629      indexVector += n;
   630    }
   631  
   632    __host__ __device__
   633    void increment() {
   634      advance(1);
   635    }
   636  
   637    __host__ __device__
   638    void decrement() {
   639      advance(-1);
   640    }
   641  };
   642  
   643  // Helper function for creating MeasureOutputIterator without specifying
   644  // Value template argument.
   645  template<typename Value>
   646  MeasureOutputIterator<Value> make_measure_output_iterator(
   647      Value *v, uint32_t *indexVector, uint32_t *baseCounts,
   648      AggregateFunction aggFunc) {
   649    return MeasureOutputIterator<Value>(v, baseCounts,
   650                                        indexVector, aggFunc);
   651  }
   652  
   653  class IndexProxy {
   654   public:
   655    __host__ __device__
   656    explicit IndexProxy(uint32_t *outputIndex)
   657        : outputIndex(outputIndex) {
   658    }
   659  
   660    // Parameter is a tuple of <index row number, index value>.
   661    __host__ __device__
   662    IndexProxy operator=(thrust::tuple<uint32_t, uint32_t> t) const {
   663      *outputIndex = thrust::get<1>(t);
   664      return *this;
   665    }
   666  
   667   private:
   668    uint32_t *outputIndex;
   669  };
   670  
   671  // IndexOutputIterator is the output iterator for writing filtered index
   672  // to. The input will be a tuple of <index row number, index value>. We
   673  // will discard the index row number and only write the filtered index value
   674  // into the output index value vector which is a uint32_t pointer.
   675  class IndexOutputIterator :
   676      public thrust::iterator_adaptor<IndexOutputIterator,
   677                                      uint32_t *,
   678                                      thrust::tuple<
   679                                          uint32_t,
   680                                          uint32_t>,
   681                                      thrust::use_default,
   682                                      thrust::use_default,
   683                                      IndexProxy,
   684                                      thrust::use_default> {
   685   public:
   686    friend class thrust::iterator_core_access;
   687  
   688    // shorthand for the name of the iterator_adaptor we're deriving from.
   689    typedef thrust::iterator_adaptor<IndexOutputIterator,
   690                                     uint32_t *,
   691                                     thrust::tuple<uint32_t, uint32_t>,
   692                                     thrust::use_default,
   693                                     thrust::use_default,
   694                                     IndexProxy,
   695                                     thrust::use_default> super_t;
   696  
   697    __host__ __device__
   698    explicit IndexOutputIterator(uint32_t *base)
   699        : super_t(base) {
   700    }
   701  
   702   private:
   703    __host__ __device__
   704    typename super_t::reference dereference() const {
   705      return IndexProxy(this->base_reference());
   706    }
   707  };
   708  
   709  // The column mode for foreign tables are either 0 or 2, which is
   710  // presented by two different type of iterators: SimpleIterator and
   711  // VectorPartyIterator. And the mode of batch columns will be interleaving.
   712  // To make all iterators have the same size, we put both cases into a
   713  // single union and has a boolean flag to tell which iterator it is.
   714  template <typename Value>
   715  struct ForeignTableIterator {
   716    __host__ __device__ ForeignTableIterator() {}
   717  
   718    __host__ __device__
   719    ForeignTableIterator(const ForeignTableIterator<Value> &other) {
   720      this->isConst = other.isConst;
   721      if (this->isConst) {
   722        this->iter.constantIter = other.iter.constantIter;
   723      } else {
   724        this->iter.columnIter = other.iter.columnIter;
   725      }
   726    }
   727  
   728    __host__ __device__ ForeignTableIterator &operator=(
   729        const ForeignTableIterator<Value> &other) {
   730      this->isConst = other.isConst;
   731      if (this->isConst) {
   732        this->iter.constantIter = other.iter.constantIter;
   733      } else {
   734        this->iter.columnIter = other.iter.columnIter;
   735      }
   736      return *this;
   737    }
   738  
   739    explicit ForeignTableIterator(VectorPartyIterator<Value> columnIter)
   740        : isConst(false) {
   741      iter.columnIter = columnIter;
   742    }
   743  
   744    explicit ForeignTableIterator(
   745        typename ConstantIterator<Value>::type constantIter)
   746        : isConst(true) {
   747      iter.constantIter = constantIter;
   748    }
   749  
   750    union ForeignBatchIter {
   751      VectorPartyIterator<Value> columnIter;
   752      typename ConstantIterator<Value>::type constantIter;
   753      __host__ __device__ ForeignBatchIter() {}
   754    } iter;
   755    bool isConst;
   756  };
   757  
   758  // RecordIDJoinIterator reads the foreign table column.
   759  template<typename Value>
   760  class RecordIDJoinIterator
   761      : public thrust::iterator_adaptor<
   762          RecordIDJoinIterator<Value>, RecordID *,
   763          thrust::tuple<Value, bool>, thrust::use_default, thrust::use_default,
   764          thrust::tuple<Value, bool>,
   765          thrust::use_default> {
   766   public:
   767    friend class thrust::iterator_core_access;
   768  
   769    typedef ForeignTableIterator<Value> ValueIter;
   770  
   771    typedef thrust::iterator_adaptor<
   772        RecordIDJoinIterator<Value>, RecordID *,
   773        thrust::tuple<Value, bool>, thrust::use_default, thrust::use_default,
   774        thrust::tuple<Value, bool>, thrust::use_default>
   775        super_t;
   776  
   777    __host__ __device__ RecordIDJoinIterator(
   778        RecordID *base,
   779        int32_t numBatches,
   780        int32_t baseBatchID,
   781        ValueIter *batches,
   782        int32_t numRecordsInLastBatch,
   783        int16_t *timezoneLookupTable,
   784        int16_t timezoneLookupSize)
   785        : super_t(base),
   786          batches(batches),
   787          baseBatchID(baseBatchID),
   788          numBatches(numBatches),
   789          numRecordsInLastBatch(numRecordsInLastBatch),
   790          timezoneLookupTable(timezoneLookupTable),
   791          timezoneLookupSize(timezoneLookupSize) {
   792    }
   793  
   794   private:
   795    ValueIter *batches;
   796    int32_t baseBatchID;
   797    int32_t numBatches;
   798    int32_t numRecordsInLastBatch;
   799    int16_t *timezoneLookupTable;
   800    int16_t timezoneLookupSize;
   801  
   802    __host__ __device__
   803    thrust::tuple<GeoPointT, bool> timezoneLookup(
   804        thrust::tuple<GeoPointT, bool> val) const {
   805      return val;
   806    }
   807  
   808    __host__ __device__ thrust::tuple<UUIDT, bool> timezoneLookup(
   809        thrust::tuple<UUIDT, bool> val) const {
   810      return val;
   811    }
   812  
   813    template<typename V>
   814    __host__ __device__
   815    thrust::tuple<V, bool> timezoneLookup(
   816        thrust::tuple<V, bool> val) const {
   817      if (timezoneLookupTable) {
   818        // if timezoneLookup is not null, we must be iterating a enum column
   819        int enumVal = static_cast<int>(thrust::get<0>(val));
   820        if (enumVal < timezoneLookupSize) {
   821          return thrust::make_tuple(
   822              timezoneLookupTable[enumVal], thrust::get<1>(val));
   823        } else {
   824          return thrust::make_tuple(0, thrust::get<1>(val));
   825        }
   826      }
   827      return val;
   828    }
   829  
   830    __host__ __device__ typename super_t::reference dereference() const {
   831      RecordID recordID = *this->base_reference();
   832      if (recordID.batchID && (recordID.batchID - baseBatchID < numBatches - 1 ||
   833          recordID.index < numRecordsInLastBatch)) {
   834        ForeignTableIterator<Value>
   835            iter = batches[recordID.batchID - baseBatchID];
   836        if (iter.isConst) {
   837          return iter.iter.constantIter[recordID.index];
   838        }
   839        thrust::tuple<Value, bool> val = iter.iter.columnIter[recordID.index];
   840        return timezoneLookup(val);
   841      }
   842      Value defaultValue;
   843      return thrust::make_tuple(defaultValue, false);
   844    }
   845  };
   846  
   847  // DimensionHashIterator reads DimensionColumnVector and produce hashed values
   848  class DimensionHashIterator
   849      : public thrust::iterator_adaptor<
   850          DimensionHashIterator, uint32_t *, uint64_t, thrust::use_default,
   851          thrust::use_default, uint64_t, thrust::use_default> {
   852   public:
   853    friend class thrust::iterator_core_access;
   854    typedef thrust::iterator_adaptor<DimensionHashIterator, uint32_t *, uint64_t,
   855                                     thrust::use_default, thrust::use_default,
   856                                     uint64_t, thrust::use_default>
   857        super_t;
   858  
   859    __host__ __device__
   860    DimensionHashIterator(uint8_t *dimValues, uint32_t *indexVector,
   861                          uint8_t _numDimsPerDimWidth[NUM_DIM_WIDTH], int length)
   862        : super_t(indexVector), dimValues(dimValues), length(length) {
   863      totalNumDims = 0;
   864      rowBytes = 0;
   865      for (int i = 0; i < NUM_DIM_WIDTH; i++) {
   866        numDimsPerDimWidth[i] = _numDimsPerDimWidth[i];
   867        totalNumDims += _numDimsPerDimWidth[i];
   868        uint8_t dimBytes = 1 << (NUM_DIM_WIDTH - 1 - i);
   869        rowBytes += dimBytes * numDimsPerDimWidth[i];
   870      }
   871      nullValues = dimValues + rowBytes * length;
   872      // include null values, 1 byte per each dim
   873      rowBytes += totalNumDims;
   874    }
   875  
   876   private:
   877    uint8_t *dimValues;
   878    uint8_t *nullValues;
   879    uint8_t numDimsPerDimWidth[NUM_DIM_WIDTH];
   880    uint8_t totalNumDims;
   881    uint8_t rowBytes;
   882    int length;
   883  
   884    __host__ __device__ typename super_t::reference dereference() const {
   885      uint32_t index = *this->base_reference();
   886      uint8_t dimRow[MAX_DIMENSION_BYTES] = {0};
   887      uint64_t hashedOutput[2];
   888      // read from
   889      uint8_t *inputValueStart = dimValues;
   890      uint8_t *inputNullStart = nullValues;
   891      // write to
   892      uint8_t *outputValuePtr = dimRow;
   893      uint8_t *outputNullPtr = outputValuePtr + (rowBytes - totalNumDims);
   894      uint8_t numDims = 0;
   895      for (int i = 0; i < NUM_DIM_WIDTH; i++) {
   896        uint8_t dimBytes = 1 << (NUM_DIM_WIDTH - 1 - i);
   897        for (int j = numDims; j < numDims + numDimsPerDimWidth[i]; j++) {
   898          switch (dimBytes) {
   899            case 16:
   900              *reinterpret_cast<UUIDT *>(outputValuePtr) =
   901                  reinterpret_cast<UUIDT *>(inputValueStart)[index];
   902              break;
   903            case 8:
   904              *reinterpret_cast<uint64_t *>(outputValuePtr) =
   905                  reinterpret_cast<uint64_t *>(inputValueStart)[index];
   906              break;
   907            case 4:
   908              *reinterpret_cast<uint32_t *>(outputValuePtr) =
   909                  reinterpret_cast<uint32_t *>(inputValueStart)[index];
   910              break;
   911            case 2:
   912              *reinterpret_cast<uint16_t *>(outputValuePtr) =
   913                  reinterpret_cast<uint16_t *>(inputValueStart)[index];
   914              break;
   915            case 1:
   916              *outputValuePtr = inputValueStart[index];
   917              break;
   918          }
   919          outputValuePtr += dimBytes;
   920          inputValueStart += dimBytes * length;
   921          *outputNullPtr = inputNullStart[index];
   922          outputNullPtr++;
   923          inputNullStart += length;
   924        }
   925        numDims += numDimsPerDimWidth[i];
   926      }
   927      murmur3sum128(dimRow, rowBytes, 0, hashedOutput);
   928      // only use the first 64bit of the 128bit hash
   929      return hashedOutput[0];
   930    }
   931  };
   932  
   933  class DimValueProxy {
   934   public:
   935    __host__ __device__ DimValueProxy(uint8_t *ptr, int dimBytes)
   936        : ptr(ptr), dimBytes(dimBytes) {}
   937  
   938    __host__ __device__ DimValueProxy operator=(DimValueProxy t) {
   939      switch (dimBytes) {
   940        case 16:
   941          *reinterpret_cast<UUIDT *>(ptr) = *reinterpret_cast<UUIDT *>(t.ptr);
   942        case 8:
   943          *reinterpret_cast<uint64_t *>(ptr) =
   944              *reinterpret_cast<uint64_t *>(t.ptr);
   945        case 4:
   946          *reinterpret_cast<uint32_t *>(ptr) =
   947              *reinterpret_cast<uint32_t *>(t.ptr);
   948        case 2:
   949          *reinterpret_cast<uint16_t *>(ptr) =
   950              *reinterpret_cast<uint16_t *>(t.ptr);
   951        case 1:*ptr = *t.ptr;
   952      }
   953      return *this;
   954    }
   955  
   956   private:
   957    uint8_t *ptr;
   958    int dimBytes;
   959  };
   960  
   961  class DimensionColumnPermutateIterator
   962      : public thrust::iterator_adaptor<DimensionColumnPermutateIterator,
   963                                        uint32_t *, DimValueProxy,
   964                                        thrust::use_default, thrust::use_default,
   965                                        DimValueProxy, thrust::use_default> {
   966   public:
   967    friend class thrust::iterator_core_access;
   968    typedef thrust::iterator_adaptor<DimensionColumnPermutateIterator, uint32_t *,
   969                                     DimValueProxy, thrust::use_default,
   970                                     thrust::use_default, DimValueProxy,
   971                                     thrust::use_default>
   972        super_t;
   973  
   974    __host__ __device__ DimensionColumnPermutateIterator(
   975        uint8_t *values, uint32_t *indexVector, int dimInputLength,
   976        int dimOutputLength, uint8_t _numDimsPerDimWidth[NUM_DIM_WIDTH])
   977        : super_t(indexVector),
   978          begin(indexVector),
   979          values(values),
   980          dimInputLength(dimInputLength),
   981          dimOutputLength(dimOutputLength) {
   982      for (int i = 0; i < NUM_DIM_WIDTH; i++) {
   983        numDimsPerDimWidth[i] = _numDimsPerDimWidth[i];
   984      }
   985    }
   986  
   987   private:
   988    uint8_t *values;
   989    uint32_t *begin;
   990    int dimInputLength;
   991    int dimOutputLength;
   992    uint8_t numDimsPerDimWidth[NUM_DIM_WIDTH];
   993  
   994    __host__ __device__ typename super_t::reference dereference() const {
   995      int baseIndex = this->base_reference() - begin;
   996      int dimIndex = baseIndex / dimOutputLength;
   997      // index in current dimension vector
   998      int localIndex = *(begin + (baseIndex % dimOutputLength));
   999      int bytes = 0;
  1000      uint8_t numDims = 0;
  1001      uint8_t dimBytes = 0;
  1002      int i = 0;
  1003      for (; i < NUM_DIM_WIDTH; i++) {
  1004        dimBytes = 1 << (NUM_DIM_WIDTH - i - 1);
  1005        if (dimIndex < numDims + numDimsPerDimWidth[i]) {
  1006          bytes +=
  1007              ((dimIndex - numDims) * dimInputLength + localIndex) * dimBytes;
  1008          break;
  1009        } else {
  1010          bytes += numDimsPerDimWidth[i] * dimInputLength * dimBytes;
  1011        }
  1012        numDims += numDimsPerDimWidth[i];
  1013      }
  1014      // null vector
  1015      if (i == NUM_DIM_WIDTH) {
  1016        bytes += ((dimIndex - numDims) * dimInputLength + localIndex) * dimBytes;
  1017        return DimValueProxy(values + bytes, dimBytes);
  1018      }
  1019      return DimValueProxy(values + bytes, dimBytes);
  1020    }
  1021  };
  1022  
  1023  // DimensionColumnOutputIterator output dimension of dimOutputLength rows into
  1024  // a bigger dimension space which can hold capacity rows, the smallest unit of
  1025  // the output is one single dimension column
  1026  class DimensionColumnOutputIterator
  1027      : public thrust::iterator_adaptor<
  1028          DimensionColumnOutputIterator, thrust::counting_iterator<int>,
  1029          DimValueProxy, thrust::use_default, thrust::use_default,
  1030          DimValueProxy, thrust::use_default> {
  1031   public:
  1032    friend class thrust::iterator_core_access;
  1033    typedef thrust::iterator_adaptor<
  1034        DimensionColumnOutputIterator, thrust::counting_iterator<int>,
  1035        DimValueProxy, thrust::use_default, thrust::use_default, DimValueProxy,
  1036        thrust::use_default>
  1037        super_t;
  1038  
  1039    __host__ __device__ DimensionColumnOutputIterator(
  1040        uint8_t *values, int capacity, int dimOutputLength,
  1041        uint8_t _numDimsPerDimWidth[NUM_DIM_WIDTH],
  1042        int offset)
  1043        : super_t(thrust::make_counting_iterator<int>(0)),
  1044          values(values),
  1045          dimOutputLength(dimOutputLength),
  1046          capacity(capacity),
  1047          offset(offset) {
  1048      for (int i = 0; i < NUM_DIM_WIDTH; i++) {
  1049        numDimsPerDimWidth[i] = _numDimsPerDimWidth[i];
  1050      }
  1051    }
  1052  
  1053   private:
  1054    uint8_t *values;
  1055    int capacity;
  1056    int dimOutputLength;
  1057    uint8_t numDimsPerDimWidth[NUM_DIM_WIDTH];
  1058    // the start row to write output data, this is used to append operation
  1059    int offset;
  1060  
  1061    __host__ __device__ typename super_t::reference dereference() const {
  1062      int baseIndex = *this->base_reference();
  1063      int dimIndex = baseIndex / dimOutputLength;
  1064      int globalIndex = baseIndex + (capacity - dimOutputLength) * dimIndex
  1065                          + offset;
  1066      uint8_t numDims = 0;
  1067      int bytes = 0;
  1068      uint8_t dimBytes = 0;
  1069      int i = 0;
  1070      for (; i < NUM_DIM_WIDTH; i++) {
  1071        dimBytes = 1 << (NUM_DIM_WIDTH - i - 1);
  1072        if (dimIndex < numDims + numDimsPerDimWidth[i]) {
  1073          bytes += (globalIndex - numDims * capacity) * dimBytes;
  1074          break;
  1075        } else {
  1076          bytes += numDimsPerDimWidth[i] * capacity * dimBytes;
  1077        }
  1078        numDims += numDimsPerDimWidth[i];
  1079      }
  1080      if (i == NUM_DIM_WIDTH) {
  1081        bytes += (globalIndex - numDims * capacity) * dimBytes;
  1082        return DimValueProxy(values + bytes, dimBytes);
  1083      }
  1084      return DimValueProxy(values + bytes, dimBytes);
  1085    }
  1086  };
  1087  
  1088  // HLLRegIDHeadFlagIterator
  1089  class HLLRegIDHeadFlagIterator
  1090      : public thrust::iterator_adaptor<
  1091          HLLRegIDHeadFlagIterator, thrust::counting_iterator<int>,
  1092          unsigned int, thrust::use_default, thrust::use_default, unsigned int,
  1093          thrust::use_default> {
  1094   public:
  1095    friend class thrust::iterator_core_access;
  1096    typedef thrust::iterator_adaptor<HLLRegIDHeadFlagIterator,
  1097                                     thrust::counting_iterator<int>,
  1098                                     unsigned int,
  1099                                     thrust::use_default, thrust::use_default,
  1100                                     unsigned int, thrust::use_default>
  1101        super_t;
  1102  
  1103    __host__ __device__ HLLRegIDHeadFlagIterator(uint64_t *hashValues)
  1104        : super_t(thrust::make_counting_iterator(0)), hashValues(hashValues) {}
  1105  
  1106   private:
  1107    uint64_t *hashValues;
  1108  
  1109    __host__ __device__ typename super_t::reference dereference() const {
  1110      int index = *this->base_reference();
  1111      return (index == 0 || hashValues[index] != hashValues[index - 1]);
  1112    }
  1113  };
  1114  
  1115  // HLLValueOutputIterator
  1116  class HLLValueOutputIterator
  1117      : public thrust::iterator_adaptor<
  1118          HLLValueOutputIterator, thrust::counting_iterator<int>,
  1119          thrust::tuple<uint64_t, uint32_t, int>, thrust::use_default,
  1120          thrust::use_default, thrust::tuple<uint64_t, uint32_t, int>,
  1121          thrust::use_default> {
  1122   public:
  1123    friend class thrust::iterator_core_access;
  1124    typedef thrust::iterator_adaptor<
  1125        HLLValueOutputIterator, thrust::counting_iterator<int>,
  1126        thrust::tuple<uint64_t, uint32_t, int>, thrust::use_default,
  1127        thrust::use_default, thrust::tuple<uint64_t, uint32_t, int>,
  1128        thrust::use_default>
  1129        super_t;
  1130  
  1131    __host__ __device__ HLLValueOutputIterator(unsigned int *dimCount,
  1132                                               uint32_t *hllMeasureValues,
  1133                                               uint64_t *hllRegIDCumCount,
  1134                                               uint64_t *hllDimRegIDCumCount,
  1135                                               uint64_t *hllVectorOffsets)
  1136        : super_t(thrust::make_counting_iterator(0)),
  1137          dimCount(dimCount),
  1138          hllMeasureValues(hllMeasureValues),
  1139          hllRegIDCumCount(hllRegIDCumCount),
  1140          hllDimRegIDCumCount(hllDimRegIDCumCount),
  1141          hllVectorOffsets(hllVectorOffsets) {}
  1142  
  1143   private:
  1144    unsigned int *dimCount;
  1145    uint32_t *hllMeasureValues;
  1146    uint64_t *hllRegIDCumCount;
  1147    uint64_t *hllDimRegIDCumCount;
  1148    uint64_t *hllVectorOffsets;
  1149  
  1150    // get the reg id within dim group
  1151    // index:               0 1 2 3 4 5 6 7 8 9 10 11
  1152    // regHeadIter:         1 0 0 0 1 0 0 0 1 0 0 0
  1153    // hllRegIDCumCount:    1 1 1 1 2 2 2 2 3 3 3 3
  1154    // dimCount:            1 1 1 1 1 1 1 1 2 2 2 2
  1155    // hllDimRegIDCumCount: 0 2 3
  1156    // hllVectorOffsets:    0 8 12
  1157    __host__ __device__ typename super_t::reference dereference() const {
  1158      int index = *this->base_reference();
  1159      unsigned int dimIndex = dimCount[index] - 1;
  1160      int dimRegIDCount =
  1161          hllDimRegIDCumCount[dimIndex + 1] - hllDimRegIDCumCount[dimIndex];
  1162      uint64_t hllVectorOffset = hllVectorOffsets[dimIndex];
  1163      if (dimRegIDCount < HLL_DENSE_THRESHOLD) {
  1164        // sparse mode
  1165        uint64_t regIDCumCount = hllRegIDCumCount[index];
  1166        uint64_t dimRegIDcumCount = hllDimRegIDCumCount[dimIndex];
  1167        int regIDIndexWithinDim = regIDCumCount - dimRegIDcumCount - 1;
  1168        return thrust::make_tuple(hllVectorOffset + regIDIndexWithinDim * 4,
  1169                                  hllMeasureValues[index], 4);
  1170      } else {
  1171        // dense mode
  1172        int regID = hllMeasureValues[index] & 0x3FFF;
  1173        return thrust::make_tuple(hllVectorOffset + regID,
  1174                                  hllMeasureValues[index], 1);
  1175      }
  1176    }
  1177  };
  1178  
  1179  // GeoPredicateIterator reads geo intersection output predicate vector.
  1180  // It output the first intersected geoshape index, or -1 if no intersected
  1181  // geoshape found.
  1182  class GeoPredicateIterator
  1183      : public thrust::iterator_adaptor<GeoPredicateIterator, uint32_t *, int8_t,
  1184                                        thrust::use_default, thrust::use_default,
  1185                                        int8_t, thrust::use_default> {
  1186   public:
  1187    friend class thrust::iterator_core_access;
  1188  
  1189    typedef thrust::iterator_adaptor<GeoPredicateIterator, uint32_t *, int8_t,
  1190                                     thrust::use_default, thrust::use_default,
  1191                                     int8_t, thrust::use_default>
  1192        super_t;
  1193  
  1194    __host__ __device__ GeoPredicateIterator(uint32_t *predicateIter,
  1195                                             uint8_t stepInWords)
  1196        : super_t(predicateIter), stepInWords(stepInWords) {}
  1197  
  1198   private:
  1199    uint8_t stepInWords;
  1200  
  1201    __host__ __device__ int8_t get_first_none_zero(uint32_t word) const {
  1202      int i = 0;
  1203      while (i < 32) {
  1204        if ((word >> i) & 1) {
  1205          return i;
  1206        }
  1207        i++;
  1208      }
  1209      return -1;
  1210    }
  1211  
  1212    __host__ __device__ typename super_t::reference dereference() const {
  1213      for (int i = 0; i < stepInWords; i++) {
  1214        int8_t index = get_first_none_zero(this->base_reference()[i]);
  1215        if (index >= 0) {
  1216          return (int8_t)(i * 32 + index);
  1217        }
  1218      }
  1219      return -1;
  1220    }
  1221  
  1222    __host__ __device__ void advance(typename super_t::difference_type n) {
  1223      this->base_reference() += n * stepInWords;
  1224    }
  1225  
  1226    __host__ __device__ void increment() { advance(1); }
  1227  
  1228    __host__ __device__ void decrement() { advance(-1); }
  1229  
  1230    __host__ __device__ typename super_t::difference_type distance_to(
  1231        const GeoPredicateIterator &other) const {
  1232      typename super_t::difference_type dist =
  1233          other.base_reference() - this->base_reference();
  1234      return dist / stepInWords;
  1235    }
  1236  };
  1237  
  1238  // Used as void value and reference type.
  1239  struct EmptyStruct {};
  1240  
  1241  // GeoBatchIntersectIterator is the iterator to compute whether the
  1242  // semi-infinite ray horizontally emitted from the geo point crosses a single
  1243  // edge of the geoshape. Note value and dereference type of this iterator are
  1244  // all default which means accessing and assigning value to this iterator has
  1245  // no meaning. The dereference function will directly write to the output
  1246  // predicate vector using atomicXor.
  1247  template <typename GeoInputIterator>
  1248  class GeoBatchIntersectIterator
  1249      : public thrust::iterator_adaptor<
  1250          GeoBatchIntersectIterator<GeoInputIterator>,
  1251          GeoInputIterator, EmptyStruct,
  1252          thrust::use_default, thrust::use_default,
  1253          EmptyStruct, thrust::use_default> {
  1254   public:
  1255    friend class thrust::iterator_core_access;
  1256  
  1257    typedef thrust::iterator_adaptor<GeoBatchIntersectIterator<GeoInputIterator>,
  1258                                     GeoInputIterator, EmptyStruct,
  1259                                     thrust::use_default, thrust::use_default,
  1260                                     EmptyStruct, thrust::use_default>
  1261        super_t;
  1262  
  1263    __host__ __device__ GeoBatchIntersectIterator() {}
  1264  
  1265    __host__ __device__ GeoBatchIntersectIterator(
  1266        GeoInputIterator geoPoints, GeoShapeBatch geoShapes,
  1267        uint32_t *outputPredicate, bool inOrOut)
  1268        : super_t(geoPoints),
  1269          geoShapes(geoShapes),
  1270          outputPredicate(outputPredicate),
  1271          pointIndex(0),
  1272          inOrOut(inOrOut) {}
  1273  
  1274   private:
  1275    GeoShapeBatch geoShapes;
  1276    uint32_t *outputPredicate;
  1277    int32_t pointIndex;
  1278    bool inOrOut;
  1279  
  1280    __host_or_device__
  1281    typename super_t::reference dereference() const {
  1282      // offset to shapeVector in Bytes is totalNumberPoints * 2 * 4
  1283      uint8_t shapeIndex =
  1284          geoShapes.LatLongs[geoShapes.TotalNumPoints * 2 * 4 + pointIndex];
  1285      EmptyStruct emptyRes;
  1286      // nothing need to be done for the last point of a shape.
  1287      if (pointIndex >= geoShapes.TotalNumPoints - 1) {
  1288        return emptyRes;
  1289      }
  1290  
  1291      // shapeIndex change marks the last point of a shape, therefore nothing
  1292      // needs to be done
  1293      if (shapeIndex !=
  1294          geoShapes.LatLongs[geoShapes.TotalNumPoints * 2 * 4 + pointIndex + 1]) {
  1295        return emptyRes;
  1296      }
  1297  
  1298      auto testPoint = *this->base_reference();
  1299      // testPoint is null, we just write false to the output predicate.
  1300      if (!thrust::get<1>(testPoint)) {
  1301        // only first pointer is responsible for write.
  1302        if (pointIndex == 0) {
  1303          for (int i = 0; i < geoShapes.TotalWords; i++) {
  1304            outputPredicate[i] = !inOrOut;
  1305          }
  1306        }
  1307        return emptyRes;
  1308      }
  1309  
  1310      float testLat = thrust::get<0>(testPoint).Lat;
  1311      float testLong = thrust::get<0>(testPoint).Long;
  1312      // the latitude of first point of the edge.
  1313      float edgeLat1 = reinterpret_cast<float *>(geoShapes.LatLongs)[pointIndex];
  1314      // the latitude of second point of the edge.
  1315      float edgeLat2 =
  1316          reinterpret_cast<float *>(geoShapes.LatLongs)[pointIndex + 1];
  1317      if (edgeLat1 < FLT_MAX && edgeLat2 < FLT_MAX) {
  1318        float edgeLong1 = reinterpret_cast<float *>(
  1319            geoShapes.LatLongs)[geoShapes.TotalNumPoints + pointIndex];
  1320        float edgeLong2 = reinterpret_cast<float *>(
  1321            geoShapes.LatLongs)[geoShapes.TotalNumPoints + pointIndex + 1];
  1322        if (((edgeLong1 > testLong) != (edgeLong2 > testLong)) &&
  1323            (testLat < (edgeLat2 - edgeLat1) * (testLong - edgeLong1) /
  1324                               (edgeLong2 - edgeLong1) +
  1325                           edgeLat1)) {
  1326  #ifdef RUN_ON_DEVICE
  1327          atomicXor(outputPredicate + (shapeIndex / 32),
  1328                    (1 << (shapeIndex % 32)));
  1329  #else
  1330          // When we are running in host mode, we are running sequentially.
  1331          // So non atomic access is ok for now.
  1332          // If we switch to parallel host execution in future,
  1333          // we need to find out how to do atomic operation on an existing
  1334          // pointer in c++.
  1335          outputPredicate[shapeIndex / 32] ^= (1 << (shapeIndex % 32));
  1336  #endif
  1337        }
  1338      }
  1339      return emptyRes;
  1340    }
  1341  
  1342    __host__ __device__ void advance(typename super_t::difference_type n) {
  1343      int64_t newPointIndex = (int64_t)pointIndex + n;
  1344      if (newPointIndex >= geoShapes.TotalNumPoints) {
  1345        int steps = newPointIndex / geoShapes.TotalNumPoints;
  1346        this->base_reference() += steps;
  1347        pointIndex = newPointIndex %= geoShapes.TotalNumPoints;
  1348        outputPredicate += steps * geoShapes.TotalWords;
  1349      } else if (newPointIndex < 0) {
  1350        int steps = (newPointIndex - geoShapes.TotalNumPoints + 1) /
  1351                    geoShapes.TotalNumPoints;
  1352        this->base_reference() += steps;
  1353        pointIndex = newPointIndex -= steps * geoShapes.TotalNumPoints;
  1354        outputPredicate += steps * geoShapes.TotalWords;
  1355      } else {
  1356        pointIndex = (int32_t)newPointIndex;
  1357      }
  1358    }
  1359  
  1360    __host__ __device__ void increment() { advance(1); }
  1361  
  1362    __host__ __device__ void decrement() { advance(-1); }
  1363  
  1364    __host__ __device__ typename super_t::difference_type distance_to(
  1365        const GeoBatchIntersectIterator &other) const {
  1366      typename super_t::difference_type dist =
  1367          other.base_reference() - this->base_reference();
  1368      return dist * geoShapes.TotalNumPoints +
  1369             (other.pointIndex - this->pointIndex);
  1370    }
  1371  };
  1372  
  1373  template<typename GeoInputIterator>
  1374  GeoBatchIntersectIterator<GeoInputIterator> make_geo_batch_intersect_iterator(
  1375      GeoInputIterator points, GeoShapeBatch geoShape,
  1376      uint32_t *outputPredicate, bool inOrOut) {
  1377    return GeoBatchIntersectIterator<GeoInputIterator>(points,
  1378                                                       geoShape,
  1379                                                       outputPredicate,
  1380                                                       inOrOut);
  1381  }
  1382  
  1383  // Iterator to retrieve counts on index, which is usually for
  1384  // mode 3 archive vectorparty
  1385  class IndexCountIterator :
  1386    public thrust::iterator_adaptor<IndexCountIterator,
  1387                                      uint32_t *, uint32_t,
  1388                                      thrust::use_default, thrust::use_default,
  1389                                      uint32_t,
  1390                                      thrust::use_default> {
  1391   public:
  1392      friend class thrust::iterator_core_access;
  1393  
  1394      typedef thrust::iterator_adaptor<IndexCountIterator,
  1395                                     uint32_t *, uint32_t,
  1396                                     thrust::use_default, thrust::use_default,
  1397                                     uint32_t,
  1398                                     thrust::use_default> super_t;
  1399  
  1400      __host__ __device__ IndexCountIterator() {}
  1401  
  1402      IndexCountIterator(uint32_t *baseCount, uint32_t *indexVector) :
  1403          super_t(indexVector), baseCount(baseCount) {}
  1404  
  1405   private:
  1406      uint32_t *baseCount;
  1407  
  1408      __host__ __device__ uint32_t dereference() const {
  1409        return baseCount[*this->base_reference()+1]
  1410              - baseCount[*this->base_reference()];
  1411    }
  1412  };
  1413  
  1414  }  // namespace ares
  1415  
  1416  namespace thrust {
  1417  namespace detail {
  1418  // For execution policy with cuda, it requires the output iterator's reference
  1419  // type to be a actual reference. However when using MeasureProxy and
  1420  // IndexProxy, they are value types. So we need to specialize the
  1421  // is_non_const_reference to traits to treat all MeasureProxy template classes
  1422  // as reference type.
  1423  template<>
  1424  struct is_non_const_reference<
  1425      ares::MeasureProxy<int32_t> > : public true_type {
  1426  };
  1427  
  1428  template<>
  1429  struct is_non_const_reference<
  1430      ares::MeasureProxy<uint32_t> > : public true_type {
  1431  };
  1432  
  1433  template<>
  1434  struct is_non_const_reference<
  1435      ares::MeasureProxy<float_t> > : public true_type {
  1436  };
  1437  
  1438  template<>
  1439  struct is_non_const_reference<
  1440      ares::MeasureProxy<int64_t> > : public true_type {
  1441  };
  1442  
  1443  template<>
  1444  struct is_non_const_reference<
  1445      ares::MeasureProxy<double_t> > : public true_type {
  1446  };
  1447  
  1448  template<>
  1449  struct is_non_const_reference<ares::IndexProxy> : public true_type {
  1450  };
  1451  
  1452  template<>
  1453  struct is_non_const_reference<ares::DimValueProxy> : public true_type {
  1454  };
  1455  }  // namespace detail
  1456  }  // namespace thrust
  1457  #endif  // QUERY_ITERATOR_HPP_