github.com/cybriq/giocore@v0.0.7-0.20210703034601-cfb9cb5f3900/gpu/shaders/binning.comp

github.com/cybriq/giocore@v0.0.7-0.20210703034601-cfb9cb5f3900/gpu/shaders/binning.comp (about)

     1  // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
     2  
     3  // The binning stage of the pipeline.
     4  //
     5  // Each workgroup processes N_TILE paths.
     6  // Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask
     7  // based on the path bounding box to bin the paths.
     8  
     9  #version 450
    10  #extension GL_GOOGLE_include_directive : enable
    11  
    12  #include "mem.h"
    13  #include "setup.h"
    14  
    15  layout(local_size_x = N_TILE, local_size_y = 1) in;
    16  
    17  layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    18      Config conf;
    19  };
    20  
    21  #include "annotated.h"
    22  #include "bins.h"
    23  
    24  // scale factors useful for converting coordinates to bins
    25  #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX))
    26  #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX))
    27  
    28  // Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000)
    29  #define INFINITY (1.0 / 0.0)
    30  
    31  // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts.
    32  // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
    33  shared uint bitmaps[N_SLICE][N_TILE];
    34  shared uint count[N_SLICE][N_TILE];
    35  shared Alloc sh_chunk_alloc[N_TILE];
    36  shared bool sh_alloc_failed;
    37  
    38  void main() {
    39      uint my_n_elements = conf.n_elements;
    40      uint my_partition = gl_WorkGroupID.x;
    41  
    42      for (uint i = 0; i < N_SLICE; i++) {
    43          bitmaps[i][gl_LocalInvocationID.x] = 0;
    44      }
    45      if (gl_LocalInvocationID.x == 0) {
    46          sh_alloc_failed = false;
    47      }
    48      barrier();
    49  
    50      // Read inputs and determine coverage of bins
    51      uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
    52      AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size);
    53      uint tag = Annotated_Nop;
    54      if (element_ix < my_n_elements) {
    55          tag = Annotated_tag(conf.anno_alloc, ref).tag;
    56      }
    57      int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    58      switch (tag) {
    59      case Annotated_Color:
    60      case Annotated_Image:
    61      case Annotated_BeginClip:
    62      case Annotated_EndClip:
    63          // Note: we take advantage of the fact that these drawing elements
    64          // have the bbox at the same place in their layout.
    65          AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref);
    66          x0 = int(floor(clip.bbox.x * SX));
    67          y0 = int(floor(clip.bbox.y * SY));
    68          x1 = int(ceil(clip.bbox.z * SX));
    69          y1 = int(ceil(clip.bbox.w * SY));
    70          break;
    71      }
    72  
    73      // At this point, we run an iterator over the coverage area,
    74      // trying to keep divergence low.
    75      // Right now, it's just a bbox, but we'll get finer with
    76      // segments.
    77      uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X;
    78      uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y;
    79      x0 = clamp(x0, 0, int(width_in_bins));
    80      x1 = clamp(x1, x0, int(width_in_bins));
    81      y0 = clamp(y0, 0, int(height_in_bins));
    82      y1 = clamp(y1, y0, int(height_in_bins));
    83      if (x0 == x1) y1 = y0;
    84      int x = x0, y = y0;
    85      uint my_slice = gl_LocalInvocationID.x / 32;
    86      uint my_mask = 1 << (gl_LocalInvocationID.x & 31);
    87      while (y < y1) {
    88          atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask);
    89          x++;
    90          if (x == x1) {
    91              x = x0;
    92              y++;
    93          }
    94      }
    95  
    96      barrier();
    97      // Allocate output segments.
    98      uint element_count = 0;
    99      for (uint i = 0; i < N_SLICE; i++) {
   100          element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]);
   101          count[i][gl_LocalInvocationID.x] = element_count;
   102      }
   103      // element_count is number of elements covering bin for this invocation.
   104      Alloc chunk_alloc = new_alloc(0, 0, true);
   105      if (element_count != 0) {
   106          // TODO: aggregate atomic adds (subgroup is probably fastest)
   107          MallocResult chunk = malloc(element_count * BinInstance_size);
   108          chunk_alloc = chunk.alloc;
   109          sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
   110          if (chunk.failed) {
   111              sh_alloc_failed = true;
   112          }
   113      }
   114      // Note: it might be more efficient for reading to do this in the
   115      // other order (each bin is a contiguous sequence of partitions)
   116      uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
   117      write_mem(conf.bin_alloc, out_ix, element_count);
   118      write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
   119  
   120      barrier();
   121      if (sh_alloc_failed || mem_error != NO_ERROR) {
   122          return;
   123      }
   124  
   125      // Use similar strategy as Laine & Karras paper; loop over bbox of bins
   126      // touched by this element
   127      x = x0;
   128      y = y0;
   129      while (y < y1) {
   130          uint bin_ix = y * width_in_bins + x;
   131          uint out_mask = bitmaps[my_slice][bin_ix];
   132          if ((out_mask & my_mask) != 0) {
   133              uint idx = bitCount(out_mask & (my_mask - 1));
   134              if (my_slice > 0) {
   135                  idx += count[my_slice - 1][bin_ix];
   136              }
   137              Alloc out_alloc = sh_chunk_alloc[bin_ix];
   138              uint out_offset = out_alloc.offset + idx * BinInstance_size;
   139              BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
   140          }
   141          x++;
   142          if (x == x1) {
   143              x = x0;
   144              y++;
   145          }
   146      }
   147  }