github.com/cybriq/giocore@v0.0.7-0.20210703034601-cfb9cb5f3900/gpu/shaders/elements.comp

github.com/cybriq/giocore@v0.0.7-0.20210703034601-cfb9cb5f3900/gpu/shaders/elements.comp (about)

     1  // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
     2  
     3  // The element processing stage, first in the pipeline.
     4  //
     5  // This stage is primarily about applying transforms and computing bounding
     6  // boxes. It is organized as a scan over the input elements, producing
     7  // annotated output elements.
     8  
     9  #version 450
    10  #extension GL_GOOGLE_include_directive : enable
    11  
    12  #include "mem.h"
    13  #include "setup.h"
    14  
    15  #define N_ROWS 4
    16  #define WG_SIZE 32
    17  #define LG_WG_SIZE 5
    18  #define PARTITION_SIZE (WG_SIZE * N_ROWS)
    19  
    20  layout(local_size_x = WG_SIZE, local_size_y = 1) in;
    21  
    22  layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    23      Config conf;
    24  };
    25  
    26  layout(set = 0, binding = 2) readonly buffer SceneBuf {
    27      uint[] scene;
    28  };
    29  
    30  // It would be better to use the Vulkan memory model than
    31  // "volatile" but shooting for compatibility here rather
    32  // than doing things right.
    33  layout(set = 0, binding = 3) volatile buffer StateBuf {
    34      uint part_counter;
    35      uint[] state;
    36  };
    37  
    38  #include "scene.h"
    39  #include "state.h"
    40  #include "annotated.h"
    41  #include "pathseg.h"
    42  #include "tile.h"
    43  
    44  #define StateBuf_stride (4 + 2 * State_size)
    45  
    46  StateRef state_aggregate_ref(uint partition_ix) {
    47      return StateRef(4 + partition_ix * StateBuf_stride);
    48  }
    49  
    50  StateRef state_prefix_ref(uint partition_ix) {
    51      return StateRef(4 + partition_ix * StateBuf_stride + State_size);
    52  }
    53  
    54  uint state_flag_index(uint partition_ix) {
    55      return partition_ix * (StateBuf_stride / 4);
    56  }
    57  
    58  // These correspond to X, A, P respectively in the prefix sum paper.
    59  #define FLAG_NOT_READY 0
    60  #define FLAG_AGGREGATE_READY 1
    61  #define FLAG_PREFIX_READY 2
    62  
    63  #define FLAG_SET_LINEWIDTH 1
    64  #define FLAG_SET_BBOX 2
    65  #define FLAG_RESET_BBOX 4
    66  #define FLAG_SET_FILL_MODE 8
    67  // Fill modes take up the next bit. Non-zero fill is 0, stroke is 1.
    68  #define LG_FILL_MODE 4
    69  #define FILL_MODE_BITS 1
    70  #define FILL_MODE_MASK (FILL_MODE_BITS << LG_FILL_MODE)
    71  
    72  // This is almost like a monoid (the interaction between transformation and
    73  // bounding boxes is approximate)
    74  State combine_state(State a, State b) {
    75      State c;
    76      c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
    77      c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
    78      c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x;
    79      c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y;
    80      if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) {
    81          c.bbox = a.bbox;
    82      } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 &&
    83          (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y))
    84      {
    85          c.bbox.xy = min(a.bbox.xy, c.bbox.xy);
    86          c.bbox.zw = max(a.bbox.zw, c.bbox.zw);
    87      }
    88      // It would be more concise to cast to matrix types; ah well.
    89      c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y;
    90      c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y;
    91      c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w;
    92      c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w;
    93      c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x;
    94      c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y;
    95      c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth;
    96      c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX | FLAG_SET_FILL_MODE)) | b.flags;
    97      c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1;
    98      uint fill_mode = (b.flags & FLAG_SET_FILL_MODE) == 0 ? a.flags : b.flags;
    99      fill_mode &= FILL_MODE_MASK;
   100      c.flags = (c.flags & ~FILL_MODE_MASK) | fill_mode;
   101      c.path_count = a.path_count + b.path_count;
   102      c.pathseg_count = a.pathseg_count + b.pathseg_count;
   103      c.trans_count = a.trans_count + b.trans_count;
   104      return c;
   105  }
   106  
   107  State map_element(ElementRef ref) {
   108      // TODO: it would *probably* be more efficient to make the memory read patterns less
   109      // divergent, though it would be more wasted memory.
   110      uint tag = Element_tag(ref).tag;
   111      State c;
   112      c.bbox = vec4(0.0, 0.0, 0.0, 0.0);
   113      c.mat = vec4(1.0, 0.0, 0.0, 1.0);
   114      c.translate = vec2(0.0, 0.0);
   115      c.linewidth = 1.0; // TODO should be 0.0
   116      c.flags = 0;
   117      c.path_count = 0;
   118      c.pathseg_count = 0;
   119      c.trans_count = 0;
   120      switch (tag) {
   121      case Element_Line:
   122          LineSeg line = Element_Line_read(ref);
   123          c.bbox.xy = min(line.p0, line.p1);
   124          c.bbox.zw = max(line.p0, line.p1);
   125          c.pathseg_count = 1;
   126          break;
   127      case Element_Quad:
   128          QuadSeg quad = Element_Quad_read(ref);
   129          c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2);
   130          c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2);
   131          c.pathseg_count = 1;
   132          break;
   133      case Element_Cubic:
   134          CubicSeg cubic = Element_Cubic_read(ref);
   135          c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3));
   136          c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3));
   137          c.pathseg_count = 1;
   138          break;
   139      case Element_FillColor:
   140      case Element_FillImage:
   141      case Element_BeginClip:
   142          c.flags = FLAG_RESET_BBOX;
   143          c.path_count = 1;
   144          break;
   145      case Element_EndClip:
   146          c.path_count = 1;
   147          break;
   148      case Element_SetLineWidth:
   149          SetLineWidth lw = Element_SetLineWidth_read(ref);
   150          c.linewidth = lw.width;
   151          c.flags = FLAG_SET_LINEWIDTH;
   152          break;
   153      case Element_Transform:
   154          Transform t = Element_Transform_read(ref);
   155          c.mat = t.mat;
   156          c.translate = t.translate;
   157          c.trans_count = 1;
   158          break;
   159      case Element_SetFillMode:
   160          SetFillMode fm = Element_SetFillMode_read(ref);
   161          c.flags = FLAG_SET_FILL_MODE | (fm.fill_mode << LG_FILL_MODE);
   162          break;
   163      }
   164      return c;
   165  }
   166  
   167  // Get the bounding box of a circle transformed by the matrix into an ellipse.
   168  vec2 get_linewidth(State st) {
   169      // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm
   170      return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw));
   171  }
   172  
   173  shared State sh_state[WG_SIZE];
   174  
   175  shared uint sh_part_ix;
   176  shared State sh_prefix;
   177  
   178  void main() {
   179      State th_state[N_ROWS];
   180      // Determine partition to process by atomic counter (described in Section
   181      // 4.4 of prefix sum paper).
   182      if (gl_LocalInvocationID.x == 0) {
   183          sh_part_ix = atomicAdd(part_counter, 1);
   184      }
   185      barrier();
   186      uint part_ix = sh_part_ix;
   187  
   188      uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS;
   189      ElementRef ref = ElementRef(ix * Element_size);
   190  
   191      th_state[0] = map_element(ref);
   192      for (uint i = 1; i < N_ROWS; i++) {
   193          // discussion question: would it be faster to load using more coherent patterns
   194          // into thread memory? This is kinda strided.
   195          th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i)));
   196      }
   197      State agg = th_state[N_ROWS - 1];
   198      sh_state[gl_LocalInvocationID.x] = agg;
   199      for (uint i = 0; i < LG_WG_SIZE; i++) {
   200          barrier();
   201          if (gl_LocalInvocationID.x >= (1 << i)) {
   202              State other = sh_state[gl_LocalInvocationID.x - (1 << i)];
   203              agg = combine_state(other, agg);
   204          }
   205          barrier();
   206          sh_state[gl_LocalInvocationID.x] = agg;
   207      }
   208  
   209      State exclusive;
   210      exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0);
   211      exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0);
   212      exclusive.translate = vec2(0.0, 0.0);
   213      exclusive.linewidth = 1.0; //TODO should be 0.0
   214      exclusive.flags = 0;
   215      exclusive.path_count = 0;
   216      exclusive.pathseg_count = 0;
   217      exclusive.trans_count = 0;
   218  
   219      // Publish aggregate for this partition
   220      if (gl_LocalInvocationID.x == WG_SIZE - 1) {
   221          // Note: with memory model, we'd want to generate the atomic store version of this.
   222          State_write(state_aggregate_ref(part_ix), agg);
   223          uint flag = FLAG_AGGREGATE_READY;
   224          memoryBarrierBuffer();
   225          if (part_ix == 0) {
   226              State_write(state_prefix_ref(part_ix), agg);
   227              flag = FLAG_PREFIX_READY;
   228          }
   229          state[state_flag_index(part_ix)] = flag;
   230          if (part_ix != 0) {
   231              // step 4 of paper: decoupled lookback
   232              uint look_back_ix = part_ix - 1;
   233  
   234              State their_agg;
   235              uint their_ix = 0;
   236              while (true) {
   237                  flag = state[state_flag_index(look_back_ix)];
   238                  if (flag == FLAG_PREFIX_READY) {
   239                      State their_prefix = State_read(state_prefix_ref(look_back_ix));
   240                      exclusive = combine_state(their_prefix, exclusive);
   241                      break;
   242                  } else if (flag == FLAG_AGGREGATE_READY) {
   243                      their_agg = State_read(state_aggregate_ref(look_back_ix));
   244                      exclusive = combine_state(their_agg, exclusive);
   245                      look_back_ix--;
   246                      their_ix = 0;
   247                      continue;
   248                  }
   249                  // else spin
   250  
   251                  // Unfortunately there's no guarantee of forward progress of other
   252                  // workgroups, so compute a bit of the aggregate before trying again.
   253                  // In the worst case, spinning stops when the aggregate is complete.
   254                  ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size);
   255                  State s = map_element(ref);
   256                  if (their_ix == 0) {
   257                      their_agg = s;
   258                  } else {
   259                      their_agg = combine_state(their_agg, s);
   260                  }
   261                  their_ix++;
   262                  if (their_ix == PARTITION_SIZE) {
   263                      exclusive = combine_state(their_agg, exclusive);
   264                      if (look_back_ix == 0) {
   265                          break;
   266                      }
   267                      look_back_ix--;
   268                      their_ix = 0;
   269                  }
   270              }
   271  
   272              // step 5 of paper: compute inclusive prefix
   273              State inclusive_prefix = combine_state(exclusive, agg);
   274              sh_prefix = exclusive;
   275              State_write(state_prefix_ref(part_ix), inclusive_prefix);
   276              memoryBarrierBuffer();
   277              flag = FLAG_PREFIX_READY;
   278              state[state_flag_index(part_ix)] = flag;
   279          }
   280      }
   281      barrier();
   282      if (part_ix != 0) {
   283          exclusive = sh_prefix;
   284      }
   285  
   286      State row = exclusive;
   287      if (gl_LocalInvocationID.x > 0) {
   288          State other = sh_state[gl_LocalInvocationID.x - 1];
   289          row = combine_state(row, other);
   290      }
   291      for (uint i = 0; i < N_ROWS; i++) {
   292          State st = combine_state(row, th_state[i]);
   293  
   294          // Here we read again from the original scene. There may be
   295          // gains to be had from stashing in shared memory or possibly
   296          // registers (though register pressure is an issue).
   297          ElementRef this_ref = Element_index(ref, i);
   298          ElementTag tag = Element_tag(this_ref);
   299          uint fill_mode = fill_mode_from_flags(st.flags >> LG_FILL_MODE);
   300          bool is_stroke = fill_mode == MODE_STROKE;
   301          switch (tag.tag) {
   302          case Element_Line:
   303              LineSeg line = Element_Line_read(this_ref);
   304              PathCubic path_cubic;
   305              path_cubic.p0 = line.p0;
   306              path_cubic.p1 = mix(line.p0, line.p1, 1.0 / 3.0);
   307              path_cubic.p2 = mix(line.p1, line.p0, 1.0 / 3.0);
   308              path_cubic.p3 = line.p1;
   309              path_cubic.path_ix = st.path_count;
   310              path_cubic.trans_ix = st.trans_count;
   311              if (is_stroke) {
   312                  path_cubic.stroke = get_linewidth(st);
   313              } else {
   314                  path_cubic.stroke = vec2(0.0);
   315              }
   316              PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
   317              PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
   318              break;
   319          case Element_Quad:
   320              QuadSeg quad = Element_Quad_read(this_ref);
   321              path_cubic.p0 = quad.p0;
   322              path_cubic.p1 = mix(quad.p1, quad.p0, 1.0 / 3.0);
   323              path_cubic.p2 = mix(quad.p1, quad.p2, 1.0 / 3.0);
   324              path_cubic.p3 = quad.p2;
   325              path_cubic.path_ix = st.path_count;
   326              path_cubic.trans_ix = st.trans_count;
   327              if (is_stroke) {
   328                  path_cubic.stroke = get_linewidth(st);
   329              } else {
   330                  path_cubic.stroke = vec2(0.0);
   331              }
   332              path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
   333              PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
   334              break;
   335          case Element_Cubic:
   336              CubicSeg cubic = Element_Cubic_read(this_ref);
   337              path_cubic.p0 = cubic.p0;
   338              path_cubic.p1 = cubic.p1;
   339              path_cubic.p2 = cubic.p2;
   340              path_cubic.p3 = cubic.p3;
   341              path_cubic.path_ix = st.path_count;
   342              path_cubic.trans_ix = st.trans_count;
   343              if (is_stroke) {
   344                  path_cubic.stroke = get_linewidth(st);
   345              } else {
   346                  path_cubic.stroke = vec2(0.0);
   347              }
   348              path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size);
   349              PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic);
   350              break;
   351          case Element_FillColor:
   352              FillColor fill = Element_FillColor_read(this_ref);
   353              AnnoColor anno_fill;
   354              anno_fill.rgba_color = fill.rgba_color;
   355              if (is_stroke) {
   356                  vec2 lw = get_linewidth(st);
   357                  anno_fill.bbox = st.bbox + vec4(-lw, lw);
   358                  anno_fill.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
   359              } else {
   360                  anno_fill.bbox = st.bbox;
   361                  anno_fill.linewidth = 0.0;
   362              }
   363              AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
   364              Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill);
   365              break;
   366          case Element_FillImage:
   367              FillImage fill_img = Element_FillImage_read(this_ref);
   368              AnnoImage anno_img;
   369              anno_img.index = fill_img.index;
   370              anno_img.offset = fill_img.offset;
   371              if (is_stroke) {
   372                  vec2 lw = get_linewidth(st);
   373                  anno_img.bbox = st.bbox + vec4(-lw, lw);
   374                  anno_img.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
   375              } else {
   376                  anno_img.bbox = st.bbox;
   377                  anno_img.linewidth = 0.0;
   378              }
   379              out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
   380              Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img);
   381              break;
   382          case Element_BeginClip:
   383              Clip begin_clip = Element_BeginClip_read(this_ref);
   384              AnnoBeginClip anno_begin_clip;
   385              // This is the absolute bbox, it's been transformed during encoding.
   386              anno_begin_clip.bbox = begin_clip.bbox;
   387              if (is_stroke) {
   388                  vec2 lw = get_linewidth(st);
   389                  anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
   390              } else {
   391                  anno_fill.linewidth = 0.0;
   392              }
   393              out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
   394              Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip);
   395              break;
   396          case Element_EndClip:
   397              Clip end_clip = Element_EndClip_read(this_ref);
   398              // This bbox is expected to be the same as the begin one.
   399              AnnoEndClip anno_end_clip = AnnoEndClip(end_clip.bbox);
   400              out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size);
   401              Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip);
   402              break;
   403          case Element_Transform:
   404              TransformSeg transform = TransformSeg(st.mat, st.translate);
   405              TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (st.trans_count - 1) * TransformSeg_size);
   406              TransformSeg_write(conf.trans_alloc, trans_ref, transform);
   407              break;
   408          }
   409      }
   410  }