github.com/cybriq/giocore@v0.0.7-0.20210703034601-cfb9cb5f3900/gpu/shaders/path_coarse.comp

github.com/cybriq/giocore@v0.0.7-0.20210703034601-cfb9cb5f3900/gpu/shaders/path_coarse.comp (about)

     1  // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
     2  
     3  // Coarse rasterization of path segments.
     4  
     5  // Allocation and initialization of tiles for paths.
     6  
     7  #version 450
     8  #extension GL_GOOGLE_include_directive : enable
     9  
    10  #include "mem.h"
    11  #include "setup.h"
    12  
    13  #define LG_COARSE_WG 5
    14  #define COARSE_WG (1 << LG_COARSE_WG)
    15  
    16  layout(local_size_x = COARSE_WG, local_size_y = 1) in;
    17  
    18  layout(set = 0, binding = 1) readonly buffer ConfigBuf {
    19      Config conf;
    20  };
    21  
    22  #include "pathseg.h"
    23  #include "tile.h"
    24  
    25  // scale factors useful for converting coordinates to tiles
    26  #define SX (1.0 / float(TILE_WIDTH_PX))
    27  #define SY (1.0 / float(TILE_HEIGHT_PX))
    28  
    29  #define ACCURACY 0.25
    30  #define Q_ACCURACY (ACCURACY * 0.1)
    31  #define REM_ACCURACY (ACCURACY - Q_ACCURACY)
    32  #define MAX_HYPOT2 (432.0 * Q_ACCURACY * Q_ACCURACY)
    33  
    34  vec2 eval_quad(vec2 p0, vec2 p1, vec2 p2, float t) {
    35      float mt = 1.0 - t;
    36      return p0 * (mt * mt) + (p1 * (mt * 2.0) + p2 * t) * t;
    37  }
    38  
    39  vec2 eval_cubic(vec2 p0, vec2 p1, vec2 p2, vec2 p3, float t) {
    40      float mt = 1.0 - t;
    41      return p0 * (mt * mt * mt) + (p1 * (mt * mt * 3.0) + (p2 * (mt * 3.0) + p3 * t) * t) * t;
    42  }
    43  
    44  struct SubdivResult {
    45      float val;
    46      float a0;
    47      float a2;
    48  };
    49  
    50  /// An approximation to $\int (1 + 4x^2) ^ -0.25 dx$
    51  ///
    52  /// This is used for flattening curves.
    53  #define D 0.67
    54  float approx_parabola_integral(float x) {
    55      return x * inversesqrt(sqrt(1.0 - D + (D * D * D * D + 0.25 * x * x)));
    56  }
    57  
    58  /// An approximation to the inverse parabola integral.
    59  #define B 0.39
    60  float approx_parabola_inv_integral(float x) {
    61      return x * sqrt(1.0 - B + (B * B + 0.25 * x * x));
    62  }
    63  
    64  SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
    65      vec2 d01 = p1 - p0;
    66      vec2 d12 = p2 - p1;
    67      vec2 dd = d01 - d12;
    68      float cross = (p2.x - p0.x) * dd.y - (p2.y - p0.y) * dd.x;
    69      float x0 = (d01.x * dd.x + d01.y * dd.y) / cross;
    70      float x2 = (d12.x * dd.x + d12.y * dd.y) / cross;
    71      float scale = abs(cross / (length(dd) * (x2 - x0)));
    72  
    73      float a0 = approx_parabola_integral(x0);
    74      float a2 = approx_parabola_integral(x2);
    75      float val = 0.0;
    76      if (scale < 1e9) {
    77          float da = abs(a2 - a0);
    78          float sqrt_scale = sqrt(scale);
    79          if (sign(x0) == sign(x2)) {
    80              val = da * sqrt_scale;
    81          } else {
    82              float xmin = sqrt_tol / sqrt_scale;
    83              val = sqrt_tol * da / approx_parabola_integral(xmin);
    84          }
    85      }
    86      return SubdivResult(val, a0, a2);
    87  }
    88  
    89  void main() {
    90      uint element_ix = gl_GlobalInvocationID.x;
    91      PathSegRef ref = PathSegRef(conf.pathseg_alloc.offset + element_ix * PathSeg_size);
    92  
    93      PathSegTag tag = PathSegTag(PathSeg_Nop, 0);
    94      if (element_ix < conf.n_pathseg) {
    95          tag = PathSeg_tag(conf.pathseg_alloc, ref);
    96      }
    97      bool mem_ok = mem_error == NO_ERROR;
    98      switch (tag.tag) {
    99      case PathSeg_Cubic:
   100          PathCubic cubic = PathSeg_Cubic_read(conf.pathseg_alloc, ref);
   101  
   102          uint trans_ix = cubic.trans_ix;
   103          if (trans_ix > 0) {
   104              TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (trans_ix - 1) * TransformSeg_size);
   105              TransformSeg trans = TransformSeg_read(conf.trans_alloc, trans_ref);
   106              cubic.p0 = trans.mat.xy * cubic.p0.x + trans.mat.zw * cubic.p0.y + trans.translate;
   107              cubic.p1 = trans.mat.xy * cubic.p1.x + trans.mat.zw * cubic.p1.y + trans.translate;
   108              cubic.p2 = trans.mat.xy * cubic.p2.x + trans.mat.zw * cubic.p2.y + trans.translate;
   109              cubic.p3 = trans.mat.xy * cubic.p3.x + trans.mat.zw * cubic.p3.y + trans.translate;
   110          }
   111  
   112          vec2 err_v = 3.0 * (cubic.p2 - cubic.p1) + cubic.p0 - cubic.p3;
   113          float err = err_v.x * err_v.x + err_v.y * err_v.y;
   114          // The number of quadratics.
   115          uint n_quads = max(uint(ceil(pow(err * (1.0 / MAX_HYPOT2), 1.0 / 6.0))), 1);
   116          // Iterate over quadratics and tote up the estimated number of segments.
   117          float val = 0.0;
   118          vec2 qp0 = cubic.p0;
   119          float step = 1.0 / float(n_quads);
   120          for (uint i = 0; i < n_quads; i++) {
   121              float t = float(i + 1) * step;
   122              vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
   123              vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
   124              qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
   125              SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
   126              val += params.val;
   127  
   128              qp0 = qp2;
   129          }
   130          uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);
   131  
   132          bool is_stroke = fill_mode_from_flags(tag.flags) == MODE_STROKE;
   133          uint path_ix = cubic.path_ix;
   134          Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size));
   135          Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
   136          ivec4 bbox = ivec4(path.bbox);
   137          vec2 p0 = cubic.p0;
   138          qp0 = cubic.p0;
   139          float v_step = val / float(n);
   140          int n_out = 1;
   141          float val_sum = 0.0;
   142          for (uint i = 0; i < n_quads; i++) {
   143              float t = float(i + 1) * step;
   144              vec2 qp2 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t);
   145              vec2 qp1 = eval_cubic(cubic.p0, cubic.p1, cubic.p2, cubic.p3, t - 0.5 * step);
   146              qp1 = 2.0 * qp1 - 0.5 * (qp0 + qp2);
   147              SubdivResult params = estimate_subdiv(qp0, qp1, qp2, sqrt(REM_ACCURACY));
   148              float u0 = approx_parabola_inv_integral(params.a0);
   149              float u2 = approx_parabola_inv_integral(params.a2);
   150              float uscale = 1.0 / (u2 - u0);
   151              float target = float(n_out) * v_step;
   152              while (n_out == n || target < val_sum + params.val) {
   153                  vec2 p1;
   154                  if (n_out == n) {
   155                      p1 = cubic.p3;
   156                  } else {
   157                      float u = (target - val_sum) / params.val;
   158                      float a = mix(params.a0, params.a2, u);
   159                      float au = approx_parabola_inv_integral(a);
   160                      float t = (au - u0) * uscale;
   161                      p1 = eval_quad(qp0, qp1, qp2, t);
   162                  }
   163  
   164                  // Output line segment
   165  
   166                  // Bounding box of element in pixel coordinates.
   167                  float xmin = min(p0.x, p1.x) - cubic.stroke.x;
   168                  float xmax = max(p0.x, p1.x) + cubic.stroke.x;
   169                  float ymin = min(p0.y, p1.y) - cubic.stroke.y;
   170                  float ymax = max(p0.y, p1.y) + cubic.stroke.y;
   171                  float dx = p1.x - p0.x;
   172                  float dy = p1.y - p0.y;
   173                  // Set up for per-scanline coverage formula, below.
   174                  float invslope = abs(dy) < 1e-9 ? 1e9 : dx / dy;
   175                  float c = (cubic.stroke.x + abs(invslope) * (0.5 * float(TILE_HEIGHT_PX) + cubic.stroke.y)) * SX;
   176                  float b = invslope; // Note: assumes square tiles, otherwise scale.
   177                  float a = (p0.x - (p0.y - 0.5 * float(TILE_HEIGHT_PX)) * b) * SX;
   178  
   179                  int x0 = int(floor(xmin * SX));
   180                  int x1 = int(floor(xmax * SX) + 1);
   181                  int y0 = int(floor(ymin * SY));
   182                  int y1 = int(floor(ymax * SY) + 1);
   183  
   184                  x0 = clamp(x0, bbox.x, bbox.z);
   185                  y0 = clamp(y0, bbox.y, bbox.w);
   186                  x1 = clamp(x1, bbox.x, bbox.z);
   187                  y1 = clamp(y1, bbox.y, bbox.w);
   188                  float xc = a + b * float(y0);
   189                  int stride = bbox.z - bbox.x;
   190                  int base = (y0 - bbox.y) * stride - bbox.x;
   191                  // TODO: can be tighter, use c to bound width
   192                  uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
   193                  // Consider using subgroups to aggregate atomic add.
   194                  MallocResult tile_alloc = malloc(n_tile_alloc * TileSeg_size);
   195                  if (tile_alloc.failed || !mem_ok) {
   196                      return;
   197                  }
   198                  uint tile_offset = tile_alloc.alloc.offset;
   199  
   200                  TileSeg tile_seg;
   201  
   202                  int xray = int(floor(p0.x*SX));
   203                  int last_xray = int(floor(p1.x*SX));
   204                  if (p0.y > p1.y) {
   205                      int tmp = xray;
   206                      xray = last_xray;
   207                      last_xray = tmp;
   208                  }
   209                  for (int y = y0; y < y1; y++) {
   210                      float tile_y0 = float(y * TILE_HEIGHT_PX);
   211                      int xbackdrop = max(xray + 1, bbox.x);
   212                      if (!is_stroke && min(p0.y, p1.y) < tile_y0 && xbackdrop < bbox.z) {
   213                          int backdrop = p1.y < p0.y ? 1 : -1;
   214                          TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
   215                          uint tile_el = tile_ref.offset >> 2;
   216                          if (touch_mem(path_alloc, tile_el + 1)) {
   217                              atomicAdd(memory[tile_el + 1], backdrop);
   218                          }
   219                      }
   220  
   221                      // next_xray is the xray for the next scanline; the line segment intersects
   222                      // all tiles between xray and next_xray.
   223                      int next_xray = last_xray;
   224                      if (y < y1 - 1) {
   225                          float tile_y1 = float((y + 1) * TILE_HEIGHT_PX);
   226                          float x_edge = mix(p0.x, p1.x, (tile_y1 - p0.y) / dy);
   227                          next_xray = int(floor(x_edge*SX));
   228                      }
   229  
   230                      int min_xray = min(xray, next_xray);
   231                      int max_xray = max(xray, next_xray);
   232                      int xx0 = min(int(floor(xc - c)), min_xray);
   233                      int xx1 = max(int(ceil(xc + c)), max_xray + 1);
   234                      xx0 = clamp(xx0, x0, x1);
   235                      xx1 = clamp(xx1, x0, x1);
   236  
   237                      for (int x = xx0; x < xx1; x++) {
   238                          float tile_x0 = float(x * TILE_WIDTH_PX);
   239                          TileRef tile_ref = Tile_index(TileRef(path.tiles.offset), uint(base + x));
   240                          uint tile_el = tile_ref.offset >> 2;
   241                          uint old = 0;
   242                          if (touch_mem(path_alloc, tile_el)) {
   243                              old = atomicExchange(memory[tile_el], tile_offset);
   244                          }
   245                          tile_seg.origin = p0;
   246                          tile_seg.vector = p1 - p0;
   247                          float y_edge = 0.0;
   248                          if (!is_stroke) {
   249                              y_edge = mix(p0.y, p1.y, (tile_x0 - p0.x) / dx);
   250                              if (min(p0.x, p1.x) < tile_x0) {
   251                                  vec2 p = vec2(tile_x0, y_edge);
   252                                  if (p0.x > p1.x) {
   253                                      tile_seg.vector = p - p0;
   254                                  } else {
   255                                      tile_seg.origin = p;
   256                                      tile_seg.vector = p1 - p;
   257                                  }
   258                                  // kernel4 uses sign(vector.x) for the sign of the intersection backdrop.
   259                                  // Nudge zeroes towards the intended sign.
   260                                  if (tile_seg.vector.x == 0) {
   261                                      tile_seg.vector.x = sign(p1.x - p0.x)*1e-9;
   262                                  }
   263                              }
   264                              if (x <= min_xray || max_xray < x) {
   265                                  // Reject inconsistent intersections.
   266                                  y_edge = 1e9;
   267                              }
   268                          }
   269                          tile_seg.y_edge = y_edge;
   270                          tile_seg.next.offset = old;
   271                          TileSeg_write(tile_alloc.alloc, TileSegRef(tile_offset), tile_seg);
   272                          tile_offset += TileSeg_size;
   273                      }
   274                      xc += b;
   275                      base += stride;
   276                      xray = next_xray;
   277                  }
   278  
   279                  n_out += 1;
   280                  target += v_step;
   281                  p0 = p1;
   282              }
   283              val_sum += params.val;
   284  
   285              qp0 = qp2;
   286          }
   287  
   288          break;
   289      }
   290  }