github.com/cybriq/giocore@v0.0.7-0.20210703034601-cfb9cb5f3900/gpu/shaders/coarse.comp (about) 1 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense 2 3 // The coarse rasterizer stage of the pipeline. 4 // 5 // As input we have the ordered partitions of paths from the binning phase and 6 // the annotated tile list of segments and backdrop per path. 7 // 8 // Each workgroup operating on one bin by stream compacting 9 // the elements corresponding to the bin. 10 // 11 // As output we have an ordered command stream per tile. Every tile from a path (backdrop + segment list) will be encoded. 12 13 #version 450 14 #extension GL_GOOGLE_include_directive : enable 15 16 #include "mem.h" 17 #include "setup.h" 18 19 layout(local_size_x = N_TILE, local_size_y = 1) in; 20 21 layout(set = 0, binding = 1) readonly buffer ConfigBuf { 22 Config conf; 23 }; 24 25 #include "annotated.h" 26 #include "bins.h" 27 #include "tile.h" 28 #include "ptcl.h" 29 30 #define LG_N_PART_READ (7 + LG_WG_FACTOR) 31 #define N_PART_READ (1 << LG_N_PART_READ) 32 33 shared uint sh_elements[N_TILE]; 34 35 // Number of elements in the partition; prefix sum. 36 shared uint sh_part_count[N_PART_READ]; 37 shared Alloc sh_part_elements[N_PART_READ]; 38 39 shared uint sh_bitmaps[N_SLICE][N_TILE]; 40 41 shared uint sh_tile_count[N_TILE]; 42 // The width of the tile rect for the element, intersected with this bin 43 shared uint sh_tile_width[N_TILE]; 44 shared uint sh_tile_x0[N_TILE]; 45 shared uint sh_tile_y0[N_TILE]; 46 47 // These are set up so base + tile_y * stride + tile_x points to a Tile. 48 shared uint sh_tile_base[N_TILE]; 49 shared uint sh_tile_stride[N_TILE]; 50 51 #ifdef MEM_DEBUG 52 // Store allocs only when MEM_DEBUG to save shared memory traffic. 53 shared Alloc sh_tile_alloc[N_TILE]; 54 55 void write_tile_alloc(uint el_ix, Alloc a) { 56 sh_tile_alloc[el_ix] = a; 57 } 58 59 Alloc read_tile_alloc(uint el_ix, bool mem_ok) { 60 return sh_tile_alloc[el_ix]; 61 } 62 #else 63 void write_tile_alloc(uint el_ix, Alloc a) { 64 // No-op 65 } 66 67 Alloc read_tile_alloc(uint el_ix, bool mem_ok) { 68 // All memory. 69 return new_alloc(0, memory.length()*4, mem_ok); 70 } 71 #endif 72 73 // The maximum number of commands per annotated element. 74 #define ANNO_COMMANDS 2 75 76 // Perhaps cmd_alloc should be a global? This is a style question. 77 bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) { 78 if (cmd_ref.offset < cmd_limit) { 79 return true; 80 } 81 MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC); 82 if (new_cmd.failed) { 83 return false; 84 } 85 CmdJump jump = CmdJump(new_cmd.alloc.offset); 86 Cmd_Jump_write(cmd_alloc, cmd_ref, jump); 87 cmd_alloc = new_cmd.alloc; 88 cmd_ref = CmdRef(cmd_alloc.offset); 89 // Reserve space for the maximum number of commands and a potential jump. 90 cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size; 91 return true; 92 } 93 94 void write_fill(Alloc alloc, inout CmdRef cmd_ref, uint flags, Tile tile, float linewidth) { 95 if (fill_mode_from_flags(flags) == MODE_NONZERO) { 96 if (tile.tile.offset != 0) { 97 CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop); 98 Cmd_Fill_write(alloc, cmd_ref, cmd_fill); 99 cmd_ref.offset += 4 + CmdFill_size; 100 } else { 101 Cmd_Solid_write(alloc, cmd_ref); 102 cmd_ref.offset += 4; 103 } 104 } else { 105 CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth); 106 Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke); 107 cmd_ref.offset += 4 + CmdStroke_size; 108 } 109 } 110 111 void main() { 112 // Could use either linear or 2d layouts for both dispatch and 113 // invocations within the workgroup. We'll use variables to abstract. 114 uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X; 115 uint bin_ix = width_in_bins * gl_WorkGroupID.y + gl_WorkGroupID.x; 116 uint partition_ix = 0; 117 uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE; 118 uint th_ix = gl_LocalInvocationID.x; 119 120 // Coordinates of top left of bin, in tiles. 121 uint bin_tile_x = N_TILE_X * gl_WorkGroupID.x; 122 uint bin_tile_y = N_TILE_Y * gl_WorkGroupID.y; 123 124 // Per-tile state 125 uint tile_x = gl_LocalInvocationID.x % N_TILE_X; 126 uint tile_y = gl_LocalInvocationID.x / N_TILE_X; 127 uint this_tile_ix = (bin_tile_y + tile_y) * conf.width_in_tiles + bin_tile_x + tile_x; 128 Alloc cmd_alloc = slice_mem(conf.ptcl_alloc, this_tile_ix * PTCL_INITIAL_ALLOC, PTCL_INITIAL_ALLOC); 129 CmdRef cmd_ref = CmdRef(cmd_alloc.offset); 130 // Reserve space for the maximum number of commands and a potential jump. 131 uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size; 132 // The nesting depth of the clip stack 133 uint clip_depth = 0; 134 // State for the "clip zero" optimization. If it's nonzero, then we are 135 // currently in a clip for which the entire tile has an alpha of zero, and 136 // the value is the depth after the "begin clip" of that element. 137 uint clip_zero_depth = 0; 138 // State for the "clip one" optimization. If bit `i` is set, then that means 139 // that the clip pushed at depth `i` has an alpha of all one. 140 uint clip_one_mask = 0; 141 142 // I'm sure we can figure out how to do this with at least one fewer register... 143 // Items up to rd_ix have been read from sh_elements 144 uint rd_ix = 0; 145 // Items up to wr_ix have been written into sh_elements 146 uint wr_ix = 0; 147 // Items between part_start_ix and ready_ix are ready to be transferred from sh_part_elements 148 uint part_start_ix = 0; 149 uint ready_ix = 0; 150 151 // Leave room for the fine rasterizer scratch allocation. 152 Alloc scratch_alloc = slice_mem(cmd_alloc, 0, Alloc_size); 153 cmd_ref.offset += Alloc_size; 154 155 uint num_begin_slots = 0; 156 uint begin_slot = 0; 157 bool mem_ok = mem_error == NO_ERROR; 158 while (true) { 159 for (uint i = 0; i < N_SLICE; i++) { 160 sh_bitmaps[i][th_ix] = 0; 161 } 162 163 // parallel read of input partitions 164 do { 165 if (ready_ix == wr_ix && partition_ix < n_partitions) { 166 part_start_ix = ready_ix; 167 uint count = 0; 168 if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) { 169 uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; 170 count = read_mem(conf.bin_alloc, in_ix); 171 uint offset = read_mem(conf.bin_alloc, in_ix + 1); 172 sh_part_elements[th_ix] = new_alloc(offset, count*BinInstance_size, mem_ok); 173 } 174 // prefix sum of counts 175 for (uint i = 0; i < LG_N_PART_READ; i++) { 176 if (th_ix < N_PART_READ) { 177 sh_part_count[th_ix] = count; 178 } 179 barrier(); 180 if (th_ix < N_PART_READ) { 181 if (th_ix >= (1 << i)) { 182 count += sh_part_count[th_ix - (1 << i)]; 183 } 184 } 185 barrier(); 186 } 187 if (th_ix < N_PART_READ) { 188 sh_part_count[th_ix] = part_start_ix + count; 189 } 190 barrier(); 191 ready_ix = sh_part_count[N_PART_READ - 1]; 192 partition_ix += N_PART_READ; 193 } 194 // use binary search to find element to read 195 uint ix = rd_ix + th_ix; 196 if (ix >= wr_ix && ix < ready_ix && mem_ok) { 197 uint part_ix = 0; 198 for (uint i = 0; i < LG_N_PART_READ; i++) { 199 uint probe = part_ix + ((N_PART_READ / 2) >> i); 200 if (ix >= sh_part_count[probe - 1]) { 201 part_ix = probe; 202 } 203 } 204 ix -= part_ix > 0 ? sh_part_count[part_ix - 1] : part_start_ix; 205 Alloc bin_alloc = sh_part_elements[part_ix]; 206 BinInstanceRef inst_ref = BinInstanceRef(bin_alloc.offset); 207 BinInstance inst = BinInstance_read(bin_alloc, BinInstance_index(inst_ref, ix)); 208 sh_elements[th_ix] = inst.element_ix; 209 } 210 barrier(); 211 212 wr_ix = min(rd_ix + N_TILE, ready_ix); 213 } while (wr_ix - rd_ix < N_TILE && (wr_ix < ready_ix || partition_ix < n_partitions)); 214 215 // We've done the merge and filled the buffer. 216 217 // Read one element, compute coverage. 218 uint tag = Annotated_Nop; 219 uint element_ix; 220 AnnotatedRef ref; 221 if (th_ix + rd_ix < wr_ix) { 222 element_ix = sh_elements[th_ix]; 223 ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); 224 tag = Annotated_tag(conf.anno_alloc, ref).tag; 225 } 226 227 // Bounding box of element in pixel coordinates. 228 uint tile_count; 229 switch (tag) { 230 case Annotated_Color: 231 case Annotated_Image: 232 case Annotated_BeginClip: 233 case Annotated_EndClip: 234 // We have one "path" for each element, even if the element isn't 235 // actually a path (currently EndClip, but images etc in the future). 236 uint path_ix = element_ix; 237 Path path = Path_read(conf.tile_alloc, PathRef(conf.tile_alloc.offset + path_ix * Path_size)); 238 uint stride = path.bbox.z - path.bbox.x; 239 sh_tile_stride[th_ix] = stride; 240 int dx = int(path.bbox.x) - int(bin_tile_x); 241 int dy = int(path.bbox.y) - int(bin_tile_y); 242 int x0 = clamp(dx, 0, N_TILE_X); 243 int y0 = clamp(dy, 0, N_TILE_Y); 244 int x1 = clamp(int(path.bbox.z) - int(bin_tile_x), 0, N_TILE_X); 245 int y1 = clamp(int(path.bbox.w) - int(bin_tile_y), 0, N_TILE_Y); 246 sh_tile_width[th_ix] = uint(x1 - x0); 247 sh_tile_x0[th_ix] = x0; 248 sh_tile_y0[th_ix] = y0; 249 tile_count = uint(x1 - x0) * uint(y1 - y0); 250 // base relative to bin 251 uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size; 252 sh_tile_base[th_ix] = base; 253 Alloc path_alloc = new_alloc(path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok); 254 write_tile_alloc(th_ix, path_alloc); 255 break; 256 default: 257 tile_count = 0; 258 break; 259 } 260 261 // Prefix sum of sh_tile_count 262 sh_tile_count[th_ix] = tile_count; 263 for (uint i = 0; i < LG_N_TILE; i++) { 264 barrier(); 265 if (th_ix >= (1 << i)) { 266 tile_count += sh_tile_count[th_ix - (1 << i)]; 267 } 268 barrier(); 269 sh_tile_count[th_ix] = tile_count; 270 } 271 barrier(); 272 uint total_tile_count = sh_tile_count[N_TILE - 1]; 273 for (uint ix = th_ix; ix < total_tile_count; ix += N_TILE) { 274 // Binary search to find element 275 uint el_ix = 0; 276 for (uint i = 0; i < LG_N_TILE; i++) { 277 uint probe = el_ix + ((N_TILE / 2) >> i); 278 if (ix >= sh_tile_count[probe - 1]) { 279 el_ix = probe; 280 } 281 } 282 AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + sh_elements[el_ix] * Annotated_size); 283 uint tag = Annotated_tag(conf.anno_alloc, ref).tag; 284 uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0); 285 uint width = sh_tile_width[el_ix]; 286 uint x = sh_tile_x0[el_ix] + seq_ix % width; 287 uint y = sh_tile_y0[el_ix] + seq_ix / width; 288 bool include_tile = false; 289 if (tag == Annotated_BeginClip || tag == Annotated_EndClip) { 290 include_tile = true; 291 } else if (mem_ok) { 292 Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok), TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size)); 293 // Include the path in the tile if 294 // - the tile contains at least a segment (tile offset non-zero) 295 // - the tile is completely covered (backdrop non-zero) 296 include_tile = tile.tile.offset != 0 || tile.backdrop != 0; 297 } 298 if (include_tile) { 299 uint el_slice = el_ix / 32; 300 uint el_mask = 1 << (el_ix & 31); 301 atomicOr(sh_bitmaps[el_slice][y * N_TILE_X + x], el_mask); 302 } 303 } 304 305 barrier(); 306 307 // Output non-segment elements for this tile. The thread does a sequential walk 308 // through the non-segment elements. 309 uint slice_ix = 0; 310 uint bitmap = sh_bitmaps[0][th_ix]; 311 while (mem_ok) { 312 if (bitmap == 0) { 313 slice_ix++; 314 if (slice_ix == N_SLICE) { 315 break; 316 } 317 bitmap = sh_bitmaps[slice_ix][th_ix]; 318 if (bitmap == 0) { 319 continue; 320 } 321 } 322 uint element_ref_ix = slice_ix * 32 + findLSB(bitmap); 323 uint element_ix = sh_elements[element_ref_ix]; 324 325 // Clear LSB 326 bitmap &= bitmap - 1; 327 328 // At this point, we read the element again from global memory. 329 // If that turns out to be expensive, maybe we can pack it into 330 // shared memory (or perhaps just the tag). 331 ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); 332 AnnotatedTag tag = Annotated_tag(conf.anno_alloc, ref); 333 334 if (clip_zero_depth == 0) { 335 switch (tag.tag) { 336 case Annotated_Color: 337 Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix] 338 + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); 339 AnnoColor fill = Annotated_Color_read(conf.anno_alloc, ref); 340 if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { 341 break; 342 } 343 write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill.linewidth); 344 Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(fill.rgba_color)); 345 cmd_ref.offset += 4 + CmdColor_size; 346 break; 347 case Annotated_Image: 348 tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix] 349 + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); 350 AnnoImage fill_img = Annotated_Image_read(conf.anno_alloc, ref); 351 if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { 352 break; 353 } 354 write_fill(cmd_alloc, cmd_ref, tag.flags, tile, fill_img.linewidth); 355 Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(fill_img.index, fill_img.offset)); 356 cmd_ref.offset += 4 + CmdImage_size; 357 break; 358 case Annotated_BeginClip: 359 tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok), TileRef(sh_tile_base[element_ref_ix] 360 + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); 361 if (tile.tile.offset == 0 && tile.backdrop == 0) { 362 clip_zero_depth = clip_depth + 1; 363 } else if (tile.tile.offset == 0 && clip_depth < 32) { 364 clip_one_mask |= (1 << clip_depth); 365 } else { 366 AnnoBeginClip begin_clip = Annotated_BeginClip_read(conf.anno_alloc, ref); 367 if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { 368 break; 369 } 370 write_fill(cmd_alloc, cmd_ref, tag.flags, tile, begin_clip.linewidth); 371 Cmd_BeginClip_write(cmd_alloc, cmd_ref); 372 cmd_ref.offset += 4; 373 if (clip_depth < 32) { 374 clip_one_mask &= ~(1 << clip_depth); 375 } 376 begin_slot++; 377 num_begin_slots = max(num_begin_slots, begin_slot); 378 } 379 clip_depth++; 380 break; 381 case Annotated_EndClip: 382 clip_depth--; 383 if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) { 384 if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) { 385 break; 386 } 387 Cmd_Solid_write(cmd_alloc, cmd_ref); 388 cmd_ref.offset += 4; 389 begin_slot--; 390 Cmd_EndClip_write(cmd_alloc, cmd_ref); 391 cmd_ref.offset += 4; 392 } 393 break; 394 } 395 } else { 396 // In "clip zero" state, suppress all drawing 397 switch (tag.tag) { 398 case Annotated_BeginClip: 399 clip_depth++; 400 break; 401 case Annotated_EndClip: 402 if (clip_depth == clip_zero_depth) { 403 clip_zero_depth = 0; 404 } 405 clip_depth--; 406 break; 407 } 408 } 409 } 410 barrier(); 411 412 rd_ix += N_TILE; 413 if (rd_ix >= ready_ix && partition_ix >= n_partitions) break; 414 } 415 if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) { 416 Cmd_End_write(cmd_alloc, cmd_ref); 417 if (num_begin_slots > 0) { 418 // Write scratch allocation: one state per BeginClip per rasterizer chunk. 419 uint scratch_size = num_begin_slots * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4; 420 MallocResult scratch = malloc(scratch_size); 421 // Ignore scratch.failed; we don't use the allocation and kernel4 422 // checks for memory overflow before using it. 423 alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc); 424 } 425 } 426 }