github.com/cybriq/giocore@v0.0.7-0.20210703034601-cfb9cb5f3900/gpu/shaders/binning.comp (about) 1 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense 2 3 // The binning stage of the pipeline. 4 // 5 // Each workgroup processes N_TILE paths. 6 // Each thread processes one path and calculates a N_TILE_X x N_TILE_Y coverage mask 7 // based on the path bounding box to bin the paths. 8 9 #version 450 10 #extension GL_GOOGLE_include_directive : enable 11 12 #include "mem.h" 13 #include "setup.h" 14 15 layout(local_size_x = N_TILE, local_size_y = 1) in; 16 17 layout(set = 0, binding = 1) readonly buffer ConfigBuf { 18 Config conf; 19 }; 20 21 #include "annotated.h" 22 #include "bins.h" 23 24 // scale factors useful for converting coordinates to bins 25 #define SX (1.0 / float(N_TILE_X * TILE_WIDTH_PX)) 26 #define SY (1.0 / float(N_TILE_Y * TILE_HEIGHT_PX)) 27 28 // Constant not available in GLSL. Also consider uintBitsToFloat(0x7f800000) 29 #define INFINITY (1.0 / 0.0) 30 31 // Note: cudaraster has N_TILE + 1 to cut down on bank conflicts. 32 // Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps) 33 shared uint bitmaps[N_SLICE][N_TILE]; 34 shared uint count[N_SLICE][N_TILE]; 35 shared Alloc sh_chunk_alloc[N_TILE]; 36 shared bool sh_alloc_failed; 37 38 void main() { 39 uint my_n_elements = conf.n_elements; 40 uint my_partition = gl_WorkGroupID.x; 41 42 for (uint i = 0; i < N_SLICE; i++) { 43 bitmaps[i][gl_LocalInvocationID.x] = 0; 44 } 45 if (gl_LocalInvocationID.x == 0) { 46 sh_alloc_failed = false; 47 } 48 barrier(); 49 50 // Read inputs and determine coverage of bins 51 uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; 52 AnnotatedRef ref = AnnotatedRef(conf.anno_alloc.offset + element_ix * Annotated_size); 53 uint tag = Annotated_Nop; 54 if (element_ix < my_n_elements) { 55 tag = Annotated_tag(conf.anno_alloc, ref).tag; 56 } 57 int x0 = 0, y0 = 0, x1 = 0, y1 = 0; 58 switch (tag) { 59 case Annotated_Color: 60 case Annotated_Image: 61 case Annotated_BeginClip: 62 case Annotated_EndClip: 63 // Note: we take advantage of the fact that these drawing elements 64 // have the bbox at the same place in their layout. 65 AnnoEndClip clip = Annotated_EndClip_read(conf.anno_alloc, ref); 66 x0 = int(floor(clip.bbox.x * SX)); 67 y0 = int(floor(clip.bbox.y * SY)); 68 x1 = int(ceil(clip.bbox.z * SX)); 69 y1 = int(ceil(clip.bbox.w * SY)); 70 break; 71 } 72 73 // At this point, we run an iterator over the coverage area, 74 // trying to keep divergence low. 75 // Right now, it's just a bbox, but we'll get finer with 76 // segments. 77 uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1)/N_TILE_X; 78 uint height_in_bins = (conf.height_in_tiles + N_TILE_Y - 1)/N_TILE_Y; 79 x0 = clamp(x0, 0, int(width_in_bins)); 80 x1 = clamp(x1, x0, int(width_in_bins)); 81 y0 = clamp(y0, 0, int(height_in_bins)); 82 y1 = clamp(y1, y0, int(height_in_bins)); 83 if (x0 == x1) y1 = y0; 84 int x = x0, y = y0; 85 uint my_slice = gl_LocalInvocationID.x / 32; 86 uint my_mask = 1 << (gl_LocalInvocationID.x & 31); 87 while (y < y1) { 88 atomicOr(bitmaps[my_slice][y * width_in_bins + x], my_mask); 89 x++; 90 if (x == x1) { 91 x = x0; 92 y++; 93 } 94 } 95 96 barrier(); 97 // Allocate output segments. 98 uint element_count = 0; 99 for (uint i = 0; i < N_SLICE; i++) { 100 element_count += bitCount(bitmaps[i][gl_LocalInvocationID.x]); 101 count[i][gl_LocalInvocationID.x] = element_count; 102 } 103 // element_count is number of elements covering bin for this invocation. 104 Alloc chunk_alloc = new_alloc(0, 0, true); 105 if (element_count != 0) { 106 // TODO: aggregate atomic adds (subgroup is probably fastest) 107 MallocResult chunk = malloc(element_count * BinInstance_size); 108 chunk_alloc = chunk.alloc; 109 sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc; 110 if (chunk.failed) { 111 sh_alloc_failed = true; 112 } 113 } 114 // Note: it might be more efficient for reading to do this in the 115 // other order (each bin is a contiguous sequence of partitions) 116 uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; 117 write_mem(conf.bin_alloc, out_ix, element_count); 118 write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset); 119 120 barrier(); 121 if (sh_alloc_failed || mem_error != NO_ERROR) { 122 return; 123 } 124 125 // Use similar strategy as Laine & Karras paper; loop over bbox of bins 126 // touched by this element 127 x = x0; 128 y = y0; 129 while (y < y1) { 130 uint bin_ix = y * width_in_bins + x; 131 uint out_mask = bitmaps[my_slice][bin_ix]; 132 if ((out_mask & my_mask) != 0) { 133 uint idx = bitCount(out_mask & (my_mask - 1)); 134 if (my_slice > 0) { 135 idx += count[my_slice - 1][bin_ix]; 136 } 137 Alloc out_alloc = sh_chunk_alloc[bin_ix]; 138 uint out_offset = out_alloc.offset + idx * BinInstance_size; 139 BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix)); 140 } 141 x++; 142 if (x == x1) { 143 x = x0; 144 y++; 145 } 146 } 147 }