github.com/cybriq/giocore@v0.0.7-0.20210703034601-cfb9cb5f3900/gpu/shaders/elements.comp (about) 1 // SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense 2 3 // The element processing stage, first in the pipeline. 4 // 5 // This stage is primarily about applying transforms and computing bounding 6 // boxes. It is organized as a scan over the input elements, producing 7 // annotated output elements. 8 9 #version 450 10 #extension GL_GOOGLE_include_directive : enable 11 12 #include "mem.h" 13 #include "setup.h" 14 15 #define N_ROWS 4 16 #define WG_SIZE 32 17 #define LG_WG_SIZE 5 18 #define PARTITION_SIZE (WG_SIZE * N_ROWS) 19 20 layout(local_size_x = WG_SIZE, local_size_y = 1) in; 21 22 layout(set = 0, binding = 1) readonly buffer ConfigBuf { 23 Config conf; 24 }; 25 26 layout(set = 0, binding = 2) readonly buffer SceneBuf { 27 uint[] scene; 28 }; 29 30 // It would be better to use the Vulkan memory model than 31 // "volatile" but shooting for compatibility here rather 32 // than doing things right. 33 layout(set = 0, binding = 3) volatile buffer StateBuf { 34 uint part_counter; 35 uint[] state; 36 }; 37 38 #include "scene.h" 39 #include "state.h" 40 #include "annotated.h" 41 #include "pathseg.h" 42 #include "tile.h" 43 44 #define StateBuf_stride (4 + 2 * State_size) 45 46 StateRef state_aggregate_ref(uint partition_ix) { 47 return StateRef(4 + partition_ix * StateBuf_stride); 48 } 49 50 StateRef state_prefix_ref(uint partition_ix) { 51 return StateRef(4 + partition_ix * StateBuf_stride + State_size); 52 } 53 54 uint state_flag_index(uint partition_ix) { 55 return partition_ix * (StateBuf_stride / 4); 56 } 57 58 // These correspond to X, A, P respectively in the prefix sum paper. 59 #define FLAG_NOT_READY 0 60 #define FLAG_AGGREGATE_READY 1 61 #define FLAG_PREFIX_READY 2 62 63 #define FLAG_SET_LINEWIDTH 1 64 #define FLAG_SET_BBOX 2 65 #define FLAG_RESET_BBOX 4 66 #define FLAG_SET_FILL_MODE 8 67 // Fill modes take up the next bit. Non-zero fill is 0, stroke is 1. 68 #define LG_FILL_MODE 4 69 #define FILL_MODE_BITS 1 70 #define FILL_MODE_MASK (FILL_MODE_BITS << LG_FILL_MODE) 71 72 // This is almost like a monoid (the interaction between transformation and 73 // bounding boxes is approximate) 74 State combine_state(State a, State b) { 75 State c; 76 c.bbox.x = min(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + min(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; 77 c.bbox.y = min(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + min(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; 78 c.bbox.z = max(a.mat.x * b.bbox.x, a.mat.x * b.bbox.z) + max(a.mat.z * b.bbox.y, a.mat.z * b.bbox.w) + a.translate.x; 79 c.bbox.w = max(a.mat.y * b.bbox.x, a.mat.y * b.bbox.z) + max(a.mat.w * b.bbox.y, a.mat.w * b.bbox.w) + a.translate.y; 80 if ((a.flags & FLAG_RESET_BBOX) == 0 && b.bbox.z <= b.bbox.x && b.bbox.w <= b.bbox.y) { 81 c.bbox = a.bbox; 82 } else if ((a.flags & FLAG_RESET_BBOX) == 0 && (b.flags & FLAG_SET_BBOX) == 0 && 83 (a.bbox.z > a.bbox.x || a.bbox.w > a.bbox.y)) 84 { 85 c.bbox.xy = min(a.bbox.xy, c.bbox.xy); 86 c.bbox.zw = max(a.bbox.zw, c.bbox.zw); 87 } 88 // It would be more concise to cast to matrix types; ah well. 89 c.mat.x = a.mat.x * b.mat.x + a.mat.z * b.mat.y; 90 c.mat.y = a.mat.y * b.mat.x + a.mat.w * b.mat.y; 91 c.mat.z = a.mat.x * b.mat.z + a.mat.z * b.mat.w; 92 c.mat.w = a.mat.y * b.mat.z + a.mat.w * b.mat.w; 93 c.translate.x = a.mat.x * b.translate.x + a.mat.z * b.translate.y + a.translate.x; 94 c.translate.y = a.mat.y * b.translate.x + a.mat.w * b.translate.y + a.translate.y; 95 c.linewidth = (b.flags & FLAG_SET_LINEWIDTH) == 0 ? a.linewidth : b.linewidth; 96 c.flags = (a.flags & (FLAG_SET_LINEWIDTH | FLAG_SET_BBOX | FLAG_SET_FILL_MODE)) | b.flags; 97 c.flags |= (a.flags & FLAG_RESET_BBOX) >> 1; 98 uint fill_mode = (b.flags & FLAG_SET_FILL_MODE) == 0 ? a.flags : b.flags; 99 fill_mode &= FILL_MODE_MASK; 100 c.flags = (c.flags & ~FILL_MODE_MASK) | fill_mode; 101 c.path_count = a.path_count + b.path_count; 102 c.pathseg_count = a.pathseg_count + b.pathseg_count; 103 c.trans_count = a.trans_count + b.trans_count; 104 return c; 105 } 106 107 State map_element(ElementRef ref) { 108 // TODO: it would *probably* be more efficient to make the memory read patterns less 109 // divergent, though it would be more wasted memory. 110 uint tag = Element_tag(ref).tag; 111 State c; 112 c.bbox = vec4(0.0, 0.0, 0.0, 0.0); 113 c.mat = vec4(1.0, 0.0, 0.0, 1.0); 114 c.translate = vec2(0.0, 0.0); 115 c.linewidth = 1.0; // TODO should be 0.0 116 c.flags = 0; 117 c.path_count = 0; 118 c.pathseg_count = 0; 119 c.trans_count = 0; 120 switch (tag) { 121 case Element_Line: 122 LineSeg line = Element_Line_read(ref); 123 c.bbox.xy = min(line.p0, line.p1); 124 c.bbox.zw = max(line.p0, line.p1); 125 c.pathseg_count = 1; 126 break; 127 case Element_Quad: 128 QuadSeg quad = Element_Quad_read(ref); 129 c.bbox.xy = min(min(quad.p0, quad.p1), quad.p2); 130 c.bbox.zw = max(max(quad.p0, quad.p1), quad.p2); 131 c.pathseg_count = 1; 132 break; 133 case Element_Cubic: 134 CubicSeg cubic = Element_Cubic_read(ref); 135 c.bbox.xy = min(min(cubic.p0, cubic.p1), min(cubic.p2, cubic.p3)); 136 c.bbox.zw = max(max(cubic.p0, cubic.p1), max(cubic.p2, cubic.p3)); 137 c.pathseg_count = 1; 138 break; 139 case Element_FillColor: 140 case Element_FillImage: 141 case Element_BeginClip: 142 c.flags = FLAG_RESET_BBOX; 143 c.path_count = 1; 144 break; 145 case Element_EndClip: 146 c.path_count = 1; 147 break; 148 case Element_SetLineWidth: 149 SetLineWidth lw = Element_SetLineWidth_read(ref); 150 c.linewidth = lw.width; 151 c.flags = FLAG_SET_LINEWIDTH; 152 break; 153 case Element_Transform: 154 Transform t = Element_Transform_read(ref); 155 c.mat = t.mat; 156 c.translate = t.translate; 157 c.trans_count = 1; 158 break; 159 case Element_SetFillMode: 160 SetFillMode fm = Element_SetFillMode_read(ref); 161 c.flags = FLAG_SET_FILL_MODE | (fm.fill_mode << LG_FILL_MODE); 162 break; 163 } 164 return c; 165 } 166 167 // Get the bounding box of a circle transformed by the matrix into an ellipse. 168 vec2 get_linewidth(State st) { 169 // See https://www.iquilezles.org/www/articles/ellipses/ellipses.htm 170 return 0.5 * st.linewidth * vec2(length(st.mat.xz), length(st.mat.yw)); 171 } 172 173 shared State sh_state[WG_SIZE]; 174 175 shared uint sh_part_ix; 176 shared State sh_prefix; 177 178 void main() { 179 State th_state[N_ROWS]; 180 // Determine partition to process by atomic counter (described in Section 181 // 4.4 of prefix sum paper). 182 if (gl_LocalInvocationID.x == 0) { 183 sh_part_ix = atomicAdd(part_counter, 1); 184 } 185 barrier(); 186 uint part_ix = sh_part_ix; 187 188 uint ix = part_ix * PARTITION_SIZE + gl_LocalInvocationID.x * N_ROWS; 189 ElementRef ref = ElementRef(ix * Element_size); 190 191 th_state[0] = map_element(ref); 192 for (uint i = 1; i < N_ROWS; i++) { 193 // discussion question: would it be faster to load using more coherent patterns 194 // into thread memory? This is kinda strided. 195 th_state[i] = combine_state(th_state[i - 1], map_element(Element_index(ref, i))); 196 } 197 State agg = th_state[N_ROWS - 1]; 198 sh_state[gl_LocalInvocationID.x] = agg; 199 for (uint i = 0; i < LG_WG_SIZE; i++) { 200 barrier(); 201 if (gl_LocalInvocationID.x >= (1 << i)) { 202 State other = sh_state[gl_LocalInvocationID.x - (1 << i)]; 203 agg = combine_state(other, agg); 204 } 205 barrier(); 206 sh_state[gl_LocalInvocationID.x] = agg; 207 } 208 209 State exclusive; 210 exclusive.bbox = vec4(0.0, 0.0, 0.0, 0.0); 211 exclusive.mat = vec4(1.0, 0.0, 0.0, 1.0); 212 exclusive.translate = vec2(0.0, 0.0); 213 exclusive.linewidth = 1.0; //TODO should be 0.0 214 exclusive.flags = 0; 215 exclusive.path_count = 0; 216 exclusive.pathseg_count = 0; 217 exclusive.trans_count = 0; 218 219 // Publish aggregate for this partition 220 if (gl_LocalInvocationID.x == WG_SIZE - 1) { 221 // Note: with memory model, we'd want to generate the atomic store version of this. 222 State_write(state_aggregate_ref(part_ix), agg); 223 uint flag = FLAG_AGGREGATE_READY; 224 memoryBarrierBuffer(); 225 if (part_ix == 0) { 226 State_write(state_prefix_ref(part_ix), agg); 227 flag = FLAG_PREFIX_READY; 228 } 229 state[state_flag_index(part_ix)] = flag; 230 if (part_ix != 0) { 231 // step 4 of paper: decoupled lookback 232 uint look_back_ix = part_ix - 1; 233 234 State their_agg; 235 uint their_ix = 0; 236 while (true) { 237 flag = state[state_flag_index(look_back_ix)]; 238 if (flag == FLAG_PREFIX_READY) { 239 State their_prefix = State_read(state_prefix_ref(look_back_ix)); 240 exclusive = combine_state(their_prefix, exclusive); 241 break; 242 } else if (flag == FLAG_AGGREGATE_READY) { 243 their_agg = State_read(state_aggregate_ref(look_back_ix)); 244 exclusive = combine_state(their_agg, exclusive); 245 look_back_ix--; 246 their_ix = 0; 247 continue; 248 } 249 // else spin 250 251 // Unfortunately there's no guarantee of forward progress of other 252 // workgroups, so compute a bit of the aggregate before trying again. 253 // In the worst case, spinning stops when the aggregate is complete. 254 ElementRef ref = ElementRef((look_back_ix * PARTITION_SIZE + their_ix) * Element_size); 255 State s = map_element(ref); 256 if (their_ix == 0) { 257 their_agg = s; 258 } else { 259 their_agg = combine_state(their_agg, s); 260 } 261 their_ix++; 262 if (their_ix == PARTITION_SIZE) { 263 exclusive = combine_state(their_agg, exclusive); 264 if (look_back_ix == 0) { 265 break; 266 } 267 look_back_ix--; 268 their_ix = 0; 269 } 270 } 271 272 // step 5 of paper: compute inclusive prefix 273 State inclusive_prefix = combine_state(exclusive, agg); 274 sh_prefix = exclusive; 275 State_write(state_prefix_ref(part_ix), inclusive_prefix); 276 memoryBarrierBuffer(); 277 flag = FLAG_PREFIX_READY; 278 state[state_flag_index(part_ix)] = flag; 279 } 280 } 281 barrier(); 282 if (part_ix != 0) { 283 exclusive = sh_prefix; 284 } 285 286 State row = exclusive; 287 if (gl_LocalInvocationID.x > 0) { 288 State other = sh_state[gl_LocalInvocationID.x - 1]; 289 row = combine_state(row, other); 290 } 291 for (uint i = 0; i < N_ROWS; i++) { 292 State st = combine_state(row, th_state[i]); 293 294 // Here we read again from the original scene. There may be 295 // gains to be had from stashing in shared memory or possibly 296 // registers (though register pressure is an issue). 297 ElementRef this_ref = Element_index(ref, i); 298 ElementTag tag = Element_tag(this_ref); 299 uint fill_mode = fill_mode_from_flags(st.flags >> LG_FILL_MODE); 300 bool is_stroke = fill_mode == MODE_STROKE; 301 switch (tag.tag) { 302 case Element_Line: 303 LineSeg line = Element_Line_read(this_ref); 304 PathCubic path_cubic; 305 path_cubic.p0 = line.p0; 306 path_cubic.p1 = mix(line.p0, line.p1, 1.0 / 3.0); 307 path_cubic.p2 = mix(line.p1, line.p0, 1.0 / 3.0); 308 path_cubic.p3 = line.p1; 309 path_cubic.path_ix = st.path_count; 310 path_cubic.trans_ix = st.trans_count; 311 if (is_stroke) { 312 path_cubic.stroke = get_linewidth(st); 313 } else { 314 path_cubic.stroke = vec2(0.0); 315 } 316 PathSegRef path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); 317 PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); 318 break; 319 case Element_Quad: 320 QuadSeg quad = Element_Quad_read(this_ref); 321 path_cubic.p0 = quad.p0; 322 path_cubic.p1 = mix(quad.p1, quad.p0, 1.0 / 3.0); 323 path_cubic.p2 = mix(quad.p1, quad.p2, 1.0 / 3.0); 324 path_cubic.p3 = quad.p2; 325 path_cubic.path_ix = st.path_count; 326 path_cubic.trans_ix = st.trans_count; 327 if (is_stroke) { 328 path_cubic.stroke = get_linewidth(st); 329 } else { 330 path_cubic.stroke = vec2(0.0); 331 } 332 path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); 333 PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); 334 break; 335 case Element_Cubic: 336 CubicSeg cubic = Element_Cubic_read(this_ref); 337 path_cubic.p0 = cubic.p0; 338 path_cubic.p1 = cubic.p1; 339 path_cubic.p2 = cubic.p2; 340 path_cubic.p3 = cubic.p3; 341 path_cubic.path_ix = st.path_count; 342 path_cubic.trans_ix = st.trans_count; 343 if (is_stroke) { 344 path_cubic.stroke = get_linewidth(st); 345 } else { 346 path_cubic.stroke = vec2(0.0); 347 } 348 path_out_ref = PathSegRef(conf.pathseg_alloc.offset + (st.pathseg_count - 1) * PathSeg_size); 349 PathSeg_Cubic_write(conf.pathseg_alloc, path_out_ref, fill_mode, path_cubic); 350 break; 351 case Element_FillColor: 352 FillColor fill = Element_FillColor_read(this_ref); 353 AnnoColor anno_fill; 354 anno_fill.rgba_color = fill.rgba_color; 355 if (is_stroke) { 356 vec2 lw = get_linewidth(st); 357 anno_fill.bbox = st.bbox + vec4(-lw, lw); 358 anno_fill.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); 359 } else { 360 anno_fill.bbox = st.bbox; 361 anno_fill.linewidth = 0.0; 362 } 363 AnnotatedRef out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); 364 Annotated_Color_write(conf.anno_alloc, out_ref, fill_mode, anno_fill); 365 break; 366 case Element_FillImage: 367 FillImage fill_img = Element_FillImage_read(this_ref); 368 AnnoImage anno_img; 369 anno_img.index = fill_img.index; 370 anno_img.offset = fill_img.offset; 371 if (is_stroke) { 372 vec2 lw = get_linewidth(st); 373 anno_img.bbox = st.bbox + vec4(-lw, lw); 374 anno_img.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); 375 } else { 376 anno_img.bbox = st.bbox; 377 anno_img.linewidth = 0.0; 378 } 379 out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); 380 Annotated_Image_write(conf.anno_alloc, out_ref, fill_mode, anno_img); 381 break; 382 case Element_BeginClip: 383 Clip begin_clip = Element_BeginClip_read(this_ref); 384 AnnoBeginClip anno_begin_clip; 385 // This is the absolute bbox, it's been transformed during encoding. 386 anno_begin_clip.bbox = begin_clip.bbox; 387 if (is_stroke) { 388 vec2 lw = get_linewidth(st); 389 anno_begin_clip.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); 390 } else { 391 anno_fill.linewidth = 0.0; 392 } 393 out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); 394 Annotated_BeginClip_write(conf.anno_alloc, out_ref, fill_mode, anno_begin_clip); 395 break; 396 case Element_EndClip: 397 Clip end_clip = Element_EndClip_read(this_ref); 398 // This bbox is expected to be the same as the begin one. 399 AnnoEndClip anno_end_clip = AnnoEndClip(end_clip.bbox); 400 out_ref = AnnotatedRef(conf.anno_alloc.offset + (st.path_count - 1) * Annotated_size); 401 Annotated_EndClip_write(conf.anno_alloc, out_ref, anno_end_clip); 402 break; 403 case Element_Transform: 404 TransformSeg transform = TransformSeg(st.mat, st.translate); 405 TransformSegRef trans_ref = TransformSegRef(conf.trans_alloc.offset + (st.trans_count - 1) * TransformSeg_size); 406 TransformSeg_write(conf.trans_alloc, trans_ref, transform); 407 break; 408 } 409 } 410 }