tinygo.org/x/drivers@v0.27.1-0.20240509133757-7dbca2a54349/ws2812/gen-ws2812.go (about) 1 //go:build none 2 3 package main 4 5 import ( 6 "bytes" 7 "flag" 8 "fmt" 9 "math" 10 "os" 11 "strconv" 12 "strings" 13 ) 14 15 // This file generates assembly to precisely time the WS2812 protocol for 16 // various chips. Just add a new frequency below and run `go generate` to add 17 // the new assembly implementation - no fiddly timings to calculate and no nops 18 // to count! 19 // 20 // Right now this is specific to Cortex-M chips and assume the following things: 21 // - Arithmetic operations (shift, add, sub) take up 1 clock cycle. 22 // - The nop instruction also takes up 1 clock cycle. 23 // - Store instructions (to the GPIO pins) take up 2 clock cycles. 24 // - Branch instructions can take up 1 to 3 clock cycles. On the Cortex-M0, this 25 // depends on whether the branch is taken or not. On the M4, the documentation 26 // is less clear but it appears the instruction is still 1 to 3 cycles 27 // (possibly including some branch prediction). 28 // It is certainly possible to extend this to other architectures, such as AVR 29 // and RISC-V if needed. 30 // 31 // Here are two important resources. For the timings: 32 // https://wp.josh.com/2014/05/13/ws2812-neopixels-are-not-so-finicky-once-you-get-to-know-them/ 33 // For the assembly (more or less): 34 // https://cpldcpu.wordpress.com/2014/01/19/light_ws2812-library-v2-0/ 35 // The timings deviate a little bit from the code here, but so far the timings 36 // from wp.josh.com seem to be fine for the ws2812. 37 38 // Architecture implementation. Describes the template and the timings of the 39 // blocks of instructions so that most code can remain architecture-independent. 40 type architectureImpl struct { 41 buildTag string 42 minBaseCyclesT0H int 43 maxBaseCyclesT0H int 44 minBaseCyclesT1H int 45 maxBaseCyclesT1H int 46 minBaseCyclesTLD int 47 valueTemplate string // template for how to pass the 'c' byte to assembly 48 template string // assembly template 49 } 50 51 var architectures = map[string]architectureImpl{ 52 "cortexm": { 53 // Assume that a branch is 1 to 3 cycles, no matter whether it's taken 54 // or not. This is a rather conservative estimate, for Cortex-M+ for 55 // example the instruction cycles are precisely known. 56 buildTag: "cortexm", 57 minBaseCyclesT0H: 1 + 1 + 2, // shift + branch (not taken) + store 58 maxBaseCyclesT0H: 1 + 3 + 2, // shift + branch (not taken) + store 59 minBaseCyclesT1H: 1 + 1 + 2, // shift + branch (taken) + store 60 maxBaseCyclesT1H: 1 + 3 + 2, // shift + branch (taken) + store 61 minBaseCyclesTLD: 1 + 2 + 2, // subtraction + branch x2 + store (in next cycle) 62 valueTemplate: "(uint32_t)c << 24", 63 template: ` 64 1: @ send_bit 65 str %[maskSet], %[portSet] @ [2] T0H and T0L start here 66 @DELAY1 67 lsls %[value], #1 @ [1] 68 bcs.n 2f @ [1/3] skip_store 69 str %[maskClear], %[portClear] @ [2] T0H -> T0L transition 70 2: @ skip_store 71 @DELAY2 72 str %[maskClear], %[portClear] @ [2] T1H -> T1L transition 73 @DELAY3 74 subs %[i], #1 @ [1] 75 beq.n 3f @ [1/3] end 76 b 1b @ [1/3] send_bit 77 3: @ end 78 `, 79 }, 80 "tinygoriscv": { 81 // Largely based on the SiFive FE310 CPU: 82 // - stores are 1 cycle 83 // - branches are 1 or 3 cycles, depending on branch prediction 84 // - ALU operations are 1 cycle (as on most CPUs) 85 // Hopefully this generalizes to other chips. 86 buildTag: "tinygo.riscv32", 87 minBaseCyclesT0H: 1 + 1 + 1, // shift + branch (not taken) + store 88 maxBaseCyclesT0H: 1 + 3 + 1, // shift + branch (not taken) + store 89 minBaseCyclesT1H: 1 + 1 + 1, // shift + branch (taken) + store 90 maxBaseCyclesT1H: 1 + 3 + 1, // shift + branch (taken) + store 91 minBaseCyclesTLD: 1 + 1 + 1, // subtraction + branch + store (in next cycle) 92 valueTemplate: "(uint32_t)c << 23", 93 template: ` 94 1: // send_bit 95 sw %[maskSet], %[portSet] // [1] T0H and T0L start here 96 @DELAY1 97 slli %[value], %[value], 1 // [1] shift value left by 1 98 bltz %[value], 2f // [1/3] skip_store 99 sw %[maskClear], %[portClear] // [1] T0H -> T0L transition 100 2: // skip_store 101 @DELAY2 102 sw %[maskClear], %[portClear] // [1] T1H -> T1L transition 103 @DELAY3 104 addi %[i], %[i], -1 // [1] 105 bnez %[i], 1b // [1/3] send_bit 106 `, 107 }, 108 } 109 110 func writeCAssembly(f *os.File, arch string, megahertz int) error { 111 cycleTimeNS := 1 / float64(megahertz) 112 // These timings are taken from the table "Updated simplified timing 113 // constraints for NeoPixel strings" at: 114 // https://wp.josh.com/2014/05/13/ws2812-neopixels-are-not-so-finicky-once-you-get-to-know-them/ 115 // Here is a copy: 116 // Symbol Parameter Min Typical Max Units 117 // T0H 0 code, high voltage time 200 350 500 ns 118 // T1H 1 code, high voltage time 550 700 5500 ns 119 // TLD data, low voltage time 450 600 5000 ns 120 // TLL latch, low voltage time 6000 ns 121 // The equivalent table for WS2811 LEDs would be the following: 122 // Symbol Parameter Min Typical Max Units 123 // T0H 0 code, high voltage time 350 500 650 ns 124 // T1H 1 code, high voltage time 1050 1200 5500 ns 125 // TLD data, low voltage time 1150 1300 5000 ns 126 // TLL latch, low voltage time 6000 ns 127 // Combining the two (min and max) leads to the following table: 128 // Symbol Parameter Min Typical Max Units 129 // T0H 0 code, high voltage time 350 - 500 ns 130 // T1H 1 code, high voltage time 1050 - 5500 ns 131 // TLD data, low voltage time 1150 - 5000 ns 132 // TLL latch, low voltage time 6000 ns 133 // These comined timings are used so that the ws2812 package is compatible 134 // with both WS2812 and with WS2811 chips. 135 // T0H is the time the pin should be high to send a "0" bit. 136 // T1H is the time the pin should be high to send a "1" bit. 137 // TLD is the time the pin should be low between bits. 138 // TLL is the time the pin should be low to apply (latch) the new colors. 139 minCyclesT0H := int(math.Ceil(0.350 / cycleTimeNS)) 140 maxCyclesT0H := int(math.Floor(0.500 / cycleTimeNS)) 141 minCyclesT1H := int(math.Ceil(1.050 / cycleTimeNS)) 142 maxCyclesT1H := int(math.Floor(5.500 / cycleTimeNS)) 143 minCyclesTLD := int(math.Ceil(1.150 / cycleTimeNS)) 144 145 // The assembly template looks something like this: 146 // 1: @ send_bit 147 // str {maskSet}, {portSet} @ [2] T0H and T0L start here 148 // ...delay 1 149 // lsls {value}, #1 @ [1] 150 // bcs.n 2f @ [1/3] skip_store 151 // str {maskClear}, {portClear} @ [2] T0H -> T0L transition 152 // 2: @ skip_store 153 // ...delay 2 154 // str {maskClear}, {portClear} @ [2] T1H -> T1L transition 155 // ...delay 3 156 // subs {i}, #1 @ [1] 157 // bne.n 1b @ [1/3] send_bit 158 // 159 // We need to calculate the number of nop instructions in the three delays. 160 archImpl, ok := architectures[arch] 161 if !ok { 162 return fmt.Errorf("unknown architecture: %s", arch) 163 } 164 165 // Determine number of nops for delay1. This is primarily based on the T0H 166 // delay, which is relatively short (<500ns). 167 delay1 := minCyclesT0H - archImpl.minBaseCyclesT0H 168 if delay1 < 0 { 169 // The minCyclesT0H constraint could not be satisfied. Don't insert 170 // nops, in the hope that it isn't too long. 171 delay1 = 0 172 } 173 if delay1+archImpl.maxBaseCyclesT0H > maxCyclesT0H { 174 return fmt.Errorf("MCU appears to be too slow to satisfy minimum requirements for the T0H signal") 175 } 176 actualMinCyclesT0H := archImpl.minBaseCyclesT0H + delay1 177 actualMaxCyclesT0H := archImpl.maxBaseCyclesT0H + delay1 178 actualMinNanosecondsT0H := float64(actualMinCyclesT0H) / float64(megahertz) * 1000 179 actualMaxNanosecondsT0H := float64(actualMaxCyclesT0H) / float64(megahertz) * 1000 180 181 // Determine number of nops for delay2. This is delay1 plus some extra time 182 // so that the pulse is long enough for T1H. 183 minBaseCyclesT1H := delay1 + archImpl.minBaseCyclesT1H // delay1 + asssembly cycles 184 maxBaseCyclesT1H := delay1 + archImpl.maxBaseCyclesT1H // delay1 + asssembly cycles 185 delay2 := minCyclesT1H - minBaseCyclesT1H 186 if delay2 < 0 { 187 delay2 = 0 188 } 189 if delay2+maxBaseCyclesT1H > maxCyclesT1H { 190 // Unlikely, we have 5500ns for this operation. 191 return fmt.Errorf("MCU appears to be too slow to satisfy minimum requirements for the T1H signal") 192 } 193 actualMinCyclesT1H := minBaseCyclesT1H + delay2 194 actualMaxCyclesT1H := maxBaseCyclesT1H + delay2 195 actualMinNanosecondsT1H := float64(actualMinCyclesT1H) / float64(megahertz) * 1000 196 actualMaxNanosecondsT1H := float64(actualMaxCyclesT1H) / float64(megahertz) * 1000 197 198 // Determine number of nops for delay3. This is based on the TLD delay, the 199 // time between two high pulses. 200 delay3 := minCyclesTLD - archImpl.minBaseCyclesTLD 201 if delay3 < 0 { 202 delay3 = 0 203 } 204 actualMinCyclesTLD := archImpl.minBaseCyclesTLD + delay3 205 actualMinNanosecondsTLD := float64(actualMinCyclesTLD) / float64(megahertz) * 1000 206 207 // Create the Go function in a buffer. Using a buffer here to be able to 208 // ignore I/O errors. 209 buf := &bytes.Buffer{} 210 fmt.Fprintf(buf, "\n") 211 fmt.Fprintf(buf, "__attribute__((always_inline))\nvoid ws2812_writeByte%d(char c, uint32_t *portSet, uint32_t *portClear, uint32_t maskSet, uint32_t maskClear) {\n", megahertz) 212 fmt.Fprintf(buf, " // Timings:\n") 213 fmt.Fprintf(buf, " // T0H: %2d - %2d cycles or %.1fns - %.1fns\n", actualMinCyclesT0H, actualMaxCyclesT0H, actualMinNanosecondsT0H, actualMaxNanosecondsT0H) 214 fmt.Fprintf(buf, " // T1H: %2d - %2d cycles or %.1fns - %.1fns\n", actualMinCyclesT1H, actualMaxCyclesT1H, actualMinNanosecondsT1H, actualMaxNanosecondsT1H) 215 fmt.Fprintf(buf, " // TLD: %2d - cycles or %.1fns -\n", actualMinCyclesTLD, actualMinNanosecondsTLD) 216 fmt.Fprintf(buf, " uint32_t value = %s;\n", archImpl.valueTemplate) 217 asm := archImpl.template 218 asm = strings.TrimSpace(asm) 219 asm = strings.ReplaceAll(asm, " @DELAY1\n", strings.Repeat(" nop\n", delay1)) 220 asm = strings.ReplaceAll(asm, " @DELAY2\n", strings.Repeat(" nop\n", delay2)) 221 asm = strings.ReplaceAll(asm, " @DELAY3\n", strings.Repeat(" nop\n", delay3)) 222 asm = strings.ReplaceAll(asm, "\n", "\n\t") 223 fmt.Fprintf(buf, " char i = 8;\n") 224 fmt.Fprintf(buf, " __asm__ __volatile__(\n") 225 for _, line := range strings.Split(asm, "\n") { 226 fmt.Fprintf(buf, "\t\t%#v\n", line+"\n") 227 } 228 // Note: [value] and [i] must be input+output operands because they modify 229 // the value. 230 fmt.Fprintf(buf, ` : [value]"+r"(value), 231 [i]"+r"(i) 232 : [maskSet]"r"(maskSet), 233 [portSet]"m"(*portSet), 234 [maskClear]"r"(maskClear), 235 [portClear]"m"(*portClear)); 236 } 237 `) 238 239 // Now write the buffer contents (with the assembly function) to a file. 240 _, err := f.Write(buf.Bytes()) 241 return err 242 } 243 244 func writeGoWrapper(f *os.File, arch string, megahertz int) error { 245 // Create the Go function in a buffer. Using a buffer here to be able to 246 // ignore I/O errors. 247 buf := &bytes.Buffer{} 248 fmt.Fprintf(buf, "\n") 249 fmt.Fprintf(buf, "func (d Device) writeByte%d(c byte) {\n", megahertz) 250 fmt.Fprintf(buf, " portSet, maskSet := d.Pin.PortMaskSet()\n") 251 fmt.Fprintf(buf, " portClear, maskClear := d.Pin.PortMaskClear()\n") 252 fmt.Fprintf(buf, "\n") 253 fmt.Fprintf(buf, " mask := interrupt.Disable()\n") 254 fmt.Fprintf(buf, " C.ws2812_writeByte%d(C.char(c), (*C.uint32_t)(unsafe.Pointer(portSet)), (*C.uint32_t)(unsafe.Pointer(portClear)), C.uint32_t(maskSet), C.uint32_t(maskClear))\n", megahertz) 255 buf.WriteString(` 256 interrupt.Restore(mask) 257 } 258 `) 259 260 // Now write the buffer contents (with the assembly function) to a file. 261 _, err := f.Write(buf.Bytes()) 262 return err 263 } 264 265 func main() { 266 arch := flag.String("arch", "cortexm", "architecture to output to") 267 flag.Parse() 268 269 // Remaining parameters are all clock frequencies. 270 var clockFrequencies []int 271 for _, s := range flag.Args() { 272 freq, err := strconv.Atoi(s) 273 if err != nil { 274 fmt.Fprintln(os.Stderr, "cannot parse frequency:", s) 275 os.Exit(1) 276 } 277 clockFrequencies = append(clockFrequencies, freq) 278 } 279 280 f, err := os.Create("ws2812-asm_" + *arch + ".go") 281 if err != nil { 282 fmt.Fprintln(os.Stderr, "could not generate WS2812 assembly code:", err) 283 os.Exit(1) 284 } 285 defer f.Close() 286 fmt.Fprintln(f, "//go:build", architectures[*arch].buildTag) 287 f.WriteString(` 288 package ws2812 289 290 // Warning: autogenerated file. Instead of modifying this file, change 291 // gen-ws2812.go and run "go generate". 292 293 import "runtime/interrupt" 294 import "unsafe" 295 296 /* 297 #include <stdint.h> 298 `) 299 for _, megahertz := range clockFrequencies { 300 err := writeCAssembly(f, *arch, megahertz) 301 if err != nil { 302 fmt.Fprintf(os.Stderr, "could not generate WS2812 assembly code for %s and %dMHz: %s\n", *arch, megahertz, err) 303 os.Exit(1) 304 } 305 } 306 f.WriteString(`*/ 307 import "C" 308 `) 309 for _, megahertz := range clockFrequencies { 310 err := writeGoWrapper(f, *arch, megahertz) 311 if err != nil { 312 fmt.Fprintf(os.Stderr, "could not generate Go wrapper: %w\n", err) 313 os.Exit(1) 314 } 315 } 316 }