tinygo.org/x/drivers@v0.27.1-0.20240509133757-7dbca2a54349/ws2812/gen-ws2812.go (about)

     1  //go:build none
     2  
     3  package main
     4  
     5  import (
     6  	"bytes"
     7  	"flag"
     8  	"fmt"
     9  	"math"
    10  	"os"
    11  	"strconv"
    12  	"strings"
    13  )
    14  
    15  // This file generates assembly to precisely time the WS2812 protocol for
    16  // various chips. Just add a new frequency below and run `go generate` to add
    17  // the new assembly implementation - no fiddly timings to calculate and no nops
    18  // to count!
    19  //
    20  // Right now this is specific to Cortex-M chips and assume the following things:
    21  // - Arithmetic operations (shift, add, sub) take up 1 clock cycle.
    22  // - The nop instruction also takes up 1 clock cycle.
    23  // - Store instructions (to the GPIO pins) take up 2 clock cycles.
    24  // - Branch instructions can take up 1 to 3 clock cycles. On the Cortex-M0, this
    25  //   depends on whether the branch is taken or not. On the M4, the documentation
    26  //   is less clear but it appears the instruction is still 1 to 3 cycles
    27  //   (possibly including some branch prediction).
    28  // It is certainly possible to extend this to other architectures, such as AVR
    29  // and RISC-V if needed.
    30  //
    31  // Here are two important resources. For the timings:
    32  // https://wp.josh.com/2014/05/13/ws2812-neopixels-are-not-so-finicky-once-you-get-to-know-them/
    33  // For the assembly (more or less):
    34  // https://cpldcpu.wordpress.com/2014/01/19/light_ws2812-library-v2-0/
    35  // The timings deviate a little bit from the code here, but so far the timings
    36  // from wp.josh.com seem to be fine for the ws2812.
    37  
    38  // Architecture implementation. Describes the template and the timings of the
    39  // blocks of instructions so that most code can remain architecture-independent.
    40  type architectureImpl struct {
    41  	buildTag         string
    42  	minBaseCyclesT0H int
    43  	maxBaseCyclesT0H int
    44  	minBaseCyclesT1H int
    45  	maxBaseCyclesT1H int
    46  	minBaseCyclesTLD int
    47  	valueTemplate    string // template for how to pass the 'c' byte to assembly
    48  	template         string // assembly template
    49  }
    50  
    51  var architectures = map[string]architectureImpl{
    52  	"cortexm": {
    53  		// Assume that a branch is 1 to 3 cycles, no matter whether it's taken
    54  		// or not. This is a rather conservative estimate, for Cortex-M+ for
    55  		// example the instruction cycles are precisely known.
    56  		buildTag:         "cortexm",
    57  		minBaseCyclesT0H: 1 + 1 + 2, // shift + branch (not taken) + store
    58  		maxBaseCyclesT0H: 1 + 3 + 2, // shift + branch (not taken) + store
    59  		minBaseCyclesT1H: 1 + 1 + 2, // shift + branch (taken) + store
    60  		maxBaseCyclesT1H: 1 + 3 + 2, // shift + branch (taken) + store
    61  		minBaseCyclesTLD: 1 + 2 + 2, // subtraction + branch x2 + store (in next cycle)
    62  		valueTemplate:    "(uint32_t)c << 24",
    63  		template: `
    64  1: @ send_bit
    65    str   %[maskSet], %[portSet]     @ [2]   T0H and T0L start here
    66    @DELAY1
    67    lsls  %[value], #1               @ [1]
    68    bcs.n 2f                         @ [1/3] skip_store
    69    str   %[maskClear], %[portClear] @ [2]   T0H -> T0L transition
    70  2: @ skip_store
    71    @DELAY2
    72    str   %[maskClear], %[portClear] @ [2]   T1H -> T1L transition
    73    @DELAY3
    74    subs  %[i], #1                   @ [1]
    75    beq.n 3f                         @ [1/3] end
    76    b     1b                         @ [1/3] send_bit
    77  3: @ end
    78  `,
    79  	},
    80  	"tinygoriscv": {
    81  		// Largely based on the SiFive FE310 CPU:
    82  		// - stores are 1 cycle
    83  		// - branches are 1 or 3 cycles, depending on branch prediction
    84  		// - ALU operations are 1 cycle (as on most CPUs)
    85  		// Hopefully this generalizes to other chips.
    86  		buildTag:         "tinygo.riscv32",
    87  		minBaseCyclesT0H: 1 + 1 + 1, // shift + branch (not taken) + store
    88  		maxBaseCyclesT0H: 1 + 3 + 1, // shift + branch (not taken) + store
    89  		minBaseCyclesT1H: 1 + 1 + 1, // shift + branch (taken) + store
    90  		maxBaseCyclesT1H: 1 + 3 + 1, // shift + branch (taken) + store
    91  		minBaseCyclesTLD: 1 + 1 + 1, // subtraction + branch + store (in next cycle)
    92  		valueTemplate:    "(uint32_t)c << 23",
    93  		template: `
    94  1: // send_bit
    95    sw    %[maskSet], %[portSet]     // [1]   T0H and T0L start here
    96    @DELAY1
    97    slli  %[value], %[value], 1      // [1]   shift value left by 1
    98    bltz  %[value], 2f               // [1/3] skip_store
    99    sw    %[maskClear], %[portClear] // [1]   T0H -> T0L transition
   100  2: // skip_store
   101    @DELAY2
   102    sw    %[maskClear], %[portClear] // [1]   T1H -> T1L transition
   103    @DELAY3
   104    addi  %[i], %[i], -1             // [1]
   105    bnez  %[i], 1b                   // [1/3] send_bit
   106  `,
   107  	},
   108  }
   109  
   110  func writeCAssembly(f *os.File, arch string, megahertz int) error {
   111  	cycleTimeNS := 1 / float64(megahertz)
   112  	// These timings are taken from the table "Updated simplified timing
   113  	// constraints for NeoPixel strings" at:
   114  	// https://wp.josh.com/2014/05/13/ws2812-neopixels-are-not-so-finicky-once-you-get-to-know-them/
   115  	// Here is a copy:
   116  	//   Symbol   Parameter                    Min   Typical    Max   Units
   117  	//   T0H      0 code, high voltage time    200       350    500   ns
   118  	//   T1H      1 code, high voltage time    550       700   5500   ns
   119  	//   TLD      data, low voltage time       450       600   5000   ns
   120  	//   TLL      latch, low voltage time     6000                    ns
   121  	// The equivalent table for WS2811 LEDs would be the following:
   122  	//   Symbol   Parameter                    Min   Typical    Max   Units
   123  	//   T0H      0 code, high voltage time    350       500    650   ns
   124  	//   T1H      1 code, high voltage time   1050      1200   5500   ns
   125  	//   TLD      data, low voltage time      1150      1300   5000   ns
   126  	//   TLL      latch, low voltage time     6000                    ns
   127  	// Combining the two (min and max) leads to the following table:
   128  	//   Symbol   Parameter                    Min   Typical    Max   Units
   129  	//   T0H      0 code, high voltage time    350         -    500   ns
   130  	//   T1H      1 code, high voltage time   1050         -   5500   ns
   131  	//   TLD      data, low voltage time      1150         -   5000   ns
   132  	//   TLL      latch, low voltage time     6000                    ns
   133  	// These comined timings are used so that the ws2812 package is compatible
   134  	// with both WS2812 and with WS2811 chips.
   135  	// T0H is the time the pin should be high to send a "0" bit.
   136  	// T1H is the time the pin should be high to send a "1" bit.
   137  	// TLD is the time the pin should be low between bits.
   138  	// TLL is the time the pin should be low to apply (latch) the new colors.
   139  	minCyclesT0H := int(math.Ceil(0.350 / cycleTimeNS))
   140  	maxCyclesT0H := int(math.Floor(0.500 / cycleTimeNS))
   141  	minCyclesT1H := int(math.Ceil(1.050 / cycleTimeNS))
   142  	maxCyclesT1H := int(math.Floor(5.500 / cycleTimeNS))
   143  	minCyclesTLD := int(math.Ceil(1.150 / cycleTimeNS))
   144  
   145  	// The assembly template looks something like this:
   146  	// 1: @ send_bit
   147  	//   str   {maskSet}, {portSet}     @ [2]   T0H and T0L start here
   148  	//   ...delay 1
   149  	//   lsls  {value}, #1              @ [1]
   150  	//   bcs.n 2f                       @ [1/3] skip_store
   151  	//   str   {maskClear}, {portClear} @ [2]   T0H -> T0L transition
   152  	// 2: @ skip_store
   153  	//   ...delay 2
   154  	//   str   {maskClear}, {portClear} @ [2]   T1H -> T1L transition
   155  	//   ...delay 3
   156  	//   subs  {i}, #1                  @ [1]
   157  	//   bne.n 1b                       @ [1/3] send_bit
   158  	//
   159  	// We need to calculate the number of nop instructions in the three delays.
   160  	archImpl, ok := architectures[arch]
   161  	if !ok {
   162  		return fmt.Errorf("unknown architecture: %s", arch)
   163  	}
   164  
   165  	// Determine number of nops for delay1. This is primarily based on the T0H
   166  	// delay, which is relatively short (<500ns).
   167  	delay1 := minCyclesT0H - archImpl.minBaseCyclesT0H
   168  	if delay1 < 0 {
   169  		// The minCyclesT0H constraint could not be satisfied. Don't insert
   170  		// nops, in the hope that it isn't too long.
   171  		delay1 = 0
   172  	}
   173  	if delay1+archImpl.maxBaseCyclesT0H > maxCyclesT0H {
   174  		return fmt.Errorf("MCU appears to be too slow to satisfy minimum requirements for the T0H signal")
   175  	}
   176  	actualMinCyclesT0H := archImpl.minBaseCyclesT0H + delay1
   177  	actualMaxCyclesT0H := archImpl.maxBaseCyclesT0H + delay1
   178  	actualMinNanosecondsT0H := float64(actualMinCyclesT0H) / float64(megahertz) * 1000
   179  	actualMaxNanosecondsT0H := float64(actualMaxCyclesT0H) / float64(megahertz) * 1000
   180  
   181  	// Determine number of nops for delay2. This is delay1 plus some extra time
   182  	// so that the pulse is long enough for T1H.
   183  	minBaseCyclesT1H := delay1 + archImpl.minBaseCyclesT1H // delay1 + asssembly cycles
   184  	maxBaseCyclesT1H := delay1 + archImpl.maxBaseCyclesT1H // delay1 + asssembly cycles
   185  	delay2 := minCyclesT1H - minBaseCyclesT1H
   186  	if delay2 < 0 {
   187  		delay2 = 0
   188  	}
   189  	if delay2+maxBaseCyclesT1H > maxCyclesT1H {
   190  		// Unlikely, we have 5500ns for this operation.
   191  		return fmt.Errorf("MCU appears to be too slow to satisfy minimum requirements for the T1H signal")
   192  	}
   193  	actualMinCyclesT1H := minBaseCyclesT1H + delay2
   194  	actualMaxCyclesT1H := maxBaseCyclesT1H + delay2
   195  	actualMinNanosecondsT1H := float64(actualMinCyclesT1H) / float64(megahertz) * 1000
   196  	actualMaxNanosecondsT1H := float64(actualMaxCyclesT1H) / float64(megahertz) * 1000
   197  
   198  	// Determine number of nops for delay3. This is based on the TLD delay, the
   199  	// time between two high pulses.
   200  	delay3 := minCyclesTLD - archImpl.minBaseCyclesTLD
   201  	if delay3 < 0 {
   202  		delay3 = 0
   203  	}
   204  	actualMinCyclesTLD := archImpl.minBaseCyclesTLD + delay3
   205  	actualMinNanosecondsTLD := float64(actualMinCyclesTLD) / float64(megahertz) * 1000
   206  
   207  	// Create the Go function in a buffer. Using a buffer here to be able to
   208  	// ignore I/O errors.
   209  	buf := &bytes.Buffer{}
   210  	fmt.Fprintf(buf, "\n")
   211  	fmt.Fprintf(buf, "__attribute__((always_inline))\nvoid ws2812_writeByte%d(char c, uint32_t *portSet, uint32_t *portClear, uint32_t maskSet, uint32_t maskClear) {\n", megahertz)
   212  	fmt.Fprintf(buf, "	// Timings:\n")
   213  	fmt.Fprintf(buf, "	// T0H: %2d - %2d cycles or %.1fns - %.1fns\n", actualMinCyclesT0H, actualMaxCyclesT0H, actualMinNanosecondsT0H, actualMaxNanosecondsT0H)
   214  	fmt.Fprintf(buf, "	// T1H: %2d - %2d cycles or %.1fns - %.1fns\n", actualMinCyclesT1H, actualMaxCyclesT1H, actualMinNanosecondsT1H, actualMaxNanosecondsT1H)
   215  	fmt.Fprintf(buf, "	// TLD: %2d -    cycles or %.1fns -\n", actualMinCyclesTLD, actualMinNanosecondsTLD)
   216  	fmt.Fprintf(buf, "	uint32_t value = %s;\n", archImpl.valueTemplate)
   217  	asm := archImpl.template
   218  	asm = strings.TrimSpace(asm)
   219  	asm = strings.ReplaceAll(asm, "  @DELAY1\n", strings.Repeat("  nop\n", delay1))
   220  	asm = strings.ReplaceAll(asm, "  @DELAY2\n", strings.Repeat("  nop\n", delay2))
   221  	asm = strings.ReplaceAll(asm, "  @DELAY3\n", strings.Repeat("  nop\n", delay3))
   222  	asm = strings.ReplaceAll(asm, "\n", "\n\t")
   223  	fmt.Fprintf(buf, "	char i = 8;\n")
   224  	fmt.Fprintf(buf, "	__asm__ __volatile__(\n")
   225  	for _, line := range strings.Split(asm, "\n") {
   226  		fmt.Fprintf(buf, "\t\t%#v\n", line+"\n")
   227  	}
   228  	// Note: [value] and [i] must be input+output operands because they modify
   229  	// the value.
   230  	fmt.Fprintf(buf, `	: [value]"+r"(value),
   231  	  [i]"+r"(i)
   232  	: [maskSet]"r"(maskSet),
   233  	  [portSet]"m"(*portSet),
   234  	  [maskClear]"r"(maskClear),
   235  	  [portClear]"m"(*portClear));
   236  }
   237  `)
   238  
   239  	// Now write the buffer contents (with the assembly function) to a file.
   240  	_, err := f.Write(buf.Bytes())
   241  	return err
   242  }
   243  
   244  func writeGoWrapper(f *os.File, arch string, megahertz int) error {
   245  	// Create the Go function in a buffer. Using a buffer here to be able to
   246  	// ignore I/O errors.
   247  	buf := &bytes.Buffer{}
   248  	fmt.Fprintf(buf, "\n")
   249  	fmt.Fprintf(buf, "func (d Device) writeByte%d(c byte) {\n", megahertz)
   250  	fmt.Fprintf(buf, "	portSet, maskSet := d.Pin.PortMaskSet()\n")
   251  	fmt.Fprintf(buf, "	portClear, maskClear := d.Pin.PortMaskClear()\n")
   252  	fmt.Fprintf(buf, "\n")
   253  	fmt.Fprintf(buf, "	mask := interrupt.Disable()\n")
   254  	fmt.Fprintf(buf, "	C.ws2812_writeByte%d(C.char(c), (*C.uint32_t)(unsafe.Pointer(portSet)), (*C.uint32_t)(unsafe.Pointer(portClear)), C.uint32_t(maskSet), C.uint32_t(maskClear))\n", megahertz)
   255  	buf.WriteString(`
   256  	interrupt.Restore(mask)
   257  }
   258  `)
   259  
   260  	// Now write the buffer contents (with the assembly function) to a file.
   261  	_, err := f.Write(buf.Bytes())
   262  	return err
   263  }
   264  
   265  func main() {
   266  	arch := flag.String("arch", "cortexm", "architecture to output to")
   267  	flag.Parse()
   268  
   269  	// Remaining parameters are all clock frequencies.
   270  	var clockFrequencies []int
   271  	for _, s := range flag.Args() {
   272  		freq, err := strconv.Atoi(s)
   273  		if err != nil {
   274  			fmt.Fprintln(os.Stderr, "cannot parse frequency:", s)
   275  			os.Exit(1)
   276  		}
   277  		clockFrequencies = append(clockFrequencies, freq)
   278  	}
   279  
   280  	f, err := os.Create("ws2812-asm_" + *arch + ".go")
   281  	if err != nil {
   282  		fmt.Fprintln(os.Stderr, "could not generate WS2812 assembly code:", err)
   283  		os.Exit(1)
   284  	}
   285  	defer f.Close()
   286  	fmt.Fprintln(f, "//go:build", architectures[*arch].buildTag)
   287  	f.WriteString(`
   288  package ws2812
   289  
   290  // Warning: autogenerated file. Instead of modifying this file, change
   291  // gen-ws2812.go and run "go generate".
   292  
   293  import "runtime/interrupt"
   294  import "unsafe"
   295  
   296  /*
   297  #include <stdint.h>
   298  `)
   299  	for _, megahertz := range clockFrequencies {
   300  		err := writeCAssembly(f, *arch, megahertz)
   301  		if err != nil {
   302  			fmt.Fprintf(os.Stderr, "could not generate WS2812 assembly code for %s and %dMHz: %s\n", *arch, megahertz, err)
   303  			os.Exit(1)
   304  		}
   305  	}
   306  	f.WriteString(`*/
   307  import "C"
   308  `)
   309  	for _, megahertz := range clockFrequencies {
   310  		err := writeGoWrapper(f, *arch, megahertz)
   311  		if err != nil {
   312  			fmt.Fprintf(os.Stderr, "could not generate Go wrapper: %w\n", err)
   313  			os.Exit(1)
   314  		}
   315  	}
   316  }