golang.org/x/arch@v0.17.0/arm/armspec/spec.go (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Armspec reads the “ARM Architecture Reference Manual” 6 // to collect instruction encoding details and writes those details to standard output 7 // in JSON format. 8 // 9 // # Warning Warning Warning 10 // 11 // This program is unfinished. It is being published in this incomplete form 12 // for interested readers, but do not expect it to be runnable or useful. 13 package main 14 15 import ( 16 "bufio" 17 "bytes" 18 "encoding/json" 19 "fmt" 20 "log" 21 "math" 22 "os" 23 "regexp" 24 "sort" 25 "strconv" 26 "strings" 27 28 "rsc.io/pdf" 29 ) 30 31 type Inst struct { 32 Name string 33 ID string 34 Bits string 35 Arch string 36 Syntax []string 37 Code string 38 } 39 40 const debugPage = 0 41 42 var stdout *bufio.Writer 43 44 func main() { 45 log.SetFlags(0) 46 log.SetPrefix("armspec: ") 47 48 if len(os.Args) != 2 { 49 fmt.Fprintf(os.Stderr, "usage: armspec file.pdf\n") 50 os.Exit(2) 51 } 52 53 f, err := pdf.Open(os.Args[1]) 54 if err != nil { 55 log.Fatal(err) 56 } 57 58 // Find instruction set reference in outline, to build instruction list. 59 instList := instHeadings(f.Outline()) 60 if len(instList) < 200 { 61 log.Fatalf("only found %d instructions in table of contents", len(instList)) 62 } 63 64 stdout = bufio.NewWriter(os.Stdout) 65 fmt.Fprintf(stdout, "[") 66 numTable := 0 67 defer stdout.Flush() 68 69 // Scan document looking for instructions. 70 // Must find exactly the ones in the outline. 71 n := f.NumPage() 72 PageLoop: 73 for pageNum := 1; pageNum <= n; pageNum++ { 74 if debugPage > 0 && pageNum != debugPage { 75 continue 76 } 77 if pageNum > 1127 { 78 break 79 } 80 p := f.Page(pageNum) 81 name, table := parsePage(pageNum, p) 82 if name == "" { 83 continue 84 } 85 if len(table) < 1 { 86 if false { 87 fmt.Fprintf(os.Stderr, "no encodings for instruction %q (page %d)\n", name, pageNum) 88 } 89 continue 90 } 91 for _, inst := range table { 92 if numTable > 0 { 93 fmt.Fprintf(stdout, ",") 94 } 95 numTable++ 96 js, _ := json.Marshal(inst) 97 fmt.Fprintf(stdout, "\n%s", jsFix.Replace(string(js))) 98 } 99 for j, headline := range instList { 100 if name == headline { 101 instList[j] = "" 102 continue PageLoop 103 } 104 } 105 fmt.Fprintf(os.Stderr, "unexpected instruction %q (page %d)\n", name, pageNum) 106 } 107 108 fmt.Fprintf(stdout, "\n]\n") 109 stdout.Flush() 110 111 if debugPage == 0 { 112 for _, headline := range instList { 113 if headline != "" { 114 switch headline { 115 default: 116 fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline) 117 case "CHKA": // ThumbEE 118 case "CPS": // system instruction 119 case "CPY": // synonym for MOV 120 case "ENTERX": // ThumbEE 121 case "F* (former VFP instruction mnemonics)": // synonyms 122 case "HB, HBL, HBLP, HBP": // ThumbEE 123 case "LEAVEX": // ThumbEE 124 case "MOV (shifted register)": // pseudo instruction for ASR, LSL, LSR, ROR, and RRX 125 case "NEG": // synonym for RSB 126 case "RFE": // system instruction 127 case "SMC (previously SMI)": // system instruction 128 case "SRS": // system instruction 129 case "SUBS PC, LR and related instructions": // system instruction 130 case "VAND (immediate)": // pseudo instruction 131 case "VCLE (register)": // pseudo instruction 132 case "VCLT (register)": // pseudo instruction 133 case "VORN (immediate)": // pseudo instruction 134 } 135 } 136 } 137 } 138 } 139 140 func instHeadings(outline pdf.Outline) []string { 141 return appendInstHeadings(outline, nil) 142 } 143 144 var instRE = regexp.MustCompile(`A[\d.]+ Alphabetical list of instructions`) 145 var childRE = regexp.MustCompile(`A[\d.]+ (.+)`) 146 var sectionRE = regexp.MustCompile(`^A[\d.]+$`) 147 var bitRE = regexp.MustCompile(`^( |[01]|\([01]\))*$`) 148 149 func appendInstHeadings(outline pdf.Outline, list []string) []string { 150 if instRE.MatchString(outline.Title) { 151 for _, child := range outline.Child { 152 m := childRE.FindStringSubmatch(child.Title) 153 if m == nil { 154 fmt.Fprintf(os.Stderr, "cannot parse section title: %s\n", child.Title) 155 continue 156 } 157 list = append(list, m[1]) 158 } 159 } 160 for _, child := range outline.Child { 161 list = appendInstHeadings(child, list) 162 } 163 return list 164 } 165 166 const inch = 72.0 167 168 func parsePage(num int, p pdf.Page) (name string, table []Inst) { 169 content := p.Content() 170 171 var text []pdf.Text 172 for _, t := range content.Text { 173 if match(t, "Times-Roman", 7.2, "") { 174 t.FontSize = 9 175 } 176 if match(t, "Times-Roman", 6.72, "") && '0' <= t.S[0] && t.S[0] <= '9' { 177 t.S = string([]rune("⁰¹²³⁴⁵⁶⁷⁸⁹")[t.S[0]-'0']) 178 t.FontSize = 9 179 t.Y -= 2.28 180 } 181 if t.Font == "Gen_Arial" { 182 continue 183 } 184 text = append(text, t) 185 } 186 187 text = findWords(text) 188 189 for i, t := range text { 190 if t.Font == "Times" { 191 t.Font = "Times-Roman" 192 text[i] = t 193 } 194 } 195 196 if debugPage > 0 { 197 for _, t := range text { 198 fmt.Println(t) 199 } 200 for _, r := range content.Rect { 201 fmt.Println(r) 202 } 203 } 204 205 // Remove text we should ignore. 206 out := text[:0] 207 skip := false 208 for _, t := range text { 209 // skip page footer 210 if match(t, "Helvetica", 8, "A") || match(t, "Helvetica", 8, "ARM DDI") || match(t, "Helvetica-Oblique", 8, "Copyright") { 211 continue 212 } 213 // skip section header and body text 214 if match(t, "Helvetica-Bold", 12, "") && (sectionRE.MatchString(t.S) || t.S == "Alphabetical list of instructions") { 215 skip = true 216 continue 217 } 218 if skip && match(t, "Times-Roman", 9, "") { 219 continue 220 } 221 skip = false 222 out = append(out, t) 223 } 224 text = out 225 226 // Page header must say Instruction Details. 227 if len(text) == 0 || !match(text[0], "Helvetica-Oblique", 8, "Instruction Details") && !match(text[0], "Times-Roman", 9, "Instruction Details") { 228 return "", nil 229 } 230 text = text[1:] 231 232 isSection := func(text []pdf.Text, i int) int { 233 if i+2 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && sectionRE.MatchString(text[i].S) && match(text[i+1], "Helvetica-Bold", 10, "") { 234 return 2 235 } 236 if i+1 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && childRE.MatchString(text[i].S) { 237 return 1 238 } 239 return 0 240 } 241 242 // Skip dummy headlines and sections. 243 for d := isSection(text, 0); d != 0; d = isSection(text, 0) { 244 i := d 245 for i < len(text) && !match(text[i], "Helvetica-Bold", 9, "Encoding") && !match(text[i], "Helvetica-Bold", 10, "") { 246 i++ 247 } 248 if isSection(text, i) == 0 { 249 break 250 } 251 text = text[i:] 252 } 253 254 // Next line is headline. Can wrap to multiple lines. 255 d := isSection(text, 0) 256 if d == 0 { 257 if debugPage > 0 { 258 fmt.Printf("non-inst-headline: %v\n", text[0]) 259 } 260 checkNoEncodings(num, text) 261 return "", nil 262 } 263 if d == 2 { 264 name = text[1].S 265 text = text[2:] 266 } else if d == 1 { 267 m := childRE.FindStringSubmatch(text[0].S) 268 name = m[1] 269 text = text[1:] 270 } 271 for len(text) > 0 && match(text[0], "Helvetica-Bold", 10, "") { 272 name += " " + text[0].S 273 text = text[1:] 274 } 275 276 // Skip description. 277 for len(text) > 0 && (match(text[0], "Times-Roman", 9, "") || match(text[0], "LucidaSansTypewriteX", 6.48, "") || match(text[0], "Times-Bold", 10, "Note")) { 278 text = text[1:] 279 } 280 281 // Encodings follow. 282 warned := false 283 for i := 0; i < len(text); { 284 if match(text[i], "Helvetica-Bold", 10, "Assembler syntax") || 285 match(text[i], "Helvetica-Bold", 9, "Modified operation in ThumbEE") || 286 match(text[i], "Helvetica-Bold", 9, "Unallocated memory hints") || 287 match(text[i], "Helvetica-Bold", 9, "Related encodings") || 288 match(text[i], "Times-Roman", 9, "Figure A") || 289 match(text[i], "Helvetica-Bold", 9, "Table A") || 290 match(text[i], "Helvetica-Bold", 9, "VFP Instructions") || 291 match(text[i], "Helvetica-Bold", 9, "VFP instructions") || 292 match(text[i], "Helvetica-Bold", 9, "VFP vectors") || 293 match(text[i], "Helvetica-Bold", 9, "FLDMX") || 294 match(text[i], "Helvetica-Bold", 9, "FSTMX") || 295 match(text[i], "Helvetica-Bold", 9, "Advanced SIMD and VFP") { 296 checkNoEncodings(num, text[i:]) 297 break 298 } 299 if match(text[i], "Helvetica-Bold", 9, "Figure A") { 300 y := text[i].Y 301 i++ 302 for i < len(text) && math.Abs(text[i].Y-y) < 2 { 303 i++ 304 } 305 continue 306 } 307 if !match(text[i], "Helvetica-Bold", 9, "Encoding") { 308 if !warned { 309 warned = true 310 fmt.Fprintln(os.Stderr, "page", num, ": unexpected:", text[i]) 311 } 312 i++ 313 continue 314 } 315 inst := Inst{ 316 Name: name, 317 } 318 enc := text[i].S 319 x := text[i].X 320 i++ 321 // Possible subarchitecture notes. 322 for i < len(text) && text[i].X > x+36 { 323 if inst.Arch != "" { 324 inst.Arch += " " 325 } 326 inst.Arch += text[i].S 327 i++ 328 } 329 // Encoding syntaxes. 330 for i < len(text) && (match(text[i], "LucidaSansTypewriteX", 6.48, "") || text[i].X > x+36) { 331 if text[i].X < x+0.25*inch { 332 inst.Syntax = append(inst.Syntax, text[i].S) 333 } else { 334 s := inst.Syntax[len(inst.Syntax)-1] 335 if !strings.Contains(s, "\t") { 336 s += "\t" 337 } else { 338 s += " " 339 } 340 s += text[i].S 341 inst.Syntax[len(inst.Syntax)-1] = s 342 } 343 i++ 344 } 345 346 var bits, abits, aenc string 347 bits, i = readBitBox(inst.Name, inst.Syntax, content, text, i) 348 if strings.Contains(enc, " / ") { 349 if i < len(text) && match(text[i], "Times-Roman", 8, "") { 350 abits, i = readBitBox(inst.Name, inst.Syntax, content, text, i) 351 } else { 352 abits = bits 353 } 354 slash := strings.Index(enc, " / ") 355 aenc = "Encoding " + enc[slash+len(" / "):] 356 enc = enc[:slash] 357 } 358 359 // pseudocode 360 y0 := -1 * inch 361 tab := 0.0 362 for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") { 363 t := text[i] 364 i++ 365 if math.Abs(t.Y-y0) < 3 { 366 // same line as last fragment, probably just two spaces 367 inst.Code += " " + t.S 368 continue 369 } 370 if inst.Code != "" { 371 inst.Code += "\n" 372 } 373 if t.X > x+0.1*inch { 374 if tab == 0 { 375 tab = t.X - x 376 } 377 inst.Code += strings.Repeat("\t", int((t.X-x)/tab+0.5)) 378 } else { 379 tab = 0 380 } 381 inst.Code += t.S 382 y0 = t.Y 383 } 384 385 inst.ID = strings.TrimPrefix(enc, "Encoding ") 386 inst.Bits = bits 387 table = append(table, inst) 388 if abits != "" { 389 inst.ID = strings.TrimPrefix(aenc, "Encoding ") 390 inst.Bits = abits 391 table = append(table, inst) 392 } 393 394 } 395 return name, table 396 } 397 398 func readBitBox(name string, syntax []string, content pdf.Content, text []pdf.Text, i int) (string, int) { 399 // bit headings 400 y2 := 0.0 401 x1 := 0.0 402 x2 := 0.0 403 for i < len(text) && match(text[i], "Times-Roman", 8, "") { 404 if y2 == 0 { 405 y2 = text[i].Y 406 } 407 if x1 == 0 { 408 x1 = text[i].X 409 } 410 i++ 411 } 412 // bit fields in box 413 y1 := 0.0 414 dy1 := 0.0 415 for i < len(text) && match(text[i], "Times-Roman", 9, "") { 416 if x2 < text[i].X+text[i].W { 417 x2 = text[i].X + text[i].W 418 } 419 y1 = text[i].Y 420 dy1 = text[i].FontSize 421 i++ 422 } 423 424 if debugPage > 0 { 425 fmt.Println("encoding box", x1, y1, x2, y2) 426 } 427 428 // Find lines (thin rectangles) separating bit fields. 429 var bottom, top pdf.Rect 430 const ( 431 yMargin = 0.25 * 72 432 xMargin = 2 * 72 433 ) 434 for _, r := range content.Rect { 435 if r.Max.Y-r.Min.Y < 2 && x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin { 436 if y1-yMargin < r.Min.Y && r.Min.Y < y1 { 437 bottom = r 438 } 439 if y1+dy1 < r.Min.Y && r.Min.Y < y2 { 440 top = r 441 } 442 } 443 } 444 445 if debugPage > 0 { 446 fmt.Println("top", top, "bottom", bottom) 447 } 448 449 const ε = 0.1 * 72 450 var bars []pdf.Rect 451 for _, r := range content.Rect { 452 if r.Max.X-r.Min.X < 2 && math.Abs(r.Min.Y-bottom.Min.Y) < ε && math.Abs(r.Max.Y-top.Min.Y) < ε { 453 bars = append(bars, r) 454 } 455 } 456 sort.Sort(RectHorizontal(bars)) 457 458 // There are 16-bit and 32-bit encodings. 459 // In practice, they are about 2.65 and 5.3 inches wide, respectively. 460 // Use 4 inches as a cutoff. 461 nbit := 32 462 dx := top.Max.X - top.Min.X 463 if top.Max.X-top.Min.X < 4*72 { 464 nbit = 16 465 } 466 467 total := 0 468 var buf bytes.Buffer 469 for i := 0; i < len(bars)-1; i++ { 470 if i > 0 { 471 fmt.Fprintf(&buf, "|") 472 } 473 var sub []pdf.Text 474 x1, x2 := bars[i].Min.X, bars[i+1].Min.X 475 for _, t := range content.Text { 476 tx := t.X + t.W/2 477 ty := t.Y + t.FontSize/2 478 if x1 < tx && tx < x2 && y1 < ty && ty < y2 { 479 sub = append(sub, t) 480 } 481 } 482 var str []string 483 for _, t := range findWords(sub) { 484 str = append(str, t.S) 485 } 486 s := strings.Join(str, " ") 487 s = strings.Replace(s, ")(", ") (", -1) 488 n := len(strings.Fields(s)) 489 b := int(float64(nbit)*(x2-x1)/dx + 0.5) 490 if n == b { 491 for j, f := range strings.Fields(s) { 492 if j > 0 { 493 fmt.Fprintf(&buf, "|") 494 } 495 fmt.Fprintf(&buf, "%s", f) 496 } 497 } else { 498 if n != 1 { 499 fmt.Fprintf(os.Stderr, "%s - %s - multi-field %d-bit encoding: %s\n", name, syntax, n, s) 500 } 501 fmt.Fprintf(&buf, "%s:%d", s, b) 502 } 503 total += b 504 } 505 506 if total != nbit || total == 0 { 507 fmt.Fprintf(os.Stderr, "%s - %s - %d-bit encoding\n", name, syntax, total) 508 } 509 return buf.String(), i 510 } 511 512 type RectHorizontal []pdf.Rect 513 514 func (x RectHorizontal) Swap(i, j int) { x[i], x[j] = x[j], x[i] } 515 func (x RectHorizontal) Less(i, j int) bool { return x[i].Min.X < x[j].Min.X } 516 func (x RectHorizontal) Len() int { return len(x) } 517 518 func checkNoEncodings(num int, text []pdf.Text) { 519 for _, t := range text { 520 if match(t, "Helvetica-Bold", 9, "Encoding") { 521 fmt.Fprintf(os.Stderr, "page %d: unexpected encoding: %s\n", num, t.S) 522 } 523 } 524 } 525 526 func match(t pdf.Text, font string, size float64, substr string) bool { 527 return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr) 528 } 529 530 func findWords(chars []pdf.Text) (words []pdf.Text) { 531 // Sort by Y coordinate and normalize. 532 const nudge = 1 533 sort.Sort(pdf.TextVertical(chars)) 534 old := -100000.0 535 for i, c := range chars { 536 if c.Y != old && math.Abs(old-c.Y) < nudge { 537 chars[i].Y = old 538 } else { 539 old = c.Y 540 } 541 } 542 543 // Sort by Y coordinate, breaking ties with X. 544 // This will bring letters in a single word together. 545 sort.Sort(pdf.TextVertical(chars)) 546 547 // Loop over chars. 548 for i := 0; i < len(chars); { 549 // Find all chars on line. 550 j := i + 1 551 for j < len(chars) && chars[j].Y == chars[i].Y { 552 j++ 553 } 554 var end float64 555 // Split line into words (really, phrases). 556 for k := i; k < j; { 557 ck := &chars[k] 558 s := ck.S 559 end = ck.X + ck.W 560 charSpace := ck.FontSize / 6 561 wordSpace := ck.FontSize * 2 / 3 562 l := k + 1 563 for l < j { 564 // Grow word. 565 cl := &chars[l] 566 if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+charSpace { 567 s += cl.S 568 end = cl.X + cl.W 569 l++ 570 continue 571 } 572 // Add space to phrase before next word. 573 if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+wordSpace { 574 s += " " + cl.S 575 end = cl.X + cl.W 576 l++ 577 continue 578 } 579 break 580 } 581 f := ck.Font 582 f = strings.TrimSuffix(f, ",Italic") 583 f = strings.TrimSuffix(f, "-Italic") 584 words = append(words, pdf.Text{ 585 Font: f, 586 FontSize: ck.FontSize, 587 X: ck.X, 588 Y: ck.Y, 589 W: end - ck.X, 590 S: s, 591 }) 592 k = l 593 } 594 i = j 595 } 596 597 return words 598 } 599 600 func sameFont(f1, f2 string) bool { 601 f1 = strings.TrimSuffix(f1, ",Italic") 602 f1 = strings.TrimSuffix(f1, "-Italic") 603 f2 = strings.TrimSuffix(f1, ",Italic") 604 f2 = strings.TrimSuffix(f1, "-Italic") 605 return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") || f1 == "Symbol" || f2 == "Symbol" || f1 == "TimesNewRoman" || f2 == "TimesNewRoman" 606 } 607 608 var jsFix = strings.NewReplacer( 609 // `\u003c`, `<`, 610 // `\u003e`, `>`, 611 // `\u0026`, `&`, 612 // `\u0009`, `\t`, 613 ) 614 615 func printTable(name string, table []Inst) { 616 _ = strconv.Atoi 617 }