golang.org/x/arch@v0.17.0/arm64/arm64spec/spec.go (about) 1 // Copyright 2017 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // arm64spec reads the ``ARMv8-A Reference Manual'' 6 // to collect instruction encoding details and writes those 7 // details to standard output in JSON format. 8 // usage: arm64spec file.pdf 9 10 package main 11 12 import ( 13 "bufio" 14 "bytes" 15 "encoding/json" 16 "fmt" 17 "log" 18 "math" 19 "os" 20 "regexp" 21 "sort" 22 "strconv" 23 "strings" 24 25 "rsc.io/pdf" 26 ) 27 28 type Inst struct { 29 Name string 30 Bits string 31 Arch string 32 Syntax string 33 Code string 34 Alias string 35 } 36 37 const debugPage = 0 38 39 var stdout *bufio.Writer 40 41 func check(e error) { 42 if e != nil { 43 panic(e) 44 } 45 } 46 47 func main() { 48 log.SetFlags(0) 49 log.SetPrefix("arm64spec: ") 50 51 if len(os.Args) != 2 { 52 fmt.Fprintf(os.Stderr, "usage: arm64spec file.pdf\n") 53 os.Exit(2) 54 } 55 f, err := pdf.Open(os.Args[1]) 56 if err != nil { 57 log.Fatal(err) 58 } 59 60 // Find instruction set reference in outline, to build instruction list. 61 instList := instHeadings(f.Outline()) 62 if debugPage == 0 { 63 fmt.Println("the number of instructions:", len(instList)) 64 } 65 if len(instList) < 200 { 66 log.Fatalf("only found %d instructions in table of contents", len(instList)) 67 } 68 69 file, err := os.Create("inst.json") 70 check(err) 71 w := bufio.NewWriter(file) 72 _, err = w.WriteString("[") 73 check(err) 74 numTable := 0 75 defer w.Flush() 76 defer file.Close() 77 78 // Scan document looking for instructions. 79 // Must find exactly the ones in the outline. 80 n := f.NumPage() 81 PageLoop: 82 for pageNum := 435; pageNum <= n; pageNum++ { 83 if debugPage > 0 && pageNum != debugPage { 84 continue 85 } 86 if pageNum == 770 { 87 continue 88 } 89 if pageNum > 1495 { 90 break 91 } 92 p := f.Page(pageNum) 93 name, table := parsePage(pageNum, p, f) 94 if name == "" { 95 continue 96 } 97 if len(table) < 1 { 98 if false { 99 fmt.Fprintf(os.Stderr, "no encodings for instruction %q (page %d)\n", name, pageNum) 100 } 101 continue 102 } 103 for _, inst := range table { 104 if numTable > 0 { 105 _, err = w.WriteString(jsFix.Replace(",")) 106 check(err) 107 _, err = w.WriteString("\n") 108 check(err) 109 } 110 numTable++ 111 js, _ := json.Marshal(inst) 112 _, err = w.WriteString(jsFix.Replace(string(js))) 113 check(err) 114 } 115 for j, headline := range instList { 116 if name == headline { 117 instList[j] = "" 118 continue PageLoop 119 } 120 } 121 fmt.Fprintf(os.Stderr, "unexpected instruction %q (page %d)\n", name, pageNum) 122 } 123 124 _, err = w.WriteString("\n]\n") 125 check(err) 126 w.Flush() 127 128 if debugPage == 0 { 129 for _, headline := range instList { 130 if headline != "" { 131 fmt.Fprintf(os.Stderr, "missing instruction %q\n", headline) 132 } 133 } 134 } 135 } 136 137 func instHeadings(outline pdf.Outline) []string { 138 return appendInstHeadings(outline, nil) 139 } 140 141 var instRE = regexp.MustCompile(`C[\d.]+ Alphabetical list of A64 base instructions`) 142 var instRE_A = regexp.MustCompile(`C[\d.]+ Alphabetical list of A64 floating-point and Advanced SIMD instructions`) 143 var childRE = regexp.MustCompile(`C[\d.]+ (.+)`) 144 var sectionRE = regexp.MustCompile(`^C[\d.]+$`) 145 var bitRE = regexp.MustCompile(`^( |[01]|\([01]\))*$`) 146 var IMMRE = regexp.MustCompile(`^imm[\d]+$`) 147 148 func appendInstHeadings(outline pdf.Outline, list []string) []string { 149 if instRE.MatchString(outline.Title) || instRE_A.MatchString(outline.Title) { 150 for _, child := range outline.Child { 151 m := childRE.FindStringSubmatch(child.Title) 152 if m == nil { 153 fmt.Fprintf(os.Stderr, "cannot parse section title: %s\n", child.Title) 154 continue 155 } 156 list = append(list, m[1]) 157 } 158 } 159 for _, child := range outline.Child { 160 list = appendInstHeadings(child, list) 161 } 162 return list 163 } 164 165 const inch = 72.0 166 167 func parsePage(num int, p pdf.Page, f *pdf.Reader) (name string, table []Inst) { 168 content := p.Content() 169 var text []pdf.Text 170 CrossTwoPage := true 171 for _, t := range content.Text { 172 text = append(text, t) 173 } 174 text = findWords(text) 175 if !(instRE.MatchString(text[1].S) || instRE_A.MatchString(text[1].S)) || len(text) == 0 || !sectionRE.MatchString(text[2].S) { 176 return "", nil 177 } 178 // Check whether the content crosses the page. 179 for _, t := range text { 180 if match(t, "Arial,Bold", 10, "Assembler symbols") { 181 CrossTwoPage = false 182 break 183 } 184 } 185 // Deal with cross page issue. To the next page content. 186 var Ncontent pdf.Content 187 Npagebox := false 188 CrossThreePage := false 189 Noffset := "" 190 if CrossTwoPage == true { 191 Np := f.Page(num + 1) 192 Ncontent = Np.Content() 193 var Ntext []pdf.Text 194 for _, t := range Ncontent.Text { 195 Ntext = append(Ntext, t) 196 } 197 Ntext = findWords(Ntext) 198 if len(Ntext) == 0 || sectionRE.MatchString(Ntext[2].S) { 199 Ntext = text[:0] 200 } else { 201 for _, t := range Ntext { 202 if match(t, "Arial,Bold", 10, "offset") { 203 Noffset = t.S 204 Npagebox = true 205 } 206 // This istruction cross three pages. 207 if match(t, "Arial,Bold", 10, "Assembler symbols") { 208 CrossThreePage = false 209 } else { 210 CrossThreePage = true 211 } 212 text = append(text, t) 213 } 214 } 215 } 216 if CrossThreePage == true { 217 NNp := f.Page(num + 2) 218 NNcontent := NNp.Content() 219 var NNtext []pdf.Text 220 for _, t := range NNcontent.Text { 221 NNtext = append(NNtext, t) 222 } 223 NNtext = findWords(NNtext) 224 if len(NNtext) == 0 || sectionRE.MatchString(NNtext[2].S) { 225 NNtext = text[:0] 226 } else { 227 for _, t := range NNtext { 228 text = append(text, t) 229 } 230 } 231 } 232 // Get alias and remove text we should ignore. 233 out := text[:0] 234 alias := "" 235 for _, t := range text { 236 if strings.Contains(t.S, "instruction is used by the alias") || strings.Contains(t.S, "instruction is an alias of") { 237 alias_t := strings.SplitAfter(t.S, ".") 238 alias = alias_t[0] 239 } 240 // Skip page footer 241 if match(t, "Arial-ItalicMT", 8, "") || match(t, "ArialMT", 8, "") { 242 if debugPage > 0 { 243 fmt.Println("==the skip page footer is:==", t) 244 } 245 continue 246 } 247 // Skip the body text 248 if match(t, "TimesNewRoman", 9, "") || match(t, "TimesNewRomanPS-ItalicMT", 9, "") { 249 if debugPage > 0 { 250 fmt.Println("==the skip body text is:==", t) 251 } 252 continue 253 } 254 out = append(out, t) 255 } 256 text = out 257 // Page header must be child title. 258 if len(text) == 0 || !sectionRE.MatchString(text[0].S) { 259 return "", nil 260 } 261 262 name = text[1].S 263 inst := Inst{ 264 Name: name, 265 Alias: alias, 266 } 267 text = text[2:] 268 // Skip body text before bits. 269 OffsetMark := false 270 k := 0 271 for k = 0; k < len(text); { 272 if !match(text[k], "Arial", 8, "31") { 273 k++ 274 } else { 275 break 276 } 277 } 278 // Check offset. 279 if k > 0 && match(text[k-1], "Arial,Bold", 10, "") { 280 OffsetMark = true 281 text = text[k-1:] 282 } else { 283 text = text[k:] 284 } 285 // Encodings follow. 286 BitMark := false 287 bits := "" 288 // Find bits. 289 for i := 0; i < len(text); { 290 inst.Bits = "" 291 offset := "" 292 abits := "" 293 // Read bits only one time. 294 if OffsetMark == true { 295 for i < len(text) && !match(text[i], "Arial", 8, "") { 296 i++ 297 } 298 if i < len(text) { 299 offset = text[i-1].S 300 BitMark = false 301 bits = "" 302 } else { 303 break 304 } 305 } 306 if BitMark == false { 307 if Npagebox == true && Noffset == offset { 308 bits, i = readBitBox(name, Ncontent, text, i) 309 } else { 310 bits, i = readBitBox(name, content, text, i) 311 } 312 BitMark = true 313 // Every time, get "then SEE" after get bits. 314 enc := false 315 if i < len(text)-1 { 316 m := i 317 for m < len(text)-1 && !match(text[m], "Arial-BoldItalicMT", 9, "encoding") { 318 m++ 319 } 320 if match(text[m], "Arial-BoldItalicMT", 9, "encoding") && m < len(text) { 321 enc = true 322 m = m + 1 323 } 324 if enc == true { 325 for m < len(text) && !match(text[m], "Arial,Bold", 10, "") && match(text[m], "LucidaSansTypewriteX", 6.48, "") { 326 if strings.Contains(text[m].S, "then SEE") { 327 inst.Code = text[m].S 328 break 329 } else { 330 m++ 331 } 332 } 333 } 334 } 335 } 336 337 // Possible subarchitecture notes. 338 ArchLoop: 339 for i < len(text) { 340 if !match(text[i], "Arial-BoldItalicMT", 9, "variant") || match(text[i], "Arial-BoldItalicMT", 9, "encoding") { 341 i++ 342 continue 343 } 344 inst.Arch = "" 345 inst.Arch += offset 346 inst.Arch += " " 347 inst.Arch += text[i].S 348 inst.Arch = strings.TrimSpace(inst.Arch) 349 i++ 350 // Encoding syntaxes. 351 sign := "" 352 SynMark := false 353 for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") && SynMark == false { 354 if (strings.Contains(text[i].S, "==") || strings.Contains(text[i].S, "!=")) && SynMark == false { 355 sign = text[i].S 356 i++ 357 continue 358 } 359 // Avoid "equivalent to" another syntax. 360 if SynMark == false { 361 SynMark = true 362 inst.Syntax = "" 363 inst.Syntax = text[i].S 364 i++ 365 } 366 } 367 abits = bits 368 // Analyse and replace some bits value.eg, sf==1 369 if strings.Contains(sign, "&&") { 370 split := strings.Split(sign, "&&") 371 for k := 0; k < len(split); { 372 if strings.Contains(split[k], "==") && !strings.Contains(split[k], "!") { 373 tmp := strings.Split(split[k], "==") 374 prefix := strings.TrimSpace(tmp[0]) 375 value := strings.TrimSpace(tmp[1]) 376 if strings.Contains(bits, prefix) && !strings.Contains(value, "x") { 377 abits = strings.Replace(abits, prefix, value, -1) 378 } 379 } 380 k++ 381 } 382 } else if strings.Contains(sign, "==") && !strings.Contains(sign, "!") { 383 split := strings.Split(sign, "==") 384 prefix := strings.TrimSpace(split[0]) 385 value := strings.TrimSpace(split[1]) 386 if strings.Contains(bits, prefix) && !strings.Contains(value, "x") { 387 abits = strings.Replace(abits, prefix, value, -1) 388 } 389 } 390 // Deal with syntax contains {2} 391 if strings.Contains(inst.Syntax, "{2}") { 392 if !strings.Contains(abits, "Q") { 393 fmt.Fprintf(os.Stderr, "instruction%s - syntax%s: is wrong!!\n", name, inst.Syntax) 394 } 395 syn := inst.Syntax 396 bits := abits 397 for i := 0; i < 2; { 398 if i == 0 { 399 inst.Bits = strings.Replace(bits, "Q", "0", -1) 400 inst.Syntax = strings.Replace(syn, "{2}", "", -1) 401 table = append(table, inst) 402 } 403 if i == 1 { 404 inst.Bits = strings.Replace(bits, "Q", "1", -1) 405 inst.Syntax = strings.Replace(syn, "{2}", "2", -1) 406 table = append(table, inst) 407 } 408 i++ 409 } 410 } else { 411 inst.Bits = abits 412 table = append(table, inst) 413 } 414 415 if OffsetMark == true && i < len(text) && match(text[i], "Arial-BoldItalicMT", 9, "variant") && !match(text[i], "Arial-BoldItalicMT", 9, "encoding") { 416 continue ArchLoop 417 } else { 418 break 419 } 420 } 421 } 422 return name, table 423 } 424 425 func readBitBox(name string, content pdf.Content, text []pdf.Text, i int) (string, int) { 426 // Bits headings 427 y3 := 0.0 428 x1 := 0.0 429 for i < len(text) && match(text[i], "Arial", 8, "") { 430 if y3 == 0 { 431 y3 = text[i].Y 432 } 433 if x1 == 0 { 434 x1 = text[i].X 435 } 436 if text[i].Y != y3 { 437 break 438 } 439 i++ 440 } 441 // Bits fields in box 442 x2 := 0.0 443 y2 := 0.0 444 dy1 := 0.0 445 for i < len(text) && match(text[i], "Arial", 8, "") { 446 if x2 < text[i].X+text[i].W { 447 x2 = text[i].X + text[i].W 448 } 449 if y2 == 0 { 450 y2 = text[i].Y 451 } 452 if text[i].Y != y2 { 453 break 454 } 455 dy1 = text[i].FontSize 456 i++ 457 } 458 // Bits fields below box 459 x3 := 0.0 460 y1 := 0.0 461 for i < len(text) && match(text[i], "Arial", 8, "") { 462 if x3 < text[i].X+text[i].W { 463 x3 = text[i].X + text[i].W 464 } 465 y1 = text[i].Y 466 if text[i].Y != y1 { 467 break 468 } 469 i++ 470 } 471 //no bits fields below box 472 below_flag := true 473 if y1 == 0.0 { 474 below_flag = false 475 y1 = y2 476 } 477 // Encoding box 478 if debugPage > 0 { 479 fmt.Println("encoding box", x1, y3, x2, y1) 480 } 481 482 // Find lines (thin rectangles) separating bit fields. 483 var bottom, top pdf.Rect 484 const ( 485 yMargin = 0.25 * 72 486 xMargin = 2 * 72 487 ) 488 cont := 0 489 if below_flag == true { 490 for _, r := range content.Rect { 491 cont = cont + 1 492 if x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin { 493 if y1-yMargin < r.Min.Y && r.Min.Y < y2-dy1 { 494 bottom = r 495 } 496 if y2+dy1 < r.Min.Y && r.Min.Y < y3+yMargin { 497 top = r 498 } 499 } 500 } 501 } else { 502 for _, r := range content.Rect { 503 cont = cont + 1 504 if x1-xMargin < r.Min.X && r.Min.X < x1 && x2 < r.Max.X && r.Max.X < x2+xMargin { 505 if y1-yMargin-dy1 < r.Min.Y && r.Min.Y < y3-dy1 { 506 bottom = r 507 } 508 if y2+dy1 < r.Min.Y && r.Min.Y < y3+yMargin { 509 top = r 510 } 511 } 512 } 513 } 514 515 if debugPage > 0 { 516 fmt.Println("top", top, "bottom", bottom, "content.Rect number", cont) 517 } 518 519 const ε = 0.5 * 72 520 cont_1 := 0 521 var bars []pdf.Rect 522 for _, r := range content.Rect { 523 if math.Abs(r.Min.X-r.Max.X) < bottom.Max.X-bottom.Min.X-(ε/2) && math.Abs(r.Min.Y-bottom.Min.Y) < ε && math.Abs(r.Max.Y-top.Min.Y) < ε { 524 cont_1 = cont_1 + 1 525 bars = append(bars, r) 526 } 527 } 528 sort.Sort(RectHorizontal(bars)) 529 if debugPage > 0 { 530 fmt.Println("==bars number==", cont_1) 531 } 532 533 // There are 16-bit and 32-bit encodings. 534 // In practice, they are about 2.65 and 5.3 inches wide, respectively. 535 // Use 4 inches as a cutoff. 536 nbit := 32 537 dx := top.Max.X - top.Min.X 538 if top.Max.X-top.Min.X < 4*72 { 539 nbit = 16 540 } 541 542 total := 0 543 var buf bytes.Buffer 544 for i := 0; i < len(bars); i++ { 545 if i > 0 { 546 fmt.Fprintf(&buf, "|") 547 } 548 var sub []pdf.Text 549 x1, x2 := bars[i].Min.X, bars[i].Max.X 550 for _, t := range content.Text { 551 tx := t.X + t.W/2 552 ty := t.Y 553 if x1 < tx && tx < x2 && y2-dy1 < ty && ty < y2+dy1 { 554 sub = append(sub, t) 555 } 556 } 557 var str []string 558 for _, t := range findWords(sub) { 559 str = append(str, t.S) 560 } 561 s := strings.Join(str, " ") 562 s = strings.Replace(s, ")(", ") (", -1) 563 564 // If bits contain "!" or "x", be replaced by the bits below it. 565 if strings.Contains(s, "!") || strings.Contains(s, "x") { 566 var sub1 []pdf.Text 567 for _, t := range content.Text { 568 tx := t.X + t.W/2 569 ty := t.Y 570 if x1 < tx && tx < x2 && y1-dy1 < ty && ty < y1+dy1 { 571 sub1 = append(sub1, t) 572 } 573 574 } 575 var str1 []string 576 for _, t := range findWords(sub1) { 577 str1 = append(str1, t.S) 578 } 579 s = strings.Join(str1, " ") 580 s = strings.Replace(s, ")(", ") (", -1) 581 } 582 583 n := len(strings.Fields(s)) 584 585 var b int 586 if IMMRE.MatchString(s) { 587 bitNum := strings.TrimPrefix(s, "imm") 588 b, _ = strconv.Atoi(bitNum) 589 } else if s == "immhi" { 590 b = 19 591 } else { 592 b = int(float64(nbit)*(x2-x1)/dx + 0.5) 593 } 594 if n == b { 595 for k, f := range strings.Fields(s) { 596 if k > 0 { 597 fmt.Fprintf(&buf, "|") 598 } 599 fmt.Fprintf(&buf, "%s", f) 600 } 601 } else { 602 if n != 1 { 603 fmt.Fprintf(os.Stderr, "%s - multi-field %d-bit encoding: %s\n", name, n, s) 604 } 605 fmt.Fprintf(&buf, "%s:%d", s, b) 606 } 607 total += b 608 } 609 610 if total != nbit || total == 0 { 611 fmt.Fprintf(os.Stderr, "%s - %d-bit encoding\n", name, total) 612 } 613 return buf.String(), i 614 } 615 616 type RectHorizontal []pdf.Rect 617 618 func (x RectHorizontal) Swap(i, j int) { x[i], x[j] = x[j], x[i] } 619 func (x RectHorizontal) Less(i, j int) bool { return x[i].Min.X < x[j].Min.X } 620 func (x RectHorizontal) Len() int { return len(x) } 621 622 func checkNoEncodings(num int, text []pdf.Text) { 623 for _, t := range text { 624 if match(t, "Helvetica-Bold", 9, "Encoding") { 625 fmt.Fprintf(os.Stderr, "page %d: unexpected encoding: %s\n", num, t.S) 626 } 627 } 628 } 629 630 func match(t pdf.Text, font string, size float64, substr string) bool { 631 return t.Font == font && math.Abs(t.FontSize-size) < 0.1 && strings.Contains(t.S, substr) 632 } 633 634 func findWords(chars []pdf.Text) (words []pdf.Text) { 635 // Sort by Y coordinate and normalize. 636 const nudge = 1 637 sort.Sort(pdf.TextVertical(chars)) 638 old := -100000.0 639 for i, c := range chars { 640 if c.Y != old && math.Abs(old-c.Y) < nudge { 641 chars[i].Y = old 642 } else { 643 old = c.Y 644 } 645 } 646 647 // Sort by Y coordinate, breaking ties with X. 648 // This will bring letters in a single word together. 649 sort.Sort(pdf.TextVertical(chars)) 650 651 // Loop over chars. 652 for i := 0; i < len(chars); { 653 // Find all chars on line. 654 j := i + 1 655 for j < len(chars) && chars[j].Y == chars[i].Y { 656 j++ 657 } 658 var end float64 659 // Split line into words (really, phrases). 660 for k := i; k < j; { 661 ck := &chars[k] 662 s := ck.S 663 end = ck.X + ck.W 664 charSpace := ck.FontSize / 6 665 wordSpace := ck.FontSize * 2 / 3 666 l := k + 1 667 for l < j { 668 // Grow word. 669 cl := &chars[l] 670 if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+charSpace { 671 s += cl.S 672 end = cl.X + cl.W 673 l++ 674 continue 675 } 676 // Add space to phrase before next word. 677 if sameFont(cl.Font, ck.Font) && math.Abs(cl.FontSize-ck.FontSize) < 0.1 && cl.X <= end+wordSpace { 678 s += " " + cl.S 679 end = cl.X + cl.W 680 l++ 681 continue 682 } 683 break 684 } 685 f := ck.Font 686 f = strings.TrimSuffix(f, ",Italic") 687 f = strings.TrimSuffix(f, "-Italic") 688 words = append(words, pdf.Text{ 689 Font: f, 690 FontSize: ck.FontSize, 691 X: ck.X, 692 Y: ck.Y, 693 W: end - ck.X, 694 S: s, 695 }) 696 k = l 697 } 698 i = j 699 } 700 701 return words 702 } 703 704 func sameFont(f1, f2 string) bool { 705 f1 = strings.TrimSuffix(f1, ",Italic") 706 f1 = strings.TrimSuffix(f1, "-Italic") 707 f2 = strings.TrimSuffix(f1, ",Italic") 708 f2 = strings.TrimSuffix(f1, "-Italic") 709 return strings.TrimSuffix(f1, ",Italic") == strings.TrimSuffix(f2, ",Italic") || f1 == "Symbol" || f2 == "Symbol" || f1 == "TimesNewRoman" || f2 == "TimesNewRoman" 710 } 711 712 var jsFix = strings.NewReplacer( 713 `\u003c`, `<`, 714 `\u003e`, `>`, 715 `\u0026`, `&`, 716 `\u0009`, `\t`, 717 ) 718 719 func printTable(name string, table []Inst) { 720 _ = strconv.Atoi 721 }