golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/gorebuild/io.go (about)

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"archive/tar"
     9  	"archive/zip"
    10  	"bufio"
    11  	"bytes"
    12  	"compress/gzip"
    13  	"crypto/sha256"
    14  	"encoding/json"
    15  	"fmt"
    16  	"hash/crc32"
    17  	"io"
    18  	"io/fs"
    19  	"net/http"
    20  	"os"
    21  	"path/filepath"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  )
    26  
    27  // SHA256 returns the hexadecimal SHA256 hash of data.
    28  func SHA256(data []byte) string {
    29  	sum := sha256.Sum256(data)
    30  	return fmt.Sprintf("%x", sum[:])
    31  }
    32  
    33  // Get returns the content at the named URL.
    34  func Get(log *Log, url string) (data []byte, err error) {
    35  	defer func() {
    36  		if err != nil && log != nil {
    37  			log.Printf("%s", err)
    38  		}
    39  	}()
    40  
    41  	resp, err := http.Get(url)
    42  	if err != nil {
    43  		return nil, err
    44  	}
    45  	defer resp.Body.Close()
    46  	if resp.StatusCode != 200 {
    47  		return nil, fmt.Errorf("get %s: %s", url, resp.Status)
    48  	}
    49  	data, err = io.ReadAll(resp.Body)
    50  	if err != nil {
    51  		return nil, fmt.Errorf("get %s: %s", url, err)
    52  	}
    53  	if log != nil {
    54  		log.Printf("downloaded %s", url)
    55  	}
    56  	return data, nil
    57  }
    58  
    59  // GerritTarGz returns a .tar.gz file corresponding to the named repo and ref on Go's Gerrit server.
    60  func GerritTarGz(log *Log, repo, ref string) ([]byte, error) {
    61  	return Get(log, "https://go.googlesource.com/"+repo+"/+archive/"+ref+".tar.gz")
    62  }
    63  
    64  // A DLRelease is the JSON for a release, returned by go.dev/dl.
    65  type DLRelease struct {
    66  	Version string    `json:"version"`
    67  	Stable  bool      `json:"stable"`
    68  	Files   []*DLFile `json:"files"`
    69  }
    70  
    71  // A DLFile is the JSON for a file, returned by go.dev/dl.
    72  type DLFile struct {
    73  	Name    string `json:"filename"`
    74  	GOOS    string `json:"os"`
    75  	GOARCH  string `json:"arch"`
    76  	Version string `json:"version"`
    77  	SHA256  string `json:"sha256"`
    78  	Size    int64  `json:"size"`
    79  	Kind    string `json:"kind"` // "archive", "installer", "source"
    80  }
    81  
    82  // DLReleases returns the release list from go.dev/dl.
    83  func DLReleases(log *Log) ([]*DLRelease, error) {
    84  	var all []*DLRelease
    85  	data, err := Get(log, "https://go.dev/dl/?mode=json&include=all")
    86  	if err != nil {
    87  		return nil, err
    88  	}
    89  	if err := json.Unmarshal(data, &all); err != nil {
    90  		return nil, fmt.Errorf("unmarshaling releases JSON: %v", err)
    91  	}
    92  
    93  	for _, r := range all {
    94  		for _, f := range r.Files {
    95  			if f.GOARCH == "armv6l" {
    96  				f.GOARCH = "arm"
    97  			}
    98  		}
    99  	}
   100  	return all, nil
   101  }
   102  
   103  // OpenTarGz returns a tar.Reader for the given tgz data.
   104  func OpenTarGz(tgz []byte) (*tar.Reader, error) {
   105  	zr, err := gzip.NewReader(bytes.NewReader(tgz))
   106  	if err != nil {
   107  		return nil, err
   108  	}
   109  	return tar.NewReader(zr), nil
   110  }
   111  
   112  // UnpackTarGz unpacks the given tgz data into the named directory.
   113  // On error the directory may contain partial contents.
   114  func UnpackTarGz(dir string, tgz []byte) error {
   115  	if err := os.MkdirAll(dir, 0777); err != nil {
   116  		return err
   117  	}
   118  	tr, err := OpenTarGz(tgz)
   119  	if err != nil {
   120  		return err
   121  	}
   122  	for {
   123  		hdr, err := tr.Next()
   124  		if err != nil {
   125  			if err == io.EOF {
   126  				break
   127  			}
   128  			return err
   129  		}
   130  		if hdr.Typeflag == tar.TypeDir {
   131  			// Ignore directories entirely
   132  			continue
   133  		}
   134  		name := filepath.FromSlash(hdr.Name)
   135  		if name != filepath.Clean(name) || strings.HasPrefix(name, "..") || filepath.IsAbs(name) {
   136  			return fmt.Errorf("invalid name in tgz: %#q", hdr.Name)
   137  		}
   138  		targ := filepath.Join(dir, name)
   139  		if err := os.MkdirAll(filepath.Dir(targ), 0777); err != nil {
   140  			return err
   141  		}
   142  		f, err := os.OpenFile(targ, os.O_CREATE|os.O_WRONLY, fs.FileMode(hdr.Mode&0777))
   143  		if err != nil {
   144  			return err
   145  		}
   146  		if _, err := io.Copy(f, tr); err != nil {
   147  			f.Close()
   148  			return err
   149  		}
   150  		if err := f.Close(); err != nil {
   151  			return err
   152  		}
   153  	}
   154  	return nil
   155  }
   156  
   157  // OpenZip returns a zip.Reader for the given zip data.
   158  func OpenZip(zipdata []byte) (*zip.Reader, error) {
   159  	return zip.NewReader(bytes.NewReader(zipdata), int64(len(zipdata)))
   160  }
   161  
   162  // UnpackZip unpacks the given zip data into the named directory.
   163  // On error the directory may contain partial contents.
   164  func UnpackZip(dir string, zipdata []byte) error {
   165  	if err := os.MkdirAll(dir, 0777); err != nil {
   166  		return err
   167  	}
   168  	zr, err := OpenZip(zipdata)
   169  	if err != nil {
   170  		return err
   171  	}
   172  	for _, zf := range zr.File {
   173  		if strings.HasSuffix(zf.Name, "/") {
   174  			// Ignore directories entirely
   175  			continue
   176  		}
   177  		name := filepath.FromSlash(zf.Name)
   178  		if name != filepath.Clean(name) || strings.HasPrefix(name, "..") || filepath.IsAbs(name) {
   179  			return fmt.Errorf("invalid name in zip: %#q", zf.Name)
   180  		}
   181  		targ := filepath.Join(dir, name)
   182  		if err := os.MkdirAll(filepath.Dir(targ), 0777); err != nil {
   183  			return err
   184  		}
   185  		f, err := os.OpenFile(targ, os.O_CREATE|os.O_WRONLY, 0666)
   186  		if err != nil {
   187  			return err
   188  		}
   189  		zr, err := zf.Open()
   190  		if err != nil {
   191  			f.Close()
   192  			return err
   193  		}
   194  		_, err = io.Copy(f, zr)
   195  		zr.Close()
   196  		if err != nil {
   197  			f.Close()
   198  			return err
   199  		}
   200  		if err := f.Close(); err != nil {
   201  			return err
   202  		}
   203  	}
   204  	return nil
   205  }
   206  
   207  // A Fixer is a transformation on file content applied during indexing.
   208  // It lets us edit away permitted differences between files, such as code
   209  // signatures that cannot be reproduced without the signing keys.
   210  type Fixer = func(*Log, string, []byte) []byte
   211  
   212  // A TarFile summarizes a single file in a tar archive:
   213  // it records the exact header and the SHA256 of the content.
   214  type TarFile struct {
   215  	tar.Header
   216  	SHA256 string
   217  }
   218  
   219  // A ZipFile summarizes a single file in a zip archive:
   220  // it records the exact header and the SHA256 of the content.
   221  type ZipFile struct {
   222  	zip.FileHeader
   223  	SHA256 string
   224  }
   225  
   226  // A CpioFile represents a single file in a CPIO archive.
   227  type CpioFile struct {
   228  	Name   string
   229  	Mode   fs.FileMode
   230  	Size   int64
   231  	SHA256 string
   232  }
   233  
   234  // IndexTarGz parses tgz as a gzip-compressed tar file and returns an index of its content.
   235  // If fix is non-nil, it is applied to file content before indexing.
   236  // This lets us strip code signatures that cannot be reproduced.
   237  func IndexTarGz(log *Log, tgz []byte, fix Fixer) map[string]*TarFile {
   238  	tr, err := OpenTarGz(tgz)
   239  	if err != nil {
   240  		log.Printf("%v", err)
   241  		return nil
   242  	}
   243  	ix := make(map[string]*TarFile)
   244  	for {
   245  		hdr, err := tr.Next()
   246  		if err != nil {
   247  			if err == io.EOF {
   248  				break
   249  			}
   250  			log.Printf("reading tgz: %v", err)
   251  			return nil
   252  		}
   253  		if hdr.Typeflag == tar.TypeDir {
   254  			// Ignore directories entirely
   255  			continue
   256  		}
   257  		data, err := io.ReadAll(tr)
   258  		if err != nil {
   259  			log.Printf("reading %s from tgz: %v", hdr.Name, err)
   260  			return nil
   261  		}
   262  		if fix != nil {
   263  			data = fix(log, hdr.Name, data)
   264  			hdr.Size = int64(len(data))
   265  		}
   266  		ix[hdr.Name] = &TarFile{*hdr, SHA256(data)}
   267  	}
   268  	return ix
   269  }
   270  
   271  // IndexZip parses zipdata as a zip archive and returns an index of its content.
   272  // If fix is non-nil, it is applied to file content before indexing.
   273  // This lets us strip code signatures that cannot be reproduced.
   274  func IndexZip(log *Log, zipdata []byte, fix Fixer) map[string]*ZipFile {
   275  	zr, err := zip.NewReader(bytes.NewReader(zipdata), int64(len(zipdata)))
   276  	if err != nil {
   277  		log.Printf("%v", err)
   278  		return nil
   279  	}
   280  	ix := make(map[string]*ZipFile)
   281  	for _, hdr := range zr.File {
   282  		if strings.HasSuffix(hdr.Name, "/") {
   283  			// Ignore directories entirely
   284  			continue
   285  		}
   286  		rc, err := hdr.Open()
   287  		if err != nil {
   288  			log.Printf("%v", err)
   289  			return nil
   290  		}
   291  		data, err := io.ReadAll(rc)
   292  		rc.Close()
   293  		if err != nil {
   294  			log.Printf("%v", err)
   295  			return nil
   296  		}
   297  		if fix != nil {
   298  			data = fix(log, hdr.Name, data)
   299  			hdr.CRC32 = crc32.ChecksumIEEE(data)
   300  			hdr.UncompressedSize = uint32(len(data))
   301  			hdr.UncompressedSize64 = uint64(len(data))
   302  		}
   303  		ix[hdr.Name] = &ZipFile{hdr.FileHeader, SHA256(data)}
   304  	}
   305  	return ix
   306  }
   307  
   308  // IndexCpioGz parses data as a gzip-compressed cpio file and returns an index of its content.
   309  // If fix is non-nil, it is applied to file content before indexing.
   310  // This lets us strip code signatures that cannot be reproduced.
   311  func IndexCpioGz(log *Log, data []byte, fix Fixer) map[string]*CpioFile {
   312  	zr, err := gzip.NewReader(bytes.NewReader(data))
   313  	if err != nil {
   314  		log.Printf("%v", err)
   315  		return nil
   316  	}
   317  	br := bufio.NewReader(zr)
   318  
   319  	const hdrSize = 76
   320  
   321  	ix := make(map[string]*CpioFile)
   322  	hdr := make([]byte, hdrSize)
   323  	for {
   324  		_, err := io.ReadFull(br, hdr)
   325  		if err != nil {
   326  			if err == io.EOF {
   327  				break
   328  			}
   329  			log.Printf("reading archive: %v", err)
   330  			return nil
   331  		}
   332  
   333  		// https://www.mkssoftware.com/docs/man4/cpio.4.asp
   334  		//
   335  		//	hdr[0:6] "070707"
   336  		//	hdr[6:12] device number (all numbers '0'-padded octal)
   337  		//	hdr[12:18] inode number
   338  		//	hdr[18:24] mode
   339  		//	hdr[24:30] uid
   340  		//	hdr[30:36] gid
   341  		//	hdr[36:42] nlink
   342  		//	hdr[42:48] rdev
   343  		//	hdr[48:59] mtime
   344  		//	hdr[59:65] name length
   345  		//	hdr[65:76] file size
   346  
   347  		if !allOctal(hdr[:]) || string(hdr[:6]) != "070707" {
   348  			log.Printf("reading archive: malformed entry")
   349  			return nil
   350  		}
   351  		mode, _ := strconv.ParseInt(string(hdr[18:24]), 8, 64)
   352  		nameLen, _ := strconv.ParseInt(string(hdr[59:65]), 8, 64)
   353  		size, _ := strconv.ParseInt(string(hdr[65:76]), 8, 64)
   354  		nameBuf := make([]byte, nameLen)
   355  		if _, err := io.ReadFull(br, nameBuf); err != nil {
   356  			log.Printf("reading archive: %v", err)
   357  			return nil
   358  		}
   359  		if nameLen == 0 || nameBuf[nameLen-1] != 0 {
   360  			log.Printf("reading archive: malformed entry")
   361  			return nil
   362  		}
   363  		name := string(nameBuf[:nameLen-1])
   364  
   365  		// The MKS cpio page says "TRAILER!!"
   366  		// but the Apple pkg files use "TRAILER!!!".
   367  		if name == "TRAILER!!!" {
   368  			break
   369  		}
   370  
   371  		fmode := fs.FileMode(mode & 0777)
   372  		if mode&040000 != 0 {
   373  			fmode |= fs.ModeDir
   374  		}
   375  
   376  		data, err := io.ReadAll(io.LimitReader(br, size))
   377  		if err != nil {
   378  			log.Printf("reading archive: %v", err)
   379  			return nil
   380  		}
   381  		if size != int64(len(data)) {
   382  			log.Printf("reading archive: short file")
   383  			return nil
   384  		}
   385  
   386  		if fmode&fs.ModeDir != 0 {
   387  			continue
   388  		}
   389  
   390  		if fix != nil {
   391  			data = fix(log, name, data)
   392  			size = int64(len(data))
   393  		}
   394  		ix[name] = &CpioFile{name, fmode, size, SHA256(data)}
   395  	}
   396  	return ix
   397  }
   398  
   399  // allOctal reports whether x is entirely ASCII octal digits.
   400  func allOctal(x []byte) bool {
   401  	for _, b := range x {
   402  		if b < '0' || '7' < b {
   403  			return false
   404  		}
   405  	}
   406  	return true
   407  }
   408  
   409  // DiffArchive diffs the archives 'rebuild' and 'posted' based on their indexes.
   410  // It reports to log any files that appear only in one or the other.
   411  // For files that appear in both, DiffArchive calls check, which should
   412  // log any differences found and report whether the files match.
   413  // It reports whether the archives match.
   414  // If either of rebuild or posted is nil, DiffArchive returns false without logging,
   415  // assuming that the code that returned the nil archive took care of reporting the problem.
   416  func DiffArchive[File1, File2 any](log *Log,
   417  	rebuilt map[string]File1, posted map[string]File2,
   418  	check func(*Log, File1, File2) bool) bool {
   419  
   420  	if rebuilt == nil || posted == nil {
   421  		return false
   422  	}
   423  
   424  	// Build list of all names; will have duplicates.
   425  	var names []string
   426  	for name := range rebuilt {
   427  		names = append(names, name)
   428  	}
   429  	for name := range posted {
   430  		names = append(names, name)
   431  	}
   432  	sort.Strings(names)
   433  
   434  	match := true
   435  	for _, name := range names {
   436  		fr, okr := rebuilt[name]
   437  		fp, okp := posted[name]
   438  		if !okr && !okp { // duplicate name
   439  			continue
   440  		}
   441  		if !okp {
   442  			log.Printf("%s: missing from posted archive", name)
   443  			match = false
   444  			continue
   445  		}
   446  		if !okr {
   447  			log.Printf("%s: unexpected file in posted archive", name)
   448  			match = false
   449  			continue
   450  		}
   451  		delete(rebuilt, name)
   452  		delete(posted, name)
   453  
   454  		if !check(log, fr, fp) {
   455  			match = false
   456  		}
   457  	}
   458  	return match
   459  }
   460  
   461  // DiffTarGz diffs the tgz files rebuilt and posted, reporting any differences to log
   462  // and applying fix to files before comparing them.
   463  // It reports whether the archives match.
   464  func DiffTarGz(log *Log, rebuilt, posted []byte, fix Fixer) bool {
   465  	n := 0
   466  	check := func(log *Log, rebuilt, posted *TarFile) bool {
   467  		match := true
   468  		name := rebuilt.Name
   469  		field := func(what string, rebuilt, posted any) {
   470  			if posted != rebuilt {
   471  				if n++; n <= 100 {
   472  					log.Printf("%s: rebuilt %s = %v, posted = %v", name, what, rebuilt, posted)
   473  				} else if n == 101 {
   474  					log.Printf("eliding additional diffs ...")
   475  				}
   476  				match = false
   477  			}
   478  		}
   479  		r := rebuilt
   480  		p := posted
   481  		field("typeflag", r.Typeflag, p.Typeflag)
   482  		field("linkname", r.Linkname, p.Linkname)
   483  		field("mode", r.Mode, p.Mode)
   484  		field("uid", r.Uid, p.Uid)
   485  		field("gid", r.Gid, p.Gid)
   486  		field("uname", r.Uname, p.Uname)
   487  		field("gname", r.Gname, p.Gname)
   488  		field("mtime", r.ModTime, p.ModTime)
   489  		field("atime", r.AccessTime, p.AccessTime)
   490  		field("ctime", r.ChangeTime, p.ChangeTime)
   491  		field("devmajor", r.Devmajor, p.Devmajor)
   492  		field("devminor", r.Devminor, p.Devminor)
   493  		for k, vhdr := range r.PAXRecords {
   494  			field("PAX:"+k, vhdr, p.PAXRecords[k])
   495  		}
   496  		for k, vf := range p.PAXRecords {
   497  			if vhdr, ok := r.PAXRecords[k]; !ok {
   498  				field("PAX:"+k, vhdr, vf)
   499  			}
   500  		}
   501  		field("format", r.Format, p.Format)
   502  		field("size", r.Size, p.Size)
   503  		field("content", r.SHA256, p.SHA256)
   504  		return match
   505  	}
   506  
   507  	return DiffArchive(log, IndexTarGz(log, rebuilt, fix), IndexTarGz(log, posted, fix), check)
   508  }
   509  
   510  // DiffZip diffs the zip files rebuilt and posted, reporting any differences to log
   511  // and applying fix to files before comparing them.
   512  // It reports whether the archives match.
   513  func DiffZip(log *Log, rebuilt, posted []byte, fix Fixer) bool {
   514  	n := 0
   515  	check := func(log *Log, rebuilt, posted *ZipFile) bool {
   516  		match := true
   517  		name := rebuilt.Name
   518  		field := func(what string, rebuilt, posted any) {
   519  			if posted != rebuilt {
   520  				if n++; n <= 100 {
   521  					log.Printf("%s: rebuilt %s = %v, posted = %v", name, what, rebuilt, posted)
   522  				} else if n == 101 {
   523  					log.Printf("eliding additional diffs ...")
   524  				}
   525  				match = false
   526  			}
   527  		}
   528  		r := rebuilt
   529  		p := posted
   530  
   531  		field("comment", r.Comment, p.Comment)
   532  		field("nonutf8", r.NonUTF8, p.NonUTF8)
   533  		field("creatorversion", r.CreatorVersion, p.CreatorVersion)
   534  		field("readerversion", r.ReaderVersion, p.ReaderVersion)
   535  		field("flags", r.Flags, p.Flags)
   536  		field("method", r.Method, p.Method)
   537  		// Older versions of Go produce unequal Modified times in archive/zip,
   538  		// presumably due to some kind of archive/zip parsing error,
   539  		// or perhaps due to the Extra field being doubled below.
   540  		// The problem does not happen with Go 1.20.
   541  		// To allow people to use older Go versions to run gorebuild,
   542  		// we only check the actual time instant, not the location, in Modified.
   543  		field("modifiedUnix", r.Modified.UnixNano(), p.Modified.UnixNano())
   544  		field("mtime", r.ModifiedTime, p.ModifiedTime)
   545  		field("mdate", r.ModifiedDate, p.ModifiedDate)
   546  		if len(p.Extra) == 2*len(r.Extra) && string(p.Extra) == string(r.Extra)+string(r.Extra) {
   547  			// Mac signing rewrites the zip file, which ends up doubling
   548  			// the Extra field due to go.dev/issue/61572.
   549  			// Allow that.
   550  		} else {
   551  			field("extra", fmt.Sprintf("%x", r.Extra), fmt.Sprintf("%x", p.Extra))
   552  		}
   553  		field("crc32", r.CRC32, p.CRC32)
   554  		field("xattrs", r.ExternalAttrs, p.ExternalAttrs)
   555  		field("usize32", r.UncompressedSize, p.UncompressedSize)
   556  		field("usize64", r.UncompressedSize64, p.UncompressedSize64)
   557  		field("content", r.SHA256, p.SHA256)
   558  		return match
   559  	}
   560  
   561  	return DiffArchive(log, IndexZip(log, rebuilt, fix), IndexZip(log, posted, fix), check)
   562  }