github.com/maruel/nin@v0.0.0-20220112143044-f35891e3ce7e/deps_log.go (about)

     1  // Copyright 2012 Google Inc. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package nin
    16  
    17  import (
    18  	"bufio"
    19  	"bytes"
    20  	"encoding/binary"
    21  	"errors"
    22  	"fmt"
    23  	"io/ioutil"
    24  	"os"
    25  )
    26  
    27  // Deps is the reading (startup-time) struct.
    28  type Deps struct {
    29  	MTime TimeStamp
    30  	Nodes []*Node
    31  }
    32  
    33  // NewDeps returns an initialized Deps.
    34  func NewDeps(mtime TimeStamp, nodeCount int) *Deps {
    35  	return &Deps{
    36  		MTime: mtime,
    37  		Nodes: make([]*Node, nodeCount),
    38  	}
    39  }
    40  
    41  // DepsLog represents a .ninja_deps log file to accelerate incremental build.
    42  //
    43  // As build commands run they can output extra dependency information
    44  // (e.g. header dependencies for C source) dynamically. DepsLog collects
    45  // that information at build time and uses it for subsequent builds.
    46  //
    47  // The on-disk format is based on two primary design constraints:
    48  //
    49  // - it must be written to as a stream (during the build, which may be
    50  // interrupted);
    51  //
    52  // - it can be read all at once on startup. (Alternative designs, where
    53  // it contains indexing information, were considered and discarded as
    54  // too complicated to implement; if the file is small than reading it
    55  // fully on startup is acceptable.)
    56  //
    57  // Here are some stats from the Windows Chrome dependency files, to
    58  // help guide the design space. The total text in the files sums to
    59  // 90mb so some compression is warranted to keep load-time fast.
    60  // There's about 10k files worth of dependencies that reference about
    61  // 40k total paths totalling 2mb of unique strings.
    62  //
    63  // Based on these stats, here's the current design.
    64  //
    65  // The file is structured as version header followed by a sequence of records.
    66  // Each record is either a path string or a dependency list.
    67  // Numbering the path strings in file order gives them dense integer ids.
    68  // A dependency list maps an output id to a list of input ids.
    69  //
    70  // Concretely, a record is:
    71  //    four bytes record length, high bit indicates record type
    72  //      (but max record sizes are capped at 512kB)
    73  //    path records contain the string name of the path, followed by up to 3
    74  //      padding bytes to align on 4 byte boundaries, followed by the
    75  //      one's complement of the expected index of the record (to detect
    76  //      concurrent writes of multiple ninja processes to the log).
    77  //    dependency records are an array of 4-byte integers
    78  //      [output path id,
    79  //       output path mtime (lower 4 bytes), output path mtime (upper 4 bytes),
    80  //       input path id, input path id...]
    81  //      (The mtime is compared against the on-disk output path mtime
    82  //      to verify the stored data is up-to-date.)
    83  // If two records reference the same output the latter one in the file
    84  // wins, allowing updates to just be appended to the file.  A separate
    85  // repacking step can run occasionally to remove dead records.
    86  type DepsLog struct {
    87  	// Maps id -> Node.
    88  	Nodes []*Node
    89  	// Maps id -> Deps of that id.
    90  	Deps []*Deps
    91  
    92  	filePath          string
    93  	file              *os.File
    94  	buf               *bufio.Writer
    95  	needsRecompaction bool
    96  }
    97  
    98  // The version is stored as 4 bytes after the signature and also serves as a
    99  // byte order mark. Signature and version combined are 16 bytes long.
   100  const (
   101  	depsLogFileSignature  = "# ninjadeps\n"
   102  	depsLogCurrentVersion = uint32(4)
   103  )
   104  
   105  // Record size is currently limited to less than the full 32 bit, due to
   106  // internal buffers having to have this size.
   107  const maxRecordSize = (1 << 19) - 1
   108  
   109  // OpenForWrite prepares writing to the log file without actually opening it -
   110  // that will happen when/if it's needed.
   111  func (d *DepsLog) OpenForWrite(path string) error {
   112  	if d.needsRecompaction {
   113  		if err := d.Recompact(path); err != nil {
   114  			return err
   115  		}
   116  	}
   117  
   118  	if d.file != nil {
   119  		panic("M-A")
   120  	}
   121  	// we don't actually open the file right now, but will do
   122  	// so on the first write attempt
   123  	d.filePath = path
   124  	return nil
   125  }
   126  
   127  func (d *DepsLog) recordDeps(node *Node, mtime TimeStamp, nodes []*Node) error {
   128  	nodeCount := len(nodes)
   129  	// Track whether there's any new data to be recorded.
   130  	madeChange := false
   131  
   132  	// Assign ids to all nodes that are missing one.
   133  	if node.ID < 0 {
   134  		if err := d.recordID(node); err != nil {
   135  			return err
   136  		}
   137  		madeChange = true
   138  	}
   139  	for i := 0; i < nodeCount; i++ {
   140  		if nodes[i].ID < 0 {
   141  			if err := d.recordID(nodes[i]); err != nil {
   142  				return err
   143  			}
   144  			madeChange = true
   145  		}
   146  	}
   147  
   148  	// See if the new data is different than the existing data, if any.
   149  	if !madeChange {
   150  		deps := d.GetDeps(node)
   151  		if deps == nil || deps.MTime != mtime || len(deps.Nodes) != nodeCount {
   152  			madeChange = true
   153  		} else {
   154  			for i := 0; i < nodeCount; i++ {
   155  				if deps.Nodes[i] != nodes[i] {
   156  					madeChange = true
   157  					break
   158  				}
   159  			}
   160  		}
   161  	}
   162  
   163  	// Don't write anything if there's no new info.
   164  	if !madeChange {
   165  		return nil
   166  	}
   167  
   168  	// Update on-disk representation.
   169  	size := uint32(4 * (1 + 2 + nodeCount))
   170  	if size > maxRecordSize {
   171  		return errors.New("too many dependencies")
   172  	}
   173  	if err := d.openForWriteIfNeeded(); err != nil {
   174  		return err
   175  	}
   176  	size |= 0x80000000 // Deps record: set high bit.
   177  
   178  	if err := binary.Write(d.buf, binary.LittleEndian, size); err != nil {
   179  		return err
   180  	}
   181  	if err := binary.Write(d.buf, binary.LittleEndian, uint32(node.ID)); err != nil {
   182  		return err
   183  	}
   184  	if err := binary.Write(d.buf, binary.LittleEndian, uint64(mtime)); err != nil {
   185  		return err
   186  	}
   187  	for i := 0; i < nodeCount; i++ {
   188  		if err := binary.Write(d.buf, binary.LittleEndian, uint32(nodes[i].ID)); err != nil {
   189  			return err
   190  		}
   191  	}
   192  	if err := d.buf.Flush(); err != nil {
   193  		return err
   194  	}
   195  
   196  	// Update in-memory representation.
   197  	deps := NewDeps(mtime, nodeCount)
   198  	for i := 0; i < nodeCount; i++ {
   199  		deps.Nodes[i] = nodes[i]
   200  	}
   201  	d.updateDeps(node.ID, deps)
   202  	return nil
   203  }
   204  
   205  // Close closes the file handle.
   206  func (d *DepsLog) Close() error {
   207  	// create the file even if nothing has been recorded
   208  	if err := d.openForWriteIfNeeded(); err != nil {
   209  		return err
   210  	}
   211  	var err error
   212  	if d.file != nil {
   213  		if err2 := d.buf.Flush(); err2 != nil {
   214  			err = err2
   215  		}
   216  		if err2 := d.file.Close(); err2 != nil {
   217  			err = err2
   218  		}
   219  	}
   220  	d.buf = nil
   221  	d.file = nil
   222  	return err
   223  }
   224  
   225  // Load loads a .ninja_deps to accelerate incremental build.
   226  //
   227  // Note: For version differences, this should migrate to the new format.
   228  // But the v1 format could sometimes (rarely) end up with invalid data, so
   229  // don't migrate v1 to v3 to force a rebuild. (v2 only existed for a few days,
   230  // and there was no release with it, so pretend that it never happened.)
   231  //
   232  // Warning: the whole file content is kept alive.
   233  //
   234  // TODO(maruel): Make it an option so that when used as a library it doesn't
   235  // become a memory bloat. This is especially important when recompacting.
   236  func (d *DepsLog) Load(path string, state *State) (LoadStatus, error) {
   237  	defer metricRecord(".ninja_deps load")()
   238  	// Read the file all at once. The drawback is that it will fail hard on 32
   239  	// bits OS on large builds. This should be rare in 2022. For small builds, it
   240  	// will be fine (and faster).
   241  	data, err := ioutil.ReadFile(path)
   242  	if err != nil {
   243  		if os.IsNotExist(err) {
   244  			return LoadNotFound, err
   245  		}
   246  		return LoadError, err
   247  	}
   248  
   249  	// Validate header.
   250  	validHeader := false
   251  	version := uint32(0)
   252  	if len(data) >= len(depsLogFileSignature)+4 && unsafeString(data[:len(depsLogFileSignature)]) == depsLogFileSignature {
   253  		version = binary.LittleEndian.Uint32(data[len(depsLogFileSignature):])
   254  		validHeader = version == depsLogCurrentVersion
   255  	}
   256  	if !validHeader {
   257  		// Don't report this as a failure.  An empty deps log will cause
   258  		// us to rebuild the outputs anyway.
   259  		_ = os.Remove(path)
   260  		if version == 1 {
   261  			return LoadSuccess, errors.New("deps log version change; rebuilding")
   262  		}
   263  		l := bytes.IndexByte(data[:], 0)
   264  		if l <= 0 {
   265  			return LoadSuccess, errors.New("bad deps log signature or version; starting over")
   266  		}
   267  		return LoadSuccess, fmt.Errorf("bad deps log signature %q or version %d; starting over", data[:l], version)
   268  	}
   269  
   270  	// Skip the header.
   271  	// TODO(maruel): Calculate if it is faster to do "data = data[4:8]" or use
   272  	// "data[offset+4:offset+8]".
   273  	// Offset is kept to keep the last successful read, to truncate in case of
   274  	// failure.
   275  	offset := int64(len(depsLogFileSignature) + 4)
   276  	data = data[offset:]
   277  	uniqueDepRecordCount := 0
   278  	totalDepRecordCount := 0
   279  	for len(data) != 0 {
   280  		// A minimal record is size (4 bytes) plus one of:
   281  		// - content (>=4 + checksum(4)); CanonicalizePath() rejects empty paths.
   282  		// - (id(4)+mtime(8)+nodes(4x) >12) for deps node.
   283  		if len(data) < 12 {
   284  			err = fmt.Errorf("premature end of file after %d bytes", int(offset)+len(data))
   285  			break
   286  		}
   287  		size := binary.LittleEndian.Uint32(data[:4])
   288  		// Skip |size|. Only bump offset after a successful read down below.
   289  		isDeps := size&0x80000000 != 0
   290  		size = size & ^uint32(0x80000000)
   291  		data = data[4:]
   292  		if len(data) < int(size) {
   293  			err = fmt.Errorf("premature end of file after %d bytes", int(offset)+len(data)+4)
   294  			break
   295  		}
   296  		if size%4 != 0 || size < 8 || size > maxRecordSize {
   297  			// It'd be nice to do a check for "size < 12" instead. The likelihood of
   298  			// a path with 3 characters or less is very small.
   299  			err = fmt.Errorf("record size %d is out of bounds", size)
   300  			break
   301  		}
   302  		if isDeps {
   303  			if size < 12 {
   304  				err = errors.New("record size is too small for deps")
   305  				break
   306  			}
   307  			outID := int32(binary.LittleEndian.Uint32(data[:4]))
   308  			if outID < 0 || outID >= 0x1000000 {
   309  				// That's a lot of nodes.
   310  				err = errors.New("record deps id is out of bounds")
   311  				break
   312  			}
   313  			mtime := TimeStamp(binary.LittleEndian.Uint64(data[4:12]))
   314  			depsCount := int(size-12) / 4
   315  
   316  			// TODO(maruel): Redesign to reduce bound checks.
   317  			deps := NewDeps(mtime, depsCount)
   318  			x := 12
   319  			for i := 0; i < depsCount; i++ {
   320  				v := binary.LittleEndian.Uint32(data[x : x+4])
   321  				if int(v) >= len(d.Nodes) || d.Nodes[v] == nil {
   322  					err = errors.New("record deps node id is out of bounds")
   323  					break
   324  				}
   325  				deps.Nodes[i] = d.Nodes[v]
   326  				x += 4
   327  			}
   328  
   329  			totalDepRecordCount++
   330  			if !d.updateDeps(outID, deps) {
   331  				uniqueDepRecordCount++
   332  			}
   333  		} else {
   334  			pathSize := size - 4
   335  			// There can be up to 3 bytes of padding.
   336  			if data[pathSize-1] == '\x00' {
   337  				pathSize--
   338  				if data[pathSize-1] == '\x00' {
   339  					pathSize--
   340  					if data[pathSize-1] == '\x00' {
   341  						pathSize--
   342  					}
   343  				}
   344  			}
   345  
   346  			// TODO(maruel): We need to differentiate if we are using the GC or not.
   347  			// When the GC is disabled, #YOLO, the buffer will never go away anyway
   348  			// so better to leverage it!
   349  			subpath := unsafeString(data[:pathSize])
   350  			// Here we make a copy, because we do not want to keep a reference to the
   351  			// read buffer.
   352  			// subpath := string(data[:pathSize])
   353  
   354  			// It is not necessary to pass in a correct slashBits here. It will
   355  			// either be a Node that's in the manifest (in which case it will already
   356  			// have a correct slashBits that GetNode will look up), or it is an
   357  			// implicit dependency from a .d which does not affect the build command
   358  			// (and so need not have its slashes maintained).
   359  			node := state.GetNode(subpath, 0)
   360  
   361  			// Check that the expected index matches the actual index. This can only
   362  			// happen if two ninja processes write to the same deps log concurrently.
   363  			// (This uses unary complement to make the checksum look less like a
   364  			// dependency record entry.)
   365  			checksum := binary.LittleEndian.Uint32(data[size-4 : size])
   366  			expectedID := ^checksum
   367  			id := int32(len(d.Nodes))
   368  			if id != int32(expectedID) {
   369  				err = errors.New("node id checksum is invalid")
   370  				break
   371  			}
   372  			if node.ID >= 0 {
   373  				err = errors.New("node is duplicate")
   374  				break
   375  			}
   376  			node.ID = id
   377  			d.Nodes = append(d.Nodes, node)
   378  		}
   379  		// Register the successful read.
   380  		data = data[size:]
   381  		offset += int64(size) + 4
   382  	}
   383  
   384  	if err != nil {
   385  		// An error occurred while loading; try to recover by truncating the
   386  		// file to the last fully-read record.
   387  		if err2 := os.Truncate(path, offset); err2 != nil {
   388  			return LoadError, fmt.Errorf("truncating failed while parsing error %q: %w", err, err2)
   389  		}
   390  
   391  		// The truncate succeeded; we'll just report the load error as a
   392  		// warning because the build can proceed.
   393  		err = errors.New(err.Error() + "; recovering")
   394  		return LoadSuccess, err
   395  	}
   396  
   397  	// Rebuild the log if there are too many dead records.
   398  	const minCompactionEntryCount = 1000
   399  	kCompactionRatio := 3
   400  	if totalDepRecordCount > minCompactionEntryCount && totalDepRecordCount > uniqueDepRecordCount*kCompactionRatio {
   401  		d.needsRecompaction = true
   402  	}
   403  	return LoadSuccess, nil
   404  }
   405  
   406  // GetDeps returns the Deps for this node ID.
   407  //
   408  // Silently ignore invalid node ID.
   409  func (d *DepsLog) GetDeps(node *Node) *Deps {
   410  	// Abort if the node has no id (never referenced in the deps) or if
   411  	// there's no deps recorded for the node.
   412  	if node.ID < 0 || int(node.ID) >= len(d.Deps) {
   413  		return nil
   414  	}
   415  	return d.Deps[node.ID]
   416  }
   417  
   418  // GetFirstReverseDepsNode returns something?
   419  //
   420  // TODO(maruel): Understand better.
   421  func (d *DepsLog) GetFirstReverseDepsNode(node *Node) *Node {
   422  	for id := 0; id < len(d.Deps); id++ {
   423  		deps := d.Deps[id]
   424  		if deps == nil {
   425  			continue
   426  		}
   427  		for _, n := range deps.Nodes {
   428  			if n == node {
   429  				return d.Nodes[id]
   430  			}
   431  		}
   432  	}
   433  	return nil
   434  }
   435  
   436  // Recompact rewrites the known log entries, throwing away old data.
   437  func (d *DepsLog) Recompact(path string) error {
   438  	defer metricRecord(".ninja_deps recompact")()
   439  
   440  	if err := d.Close(); err != nil {
   441  		return err
   442  	}
   443  	tempPath := path + ".recompact"
   444  
   445  	// OpenForWrite() opens for append.  Make sure it's not appending to a
   446  	// left-over file from a previous recompaction attempt that crashed somehow.
   447  	if err := os.Remove(tempPath); err != nil && !os.IsNotExist(err) {
   448  		return err
   449  	}
   450  
   451  	// Create a new temporary log to regenerate everything.
   452  	newLog := DepsLog{}
   453  	if err := newLog.OpenForWrite(tempPath); err != nil {
   454  		return err
   455  	}
   456  
   457  	// Clear all known ids so that new ones can be reassigned.  The new indices
   458  	// will refer to the ordering in newLog, not in the current log.
   459  	for _, i := range d.Nodes {
   460  		i.ID = -1
   461  	}
   462  
   463  	// Write out all deps again.
   464  	for oldID := 0; oldID < len(d.Deps); oldID++ {
   465  		deps := d.Deps[oldID]
   466  		if deps == nil { // If nodes[oldID] is a leaf, it has no deps.
   467  			continue
   468  		}
   469  
   470  		if !d.IsDepsEntryLiveFor(d.Nodes[oldID]) {
   471  			continue
   472  		}
   473  
   474  		if err := newLog.recordDeps(d.Nodes[oldID], deps.MTime, deps.Nodes); err != nil {
   475  			_ = newLog.Close()
   476  			return err
   477  		}
   478  	}
   479  
   480  	if err := newLog.Close(); err != nil {
   481  		return err
   482  	}
   483  
   484  	// All nodes now have ids that refer to newLog, so steal its data.
   485  	d.Deps = newLog.Deps
   486  	d.Nodes = newLog.Nodes
   487  
   488  	if err := os.Remove(path); err != nil {
   489  		return err
   490  	}
   491  	return os.Rename(tempPath, path)
   492  }
   493  
   494  // IsDepsEntryLiveFor returns if the deps entry for a node is still reachable
   495  // from the manifest.
   496  //
   497  // The deps log can contain deps entries for files that were built in the
   498  // past but are no longer part of the manifest.  This function returns if
   499  // this is the case for a given node.  This function is slow, don't call
   500  // it from code that runs on every build.
   501  func (d *DepsLog) IsDepsEntryLiveFor(node *Node) bool {
   502  	// Skip entries that don't have in-edges or whose edges don't have a
   503  	// "deps" attribute. They were in the deps log from previous builds, but
   504  	// the the files they were for were removed from the build and their deps
   505  	// entries are no longer needed.
   506  	// (Without the check for "deps", a chain of two or more nodes that each
   507  	// had deps wouldn't be collected in a single recompaction.)
   508  	return node.InEdge != nil && node.InEdge.GetBinding("deps") != ""
   509  }
   510  
   511  // Updates the in-memory representation.  Takes ownership of |deps|.
   512  // Returns true if a prior deps record was deleted.
   513  func (d *DepsLog) updateDeps(outID int32, deps *Deps) bool {
   514  	if n := int(outID) + 1 - len(d.Deps); n > 0 {
   515  		d.Deps = append(d.Deps, make([]*Deps, n)...)
   516  	}
   517  	existed := d.Deps[outID] != nil
   518  	d.Deps[outID] = deps
   519  	return existed
   520  }
   521  
   522  var zeroBytes [4]byte
   523  
   524  // Write a node name record, assigning it an id.
   525  func (d *DepsLog) recordID(node *Node) error {
   526  	if node.Path == "" {
   527  		return errors.New("node.Path is empty")
   528  	}
   529  	pathSize := len(node.Path)
   530  	padding := (4 - pathSize%4) % 4 // Pad path to 4 byte boundary.
   531  
   532  	size := uint32(pathSize + padding + 4)
   533  	if size > maxRecordSize {
   534  		return errors.New("node.Path is too long")
   535  	}
   536  	if err := d.openForWriteIfNeeded(); err != nil {
   537  		return nil
   538  	}
   539  	if err := binary.Write(d.buf, binary.LittleEndian, size); err != nil {
   540  		return nil
   541  	}
   542  	if _, err := d.buf.WriteString(node.Path); err != nil {
   543  		return nil
   544  	}
   545  	if padding != 0 {
   546  		if _, err := d.buf.Write(zeroBytes[:padding]); err != nil {
   547  			return nil
   548  		}
   549  	}
   550  	id := int32(len(d.Nodes))
   551  	checksum := ^uint32(id)
   552  	if err := binary.Write(d.buf, binary.LittleEndian, checksum); err != nil {
   553  		return nil
   554  	}
   555  	if err := d.buf.Flush(); err != nil {
   556  		return nil
   557  	}
   558  	node.ID = id
   559  	d.Nodes = append(d.Nodes, node)
   560  	return nil
   561  }
   562  
   563  // openForWriteIfNeeded should be called before using file.
   564  func (d *DepsLog) openForWriteIfNeeded() error {
   565  	if d.filePath == "" {
   566  		return nil
   567  	}
   568  	if d.file != nil {
   569  		panic("surprising state")
   570  	}
   571  	var err error
   572  	d.file, err = os.OpenFile(d.filePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o666)
   573  	if err != nil {
   574  		return err
   575  	}
   576  	// Set the buffer size large and flush the file buffer after every record to
   577  	// make sure records aren't written partially.
   578  	d.buf = bufio.NewWriterSize(d.file, maxRecordSize+1)
   579  
   580  	// Opening a file in append mode doesn't set the file pointer to the file's
   581  	// end on Windows. Do that explicitly.
   582  	offset, err := d.file.Seek(0, os.SEEK_END)
   583  	if err != nil {
   584  		return err
   585  	}
   586  
   587  	if offset == 0 {
   588  		if _, err = d.buf.WriteString(depsLogFileSignature); err != nil {
   589  			return err
   590  		}
   591  		if err = binary.Write(d.buf, binary.LittleEndian, depsLogCurrentVersion); err != nil {
   592  			return err
   593  		}
   594  	}
   595  	if err = d.buf.Flush(); err != nil {
   596  		return err
   597  	}
   598  	d.filePath = ""
   599  	return nil
   600  }