github.com/quay/claircore@v1.5.28/rpm/bdb/bdb.go (about)

     1  package bdb
     2  
     3  import (
     4  	"context"
     5  	"encoding/binary"
     6  	"errors"
     7  	"fmt"
     8  	"io"
     9  
    10  	"github.com/quay/zlog"
    11  )
    12  
    13  // PackageDB is the "pkgdb" a.k.a. "Packages", the raw package data.
    14  type PackageDB struct {
    15  	r   io.ReaderAt
    16  	ord binary.ByteOrder
    17  	m   hashmeta
    18  }
    19  
    20  // Parse closes over the provided [io.ReaderAt] and readies the provided PackageDB.
    21  func (db *PackageDB) Parse(r io.ReaderAt) error {
    22  	const (
    23  		hashmagic   = 0x00061561
    24  		hashmagicBE = 0x61150600
    25  	)
    26  	db.ord = binary.LittleEndian
    27  Again:
    28  	pg := io.NewSectionReader(r, 0, 512)
    29  	if err := binary.Read(pg, db.ord, &db.m); err != nil {
    30  		return err
    31  	}
    32  	if db.m.Magic == hashmagicBE {
    33  		// Swap, try again.
    34  		db.ord = binary.BigEndian
    35  		goto Again
    36  	}
    37  
    38  	if db.m.Magic != hashmagic {
    39  		return fmt.Errorf("bdb: nonsense magic: %08x", db.m.Magic)
    40  	}
    41  	if db.m.Type != pagetypeHashMeta {
    42  		return fmt.Errorf("bdb: nonsense page type: %08x", db.m.Type)
    43  	}
    44  	if db.m.EncryptAlg != 0 { // none
    45  		return errors.New("bdb: database encryption not supported")
    46  	}
    47  	ok := false
    48  	for i := 0; i < 8; i++ {
    49  		var sz uint32 = (1 << i) * 512
    50  		if db.m.PageSize == sz {
    51  			ok = true
    52  			break
    53  		}
    54  	}
    55  	if !ok {
    56  		return fmt.Errorf("bdb: nonsense page size: %d", db.m.PageSize)
    57  	}
    58  
    59  	db.r = r
    60  	return nil
    61  }
    62  
    63  /*
    64  Some terminology:
    65  
    66  - LSN:
    67    Log Sequence Number -- Needed for detecting stale writes, I think.
    68    This package ignores it.
    69  
    70  Note that the page type always falls in byte 25 -- very clever.
    71  Don't freak out if it looks like the first page is read multiple ways; it is.
    72  
    73  See also: libdb's src/dbinc/db_page.h
    74  */
    75  
    76  // Meta is the generic metadata, aka DBMETA in C.
    77  type meta struct {
    78  	LSN         [8]byte  /* 00-07: LSN. */
    79  	PageNo      uint32   /* 08-11: Current page number. */
    80  	Magic       uint32   /* 12-15: Magic number. */
    81  	Version     uint32   /* 16-19: Version. */
    82  	PageSize    uint32   /* 20-23: Pagesize. */
    83  	EncryptAlg  byte     /*    24: Encryption algorithm. */
    84  	Type        byte     /*    25: Page type. */
    85  	Metaflags   byte     /* 26: Meta-only flags */
    86  	_           byte     /* 27: Unused. */
    87  	Free        uint32   /* 28-31: Free list page number. */
    88  	LastPageNo  uint32   /* 32-35: Page number of last page in db. */
    89  	NParts      uint32   /* 36-39: Number of partitions. */
    90  	KeyCount    uint32   /* 40-43: Cached key count. */
    91  	RecordCount uint32   /* 44-47: Cached record count. */
    92  	Flags       uint32   /* 48-51: Flags: unique to each AM. */
    93  	UID         [20]byte /* 52-71: Unique file ID. */
    94  }
    95  
    96  // Pagetype numbers:
    97  const (
    98  	pagetypeHashMeta     = 8
    99  	pagetypeHashUnsorted = 2
   100  	pagetypeHash         = 13
   101  	pagetypeHashOffIndex = 3
   102  	pagetypeOverflow     = 7
   103  	pagetypeKeyData      = 1 // Disused, we never examine the keys.
   104  )
   105  
   106  // Serialized sizes:
   107  const (
   108  	hashpageSize    = 26
   109  	hashoffpageSize = 12
   110  )
   111  
   112  // Hash database metadata, aka HMETA in C.
   113  type hashmeta struct {
   114  	meta                   /* 00-71: Generic meta-data page header. */
   115  	MaxBucket   uint32     /* 72-75: ID of Maximum bucket in use */
   116  	HighMask    uint32     /* 76-79: Modulo mask into table */
   117  	LowMask     uint32     /* 80-83: Modulo mask into table lower half */
   118  	FllFactor   uint32     /* 84-87: Fill factor */
   119  	NElem       uint32     /* 88-91: Number of keys in hash table */
   120  	HashCharKey uint32     /* 92-95: Value of hash(CHARKEY) */
   121  	_           [32]uint32 /* 96-223: Spare pages for overflow */
   122  	_           [59]uint32 /* 224-459: Unused space */
   123  	CryptoMagic uint32     /* 460-463: Crypto magic number */
   124  	_           [3]uint32  /* 464-475: Trash space - Do not use */
   125  	// The comments don't line up, but the numbers come from the source, so...
   126  	IV       [16]byte /* 476-495: Crypto IV */
   127  	Checksum [20]byte /* 496-511: Page chksum */
   128  }
   129  
   130  // Hash page header, aka PAGE in C.
   131  //
   132  // Also shared with btree databases, which are unimplemented here.
   133  // The [meta.PageSize] block of memory has this struct at position 0, then
   134  // populates it backwards from the end for structured data, or immediately after
   135  // this for binary data.
   136  type hashpage struct {
   137  	LSN            [8]byte /* 00-07: Log sequence number. */
   138  	PageNo         uint32  /* 08-11: Current page number. */
   139  	PrevPageNo     uint32  /* 12-15: Previous page number. */
   140  	NextPageNo     uint32  /* 16-19: Next page number. */
   141  	Entries        uint16  /* 20-21: Number of items on the page. */
   142  	HighFreeOffset uint16  /* 22-23: High free byte page offset. */
   143  	Level          byte    /*    24: Btree tree level. */
   144  	Type           byte    /*    25: Page type. */
   145  }
   146  
   147  // Hash page entries.
   148  //
   149  // This data structure doesn't appear directly in the C source, but open a file
   150  // in a hex editor and it's apparent. The comments mention that "For hash and
   151  // btree leaf pages, index items are paired, e.g., inp[0] is the key for
   152  // inp[1]'s data." I think this is just a codification of that.
   153  //
   154  // We never bother looking up the key. If access to a single, specific header
   155  // were needed, the code would have to handle it then.
   156  type hashentry struct {
   157  	Key  uint16
   158  	Data uint16
   159  }
   160  
   161  // Hash offpage header, aka HOFFPAGE in C.
   162  //
   163  // This stores the data on how to extract "overflow"/"offpage" data.
   164  type hashoffpage struct {
   165  	Type   byte    /*    00: Page type and delete flag. */
   166  	_      [3]byte /* 01-03: Padding, unused. */
   167  	PageNo uint32  /* 04-07: Offpage page number. */
   168  	Length uint32  /* 08-11: Total length of item. */
   169  }
   170  
   171  // AllHeaders returns ReaderAts for all RPM headers in the PackageDB.
   172  func (db *PackageDB) AllHeaders(ctx context.Context) ([]io.ReaderAt, error) {
   173  	var ret []io.ReaderAt
   174  	pageSz := int64(db.m.PageSize)
   175  	for n, lim := int64(0), int64(db.m.LastPageNo)+1; n < lim; n++ {
   176  		pg := io.NewSectionReader(db.r, n*pageSz, pageSz)
   177  		var h hashpage
   178  		if err := binary.Read(pg, db.ord, &h); err != nil {
   179  			return nil, fmt.Errorf("bdb: error reading hashpage: %w", err)
   180  		}
   181  		if h.Type != pagetypeHashUnsorted && h.Type != pagetypeHash {
   182  			continue
   183  		}
   184  		if h.Entries%2 != 0 {
   185  			return nil, errors.New("bdb: odd number of entries")
   186  		}
   187  
   188  		ent := make([]hashentry, int(h.Entries)/2)
   189  		for i := range ent {
   190  			if err := binary.Read(pg, db.ord, &ent[i]); err != nil {
   191  				return nil, fmt.Errorf("bdb: error reading hash entry: %w", err)
   192  			}
   193  		}
   194  
   195  		k := []byte{0x00}
   196  		for _, e := range ent {
   197  			off := int64(e.Data)
   198  			// First, check what kind of hash entry this is.
   199  			view := io.NewSectionReader(pg, off, hashoffpageSize)
   200  			if _, err := view.ReadAt(k, 0); err != nil {
   201  				return nil, fmt.Errorf("bdb: error peeking page type: %w", err)
   202  			}
   203  			if k[0] != pagetypeHashOffIndex {
   204  				continue
   205  			}
   206  			// Read the page header, now that we know it's correct.
   207  			var offpg hashoffpage
   208  			if err := binary.Read(view, db.ord, &offpg); err != nil {
   209  				return nil, fmt.Errorf("bdb: error reading hashoffpage: %w", err)
   210  			}
   211  			var r rope
   212  			for n := offpg.PageNo; n != 0; {
   213  				off := pageSz * int64(n)
   214  				pg := io.NewSectionReader(db.r, off, pageSz)
   215  				var h hashpage
   216  				if err := binary.Read(pg, db.ord, &h); err != nil {
   217  					return nil, fmt.Errorf("bdb: error reading hashpage: %w", err)
   218  				}
   219  				if h.Type != pagetypeOverflow {
   220  					continue
   221  				}
   222  				off += hashpageSize
   223  
   224  				var data *io.SectionReader
   225  				if h.NextPageNo == 0 {
   226  					// If this is the last page, only read to the end.
   227  					data = io.NewSectionReader(db.r, off, int64(h.HighFreeOffset))
   228  				} else {
   229  					data = io.NewSectionReader(db.r, off, pageSz-hashpageSize)
   230  				}
   231  				if err := r.add(data); err != nil {
   232  					return nil, fmt.Errorf("bdb: error adding to rope: %w", err)
   233  				}
   234  				n = h.NextPageNo
   235  			}
   236  			// Double-check we'll read the intended amount.
   237  			if got, want := r.Size(), int64(offpg.Length); got != want {
   238  				zlog.Info(ctx).
   239  					Int64("got", got).
   240  					Int64("want", want).
   241  					Msg("bdb: expected data length botch")
   242  			}
   243  			ret = append(ret, &r)
   244  		}
   245  	}
   246  	return ret, nil
   247  }
   248  
   249  // Validate currently here to fulfil an interface.
   250  func (db *PackageDB) Validate(_ context.Context) error {
   251  	return nil
   252  }
   253  
   254  // Rope provides an [io.ReaderAt] over an ordered slice of [io.ReaderAt].
   255  //
   256  // It's much simpler than a real rope because it's append only.
   257  type rope struct {
   258  	rd  []*io.SectionReader
   259  	off []int64
   260  }
   261  
   262  var _ io.ReaderAt = (*rope)(nil)
   263  
   264  // ReadAt implements [io.ReaderAt].
   265  func (r *rope) ReadAt(b []byte, off int64) (int, error) {
   266  	// Find start:
   267  	idx := 0
   268  	for i, roff := range r.off {
   269  		if roff > off {
   270  			break
   271  		}
   272  		idx = i
   273  	}
   274  
   275  	// Read as many segments as needed:
   276  	n := 0
   277  	rdoff := off - r.off[idx] // offset into the reader at "idx"
   278  	for {
   279  		rn, err := r.rd[idx].ReadAt(b[n:], rdoff)
   280  		n += rn
   281  		switch {
   282  		case errors.Is(err, nil):
   283  		case errors.Is(err, io.EOF):
   284  			idx++
   285  			if idx != len(r.rd) {
   286  				rdoff = 0 // Reading from the start, now that we're on the next one.
   287  				continue
   288  			}
   289  			fallthrough
   290  		default:
   291  			return n, err
   292  		}
   293  		if n == len(b) {
   294  			break
   295  		}
   296  	}
   297  	return n, nil
   298  }
   299  
   300  // Size reports the total size of data that can be read from this rope.
   301  func (r *rope) Size() (s int64) {
   302  	for _, rd := range r.rd {
   303  		s += rd.Size()
   304  	}
   305  	return s
   306  }
   307  
   308  // Add appends the provided [io.SectionReader].
   309  func (r *rope) add(rd *io.SectionReader) error {
   310  	var off int64
   311  	for _, rd := range r.rd {
   312  		off += rd.Size()
   313  	}
   314  	r.rd = append(r.rd, rd)
   315  	r.off = append(r.off, off)
   316  	return nil
   317  }