github.com/google/trillian-examples@v0.0.0-20240520080811-0d40d35cef0e/experimental/batchmap/sumdb/build/map.go (about)

     1  // Copyright 2020 Google LLC. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // map constructs a verifiable map from the modules in Go SumDB.
    16  package main
    17  
    18  import (
    19  	"context"
    20  	"database/sql"
    21  	"encoding/json"
    22  	"flag"
    23  	"fmt"
    24  	"reflect"
    25  
    26  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    27  	"github.com/apache/beam/sdks/v2/go/pkg/beam/io/databaseio"
    28  	beamlog "github.com/apache/beam/sdks/v2/go/pkg/beam/log"
    29  	"github.com/apache/beam/sdks/v2/go/pkg/beam/x/beamx"
    30  
    31  	"github.com/golang/glog"
    32  
    33  	"github.com/google/trillian/experimental/batchmap"
    34  
    35  	"github.com/google/trillian-examples/experimental/batchmap/sumdb/build/pipeline"
    36  	"github.com/google/trillian-examples/experimental/batchmap/sumdb/mapdb"
    37  
    38  	_ "github.com/mattn/go-sqlite3"
    39  )
    40  
    41  var (
    42  	sumDBString       = flag.String("sum_db", "", "The path of the SQLite file generated by sumdbaudit, e.g. ~/sum.db.")
    43  	mapDBString       = flag.String("map_db", "", "Output database where the map tiles will be written.")
    44  	treeID            = flag.Int64("tree_id", 12345, "The ID of the tree. Used as a salt in hashing.")
    45  	prefixStrata      = flag.Int("prefix_strata", 2, "The number of strata of 8-bit strata before the final strata.")
    46  	count             = flag.Int64("count", -1, "The total number of entries starting from the beginning of the SumDB to use, or -1 to use all")
    47  	batchSize         = flag.Int("write_batch_size", 250, "Number of tiles to write per batch")
    48  	incrementalUpdate = flag.Bool("incremental_update", false, "If set the map tiles from the previous revision will be updated with the delta, otherwise this will build the map from scratch each time.")
    49  	buildVersionList  = flag.Bool("build_version_list", false, "If set then the map will also contain a mapping for each module to a log committing to its list of versions.")
    50  )
    51  
    52  func init() {
    53  	beam.RegisterType(reflect.TypeOf((*tileToDBRowFn)(nil)).Elem())
    54  	beam.RegisterFunction(tileFromDBRowFn)
    55  
    56  	beam.RegisterType(reflect.TypeOf((*logToDBRowFn)(nil)).Elem())
    57  }
    58  
    59  func main() {
    60  	flag.Parse()
    61  	beam.Init()
    62  
    63  	// Connect to where we will read from and write to.
    64  	sumDB, err := newSumDBMirrorFromFlags()
    65  	if err != nil {
    66  		glog.Exitf("Failed to initialize from local SumDB: %v", err)
    67  	}
    68  	mapDB, rev, err := sinkFromFlags()
    69  	if err != nil {
    70  		glog.Exitf("Failed to initialize Map DB: %v", err)
    71  	}
    72  
    73  	pb := pipeline.NewMapBuilder(sumDB, *treeID, *prefixStrata, *buildVersionList)
    74  
    75  	beamlog.SetLogger(&BeamGLogger{InfoLogAtVerbosity: 2})
    76  	p, s := beam.NewPipelineWithRoot()
    77  
    78  	var tiles, logs beam.PCollection
    79  	var inputLogMetadata pipeline.InputLogMetadata
    80  	if *incrementalUpdate {
    81  		lastMapRev, golden, startID, err := mapDB.LatestRevision()
    82  		if err != nil {
    83  			glog.Exitf("Failed to get LatestRevision: %v", err)
    84  		}
    85  		tileRows := databaseio.Query(s, "sqlite3", *mapDBString, fmt.Sprintf("SELECT * FROM tiles WHERE revision=%d", lastMapRev), reflect.TypeOf(MapTile{}))
    86  		lastTiles := beam.ParDo(s, tileFromDBRowFn, tileRows)
    87  
    88  		tiles, inputLogMetadata, err = pb.Update(s, lastTiles, pipeline.InputLogMetadata{
    89  			Checkpoint: golden,
    90  			Entries:    startID,
    91  		}, *count)
    92  		if err != nil {
    93  			glog.Exitf("Failed to build Update pipeline: %v", err)
    94  		}
    95  	} else {
    96  		tiles, logs, inputLogMetadata, err = pb.Create(s, *count)
    97  		if err != nil {
    98  			glog.Exitf("Failed to build Create pipeline: %v", err)
    99  		}
   100  	}
   101  
   102  	tileRows := beam.ParDo(s.Scope("convertoutput"), &tileToDBRowFn{Revision: rev}, tiles)
   103  	databaseio.WriteWithBatchSize(s.Scope("sink"), *batchSize, "sqlite3", *mapDBString, "tiles", []string{}, tileRows)
   104  
   105  	if *buildVersionList {
   106  		logRows := beam.ParDo(s, &logToDBRowFn{rev}, logs)
   107  		databaseio.WriteWithBatchSize(s.Scope("sinkLogs"), *batchSize, "sqlite3", *mapDBString, "logs", []string{}, logRows)
   108  	}
   109  
   110  	// All of the above constructs the pipeline but doesn't run it. Now we run it.
   111  	if err := beamx.Run(context.Background(), p); err != nil {
   112  		glog.Exitf("Failed to execute job: %q", err)
   113  	}
   114  
   115  	if err := mapDB.WriteRevision(rev, inputLogMetadata.Checkpoint, inputLogMetadata.Entries); err != nil {
   116  		glog.Exitf("Failed to finalize map revison %d: %v", rev, err)
   117  	}
   118  }
   119  
   120  func sinkFromFlags() (*mapdb.TileDB, int, error) {
   121  	if len(*mapDBString) == 0 {
   122  		return nil, 0, fmt.Errorf("missing flag: map_db")
   123  	}
   124  
   125  	tiledb, err := mapdb.NewTileDB(*mapDBString)
   126  	if err != nil {
   127  		return nil, 0, fmt.Errorf("failed to open map DB at %q: %v", *mapDBString, err)
   128  	}
   129  	if err := tiledb.Init(); err != nil {
   130  		return nil, 0, fmt.Errorf("failed to Init map DB at %q: %v", *mapDBString, err)
   131  	}
   132  
   133  	var rev int
   134  	if rev, err = tiledb.NextWriteRevision(); err != nil {
   135  		return nil, 0, fmt.Errorf("failed to query for next write revision: %v", err)
   136  
   137  	}
   138  	return tiledb, rev, nil
   139  }
   140  
   141  // LogDBRow adapts ModuleVersionLog to the schema format of the Map database to allow for databaseio writing.
   142  type LogDBRow struct {
   143  	Revision int
   144  	Module   string
   145  	Leaves   []byte
   146  }
   147  
   148  type logToDBRowFn struct {
   149  	Revision int
   150  }
   151  
   152  func (fn *logToDBRowFn) ProcessElement(ctx context.Context, l *pipeline.ModuleVersionLog) (LogDBRow, error) {
   153  	bs, err := json.Marshal(l.Versions)
   154  	if err != nil {
   155  		return LogDBRow{}, err
   156  	}
   157  	return LogDBRow{
   158  		Revision: fn.Revision,
   159  		Module:   l.Module,
   160  		Leaves:   bs,
   161  	}, nil
   162  }
   163  
   164  // MapTile is the schema format of the Map database to allow for databaseio writing.
   165  type MapTile struct {
   166  	Revision int
   167  	Path     []byte
   168  	Tile     []byte
   169  }
   170  
   171  type tileToDBRowFn struct {
   172  	Revision int
   173  }
   174  
   175  func (fn *tileToDBRowFn) ProcessElement(ctx context.Context, t *batchmap.Tile) (MapTile, error) {
   176  	bs, err := json.Marshal(t)
   177  	if err != nil {
   178  		return MapTile{}, err
   179  	}
   180  	return MapTile{
   181  		Revision: fn.Revision,
   182  		Path:     t.Path,
   183  		Tile:     bs,
   184  	}, nil
   185  }
   186  
   187  func tileFromDBRowFn(t MapTile) (*batchmap.Tile, error) {
   188  	var res batchmap.Tile
   189  	if err := json.Unmarshal(t.Tile, &res); err != nil {
   190  		return nil, err
   191  	}
   192  	return &res, nil
   193  }
   194  
   195  type sumDBMirror struct {
   196  	dbString string
   197  	db       *sql.DB
   198  }
   199  
   200  func newSumDBMirrorFromFlags() (*sumDBMirror, error) {
   201  	if len(*sumDBString) == 0 {
   202  		return nil, fmt.Errorf("missing flag: sum_db")
   203  	}
   204  	db, err := sql.Open("sqlite3", *sumDBString)
   205  	return &sumDBMirror{
   206  		dbString: *sumDBString,
   207  		db:       db,
   208  	}, err
   209  }
   210  
   211  // Head gets the STH and the total number of entries available to process.
   212  func (m *sumDBMirror) Head() ([]byte, int64, error) {
   213  	var cp []byte
   214  	var leafCount int64
   215  
   216  	if err := m.db.QueryRow("SELECT checkpoint FROM checkpoints ORDER BY datetime DESC LIMIT 1").Scan(&cp); err != nil {
   217  		return nil, 0, err
   218  	}
   219  	return cp, leafCount, m.db.QueryRow("SELECT COUNT(*) FROM leafMetadata").Scan(&leafCount)
   220  }
   221  
   222  // Entries returns a PCollection of Metadata, containing entries in range [start, end).
   223  func (m *sumDBMirror) Entries(s beam.Scope, start, end int64) beam.PCollection {
   224  	return databaseio.Query(s, "sqlite3", m.dbString, fmt.Sprintf("SELECT * FROM leafMetadata WHERE id >= %d AND id < %d", start, end), reflect.TypeOf(pipeline.Metadata{}))
   225  }
   226  
   227  // BeamGLogger allows Beam to log via the glog mechanism.
   228  // This is used to allow the very verbose logging output from Beam to be switched off.
   229  type BeamGLogger struct {
   230  	InfoLogAtVerbosity glog.Level
   231  }
   232  
   233  // Log logs.
   234  func (l *BeamGLogger) Log(ctx context.Context, sev beamlog.Severity, _ int, msg string) {
   235  	switch sev {
   236  	case beamlog.SevDebug:
   237  		glog.V(3).Info(msg)
   238  	case beamlog.SevInfo:
   239  		glog.V(l.InfoLogAtVerbosity).Info(msg)
   240  	case beamlog.SevError:
   241  		glog.Error(msg)
   242  	case beamlog.SevWarn:
   243  		glog.Warning(msg)
   244  	default:
   245  		glog.V(5).Infof("?? %s", msg)
   246  	}
   247  }