github.com/google/trillian-examples@v0.0.0-20240520080811-0d40d35cef0e/experimental/batchmap/ctmap/internal/pipeline/pipeline.go (about)

     1  // Copyright 2021 Google LLC. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package pipeline contains Beam pipeline library functions for the CT
    16  // verifiable map.
    17  package pipeline
    18  
    19  import (
    20  	"context"
    21  	"crypto"
    22  	"fmt"
    23  	"reflect"
    24  
    25  	"github.com/apache/beam/sdks/v2/go/pkg/beam"
    26  	"github.com/golang/glog"
    27  	ct "github.com/google/certificate-transparency-go"
    28  	"github.com/google/certificate-transparency-go/tls"
    29  	"github.com/google/certificate-transparency-go/x509"
    30  	"github.com/google/trillian/experimental/batchmap"
    31  )
    32  
    33  var (
    34  	cntCertsWithNoDomain = beam.NewCounter("ctmap", "certs-zero-domains")
    35  	cntPrecerts          = beam.NewCounter("ctmap", "precerts")
    36  	cntLeavesProcessed   = beam.NewCounter("ctmap", "leaves-processed")
    37  )
    38  
    39  func init() {
    40  	beam.RegisterType(reflect.TypeOf((*domainEntry)(nil)).Elem())
    41  }
    42  
    43  // InputLog allows access to entries from the log.
    44  type InputLog interface {
    45  	// Head returns the metadata of available entries.
    46  	Head(ctx context.Context) (checkpoint []byte, count int64, err error)
    47  
    48  	// Entries returns a PCollection of InputLogLeaf, containing entries in range [start, end).
    49  	Entries(s beam.Scope, start, end int64) beam.PCollection
    50  }
    51  
    52  // InputLogLeaf is a leaf in an input log, with its sequence index and data.
    53  type InputLogLeaf struct {
    54  	Seq  int64
    55  	Data []byte
    56  }
    57  
    58  // InputLogMetadata describes the provenance information of the input
    59  // log to be passed around atomically.
    60  type InputLogMetadata struct {
    61  	Checkpoint []byte
    62  	Entries    int64
    63  }
    64  
    65  // Result is returned on successful run of the pipeline. It primarily
    66  // exists to name the output and aid readability, as PCollections are untyped
    67  // in code, so having them as named fields at least aids a little.
    68  type Result struct {
    69  	// MapTiles is a PCollection of *batchmap.Tile.
    70  	MapTiles beam.PCollection
    71  	// DomainCounts is a PCollection of *DomainCertIndexLog.
    72  	DomainCertIndexLogs beam.PCollection
    73  	Metadata            InputLogMetadata
    74  }
    75  
    76  // MapBuilder contains the static configuration for a map, and allows
    77  // maps at different log sizes to be built using its methods.
    78  type MapBuilder struct {
    79  	source       InputLog
    80  	treeID       int64
    81  	prefixStrata int
    82  }
    83  
    84  // NewMapBuilder returns a MapBuilder for a map with the given configuration.
    85  func NewMapBuilder(source InputLog, treeID int64, prefixStrata int) MapBuilder {
    86  	return MapBuilder{
    87  		source:       source,
    88  		treeID:       treeID,
    89  		prefixStrata: prefixStrata,
    90  	}
    91  }
    92  
    93  // Create builds a map from scratch, using the first `size` entries in the
    94  // input log. If there aren't enough entries then it will fail.
    95  func (b *MapBuilder) Create(ctx context.Context, s beam.Scope, size int64) (Result, error) {
    96  	var r Result
    97  
    98  	endID, golden, err := b.getLogEnd(ctx, size)
    99  	if err != nil {
   100  		return r, err
   101  	}
   102  
   103  	// TODO(mhutchinson): Find a better hack to parallize data source.
   104  	batchSize := size/10 + 1
   105  
   106  	rawLeaves := make([]beam.PCollection, 0, endID/batchSize)
   107  	for i := int64(0); i < endID; i += batchSize {
   108  		end := i + batchSize
   109  		if end > endID {
   110  			end = endID
   111  		}
   112  		rawLeaves = append(rawLeaves, b.source.Entries(s.Scope("source"), i, end))
   113  	}
   114  	domains := beam.ParDo(s.Scope("keyByDomain"), rawLeafToDomainEntries, beam.Flatten(s, rawLeaves...))
   115  
   116  	entries, logs := MakeDomainLogs(s.Scope("MakeDomainLogs"), b.treeID, domains)
   117  
   118  	glog.Infof("Creating new map revision from range [0, %d)", endID)
   119  	tiles, err := batchmap.Create(s, entries, b.treeID, crypto.SHA256, b.prefixStrata)
   120  
   121  	return Result{
   122  		MapTiles:            tiles,
   123  		DomainCertIndexLogs: logs,
   124  		Metadata: InputLogMetadata{
   125  			Checkpoint: golden,
   126  			Entries:    endID,
   127  		},
   128  	}, err
   129  }
   130  
   131  func (b *MapBuilder) getLogEnd(ctx context.Context, requiredEntries int64) (int64, []byte, error) {
   132  	golden, totalLeaves, err := b.source.Head(ctx)
   133  	if err != nil {
   134  		return 0, nil, fmt.Errorf("failed to get Head of input log: %v", err)
   135  	}
   136  
   137  	if requiredEntries < 0 {
   138  		return totalLeaves, golden, nil
   139  	}
   140  
   141  	if totalLeaves < requiredEntries {
   142  		return 0, nil, fmt.Errorf("wanted %d leaves but only %d available", requiredEntries, totalLeaves)
   143  	}
   144  
   145  	return requiredEntries, golden, nil
   146  }
   147  
   148  func rawLeafToDomainEntries(ctx context.Context, rawLeaf InputLogLeaf, emit func(*domainEntry)) error {
   149  	cntLeavesProcessed.Inc(ctx, 1)
   150  	var leaf ct.MerkleTreeLeaf
   151  	if rest, err := tls.Unmarshal(rawLeaf.Data, &leaf); err != nil {
   152  		return fmt.Errorf("failed to unmarshal MerkleTreeLeaf: %v", err)
   153  	} else if len(rest) > 0 {
   154  		return fmt.Errorf("MerkleTreeLeaf: trailing data %d bytes", len(rest))
   155  	}
   156  
   157  	var cert *x509.Certificate
   158  	var err error
   159  	switch eType := leaf.TimestampedEntry.EntryType; eType {
   160  	case ct.X509LogEntryType:
   161  		cert, err = leaf.X509Certificate()
   162  		if x509.IsFatal(err) {
   163  			return fmt.Errorf("failed to parse certificate: %v", err)
   164  		}
   165  
   166  	case ct.PrecertLogEntryType:
   167  		cntPrecerts.Inc(ctx, 1)
   168  		cert, err = leaf.Precertificate()
   169  		if x509.IsFatal(err) {
   170  			return fmt.Errorf("failed to parse precertificate: %v", err)
   171  		}
   172  
   173  	default:
   174  		return fmt.Errorf("unknown entry type: %v", eType)
   175  	}
   176  
   177  	if len(cert.DNSNames) == 0 {
   178  		cntCertsWithNoDomain.Inc(ctx, 1)
   179  	}
   180  	for _, n := range cert.DNSNames {
   181  		emit(&domainEntry{
   182  			Index:   uint64(rawLeaf.Seq),
   183  			DNSName: n,
   184  		})
   185  	}
   186  	return nil
   187  }
   188  
   189  type domainEntry struct {
   190  	Index   uint64
   191  	DNSName string
   192  }