github.com/google/trillian-examples@v0.0.0-20240520080811-0d40d35cef0e/experimental/batchmap/ctmap/internal/pipeline/pipeline.go (about) 1 // Copyright 2021 Google LLC. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package pipeline contains Beam pipeline library functions for the CT 16 // verifiable map. 17 package pipeline 18 19 import ( 20 "context" 21 "crypto" 22 "fmt" 23 "reflect" 24 25 "github.com/apache/beam/sdks/v2/go/pkg/beam" 26 "github.com/golang/glog" 27 ct "github.com/google/certificate-transparency-go" 28 "github.com/google/certificate-transparency-go/tls" 29 "github.com/google/certificate-transparency-go/x509" 30 "github.com/google/trillian/experimental/batchmap" 31 ) 32 33 var ( 34 cntCertsWithNoDomain = beam.NewCounter("ctmap", "certs-zero-domains") 35 cntPrecerts = beam.NewCounter("ctmap", "precerts") 36 cntLeavesProcessed = beam.NewCounter("ctmap", "leaves-processed") 37 ) 38 39 func init() { 40 beam.RegisterType(reflect.TypeOf((*domainEntry)(nil)).Elem()) 41 } 42 43 // InputLog allows access to entries from the log. 44 type InputLog interface { 45 // Head returns the metadata of available entries. 46 Head(ctx context.Context) (checkpoint []byte, count int64, err error) 47 48 // Entries returns a PCollection of InputLogLeaf, containing entries in range [start, end). 49 Entries(s beam.Scope, start, end int64) beam.PCollection 50 } 51 52 // InputLogLeaf is a leaf in an input log, with its sequence index and data. 53 type InputLogLeaf struct { 54 Seq int64 55 Data []byte 56 } 57 58 // InputLogMetadata describes the provenance information of the input 59 // log to be passed around atomically. 60 type InputLogMetadata struct { 61 Checkpoint []byte 62 Entries int64 63 } 64 65 // Result is returned on successful run of the pipeline. It primarily 66 // exists to name the output and aid readability, as PCollections are untyped 67 // in code, so having them as named fields at least aids a little. 68 type Result struct { 69 // MapTiles is a PCollection of *batchmap.Tile. 70 MapTiles beam.PCollection 71 // DomainCounts is a PCollection of *DomainCertIndexLog. 72 DomainCertIndexLogs beam.PCollection 73 Metadata InputLogMetadata 74 } 75 76 // MapBuilder contains the static configuration for a map, and allows 77 // maps at different log sizes to be built using its methods. 78 type MapBuilder struct { 79 source InputLog 80 treeID int64 81 prefixStrata int 82 } 83 84 // NewMapBuilder returns a MapBuilder for a map with the given configuration. 85 func NewMapBuilder(source InputLog, treeID int64, prefixStrata int) MapBuilder { 86 return MapBuilder{ 87 source: source, 88 treeID: treeID, 89 prefixStrata: prefixStrata, 90 } 91 } 92 93 // Create builds a map from scratch, using the first `size` entries in the 94 // input log. If there aren't enough entries then it will fail. 95 func (b *MapBuilder) Create(ctx context.Context, s beam.Scope, size int64) (Result, error) { 96 var r Result 97 98 endID, golden, err := b.getLogEnd(ctx, size) 99 if err != nil { 100 return r, err 101 } 102 103 // TODO(mhutchinson): Find a better hack to parallize data source. 104 batchSize := size/10 + 1 105 106 rawLeaves := make([]beam.PCollection, 0, endID/batchSize) 107 for i := int64(0); i < endID; i += batchSize { 108 end := i + batchSize 109 if end > endID { 110 end = endID 111 } 112 rawLeaves = append(rawLeaves, b.source.Entries(s.Scope("source"), i, end)) 113 } 114 domains := beam.ParDo(s.Scope("keyByDomain"), rawLeafToDomainEntries, beam.Flatten(s, rawLeaves...)) 115 116 entries, logs := MakeDomainLogs(s.Scope("MakeDomainLogs"), b.treeID, domains) 117 118 glog.Infof("Creating new map revision from range [0, %d)", endID) 119 tiles, err := batchmap.Create(s, entries, b.treeID, crypto.SHA256, b.prefixStrata) 120 121 return Result{ 122 MapTiles: tiles, 123 DomainCertIndexLogs: logs, 124 Metadata: InputLogMetadata{ 125 Checkpoint: golden, 126 Entries: endID, 127 }, 128 }, err 129 } 130 131 func (b *MapBuilder) getLogEnd(ctx context.Context, requiredEntries int64) (int64, []byte, error) { 132 golden, totalLeaves, err := b.source.Head(ctx) 133 if err != nil { 134 return 0, nil, fmt.Errorf("failed to get Head of input log: %v", err) 135 } 136 137 if requiredEntries < 0 { 138 return totalLeaves, golden, nil 139 } 140 141 if totalLeaves < requiredEntries { 142 return 0, nil, fmt.Errorf("wanted %d leaves but only %d available", requiredEntries, totalLeaves) 143 } 144 145 return requiredEntries, golden, nil 146 } 147 148 func rawLeafToDomainEntries(ctx context.Context, rawLeaf InputLogLeaf, emit func(*domainEntry)) error { 149 cntLeavesProcessed.Inc(ctx, 1) 150 var leaf ct.MerkleTreeLeaf 151 if rest, err := tls.Unmarshal(rawLeaf.Data, &leaf); err != nil { 152 return fmt.Errorf("failed to unmarshal MerkleTreeLeaf: %v", err) 153 } else if len(rest) > 0 { 154 return fmt.Errorf("MerkleTreeLeaf: trailing data %d bytes", len(rest)) 155 } 156 157 var cert *x509.Certificate 158 var err error 159 switch eType := leaf.TimestampedEntry.EntryType; eType { 160 case ct.X509LogEntryType: 161 cert, err = leaf.X509Certificate() 162 if x509.IsFatal(err) { 163 return fmt.Errorf("failed to parse certificate: %v", err) 164 } 165 166 case ct.PrecertLogEntryType: 167 cntPrecerts.Inc(ctx, 1) 168 cert, err = leaf.Precertificate() 169 if x509.IsFatal(err) { 170 return fmt.Errorf("failed to parse precertificate: %v", err) 171 } 172 173 default: 174 return fmt.Errorf("unknown entry type: %v", eType) 175 } 176 177 if len(cert.DNSNames) == 0 { 178 cntCertsWithNoDomain.Inc(ctx, 1) 179 } 180 for _, n := range cert.DNSNames { 181 emit(&domainEntry{ 182 Index: uint64(rawLeaf.Seq), 183 DNSName: n, 184 }) 185 } 186 return nil 187 } 188 189 type domainEntry struct { 190 Index uint64 191 DNSName string 192 }