github.com/letsencrypt/boulder@v0.20251208.0/cmd/bad-key-revoker/main.go (about)

     1  package notmain
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"fmt"
     7  	"os"
     8  	"time"
     9  
    10  	"github.com/jmhodges/clock"
    11  	"github.com/prometheus/client_golang/prometheus"
    12  	"github.com/prometheus/client_golang/prometheus/promauto"
    13  	"google.golang.org/grpc"
    14  	"google.golang.org/protobuf/types/known/emptypb"
    15  
    16  	"github.com/letsencrypt/boulder/cmd"
    17  	"github.com/letsencrypt/boulder/config"
    18  	"github.com/letsencrypt/boulder/core"
    19  	"github.com/letsencrypt/boulder/db"
    20  	bgrpc "github.com/letsencrypt/boulder/grpc"
    21  	blog "github.com/letsencrypt/boulder/log"
    22  	rapb "github.com/letsencrypt/boulder/ra/proto"
    23  	"github.com/letsencrypt/boulder/revocation"
    24  	"github.com/letsencrypt/boulder/sa"
    25  )
    26  
    27  const blockedKeysGaugeLimit = 1000
    28  
    29  // revoker is an interface used to reduce the scope of a RA gRPC client
    30  // to only the single method we need to use, this makes testing significantly
    31  // simpler
    32  type revoker interface {
    33  	AdministrativelyRevokeCertificate(ctx context.Context, in *rapb.AdministrativelyRevokeCertificateRequest, opts ...grpc.CallOption) (*emptypb.Empty, error)
    34  }
    35  
    36  type badKeyRevoker struct {
    37  	dbMap                     *db.WrappedMap
    38  	maxRevocations            int
    39  	serialBatchSize           int
    40  	raClient                  revoker
    41  	logger                    blog.Logger
    42  	clk                       clock.Clock
    43  	backoffIntervalBase       time.Duration
    44  	backoffIntervalMax        time.Duration
    45  	backoffFactor             float64
    46  	backoffTicker             int
    47  	maxExpectedReplicationLag time.Duration
    48  	keysToProcess             prometheus.Gauge
    49  	keysProcessed             *prometheus.CounterVec
    50  	certsRevoked              prometheus.Counter
    51  }
    52  
    53  // uncheckedBlockedKey represents a row in the blockedKeys table
    54  type uncheckedBlockedKey struct {
    55  	KeyHash   []byte
    56  	RevokedBy int64
    57  }
    58  
    59  func (ubk uncheckedBlockedKey) String() string {
    60  	return fmt.Sprintf("[revokedBy: %d, keyHash: %x]",
    61  		ubk.RevokedBy, ubk.KeyHash)
    62  }
    63  
    64  func (bkr *badKeyRevoker) countUncheckedKeys(ctx context.Context) (int, error) {
    65  	var count int
    66  	err := bkr.dbMap.SelectOne(
    67  		ctx,
    68  		&count,
    69  		`SELECT COUNT(*)
    70  		FROM (SELECT 1 FROM blockedKeys
    71  		WHERE extantCertificatesChecked = false AND added < ? - INTERVAL ? SECOND
    72  		LIMIT ?) AS a`,
    73  		bkr.clk.Now(),
    74  		bkr.maxExpectedReplicationLag.Seconds(),
    75  		blockedKeysGaugeLimit,
    76  	)
    77  	return count, err
    78  }
    79  
    80  func (bkr *badKeyRevoker) selectUncheckedKey(ctx context.Context) (uncheckedBlockedKey, error) {
    81  	var row uncheckedBlockedKey
    82  	err := bkr.dbMap.SelectOne(
    83  		ctx,
    84  		&row,
    85  		`SELECT keyHash, revokedBy
    86  		FROM blockedKeys
    87  		WHERE extantCertificatesChecked = false AND added < ? - INTERVAL ? SECOND
    88  		LIMIT 1`,
    89  		bkr.clk.Now(),
    90  		bkr.maxExpectedReplicationLag.Seconds(),
    91  	)
    92  	return row, err
    93  }
    94  
    95  // unrevokedCertificate represents a yet to be revoked certificate
    96  type unrevokedCertificate struct {
    97  	ID             int
    98  	Serial         string
    99  	DER            []byte
   100  	RegistrationID int64
   101  	Status         core.OCSPStatus
   102  	IsExpired      bool
   103  }
   104  
   105  func (uc unrevokedCertificate) String() string {
   106  	return fmt.Sprintf("id=%d serial=%s regID=%d status=%s expired=%t",
   107  		uc.ID, uc.Serial, uc.RegistrationID, uc.Status, uc.IsExpired)
   108  }
   109  
   110  // findUnrevoked looks for all unexpired, currently valid certificates which have a specific SPKI hash,
   111  // by looking first at the keyHashToSerial table and then the certificateStatus and certificates tables.
   112  // If the number of certificates it finds is larger than bkr.maxRevocations it'll error out.
   113  func (bkr *badKeyRevoker) findUnrevoked(ctx context.Context, unchecked uncheckedBlockedKey) ([]unrevokedCertificate, error) {
   114  	var unrevokedCerts []unrevokedCertificate
   115  	initialID := 0
   116  	for {
   117  		var batch []struct {
   118  			ID         int
   119  			CertSerial string
   120  		}
   121  		_, err := bkr.dbMap.Select(
   122  			ctx,
   123  			&batch,
   124  			"SELECT id, certSerial FROM keyHashToSerial WHERE keyHash = ? AND id > ? AND certNotAfter > ? ORDER BY id LIMIT ?",
   125  			unchecked.KeyHash,
   126  			initialID,
   127  			bkr.clk.Now(),
   128  			bkr.serialBatchSize,
   129  		)
   130  		if err != nil {
   131  			return nil, err
   132  		}
   133  		if len(batch) == 0 {
   134  			break
   135  		}
   136  		initialID = batch[len(batch)-1].ID
   137  		for _, serial := range batch {
   138  			var unrevokedCert unrevokedCertificate
   139  			// NOTE: This has a `LIMIT 1` because the certificateStatus and precertificates
   140  			// tables do not have a UNIQUE KEY on serial (for partitioning reasons). So it's
   141  			// possible we could get multiple results for a single serial number, but they
   142  			// would be duplicates.
   143  			err = bkr.dbMap.SelectOne(
   144  				ctx,
   145  				&unrevokedCert,
   146  				`SELECT cs.id, cs.serial, c.registrationID, c.der, cs.status, cs.isExpired
   147  				FROM certificateStatus AS cs
   148  				JOIN precertificates AS c
   149  				ON cs.serial = c.serial
   150  				WHERE cs.serial = ?
   151  				LIMIT 1`,
   152  				serial.CertSerial,
   153  			)
   154  			if err != nil {
   155  				return nil, err
   156  			}
   157  			if unrevokedCert.IsExpired || unrevokedCert.Status == core.OCSPStatusRevoked {
   158  				continue
   159  			}
   160  			unrevokedCerts = append(unrevokedCerts, unrevokedCert)
   161  		}
   162  	}
   163  	if len(unrevokedCerts) > bkr.maxRevocations {
   164  		return nil, fmt.Errorf("too many certificates to revoke associated with %x: got %d, max %d", unchecked.KeyHash, len(unrevokedCerts), bkr.maxRevocations)
   165  	}
   166  	return unrevokedCerts, nil
   167  }
   168  
   169  // markRowChecked updates a row in the blockedKeys table to mark a keyHash
   170  // as having been checked for extant unrevoked certificates.
   171  func (bkr *badKeyRevoker) markRowChecked(ctx context.Context, unchecked uncheckedBlockedKey) error {
   172  	_, err := bkr.dbMap.ExecContext(ctx, "UPDATE blockedKeys SET extantCertificatesChecked = true WHERE keyHash = ?", unchecked.KeyHash)
   173  	return err
   174  }
   175  
   176  // revokeCerts revokes all the provided certificates. It uses reason
   177  // keyCompromise and includes note indicating that they were revoked by
   178  // bad-key-revoker.
   179  func (bkr *badKeyRevoker) revokeCerts(certs []unrevokedCertificate) error {
   180  	for _, cert := range certs {
   181  		_, err := bkr.raClient.AdministrativelyRevokeCertificate(context.Background(), &rapb.AdministrativelyRevokeCertificateRequest{
   182  			Cert:      cert.DER,
   183  			Serial:    cert.Serial,
   184  			Code:      int64(revocation.KeyCompromise),
   185  			AdminName: "bad-key-revoker",
   186  		})
   187  		if err != nil {
   188  			return err
   189  		}
   190  		bkr.certsRevoked.Inc()
   191  	}
   192  	return nil
   193  }
   194  
   195  // invoke exits early and returns true if there is no work to be done.
   196  // Otherwise, it processes a single key in the blockedKeys table and returns false.
   197  func (bkr *badKeyRevoker) invoke(ctx context.Context) (bool, error) {
   198  	// Gather a count of rows to be processed.
   199  	uncheckedCount, err := bkr.countUncheckedKeys(ctx)
   200  	if err != nil {
   201  		return false, err
   202  	}
   203  
   204  	// Set the gauge to the number of rows to be processed (max:
   205  	// blockedKeysGaugeLimit).
   206  	bkr.keysToProcess.Set(float64(uncheckedCount))
   207  
   208  	if uncheckedCount >= blockedKeysGaugeLimit {
   209  		bkr.logger.AuditInfof("found >= %d unchecked blocked keys left to process", uncheckedCount)
   210  	} else {
   211  		bkr.logger.AuditInfof("found %d unchecked blocked keys left to process", uncheckedCount)
   212  	}
   213  
   214  	// select a row to process
   215  	unchecked, err := bkr.selectUncheckedKey(ctx)
   216  	if err != nil {
   217  		if db.IsNoRows(err) {
   218  			return true, nil
   219  		}
   220  		return false, err
   221  	}
   222  	bkr.logger.AuditInfo(fmt.Sprintf("found unchecked block key to work on: %s", unchecked))
   223  
   224  	// select all unrevoked, unexpired serials associated with the blocked key hash
   225  	unrevokedCerts, err := bkr.findUnrevoked(ctx, unchecked)
   226  	if err != nil {
   227  		bkr.logger.AuditInfo(fmt.Sprintf("finding unrevoked certificates related to %s: %s",
   228  			unchecked, err))
   229  		return false, err
   230  	}
   231  	if len(unrevokedCerts) == 0 {
   232  		bkr.logger.AuditInfo(fmt.Sprintf("found no certificates that need revoking related to %s, marking row as checked", unchecked))
   233  		// mark row as checked
   234  		err = bkr.markRowChecked(ctx, unchecked)
   235  		if err != nil {
   236  			return false, err
   237  		}
   238  		return false, nil
   239  	}
   240  
   241  	var serials []string
   242  	for _, cert := range unrevokedCerts {
   243  		serials = append(serials, cert.Serial)
   244  	}
   245  	bkr.logger.AuditInfo(fmt.Sprintf("revoking serials %v for key with hash %x", serials, unchecked.KeyHash))
   246  
   247  	// revoke each certificate
   248  	err = bkr.revokeCerts(unrevokedCerts)
   249  	if err != nil {
   250  		return false, err
   251  	}
   252  
   253  	// mark the key as checked
   254  	err = bkr.markRowChecked(ctx, unchecked)
   255  	if err != nil {
   256  		return false, err
   257  	}
   258  	return false, nil
   259  }
   260  
   261  type Config struct {
   262  	BadKeyRevoker struct {
   263  		DB        cmd.DBConfig
   264  		DebugAddr string `validate:"omitempty,hostname_port"`
   265  
   266  		TLS       cmd.TLSConfig
   267  		RAService *cmd.GRPCClientConfig
   268  
   269  		// MaximumRevocations specifies the maximum number of certificates associated with
   270  		// a key hash that bad-key-revoker will attempt to revoke. If the number of certificates
   271  		// is higher than MaximumRevocations bad-key-revoker will error out and refuse to
   272  		// progress until this is addressed.
   273  		MaximumRevocations int `validate:"gte=0"`
   274  
   275  		// FindCertificatesBatchSize specifies the maximum number of serials to select from the
   276  		// keyHashToSerial table at once
   277  		FindCertificatesBatchSize int `validate:"required"`
   278  
   279  		// Interval specifies the minimum duration bad-key-revoker
   280  		// should sleep between attempting to find blockedKeys rows to
   281  		// process when there is an error or no work to do.
   282  		Interval config.Duration `validate:"-"`
   283  
   284  		// BackoffIntervalMax specifies a maximum duration the backoff
   285  		// algorithm will wait before retrying in the event of error
   286  		// or no work to do.
   287  		BackoffIntervalMax config.Duration `validate:"-"`
   288  
   289  		// MaxExpectedReplicationLag specifies the minimum duration
   290  		// bad-key-revoker should wait before searching for certificates
   291  		// matching a blockedKeys row. This should be just slightly greater than
   292  		// the database's maximum replication lag, and always well under 24
   293  		// hours.
   294  		MaxExpectedReplicationLag config.Duration `validate:"-"`
   295  	}
   296  
   297  	Syslog        cmd.SyslogConfig
   298  	OpenTelemetry cmd.OpenTelemetryConfig
   299  }
   300  
   301  func main() {
   302  	debugAddr := flag.String("debug-addr", "", "Debug server address override")
   303  	configPath := flag.String("config", "", "File path to the configuration file for this service")
   304  	flag.Parse()
   305  
   306  	if *configPath == "" {
   307  		flag.Usage()
   308  		os.Exit(1)
   309  	}
   310  	var config Config
   311  	err := cmd.ReadConfigFile(*configPath, &config)
   312  	cmd.FailOnError(err, "Failed reading config file")
   313  
   314  	if *debugAddr != "" {
   315  		config.BadKeyRevoker.DebugAddr = *debugAddr
   316  	}
   317  
   318  	stats, logger, oTelShutdown := cmd.StatsAndLogging(config.Syslog, config.OpenTelemetry, config.BadKeyRevoker.DebugAddr)
   319  	defer oTelShutdown(context.Background())
   320  	logger.Info(cmd.VersionString())
   321  	clk := clock.New()
   322  
   323  	keysToProcess := promauto.With(stats).NewGauge(prometheus.GaugeOpts{
   324  		Name: "bad_keys_to_process",
   325  		Help: fmt.Sprintf("A gauge of blockedKeys rows to process (max: %d)", blockedKeysGaugeLimit),
   326  	})
   327  	keysProcessed := promauto.With(stats).NewCounterVec(prometheus.CounterOpts{
   328  		Name: "bad_keys_processed",
   329  		Help: "A counter of blockedKeys rows processed labelled by processing state",
   330  	}, []string{"state"})
   331  	certsRevoked := promauto.With(stats).NewCounter(prometheus.CounterOpts{
   332  		Name: "bad_keys_certs_revoked",
   333  		Help: "A counter of certificates associated with rows in blockedKeys that have been revoked",
   334  	})
   335  
   336  	dbMap, err := sa.InitWrappedDb(config.BadKeyRevoker.DB, stats, logger)
   337  	cmd.FailOnError(err, "While initializing dbMap")
   338  
   339  	tlsConfig, err := config.BadKeyRevoker.TLS.Load(stats)
   340  	cmd.FailOnError(err, "TLS config")
   341  
   342  	conn, err := bgrpc.ClientSetup(config.BadKeyRevoker.RAService, tlsConfig, stats, clk)
   343  	cmd.FailOnError(err, "Failed to load credentials and create gRPC connection to RA")
   344  	rac := rapb.NewRegistrationAuthorityClient(conn)
   345  
   346  	bkr := &badKeyRevoker{
   347  		dbMap:                     dbMap,
   348  		maxRevocations:            config.BadKeyRevoker.MaximumRevocations,
   349  		serialBatchSize:           config.BadKeyRevoker.FindCertificatesBatchSize,
   350  		raClient:                  rac,
   351  		logger:                    logger,
   352  		clk:                       clk,
   353  		backoffIntervalMax:        config.BadKeyRevoker.BackoffIntervalMax.Duration,
   354  		backoffIntervalBase:       config.BadKeyRevoker.Interval.Duration,
   355  		backoffFactor:             1.3,
   356  		maxExpectedReplicationLag: config.BadKeyRevoker.MaxExpectedReplicationLag.Duration,
   357  		keysToProcess:             keysToProcess,
   358  		keysProcessed:             keysProcessed,
   359  		certsRevoked:              certsRevoked,
   360  	}
   361  
   362  	// If `BackoffIntervalMax` was not set via the config, set it to 60
   363  	// seconds. This will avoid a tight loop on error but not be an
   364  	// excessive delay if the config value was not deliberately set.
   365  	if bkr.backoffIntervalMax == 0 {
   366  		bkr.backoffIntervalMax = time.Second * 60
   367  	}
   368  
   369  	// If `Interval` was not set via the config then set
   370  	// `bkr.backoffIntervalBase` to a default 1 second.
   371  	if bkr.backoffIntervalBase == 0 {
   372  		bkr.backoffIntervalBase = time.Second
   373  	}
   374  
   375  	// If `MaxExpectedReplicationLag` was not set via the config, then set
   376  	// `bkr.maxExpectedReplicationLag` to a default 22 seconds. This is based on
   377  	// ProxySQL's max_replication_lag for bad-key-revoker (10s), times two, plus
   378  	// two seconds.
   379  	if bkr.maxExpectedReplicationLag == 0 {
   380  		bkr.maxExpectedReplicationLag = time.Second * 22
   381  	}
   382  
   383  	// Run bad-key-revoker in a loop. Backoff if no work or errors.
   384  	for {
   385  		noWork, err := bkr.invoke(context.Background())
   386  		if err != nil {
   387  			keysProcessed.WithLabelValues("error").Inc()
   388  			logger.AuditErrf("failed to process blockedKeys row: %s", err)
   389  			// Calculate and sleep for a backoff interval
   390  			bkr.backoff()
   391  			continue
   392  		}
   393  		if noWork {
   394  			logger.Info("no work to do")
   395  			// Calculate and sleep for a backoff interval
   396  			bkr.backoff()
   397  		} else {
   398  			keysProcessed.WithLabelValues("success").Inc()
   399  			// Successfully processed, reset backoff.
   400  			bkr.backoffReset()
   401  		}
   402  	}
   403  }
   404  
   405  // backoff increments the backoffTicker, calls core.RetryBackoff to
   406  // calculate a new backoff duration, then logs the backoff and sleeps for
   407  // the calculated duration.
   408  func (bkr *badKeyRevoker) backoff() {
   409  	bkr.backoffTicker++
   410  	backoffDur := core.RetryBackoff(
   411  		bkr.backoffTicker,
   412  		bkr.backoffIntervalBase,
   413  		bkr.backoffIntervalMax,
   414  		bkr.backoffFactor,
   415  	)
   416  	bkr.logger.Infof("backoff trying again in %.2f seconds", backoffDur.Seconds())
   417  	bkr.clk.Sleep(backoffDur)
   418  }
   419  
   420  // reset sets the backoff ticker and duration to zero.
   421  func (bkr *badKeyRevoker) backoffReset() {
   422  	bkr.backoffTicker = 0
   423  }
   424  
   425  func init() {
   426  	cmd.RegisterCommand("bad-key-revoker", main, &cmd.ConfigValidator{Config: &Config{}})
   427  }