github.com/mholt/caddy-l4@v0.0.0-20241104153248-ec8fae209322/modules/l4proxy/healthchecks.go (about)

     1  // Copyright 2020 Matthew Holt
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package l4proxy
    16  
    17  import (
    18  	"fmt"
    19  	"log"
    20  	"net"
    21  	"runtime/debug"
    22  	"time"
    23  
    24  	"github.com/caddyserver/caddy/v2"
    25  	"go.uber.org/zap"
    26  )
    27  
    28  // HealthChecks configures active and passive health checks.
    29  type HealthChecks struct {
    30  	// Active health checks run in the background on a timer. To
    31  	// minimally enable active health checks, set either path or
    32  	// port (or both).
    33  	Active *ActiveHealthChecks `json:"active,omitempty"`
    34  
    35  	// Passive health checks monitor proxied connections for errors or timeouts.
    36  	// To minimally enable passive health checks, specify at least an empty
    37  	// config object.
    38  	Passive *PassiveHealthChecks `json:"passive,omitempty"`
    39  }
    40  
    41  // ActiveHealthChecks holds configuration related to active health
    42  // checks (that is, health checks which occur independently in a
    43  // background goroutine).
    44  type ActiveHealthChecks struct {
    45  	// The port to use (if different from the upstream's dial
    46  	// address) for health checks.
    47  	Port int `json:"port,omitempty"`
    48  
    49  	// How frequently to perform active health checks (default 30s).
    50  	Interval caddy.Duration `json:"interval,omitempty"`
    51  
    52  	// How long to wait for a connection to be established with
    53  	// peer before considering it unhealthy (default 5s).
    54  	Timeout caddy.Duration `json:"timeout,omitempty"`
    55  
    56  	logger *zap.Logger
    57  }
    58  
    59  // PassiveHealthChecks holds configuration related to passive
    60  // health checks (that is, health checks which occur during
    61  // the normal flow of connection proxying).
    62  type PassiveHealthChecks struct {
    63  	// How long to remember a failed connection to a backend. A
    64  	// duration > 0 enables passive health checking. Default 0.
    65  	FailDuration caddy.Duration `json:"fail_duration,omitempty"`
    66  
    67  	// The number of failed connections within the FailDuration window to
    68  	// consider a backend as "down". Must be >= 1; default is 1. Requires
    69  	// that FailDuration be > 0.
    70  	MaxFails int `json:"max_fails,omitempty"`
    71  
    72  	// Limits the number of simultaneous connections to a backend by
    73  	// marking the backend as "down" if it has this many or more
    74  	// concurrent connections.
    75  	UnhealthyConnectionCount int `json:"unhealthy_connection_count,omitempty"`
    76  
    77  	logger *zap.Logger
    78  }
    79  
    80  // activeHealthChecker runs active health checks on a
    81  // regular basis and blocks until
    82  // h.HealthChecks.Active.stopChan is closed.
    83  func (h *Handler) activeHealthChecker() {
    84  	defer func() {
    85  		if err := recover(); err != nil {
    86  			log.Printf("[PANIC] active health checks: %v\n%s", err, debug.Stack())
    87  		}
    88  	}()
    89  	ticker := time.NewTicker(time.Duration(h.HealthChecks.Active.Interval))
    90  	h.doActiveHealthCheckForAllHosts()
    91  	for {
    92  		select {
    93  		case <-ticker.C:
    94  			h.doActiveHealthCheckForAllHosts()
    95  		case <-h.ctx.Done():
    96  			ticker.Stop()
    97  			return
    98  		}
    99  	}
   100  }
   101  
   102  // doActiveHealthCheckForAllHosts immediately performs a
   103  // health checks for all upstream hosts configured by h.
   104  func (h *Handler) doActiveHealthCheckForAllHosts() {
   105  	for _, upstream := range h.Upstreams {
   106  		go func(upstream *Upstream) {
   107  			defer func() {
   108  				if err := recover(); err != nil {
   109  					log.Printf("[PANIC] active health check: %v\n%s", err, debug.Stack())
   110  				}
   111  			}()
   112  
   113  			for _, p := range upstream.peers {
   114  				err := h.doActiveHealthCheck(p)
   115  				if err != nil {
   116  					h.HealthChecks.Active.logger.Error("active health check failed",
   117  						zap.String("peer", p.address.String()),
   118  						zap.Error(err))
   119  				}
   120  			}
   121  		}(upstream)
   122  	}
   123  }
   124  
   125  // doActiveHealthCheck performs a health check to host which
   126  // can be reached at address hostAddr. The health status of
   127  // the host will be updated according to whether it passes
   128  // the health check. An error is returned only if the health
   129  // check fails to occur or if marking the host's health status
   130  // fails.
   131  func (h *Handler) doActiveHealthCheck(p *peer) error {
   132  	addr := p.address
   133  
   134  	// adjust the port, if configured to be different
   135  	if h.HealthChecks.Active.Port > 0 {
   136  		addr.StartPort = uint(h.HealthChecks.Active.Port)
   137  		addr.EndPort = addr.StartPort
   138  	}
   139  
   140  	hostPort := addr.JoinHostPort(0)
   141  	timeout := time.Duration(h.HealthChecks.Active.Timeout)
   142  
   143  	conn, err := net.DialTimeout(addr.Network, hostPort, timeout)
   144  	if err != nil {
   145  		h.HealthChecks.Active.logger.Info("host is down",
   146  			zap.String("address", addr.String()),
   147  			zap.Duration("timeout", timeout),
   148  			zap.Error(err))
   149  		_, err2 := p.setHealthy(false)
   150  		if err2 != nil {
   151  			return fmt.Errorf("marking unhealthy: %v (original error: %v)", err2, err)
   152  		}
   153  		return nil
   154  	}
   155  	_ = conn.Close()
   156  
   157  	// connection succeeded, so mark as healthy
   158  	swapped, err := p.setHealthy(true)
   159  	if swapped {
   160  		h.HealthChecks.Active.logger.Info("host is up", zap.String("address", addr.String()))
   161  	}
   162  	if err != nil {
   163  		return fmt.Errorf("marking healthy: %v", err)
   164  	}
   165  
   166  	return nil
   167  }