bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/docs/examples.md (about)

     1  ---
     2  layout: default
     3  title: Examples
     4  ---
     5  
     6  <div class="row">
     7  <div class="col-sm-3" >
     8    <div data-spy="affix" data-offset-top="0" data-offset-bottom="0" markdown="1">
     9   
    10   * Some TOC
    11   {:toc}
    12   
    13    </div>
    14  </div>
    15  
    16  {% raw %}
    17  
    18  <div class="doc-body col-sm-9" markdown="1">
    19  
    20  <p class="title h1">Examples</p>
    21  
    22  ## Basic Alerts
    23  
    24  ### Combine metrics
    25  
    26  <p class="h4">Rule</p>
    27  In this alert we use the slim (session limit) metric and compare it to the current amount of sessions on each frontend. Because of this ability to combine metrics, we can easily create a percentage of utilization even though that metric doesn't exist in haproxy's csv stats.
    28  
    29  ~~~
    30  alert haproxy_session_limit {
    31      macro = host_based
    32      template = generic
    33      $notes = This alert monitors the percentage of sessions against the session limit in haproxy (maxconn) and alerts when we are getting close to that limit and will need to raise that limit. This alert was created due to a socket outage we experienced for that reason
    34      $current_sessions = max(q("sum:haproxy.frontend.scur{host=*,pxname=*,tier=*}", "5m", ""))
    35      $session_limit = max(q("sum:haproxy.frontend.slim{host=*,pxname=*,tier=*}", "5m", ""))
    36      $q = ($current_sessions / $session_limit) * 100
    37      warn = $q > 80
    38      crit = $q > 95
    39  }
    40  ~~~
    41  
    42  ### *Consistently* in a certain state
    43  Some metrics represent bools, (0 for false, 1 for true). If we take a time series and run min on that, we know that has been in a false state for the entire duration. So the following lets us know if puppet has been left disabled for more than 24 hours:
    44  
    45  <p class="h4">Rule</p> 
    46  
    47  ~~~
    48  alert puppet.left.disabled {
    49      macro = host_based
    50      template = generic
    51      $notes = More often than not, if puppet has been consistently disabled for more than 24 hours some forgot to re-enable it
    52      $oquery = "avg:24h-min:puppet.disabled{host=*}"
    53      $q = min(q($oquery, "24h", ""))
    54      warn = $q > 0
    55  }
    56  ~~~
    57  
    58  ### Macro that establishes contacts based on host name
    59  This is an example of one of our basic alerts at Stack Exchange. We have an IT and SRE team, so for host based alerts we make it so that the appropriate team is alerted for those hosts using our macro and lookup functionality. Macros reduce reuse for alert definitions. The lookup table is like a case statement that lets you change values based on the instance of the alert. The generic template is meant for when warn and crit use basically the same expression with different thresholds.  Templates can include other templates, so we make reusable components that we may want to include in other alerts.
    60  
    61  <p class="h4">Rule</p>
    62  
    63  ~~~
    64  lookup host_base_contact {
    65      entry host=nyhq-|-int|den-*|lon-* {
    66          main_contact = it
    67          chat_contact = it-chat
    68      }
    69      entry host=* {
    70          main_contact = default
    71      }
    72  }
    73  
    74  macro host_based {
    75      warnNotification = lookup("host_base_contact", "main_contact")
    76      critNotification = lookup("host_base_contact", "main_contact")
    77      warnNotification = lookup("host_base_contact", "chat_contact")
    78      critNotification = lookup("host_base_contact", "chat_contact")
    79  }
    80  
    81  alert os.low.memory {
    82      macro = host_based
    83      template = generic
    84      $notes = In Linux, Buffers and Cache are considered "Free Memory"
    85      #Unit string shows up in the subject of the "Generic" template
    86      $unit_string = % Free Memory
    87      $q = avg(q("avg:os.mem.percent_free{host=*}", $default_time, ""))
    88      crit = $q <= .5
    89      warn = $q < 5
    90      squelch = host=sql|devsearch
    91  }
    92  ~~~
    93  
    94  <p class="h4">Template</p>
    95  
    96  ~~~
    97  
    98  template generic {
    99      body = `{{template "header" .}}
   100      {{template "def" .}}
   101      
   102      {{template "tags" .}}
   103  
   104      {{template "computation" .}}`
   105      subject = {{.Last.Status}}: {{replace .Alert.Name "." " " -1}}: {{.Eval .Alert.Vars.q | printf "%.2f"}}{{if .Alert.Vars.unit_string}}{{.Alert.Vars.unit_string}}{{end}} on {{.Group.host}}
   106  }
   107  
   108  template def {
   109      body = `<p><strong>Alert definition:</strong>
   110      <table>
   111          <tr>
   112              <td>Name:</td>
   113              <td>{{replace .Alert.Name "." " " -1}}</td></tr>
   114          <tr>
   115              <td>Warn:</td>
   116              <td>{{.Alert.Warn}}</td></tr>
   117          <tr>
   118              <td>Crit:</td>
   119              <td>{{.Alert.Crit}}</td></tr>
   120      </table>`
   121  }
   122  
   123  template tags {
   124      body = `<p><strong>Tags</strong>
   125      
   126      <table>
   127          {{range $k, $v := .Group}}
   128              {{if eq $k "host"}}
   129                  <tr><td>{{$k}}</td><td><a href="{{$.HostView $v}}">{{$v}}</a></td></tr>
   130              {{else}}
   131                  <tr><td>{{$k}}</td><td>{{$v}}</td></tr>
   132              {{end}}
   133          {{end}}
   134      </table>`
   135  }
   136  
   137  template computation {
   138      body = `
   139      <p><strong>Computation</strong>
   140      
   141      <table>
   142          {{range .Computations}}
   143              <tr><td><a href="{{$.Expr .Text}}">{{.Text}}</a></td><td>{{.Value}}</td></tr>
   144          {{end}}
   145      </table>`
   146  }
   147  
   148  template header {
   149      body = `<p><a href="{{.Ack}}">Acknowledge alert</a>
   150      <p><a href="{{.Rule}}">View the Rule + Template in the Bosun's Rule Page</a>
   151      {{if .Alert.Vars.notes}}
   152      <p>Notes: {{.Alert.Vars.notes}}
   153      {{end}}
   154      {{if .Group.host}}
   155      <p><a href="https://status.stackexchange.com/dashboard/node?node={{.Group.host}}">View Host {{.Group.host}} in Opserver</a>
   156      {{end}}
   157      `
   158  }
   159  
   160  ~~~
   161  
   162  ### Graphite: Verify available cluster cpu capacity.
   163  
   164  This rule checks the current cpu capacity (avg per core, for the last 5 minutes), for each host in a cluster.
   165  
   166  We warn if more than 80% of systems have less than 20% capacity available, or if there's less than 200% in total in the cluster, irrespective of how many hosts there are and where the spare capacity is. Critical is for 90% and 50% respectively.
   167  
   168  By checking a value that scales along with the size of the cluster, and one that puts an absolute lower limit, we can cover more ground.  But this is of course specific to the context of the environment and usage patterns.  Note that via groupByNode() the timeseries as returned by Graphite only have the hostname in them.
   169  
   170  To provide a bit more context to the operator, we also plot the last 10 hours on a graph in the alert
   171  ![Notification for per-host and aggregate idle cpu](public/cpu_idle_graph_graphite.png)
   172  
   173  <p class="h4">Rule</p>
   174  
   175  ~~~
   176  alert os.cpu.idle.web {
   177      template = os_cpu_idle_web
   178      $expr = "groupByNode(collectd.dc1-web*.cpu.*.cpu.idle,1,'avg')"
   179      $idle_series_by_host = graphite($expr, "5m", "", "host")
   180      $idle_series_by_host_historical = graphite($expr, "10h", "", "host")
   181      $idle_by_host = avg($idle_series_by_host)
   182      $num_hosts = len(t($idle_by_host, ""))
   183      $idle_total= sum(t($idle_by_host, ""))
   184  
   185      $hosts_low = t($idle_by_host < 20, "")
   186      $num_hosts_low = sum($hosts_low)
   187      $ratio_hosts_low = ($num_hosts_low / $num_hosts)
   188  
   189      warn = ($ratio_hosts_low > 0.8) || ($idle_total < 200)
   190      crit = ($ratio_hosts_low > 0.9) || ($idle_total < 50)
   191  }
   192  ~~~
   193  
   194  <p class="h4">Template</p>
   195  
   196  ~~~
   197  template os_cpu_idle_web {
   198      body = `<a href="{{.Ack}}">Acknowledge alert</a>
   199      <br>
   200      <br>
   201      <b>Cpu idle by host:</b>
   202      <table>
   203      {{range $f := .EvalAll .Alert.Vars.idle_by_host}}
   204          <tr><td>{{ $f.Group.host}}</td>
   205          {{if lt $f.Value 20.0}}
   206              <td style="color: red;">
   207              {{else}}
   208                  <td style="color: green;">
   209              {{end}}
   210          {{ $f.Value | printf "%.0f" }}</td></tr>
   211      {{end}}
   212      <tr><td><b>Total</b></td><td>{{.Eval .Alert.Vars.idle_total | printf "%.0f" }}</td></tr>
   213      </table>
   214      <br>
   215      {{.Graph .Alert.Vars.idle_series_by_host_historical}}
   216  `
   217      subject = {{.Last.Status}}: {{.Alert.Name}} : {{.Eval .Alert.Vars.idle_total | printf "%.0f"}}% cpu idle in cluster. {{.Eval .Alert.Vars.num_hosts_low}} of {{.Eval .Alert.Vars.num_hosts}} hosts have insufficient cpu idle
   218  }
   219  ~~~
   220  
   221  ## Forecasting Alerts
   222  
   223  ### Forecast Disk space
   224  This alert mixes thresholds and forecasting to trigger alerts based on disk space. This can be very useful because it can warn about a situation that will result in the loss of diskspace before it is too late to go and fix the issue. This is combined with a threshold based alert because a good general rule is to try to eliminate duplicate notifications / alerts on the same object. So these are applied and tuned by the operator and are not auto-magic.
   225  
   226  Once we have string support for lookup tables, the duration that the forecast acts on can be tuned per host when relevant (some disks will have longer or shorter periodicity).
   227  
   228  The forecastlr function returns the number of seconds until the specified value will be reached according to a linear regression. It is a pretty naive way of forecasting, but has been effective. Also, there is no reason we can't extend bosun to include more advanced forecasting functions.
   229  
   230  ![Forecast Notification Image](public/disk_forecast_notification.png)
   231  
   232  <p class="h4">Rule</p>
   233  
   234  ~~~
   235  lookup disk_space {
   236      entry host=ny-omni01,disk=E {
   237          warn_percent_free = 2
   238          crit_percent_free = 0
   239      }
   240      entry host=*,disk=* {
   241          warn_percent_free = 10
   242          crit_percent_free = 0
   243      }
   244  }
   245  
   246  alert os.diskspace {
   247      macro = host_based
   248      $notes = This alert triggers when there are issues detected in disk capacity. Two methods are used. The first is a traditional percentage based threshold. This alert also uses a linear regression to attempt to predict the amount of time until we run out of space. Forecasting is a hard problem, in particular to apply generically so there is a lot of room for improvement here. But this is a start
   249      template = diskspace
   250      $filter = host=*,disk=*
   251      
   252      ##Forecast Section
   253      #Downsampling avg on opentsdb side will save the linear regression a lot of work
   254      $days_to_zero = (forecastlr(q("avg:6h-avg:os.disk.fs.percent_free{$filter}", "7d", ""), 0) / 60 / 60 / 24)
   255      #Threshold can be higher here once we support string lookups in lookup tables https://github.com/bosun-monitor/bosun/issues/268
   256      $warn_days = $days_to_zero > 0 && $days_to_zero < 7
   257      $crit_days =   $days_to_zero > 0 && $days_to_zero < 1
   258      
   259      ##Percent Free Section
   260      $pf_time = "5m"
   261      $percent_free = avg(q("avg:os.disk.fs.percent_free{host=*,disk=*}", $pf_time, ""))
   262      $used = avg(q("avg:os.disk.fs.space_used{host=*,disk=*}", $pf_time, ""))
   263      $total = avg(q("avg:os.disk.fs.space_total{host=*,disk=*}", $pf_time, ""))
   264      $warn_percent = $percent_free <  lookup("disk_space", "warn_percent_free")
   265      #Linux stops root from writing at less than 5%
   266      $crit_percent = $percent_free <  lookup("disk_space", "crit_percent_free")
   267      #For graph (long time)
   268      $percent_free_graph = q("avg:1h-min:os.disk.fs.percent_free{host=*,disk=*}", "4d", "")
   269      
   270      ##Main Logic
   271      warn = $warn_percent || $warn_days
   272      crit = $crit_percent || $crit_days
   273      
   274      ##Options
   275      squelch = $disk_squelch
   276      ignoreUnknown = true
   277      #This is needed because disks go away when the forecast doesn't
   278      unjoinedOk = true
   279      
   280  }
   281  ~~~
   282  
   283  <p class="h4">Template</p>
   284  
   285  ~~~
   286  template diskspace {
   287      body = `{{template "header" .}}
   288      <p>Host: <a href="{{.HostView .Group.host | short }}">{{.Group.host}}</a>
   289      <br>Disk: {{.Group.disk}}
   290  
   291      <p>Percent Free: {{.Eval .Alert.Vars.percent_free | printf "%.2f"}}%
   292      <br>Used: {{.Eval .Alert.Vars.used | bytes}}
   293      <br>Total: {{.Eval .Alert.Vars.total | bytes}}
   294      <br>Est. {{.Eval .Alert.Vars.days_to_zero | printf "%.2f"}} days remain until 0% free space
   295      {{/* .Graph .Alert.Vars.percent_free_graph */}}
   296      {{printf "q(\"avg:1h-min:os.disk.fs.percent_free{host=%s,disk=%s}\", \"7d\", \"\")" .Group.host .Group.disk | .Graph}}
   297      `
   298      subject = {{.Last.Status}}: Diskspace: ({{.Alert.Vars.used | .Eval | bytes}}/{{.Alert.Vars.total | .Eval | bytes}}) {{.Alert.Vars.percent_free | .Eval | printf "%.2f"}}% Free on {{.Group.host}}:{{.Group.disk}} (Est. {{.Eval .Alert.Vars.days_to_zero | printf "%.2f"}} days remain)
   299  }
   300  ~~~
   301  
   302  ## Anomalous Alerts
   303  The idea of an anomalous alert it in Bosun is that a deviation from the norm can be detected without having to set static thresholds for everything. These can be very useful when the amount of data makes it unfeasible to manually set thresholds for these. Attempts to fully automate this from what I have seen and been told are noisy. So Bosun doesn't just have an "anomalous" function, but rather you can query history and do various comparisons with that data.
   304  
   305  ### Anomalous response per route
   306  At Stack Exchange we send which web route was hit to haproxy so that gets logged (It is removed before sent to the client). With over a thousand routes, static thresholds are not feasible. So this looks at history using the band function, and compares it to current performance. An alert is then triggered if the route makes up more than 1% of our total hits, and has gotten slower or faster by more than 10 milliseconds.
   307  
   308  ![Anomalous Route Performance Notification](public/anom_route_notification.png)
   309  
   310  <p class="h4">Rule</p>
   311  
   312  ~~~
   313  alert slower.route.performance {
   314      template = route.performance
   315      $notes = Response time is based on HAProxy's Tr Value. This is the web server response time (time elapsed between the moment the TCP connection was established to the web server and the moment it send its complete response header
   316      $duration = "1d"
   317      $route=*
   318      $metric = "sum:10m-avg:haproxy.logs.route_tr_median{route=$route}"
   319      $route_hit_metric = "sum:10m-avg:rate{counter,,1}:haproxy.logs.hits_by_route{route=$route}"
   320      $total_hit_metric = "sum:10m-avg:rate{counter,,1}:haproxy.logs.hits_by_route"
   321      $route_hits = change($route_hit_metric, $duration, "")
   322      $total_hits = change($total_hit_metric, $duration, "")
   323      $hit_percent = $route_hits / $total_hits * 100
   324      $current_hitcount =  len(q($metric, $duration, ""))
   325      $period = "7d"
   326      $lookback = 4
   327      $history = band($metric, $duration, $period, $lookback)
   328      $past_dev = dev($history)
   329      $past_median = percentile($history, .5)
   330      $current_median = percentile(q($metric, $duration, ""), .5)
   331      $diff = $current_median - $past_median
   332      warn = $current_median > ($past_median + $past_dev*2) && abs($diff) > 10 && $hit_percent > 1
   333      warnNotification = default
   334      ignoreUnknown = true
   335  }
   336  ~~~
   337  
   338  <p class="h4">Template</p>
   339  ~~~
   340  template route.performance {
   341      body = `{{template "header" .}}
   342      <br>
   343      <br><span style="font-weight: bold;">Route: {{.Group.route}}</span>
   344      <br>Past Median: {{.Eval .Alert.Vars.current_median | printf "%.2f ms"}}
   345      <br>Current Median: {{.Eval .Alert.Vars.past_median | printf "%.2f ms"}}
   346      <br>Difference: {{.Eval .Alert.Vars.diff | printf "%.2f ms"}}
   347      <br>Route Hits: {{.Eval .Alert.Vars.route_hits | printf "%.2f hits"}}
   348      <br>Total Hits: {{.Eval .Alert.Vars.total_hits | printf "%.2f hits "}}
   349      <br>Route Hit Percentage of Total: {{.Eval .Alert.Vars.hit_percent | printf "%.2f"}}%
   350  `
   351      subject = {{.Last.Status}}: Median Response Time Change of  {{.Eval .Alert.Vars.diff | printf "%.2f ms"}} (Current: {{.Eval .Alert.Vars.current_median | printf "%.2f ms"}} Past: {{.Eval .Alert.Vars.past_median | printf "%.2f ms"}}) on {{.Group.route}}
   352  }
   353  ~~~
   354  
   355  ### Graphite: Anomalous traffic volume per country
   356  At Vimeo, we use statsdaemon to track web requests/s.  Here we'll use metrics which describe the web traffic on a per server basis (we get them summed together into one series from Graphite via the sum function), and we also use a set of metrics that are already aggregated across all servers, but broken down per country.
   357  In the alert we leverage the banding functionality to get a similar timeframe of past weeks.  We then verify that the median of the total web traffic for this period is not significantly (20% or more) less than the median of past periods.  And on the per-country basis, we verify that the current median is not 3 or more standard deviations below the past median.  Note also that we count how many countries have issues and use that to decide wether the alert is critical or warning.
   358  Note that the screenshot below has been modified. The countries and values are not accurate.
   359  ![Anomalous web traffic Notification](public/anom_traffic_vol_graphite.png)
   360  
   361  <p class="h4">Rule</p>
   362  ~~~
   363  alert requests_by_country {
   364      template = requests_by_country
   365      $r = "transformNull(sum(stats.dc1*.request.web),0)" 
   366      $r_hist = graphiteBand($r, "60m","7d", "",1)
   367      $r_now = graphite($r, "60m", "", "")
   368      $r_hist_med = median($r_hist)
   369      $r_now_med = median($r_now)
   370  
   371      $rbc = "aliasByNode(transformNull(stats._sum_dc1.request_by_country.*,0),3)"
   372      $rbc_hist = graphiteBand($rbc, "60m","7d", "country",1)
   373      $rbc_now = graphite($rbc, "60m", "", "country")
   374      $rbc_hist_med = median($rbc_hist)
   375      $rbc_hist_dev= dev($rbc_hist)
   376      $rbc_now_med = median($rbc_now)
   377      $rbc_now_dev = dev($rbc_now)
   378      
   379      $rbc_med_diff = ($rbc_now_med - $rbc_hist_med)/$rbc_hist_dev
   380      $rbc_med_bad = $rbc_med_diff < -3
   381  
   382      $r_strength = $r_now_med/$r_hist_med
   383      $rbc_med_issues = sum(t($rbc_med_bad,""))
   384      warn = $rbc_med_issues > 0
   385      crit = $rbc_med_issues > 10 || $r_strength < 0.8
   386      warnNotification = web
   387      critNotification = web
   388  }
   389  ~~~
   390  
   391  #### Template
   392  ~~~
   393  template requests_by_country {
   394      body = `
   395      <a href="{{.Ack}}">Acknowledge alert</a>
   396      <br/>
   397      <h2>Web requests global</h2>
   398      <table>
   399      <tr><td>Last week</td><td>{{.Eval .Alert.Vars.r_hist_med  | printf "%.0f"}}</td></tr>
   400      {{if lt (.Eval .Alert.Vars.r_strength) 0.7}}
   401          <tr style="color:red;">
   402        {{else}}
   403          {{if lt (.Eval .Alert.Vars.r_strength) 1.0}}
   404                    <tr>
   405           {{else}}
   406               <tr style="color:green;">
   407                     {{end}}
   408      {{end}}
   409     <td> Now</td><td>{{.Eval .Alert.Vars.r_now_med | printf "%.0f"}}</td></tr>
   410     </table>
   411      <br><a href="http://graphexplorer/index/statsd request.web sum by n1 from -1week||">GE graph</a>
   412      <br>
   413      <br>
   414      <h2>Web requests per country</h2>
   415      <br><a href="http://graphexplorer/index/request by country _sum_dc1">GE graph</a>
   416      <br>median diff lower than -3 is bad
   417      <br>bad values are marked in red.
   418         <table style="border-spacing: 10px;">
   419         <tr>
   420             <th>Country</th>
   421             <th>hist (med +- dev)</th>
   422             <th>now (med +- dev)</th>
   423             <th>med diff (in devs)</th>
   424             </tr>
   425      {{range $r := .LeftJoin .Alert.Vars.rbc_hist_med .Alert.Vars.rbc_hist_dev .Alert.Vars.rbc_now_med .Alert.Vars.rbc_now_dev .Alert.Vars.rbc_med_diff .Alert.Vars.rbc_med_bad}}
   426          <tr>
   427              <td>{{(index $r 0).Group.country}}</td>
   428              <td> {{ (index $r 0).Value | printf "%.0f"}} +- {{ (index $r 1).Value | printf "%.0f"}}</td>
   429              <td> {{ (index $r 2).Value | printf "%.0f"}} +- {{ (index $r 3).Value | printf "%.0f"}}</td>
   430              {{ if gt (index $r 5).Value 0.0}}
   431                  <td style="color:red;" >{{ (index $r 4).Value | printf "%.0f"}}</td>
   432              {{else}}
   433                    <td style="color:green;"> {{ (index $r 4).Value | printf "%.0f"}}</td>
   434               {{end}}
   435              <td><a href="http://graphexplorer/index/request by country _sum_dc1 n3={{(index $r 0).Group.country}}||">GE graph</a></td>
   436          </tr>
   437      {{end}}
   438      </table>
   439      `
   440      subject =`
   441      {{.Last.Status}}: {{.Alert.Name}} :
   442   Global traffic volume {{.Eval .Alert.Vars.r_strength | printf "%.3f"}}
   443   | {{.Eval .Alert.Vars.rbc_med_issues | printf "%.0f"}} Countries with dropped median:
   444          {{range $r :=.EvalAll .Alert.Vars.rbc_med_bad}}
   445          {{ if gt $r.Value 0.0}}
   446               {{$r.Group.country}}
   447           {{end}}
   448      {{end}}`
   449  }
   450  ~~~
   451  
   452  
   453  ## Alerts with Notification Breakdowns
   454  A common pattern is to trigger an alert on a scope that covers multiple metrics
   455  (or hosts, services, etc.), but then to send more detailed information in
   456  notification. This is useful when you notice that certain failures tend to go
   457  together.
   458  
   459  ### Linux TCP Stack Alert
   460  When alerting on issues with the Linux TCP stack on a host, you probably don't
   461  want N alerts about all TCP stats, but just one alert that shows breakdowns:
   462  
   463  ![](public/tcp_notification.png)
   464  
   465  <p class="h4">Rule</p>
   466  
   467  ~~~
   468  # This macro isn't Show, but makes it so IT and SRE are notified for their
   469  respective systems, when an alert is host based.
   470  
   471  lookup linux_tcp {
   472  	entry host=ny-tsdb03 {
   473  		backlog_drop_threshold = 500
   474  	}
   475  	entry host=* {
   476  		backlog_drop_threshold = 100
   477  	}
   478  }
   479  
   480  alert linux.tcp {
   481  	macro = host_based
   482  	$notes = `
   483  		This alert checks for various errors or possible issues in the
   484  		TCP stack of a Linux host. Since these tend to happen at the
   485  		same time, combining them into a single alert reduces
   486  		notification noise.`
   487  	template = linux.tcp
   488  	$time = 10m
   489  	$abort_failed = change("sum:rate{counter,,1}:linux.net.stat.tcp.abortfailed{host=*}", "$time", "")
   490  	$abort_mem = change("sum:rate{counter,,1}:linux.net.stat.tcp.abortonmemory{host=*}", "$time", "")
   491  	$ofo_pruned = change("sum:rate{counter,,1}:linux.net.stat.tcp.ofopruned{host=*}", "$time", "")
   492  	$rcv_pruned = change("sum:rate{counter,,1}:linux.net.stat.tcp.rcvpruned{host=*}", "$time", "")
   493  	$backlog_drop = change("sum:rate{counter,,1}:linux.net.stat.tcp.backlogdrop{host=*}", "$time", "")
   494  	$syncookies_sent = change("sum:rate{counter,,1}:linux.net.stat.tcp.syncookiessent{host=*}", "$time", "")
   495  	$total_err = $abort_failed + $ofo_pruned + $rcv_pruned + $backlog_drop + $syncookies_sent
   496  	warn = $abort_failed || $ofo_pruned > 100 || $rcv_pruned > 100 || $backlog_drop > lookup("linux_tcp", "backlog_drop_threshold")  || $syncookies_sent
   497  }
   498  ~~~
   499  
   500  <p class="h4">Template Def</p>
   501  
   502  ~~~
   503  template linux.tcp {
   504  	body = `
   505  		{{template "header" .}}
   506  		<table>
   507  			{{/* TODO: Reference what each stat means */}}
   508  			<tr><th>Stat</th><th>Count in the last {{.Alert.Vars.time}}</th></tr>
   509  			<tr><td>TCP Abort Failed</td><td>{{.Eval .Alert.Vars.abort_failed | printf "%.2f"}}<td></tr>
   510  			<tr><td>Out Of Order Pruned</td><td>{{.Eval .Alert.Vars.ofo_pruned | printf "%.2f"}}<td></tr>
   511  			<tr><td>Receive Pruned</td><td>{{.Eval .Alert.Vars.rcv_pruned | printf "%.2f"}}<td></tr>
   512  			<tr><td>Backlog Dropped</td><td>{{.Eval .Alert.Vars.backlog_drop | printf "%.2f"}}<td></tr>
   513  			<tr><td>Syn Cookies Sent</td><td>{{.Eval .Alert.Vars.syncookies_sent | printf "%.2f"}}<td></tr>
   514  			<tr><td>TOTAL Of Above</td><td>{{.Eval .Alert.Vars.total_err | printf "%.2f"}}<td></tr>
   515  		</table>`
   516  	subject = {{.Last.Status}}: {{.Eval .Alert.Vars.total_err | printf "%.2f"}} tcp errors on {{.Group.host}}
   517  }
   518  ~~~
   519  
   520  ### Dell Hardware Alert
   521  No need for N Hardware alerts when hardware is bad, just send one notification with the information needed. Metadata (generally string information) can be reported to bosun, and is reported by scollector. Therefore in the notification we can include things like the model and the service tag.
   522  
   523  ![](public/hw_notification.png)
   524  
   525  <p class="h4">Rule</p>
   526  ~~~
   527  alert hardware {
   528      macro = host_based
   529      template = hardware
   530      $time = "30m"
   531      $notes = This alert triggers on omreport's "system" status, which *should* be a rollup of the entire system state. So it is possible to see an "Overall System status" of bad even though the breakdowns below are all "Ok". If this is the case, look directly in OMSA using the link below
   532      
   533      #By Component
   534      $power = max(q("sum:hw.ps{host=*,id=*}", $time, ""))
   535      $battery = max(q("sum:hw.storage.battery{host=*,id=*}", $time, ""))
   536      $controller = max(q("sum:hw.storage.controller{host=*,id=*}", $time, ""))
   537      $enclosure = max(q("sum:hw.storage.enclosure{host=*,id=*}", $time, ""))
   538      $physical_disk = max(q("sum:hw.storage.pdisk{host=*,id=*}", $time, ""))
   539      $virtual_disk = max(q("sum:hw.storage.vdisk{host=*,id=*}", $time, ""))
   540      #I believe the system should report the status of non-zero if the anything is bad
   541      #(omreport system), so everything else is for notification purposes. This works out
   542      #because not everything has all these components
   543      $system = max(q("sum:hw.system{host=*,component=*}", $time, ""))
   544      
   545      #Component Summary Per Host
   546      $s_power= sum(t($power, "host"))
   547      $s_battery = sum(t($battery, "host"))
   548      
   549      warn = $system
   550  }
   551  ~~~
   552  
   553  <p class="h4">Template</p>
   554  ~~~
   555  template hardware {
   556      body = `
   557      <p>Overall System status is  {{if gt .Value 0.0}} <span style="color: red;">Bad</span>
   558                {{else}} <span style="color: green;">Ok</span>
   559                {{end}}</p>
   560      {{template "header" .}}
   561      <p><a href="https://{{.Group.host}}:1311/OMSALogin?msgStatus=null">OMSA Login for host {{.Group.host}}</a>
   562      <h3>General Host Info</h3>
   563      {{ with .GetMeta "" "svctag" (printf "host=%s" .Group.host) }}
   564          Service Tag: <a href="http://www.dell.com/support/home/us/en/04/product-support/servicetag/{{.}}">{{.}}</a>
   565      {{ end }}
   566      
   567      {{ with .GetMeta "" "model" (printf "host=%s" .Group.host) }}
   568          <br>Model: {{.}}
   569      {{ end }}
   570      
   571      {{ with .GetMeta "" "version" (printf "host=%s" .Group.host) }}
   572          <br>OS: {{.}}
   573      {{ end }}
   574      
   575      <h3>Power Supplies</h3>
   576      <table>
   577      <tr><th>Power Supply Id</th><th>Status</th></tr>
   578      {{range $r := .EvalAll .Alert.Vars.power}}
   579          {{if eq $r.Group.host $.Group.host}}
   580              <tr>
   581                <td>{{$r.Group.id}}</td>
   582                {{if gt $r.Value 0.0}} <td style="color: red;">Bad</td>
   583                {{else}} <td style="color: green;">Ok</td>
   584                {{end}}
   585              </tr>
   586          {{end}}
   587      {{end}}
   588      </table>
   589      
   590      <h3>Batteries</h3>
   591      <table>
   592      <tr><th>Battery Id</th><th>Status</th></tr>
   593      {{range $r := .EvalAll .Alert.Vars.battery}}
   594          {{if eq $r.Group.host $.Group.host}}
   595              <tr>
   596                <td>{{$r.Group.id}}</td>
   597              </tr>
   598          {{end}}
   599      {{end}}
   600      </table>
   601      
   602      <h3>Controllers</h3>
   603      <table>
   604      <tr><th>Controller Id</th><th>Status</th></tr>
   605      {{range $r := .EvalAll .Alert.Vars.controller}}
   606          {{if eq $r.Group.host $.Group.host}}
   607              <tr>
   608                <td>{{$r.Group.id}}</td>
   609                {{if gt $r.Value 0.0}} <td style="color: red;">Bad</td>
   610                {{else}} <td style="color: green;">Ok</td>
   611                {{end}}
   612              </tr>
   613          {{end}}
   614      {{end}}
   615      </table>
   616  
   617      <h3>Enclosures</h3>
   618      <table>
   619      <tr><th>Enclosure Id</th><th>Status</th></tr>
   620      {{range $r := .EvalAll .Alert.Vars.enclosure}}
   621        {{if eq $r.Group.host $.Group.host}}
   622              <tr>
   623                <td>{{$r.Group.id}}</td>
   624                {{if gt $r.Value 0.0}} <td style="color: red;">Bad</td>
   625                {{else}} <td style="color: green;">Ok</td>
   626                {{end}}
   627              </tr>
   628          {{end}}
   629      {{end}}
   630      </table>
   631  
   632      <h3>Physical Disks</h3>
   633      <table>
   634      <tr><th>Physical Disk Id</th><th>Status</th></tr>
   635      {{range $r := .EvalAll .Alert.Vars.physical_disk}}
   636        {{if eq $r.Group.host $.Group.host}}
   637              <tr>
   638                <td>{{$r.Group.id}}</td>
   639                {{if gt $r.Value 0.0}} <td style="color: red;">Bad</td>
   640                {{else}} <td style="color: green;">Ok</td>
   641                {{end}}
   642              </tr>
   643          {{end}}
   644      {{end}}
   645      </table>
   646  
   647      <h3>Virtual Disks</h3>
   648      <table>
   649      <tr><th>Virtual Disk Id</th><th>Status</th></tr>
   650      {{range $r := .EvalAll .Alert.Vars.virtual_disk}}
   651        {{if eq $r.Group.host $.Group.host}}
   652              <tr>
   653                <td>{{$r.Group.id}}</td>
   654                {{if gt $r.Value 0.0}} <td style="color: red;">Bad</td>
   655                {{else}} <td style="color: green;">Ok</td>
   656                {{end}}
   657              </tr>
   658          {{end}}
   659      {{end}}
   660      </table>
   661      `
   662      subject = {{.Last.Status}}: {{replace .Alert.Name "." " " -1}}: on {{.Group.host}}
   663  }
   664  ~~~
   665  
   666  ### Backup -- Advanced Grouping
   667  
   668  This shows the state of backups based on multiple conditions and multiple metrics. It also simulates a Left Join operation by substituting NaN Values with numbers and the nv functions in the rule, and using the LeftJoin template function. This stretches Bosun when it comes to grouping. Generally you might want to capture this sort of logic in your collector when going to these extremes, but this displays that you don't have to be limited to that:
   669  
   670  ![](public/netbackup_notification.png)
   671  
   672  <p class="h4">Rule</p>
   673  
   674  ~~~
   675  alert netbackup {
   676  	template = netbackup
   677  	$tagset = {class=*,client=*,schedule=*}
   678  	#Turn seconds into days
   679  	$attempt_age = max(q("sum:netbackup.backup.attempt_age$tagset", "10m", "")) / 60 / 60 / 24
   680  	$job_frequency = max(q("sum:netbackup.backup.frequency$tagset", "10m", "")) / 60 / 60 / 24
   681  	$job_status = max(q("sum:netbackup.backup.status$tagset", "10m", ""))
   682  	#Add 1/4 a day to the frequency as some buffer
   683  	$not_run_in_time = nv($attempt_age, 1e9) > nv($job_frequency+.25, 1)
   684  	$problems = $not_run_in_time || nv($job_status, 1)
   685  	$summary = sum(t($problems, ""))
   686  	warn = $summary
   687  }
   688  ~~~
   689  
   690  <p class="h4">Temp</p>late Def
   691  
   692  ~~~
   693  template netbackup {
   694  	subject = `{{.Last.Status}}: Netbackup has {{.Eval .Alert.Vars.summary}} Problematic Backups`
   695  	body = `
   696  	        {{template "header" .}}
   697  	        <p><a href="http://www.symantec.com/business/support/index?page=content&id=DOC5181">Symantec Reference Guide for Status Codes (PDF)</a>
   698  	        <p><a href="https://ny-back02.ds.stackexchange.com/opscenter/">View Backups in Opscenter</a>
   699  	        <h3>Status of all Tape Backups</h3>
   700  		<table>
   701  		<tr><th>Client</th><th>Policy</th><th>Schedule</th><th>Frequency</th><th>Attempt Age</th><th>Status</th></tr>
   702  	{{ range $v := .LeftJoin .Alert.Vars.job_frequency .Alert.Vars.attempt_age .Alert.Vars.job_status}}
   703  		{{ $freq := index $v 0 }}
   704  		{{ $age := index $v 1 }}
   705  		{{ $status := index $v 2 }}
   706  		<tr>
   707  			<td><a href="https://status.stackexchange.com/dashboard/node?node={{$freq.Group.client| short}}">{{$freq.Group.client| short}}</td>
   708  			<td>{{$freq.Group.class}}</td>
   709  			<td>{{$freq.Group.schedule}}</td>
   710  			<td>{{$freq.Value}}</td>
   711  			<td {{if gt $age.Value $freq.Value }} style="color: red;" {{end}}>{{$age.Value | printf "%.2f"}}</td>
   712  			<td{{if gt $status.Value 0.0}} style="color: red;" {{end}}>{{$status.Value}}</td>
   713  		<tr>
   714  	{{end}}
   715  	</table>`
   716  }
   717  
   718  ~~~
   719  
   720  ## Conditional Alerts
   721  
   722  ### Swapping notification unless there is a high exim mail queue
   723  This alert makes it so that swapping won't trigger if there is a high exim mail queue on the host that is swapping. All operators (such as &&) perform a join, so this alert makes it so if there is no exim mailq for that host, then it is as if the mail queue were high.
   724  
   725  <p class="h4">Rule</p>
   726  
   727  ~~~
   728  alert linux.swapping {
   729      macro = host_based
   730      template = generic
   731      $notes = This alert does not trigger for our mail servers with the mail queue is high
   732      #NV makes it so that if mailq doesn't exist for the Host, the NaN that gets returned gets replaced with a 1 (True)
   733      $mail_q = nv(max(q("sum:exim.mailq_count{host=*}", "2h", "") > 5000), 1)
   734      $metric = "sum:rate{counter,,1}:linux.mem.pswp{host=*,direction=in}"
   735      $q = (median(q($metric, "2h", "")) > 1) && ! $mail_q
   736      warn = $q
   737      squelch = host=ny-devsearch*|ny-git01
   738  }
   739  ~~~ 
   740  
   741  {% endraw %}
   742  </div>
   743  </div>