github.com/filecoin-project/bacalhau@v0.3.23-0.20230228154132-45c989550ace/ops/aws/canary/lib/canary-stack.ts (about)

     1  import * as cdk from 'aws-cdk-lib';
     2  import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch';
     3  import * as cloudwatchActions from 'aws-cdk-lib/aws-cloudwatch-actions';
     4  import * as events from 'aws-cdk-lib/aws-events';
     5  import * as targets from 'aws-cdk-lib/aws-events-targets';
     6  import * as iam from 'aws-cdk-lib/aws-iam';
     7  import * as lambda from 'aws-cdk-lib/aws-lambda';
     8  import * as lambdaSources from 'aws-cdk-lib/aws-lambda-event-sources';
     9  import * as sns from 'aws-cdk-lib/aws-sns';
    10  import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager';
    11  import {CanaryConfig} from "./config";
    12  import {Size} from "aws-cdk-lib";
    13  
    14  export interface ScenarioProps {
    15      readonly action: string;
    16      readonly timeoutMinutes: number;
    17      readonly rateMinutes: number;
    18      readonly memorySize: number;
    19      readonly storageSize: number;
    20      readonly evaluationPeriods: number;
    21      readonly datapointsToAlarm: number;
    22      readonly availabilityThreshold: number;
    23      readonly doAlarm: boolean;
    24      readonly logLevel: string
    25  }
    26  
    27  const DEFAULT_SCENARIO_PROPS: ScenarioProps = {
    28      action: '_',
    29      timeoutMinutes: 1,
    30      rateMinutes: 2,
    31      memorySize: 256,
    32      storageSize: 512,
    33      evaluationPeriods: 5,
    34      datapointsToAlarm: 3,
    35      availabilityThreshold: 95,
    36      doAlarm: true,
    37      logLevel: 'DEBUG',
    38  }
    39  
    40  export class CanaryStack extends cdk.Stack {
    41      public readonly lambdaCode: lambda.CfnParametersCode;
    42      private readonly config: CanaryConfig;
    43      private readonly dashboard: cloudwatch.Dashboard
    44      private readonly snsAlarmTopic: sns.ITopic
    45  
    46      constructor(app: cdk.App, id: string, props: cdk.StackProps, config: CanaryConfig) {
    47          super(app, id, props)
    48  
    49          this.config = config;
    50          this.lambdaCode = lambda.Code.fromCfnParameters();
    51          this.dashboard = new cloudwatch.Dashboard(this, "Dashboard", {
    52              dashboardName: "BacalhauCanary" + this.config.envTitle
    53          });
    54          this.snsAlarmTopic = new sns.Topic(this, 'AlarmTopic');
    55  
    56          this.createLambdaAlarmSlackHandlerFunc()
    57          this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "list"}});
    58          this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "submit"}});
    59          this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "submitAndGet", memorySize: 1024}});
    60          this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "submitAndDescribe"}});
    61          this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "submitWithConcurrency"}});
    62          this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "submitWithConcurrencyOwnedNodes"}});
    63          this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{
    64                  action: "submitDockerIPFSJobAndGet", timeoutMinutes: 5, rateMinutes: 5, memorySize: 5120, storageSize: 5012,
    65                  datapointsToAlarm: 4, evaluationPeriods: 6, doAlarm: false}});
    66  
    67          if (config.createOperators) {
    68              this.createOperatorGroup()
    69          }
    70      }
    71  
    72      // Create a lambda function that handles alarms and sends a slack notification
    73      private createLambdaAlarmSlackHandlerFunc() : lambda.Function {
    74          const slackSecretes = new secretsmanager.Secret(this, 'SlackWebhooksSecret', {
    75              description: 'Slack webhook URLs',
    76              secretObjectValue: {
    77                  webhookUrl: cdk.SecretValue.unsafePlainText('https://...'),
    78              },
    79          });
    80  
    81          const func = new lambda.Function(this,  'AlarmHandlerFunction', {
    82              code: this.lambdaCode,
    83              handler: 'alarm_handler',
    84              runtime: lambda.Runtime.GO_1_X,
    85              timeout: cdk.Duration.minutes(1),
    86              environment: {
    87                  'DASHBOARD_URL': this.config.dashboardPublicUrl,
    88                  'SLACK_SECRET_NAME': slackSecretes.secretName,
    89              }
    90          });
    91          func.addEventSource(new lambdaSources.SnsEventSource(this.snsAlarmTopic));
    92          slackSecretes.grantRead(func);
    93          return func;
    94      }
    95  
    96      // Create a lambda function that triggers test scenarios
    97      private createLambdaScenarioFunc(props: ScenarioProps) : lambda.Function {
    98          const actionTitle = props.action.charAt(0).toUpperCase() + props.action.slice(1)
    99          const func = new lambda.Function(this, actionTitle + 'Function', {
   100              code: this.lambdaCode,
   101              handler: 'scenario_handler',
   102              runtime: lambda.Runtime.GO_1_X,
   103              timeout: cdk.Duration.minutes(props.timeoutMinutes),
   104              memorySize: props.memorySize,
   105              ephemeralStorageSize: Size.mebibytes(props.storageSize),
   106              retryAttempts: 0,
   107              environment: {
   108                  'BACALHAU_DIR': '/tmp', //bacalhau uses $HOME to store configs by default, which doesn't exist in lambda
   109                  'LOG_LEVEL': props.logLevel,
   110                  'BACALHAU_ENVIRONMENT': this.config.bacalhauEnvironment,
   111              }
   112          });
   113  
   114          // EventBridge rules
   115          const rule = new events.Rule(this, actionTitle + 'EventRule', {
   116              schedule: events.Schedule.rate(cdk.Duration.minutes(props.rateMinutes)),
   117          });
   118  
   119          rule.addTarget(new targets.LambdaFunction(func, {
   120              event: events.RuleTargetInput.fromObject({action: props.action}),
   121              retryAttempts: 0,
   122              maxEventAge: cdk.Duration.minutes(1),
   123          }));
   124  
   125          this.addDashboardWidgets(actionTitle, func);
   126          this.createAlarm(props, func)
   127          return func;
   128      }
   129  
   130      private addDashboardWidgets(actionTitle: string, func: lambda.Function) {
   131          // Create Title for Dashboard
   132          this.dashboard.addWidgets(new cloudwatch.TextWidget({
   133              markdown: '## ' + actionTitle,
   134              height: 1,
   135              width: 24
   136          }))
   137  
   138          // Create CloudWatch Dashboard Widgets: Errors, Invocations, Duration, Throttles
   139          this.dashboard.addWidgets(
   140              new cloudwatch.GraphWidget({
   141                  title: "Invocations",
   142                  left: [func.metricInvocations()],
   143                  width: 8
   144              }),
   145              new cloudwatch.GraphWidget({
   146                  title: "Duration",
   147                  left: [func.metricDuration({label: "[avg: ${AVG}ms, max: ${MAX}ms] Duration"})],
   148                  width: 8
   149              }),
   150              new cloudwatch.GraphWidget({
   151                  title: "Error count and success rate (%)",
   152                  left: [func.metricErrors()],
   153                  right: [this.getAvailabilityMetric(func)],
   154                  rightYAxis: {min: 0, max: 100},
   155                  width: 8
   156              })
   157          )
   158      }
   159  
   160      private getAvailabilityMetric(func: lambda.Function) : cloudwatch.MathExpression {
   161          return new cloudwatch.MathExpression({
   162              expression: "100 - 100 * errors / MAX([errors, invocations])",
   163              label: "[avg: ${AVG}] Success rate",
   164              usingMetrics: {
   165                  errors: func.metricErrors(),
   166                  invocations: func.metricInvocations()
   167              }
   168          })
   169      }
   170  
   171      private createAlarm(props: ScenarioProps, func: lambda.Function) {
   172          const actionTitle = props.action.charAt(0).toUpperCase() + props.action.slice(1)
   173          const availabilityMetric = this.getAvailabilityMetric(func)
   174          const alarm = availabilityMetric.createAlarm(this, actionTitle + "Alarm", {
   175              alarmDescription: actionTitle + ' ' + this.config.envTitle + ' Availability',
   176              threshold: props.availabilityThreshold,
   177              comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD,
   178              evaluationPeriods: props.evaluationPeriods,
   179              datapointsToAlarm: props.datapointsToAlarm,
   180              treatMissingData: cloudwatch.TreatMissingData.BREACHING,
   181          });
   182  
   183          if (props.doAlarm) {
   184              alarm.addAlarmAction(new cloudwatchActions.SnsAction(this.snsAlarmTopic));
   185              alarm.addOkAction(new cloudwatchActions.SnsAction(this.snsAlarmTopic));
   186          }
   187      }
   188  
   189      private createOperatorGroup() {
   190          const group = new iam.Group(this, 'OperatorGroup', {
   191              groupName: 'BacalhauCanaryOperators-' + this.config.envTitle
   192          })
   193  
   194          // add managed policies
   195          group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('CloudWatchReadOnlyAccess'))
   196          group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSCloudFormationReadOnlyAccess'))
   197          group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSLambda_ReadOnlyAccess'))
   198          group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AmazonEventBridgeReadOnlyAccess'))
   199          group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AmazonEventBridgeSchemasReadOnlyAccess'))
   200          group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSCodePipeline_ReadOnlyAccess'))
   201          group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSCodeBuildReadOnlyAccess'))
   202          group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSCodeDeployReadOnlyAccess'))
   203          group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSCodeCommitReadOnly'))
   204          group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('IAMUserChangePassword'))
   205  
   206          // Create users and add them to the group
   207          const users = [
   208              'kai.davenport',
   209              'luke.marsden',
   210              'enrico.rotundo',
   211          ]
   212  
   213          const initialPassword = new secretsmanager.Secret(this, 'CanaryOperatorsInitialPassword', {
   214              description: 'Canary Operators Initial Password',
   215          });
   216  
   217          users.forEach(username => {
   218              new iam.User(this, 'OperatorUser' + username, {
   219                  userName: username,
   220                  password: initialPassword.secretValue,
   221                  passwordResetRequired: true,
   222                  groups: [group]
   223              })
   224          })
   225      }
   226  }