github.com/filecoin-project/bacalhau@v0.3.23-0.20230228154132-45c989550ace/ops/aws/canary/lib/canary-stack.ts (about) 1 import * as cdk from 'aws-cdk-lib'; 2 import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch'; 3 import * as cloudwatchActions from 'aws-cdk-lib/aws-cloudwatch-actions'; 4 import * as events from 'aws-cdk-lib/aws-events'; 5 import * as targets from 'aws-cdk-lib/aws-events-targets'; 6 import * as iam from 'aws-cdk-lib/aws-iam'; 7 import * as lambda from 'aws-cdk-lib/aws-lambda'; 8 import * as lambdaSources from 'aws-cdk-lib/aws-lambda-event-sources'; 9 import * as sns from 'aws-cdk-lib/aws-sns'; 10 import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager'; 11 import {CanaryConfig} from "./config"; 12 import {Size} from "aws-cdk-lib"; 13 14 export interface ScenarioProps { 15 readonly action: string; 16 readonly timeoutMinutes: number; 17 readonly rateMinutes: number; 18 readonly memorySize: number; 19 readonly storageSize: number; 20 readonly evaluationPeriods: number; 21 readonly datapointsToAlarm: number; 22 readonly availabilityThreshold: number; 23 readonly doAlarm: boolean; 24 readonly logLevel: string 25 } 26 27 const DEFAULT_SCENARIO_PROPS: ScenarioProps = { 28 action: '_', 29 timeoutMinutes: 1, 30 rateMinutes: 2, 31 memorySize: 256, 32 storageSize: 512, 33 evaluationPeriods: 5, 34 datapointsToAlarm: 3, 35 availabilityThreshold: 95, 36 doAlarm: true, 37 logLevel: 'DEBUG', 38 } 39 40 export class CanaryStack extends cdk.Stack { 41 public readonly lambdaCode: lambda.CfnParametersCode; 42 private readonly config: CanaryConfig; 43 private readonly dashboard: cloudwatch.Dashboard 44 private readonly snsAlarmTopic: sns.ITopic 45 46 constructor(app: cdk.App, id: string, props: cdk.StackProps, config: CanaryConfig) { 47 super(app, id, props) 48 49 this.config = config; 50 this.lambdaCode = lambda.Code.fromCfnParameters(); 51 this.dashboard = new cloudwatch.Dashboard(this, "Dashboard", { 52 dashboardName: "BacalhauCanary" + this.config.envTitle 53 }); 54 this.snsAlarmTopic = new sns.Topic(this, 'AlarmTopic'); 55 56 this.createLambdaAlarmSlackHandlerFunc() 57 this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "list"}}); 58 this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "submit"}}); 59 this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "submitAndGet", memorySize: 1024}}); 60 this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "submitAndDescribe"}}); 61 this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "submitWithConcurrency"}}); 62 this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{action: "submitWithConcurrencyOwnedNodes"}}); 63 this.createLambdaScenarioFunc({ ...DEFAULT_SCENARIO_PROPS, ...{ 64 action: "submitDockerIPFSJobAndGet", timeoutMinutes: 5, rateMinutes: 5, memorySize: 5120, storageSize: 5012, 65 datapointsToAlarm: 4, evaluationPeriods: 6, doAlarm: false}}); 66 67 if (config.createOperators) { 68 this.createOperatorGroup() 69 } 70 } 71 72 // Create a lambda function that handles alarms and sends a slack notification 73 private createLambdaAlarmSlackHandlerFunc() : lambda.Function { 74 const slackSecretes = new secretsmanager.Secret(this, 'SlackWebhooksSecret', { 75 description: 'Slack webhook URLs', 76 secretObjectValue: { 77 webhookUrl: cdk.SecretValue.unsafePlainText('https://...'), 78 }, 79 }); 80 81 const func = new lambda.Function(this, 'AlarmHandlerFunction', { 82 code: this.lambdaCode, 83 handler: 'alarm_handler', 84 runtime: lambda.Runtime.GO_1_X, 85 timeout: cdk.Duration.minutes(1), 86 environment: { 87 'DASHBOARD_URL': this.config.dashboardPublicUrl, 88 'SLACK_SECRET_NAME': slackSecretes.secretName, 89 } 90 }); 91 func.addEventSource(new lambdaSources.SnsEventSource(this.snsAlarmTopic)); 92 slackSecretes.grantRead(func); 93 return func; 94 } 95 96 // Create a lambda function that triggers test scenarios 97 private createLambdaScenarioFunc(props: ScenarioProps) : lambda.Function { 98 const actionTitle = props.action.charAt(0).toUpperCase() + props.action.slice(1) 99 const func = new lambda.Function(this, actionTitle + 'Function', { 100 code: this.lambdaCode, 101 handler: 'scenario_handler', 102 runtime: lambda.Runtime.GO_1_X, 103 timeout: cdk.Duration.minutes(props.timeoutMinutes), 104 memorySize: props.memorySize, 105 ephemeralStorageSize: Size.mebibytes(props.storageSize), 106 retryAttempts: 0, 107 environment: { 108 'BACALHAU_DIR': '/tmp', //bacalhau uses $HOME to store configs by default, which doesn't exist in lambda 109 'LOG_LEVEL': props.logLevel, 110 'BACALHAU_ENVIRONMENT': this.config.bacalhauEnvironment, 111 } 112 }); 113 114 // EventBridge rules 115 const rule = new events.Rule(this, actionTitle + 'EventRule', { 116 schedule: events.Schedule.rate(cdk.Duration.minutes(props.rateMinutes)), 117 }); 118 119 rule.addTarget(new targets.LambdaFunction(func, { 120 event: events.RuleTargetInput.fromObject({action: props.action}), 121 retryAttempts: 0, 122 maxEventAge: cdk.Duration.minutes(1), 123 })); 124 125 this.addDashboardWidgets(actionTitle, func); 126 this.createAlarm(props, func) 127 return func; 128 } 129 130 private addDashboardWidgets(actionTitle: string, func: lambda.Function) { 131 // Create Title for Dashboard 132 this.dashboard.addWidgets(new cloudwatch.TextWidget({ 133 markdown: '## ' + actionTitle, 134 height: 1, 135 width: 24 136 })) 137 138 // Create CloudWatch Dashboard Widgets: Errors, Invocations, Duration, Throttles 139 this.dashboard.addWidgets( 140 new cloudwatch.GraphWidget({ 141 title: "Invocations", 142 left: [func.metricInvocations()], 143 width: 8 144 }), 145 new cloudwatch.GraphWidget({ 146 title: "Duration", 147 left: [func.metricDuration({label: "[avg: ${AVG}ms, max: ${MAX}ms] Duration"})], 148 width: 8 149 }), 150 new cloudwatch.GraphWidget({ 151 title: "Error count and success rate (%)", 152 left: [func.metricErrors()], 153 right: [this.getAvailabilityMetric(func)], 154 rightYAxis: {min: 0, max: 100}, 155 width: 8 156 }) 157 ) 158 } 159 160 private getAvailabilityMetric(func: lambda.Function) : cloudwatch.MathExpression { 161 return new cloudwatch.MathExpression({ 162 expression: "100 - 100 * errors / MAX([errors, invocations])", 163 label: "[avg: ${AVG}] Success rate", 164 usingMetrics: { 165 errors: func.metricErrors(), 166 invocations: func.metricInvocations() 167 } 168 }) 169 } 170 171 private createAlarm(props: ScenarioProps, func: lambda.Function) { 172 const actionTitle = props.action.charAt(0).toUpperCase() + props.action.slice(1) 173 const availabilityMetric = this.getAvailabilityMetric(func) 174 const alarm = availabilityMetric.createAlarm(this, actionTitle + "Alarm", { 175 alarmDescription: actionTitle + ' ' + this.config.envTitle + ' Availability', 176 threshold: props.availabilityThreshold, 177 comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD, 178 evaluationPeriods: props.evaluationPeriods, 179 datapointsToAlarm: props.datapointsToAlarm, 180 treatMissingData: cloudwatch.TreatMissingData.BREACHING, 181 }); 182 183 if (props.doAlarm) { 184 alarm.addAlarmAction(new cloudwatchActions.SnsAction(this.snsAlarmTopic)); 185 alarm.addOkAction(new cloudwatchActions.SnsAction(this.snsAlarmTopic)); 186 } 187 } 188 189 private createOperatorGroup() { 190 const group = new iam.Group(this, 'OperatorGroup', { 191 groupName: 'BacalhauCanaryOperators-' + this.config.envTitle 192 }) 193 194 // add managed policies 195 group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('CloudWatchReadOnlyAccess')) 196 group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSCloudFormationReadOnlyAccess')) 197 group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSLambda_ReadOnlyAccess')) 198 group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AmazonEventBridgeReadOnlyAccess')) 199 group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AmazonEventBridgeSchemasReadOnlyAccess')) 200 group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSCodePipeline_ReadOnlyAccess')) 201 group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSCodeBuildReadOnlyAccess')) 202 group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSCodeDeployReadOnlyAccess')) 203 group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('AWSCodeCommitReadOnly')) 204 group.addManagedPolicy(iam.ManagedPolicy.fromAwsManagedPolicyName('IAMUserChangePassword')) 205 206 // Create users and add them to the group 207 const users = [ 208 'kai.davenport', 209 'luke.marsden', 210 'enrico.rotundo', 211 ] 212 213 const initialPassword = new secretsmanager.Secret(this, 'CanaryOperatorsInitialPassword', { 214 description: 'Canary Operators Initial Password', 215 }); 216 217 users.forEach(username => { 218 new iam.User(this, 'OperatorUser' + username, { 219 userName: username, 220 password: initialPassword.secretValue, 221 passwordResetRequired: true, 222 groups: [group] 223 }) 224 }) 225 } 226 }