diff --git a/.tool-versions b/.tool-versions index 14550dc..ef1c7b1 100644 --- a/.tool-versions +++ b/.tool-versions @@ -3,9 +3,9 @@ gitleaks 8.24.0 jq 1.6 nodejs 22.16.0 pre-commit 3.6.0 -terraform 1.12.0 +terraform 1.14.3 terraform-docs 0.19.0 -trivy 0.61.0 +trivy 0.69.2 vale 3.6.0 python 3.13.2 diff --git a/infrastructure/terraform/components/events/.tool-versions b/infrastructure/terraform/components/events/.tool-versions index 82cdb4d..52428de 100644 --- a/infrastructure/terraform/components/events/.tool-versions +++ b/infrastructure/terraform/components/events/.tool-versions @@ -1 +1 @@ -terraform 1.12.0 +terraform 1.14.3 diff --git a/infrastructure/terraform/components/events/README.md b/infrastructure/terraform/components/events/README.md index cbfaf27..b1cc46d 100644 --- a/infrastructure/terraform/components/events/README.md +++ b/infrastructure/terraform/components/events/README.md @@ -15,7 +15,11 @@ | [aws\_account\_id](#input\_aws\_account\_id) | The AWS Account ID (numeric) | `string` | n/a | yes | | [component](#input\_component) | The variable encapsulating the name of this component | `string` | `"events"` | no | | [default\_tags](#input\_default\_tags) | A map of default tags to apply to all taggable resources within the component | `map(string)` | `{}` | no | +| [enable\_event\_anomaly\_detection](#input\_enable\_event\_anomaly\_detection) | Enable CloudWatch anomaly detection alarms for event bus traffic. Applies to both data and control plane ingestion and invocations. | `bool` | `true` | no | | [environment](#input\_environment) | The name of the tfscaffold environment | `string` | n/a | yes | +| [event\_anomaly\_band\_width](#input\_event\_anomaly\_band\_width) | The width of the anomaly detection band. Higher values (e.g. 4-6) reduce sensitivity and noise, lower values (e.g. 2-3) increase sensitivity. Recommended: 2-4. | `number` | `3` | no | +| [event\_anomaly\_evaluation\_periods](#input\_event\_anomaly\_evaluation\_periods) | Number of evaluation periods for the anomaly alarm. Each period is defined by event\_anomaly\_period. | `number` | `2` | no | +| [event\_anomaly\_period](#input\_event\_anomaly\_period) | The period in seconds over which the specified statistic is applied for anomaly detection. Minimum 300 seconds (5 minutes). Recommended: 300-600. | `number` | `300` | no | | [event\_publisher\_account\_ids](#input\_event\_publisher\_account\_ids) | An object representing account id's of event publishers | `list(any)` | `[]` | no | | [event\_target\_arns](#input\_event\_target\_arns) | A map of event target ARNs keyed by name |
object({
sms_nudge = string
notify_core_sns_topic = optional(string, null)
supplier_api_sns_topic = optional(string, null)
app_response = optional(string, null)
client_callbacks = optional(string, null)
})
| n/a | yes | | [force\_lambda\_code\_deploy](#input\_force\_lambda\_code\_deploy) | If the lambda package in s3 has the same commit id tag as the terraform build branch, the lambda will not update automatically. Set to True if making changes to Lambda code from on the same commit for example during development | `bool` | `false` | no | @@ -43,7 +47,11 @@ | Name | Description | |------|-------------| | [control\_plane\_event\_bus](#output\_control\_plane\_event\_bus) | n/a | +| [control\_plane\_ingestion\_anomaly\_alarm](#output\_control\_plane\_ingestion\_anomaly\_alarm) | Control plane ingestion anomaly detection alarm details | +| [control\_plane\_invocations\_anomaly\_alarm](#output\_control\_plane\_invocations\_anomaly\_alarm) | Control plane invocations anomaly detection alarm details | | [data\_plane\_event\_bus](#output\_data\_plane\_event\_bus) | n/a | +| [data\_plane\_ingestion\_anomaly\_alarm](#output\_data\_plane\_ingestion\_anomaly\_alarm) | Data plane ingestion anomaly detection alarm details | +| [data\_plane\_invocations\_anomaly\_alarm](#output\_data\_plane\_invocations\_anomaly\_alarm) | Data plane invocations anomaly detection alarm details | diff --git a/infrastructure/terraform/components/events/cloudwatch_metric_alarm_control_plane_ingestion_anomaly.tf b/infrastructure/terraform/components/events/cloudwatch_metric_alarm_control_plane_ingestion_anomaly.tf new file mode 100644 index 0000000..8119c8f --- /dev/null +++ b/infrastructure/terraform/components/events/cloudwatch_metric_alarm_control_plane_ingestion_anomaly.tf @@ -0,0 +1,40 @@ +resource "aws_cloudwatch_metric_alarm" "control_plane_ingestion_anomaly" { + count = var.enable_event_anomaly_detection ? 1 : 0 + + alarm_name = "${local.csi}-control-plane-ingestion-anomaly" + alarm_description = "RELIABILITY: Detects anomalous patterns in events ingested to the control plane event bus" + comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold" + evaluation_periods = var.event_anomaly_evaluation_periods + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + + metric_query { + id = "m1" + return_data = true + + metric { + metric_name = "Ingestion" + namespace = "AWS/Events" + period = var.event_anomaly_period + stat = "Sum" + + dimensions = { + EventBusName = aws_cloudwatch_event_bus.control_plane.name + } + } + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.event_anomaly_band_width})" + label = "Ingestion (expected)" + return_data = true + } + + tags = merge( + local.default_tags, + { + Name = "${local.csi}-control-plane-ingestion-anomaly" + } + ) +} diff --git a/infrastructure/terraform/components/events/cloudwatch_metric_alarm_control_plane_invocations_anomaly.tf b/infrastructure/terraform/components/events/cloudwatch_metric_alarm_control_plane_invocations_anomaly.tf new file mode 100644 index 0000000..b30913a --- /dev/null +++ b/infrastructure/terraform/components/events/cloudwatch_metric_alarm_control_plane_invocations_anomaly.tf @@ -0,0 +1,40 @@ +resource "aws_cloudwatch_metric_alarm" "control_plane_invocations_anomaly" { + count = var.enable_event_anomaly_detection ? 1 : 0 + + alarm_name = "${local.csi}-control-plane-invocations-anomaly" + alarm_description = "RELIABILITY: Detects anomalous patterns in events delivered from the control plane event bus to targets" + comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold" + evaluation_periods = var.event_anomaly_evaluation_periods + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + + metric_query { + id = "m1" + return_data = true + + metric { + metric_name = "Invocations" + namespace = "AWS/Events" + period = var.event_anomaly_period + stat = "Sum" + + dimensions = { + EventBusName = aws_cloudwatch_event_bus.control_plane.name + } + } + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.event_anomaly_band_width})" + label = "Invocations (expected)" + return_data = true + } + + tags = merge( + local.default_tags, + { + Name = "${local.csi}-control-plane-invocations-anomaly" + } + ) +} diff --git a/infrastructure/terraform/components/events/cloudwatch_metric_alarm_data_plane_ingestion_anomaly.tf b/infrastructure/terraform/components/events/cloudwatch_metric_alarm_data_plane_ingestion_anomaly.tf new file mode 100644 index 0000000..5eed057 --- /dev/null +++ b/infrastructure/terraform/components/events/cloudwatch_metric_alarm_data_plane_ingestion_anomaly.tf @@ -0,0 +1,40 @@ +resource "aws_cloudwatch_metric_alarm" "data_plane_ingestion_anomaly" { + count = var.enable_event_anomaly_detection ? 1 : 0 + + alarm_name = "${local.csi}-data-plane-ingestion-anomaly" + alarm_description = "RELIABILITY: Detects anomalous patterns in events ingested to the data plane event bus" + comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold" + evaluation_periods = var.event_anomaly_evaluation_periods + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + + metric_query { + id = "m1" + return_data = true + + metric { + metric_name = "Ingestion" + namespace = "AWS/Events" + period = var.event_anomaly_period + stat = "Sum" + + dimensions = { + EventBusName = aws_cloudwatch_event_bus.data_plane.name + } + } + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.event_anomaly_band_width})" + label = "Ingestion (expected)" + return_data = true + } + + tags = merge( + local.default_tags, + { + Name = "${local.csi}-data-plane-ingestion-anomaly" + } + ) +} diff --git a/infrastructure/terraform/components/events/cloudwatch_metric_alarm_data_plane_invocations_anomaly.tf b/infrastructure/terraform/components/events/cloudwatch_metric_alarm_data_plane_invocations_anomaly.tf new file mode 100644 index 0000000..d029fe6 --- /dev/null +++ b/infrastructure/terraform/components/events/cloudwatch_metric_alarm_data_plane_invocations_anomaly.tf @@ -0,0 +1,40 @@ +resource "aws_cloudwatch_metric_alarm" "data_plane_invocations_anomaly" { + count = var.enable_event_anomaly_detection ? 1 : 0 + + alarm_name = "${local.csi}-data-plane-invocations-anomaly" + alarm_description = "RELIABILITY: Detects anomalous patterns in events delivered from the data plane event bus to targets" + comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold" + evaluation_periods = var.event_anomaly_evaluation_periods + threshold_metric_id = "ad1" + treat_missing_data = "notBreaching" + + metric_query { + id = "m1" + return_data = true + + metric { + metric_name = "Invocations" + namespace = "AWS/Events" + period = var.event_anomaly_period + stat = "Sum" + + dimensions = { + EventBusName = aws_cloudwatch_event_bus.data_plane.name + } + } + } + + metric_query { + id = "ad1" + expression = "ANOMALY_DETECTION_BAND(m1, ${var.event_anomaly_band_width})" + label = "Invocations (expected)" + return_data = true + } + + tags = merge( + local.default_tags, + { + Name = "${local.csi}-data-plane-invocations-anomaly" + } + ) +} diff --git a/infrastructure/terraform/components/events/outputs.tf b/infrastructure/terraform/components/events/outputs.tf index e90c7aa..fb678ab 100644 --- a/infrastructure/terraform/components/events/outputs.tf +++ b/infrastructure/terraform/components/events/outputs.tf @@ -11,3 +11,35 @@ output "data_plane_event_bus" { arn = aws_cloudwatch_event_bus.data_plane.arn } } + +output "data_plane_ingestion_anomaly_alarm" { + description = "Data plane ingestion anomaly detection alarm details" + value = var.enable_event_anomaly_detection ? { + arn = aws_cloudwatch_metric_alarm.data_plane_ingestion_anomaly[0].arn + name = aws_cloudwatch_metric_alarm.data_plane_ingestion_anomaly[0].alarm_name + } : null +} + +output "data_plane_invocations_anomaly_alarm" { + description = "Data plane invocations anomaly detection alarm details" + value = var.enable_event_anomaly_detection ? { + arn = aws_cloudwatch_metric_alarm.data_plane_invocations_anomaly[0].arn + name = aws_cloudwatch_metric_alarm.data_plane_invocations_anomaly[0].alarm_name + } : null +} + +output "control_plane_ingestion_anomaly_alarm" { + description = "Control plane ingestion anomaly detection alarm details" + value = var.enable_event_anomaly_detection ? { + arn = aws_cloudwatch_metric_alarm.control_plane_ingestion_anomaly[0].arn + name = aws_cloudwatch_metric_alarm.control_plane_ingestion_anomaly[0].alarm_name + } : null +} + +output "control_plane_invocations_anomaly_alarm" { + description = "Control plane invocations anomaly detection alarm details" + value = var.enable_event_anomaly_detection ? { + arn = aws_cloudwatch_metric_alarm.control_plane_invocations_anomaly[0].arn + name = aws_cloudwatch_metric_alarm.control_plane_invocations_anomaly[0].alarm_name + } : null +} diff --git a/infrastructure/terraform/components/events/variables.tf b/infrastructure/terraform/components/events/variables.tf index d2c7eef..99863c7 100644 --- a/infrastructure/terraform/components/events/variables.tf +++ b/infrastructure/terraform/components/events/variables.tf @@ -130,3 +130,32 @@ variable "notify_core_sns_kms_arn" { type = string default = null } + +variable "enable_event_anomaly_detection" { + type = bool + description = "Enable CloudWatch anomaly detection alarms for event bus traffic. Applies to both data and control plane ingestion and invocations." + default = true +} + +variable "event_anomaly_evaluation_periods" { + type = number + description = "Number of evaluation periods for the anomaly alarm. Each period is defined by event_anomaly_period." + default = 2 +} + +variable "event_anomaly_period" { + type = number + description = "The period in seconds over which the specified statistic is applied for anomaly detection. Minimum 300 seconds (5 minutes). Recommended: 300-600." + default = 300 +} + +variable "event_anomaly_band_width" { + type = number + description = "The width of the anomaly detection band. Higher values (e.g. 4-6) reduce sensitivity and noise, lower values (e.g. 2-3) increase sensitivity. Recommended: 2-4." + default = 3 + + validation { + condition = var.event_anomaly_band_width >= 2 && var.event_anomaly_band_width <= 10 + error_message = "Band width must be between 2 and 10" + } +}