diff --git a/.gitignore b/.gitignore index ff7161e57..ed34f7842 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ node_modules # draw.io temp files .$*.bkp -.$*.dtmp \ No newline at end of file +.$*.dtmp +venv \ No newline at end of file diff --git a/metrics-collector/.ceignore b/metrics-collector/.ceignore new file mode 100644 index 000000000..a5ed506b1 --- /dev/null +++ b/metrics-collector/.ceignore @@ -0,0 +1,2 @@ +images/ +setup/ \ No newline at end of file diff --git a/metrics-collector/Dockerfile b/metrics-collector/Dockerfile index 111dd9481..83a2322c6 100644 --- a/metrics-collector/Dockerfile +++ b/metrics-collector/Dockerfile @@ -1,11 +1,51 @@ -FROM quay.io/projectquay/golang:1.23 AS build-env +# Stage 1: Build Go binary +FROM quay.io/projectquay/golang:1.25 AS go-builder WORKDIR /go/src/app -COPY . . - +COPY go.mod go.sum ./ RUN go mod download -RUN CGO_ENABLED=0 go build -o /go/bin/app main.go +COPY main.go ./ +RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o app main.go + +# Stage 2: Download and extract Prometheus +FROM busybox:1.36-glibc AS prometheus-downloader +ARG PROMETHEUS_VERSION=3.10.0 +ARG TARGETARCH=amd64 + +WORKDIR /tmp +RUN wget https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}.tar.gz && \ + tar xzf prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}.tar.gz && \ + mv prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}/prometheus /prometheus + +# Stage 3: Get CA certificates +FROM alpine:latest AS certs +RUN apk --no-cache add ca-certificates + +# Stage 4: Runtime image +FROM busybox:1.36-glibc + +# Copy CA certificates for TLS verification +COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt + +# Copy Go binary +COPY --from=go-builder /go/src/app/app /app + +# Copy Prometheus binary +COPY --from=prometheus-downloader /prometheus /bin/prometheus + +# Copy configuration and scripts +COPY prometheus.yml.template /etc/prometheus/prometheus.yml.template +COPY start.sh /start.sh +RUN chmod +x /start.sh + +# Create necessary directories with proper permissions +RUN mkdir -p /tmp/agent-data && \ + mkdir -p /etc/secrets && \ + chmod 777 /tmp/agent-data + +# Set SSL certificate path environment variable +ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt + +# Use non-root user +USER 1000:1000 -# Copy the exe into a smaller base image -FROM gcr.io/distroless/static-debian12 -COPY --from=build-env /go/bin/app / -CMD ["/app"] +ENTRYPOINT ["/start.sh"] diff --git a/metrics-collector/README.md b/metrics-collector/README.md index 9abb6844a..02b3cd47d 100644 --- a/metrics-collector/README.md +++ b/metrics-collector/README.md @@ -1,59 +1,132 @@ # IBM Cloud Code Engine - Metrics Collector -Code Engine job that demonstrates how to collect resource metrics (CPU, memory and disk usage) of running Code Engine apps, jobs, and builds +Code Engine job that demonstrates how to collect resource metrics (CPU, memory and disk usage) of running Code Engine apps, jobs, and builds. Those metrics can either be render + +in **IBM Cloud Monitoring** (see [instructions](#Send-metrics-to-IBM-Cloud-Monitoring)) + +![](./images/monitoring-dashboard-ce-component-resources.png) + +or in **IBM Cloud Logs** (see [instructions](#ibm-cloud-logs-setup)) ![Dashboard overview](./images/icl-dashboard-overview.png) -## Installation -### Capture metrics every n seconds +## Send metrics to IBM Cloud Monitoring -* Create Code Engine job template -``` -$ ibmcloud ce job create \ - --name metrics-collector \ - --src . \ - --mode daemon \ - --cpu 0.25 \ - --memory 0.5G \ - --wait -``` +### How It Works -* Submit a daemon job that collects metrics in an endless loop. The daemon job queries the Metrics API every 30 seconds -``` -$ ibmcloud ce jobrun submit \ - --job metrics-collector \ - --env INTERVAL=30 +![](./images/metrics-collector.overview.png) + +1. The metrics collector exposes Prometheus metrics on `localhost:9100/metrics` +2. The embedded Prometheus agent scrapes these metrics every 30 seconds +3. The agent also discovers and scrapes pods with the `codeengine.cloud.ibm.com/userMetricsScrape: 'true'` annotation +4. All metrics are forwarded to IBM Cloud Monitoring via remote write +5. If either the collector or Prometheus agent crashes, the container exits with a non-zero code to trigger a restart + +### Setup Instructions + +**Step 1:** You need an IBM Cloud Monitoring instance +```bash +REGION= +MONITORING_INSTANCE_NAME="" +MONITORING_INSTANCE_GUID=$(ibmcloud resource service-instance "$MONITORING_INSTANCE_NAME" -o JSON|jq -r '.[0].guid') +echo "MONITORING_INSTANCE_GUID: '$MONITORING_INSTANCE_GUID'" ``` +**Step 2:** The collector must run in a Code Engine project +```bash +# Create new Code Engine project +ibmcloud ce project create --name +# Select an existing Code Engine project +ibmcloud ce project select --name +``` -### Capture metrics every n minutes +**Step 3:** Create a secret with your IBM Cloud Monitoring API token +```bash +# Obtain the Monitoring API token of the IBM Cloud Monitoring instance +# using the IAM access token of the current IBM CLI Session +MONITORING_INSTANCE_MONITORING_API_KEY=$(curl --silent -X GET https://$REGION.monitoring.cloud.ibm.com/api/token -H "Authorization: $(ibmcloud iam oauth-tokens --output JSON|jq -r '.iam_token')" -H "IBMInstanceID: $MONITORING_INSTANCE_GUID" -H "content-type: application/json"|jq -r '.token.key') -* Create Code Engine job template +# Create a Code Engine secret that stores the Monitoring API Key +ibmcloud ce secret create \ + --name monitoring-apikey \ + --from-literal monitoring-apikey=$MONITORING_INSTANCE_MONITORING_API_KEY ``` -$ ibmcloud ce job create \ + +**Step 4:** Create your metrics-collector job with the required configuration +```bash +ibmcloud ce job create \ --name metrics-collector \ - --src . \ - --mode task \ + --src "." \ + --mode daemon \ --cpu 0.25 \ --memory 0.5G \ - --wait + --service-account reader \ + --build-size xlarge \ + --env INTERVAL=30 \ + --env METRICS_ENABLED=true \ + --env METRICS_REMOTE_WRITE_FQDN=ingest.prws.private.${REGION}.monitoring.cloud.ibm.com \ + --env CE_PROJECT_NAME="$(ibmcloud ce proj current --output json|jq -r '.name')" \ + --mount-secret /etc/secrets=monitoring-apikey ``` -* Submit a Code Engine cron subscription that triggers the metrics collector every minute to query the Metrics API +**Step 5:** Submit a daemon job run** +```bash +ibmcloud ce jobrun submit \ + --job metrics-collector ``` -$ ibmcloud ce subscription cron create \ - --name collect-metrics-every-minute \ - --destination-type job \ - --destination metrics-collector \ - --schedule '*/1 * * * *' + +**Step 6:** Import the "IBM Cloud Code Engine - Component Resource Overview" dashboard +```bash +# Load the most recent dashboard configuration +CE_MONITORING_DASHBOARD=$(curl -sL https://raw.githubusercontent.com/IBM/CodeEngine/main/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json) + +# Import the dashboard +curl -X POST https://$REGION.monitoring.cloud.ibm.com/api/v3/dashboards \ + -H "Authorization: $(ibmcloud iam oauth-tokens --output JSON|jq -r '.iam_token')" \ + -H "IBMInstanceID: $MONITORING_INSTANCE_GUID" \ + -H "Content-Type: application/json" \ + -d "{\"dashboard\": $CE_MONITORING_DASHBOARD}" + ``` -## Configuration +**Note:** A more elaborated approach to manage custom Cloud Monitoring dashboards can be found [here](setup/ibm-cloud-monitoring/README.md) + -Per default the metrics collector collects memory and CPU statistics, like `usage`, `current` and `configured`. +#### Exposed Metrics -One can use the environment variable `COLLECT_DISKUSAGE=true` to also collect the amount of disk space that is used. Please note, the metrics collector can only calculate the overall file size stored in the pods filesystem which includes files that are part of the container image, the epheremal storage as well as mounted COS buckets. Hence, this metric cannot be used to calculate the ephemeral storage usage. +The following Prometheus metrics are exposed as gauges: + +Container Metrics: +- **`ibm_codeengine_instance_cpu_usage_millicores`**: Current CPU usage in millicores +- **`ibm_codeengine_instance_cpu_limit_millicores`**: Configured CPU limit in millicores +- **`ibm_codeengine_instance_memory_usage_bytes`**: Current memory usage in bytes +- **`ibm_codeengine_instance_memory_limit_bytes`**: Configured memory limit in bytes +- **`ibm_codeengine_instance_ephemeral_storage_usage_bytes`**: Current ephemeral storage usage in bytes (if `COLLECT_DISKUSAGE=true`) + +The following 3 metrics are used to monitor the collector itself: +- **`ibm_codeengine_collector_collection_duration_seconds`**: Time taken to collect metrics in seconds (if `METRICS_INTERNAL_STATS=true`) +- **`ibm_codeengine_collector_last_collection_timestamp_seconds`**: Unix timestamp of last successful collection (if `METRICS_INTERNAL_STATS=true`) +- **`ibm_codeengine_collector_collection_errors_total`**: Total number of collection errors (counter) (if `METRICS_INTERNAL_STATS=true`) + +#### Metric Labels + +All container metrics include the following labels: +- `ibm_codeengine_instance_name`: Name of the pod instance +- `ibm_codeengine_component_type`: Type of component (`app`, `job`, or `build`) +- `ibm_codeengine_component_name`: Name of the Code Engine component + +#### Example Metrics Output + +```prometheus +# HELP ibm_codeengine_instance_cpu_usage_millicores Current CPU usage in millicores +# TYPE ibm_codeengine_instance_cpu_usage_millicores gauge +ibm_codeengine_instance_cpu_usage_millicores{ibm_codeengine_instance_name="myapp-00001-deployment-abc123",ibm_codeengine_component_type="app",ibm_codeengine_component_name="myapp"} 250 + +# HELP ibm_codeengine_instance_memory_usage_bytes Current memory usage in bytes +# TYPE ibm_codeengine_instance_memory_usage_bytes gauge +ibm_codeengine_instance_memory_usage_bytes{ibm_codeengine_instance_name="myapp-00001-deployment-abc123",ibm_codeengine_component_type="app",ibm_codeengine_component_name="myapp"} 134217728 +``` ## IBM Cloud Logs setup @@ -71,7 +144,7 @@ Follow the steps below to create a custom dashboard in your IBM Cloud Logs insta ![New dashboard](./images/icl-dashboard-new.png) -* In the "Import" modal, select the file [./setup/dashboard-code_engine_resource_consumption_metrics.json](./setup/dashboard-code_engine_resource_consumption_metrics.json) located in this repository, and click "Import" +* In the "Import" modal, select the file [./setup/ibm-cloud-logs/dashboard-code_engine_resource_consumption_metrics.json](./setup/ibm-cloud-logs/dashboard-code_engine_resource_consumption_metrics.json) located in this repository, and click "Import" ![Import modal](./images/icl-dashboard-import.png) @@ -117,8 +190,6 @@ app:"codeengine" AND message.metric:"instance-resources" * In the top-right corner, select `1-line` as view mode -![View](./images/icl-logs-view-mode.png) - * In the graph title it says "**Count** all grouped by **Severity**". Click on `Severity` and select `message.component_name` instead. Furthermore, select `Max` as aggregation metric and choose `message.memory.usage` as aggregation field ![Graph](./images/icl-logs-view-graph.png) @@ -132,54 +203,16 @@ app:"codeengine" AND message.metric:"instance-resources" ![Logs overview](./images/icl-logs-view-overview.png) -## IBM Log Analysis setup (deprecated) - -### Log lines - -Along with a human readable message, like `Captured metrics of app instance 'load-generator-00001-deployment-677d5b7754-ktcf6': 3m vCPU, 109 MB memory, 50 MB ephemeral storage`, each log line passes specific resource utilization details in a structured way allowing to apply advanced filters on them. - -E.g. -- `cpu.usage:>80`: Filter for all log lines that noticed a CPU utilization of 80% or higher -- `memory.current:>1000`: Filter for all log lines that noticed an instance that used 1GB or higher of memory -- `component_type:app`: Filter only for app instances. Possible values are `app`, `job`, and `build` -- `component_name:`: Filter for all instances of a specific app, job, or build -- `name:`: Filter for a specific instance - -![IBM Cloud Logs](./images/ibm-cloud-logs--loglines.png) - -### Log graphs - -Best is to create IBM Cloud Logs Board, in order to visualize the CPU and Memory usage per Code Engine component. - -1. In your log instance navigate to Boards -1. Give it a proper name, enter `metric:instance-resources` as query and submit by clicking `Add Graph` -![New Board](./images/new-board.png) -1. Now the graph shows the overall amount of logs captured for the specified query per time interval -![Count of metrics log lines ](./images/count-of-metrics-lines.png) -1. Click on the filter icon above the graph and put in `metric:instance-resources AND component_name:` -1. Switch the metric of the Graph to `Maximums` -1. Below the graph Add a new plot`cpu.usage` as field and choose `ANY` as field values -![Configure Graph plots](./images/configure-plots.png) -1. Add another plot for the field `memory.usage` and values `ANY` -1. Finally delete the plot `metrics:instance-resources` and adjust the plot colors to your likings -![Resource Usage graph](./images/resource-usage-graph.png) -1. The usage graph above renders the utilization in % of the CPU and Memory - -#### Add CPU utilization -1. Duplicate the graph, change its name to CPU and replace its plots with `cpu.configured` and `cpu.current`. -- The resulting graph will render the actual CPU usage compared to the configured limit. The the unit is milli vCPUs (1000 -> 1 vCPU). -![](./images/cpu-utilization.png) - - -#### Add memory utilization -1. Duplicate the graph, change its name to Memory and replace its plots with `memory.configured` and `memory.current`. -1. The resulting graph will render the actual memory usage compared to the configured limit. The the unit is MB (1000 -> 1 GB). -![](./images/memory-utilization.png) - +### Troubleshooting & Configuration +If the container fails to start with `METRICS_ENABLED=true`, check the logs for: +- Missing `/etc/secrets/monitoring-apikey` file +- Missing or wrong `METRICS_REMOTE_WRITE_FQDN` environment variable -#### Add disk utilization -1. Duplicate the graph or create a new one, change its name to "Disk usage" and replace its plots with `disk_usage.current`. -1. The resulting graph will render the actual disk usage. While this does not allow to identify the usage of disk space compared with the configured ephemeral storage limit, this graph gives an impression on whether the disk usage is growing over time. The the unit is MB (1000 -> 1 GB). -![](./images/disk-utilization.png) +#### Environment Variables +- **`INTERVAL`** (default: `30`): Collection interval in seconds (minimum 30 seconds). Controls how frequently metrics are collected from the Kubernetes API endpoint in daemon mode. +- **`COLLECT_DISKUSAGE`** (default: `false`): Set to `true` to collect disk space usage. Note: The metrics collector calculates the overall file size stored in the pod's filesystem, which includes files from the container image, ephemeral storage, and mounted COS buckets. This metric cannot be used to calculate ephemeral storage usage alone. +- **`METRICS_ENABLED`** (default: `false`): Set to `true` to enable the HTTP metrics server. When disabled, the collector still runs and logs metrics to stdout but does not expose the HTTP endpoint. +- **`METRICS_REMOTE_WRITE_FQDN`**: IBM Cloud Monitoring ingestion endpoint FQDN (required when `METRICS_ENABLED=true`) +- **`METRICS_PORT`** (default: `9100`): HTTP server port for the Prometheus metrics endpoint. Only used when `METRICS_ENABLED=true` in daemon mode. diff --git a/metrics-collector/docs/metrics-collector.drawio b/metrics-collector/docs/metrics-collector.drawio new file mode 100644 index 000000000..8c61e9815 --- /dev/null +++ b/metrics-collector/docs/metrics-collector.drawio @@ -0,0 +1,199 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/metrics-collector/go.mod b/metrics-collector/go.mod index b816d4e37..e358afe92 100644 --- a/metrics-collector/go.mod +++ b/metrics-collector/go.mod @@ -1,6 +1,6 @@ module metrics-collector -go 1.23.0 +go 1.25.0 require ( k8s.io/api v0.30.1 @@ -31,7 +31,7 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect - golang.org/x/net v0.38.0 // indirect + golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect golang.org/x/sys v0.31.0 // indirect golang.org/x/term v0.30.0 // indirect @@ -42,7 +42,7 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/klog/v2 v2.120.1 // indirect - k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f // indirect + k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect diff --git a/metrics-collector/go.sum b/metrics-collector/go.sum index 6f6389a87..f2758c0dc 100644 --- a/metrics-collector/go.sum +++ b/metrics-collector/go.sum @@ -14,8 +14,7 @@ github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDsl github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= -github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= -github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= @@ -28,8 +27,8 @@ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6 h1:k7nVchz72niMH6YLQNvHSdIE7iqsQxK1P41mySCvssg= -github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= @@ -58,10 +57,10 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/onsi/ginkgo/v2 v2.17.2 h1:7eMhcy3GimbsA3hEnVKdw/PQM9XN9krpKVXsZdph0/g= -github.com/onsi/ginkgo/v2 v2.17.2/go.mod h1:nP2DPOQoNsQmsVyv5rDA8JkXQoCs6goXIvr/PRJ1eCc= -github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= -github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= +github.com/onsi/ginkgo/v2 v2.15.0 h1:79HwNRBAZHOEwrczrgSOPy+eFTTlIGELKy5as+ClttY= +github.com/onsi/ginkgo/v2 v2.15.0/go.mod h1:HlxMHtYF57y6Dpf+mc5529KKmSq9h2FpCF+/ZkwUxKM= +github.com/onsi/gomega v1.31.0 h1:54UJxxj6cPInHS3a35wm6BK/F9nHYueZ1NVujHDrnXE= +github.com/onsi/gomega v1.31.0/go.mod h1:DW9aCi7U6Yi40wNVAvT6kzFnEVEI5n3DloYBiKiT6zk= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= @@ -83,8 +82,8 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -133,8 +132,8 @@ k8s.io/client-go v0.30.1 h1:uC/Ir6A3R46wdkgCV3vbLyNOYyCJ8oZnjtJGKfytl/Q= k8s.io/client-go v0.30.1/go.mod h1:wrAqLNs2trwiCH/wxxmT/x3hKVH9PuV0GGW0oDoHVqc= k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f h1:0LQagt0gDpKqvIkAMPaRGcXawNMouPECM1+F9BVxEaM= -k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f/go.mod h1:S9tOR0FxgyusSNR+MboCuiDpVWkAifZvaYI1Q2ubgro= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= k8s.io/kubectl v0.30.1 h1:sHFIRI3oP0FFZmBAVEE8ErjnTyXDPkBcvO88mH9RjuY= k8s.io/kubectl v0.30.1/go.mod h1:7j+L0Cc38RYEcx+WH3y44jRBe1Q1jxdGPKkX0h4iDq0= k8s.io/metrics v0.30.1 h1:PeA9cP0kxVtaC8Wkzp4sTkr7YSkd9R0UYP6cCHOOY1M= diff --git a/metrics-collector/images/configure-plots.png b/metrics-collector/images/configure-plots.png deleted file mode 100644 index ceffd258b..000000000 Binary files a/metrics-collector/images/configure-plots.png and /dev/null differ diff --git a/metrics-collector/images/count-of-metrics-lines.png b/metrics-collector/images/count-of-metrics-lines.png deleted file mode 100644 index e274f48b1..000000000 Binary files a/metrics-collector/images/count-of-metrics-lines.png and /dev/null differ diff --git a/metrics-collector/images/cpu-utilization.png b/metrics-collector/images/cpu-utilization.png deleted file mode 100644 index 4329d5714..000000000 Binary files a/metrics-collector/images/cpu-utilization.png and /dev/null differ diff --git a/metrics-collector/images/ibm-cloud-logs--loglines.png b/metrics-collector/images/ibm-cloud-logs--loglines.png deleted file mode 100644 index e865c935c..000000000 Binary files a/metrics-collector/images/ibm-cloud-logs--loglines.png and /dev/null differ diff --git a/metrics-collector/images/memory-utilization.png b/metrics-collector/images/memory-utilization.png deleted file mode 100644 index d1b533810..000000000 Binary files a/metrics-collector/images/memory-utilization.png and /dev/null differ diff --git a/metrics-collector/images/metrics-collector.overview.png b/metrics-collector/images/metrics-collector.overview.png new file mode 100644 index 000000000..89535b90c Binary files /dev/null and b/metrics-collector/images/metrics-collector.overview.png differ diff --git a/metrics-collector/images/monitoring-dashboard-ce-component-resources.png b/metrics-collector/images/monitoring-dashboard-ce-component-resources.png new file mode 100644 index 000000000..c6b2a282f Binary files /dev/null and b/metrics-collector/images/monitoring-dashboard-ce-component-resources.png differ diff --git a/metrics-collector/images/new-board.png b/metrics-collector/images/new-board.png deleted file mode 100644 index b95e182e5..000000000 Binary files a/metrics-collector/images/new-board.png and /dev/null differ diff --git a/metrics-collector/images/resource-usage-graph.png b/metrics-collector/images/resource-usage-graph.png deleted file mode 100644 index 47eb80be4..000000000 Binary files a/metrics-collector/images/resource-usage-graph.png and /dev/null differ diff --git a/metrics-collector/main.go b/metrics-collector/main.go index 63a2ba867..2bb5c1db8 100644 --- a/metrics-collector/main.go +++ b/metrics-collector/main.go @@ -5,10 +5,14 @@ import ( "context" "encoding/json" "fmt" + "net/http" "os" + "os/signal" "strconv" "strings" "sync" + "sync/atomic" + "syscall" "time" v1 "k8s.io/api/core/v1" @@ -23,13 +27,180 @@ import ( metricsv "k8s.io/metrics/pkg/client/clientset/versioned" ) -func main() { +// MetricsCache holds the latest collected metrics in a thread-safe manner +type MetricsCache struct { + mu sync.RWMutex + metrics []InstanceResourceStats + namespace string + lastUpdate time.Time + collectionCount int64 + errorCount int64 +} + +// CollectorStats tracks collector performance metrics +type CollectorStats struct { + lastCollectionDuration atomic.Int64 // in milliseconds + lastCollectionTime atomic.Int64 // unix timestamp + totalErrors atomic.Int64 +} + +var ( + metricsCache = &MetricsCache{} + collectorStats = &CollectorStats{} +) + +// setupHTTPHandlers configures the HTTP routes +func setupHTTPHandlers() http.Handler { + mux := http.NewServeMux() + mux.HandleFunc("/metrics", metricsHandler) + return mux +} + +// metricsHandler serves Prometheus-formatted metrics +func metricsHandler(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + metricsCache.mu.RLock() + metrics := metricsCache.metrics + namespace := metricsCache.namespace + lastUpdate := metricsCache.lastUpdate + metricsCache.mu.RUnlock() + + // Set content type for Prometheus + w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + + // Write Prometheus metrics + output := formatPrometheusMetrics(metrics, namespace, lastUpdate) + w.Write([]byte(output)) +} + +// formatPrometheusMetrics converts metrics to Prometheus text format +func formatPrometheusMetrics(metrics []InstanceResourceStats, namespace string, lastUpdate time.Time) string { + var sb strings.Builder + + // Helper function to escape label values + escapeLabelValue := func(s string) string { + s = strings.ReplaceAll(s, "\\", "\\\\") + s = strings.ReplaceAll(s, "\"", "\\\"") + s = strings.ReplaceAll(s, "\n", "\\n") + return s + } + + // Write container CPU usage metrics + sb.WriteString("# HELP ibm_codeengine_instance_cpu_usage_millicores Current CPU usage in millicores\n") + sb.WriteString("# TYPE ibm_codeengine_instance_cpu_usage_millicores gauge\n") + for _, m := range metrics { + labels := fmt.Sprintf("ibm_codeengine_instance_name=\"%s\",ibm_codeengine_component_type=\"%s\",ibm_codeengine_component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_cpu_usage_millicores{%s} %d\n", labels, m.Cpu.Current)) + } + sb.WriteString("\n") + + // Write container CPU limit metrics + sb.WriteString("# HELP ibm_codeengine_instance_cpu_limit_millicores Configured CPU limit in millicores\n") + sb.WriteString("# TYPE ibm_codeengine_instance_cpu_limit_millicores gauge\n") + for _, m := range metrics { + if m.Cpu.Configured > 0 { + labels := fmt.Sprintf("ibm_codeengine_instance_name=\"%s\",ibm_codeengine_component_type=\"%s\",ibm_codeengine_component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_cpu_limit_millicores{%s} %d\n", labels, m.Cpu.Configured)) + } + } + sb.WriteString("\n") + + // Write container memory usage metrics + sb.WriteString("# HELP ibm_codeengine_instance_memory_usage_bytes Current memory usage in bytes\n") + sb.WriteString("# TYPE ibm_codeengine_instance_memory_usage_bytes gauge\n") + for _, m := range metrics { + labels := fmt.Sprintf("ibm_codeengine_instance_name=\"%s\",ibm_codeengine_component_type=\"%s\",ibm_codeengine_component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + // Convert MB to bytes + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_memory_usage_bytes{%s} %d\n", labels, m.Memory.Current*1000*1000)) + } + sb.WriteString("\n") + + // Write container memory limit metrics + sb.WriteString("# HELP ibm_codeengine_instance_memory_limit_bytes Configured memory limit in bytes\n") + sb.WriteString("# TYPE ibm_codeengine_instance_memory_limit_bytes gauge\n") + for _, m := range metrics { + if m.Memory.Configured > 0 { + labels := fmt.Sprintf("ibm_codeengine_instance_name=\"%s\",ibm_codeengine_component_type=\"%s\",ibm_codeengine_component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + // Convert MB to bytes + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_memory_limit_bytes{%s} %d\n", labels, m.Memory.Configured*1000*1000)) + } + } + sb.WriteString("\n") + + // Write container ephemeral storage usage metrics (if available) + hasStorageMetrics := false + for _, m := range metrics { + if m.DiskUsage.Current > 0 { + hasStorageMetrics = true + break + } + } + + if hasStorageMetrics { + sb.WriteString("# HELP ibm_codeengine_instance_ephemeral_storage_usage_bytes Current ephemeral storage usage in bytes\n") + sb.WriteString("# TYPE ibm_codeengine_instance_ephemeral_storage_usage_bytes gauge\n") + for _, m := range metrics { + if m.DiskUsage.Current > 0 { + labels := fmt.Sprintf("ibm_codeengine_instance_name=\"%s\",ibm_codeengine_component_type=\"%s\",ibm_codeengine_component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + // Convert MB to bytes + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_ephemeral_storage_usage_bytes{%s} %d\n", labels, m.DiskUsage.Current*1000*1000)) + } + } + sb.WriteString("\n") + } + + if os.Getenv("METRICS_INTERNAL_STATS") == "true" { + // Write collector self-monitoring metrics + sb.WriteString("# HELP codeengine_collector_collection_duration_seconds Time taken to collect metrics in seconds\n") + sb.WriteString("# TYPE codeengine_collector_collection_duration_seconds gauge\n") + durationMs := collectorStats.lastCollectionDuration.Load() + sb.WriteString(fmt.Sprintf("codeengine_collector_collection_duration_seconds %.3f\n", float64(durationMs)/1000.0)) + sb.WriteString("\n") + + sb.WriteString("# HELP codeengine_collector_last_collection_timestamp_seconds Unix timestamp of last successful collection\n") + sb.WriteString("# TYPE codeengine_collector_last_collection_timestamp_seconds gauge\n") + lastCollectionTime := collectorStats.lastCollectionTime.Load() + sb.WriteString(fmt.Sprintf("codeengine_collector_last_collection_timestamp_seconds %d\n", lastCollectionTime)) + sb.WriteString("\n") + + sb.WriteString("# HELP codeengine_collector_collection_errors_total Total number of collection errors\n") + sb.WriteString("# TYPE codeengine_collector_collection_errors_total counter\n") + totalErrors := collectorStats.totalErrors.Load() + sb.WriteString(fmt.Sprintf("codeengine_collector_collection_errors_total %d\n", totalErrors)) + sb.WriteString("\n") + } + + return sb.String() +} +func main() { jobMode := os.Getenv("JOB_MODE") // In task mode, collect the resource metrics once if jobMode == "task" { - collectInstanceMetrics() + if err := collectInstanceMetrics(metricsCache); err != nil { + fmt.Printf("Error collecting metrics: %v\n", err) + os.Exit(1) + } return } @@ -42,11 +213,105 @@ func main() { } } - // In daemon mode, collect resource metrics in an endless loop - for { - collectInstanceMetrics() - time.Sleep(time.Duration(sleepDuration) * time.Second) + // Check if HTTP metrics server should be enabled + metricsEnabled := os.Getenv("METRICS_ENABLED") == "true" + + // Get metrics port configuration + metricsPort := "9100" + if port := os.Getenv("METRICS_PORT"); port != "" { + metricsPort = port + } + + // Create context for graceful shutdown + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Setup signal handling for graceful shutdown + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + // Start HTTP server only if METRICS_ENABLED=true + var server *http.Server + var serverErrors chan error + + if metricsEnabled { + server = &http.Server{ + Addr: ":" + metricsPort, + Handler: setupHTTPHandlers(), + } + + // Start HTTP server in a goroutine + serverErrors = make(chan error, 1) + go func() { + fmt.Printf("Starting HTTP metrics server on port %s\n", metricsPort) + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + serverErrors <- fmt.Errorf("HTTP server error: %w", err) + } + }() + } else { + fmt.Println("HTTP metrics server disabled (METRICS_ENABLED not set to 'true')") + } + + // Start metrics collection loop in a goroutine + collectionDone := make(chan struct{}) + go func() { + defer close(collectionDone) + ticker := time.NewTicker(time.Duration(sleepDuration) * time.Second) + defer ticker.Stop() + + // Collect metrics immediately on startup + if err := collectInstanceMetrics(metricsCache); err != nil { + fmt.Printf("Error collecting metrics: %v\n", err) + collectorStats.totalErrors.Add(1) + } + + for { + select { + case <-ctx.Done(): + fmt.Println("Stopping metrics collection...") + return + case <-ticker.C: + if err := collectInstanceMetrics(metricsCache); err != nil { + fmt.Printf("Error collecting metrics: %v\n", err) + collectorStats.totalErrors.Add(1) + } + } + } + }() + + // Wait for shutdown signal or server error + if metricsEnabled { + select { + case sig := <-sigChan: + fmt.Printf("\nReceived signal %v, initiating graceful shutdown...\n", sig) + case err := <-serverErrors: + fmt.Printf("Server error: %v\n", err) + } + } else { + // If server is not running, just wait for signal + sig := <-sigChan + fmt.Printf("\nReceived signal %v, initiating graceful shutdown...\n", sig) + } + + // Cancel context to stop metrics collection + cancel() + + // Shutdown HTTP server with timeout (only if it was started) + if metricsEnabled && server != nil { + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer shutdownCancel() + + if err := server.Shutdown(shutdownCtx); err != nil { + fmt.Printf("HTTP server shutdown error: %v\n", err) + } else { + fmt.Println("HTTP server stopped gracefully") + } } + + // Wait for metrics collection to finish + <-collectionDone + fmt.Println("Metrics collection stopped") + fmt.Println("Shutdown complete") } type ComponentType int64 @@ -88,28 +353,129 @@ type InstanceResourceStats struct { Message string `json:"message"` } +// buildPodMap creates a map of pod names to pod objects for O(1) lookup +func buildPodMap(pods *[]v1.Pod) map[string]*v1.Pod { + podMap := make(map[string]*v1.Pod, len(*pods)) + for i := range *pods { + podMap[(*pods)[i].Name] = &(*pods)[i] + } + return podMap +} + +// extractComponentMetadata extracts component type, name, and parent from pod metric labels +func extractComponentMetadata(podMetric *v1beta1.PodMetrics) (componentType ComponentType, componentName, parent string) { + componentType = determineComponentType(podMetric) + + switch componentType { + case Job: + if jobName, ok := podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-definition-name"]; ok { + componentName = jobName + } else if jobRunName, ok := podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-run"]; ok { + componentName = jobRunName + } else { + componentName = "standalone" + } + parent = podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-run"] + case App: + componentName = podMetric.ObjectMeta.Labels["serving.knative.dev/service"] + parent = podMetric.ObjectMeta.Labels["serving.knative.dev/revision"] + case Build: + if buildName, ok := podMetric.ObjectMeta.Labels["build.shipwright.io/name"]; ok { + componentName = buildName + } else if buildRunName, ok := podMetric.ObjectMeta.Labels["buildrun.shipwright.io/name"]; ok { + componentName = buildRunName + } else { + componentName = "standalone" + } + parent = podMetric.ObjectMeta.Labels["buildrun.shipwright.io/name"] + default: + componentName = "unknown" + } + + return +} + +// processMetric processes a single pod metric and outputs the JSON log line +func processMetric( + podMetric *v1beta1.PodMetrics, + podMap map[string]*v1.Pod, + clientset *kubernetes.Clientset, + namespace string, + config *rest.Config, +) *InstanceResourceStats { + // Extract component metadata + componentType, componentName, parent := extractComponentMetadata(podMetric) + + // Determine the actual CPU and memory usage + cpuCurrent := podMetric.Containers[0].Usage.Cpu().ToDec().AsApproximateFloat64() * 1000 + memoryCurrent := podMetric.Containers[0].Usage.Memory().ToDec().AsApproximateFloat64() / 1000 / 1000 + + stats := InstanceResourceStats{ + Metric: "instance-resources", + Name: podMetric.Name, + Parent: parent, + ComponentType: componentType.String(), + ComponentName: componentName, + Cpu: ResourceStats{ + Current: int64(cpuCurrent), + }, + Memory: ResourceStats{ + Current: int64(memoryCurrent), + }, + } + + // Gather the configured resource limits and calculate the usage (in percent) + pod := podMap[podMetric.Name] + if pod != nil { + userContainerName := getUserContainerName(componentType, pod) + + // determine the actual disk usage + storageCurrent := obtainDiskUsage(clientset, namespace, podMetric.Name, userContainerName, config) + stats.DiskUsage.Current = int64(storageCurrent) + + // extract memory and cpu limits + cpu, memory := getCpuAndMemoryLimits(userContainerName, pod) + + cpuLimit := cpu.ToDec().AsApproximateFloat64() * 1000 + stats.Cpu.Configured = int64(cpuLimit) + stats.Cpu.Usage = int64((cpuCurrent / cpuLimit) * 100) + + memoryLimit := memory.ToDec().AsApproximateFloat64() / 1000 / 1000 + stats.Memory.Configured = int64(memoryLimit) + stats.Memory.Usage = int64(memoryCurrent / memoryLimit * 100) + } + + // Compose the log line message + stats.Message = "Captured metrics of " + stats.ComponentType + " instance '" + stats.Name + "': " + fmt.Sprintf("%d", stats.Cpu.Current) + "m vCPU, " + fmt.Sprintf("%d", stats.Memory.Current) + " MB memory, " + fmt.Sprintf("%d", stats.DiskUsage.Current) + " MB disk usage" + + // Write the stringified JSON struct and make use of IBM Cloud Logs built-in parsing mechanism, + // which allows to annotate log lines by providing a JSON object instead of a simple string + fmt.Println(ToJSONString(&stats)) + + return &stats +} + // Helper function that retrieves all pods and all pod metrics // this function creates a structured log line for each pod for which the kube metrics api provides a metric -func collectInstanceMetrics() { - +func collectInstanceMetrics(cache *MetricsCache) error { startTime := time.Now() fmt.Println("Start to capture pod metrics ...") config, err := rest.InClusterConfig() if err != nil { - panic(err.Error()) + return fmt.Errorf("failed to get cluster config: %w", err) } // obtain the kube namespace related to this Code Engine project nsBytes, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace") if err != nil { - panic(err.Error()) + return fmt.Errorf("failed to read namespace: %w", err) } namespace := string(nsBytes) coreClientset, err := kubernetes.NewForConfig(config) if err != nil { - panic(err.Error()) + return fmt.Errorf("failed to create clientset: %w", err) } // fetches all pods @@ -118,96 +484,53 @@ func collectInstanceMetrics() { // fetch all pod metrics podMetrics := getAllPodMetrics(namespace, config) + // Build pod map for O(1) lookup + podMap := buildPodMap(pods) + + // Collect metrics into a slice + var collectedMetrics []InstanceResourceStats + var metricsMu sync.Mutex + + // Use semaphore to limit concurrent goroutines + const maxConcurrency = 20 + sem := make(chan struct{}, maxConcurrency) var wg sync.WaitGroup for _, metric := range *podMetrics { wg.Add(1) + sem <- struct{}{} // Acquire semaphore go func(podMetric *v1beta1.PodMetrics) { defer wg.Done() + defer func() { <-sem }() // Release semaphore - // Determine the component type (either app, job, build or unknown) - componentType := determineComponentType(podMetric) - - // Determine the component name - var componentName string - var parent string - switch componentType { - case Job: - if val, ok := podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-definition-name"]; ok { - componentName = val - } else { - componentName = "standalone" - } - parent = podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-run"] - case App: - componentName = podMetric.ObjectMeta.Labels["serving.knative.dev/service"] - parent = podMetric.ObjectMeta.Labels["serving.knative.dev/revision"] - case Build: - if val, ok := podMetric.ObjectMeta.Labels["build.shipwright.io/name"]; ok { - componentName = val - } else { - componentName = "standalone" - } - - parent = podMetric.ObjectMeta.Labels["buildrun.shipwright.io/name"] - default: - componentName = "unknown" - } - - // Determine the actual CPU and memory usage - cpuCurrent := podMetric.Containers[0].Usage.Cpu().ToDec().AsApproximateFloat64() * 1000 - memoryCurrent := podMetric.Containers[0].Usage.Memory().ToDec().AsApproximateFloat64() / 1000 / 1000 - - stats := InstanceResourceStats{ - Metric: "instance-resources", - Name: podMetric.Name, - Parent: parent, - ComponentType: componentType.String(), - ComponentName: componentName, - Cpu: ResourceStats{ - Current: int64(cpuCurrent), - }, - Memory: ResourceStats{ - Current: int64(memoryCurrent), - }, - } - - // Gather the configured resource limits and calculate the usage (in percent) - pod := getPod(podMetric.Name, pods) - if pod != nil { - - userContainerName := getUserContainerName(componentType, pod) - - // determine the actual disk usage - storageCurrent := obtainDiskUsage(coreClientset, namespace, podMetric.Name, userContainerName, config) - stats.DiskUsage.Current = int64(storageCurrent) - - // extract memory and cpu limits - cpu, memory := getCpuAndMemoryLimits(userContainerName, pod) - - cpuLimit := cpu.ToDec().AsApproximateFloat64() * 1000 - stats.Cpu.Configured = int64(cpuLimit) - stats.Cpu.Usage = int64((cpuCurrent / cpuLimit) * 100) - - memoryLimit := memory.ToDec().AsApproximateFloat64() / 1000 / 1000 - stats.Memory.Configured = int64(memoryLimit) - stats.Memory.Usage = int64(memoryCurrent / memoryLimit * 100) + stats := processMetric(podMetric, podMap, coreClientset, namespace, config) + if stats != nil { + metricsMu.Lock() + collectedMetrics = append(collectedMetrics, *stats) + metricsMu.Unlock() } - - // Compose the log line message - stats.Message = "Captured metrics of " + stats.ComponentType + " instance '" + stats.Name + "': " + fmt.Sprintf("%d", stats.Cpu.Current) + "m vCPU, " + fmt.Sprintf("%d", stats.Memory.Current) + " MB memory, " + fmt.Sprintf("%d", stats.DiskUsage.Current) + " MB disk usage" - - // Write the stringified JSON struct and make use of IBM Cloud Logs built-in parsing mechanism, - // which allows to annotate log lines by providing a JSON object instead of a simple string - fmt.Println(ToJSONString(&stats)) - }(&metric) } wg.Wait() - fmt.Println("Captured pod metrics in " + strconv.FormatInt(time.Since(startTime).Milliseconds(), 10) + " ms") + duration := time.Since(startTime) + fmt.Println("Captured pod metrics in " + strconv.FormatInt(duration.Milliseconds(), 10) + " ms") + + // Update cache with collected metrics + cache.mu.Lock() + cache.metrics = collectedMetrics + cache.namespace = namespace + cache.lastUpdate = time.Now() + cache.collectionCount++ + cache.mu.Unlock() + + // Update collector statistics + collectorStats.lastCollectionDuration.Store(duration.Milliseconds()) + collectorStats.lastCollectionTime.Store(time.Now().Unix()) + + return nil } // Helper function to determine the component type @@ -224,16 +547,6 @@ func determineComponentType(podMetric *v1beta1.PodMetrics) ComponentType { return Unknown } -// Helper function to obtain a pod by its name from a slice of pods -func getPod(name string, pods *[]v1.Pod) *v1.Pod { - for _, pod := range *pods { - if pod.Name == name { - return &pod - } - } - return nil -} - // Helper function to retrieve all pods from the Kube API func getAllPods(coreClientset *kubernetes.Clientset, namespace string, config *rest.Config) *[]v1.Pod { @@ -374,10 +687,17 @@ func getUserContainerName(componentType ComponentType, pod *v1.Pod) string { return "user-container" } - if componentType == Job || componentType == Build { + if componentType == Job { return pod.Spec.Containers[0].Name } + // builds are using two containers: + // a quite small 'step-source-default' + // and the 'step-build-and-push' which does the heavy lifting + if componentType == Build && len(pod.Spec.Containers) > 1 { + return pod.Spec.Containers[1].Name + } + // for kube-native deployments, we pick the first container return pod.Spec.Containers[0].Name } diff --git a/metrics-collector/prometheus.yml.template b/metrics-collector/prometheus.yml.template new file mode 100644 index 000000000..b2e515f8f --- /dev/null +++ b/metrics-collector/prometheus.yml.template @@ -0,0 +1,101 @@ +global: + scrape_interval: 30s + external_labels: + ibm_codeengine_project_name: '${CE_PROJECT_NAME}' + +scrape_configs: + - job_name: 'codeengine-metrics-project' + static_configs: + - targets: ['localhost:9100'] + relabel_configs: + # Add project name label + - source_labels: [job] + action: replace + regex: (.+) + replacement: '${CE_PROJECT_NAME}' + target_label: ibm_codeengine_project_name + + - job_name: 'codeengine-metrics-user' + fallback_scrape_protocol: PrometheusText0.0.4 + kubernetes_sd_configs: + - api_server: 'https://172.21.0.1' + role: pod + namespaces: + names: + - ${CE_SUBDOMAIN} + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + # only scrape when annotation codeengine.cloud.ibm.com/userMetricsScrape: 'true' is set + - source_labels: [__meta_kubernetes_pod_annotation_codeengine_cloud_ibm_com_userMetricsScrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_codeengine_cloud_ibm_com_userMetricsPath] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_codeengine_cloud_ibm_com_userMetricsPort] + action: replace + regex: (.+):(?:\d+);(\d+) + replacement: ${1}:${2} + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + # rename important meta data labels + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: ibm_codeengine_namespace + - action: replace + replacement: '${CE_PROJECT_NAME}' + target_label: ibm_codeengine_project_name + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: ibm_codeengine_instance_name + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_service] + action: replace + target_label: ibm_codeengine_component_name + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_configuration] + action: replace + regex: (.+) + replacement: app + target_label: ibm_codeengine_component_type + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_revision] + action: replace + target_label: ibm_codeengine_subcomponent_name + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_revisionUID] + action: replace + regex: (.+) + replacement: app_revision + target_label: ibm_codeengine_subcomponent_type + # drop codeengine, istio, and knative labels + - action: labeldrop + regex: "codeengine_cloud_ibm_com_(.+)" + - action: labeldrop + regex: "security_istio_io_(.+)" + - action: labeldrop + regex: "service_istio_io_(.+)" + - action: labeldrop + regex: "serving_knative_dev_(.+)" + # drop default prometheus labels + metric_relabel_configs: + - action: labeldrop + regex: "instance" + - action: labeldrop + regex: "job" + - action: labeldrop + regex: "pod_template_hash" + - action: labeldrop + regex: "app" +# +# Define IBM Cloud Monitoring as the remote write target +# +remote_write: +- url: https://${METRICS_REMOTE_WRITE_FQDN}/prometheus/remote/write + authorization: + credentials_file: "/etc/secrets/monitoring-apikey" + write_relabel_configs: + # Dropping scrape metrics (e.g. scrape_duration_seconds) + - source_labels: [__name__] + regex: 'scrape_duration_seconds|scrape_samples_scraped|scrape_series_added|scrape_samples_post_metric_relabeling' + action: drop \ No newline at end of file diff --git a/metrics-collector/setup/ibm-cloud-monitoring/README.md b/metrics-collector/setup/ibm-cloud-monitoring/README.md new file mode 100644 index 000000000..4a4619fc7 --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/README.md @@ -0,0 +1,243 @@ +# IBM Cloud Monitoring Dashboard Setup + +This directory contains tools and dashboards for IBM Cloud Monitoring (Sysdig) integration. + +## Files + +- **`import_dashboard.py`**: Python script to create or update Sysdig dashboards +- **`code-engine-component-resource-overview.json`**: Dashboard configuration for Code Engine resource monitoring + +## Prerequisites + +1. **Python 3.6+** installed on your system + +2. **IBM Cloud Account** with: + - An IBM Cloud Monitoring (Sysdig) instance + - An IBM Cloud IAM API key with access to the Monitoring instance + - The Monitoring instance ID (GUID) + +3. **Metrics Data**: The dashboard expects metrics from the Code Engine metrics collector to be available in your Sysdig instance + +### Getting Your IBM Cloud Credentials + +**IBM Cloud IAM API Key:** +1. Log in to [IBM Cloud Console](https://cloud.ibm.com) +2. Go to **Manage** > **Access (IAM)** > **API keys** +3. Click **Create an IBM Cloud API key** +4. Give it a name and description +5. Copy and save the API key securely + +**Monitoring Instance ID:** +1. Navigate to your IBM Cloud Monitoring instance +2. Click on **Overview** or **Settings** +3. Copy the **Instance ID** (GUID format: `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`) + +**Region:** +- Note the region where your Monitoring instance is deployed (e.g., `us-south`, `eu-de`) + +## Setup + +### Using a Virtual Environment (Recommended) + +It's recommended to use a Python virtual environment to isolate dependencies: + +```bash +# Navigate to the setup directory +cd setup/ibm-cloud-monitoring + +# Create a virtual environment +python3 -m venv venv + +# Activate the virtual environment +# On macOS/Linux: +source venv/bin/activate +# On Windows: +# venv\Scripts\activate + +# Install required dependencies +pip install requests + +# You should now see (venv) in your terminal prompt +``` + +When you're done, deactivate the virtual environment: +```bash +deactivate +``` + +### Global Installation (Alternative) + +If you prefer to install dependencies globally: + +```bash +pip install requests +# or +pip3 install requests +``` + +## Usage + +### Import or Update Dashboard + +```bash +python import_dashboard.py \ + --iam-api-key YOUR_IBM_CLOUD_IAM_API_KEY \ + --instance-id YOUR_MONITORING_INSTANCE_ID \ + --region us-south \ + --dashboard code-engine-component-resource-overview.json +``` + +### Using Environment Variables + +```bash +export IBM_CLOUD_IAM_API_KEY=YOUR_IBM_CLOUD_IAM_API_KEY +export SYSDIG_INSTANCE_ID=YOUR_MONITORING_INSTANCE_ID +export SYSDIG_REGION=us-south +python import_dashboard.py --dashboard code-engine-component-resource-overview.json +``` + +### Supported Regions + +- `au-syd` - Australia (Sydney) +- `br-sao` - Brazil (São Paulo) +- `ca-tor` - Canada (Toronto) +- `eu-de` - EU Central (Frankfurt) +- `eu-es` - EU Spain (Madrid) +- `eu-gb` - EU GB (London) +- `jp-osa` - Japan (Osaka) +- `jp-tok` - Japan (Tokyo) +- `us-east` - US East (Washington DC) +- `us-south` - US South (Dallas) + +## Dashboard: Code Engine Container Resource Overview + +The `code-engine-component-resource-overview.json` dashboard provides comprehensive monitoring of Code Engine resources: + +### Panels + +1. **CPU Usage vs Limit (per Pod)** - Compares live CPU usage to configured limits +2. **CPU Utilization % (per App)** - CPU percentage by component +3. **Memory Usage vs Limit (per Pod)** - Compares memory usage to limits +4. **Memory Utilization % (per App)** - Memory percentage by component +5. **CPU Utilization % (per Namespace)** - Namespace-level CPU monitoring +6. **Memory Utilization % (per Namespace)** - Namespace-level memory monitoring +7. **CPU Utilization % (per Revision/Parent)** - Revision-level CPU tracking +8. **Memory Utilization % (per Revision/Parent)** - Revision-level memory tracking +9. **Top Pods by CPU** - Top 10 CPU consumers +10. **Top Pods by Memory** - Top 10 memory consumers +11. **Cluster CPU Utilization (%)** - Global CPU percentage +12. **Cluster Memory Utilization (%)** - Global memory percentage + +### Required Metrics + +The dashboard uses the following Prometheus metrics: + +- `codeengine_container_cpu_usage_millicores` +- `codeengine_container_cpu_limit_millicores` +- `codeengine_container_memory_usage_bytes` +- `codeengine_container_memory_limit_bytes` + +These metrics are exposed by the Code Engine metrics collector when running with `METRICS_ENABLED=true`. + +## Script Features + +The `import_dashboard.py` script: + +- ✅ Creates new dashboards if they don't exist +- ✅ Updates existing dashboards with the same name +- ✅ Validates API credentials and region +- ✅ Provides clear error messages +- ✅ Displays dashboard URL after creation/update + +## Troubleshooting + +### Authentication Errors + +If you get authentication errors, verify: +- Your IBM Cloud IAM API key is correct and not expired +- The IAM API key has permissions to access the Monitoring instance +- The Monitoring instance ID is correct +- You're using the correct region where the instance is deployed + +### Dashboard Not Showing Data + +If the dashboard shows no data: +- Verify the metrics collector is running with `METRICS_ENABLED=true` +- Check that metrics are being sent to IBM Cloud Monitoring +- Ensure the Prometheus remote write configuration is correct +- Wait a few minutes for data to appear (initial scrape interval) + +### Import Errors + +If the import fails: +- Check that the JSON file is valid +- Ensure you have network connectivity to IBM Cloud +- Verify the region endpoint is accessible + +## Example: Complete Setup with Virtual Environment + +```bash +# 1. Navigate to the setup directory +cd setup/ibm-cloud-monitoring + +# 2. Create and activate virtual environment +python3 -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# 3. Install dependencies +pip install requests + +# 4. Set environment variables +export IBM_CLOUD_IAM_API_KEY=your-iam-api-key-here +export SYSDIG_INSTANCE_ID=your-instance-id-here +export SYSDIG_REGION=us-south + +# 5. Import the dashboard +python import_dashboard.py --dashboard code-engine-component-resource-overview.json + +# Output: +# Loading dashboard configuration from 'code-engine-component-resource-overview.json'... +# Obtaining IBM Cloud IAM access token... +# ✓ IAM access token obtained successfully +# Checking if dashboard 'IBM Code Engine - Container Resource Overview' exists... +# Dashboard 'IBM Code Engine - Container Resource Overview' not found. Creating new dashboard... +# ✓ Dashboard 'IBM Code Engine - Container Resource Overview' created successfully (ID: 12345)! +# +# Dashboard URL: https://us-south.monitoring.cloud.ibm.com/#/dashboards/12345 +# +# ✓ Operation completed successfully! + +# 6. Deactivate virtual environment when done +deactivate +``` + +## Example: Quick Run (Without Virtual Environment) + +```bash +# 1. Install dependencies globally +pip3 install requests + +# 2. Run the script +cd setup/ibm-cloud-monitoring +python3 import_dashboard.py \ + --iam-api-key your-iam-api-key-here \ + --instance-id your-instance-id-here \ + --region us-south \ + --dashboard code-engine-component-resource-overview.json +``` + +## Customizing Dashboards + +To customize the dashboard: + +1. Edit `code-engine-component-resource-overview.json` +2. Modify panel queries, layouts, or add new panels +3. Run the import script to update the dashboard + +The script will detect the existing dashboard by name and update it with your changes. + +## Additional Resources + +- [IBM Cloud Monitoring Documentation](https://cloud.ibm.com/docs/monitoring) +- [Sysdig Dashboard API](https://docs.sysdig.com/en/docs/developer-tools/sysdig-rest-api-conventions/) +- [PromQL Query Language](https://prometheus.io/docs/prometheus/latest/querying/basics/) \ No newline at end of file diff --git a/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json b/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json new file mode 100644 index 000000000..e7cfd2263 --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json @@ -0,0 +1,1069 @@ +{ + "name": "IBM Cloud Code Engine - Component Resource Overview", + "panels": [ + { + "id": 1, + "type": "text", + "name": "Dashboard Overview", + "description": "", + "nullValueDisplayText": null, + "links": null, + "markdownSource": "Monitor vCPU, and Memory usage across Code Engine components.\n\n**Use the scope filters above to narrow by:**\n- `ibm_codeengine_component_type` (app, job, build)\n- `ibm_codeengine_component_name` (specific app/job/build name)", + "transparentBackground": false, + "panelTitleVisible": true, + "textAutosized": false + }, + { + "id": 5, + "type": "advancedTimechart", + "name": "vCPU Utilization % (by Component)", + "description": "vCPU usage as percentage of limit, grouped by ibm_codeengine_component_name", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum by (ibm_codeengine_component_name) (ibm_codeengine_instance_cpu_usage_millicores{$__scope}) / sum by (ibm_codeengine_component_name) (ibm_codeengine_instance_cpu_limit_millicores{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": "{{ibm_codeengine_component_name}}", + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "%", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "%", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 80.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 0.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 6, + "type": "advancedTimechart", + "name": "Memory Utilization % (by Component)", + "description": "Memory usage as percentage of limit, grouped by ibm_codeengine_component_name", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum by (ibm_codeengine_component_name) (ibm_codeengine_instance_memory_usage_bytes{$__scope}) / sum by (ibm_codeengine_component_name) (ibm_codeengine_instance_memory_limit_bytes{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": "{{ibm_codeengine_component_name}}", + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + }, + "right": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 80.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 30.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 8, + "type": "advancedNumber", + "name": "Total vCPU Utilization in %", + "description": "Overall vCPU utilization across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum(ibm_codeengine_instance_cpu_usage_millicores{$__scope}) / sum(ibm_codeengine_instance_cpu_limit_millicores{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "medium", + "value": 90.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 30.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "none", + "value": 1.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 9, + "type": "advancedNumber", + "name": "Total Memory Utilization in %", + "description": "Overall memory utilization across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum(ibm_codeengine_instance_memory_usage_bytes{$__scope}) / sum(ibm_codeengine_instance_memory_limit_bytes{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "medium", + "value": 80.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 30.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "none", + "value": 1.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 11, + "type": "advancedNumber", + "name": "Total Available vCPUs", + "description": "Overall number of available vCPUs across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_cpu_limit_millicores{$__scope}) / 1000", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "info", + "value": 80.0, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 12, + "type": "advancedNumber", + "name": "Total Available Memory", + "description": "Overall amount of available memory across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_memory_limit_bytes{$__scope})", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 13, + "type": "advancedNumber", + "name": "Total Used vCPUs", + "description": "Actual number of vCPUs that are used across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_cpu_usage_millicores{$__scope}) / 1000", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "info", + "value": 80.0, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 14, + "type": "advancedNumber", + "name": "Total Used Memory", + "description": "Actual amount of memory used across all instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_memory_usage_bytes{$__scope})", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 2, + "type": "advancedTimechart", + "name": "vCPU Usage (per Instance)", + "description": "Current vCPU usage per instance", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "ibm_codeengine_instance_cpu_usage_millicores{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "CPU Usage", + "timeSeriesDisplayNameTemplate": "{{ibm_codeengine_component_name}}/{{ibm_codeengine_instance_name}} usage", + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + }, + "right": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 3, + "type": "advancedTimechart", + "name": "Memory Usage (per Instance)", + "description": "Current memory usage per instance", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "ibm_codeengine_instance_memory_usage_bytes{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "Memory Usage", + "timeSeriesDisplayNameTemplate": "{{ibm_codeengine_component_name}}/{{ibm_codeengine_instance_name}} usage", + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "B", + "maxInputFormat": "B", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 15, + "type": "advancedNumber", + "name": "Components with Running Instances", + "description": "Number of Code Engine components that do have running instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "count(count(\n ibm_codeengine_instance_cpu_limit_millicores{$__scope}\n)by (ibm_codeengine_component_name))", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 16, + "type": "advancedNumber", + "name": "Running Instances", + "description": "Number of running Code Engine instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(count(\n ibm_codeengine_instance_cpu_limit_millicores{$__scope}\n)by (ibm_codeengine_component_name))", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "info", + "value": 80.0, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + } + ], + "scopeExpressionList": [ + { + "operand": "ibm_codeengine_project_name", + "operator": "in", + "displayName": "", + "value": [], + "descriptor": { + "documentId": "ibm_codeengine_project_name", + "id": "ibm_codeengine_project_name", + "metricType": "tag", + "type": "string", + "scale": 0.0, + "name": "ibm_codeengine_project_name", + "description": "ibm_codeengine_project_name", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "scopes": [], + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "aggregationForGroup": "none", + "hidden": false, + "experimental": false, + "deferred": false, + "identity": false, + "canMonitor": false, + "canGroupBy": false, + "canFilter": true, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "publicId": "ibm_codeengine_project_name", + "heuristic": false, + "documentType": "metric", + "segment": false, + "documentTimestamp": 1772465428162 + }, + "variable": true, + "isVariable": true + }, + { + "operand": "ibm_codeengine_component_type", + "operator": "in", + "displayName": "", + "value": [], + "descriptor": { + "documentId": "ibm_codeengine_component_type", + "id": "ibm_codeengine_component_type", + "metricType": "tag", + "type": "string", + "scale": 0.0, + "name": "ibm_codeengine_component_type", + "description": "ibm_codeengine_component_type", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "scopes": [], + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "aggregationForGroup": "none", + "hidden": false, + "experimental": false, + "deferred": false, + "identity": false, + "canMonitor": false, + "canGroupBy": false, + "canFilter": true, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "publicId": "ibm_codeengine_component_type", + "heuristic": false, + "documentType": "metric", + "segment": false, + "documentTimestamp": 1772465428162 + }, + "variable": true, + "isVariable": true + }, + { + "operand": "ibm_codeengine_component_name", + "operator": "in", + "displayName": "", + "value": [], + "descriptor": { + "documentId": "ibm_codeengine_component_name", + "id": "ibm_codeengine_component_name", + "metricType": "tag", + "type": "string", + "scale": 0.0, + "name": "ibm_codeengine_component_name", + "description": "ibm_codeengine_component_name", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "scopes": [], + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "aggregationForGroup": "none", + "hidden": false, + "experimental": false, + "deferred": false, + "identity": false, + "canMonitor": false, + "canGroupBy": false, + "canFilter": true, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "publicId": "ibm_codeengine_component_name", + "heuristic": false, + "documentType": "metric", + "segment": false, + "documentTimestamp": 1772465428162 + }, + "variable": true, + "isVariable": true + } + ], + "eventDisplaySettings": { + "enabled": true, + "queryParams": { + "severities": [], + "alertStatuses": [], + "categories": [], + "filter": "", + "teamScope": false + } + }, + "shared": true, + "public": false, + "description": "Overview of Code Engine instance resource usage: CPU/memory current vs limits, with filtering by ibm_codeengine_component_type and ibm_codeengine_component_name", + "layout": [ + { + "panelId": 1, + "x": 0, + "y": 0, + "w": 7, + "h": 4 + }, + { + "panelId": 5, + "x": 0, + "y": 4, + "w": 12, + "h": 6 + }, + { + "panelId": 6, + "x": 0, + "y": 10, + "w": 12, + "h": 5 + }, + { + "panelId": 8, + "x": 21, + "y": 0, + "w": 3, + "h": 2 + }, + { + "panelId": 9, + "x": 21, + "y": 2, + "w": 3, + "h": 2 + }, + { + "panelId": 11, + "x": 14, + "y": 0, + "w": 4, + "h": 2 + }, + { + "panelId": 12, + "x": 14, + "y": 2, + "w": 4, + "h": 2 + }, + { + "panelId": 13, + "x": 18, + "y": 0, + "w": 3, + "h": 2 + }, + { + "panelId": 14, + "x": 18, + "y": 2, + "w": 3, + "h": 2 + }, + { + "panelId": 2, + "x": 12, + "y": 4, + "w": 12, + "h": 6 + }, + { + "panelId": 3, + "x": 12, + "y": 10, + "w": 12, + "h": 5 + }, + { + "panelId": 15, + "x": 7, + "y": 0, + "w": 4, + "h": 4 + }, + { + "panelId": 16, + "x": 11, + "y": 0, + "w": 3, + "h": 4 + } + ], + "schema": 3 +} \ No newline at end of file diff --git a/metrics-collector/setup/ibm-cloud-monitoring/export_dashboard.py b/metrics-collector/setup/ibm-cloud-monitoring/export_dashboard.py new file mode 100644 index 000000000..983965771 --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/export_dashboard.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +""" +IBM Cloud Monitoring Dashboard Export Script + +This script exports Sysdig dashboards from IBM Cloud Monitoring using IBM Cloud IAM authentication. +It uses an IBM Cloud IAM API key to obtain an access token, then retrieves the dashboard via the Sysdig API. + +Usage: + python export_dashboard.py --iam-api-key --instance-id --region --name + +Environment Variables: + IBM_CLOUD_IAM_API_KEY: IBM Cloud IAM API key (alternative to --iam-api-key) + SYSDIG_INSTANCE_ID: IBM Cloud Monitoring instance ID (alternative to --instance-id) + SYSDIG_REGION: IBM Cloud Monitoring region (alternative to --region) + +Example: + python export_dashboard.py \\ + --iam-api-key YOUR_IAM_API_KEY \\ + --instance-id YOUR_INSTANCE_ID \\ + --region us-south \\ + --name "IBM Cloud Code Engine - Component Resource Overview" +""" + +import argparse +import json +import os +import sys +from datetime import datetime +from typing import Dict, Optional + +try: + import requests +except ImportError: + print("Error: 'requests' module not found. Install it with: pip install requests") + sys.exit(1) + + +class IBMCloudIAMAuth: + """Handles IBM Cloud IAM authentication.""" + + IAM_TOKEN_URL = "https://iam.cloud.ibm.com/identity/token" + + def __init__(self, iam_api_key: str): + """ + Initialize IBM Cloud IAM authentication. + + Args: + iam_api_key: IBM Cloud IAM API key + """ + self.iam_api_key = iam_api_key + self._access_token = None + self._token_expiry = 0 + + def get_access_token(self) -> str: + """ + Get an IBM Cloud IAM access token. + + Returns: + IAM access token + """ + print("Obtaining IBM Cloud IAM access token...") + + headers = { + "Content-Type": "application/x-www-form-urlencoded", + "Accept": "application/json" + } + + data = { + "grant_type": "urn:ibm:params:oauth:grant-type:apikey", + "apikey": self.iam_api_key + } + + try: + response = requests.post( + self.IAM_TOKEN_URL, + headers=headers, + data=data, + timeout=30 + ) + response.raise_for_status() + token_data = response.json() + self._access_token = token_data.get("access_token") + + if not self._access_token: + raise ValueError("No access token in IAM response") + + print("✓ IAM access token obtained successfully") + return self._access_token + + except requests.exceptions.RequestException as e: + print(f"Error obtaining IAM token: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + raise + + +class SysdigDashboardManager: + """Manages Sysdig dashboard operations via REST API with IBM Cloud IAM authentication.""" + + # IBM Cloud Monitoring regional endpoints + REGION_ENDPOINTS = { + "us-south": "https://us-south.monitoring.cloud.ibm.com", + "us-east": "https://us-east.monitoring.cloud.ibm.com", + "eu-de": "https://eu-de.monitoring.cloud.ibm.com", + "eu-es": "https://eu-es.monitoring.cloud.ibm.com", + "eu-gb": "https://eu-gb.monitoring.cloud.ibm.com", + "jp-tok": "https://jp-tok.monitoring.cloud.ibm.com", + "au-syd": "https://au-syd.monitoring.cloud.ibm.com", + "jp-osa": "https://jp-osa.monitoring.cloud.ibm.com", + "ca-tor": "https://ca-tor.monitoring.cloud.ibm.com", + "br-sao": "https://br-sao.monitoring.cloud.ibm.com", + } + + def __init__(self, iam_auth: IBMCloudIAMAuth, instance_id: str, region: str): + """ + Initialize the Sysdig Dashboard Manager. + + Args: + iam_auth: IBM Cloud IAM authentication handler + instance_id: IBM Cloud Monitoring instance ID (GUID) + region: IBM Cloud region (e.g., 'us-south', 'eu-de') + """ + if region not in self.REGION_ENDPOINTS: + raise ValueError( + f"Invalid region '{region}'. Valid regions: {', '.join(self.REGION_ENDPOINTS.keys())}" + ) + + self.iam_auth = iam_auth + self.instance_id = instance_id + self.region = region + self.base_url = self.REGION_ENDPOINTS[region] + + def _get_headers(self) -> Dict[str, str]: + """ + Get HTTP headers with IAM authentication. + + Returns: + Dictionary of HTTP headers + """ + access_token = self.iam_auth.get_access_token() + return { + "Authorization": f"Bearer {access_token}", + "IBMInstanceID": self.instance_id, + "Content-Type": "application/json", + } + + def list_dashboards(self) -> list: + """ + List all dashboards in the Sysdig instance. + + Returns: + List of dashboard objects + """ + url = f"{self.base_url}/api/v3/dashboards" + + try: + response = requests.get(url, headers=self._get_headers(), timeout=30) + response.raise_for_status() + data = response.json() + return data.get("dashboards", []) + except requests.exceptions.RequestException as e: + print(f"Error listing dashboards: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + return [] + + def find_dashboard_by_name(self, name: str) -> Optional[Dict]: + """ + Find a dashboard by its name. + + Args: + name: Dashboard name to search for + + Returns: + Dashboard object if found, None otherwise + """ + dashboards = self.list_dashboards() + for dashboard in dashboards: + print(f"dashboard: {dashboard.get("name")}, id: '{dashboard.get("id")}'") + if dashboard.get("name") == name: + return dashboard + return None + + def get_dashboard(self, dashboard_id: int) -> Dict: + """ + Get a dashboard by its ID. + + Args: + dashboard_id: ID of the dashboard to retrieve + + Returns: + Dashboard object + """ + url = f"{self.base_url}/api/v3/dashboards/{dashboard_id}" + + try: + response = requests.get(url, headers=self._get_headers(), timeout=30) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + print(f"Error retrieving dashboard: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + raise + + def export_dashboard_by_name(self, name: str, output_dir: str = ".") -> str: + """ + Export a dashboard by its name to a JSON file. + + Args: + name: Dashboard name to export + output_dir: Directory to save the exported file (default: current directory) + + Returns: + Path to the exported file + """ + print(f"Searching for dashboard '{name}'...") + dashboard_summary = self.find_dashboard_by_name(name) + + if not dashboard_summary: + raise ValueError(f"Dashboard '{name}' not found") + + dashboard_id = dashboard_summary.get("id") + if dashboard_id is None: + raise ValueError(f"Dashboard '{name}' found but has no ID") + + print(f"✓ Dashboard found (ID: {dashboard_id})") + print(f"Retrieving full dashboard configuration...") + + # Get the full dashboard configuration + dashboard_data = self.get_dashboard(dashboard_id) + + # Extract just the dashboard object (without wrapper) + dashboard_config = dashboard_data.get("dashboard", {}) + + # Generate filename with dashboard name and timestamp + timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + # Sanitize dashboard name for filename + safe_name = "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in name) + safe_name = safe_name.replace(' ', '_').lower() + filename = f"{safe_name}_{timestamp}.json" + filepath = os.path.join(output_dir, filename) + + # Save to file + print(f"Saving dashboard to '{filepath}'...") + with open(filepath, 'w') as f: + json.dump(dashboard_config, f, indent=2) + + print(f"✓ Dashboard exported successfully!") + return filepath + + +def main(): + """Main entry point for the script.""" + parser = argparse.ArgumentParser( + description="Export IBM Cloud Monitoring (Sysdig) dashboards using IBM Cloud IAM authentication", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Using command-line arguments + python export_dashboard.py \\ + --iam-api-key YOUR_IAM_KEY \\ + --instance-id YOUR_INSTANCE_ID \\ + --region us-south \\ + --name "IBM Code Engine - Container Resource Overview" + + # Using environment variables + export IBM_CLOUD_IAM_API_KEY=YOUR_IAM_KEY + export SYSDIG_INSTANCE_ID=YOUR_INSTANCE_ID + export SYSDIG_REGION=us-south + python export_dashboard.py --name "My Dashboard" + + # Export to specific directory + python export_dashboard.py \\ + --name "My Dashboard" \\ + --output-dir ./exports + +Supported Regions: + us-south, us-east, eu-de, eu-es, eu-gb, jp-tok, au-syd, jp-osa, ca-tor, br-sao + +How to get your Instance ID: + 1. Go to IBM Cloud Console + 2. Navigate to your Monitoring instance + 3. Click on "Overview" or "Settings" + 4. Copy the Instance ID (GUID format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx) + """ + ) + + parser.add_argument( + "--iam-api-key", + help="IBM Cloud IAM API key (or set IBM_CLOUD_IAM_API_KEY env var)", + default=os.environ.get("IBM_CLOUD_IAM_API_KEY") + ) + + parser.add_argument( + "--instance-id", + help="IBM Cloud Monitoring instance ID/GUID (or set SYSDIG_INSTANCE_ID env var)", + default=os.environ.get("SYSDIG_INSTANCE_ID") + ) + + parser.add_argument( + "--region", + help="IBM Cloud region (or set SYSDIG_REGION env var)", + default=os.environ.get("SYSDIG_REGION") + ) + + parser.add_argument( + "--name", + required=True, + help="Name of the dashboard to export" + ) + + parser.add_argument( + "--output-dir", + default=".", + help="Directory to save the exported dashboard (default: current directory)" + ) + + parser.add_argument( + "--list", + action="store_true", + help="List all available dashboards and exit" + ) + + args = parser.parse_args() + + # Validate required arguments + if not args.iam_api_key: + print("Error: IAM API key is required. Provide via --iam-api-key or IBM_CLOUD_IAM_API_KEY environment variable") + sys.exit(1) + + if not args.instance_id: + print("Error: Instance ID is required. Provide via --instance-id or SYSDIG_INSTANCE_ID environment variable") + sys.exit(1) + + if not args.region: + print("Error: Region is required. Provide via --region or SYSDIG_REGION environment variable") + sys.exit(1) + + # Initialize IAM authentication and dashboard manager + try: + iam_auth = IBMCloudIAMAuth(args.iam_api_key) + manager = SysdigDashboardManager(iam_auth, args.instance_id, args.region) + + # List dashboards if requested + if args.list: + print("Listing all dashboards...") + dashboards = manager.list_dashboards() + if not dashboards: + print("No dashboards found") + else: + print(f"\nFound {len(dashboards)} dashboard(s):\n") + for i, dashboard in enumerate(dashboards, 1): + name = dashboard.get("name", "Unnamed") + dashboard_id = dashboard.get("id", "N/A") + print(f"{i}. {name} (ID: {dashboard_id})") + sys.exit(0) + + # Create output directory if it doesn't exist + if args.output_dir != "." and not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + print(f"Created output directory: {args.output_dir}") + + # Export the dashboard + filepath = manager.export_dashboard_by_name(args.name, args.output_dir) + + print(f"\n✓ Export completed successfully!") + print(f" File: {filepath}") + + except ValueError as e: + print(f"Error: {e}") + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/metrics-collector/setup/ibm-cloud-monitoring/import_dashboard.py b/metrics-collector/setup/ibm-cloud-monitoring/import_dashboard.py new file mode 100644 index 000000000..2dcf4263f --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/import_dashboard.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +""" +IBM Cloud Monitoring Dashboard Import/Update Script + +This script creates or updates Sysdig dashboards in IBM Cloud Monitoring using IBM Cloud IAM authentication. +It uses an IBM Cloud IAM API key to obtain an access token, then interacts with the Sysdig API. + +Usage: + python import_dashboard.py --iam-api-key --instance-id --region --dashboard + +Environment Variables: + IBM_CLOUD_IAM_API_KEY: IBM Cloud IAM API key (alternative to --iam-api-key) + SYSDIG_INSTANCE_ID: IBM Cloud Monitoring instance ID (alternative to --instance-id) + SYSDIG_REGION: IBM Cloud Monitoring region (alternative to --region) + +Example: + python import_dashboard.py \\ + --iam-api-key YOUR_IAM_API_KEY \\ + --instance-id YOUR_INSTANCE_ID \\ + --region us-south \\ + --dashboard code-engine-component-resource-overview.json +""" + +import argparse +import json +import os +import sys +from typing import Dict, Optional + +try: + import requests +except ImportError: + print("Error: 'requests' module not found. Install it with: pip install requests") + sys.exit(1) + + +class IBMCloudIAMAuth: + """Handles IBM Cloud IAM authentication.""" + + IAM_TOKEN_URL = "https://iam.cloud.ibm.com/identity/token" + + def __init__(self, iam_api_key: str): + """ + Initialize IBM Cloud IAM authentication. + + Args: + iam_api_key: IBM Cloud IAM API key + """ + self.iam_api_key = iam_api_key + self._access_token = None + self._token_expiry = 0 + + def get_access_token(self) -> str: + """ + Get an IBM Cloud IAM access token. + + Returns: + IAM access token + """ + print("Obtaining IBM Cloud IAM access token...") + + headers = { + "Content-Type": "application/x-www-form-urlencoded", + "Accept": "application/json" + } + + data = { + "grant_type": "urn:ibm:params:oauth:grant-type:apikey", + "apikey": self.iam_api_key + } + + try: + response = requests.post( + self.IAM_TOKEN_URL, + headers=headers, + data=data, + timeout=30 + ) + response.raise_for_status() + token_data = response.json() + self._access_token = token_data.get("access_token") + + if not self._access_token: + raise ValueError("No access token in IAM response") + + print("✓ IAM access token obtained successfully") + return self._access_token + + except requests.exceptions.RequestException as e: + print(f"Error obtaining IAM token: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + raise + + +class SysdigDashboardManager: + """Manages Sysdig dashboard creation and updates via REST API with IBM Cloud IAM authentication.""" + + # IBM Cloud Monitoring regional endpoints + REGION_ENDPOINTS = { + "us-south": "https://us-south.monitoring.cloud.ibm.com", + "us-east": "https://us-east.monitoring.cloud.ibm.com", + "eu-de": "https://eu-de.monitoring.cloud.ibm.com", + "eu-es": "https://eu-es.monitoring.cloud.ibm.com", + "eu-gb": "https://eu-gb.monitoring.cloud.ibm.com", + "jp-tok": "https://jp-tok.monitoring.cloud.ibm.com", + "au-syd": "https://au-syd.monitoring.cloud.ibm.com", + "jp-osa": "https://jp-osa.monitoring.cloud.ibm.com", + "ca-tor": "https://ca-tor.monitoring.cloud.ibm.com", + "br-sao": "https://br-sao.monitoring.cloud.ibm.com", + } + + def __init__(self, iam_auth: IBMCloudIAMAuth, instance_id: str, region: str): + """ + Initialize the Sysdig Dashboard Manager. + + Args: + iam_auth: IBM Cloud IAM authentication handler + instance_id: IBM Cloud Monitoring instance ID (GUID) + region: IBM Cloud region (e.g., 'us-south', 'eu-de') + """ + if region not in self.REGION_ENDPOINTS: + raise ValueError( + f"Invalid region '{region}'. Valid regions: {', '.join(self.REGION_ENDPOINTS.keys())}" + ) + + self.iam_auth = iam_auth + self.instance_id = instance_id + self.region = region + self.base_url = self.REGION_ENDPOINTS[region] + + def _get_headers(self) -> Dict[str, str]: + """ + Get HTTP headers with IAM authentication. + + Returns: + Dictionary of HTTP headers + """ + access_token = self.iam_auth.get_access_token() + return { + "Authorization": f"Bearer {access_token}", + "IBMInstanceID": self.instance_id, + "Content-Type": "application/json", + } + + def list_dashboards(self) -> list: + """ + List all dashboards in the Sysdig instance. + + Returns: + List of dashboard objects + """ + url = f"{self.base_url}/api/v3/dashboards" + + try: + response = requests.get(url, headers=self._get_headers(), timeout=30) + response.raise_for_status() + data = response.json() + return data.get("dashboards", []) + except requests.exceptions.RequestException as e: + print(f"Error listing dashboards: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + return [] + + def find_dashboard_by_name(self, name: str) -> Optional[Dict]: + """ + Find a dashboard by its name. + + Args: + name: Dashboard name to search for + + Returns: + Dashboard object if found, None otherwise + """ + dashboards = self.list_dashboards() + for dashboard in dashboards: + if dashboard.get("name") == name: + return dashboard + return None + + def create_dashboard(self, dashboard_config: Dict) -> Dict: + """ + Create a new dashboard. + + Args: + dashboard_config: Dashboard configuration dictionary + + Returns: + Created dashboard object + """ + url = f"{self.base_url}/api/v3/dashboards" + + try: + response = requests.post( + url, + headers=self._get_headers(), + json={"dashboard": dashboard_config}, + timeout=30 + ) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + print(f"Error creating dashboard: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + raise + + def update_dashboard(self, dashboard_id: int, dashboard_config: Dict) -> Dict: + """ + Update an existing dashboard. + + Args: + dashboard_id: ID of the dashboard to update + dashboard_config: New dashboard configuration + + Returns: + Updated dashboard object + """ + url = f"{self.base_url}/api/v3/dashboards/{dashboard_id}" + + try: + response = requests.put( + url, + headers=self._get_headers(), + json={"dashboard": dashboard_config}, + timeout=30 + ) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + print(f"Error updating dashboard: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + raise + + def import_or_update_dashboard(self, dashboard_config: Dict) -> Dict: + """ + Import a dashboard or update it if it already exists. + + Args: + dashboard_config: Dashboard configuration dictionary + + Returns: + Dashboard object (created or updated) + """ + dashboard_name = dashboard_config.get("name") + if not dashboard_name: + raise ValueError("Dashboard configuration must include a 'name' field") + + print(f"Checking if dashboard '{dashboard_name}' exists...") + existing_dashboard = self.find_dashboard_by_name(dashboard_name) + + if existing_dashboard: + dashboard_id = existing_dashboard.get("id") + if dashboard_id is None: + raise ValueError(f"Dashboard '{dashboard_name}' found but has no ID") + print(f"Dashboard '{dashboard_name}' found (ID: {dashboard_id}). Updating...") + result = self.update_dashboard(dashboard_id, dashboard_config) + print(f"✓ Dashboard '{dashboard_name}' updated successfully!") + return result + else: + print(f"Dashboard '{dashboard_name}' not found. Creating new dashboard...") + result = self.create_dashboard(dashboard_config) + dashboard_id = result.get("dashboard", {}).get("id") + print(f"✓ Dashboard '{dashboard_name}' created successfully (ID: {dashboard_id})!") + return result + + +def load_dashboard_config(file_path: str) -> Dict: + """ + Load dashboard configuration from a JSON file. + + Args: + file_path: Path to the JSON file + + Returns: + Dashboard configuration dictionary + """ + try: + with open(file_path, 'r') as f: + return json.load(f) + except FileNotFoundError: + print(f"Error: Dashboard file '{file_path}' not found") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in dashboard file: {e}") + sys.exit(1) + + +def main(): + """Main entry point for the script.""" + parser = argparse.ArgumentParser( + description="Import or update IBM Cloud Monitoring (Sysdig) dashboards using IBM Cloud IAM authentication", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Using command-line arguments + python import_dashboard.py \\ + --iam-api-key YOUR_IAM_KEY \\ + --instance-id YOUR_INSTANCE_ID \\ + --region us-south \\ + --dashboard code-engine-component-resource-overview.json + + # Using environment variables + export IBM_CLOUD_IAM_API_KEY=YOUR_IAM_KEY + export SYSDIG_INSTANCE_ID=YOUR_INSTANCE_ID + export SYSDIG_REGION=us-south + python import_dashboard.py --dashboard code-engine-component-resource-overview.json + +Supported Regions: + us-south, us-east, eu-de, eu-gb, jp-tok, au-syd, jp-osa, ca-tor, br-sao + +How to get your Instance ID: + 1. Go to IBM Cloud Console + 2. Navigate to your Monitoring instance + 3. Click on "Overview" or "Settings" + 4. Copy the Instance ID (GUID format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx) + """ + ) + + parser.add_argument( + "--iam-api-key", + help="IBM Cloud IAM API key (or set IBM_CLOUD_IAM_API_KEY env var)", + default=os.environ.get("IBM_CLOUD_IAM_API_KEY") + ) + + parser.add_argument( + "--instance-id", + help="IBM Cloud Monitoring instance ID/GUID (or set SYSDIG_INSTANCE_ID env var)", + default=os.environ.get("SYSDIG_INSTANCE_ID") + ) + + parser.add_argument( + "--region", + help="IBM Cloud region (or set SYSDIG_REGION env var)", + default=os.environ.get("SYSDIG_REGION") + ) + + parser.add_argument( + "--dashboard", + required=True, + help="Path to dashboard JSON file" + ) + + args = parser.parse_args() + + # Validate required arguments + if not args.iam_api_key: + print("Error: IAM API key is required. Provide via --iam-api-key or IBM_CLOUD_IAM_API_KEY environment variable") + sys.exit(1) + + if not args.instance_id: + print("Error: Instance ID is required. Provide via --instance-id or SYSDIG_INSTANCE_ID environment variable") + sys.exit(1) + + if not args.region: + print("Error: Region is required. Provide via --region or SYSDIG_REGION environment variable") + sys.exit(1) + + # Load dashboard configuration + print(f"Loading dashboard configuration from '{args.dashboard}'...") + dashboard_config = load_dashboard_config(args.dashboard) + + # Initialize IAM authentication and dashboard manager + try: + iam_auth = IBMCloudIAMAuth(args.iam_api_key) + manager = SysdigDashboardManager(iam_auth, args.instance_id, args.region) + result = manager.import_or_update_dashboard(dashboard_config) + + # Print dashboard URL + dashboard_id = result.get("dashboard", {}).get("id") + if dashboard_id: + dashboard_url = f"{manager.base_url}/#/dashboards/{dashboard_id}" + print(f"\nDashboard URL: {dashboard_url}") + + print("\n✓ Operation completed successfully!") + + except ValueError as e: + print(f"Error: {e}") + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/metrics-collector/start.sh b/metrics-collector/start.sh new file mode 100644 index 000000000..de2ded51b --- /dev/null +++ b/metrics-collector/start.sh @@ -0,0 +1,98 @@ +#!/bin/sh +set -e + +echo "Starting Code Engine Metrics Collector..." + +# Check if METRICS_ENABLED is set to true +if [ "$METRICS_ENABLED" = "true" ]; then + echo "Prometheus metrics export enabled" + + # Check if monitoring API key secret is mounted + if [ ! -f "/etc/secrets/monitoring-apikey" ]; then + echo "ERROR: Prometheus agent requires /etc/secrets/monitoring-apikey to be mounted" + echo "Please create a secret with your IBM Cloud Monitoring API key and mount it at /etc/secrets/monitoring-apikey" + echo "Example:" + echo " ibmcloud ce secret create --name monitoring-apikey --from-literal monitoring-apikey=YOUR_API_KEY" + echo " ibmcloud ce job update --name metrics-collector --mount-secret /etc/secrets=monitoring-apikey" + exit 1 + fi + + # Check required environment variables + if [ -z "$CE_SUBDOMAIN" ]; then + echo "ERROR: CE_SUBDOMAIN environment variable is required when METRICS_ENABLED=true" + exit 1 + fi + + if [ -z "$METRICS_REMOTE_WRITE_FQDN" ]; then + echo "ERROR: METRICS_REMOTE_WRITE_FQDN environment variable is required when METRICS_ENABLED=true" + exit 1 + fi + + if [ -z "$CE_PROJECT_NAME" ]; then + CE_PROJECT_NAME="default" + fi + + # Generate prometheus.yml from template with environment variable substitution + echo "Generating Prometheus configuration..." + sed -e "s/\${CE_SUBDOMAIN}/$CE_SUBDOMAIN/g" \ + -e "s/\${CE_PROJECT_NAME}/$CE_PROJECT_NAME/g" \ + -e "s/\${METRICS_REMOTE_WRITE_FQDN}/$METRICS_REMOTE_WRITE_FQDN/g" \ + /etc/prometheus/prometheus.yml.template > /tmp/prometheus.yml + + echo "Starting Prometheus agent..." + /bin/prometheus --config.file=/tmp/prometheus.yml --agent --storage.agent.path=/tmp/agent-data --log.level info --log.format json 2>&1 & + PROMETHEUS_PID=$! + echo "Prometheus agent started with PID $PROMETHEUS_PID" + + # Give Prometheus a moment to start and check if it's actually running + sleep 2 + if ! kill -0 "$PROMETHEUS_PID" 2>/dev/null; then + echo "ERROR: Prometheus agent failed to start" + exit 1 + fi +else + echo "Prometheus metrics export disabled (METRICS_ENABLED not set to 'true')" +fi + +# Start the metrics collector +echo "Starting metrics collector..." +/app & +APP_PID=$! +echo "Metrics collector started with PID $APP_PID" + +# Function to handle shutdown +shutdown() { + echo "Shutting down..." + if [ -n "$APP_PID" ]; then + kill -TERM "$APP_PID" 2>/dev/null || true + fi + if [ -n "$PROMETHEUS_PID" ]; then + kill -TERM "$PROMETHEUS_PID" 2>/dev/null || true + fi + wait + exit 0 +} + +# Trap signals +trap shutdown SIGTERM SIGINT + +# Monitor processes +while true; do + # Check if app is still running + if ! kill -0 "$APP_PID" 2>/dev/null; then + echo "ERROR: Metrics collector process died unexpectedly" + if [ -n "$APP_PID" ]; then + kill -TERM "$APP_PID" 2>/dev/null || true + fi + exit 1 + fi + + # Check if Prometheus is still running (only if it was started) + if [ "$METRICS_ENABLED" = "true" ] && ! kill -0 "$PROMETHEUS_PID" 2>/dev/null; then + echo "ERROR: Prometheus agent process died unexpectedly" + kill -TERM "$PROMETHEUS_PID" 2>/dev/null || true + exit 1 + fi + + sleep 5 +done diff --git a/network-test-app/Dockerfile b/network-test-app/Dockerfile index de42fb623..f7224cc66 100644 --- a/network-test-app/Dockerfile +++ b/network-test-app/Dockerfile @@ -1,10 +1,19 @@ -FROM icr.io/codeengine/node:22-alpine -RUN apk -U upgrade +# Download dependencies in builder stage +FROM registry.access.redhat.com/ubi9/nodejs-24:latest AS builder -WORKDIR /network-test-app +COPY --chown=${CNB_USER_ID}:${CNB_GROUP_ID} package.json /app/ +WORKDIR /app +RUN npm i --omit=dev -COPY *.js *.json /network-test-app/ -RUN npm install +# Use a small distroless image for as runtime image +FROM gcr.io/distroless/nodejs24 -ENTRYPOINT [ "node", "app.js" ] \ No newline at end of file +COPY --chown=1001:0 --from=builder /app/node_modules /app/node_modules +COPY --chown=1001:0 . /app/ + +USER 1001:0 +WORKDIR /app +EXPOSE 8080 + +CMD ["app.mjs"] diff --git a/network-test-app/README.md b/network-test-app/README.md index d1a421394..afc7a95c5 100644 --- a/network-test-app/README.md +++ b/network-test-app/README.md @@ -1,20 +1,212 @@ # Network Connectivity Test App -This sample is intended to help users debug connectivity issues for IBM Cloud Services. You can use this app to help isolate network connection issues between your own code and a working app. +This application helps debug connectivity issues for IBM Cloud Services and provides comprehensive monitoring through Prometheus metrics. It includes outbound HTTP call simulation, database connectivity testing, and compute-intensive workload simulation. -- - - +## Features -This sample includes a `build` script which will build the container image and push the image to `icr.io/codeengine/network-test-app`. The customer should: -- Pull the image `icr.io/codeengine/network-test-app` -- Deploy the image as an application -- Make an HTTP request to your application, and observe the response +- **Outbound HTTP Calls**: Configurable endpoints that simulate delays and error responses to httpbin.org-compatible backends +- **Database Testing**: PostgreSQL connectivity verification with instrumented queries +- **Prometheus Metrics**: Comprehensive instrumentation for requests, outbound calls, database operations, and compute workloads +- **Load Testing**: Included shell script for generating realistic traffic patterns -## Configuring the Service Credentials for the App +## Quick Start -This app works by attempting to connect your Code Engine project to another IBM Cloud service; in order to do this properly, it must consume service credentials that should be configured by creating a `service binding` between the customer's project and the service they wish to connect to. +### Deploy to IBM Cloud Code Engine -For more information about how to create a service binding, see [Working with service bindings to integrate IBM Cloud services with Code Engine](https://cloud.ibm.com/docs/codeengine?topic=codeengine-service-binding). +Deploy the application with recommended resource settings: -### Example: Databases for PostgreSQL -If the app is attempting to connect to a postgres instance, then after creating a service binding for the instance the app will contain the credentials for the postgres instance in the form of an environment variable `DATABASES_FOR_POSTGRESQL_CONNECTION`. -- **Without this environment variable properly configured, the app will NOT be able to connect to postgres** \ No newline at end of file +```bash +ibmcloud ce application create \ + --name network-test-app \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --port 8080 +``` + +The `--concurrency 5` setting limits each instance to handle a maximum of 5 concurrent requests, ensuring stable performance given the compute-intensive operations. + +To configure environment variables during deployment: + +```bash +ibmcloud ce application create \ + --name network-test-app \ + --src "." \ + --memory 0.5G \ + --cpu 0.25 \ + --env HTTPBIN_BASE_URL=https://httpbin.org \ + --env METRICS_COLLECT_NODE_METRICS_ENABLED=true +``` + +Update an existing application: + +```bash +ibmcloud ce application update \ + --name network-test-app \ + --env HTTPBIN_BASE_URL=https://custom-backend.example.com +``` + +### Run Locally + +Pull and run with Docker: +```bash +docker pull icr.io/codeengine/network-test-app +docker run -p 8080:8080 -p 2112:2112 icr.io/codeengine/network-test-app +``` + +Or run from source: +```bash +npm install +node app.mjs +``` + +The application exposes two servers: +- Main application: `http://localhost:8080` +- Metrics endpoint: `http://localhost:2112/metrics` + +## Configuration + +### Environment Variables + +- `PORT`: Application server port (default: 8080) +- `HTTPBIN_BASE_URL`: Backend URL for outbound calls (default: `https://httpbin.org`) +- `METRICS_NAME_PREFIX`: Prefix for all Prometheus metrics (default: `mymetrics_`) +- `METRICS_COLLECT_NODE_METRICS_ENABLED`: Enable Node.js runtime metrics (set to "true") +- `DATABASES_FOR_POSTGRESQL_CONNECTION`: PostgreSQL connection credentials (JSON format) + +### Deploying httpbin Backend + +To deploy your own httpbin instance on IBM Cloud Code Engine instead of using the public service, use the following command with an image from a registry other than docker.io: + +```bash +ibmcloud ce application update \ + --name httpbin \ + --src https://github.com/mark-sivill/httpbin \ + --memory 0.5G \ + --cpu 0.25 \ + --min-scale 1 \ + --max-scale 3 \ + --concurrency 100 \ + --port 9000 +``` + +After deployment, get the application URL: + +```bash +ibmcloud ce application get --name httpbin --output url +``` + +Then configure the network-test-app to use your httpbin instance: + +```bash +ibmcloud ce application update \ + --name network-test-app \ + --env HTTPBIN_BASE_URL=https://httpbin.your-project.us-south.codeengine.appdomain.cloud +``` + +The httpbin image from GitHub Container Registry (ghcr.io) is the official Postman-maintained implementation that works well in Code Engine environments. + +### Service Bindings + +For database connectivity, create a Code Engine service binding between your project and the IBM Cloud service. See [Working with service bindings](https://cloud.ibm.com/docs/codeengine?topic=codeengine-service-binding) for details. + +## API Endpoints + +- `GET /` - Health check +- `GET /test-db` - Test PostgreSQL connectivity +- `GET /outbound/delay` - Outbound call with random delay (0-2s) and 5% error rate +- `GET /outbound/get` - Simple outbound GET request +- `POST /outbound/post` - Outbound POST request +- `GET /outbound/status/:code` - Request specific HTTP status code + +All outbound endpoints include simulated compute-intensive data processing (0-3s duration, 40-80% CPU intensity). + +## Metrics + +The application exposes Prometheus metrics at `/metrics` (port 2112). All metric names are prefixed with a configurable value set via the `METRICS_NAME_PREFIX` environment variable (default: `mymetrics_node_`). + +Enable custom metrics scraping: + +```bash +ibmcloud ce project select --name your-project-name --kubecfg + +kubectl patch kservice "network-test-app" --type=json -p='[ + {"op":"add","path":"/spec/template/metadata/annotations/codeengine.cloud.ibm.com~1userMetricsScrape","value":"true"}, + {"op":"add","path":"/spec/template/metadata/annotations/codeengine.cloud.ibm.com~1userMetricsPath","value":"/metrics"}, + {"op":"add","path":"/spec/template/metadata/annotations/codeengine.cloud.ibm.com~1userMetricsPort","value":"2112"} +]' +``` + +Once custom metrics scraping is enabled (see asset [metrics-collector](../metrics-collector/README.md)), the following command can be used to import the "My custom Code Engine Metrics" dashboard into IBM Cloud Monitoring: + +```bash +# Load the custom metric dashboard configuration +CE_CUSTOM_METRICS_DASHBOARD=$(curl -sL https://raw.githubusercontent.com/IBM/CodeEngine/main/network-test-app/my-custom-code-engine-metrics-dashboard.json) + +# Import the dashboard +curl -X POST https://$REGION.monitoring.cloud.ibm.com/api/v3/dashboards \ + -H "Authorization: $(ibmcloud iam oauth-tokens --output JSON|jq -r '.iam_token')" \ + -H "IBMInstanceID: $MONITORING_INSTANCE_GUID" \ + -H "Content-Type: application/json" \ + -d "{\"dashboard\": $CE_CUSTOM_METRICS_DASHBOARD}" +``` + +To customize the prefix, set the environment variable when starting the application: + +```bash +METRICS_NAME_PREFIX=myapp_ node app.mjs +``` + +On Code Engine set the environment variable in the application configuration: + +```bash +ibmcloud ce app update "network-test-app" --env METRICS_NAME_PREFIX=myapp_ +``` + +Following metrics are emitted by the network-test-app: + +**Request Metrics** +- `mymetrics_node_requests_total`: Total requests by method and path + +**Outbound Call Metrics** +- `mymetrics_node_outbound_request_duration_seconds`: Histogram of outbound request durations +- `mymetrics_node_outbound_requests_total`: Total outbound requests by target, method, and status + +**Database Metrics** +- `mymetrics_node_db_query_duration_seconds`: Histogram of query durations by operation and table +- `mymetrics_node_db_queries_total`: Total queries by operation, table, and status +- `mymetrics_node_db_connections_active`: Active database connections gauge + +**Compute Metrics** +- `mymetrics_node_compute_duration_seconds`: Histogram of compute operation durations + + + +## Load Testing + +Generate test traffic using the included script: + +```bash +# Local testing +./load-test.sh + +# IBM Cloud Code Engine deployment +TARGET_URL=https://your-app.example.com ./load-test.sh + +# Custom configuration +TARGET_URL=https://your-app.example.com DURATION=120 CONCURRENT_REQUESTS=10 ./load-test.sh +``` + +Configuration options: +- `TARGET_URL`: Application endpoint (default: http://localhost:8080) +- `DURATION`: Test duration in seconds (default: 60) +- `CONCURRENT_REQUESTS`: Number of concurrent workers (default: 5) + +## Building + +Build and push the container image: +```bash +./build +``` + +This builds the image and pushes it to `icr.io/codeengine/network-test-app`. diff --git a/network-test-app/app.js b/network-test-app/app.js deleted file mode 100644 index b50427e90..000000000 --- a/network-test-app/app.js +++ /dev/null @@ -1,71 +0,0 @@ -const { Client } = require("pg"); -const express = require("express"); -const app = express() -const timeoutMs = 15000 // timeout in 15 seconds -const port = process.env.PORT; - -app.get("/", async (request, response) => { - pgServiceCredentials = process.env.DATABASES_FOR_POSTGRESQL_CONNECTION - if(!!pgServiceCredentials){ - /* - Postgres service credentials have been configured properly, - continue with attempting to connect to service - */ - try { - // Use env variables loaded from service binding to connect to our postgres instance - console.log("Connecting to PostgreSQL instance..."); - - postgresSetup = JSON.parse(pgServiceCredentials); - cli = postgresSetup.cli; - postgres = postgresSetup.postgres; - cert = Buffer.from(postgres.certificate.certificate_base64, 'base64').toString('utf8'); - - const client = new Client({ - user: postgres.authentication.username, - password: cli.environment.PGPASSWORD, - host: postgres.hosts[0].hostname, - database: postgres.database, - port: postgres.hosts[0].port, - statement_timeout: timeoutMs, - query_timeout: timeoutMs, - lock_timeout: timeoutMs, - application_name: "network-test-app", - connectionTimeoutMillis: timeoutMs, - ssl: { - ca: cert, - rejectUnauthorized: true, - }, - }); - await client.connect(); - - // Run a simple command to verify that we connected to the postgres instance - console.log("List tables"); - result = await client.query("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE';"); - console.log(result) - await client.end() - response.status(200).send("Successfully connected to postgres instance"); - } catch (err) { - console.error("Failed to connect to PostgreSQL instance", err); - response.status(500).send("Could not connect to postgres instance:", err); - } - } else { - response.status(500).send("Could not connect to postgres instance: no postgres instance configured"); - } - - -}) - -const server = app.listen(port, async () => { - console.log('listening on localhost', port) -}) - -process.on('SIGTERM', () => { - console.info('SIGTERM signal received.'); - server.close(() => { - console.log('Http server closed.'); - }); -}); - - - - diff --git a/network-test-app/app.mjs b/network-test-app/app.mjs new file mode 100644 index 000000000..835e73a37 --- /dev/null +++ b/network-test-app/app.mjs @@ -0,0 +1,370 @@ +import express from "express"; +import promClient from "prom-client"; +import { closeDbClient, getDbClient } from "./utils/db.mjs"; + +// ==================================== +// Configuration +// ==================================== +const HTTPBIN_BASE_URL = process.env.HTTPBIN_BASE_URL || "https://httpbin.org"; + +// ==================================== +// Initialize Prometheus metrics +// ==================================== +const METRICS_NAME_PREFIX = process.env.METRICS_NAME_PREFIX || "mymetrics_node_"; +// Create a registry to register the metrics +const register = new promClient.Registry(); + +// Create a custom counter metric with path label +// Note: For high-cardinality paths, consider using a Histogram instead to track +// request duration distribution, or a Gauge to track active requests. +// Histogram example: new promClient.Histogram({ +// name: `${METRICS_NAME_PREFIX}request_duration_seconds`, +// help: "Request duration in seconds", +// labelNames: ["method", "path", "status_code"], +// buckets: [0.1, 0.5, 1, 2, 5] +// }); +const counter = new promClient.Counter({ + name: `${METRICS_NAME_PREFIX}requests_total`, + help: "Total number of requests", + labelNames: ["method", "path"], +}); +register.registerMetric(counter); + +// Outbound HTTP call metrics +const outboundCallDuration = new promClient.Histogram({ + name: `${METRICS_NAME_PREFIX}outbound_request_duration_seconds`, + help: "Duration of outbound HTTP requests in seconds", + labelNames: ["target", "method", "status_code"], + buckets: [0.1, 0.5, 1, 2, 5, 10], +}); +register.registerMetric(outboundCallDuration); + +const outboundCallTotal = new promClient.Counter({ + name: `${METRICS_NAME_PREFIX}outbound_requests_total`, + help: "Total number of outbound HTTP requests", + labelNames: ["target", "method", "status_code"], +}); +register.registerMetric(outboundCallTotal); + +// Database operation metrics +const dbQueryDuration = new promClient.Histogram({ + name: `${METRICS_NAME_PREFIX}db_query_duration_seconds`, + help: "Duration of database queries in seconds", + labelNames: ["operation", "table", "status"], + buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5], +}); +register.registerMetric(dbQueryDuration); + +const dbQueryTotal = new promClient.Counter({ + name: `${METRICS_NAME_PREFIX}db_queries_total`, + help: "Total number of database queries", + labelNames: ["operation", "table", "status"], +}); +register.registerMetric(dbQueryTotal); + +const dbConnectionsActive = new promClient.Gauge({ + name: `${METRICS_NAME_PREFIX}db_connections_active`, + help: "Number of active database connections", +}); +register.registerMetric(dbConnectionsActive); + +// Compute operation metrics +const computeDuration = new promClient.Histogram({ + name: `${METRICS_NAME_PREFIX}compute_duration_seconds`, + help: "Duration of compute-intensive operations in seconds", + labelNames: ["operation"], + buckets: [0.5, 1, 2, 3, 5], +}); +register.registerMetric(computeDuration); + +if (process.env.METRICS_COLLECT_NODE_METRICS_ENABLED === "true") { + promClient.collectDefaultMetrics({ register, prefix: METRICS_NAME_PREFIX }); +} + +// ==================================== +// Helper Functions +// ==================================== + +// Simulate compute-intensive operation +function simulateCompute(durationSeconds, cpuIntensity) { + const startTime = Date.now(); + const endTime = startTime + durationSeconds * 1000; + + // CPU-intensive loop based on intensity (40-80%) + while (Date.now() < endTime) { + // Perform some CPU work + const workIterations = Math.floor(cpuIntensity * 1000); + for (let i = 0; i < workIterations; i++) { + Math.sqrt(Math.random() * 1000000); + } + // Small sleep to control CPU usage + const sleepTime = (100 - cpuIntensity) / 10; + const sleepEnd = Date.now() + sleepTime; + while (Date.now() < sleepEnd) { + // Busy wait for precise timing + } + } +} + +// Make outbound HTTP call with metrics +async function makeOutboundCall(endpoint, method = "GET") { + const url = `${HTTPBIN_BASE_URL}${endpoint}`; + const startTime = Date.now(); + + try { + const response = await fetch(url, { method }); + const duration = (Date.now() - startTime) / 1000; + const statusCode = response.status.toString(); + + // Record metrics + outboundCallDuration.observe({ target: HTTPBIN_BASE_URL, method, status_code: statusCode }, duration); + outboundCallTotal.inc({ target: HTTPBIN_BASE_URL, method, status_code: statusCode }); + + return { success: true, status: response.status, duration, data: await response.text() }; + } catch (error) { + const duration = (Date.now() - startTime) / 1000; + + // Record error metrics + outboundCallDuration.observe({ target: HTTPBIN_BASE_URL, method, status_code: "error" }, duration); + outboundCallTotal.inc({ target: HTTPBIN_BASE_URL, method, status_code: "error" }); + + return { success: false, error: error.message, duration }; + } +} + +// Instrumented DB query wrapper +async function executeDbQuery(dbClient, query, operation, table) { + const startTime = Date.now(); + let status = "success"; + + try { + const result = await dbClient.query(query); + const duration = (Date.now() - startTime) / 1000; + + dbQueryDuration.observe({ operation, table, status }, duration); + dbQueryTotal.inc({ operation, table, status }); + + return result; + } catch (error) { + status = "error"; + const duration = (Date.now() - startTime) / 1000; + + dbQueryDuration.observe({ operation, table, status }, duration); + dbQueryTotal.inc({ operation, table, status }); + + throw error; + } +} + +// ====================================== +// Initialize Express app +// ====================================== +const app = express(); +app.use(express.json()); +const router = express.Router(); +app.use("/", router); + +// Middleware to count requests with path +router.use((req, res, next) => { + counter.inc({ method: req.method, path: req.path }); + next(); +}); + +router.get("/", (req, res) => { + res.send(`app '${process.env.CE_APP || "network-test-app"}' is ready!`); +}); + +router.get("/test-db", async (request, response) => { + const dbClient = await getDbClient(); + if (!dbClient) { + return response.status(500).send("Could not connect to postgres instance: no postgres instance configured"); + } + + try { + // Update connection gauge + dbConnectionsActive.inc(); + + // Run a simple command to verify that we connected to the postgres instance + console.log("List tables"); + const result = await executeDbQuery( + dbClient, + "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE';", + "SELECT", + "INFORMATION_SCHEMA.TABLES", + ); + console.log(`Received the following query result: ${JSON.stringify(result)}`); + response.status(200).send("Successfully connected to postgres instance"); + } catch (err) { + console.error("Failed to connect to PostgreSQL instance", err); + response.status(500).send(`Could not connect to postgres instance: '${err.message}'`); + } finally { + dbConnectionsActive.dec(); + } +}); + +// ==================================== +// Outbound call endpoints +// ==================================== + +router.get("/outbound/delay", async (req, res) => { + try { + // Random delay between 0-2 seconds + const delay = Math.random() * 2; + + // 5% error rate + const shouldError = Math.random() < 0.05; + + if (shouldError) { + // Simulate error by calling status/500 + const result = await makeOutboundCall("/status/500", "GET"); + + // Simulate compute-intensive data handling + const computeStart = Date.now(); + const computeDurationSec = Math.random() * 3; // 0-3 seconds + const cpuIntensity = 40 + Math.random() * 40; // 40-80% + simulateCompute(computeDurationSec, cpuIntensity); + const actualComputeDuration = (Date.now() - computeStart) / 1000; + computeDuration.observe({ operation: "data_processing" }, actualComputeDuration); + + return res.status(500).json({ + message: "Simulated error response", + delay, + outboundCall: result, + computeTime: actualComputeDuration, + cpuIntensity: `${cpuIntensity.toFixed(1)}%`, + }); + } + + // Normal flow with delay + const result = await makeOutboundCall(`/delay/${delay.toFixed(1)}`, "GET"); + + // Simulate compute-intensive data handling + const computeStart = Date.now(); + const computeDurationSec = Math.random() * 3; // 0-3 seconds + const cpuIntensity = 40 + Math.random() * 40; // 40-80% + simulateCompute(computeDurationSec, cpuIntensity); + const actualComputeDuration = (Date.now() - computeStart) / 1000; + computeDuration.observe({ operation: "data_processing" }, actualComputeDuration); + + res.status(200).json({ + message: "Outbound call completed", + delay, + outboundCall: result, + computeTime: actualComputeDuration, + cpuIntensity: `${cpuIntensity.toFixed(1)}%`, + }); + } catch (error) { + res.status(500).json({ error: error.message }); + } +}); + +router.get("/outbound/status/:code", async (req, res) => { + try { + const statusCode = req.params.code; + const result = await makeOutboundCall(`/status/${statusCode}`, "GET"); + + // Simulate compute-intensive data handling + const computeStart = Date.now(); + const computeDurationSec = Math.random() * 3; + const cpuIntensity = 40 + Math.random() * 40; + simulateCompute(computeDurationSec, cpuIntensity); + const actualComputeDuration = (Date.now() - computeStart) / 1000; + computeDuration.observe({ operation: "data_processing" }, actualComputeDuration); + + res.status(200).json({ + message: "Outbound call completed", + requestedStatus: statusCode, + outboundCall: result, + computeTime: actualComputeDuration, + cpuIntensity: `${cpuIntensity.toFixed(1)}%`, + }); + } catch (error) { + res.status(500).json({ error: error.message }); + } +}); + +router.get("/outbound/get", async (req, res) => { + try { + const result = await makeOutboundCall("/get", "GET"); + + // Simulate compute-intensive data handling + const computeStart = Date.now(); + const computeDurationSec = Math.random() * 3; + const cpuIntensity = 40 + Math.random() * 40; + simulateCompute(computeDurationSec, cpuIntensity); + const actualComputeDuration = (Date.now() - computeStart) / 1000; + computeDuration.observe({ operation: "data_processing" }, actualComputeDuration); + + res.status(200).json({ + message: "Outbound GET call completed", + outboundCall: result, + computeTime: actualComputeDuration, + cpuIntensity: `${cpuIntensity.toFixed(1)}%`, + }); + } catch (error) { + res.status(500).json({ error: error.message }); + } +}); + +router.post("/outbound/post", async (req, res) => { + try { + const result = await makeOutboundCall("/post", "POST"); + + // Simulate compute-intensive data handling + const computeStart = Date.now(); + const computeDurationSec = Math.random() * 3; + const cpuIntensity = 40 + Math.random() * 40; + simulateCompute(computeDurationSec, cpuIntensity); + const actualComputeDuration = (Date.now() - computeStart) / 1000; + computeDuration.observe({ operation: "data_processing" }, actualComputeDuration); + + res.status(200).json({ + message: "Outbound POST call completed", + outboundCall: result, + computeTime: actualComputeDuration, + cpuIntensity: `${cpuIntensity.toFixed(1)}%`, + }); + } catch (error) { + res.status(500).json({ error: error.message }); + } +}); + +// ====================================== +// Start the http server +// ====================================== +const port = process.env.PORT || 8080; +const server = app.listen(port, async () => { + console.log(`Application server is running at http://localhost:${port}`); + console.log(`Configured httpbin backend: ${HTTPBIN_BASE_URL}`); +}); + +// ====================================== +// Metrics server +// ====================================== +const metricsApp = express(); +const metricsPort = 2112; +// Expose metrics endpoint +metricsApp.get("/metrics", async (req, res) => { + res.set("Content-Type", register.contentType); + res.end(await register.metrics()); +}); +// Start the metrics server +const metricsServer = metricsApp.listen(metricsPort, () => { + console.log(`Metrics server is running at http://localhost:${metricsPort}`); +}); + +// ====================================== +// Handle shutdown signals +// ====================================== +process.on("SIGTERM", async () => { + console.info("SIGTERM signal received."); + await closeDbClient(); + + metricsServer.close(() => { + console.log("Metrics server closed."); + }); + + server.close(() => { + console.log("Http server closed."); + }); +}); diff --git a/network-test-app/load-test.sh b/network-test-app/load-test.sh new file mode 100755 index 000000000..0d45a32f8 --- /dev/null +++ b/network-test-app/load-test.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +# Load testing script for network-test-app +# Generates random load to produce metric data +# Press Ctrl+C to abort the test + +# Configuration +TARGET_URL="${TARGET_URL:-http://localhost:8080}" +DURATION="${DURATION:-60}" +CONCURRENT_REQUESTS="${CONCURRENT_REQUESTS:-5}" + +# Track worker PIDs for cleanup +WORKER_PIDS=() +INTERRUPTED=false + +# Cleanup function +cleanup() { + if [ "$INTERRUPTED" = false ]; then + INTERRUPTED=true + echo "" + echo "Aborting load test..." + + # Kill all worker processes + for pid in "${WORKER_PIDS[@]}"; do + kill "$pid" 2>/dev/null + done + + # Wait for all processes to terminate + wait 2>/dev/null + + echo "Load test aborted!" + echo "View metrics at: ${TARGET_URL%:*}:2112/metrics" + exit 0 + fi +} + +# Set up signal handlers for graceful shutdown +trap cleanup SIGINT SIGTERM + +echo "Load Testing Configuration:" +echo " Target URL: $TARGET_URL" +echo " Duration: ${DURATION}s" +echo " Concurrent requests: $CONCURRENT_REQUESTS" +echo "" +echo "Press Ctrl+C to abort the test" +echo "" + +# Endpoints to test +ENDPOINTS=( + "/" + "/outbound/delay" + "/outbound/get" + "/outbound/status/200" + "/outbound/status/404" + "/test-db" +) + +# Function to make a random request +make_request() { + local endpoint=${ENDPOINTS[$RANDOM % ${#ENDPOINTS[@]}]} + local method="GET" + + # POST endpoint + if [[ "$endpoint" == "/outbound/post" ]]; then + method="POST" + fi + + local start_time=$(date +%s%N) + local response_code=$(curl -s -o /dev/null -w "%{http_code}" -X "$method" "${TARGET_URL}${endpoint}" 2>/dev/null) + local end_time=$(date +%s%N) + local duration=$(( (end_time - start_time) / 1000000 )) + + echo "[$(date +%H:%M:%S)] $method $endpoint -> $response_code (${duration}ms)" +} + +# Function to run load test worker +run_worker() { + local worker_id=$1 + local end_time=$(($(date +%s) + DURATION)) + + while [ $(date +%s) -lt $end_time ] && [ "$INTERRUPTED" = false ]; do + make_request + # Random sleep between requests (0.5-2 seconds) + sleep $(awk -v min=0.5 -v max=2 'BEGIN{srand(); print min+rand()*(max-min)}') 2>/dev/null || break + done +} + +# Start concurrent workers +echo "Starting load test..." +for i in $(seq 1 $CONCURRENT_REQUESTS); do + run_worker $i & + WORKER_PIDS+=($!) +done + +# Wait for all workers to complete +wait + +# Check if we completed normally or were interrupted +if [ "$INTERRUPTED" = false ]; then + echo "" + echo "Load test completed!" + echo "View metrics at: ${TARGET_URL%:*}:2112/metrics" +fi diff --git a/network-test-app/my-custom-code-engine-metrics-dashboard.json b/network-test-app/my-custom-code-engine-metrics-dashboard.json new file mode 100644 index 000000000..43475341e --- /dev/null +++ b/network-test-app/my-custom-code-engine-metrics-dashboard.json @@ -0,0 +1,792 @@ +{ + "name": "My custom Code Engine Metrics", + "panels": [ + { + "id": 1, + "type": "text", + "name": "Dashboard Overview", + "description": "", + "nullValueDisplayText": null, + "links": null, + "markdownSource": "Monitor custom application metrics for Code Engine applications.\n\n**Metrics included:**\n- Request rates by method and path\n- Outbound HTTP request performance\n- Database connection monitoring\n- Compute operation duration", + "transparentBackground": false, + "panelTitleVisible": true, + "textAutosized": false + }, + { + "id": 2, + "type": "advancedTimechart", + "name": "Request Rate (by Path)", + "description": "Rate of incoming requests grouped by path", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "rate(mymetrics_node_requests_total{$__scope}[5m])", + "enabled": true, + "displayInfo": { + "displayName": "Request Rate", + "timeSeriesDisplayNameTemplate": "{{method}} {{path}}", + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 2, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 3, + "type": "advancedNumber", + "name": "Total Requests", + "description": "Total number of requests received", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(mymetrics_node_requests_total{$__scope})", + "enabled": true, + "displayInfo": { + "displayName": "Total Requests", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 4, + "type": "advancedTimechart", + "name": "Outbound Request Duration (P95)", + "description": "95th percentile of outbound HTTP request duration", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "histogram_quantile(0.95, rate(mymetrics_node_outbound_request_duration_seconds_bucket{$__scope}[5m]))", + "enabled": true, + "displayInfo": { + "displayName": "P95 Duration", + "timeSeriesDisplayNameTemplate": "{{target}} {{method}}", + "type": "lines" + }, + "format": { + "unit": "relativeTime", + "inputFormat": "s", + "displayFormat": "auto", + "decimals": 3, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 2.0, + "inputFormat": "s", + "displayText": "" + }, + { + "severity": "medium", + "value": 1.0, + "inputFormat": "s", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 5, + "type": "advancedTimechart", + "name": "Outbound Request Duration (Average)", + "description": "Average duration of outbound HTTP requests", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "rate(mymetrics_node_outbound_request_duration_seconds_sum{$__scope}[5m]) / rate(mymetrics_node_outbound_request_duration_seconds_count{$__scope}[5m])", + "enabled": true, + "displayInfo": { + "displayName": "Avg Duration", + "timeSeriesDisplayNameTemplate": "{{target}} {{method}}", + "type": "lines" + }, + "format": { + "unit": "relativeTime", + "inputFormat": "s", + "displayFormat": "auto", + "decimals": 3, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 6, + "type": "advancedTimechart", + "name": "Outbound Request Rate", + "description": "Rate of outbound HTTP requests by target and status", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "rate(mymetrics_node_outbound_requests_total{$__scope}[5m])", + "enabled": true, + "displayInfo": { + "displayName": "Request Rate", + "timeSeriesDisplayNameTemplate": "{{target}} {{method}} ({{status_code}})", + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 2, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 7, + "type": "advancedNumber", + "name": "Active DB Connections", + "description": "Current number of active database connections", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "mymetrics_node_db_connections_active{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "Active Connections", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 80, + "inputFormat": "1", + "displayText": "" + }, + { + "severity": "medium", + "value": 50, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 8, + "type": "advancedTimechart", + "name": "Database Connections Over Time", + "description": "Active database connections timeline", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "mymetrics_node_db_connections_active{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "Active Connections", + "timeSeriesDisplayNameTemplate": "DB Connections", + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "number", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 80, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 9, + "type": "advancedTimechart", + "name": "Compute Operation Duration (P95)", + "description": "95th percentile of compute operation duration", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "histogram_quantile(0.95, rate(mymetrics_node_compute_duration_seconds_bucket{$__scope}[5m]))", + "enabled": true, + "displayInfo": { + "displayName": "P95 Duration", + "timeSeriesDisplayNameTemplate": "{{operation}}", + "type": "lines" + }, + "format": { + "unit": "relativeTime", + "inputFormat": "s", + "displayFormat": "auto", + "decimals": 3, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 5.0, + "inputFormat": "s", + "displayText": "" + }, + { + "severity": "medium", + "value": 3.0, + "inputFormat": "s", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 10, + "type": "advancedTimechart", + "name": "Compute Operation Duration (Average)", + "description": "Average duration of compute operations", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "rate(mymetrics_node_compute_duration_seconds_sum{$__scope}[5m]) / rate(mymetrics_node_compute_duration_seconds_count{$__scope}[5m])", + "enabled": true, + "displayInfo": { + "displayName": "Avg Duration", + "timeSeriesDisplayNameTemplate": "{{operation}}", + "type": "lines" + }, + "format": { + "unit": "relativeTime", + "inputFormat": "s", + "displayFormat": "auto", + "decimals": 3, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "s", + "displayFormat": "auto", + "decimals": null, + "minValue": 0, + "maxValue": null, + "minInputFormat": "s", + "maxInputFormat": "s", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + } + ], + "scopeExpressionList": [], + "eventDisplaySettings": { + "enabled": true, + "queryParams": { + "severities": [], + "alertStatuses": [], + "categories": [], + "filter": "", + "teamScope": false + } + }, + "shared": true, + "public": false, + "description": "Custom dashboard for monitoring application metrics including request rates, outbound HTTP performance, database connections, and compute operations", + "layout": [ + { + "panelId": 1, + "x": 0, + "y": 0, + "w": 12, + "h": 3 + }, + { + "panelId": 3, + "x": 12, + "y": 0, + "w": 6, + "h": 3 + }, + { + "panelId": 7, + "x": 18, + "y": 0, + "w": 6, + "h": 3 + }, + { + "panelId": 2, + "x": 0, + "y": 3, + "w": 12, + "h": 6 + }, + { + "panelId": 6, + "x": 12, + "y": 3, + "w": 12, + "h": 6 + }, + { + "panelId": 4, + "x": 0, + "y": 9, + "w": 12, + "h": 6 + }, + { + "panelId": 5, + "x": 12, + "y": 9, + "w": 12, + "h": 6 + }, + { + "panelId": 8, + "x": 0, + "y": 15, + "w": 12, + "h": 6 + }, + { + "panelId": 9, + "x": 0, + "y": 21, + "w": 12, + "h": 6 + }, + { + "panelId": 10, + "x": 12, + "y": 21, + "w": 12, + "h": 6 + } + ], + "schema": 3 +} diff --git a/network-test-app/package-lock.json b/network-test-app/package-lock.json index 46b4a8c16..d5302ca88 100644 --- a/network-test-app/package-lock.json +++ b/network-test-app/package-lock.json @@ -9,51 +9,61 @@ "version": "1.0.0", "license": "ISC", "dependencies": { - "express": "^4.22.1", - "pg": "^8.13.0" + "express": "^5.2.1", + "pg": "^8.20.0", + "prom-client": "^15.1.3" + } + }, + "node_modules/@opentelemetry/api": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", + "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", + "license": "Apache-2.0", + "engines": { + "node": ">=8.0.0" } }, "node_modules/accepts": { - "version": "1.3.8", - "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.8.tgz", - "integrity": "sha512-PYAthTa2m2VKxuvSD3DPC/Gy+U+sOA1LAuT8mkmRuvw+NACSaeXEQ+NHcVF7rONl6qcaxV3Uuemwawk+7+SJLw==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-2.0.0.tgz", + "integrity": "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==", "license": "MIT", "dependencies": { - "mime-types": "~2.1.34", - "negotiator": "0.6.3" + "mime-types": "^3.0.0", + "negotiator": "^1.0.0" }, "engines": { "node": ">= 0.6" } }, - "node_modules/array-flatten": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", - "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==", + "node_modules/bintrees": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz", + "integrity": "sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==", "license": "MIT" }, "node_modules/body-parser": { - "version": "1.20.3", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.3.tgz", - "integrity": "sha512-7rAxByjUMqQ3/bHJy7D6OGXvx/MMc4IqBn/X0fcM1QUcAItpZrBEYhWGem+tzXH90c+G01ypMcYJBO9Y30203g==", + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.2.tgz", + "integrity": "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==", "license": "MIT", "dependencies": { - "bytes": "3.1.2", - "content-type": "~1.0.5", - "debug": "2.6.9", - "depd": "2.0.0", - "destroy": "1.2.0", - "http-errors": "2.0.0", - "iconv-lite": "0.4.24", - "on-finished": "2.4.1", - "qs": "6.13.0", - "raw-body": "2.5.2", - "type-is": "~1.6.18", - "unpipe": "1.0.0" + "bytes": "^3.1.2", + "content-type": "^1.0.5", + "debug": "^4.4.3", + "http-errors": "^2.0.0", + "iconv-lite": "^0.7.0", + "on-finished": "^2.4.1", + "qs": "^6.14.1", + "raw-body": "^3.0.1", + "type-is": "^2.0.1" }, "engines": { - "node": ">= 0.8", - "npm": "1.2.8000 || >= 1.4.16" + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/bytes": { @@ -95,15 +105,16 @@ } }, "node_modules/content-disposition": { - "version": "0.5.4", - "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.4.tgz", - "integrity": "sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.0.1.tgz", + "integrity": "sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q==", "license": "MIT", - "dependencies": { - "safe-buffer": "5.2.1" - }, "engines": { - "node": ">= 0.6" + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/content-type": { @@ -125,18 +136,29 @@ } }, "node_modules/cookie-signature": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz", - "integrity": "sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==", - "license": "MIT" + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.2.2.tgz", + "integrity": "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==", + "license": "MIT", + "engines": { + "node": ">=6.6.0" + } }, "node_modules/debug": { - "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", "license": "MIT", "dependencies": { - "ms": "2.0.0" + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } } }, "node_modules/depd": { @@ -148,16 +170,6 @@ "node": ">= 0.8" } }, - "node_modules/destroy": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz", - "integrity": "sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg==", - "license": "MIT", - "engines": { - "node": ">= 0.8", - "npm": "1.2.8000 || >= 1.4.16" - } - }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -233,82 +245,67 @@ } }, "node_modules/express": { - "version": "4.22.1", - "resolved": "https://registry.npmjs.org/express/-/express-4.22.1.tgz", - "integrity": "sha512-F2X8g9P1X7uCPZMA3MVf9wcTqlyNp7IhH5qPCI0izhaOIYXaW9L535tGA3qmjRzpH+bZczqq7hVKxTR4NWnu+g==", - "license": "MIT", - "dependencies": { - "accepts": "~1.3.8", - "array-flatten": "1.1.1", - "body-parser": "~1.20.3", - "content-disposition": "~0.5.4", - "content-type": "~1.0.4", - "cookie": "~0.7.1", - "cookie-signature": "~1.0.6", - "debug": "2.6.9", - "depd": "2.0.0", - "encodeurl": "~2.0.0", - "escape-html": "~1.0.3", - "etag": "~1.8.1", - "finalhandler": "~1.3.1", - "fresh": "~0.5.2", - "http-errors": "~2.0.0", - "merge-descriptors": "1.0.3", - "methods": "~1.1.2", - "on-finished": "~2.4.1", - "parseurl": "~1.3.3", - "path-to-regexp": "~0.1.12", - "proxy-addr": "~2.0.7", - "qs": "~6.14.0", - "range-parser": "~1.2.1", - "safe-buffer": "5.2.1", - "send": "~0.19.0", - "serve-static": "~1.16.2", - "setprototypeof": "1.2.0", - "statuses": "~2.0.1", - "type-is": "~1.6.18", - "utils-merge": "1.0.1", - "vary": "~1.1.2" - }, - "engines": { - "node": ">= 0.10.0" + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/express/-/express-5.2.1.tgz", + "integrity": "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==", + "license": "MIT", + "dependencies": { + "accepts": "^2.0.0", + "body-parser": "^2.2.1", + "content-disposition": "^1.0.0", + "content-type": "^1.0.5", + "cookie": "^0.7.1", + "cookie-signature": "^1.2.1", + "debug": "^4.4.0", + "depd": "^2.0.0", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "etag": "^1.8.1", + "finalhandler": "^2.1.0", + "fresh": "^2.0.0", + "http-errors": "^2.0.0", + "merge-descriptors": "^2.0.0", + "mime-types": "^3.0.0", + "on-finished": "^2.4.1", + "once": "^1.4.0", + "parseurl": "^1.3.3", + "proxy-addr": "^2.0.7", + "qs": "^6.14.0", + "range-parser": "^1.2.1", + "router": "^2.2.0", + "send": "^1.1.0", + "serve-static": "^2.2.0", + "statuses": "^2.0.1", + "type-is": "^2.0.1", + "vary": "^1.1.2" + }, + "engines": { + "node": ">= 18" }, "funding": { "type": "opencollective", "url": "https://opencollective.com/express" } }, - "node_modules/express/node_modules/qs": { - "version": "6.14.2", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.14.2.tgz", - "integrity": "sha512-V/yCWTTF7VJ9hIh18Ugr2zhJMP01MY7c5kh4J870L7imm6/DIzBsNLTXzMwUA3yZ5b/KBqLx8Kp3uRvd7xSe3Q==", - "license": "BSD-3-Clause", - "dependencies": { - "side-channel": "^1.1.0" - }, - "engines": { - "node": ">=0.6" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/finalhandler": { - "version": "1.3.1", - "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-1.3.1.tgz", - "integrity": "sha512-6BN9trH7bp3qvnrRyzsBz+g3lZxTNZTbVO2EV1CS0WIcDbawYVdYvGflME/9QP0h0pYlCDBCTjYa9nZzMDpyxQ==", + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-2.1.1.tgz", + "integrity": "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==", "license": "MIT", "dependencies": { - "debug": "2.6.9", - "encodeurl": "~2.0.0", - "escape-html": "~1.0.3", - "on-finished": "2.4.1", - "parseurl": "~1.3.3", - "statuses": "2.0.1", - "unpipe": "~1.0.0" + "debug": "^4.4.0", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "on-finished": "^2.4.1", + "parseurl": "^1.3.3", + "statuses": "^2.0.1" }, "engines": { - "node": ">= 0.8" + "node": ">= 18.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/forwarded": { @@ -321,12 +318,12 @@ } }, "node_modules/fresh": { - "version": "0.5.2", - "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz", - "integrity": "sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/fresh/-/fresh-2.0.0.tgz", + "integrity": "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==", "license": "MIT", "engines": { - "node": ">= 0.6" + "node": ">= 0.8" } }, "node_modules/function-bind": { @@ -412,31 +409,39 @@ } }, "node_modules/http-errors": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz", - "integrity": "sha512-FtwrG/euBzaEjYeRqOgly7G0qviiXoJWnvEH2Z1plBdXgbyjv34pHTSb9zoeHMyDy33+DWy5Wt9Wo+TURtOYSQ==", + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz", + "integrity": "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==", "license": "MIT", "dependencies": { - "depd": "2.0.0", - "inherits": "2.0.4", - "setprototypeof": "1.2.0", - "statuses": "2.0.1", - "toidentifier": "1.0.1" + "depd": "~2.0.0", + "inherits": "~2.0.4", + "setprototypeof": "~1.2.0", + "statuses": "~2.0.2", + "toidentifier": "~1.0.1" }, "engines": { "node": ">= 0.8" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/iconv-lite": { - "version": "0.4.24", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", - "integrity": "sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==", + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz", + "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==", "license": "MIT", "dependencies": { - "safer-buffer": ">= 2.1.2 < 3" + "safer-buffer": ">= 2.1.2 < 3.0.0" }, "engines": { "node": ">=0.10.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/inherits": { @@ -454,6 +459,12 @@ "node": ">= 0.10" } }, + "node_modules/is-promise": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-4.0.0.tgz", + "integrity": "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==", + "license": "MIT" + }, "node_modules/math-intrinsics": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", @@ -464,75 +475,61 @@ } }, "node_modules/media-typer": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz", - "integrity": "sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/media-typer/-/media-typer-1.1.0.tgz", + "integrity": "sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==", "license": "MIT", "engines": { - "node": ">= 0.6" + "node": ">= 0.8" } }, "node_modules/merge-descriptors": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.3.tgz", - "integrity": "sha512-gaNvAS7TZ897/rVaZ0nMtAyxNyi/pdbjbAwUpFQpN70GqnVfOiXpeUUMKRBmzXaSQ8DdTX4/0ms62r2K+hE6mQ==", - "license": "MIT", - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/methods": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz", - "integrity": "sha512-iclAHeNqNm68zFtnZ0e+1L2yUIdvzNoauKU4WBA3VvH/vPFieF7qfRlwUZU+DA9P9bPXIS90ulxoUoCH23sV2w==", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-2.0.0.tgz", + "integrity": "sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==", "license": "MIT", "engines": { - "node": ">= 0.6" - } - }, - "node_modules/mime": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", - "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", - "license": "MIT", - "bin": { - "mime": "cli.js" + "node": ">=18" }, - "engines": { - "node": ">=4" + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, "node_modules/mime-db": { - "version": "1.52.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", - "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "version": "1.54.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.54.0.tgz", + "integrity": "sha512-aU5EJuIN2WDemCcAp2vFBfp/m4EAhWJnUNSSw0ixs7/kXbd6Pg64EmwJkNdFhB8aWt1sH2CTXrLxo/iAGV3oPQ==", "license": "MIT", "engines": { "node": ">= 0.6" } }, "node_modules/mime-types": { - "version": "2.1.35", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", - "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-3.0.2.tgz", + "integrity": "sha512-Lbgzdk0h4juoQ9fCKXW4by0UJqj+nOOrI9MJ1sSj4nI8aI2eo1qmvQEie4VD1glsS250n15LsWsYtCugiStS5A==", "license": "MIT", "dependencies": { - "mime-db": "1.52.0" + "mime-db": "^1.54.0" }, "engines": { - "node": ">= 0.6" + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/ms": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", - "integrity": "sha512-Tpp60P6IUJDTuOq/5Z8cdskzJujfwqfOTkrwIwj7IRISpnkJnT6SyJ4PCPnGMoFjC9ddhal5KVIYtAt97ix05A==", + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "license": "MIT" }, "node_modules/negotiator": { - "version": "0.6.3", - "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.3.tgz", - "integrity": "sha512-+EUsqGPLsM+j/zdChZjsnX51g4XrHFOIXwfnCVPGlQk/k5giakcKsuxCObBRu6DSm9opw/O6slWbJdghQM4bBg==", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-1.0.0.tgz", + "integrity": "sha512-8Ofs/AUQh8MaEcrlq5xOX0CQ9ypTF5dl78mjlMNfOK08fzpgTHQRQPBxcPlEtIw0yRpws+Zo/3r+5WRby7u3Gg==", "license": "MIT", "engines": { "node": ">= 0.6" @@ -562,6 +559,15 @@ "node": ">= 0.8" } }, + "node_modules/once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", + "license": "ISC", + "dependencies": { + "wrappy": "1" + } + }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", @@ -572,28 +578,33 @@ } }, "node_modules/path-to-regexp": { - "version": "0.1.12", - "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.12.tgz", - "integrity": "sha512-RA1GjUVMnvYFxuqovrEqZoxxW5NUZqbwKtYz/Tt7nXerk0LbLblQmrsgdeOxV5SFHf0UDggjS/bSeOZwt1pmEQ==", - "license": "MIT" + "version": "8.3.0", + "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-8.3.0.tgz", + "integrity": "sha512-7jdwVIRtsP8MYpdXSwOS0YdD0Du+qOoF/AEPIt88PcCFrZCzx41oxku1jD88hZBwbNUIEfpqvuhjFaMAqMTWnA==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } }, "node_modules/pg": { - "version": "8.13.0", - "resolved": "https://registry.npmjs.org/pg/-/pg-8.13.0.tgz", - "integrity": "sha512-34wkUTh3SxTClfoHB3pQ7bIMvw9dpFU1audQQeZG837fmHfHpr14n/AELVDoOYVDW2h5RDWU78tFjkD+erSBsw==", + "version": "8.20.0", + "resolved": "https://registry.npmjs.org/pg/-/pg-8.20.0.tgz", + "integrity": "sha512-ldhMxz2r8fl/6QkXnBD3CR9/xg694oT6DZQ2s6c/RI28OjtSOpxnPrUCGOBJ46RCUxcWdx3p6kw/xnDHjKvaRA==", "license": "MIT", + "peer": true, "dependencies": { - "pg-connection-string": "^2.7.0", - "pg-pool": "^3.7.0", - "pg-protocol": "^1.7.0", - "pg-types": "^2.1.0", - "pgpass": "1.x" + "pg-connection-string": "^2.12.0", + "pg-pool": "^3.13.0", + "pg-protocol": "^1.13.0", + "pg-types": "2.2.0", + "pgpass": "1.0.5" }, "engines": { - "node": ">= 8.0.0" + "node": ">= 16.0.0" }, "optionalDependencies": { - "pg-cloudflare": "^1.1.1" + "pg-cloudflare": "^1.3.0" }, "peerDependencies": { "pg-native": ">=3.0.1" @@ -605,16 +616,16 @@ } }, "node_modules/pg-cloudflare": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/pg-cloudflare/-/pg-cloudflare-1.1.1.tgz", - "integrity": "sha512-xWPagP/4B6BgFO+EKz3JONXv3YDgvkbVrGw2mTo3D6tVDQRh1e7cqVGvyR3BE+eQgAvx1XhW/iEASj4/jCWl3Q==", + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/pg-cloudflare/-/pg-cloudflare-1.3.0.tgz", + "integrity": "sha512-6lswVVSztmHiRtD6I8hw4qP/nDm1EJbKMRhf3HCYaqud7frGysPv7FYJ5noZQdhQtN2xJnimfMtvQq21pdbzyQ==", "license": "MIT", "optional": true }, "node_modules/pg-connection-string": { - "version": "2.7.0", - "resolved": "https://registry.npmjs.org/pg-connection-string/-/pg-connection-string-2.7.0.tgz", - "integrity": "sha512-PI2W9mv53rXJQEOb8xNR8lH7Hr+EKa6oJa38zsK0S/ky2er16ios1wLKhZyxzD7jUReiWokc9WK5nxSnC7W1TA==", + "version": "2.12.0", + "resolved": "https://registry.npmjs.org/pg-connection-string/-/pg-connection-string-2.12.0.tgz", + "integrity": "sha512-U7qg+bpswf3Cs5xLzRqbXbQl85ng0mfSV/J0nnA31MCLgvEaAo7CIhmeyrmJpOr7o+zm0rXK+hNnT5l9RHkCkQ==", "license": "MIT" }, "node_modules/pg-int8": { @@ -627,18 +638,18 @@ } }, "node_modules/pg-pool": { - "version": "3.7.0", - "resolved": "https://registry.npmjs.org/pg-pool/-/pg-pool-3.7.0.tgz", - "integrity": "sha512-ZOBQForurqh4zZWjrgSwwAtzJ7QiRX0ovFkZr2klsen3Nm0aoh33Ls0fzfv3imeH/nw/O27cjdz5kzYJfeGp/g==", + "version": "3.13.0", + "resolved": "https://registry.npmjs.org/pg-pool/-/pg-pool-3.13.0.tgz", + "integrity": "sha512-gB+R+Xud1gLFuRD/QgOIgGOBE2KCQPaPwkzBBGC9oG69pHTkhQeIuejVIk3/cnDyX39av2AxomQiyPT13WKHQA==", "license": "MIT", "peerDependencies": { "pg": ">=8.0" } }, "node_modules/pg-protocol": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.7.0.tgz", - "integrity": "sha512-hTK/mE36i8fDDhgDFjy6xNOG+LCorxLG3WO17tku+ij6sVHXh1jQUJ8hYAnRhNla4QVD2H8er/FOjc/+EgC6yQ==", + "version": "1.13.0", + "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.13.0.tgz", + "integrity": "sha512-zzdvXfS6v89r6v7OcFCHfHlyG/wvry1ALxZo4LqgUoy7W9xhBDMaqOuMiF3qEV45VqsN6rdlcehHrfDtlCPc8w==", "license": "MIT" }, "node_modules/pg-types": { @@ -705,6 +716,19 @@ "node": ">=0.10.0" } }, + "node_modules/prom-client": { + "version": "15.1.3", + "resolved": "https://registry.npmjs.org/prom-client/-/prom-client-15.1.3.tgz", + "integrity": "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.4.0", + "tdigest": "^0.1.1" + }, + "engines": { + "node": "^16 || ^18 || >=20" + } + }, "node_modules/proxy-addr": { "version": "2.0.7", "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", @@ -719,12 +743,12 @@ } }, "node_modules/qs": { - "version": "6.13.0", - "resolved": "https://registry.npmjs.org/qs/-/qs-6.13.0.tgz", - "integrity": "sha512-+38qI9SOr8tfZ4QmJNplMUxqjbe7LKvvZgWdExBOmd+egZTtjLB67Gu0HRX3u/XOq7UU2Nx6nsjvS16Z9uwfpg==", + "version": "6.15.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.15.0.tgz", + "integrity": "sha512-mAZTtNCeetKMH+pSjrb76NAM8V9a05I9aBZOHztWy/UqcJdQYNsf59vrRKWnojAT9Y+GbIvoTBC++CPHqpDBhQ==", "license": "BSD-3-Clause", "dependencies": { - "side-channel": "^1.0.6" + "side-channel": "^1.1.0" }, "engines": { "node": ">=0.6" @@ -743,39 +767,35 @@ } }, "node_modules/raw-body": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz", - "integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-3.0.2.tgz", + "integrity": "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==", "license": "MIT", "dependencies": { - "bytes": "3.1.2", - "http-errors": "2.0.0", - "iconv-lite": "0.4.24", - "unpipe": "1.0.0" + "bytes": "~3.1.2", + "http-errors": "~2.0.1", + "iconv-lite": "~0.7.0", + "unpipe": "~1.0.0" }, "engines": { - "node": ">= 0.8" + "node": ">= 0.10" } }, - "node_modules/safe-buffer": { - "version": "5.2.1", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", - "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/feross" - }, - { - "type": "patreon", - "url": "https://www.patreon.com/feross" - }, - { - "type": "consulting", - "url": "https://feross.org/support" - } - ], - "license": "MIT" + "node_modules/router": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/router/-/router-2.2.0.tgz", + "integrity": "sha512-nLTrUKm2UyiL7rlhapu/Zl45FwNgkZGaCpZbIHajDYgwlJCOzLSk+cIPAnsEqV955GjILJnKbdQC1nVPz+gAYQ==", + "license": "MIT", + "dependencies": { + "debug": "^4.4.0", + "depd": "^2.0.0", + "is-promise": "^4.0.0", + "parseurl": "^1.3.3", + "path-to-regexp": "^8.0.0" + }, + "engines": { + "node": ">= 18" + } }, "node_modules/safer-buffer": { "version": "2.1.2", @@ -784,57 +804,48 @@ "license": "MIT" }, "node_modules/send": { - "version": "0.19.0", - "resolved": "https://registry.npmjs.org/send/-/send-0.19.0.tgz", - "integrity": "sha512-dW41u5VfLXu8SJh5bwRmyYUbAoSB3c9uQh6L8h/KtsFREPWpbX1lrljJo186Jc4nmci/sGUZ9a0a0J2zgfq2hw==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/send/-/send-1.2.1.tgz", + "integrity": "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ==", "license": "MIT", "dependencies": { - "debug": "2.6.9", - "depd": "2.0.0", - "destroy": "1.2.0", - "encodeurl": "~1.0.2", - "escape-html": "~1.0.3", - "etag": "~1.8.1", - "fresh": "0.5.2", - "http-errors": "2.0.0", - "mime": "1.6.0", - "ms": "2.1.3", - "on-finished": "2.4.1", - "range-parser": "~1.2.1", - "statuses": "2.0.1" + "debug": "^4.4.3", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "etag": "^1.8.1", + "fresh": "^2.0.0", + "http-errors": "^2.0.1", + "mime-types": "^3.0.2", + "ms": "^2.1.3", + "on-finished": "^2.4.1", + "range-parser": "^1.2.1", + "statuses": "^2.0.2" }, "engines": { - "node": ">= 0.8.0" - } - }, - "node_modules/send/node_modules/encodeurl": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", - "integrity": "sha512-TPJXq8JqFaVYm2CWmPvnP2Iyo4ZSM7/QKcSmuMLDObfpH5fi7RUGmd/rTDf+rut/saiDiQEeVTNgAmJEdAOx0w==", - "license": "MIT", - "engines": { - "node": ">= 0.8" + "node": ">= 18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, - "node_modules/send/node_modules/ms": { - "version": "2.1.3", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", - "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", - "license": "MIT" - }, "node_modules/serve-static": { - "version": "1.16.2", - "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.16.2.tgz", - "integrity": "sha512-VqpjJZKadQB/PEbEwvFdO43Ax5dFBZ2UECszz8bQ7pi7wt//PWe1P6MN7eCnjsatYtBT6EuiClbjSWP2WrIoTw==", + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-2.2.1.tgz", + "integrity": "sha512-xRXBn0pPqQTVQiC8wyQrKs2MOlX24zQ0POGaj0kultvoOCstBQM5yvOhAVSUwOMjQtTvsPWoNCHfPGwaaQJhTw==", "license": "MIT", "dependencies": { - "encodeurl": "~2.0.0", - "escape-html": "~1.0.3", - "parseurl": "~1.3.3", - "send": "0.19.0" + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "parseurl": "^1.3.3", + "send": "^1.2.0" }, "engines": { - "node": ">= 0.8.0" + "node": ">= 18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/setprototypeof": { @@ -925,14 +936,23 @@ } }, "node_modules/statuses": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.1.tgz", - "integrity": "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz", + "integrity": "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==", "license": "MIT", "engines": { "node": ">= 0.8" } }, + "node_modules/tdigest": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/tdigest/-/tdigest-0.1.2.tgz", + "integrity": "sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==", + "license": "MIT", + "dependencies": { + "bintrees": "1.0.2" + } + }, "node_modules/toidentifier": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz", @@ -943,13 +963,14 @@ } }, "node_modules/type-is": { - "version": "1.6.18", - "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz", - "integrity": "sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==", + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.0.1.tgz", + "integrity": "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==", "license": "MIT", "dependencies": { - "media-typer": "0.3.0", - "mime-types": "~2.1.24" + "content-type": "^1.0.5", + "media-typer": "^1.1.0", + "mime-types": "^3.0.0" }, "engines": { "node": ">= 0.6" @@ -964,15 +985,6 @@ "node": ">= 0.8" } }, - "node_modules/utils-merge": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz", - "integrity": "sha512-pMZTvIkT1d+TFGvDOqodOclx0QWkkgi6Tdoa8gC8ffGAAqz9pzPTZWAybbsHHoED/ztMtkv/VoYTYyShUn81hA==", - "license": "MIT", - "engines": { - "node": ">= 0.4.0" - } - }, "node_modules/vary": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", @@ -982,6 +994,12 @@ "node": ">= 0.8" } }, + "node_modules/wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", + "license": "ISC" + }, "node_modules/xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", diff --git a/network-test-app/package.json b/network-test-app/package.json index d56cac095..d61adb683 100644 --- a/network-test-app/package.json +++ b/network-test-app/package.json @@ -2,7 +2,7 @@ "name": "network-test-app", "version": "1.0.0", "description": "App to test connectivity to IBM Cloud Services", - "main": "app.js", + "main": "app.mjs", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, @@ -18,7 +18,8 @@ "author": "", "license": "ISC", "dependencies": { - "express": "^4.22.1", - "pg": "^8.13.0" + "express": "^5.2.1", + "pg": "^8.20.0", + "prom-client": "^15.1.3" } } diff --git a/network-test-app/utils/db.mjs b/network-test-app/utils/db.mjs new file mode 100644 index 000000000..f33008349 --- /dev/null +++ b/network-test-app/utils/db.mjs @@ -0,0 +1,56 @@ +import { Client } from "pg"; + +const pgServiceCredentials = process.env.DATABASES_FOR_POSTGRESQL_CONNECTION; +const pgTimeoutMs = 15000; // timeout in 15 seconds +let _dbClient = null; + +export async function getDbClient() { + if (!pgServiceCredentials) { + return undefined; + } + + if (_dbClient) { + return _dbClient; + } + + // Use env variables loaded from service binding to connect to our postgres instance + console.log("Connecting to PostgreSQL instance..."); + postgresSetup = JSON.parse(pgServiceCredentials); + cli = postgresSetup.cli; + postgres = postgresSetup.postgres; + cert = Buffer.from(postgres.certificate.certificate_base64, "base64").toString("utf8"); + + // Define the client + const client = new Client({ + user: postgres.authentication.username, + password: cli.environment.PGPASSWORD, + host: postgres.hosts[0].hostname, + database: postgres.database, + port: postgres.hosts[0].port, + statement_timeout: pgTimeoutMs, + query_timeout: pgTimeoutMs, + lock_timeout: pgTimeoutMs, + application_name: "network-test-app", + connectionTimeoutMillis: pgTimeoutMs, + ssl: { + ca: cert, + rejectUnauthorized: true, + }, + }); + + // Initiate the connection + _dbClient = await client.connect(); + + return _dbClient; +} + +export async function closeDbClient() { + try { + if (_dbClient) { + await _dbClient.end(); + console.log("DB connection closed."); + } + } catch (e) { + console.error("Failed to close DB connection."); + } +} diff --git a/private-path-to-vpc-vsi/ce-app/Dockerfile b/private-path-to-vpc-vsi/ce-app/Dockerfile index 7ae1e0829..93565d9e5 100644 --- a/private-path-to-vpc-vsi/ce-app/Dockerfile +++ b/private-path-to-vpc-vsi/ce-app/Dockerfile @@ -1,10 +1,10 @@ -FROM quay.io/projectquay/golang:1.23 AS build-env +FROM quay.io/projectquay/golang:1.25 AS build-env WORKDIR /go/src/app COPY . . RUN CGO_ENABLED=0 go build -o /go/bin/app . # Copy the exe into a smaller base image -FROM gcr.io/distroless/static-debian12 +FROM gcr.io/distroless/static-debian13 COPY --from=build-env /go/bin/app / ENTRYPOINT ["/app"] diff --git a/private-path-to-vpc-vsi/ce-app/go.mod b/private-path-to-vpc-vsi/ce-app/go.mod index bff2499af..f65b725d1 100644 --- a/private-path-to-vpc-vsi/ce-app/go.mod +++ b/private-path-to-vpc-vsi/ce-app/go.mod @@ -1,5 +1,5 @@ module github.com/IBM/CodeEngine/ce-private-path -go 1.23.0 +go 1.25 require github.com/lib/pq v1.10.9 diff --git a/satellite-connector-to-vpc-vsi/ce-app/Dockerfile b/satellite-connector-to-vpc-vsi/ce-app/Dockerfile index 7ae1e0829..93565d9e5 100644 --- a/satellite-connector-to-vpc-vsi/ce-app/Dockerfile +++ b/satellite-connector-to-vpc-vsi/ce-app/Dockerfile @@ -1,10 +1,10 @@ -FROM quay.io/projectquay/golang:1.23 AS build-env +FROM quay.io/projectquay/golang:1.25 AS build-env WORKDIR /go/src/app COPY . . RUN CGO_ENABLED=0 go build -o /go/bin/app . # Copy the exe into a smaller base image -FROM gcr.io/distroless/static-debian12 +FROM gcr.io/distroless/static-debian13 COPY --from=build-env /go/bin/app / ENTRYPOINT ["/app"] diff --git a/satellite-connector-to-vpc-vsi/ce-app/go.mod b/satellite-connector-to-vpc-vsi/ce-app/go.mod index 06d27fb97..4511f6fe9 100644 --- a/satellite-connector-to-vpc-vsi/ce-app/go.mod +++ b/satellite-connector-to-vpc-vsi/ce-app/go.mod @@ -1,5 +1,5 @@ module github.com/IBM/CodeEngine/ce-satellite-connector -go 1.21.0 +go 1.25 require github.com/lib/pq v1.10.9