Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
03f1d4b
autoscaling wip
breardon2011 Mar 26, 2026
dd18a69
Merge branch 'main' into autoscaling-etc
breardon2011 Mar 30, 2026
95c1121
add disk pressure, add goldensnap versioning system
breardon2011 Mar 31, 2026
9410e8b
workng wip
breardon2011 Apr 3, 2026
4bdfad7
more tests
breardon2011 Apr 4, 2026
e8f0a0c
ha infra, and fixes
breardon2011 Apr 4, 2026
fdaa7d7
update deployment
breardon2011 Apr 4, 2026
cee12ac
update for test repo
breardon2011 Apr 4, 2026
6f922c8
more tests
breardon2011 Apr 6, 2026
fbe162a
fix: SSH key handling in deploy workflow
breardon2011 Apr 6, 2026
74664b7
fix for key
breardon2011 Apr 6, 2026
ace7ae6
fix: mkdir ssh dir before writing key
breardon2011 Apr 6, 2026
6c9421f
use b64 key
breardon2011 Apr 6, 2026
5ea88ee
change deployment
breardon2011 Apr 6, 2026
d5ab564
trigger: test Packer build pipeline
breardon2011 Apr 6, 2026
907282c
fix: Packer build — package rootfs context, fail on missing image
breardon2011 Apr 7, 2026
56463e6
re login because step takes too long
breardon2011 Apr 7, 2026
bc72e78
fix: smoke test retry + Packer Azure re-login
breardon2011 Apr 7, 2026
8deb8ca
use gallery
breardon2011 Apr 7, 2026
e0653db
draining bug for control plane
breardon2011 Apr 7, 2026
07fa499
fixes
breardon2011 Apr 7, 2026
1094a34
fix: CommittedMemoryMB propagation, drain/destroy, scale-down thrashi…
breardon2011 Apr 7, 2026
59e14be
backups until managed
breardon2011 Apr 7, 2026
1dc394d
test and demo adjustment
breardon2011 Apr 8, 2026
8321226
edit test
breardon2011 Apr 8, 2026
2248573
query actual size
breardon2011 Apr 8, 2026
56a283b
upto date stats
breardon2011 Apr 8, 2026
6d7f6ff
memory fixes
breardon2011 Apr 9, 2026
a2f2652
Merge main into autoscaling-etc
breardon2011 Apr 10, 2026
de1e86e
merge main
breardon2011 Apr 14, 2026
1a70bef
fix migration files
breardon2011 Apr 14, 2026
b2aad6d
fix migration list in store.go
breardon2011 Apr 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 149 additions & 0 deletions .github/workflows/build-worker-ami.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
name: Build Worker Image

on:
push:
branches: [main, autoscaling-etc]
paths:
- 'cmd/worker/**'
- 'cmd/agent/**'
- 'internal/**'
- 'proto/**'
- 'deploy/firecracker/rootfs/**'
- 'deploy/azure/**'
- 'deploy/ec2/build-rootfs-docker.sh'
- 'deploy/packer/**'
- 'scripts/claude-agent-wrapper/**'
- 'go.mod'
- 'go.sum'
- '.github/workflows/build-worker-ami.yml'
workflow_dispatch:

env:
AZURE_RESOURCE_GROUP: ${{ secrets.AZURE_RESOURCE_GROUP }}
AZURE_LOCATION: ${{ vars.AZURE_LOCATION || 'eastus2' }}
AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

jobs:
build-image:
name: Build Worker Image (Azure)
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v4

- uses: actions/setup-go@v5
with:
go-version: '1.23'

- name: Build binaries (amd64)
run: |
VERSION=$(git rev-parse --short HEAD)
echo "VERSION=$VERSION" >> $GITHUB_ENV

CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-ldflags "-X main.WorkerVersion=$VERSION -X main.AgentVersion=$VERSION" \
-o bin/opensandbox-worker ./cmd/worker/

CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build \
-ldflags "-X main.Version=$VERSION" \
-o bin/osb-agent ./cmd/agent/

- name: Package rootfs context
run: |
tar czf /tmp/packer-rootfs-ctx.tar.gz \
deploy/firecracker/rootfs/ \
deploy/ec2/build-rootfs-docker.sh \
scripts/claude-agent-wrapper/

- name: Azure Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

- name: Setup Packer
uses: hashicorp/setup-packer@main

- name: Packer init
run: packer init deploy/packer/worker-ami.pkr.hcl

- name: Build image
run: |
# Use run number as patch version for gallery (must be integer)
PATCH=${{ github.run_number }}

packer build \
-var "worker_version=$VERSION" \
-var "agent_version=$VERSION" \
-var "subscription_id=$AZURE_SUBSCRIPTION_ID" \
-var "resource_group=$AZURE_RESOURCE_GROUP" \
-var "location=$AZURE_LOCATION" \
-var "image_version_patch=$PATCH" \
deploy/packer/worker-ami.pkr.hcl | tee /tmp/packer-output.txt

# Use the gallery image ID (NVMe-compatible for v6 VMs)
GALLERY_IMAGE_ID="/subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/${AZURE_RESOURCE_GROUP}/providers/Microsoft.Compute/galleries/opensandbox_gallery/images/osb-worker/versions/1.0.${PATCH}"

# Verify it exists
if ! az sig image-version show \
--resource-group "$AZURE_RESOURCE_GROUP" \
--gallery-name opensandbox_gallery \
--gallery-image-definition osb-worker \
--gallery-image-version "1.0.${PATCH}" -o none 2>/dev/null; then
echo "ERROR: Gallery image version 1.0.${PATCH} not found after build"
cat /tmp/packer-output.txt
exit 1
fi

echo "IMAGE_ID=$GALLERY_IMAGE_ID" >> $GITHUB_ENV
echo "Built gallery image: $GALLERY_IMAGE_ID (version=1.0.${PATCH})"

- name: Azure Re-Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

- name: Store image ID in Key Vault
run: |
az keyvault secret set \
--vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \
--name "worker-image-id" \
--value "$IMAGE_ID"

az keyvault secret set \
--vault-name "${{ secrets.AZURE_KEY_VAULT_NAME }}" \
--name "worker-image-version" \
--value "$VERSION"

echo "Published image ID to Key Vault"

- name: Cleanup old images
run: |
# List all worker images, keep the 5 most recent
IMAGES=$(az image list \
--resource-group "$AZURE_RESOURCE_GROUP" \
--query "[?tags.\"opensandbox-role\"=='worker'] | sort_by(@, &name) | reverse(@)" \
--output json)

STALE=$(echo "$IMAGES" | jq -r '.[5:] | .[].name')
if [ -z "$STALE" ]; then
echo "No old images to clean up"
exit 0
fi

for name in $STALE; do
echo "Deleting old image: $name"
az image delete --resource-group "$AZURE_RESOURCE_GROUP" --name "$name" || true
done

- name: Summary
run: |
echo "## Worker Image Build Complete" >> $GITHUB_STEP_SUMMARY
echo "- **Image:** $IMAGE_ID" >> $GITHUB_STEP_SUMMARY
echo "- **Version:** $VERSION" >> $GITHUB_STEP_SUMMARY
echo "- **Location:** $AZURE_LOCATION" >> $GITHUB_STEP_SUMMARY
164 changes: 142 additions & 22 deletions .github/workflows/deploy-server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: Deploy Control Plane

on:
push:
branches: [main]
branches: [main, autoscaling-etc]
paths:
- 'cmd/server/**'
- 'internal/**'
Expand All @@ -18,8 +18,11 @@ env:

jobs:
deploy:
name: Build & Deploy Server
name: Blue-Green Deploy
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v4

Expand All @@ -32,7 +35,12 @@ jobs:
node-version: '20'

- name: Build server binary
run: CGO_ENABLED=0 GOOS=linux GOARCH=${{ env.GOARCH }} go build -o bin/opensandbox-server ./cmd/server/
run: |
VERSION=$(git rev-parse --short HEAD)
echo "VERSION=$VERSION" >> $GITHUB_ENV
CGO_ENABLED=0 GOOS=linux GOARCH=${{ env.GOARCH }} go build \
-ldflags "-X main.Version=$VERSION" \
-o bin/opensandbox-server ./cmd/server/

- name: Build web dashboard
run: cd web && npm ci && npm run build
Expand All @@ -43,28 +51,140 @@ jobs:
- name: Configure SSH
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/deploy.pem
echo "$SSH_KEY" | base64 -d > ~/.ssh/deploy.pem
chmod 600 ~/.ssh/deploy.pem
ssh-keyscan -H ${{ secrets.SERVER_IP }} >> ~/.ssh/known_hosts
echo -e "Host *\n StrictHostKeyChecking no\n UserKnownHostsFile /dev/null" >> ~/.ssh/config
env:
SSH_KEY: ${{ secrets.SSH_PRIVATE_KEY_B64 }}

- name: Azure Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

- name: Upload artifacts
- name: Discover control planes
run: |
scp -i ~/.ssh/deploy.pem bin/opensandbox-server ${{ env.SSH_USER }}@${{ secrets.SERVER_IP }}:/tmp/opensandbox-server
scp -i ~/.ssh/deploy.pem bin/web-dist.tar.gz ${{ env.SSH_USER }}@${{ secrets.SERVER_IP }}:/tmp/web-dist.tar.gz
CPS=$(az vm list \
--resource-group ${{ secrets.AZURE_RESOURCE_GROUP }} \
--show-details \
--query "[?contains(name,'controlplane')].publicIps" \
-o tsv 2>/dev/null | tr '\n' ',' | sed 's/,$//')
echo "Discovered CPs: $CPS"
echo "CP_IPS=$CPS" >> $GITHUB_ENV

- name: Install and restart
if [ -z "$CPS" ]; then
echo "ERROR: No control plane VMs found"
exit 1
fi

- name: Deploy to all control planes
run: |
ssh -i ~/.ssh/deploy.pem ${{ env.SSH_USER }}@${{ secrets.SERVER_IP }} '
sudo mv /tmp/opensandbox-server /usr/local/bin/opensandbox-server
sudo chmod +x /usr/local/bin/opensandbox-server
sudo mkdir -p /opt/opensandbox/web
sudo tar xzf /tmp/web-dist.tar.gz -C /opt/opensandbox/web
rm /tmp/web-dist.tar.gz
sudo systemctl restart opensandbox-server
'

- name: Verify deployment
# CP_IPS discovered from Azure — comma-separated list
IFS=',' read -ra CPS <<< "$CP_IPS"

if [ ${#CPS[@]} -eq 0 ]; then
echo "ERROR: No control planes found"
exit 1
fi

deploy_one() {
local IP=$1
local LABEL=$2
echo ""
echo "=== Deploying to $LABEL ($IP) ==="

scp -i ~/.ssh/deploy.pem bin/opensandbox-server bin/web-dist.tar.gz \
${{ env.SSH_USER }}@${IP}:/tmp/

ssh -i ~/.ssh/deploy.pem ${{ env.SSH_USER }}@${IP} '
sudo systemctl stop opensandbox-server
sleep 1
sudo rm -f /usr/local/bin/opensandbox-server
sudo cp /tmp/opensandbox-server /usr/local/bin/opensandbox-server
sudo chmod +x /usr/local/bin/opensandbox-server
sudo mkdir -p /opt/opensandbox/web
sudo tar xzf /tmp/web-dist.tar.gz -C /opt/opensandbox/web
rm -f /tmp/opensandbox-server /tmp/web-dist.tar.gz
sudo systemctl start opensandbox-server
'

echo "Waiting for $LABEL to be ready..."
for i in $(seq 1 20); do
STATUS=$(curl -sf --max-time 3 http://${IP}:8080/readyz 2>/dev/null | jq -r .status 2>/dev/null)
if [ "$STATUS" = "ready" ]; then
echo "$LABEL ready"
return 0
fi
sleep 3
done
echo "WARNING: $LABEL not ready after 60s"
return 1
}

# Deploy followers first (all except the first IP)
for i in $(seq $((${#CPS[@]}-1)) -1 1); do
deploy_one "${CPS[$i]}" "CP$((i+1))"
done

# Deploy leader last (first IP)
deploy_one "${CPS[0]}" "CP1"

- name: Smoke test
run: |
IFS=',' read -ra CPS <<< "$CP_IPS"
LEADER="${CPS[0]}"
# Pick a follower for cross-CP test (or use leader if only 1 CP)
if [ ${#CPS[@]} -gt 1 ]; then
FOLLOWER="${CPS[1]}"
else
FOLLOWER="$LEADER"
fi

# Create sandbox via leader
SB=$(curl -sf --max-time 30 \
-H "Content-Type: application/json" \
-H "X-API-Key: ${{ secrets.API_KEY }}" \
-X POST "http://${LEADER}:8080/api/sandboxes" \
-d '{"timeout":60}' | jq -r .sandboxID)

if [ -z "$SB" ] || [ "$SB" = "null" ]; then
echo "Smoke test FAILED: could not create sandbox"
exit 1
fi
echo "Created sandbox: $SB"

# Exec via follower with retry
RESULT=""
for attempt in 1 2 3; do
sleep 5
RESULT=$(curl -sf --max-time 15 \
-H "Content-Type: application/json" \
-H "X-API-Key: ${{ secrets.API_KEY }}" \
-X POST "http://${FOLLOWER}:8080/api/sandboxes/$SB/exec/run" \
-d '{"cmd":"echo","args":["deploy-ok"],"timeout":10}' | jq -r .stdout 2>/dev/null || echo "")
echo "Attempt $attempt: '$RESULT'"
echo "$RESULT" | grep -q "deploy-ok" && break
done

curl -sf --max-time 10 \
-H "X-API-Key: ${{ secrets.API_KEY }}" \
-X DELETE "http://${LEADER}:8080/api/sandboxes/$SB" || true

if echo "$RESULT" | grep -q "deploy-ok"; then
echo "Smoke test PASSED (create on $LEADER, exec on $FOLLOWER)"
else
echo "Smoke test FAILED: exec returned '$RESULT'"
exit 1
fi

- name: Summary
run: |
sleep 3
ssh -i ~/.ssh/deploy.pem ${{ env.SSH_USER }}@${{ secrets.SERVER_IP }} 'sudo systemctl is-active opensandbox-server'
echo "Server deployed successfully!"
IFS=',' read -ra CPS <<< "$CP_IPS"
echo "## Control Plane Deploy Complete" >> $GITHUB_STEP_SUMMARY
echo "- **Version:** $VERSION" >> $GITHUB_STEP_SUMMARY
for i in "${!CPS[@]}"; do
echo "- **CP$((i+1)):** ${CPS[$i]} ✅" >> $GITHUB_STEP_SUMMARY
done
echo "- **Smoke test:** Passed" >> $GITHUB_STEP_SUMMARY
Loading
Loading