Files
2026-04-28 10:05:39 +08:00

161 lines
6.7 KiB
Bash

#!/usr/bin/env bash
# Run the CubeSandbox one-click installer, then run quickcheck.sh.
# Called by cube-install.service (Type=oneshot) after docker.service and
# cube-xfs-mount.service are both active.
set -euo pipefail
log() { printf '[cube-install] %s\n' "$*"; }
err() { printf '[cube-install] ERROR: %s\n' "$*" >&2; }
INSTALL_PREFIX="/usr/local/services/cubetoolbox"
QUICKCHECK="${INSTALL_PREFIX}/scripts/one-click/quickcheck.sh"
UP_SCRIPT="${INSTALL_PREFIX}/scripts/one-click/up-with-deps.sh"
MIRROR="${CUBE_MIRROR:-cn}"
INSTALLER_URL_CN="https://cnb.cool/CubeSandbox/CubeSandbox/-/git/raw/master/deploy/one-click/online-install.sh"
INSTALLER_URL_GH="https://github.com/tencentcloud/CubeSandbox/raw/master/deploy/one-click/online-install.sh"
# /dev/kvm sanity — required by the MicroVM hypervisor.
if [ ! -c /dev/kvm ]; then
err "/dev/kvm is not available inside the container."
err "Ensure the compose stack passes --device /dev/kvm and nested virt is enabled on the host."
exit 1
fi
log "KVM device present: $(ls -l /dev/kvm)"
# Wait for dockerd (started by docker.service) to be ready before install.sh
# tries to pull MySQL / Redis / CubeProxy images.
log "Waiting for docker daemon ..."
for i in $(seq 1 60); do
if docker info >/dev/null 2>&1; then
log "docker ready."
break
fi
sleep 2
done
if ! docker info >/dev/null 2>&1; then
err "docker daemon not ready after 120 s"
exit 1
fi
# Redirect TMPDIR to the 50 GB XFS volume.
# /tmp is only 256 MB (tmpfs) and mounted noexec — both cause install failures:
# - curl: (23) Failure writing output to destination (out of space)
# - extracted scripts fail to execute (noexec mount flag)
mkdir -p /data/tmp
export TMPDIR=/data/tmp
log "TMPDIR set to $TMPDIR ($(df -h /data/tmp | awk 'NR==2{print $4}') free)"
# Set CAROOT so mkcert can find / create the local CA directory on every boot.
# Without this, up-cube-proxy.sh calls `mkcert -install` which exits with:
# "ERROR: failed to find the default CA location"
# Because up-with-deps.sh runs under set -euo pipefail, that failure aborts
# the entire script before any compute services (network-agent, CubeAPI, etc.)
# are started. Persisting the CA on /data (named volume) means the cert is
# re-used across container restarts rather than regenerated each time.
export CAROOT=/data/mkcert-ca
mkdir -p "$CAROOT"
log "CAROOT set to $CAROOT"
# Run the upstream one-click installer on first boot; on subsequent boots
# just re-launch all services via up-with-deps.sh.
if [ -x "$QUICKCHECK" ] && [ "${CUBE_FORCE_REINSTALL:-0}" != "1" ]; then
log "CubeSandbox already installed at $INSTALL_PREFIX — starting services."
if [ ! -x "$UP_SCRIPT" ]; then
err "up-with-deps.sh not found at $UP_SCRIPT — reinstall required"
exit 1
fi
ONE_CLICK_TOOLBOX_ROOT="$INSTALL_PREFIX" \
ONE_CLICK_RUNTIME_ENV_FILE="${INSTALL_PREFIX}/.one-click.env" \
bash "$UP_SCRIPT" \
|| log "WARNING: up-with-deps.sh exited non-zero; services may still be starting"
else
log "Running CubeSandbox one-click installer (mirror=$MIRROR) ..."
if [ "$MIRROR" = "cn" ]; then
curl -fsSL "$INSTALLER_URL_CN" | MIRROR=cn bash
else
curl -fsSL "$INSTALLER_URL_GH" | bash
fi
fi
# Run quickcheck.sh with retries — network-agent initialises 500 tap interfaces
# which takes ~2 minutes; we retry every 30 s for up to 10 minutes.
QUICKCHECK_PASSED=0
if [ -x "$QUICKCHECK" ]; then
log "Running quickcheck.sh (retrying up to 10 min for network-agent tap init) ..."
for i in $(seq 1 20); do
if ONE_CLICK_TOOLBOX_ROOT="$INSTALL_PREFIX" \
ONE_CLICK_RUNTIME_ENV_FILE="${INSTALL_PREFIX}/.one-click.env" \
"$QUICKCHECK" 2>&1; then
QUICKCHECK_PASSED=1
break
fi
log "quickcheck attempt $i/20 failed — retrying in 30 s ..."
sleep 30
done
else
err "quickcheck.sh not found at $QUICKCHECK — install may have failed."
exit 1
fi
if [ "$QUICKCHECK_PASSED" != "1" ]; then
err "quickcheck.sh never passed after 20 attempts — CubeSandbox is unhealthy."
exit 1
fi
# Ensure containerd-shim-cube-rs is on Cubelet's clean PATH.
# up.sh/up-with-deps.sh launch Cubelet with:
# PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
# Cubelet resolves runtime shims from that PATH, so it cannot find
# containerd-shim-cube-rs unless it is symlinked into one of those dirs.
# We create the symlink unconditionally on every boot (both after fresh
# install and after the restart path) so Cubelet can start sandboxes.
SHIM_SRC="${INSTALL_PREFIX}/cube-shim/bin/containerd-shim-cube-rs"
SHIM_DST="/usr/local/bin/containerd-shim-cube-rs"
if [ -x "$SHIM_SRC" ]; then
ln -sf "$SHIM_SRC" "$SHIM_DST"
log "containerd-shim-cube-rs linked: $SHIM_DST -> $SHIM_SRC"
else
log "WARNING: $SHIM_SRC not found — Cubelet will not be able to start MicroVMs"
fi
# Restart Cubelet now that network-agent is confirmed ready.
# On first startup the Cubelet process begins before network-agent has finished
# initialising its 500 TAP interfaces (~2 min). This causes the
# io.cubelet.images-service.v1 plugin to fail with:
# "network-agent health check failed ... context deadline exceeded"
# leaving the gRPC cubelet.services.images.v1.Images service unregistered.
# When CubeMaster later tries to distribute a template artifact to the node it
# gets back gRPC Unimplemented and the build fails.
# Restarting Cubelet here — after quickcheck has confirmed network-agent is up —
# allows the images-service plugin to load successfully on the second boot.
CUBELET_BIN="${INSTALL_PREFIX}/Cubelet/bin/cubelet"
CUBELET_CFG="${INSTALL_PREFIX}/Cubelet/config/config.toml"
CUBELET_DYN="${INSTALL_PREFIX}/Cubelet/dynamicconf/conf.yaml"
CUBELET_LOG="/data/log/Cubelet/Cubelet-req.log"
if [ -x "$CUBELET_BIN" ]; then
log "Restarting Cubelet so images-service plugin loads against ready network-agent ..."
pkill -f "${CUBELET_BIN}" 2>/dev/null || true
sleep 2
mkdir -p "$(dirname "$CUBELET_LOG")"
nohup "$CUBELET_BIN" \
--config "$CUBELET_CFG" \
--dynamic-conf-path "$CUBELET_DYN" \
>>"$CUBELET_LOG" 2>&1 &
CUBELET_PID=$!
log "Cubelet restarted (PID ${CUBELET_PID}) — waiting 10 s for boot ..."
sleep 10
if kill -0 "$CUBELET_PID" 2>/dev/null; then
log "Cubelet is running."
else
log "WARNING: Cubelet PID ${CUBELET_PID} exited — check ${CUBELET_LOG}."
fi
fi
log "==================== CubeSandbox is up ===================="
log " CubeAPI: http://127.0.0.1:3000/health"
log " CubeMaster: http://127.0.0.1:8089/notify/health"
log " network-agent http://127.0.0.1:19090/healthz"
log " Logs: /data/log/{CubeAPI,CubeMaster,Cubelet}/"
log "==========================================================="