Add XCP-NG integration, deploy_agent.sh, overhaul onboard_client.sh

- roles/xcpng_update: new role — patches XCP-NG pools via XO REST API - non-HA pools: pool-level install_patches + restart_hosts - HA clusters: rolling pool update via JSON-RPC pool.rollingUpdate - dry run support, patch verification after update - roles/snapshot: add xcpng_xo hypervisor_type support via XO REST API - playbooks/xcpng_pool_update.yml: new playbook for XCP-NG pool patching - inventories/client_template/hosts.yml: add xcpng_hosts group - scripts/onboard_client.sh: major overhaul - add --hypervisor proxmox|xcpng|baremetal|mixed - add --xo-url / --xo-token (falls back to global env) - webhook no longer required (falls back to N8N_WEBHOOK_URL in env) - ansible_user changed to ansible-msp-agent with sudo - xcpng_hosts group in inventory scaffold for xcpng/mixed clients - hypervisor-appropriate task templates created automatically - add --dry-run support - scripts/deploy_agent.sh: new script — bootstrap ansible-msp-agent - reads hosts.yml to get Linux/Windows hosts - SSHes as native account, su - to root - creates ansible-msp-agent user + sudo-nopasswd group - deploys client key + MSP backup key to agent user and root - adjusts sshd_config, reloads sshd - verifies key-based login after bootstrap - Windows stub with skip + warning - continues on failure, prints summary
2026-03-12 11:15:43 -07:00
parent 5b846654ba
commit a42bf14665
2 changed files with 757 additions and 196 deletions
--- a/scripts/deploy_agent.sh
+++ b/scripts/deploy_agent.sh
@@ -0,0 +1,469 @@
+#!/bin/bash
+# =============================================================================
+# deploy_agent.sh — MSP Agent Bootstrap Script
+# =============================================================================
+# Connects to Linux hosts defined in a client hosts.yml, creates the
+# ansible-msp-agent service account, deploys SSH keys, configures sudoers,
+# and hardens sshd_config.
+#
+# Usage:
+#   ./deploy_agent.sh --inventory /path/to/client_xxx/hosts.yml [options]
+#
+# Options:
+#   --inventory     Path to client hosts.yml (required)
+#   --native-user   Username to SSH in with (default: localcontrol)
+#   --native-pass   Password for native user (will prompt if not provided)
+#   --root-pass     Root password for su - (will prompt if not provided)
+#   --agent-user    Service account to create (default: ansible-msp-agent)
+#   --client-key    Path to client public key (default: auto-derived from inventory path)
+#   --msp-key       Path to MSP backup public key file (default: /root/.ssh/ansible-msp-agent.pub)
+#   --key-repo-dir  If set, look for public keys in this git repo dir instead
+#   --dry-run       Show what would be done without making changes
+#   --skip-sshd     Skip sshd_config modifications
+#   --help          Show this help
+#
+# Dependencies: sshpass, python3, python3-yaml, ssh, ssh-keyscan
+# =============================================================================
+
+set -euo pipefail
+
+# ─── Defaults ────────────────────────────────────────────────────────────────
+NATIVE_USER="localcontrol"
+NATIVE_PASS=""
+ROOT_PASS=""
+AGENT_USER="ansible-msp-agent"
+CLIENT_KEY_PATH=""          # auto-derived if empty
+MSP_KEY_PATH="/root/.ssh/ansible-msp-agent.pub"
+KEY_REPO_DIR=""             # future: point to keys/ dir in git repo
+DRY_RUN=false
+SKIP_SSHD=false
+INVENTORY_PATH=""
+REPO_DIR="/opt/ansible-msp-automations"
+
+# Load MSP backup key from environment if file not present
+# Set MSP_BACKUP_PUBKEY in /root/.semaphore_env to avoid needing the file
+if [[ -f /root/.semaphore_env ]]; then
+  source /root/.semaphore_env
+fi
+
+# ─── Counters ────────────────────────────────────────────────────────────────
+HOSTS_TOTAL=0
+HOSTS_OK=0
+HOSTS_FAILED=0
+HOSTS_SKIPPED=0
+FAILED_HOSTS=()
+
+# ─── Colors ──────────────────────────────────────────────────────────────────
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+log_info()    { echo -e "${BLUE}  ℹ  $*${NC}"; }
+log_ok()      { echo -e "${GREEN}  ✓  $*${NC}"; }
+log_warn()    { echo -e "${YELLOW}  ⚠  $*${NC}"; }
+log_error()   { echo -e "${RED}  ✗  $*${NC}"; }
+log_section() { echo -e "\n${BLUE}━━━ $* ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; }
+
+# ─── Parse args ──────────────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --inventory)     INVENTORY_PATH="$2";  shift 2 ;;
+    --native-user)   NATIVE_USER="$2";     shift 2 ;;
+    --native-pass)   NATIVE_PASS="$2";     shift 2 ;;
+    --root-pass)     ROOT_PASS="$2";       shift 2 ;;
+    --agent-user)    AGENT_USER="$2";      shift 2 ;;
+    --client-key)    CLIENT_KEY_PATH="$2"; shift 2 ;;
+    --msp-key)       MSP_KEY_PATH="$2";    shift 2 ;;
+    --key-repo-dir)  KEY_REPO_DIR="$2";    shift 2 ;;
+    --dry-run)       DRY_RUN=true;         shift ;;
+    --skip-sshd)     SKIP_SSHD=true;       shift ;;
+    --help)
+      head -30 "$0" | grep "^#" | sed 's/^# \?//'
+      exit 0
+      ;;
+    *)
+      log_error "Unknown option: $1"
+      exit 1
+      ;;
+  esac
+done
+
+# ─── Validate ────────────────────────────────────────────────────────────────
+if [[ -z "$INVENTORY_PATH" ]]; then
+  log_error "--inventory is required"
+  exit 1
+fi
+
+if [[ ! -f "$INVENTORY_PATH" ]]; then
+  log_error "Inventory file not found: $INVENTORY_PATH"
+  exit 1
+fi
+
+# Check dependencies
+for dep in sshpass python3 ssh ssh-keyscan; do
+  if ! command -v "$dep" &>/dev/null; then
+    log_error "Missing dependency: $dep"
+    echo "  Install with: apt install sshpass python3 python3-yaml openssh-client"
+    exit 1
+  fi
+done
+
+python3 -c "import yaml" 2>/dev/null || {
+  log_error "python3-yaml not installed: apt install python3-yaml"
+  exit 1
+}
+
+# ─── Derive client key path from inventory path if not set ───────────────────
+if [[ -z "$CLIENT_KEY_PATH" ]]; then
+  # Extract slug from inventory path: .../inventories/client_foo/hosts.yml -> client_foo
+  INVENTORY_DIR=$(dirname "$INVENTORY_PATH")
+  SLUG=$(basename "$INVENTORY_DIR")
+  # Remove client_ prefix for key name
+  KEY_SLUG="${SLUG#client_}"
+  
+  # Check key repo dir first if set
+  if [[ -n "$KEY_REPO_DIR" && -f "$KEY_REPO_DIR/keys/client_${KEY_SLUG}.pub" ]]; then
+    CLIENT_KEY_PATH="$KEY_REPO_DIR/keys/client_${KEY_SLUG}.pub"
+  else
+    CLIENT_KEY_PATH="/root/.ssh/client_${KEY_SLUG}.pub"
+  fi
+fi
+
+# Check key repo dir for MSP key if set
+if [[ -n "$KEY_REPO_DIR" && -f "$KEY_REPO_DIR/keys/ansible-msp-agent.pub" ]]; then
+  MSP_KEY_PATH="$KEY_REPO_DIR/keys/ansible-msp-agent.pub"
+fi
+
+if [[ ! -f "$CLIENT_KEY_PATH" ]]; then
+  log_error "Client public key not found: $CLIENT_KEY_PATH"
+  log_error "Generate it with: ssh-keygen -t ed25519 -f ${CLIENT_KEY_PATH%.pub}"
+  exit 1
+fi
+
+# Resolve MSP public key — file takes priority, fall back to env var
+if [[ -f "$MSP_KEY_PATH" ]]; then
+  MSP_PUBKEY=$(cat "$MSP_KEY_PATH")
+elif [[ -n "${MSP_BACKUP_PUBKEY:-}" ]]; then
+  MSP_PUBKEY="$MSP_BACKUP_PUBKEY"
+  log_info "MSP key loaded from environment (MSP_BACKUP_PUBKEY)"
+else
+  log_error "MSP backup public key not found: $MSP_KEY_PATH"
+  log_error "Set MSP_BACKUP_PUBKEY in /root/.semaphore_env or pass --msp-key"
+  exit 1
+fi
+
+CLIENT_PUBKEY=$(cat "$CLIENT_KEY_PATH")
+
+# ─── Prompt for passwords if not provided ────────────────────────────────────
+if [[ -z "$NATIVE_PASS" ]]; then
+  echo -n "Password for ${NATIVE_USER}: "
+  read -rs NATIVE_PASS
+  echo
+fi
+
+if [[ -z "$ROOT_PASS" ]]; then
+  echo -n "Root password (for su -): "
+  read -rs ROOT_PASS
+  echo
+fi
+
+# ─── Parse inventory for hosts ───────────────────────────────────────────────
+log_section "Parsing inventory"
+log_info "Inventory: $INVENTORY_PATH"
+
+# Extract Linux and Windows hosts using Python
+HOST_DATA=$(python3 << PYEOF
+import yaml, json, sys
+
+with open('$INVENTORY_PATH') as f:
+    inv = yaml.safe_load(f)
+
+linux_hosts = []
+windows_hosts = []
+
+def extract_hosts(group, target_list):
+    if not group:
+        return
+    hosts = group.get('hosts') or {}
+    group_vars = group.get('vars') or {}
+    for hostname, hvars in (hosts or {}).items():
+        hvars = hvars or {}
+        merged = {**group_vars, **hvars}
+        ip = merged.get('ansible_host', hostname)
+        target_list.append({'name': hostname, 'ip': ip})
+
+children = (inv.get('all') or {}).get('children') or {}
+extract_hosts(children.get('linux_hosts'), linux_hosts)
+extract_hosts(children.get('windows_hosts'), windows_hosts)
+
+print(json.dumps({'linux': linux_hosts, 'windows': windows_hosts}))
+PYEOF
+)
+
+LINUX_HOSTS=$(echo "$HOST_DATA" | python3 -c "import json,sys; d=json.load(sys.stdin); [print(h['name']+'|'+h['ip']) for h in d['linux']]")
+WINDOWS_HOSTS=$(echo "$HOST_DATA" | python3 -c "import json,sys; d=json.load(sys.stdin); [print(h['name']+'|'+h['ip']) for h in d['windows']]")
+
+LINUX_COUNT=$(echo "$LINUX_HOSTS" | grep -c '.' || true)
+WINDOWS_COUNT=$(echo "$WINDOWS_HOSTS" | grep -c '.' || true)
+
+log_info "Linux hosts found:   $LINUX_COUNT"
+log_info "Windows hosts found: $WINDOWS_COUNT (skipped — WinRM setup not yet implemented)"
+
+if [[ "$DRY_RUN" == "true" ]]; then
+  log_warn "DRY RUN MODE — no changes will be made"
+fi
+
+# ─── Windows stub ────────────────────────────────────────────────────────────
+if [[ -n "$WINDOWS_HOSTS" ]]; then
+  log_section "Windows hosts (stub)"
+  while IFS='|' read -r hostname ip; do
+    [[ -z "$hostname" ]] && continue
+    log_warn "SKIP $hostname ($ip) — Windows host, WinRM/SSH setup not yet implemented"
+    ((HOSTS_SKIPPED++)) || true
+  done <<< "$WINDOWS_HOSTS"
+fi
+
+# ─── Remote setup script ─────────────────────────────────────────────────────
+# This heredoc is the script executed on each remote host as root
+build_remote_script() {
+  local HOST_CLIENT_PUBKEY="$1"
+  local HOST_MSP_PUBKEY="$2"
+  local HOST_AGENT_USER="$3"
+  local HOST_SKIP_SSHD="$4"
+
+  cat << REMOTESCRIPT
+#!/bin/bash
+set -e
+
+AGENT_USER="${HOST_AGENT_USER}"
+CLIENT_PUBKEY='${HOST_CLIENT_PUBKEY}'
+MSP_PUBKEY='${HOST_MSP_PUBKEY}'
+SKIP_SSHD="${HOST_SKIP_SSHD}"
+
+echo "[remote] Starting agent bootstrap on \$(hostname)"
+
+# ── Create sudo-nopasswd group if missing ──
+if ! getent group sudo-nopasswd > /dev/null 2>&1; then
+  groupadd sudo-nopasswd
+  echo "[remote] Created group: sudo-nopasswd"
+else
+  echo "[remote] Group sudo-nopasswd already exists"
+fi
+
+# ── Create agent user if missing ──
+if ! id "\$AGENT_USER" > /dev/null 2>&1; then
+  useradd -m -s /bin/bash -G sudo,sudo-nopasswd "\$AGENT_USER"
+  echo "[remote] Created user: \$AGENT_USER"
+else
+  echo "[remote] User \$AGENT_USER already exists — ensuring group membership"
+  usermod -aG sudo,sudo-nopasswd "\$AGENT_USER" || true
+fi
+
+# ── Sudoers ──
+SUDOERS_FILE="/etc/sudoers.d/99-ansible-nopasswd"
+cat > "\$SUDOERS_FILE" << SUDOEOF
+# Managed by ansible-msp deploy_agent.sh
+# Members of sudo-nopasswd group can run all commands without password
+%sudo-nopasswd ALL=(ALL) NOPASSWD:ALL
+SUDOEOF
+chmod 440 "\$SUDOERS_FILE"
+visudo -cf "\$SUDOERS_FILE" && echo "[remote] Sudoers file validated OK" || {
+  echo "[remote] ERROR: sudoers file invalid — removing"
+  rm -f "\$SUDOERS_FILE"
+  exit 1
+}
+
+# ── Deploy SSH keys to agent user ──
+AGENT_SSH_DIR="/home/\$AGENT_USER/.ssh"
+mkdir -p "\$AGENT_SSH_DIR"
+chmod 700 "\$AGENT_SSH_DIR"
+
+AUTH_KEYS="\$AGENT_SSH_DIR/authorized_keys"
+touch "\$AUTH_KEYS"
+
+# Add client key if not present
+if ! grep -qF "\$CLIENT_PUBKEY" "\$AUTH_KEYS" 2>/dev/null; then
+  echo "\$CLIENT_PUBKEY" >> "\$AUTH_KEYS"
+  echo "[remote] Client key added to \$AGENT_USER"
+else
+  echo "[remote] Client key already present for \$AGENT_USER"
+fi
+
+# Add MSP backup key if not present
+if ! grep -qF "\$MSP_PUBKEY" "\$AUTH_KEYS" 2>/dev/null; then
+  echo "\$MSP_PUBKEY" >> "\$AUTH_KEYS"
+  echo "[remote] MSP backup key added to \$AGENT_USER"
+else
+  echo "[remote] MSP backup key already present for \$AGENT_USER"
+fi
+
+chmod 600 "\$AUTH_KEYS"
+chown -R "\$AGENT_USER:\$AGENT_USER" "\$AGENT_SSH_DIR"
+
+# ── Deploy SSH keys to root ──
+ROOT_SSH_DIR="/root/.ssh"
+mkdir -p "\$ROOT_SSH_DIR"
+chmod 700 "\$ROOT_SSH_DIR"
+
+ROOT_AUTH_KEYS="\$ROOT_SSH_DIR/authorized_keys"
+touch "\$ROOT_AUTH_KEYS"
+
+if ! grep -qF "\$CLIENT_PUBKEY" "\$ROOT_AUTH_KEYS" 2>/dev/null; then
+  echo "\$CLIENT_PUBKEY" >> "\$ROOT_AUTH_KEYS"
+  echo "[remote] Client key added to root"
+else
+  echo "[remote] Client key already present for root"
+fi
+
+if ! grep -qF "\$MSP_PUBKEY" "\$ROOT_AUTH_KEYS" 2>/dev/null; then
+  echo "\$MSP_PUBKEY" >> "\$ROOT_AUTH_KEYS"
+  echo "[remote] MSP backup key added to root"
+else
+  echo "[remote] MSP backup key already present for root"
+fi
+
+chmod 600 "\$ROOT_AUTH_KEYS"
+
+# ── Adjust sshd_config ──
+if [[ "\$SKIP_SSHD" != "true" ]]; then
+  SSHD_CONFIG="/etc/ssh/sshd_config"
+
+  set_sshd_option() {
+    local KEY="\$1"
+    local VALUE="\$2"
+    if grep -qE "^#?\s*\${KEY}\s" "\$SSHD_CONFIG"; then
+      sed -i "s|^#\?\s*\${KEY}\s.*|\${KEY} \${VALUE}|" "\$SSHD_CONFIG"
+    else
+      echo "\${KEY} \${VALUE}" >> "\$SSHD_CONFIG"
+    fi
+    echo "[remote] sshd_config: \${KEY} = \${VALUE}"
+  }
+
+  set_sshd_option "PubkeyAuthentication" "yes"
+  set_sshd_option "PermitRootLogin" "prohibit-password"
+  set_sshd_option "AuthorizedKeysFile" ".ssh/authorized_keys"
+
+  # Reload sshd
+  if command -v systemctl &>/dev/null; then
+    systemctl reload sshd 2>/dev/null || systemctl reload ssh 2>/dev/null || true
+  else
+    service sshd reload 2>/dev/null || service ssh reload 2>/dev/null || true
+  fi
+  echo "[remote] sshd reloaded"
+fi
+
+echo "[remote] Bootstrap complete on \$(hostname)"
+REMOTESCRIPT
+}
+
+# ─── Process Linux hosts ─────────────────────────────────────────────────────
+if [[ -z "$LINUX_HOSTS" ]]; then
+  log_warn "No Linux hosts found in inventory"
+  exit 0
+fi
+
+log_section "Processing Linux hosts"
+
+while IFS='|' read -r HOSTNAME HOST_IP; do
+  [[ -z "$HOSTNAME" ]] && continue
+  ((HOSTS_TOTAL++)) || true
+
+  echo ""
+  log_section "Host: $HOSTNAME ($HOST_IP)"
+
+  if [[ "$DRY_RUN" == "true" ]]; then
+    log_info "DRY RUN: Would bootstrap $HOSTNAME ($HOST_IP) as $NATIVE_USER → root → create $AGENT_USER"
+    ((HOSTS_OK++)) || true
+    continue
+  fi
+
+  # Add host to known_hosts
+  log_info "Scanning host key..."
+  ssh-keyscan -T 10 "$HOST_IP" >> /root/.ssh/known_hosts 2>/dev/null || true
+
+  # Test native user SSH access
+  log_info "Testing SSH as $NATIVE_USER..."
+  if ! sshpass -p "$NATIVE_PASS" ssh -o StrictHostKeyChecking=no \
+      -o ConnectTimeout=10 \
+      -o PasswordAuthentication=yes \
+      "$NATIVE_USER@$HOST_IP" "echo connected" &>/dev/null; then
+    log_error "Cannot SSH to $HOSTNAME ($HOST_IP) as $NATIVE_USER — skipping"
+    FAILED_HOSTS+=("$HOSTNAME ($HOST_IP) — SSH connection failed")
+    ((HOSTS_FAILED++)) || true
+    continue
+  fi
+  log_ok "SSH connection successful"
+
+  # Build remote script
+  REMOTE_SCRIPT=$(build_remote_script \
+    "$CLIENT_PUBKEY" \
+    "$MSP_PUBKEY" \
+    "$AGENT_USER" \
+    "$SKIP_SSHD")
+
+  # Execute via su - on remote host
+  log_info "Executing bootstrap via su - root..."
+  BOOTSTRAP_OUTPUT=$(sshpass -p "$NATIVE_PASS" ssh \
+    -o StrictHostKeyChecking=no \
+    -o ConnectTimeout=10 \
+    -o PasswordAuthentication=yes \
+    "$NATIVE_USER@$HOST_IP" \
+    "echo '$ROOT_PASS' | su - root -c 'bash -s'" <<< "$REMOTE_SCRIPT" 2>&1) || {
+    log_error "Bootstrap script failed on $HOSTNAME"
+    echo "$BOOTSTRAP_OUTPUT" | sed 's/^/    /'
+    FAILED_HOSTS+=("$HOSTNAME ($HOST_IP) — bootstrap script failed")
+    ((HOSTS_FAILED++)) || true
+    continue
+  }
+
+  # Show remote output
+  echo "$BOOTSTRAP_OUTPUT" | grep "\[remote\]" | sed 's/^/    /'
+
+  # Verify key-based login works for agent user
+  log_info "Verifying key-based login for $AGENT_USER..."
+  CLIENT_PRIVKEY="${CLIENT_KEY_PATH%.pub}"
+  if [[ -f "$CLIENT_PRIVKEY" ]]; then
+    if ssh -i "$CLIENT_PRIVKEY" \
+        -o StrictHostKeyChecking=no \
+        -o ConnectTimeout=10 \
+        -o PasswordAuthentication=no \
+        "$AGENT_USER@$HOST_IP" "echo key-auth-ok" &>/dev/null; then
+      log_ok "Key-based login verified for $AGENT_USER"
+    else
+      log_warn "Key-based login test failed for $AGENT_USER — check manually"
+    fi
+  else
+    log_warn "Private key not found at $CLIENT_PRIVKEY — skipping login verification"
+  fi
+
+  log_ok "Bootstrap complete: $HOSTNAME"
+  ((HOSTS_OK++)) || true
+
+done <<< "$LINUX_HOSTS"
+
+# ─── Summary ─────────────────────────────────────────────────────────────────
+echo ""
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo "  Bootstrap Summary"
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo "  Total Linux hosts:   $HOSTS_TOTAL"
+echo -e "  ${GREEN}Succeeded:           $HOSTS_OK${NC}"
+if [[ $HOSTS_FAILED -gt 0 ]]; then
+  echo -e "  ${RED}Failed:              $HOSTS_FAILED${NC}"
+  echo ""
+  echo "  Failed hosts:"
+  for h in "${FAILED_HOSTS[@]}"; do
+    echo -e "    ${RED}✗ $h${NC}"
+  done
+fi
+if [[ $HOSTS_SKIPPED -gt 0 ]]; then
+  echo -e "  ${YELLOW}Skipped (Windows):   $HOSTS_SKIPPED${NC}"
+fi
+echo ""
+
+if [[ $HOSTS_FAILED -gt 0 ]]; then
+  exit 1
+fi
+