diff --git a/playbooks/proxmox_upgrade.yml b/playbooks/proxmox_upgrade.yml new file mode 100644 index 0000000..e6f9e66 --- /dev/null +++ b/playbooks/proxmox_upgrade.yml @@ -0,0 +1,43 @@ +--- +# ============================================================================= +# proxmox_upgrade.yml +# ============================================================================= +# Rolling Proxmox cluster upgrade playbook. +# Runs on the first node in upgrade_order — all other nodes are handled +# via API calls and delegate_to from within the role. +# +# Usage: +# ansible-playbook playbooks/proxmox_upgrade.yml \ +# -i inventories/client_local_eng/hypervisor_hosts.yml +# +# Override migration behaviour: +# -e migration_bulk=true +# -e live_migrate_fallback=skip +# -e migration_restore=true +# +# Dry run (check mode — no changes): +# --check +# ============================================================================= + +- name: Proxmox Rolling Upgrade + hosts: proxmox_cluster + gather_facts: true + serial: 1 + run_once: true + + pre_tasks: + - name: Confirm upgrade_order is defined + ansible.builtin.fail: + msg: "upgrade_order must be defined in hypervisor_hosts.yml" + when: upgrade_order is not defined or upgrade_order | length == 0 + + - name: Log upgrade targets + ansible.builtin.debug: + msg: >- + Proxmox upgrade starting for {{ client_name }} ({{ client_id }}) + Nodes: {{ upgrade_order | join(', ') }} + API: https://{{ api_host }}:{{ api_port }} + + roles: + - proxmox_upgrade + diff --git a/roles/proxmox_upgrade/defaults/main.yml b/roles/proxmox_upgrade/defaults/main.yml new file mode 100644 index 0000000..558fcdd --- /dev/null +++ b/roles/proxmox_upgrade/defaults/main.yml @@ -0,0 +1,31 @@ +--- +# ============================================================================= +# proxmox_upgrade — defaults +# ============================================================================= + +# Migration behaviour +migration_bulk: false # true = all VMs at once, false = one at a time +migration_restore: false # true = migrate VMs back to original node after upgrade +live_migrate_fallback: shutdown # migrate | shutdown | skip + +# Shutdown timeout in seconds before forcing off +vm_shutdown_timeout: 120 + +# How long to wait for a VM to start after cold migration +vm_start_timeout: 120 + +# How long to wait for node to rejoin cluster after reboot +node_rejoin_timeout: 600 + +# How long to wait for CEPH to recover after node rejoins +ceph_recover_timeout: 300 + +# apt upgrade options +apt_upgrade_cmd: "DEBIAN_FRONTEND=noninteractive apt-get dist-upgrade -y" +apt_autoremove: true + +# Tags on VMs/LXCs to never migrate (comma separated in Proxmox) +migrate_exclude_tags: + - nomigrate + - pinned + diff --git a/roles/proxmox_upgrade/readme.md b/roles/proxmox_upgrade/readme.md new file mode 100644 index 0000000..f5b03be --- /dev/null +++ b/roles/proxmox_upgrade/readme.md @@ -0,0 +1,12 @@ +roles/proxmox_upgrade/ + defaults/main.yml ← all vars with defaults + tasks/ + main.yml ← entry point, calls preflight then loops nodes + node_upgrade.yml ← per-node: backup → drain → upgrade → restore + preflight.yml ← cluster health check, abort if unhealthy + drain.yml ← classify guests, trigger migrations + migrate_guest.yml ← single guest migration with fallback logic + upgrade.yml ← apt dist-upgrade, reboot, wait for rejoin + restore.yml ← optional migrate-back +playbooks/proxmox_upgrade.yml + diff --git a/roles/proxmox_upgrade/tasks/drain.yml b/roles/proxmox_upgrade/tasks/drain.yml new file mode 100644 index 0000000..54efc10 --- /dev/null +++ b/roles/proxmox_upgrade/tasks/drain.yml @@ -0,0 +1,186 @@ +--- +# ============================================================================= +# proxmox_upgrade — drain.yml +# Migrate all VMs/LXCs off a node before upgrading it +# Uses Proxmox API — runs delegate_to: localhost +# ============================================================================= + +# ── Get all guests on this node ─────────────────────────────────────────────── +- name: Drain | Get all VMs on node {{ current_node }} + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/qemu" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: node_vms + delegate_to: localhost + +- name: Drain | Get all LXCs on node {{ current_node }} + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/lxc" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: node_lxcs + delegate_to: localhost + +- name: Drain | Get available target nodes + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: all_nodes + delegate_to: localhost + +- name: Drain | Build target node list (exclude current node) + ansible.builtin.set_fact: + migration_targets: >- + {{ all_nodes.json.data + | selectattr('status', 'equalto', 'online') + | rejectattr('node', 'equalto', current_node) + | map(attribute='node') + | list }} + delegate_to: localhost + +- name: Drain | Fail if no migration targets available + ansible.builtin.fail: + msg: "No online nodes available to migrate guests to. Cannot drain {{ current_node }}." + when: migration_targets | length == 0 + delegate_to: localhost + +# ── Classify VMs — live migratable vs needs fallback ───────────────────────── +- name: Drain | Get VM configs to check migratability + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/qemu/{{ item.vmid }}/config" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: vm_configs + loop: "{{ node_vms.json.data }}" + delegate_to: localhost + +- name: Drain | Build guest migration plan + ansible.builtin.set_fact: + migration_plan: >- + {%- set plan = [] -%} + {%- for vm in node_vms.json.data -%} + {%- set cfg = vm_configs.results[loop.index0].json.data -%} + {%- set tags = (vm.tags | default('')) .split(',') | map('trim') | list -%} + {%- set excluded = tags | select('in', migrate_exclude_tags) | list | length > 0 -%} + {%- set has_passthrough = 'hostpci0' in cfg or 'usb0' in cfg -%} + {%- set has_local_disk = shared_storage == false -%} + {%- set has_local_cdrom = cfg.values() | select('string') | select('match', '.*local.*\\.iso.*') | list | length > 0 -%} + {%- set needs_fallback = has_passthrough or has_local_disk or has_local_cdrom -%} + {%- if not excluded -%} + {%- set _ = plan.append({ + 'vmid': vm.vmid, + 'name': vm.name, + 'type': 'qemu', + 'status': vm.status, + 'needs_fallback': needs_fallback, + 'fallback_reason': ('passthrough' if has_passthrough else ('local_disk' if has_local_disk else ('local_cdrom' if has_local_cdrom else ''))) + }) -%} + {%- endif -%} + {%- endfor -%} + {%- for lxc in node_lxcs.json.data -%} + {%- set tags = (lxc.tags | default('')) .split(',') | map('trim') | list -%} + {%- set excluded = tags | select('in', migrate_exclude_tags) | list | length > 0 -%} + {%- if not excluded -%} + {%- set _ = plan.append({ + 'vmid': lxc.vmid, + 'name': lxc.name, + 'type': 'lxc', + 'status': lxc.status, + 'needs_fallback': false, + 'fallback_reason': '' + }) -%} + {%- endif -%} + {%- endfor -%} + {{ plan }} + delegate_to: localhost + +- name: Drain | Log migration plan + ansible.builtin.debug: + msg: >- + Migration plan for {{ current_node }}: + {% for g in migration_plan %} + - {{ g.type | upper }} {{ g.vmid }} ({{ g.name }}) [{{ g.status }}] + {% if g.needs_fallback %} ⚠ needs fallback ({{ g.fallback_reason }}) — action: {{ live_migrate_fallback }}{% endif %} + {% endfor %} + delegate_to: localhost + +# ── Abort if any guests need fallback and live_migrate_fallback is 'migrate' ── +- name: Drain | Warn about non-migratable guests + ansible.builtin.debug: + msg: >- + WARNING — {{ item.type | upper }} {{ item.vmid }} ({{ item.name }}) + cannot be live migrated ({{ item.fallback_reason }}). + live_migrate_fallback={{ live_migrate_fallback }} — + {% if live_migrate_fallback == 'skip' %} + THIS VM WILL GO DOWN DURING NODE REBOOT. + {% elif live_migrate_fallback == 'shutdown' %} + Will be shut down, cold migrated, and restarted. + {% else %} + Will attempt live migrate anyway (may fail). + {% endif %} + loop: "{{ migration_plan | selectattr('needs_fallback') | list }}" + delegate_to: localhost + +# ── Perform migrations ──────────────────────────────────────────────────────── +- name: Drain | Migrate guests (sequential) + when: not migration_bulk | bool + include_tasks: migrate_guest.yml + loop: "{{ migration_plan | rejectattr('needs_fallback') | list + migration_plan | selectattr('needs_fallback') | rejectattr('needs_fallback' if live_migrate_fallback == 'skip' else 'nonexistent') | list }}" + loop_var: guest + +- name: Drain | Migrate guests (bulk — fire all at once) + when: migration_bulk | bool + block: + - name: Drain | Bulk | Trigger all live migrations simultaneously + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate" + method: POST + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + body_format: json + body: + target: "{{ migration_targets | first }}" + online: "{{ 1 if not guest.needs_fallback else 0 }}" + validate_certs: false + register: bulk_migration_tasks + loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}" + loop_var: guest + delegate_to: localhost + + - name: Drain | Bulk | Wait for all migrations to complete + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/tasks/{{ item.json.data }}/status" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: task_status + until: task_status.json.data.status == 'stopped' + retries: 60 + delay: 10 + loop: "{{ bulk_migration_tasks.results }}" + delegate_to: localhost + + - name: Drain | Bulk | Check all migrations succeeded + ansible.builtin.fail: + msg: "Migration task failed for VMID — exitstatus: {{ item.json.data.exitstatus }}" + loop: "{{ task_status.results }}" + when: item.json.data.exitstatus != 'OK' + delegate_to: localhost + + - name: Drain | Bulk | Handle fallback guests sequentially + include_tasks: migrate_guest.yml + loop: "{{ migration_plan | selectattr('needs_fallback') | list }}" + loop_var: guest + when: live_migrate_fallback != 'skip' + diff --git a/roles/proxmox_upgrade/tasks/main.yml b/roles/proxmox_upgrade/tasks/main.yml new file mode 100644 index 0000000..48a7c6c --- /dev/null +++ b/roles/proxmox_upgrade/tasks/main.yml @@ -0,0 +1,33 @@ +--- +# ============================================================================= +# proxmox_upgrade — main.yml +# Orchestrates rolling Proxmox cluster upgrade +# Runs on the first node in upgrade_order, delegates API calls to localhost +# ============================================================================= + +- name: Proxmox Upgrade | Start + ansible.builtin.debug: + msg: >- + Starting Proxmox rolling upgrade for {{ client_name }} + — {{ upgrade_order | length }} nodes in order: {{ upgrade_order | join(' → ') }} + — migration_bulk={{ migration_bulk }} + — live_migrate_fallback={{ live_migrate_fallback }} + — migration_restore={{ migration_restore }} + — ceph_enabled={{ ceph_enabled }} + +# ── Cluster health preflight ────────────────────────────────────────────────── +- name: Proxmox Upgrade | Cluster preflight + include_tasks: preflight.yml + +# ── Rolling upgrade — one node at a time ───────────────────────────────────── +- name: Proxmox Upgrade | Rolling upgrade loop + include_tasks: node_upgrade.yml + loop: "{{ upgrade_order }}" + loop_var: current_node + +- name: Proxmox Upgrade | Complete + ansible.builtin.debug: + msg: >- + Proxmox rolling upgrade complete for {{ client_name }} + — all {{ upgrade_order | length }} nodes upgraded successfully + diff --git a/roles/proxmox_upgrade/tasks/migrate_guest.yml b/roles/proxmox_upgrade/tasks/migrate_guest.yml new file mode 100644 index 0000000..9e446d1 --- /dev/null +++ b/roles/proxmox_upgrade/tasks/migrate_guest.yml @@ -0,0 +1,117 @@ +--- +# ============================================================================= +# proxmox_upgrade — migrate_guest.yml +# Handles migration of a single VM or LXC +# Called with loop_var: guest +# guest = { vmid, name, type, status, needs_fallback, fallback_reason } +# ============================================================================= + +- name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — skip check" + ansible.builtin.debug: + msg: "SKIPPING {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — live_migrate_fallback=skip, will go down during reboot" + when: guest.needs_fallback and live_migrate_fallback == 'skip' + delegate_to: localhost + +- name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }})" + when: not (guest.needs_fallback and live_migrate_fallback == 'skip') + block: + # ── Cold migration: shutdown first ─────────────────────────────────────── + - name: "Migrate | {{ guest.vmid }} | Shutdown for cold migration" + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/shutdown" + method: POST + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + body_format: json + body: + timeout: "{{ vm_shutdown_timeout }}" + forceStop: 1 + validate_certs: false + when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running' + delegate_to: localhost + + - name: "Migrate | {{ guest.vmid }} | Wait for shutdown" + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/current" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: vm_status + until: vm_status.json.data.status == 'stopped' + retries: "{{ (vm_shutdown_timeout | int / 5) | int }}" + delay: 5 + when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running' + delegate_to: localhost + + # ── Trigger migration ───────────────────────────────────────────────────── + - name: "Migrate | {{ guest.vmid }} | Trigger migration to {{ migration_targets | first }}" + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate" + method: POST + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + body_format: json + body: + target: "{{ migration_targets | first }}" + online: "{{ 0 if (guest.needs_fallback and live_migrate_fallback == 'shutdown') else 1 }}" + validate_certs: false + register: migration_task + delegate_to: localhost + + # ── Wait for migration to complete ──────────────────────────────────────── + - name: "Migrate | {{ guest.vmid }} | Wait for migration task to complete" + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/tasks/{{ migration_task.json.data }}/status" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: task_status + until: task_status.json.data.status == 'stopped' + retries: 60 + delay: 10 + delegate_to: localhost + + - name: "Migrate | {{ guest.vmid }} | Verify migration succeeded" + ansible.builtin.fail: + msg: "Migration of {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) failed — {{ task_status.json.data.exitstatus }}" + when: task_status.json.data.exitstatus != 'OK' + delegate_to: localhost + + # ── Cold migration: restart on target ──────────────────────────────────── + - name: "Migrate | {{ guest.vmid }} | Start on target node after cold migration" + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/start" + method: POST + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running' + delegate_to: localhost + + - name: "Migrate | {{ guest.vmid }} | Wait for VM to start on target" + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/current" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: vm_start_status + until: vm_start_status.json.data.status == 'running' + retries: "{{ (vm_start_timeout | int / 5) | int }}" + delay: 5 + when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running' + delegate_to: localhost + + - name: "Migrate | {{ guest.vmid }} ({{ guest.name }}) | Migration complete" + ansible.builtin.debug: + msg: >- + {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) + {% if guest.needs_fallback and live_migrate_fallback == 'shutdown' %} + cold migrated to {{ migration_targets | first }} and restarted + {% else %} + live migrated to {{ migration_targets | first }} + {% endif %} + delegate_to: localhost + diff --git a/roles/proxmox_upgrade/tasks/node_upgrade.yml b/roles/proxmox_upgrade/tasks/node_upgrade.yml new file mode 100644 index 0000000..e5d4a1f --- /dev/null +++ b/roles/proxmox_upgrade/tasks/node_upgrade.yml @@ -0,0 +1,43 @@ +--- +# ============================================================================= +# proxmox_upgrade — node_upgrade.yml +# Per-node upgrade sequence: backup → drain → upgrade → restore +# Called with loop_var: current_node +# ============================================================================= + +- name: "Node {{ current_node }} | Start" + ansible.builtin.debug: + msg: "━━━ Starting upgrade of node {{ current_node }} ━━━" + +# ── Step 1: Backup config ───────────────────────────────────────────────────── +- name: "Node {{ current_node }} | Step 1 — Backup config" + include_role: + name: hypervisor_backup_config + vars: + pve_config_git_commit_message: "[{{ client_id }}] {{ current_node }} pre-upgrade config backup {{ ansible_date_time.date }}" + delegate_to: "{{ current_node }}" + +# ── Step 2: Drain node ──────────────────────────────────────────────────────── +- name: "Node {{ current_node }} | Step 2 — Drain (migrate guests off node)" + include_tasks: drain.yml + when: cluster_mode == 'cluster' + +- name: "Node {{ current_node }} | Step 2 — Single node mode, skipping drain" + ansible.builtin.debug: + msg: "cluster_mode=single — skipping guest migration" + when: cluster_mode == 'single' + +# ── Step 3: Upgrade ─────────────────────────────────────────────────────────── +- name: "Node {{ current_node }} | Step 3 — Upgrade packages" + include_tasks: upgrade.yml + delegate_to: "{{ current_node }}" + +# ── Step 4: Restore ─────────────────────────────────────────────────────────── +- name: "Node {{ current_node }} | Step 4 — Restore guests" + include_tasks: restore.yml + when: cluster_mode == 'cluster' + +- name: "Node {{ current_node }} | Complete" + ansible.builtin.debug: + msg: "━━━ Node {{ current_node }} upgrade complete ━━━" + diff --git a/roles/proxmox_upgrade/tasks/preflight.yml b/roles/proxmox_upgrade/tasks/preflight.yml new file mode 100644 index 0000000..4f598ff --- /dev/null +++ b/roles/proxmox_upgrade/tasks/preflight.yml @@ -0,0 +1,64 @@ +--- +# ============================================================================= +# proxmox_upgrade — preflight.yml +# Check cluster health before starting any upgrade work +# Runs delegate_to: first node in upgrade_order +# ============================================================================= + +- name: Preflight | Check all cluster nodes are online + ansible.builtin.shell: | + pvecm status 2>/dev/null | grep -E "^Nodes|Quorate" + register: pvecm_status + changed_when: false + +- name: Preflight | Get cluster node status via API + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: cluster_nodes + delegate_to: localhost + +- name: Preflight | Check all nodes are online + ansible.builtin.fail: + msg: > + Cluster health check FAILED — node {{ item.node }} is {{ item.status }}. + Aborting upgrade to prevent data loss. Investigate before retrying. + loop: "{{ cluster_nodes.json.data }}" + when: item.status != 'online' + delegate_to: localhost + +- name: Preflight | Check quorum via pvecm + ansible.builtin.shell: | + pvecm status 2>/dev/null | grep -i "quorate" | grep -i "yes" + register: quorum_check + changed_when: false + failed_when: quorum_check.rc != 0 + +- name: Preflight | Check CEPH health + when: ceph_enabled | bool + block: + - name: Preflight | Get CEPH health status + ansible.builtin.shell: | + ceph health 2>/dev/null + register: ceph_health + changed_when: false + + - name: Preflight | Abort if CEPH is not healthy + ansible.builtin.fail: + msg: > + CEPH health check FAILED — status: {{ ceph_health.stdout }}. + Aborting upgrade. Resolve CEPH issues before retrying. + when: "'HEALTH_OK' not in ceph_health.stdout and 'HEALTH_WARN' not in ceph_health.stdout" + + - name: Preflight | Warn if CEPH has warnings + ansible.builtin.debug: + msg: "WARNING — CEPH has warnings: {{ ceph_health.stdout }}. Proceeding but monitor closely." + when: "'HEALTH_WARN' in ceph_health.stdout" + +- name: Preflight | Cluster health check passed + ansible.builtin.debug: + msg: "Cluster health check passed — all nodes online, quorum OK{{ ', CEPH checked' if ceph_enabled else '' }}" + diff --git a/roles/proxmox_upgrade/tasks/restore.yml b/roles/proxmox_upgrade/tasks/restore.yml new file mode 100644 index 0000000..474bd00 --- /dev/null +++ b/roles/proxmox_upgrade/tasks/restore.yml @@ -0,0 +1,63 @@ +--- +# ============================================================================= +# proxmox_upgrade — restore.yml +# Optionally migrate guests back to their original node after upgrade +# Only runs if migration_restore: true +# ============================================================================= + +- name: Restore | Skip restore + ansible.builtin.debug: + msg: "migration_restore=false — leaving guests on their current nodes" + when: not migration_restore | bool + +- name: Restore | Migrate guests back to {{ current_node }} + when: migration_restore | bool + block: + - name: Restore | Get guests currently on other nodes that originated from {{ current_node }} + ansible.builtin.debug: + msg: >- + Restoring {{ migration_plan | rejectattr('needs_fallback') | list | length + + (migration_plan | selectattr('needs_fallback') | list | length if live_migrate_fallback != 'skip' else 0) }} + guests back to {{ current_node }} + + - name: Restore | Migrate each guest back + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate" + method: POST + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + body_format: json + body: + target: "{{ current_node }}" + online: "{{ 0 if (guest.needs_fallback and live_migrate_fallback == 'shutdown') else 1 }}" + validate_certs: false + register: restore_task + loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}" + loop_var: guest + delegate_to: localhost + + - name: Restore | Wait for all restore migrations to complete + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/tasks/{{ item.json.data }}/status" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: restore_status + until: restore_status.json.data.status == 'stopped' + retries: 60 + delay: 10 + loop: "{{ restore_task.results }}" + delegate_to: localhost + + - name: Restore | Check all restores succeeded + ansible.builtin.fail: + msg: "Restore migration failed — {{ item.json.data.exitstatus }}" + loop: "{{ restore_status.results }}" + when: item.json.data.exitstatus != 'OK' + delegate_to: localhost + + - name: Restore | Complete + ansible.builtin.debug: + msg: "All guests restored to {{ current_node }}" + diff --git a/roles/proxmox_upgrade/tasks/upgrade.yml b/roles/proxmox_upgrade/tasks/upgrade.yml new file mode 100644 index 0000000..4a0dac9 --- /dev/null +++ b/roles/proxmox_upgrade/tasks/upgrade.yml @@ -0,0 +1,95 @@ +--- +# ============================================================================= +# proxmox_upgrade — upgrade.yml +# Run apt dist-upgrade and reboot, wait for node to rejoin cluster +# ============================================================================= + +- name: Upgrade | Set CEPH noout flag before upgrade + ansible.builtin.shell: ceph osd set noout + when: ceph_enabled | bool + changed_when: true + +- name: Upgrade | Run apt update + ansible.builtin.shell: apt-get update -q + changed_when: false + +- name: Upgrade | Run apt dist-upgrade + ansible.builtin.shell: "{{ apt_upgrade_cmd }}" + register: apt_upgrade_result + changed_when: "'0 upgraded' not in apt_upgrade_result.stdout" + +- name: Upgrade | Log packages upgraded + ansible.builtin.debug: + msg: "{{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('No output') }}" + +- name: Upgrade | Run apt autoremove + ansible.builtin.shell: DEBIAN_FRONTEND=noninteractive apt-get autoremove -y + when: apt_autoremove | bool + changed_when: false + +- name: Upgrade | Check if reboot is required + ansible.builtin.stat: + path: /var/run/reboot-required + register: reboot_required + +- name: Upgrade | Log reboot status + ansible.builtin.debug: + msg: "{{ 'Reboot required — rebooting node' if reboot_required.stat.exists else 'No reboot required — skipping reboot' }}" + +- name: Upgrade | Reboot node + ansible.builtin.reboot: + reboot_timeout: "{{ node_rejoin_timeout }}" + msg: "Rebooting for Proxmox upgrade" + pre_reboot_delay: 5 + post_reboot_delay: 30 + when: reboot_required.stat.exists + +# ── Wait for node to rejoin cluster ────────────────────────────────────────── +- name: Upgrade | Wait for node to appear online in cluster + ansible.builtin.uri: + url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes" + method: GET + headers: + Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" + validate_certs: false + register: nodes_status + until: >- + nodes_status.json.data + | selectattr('node', 'equalto', current_node) + | selectattr('status', 'equalto', 'online') + | list | length > 0 + retries: "{{ (node_rejoin_timeout | int / 10) | int }}" + delay: 10 + delegate_to: localhost + +- name: Upgrade | Node {{ current_node }} back online + ansible.builtin.debug: + msg: "Node {{ current_node }} has rejoined the cluster" + +# ── CEPH recovery wait ──────────────────────────────────────────────────────── +- name: Upgrade | Wait for CEPH to recover + when: ceph_enabled | bool + block: + - name: Upgrade | CEPH | Wait for HEALTH_OK or HEALTH_WARN + ansible.builtin.shell: ceph health + register: ceph_health_post + until: "'HEALTH_OK' in ceph_health_post.stdout or 'HEALTH_WARN' in ceph_health_post.stdout" + retries: "{{ (ceph_recover_timeout | int / 10) | int }}" + delay: 10 + changed_when: false + + - name: Upgrade | CEPH | Clear noout flag + ansible.builtin.shell: ceph osd unset noout + changed_when: true + + - name: Upgrade | CEPH | Log recovery status + ansible.builtin.debug: + msg: "CEPH recovered: {{ ceph_health_post.stdout }}" + +- name: Upgrade | Node {{ current_node }} upgrade complete + ansible.builtin.debug: + msg: >- + Node {{ current_node }} upgrade complete — + {{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('packages updated') }} + {{ '— rebooted' if reboot_required.stat.exists else '— no reboot needed' }} +