feat: proxmox_upgrade role and playbook

This commit is contained in:
Semaphore
2026-03-14 14:05:40 -07:00
parent df7614f417
commit e0a5ff298a
10 changed files with 687 additions and 0 deletions

View File

@@ -0,0 +1,31 @@
---
# =============================================================================
# proxmox_upgrade — defaults
# =============================================================================
# Migration behaviour
migration_bulk: false # true = all VMs at once, false = one at a time
migration_restore: false # true = migrate VMs back to original node after upgrade
live_migrate_fallback: shutdown # migrate | shutdown | skip
# Shutdown timeout in seconds before forcing off
vm_shutdown_timeout: 120
# How long to wait for a VM to start after cold migration
vm_start_timeout: 120
# How long to wait for node to rejoin cluster after reboot
node_rejoin_timeout: 600
# How long to wait for CEPH to recover after node rejoins
ceph_recover_timeout: 300
# apt upgrade options
apt_upgrade_cmd: "DEBIAN_FRONTEND=noninteractive apt-get dist-upgrade -y"
apt_autoremove: true
# Tags on VMs/LXCs to never migrate (comma separated in Proxmox)
migrate_exclude_tags:
- nomigrate
- pinned

View File

@@ -0,0 +1,12 @@
roles/proxmox_upgrade/
defaults/main.yml ← all vars with defaults
tasks/
main.yml ← entry point, calls preflight then loops nodes
node_upgrade.yml ← per-node: backup → drain → upgrade → restore
preflight.yml ← cluster health check, abort if unhealthy
drain.yml ← classify guests, trigger migrations
migrate_guest.yml ← single guest migration with fallback logic
upgrade.yml ← apt dist-upgrade, reboot, wait for rejoin
restore.yml ← optional migrate-back
playbooks/proxmox_upgrade.yml

View File

@@ -0,0 +1,186 @@
---
# =============================================================================
# proxmox_upgrade — drain.yml
# Migrate all VMs/LXCs off a node before upgrading it
# Uses Proxmox API — runs delegate_to: localhost
# =============================================================================
# ── Get all guests on this node ───────────────────────────────────────────────
- name: Drain | Get all VMs on node {{ current_node }}
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/qemu"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: node_vms
delegate_to: localhost
- name: Drain | Get all LXCs on node {{ current_node }}
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/lxc"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: node_lxcs
delegate_to: localhost
- name: Drain | Get available target nodes
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: all_nodes
delegate_to: localhost
- name: Drain | Build target node list (exclude current node)
ansible.builtin.set_fact:
migration_targets: >-
{{ all_nodes.json.data
| selectattr('status', 'equalto', 'online')
| rejectattr('node', 'equalto', current_node)
| map(attribute='node')
| list }}
delegate_to: localhost
- name: Drain | Fail if no migration targets available
ansible.builtin.fail:
msg: "No online nodes available to migrate guests to. Cannot drain {{ current_node }}."
when: migration_targets | length == 0
delegate_to: localhost
# ── Classify VMs — live migratable vs needs fallback ─────────────────────────
- name: Drain | Get VM configs to check migratability
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/qemu/{{ item.vmid }}/config"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: vm_configs
loop: "{{ node_vms.json.data }}"
delegate_to: localhost
- name: Drain | Build guest migration plan
ansible.builtin.set_fact:
migration_plan: >-
{%- set plan = [] -%}
{%- for vm in node_vms.json.data -%}
{%- set cfg = vm_configs.results[loop.index0].json.data -%}
{%- set tags = (vm.tags | default('')) .split(',') | map('trim') | list -%}
{%- set excluded = tags | select('in', migrate_exclude_tags) | list | length > 0 -%}
{%- set has_passthrough = 'hostpci0' in cfg or 'usb0' in cfg -%}
{%- set has_local_disk = shared_storage == false -%}
{%- set has_local_cdrom = cfg.values() | select('string') | select('match', '.*local.*\\.iso.*') | list | length > 0 -%}
{%- set needs_fallback = has_passthrough or has_local_disk or has_local_cdrom -%}
{%- if not excluded -%}
{%- set _ = plan.append({
'vmid': vm.vmid,
'name': vm.name,
'type': 'qemu',
'status': vm.status,
'needs_fallback': needs_fallback,
'fallback_reason': ('passthrough' if has_passthrough else ('local_disk' if has_local_disk else ('local_cdrom' if has_local_cdrom else '')))
}) -%}
{%- endif -%}
{%- endfor -%}
{%- for lxc in node_lxcs.json.data -%}
{%- set tags = (lxc.tags | default('')) .split(',') | map('trim') | list -%}
{%- set excluded = tags | select('in', migrate_exclude_tags) | list | length > 0 -%}
{%- if not excluded -%}
{%- set _ = plan.append({
'vmid': lxc.vmid,
'name': lxc.name,
'type': 'lxc',
'status': lxc.status,
'needs_fallback': false,
'fallback_reason': ''
}) -%}
{%- endif -%}
{%- endfor -%}
{{ plan }}
delegate_to: localhost
- name: Drain | Log migration plan
ansible.builtin.debug:
msg: >-
Migration plan for {{ current_node }}:
{% for g in migration_plan %}
- {{ g.type | upper }} {{ g.vmid }} ({{ g.name }}) [{{ g.status }}]
{% if g.needs_fallback %} ⚠ needs fallback ({{ g.fallback_reason }}) — action: {{ live_migrate_fallback }}{% endif %}
{% endfor %}
delegate_to: localhost
# ── Abort if any guests need fallback and live_migrate_fallback is 'migrate' ──
- name: Drain | Warn about non-migratable guests
ansible.builtin.debug:
msg: >-
WARNING — {{ item.type | upper }} {{ item.vmid }} ({{ item.name }})
cannot be live migrated ({{ item.fallback_reason }}).
live_migrate_fallback={{ live_migrate_fallback }} —
{% if live_migrate_fallback == 'skip' %}
THIS VM WILL GO DOWN DURING NODE REBOOT.
{% elif live_migrate_fallback == 'shutdown' %}
Will be shut down, cold migrated, and restarted.
{% else %}
Will attempt live migrate anyway (may fail).
{% endif %}
loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
delegate_to: localhost
# ── Perform migrations ────────────────────────────────────────────────────────
- name: Drain | Migrate guests (sequential)
when: not migration_bulk | bool
include_tasks: migrate_guest.yml
loop: "{{ migration_plan | rejectattr('needs_fallback') | list + migration_plan | selectattr('needs_fallback') | rejectattr('needs_fallback' if live_migrate_fallback == 'skip' else 'nonexistent') | list }}"
loop_var: guest
- name: Drain | Migrate guests (bulk — fire all at once)
when: migration_bulk | bool
block:
- name: Drain | Bulk | Trigger all live migrations simultaneously
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate"
method: POST
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
body_format: json
body:
target: "{{ migration_targets | first }}"
online: "{{ 1 if not guest.needs_fallback else 0 }}"
validate_certs: false
register: bulk_migration_tasks
loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}"
loop_var: guest
delegate_to: localhost
- name: Drain | Bulk | Wait for all migrations to complete
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/tasks/{{ item.json.data }}/status"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: task_status
until: task_status.json.data.status == 'stopped'
retries: 60
delay: 10
loop: "{{ bulk_migration_tasks.results }}"
delegate_to: localhost
- name: Drain | Bulk | Check all migrations succeeded
ansible.builtin.fail:
msg: "Migration task failed for VMID — exitstatus: {{ item.json.data.exitstatus }}"
loop: "{{ task_status.results }}"
when: item.json.data.exitstatus != 'OK'
delegate_to: localhost
- name: Drain | Bulk | Handle fallback guests sequentially
include_tasks: migrate_guest.yml
loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
loop_var: guest
when: live_migrate_fallback != 'skip'

View File

@@ -0,0 +1,33 @@
---
# =============================================================================
# proxmox_upgrade — main.yml
# Orchestrates rolling Proxmox cluster upgrade
# Runs on the first node in upgrade_order, delegates API calls to localhost
# =============================================================================
- name: Proxmox Upgrade | Start
ansible.builtin.debug:
msg: >-
Starting Proxmox rolling upgrade for {{ client_name }}
— {{ upgrade_order | length }} nodes in order: {{ upgrade_order | join(' → ') }}
— migration_bulk={{ migration_bulk }}
— live_migrate_fallback={{ live_migrate_fallback }}
— migration_restore={{ migration_restore }}
— ceph_enabled={{ ceph_enabled }}
# ── Cluster health preflight ──────────────────────────────────────────────────
- name: Proxmox Upgrade | Cluster preflight
include_tasks: preflight.yml
# ── Rolling upgrade — one node at a time ─────────────────────────────────────
- name: Proxmox Upgrade | Rolling upgrade loop
include_tasks: node_upgrade.yml
loop: "{{ upgrade_order }}"
loop_var: current_node
- name: Proxmox Upgrade | Complete
ansible.builtin.debug:
msg: >-
Proxmox rolling upgrade complete for {{ client_name }}
— all {{ upgrade_order | length }} nodes upgraded successfully

View File

@@ -0,0 +1,117 @@
---
# =============================================================================
# proxmox_upgrade — migrate_guest.yml
# Handles migration of a single VM or LXC
# Called with loop_var: guest
# guest = { vmid, name, type, status, needs_fallback, fallback_reason }
# =============================================================================
- name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — skip check"
ansible.builtin.debug:
msg: "SKIPPING {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — live_migrate_fallback=skip, will go down during reboot"
when: guest.needs_fallback and live_migrate_fallback == 'skip'
delegate_to: localhost
- name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }})"
when: not (guest.needs_fallback and live_migrate_fallback == 'skip')
block:
# ── Cold migration: shutdown first ───────────────────────────────────────
- name: "Migrate | {{ guest.vmid }} | Shutdown for cold migration"
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/shutdown"
method: POST
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
body_format: json
body:
timeout: "{{ vm_shutdown_timeout }}"
forceStop: 1
validate_certs: false
when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
delegate_to: localhost
- name: "Migrate | {{ guest.vmid }} | Wait for shutdown"
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/current"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: vm_status
until: vm_status.json.data.status == 'stopped'
retries: "{{ (vm_shutdown_timeout | int / 5) | int }}"
delay: 5
when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
delegate_to: localhost
# ── Trigger migration ─────────────────────────────────────────────────────
- name: "Migrate | {{ guest.vmid }} | Trigger migration to {{ migration_targets | first }}"
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate"
method: POST
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
body_format: json
body:
target: "{{ migration_targets | first }}"
online: "{{ 0 if (guest.needs_fallback and live_migrate_fallback == 'shutdown') else 1 }}"
validate_certs: false
register: migration_task
delegate_to: localhost
# ── Wait for migration to complete ────────────────────────────────────────
- name: "Migrate | {{ guest.vmid }} | Wait for migration task to complete"
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/tasks/{{ migration_task.json.data }}/status"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: task_status
until: task_status.json.data.status == 'stopped'
retries: 60
delay: 10
delegate_to: localhost
- name: "Migrate | {{ guest.vmid }} | Verify migration succeeded"
ansible.builtin.fail:
msg: "Migration of {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) failed — {{ task_status.json.data.exitstatus }}"
when: task_status.json.data.exitstatus != 'OK'
delegate_to: localhost
# ── Cold migration: restart on target ────────────────────────────────────
- name: "Migrate | {{ guest.vmid }} | Start on target node after cold migration"
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/start"
method: POST
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
delegate_to: localhost
- name: "Migrate | {{ guest.vmid }} | Wait for VM to start on target"
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/current"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: vm_start_status
until: vm_start_status.json.data.status == 'running'
retries: "{{ (vm_start_timeout | int / 5) | int }}"
delay: 5
when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
delegate_to: localhost
- name: "Migrate | {{ guest.vmid }} ({{ guest.name }}) | Migration complete"
ansible.builtin.debug:
msg: >-
{{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }})
{% if guest.needs_fallback and live_migrate_fallback == 'shutdown' %}
cold migrated to {{ migration_targets | first }} and restarted
{% else %}
live migrated to {{ migration_targets | first }}
{% endif %}
delegate_to: localhost

View File

@@ -0,0 +1,43 @@
---
# =============================================================================
# proxmox_upgrade — node_upgrade.yml
# Per-node upgrade sequence: backup → drain → upgrade → restore
# Called with loop_var: current_node
# =============================================================================
- name: "Node {{ current_node }} | Start"
ansible.builtin.debug:
msg: "━━━ Starting upgrade of node {{ current_node }} ━━━"
# ── Step 1: Backup config ─────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 1 — Backup config"
include_role:
name: hypervisor_backup_config
vars:
pve_config_git_commit_message: "[{{ client_id }}] {{ current_node }} pre-upgrade config backup {{ ansible_date_time.date }}"
delegate_to: "{{ current_node }}"
# ── Step 2: Drain node ────────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 2 — Drain (migrate guests off node)"
include_tasks: drain.yml
when: cluster_mode == 'cluster'
- name: "Node {{ current_node }} | Step 2 — Single node mode, skipping drain"
ansible.builtin.debug:
msg: "cluster_mode=single — skipping guest migration"
when: cluster_mode == 'single'
# ── Step 3: Upgrade ───────────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 3 — Upgrade packages"
include_tasks: upgrade.yml
delegate_to: "{{ current_node }}"
# ── Step 4: Restore ───────────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 4 — Restore guests"
include_tasks: restore.yml
when: cluster_mode == 'cluster'
- name: "Node {{ current_node }} | Complete"
ansible.builtin.debug:
msg: "━━━ Node {{ current_node }} upgrade complete ━━━"

View File

@@ -0,0 +1,64 @@
---
# =============================================================================
# proxmox_upgrade — preflight.yml
# Check cluster health before starting any upgrade work
# Runs delegate_to: first node in upgrade_order
# =============================================================================
- name: Preflight | Check all cluster nodes are online
ansible.builtin.shell: |
pvecm status 2>/dev/null | grep -E "^Nodes|Quorate"
register: pvecm_status
changed_when: false
- name: Preflight | Get cluster node status via API
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: cluster_nodes
delegate_to: localhost
- name: Preflight | Check all nodes are online
ansible.builtin.fail:
msg: >
Cluster health check FAILED — node {{ item.node }} is {{ item.status }}.
Aborting upgrade to prevent data loss. Investigate before retrying.
loop: "{{ cluster_nodes.json.data }}"
when: item.status != 'online'
delegate_to: localhost
- name: Preflight | Check quorum via pvecm
ansible.builtin.shell: |
pvecm status 2>/dev/null | grep -i "quorate" | grep -i "yes"
register: quorum_check
changed_when: false
failed_when: quorum_check.rc != 0
- name: Preflight | Check CEPH health
when: ceph_enabled | bool
block:
- name: Preflight | Get CEPH health status
ansible.builtin.shell: |
ceph health 2>/dev/null
register: ceph_health
changed_when: false
- name: Preflight | Abort if CEPH is not healthy
ansible.builtin.fail:
msg: >
CEPH health check FAILED — status: {{ ceph_health.stdout }}.
Aborting upgrade. Resolve CEPH issues before retrying.
when: "'HEALTH_OK' not in ceph_health.stdout and 'HEALTH_WARN' not in ceph_health.stdout"
- name: Preflight | Warn if CEPH has warnings
ansible.builtin.debug:
msg: "WARNING — CEPH has warnings: {{ ceph_health.stdout }}. Proceeding but monitor closely."
when: "'HEALTH_WARN' in ceph_health.stdout"
- name: Preflight | Cluster health check passed
ansible.builtin.debug:
msg: "Cluster health check passed — all nodes online, quorum OK{{ ', CEPH checked' if ceph_enabled else '' }}"

View File

@@ -0,0 +1,63 @@
---
# =============================================================================
# proxmox_upgrade — restore.yml
# Optionally migrate guests back to their original node after upgrade
# Only runs if migration_restore: true
# =============================================================================
- name: Restore | Skip restore
ansible.builtin.debug:
msg: "migration_restore=false — leaving guests on their current nodes"
when: not migration_restore | bool
- name: Restore | Migrate guests back to {{ current_node }}
when: migration_restore | bool
block:
- name: Restore | Get guests currently on other nodes that originated from {{ current_node }}
ansible.builtin.debug:
msg: >-
Restoring {{ migration_plan | rejectattr('needs_fallback') | list | length +
(migration_plan | selectattr('needs_fallback') | list | length if live_migrate_fallback != 'skip' else 0) }}
guests back to {{ current_node }}
- name: Restore | Migrate each guest back
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate"
method: POST
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
body_format: json
body:
target: "{{ current_node }}"
online: "{{ 0 if (guest.needs_fallback and live_migrate_fallback == 'shutdown') else 1 }}"
validate_certs: false
register: restore_task
loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}"
loop_var: guest
delegate_to: localhost
- name: Restore | Wait for all restore migrations to complete
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/tasks/{{ item.json.data }}/status"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: restore_status
until: restore_status.json.data.status == 'stopped'
retries: 60
delay: 10
loop: "{{ restore_task.results }}"
delegate_to: localhost
- name: Restore | Check all restores succeeded
ansible.builtin.fail:
msg: "Restore migration failed — {{ item.json.data.exitstatus }}"
loop: "{{ restore_status.results }}"
when: item.json.data.exitstatus != 'OK'
delegate_to: localhost
- name: Restore | Complete
ansible.builtin.debug:
msg: "All guests restored to {{ current_node }}"

View File

@@ -0,0 +1,95 @@
---
# =============================================================================
# proxmox_upgrade — upgrade.yml
# Run apt dist-upgrade and reboot, wait for node to rejoin cluster
# =============================================================================
- name: Upgrade | Set CEPH noout flag before upgrade
ansible.builtin.shell: ceph osd set noout
when: ceph_enabled | bool
changed_when: true
- name: Upgrade | Run apt update
ansible.builtin.shell: apt-get update -q
changed_when: false
- name: Upgrade | Run apt dist-upgrade
ansible.builtin.shell: "{{ apt_upgrade_cmd }}"
register: apt_upgrade_result
changed_when: "'0 upgraded' not in apt_upgrade_result.stdout"
- name: Upgrade | Log packages upgraded
ansible.builtin.debug:
msg: "{{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('No output') }}"
- name: Upgrade | Run apt autoremove
ansible.builtin.shell: DEBIAN_FRONTEND=noninteractive apt-get autoremove -y
when: apt_autoremove | bool
changed_when: false
- name: Upgrade | Check if reboot is required
ansible.builtin.stat:
path: /var/run/reboot-required
register: reboot_required
- name: Upgrade | Log reboot status
ansible.builtin.debug:
msg: "{{ 'Reboot required — rebooting node' if reboot_required.stat.exists else 'No reboot required — skipping reboot' }}"
- name: Upgrade | Reboot node
ansible.builtin.reboot:
reboot_timeout: "{{ node_rejoin_timeout }}"
msg: "Rebooting for Proxmox upgrade"
pre_reboot_delay: 5
post_reboot_delay: 30
when: reboot_required.stat.exists
# ── Wait for node to rejoin cluster ──────────────────────────────────────────
- name: Upgrade | Wait for node to appear online in cluster
ansible.builtin.uri:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes"
method: GET
headers:
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
validate_certs: false
register: nodes_status
until: >-
nodes_status.json.data
| selectattr('node', 'equalto', current_node)
| selectattr('status', 'equalto', 'online')
| list | length > 0
retries: "{{ (node_rejoin_timeout | int / 10) | int }}"
delay: 10
delegate_to: localhost
- name: Upgrade | Node {{ current_node }} back online
ansible.builtin.debug:
msg: "Node {{ current_node }} has rejoined the cluster"
# ── CEPH recovery wait ────────────────────────────────────────────────────────
- name: Upgrade | Wait for CEPH to recover
when: ceph_enabled | bool
block:
- name: Upgrade | CEPH | Wait for HEALTH_OK or HEALTH_WARN
ansible.builtin.shell: ceph health
register: ceph_health_post
until: "'HEALTH_OK' in ceph_health_post.stdout or 'HEALTH_WARN' in ceph_health_post.stdout"
retries: "{{ (ceph_recover_timeout | int / 10) | int }}"
delay: 10
changed_when: false
- name: Upgrade | CEPH | Clear noout flag
ansible.builtin.shell: ceph osd unset noout
changed_when: true
- name: Upgrade | CEPH | Log recovery status
ansible.builtin.debug:
msg: "CEPH recovered: {{ ceph_health_post.stdout }}"
- name: Upgrade | Node {{ current_node }} upgrade complete
ansible.builtin.debug:
msg: >-
Node {{ current_node }} upgrade complete —
{{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('packages updated') }}
{{ '— rebooted' if reboot_required.stat.exists else '— no reboot needed' }}