testing new proxmox logic

2026-03-15 15:48:59 -07:00
parent 347a85b09d
commit ea2f00c098
34 changed files with 2391 additions and 21 deletions
--- a/roles/proxmox_ceph/defaults/main.yml
+++ b/roles/proxmox_ceph/defaults/main.yml
@@ -0,0 +1,18 @@
+---
+# =============================================================================
+# proxmox_ceph — defaults
+# =============================================================================
+
+# Action: set_noout | clear_noout | status | check_health
+ceph_action: status
+
+# Health check settings
+ceph_health_timeout: 300        # seconds to wait for HEALTH_OK
+ceph_health_retries: 30
+ceph_health_delay: 10
+
+# Abort upgrade if CEPH is in error state
+ceph_abort_on_error: true
+
+# Warn but continue if CEPH has warnings
+ceph_warn_on_warning: true
--- a/roles/proxmox_ceph/meta/main.yml
+++ b/roles/proxmox_ceph/meta/main.yml
@@ -0,0 +1,11 @@
+---
+galaxy_info:
+  role_name: proxmox_ceph
+  author: ansible-msp
+  description: "MSP Proxmox automation — proxmox_ceph"
+  min_ansible_version: "2.15"
+  platforms:
+    - name: Debian
+      versions:
+        - bookworm
+dependencies: []
--- a/roles/proxmox_ceph/tasks/main.yml
+++ b/roles/proxmox_ceph/tasks/main.yml
@@ -0,0 +1,140 @@
+---
+# =============================================================================
+# proxmox_ceph — tasks
+# Manages CEPH noout flag and health checks during maintenance.
+# Skips gracefully if CEPH is not configured on this cluster.
+# =============================================================================
+
+# ── Detect CEPH ───────────────────────────────────────────────────────────────
+- name: "CEPH | Detect if CEPH is configured"
+  ansible.builtin.command: ceph status
+  register: ceph_detect
+  changed_when: false
+  failed_when: false
+  run_once: true
+
+- name: "CEPH | Set CEPH enabled fact"
+  ansible.builtin.set_fact:
+    ceph_is_enabled: "{{ ceph_detect.rc == 0 }}"
+  run_once: true
+
+- name: "CEPH | Skip — CEPH not configured on this cluster"
+  ansible.builtin.debug:
+    msg: "CEPH is not configured on this cluster — skipping all CEPH tasks."
+  when: not ceph_is_enabled
+  run_once: true
+
+# ── CEPH status ───────────────────────────────────────────────────────────────
+- name: "CEPH | Get cluster status"
+  ansible.builtin.command: ceph status --format json
+  register: ceph_status_raw
+  changed_when: false
+  run_once: true
+  when: ceph_is_enabled
+
+- name: "CEPH | Parse status"
+  ansible.builtin.set_fact:
+    ceph_status: "{{ ceph_status_raw.stdout | from_json }}"
+  run_once: true
+  when: ceph_is_enabled
+
+- name: "CEPH | Log health"
+  ansible.builtin.debug:
+    msg: "CEPH health: {{ ceph_status.health.status }}"
+  run_once: true
+  when:
+    - ceph_is_enabled
+    - ceph_action == 'status'
+
+# ── Health check ──────────────────────────────────────────────────────────────
+- name: "CEPH | Check health | Abort if HEALTH_ERR"
+  ansible.builtin.fail:
+    msg: >-
+      CEPH is in HEALTH_ERR state — aborting to prevent data loss.
+      Run 'ceph status' to investigate. Set ceph_abort_on_error=false to override.
+  when:
+    - ceph_is_enabled
+    - ceph_abort_on_error
+    - ceph_status.health.status == 'HEALTH_ERR'
+  run_once: true
+
+- name: "CEPH | Check health | Warn on HEALTH_WARN"
+  ansible.builtin.debug:
+    msg: >-
+      WARNING: CEPH is in HEALTH_WARN state.
+      Proceeding — set ceph_warn_on_warning=false to suppress this message.
+      Checks: {{ ceph_status.health.checks | default({}) | dict2items | map(attribute='key') | list }}
+  when:
+    - ceph_is_enabled
+    - ceph_warn_on_warning
+    - ceph_status.health.status == 'HEALTH_WARN'
+  run_once: true
+
+# ── Set noout ─────────────────────────────────────────────────────────────────
+- name: "CEPH | Set noout flag"
+  ansible.builtin.command: ceph osd set noout
+  changed_when: true
+  run_once: true
+  when:
+    - ceph_is_enabled
+    - ceph_action == 'set_noout'
+
+- name: "CEPH | Confirm noout set"
+  ansible.builtin.debug:
+    msg: "✓ CEPH noout flag SET — OSDs will not be marked out during maintenance."
+  when:
+    - ceph_is_enabled
+    - ceph_action == 'set_noout'
+  run_once: true
+
+# ── Clear noout ───────────────────────────────────────────────────────────────
+- name: "CEPH | Clear noout flag"
+  ansible.builtin.command: ceph osd unset noout
+  changed_when: true
+  run_once: true
+  when:
+    - ceph_is_enabled
+    - ceph_action == 'clear_noout'
+
+- name: "CEPH | Wait for HEALTH_OK after clearing noout"
+  ansible.builtin.command: ceph status --format json
+  register: ceph_recovery_check
+  changed_when: false
+  until: "(ceph_recovery_check.stdout | from_json).health.status in ['HEALTH_OK', 'HEALTH_WARN']"
+  retries: "{{ ceph_health_retries }}"
+  delay: "{{ ceph_health_delay }}"
+  run_once: true
+  when:
+    - ceph_is_enabled
+    - ceph_action == 'clear_noout'
+
+- name: "CEPH | Log recovery status"
+  ansible.builtin.debug:
+    msg: >-
+      ✓ CEPH noout CLEARED —
+      health: {{ (ceph_recovery_check.stdout | from_json).health.status }}
+  when:
+    - ceph_is_enabled
+    - ceph_action == 'clear_noout'
+  run_once: true
+
+# ── check_health action ───────────────────────────────────────────────────────
+- name: "CEPH | Wait for healthy state"
+  ansible.builtin.command: ceph status --format json
+  register: ceph_health_wait
+  changed_when: false
+  until: "(ceph_health_wait.stdout | from_json).health.status in ['HEALTH_OK', 'HEALTH_WARN']"
+  retries: "{{ ceph_health_retries }}"
+  delay: "{{ ceph_health_delay }}"
+  run_once: true
+  when:
+    - ceph_is_enabled
+    - ceph_action == 'check_health'
+
+- name: "CEPH | Health check result"
+  ansible.builtin.debug:
+    msg: "CEPH health: {{ (ceph_health_wait.stdout | from_json).health.status }}"
+  when:
+    - ceph_is_enabled
+    - ceph_action == 'check_health'
+  run_once: true
--- a/roles/proxmox_config_backup/defaults/main.yml
+++ b/roles/proxmox_config_backup/defaults/main.yml
@@ -0,0 +1,33 @@
+---
+# =============================================================================
+# proxmox_config_backup — defaults
+# =============================================================================
+
+# Backup destination: local | sftp
+# git destination removed pending secure implementation (TODO: encrypt secrets)
+backup_destination: local
+
+# Local backup settings
+backup_local_dir: /var/backups/proxmox-config
+backup_local_keep: 10
+
+# SFTP settings (used when backup_destination: sftp)
+backup_sftp_host: ""
+backup_sftp_user: ""
+backup_sftp_key: ""
+backup_sftp_remote_dir: "/backups/proxmox"
+
+# What to include in the backup tarball
+backup_paths_proxmox:
+  - /etc/pve
+  - /etc/network/interfaces
+  - /etc/hosts
+  - /etc/hostname
+  - /etc/apt/sources.list
+  - /etc/apt/sources.list.d
+
+backup_paths_xcpng:
+  - /etc/xcp-ng
+  - /etc/network/interfaces
+  - /etc/hosts
+  - /etc/hostname
--- a/roles/proxmox_config_backup/meta/main.yml
+++ b/roles/proxmox_config_backup/meta/main.yml
@@ -0,0 +1,11 @@
+---
+galaxy_info:
+  role_name: proxmox_config_backup
+  author: ansible-msp
+  description: "MSP Proxmox automation — proxmox_config_backup"
+  min_ansible_version: "2.15"
+  platforms:
+    - name: Debian
+      versions:
+        - bookworm
+dependencies: []
--- a/roles/proxmox_config_backup/tasks/main.yml
+++ b/roles/proxmox_config_backup/tasks/main.yml
@@ -0,0 +1,113 @@
+---
+# =============================================================================
+# proxmox_config_backup — tasks
+# Creates a tarball of critical Proxmox config files and stores it locally
+# or transfers via SFTP. Git destination is a TODO pending secure handling
+# of sensitive files (SSL keys, shadow.cfg, etc).
+#
+# Required vars:
+#   current_node — the node being backed up (for filename)
+#   client_id    — client identifier (for filename)
+# =============================================================================
+
+- name: "Backup | {{ current_node }} | Gather date/time facts"
+  ansible.builtin.setup:
+    gather_subset:
+      - date_time
+  when: ansible_date_time is not defined
+
+- name: "Backup | {{ current_node }} | Set backup filename"
+  ansible.builtin.set_fact:
+    backup_filename: >-
+      proxmox_{{ client_id | lower | replace('-', '_') }}_{{ current_node }}_{{ ansible_date_time.date }}
+
+- name: "Backup | {{ current_node }} | Set backup paths"
+  ansible.builtin.set_fact:
+    backup_paths: "{{ backup_paths_proxmox }}"
+
+# ── Create tarball on node ────────────────────────────────────────────────────
+- name: "Backup | {{ current_node }} | Create config tarball"
+  ansible.builtin.shell: |
+    tar czf /tmp/{{ backup_filename }}.tar.gz \
+      --ignore-failed-read \
+      --dereference \
+      {{ backup_paths | join(' ') }} 2>/dev/null || true
+    echo "done"
+  changed_when: true
+  register: backup_tarball
+
+# ── Local backup ──────────────────────────────────────────────────────────────
+- name: "Backup | {{ current_node }} | Local | Ensure backup dir exists"
+  ansible.builtin.file:
+    path: "{{ backup_local_dir }}"
+    state: directory
+    mode: '0700'
+  when: backup_destination == 'local'
+
+- name: "Backup | {{ current_node }} | Local | Move tarball to backup dir"
+  ansible.builtin.copy:
+    src: "/tmp/{{ backup_filename }}.tar.gz"
+    dest: "{{ backup_local_dir }}/{{ backup_filename }}.tar.gz"
+    remote_src: true
+    mode: '0600'
+  when: backup_destination == 'local'
+
+- name: "Backup | {{ current_node }} | Local | Rotate old backups"
+  ansible.builtin.shell: |
+    ls -1t {{ backup_local_dir }}/proxmox_*_{{ current_node }}_*.tar.gz 2>/dev/null \
+      | tail -n +{{ (backup_local_keep | int) + 1 }} \
+      | xargs -r rm -f
+  changed_when: false
+  when: backup_destination == 'local'
+
+- name: "Backup | {{ current_node }} | Local | Log result"
+  ansible.builtin.debug:
+    msg: "✓ Config backed up locally: {{ backup_local_dir }}/{{ backup_filename }}.tar.gz"
+  when: backup_destination == 'local'
+
+# ── SFTP backup ───────────────────────────────────────────────────────────────
+- name: "Backup | {{ current_node }} | SFTP | Validate required vars"
+  ansible.builtin.fail:
+    msg: "SFTP backup requires backup_sftp_host and backup_sftp_user to be set."
+  when:
+    - backup_destination == 'sftp'
+    - backup_sftp_host == '' or backup_sftp_user == ''
+
+- name: "Backup | {{ current_node }} | SFTP | Fetch tarball to controller"
+  ansible.builtin.fetch:
+    src: "/tmp/{{ backup_filename }}.tar.gz"
+    dest: "/tmp/{{ backup_filename }}.tar.gz"
+    flat: true
+  when: backup_destination == 'sftp'
+
+- name: "Backup | {{ current_node }} | SFTP | Transfer to remote host"
+  ansible.builtin.shell: |
+    sftp_opts="-o StrictHostKeyChecking=no -o BatchMode=yes"
+    {% if backup_sftp_key != '' %}
+    sftp_opts="$sftp_opts -i {{ backup_sftp_key }}"
+    {% endif %}
+    sftp $sftp_opts {{ backup_sftp_user }}@{{ backup_sftp_host }} << EOF
+    cd {{ backup_sftp_remote_dir }}
+    put /tmp/{{ backup_filename }}.tar.gz
+    EOF
+  delegate_to: localhost
+  changed_when: true
+  when: backup_destination == 'sftp'
+
+- name: "Backup | {{ current_node }} | SFTP | Clean up local temp tarball"
+  ansible.builtin.file:
+    path: "/tmp/{{ backup_filename }}.tar.gz"
+    state: absent
+  delegate_to: localhost
+  when: backup_destination == 'sftp'
+
+- name: "Backup | {{ current_node }} | SFTP | Log result"
+  ansible.builtin.debug:
+    msg: "✓ Config backed up via SFTP: {{ backup_sftp_host }}:{{ backup_sftp_remote_dir }}/{{ backup_filename }}.tar.gz"
+  when: backup_destination == 'sftp'
+
+# ── Cleanup ───────────────────────────────────────────────────────────────────
+- name: "Backup | {{ current_node }} | Clean up temp tarball on node"
+  ansible.builtin.file:
+    path: "/tmp/{{ backup_filename }}.tar.gz"
+    state: absent
--- a/roles/proxmox_drain/defaults/main.yml
+++ b/roles/proxmox_drain/defaults/main.yml
@@ -0,0 +1,34 @@
+---
+# =============================================================================
+# proxmox_drain — defaults
+# =============================================================================
+
+# Target selection strategy: resources | explicit
+# resources = pick node with most available mem+cpu
+# explicit   = use drain_target_node variable
+drain_target_strategy: resources
+
+# Explicit target node (only used when drain_target_strategy: explicit)
+drain_target_node: ""
+
+# Resource weighting for target scoring (must sum to 1.0)
+drain_score_mem_weight: 0.6
+drain_score_cpu_weight: 0.4
+
+# Migration behaviour
+drain_online: true               # live migrate running VMs
+drain_shutdown_fallback: true    # shutdown VM if live migrate fails
+drain_vm_shutdown_timeout: 120   # seconds to wait for graceful shutdown
+drain_lxc_restart: true          # restart LXC after migration
+
+# State file — written to Semaphore host for restore mode
+drain_state_dir: "/tmp/proxmox_drain_state"
+
+# Filtering — skip VMs matching these tags (comma-separated string in PVE)
+drain_exclude_tags:
+  - nomigrate
+  - pinned
+
+# API connection (inherited from inventory)
+api_port: 8006
+validate_certs: false
--- a/roles/proxmox_drain/meta/main.yml
+++ b/roles/proxmox_drain/meta/main.yml
@@ -0,0 +1,11 @@
+---
+galaxy_info:
+  role_name: proxmox_drain
+  author: ansible-msp
+  description: "MSP Proxmox automation — proxmox_drain"
+  min_ansible_version: "2.15"
+  platforms:
+    - name: Debian
+      versions:
+        - bookworm
+dependencies: []
--- a/roles/proxmox_drain/tasks/main.yml
+++ b/roles/proxmox_drain/tasks/main.yml
@@ -0,0 +1,217 @@
+---
+# =============================================================================
+# proxmox_drain — tasks
+# Migrates all VMs/LXCs off current_node to the best available target.
+# Writes a state file so proxmox_restore can return VMs to origin.
+#
+# Required vars:
+#   current_node   — the node being drained
+# =============================================================================
+
+# ── Discover guests on this node ──────────────────────────────────────────────
+- name: "Drain | {{ current_node }} | Discover guests"
+  community.proxmox.proxmox_vm_info:
+    api_host: "{{ api_host }}"
+    api_user: "{{ api_user }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+    validate_certs: "{{ validate_certs }}"
+    node: "{{ current_node }}"
+  register: drain_node_guests
+  delegate_to: localhost
+
+- name: "Drain | {{ current_node }} | Separate KVM and LXC guests"
+  ansible.builtin.set_fact:
+    drain_kvm_guests: >-
+      {{ drain_node_guests.proxmox_vms
+         | selectattr('type', 'equalto', 'qemu')
+         | rejectattr('template', 'equalto', true)
+         | list }}
+    drain_lxc_guests: >-
+      {{ drain_node_guests.proxmox_vms
+         | selectattr('type', 'equalto', 'lxc')
+         | list }}
+  delegate_to: localhost
+
+- name: "Drain | {{ current_node }} | Filter excluded tags"
+  ansible.builtin.set_fact:
+    drain_kvm_guests: >-
+      {{ drain_kvm_guests
+         | rejectattr('tags', 'defined')
+         | list
+         + drain_kvm_guests
+         | selectattr('tags', 'defined')
+         | rejectattr('tags', 'search', drain_exclude_tags | join('|'))
+         | list }}
+    drain_lxc_guests: >-
+      {{ drain_lxc_guests
+         | rejectattr('tags', 'defined')
+         | list
+         + drain_lxc_guests
+         | selectattr('tags', 'defined')
+         | rejectattr('tags', 'search', drain_exclude_tags | join('|'))
+         | list }}
+  delegate_to: localhost
+
+- name: "Drain | {{ current_node }} | Log guest inventory"
+  ansible.builtin.debug:
+    msg: >-
+      {{ current_node }} has
+      {{ drain_kvm_guests | length }} KVM guest(s) and
+      {{ drain_lxc_guests | length }} LXC guest(s) to migrate.
+      VMIDs: {{ (drain_kvm_guests + drain_lxc_guests) | map(attribute='vmid') | list }}
+
+# ── Skip if nothing to migrate ────────────────────────────────────────────────
+- name: "Drain | {{ current_node }} | Skip — no guests to migrate"
+  ansible.builtin.debug:
+    msg: "Node {{ current_node }} has no guests — skipping drain."
+  when:
+    - drain_kvm_guests | length == 0
+    - drain_lxc_guests | length == 0
+
+- name: "Drain | {{ current_node }} | End play if no guests"
+  ansible.builtin.meta: end_play
+  when:
+    - drain_kvm_guests | length == 0
+    - drain_lxc_guests | length == 0
+
+# ── Select migration target ───────────────────────────────────────────────────
+- name: "Drain | {{ current_node }} | Get all node resource info"
+  community.proxmox.proxmox_node_info:
+    api_host: "{{ api_host }}"
+    api_user: "{{ api_user }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+    validate_certs: "{{ validate_certs }}"
+  register: drain_all_nodes
+  delegate_to: localhost
+  when: drain_target_strategy == 'resources'
+
+- name: "Drain | {{ current_node }} | Score nodes by available resources"
+  ansible.builtin.set_fact:
+    drain_scored_nodes: >-
+      {% set candidates = [] %}
+      {% for node in drain_all_nodes.proxmox_nodes %}
+        {% if node.status == 'online' and node.node != current_node %}
+          {% set free_mem = node.maxmem - node.mem %}
+          {% set free_cpu = 1.0 - (node.cpu | default(0)) %}
+          {% set score = (free_mem * drain_score_mem_weight | float) + (free_cpu * 1000000000 * drain_score_cpu_weight | float) %}
+          {% set _ = candidates.append({'node': node.node, 'score': score, 'free_mem': free_mem, 'free_cpu': free_cpu}) %}
+        {% endif %}
+      {% endfor %}
+      {{ candidates | sort(attribute='score', reverse=true) }}
+  delegate_to: localhost
+  when: drain_target_strategy == 'resources'
+
+- name: "Drain | {{ current_node }} | Set migration target (resources)"
+  ansible.builtin.set_fact:
+    drain_resolved_target: "{{ drain_scored_nodes | first | default({}) | default({'node': ''}) }}"
+    drain_target: "{{ (drain_scored_nodes | first).node }}"
+  delegate_to: localhost
+  when: drain_target_strategy == 'resources'
+
+- name: "Drain | {{ current_node }} | Set migration target (explicit)"
+  ansible.builtin.set_fact:
+    drain_target: "{{ drain_target_node }}"
+  delegate_to: localhost
+  when: drain_target_strategy == 'explicit'
+
+- name: "Drain | {{ current_node }} | Fail if no target available"
+  ansible.builtin.fail:
+    msg: >-
+      No valid migration target found for node {{ current_node }}.
+      All other nodes may be offline or no nodes configured.
+  when: drain_target == ''
+  delegate_to: localhost
+
+- name: "Drain | {{ current_node }} | Log migration target"
+  ansible.builtin.debug:
+    msg: >-
+      Migration target for {{ current_node }}: {{ drain_target }}
+      {% if drain_target_strategy == 'resources' %}
+      (free_mem={{ (drain_resolved_target.free_mem / 1073741824) | round(1) }}GB,
+       free_cpu={{ (drain_resolved_target.free_cpu * 100) | round(1) }}%)
+      {% endif %}
+
+# ── Write state file for restore ──────────────────────────────────────────────
+- name: "Drain | {{ current_node }} | Ensure state directory exists"
+  ansible.builtin.file:
+    path: "{{ drain_state_dir }}"
+    state: directory
+    mode: '0700'
+  delegate_to: localhost
+
+- name: "Drain | {{ current_node }} | Write VM origin state"
+  ansible.builtin.copy:
+    content: >-
+      {{ (drain_kvm_guests + drain_lxc_guests)
+         | map('combine', {'origin_node': current_node})
+         | list
+         | to_nice_json }}
+    dest: "{{ drain_state_dir }}/{{ current_node }}_{{ ansible_date_time.iso8601_basic_short }}.json"
+    mode: '0600'
+  delegate_to: localhost
+  when: ansible_date_time is defined
+
+# ── Migrate KVM guests ────────────────────────────────────────────────────────
+- name: "Drain | {{ current_node }} | KVM | Live migrate to {{ drain_target }}"
+  ansible.builtin.command: >
+    qm migrate {{ item.vmid }} {{ drain_target }}
+    {% if item.status == 'running' %}--online{% endif %}
+    --with-local-disks 0
+  loop: "{{ drain_kvm_guests }}"
+  loop_control:
+    label: "{{ item.name }} (VMID {{ item.vmid }}) — {{ item.status }}"
+  changed_when: true
+  register: drain_kvm_results
+  failed_when: drain_kvm_results.rc is defined and drain_kvm_results.rc != 0
+
+- name: "Drain | {{ current_node }} | KVM | Verify guests moved"
+  community.proxmox.proxmox_vm_info:
+    api_host: "{{ api_host }}"
+    api_user: "{{ api_user }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+    validate_certs: "{{ validate_certs }}"
+    node: "{{ drain_target }}"
+  register: drain_verify_guests
+  delegate_to: localhost
+  when: drain_kvm_guests | length > 0
+
+- name: "Drain | {{ current_node }} | KVM | Log migration results"
+  ansible.builtin.debug:
+    msg: >-
+      KVM migrations complete —
+      {{ drain_kvm_guests | length }} guest(s) moved to {{ drain_target }}.
+  when: drain_kvm_guests | length > 0
+
+# ── Migrate LXC guests ────────────────────────────────────────────────────────
+- name: "Drain | {{ current_node }} | LXC | Migrate to {{ drain_target }}"
+  ansible.builtin.command: >
+    pct migrate {{ item.vmid }} {{ drain_target }}
+    {% if drain_lxc_restart %}--restart{% endif %}
+    --timeout {{ drain_vm_shutdown_timeout }}
+  loop: "{{ drain_lxc_guests }}"
+  loop_control:
+    label: "{{ item.name | default(item.vmid) }} (VMID {{ item.vmid }}) — {{ item.status }}"
+  changed_when: true
+  register: drain_lxc_results
+  failed_when: drain_lxc_results.rc is defined and drain_lxc_results.rc != 0
+
+- name: "Drain | {{ current_node }} | LXC | Log migration results"
+  ansible.builtin.debug:
+    msg: >-
+      LXC migrations complete —
+      {{ drain_lxc_guests | length }} container(s) moved to {{ drain_target }}.
+  when: drain_lxc_guests | length > 0
+
+# ── Final summary ─────────────────────────────────────────────────────────────
+- name: "Drain | {{ current_node }} | Complete"
+  ansible.builtin.debug:
+    msg: >-
+      ✓ Node {{ current_node }} drained —
+      {{ drain_kvm_guests | length }} KVM +
+      {{ drain_lxc_guests | length }} LXC guests migrated to {{ drain_target }}.
--- a/roles/proxmox_ha/defaults/main.yml
+++ b/roles/proxmox_ha/defaults/main.yml
@@ -0,0 +1,17 @@
+---
+# =============================================================================
+# proxmox_ha — defaults
+# =============================================================================
+
+# Action: status | disable | enable | migrate
+ha_action: status
+
+# Node to disable/enable HA management for (used with disable/enable)
+# Default: current_node (set by caller)
+
+# Timeout waiting for HA manager to acknowledge
+ha_timeout: 60
+
+# API connection (inherited from inventory)
+api_port: 8006
+validate_certs: false
--- a/roles/proxmox_ha/meta/main.yml
+++ b/roles/proxmox_ha/meta/main.yml
@@ -0,0 +1,11 @@
+---
+galaxy_info:
+  role_name: proxmox_ha
+  author: ansible-msp
+  description: "MSP Proxmox automation — proxmox_ha"
+  min_ansible_version: "2.15"
+  platforms:
+    - name: Debian
+      versions:
+        - bookworm
+dependencies: []
--- a/roles/proxmox_ha/tasks/main.yml
+++ b/roles/proxmox_ha/tasks/main.yml
@@ -0,0 +1,96 @@
+---
+# =============================================================================
+# proxmox_ha — tasks
+# Manages Proxmox HA group membership and maintenance mode.
+# Proxmox HA is self-managing during migrations — this role handles
+# cases where you need to explicitly pause or resume HA for a node.
+# =============================================================================
+
+# ── Detect HA ─────────────────────────────────────────────────────────────────
+- name: "HA | Detect if HA is configured"
+  ansible.builtin.command: ha-manager status
+  register: ha_detect
+  changed_when: false
+  failed_when: false
+  run_once: true
+
+- name: "HA | Set HA enabled fact"
+  ansible.builtin.set_fact:
+    ha_is_enabled: "{{ ha_detect.rc == 0 and ha_detect.stdout != '' }}"
+  run_once: true
+
+- name: "HA | Skip — HA not configured"
+  ansible.builtin.debug:
+    msg: "HA is not configured on this cluster — skipping."
+  when: not ha_is_enabled
+  run_once: true
+
+# ── HA status ─────────────────────────────────────────────────────────────────
+- name: "HA | Get status"
+  ansible.builtin.command: ha-manager status
+  register: ha_status
+  changed_when: false
+  when:
+    - ha_is_enabled
+    - ha_action == 'status'
+  run_once: true
+
+- name: "HA | Log status"
+  ansible.builtin.debug:
+    msg: "{{ ha_status.stdout_lines }}"
+  when:
+    - ha_is_enabled
+    - ha_action == 'status'
+  run_once: true
+
+# ── Put node in maintenance mode ──────────────────────────────────────────────
+# Proxmox uses node maintenance mode via ha-manager to gracefully migrate
+# HA-managed VMs before maintenance. This is the correct HA-aware drain.
+- name: "HA | Enable maintenance mode for {{ current_node }}"
+  ansible.builtin.command: >
+    ha-manager crm-command node-maintenance enable {{ current_node }}
+  changed_when: true
+  run_once: true
+  when:
+    - ha_is_enabled
+    - ha_action == 'disable'
+
+- name: "HA | Wait for {{ current_node }} maintenance mode to be acknowledged"
+  ansible.builtin.command: ha-manager status
+  register: ha_maintenance_check
+  changed_when: false
+  until: >-
+    'maintenance' in ha_maintenance_check.stdout
+    or current_node + ' (maintenance)' in ha_maintenance_check.stdout
+  retries: "{{ (ha_timeout / 5) | int }}"
+  delay: 5
+  run_once: true
+  when:
+    - ha_is_enabled
+    - ha_action == 'disable'
+
+- name: "HA | Maintenance mode enabled for {{ current_node }}"
+  ansible.builtin.debug:
+    msg: "✓ HA maintenance mode enabled for {{ current_node }} — HA will not restart VMs on this node."
+  when:
+    - ha_is_enabled
+    - ha_action == 'disable'
+  run_once: true
+
+# ── Resume HA management ──────────────────────────────────────────────────────
+- name: "HA | Disable maintenance mode for {{ current_node }}"
+  ansible.builtin.command: >
+    ha-manager crm-command node-maintenance disable {{ current_node }}
+  changed_when: true
+  run_once: true
+  when:
+    - ha_is_enabled
+    - ha_action == 'enable'
+
+- name: "HA | Maintenance mode disabled for {{ current_node }}"
+  ansible.builtin.debug:
+    msg: "✓ HA management resumed for {{ current_node }}."
+  when:
+    - ha_is_enabled
+    - ha_action == 'enable'
+  run_once: true
--- a/roles/proxmox_preflight/defaults/main.yml
+++ b/roles/proxmox_preflight/defaults/main.yml
@@ -0,0 +1,18 @@
+---
+# =============================================================================
+# proxmox_preflight — defaults
+# =============================================================================
+
+# Minimum number of nodes that must be online before proceeding
+preflight_min_nodes_online: 1
+
+# Abort if any node is offline (set false to warn only)
+preflight_abort_on_offline_node: true
+
+# Quorum check via pvecm (SSH)
+preflight_check_quorum: true
+
+# API connection (inherited from inventory)
+# api_host, api_port, api_user, api_token_id, api_token_secret
+api_port: 8006
+validate_certs: false
--- a/roles/proxmox_preflight/meta/main.yml
+++ b/roles/proxmox_preflight/meta/main.yml
@@ -0,0 +1,11 @@
+---
+galaxy_info:
+  role_name: proxmox_preflight
+  author: ansible-msp
+  description: "MSP Proxmox automation — proxmox_preflight"
+  min_ansible_version: "2.15"
+  platforms:
+    - name: Debian
+      versions:
+        - bookworm
+dependencies: []
--- a/roles/proxmox_preflight/tasks/main.yml
+++ b/roles/proxmox_preflight/tasks/main.yml
@@ -0,0 +1,113 @@
+---
+# =============================================================================
+# proxmox_preflight — tasks
+# Determines: standalone vs cluster, node health, quorum, CEPH state
+# Sets facts: proxmox_is_cluster, proxmox_cluster_nodes, proxmox_node_count
+# =============================================================================
+
+# ── Detect standalone vs cluster ──────────────────────────────────────────────
+- name: "Preflight | Detect cluster membership"
+  ansible.builtin.command: pvecm status
+  register: pvecm_status
+  changed_when: false
+  failed_when: false
+
+- name: "Preflight | Set cluster mode fact"
+  ansible.builtin.set_fact:
+    proxmox_is_cluster: "{{ pvecm_status.rc == 0 }}"
+  delegate_to: localhost
+
+- name: "Preflight | Log topology"
+  ansible.builtin.debug:
+    msg: >-
+      Node {{ inventory_hostname }} is running in
+      {{ 'CLUSTER' if proxmox_is_cluster else 'STANDALONE' }} mode.
+
+# ── Standalone path ───────────────────────────────────────────────────────────
+- name: "Preflight | Standalone | Verify host is reachable"
+  ansible.builtin.ping:
+  when: not proxmox_is_cluster
+
+- name: "Preflight | Standalone | Health check passed"
+  ansible.builtin.debug:
+    msg: "Standalone node {{ inventory_hostname }} is reachable — preflight passed."
+  when: not proxmox_is_cluster
+
+# ── Cluster path ──────────────────────────────────────────────────────────────
+- name: "Preflight | Cluster | Check quorum"
+  ansible.builtin.command: pvecm status
+  register: quorum_check
+  changed_when: false
+  failed_when: "'Quorate' not in quorum_check.stdout"
+  when: proxmox_is_cluster and preflight_check_quorum
+  run_once: true
+
+- name: "Preflight | Cluster | Get all node info via API"
+  community.proxmox.proxmox_node_info:
+    api_host: "{{ api_host }}"
+    api_user: "{{ api_user }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+    validate_certs: "{{ validate_certs }}"
+  register: proxmox_all_nodes
+  delegate_to: localhost
+  run_once: true
+  when: proxmox_is_cluster
+
+- name: "Preflight | Cluster | Set node list facts"
+  ansible.builtin.set_fact:
+    proxmox_cluster_nodes: "{{ proxmox_all_nodes.proxmox_nodes }}"
+    proxmox_node_count: "{{ proxmox_all_nodes.proxmox_nodes | length }}"
+    proxmox_online_nodes: >-
+      {{ proxmox_all_nodes.proxmox_nodes
+         | selectattr('status', 'equalto', 'online')
+         | list }}
+    proxmox_offline_nodes: >-
+      {{ proxmox_all_nodes.proxmox_nodes
+         | rejectattr('status', 'equalto', 'online')
+         | list }}
+  delegate_to: localhost
+  run_once: true
+  when: proxmox_is_cluster
+
+- name: "Preflight | Cluster | Warn about offline nodes"
+  ansible.builtin.debug:
+    msg: >-
+      WARNING: The following nodes are offline:
+      {{ proxmox_offline_nodes | map(attribute='node') | list }}
+  when:
+    - proxmox_is_cluster
+    - proxmox_offline_nodes | length > 0
+  run_once: true
+
+- name: "Preflight | Cluster | Abort if offline nodes detected"
+  ansible.builtin.fail:
+    msg: >-
+      Preflight failed — {{ proxmox_offline_nodes | length }} node(s) are offline:
+      {{ proxmox_offline_nodes | map(attribute='node') | list }}.
+      Set preflight_abort_on_offline_node=false to proceed anyway.
+  when:
+    - proxmox_is_cluster
+    - preflight_abort_on_offline_node
+    - proxmox_offline_nodes | length > 0
+  run_once: true
+
+- name: "Preflight | Cluster | Verify minimum online node count"
+  ansible.builtin.fail:
+    msg: >-
+      Only {{ proxmox_online_nodes | length }} node(s) online.
+      Minimum required: {{ preflight_min_nodes_online }}.
+  when:
+    - proxmox_is_cluster
+    - proxmox_online_nodes | length < preflight_min_nodes_online | int
+  run_once: true
+
+- name: "Preflight | Cluster | Health check passed"
+  ansible.builtin.debug:
+    msg: >-
+      Cluster preflight OK —
+      {{ proxmox_online_nodes | length }}/{{ proxmox_node_count }} nodes online,
+      quorum confirmed.
+  when: proxmox_is_cluster
+  run_once: true
--- a/roles/proxmox_restore/defaults/main.yml
+++ b/roles/proxmox_restore/defaults/main.yml
@@ -0,0 +1,17 @@
+---
+# =============================================================================
+# proxmox_restore — defaults
+# =============================================================================
+
+# State file directory (must match drain_state_dir)
+restore_state_dir: "/tmp/proxmox_drain_state"
+
+# If true, delete the state file after successful restore
+restore_cleanup_state_file: true
+
+# Timeout waiting for VM to start on restored node
+restore_vm_start_timeout: 120
+
+# API connection (inherited from inventory)
+api_port: 8006
+validate_certs: false
--- a/roles/proxmox_restore/meta/main.yml
+++ b/roles/proxmox_restore/meta/main.yml
@@ -0,0 +1,11 @@
+---
+galaxy_info:
+  role_name: proxmox_restore
+  author: ansible-msp
+  description: "MSP Proxmox automation — proxmox_restore"
+  min_ansible_version: "2.15"
+  platforms:
+    - name: Debian
+      versions:
+        - bookworm
+dependencies: []
--- a/roles/proxmox_restore/tasks/main.yml
+++ b/roles/proxmox_restore/tasks/main.yml
@@ -0,0 +1,112 @@
+---
+# =============================================================================
+# proxmox_restore — tasks
+# Returns VMs to their origin nodes using state written by proxmox_drain.
+#
+# Required vars:
+#   current_node — the node whose VMs should be restored
+#   restore_state_file — path to the JSON state file (set by caller or discovered)
+# =============================================================================
+
+# ── Find state file ───────────────────────────────────────────────────────────
+- name: "Restore | {{ current_node }} | Find state files"
+  ansible.builtin.find:
+    paths: "{{ restore_state_dir }}"
+    patterns: "{{ current_node }}_*.json"
+    file_type: file
+  register: restore_found_files
+  delegate_to: localhost
+
+- name: "Restore | {{ current_node }} | No state files found — skipping"
+  ansible.builtin.debug:
+    msg: >-
+      No drain state files found for {{ current_node }} in {{ restore_state_dir }}.
+      Skipping restore.
+  when: restore_found_files.files | length == 0
+
+- name: "Restore | {{ current_node }} | End if no state files"
+  ansible.builtin.meta: end_play
+  when: restore_found_files.files | length == 0
+
+- name: "Restore | {{ current_node }} | Use most recent state file"
+  ansible.builtin.set_fact:
+    restore_state_file: >-
+      {{ (restore_found_files.files | sort(attribute='mtime') | last).path }}
+  delegate_to: localhost
+
+- name: "Restore | {{ current_node }} | Load state file"
+  ansible.builtin.slurp:
+    src: "{{ restore_state_file }}"
+  register: restore_state_raw
+  delegate_to: localhost
+
+- name: "Restore | {{ current_node }} | Parse VM origin list"
+  ansible.builtin.set_fact:
+    restore_vm_list: "{{ restore_state_raw.content | b64decode | from_json }}"
+  delegate_to: localhost
+
+- name: "Restore | {{ current_node }} | Log restore plan"
+  ansible.builtin.debug:
+    msg: >-
+      Restoring {{ restore_vm_list | length }} guest(s) to {{ current_node }}:
+      {{ restore_vm_list | map(attribute='vmid') | list }}
+
+# ── Get current VM locations ──────────────────────────────────────────────────
+- name: "Restore | {{ current_node }} | Get current VM locations"
+  community.proxmox.proxmox_vm_info:
+    api_host: "{{ api_host }}"
+    api_user: "{{ api_user }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+    validate_certs: "{{ validate_certs }}"
+  register: restore_all_vms
+  delegate_to: localhost
+
+# ── Migrate KVM guests back ───────────────────────────────────────────────────
+- name: "Restore | {{ current_node }} | KVM | Migrate back"
+  ansible.builtin.command: >
+    qm migrate {{ item.vmid }} {{ current_node }}
+    {% if item.status == 'running' %}--online{% endif %}
+    --with-local-disks 0
+  loop: "{{ restore_vm_list | selectattr('type', 'equalto', 'qemu') | list }}"
+  loop_control:
+    label: "{{ item.name }} (VMID {{ item.vmid }})"
+  changed_when: true
+  vars:
+    current_location: >-
+      {{ restore_all_vms.proxmox_vms
+         | selectattr('vmid', 'equalto', item.vmid)
+         | map(attribute='node')
+         | first
+         | default('unknown') }}
+  when: current_location != current_node
+
+# ── Migrate LXC guests back ───────────────────────────────────────────────────
+- name: "Restore | {{ current_node }} | LXC | Migrate back"
+  ansible.builtin.command: >
+    pct migrate {{ item.vmid }} {{ current_node }} --restart --timeout 120
+  loop: "{{ restore_vm_list | selectattr('type', 'equalto', 'lxc') | list }}"
+  loop_control:
+    label: "{{ item.name | default(item.vmid) }} (VMID {{ item.vmid }})"
+  changed_when: true
+  vars:
+    current_location: >-
+      {{ restore_all_vms.proxmox_vms
+         | selectattr('vmid', 'equalto', item.vmid)
+         | map(attribute='node')
+         | first
+         | default('unknown') }}
+  when: current_location != current_node
+
+# ── Cleanup ───────────────────────────────────────────────────────────────────
+- name: "Restore | {{ current_node }} | Remove state file"
+  ansible.builtin.file:
+    path: "{{ restore_state_file }}"
+    state: absent
+  delegate_to: localhost
+  when: restore_cleanup_state_file
+
+- name: "Restore | {{ current_node }} | Complete"
+  ansible.builtin.debug:
+    msg: "✓ Restore complete — {{ restore_vm_list | length }} guest(s) returned to {{ current_node }}."
--- a/roles/proxmox_status/defaults/main.yml
+++ b/roles/proxmox_status/defaults/main.yml
@@ -0,0 +1,20 @@
+---
+# =============================================================================
+# proxmox_status — defaults
+# =============================================================================
+
+# Include VM inventory in report
+status_include_vms: true
+
+# Include storage status
+status_include_storage: true
+
+# Include CEPH status (skipped gracefully if not configured)
+status_include_ceph: true
+
+# Include HA status (skipped gracefully if not configured)
+status_include_ha: true
+
+# API connection (inherited from inventory)
+api_port: 8006
+validate_certs: false
--- a/roles/proxmox_status/meta/main.yml
+++ b/roles/proxmox_status/meta/main.yml
@@ -0,0 +1,11 @@
+---
+galaxy_info:
+  role_name: proxmox_status
+  author: ansible-msp
+  description: "MSP Proxmox automation — proxmox_status"
+  min_ansible_version: "2.15"
+  platforms:
+    - name: Debian
+      versions:
+        - bookworm
+dependencies: []
--- a/roles/proxmox_status/tasks/main.yml
+++ b/roles/proxmox_status/tasks/main.yml
@@ -0,0 +1,127 @@
+---
+# =============================================================================
+# proxmox_status — tasks
+# Produces a cluster health report: nodes, VMs, storage, CEPH, HA.
+# =============================================================================
+
+# ── Node info ─────────────────────────────────────────────────────────────────
+- name: "Status | Get cluster node info"
+  community.proxmox.proxmox_node_info:
+    api_host: "{{ api_host }}"
+    api_user: "{{ api_user }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+    validate_certs: "{{ validate_certs }}"
+  register: status_nodes
+  delegate_to: localhost
+  run_once: true
+
+- name: "Status | Node summary"
+  ansible.builtin.debug:
+    msg: >-
+      ┌─ NODE SUMMARY ─────────────────────────────
+      {% for node in status_nodes.proxmox_nodes | sort(attribute='node') %}
+      │ {{ node.node | ljust(20) }}
+        status={{ node.status | ljust(8) }}
+        ver={{ node.version.version | default('?') }}
+        cpu={{ (node.cpu | default(0) * 100) | round(1) }}%
+        mem={{ ((node.mem | default(0)) / 1073741824) | round(1) }}GB /
+            {{ ((node.maxmem | default(0)) / 1073741824) | round(1) }}GB
+      {% endfor %}
+      └────────────────────────────────────────────
+  run_once: true
+
+# ── VM inventory ──────────────────────────────────────────────────────────────
+- name: "Status | Get VM info for each node"
+  community.proxmox.proxmox_vm_info:
+    api_host: "{{ api_host }}"
+    api_user: "{{ api_user }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+    validate_certs: "{{ validate_certs }}"
+    node: "{{ item.node }}"
+  loop: "{{ status_nodes.proxmox_nodes | selectattr('status', 'equalto', 'online') | list }}"
+  loop_control:
+    label: "{{ item.node }}"
+  register: status_vms_per_node
+  delegate_to: localhost
+  run_once: true
+  when: status_include_vms
+
+- name: "Status | VM distribution summary"
+  ansible.builtin.debug:
+    msg: >-
+      ┌─ VM DISTRIBUTION ──────────────────────────
+      {% for result in status_vms_per_node.results %}
+      │ {{ result.item.node | ljust(20) }}
+        total={{ result.proxmox_vms | length }}
+        running={{ result.proxmox_vms | selectattr('status', 'equalto', 'running') | list | length }}
+        stopped={{ result.proxmox_vms | selectattr('status', 'equalto', 'stopped') | list | length }}
+      {% endfor %}
+      │ Total VMs: {{ status_vms_per_node.results | map(attribute='proxmox_vms') | flatten | length }}
+      └────────────────────────────────────────────
+  run_once: true
+  when: status_include_vms
+
+# ── CEPH status ───────────────────────────────────────────────────────────────
+- name: "Status | CEPH status"
+  ansible.builtin.command: ceph status --format json
+  register: status_ceph
+  changed_when: false
+  failed_when: false
+  run_once: true
+  when: status_include_ceph
+
+- name: "Status | CEPH summary"
+  ansible.builtin.debug:
+    msg: >-
+      ┌─ CEPH STATUS ───────────────────────────────
+      {% if status_ceph.rc == 0 %}
+      │ Health:   {{ (status_ceph.stdout | from_json).health.status }}
+      │ OSDs:     {{ (status_ceph.stdout | from_json).osdmap.num_osds }} total,
+                  {{ (status_ceph.stdout | from_json).osdmap.num_up_osds }} up,
+                  {{ (status_ceph.stdout | from_json).osdmap.num_in_osds }} in
+      {% else %}
+      │ CEPH not configured or not reachable.
+      {% endif %}
+      └────────────────────────────────────────────
+  run_once: true
+  when: status_include_ceph
+
+# ── HA status ─────────────────────────────────────────────────────────────────
+- name: "Status | HA status"
+  ansible.builtin.command: ha-manager status
+  register: status_ha
+  changed_when: false
+  failed_when: false
+  run_once: true
+  when: status_include_ha
+
+- name: "Status | HA summary"
+  ansible.builtin.debug:
+    msg: >-
+      ┌─ HA STATUS ─────────────────────────────────
+      {% if status_ha.rc == 0 and status_ha.stdout != '' %}
+      {{ status_ha.stdout_lines | join('\n      ') }}
+      {% else %}
+      │ HA not configured.
+      {% endif %}
+      └────────────────────────────────────────────
+  run_once: true
+  when: status_include_ha
+
+# ── PVE versions ─────────────────────────────────────────────────────────────
+- name: "Status | Check for available updates on each node"
+  ansible.builtin.shell: |
+    apt-get -q update > /dev/null 2>&1
+    apt-get -s dist-upgrade 2>/dev/null | grep "^Inst " | wc -l
+  register: status_updates_available
+  changed_when: false
+
+- name: "Status | Update availability per node"
+  ansible.builtin.debug:
+    msg: >-
+      {{ inventory_hostname }}: {{ status_updates_available.stdout | trim }} package(s) available for upgrade
+      (PVE {{ ansible_local.pve_version | default('unknown') }})
--- a/roles/proxmox_upgrade_node/defaults/main.yml
+++ b/roles/proxmox_upgrade_node/defaults/main.yml
@@ -0,0 +1,21 @@
+---
+# =============================================================================
+# proxmox_upgrade_node — defaults
+# =============================================================================
+
+# Reboot behaviour
+upgrade_reboot_if_required: true         # reboot if /var/run/reboot-required exists
+upgrade_reboot_force: false              # reboot even if not required
+upgrade_reboot_timeout: 600             # seconds to wait for node to come back
+upgrade_node_rejoin_timeout: 300        # seconds to wait for cluster rejoin
+upgrade_node_rejoin_retries: 30
+upgrade_node_rejoin_delay: 10
+
+# apt options
+upgrade_apt_update_cache: true
+upgrade_apt_autoremove: true
+upgrade_apt_cache_valid_time: 3600
+
+# API connection (inherited from inventory)
+api_port: 8006
+validate_certs: false
--- a/roles/proxmox_upgrade_node/meta/main.yml
+++ b/roles/proxmox_upgrade_node/meta/main.yml
@@ -0,0 +1,11 @@
+---
+galaxy_info:
+  role_name: proxmox_upgrade_node
+  author: ansible-msp
+  description: "MSP Proxmox automation — proxmox_upgrade_node"
+  min_ansible_version: "2.15"
+  platforms:
+    - name: Debian
+      versions:
+        - bookworm
+dependencies: []
--- a/roles/proxmox_upgrade_node/tasks/main.yml
+++ b/roles/proxmox_upgrade_node/tasks/main.yml
@@ -0,0 +1,85 @@
+---
+# =============================================================================
+# proxmox_upgrade_node — tasks
+# Runs apt dist-upgrade on a single node, reboots if required,
+# and waits for the node to rejoin the cluster.
+#
+# Required vars:
+#   current_node — the node being upgraded (used for logging)
+# =============================================================================
+
+- name: "Upgrade | {{ current_node }} | apt-get update"
+  ansible.builtin.apt:
+    update_cache: "{{ upgrade_apt_update_cache }}"
+    cache_valid_time: "{{ upgrade_apt_cache_valid_time }}"
+  changed_when: false
+
+- name: "Upgrade | {{ current_node }} | apt dist-upgrade"
+  ansible.builtin.apt:
+    upgrade: dist
+    autoremove: "{{ upgrade_apt_autoremove }}"
+    autoclean: true
+  register: upgrade_apt_result
+
+- name: "Upgrade | {{ current_node }} | Log upgraded packages"
+  ansible.builtin.debug:
+    msg: "{{ upgrade_apt_result.stdout_lines | last | default('No output') }}"
+
+- name: "Upgrade | {{ current_node }} | Check if reboot required"
+  ansible.builtin.stat:
+    path: /var/run/reboot-required
+  register: upgrade_reboot_required_file
+
+- name: "Upgrade | {{ current_node }} | Set reboot needed fact"
+  ansible.builtin.set_fact:
+    upgrade_needs_reboot: >-
+      {{ upgrade_reboot_required_file.stat.exists or upgrade_reboot_force }}
+
+- name: "Upgrade | {{ current_node }} | Reboot node"
+  ansible.builtin.reboot:
+    reboot_timeout: "{{ upgrade_reboot_timeout }}"
+    msg: "Ansible controlled reboot for Proxmox upgrade"
+    pre_reboot_delay: 5
+    post_reboot_delay: 15
+  when:
+    - upgrade_needs_reboot
+    - upgrade_reboot_if_required
+
+- name: "Upgrade | {{ current_node }} | Skip reboot (not required)"
+  ansible.builtin.debug:
+    msg: "No reboot required on {{ current_node }} — skipping."
+  when: not upgrade_needs_reboot
+
+# ── Wait for cluster rejoin ───────────────────────────────────────────────────
+- name: "Upgrade | {{ current_node }} | Wait for node to rejoin cluster"
+  community.proxmox.proxmox_node_info:
+    api_host: "{{ api_host }}"
+    api_user: "{{ api_user }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+    validate_certs: "{{ validate_certs }}"
+  register: upgrade_rejoin_check
+  delegate_to: localhost
+  until: >-
+    upgrade_rejoin_check.proxmox_nodes
+    | selectattr('node', 'equalto', current_node)
+    | selectattr('status', 'equalto', 'online')
+    | list
+    | length > 0
+  retries: "{{ upgrade_node_rejoin_retries }}"
+  delay: "{{ upgrade_node_rejoin_delay }}"
+  when: upgrade_needs_reboot
+
+- name: "Upgrade | {{ current_node }} | Node back online"
+  ansible.builtin.debug:
+    msg: >-
+      ✓ Node {{ current_node }} has rejoined the cluster
+      {{ '(after reboot)' if upgrade_needs_reboot else '(no reboot needed)' }}.
+
+- name: "Upgrade | {{ current_node }} | Complete"
+  ansible.builtin.debug:
+    msg: >-
+      ━━━ Upgrade complete: {{ current_node }}
+      {% if upgrade_apt_result.changed %}(packages updated){% else %}(already up to date){% endif %}
+      {% if upgrade_needs_reboot %}(rebooted){% else %}(no reboot){% endif %} ━━━