testing new proxmox logic

2026-03-15 15:48:59 -07:00
parent 347a85b09d
commit ea2f00c098
34 changed files with 2391 additions and 21 deletions
--- a/playbooks/proxmox_ceph.yml
+++ b/playbooks/proxmox_ceph.yml
@@ -0,0 +1,45 @@
 ---
 # =============================================================================
 # proxmox_ceph.yml
 # CEPH management playbook for Proxmox clusters.
 #
 # Actions:
 #   status       — report current CEPH health and OSD state
 #   set_noout    — set noout flag before node maintenance
 #   clear_noout  — clear noout flag and wait for HEALTH_OK after maintenance
 #   check_health — wait for CEPH to reach HEALTH_OK or HEALTH_WARN
 #
 # Usage:
 #   # Check current status
 #   ansible-playbook proxmox_ceph.yml -e "ceph_action=status"
 #
 #   # Set noout before maintenance
 #   ansible-playbook proxmox_ceph.yml -e "ceph_action=set_noout"
 #
 #   # Clear noout after node comes back online
 #   ansible-playbook proxmox_ceph.yml -e "ceph_action=clear_noout"
 # =============================================================================
 - name: "Proxmox | CEPH Management"
  hosts: proxmox_cluster
  gather_facts: false
  vars:
    ceph_action: status
  pre_tasks:
    - name: "CEPH | Validate action"
      ansible.builtin.fail:
        msg: >-
          Invalid ceph_action '{{ ceph_action }}'.
          Must be one of: status, set_noout, clear_noout, check_health.
      when: ceph_action not in ['status', 'set_noout', 'clear_noout', 'check_health']
      run_once: true
    - name: "CEPH | Log action"
      ansible.builtin.debug:
        msg: "CEPH action: {{ ceph_action }} on {{ client_name | default('cluster') }}"
      run_once: true
  roles:
    - role: proxmox_ceph
--- a/playbooks/proxmox_config_backup.yml
+++ b/playbooks/proxmox_config_backup.yml
@@ -0,0 +1,49 @@
 ---
 # =============================================================================
 # proxmox_config_backup.yml
 # Backs up critical Proxmox configuration files from all nodes.
 #
 # Captures:
 #   /etc/pve           — cluster config, VM configs, storage, users, certs
 #   /etc/network       — network interfaces
 #   /etc/hosts         — hostname resolution
 #   /etc/hostname      — node name
 #   /etc/apt/          — apt sources (so repos can be restored)
 #
 # NOTE: /etc/pve contains sensitive files (SSL keys, shadow.cfg, API tokens).
 #       Local and SFTP destinations are supported. Git destination is a
 #       TODO pending a secure encryption strategy for sensitive files.
 #
 # Usage:
 #   # Backup all nodes (local)
 #   ansible-playbook proxmox_config_backup.yml
 #
 #   # Backup to SFTP
 #   ansible-playbook proxmox_config_backup.yml \
 #     -e "backup_destination=sftp backup_sftp_host=backup.example.com backup_sftp_user=ansible"
 #
 #   # Backup a single node
 #   ansible-playbook proxmox_config_backup.yml --limit pm-node-01
 # =============================================================================
 - name: "Proxmox | Config Backup"
  hosts: proxmox_cluster
  gather_facts: true
  serial: 1               # Back up one node at a time to avoid SFTP conflicts
  vars:
    backup_destination: local
    backup_local_dir: /var/backups/proxmox-config
    backup_local_keep: 10
  tasks:
    - name: "Backup | Run config backup for {{ inventory_hostname }}"
      ansible.builtin.include_role:
        name: proxmox_config_backup
      vars:
        current_node: "{{ inventory_hostname }}"
    - name: "Backup | All nodes complete"
      ansible.builtin.debug:
        msg: "✓ Config backup complete for all nodes in {{ client_name | default('cluster') }}."
      run_once: true
--- a/playbooks/proxmox_ha.yml
+++ b/playbooks/proxmox_ha.yml
@@ -0,0 +1,50 @@
 ---
 # =============================================================================
 # proxmox_ha.yml
 # HA group membership and maintenance mode management.
 #
 # Actions:
 #   status   — show current HA status for all nodes and services
 #   disable  — put a node into HA maintenance mode (VMs migrate away)
 #   enable   — take a node out of HA maintenance mode (resume normal HA)
 #
 # Usage:
 #   # Check HA status
 #   ansible-playbook proxmox_ha.yml -e "ha_action=status"
 #
 #   # Put node into maintenance before work
 #   ansible-playbook proxmox_ha.yml -e "ha_action=disable ha_target_node=pm-node-01"
 #
 #   # Resume HA after work is complete
 #   ansible-playbook proxmox_ha.yml -e "ha_action=enable ha_target_node=pm-node-01"
 # =============================================================================
 - name: "Proxmox | HA Management"
  hosts: proxmox_cluster
  gather_facts: false
  vars:
    ha_action: status
    ha_target_node: "{{ inventory_hostname }}"
  pre_tasks:
    - name: "HA | Validate action"
      ansible.builtin.fail:
        msg: >-
          Invalid ha_action '{{ ha_action }}'.
          Must be one of: status, disable, enable.
      when: ha_action not in ['status', 'disable', 'enable']
      run_once: true
    - name: "HA | Log action"
      ansible.builtin.debug:
        msg: >-
          HA {{ ha_action }} —
          client={{ client_name | default('Unknown') }}
          {% if ha_action in ['disable', 'enable'] %}node={{ ha_target_node }}{% endif %}
      run_once: true
  roles:
    - role: proxmox_ha
      vars:
        current_node: "{{ ha_target_node }}"
--- a/playbooks/proxmox_migrate_vms.yml
+++ b/playbooks/proxmox_migrate_vms.yml
@@ -0,0 +1,371 @@
 ---
 # =============================================================================
 # proxmox_migrate_vms.yml
 # Flexible VM migration playbook supporting three modes:
 #
 #   drain     — move all VMs off a specific node (pre-maintenance)
 #   rebalance — redistribute VMs evenly across all online nodes by resources
 #   restore   — return VMs to their origin nodes using a drain state file
 #   targeted  — migrate specific VMIDs or tagged VMs to a specified target
 #
 # Usage examples:
 #   # Drain a node before maintenance
 #   ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=drain migrate_source_node=pm-node-01"
 #
 #   # Rebalance the cluster
 #   ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=rebalance"
 #
 #   # Restore VMs to origin after maintenance
 #   ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=restore migrate_source_node=pm-node-01"
 #
 #   # Migrate specific VMIDs to a target node
 #   ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=targeted migrate_vmids=[100,101] migrate_target_node=pm-node-02"
 #
 #   # Migrate VMs by tag
 #   ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=targeted migrate_tags=[win11] migrate_target_node=pm-node-02"
 # =============================================================================
 - name: "Proxmox | Migrate VMs"
  hosts: proxmox_cluster
  gather_facts: true
  run_once: true
  vars:
    # Mode: drain | rebalance | restore | targeted
    migrate_mode: drain
    # Source node (required for drain and restore modes)
    migrate_source_node: ""
    # Target node (required for targeted mode, optional for drain)
    migrate_target_node: ""
    # Targeted mode filters
    migrate_vmids: []             # list of VMIDs to migrate
    migrate_tags: []              # list of tags to match
    # Rebalance threshold — don't migrate if imbalance is below this % of total memory
    rebalance_threshold_pct: 10
    # Shared drain role vars
    drain_target_strategy: "{{ 'explicit' if migrate_target_node != '' else 'resources' }}"
    drain_target_node: "{{ migrate_target_node }}"
    drain_state_dir: "/tmp/proxmox_drain_state"
    # Restore vars
    restore_state_dir: "/tmp/proxmox_drain_state"
  pre_tasks:
    - name: "Migrate | Validate mode"
      ansible.builtin.fail:
        msg: >-
          Invalid migrate_mode '{{ migrate_mode }}'.
          Must be one of: drain, rebalance, restore, targeted.
      when: migrate_mode not in ['drain', 'rebalance', 'restore', 'targeted']
    - name: "Migrate | Validate drain — source node required"
      ansible.builtin.fail:
        msg: "migrate_source_node is required for drain mode."
      when:
        - migrate_mode == 'drain'
        - migrate_source_node == ''
    - name: "Migrate | Validate restore — source node required"
      ansible.builtin.fail:
        msg: "migrate_source_node is required for restore mode."
      when:
        - migrate_mode == 'restore'
        - migrate_source_node == ''
    - name: "Migrate | Validate targeted — VMIDs or tags required"
      ansible.builtin.fail:
        msg: "migrate_vmids or migrate_tags must be set for targeted mode."
      when:
        - migrate_mode == 'targeted'
        - migrate_vmids | length == 0
        - migrate_tags | length == 0
    - name: "Migrate | Log operation"
      ansible.builtin.debug:
        msg: >-
          Proxmox VM migration —
          client={{ client_name | default('Unknown') }}
          mode={{ migrate_mode }}
          {% if migrate_source_node != '' %}source={{ migrate_source_node }}{% endif %}
          {% if migrate_target_node != '' %}target={{ migrate_target_node }}{% endif %}
          {% if migrate_vmids | length > 0 %}vmids={{ migrate_vmids }}{% endif %}
          {% if migrate_tags | length > 0 %}tags={{ migrate_tags }}{% endif %}
  roles:
    - role: proxmox_preflight
  tasks:
    # ── DRAIN mode ─────────────────────────────────────────────────────────────
    - name: "Migrate | DRAIN mode"
      ansible.builtin.include_role:
        name: proxmox_drain
      vars:
        current_node: "{{ migrate_source_node }}"
      when: migrate_mode == 'drain'
    # ── RESTORE mode ───────────────────────────────────────────────────────────
    - name: "Migrate | RESTORE mode"
      ansible.builtin.include_role:
        name: proxmox_restore
      vars:
        current_node: "{{ migrate_source_node }}"
      when: migrate_mode == 'restore'
    # ── REBALANCE mode ─────────────────────────────────────────────────────────
    - name: "Migrate | REBALANCE | Get all node info"
      community.proxmox.proxmox_node_info:
        api_host: "{{ api_host }}"
        api_user: "{{ api_user }}"
        api_token_id: "{{ api_token_id }}"
        api_token_secret: "{{ api_token_secret }}"
        api_port: "{{ api_port | default(8006) }}"
        validate_certs: "{{ validate_certs | default(false) }}"
      register: rebalance_nodes
      delegate_to: localhost
      when: migrate_mode == 'rebalance'
    - name: "Migrate | REBALANCE | Get all VM info per node"
      community.proxmox.proxmox_vm_info:
        api_host: "{{ api_host }}"
        api_user: "{{ api_user }}"
        api_token_id: "{{ api_token_id }}"
        api_token_secret: "{{ api_token_secret }}"
        api_port: "{{ api_port | default(8006) }}"
        validate_certs: "{{ validate_certs | default(false) }}"
        node: "{{ item.node }}"
      loop: >-
        {{ rebalance_nodes.proxmox_nodes
           | selectattr('status', 'equalto', 'online')
           | list }}
      loop_control:
        label: "{{ item.node }}"
      register: rebalance_vms_per_node
      delegate_to: localhost
      when: migrate_mode == 'rebalance'
    - name: "Migrate | REBALANCE | Calculate node loads"
      ansible.builtin.set_fact:
        rebalance_node_loads: >-
          {% set loads = [] %}
          {% for result in rebalance_vms_per_node.results %}
            {% set node_name = result.item.node %}
            {% set node_info = rebalance_nodes.proxmox_nodes
               | selectattr('node', 'equalto', node_name)
               | first %}
            {% set vm_mem = result.proxmox_vms
               | map(attribute='mem')
               | map('default', 0)
               | sum %}
            {% set free_mem = node_info.maxmem - node_info.mem %}
            {% set load_pct = (node_info.mem / node_info.maxmem * 100) | round(1) %}
            {% set _ = loads.append({
                 'node': node_name,
                 'used_mem': node_info.mem,
                 'max_mem': node_info.maxmem,
                 'free_mem': free_mem,
                 'load_pct': load_pct,
                 'vm_count': result.proxmox_vms | rejectattr('template', 'equalto', true) | list | length,
                 'vms': result.proxmox_vms | rejectattr('template', 'equalto', true) | list
               }) %}
          {% endfor %}
          {{ loads | sort(attribute='load_pct', reverse=true) }}
      delegate_to: localhost
      when: migrate_mode == 'rebalance'
    - name: "Migrate | REBALANCE | Log current distribution"
      ansible.builtin.debug:
        msg: >-
          Current cluster load:
          {% for n in rebalance_node_loads %}
          {{ n.node }}: {{ n.load_pct }}% memory used, {{ n.vm_count }} VMs
          {% endfor %}
      when: migrate_mode == 'rebalance'
    - name: "Migrate | REBALANCE | Build migration plan"
      ansible.builtin.set_fact:
        rebalance_migrations: >-
          {% set moves = [] %}
          {% set loads = rebalance_node_loads | list %}
          {% set total_mem = loads | map(attribute='used_mem') | sum %}
          {% set avg_mem = total_mem / loads | length %}
          {% for vm in (loads | map(attribute='vms') | flatten
                        | rejectattr('status', 'equalto', 'stopped')
                        | list) %}
            {% set src_node = vm.node %}
            {% set src_info = loads | selectattr('node', 'equalto', src_node) | first %}
            {% if src_info.load_pct | float > (avg_mem / src_info.max_mem * 100 + rebalance_threshold_pct) %}
              {% set target = loads
                 | rejectattr('node', 'equalto', src_node)
                 | sort(attribute='load_pct')
                 | first %}
              {% if target.load_pct | float < src_info.load_pct | float - rebalance_threshold_pct %}
                {% set _ = moves.append({
                     'vmid': vm.vmid,
                     'name': vm.name,
                     'type': vm.type,
                     'status': vm.status,
                     'from': src_node,
                     'to': target.node
                   }) %}
              {% endif %}
            {% endif %}
          {% endfor %}
          {{ moves }}
      delegate_to: localhost
      when: migrate_mode == 'rebalance'
    - name: "Migrate | REBALANCE | Log migration plan"
      ansible.builtin.debug:
        msg: >-
          Rebalance plan ({{ rebalance_migrations | length }} migration(s)):
          {% if rebalance_migrations | length == 0 %}
          Cluster is already balanced within {{ rebalance_threshold_pct }}% threshold — no migrations needed.
          {% else %}
          {% for m in rebalance_migrations %}
          {{ m.name }} (VMID {{ m.vmid }}) {{ m.from }} → {{ m.to }}
          {% endfor %}
          {% endif %}
      when: migrate_mode == 'rebalance'
    - name: "Migrate | REBALANCE | Execute KVM migrations"
      ansible.builtin.command: >
        qm migrate {{ item.vmid }} {{ item.to }}
        {% if item.status == 'running' %}--online{% endif %}
        --with-local-disks 0
      loop: "{{ rebalance_migrations | selectattr('type', 'equalto', 'qemu') | list }}"
      loop_control:
        label: "{{ item.name }} ({{ item.from }} → {{ item.to }})"
      changed_when: true
      delegate_to: "{{ item.from }}"
      when:
        - migrate_mode == 'rebalance'
        - rebalance_migrations | length > 0
    - name: "Migrate | REBALANCE | Execute LXC migrations"
      ansible.builtin.command: >
        pct migrate {{ item.vmid }} {{ item.to }} --restart --timeout 120
      loop: "{{ rebalance_migrations | selectattr('type', 'equalto', 'lxc') | list }}"
      loop_control:
        label: "{{ item.name | default(item.vmid) }} ({{ item.from }} → {{ item.to }})"
      changed_when: true
      delegate_to: "{{ item.from }}"
      when:
        - migrate_mode == 'rebalance'
        - rebalance_migrations | length > 0
    - name: "Migrate | REBALANCE | Complete"
      ansible.builtin.debug:
        msg: >-
          ✓ Rebalance complete —
          {{ rebalance_migrations | length }} VM(s) redistributed.
      when: migrate_mode == 'rebalance'
    # ── TARGETED mode ──────────────────────────────────────────────────────────
    - name: "Migrate | TARGETED | Get all VMs"
      community.proxmox.proxmox_vm_info:
        api_host: "{{ api_host }}"
        api_user: "{{ api_user }}"
        api_token_id: "{{ api_token_id }}"
        api_token_secret: "{{ api_token_secret }}"
        api_port: "{{ api_port | default(8006) }}"
        validate_certs: "{{ validate_certs | default(false) }}"
      register: targeted_all_vms
      delegate_to: localhost
      when: migrate_mode == 'targeted'
    - name: "Migrate | TARGETED | Filter VMs by VMID"
      ansible.builtin.set_fact:
        targeted_vms: >-
          {{ targeted_all_vms.proxmox_vms
             | selectattr('vmid', 'in', migrate_vmids)
             | list }}
      delegate_to: localhost
      when:
        - migrate_mode == 'targeted'
        - migrate_vmids | length > 0
    - name: "Migrate | TARGETED | Filter VMs by tag"
      ansible.builtin.set_fact:
        targeted_vms: >-
          {{ targeted_all_vms.proxmox_vms
             | selectattr('tags', 'defined')
             | selectattr('tags', 'search', migrate_tags | join('|'))
             | list }}
      delegate_to: localhost
      when:
        - migrate_mode == 'targeted'
        - migrate_tags | length > 0
        - migrate_vmids | length == 0
    - name: "Migrate | TARGETED | Resolve target node"
      ansible.builtin.set_fact:
        targeted_resolved_target: "{{ migrate_target_node }}"
      when:
        - migrate_mode == 'targeted'
        - migrate_target_node != ''
    - name: "Migrate | TARGETED | Auto-select target by resources"
      block:
        - name: "Migrate | TARGETED | Get node resources"
          community.proxmox.proxmox_node_info:
            api_host: "{{ api_host }}"
            api_user: "{{ api_user }}"
            api_token_id: "{{ api_token_id }}"
            api_token_secret: "{{ api_token_secret }}"
            api_port: "{{ api_port | default(8006) }}"
            validate_certs: "{{ validate_certs | default(false) }}"
          register: targeted_nodes
          delegate_to: localhost
        - name: "Migrate | TARGETED | Pick best target"
          ansible.builtin.set_fact:
            targeted_resolved_target: >-
              {{ (targeted_nodes.proxmox_nodes
                  | selectattr('status', 'equalto', 'online')
                  | sort(attribute='mem')
                  | first).node }}
          delegate_to: localhost
      when:
        - migrate_mode == 'targeted'
        - migrate_target_node == ''
    - name: "Migrate | TARGETED | Log plan"
      ansible.builtin.debug:
        msg: >-
          Targeted migration: {{ targeted_vms | length }} VM(s) → {{ targeted_resolved_target }}
          VMIDs: {{ targeted_vms | map(attribute='vmid') | list }}
      when: migrate_mode == 'targeted'
    - name: "Migrate | TARGETED | Migrate KVM VMs"
      ansible.builtin.command: >
        qm migrate {{ item.vmid }} {{ targeted_resolved_target }}
        {% if item.status == 'running' %}--online{% endif %}
        --with-local-disks 0
      loop: "{{ targeted_vms | selectattr('type', 'equalto', 'qemu') | list }}"
      loop_control:
        label: "{{ item.name }} (VMID {{ item.vmid }}) → {{ targeted_resolved_target }}"
      changed_when: true
      delegate_to: "{{ item.node }}"
      when: migrate_mode == 'targeted'
    - name: "Migrate | TARGETED | Migrate LXC containers"
      ansible.builtin.command: >
        pct migrate {{ item.vmid }} {{ targeted_resolved_target }} --restart --timeout 120
      loop: "{{ targeted_vms | selectattr('type', 'equalto', 'lxc') | list }}"
      loop_control:
        label: "{{ item.name | default(item.vmid) }} (VMID {{ item.vmid }}) → {{ targeted_resolved_target }}"
      changed_when: true
      delegate_to: "{{ item.node }}"
      when: migrate_mode == 'targeted'
    - name: "Migrate | TARGETED | Complete"
      ansible.builtin.debug:
        msg: >-
          ✓ Targeted migration complete —
          {{ targeted_vms | length }} VM(s) moved to {{ targeted_resolved_target }}.
      when: migrate_mode == 'targeted'
--- a/playbooks/proxmox_reboot.yml
+++ b/playbooks/proxmox_reboot.yml
@@ -0,0 +1,75 @@
 ---
 # =============================================================================
 # proxmox_reboot.yml
 # Controlled rolling reboot of Proxmox cluster nodes.
 # Drains guests before rebooting, waits for rejoin, optionally restores.
 #
 # Use cases:
 #   - Apply kernel updates that require a reboot
 #   - Scheduled maintenance reboots
 #   - Hardware changes requiring a restart
 #
 # Variables:
 #   reboot_order          — ordered list of nodes to reboot (default: upgrade_order)
 #   reboot_reason         — logged message explaining the reboot
 #   migration_restore     — return VMs to origin after reboot (default: false)
 #   drain_target_strategy — resources | explicit (default: resources)
 #
 # Usage:
 #   # Rolling reboot all nodes
 #   ansible-playbook proxmox_reboot.yml
 #
 #   # Reboot a single node
 #   ansible-playbook proxmox_reboot.yml -e "reboot_order=[pm-node-02]"
 #
 #   # Reboot and restore VMs to origin
 #   ansible-playbook proxmox_reboot.yml -e "migration_restore=true"
 # =============================================================================
 - name: "Proxmox | Controlled Rolling Reboot"
  hosts: proxmox_cluster
  gather_facts: true
  run_once: true
  vars:
    reboot_order: "{{ upgrade_order | default(groups['proxmox_cluster'] | sort) }}"
    reboot_reason: "Scheduled maintenance reboot"
    migration_restore: false
    reboot_timeout: 600
    node_rejoin_timeout: 300
    node_rejoin_retries: 30
    node_rejoin_delay: 10
  pre_tasks:
    - name: "Reboot | Log operation"
      ansible.builtin.debug:
        msg: >-
          Proxmox rolling reboot —
          client={{ client_name | default('Unknown') }}
          nodes={{ reboot_order | join(', ') }}
          reason={{ reboot_reason }}
          restore={{ migration_restore }}
  roles:
    - role: proxmox_preflight
  tasks:
    - name: "Reboot | Rolling reboot — cluster mode"
      ansible.builtin.include_tasks: tasks/proxmox_reboot_node_loop.yml
      loop: "{{ reboot_order }}"
      loop_control:
        loop_var: current_node
        label: "{{ current_node }}"
      when: proxmox_is_cluster
    - name: "Reboot | Standalone | Reboot node"
      ansible.builtin.reboot:
        reboot_timeout: "{{ reboot_timeout }}"
        msg: "{{ reboot_reason }}"
        pre_reboot_delay: 5
        post_reboot_delay: 15
      when: not proxmox_is_cluster
    - name: "Reboot | Complete"
      ansible.builtin.debug:
        msg: "✓ Rolling reboot complete for {{ client_name | default('cluster') }}."
--- a/playbooks/proxmox_snapshot.yml
+++ b/playbooks/proxmox_snapshot.yml
@@ -0,0 +1,298 @@
 ---
 # =============================================================================
 # proxmox_snapshot.yml
 # Pre/post maintenance VM snapshot management.
 #
 # Actions:
 #   create  — snapshot all running VMs across the cluster before maintenance
 #   verify  — verify snapshots exist and are readable
 #   cleanup — remove snapshots older than snapshot_max_age_hours
 #   rollback — rollback a specific VMID to its most recent automation snapshot
 #
 # Snapshots are named with a consistent prefix for easy identification and cleanup:
 #   auto_pre_<date>_<time>
 #
 # Usage:
 #   # Snapshot all running VMs before upgrade
 #   ansible-playbook proxmox_snapshot.yml -e "snapshot_action=create"
 #
 #   # Verify snapshots exist
 #   ansible-playbook proxmox_snapshot.yml -e "snapshot_action=verify"
 #
 #   # Clean up snapshots older than 48 hours
 #   ansible-playbook proxmox_snapshot.yml -e "snapshot_action=cleanup snapshot_max_age_hours=48"
 #
 #   # Rollback a specific VM
 #   ansible-playbook proxmox_snapshot.yml -e "snapshot_action=rollback snapshot_rollback_vmid=100"
 # =============================================================================
 - name: "Proxmox | VM Snapshot Management"
  hosts: proxmox_cluster
  gather_facts: true
  run_once: true
  vars:
    snapshot_action: create
    snapshot_prefix: "auto_pre"
    snapshot_description: "Pre-maintenance snapshot — managed by ansible-msp"
    snapshot_max_age_hours: 72
    snapshot_include_ram: false         # include RAM state in snapshot (slower, more disk)
    snapshot_target_vmids: []           # empty = all running VMs
    snapshot_exclude_tags:
      - nosnap
      - nosnapshot
    snapshot_rollback_vmid: ""          # required for rollback action
    # API connection
    api_port: "{{ api_port | default(8006) }}"
    validate_certs: "{{ validate_certs | default(false) }}"
  pre_tasks:
    - name: "Snapshot | Validate action"
      ansible.builtin.fail:
        msg: >-
          Invalid snapshot_action '{{ snapshot_action }}'.
          Must be one of: create, verify, cleanup, rollback.
      when: snapshot_action not in ['create', 'verify', 'cleanup', 'rollback']
    - name: "Snapshot | Validate rollback — VMID required"
      ansible.builtin.fail:
        msg: "snapshot_rollback_vmid is required for rollback action."
      when:
        - snapshot_action == 'rollback'
        - snapshot_rollback_vmid == ''
    - name: "Snapshot | Set snapshot name"
      ansible.builtin.set_fact:
        snapshot_name: "{{ snapshot_prefix }}_{{ ansible_date_time.date | replace('-','') }}_{{ ansible_date_time.hour }}{{ ansible_date_time.minute }}"
      when: snapshot_action == 'create'
    - name: "Snapshot | Log operation"
      ansible.builtin.debug:
        msg: >-
          Snapshot {{ snapshot_action }} —
          client={{ client_name | default('Unknown') }}
          {% if snapshot_action == 'create' %}name={{ snapshot_name }}{% endif %}
          {% if snapshot_action == 'cleanup' %}max_age={{ snapshot_max_age_hours }}h{% endif %}
          {% if snapshot_action == 'rollback' %}vmid={{ snapshot_rollback_vmid }}{% endif %}
  roles:
    - role: proxmox_preflight
  tasks:
    # ── Get all VMs ────────────────────────────────────────────────────────────
    - name: "Snapshot | Get all node info"
      community.proxmox.proxmox_node_info:
        api_host: "{{ api_host }}"
        api_user: "{{ api_user }}"
        api_token_id: "{{ api_token_id }}"
        api_token_secret: "{{ api_token_secret }}"
        api_port: "{{ api_port }}"
        validate_certs: "{{ validate_certs }}"
      register: snapshot_nodes
      delegate_to: localhost
    - name: "Snapshot | Get all VMs per node"
      community.proxmox.proxmox_vm_info:
        api_host: "{{ api_host }}"
        api_user: "{{ api_user }}"
        api_token_id: "{{ api_token_id }}"
        api_token_secret: "{{ api_token_secret }}"
        api_port: "{{ api_port }}"
        validate_certs: "{{ validate_certs }}"
        node: "{{ item.node }}"
      loop: >-
        {{ snapshot_nodes.proxmox_nodes
           | selectattr('status', 'equalto', 'online')
           | list }}
      loop_control:
        label: "{{ item.node }}"
      register: snapshot_vms_per_node
      delegate_to: localhost
    - name: "Snapshot | Build VM list"
      ansible.builtin.set_fact:
        snapshot_all_vms: >-
          {{ snapshot_vms_per_node.results
             | map(attribute='proxmox_vms')
             | flatten
             | rejectattr('template', 'equalto', true)
             | selectattr('type', 'equalto', 'qemu')
             | list }}
      delegate_to: localhost
    - name: "Snapshot | Filter by VMID list"
      ansible.builtin.set_fact:
        snapshot_target_vms: >-
          {{ snapshot_all_vms
             | selectattr('vmid', 'in', snapshot_target_vmids)
             | list }}
      when: snapshot_target_vmids | length > 0
      delegate_to: localhost
    - name: "Snapshot | Filter running VMs (no VMID filter)"
      ansible.builtin.set_fact:
        snapshot_target_vms: >-
          {{ snapshot_all_vms
             | selectattr('status', 'equalto', 'running')
             | rejectattr('tags', 'defined')
             | list
             + snapshot_all_vms
             | selectattr('status', 'equalto', 'running')
             | selectattr('tags', 'defined')
             | rejectattr('tags', 'search', snapshot_exclude_tags | join('|'))
             | list }}
      when: snapshot_target_vmids | length == 0
      delegate_to: localhost
    # ── CREATE ─────────────────────────────────────────────────────────────────
    - name: "Snapshot | CREATE | Log plan"
      ansible.builtin.debug:
        msg: >-
          Creating snapshot '{{ snapshot_name }}' for
          {{ snapshot_target_vms | length }} VM(s):
          {{ snapshot_target_vms | map(attribute='name') | list }}
      when: snapshot_action == 'create'
    - name: "Snapshot | CREATE | Take snapshots"
      community.proxmox.proxmox_snap:
        api_host: "{{ api_host }}"
        api_user: "{{ api_user }}"
        api_token_id: "{{ api_token_id }}"
        api_token_secret: "{{ api_token_secret }}"
        api_port: "{{ api_port }}"
        validate_certs: "{{ validate_certs }}"
        vmid: "{{ item.vmid }}"
        snapname: "{{ snapshot_name }}"
        description: "{{ snapshot_description }}"
        vmstate: "{{ snapshot_include_ram }}"
        state: present
      loop: "{{ snapshot_target_vms }}"
      loop_control:
        label: "{{ item.name }} (VMID {{ item.vmid }}) on {{ item.node }}"
      delegate_to: localhost
      when: snapshot_action == 'create'
    - name: "Snapshot | CREATE | Complete"
      ansible.builtin.debug:
        msg: "✓ Snapshots created: '{{ snapshot_name }}' on {{ snapshot_target_vms | length }} VM(s)."
      when: snapshot_action == 'create'
    # ── VERIFY ─────────────────────────────────────────────────────────────────
    - name: "Snapshot | VERIFY | Check snapshots exist"
      ansible.builtin.command: >
        qm listsnapshot {{ item.vmid }}
      loop: "{{ snapshot_target_vms }}"
      loop_control:
        label: "{{ item.name }} (VMID {{ item.vmid }})"
      register: snapshot_verify_results
      changed_when: false
      delegate_to: "{{ item.node }}"
      when: snapshot_action == 'verify'
    - name: "Snapshot | VERIFY | Report"
      ansible.builtin.debug:
        msg: >-
          {{ item.item.name }} (VMID {{ item.item.vmid }}):
          {{ 'HAS snapshot' if snapshot_prefix in item.stdout else 'NO automation snapshot found' }}
      loop: "{{ snapshot_verify_results.results | default([]) }}"
      loop_control:
        label: "{{ item.item.name | default(item.item.vmid) }}"
      when: snapshot_action == 'verify'
    # ── CLEANUP ────────────────────────────────────────────────────────────────
    - name: "Snapshot | CLEANUP | Remove old snapshots"
      ansible.builtin.shell: |
        cutoff=$(date -d "{{ snapshot_max_age_hours }} hours ago" +%s)
        for snap in $(qm listsnapshot {{ item.vmid }} 2>/dev/null | grep "{{ snapshot_prefix }}" | awk '{print $2}'); do
          snap_date=$(echo $snap | sed 's/{{ snapshot_prefix }}_//' | sed 's/_[0-9]*$//')
          snap_epoch=$(date -d "${snap_date:0:4}-${snap_date:4:2}-${snap_date:6:2}" +%s 2>/dev/null || echo 0)
          if [ "$snap_epoch" -lt "$cutoff" ]; then
            echo "Removing snapshot: $snap from VMID {{ item.vmid }}"
            qm delsnapshot {{ item.vmid }} $snap
          fi
        done
      loop: "{{ snapshot_target_vms }}"
      loop_control:
        label: "{{ item.name }} (VMID {{ item.vmid }})"
      changed_when: true
      register: snapshot_cleanup_result
      delegate_to: "{{ item.node }}"
      when: snapshot_action == 'cleanup'
    - name: "Snapshot | CLEANUP | Complete"
      ansible.builtin.debug:
        msg: "✓ Snapshot cleanup complete — removed snapshots older than {{ snapshot_max_age_hours }} hours."
      when: snapshot_action == 'cleanup'
    # ── ROLLBACK ───────────────────────────────────────────────────────────────
    - name: "Snapshot | ROLLBACK | Find most recent automation snapshot"
      ansible.builtin.shell: >
        qm listsnapshot {{ snapshot_rollback_vmid }} 2>/dev/null
        | grep "{{ snapshot_prefix }}"
        | awk '{print $2}'
        | sort -r
        | head -1
      register: snapshot_rollback_name
      changed_when: false
      delegate_to: >-
        {{ (snapshot_all_vms
            | selectattr('vmid', 'equalto', snapshot_rollback_vmid | int)
            | map(attribute='node')
            | first) }}
      when: snapshot_action == 'rollback'
    - name: "Snapshot | ROLLBACK | Fail if no snapshot found"
      ansible.builtin.fail:
        msg: >-
          No automation snapshot found for VMID {{ snapshot_rollback_vmid }}.
          Run snapshot_action=create first.
      when:
        - snapshot_action == 'rollback'
        - snapshot_rollback_name.stdout | trim == ''
    - name: "Snapshot | ROLLBACK | Stop VM before rollback"
      community.proxmox.proxmox_kvm:
        api_host: "{{ api_host }}"
        api_user: "{{ api_user }}"
        api_token_id: "{{ api_token_id }}"
        api_token_secret: "{{ api_token_secret }}"
        api_port: "{{ api_port }}"
        validate_certs: "{{ validate_certs }}"
        vmid: "{{ snapshot_rollback_vmid }}"
        state: stopped
        force: true
        timeout: 60
      delegate_to: localhost
      when: snapshot_action == 'rollback'
    - name: "Snapshot | ROLLBACK | Execute rollback"
      ansible.builtin.command: >
        qm rollback {{ snapshot_rollback_vmid }} {{ snapshot_rollback_name.stdout | trim }}
      changed_when: true
      delegate_to: >-
        {{ (snapshot_all_vms
            | selectattr('vmid', 'equalto', snapshot_rollback_vmid | int)
            | map(attribute='node')
            | first) }}
      when: snapshot_action == 'rollback'
    - name: "Snapshot | ROLLBACK | Start VM after rollback"
      community.proxmox.proxmox_kvm:
        api_host: "{{ api_host }}"
        api_user: "{{ api_user }}"
        api_token_id: "{{ api_token_id }}"
        api_token_secret: "{{ api_token_secret }}"
        api_port: "{{ api_port }}"
        validate_certs: "{{ validate_certs }}"
        vmid: "{{ snapshot_rollback_vmid }}"
        state: started
      delegate_to: localhost
      when: snapshot_action == 'rollback'
    - name: "Snapshot | ROLLBACK | Complete"
      ansible.builtin.debug:
        msg: >-
          ✓ VMID {{ snapshot_rollback_vmid }} rolled back to
          '{{ snapshot_rollback_name.stdout | trim }}'.
      when: snapshot_action == 'rollback'
--- a/playbooks/proxmox_status.yml
+++ b/playbooks/proxmox_status.yml
@@ -0,0 +1,23 @@
 ---
 # =============================================================================
 # proxmox_status.yml
 # Cluster health report — nodes, VMs, storage, CEPH, HA, updates.
 # Safe to run at any time with no side effects.
 #
 # Usage:
 #   ansible-playbook proxmox_status.yml
 #   ansible-playbook proxmox_status.yml -e "status_include_ceph=false"
 # =============================================================================
 - name: "Proxmox | Cluster Status Report"
  hosts: proxmox_cluster
  gather_facts: true
  vars:
    status_include_vms: true
    status_include_storage: true
    status_include_ceph: true
    status_include_ha: true
  roles:
    - role: proxmox_status
--- a/playbooks/proxmox_upgrade.yml
+++ b/playbooks/proxmox_upgrade.yml
@@ -1,43 +1,83 @@
 ---
 # =============================================================================
 # proxmox_upgrade.yml
-# =============================================================================
+# Rolling Proxmox upgrade orchestrator.
 # Rolling Proxmox cluster upgrade playbook.
 # Runs on the first node in upgrade_order — all other nodes are handled
 # via API calls and delegate_to from within the role.
 #
-# Usage:
+# Workflow per node (cluster mode):
-#   ansible-playbook playbooks/proxmox_upgrade.yml \
+#   1. Backup config
-#     -i inventories/client_local_eng/hypervisor_hosts.yml
+#   2. Set CEPH noout (if CEPH enabled)
 #   3. Enable HA maintenance mode
 #   4. Drain guests to best available node
 #   5. apt dist-upgrade
 #   6. Reboot if required, wait for rejoin
 #   7. Clear CEPH noout
 #   8. Disable HA maintenance mode
 #   9. Restore guests (if migration_restore=true)
 #
-# Override migration behaviour:
+# Standalone mode skips all cluster/HA/CEPH/drain steps.
 #   -e migration_bulk=true
 #   -e live_migrate_fallback=skip
 #   -e migration_restore=true
 #
-# Dry run (check mode — no changes):
+# Variables (set in inventory or pass with -e):
-#   --check
+#   upgrade_order           — ordered list of nodes to upgrade (cluster only)
 #   migration_restore       — return VMs to origin node after upgrade (default: false)
 #   drain_target_strategy   — resources | explicit (default: resources)
 #   backup_destination      — local | sftp (default: local)
 # =============================================================================
- name: Proxmox Rolling Upgrade
+- name: "Proxmox Rolling Upgrade"
  hosts: proxmox_cluster
  gather_facts: true
-  serial: 1
+  run_once: true        # Play runs once — loops over nodes internally
-  run_once: true
+  serial: 1             # Safety: only one Ansible host processes at a time
  vars:
    migration_restore: false
    upgrade_order: "{{ groups['proxmox_cluster'] | sort }}"
  pre_tasks:
-    - name: Confirm upgrade_order is defined
+    - name: "Upgrade | Confirm upgrade_order is defined"
      ansible.builtin.fail:
-        msg: "upgrade_order must be defined in hypervisor_hosts.yml"
+        msg: "upgrade_order must be defined in inventory or passed with -e"
      when: upgrade_order is not defined or upgrade_order | length == 0
-    - name: Log upgrade targets
+    - name: "Upgrade | Log targets"
      ansible.builtin.debug:
        msg: >-
-          Proxmox upgrade starting for {{ client_name }} ({{ client_id }})
+          Proxmox upgrade starting for {{ client_name | default('Unknown') }}
          ({{ client_id | default('?') }})
          Nodes: {{ upgrade_order | join(', ') }}
          API: https://{{ api_host }}:{{ api_port }}
  roles:
-    - proxmox_upgrade
+    - role: proxmox_preflight
  tasks:
    # ── Cluster: loop through each node ────────────────────────────────────────
    - name: "Upgrade | Rolling upgrade — cluster mode"
      ansible.builtin.include_tasks: tasks/proxmox_upgrade_node_loop.yml
      loop: "{{ upgrade_order }}"
      loop_control:
        loop_var: current_node
        label: "{{ current_node }}"
      when: proxmox_is_cluster
    # ── Standalone: upgrade this host directly ────────────────────────────────
    - name: "Upgrade | Standalone | Backup config"
      ansible.builtin.include_role:
        name: proxmox_config_backup
      vars:
        current_node: "{{ inventory_hostname }}"
      when: not proxmox_is_cluster
    - name: "Upgrade | Standalone | Run upgrade"
      ansible.builtin.include_role:
        name: proxmox_upgrade_node
      vars:
        current_node: "{{ inventory_hostname }}"
      when: not proxmox_is_cluster
    - name: "Upgrade | Complete"
      ansible.builtin.debug:
        msg: >-
          ✓ Proxmox upgrade complete for
          {{ client_name | default('Unknown') }} —
          {{ upgrade_order | length }} node(s) upgraded.
--- a/playbooks/tasks/proxmox_reboot_node_loop.yml
+++ b/playbooks/tasks/proxmox_reboot_node_loop.yml
@@ -0,0 +1,82 @@
 ---
 # =============================================================================
 # proxmox_reboot_node_loop.yml
 # Called once per node by proxmox_reboot.yml
 # =============================================================================
 - name: "Reboot | {{ current_node }} | Start"
  ansible.builtin.debug:
    msg: "━━━ Starting reboot of node {{ current_node }} ━━━"
 - name: "Reboot | {{ current_node }} | Set CEPH noout"
  ansible.builtin.include_role:
    name: proxmox_ceph
  vars:
    ceph_action: set_noout
 - name: "Reboot | {{ current_node }} | Enable HA maintenance"
  ansible.builtin.include_role:
    name: proxmox_ha
  vars:
    ha_action: disable
 - name: "Reboot | {{ current_node }} | Drain guests"
  ansible.builtin.include_role:
    name: proxmox_drain
 - name: "Reboot | {{ current_node }} | Reboot"
  ansible.builtin.reboot:
    reboot_timeout: "{{ reboot_timeout }}"
    msg: "{{ reboot_reason }}"
    pre_reboot_delay: 5
    post_reboot_delay: 15
  delegate_to: "{{ current_node }}"
 - name: "Reboot | {{ current_node }} | Wait for cluster rejoin"
  community.proxmox.proxmox_node_info:
    api_host: "{{ api_host }}"
    api_user: "{{ api_user }}"
    api_token_id: "{{ api_token_id }}"
    api_token_secret: "{{ api_token_secret }}"
    api_port: "{{ api_port | default(8006) }}"
    validate_certs: "{{ validate_certs | default(false) }}"
  register: reboot_rejoin_check
  delegate_to: localhost
  until: >-
    reboot_rejoin_check.proxmox_nodes
    | selectattr('node', 'equalto', current_node)
    | selectattr('status', 'equalto', 'online')
    | list
    | length > 0
  retries: "{{ node_rejoin_retries }}"
  delay: "{{ node_rejoin_delay }}"
 - name: "Reboot | {{ current_node }} | Back online"
  ansible.builtin.debug:
    msg: "✓ Node {{ current_node }} has rejoined the cluster after reboot."
 - name: "Reboot | {{ current_node }} | Clear CEPH noout"
  ansible.builtin.include_role:
    name: proxmox_ceph
  vars:
    ceph_action: clear_noout
 - name: "Reboot | {{ current_node }} | Disable HA maintenance"
  ansible.builtin.include_role:
    name: proxmox_ha
  vars:
    ha_action: enable
 - name: "Reboot | {{ current_node }} | Restore guests"
  ansible.builtin.include_role:
    name: proxmox_restore
  when: migration_restore | bool
 - name: "Reboot | {{ current_node }} | Skip restore"
  ansible.builtin.debug:
    msg: "migration_restore=false — guests remain on their current nodes."
  when: not migration_restore | bool
 - name: "Reboot | {{ current_node }} | Complete"
  ansible.builtin.debug:
    msg: "━━━ Reboot complete: {{ current_node }} ━━━"
--- a/playbooks/tasks/proxmox_upgrade_node_loop.yml
+++ b/playbooks/tasks/proxmox_upgrade_node_loop.yml
@@ -0,0 +1,68 @@
 ---
 # =============================================================================
 # proxmox_upgrade_node_loop.yml
 # Called once per node by proxmox_upgrade.yml.
 # Handles the full per-node upgrade pipeline in cluster mode.
 # =============================================================================
 - name: "Node {{ current_node }} | Start"
  ansible.builtin.debug:
    msg: "━━━ Starting upgrade of node {{ current_node }} ━━━"
 # ── Step 1: Backup config ─────────────────────────────────────────────────────
 - name: "Node {{ current_node }} | Step 1 — Backup config"
  ansible.builtin.include_role:
    name: proxmox_config_backup
 # ── Step 2: CEPH noout ────────────────────────────────────────────────────────
 - name: "Node {{ current_node }} | Step 2 — Set CEPH noout"
  ansible.builtin.include_role:
    name: proxmox_ceph
  vars:
    ceph_action: set_noout
 # ── Step 3: HA maintenance mode ───────────────────────────────────────────────
 - name: "Node {{ current_node }} | Step 3 — Enable HA maintenance"
  ansible.builtin.include_role:
    name: proxmox_ha
  vars:
    ha_action: disable
 # ── Step 4: Drain guests ──────────────────────────────────────────────────────
 - name: "Node {{ current_node }} | Step 4 — Drain guests"
  ansible.builtin.include_role:
    name: proxmox_drain
 # ── Step 5: Upgrade ───────────────────────────────────────────────────────────
 - name: "Node {{ current_node }} | Step 5 — Upgrade packages"
  ansible.builtin.include_role:
    name: proxmox_upgrade_node
 # ── Step 6: Clear CEPH noout ──────────────────────────────────────────────────
 - name: "Node {{ current_node }} | Step 6 — Clear CEPH noout"
  ansible.builtin.include_role:
    name: proxmox_ceph
  vars:
    ceph_action: clear_noout
 # ── Step 7: Resume HA management ─────────────────────────────────────────────
 - name: "Node {{ current_node }} | Step 7 — Disable HA maintenance"
  ansible.builtin.include_role:
    name: proxmox_ha
  vars:
    ha_action: enable
 # ── Step 8: Restore guests (optional) ────────────────────────────────────────
 - name: "Node {{ current_node }} | Step 8 — Restore guests"
  ansible.builtin.include_role:
    name: proxmox_restore
  when: migration_restore | bool
 - name: "Node {{ current_node }} | Skip restore"
  ansible.builtin.debug:
    msg: "migration_restore=false — guests remain on their current nodes."
  when: not migration_restore | bool
 - name: "Node {{ current_node }} | Complete"
  ansible.builtin.debug:
    msg: "━━━ Node {{ current_node }} upgrade complete ━━━"
--- a/roles/proxmox_ceph/defaults/main.yml
+++ b/roles/proxmox_ceph/defaults/main.yml
@@ -0,0 +1,18 @@
 ---
 # =============================================================================
 # proxmox_ceph — defaults
 # =============================================================================
 # Action: set_noout | clear_noout | status | check_health
 ceph_action: status
 # Health check settings
 ceph_health_timeout: 300        # seconds to wait for HEALTH_OK
 ceph_health_retries: 30
 ceph_health_delay: 10
 # Abort upgrade if CEPH is in error state
 ceph_abort_on_error: true
 # Warn but continue if CEPH has warnings
 ceph_warn_on_warning: true
--- a/roles/proxmox_ceph/meta/main.yml
+++ b/roles/proxmox_ceph/meta/main.yml
@@ -0,0 +1,11 @@
 ---
 galaxy_info:
  role_name: proxmox_ceph
  author: ansible-msp
  description: "MSP Proxmox automation — proxmox_ceph"
  min_ansible_version: "2.15"
  platforms:
    - name: Debian
      versions:
        - bookworm
 dependencies: []
--- a/roles/proxmox_ceph/tasks/main.yml
+++ b/roles/proxmox_ceph/tasks/main.yml
@@ -0,0 +1,140 @@
 ---
 # =============================================================================
 # proxmox_ceph — tasks
 # Manages CEPH noout flag and health checks during maintenance.
 # Skips gracefully if CEPH is not configured on this cluster.
 # =============================================================================
 # ── Detect CEPH ───────────────────────────────────────────────────────────────
 - name: "CEPH | Detect if CEPH is configured"
  ansible.builtin.command: ceph status
  register: ceph_detect
  changed_when: false
  failed_when: false
  run_once: true
 - name: "CEPH | Set CEPH enabled fact"
  ansible.builtin.set_fact:
    ceph_is_enabled: "{{ ceph_detect.rc == 0 }}"
  run_once: true
 - name: "CEPH | Skip — CEPH not configured on this cluster"
  ansible.builtin.debug:
    msg: "CEPH is not configured on this cluster — skipping all CEPH tasks."
  when: not ceph_is_enabled
  run_once: true
 # ── CEPH status ───────────────────────────────────────────────────────────────
 - name: "CEPH | Get cluster status"
  ansible.builtin.command: ceph status --format json
  register: ceph_status_raw
  changed_when: false
  run_once: true
  when: ceph_is_enabled
 - name: "CEPH | Parse status"
  ansible.builtin.set_fact:
    ceph_status: "{{ ceph_status_raw.stdout | from_json }}"
  run_once: true
  when: ceph_is_enabled
 - name: "CEPH | Log health"
  ansible.builtin.debug:
    msg: "CEPH health: {{ ceph_status.health.status }}"
  run_once: true
  when:
    - ceph_is_enabled
    - ceph_action == 'status'
 # ── Health check ──────────────────────────────────────────────────────────────
 - name: "CEPH | Check health | Abort if HEALTH_ERR"
  ansible.builtin.fail:
    msg: >-
      CEPH is in HEALTH_ERR state — aborting to prevent data loss.
      Run 'ceph status' to investigate. Set ceph_abort_on_error=false to override.
  when:
    - ceph_is_enabled
    - ceph_abort_on_error
    - ceph_status.health.status == 'HEALTH_ERR'
  run_once: true
 - name: "CEPH | Check health | Warn on HEALTH_WARN"
  ansible.builtin.debug:
    msg: >-
      WARNING: CEPH is in HEALTH_WARN state.
      Proceeding — set ceph_warn_on_warning=false to suppress this message.
      Checks: {{ ceph_status.health.checks | default({}) | dict2items | map(attribute='key') | list }}
  when:
    - ceph_is_enabled
    - ceph_warn_on_warning
    - ceph_status.health.status == 'HEALTH_WARN'
  run_once: true
 # ── Set noout ─────────────────────────────────────────────────────────────────
 - name: "CEPH | Set noout flag"
  ansible.builtin.command: ceph osd set noout
  changed_when: true
  run_once: true
  when:
    - ceph_is_enabled
    - ceph_action == 'set_noout'
 - name: "CEPH | Confirm noout set"
  ansible.builtin.debug:
    msg: "✓ CEPH noout flag SET — OSDs will not be marked out during maintenance."
  when:
    - ceph_is_enabled
    - ceph_action == 'set_noout'
  run_once: true
 # ── Clear noout ───────────────────────────────────────────────────────────────
 - name: "CEPH | Clear noout flag"
  ansible.builtin.command: ceph osd unset noout
  changed_when: true
  run_once: true
  when:
    - ceph_is_enabled
    - ceph_action == 'clear_noout'
 - name: "CEPH | Wait for HEALTH_OK after clearing noout"
  ansible.builtin.command: ceph status --format json
  register: ceph_recovery_check
  changed_when: false
  until: "(ceph_recovery_check.stdout | from_json).health.status in ['HEALTH_OK', 'HEALTH_WARN']"
  retries: "{{ ceph_health_retries }}"
  delay: "{{ ceph_health_delay }}"
  run_once: true
  when:
    - ceph_is_enabled
    - ceph_action == 'clear_noout'
 - name: "CEPH | Log recovery status"
  ansible.builtin.debug:
    msg: >-
      ✓ CEPH noout CLEARED —
      health: {{ (ceph_recovery_check.stdout | from_json).health.status }}
  when:
    - ceph_is_enabled
    - ceph_action == 'clear_noout'
  run_once: true
 # ── check_health action ───────────────────────────────────────────────────────
 - name: "CEPH | Wait for healthy state"
  ansible.builtin.command: ceph status --format json
  register: ceph_health_wait
  changed_when: false
  until: "(ceph_health_wait.stdout | from_json).health.status in ['HEALTH_OK', 'HEALTH_WARN']"
  retries: "{{ ceph_health_retries }}"
  delay: "{{ ceph_health_delay }}"
  run_once: true
  when:
    - ceph_is_enabled
    - ceph_action == 'check_health'
 - name: "CEPH | Health check result"
  ansible.builtin.debug:
    msg: "CEPH health: {{ (ceph_health_wait.stdout | from_json).health.status }}"
  when:
    - ceph_is_enabled
    - ceph_action == 'check_health'
  run_once: true
--- a/roles/proxmox_config_backup/defaults/main.yml
+++ b/roles/proxmox_config_backup/defaults/main.yml
@@ -0,0 +1,33 @@
 ---
 # =============================================================================
 # proxmox_config_backup — defaults
 # =============================================================================
 # Backup destination: local | sftp
 # git destination removed pending secure implementation (TODO: encrypt secrets)
 backup_destination: local
 # Local backup settings
 backup_local_dir: /var/backups/proxmox-config
 backup_local_keep: 10
 # SFTP settings (used when backup_destination: sftp)
 backup_sftp_host: ""
 backup_sftp_user: ""
 backup_sftp_key: ""
 backup_sftp_remote_dir: "/backups/proxmox"
 # What to include in the backup tarball
 backup_paths_proxmox:
  - /etc/pve
  - /etc/network/interfaces
  - /etc/hosts
  - /etc/hostname
  - /etc/apt/sources.list
  - /etc/apt/sources.list.d
 backup_paths_xcpng:
  - /etc/xcp-ng
  - /etc/network/interfaces
  - /etc/hosts
  - /etc/hostname
--- a/roles/proxmox_config_backup/meta/main.yml
+++ b/roles/proxmox_config_backup/meta/main.yml
@@ -0,0 +1,11 @@
 ---
 galaxy_info:
  role_name: proxmox_config_backup
  author: ansible-msp
  description: "MSP Proxmox automation — proxmox_config_backup"
  min_ansible_version: "2.15"
  platforms:
    - name: Debian
      versions:
        - bookworm
 dependencies: []
--- a/roles/proxmox_config_backup/tasks/main.yml
+++ b/roles/proxmox_config_backup/tasks/main.yml
@@ -0,0 +1,113 @@
 ---
 # =============================================================================
 # proxmox_config_backup — tasks
 # Creates a tarball of critical Proxmox config files and stores it locally
 # or transfers via SFTP. Git destination is a TODO pending secure handling
 # of sensitive files (SSL keys, shadow.cfg, etc).
 #
 # Required vars:
 #   current_node — the node being backed up (for filename)
 #   client_id    — client identifier (for filename)
 # =============================================================================
 - name: "Backup | {{ current_node }} | Gather date/time facts"
  ansible.builtin.setup:
    gather_subset:
      - date_time
  when: ansible_date_time is not defined
 - name: "Backup | {{ current_node }} | Set backup filename"
  ansible.builtin.set_fact:
    backup_filename: >-
      proxmox_{{ client_id | lower | replace('-', '_') }}_{{ current_node }}_{{ ansible_date_time.date }}
 - name: "Backup | {{ current_node }} | Set backup paths"
  ansible.builtin.set_fact:
    backup_paths: "{{ backup_paths_proxmox }}"
 # ── Create tarball on node ────────────────────────────────────────────────────
 - name: "Backup | {{ current_node }} | Create config tarball"
  ansible.builtin.shell: |
    tar czf /tmp/{{ backup_filename }}.tar.gz \
      --ignore-failed-read \
      --dereference \
      {{ backup_paths | join(' ') }} 2>/dev/null || true
    echo "done"
  changed_when: true
  register: backup_tarball
 # ── Local backup ──────────────────────────────────────────────────────────────
 - name: "Backup | {{ current_node }} | Local | Ensure backup dir exists"
  ansible.builtin.file:
    path: "{{ backup_local_dir }}"
    state: directory
    mode: '0700'
  when: backup_destination == 'local'
 - name: "Backup | {{ current_node }} | Local | Move tarball to backup dir"
  ansible.builtin.copy:
    src: "/tmp/{{ backup_filename }}.tar.gz"
    dest: "{{ backup_local_dir }}/{{ backup_filename }}.tar.gz"
    remote_src: true
    mode: '0600'
  when: backup_destination == 'local'
 - name: "Backup | {{ current_node }} | Local | Rotate old backups"
  ansible.builtin.shell: |
    ls -1t {{ backup_local_dir }}/proxmox_*_{{ current_node }}_*.tar.gz 2>/dev/null \
      | tail -n +{{ (backup_local_keep | int) + 1 }} \
      | xargs -r rm -f
  changed_when: false
  when: backup_destination == 'local'
 - name: "Backup | {{ current_node }} | Local | Log result"
  ansible.builtin.debug:
    msg: "✓ Config backed up locally: {{ backup_local_dir }}/{{ backup_filename }}.tar.gz"
  when: backup_destination == 'local'
 # ── SFTP backup ───────────────────────────────────────────────────────────────
 - name: "Backup | {{ current_node }} | SFTP | Validate required vars"
  ansible.builtin.fail:
    msg: "SFTP backup requires backup_sftp_host and backup_sftp_user to be set."
  when:
    - backup_destination == 'sftp'
    - backup_sftp_host == '' or backup_sftp_user == ''
 - name: "Backup | {{ current_node }} | SFTP | Fetch tarball to controller"
  ansible.builtin.fetch:
    src: "/tmp/{{ backup_filename }}.tar.gz"
    dest: "/tmp/{{ backup_filename }}.tar.gz"
    flat: true
  when: backup_destination == 'sftp'
 - name: "Backup | {{ current_node }} | SFTP | Transfer to remote host"
  ansible.builtin.shell: |
    sftp_opts="-o StrictHostKeyChecking=no -o BatchMode=yes"
    {% if backup_sftp_key != '' %}
    sftp_opts="$sftp_opts -i {{ backup_sftp_key }}"
    {% endif %}
    sftp $sftp_opts {{ backup_sftp_user }}@{{ backup_sftp_host }} << EOF
    cd {{ backup_sftp_remote_dir }}
    put /tmp/{{ backup_filename }}.tar.gz
    EOF
  delegate_to: localhost
  changed_when: true
  when: backup_destination == 'sftp'
 - name: "Backup | {{ current_node }} | SFTP | Clean up local temp tarball"
  ansible.builtin.file:
    path: "/tmp/{{ backup_filename }}.tar.gz"
    state: absent
  delegate_to: localhost
  when: backup_destination == 'sftp'
 - name: "Backup | {{ current_node }} | SFTP | Log result"
  ansible.builtin.debug:
    msg: "✓ Config backed up via SFTP: {{ backup_sftp_host }}:{{ backup_sftp_remote_dir }}/{{ backup_filename }}.tar.gz"
  when: backup_destination == 'sftp'
 # ── Cleanup ───────────────────────────────────────────────────────────────────
 - name: "Backup | {{ current_node }} | Clean up temp tarball on node"
  ansible.builtin.file:
    path: "/tmp/{{ backup_filename }}.tar.gz"
    state: absent
--- a/roles/proxmox_drain/defaults/main.yml
+++ b/roles/proxmox_drain/defaults/main.yml
@@ -0,0 +1,34 @@
 ---
 # =============================================================================
 # proxmox_drain — defaults
 # =============================================================================
 # Target selection strategy: resources | explicit
 # resources = pick node with most available mem+cpu
 # explicit   = use drain_target_node variable
 drain_target_strategy: resources
 # Explicit target node (only used when drain_target_strategy: explicit)
 drain_target_node: ""
 # Resource weighting for target scoring (must sum to 1.0)
 drain_score_mem_weight: 0.6
 drain_score_cpu_weight: 0.4
 # Migration behaviour
 drain_online: true               # live migrate running VMs
 drain_shutdown_fallback: true    # shutdown VM if live migrate fails
 drain_vm_shutdown_timeout: 120   # seconds to wait for graceful shutdown
 drain_lxc_restart: true          # restart LXC after migration
 # State file — written to Semaphore host for restore mode
 drain_state_dir: "/tmp/proxmox_drain_state"
 # Filtering — skip VMs matching these tags (comma-separated string in PVE)
 drain_exclude_tags:
  - nomigrate
  - pinned
 # API connection (inherited from inventory)
 api_port: 8006
 validate_certs: false
--- a/roles/proxmox_drain/meta/main.yml
+++ b/roles/proxmox_drain/meta/main.yml
@@ -0,0 +1,11 @@
 ---
 galaxy_info:
  role_name: proxmox_drain
  author: ansible-msp
  description: "MSP Proxmox automation — proxmox_drain"
  min_ansible_version: "2.15"
  platforms:
    - name: Debian
      versions:
        - bookworm
 dependencies: []
--- a/roles/proxmox_drain/tasks/main.yml
+++ b/roles/proxmox_drain/tasks/main.yml
@@ -0,0 +1,217 @@
 ---
 # =============================================================================
 # proxmox_drain — tasks
 # Migrates all VMs/LXCs off current_node to the best available target.
 # Writes a state file so proxmox_restore can return VMs to origin.
 #
 # Required vars:
 #   current_node   — the node being drained
 # =============================================================================
 # ── Discover guests on this node ──────────────────────────────────────────────
 - name: "Drain | {{ current_node }} | Discover guests"
  community.proxmox.proxmox_vm_info:
    api_host: "{{ api_host }}"
    api_user: "{{ api_user }}"
    api_token_id: "{{ api_token_id }}"
    api_token_secret: "{{ api_token_secret }}"
    api_port: "{{ api_port }}"
    validate_certs: "{{ validate_certs }}"
    node: "{{ current_node }}"
  register: drain_node_guests
  delegate_to: localhost
 - name: "Drain | {{ current_node }} | Separate KVM and LXC guests"
  ansible.builtin.set_fact:
    drain_kvm_guests: >-
      {{ drain_node_guests.proxmox_vms
         | selectattr('type', 'equalto', 'qemu')
         | rejectattr('template', 'equalto', true)
         | list }}
    drain_lxc_guests: >-
      {{ drain_node_guests.proxmox_vms
         | selectattr('type', 'equalto', 'lxc')
         | list }}
  delegate_to: localhost
 - name: "Drain | {{ current_node }} | Filter excluded tags"
  ansible.builtin.set_fact:
    drain_kvm_guests: >-
      {{ drain_kvm_guests
         | rejectattr('tags', 'defined')
         | list
         + drain_kvm_guests
         | selectattr('tags', 'defined')
         | rejectattr('tags', 'search', drain_exclude_tags | join('|'))
         | list }}
    drain_lxc_guests: >-
      {{ drain_lxc_guests
         | rejectattr('tags', 'defined')
         | list
         + drain_lxc_guests
         | selectattr('tags', 'defined')
         | rejectattr('tags', 'search', drain_exclude_tags | join('|'))
         | list }}
  delegate_to: localhost
 - name: "Drain | {{ current_node }} | Log guest inventory"
  ansible.builtin.debug:
    msg: >-
      {{ current_node }} has
      {{ drain_kvm_guests | length }} KVM guest(s) and
      {{ drain_lxc_guests | length }} LXC guest(s) to migrate.
      VMIDs: {{ (drain_kvm_guests + drain_lxc_guests) | map(attribute='vmid') | list }}
 # ── Skip if nothing to migrate ────────────────────────────────────────────────
 - name: "Drain | {{ current_node }} | Skip — no guests to migrate"
  ansible.builtin.debug:
    msg: "Node {{ current_node }} has no guests — skipping drain."
  when:
    - drain_kvm_guests | length == 0
    - drain_lxc_guests | length == 0
 - name: "Drain | {{ current_node }} | End play if no guests"
  ansible.builtin.meta: end_play
  when:
    - drain_kvm_guests | length == 0
    - drain_lxc_guests | length == 0
 # ── Select migration target ───────────────────────────────────────────────────
 - name: "Drain | {{ current_node }} | Get all node resource info"
  community.proxmox.proxmox_node_info:
    api_host: "{{ api_host }}"
    api_user: "{{ api_user }}"
    api_token_id: "{{ api_token_id }}"
    api_token_secret: "{{ api_token_secret }}"
    api_port: "{{ api_port }}"
    validate_certs: "{{ validate_certs }}"
  register: drain_all_nodes
  delegate_to: localhost
  when: drain_target_strategy == 'resources'
 - name: "Drain | {{ current_node }} | Score nodes by available resources"
  ansible.builtin.set_fact:
    drain_scored_nodes: >-
      {% set candidates = [] %}
      {% for node in drain_all_nodes.proxmox_nodes %}
        {% if node.status == 'online' and node.node != current_node %}
          {% set free_mem = node.maxmem - node.mem %}
          {% set free_cpu = 1.0 - (node.cpu | default(0)) %}
          {% set score = (free_mem * drain_score_mem_weight | float) + (free_cpu * 1000000000 * drain_score_cpu_weight | float) %}
          {% set _ = candidates.append({'node': node.node, 'score': score, 'free_mem': free_mem, 'free_cpu': free_cpu}) %}
        {% endif %}
      {% endfor %}
      {{ candidates | sort(attribute='score', reverse=true) }}
  delegate_to: localhost
  when: drain_target_strategy == 'resources'
 - name: "Drain | {{ current_node }} | Set migration target (resources)"
  ansible.builtin.set_fact:
    drain_resolved_target: "{{ drain_scored_nodes | first | default({}) | default({'node': ''}) }}"
    drain_target: "{{ (drain_scored_nodes | first).node }}"
  delegate_to: localhost
  when: drain_target_strategy == 'resources'
 - name: "Drain | {{ current_node }} | Set migration target (explicit)"
  ansible.builtin.set_fact:
    drain_target: "{{ drain_target_node }}"
  delegate_to: localhost
  when: drain_target_strategy == 'explicit'
 - name: "Drain | {{ current_node }} | Fail if no target available"
  ansible.builtin.fail:
    msg: >-
      No valid migration target found for node {{ current_node }}.
      All other nodes may be offline or no nodes configured.
  when: drain_target == ''
  delegate_to: localhost
 - name: "Drain | {{ current_node }} | Log migration target"
  ansible.builtin.debug:
    msg: >-
      Migration target for {{ current_node }}: {{ drain_target }}
      {% if drain_target_strategy == 'resources' %}
      (free_mem={{ (drain_resolved_target.free_mem / 1073741824) | round(1) }}GB,
       free_cpu={{ (drain_resolved_target.free_cpu * 100) | round(1) }}%)
      {% endif %}
 # ── Write state file for restore ──────────────────────────────────────────────
 - name: "Drain | {{ current_node }} | Ensure state directory exists"
  ansible.builtin.file:
    path: "{{ drain_state_dir }}"
    state: directory
    mode: '0700'
  delegate_to: localhost
 - name: "Drain | {{ current_node }} | Write VM origin state"
  ansible.builtin.copy:
    content: >-
      {{ (drain_kvm_guests + drain_lxc_guests)
         | map('combine', {'origin_node': current_node})
         | list
         | to_nice_json }}
    dest: "{{ drain_state_dir }}/{{ current_node }}_{{ ansible_date_time.iso8601_basic_short }}.json"
    mode: '0600'
  delegate_to: localhost
  when: ansible_date_time is defined
 # ── Migrate KVM guests ────────────────────────────────────────────────────────
 - name: "Drain | {{ current_node }} | KVM | Live migrate to {{ drain_target }}"
  ansible.builtin.command: >
    qm migrate {{ item.vmid }} {{ drain_target }}
    {% if item.status == 'running' %}--online{% endif %}
    --with-local-disks 0
  loop: "{{ drain_kvm_guests }}"
  loop_control:
    label: "{{ item.name }} (VMID {{ item.vmid }}) — {{ item.status }}"
  changed_when: true
  register: drain_kvm_results
  failed_when: drain_kvm_results.rc is defined and drain_kvm_results.rc != 0
 - name: "Drain | {{ current_node }} | KVM | Verify guests moved"
  community.proxmox.proxmox_vm_info:
    api_host: "{{ api_host }}"
    api_user: "{{ api_user }}"
    api_token_id: "{{ api_token_id }}"
    api_token_secret: "{{ api_token_secret }}"
    api_port: "{{ api_port }}"
    validate_certs: "{{ validate_certs }}"
    node: "{{ drain_target }}"
  register: drain_verify_guests
  delegate_to: localhost
  when: drain_kvm_guests | length > 0
 - name: "Drain | {{ current_node }} | KVM | Log migration results"
  ansible.builtin.debug:
    msg: >-
      KVM migrations complete —
      {{ drain_kvm_guests | length }} guest(s) moved to {{ drain_target }}.
  when: drain_kvm_guests | length > 0
 # ── Migrate LXC guests ────────────────────────────────────────────────────────
 - name: "Drain | {{ current_node }} | LXC | Migrate to {{ drain_target }}"
  ansible.builtin.command: >
    pct migrate {{ item.vmid }} {{ drain_target }}
    {% if drain_lxc_restart %}--restart{% endif %}
    --timeout {{ drain_vm_shutdown_timeout }}
  loop: "{{ drain_lxc_guests }}"
  loop_control:
    label: "{{ item.name | default(item.vmid) }} (VMID {{ item.vmid }}) — {{ item.status }}"
  changed_when: true
  register: drain_lxc_results
  failed_when: drain_lxc_results.rc is defined and drain_lxc_results.rc != 0
 - name: "Drain | {{ current_node }} | LXC | Log migration results"
  ansible.builtin.debug:
    msg: >-
      LXC migrations complete —
      {{ drain_lxc_guests | length }} container(s) moved to {{ drain_target }}.
  when: drain_lxc_guests | length > 0
 # ── Final summary ─────────────────────────────────────────────────────────────
 - name: "Drain | {{ current_node }} | Complete"
  ansible.builtin.debug:
    msg: >-
      ✓ Node {{ current_node }} drained —
      {{ drain_kvm_guests | length }} KVM +
      {{ drain_lxc_guests | length }} LXC guests migrated to {{ drain_target }}.
--- a/roles/proxmox_ha/defaults/main.yml
+++ b/roles/proxmox_ha/defaults/main.yml
@@ -0,0 +1,17 @@
 ---
 # =============================================================================
 # proxmox_ha — defaults
 # =============================================================================
 # Action: status | disable | enable | migrate
 ha_action: status
 # Node to disable/enable HA management for (used with disable/enable)
 # Default: current_node (set by caller)
 # Timeout waiting for HA manager to acknowledge
 ha_timeout: 60
 # API connection (inherited from inventory)
 api_port: 8006
 validate_certs: false
--- a/roles/proxmox_ha/meta/main.yml
+++ b/roles/proxmox_ha/meta/main.yml
@@ -0,0 +1,11 @@
 ---
 galaxy_info:
  role_name: proxmox_ha
  author: ansible-msp
  description: "MSP Proxmox automation — proxmox_ha"
  min_ansible_version: "2.15"
  platforms:
    - name: Debian
      versions:
        - bookworm
 dependencies: []
--- a/roles/proxmox_ha/tasks/main.yml
+++ b/roles/proxmox_ha/tasks/main.yml
@@ -0,0 +1,96 @@
 ---
 # =============================================================================
 # proxmox_ha — tasks
 # Manages Proxmox HA group membership and maintenance mode.
 # Proxmox HA is self-managing during migrations — this role handles
 # cases where you need to explicitly pause or resume HA for a node.
 # =============================================================================
 # ── Detect HA ─────────────────────────────────────────────────────────────────
 - name: "HA | Detect if HA is configured"
  ansible.builtin.command: ha-manager status
  register: ha_detect
  changed_when: false
  failed_when: false
  run_once: true
 - name: "HA | Set HA enabled fact"
  ansible.builtin.set_fact:
    ha_is_enabled: "{{ ha_detect.rc == 0 and ha_detect.stdout != '' }}"
  run_once: true
 - name: "HA | Skip — HA not configured"
  ansible.builtin.debug:
    msg: "HA is not configured on this cluster — skipping."
  when: not ha_is_enabled
  run_once: true
 # ── HA status ─────────────────────────────────────────────────────────────────
 - name: "HA | Get status"
  ansible.builtin.command: ha-manager status
  register: ha_status
  changed_when: false
  when:
    - ha_is_enabled
    - ha_action == 'status'
  run_once: true
 - name: "HA | Log status"
  ansible.builtin.debug:
    msg: "{{ ha_status.stdout_lines }}"
  when:
    - ha_is_enabled
    - ha_action == 'status'
  run_once: true
 # ── Put node in maintenance mode ──────────────────────────────────────────────
 # Proxmox uses node maintenance mode via ha-manager to gracefully migrate
 # HA-managed VMs before maintenance. This is the correct HA-aware drain.
 - name: "HA | Enable maintenance mode for {{ current_node }}"
  ansible.builtin.command: >
    ha-manager crm-command node-maintenance enable {{ current_node }}
  changed_when: true
  run_once: true
  when:
    - ha_is_enabled
    - ha_action == 'disable'
 - name: "HA | Wait for {{ current_node }} maintenance mode to be acknowledged"
  ansible.builtin.command: ha-manager status
  register: ha_maintenance_check
  changed_when: false
  until: >-
    'maintenance' in ha_maintenance_check.stdout
    or current_node + ' (maintenance)' in ha_maintenance_check.stdout
  retries: "{{ (ha_timeout / 5) | int }}"
  delay: 5
  run_once: true
  when:
    - ha_is_enabled
    - ha_action == 'disable'
 - name: "HA | Maintenance mode enabled for {{ current_node }}"
  ansible.builtin.debug:
    msg: "✓ HA maintenance mode enabled for {{ current_node }} — HA will not restart VMs on this node."
  when:
    - ha_is_enabled
    - ha_action == 'disable'
  run_once: true
 # ── Resume HA management ──────────────────────────────────────────────────────
 - name: "HA | Disable maintenance mode for {{ current_node }}"
  ansible.builtin.command: >
    ha-manager crm-command node-maintenance disable {{ current_node }}
  changed_when: true
  run_once: true
  when:
    - ha_is_enabled
    - ha_action == 'enable'
 - name: "HA | Maintenance mode disabled for {{ current_node }}"
  ansible.builtin.debug:
    msg: "✓ HA management resumed for {{ current_node }}."
  when:
    - ha_is_enabled
    - ha_action == 'enable'
  run_once: true
--- a/roles/proxmox_preflight/defaults/main.yml
+++ b/roles/proxmox_preflight/defaults/main.yml
@@ -0,0 +1,18 @@
 ---
 # =============================================================================
 # proxmox_preflight — defaults
 # =============================================================================
 # Minimum number of nodes that must be online before proceeding
 preflight_min_nodes_online: 1
 # Abort if any node is offline (set false to warn only)
 preflight_abort_on_offline_node: true
 # Quorum check via pvecm (SSH)
 preflight_check_quorum: true
 # API connection (inherited from inventory)
 # api_host, api_port, api_user, api_token_id, api_token_secret
 api_port: 8006
 validate_certs: false
--- a/roles/proxmox_preflight/meta/main.yml
+++ b/roles/proxmox_preflight/meta/main.yml
@@ -0,0 +1,11 @@
 ---
 galaxy_info:
  role_name: proxmox_preflight
  author: ansible-msp
  description: "MSP Proxmox automation — proxmox_preflight"
  min_ansible_version: "2.15"
  platforms:
    - name: Debian
      versions:
        - bookworm
 dependencies: []
--- a/roles/proxmox_preflight/tasks/main.yml
+++ b/roles/proxmox_preflight/tasks/main.yml
@@ -0,0 +1,113 @@
 ---
 # =============================================================================
 # proxmox_preflight — tasks
 # Determines: standalone vs cluster, node health, quorum, CEPH state
 # Sets facts: proxmox_is_cluster, proxmox_cluster_nodes, proxmox_node_count
 # =============================================================================
 # ── Detect standalone vs cluster ──────────────────────────────────────────────
 - name: "Preflight | Detect cluster membership"
  ansible.builtin.command: pvecm status
  register: pvecm_status
  changed_when: false
  failed_when: false
 - name: "Preflight | Set cluster mode fact"
  ansible.builtin.set_fact:
    proxmox_is_cluster: "{{ pvecm_status.rc == 0 }}"
  delegate_to: localhost
 - name: "Preflight | Log topology"
  ansible.builtin.debug:
    msg: >-
      Node {{ inventory_hostname }} is running in
      {{ 'CLUSTER' if proxmox_is_cluster else 'STANDALONE' }} mode.
 # ── Standalone path ───────────────────────────────────────────────────────────
 - name: "Preflight | Standalone | Verify host is reachable"
  ansible.builtin.ping:
  when: not proxmox_is_cluster
 - name: "Preflight | Standalone | Health check passed"
  ansible.builtin.debug:
    msg: "Standalone node {{ inventory_hostname }} is reachable — preflight passed."
  when: not proxmox_is_cluster
 # ── Cluster path ──────────────────────────────────────────────────────────────
 - name: "Preflight | Cluster | Check quorum"
  ansible.builtin.command: pvecm status
  register: quorum_check
  changed_when: false
  failed_when: "'Quorate' not in quorum_check.stdout"
  when: proxmox_is_cluster and preflight_check_quorum
  run_once: true
 - name: "Preflight | Cluster | Get all node info via API"
  community.proxmox.proxmox_node_info:
    api_host: "{{ api_host }}"
    api_user: "{{ api_user }}"
    api_token_id: "{{ api_token_id }}"
    api_token_secret: "{{ api_token_secret }}"
    api_port: "{{ api_port }}"
    validate_certs: "{{ validate_certs }}"
  register: proxmox_all_nodes
  delegate_to: localhost
  run_once: true
  when: proxmox_is_cluster
 - name: "Preflight | Cluster | Set node list facts"
  ansible.builtin.set_fact:
    proxmox_cluster_nodes: "{{ proxmox_all_nodes.proxmox_nodes }}"
    proxmox_node_count: "{{ proxmox_all_nodes.proxmox_nodes | length }}"
    proxmox_online_nodes: >-
      {{ proxmox_all_nodes.proxmox_nodes
         | selectattr('status', 'equalto', 'online')
         | list }}
    proxmox_offline_nodes: >-
      {{ proxmox_all_nodes.proxmox_nodes
         | rejectattr('status', 'equalto', 'online')
         | list }}
  delegate_to: localhost
  run_once: true
  when: proxmox_is_cluster
 - name: "Preflight | Cluster | Warn about offline nodes"
  ansible.builtin.debug:
    msg: >-
      WARNING: The following nodes are offline:
      {{ proxmox_offline_nodes | map(attribute='node') | list }}
  when:
    - proxmox_is_cluster
    - proxmox_offline_nodes | length > 0
  run_once: true
 - name: "Preflight | Cluster | Abort if offline nodes detected"
  ansible.builtin.fail:
    msg: >-
      Preflight failed — {{ proxmox_offline_nodes | length }} node(s) are offline:
      {{ proxmox_offline_nodes | map(attribute='node') | list }}.
      Set preflight_abort_on_offline_node=false to proceed anyway.
  when:
    - proxmox_is_cluster
    - preflight_abort_on_offline_node
    - proxmox_offline_nodes | length > 0
  run_once: true
 - name: "Preflight | Cluster | Verify minimum online node count"
  ansible.builtin.fail:
    msg: >-
      Only {{ proxmox_online_nodes | length }} node(s) online.
      Minimum required: {{ preflight_min_nodes_online }}.
  when:
    - proxmox_is_cluster
    - proxmox_online_nodes | length < preflight_min_nodes_online | int
  run_once: true
 - name: "Preflight | Cluster | Health check passed"
  ansible.builtin.debug:
    msg: >-
      Cluster preflight OK —
      {{ proxmox_online_nodes | length }}/{{ proxmox_node_count }} nodes online,
      quorum confirmed.
  when: proxmox_is_cluster
  run_once: true
--- a/roles/proxmox_restore/defaults/main.yml
+++ b/roles/proxmox_restore/defaults/main.yml
@@ -0,0 +1,17 @@
 ---
 # =============================================================================
 # proxmox_restore — defaults
 # =============================================================================
 # State file directory (must match drain_state_dir)
 restore_state_dir: "/tmp/proxmox_drain_state"
 # If true, delete the state file after successful restore
 restore_cleanup_state_file: true
 # Timeout waiting for VM to start on restored node
 restore_vm_start_timeout: 120
 # API connection (inherited from inventory)
 api_port: 8006
 validate_certs: false
--- a/roles/proxmox_restore/meta/main.yml
+++ b/roles/proxmox_restore/meta/main.yml
@@ -0,0 +1,11 @@
 ---
 galaxy_info:
  role_name: proxmox_restore
  author: ansible-msp
  description: "MSP Proxmox automation — proxmox_restore"
  min_ansible_version: "2.15"
  platforms:
    - name: Debian
      versions:
        - bookworm
 dependencies: []
--- a/roles/proxmox_restore/tasks/main.yml
+++ b/roles/proxmox_restore/tasks/main.yml
@@ -0,0 +1,112 @@
 ---
 # =============================================================================
 # proxmox_restore — tasks
 # Returns VMs to their origin nodes using state written by proxmox_drain.
 #
 # Required vars:
 #   current_node — the node whose VMs should be restored
 #   restore_state_file — path to the JSON state file (set by caller or discovered)
 # =============================================================================
 # ── Find state file ───────────────────────────────────────────────────────────
 - name: "Restore | {{ current_node }} | Find state files"
  ansible.builtin.find:
    paths: "{{ restore_state_dir }}"
    patterns: "{{ current_node }}_*.json"
    file_type: file
  register: restore_found_files
  delegate_to: localhost
 - name: "Restore | {{ current_node }} | No state files found — skipping"
  ansible.builtin.debug:
    msg: >-
      No drain state files found for {{ current_node }} in {{ restore_state_dir }}.
      Skipping restore.
  when: restore_found_files.files | length == 0
 - name: "Restore | {{ current_node }} | End if no state files"
  ansible.builtin.meta: end_play
  when: restore_found_files.files | length == 0
 - name: "Restore | {{ current_node }} | Use most recent state file"
  ansible.builtin.set_fact:
    restore_state_file: >-
      {{ (restore_found_files.files | sort(attribute='mtime') | last).path }}
  delegate_to: localhost
 - name: "Restore | {{ current_node }} | Load state file"
  ansible.builtin.slurp:
    src: "{{ restore_state_file }}"
  register: restore_state_raw
  delegate_to: localhost
 - name: "Restore | {{ current_node }} | Parse VM origin list"
  ansible.builtin.set_fact:
    restore_vm_list: "{{ restore_state_raw.content | b64decode | from_json }}"
  delegate_to: localhost
 - name: "Restore | {{ current_node }} | Log restore plan"
  ansible.builtin.debug:
    msg: >-
      Restoring {{ restore_vm_list | length }} guest(s) to {{ current_node }}:
      {{ restore_vm_list | map(attribute='vmid') | list }}
 # ── Get current VM locations ──────────────────────────────────────────────────
 - name: "Restore | {{ current_node }} | Get current VM locations"
  community.proxmox.proxmox_vm_info:
    api_host: "{{ api_host }}"
    api_user: "{{ api_user }}"
    api_token_id: "{{ api_token_id }}"
    api_token_secret: "{{ api_token_secret }}"
    api_port: "{{ api_port }}"
    validate_certs: "{{ validate_certs }}"
  register: restore_all_vms
  delegate_to: localhost
 # ── Migrate KVM guests back ───────────────────────────────────────────────────
 - name: "Restore | {{ current_node }} | KVM | Migrate back"
  ansible.builtin.command: >
    qm migrate {{ item.vmid }} {{ current_node }}
    {% if item.status == 'running' %}--online{% endif %}
    --with-local-disks 0
  loop: "{{ restore_vm_list | selectattr('type', 'equalto', 'qemu') | list }}"
  loop_control:
    label: "{{ item.name }} (VMID {{ item.vmid }})"
  changed_when: true
  vars:
    current_location: >-
      {{ restore_all_vms.proxmox_vms
         | selectattr('vmid', 'equalto', item.vmid)
         | map(attribute='node')
         | first
         | default('unknown') }}
  when: current_location != current_node
 # ── Migrate LXC guests back ───────────────────────────────────────────────────
 - name: "Restore | {{ current_node }} | LXC | Migrate back"
  ansible.builtin.command: >
    pct migrate {{ item.vmid }} {{ current_node }} --restart --timeout 120
  loop: "{{ restore_vm_list | selectattr('type', 'equalto', 'lxc') | list }}"
  loop_control:
    label: "{{ item.name | default(item.vmid) }} (VMID {{ item.vmid }})"
  changed_when: true
  vars:
    current_location: >-
      {{ restore_all_vms.proxmox_vms
         | selectattr('vmid', 'equalto', item.vmid)
         | map(attribute='node')
         | first
         | default('unknown') }}
  when: current_location != current_node
 # ── Cleanup ───────────────────────────────────────────────────────────────────
 - name: "Restore | {{ current_node }} | Remove state file"
  ansible.builtin.file:
    path: "{{ restore_state_file }}"
    state: absent
  delegate_to: localhost
  when: restore_cleanup_state_file
 - name: "Restore | {{ current_node }} | Complete"
  ansible.builtin.debug:
    msg: "✓ Restore complete — {{ restore_vm_list | length }} guest(s) returned to {{ current_node }}."
--- a/roles/proxmox_status/defaults/main.yml
+++ b/roles/proxmox_status/defaults/main.yml
@@ -0,0 +1,20 @@
 ---
 # =============================================================================
 # proxmox_status — defaults
 # =============================================================================
 # Include VM inventory in report
 status_include_vms: true
 # Include storage status
 status_include_storage: true
 # Include CEPH status (skipped gracefully if not configured)
 status_include_ceph: true
 # Include HA status (skipped gracefully if not configured)
 status_include_ha: true
 # API connection (inherited from inventory)
 api_port: 8006
 validate_certs: false
--- a/roles/proxmox_status/meta/main.yml
+++ b/roles/proxmox_status/meta/main.yml
@@ -0,0 +1,11 @@
 ---
 galaxy_info:
  role_name: proxmox_status
  author: ansible-msp
  description: "MSP Proxmox automation — proxmox_status"
  min_ansible_version: "2.15"
  platforms:
    - name: Debian
      versions:
        - bookworm
 dependencies: []
--- a/roles/proxmox_status/tasks/main.yml
+++ b/roles/proxmox_status/tasks/main.yml
@@ -0,0 +1,127 @@
 ---
 # =============================================================================
 # proxmox_status — tasks
 # Produces a cluster health report: nodes, VMs, storage, CEPH, HA.
 # =============================================================================
 # ── Node info ─────────────────────────────────────────────────────────────────
 - name: "Status | Get cluster node info"
  community.proxmox.proxmox_node_info:
    api_host: "{{ api_host }}"
    api_user: "{{ api_user }}"
    api_token_id: "{{ api_token_id }}"
    api_token_secret: "{{ api_token_secret }}"
    api_port: "{{ api_port }}"
    validate_certs: "{{ validate_certs }}"
  register: status_nodes
  delegate_to: localhost
  run_once: true
 - name: "Status | Node summary"
  ansible.builtin.debug:
    msg: >-
      ┌─ NODE SUMMARY ─────────────────────────────
      {% for node in status_nodes.proxmox_nodes | sort(attribute='node') %}
      │ {{ node.node | ljust(20) }}
        status={{ node.status | ljust(8) }}
        ver={{ node.version.version | default('?') }}
        cpu={{ (node.cpu | default(0) * 100) | round(1) }}%
        mem={{ ((node.mem | default(0)) / 1073741824) | round(1) }}GB /
            {{ ((node.maxmem | default(0)) / 1073741824) | round(1) }}GB
      {% endfor %}
      └────────────────────────────────────────────
  run_once: true
 # ── VM inventory ──────────────────────────────────────────────────────────────
 - name: "Status | Get VM info for each node"
  community.proxmox.proxmox_vm_info:
    api_host: "{{ api_host }}"
    api_user: "{{ api_user }}"
    api_token_id: "{{ api_token_id }}"
    api_token_secret: "{{ api_token_secret }}"
    api_port: "{{ api_port }}"
    validate_certs: "{{ validate_certs }}"
    node: "{{ item.node }}"
  loop: "{{ status_nodes.proxmox_nodes | selectattr('status', 'equalto', 'online') | list }}"
  loop_control:
    label: "{{ item.node }}"
  register: status_vms_per_node
  delegate_to: localhost
  run_once: true
  when: status_include_vms
 - name: "Status | VM distribution summary"
  ansible.builtin.debug:
    msg: >-
      ┌─ VM DISTRIBUTION ──────────────────────────
      {% for result in status_vms_per_node.results %}
      │ {{ result.item.node | ljust(20) }}
        total={{ result.proxmox_vms | length }}
        running={{ result.proxmox_vms | selectattr('status', 'equalto', 'running') | list | length }}
        stopped={{ result.proxmox_vms | selectattr('status', 'equalto', 'stopped') | list | length }}
      {% endfor %}
      │ Total VMs: {{ status_vms_per_node.results | map(attribute='proxmox_vms') | flatten | length }}
      └────────────────────────────────────────────
  run_once: true
  when: status_include_vms
 # ── CEPH status ───────────────────────────────────────────────────────────────
 - name: "Status | CEPH status"
  ansible.builtin.command: ceph status --format json
  register: status_ceph
  changed_when: false
  failed_when: false
  run_once: true
  when: status_include_ceph
 - name: "Status | CEPH summary"
  ansible.builtin.debug:
    msg: >-
      ┌─ CEPH STATUS ───────────────────────────────
      {% if status_ceph.rc == 0 %}
      │ Health:   {{ (status_ceph.stdout | from_json).health.status }}
      │ OSDs:     {{ (status_ceph.stdout | from_json).osdmap.num_osds }} total,
                  {{ (status_ceph.stdout | from_json).osdmap.num_up_osds }} up,
                  {{ (status_ceph.stdout | from_json).osdmap.num_in_osds }} in
      {% else %}
      │ CEPH not configured or not reachable.
      {% endif %}
      └────────────────────────────────────────────
  run_once: true
  when: status_include_ceph
 # ── HA status ─────────────────────────────────────────────────────────────────
 - name: "Status | HA status"
  ansible.builtin.command: ha-manager status
  register: status_ha
  changed_when: false
  failed_when: false
  run_once: true
  when: status_include_ha
 - name: "Status | HA summary"
  ansible.builtin.debug:
    msg: >-
      ┌─ HA STATUS ─────────────────────────────────
      {% if status_ha.rc == 0 and status_ha.stdout != '' %}
      {{ status_ha.stdout_lines | join('\n      ') }}
      {% else %}
      │ HA not configured.
      {% endif %}
      └────────────────────────────────────────────
  run_once: true
  when: status_include_ha
 # ── PVE versions ─────────────────────────────────────────────────────────────
 - name: "Status | Check for available updates on each node"
  ansible.builtin.shell: |
    apt-get -q update > /dev/null 2>&1
    apt-get -s dist-upgrade 2>/dev/null | grep "^Inst " | wc -l
  register: status_updates_available
  changed_when: false
 - name: "Status | Update availability per node"
  ansible.builtin.debug:
    msg: >-
      {{ inventory_hostname }}: {{ status_updates_available.stdout | trim }} package(s) available for upgrade
      (PVE {{ ansible_local.pve_version | default('unknown') }})
--- a/roles/proxmox_upgrade_node/defaults/main.yml
+++ b/roles/proxmox_upgrade_node/defaults/main.yml
@@ -0,0 +1,21 @@
 ---
 # =============================================================================
 # proxmox_upgrade_node — defaults
 # =============================================================================
 # Reboot behaviour
 upgrade_reboot_if_required: true         # reboot if /var/run/reboot-required exists
 upgrade_reboot_force: false              # reboot even if not required
 upgrade_reboot_timeout: 600             # seconds to wait for node to come back
 upgrade_node_rejoin_timeout: 300        # seconds to wait for cluster rejoin
 upgrade_node_rejoin_retries: 30
 upgrade_node_rejoin_delay: 10
 # apt options
 upgrade_apt_update_cache: true
 upgrade_apt_autoremove: true
 upgrade_apt_cache_valid_time: 3600
 # API connection (inherited from inventory)
 api_port: 8006
 validate_certs: false
--- a/roles/proxmox_upgrade_node/meta/main.yml
+++ b/roles/proxmox_upgrade_node/meta/main.yml
@@ -0,0 +1,11 @@
 ---
 galaxy_info:
  role_name: proxmox_upgrade_node
  author: ansible-msp
  description: "MSP Proxmox automation — proxmox_upgrade_node"
  min_ansible_version: "2.15"
  platforms:
    - name: Debian
      versions:
        - bookworm
 dependencies: []
--- a/roles/proxmox_upgrade_node/tasks/main.yml
+++ b/roles/proxmox_upgrade_node/tasks/main.yml
@@ -0,0 +1,85 @@
 ---
 # =============================================================================
 # proxmox_upgrade_node — tasks
 # Runs apt dist-upgrade on a single node, reboots if required,
 # and waits for the node to rejoin the cluster.
 #
 # Required vars:
 #   current_node — the node being upgraded (used for logging)
 # =============================================================================
 - name: "Upgrade | {{ current_node }} | apt-get update"
  ansible.builtin.apt:
    update_cache: "{{ upgrade_apt_update_cache }}"
    cache_valid_time: "{{ upgrade_apt_cache_valid_time }}"
  changed_when: false
 - name: "Upgrade | {{ current_node }} | apt dist-upgrade"
  ansible.builtin.apt:
    upgrade: dist
    autoremove: "{{ upgrade_apt_autoremove }}"
    autoclean: true
  register: upgrade_apt_result
 - name: "Upgrade | {{ current_node }} | Log upgraded packages"
  ansible.builtin.debug:
    msg: "{{ upgrade_apt_result.stdout_lines | last | default('No output') }}"
 - name: "Upgrade | {{ current_node }} | Check if reboot required"
  ansible.builtin.stat:
    path: /var/run/reboot-required
  register: upgrade_reboot_required_file
 - name: "Upgrade | {{ current_node }} | Set reboot needed fact"
  ansible.builtin.set_fact:
    upgrade_needs_reboot: >-
      {{ upgrade_reboot_required_file.stat.exists or upgrade_reboot_force }}
 - name: "Upgrade | {{ current_node }} | Reboot node"
  ansible.builtin.reboot:
    reboot_timeout: "{{ upgrade_reboot_timeout }}"
    msg: "Ansible controlled reboot for Proxmox upgrade"
    pre_reboot_delay: 5
    post_reboot_delay: 15
  when:
    - upgrade_needs_reboot
    - upgrade_reboot_if_required
 - name: "Upgrade | {{ current_node }} | Skip reboot (not required)"
  ansible.builtin.debug:
    msg: "No reboot required on {{ current_node }} — skipping."
  when: not upgrade_needs_reboot
 # ── Wait for cluster rejoin ───────────────────────────────────────────────────
 - name: "Upgrade | {{ current_node }} | Wait for node to rejoin cluster"
  community.proxmox.proxmox_node_info:
    api_host: "{{ api_host }}"
    api_user: "{{ api_user }}"
    api_token_id: "{{ api_token_id }}"
    api_token_secret: "{{ api_token_secret }}"
    api_port: "{{ api_port }}"
    validate_certs: "{{ validate_certs }}"
  register: upgrade_rejoin_check
  delegate_to: localhost
  until: >-
    upgrade_rejoin_check.proxmox_nodes
    | selectattr('node', 'equalto', current_node)
    | selectattr('status', 'equalto', 'online')
    | list
    | length > 0
  retries: "{{ upgrade_node_rejoin_retries }}"
  delay: "{{ upgrade_node_rejoin_delay }}"
  when: upgrade_needs_reboot
 - name: "Upgrade | {{ current_node }} | Node back online"
  ansible.builtin.debug:
    msg: >-
      ✓ Node {{ current_node }} has rejoined the cluster
      {{ '(after reboot)' if upgrade_needs_reboot else '(no reboot needed)' }}.
 - name: "Upgrade | {{ current_node }} | Complete"
  ansible.builtin.debug:
    msg: >-
      ━━━ Upgrade complete: {{ current_node }}
      {% if upgrade_apt_result.changed %}(packages updated){% else %}(already up to date){% endif %}
      {% if upgrade_needs_reboot %}(rebooted){% else %}(no reboot){% endif %} ━━━