refactor: proxmox_upgrade use inline community.proxmox for API calls.

2026-03-14 15:23:47 -07:00
parent 464fba619f
commit 9bb8e97c82
9 changed files with 274 additions and 384 deletions
--- a/requirements.yml
+++ b/requirements.yml
@@ -0,0 +1,6 @@
+---
+collections:
+  - name: community.proxmox
+    version: ">=1.6.0"
+  - name: community.general
+    version: ">=8.0.0"
--- a/roles/proxmox_upgrade/defaults/main.yml
+++ b/roles/proxmox_upgrade/defaults/main.yml
@@ -4,27 +4,23 @@
 # =============================================================================

 # Migration behaviour
-migration_bulk: false               # true = all VMs at once, false = one at a time
-migration_restore: false            # true = migrate VMs back to original node after upgrade
-live_migrate_fallback: shutdown     # migrate | shutdown | skip
+migration_bulk: false               # true = fire all migrations at once, false = one at a time
+migration_restore: false            # true = migrate guests back to original node after upgrade
+live_migrate_fallback: shutdown     # shutdown | skip
+#   shutdown: shutdown VM, cold migrate, restart on target
+#   skip: leave VM on node (it will go down during reboot — use with caution)

-# Shutdown timeout in seconds before forcing off
-vm_shutdown_timeout: 120
+# Timeouts (seconds)
+vm_shutdown_timeout: 120            # graceful shutdown before force-off
+vm_start_timeout: 120               # wait for VM to start after cold migration
+lxc_migrate_timeout: 300            # pct migrate --restart timeout
+node_rejoin_timeout: 600            # wait for node to rejoin cluster after reboot
+ceph_recover_timeout: 300           # wait for CEPH to recover after node rejoins

-# How long to wait for a VM to start after cold migration
-vm_start_timeout: 120
-
-# How long to wait for node to rejoin cluster after reboot
-node_rejoin_timeout: 600
-
-# How long to wait for CEPH to recover after node rejoins
-ceph_recover_timeout: 300
-
-# apt upgrade options
-apt_upgrade_cmd: "DEBIAN_FRONTEND=noninteractive apt-get dist-upgrade -y"
+# Upgrade options
 apt_autoremove: true

-# Tags on VMs/LXCs to never migrate (comma separated in Proxmox)
+# VM/LXC tags that prevent migration (leave on node, warn)
 migrate_exclude_tags:
  - nomigrate
  - pinned
--- a/roles/proxmox_upgrade/tasks/drain.yml
+++ b/roles/proxmox_upgrade/tasks/drain.yml
@@ -1,225 +1,167 @@
 ---
 # =============================================================================
 # proxmox_upgrade — drain.yml
-# Migrate all VMs/LXCs off a node before upgrading it
-# Uses inline Python for API calls to avoid cross-task variable scope issues
+# Migrate all VMs/LXCs off current_node before upgrading
+# KVM: community.proxmox.proxmox_kvm (API, delegate_to: localhost)
+# LXC: pct migrate (SSH on source node)
 # =============================================================================

-# ── Build migration plan via API ──────────────────────────────────────────────
- name: "Drain | Build migration plan for {{ current_node }}"
-  ansible.builtin.shell: |
-    python3 << 'PYEOF'
-    import urllib.request, urllib.error, json, ssl
-
-    ctx = ssl.create_default_context()
-    ctx.check_hostname = False
-    ctx.verify_mode = ssl.CERT_NONE
-
-    api_base = "https://{{ api_host }}:{{ api_port }}/api2/json"
-    headers  = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
-    node     = "{{ current_node }}"
-    shared   = {{ shared_storage | lower | replace("true", "True") | replace("false", "False") }}
-    exclude_tags = {{ migrate_exclude_tags | to_json }}
-
-    def api_get(path):
-        req = urllib.request.Request(f"{api_base}{path}", headers=headers)
-        with urllib.request.urlopen(req, context=ctx) as r:
-            return json.loads(r.read())["data"]
-
-    # Get all online nodes except current
-    all_nodes = api_get("/nodes")
-    targets = [n["node"] for n in all_nodes if n["status"] == "online" and n["node"] != node]
-    if not targets:
-        print(json.dumps({"error": f"No online nodes available to migrate guests to from {node}"}))
-        exit(1)
-
-    # Get VMs and LXCs on this node
-    vms  = api_get(f"/nodes/{node}/qemu")
-    lxcs = api_get(f"/nodes/{node}/lxc")
-
-    plan = []
-
-    for vm in vms:
-        tags = [t.strip() for t in (vm.get("tags") or "").split(",") if t.strip()]
-        if any(t in exclude_tags for t in tags):
-            continue
-        cfg = api_get(f"/nodes/{node}/qemu/{vm['vmid']}/config")
-        has_passthrough = any(k.startswith("hostpci") or k.startswith("usb") for k in cfg)
-        has_local_cdrom = any(
-            isinstance(v, str) and "local" in v and ".iso" in v
-            for v in cfg.values()
-        )
-        needs_fallback = has_passthrough or not shared or has_local_cdrom
-        reason = "passthrough" if has_passthrough else ("local_disk" if not shared else ("local_cdrom" if has_local_cdrom else ""))
-        plan.append({
-            "vmid": vm["vmid"],
-            "name": vm.get("name", str(vm["vmid"])),
-            "type": "qemu",
-            "status": vm["status"],
-            "needs_fallback": needs_fallback,
-            "fallback_reason": reason
-        })
-
-    for lxc in lxcs:
-        tags = [t.strip() for t in (lxc.get("tags") or "").split(",") if t.strip()]
-        if any(t in exclude_tags for t in tags):
-            continue
-        plan.append({
-            "vmid": lxc["vmid"],
-            "name": lxc.get("name", str(lxc["vmid"])),
-            "type": "lxc",
-            "status": lxc["status"],
-            "needs_fallback": False,
-            "fallback_reason": ""
-        })
-
-    print(json.dumps({"plan": plan, "targets": targets}))
-    PYEOF
-  register: drain_plan_raw
+# ── Discover guests on this node ──────────────────────────────────────────────
+- name: "Drain | Get all guests on {{ current_node }}"
+  community.proxmox.proxmox_vm_info:
+    api_host: "{{ api_host }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+    node: "{{ current_node }}"
+  register: node_guests
  delegate_to: localhost
-  changed_when: false

- name: "Drain | Parse migration plan"
+- name: "Drain | Get available target nodes"
+  community.proxmox.proxmox_node_info:
+    api_host: "{{ api_host }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+  register: all_nodes_info
+  delegate_to: localhost
+
+- name: "Drain | Set migration target"
  ansible.builtin.set_fact:
-    drain_data: "{{ drain_plan_raw.stdout | from_json }}"
+    migration_target: >-
+      {{ all_nodes_info.proxmox_nodes
+         | selectattr('status', 'equalto', 'online')
+         | rejectattr('node', 'equalto', current_node)
+         | map(attribute='node')
+         | list
+         | first }}
  delegate_to: localhost

- name: "Drain | Fail if error building plan"
+- name: "Drain | Fail if no migration target available"
  ansible.builtin.fail:
-    msg: "{{ drain_data.error }}"
-  when: drain_data.error is defined
+    msg: "No online nodes available to migrate guests to. Cannot drain {{ current_node }}."
+  when: migration_target is not defined or migration_target == ''
  delegate_to: localhost

- name: "Drain | Set migration plan and targets"
+- name: "Drain | Build KVM migration list"
  ansible.builtin.set_fact:
-    migration_plan: "{{ drain_data.plan }}"
-    migration_targets: "{{ drain_data.targets }}"
+    kvm_guests: >-
+      {{ node_guests.proxmox_vms
+         | selectattr('type', 'equalto', 'qemu')
+         | rejectattr('tags', 'intersect', migrate_exclude_tags)
+         | list }}
  delegate_to: localhost

- name: "Drain | Log migration plan for {{ current_node }}"
+- name: "Drain | Build LXC migration list"
+  ansible.builtin.set_fact:
+    lxc_guests: >-
+      {{ node_guests.proxmox_vms
+         | selectattr('type', 'equalto', 'lxc')
+         | rejectattr('tags', 'intersect', migrate_exclude_tags)
+         | list }}
+  delegate_to: localhost
+
+- name: "Drain | Log migration plan"
  ansible.builtin.debug:
    msg: >-
-      Migration plan for {{ current_node }} ({{ migration_plan | length }} guests → {{ migration_targets | first }}):
-      {% for g in migration_plan %}
-      - {{ g.type | upper }} {{ g.vmid }} ({{ g.name }}) [{{ g.status }}]{% if g.needs_fallback %} ⚠ fallback={{ live_migrate_fallback }} reason={{ g.fallback_reason }}{% endif %}
-
-      {% endfor %}
+      Drain plan for {{ current_node }} → {{ migration_target }}:
+      KVM: {{ kvm_guests | map(attribute='vmid') | list }}
+      LXC: {{ lxc_guests | map(attribute='vmid') | list }}
  delegate_to: localhost

- name: "Drain | Warn about non-live-migratable guests"
-  ansible.builtin.debug:
-    msg: >-
-      WARNING — {{ item.type | upper }} {{ item.vmid }} ({{ item.name }})
-      cannot be live migrated ({{ item.fallback_reason }}).
-      {% if live_migrate_fallback == 'skip' %}THIS VM WILL GO DOWN DURING NODE REBOOT.
-      {% elif live_migrate_fallback == 'shutdown' %}Will be shut down, cold migrated, and restarted.
-      {% else %}Will attempt live migrate anyway (may fail).{% endif %}
-  loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
-  loop_control:
-    loop_var: item
+# ── KVM migrations ────────────────────────────────────────────────────────────
+- name: "Drain | Migrate KVM guests"
+  when: kvm_guests | length > 0
+  block:
+    - name: "Drain | KVM | Live migrate (sequential)"
+      community.proxmox.proxmox_kvm:
+        api_host: "{{ api_host }}"
+        api_token_id: "{{ api_token_id }}"
+        api_token_secret: "{{ api_token_secret }}"
+        api_port: "{{ api_port }}"
+        node: "{{ current_node }}"
+        vmid: "{{ item.vmid }}"
+        migrate: true
+        target_node: "{{ migration_target }}"
+        online: "{{ true if item.status == 'running' and not item.get('needs_fallback', false) else false }}"
+        timeout: "{{ vm_shutdown_timeout }}"
+      loop: "{{ kvm_guests }}"
      delegate_to: localhost
-
-# ── Sequential migrations ─────────────────────────────────────────────────────
- name: "Drain | Migrate guests sequentially"
      when: not migration_bulk | bool
-  block:
-    - name: "Drain | Sequential | Migrate live-migratable guests"
-      include_tasks: migrate_guest.yml
-      loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}"
-      loop_control:
-        loop_var: guest

-    - name: "Drain | Sequential | Handle fallback guests"
-      include_tasks: migrate_guest.yml
-      loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
-      loop_control:
-        loop_var: guest
-      when: live_migrate_fallback != 'skip'
-
-# ── Bulk migrations ───────────────────────────────────────────────────────────
- name: "Drain | Migrate guests in bulk"
-  when: migration_bulk | bool
-  block:
-    - name: "Drain | Bulk | Trigger all live migrations"
-      ansible.builtin.shell: |
-        python3 << 'PYEOF'
-        import urllib.request, json, ssl
-
-        ctx = ssl.create_default_context()
-        ctx.check_hostname = False
-        ctx.verify_mode = ssl.CERT_NONE
-
-        api_base = "https://{{ api_host }}:{{ api_port }}/api2/json"
-        headers  = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
-        node     = "{{ current_node }}"
-        target   = "{{ migration_targets | first }}"
-        plan     = {{ migration_plan | rejectattr('needs_fallback') | list | to_json }}
-
-        task_ids = []
-        for guest in plan:
-            gtype = "qemu" if guest["type"] == "qemu" else "lxc"
-            url = f"{api_base}/nodes/{node}/{gtype}/{guest['vmid']}/migrate"
-            body = json.dumps({"target": target, "online": 1}).encode()
-            req = urllib.request.Request(url, data=body, headers={**headers, "Content-Type": "application/json"}, method="POST")
-            with urllib.request.urlopen(req, context=ctx) as r:
-                task_id = json.loads(r.read())["data"]
-                task_ids.append({"vmid": guest["vmid"], "name": guest["name"], "task": task_id})
-                print(f"Triggered migration: {guest['type'].upper()} {guest['vmid']} ({guest['name']}) → {target} task={task_id}")
-
-        print(json.dumps({"task_ids": task_ids}))
-        PYEOF
-      register: bulk_trigger_raw
+    - name: "Drain | KVM | Bulk migrate (fire and wait)"
+      community.proxmox.proxmox_kvm:
+        api_host: "{{ api_host }}"
+        api_token_id: "{{ api_token_id }}"
+        api_token_secret: "{{ api_token_secret }}"
+        api_port: "{{ api_port }}"
+        node: "{{ current_node }}"
+        vmid: "{{ item.vmid }}"
+        migrate: true
+        target_node: "{{ migration_target }}"
+        online: "{{ true if item.status == 'running' else false }}"
+        timeout: "{{ vm_shutdown_timeout }}"
+      loop: "{{ kvm_guests }}"
      delegate_to: localhost
+      async: "{{ vm_shutdown_timeout * 2 }}"
+      poll: 0
+      register: kvm_bulk_jobs
+      when: migration_bulk | bool
+
+    - name: "Drain | KVM | Wait for bulk migrations to complete"
+      ansible.builtin.async_status:
+        jid: "{{ item.ansible_job_id }}"
+      register: kvm_job_result
+      until: kvm_job_result.finished
+      retries: 60
+      delay: 10
+      loop: "{{ kvm_bulk_jobs.results }}"
+      delegate_to: localhost
+      when: migration_bulk | bool
+
+# ── LXC migrations ────────────────────────────────────────────────────────────
+- name: "Drain | Migrate LXC guests"
+  when: lxc_guests | length > 0
+  block:
+    - name: "Drain | LXC | Warn about restart requirement"
+      ansible.builtin.debug:
+        msg: >-
+          LXC {{ item.vmid }} ({{ item.name | default('unknown') }}) will be
+          stopped, migrated to {{ migration_target }}, and restarted
+          (LXC live migration is not supported by Proxmox).
+      loop: "{{ lxc_guests | selectattr('status', 'equalto', 'running') | list }}"
+      delegate_to: localhost
+
+    - name: "Drain | LXC | Warn about skipped containers"
+      ansible.builtin.debug:
+        msg: >-
+          WARNING — LXC {{ item.vmid }} ({{ item.name | default('unknown') }})
+          live_migrate_fallback=skip — this container WILL GO DOWN during node reboot.
+      loop: "{{ lxc_guests | selectattr('status', 'equalto', 'running') | list }}"
+      when: live_migrate_fallback == 'skip'
+      delegate_to: localhost
+
+    - name: "Drain | LXC | Migrate via pct migrate --restart"
+      ansible.builtin.command: >
+        pct migrate {{ item.vmid }} {{ migration_target }}
+        {{ '--restart' if item.status == 'running' else '' }}
+        --timeout {{ lxc_migrate_timeout }}
+      loop: "{{ lxc_guests }}"
+      when: live_migrate_fallback != 'skip'
+      register: lxc_migrate_result
      changed_when: true

-    - name: "Drain | Bulk | Wait for all migration tasks to complete"
-      ansible.builtin.shell: |
-        python3 << 'PYEOF'
-        import urllib.request, json, ssl, time
+    - name: "Drain | LXC | Log migration results"
+      ansible.builtin.debug:
+        msg: "LXC {{ item.item.vmid }} migrated to {{ migration_target }}"
+      loop: "{{ lxc_migrate_result.results }}"
+      when:
+        - live_migrate_fallback != 'skip'
+        - item.rc == 0

-        ctx = ssl.create_default_context()
-        ctx.check_hostname = False
-        ctx.verify_mode = ssl.CERT_NONE
-
-        api_base = "https://{{ api_host }}:{{ api_port }}/api2/json"
-        headers  = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
-        node     = "{{ current_node }}"
-
-        lines = """{{ bulk_trigger_raw.stdout }}""".strip().split('\n')
-        last_line = [l for l in lines if l.startswith('{')][-1]
-        task_ids = json.loads(last_line)["task_ids"]
-
-        failed = []
-        for t in task_ids:
-            retries = 60
-            while retries > 0:
-                url = f"{api_base}/nodes/{node}/tasks/{t['task']}/status"
-                req = urllib.request.Request(url, headers=headers)
-                with urllib.request.urlopen(req, context=ctx) as r:
-                    status = json.loads(r.read())["data"]
-                if status["status"] == "stopped":
-                    if status.get("exitstatus") != "OK":
-                        failed.append(f"{t['name']} ({t['vmid']}): {status.get('exitstatus')}")
-                    else:
-                        print(f"OK: {t['name']} ({t['vmid']}) migrated successfully")
-                    break
-                time.sleep(10)
-                retries -= 1
-            else:
-                failed.append(f"{t['name']} ({t['vmid']}): timed out")
-
-        if failed:
-            print("FAILED: " + ", ".join(failed))
-            exit(1)
-        print("All bulk migrations completed successfully")
-        PYEOF
-      register: bulk_wait_result
-      delegate_to: localhost
-      changed_when: false
-
-    - name: "Drain | Bulk | Handle fallback guests sequentially"
-      include_tasks: migrate_guest.yml
-      loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
-      loop_control:
-        loop_var: guest
-      when: live_migrate_fallback != 'skip'
+- name: "Drain | {{ current_node }} drained successfully"
+  ansible.builtin.debug:
+    msg: >-
+      Node {{ current_node }} drained —
+      {{ kvm_guests | length }} KVM + {{ lxc_guests | length }} LXC guests
+      migrated to {{ migration_target }}
--- a/roles/proxmox_upgrade/tasks/main.yml
+++ b/roles/proxmox_upgrade/tasks/main.yml
@@ -8,18 +8,16 @@
  ansible.builtin.debug:
    msg: >-
      Starting Proxmox rolling upgrade for {{ client_name }}
-      — {{ upgrade_order | length }} nodes in order: {{ upgrade_order | join(' → ') }}
+      — {{ upgrade_order | length }} nodes: {{ upgrade_order | join(' → ') }}
      — migration_bulk={{ migration_bulk }}
      — live_migrate_fallback={{ live_migrate_fallback }}
      — migration_restore={{ migration_restore }}
      — ceph_enabled={{ ceph_enabled }}

-# ── Cluster health preflight ──────────────────────────────────────────────────
 - name: Proxmox Upgrade | Cluster preflight
  include_tasks: preflight.yml

-# ── Rolling upgrade — one node at a time ─────────────────────────────────────
- name: Proxmox Upgrade | Rolling upgrade loop
+- name: Proxmox Upgrade | Rolling upgrade
  include_tasks: node_upgrade.yml
  loop: "{{ upgrade_order }}"
  loop_control:
@@ -27,6 +25,4 @@

 - name: Proxmox Upgrade | Complete
  ansible.builtin.debug:
-    msg: >-
-      Proxmox rolling upgrade complete for {{ client_name }}
-      — all {{ upgrade_order | length }} nodes upgraded successfully
+    msg: "Proxmox rolling upgrade complete for {{ client_name }} — {{ upgrade_order | length }} nodes upgraded"
--- a/roles/proxmox_upgrade/tasks/node_upgrade.yml
+++ b/roles/proxmox_upgrade/tasks/node_upgrade.yml
@@ -1,8 +1,8 @@
 ---
 # =============================================================================
 # proxmox_upgrade — node_upgrade.yml
-# Per-node upgrade sequence: backup → drain → upgrade → restore
-# Called with loop_var: current_node
+# Per-node sequence: backup → drain → upgrade → restore
+# Called via loop with loop_var: current_node
 # =============================================================================

 - name: "Node {{ current_node }} | Start"
@@ -16,18 +16,18 @@
  vars:
    pve_config_git_commit_message: "[{{ client_id }}] {{ current_node }} pre-upgrade config backup {{ ansible_date_time.date }}"

-# ── Step 2: Drain node ────────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 2 — Drain (migrate guests off node)"
+# ── Step 2: Drain ─────────────────────────────────────────────────────────────
+- name: "Node {{ current_node }} | Step 2 — Drain guests"
  include_tasks: drain.yml
  when: cluster_mode == 'cluster'

- name: "Node {{ current_node }} | Step 2 — Single node mode, skipping drain"
+- name: "Node {{ current_node }} | Step 2 — Single node, skipping drain"
  ansible.builtin.debug:
    msg: "cluster_mode=single — skipping guest migration"
  when: cluster_mode == 'single'

 # ── Step 3: Upgrade ───────────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 3 — Upgrade packages"
+- name: "Node {{ current_node }} | Step 3 — Upgrade"
  include_tasks: upgrade.yml

 # ── Step 4: Restore ───────────────────────────────────────────────────────────
--- a/roles/proxmox_upgrade/tasks/preflight.yml
+++ b/roles/proxmox_upgrade/tasks/preflight.yml
@@ -1,8 +1,9 @@
 ---
 # =============================================================================
 # proxmox_upgrade — preflight.yml
-# Check cluster health before starting any upgrade work
-# All API checks done in single tasks on localhost to avoid variable scope issues
+# Cluster health check before starting any upgrade work
+# pvecm runs on the node directly (SSH) — no delegation needed
+# API node check runs delegate_to: localhost via community.proxmox
 # =============================================================================

 - name: Preflight | Check quorum via pvecm
@@ -11,65 +12,55 @@
  register: quorum_check
  changed_when: false
  failed_when: quorum_check.rc != 0
+  run_once: true

- name: Preflight | Check all cluster nodes online via API
-  ansible.builtin.shell: |
-    python3 << 'PYEOF'
-    import urllib.request, urllib.error, json, ssl
-
-    ctx = ssl.create_default_context()
-    ctx.check_hostname = False
-    ctx.verify_mode = ssl.CERT_NONE
-
-    req = urllib.request.Request(
-        "https://{{ api_host }}:{{ api_port }}/api2/json/nodes",
-        headers={"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
-    )
-    with urllib.request.urlopen(req, context=ctx) as r:
-        data = json.loads(r.read())["data"]
-
-    offline = [n for n in data if n["status"] != "online"]
-    if offline:
-        print("FAIL: " + ", ".join(f"{n['node']}={n['status']}" for n in offline))
-        exit(1)
-    else:
-        print("OK: " + ", ".join(f"{n['node']}={n['status']}" for n in data))
-        exit(0)
-    PYEOF
-  register: node_check
-  changed_when: false
-  failed_when: node_check.rc != 0
+- name: Preflight | Get all cluster nodes via API
+  community.proxmox.proxmox_node_info:
+    api_host: "{{ api_host }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+  register: cluster_node_info
  delegate_to: localhost
  run_once: true

- name: Preflight | Log node status
-  ansible.builtin.debug:
-    msg: "{{ node_check.stdout }}"
+- name: Preflight | Fail if any node is offline
+  ansible.builtin.fail:
+    msg: >
+      Cluster health check FAILED — node {{ item.node }} is {{ item.status }}.
+      Aborting upgrade to prevent data loss. Investigate before retrying.
+  loop: "{{ cluster_node_info.proxmox_nodes }}"
+  when: item.status != 'online'
  delegate_to: localhost
  run_once: true

 - name: Preflight | Check CEPH health
  when: ceph_enabled | bool
  block:
-    - name: Preflight | Get CEPH health status
-      ansible.builtin.shell: ceph health 2>/dev/null
+    - name: Preflight | Get CEPH health
+      ansible.builtin.shell: ceph health
      register: ceph_health
      changed_when: false
+      run_once: true

    - name: Preflight | Abort if CEPH is in error state
      ansible.builtin.fail:
        msg: >
-          CEPH health check FAILED — status: {{ ceph_health.stdout }}.
-          Aborting upgrade. Resolve CEPH issues before retrying.
+          CEPH health check FAILED — {{ ceph_health.stdout }}.
+          Resolve CEPH issues before retrying.
      when: "'HEALTH_OK' not in ceph_health.stdout and 'HEALTH_WARN' not in ceph_health.stdout"
+      run_once: true

    - name: Preflight | Warn if CEPH has warnings
      ansible.builtin.debug:
        msg: "WARNING — CEPH has warnings: {{ ceph_health.stdout }}. Proceeding but monitor closely."
      when: "'HEALTH_WARN' in ceph_health.stdout"
+      run_once: true

 - name: Preflight | Cluster health check passed
  ansible.builtin.debug:
-    msg: "Cluster health check passed — all nodes online, quorum OK{{ ', CEPH checked' if ceph_enabled else '' }}"
+    msg: >-
+      Cluster health OK — {{ cluster_node_info.proxmox_nodes | length }} nodes online,
+      quorum confirmed{{ ', CEPH checked' if ceph_enabled else '' }}
  delegate_to: localhost
  run_once: true
--- a/roles/proxmox_upgrade/tasks/proxmox_upgrade.yml
+++ b/roles/proxmox_upgrade/tasks/proxmox_upgrade.yml
@@ -1,22 +1,19 @@
 ---
 # =============================================================================
-# proxmox_upgrade.yml
+# proxmox_upgrade.yml — Rolling Proxmox cluster upgrade
 # =============================================================================
-# Rolling Proxmox cluster upgrade playbook.
-# Runs on the first node in upgrade_order — all other nodes are handled
-# via API calls and delegate_to from within the role.
+# Requires: community.proxmox collection + proxmoxer>=2.0, requests on Semaphore
+#   ansible-galaxy collection install community.proxmox
+#   pip install proxmoxer requests --break-system-packages
 #
 # Usage:
 #   ansible-playbook playbooks/proxmox_upgrade.yml \
 #     -i inventories/client_local_eng/hypervisor_hosts.yml
 #
-# Override migration behaviour:
+# Overrides:
 #   -e migration_bulk=true
 #   -e live_migrate_fallback=skip
 #   -e migration_restore=true
-#
-# Dry run (check mode — no changes):
-#   --check
 # =============================================================================

 - name: Proxmox Rolling Upgrade
@@ -26,15 +23,15 @@
  run_once: true

  pre_tasks:
-    - name: Confirm upgrade_order is defined
+    - name: Validate upgrade_order is defined
      ansible.builtin.fail:
        msg: "upgrade_order must be defined in hypervisor_hosts.yml"
      when: upgrade_order is not defined or upgrade_order | length == 0

-    - name: Log upgrade targets
+    - name: Log upgrade plan
      ansible.builtin.debug:
        msg: >-
-          Proxmox upgrade starting for {{ client_name }} ({{ client_id }})
+          Proxmox upgrade: {{ client_name }} ({{ client_id }})
          Nodes: {{ upgrade_order | join(', ') }}
          API: https://{{ api_host }}:{{ api_port }}

--- a/roles/proxmox_upgrade/tasks/restore.yml
+++ b/roles/proxmox_upgrade/tasks/restore.yml
@@ -1,81 +1,43 @@
 ---
 # =============================================================================
 # proxmox_upgrade — restore.yml
-# Optionally migrate guests back to their original node after upgrade
+# Optionally migrate guests back to original node after upgrade
 # Only runs if migration_restore: true
 # =============================================================================

 - name: "Restore | Skip — migration_restore=false"
  ansible.builtin.debug:
-    msg: "migration_restore=false — leaving guests on their current nodes"
+    msg: "migration_restore=false — guests remain on {{ migration_target }}"
  when: not migration_restore | bool
-  delegate_to: localhost

 - name: "Restore | Migrate guests back to {{ current_node }}"
  when: migration_restore | bool
  block:
-    - name: "Restore | Migrate all guests back to {{ current_node }}"
-      ansible.builtin.shell: |
-        python3 << 'PYEOF'
-        import urllib.request, json, ssl, time
-
-        ctx = ssl.create_default_context()
-        ctx.check_hostname = False
-        ctx.verify_mode = ssl.CERT_NONE
-
-        api_base = "https://{{ api_host }}:{{ api_port }}/api2/json"
-        headers  = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
-        node     = "{{ current_node }}"
-        source   = "{{ migration_targets | first }}"
-        plan     = {{ migration_plan | to_json }}
-        fallback = "{{ live_migrate_fallback }}"
-
-        def api_req(path, method="GET", body=None):
-            url = f"{api_base}{path}"
-            data = json.dumps(body).encode() if body else None
-            hdrs = {**headers}
-            if data:
-                hdrs["Content-Type"] = "application/json"
-            req = urllib.request.Request(url, data=data, headers=hdrs, method=method)
-            with urllib.request.urlopen(req, context=ctx) as r:
-                return json.loads(r.read())["data"]
-
-        task_ids = []
-        for guest in plan:
-            if guest["needs_fallback"] and fallback == "skip":
-                print(f"SKIP restore: {guest['type'].upper()} {guest['vmid']} ({guest['name']}) — was skipped during drain")
-                continue
-            gtype = guest["type"]
-            online = 0 if (guest["needs_fallback"] and fallback == "shutdown") else 1
-            print(f"Restoring {gtype.upper()} {guest['vmid']} ({guest['name']}) → {node} (online={online})...")
-            task_id = api_req(f"/nodes/{source}/{gtype}/{guest['vmid']}/migrate", "POST",
-                              {"target": node, "online": online})
-            task_ids.append({"vmid": guest["vmid"], "name": guest["name"], "task": task_id, "type": gtype})
-
-        failed = []
-        for t in task_ids:
-            for _ in range(60):
-                status = api_req(f"/nodes/{source}/tasks/{t['task']}/status")
-                if status["status"] == "stopped":
-                    if status.get("exitstatus") != "OK":
-                        failed.append(f"{t['name']} ({t['vmid']}): {status.get('exitstatus')}")
-                    else:
-                        print(f"OK: {t['name']} ({t['vmid']}) restored to {node}")
-                    break
-                time.sleep(10)
-            else:
-                failed.append(f"{t['name']} ({t['vmid']}): timed out")
-
-        if failed:
-            print("FAILED restores: " + ", ".join(failed))
-            exit(1)
-        print(f"All guests restored to {node}")
-        PYEOF
-      register: restore_result
+    - name: "Restore | KVM | Migrate back to {{ current_node }}"
+      community.proxmox.proxmox_kvm:
+        api_host: "{{ api_host }}"
+        api_token_id: "{{ api_token_id }}"
+        api_token_secret: "{{ api_token_secret }}"
+        api_port: "{{ api_port }}"
+        node: "{{ migration_target }}"
+        vmid: "{{ item.vmid }}"
+        migrate: true
+        target_node: "{{ current_node }}"
+        online: "{{ true if item.status == 'running' else false }}"
+        timeout: "{{ vm_shutdown_timeout }}"
+      loop: "{{ kvm_guests | default([]) }}"
      delegate_to: localhost
+
+    - name: "Restore | LXC | Migrate back to {{ current_node }}"
+      ansible.builtin.command: >
+        pct migrate {{ item.vmid }} {{ current_node }}
+        {{ '--restart' if item.status == 'running' else '' }}
+        --timeout {{ lxc_migrate_timeout }}
+      loop: "{{ lxc_guests | default([]) }}"
+      when: live_migrate_fallback != 'skip'
      changed_when: true
+      delegate_to: "{{ migration_target }}"

-    - name: "Restore | Log result"
+    - name: "Restore | Complete"
      ansible.builtin.debug:
-        msg: "{{ restore_result.stdout_lines }}"
-      delegate_to: localhost
+        msg: "All guests restored to {{ current_node }}"
--- a/roles/proxmox_upgrade/tasks/upgrade.yml
+++ b/roles/proxmox_upgrade/tasks/upgrade.yml
@@ -1,42 +1,36 @@
 ---
 # =============================================================================
 # proxmox_upgrade — upgrade.yml
-# Run apt dist-upgrade and reboot, wait for node to rejoin cluster
+# apt dist-upgrade, reboot, wait for node to rejoin cluster
+# Runs directly on the node via SSH — no delegation
 # =============================================================================

- name: Upgrade | Set CEPH noout flag before upgrade
-  ansible.builtin.shell: ceph osd set noout
+- name: "Upgrade | {{ current_node }} | Set CEPH noout flag"
+  ansible.builtin.command: ceph osd set noout
  when: ceph_enabled | bool
  changed_when: true

- name: Upgrade | Run apt update
-  ansible.builtin.shell: apt-get update -q
+- name: "Upgrade | {{ current_node }} | apt-get update"
+  ansible.builtin.apt:
+    update_cache: true
  changed_when: false

- name: Upgrade | Run apt dist-upgrade
-  ansible.builtin.shell: "{{ apt_upgrade_cmd }}"
-  register: apt_upgrade_result
-  changed_when: "'0 upgraded' not in apt_upgrade_result.stdout"
+- name: "Upgrade | {{ current_node }} | apt dist-upgrade"
+  ansible.builtin.apt:
+    upgrade: dist
+    autoremove: "{{ apt_autoremove | bool }}"
+  register: apt_result

- name: Upgrade | Log packages upgraded
+- name: "Upgrade | {{ current_node }} | Log upgraded packages"
  ansible.builtin.debug:
-    msg: "{{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('No output') }}"
+    msg: "{{ apt_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('apt dist-upgrade complete') }}"

- name: Upgrade | Run apt autoremove
-  ansible.builtin.shell: DEBIAN_FRONTEND=noninteractive apt-get autoremove -y
-  when: apt_autoremove | bool
-  changed_when: false
-
- name: Upgrade | Check if reboot is required
+- name: "Upgrade | {{ current_node }} | Check if reboot required"
  ansible.builtin.stat:
    path: /var/run/reboot-required
  register: reboot_required

- name: Upgrade | Log reboot status
-  ansible.builtin.debug:
-    msg: "{{ 'Reboot required — rebooting node' if reboot_required.stat.exists else 'No reboot required — skipping reboot' }}"
-
- name: Upgrade | Reboot node
+- name: "Upgrade | {{ current_node }} | Reboot node"
  ansible.builtin.reboot:
    reboot_timeout: "{{ node_rejoin_timeout }}"
    msg: "Rebooting for Proxmox upgrade"
@@ -44,51 +38,57 @@
    post_reboot_delay: 30
  when: reboot_required.stat.exists

+- name: "Upgrade | {{ current_node }} | Skip reboot (not required)"
+  ansible.builtin.debug:
+    msg: "No reboot required — skipping"
+  when: not reboot_required.stat.exists
+
 # ── Wait for node to rejoin cluster ──────────────────────────────────────────
- name: Upgrade | Wait for node to appear online in cluster
-  ansible.builtin.uri:
-    url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes"
-    method: GET
-    headers:
-      Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
-    validate_certs: false
-  register: nodes_status
+- name: "Upgrade | {{ current_node }} | Wait for node to rejoin cluster"
+  community.proxmox.proxmox_node_info:
+    api_host: "{{ api_host }}"
+    api_token_id: "{{ api_token_id }}"
+    api_token_secret: "{{ api_token_secret }}"
+    api_port: "{{ api_port }}"
+  register: rejoin_check
  until: >-
-    nodes_status.json.data
+    rejoin_check.proxmox_nodes
    | selectattr('node', 'equalto', current_node)
    | selectattr('status', 'equalto', 'online')
    | list | length > 0
  retries: "{{ (node_rejoin_timeout | int / 10) | int }}"
  delay: 10
  delegate_to: localhost
+  when: reboot_required.stat.exists

- name: Upgrade | Node {{ current_node }} back online
+- name: "Upgrade | {{ current_node }} | Node back online"
  ansible.builtin.debug:
    msg: "Node {{ current_node }} has rejoined the cluster"

-# ── CEPH recovery wait ────────────────────────────────────────────────────────
- name: Upgrade | Wait for CEPH to recover
+# ── CEPH recovery ─────────────────────────────────────────────────────────────
+- name: "Upgrade | {{ current_node }} | Wait for CEPH to recover"
  when: ceph_enabled | bool
  block:
-    - name: Upgrade | CEPH | Wait for HEALTH_OK or HEALTH_WARN
+    - name: "Upgrade | CEPH | Wait for healthy status"
      ansible.builtin.shell: ceph health
      register: ceph_health_post
-      until: "'HEALTH_OK' in ceph_health_post.stdout or 'HEALTH_WARN' in ceph_health_post.stdout"
+      until: >-
+        'HEALTH_OK' in ceph_health_post.stdout or
+        'HEALTH_WARN' in ceph_health_post.stdout
      retries: "{{ (ceph_recover_timeout | int / 10) | int }}"
      delay: 10
      changed_when: false

-    - name: Upgrade | CEPH | Clear noout flag
-      ansible.builtin.shell: ceph osd unset noout
+    - name: "Upgrade | CEPH | Clear noout flag"
+      ansible.builtin.command: ceph osd unset noout
      changed_when: true

-    - name: Upgrade | CEPH | Log recovery status
+    - name: "Upgrade | CEPH | Status"
      ansible.builtin.debug:
        msg: "CEPH recovered: {{ ceph_health_post.stdout }}"

- name: Upgrade | Node {{ current_node }} upgrade complete
+- name: "Upgrade | {{ current_node }} | Upgrade complete"
  ansible.builtin.debug:
    msg: >-
-      Node {{ current_node }} upgrade complete —
-      {{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('packages updated') }}
+      Node {{ current_node }} upgrade complete
      {{ '— rebooted' if reboot_required.stat.exists else '— no reboot needed' }}