From 9bb8e97c82f63b6d3f73661713a037a2183830c1 Mon Sep 17 00:00:00 2001 From: Semaphore Date: Sat, 14 Mar 2026 15:23:47 -0700 Subject: [PATCH] refactor: proxmox_upgrade use inline community.proxmox for API calls. --- requirements.yml | 6 + roles/proxmox_upgrade/defaults/main.yml | 30 +- roles/proxmox_upgrade/tasks/drain.yml | 342 ++++++++---------- roles/proxmox_upgrade/tasks/main.yml | 10 +- roles/proxmox_upgrade/tasks/node_upgrade.yml | 12 +- roles/proxmox_upgrade/tasks/preflight.yml | 65 ++-- .../proxmox_upgrade/tasks/proxmox_upgrade.yml | 19 +- roles/proxmox_upgrade/tasks/restore.yml | 90 ++--- roles/proxmox_upgrade/tasks/upgrade.yml | 84 ++--- 9 files changed, 274 insertions(+), 384 deletions(-) create mode 100644 requirements.yml diff --git a/requirements.yml b/requirements.yml new file mode 100644 index 0000000..b8c5a99 --- /dev/null +++ b/requirements.yml @@ -0,0 +1,6 @@ +--- +collections: + - name: community.proxmox + version: ">=1.6.0" + - name: community.general + version: ">=8.0.0" diff --git a/roles/proxmox_upgrade/defaults/main.yml b/roles/proxmox_upgrade/defaults/main.yml index c5faacd..c2d1674 100644 --- a/roles/proxmox_upgrade/defaults/main.yml +++ b/roles/proxmox_upgrade/defaults/main.yml @@ -4,27 +4,23 @@ # ============================================================================= # Migration behaviour -migration_bulk: false # true = all VMs at once, false = one at a time -migration_restore: false # true = migrate VMs back to original node after upgrade -live_migrate_fallback: shutdown # migrate | shutdown | skip +migration_bulk: false # true = fire all migrations at once, false = one at a time +migration_restore: false # true = migrate guests back to original node after upgrade +live_migrate_fallback: shutdown # shutdown | skip +# shutdown: shutdown VM, cold migrate, restart on target +# skip: leave VM on node (it will go down during reboot — use with caution) -# Shutdown timeout in seconds before forcing off -vm_shutdown_timeout: 120 +# Timeouts (seconds) +vm_shutdown_timeout: 120 # graceful shutdown before force-off +vm_start_timeout: 120 # wait for VM to start after cold migration +lxc_migrate_timeout: 300 # pct migrate --restart timeout +node_rejoin_timeout: 600 # wait for node to rejoin cluster after reboot +ceph_recover_timeout: 300 # wait for CEPH to recover after node rejoins -# How long to wait for a VM to start after cold migration -vm_start_timeout: 120 - -# How long to wait for node to rejoin cluster after reboot -node_rejoin_timeout: 600 - -# How long to wait for CEPH to recover after node rejoins -ceph_recover_timeout: 300 - -# apt upgrade options -apt_upgrade_cmd: "DEBIAN_FRONTEND=noninteractive apt-get dist-upgrade -y" +# Upgrade options apt_autoremove: true -# Tags on VMs/LXCs to never migrate (comma separated in Proxmox) +# VM/LXC tags that prevent migration (leave on node, warn) migrate_exclude_tags: - nomigrate - pinned diff --git a/roles/proxmox_upgrade/tasks/drain.yml b/roles/proxmox_upgrade/tasks/drain.yml index 327bee7..9a70bb4 100644 --- a/roles/proxmox_upgrade/tasks/drain.yml +++ b/roles/proxmox_upgrade/tasks/drain.yml @@ -1,225 +1,167 @@ --- # ============================================================================= # proxmox_upgrade — drain.yml -# Migrate all VMs/LXCs off a node before upgrading it -# Uses inline Python for API calls to avoid cross-task variable scope issues +# Migrate all VMs/LXCs off current_node before upgrading +# KVM: community.proxmox.proxmox_kvm (API, delegate_to: localhost) +# LXC: pct migrate (SSH on source node) # ============================================================================= -# ── Build migration plan via API ────────────────────────────────────────────── -- name: "Drain | Build migration plan for {{ current_node }}" - ansible.builtin.shell: | - python3 << 'PYEOF' - import urllib.request, urllib.error, json, ssl - - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" - headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} - node = "{{ current_node }}" - shared = {{ shared_storage | lower | replace("true", "True") | replace("false", "False") }} - exclude_tags = {{ migrate_exclude_tags | to_json }} - - def api_get(path): - req = urllib.request.Request(f"{api_base}{path}", headers=headers) - with urllib.request.urlopen(req, context=ctx) as r: - return json.loads(r.read())["data"] - - # Get all online nodes except current - all_nodes = api_get("/nodes") - targets = [n["node"] for n in all_nodes if n["status"] == "online" and n["node"] != node] - if not targets: - print(json.dumps({"error": f"No online nodes available to migrate guests to from {node}"})) - exit(1) - - # Get VMs and LXCs on this node - vms = api_get(f"/nodes/{node}/qemu") - lxcs = api_get(f"/nodes/{node}/lxc") - - plan = [] - - for vm in vms: - tags = [t.strip() for t in (vm.get("tags") or "").split(",") if t.strip()] - if any(t in exclude_tags for t in tags): - continue - cfg = api_get(f"/nodes/{node}/qemu/{vm['vmid']}/config") - has_passthrough = any(k.startswith("hostpci") or k.startswith("usb") for k in cfg) - has_local_cdrom = any( - isinstance(v, str) and "local" in v and ".iso" in v - for v in cfg.values() - ) - needs_fallback = has_passthrough or not shared or has_local_cdrom - reason = "passthrough" if has_passthrough else ("local_disk" if not shared else ("local_cdrom" if has_local_cdrom else "")) - plan.append({ - "vmid": vm["vmid"], - "name": vm.get("name", str(vm["vmid"])), - "type": "qemu", - "status": vm["status"], - "needs_fallback": needs_fallback, - "fallback_reason": reason - }) - - for lxc in lxcs: - tags = [t.strip() for t in (lxc.get("tags") or "").split(",") if t.strip()] - if any(t in exclude_tags for t in tags): - continue - plan.append({ - "vmid": lxc["vmid"], - "name": lxc.get("name", str(lxc["vmid"])), - "type": "lxc", - "status": lxc["status"], - "needs_fallback": False, - "fallback_reason": "" - }) - - print(json.dumps({"plan": plan, "targets": targets})) - PYEOF - register: drain_plan_raw +# ── Discover guests on this node ────────────────────────────────────────────── +- name: "Drain | Get all guests on {{ current_node }}" + community.proxmox.proxmox_vm_info: + api_host: "{{ api_host }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port }}" + node: "{{ current_node }}" + register: node_guests delegate_to: localhost - changed_when: false -- name: "Drain | Parse migration plan" +- name: "Drain | Get available target nodes" + community.proxmox.proxmox_node_info: + api_host: "{{ api_host }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port }}" + register: all_nodes_info + delegate_to: localhost + +- name: "Drain | Set migration target" ansible.builtin.set_fact: - drain_data: "{{ drain_plan_raw.stdout | from_json }}" + migration_target: >- + {{ all_nodes_info.proxmox_nodes + | selectattr('status', 'equalto', 'online') + | rejectattr('node', 'equalto', current_node) + | map(attribute='node') + | list + | first }} delegate_to: localhost -- name: "Drain | Fail if error building plan" +- name: "Drain | Fail if no migration target available" ansible.builtin.fail: - msg: "{{ drain_data.error }}" - when: drain_data.error is defined + msg: "No online nodes available to migrate guests to. Cannot drain {{ current_node }}." + when: migration_target is not defined or migration_target == '' delegate_to: localhost -- name: "Drain | Set migration plan and targets" +- name: "Drain | Build KVM migration list" ansible.builtin.set_fact: - migration_plan: "{{ drain_data.plan }}" - migration_targets: "{{ drain_data.targets }}" + kvm_guests: >- + {{ node_guests.proxmox_vms + | selectattr('type', 'equalto', 'qemu') + | rejectattr('tags', 'intersect', migrate_exclude_tags) + | list }} delegate_to: localhost -- name: "Drain | Log migration plan for {{ current_node }}" +- name: "Drain | Build LXC migration list" + ansible.builtin.set_fact: + lxc_guests: >- + {{ node_guests.proxmox_vms + | selectattr('type', 'equalto', 'lxc') + | rejectattr('tags', 'intersect', migrate_exclude_tags) + | list }} + delegate_to: localhost + +- name: "Drain | Log migration plan" ansible.builtin.debug: msg: >- - Migration plan for {{ current_node }} ({{ migration_plan | length }} guests → {{ migration_targets | first }}): - {% for g in migration_plan %} - - {{ g.type | upper }} {{ g.vmid }} ({{ g.name }}) [{{ g.status }}]{% if g.needs_fallback %} ⚠ fallback={{ live_migrate_fallback }} reason={{ g.fallback_reason }}{% endif %} - - {% endfor %} + Drain plan for {{ current_node }} → {{ migration_target }}: + KVM: {{ kvm_guests | map(attribute='vmid') | list }} + LXC: {{ lxc_guests | map(attribute='vmid') | list }} delegate_to: localhost -- name: "Drain | Warn about non-live-migratable guests" - ansible.builtin.debug: - msg: >- - WARNING — {{ item.type | upper }} {{ item.vmid }} ({{ item.name }}) - cannot be live migrated ({{ item.fallback_reason }}). - {% if live_migrate_fallback == 'skip' %}THIS VM WILL GO DOWN DURING NODE REBOOT. - {% elif live_migrate_fallback == 'shutdown' %}Will be shut down, cold migrated, and restarted. - {% else %}Will attempt live migrate anyway (may fail).{% endif %} - loop: "{{ migration_plan | selectattr('needs_fallback') | list }}" - loop_control: - loop_var: item - delegate_to: localhost - -# ── Sequential migrations ───────────────────────────────────────────────────── -- name: "Drain | Migrate guests sequentially" - when: not migration_bulk | bool +# ── KVM migrations ──────────────────────────────────────────────────────────── +- name: "Drain | Migrate KVM guests" + when: kvm_guests | length > 0 block: - - name: "Drain | Sequential | Migrate live-migratable guests" - include_tasks: migrate_guest.yml - loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}" - loop_control: - loop_var: guest - - - name: "Drain | Sequential | Handle fallback guests" - include_tasks: migrate_guest.yml - loop: "{{ migration_plan | selectattr('needs_fallback') | list }}" - loop_control: - loop_var: guest - when: live_migrate_fallback != 'skip' - -# ── Bulk migrations ─────────────────────────────────────────────────────────── -- name: "Drain | Migrate guests in bulk" - when: migration_bulk | bool - block: - - name: "Drain | Bulk | Trigger all live migrations" - ansible.builtin.shell: | - python3 << 'PYEOF' - import urllib.request, json, ssl - - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" - headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} - node = "{{ current_node }}" - target = "{{ migration_targets | first }}" - plan = {{ migration_plan | rejectattr('needs_fallback') | list | to_json }} - - task_ids = [] - for guest in plan: - gtype = "qemu" if guest["type"] == "qemu" else "lxc" - url = f"{api_base}/nodes/{node}/{gtype}/{guest['vmid']}/migrate" - body = json.dumps({"target": target, "online": 1}).encode() - req = urllib.request.Request(url, data=body, headers={**headers, "Content-Type": "application/json"}, method="POST") - with urllib.request.urlopen(req, context=ctx) as r: - task_id = json.loads(r.read())["data"] - task_ids.append({"vmid": guest["vmid"], "name": guest["name"], "task": task_id}) - print(f"Triggered migration: {guest['type'].upper()} {guest['vmid']} ({guest['name']}) → {target} task={task_id}") - - print(json.dumps({"task_ids": task_ids})) - PYEOF - register: bulk_trigger_raw + - name: "Drain | KVM | Live migrate (sequential)" + community.proxmox.proxmox_kvm: + api_host: "{{ api_host }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port }}" + node: "{{ current_node }}" + vmid: "{{ item.vmid }}" + migrate: true + target_node: "{{ migration_target }}" + online: "{{ true if item.status == 'running' and not item.get('needs_fallback', false) else false }}" + timeout: "{{ vm_shutdown_timeout }}" + loop: "{{ kvm_guests }}" delegate_to: localhost + when: not migration_bulk | bool + + - name: "Drain | KVM | Bulk migrate (fire and wait)" + community.proxmox.proxmox_kvm: + api_host: "{{ api_host }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port }}" + node: "{{ current_node }}" + vmid: "{{ item.vmid }}" + migrate: true + target_node: "{{ migration_target }}" + online: "{{ true if item.status == 'running' else false }}" + timeout: "{{ vm_shutdown_timeout }}" + loop: "{{ kvm_guests }}" + delegate_to: localhost + async: "{{ vm_shutdown_timeout * 2 }}" + poll: 0 + register: kvm_bulk_jobs + when: migration_bulk | bool + + - name: "Drain | KVM | Wait for bulk migrations to complete" + ansible.builtin.async_status: + jid: "{{ item.ansible_job_id }}" + register: kvm_job_result + until: kvm_job_result.finished + retries: 60 + delay: 10 + loop: "{{ kvm_bulk_jobs.results }}" + delegate_to: localhost + when: migration_bulk | bool + +# ── LXC migrations ──────────────────────────────────────────────────────────── +- name: "Drain | Migrate LXC guests" + when: lxc_guests | length > 0 + block: + - name: "Drain | LXC | Warn about restart requirement" + ansible.builtin.debug: + msg: >- + LXC {{ item.vmid }} ({{ item.name | default('unknown') }}) will be + stopped, migrated to {{ migration_target }}, and restarted + (LXC live migration is not supported by Proxmox). + loop: "{{ lxc_guests | selectattr('status', 'equalto', 'running') | list }}" + delegate_to: localhost + + - name: "Drain | LXC | Warn about skipped containers" + ansible.builtin.debug: + msg: >- + WARNING — LXC {{ item.vmid }} ({{ item.name | default('unknown') }}) + live_migrate_fallback=skip — this container WILL GO DOWN during node reboot. + loop: "{{ lxc_guests | selectattr('status', 'equalto', 'running') | list }}" + when: live_migrate_fallback == 'skip' + delegate_to: localhost + + - name: "Drain | LXC | Migrate via pct migrate --restart" + ansible.builtin.command: > + pct migrate {{ item.vmid }} {{ migration_target }} + {{ '--restart' if item.status == 'running' else '' }} + --timeout {{ lxc_migrate_timeout }} + loop: "{{ lxc_guests }}" + when: live_migrate_fallback != 'skip' + register: lxc_migrate_result changed_when: true - - name: "Drain | Bulk | Wait for all migration tasks to complete" - ansible.builtin.shell: | - python3 << 'PYEOF' - import urllib.request, json, ssl, time + - name: "Drain | LXC | Log migration results" + ansible.builtin.debug: + msg: "LXC {{ item.item.vmid }} migrated to {{ migration_target }}" + loop: "{{ lxc_migrate_result.results }}" + when: + - live_migrate_fallback != 'skip' + - item.rc == 0 - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" - headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} - node = "{{ current_node }}" - - lines = """{{ bulk_trigger_raw.stdout }}""".strip().split('\n') - last_line = [l for l in lines if l.startswith('{')][-1] - task_ids = json.loads(last_line)["task_ids"] - - failed = [] - for t in task_ids: - retries = 60 - while retries > 0: - url = f"{api_base}/nodes/{node}/tasks/{t['task']}/status" - req = urllib.request.Request(url, headers=headers) - with urllib.request.urlopen(req, context=ctx) as r: - status = json.loads(r.read())["data"] - if status["status"] == "stopped": - if status.get("exitstatus") != "OK": - failed.append(f"{t['name']} ({t['vmid']}): {status.get('exitstatus')}") - else: - print(f"OK: {t['name']} ({t['vmid']}) migrated successfully") - break - time.sleep(10) - retries -= 1 - else: - failed.append(f"{t['name']} ({t['vmid']}): timed out") - - if failed: - print("FAILED: " + ", ".join(failed)) - exit(1) - print("All bulk migrations completed successfully") - PYEOF - register: bulk_wait_result - delegate_to: localhost - changed_when: false - - - name: "Drain | Bulk | Handle fallback guests sequentially" - include_tasks: migrate_guest.yml - loop: "{{ migration_plan | selectattr('needs_fallback') | list }}" - loop_control: - loop_var: guest - when: live_migrate_fallback != 'skip' \ No newline at end of file +- name: "Drain | {{ current_node }} drained successfully" + ansible.builtin.debug: + msg: >- + Node {{ current_node }} drained — + {{ kvm_guests | length }} KVM + {{ lxc_guests | length }} LXC guests + migrated to {{ migration_target }} diff --git a/roles/proxmox_upgrade/tasks/main.yml b/roles/proxmox_upgrade/tasks/main.yml index 3ca52de..54c9f60 100644 --- a/roles/proxmox_upgrade/tasks/main.yml +++ b/roles/proxmox_upgrade/tasks/main.yml @@ -8,18 +8,16 @@ ansible.builtin.debug: msg: >- Starting Proxmox rolling upgrade for {{ client_name }} - — {{ upgrade_order | length }} nodes in order: {{ upgrade_order | join(' → ') }} + — {{ upgrade_order | length }} nodes: {{ upgrade_order | join(' → ') }} — migration_bulk={{ migration_bulk }} — live_migrate_fallback={{ live_migrate_fallback }} — migration_restore={{ migration_restore }} — ceph_enabled={{ ceph_enabled }} -# ── Cluster health preflight ────────────────────────────────────────────────── - name: Proxmox Upgrade | Cluster preflight include_tasks: preflight.yml -# ── Rolling upgrade — one node at a time ───────────────────────────────────── -- name: Proxmox Upgrade | Rolling upgrade loop +- name: Proxmox Upgrade | Rolling upgrade include_tasks: node_upgrade.yml loop: "{{ upgrade_order }}" loop_control: @@ -27,6 +25,4 @@ - name: Proxmox Upgrade | Complete ansible.builtin.debug: - msg: >- - Proxmox rolling upgrade complete for {{ client_name }} - — all {{ upgrade_order | length }} nodes upgraded successfully + msg: "Proxmox rolling upgrade complete for {{ client_name }} — {{ upgrade_order | length }} nodes upgraded" diff --git a/roles/proxmox_upgrade/tasks/node_upgrade.yml b/roles/proxmox_upgrade/tasks/node_upgrade.yml index 089cc79..113dfa6 100644 --- a/roles/proxmox_upgrade/tasks/node_upgrade.yml +++ b/roles/proxmox_upgrade/tasks/node_upgrade.yml @@ -1,8 +1,8 @@ --- # ============================================================================= # proxmox_upgrade — node_upgrade.yml -# Per-node upgrade sequence: backup → drain → upgrade → restore -# Called with loop_var: current_node +# Per-node sequence: backup → drain → upgrade → restore +# Called via loop with loop_var: current_node # ============================================================================= - name: "Node {{ current_node }} | Start" @@ -16,18 +16,18 @@ vars: pve_config_git_commit_message: "[{{ client_id }}] {{ current_node }} pre-upgrade config backup {{ ansible_date_time.date }}" -# ── Step 2: Drain node ──────────────────────────────────────────────────────── -- name: "Node {{ current_node }} | Step 2 — Drain (migrate guests off node)" +# ── Step 2: Drain ───────────────────────────────────────────────────────────── +- name: "Node {{ current_node }} | Step 2 — Drain guests" include_tasks: drain.yml when: cluster_mode == 'cluster' -- name: "Node {{ current_node }} | Step 2 — Single node mode, skipping drain" +- name: "Node {{ current_node }} | Step 2 — Single node, skipping drain" ansible.builtin.debug: msg: "cluster_mode=single — skipping guest migration" when: cluster_mode == 'single' # ── Step 3: Upgrade ─────────────────────────────────────────────────────────── -- name: "Node {{ current_node }} | Step 3 — Upgrade packages" +- name: "Node {{ current_node }} | Step 3 — Upgrade" include_tasks: upgrade.yml # ── Step 4: Restore ─────────────────────────────────────────────────────────── diff --git a/roles/proxmox_upgrade/tasks/preflight.yml b/roles/proxmox_upgrade/tasks/preflight.yml index a858abe..5ddbe4a 100644 --- a/roles/proxmox_upgrade/tasks/preflight.yml +++ b/roles/proxmox_upgrade/tasks/preflight.yml @@ -1,8 +1,9 @@ --- # ============================================================================= # proxmox_upgrade — preflight.yml -# Check cluster health before starting any upgrade work -# All API checks done in single tasks on localhost to avoid variable scope issues +# Cluster health check before starting any upgrade work +# pvecm runs on the node directly (SSH) — no delegation needed +# API node check runs delegate_to: localhost via community.proxmox # ============================================================================= - name: Preflight | Check quorum via pvecm @@ -11,65 +12,55 @@ register: quorum_check changed_when: false failed_when: quorum_check.rc != 0 + run_once: true -- name: Preflight | Check all cluster nodes online via API - ansible.builtin.shell: | - python3 << 'PYEOF' - import urllib.request, urllib.error, json, ssl - - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - req = urllib.request.Request( - "https://{{ api_host }}:{{ api_port }}/api2/json/nodes", - headers={"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} - ) - with urllib.request.urlopen(req, context=ctx) as r: - data = json.loads(r.read())["data"] - - offline = [n for n in data if n["status"] != "online"] - if offline: - print("FAIL: " + ", ".join(f"{n['node']}={n['status']}" for n in offline)) - exit(1) - else: - print("OK: " + ", ".join(f"{n['node']}={n['status']}" for n in data)) - exit(0) - PYEOF - register: node_check - changed_when: false - failed_when: node_check.rc != 0 +- name: Preflight | Get all cluster nodes via API + community.proxmox.proxmox_node_info: + api_host: "{{ api_host }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port }}" + register: cluster_node_info delegate_to: localhost run_once: true -- name: Preflight | Log node status - ansible.builtin.debug: - msg: "{{ node_check.stdout }}" +- name: Preflight | Fail if any node is offline + ansible.builtin.fail: + msg: > + Cluster health check FAILED — node {{ item.node }} is {{ item.status }}. + Aborting upgrade to prevent data loss. Investigate before retrying. + loop: "{{ cluster_node_info.proxmox_nodes }}" + when: item.status != 'online' delegate_to: localhost run_once: true - name: Preflight | Check CEPH health when: ceph_enabled | bool block: - - name: Preflight | Get CEPH health status - ansible.builtin.shell: ceph health 2>/dev/null + - name: Preflight | Get CEPH health + ansible.builtin.shell: ceph health register: ceph_health changed_when: false + run_once: true - name: Preflight | Abort if CEPH is in error state ansible.builtin.fail: msg: > - CEPH health check FAILED — status: {{ ceph_health.stdout }}. - Aborting upgrade. Resolve CEPH issues before retrying. + CEPH health check FAILED — {{ ceph_health.stdout }}. + Resolve CEPH issues before retrying. when: "'HEALTH_OK' not in ceph_health.stdout and 'HEALTH_WARN' not in ceph_health.stdout" + run_once: true - name: Preflight | Warn if CEPH has warnings ansible.builtin.debug: msg: "WARNING — CEPH has warnings: {{ ceph_health.stdout }}. Proceeding but monitor closely." when: "'HEALTH_WARN' in ceph_health.stdout" + run_once: true - name: Preflight | Cluster health check passed ansible.builtin.debug: - msg: "Cluster health check passed — all nodes online, quorum OK{{ ', CEPH checked' if ceph_enabled else '' }}" + msg: >- + Cluster health OK — {{ cluster_node_info.proxmox_nodes | length }} nodes online, + quorum confirmed{{ ', CEPH checked' if ceph_enabled else '' }} delegate_to: localhost run_once: true diff --git a/roles/proxmox_upgrade/tasks/proxmox_upgrade.yml b/roles/proxmox_upgrade/tasks/proxmox_upgrade.yml index 0febe22..88a9d7c 100644 --- a/roles/proxmox_upgrade/tasks/proxmox_upgrade.yml +++ b/roles/proxmox_upgrade/tasks/proxmox_upgrade.yml @@ -1,22 +1,19 @@ --- # ============================================================================= -# proxmox_upgrade.yml +# proxmox_upgrade.yml — Rolling Proxmox cluster upgrade # ============================================================================= -# Rolling Proxmox cluster upgrade playbook. -# Runs on the first node in upgrade_order — all other nodes are handled -# via API calls and delegate_to from within the role. +# Requires: community.proxmox collection + proxmoxer>=2.0, requests on Semaphore +# ansible-galaxy collection install community.proxmox +# pip install proxmoxer requests --break-system-packages # # Usage: # ansible-playbook playbooks/proxmox_upgrade.yml \ # -i inventories/client_local_eng/hypervisor_hosts.yml # -# Override migration behaviour: +# Overrides: # -e migration_bulk=true # -e live_migrate_fallback=skip # -e migration_restore=true -# -# Dry run (check mode — no changes): -# --check # ============================================================================= - name: Proxmox Rolling Upgrade @@ -26,15 +23,15 @@ run_once: true pre_tasks: - - name: Confirm upgrade_order is defined + - name: Validate upgrade_order is defined ansible.builtin.fail: msg: "upgrade_order must be defined in hypervisor_hosts.yml" when: upgrade_order is not defined or upgrade_order | length == 0 - - name: Log upgrade targets + - name: Log upgrade plan ansible.builtin.debug: msg: >- - Proxmox upgrade starting for {{ client_name }} ({{ client_id }}) + Proxmox upgrade: {{ client_name }} ({{ client_id }}) Nodes: {{ upgrade_order | join(', ') }} API: https://{{ api_host }}:{{ api_port }} diff --git a/roles/proxmox_upgrade/tasks/restore.yml b/roles/proxmox_upgrade/tasks/restore.yml index 65e927d..9584db4 100644 --- a/roles/proxmox_upgrade/tasks/restore.yml +++ b/roles/proxmox_upgrade/tasks/restore.yml @@ -1,81 +1,43 @@ --- # ============================================================================= # proxmox_upgrade — restore.yml -# Optionally migrate guests back to their original node after upgrade +# Optionally migrate guests back to original node after upgrade # Only runs if migration_restore: true # ============================================================================= - name: "Restore | Skip — migration_restore=false" ansible.builtin.debug: - msg: "migration_restore=false — leaving guests on their current nodes" + msg: "migration_restore=false — guests remain on {{ migration_target }}" when: not migration_restore | bool - delegate_to: localhost - name: "Restore | Migrate guests back to {{ current_node }}" when: migration_restore | bool block: - - name: "Restore | Migrate all guests back to {{ current_node }}" - ansible.builtin.shell: | - python3 << 'PYEOF' - import urllib.request, json, ssl, time - - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" - headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} - node = "{{ current_node }}" - source = "{{ migration_targets | first }}" - plan = {{ migration_plan | to_json }} - fallback = "{{ live_migrate_fallback }}" - - def api_req(path, method="GET", body=None): - url = f"{api_base}{path}" - data = json.dumps(body).encode() if body else None - hdrs = {**headers} - if data: - hdrs["Content-Type"] = "application/json" - req = urllib.request.Request(url, data=data, headers=hdrs, method=method) - with urllib.request.urlopen(req, context=ctx) as r: - return json.loads(r.read())["data"] - - task_ids = [] - for guest in plan: - if guest["needs_fallback"] and fallback == "skip": - print(f"SKIP restore: {guest['type'].upper()} {guest['vmid']} ({guest['name']}) — was skipped during drain") - continue - gtype = guest["type"] - online = 0 if (guest["needs_fallback"] and fallback == "shutdown") else 1 - print(f"Restoring {gtype.upper()} {guest['vmid']} ({guest['name']}) → {node} (online={online})...") - task_id = api_req(f"/nodes/{source}/{gtype}/{guest['vmid']}/migrate", "POST", - {"target": node, "online": online}) - task_ids.append({"vmid": guest["vmid"], "name": guest["name"], "task": task_id, "type": gtype}) - - failed = [] - for t in task_ids: - for _ in range(60): - status = api_req(f"/nodes/{source}/tasks/{t['task']}/status") - if status["status"] == "stopped": - if status.get("exitstatus") != "OK": - failed.append(f"{t['name']} ({t['vmid']}): {status.get('exitstatus')}") - else: - print(f"OK: {t['name']} ({t['vmid']}) restored to {node}") - break - time.sleep(10) - else: - failed.append(f"{t['name']} ({t['vmid']}): timed out") - - if failed: - print("FAILED restores: " + ", ".join(failed)) - exit(1) - print(f"All guests restored to {node}") - PYEOF - register: restore_result + - name: "Restore | KVM | Migrate back to {{ current_node }}" + community.proxmox.proxmox_kvm: + api_host: "{{ api_host }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port }}" + node: "{{ migration_target }}" + vmid: "{{ item.vmid }}" + migrate: true + target_node: "{{ current_node }}" + online: "{{ true if item.status == 'running' else false }}" + timeout: "{{ vm_shutdown_timeout }}" + loop: "{{ kvm_guests | default([]) }}" delegate_to: localhost + + - name: "Restore | LXC | Migrate back to {{ current_node }}" + ansible.builtin.command: > + pct migrate {{ item.vmid }} {{ current_node }} + {{ '--restart' if item.status == 'running' else '' }} + --timeout {{ lxc_migrate_timeout }} + loop: "{{ lxc_guests | default([]) }}" + when: live_migrate_fallback != 'skip' changed_when: true + delegate_to: "{{ migration_target }}" - - name: "Restore | Log result" + - name: "Restore | Complete" ansible.builtin.debug: - msg: "{{ restore_result.stdout_lines }}" - delegate_to: localhost + msg: "All guests restored to {{ current_node }}" diff --git a/roles/proxmox_upgrade/tasks/upgrade.yml b/roles/proxmox_upgrade/tasks/upgrade.yml index 14de0d4..6ba2e78 100644 --- a/roles/proxmox_upgrade/tasks/upgrade.yml +++ b/roles/proxmox_upgrade/tasks/upgrade.yml @@ -1,42 +1,36 @@ --- # ============================================================================= # proxmox_upgrade — upgrade.yml -# Run apt dist-upgrade and reboot, wait for node to rejoin cluster +# apt dist-upgrade, reboot, wait for node to rejoin cluster +# Runs directly on the node via SSH — no delegation # ============================================================================= -- name: Upgrade | Set CEPH noout flag before upgrade - ansible.builtin.shell: ceph osd set noout +- name: "Upgrade | {{ current_node }} | Set CEPH noout flag" + ansible.builtin.command: ceph osd set noout when: ceph_enabled | bool changed_when: true -- name: Upgrade | Run apt update - ansible.builtin.shell: apt-get update -q +- name: "Upgrade | {{ current_node }} | apt-get update" + ansible.builtin.apt: + update_cache: true changed_when: false -- name: Upgrade | Run apt dist-upgrade - ansible.builtin.shell: "{{ apt_upgrade_cmd }}" - register: apt_upgrade_result - changed_when: "'0 upgraded' not in apt_upgrade_result.stdout" +- name: "Upgrade | {{ current_node }} | apt dist-upgrade" + ansible.builtin.apt: + upgrade: dist + autoremove: "{{ apt_autoremove | bool }}" + register: apt_result -- name: Upgrade | Log packages upgraded +- name: "Upgrade | {{ current_node }} | Log upgraded packages" ansible.builtin.debug: - msg: "{{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('No output') }}" + msg: "{{ apt_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('apt dist-upgrade complete') }}" -- name: Upgrade | Run apt autoremove - ansible.builtin.shell: DEBIAN_FRONTEND=noninteractive apt-get autoremove -y - when: apt_autoremove | bool - changed_when: false - -- name: Upgrade | Check if reboot is required +- name: "Upgrade | {{ current_node }} | Check if reboot required" ansible.builtin.stat: path: /var/run/reboot-required register: reboot_required -- name: Upgrade | Log reboot status - ansible.builtin.debug: - msg: "{{ 'Reboot required — rebooting node' if reboot_required.stat.exists else 'No reboot required — skipping reboot' }}" - -- name: Upgrade | Reboot node +- name: "Upgrade | {{ current_node }} | Reboot node" ansible.builtin.reboot: reboot_timeout: "{{ node_rejoin_timeout }}" msg: "Rebooting for Proxmox upgrade" @@ -44,51 +38,57 @@ post_reboot_delay: 30 when: reboot_required.stat.exists +- name: "Upgrade | {{ current_node }} | Skip reboot (not required)" + ansible.builtin.debug: + msg: "No reboot required — skipping" + when: not reboot_required.stat.exists + # ── Wait for node to rejoin cluster ────────────────────────────────────────── -- name: Upgrade | Wait for node to appear online in cluster - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes" - method: GET - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - register: nodes_status +- name: "Upgrade | {{ current_node }} | Wait for node to rejoin cluster" + community.proxmox.proxmox_node_info: + api_host: "{{ api_host }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port }}" + register: rejoin_check until: >- - nodes_status.json.data + rejoin_check.proxmox_nodes | selectattr('node', 'equalto', current_node) | selectattr('status', 'equalto', 'online') | list | length > 0 retries: "{{ (node_rejoin_timeout | int / 10) | int }}" delay: 10 delegate_to: localhost + when: reboot_required.stat.exists -- name: Upgrade | Node {{ current_node }} back online +- name: "Upgrade | {{ current_node }} | Node back online" ansible.builtin.debug: msg: "Node {{ current_node }} has rejoined the cluster" -# ── CEPH recovery wait ──────────────────────────────────────────────────────── -- name: Upgrade | Wait for CEPH to recover +# ── CEPH recovery ───────────────────────────────────────────────────────────── +- name: "Upgrade | {{ current_node }} | Wait for CEPH to recover" when: ceph_enabled | bool block: - - name: Upgrade | CEPH | Wait for HEALTH_OK or HEALTH_WARN + - name: "Upgrade | CEPH | Wait for healthy status" ansible.builtin.shell: ceph health register: ceph_health_post - until: "'HEALTH_OK' in ceph_health_post.stdout or 'HEALTH_WARN' in ceph_health_post.stdout" + until: >- + 'HEALTH_OK' in ceph_health_post.stdout or + 'HEALTH_WARN' in ceph_health_post.stdout retries: "{{ (ceph_recover_timeout | int / 10) | int }}" delay: 10 changed_when: false - - name: Upgrade | CEPH | Clear noout flag - ansible.builtin.shell: ceph osd unset noout + - name: "Upgrade | CEPH | Clear noout flag" + ansible.builtin.command: ceph osd unset noout changed_when: true - - name: Upgrade | CEPH | Log recovery status + - name: "Upgrade | CEPH | Status" ansible.builtin.debug: msg: "CEPH recovered: {{ ceph_health_post.stdout }}" -- name: Upgrade | Node {{ current_node }} upgrade complete +- name: "Upgrade | {{ current_node }} | Upgrade complete" ansible.builtin.debug: msg: >- - Node {{ current_node }} upgrade complete — - {{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('packages updated') }} + Node {{ current_node }} upgrade complete {{ '— rebooted' if reboot_required.stat.exists else '— no reboot needed' }}