refactor: proxmox_upgrade use inline community.proxmox for API calls.

This commit is contained in:
Semaphore
2026-03-14 15:23:47 -07:00
parent 464fba619f
commit 9bb8e97c82
9 changed files with 274 additions and 384 deletions

6
requirements.yml Normal file
View File

@@ -0,0 +1,6 @@
---
collections:
- name: community.proxmox
version: ">=1.6.0"
- name: community.general
version: ">=8.0.0"

View File

@@ -4,27 +4,23 @@
# ============================================================================= # =============================================================================
# Migration behaviour # Migration behaviour
migration_bulk: false # true = all VMs at once, false = one at a time migration_bulk: false # true = fire all migrations at once, false = one at a time
migration_restore: false # true = migrate VMs back to original node after upgrade migration_restore: false # true = migrate guests back to original node after upgrade
live_migrate_fallback: shutdown # migrate | shutdown | skip live_migrate_fallback: shutdown # shutdown | skip
# shutdown: shutdown VM, cold migrate, restart on target
# skip: leave VM on node (it will go down during reboot — use with caution)
# Shutdown timeout in seconds before forcing off # Timeouts (seconds)
vm_shutdown_timeout: 120 vm_shutdown_timeout: 120 # graceful shutdown before force-off
vm_start_timeout: 120 # wait for VM to start after cold migration
lxc_migrate_timeout: 300 # pct migrate --restart timeout
node_rejoin_timeout: 600 # wait for node to rejoin cluster after reboot
ceph_recover_timeout: 300 # wait for CEPH to recover after node rejoins
# How long to wait for a VM to start after cold migration # Upgrade options
vm_start_timeout: 120
# How long to wait for node to rejoin cluster after reboot
node_rejoin_timeout: 600
# How long to wait for CEPH to recover after node rejoins
ceph_recover_timeout: 300
# apt upgrade options
apt_upgrade_cmd: "DEBIAN_FRONTEND=noninteractive apt-get dist-upgrade -y"
apt_autoremove: true apt_autoremove: true
# Tags on VMs/LXCs to never migrate (comma separated in Proxmox) # VM/LXC tags that prevent migration (leave on node, warn)
migrate_exclude_tags: migrate_exclude_tags:
- nomigrate - nomigrate
- pinned - pinned

View File

@@ -1,225 +1,167 @@
--- ---
# ============================================================================= # =============================================================================
# proxmox_upgrade — drain.yml # proxmox_upgrade — drain.yml
# Migrate all VMs/LXCs off a node before upgrading it # Migrate all VMs/LXCs off current_node before upgrading
# Uses inline Python for API calls to avoid cross-task variable scope issues # KVM: community.proxmox.proxmox_kvm (API, delegate_to: localhost)
# LXC: pct migrate (SSH on source node)
# ============================================================================= # =============================================================================
# ── Build migration plan via API ────────────────────────────────────────────── # ── Discover guests on this node ──────────────────────────────────────────────
- name: "Drain | Build migration plan for {{ current_node }}" - name: "Drain | Get all guests on {{ current_node }}"
ansible.builtin.shell: | community.proxmox.proxmox_vm_info:
python3 << 'PYEOF' api_host: "{{ api_host }}"
import urllib.request, urllib.error, json, ssl api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
ctx = ssl.create_default_context() api_port: "{{ api_port }}"
ctx.check_hostname = False node: "{{ current_node }}"
ctx.verify_mode = ssl.CERT_NONE register: node_guests
api_base = "https://{{ api_host }}:{{ api_port }}/api2/json"
headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
node = "{{ current_node }}"
shared = {{ shared_storage | lower | replace("true", "True") | replace("false", "False") }}
exclude_tags = {{ migrate_exclude_tags | to_json }}
def api_get(path):
req = urllib.request.Request(f"{api_base}{path}", headers=headers)
with urllib.request.urlopen(req, context=ctx) as r:
return json.loads(r.read())["data"]
# Get all online nodes except current
all_nodes = api_get("/nodes")
targets = [n["node"] for n in all_nodes if n["status"] == "online" and n["node"] != node]
if not targets:
print(json.dumps({"error": f"No online nodes available to migrate guests to from {node}"}))
exit(1)
# Get VMs and LXCs on this node
vms = api_get(f"/nodes/{node}/qemu")
lxcs = api_get(f"/nodes/{node}/lxc")
plan = []
for vm in vms:
tags = [t.strip() for t in (vm.get("tags") or "").split(",") if t.strip()]
if any(t in exclude_tags for t in tags):
continue
cfg = api_get(f"/nodes/{node}/qemu/{vm['vmid']}/config")
has_passthrough = any(k.startswith("hostpci") or k.startswith("usb") for k in cfg)
has_local_cdrom = any(
isinstance(v, str) and "local" in v and ".iso" in v
for v in cfg.values()
)
needs_fallback = has_passthrough or not shared or has_local_cdrom
reason = "passthrough" if has_passthrough else ("local_disk" if not shared else ("local_cdrom" if has_local_cdrom else ""))
plan.append({
"vmid": vm["vmid"],
"name": vm.get("name", str(vm["vmid"])),
"type": "qemu",
"status": vm["status"],
"needs_fallback": needs_fallback,
"fallback_reason": reason
})
for lxc in lxcs:
tags = [t.strip() for t in (lxc.get("tags") or "").split(",") if t.strip()]
if any(t in exclude_tags for t in tags):
continue
plan.append({
"vmid": lxc["vmid"],
"name": lxc.get("name", str(lxc["vmid"])),
"type": "lxc",
"status": lxc["status"],
"needs_fallback": False,
"fallback_reason": ""
})
print(json.dumps({"plan": plan, "targets": targets}))
PYEOF
register: drain_plan_raw
delegate_to: localhost delegate_to: localhost
changed_when: false
- name: "Drain | Parse migration plan" - name: "Drain | Get available target nodes"
community.proxmox.proxmox_node_info:
api_host: "{{ api_host }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port }}"
register: all_nodes_info
delegate_to: localhost
- name: "Drain | Set migration target"
ansible.builtin.set_fact: ansible.builtin.set_fact:
drain_data: "{{ drain_plan_raw.stdout | from_json }}" migration_target: >-
{{ all_nodes_info.proxmox_nodes
| selectattr('status', 'equalto', 'online')
| rejectattr('node', 'equalto', current_node)
| map(attribute='node')
| list
| first }}
delegate_to: localhost delegate_to: localhost
- name: "Drain | Fail if error building plan" - name: "Drain | Fail if no migration target available"
ansible.builtin.fail: ansible.builtin.fail:
msg: "{{ drain_data.error }}" msg: "No online nodes available to migrate guests to. Cannot drain {{ current_node }}."
when: drain_data.error is defined when: migration_target is not defined or migration_target == ''
delegate_to: localhost delegate_to: localhost
- name: "Drain | Set migration plan and targets" - name: "Drain | Build KVM migration list"
ansible.builtin.set_fact: ansible.builtin.set_fact:
migration_plan: "{{ drain_data.plan }}" kvm_guests: >-
migration_targets: "{{ drain_data.targets }}" {{ node_guests.proxmox_vms
| selectattr('type', 'equalto', 'qemu')
| rejectattr('tags', 'intersect', migrate_exclude_tags)
| list }}
delegate_to: localhost delegate_to: localhost
- name: "Drain | Log migration plan for {{ current_node }}" - name: "Drain | Build LXC migration list"
ansible.builtin.set_fact:
lxc_guests: >-
{{ node_guests.proxmox_vms
| selectattr('type', 'equalto', 'lxc')
| rejectattr('tags', 'intersect', migrate_exclude_tags)
| list }}
delegate_to: localhost
- name: "Drain | Log migration plan"
ansible.builtin.debug: ansible.builtin.debug:
msg: >- msg: >-
Migration plan for {{ current_node }} ({{ migration_plan | length }} guests → {{ migration_targets | first }}): Drain plan for {{ current_node }} → {{ migration_target }}:
{% for g in migration_plan %} KVM: {{ kvm_guests | map(attribute='vmid') | list }}
- {{ g.type | upper }} {{ g.vmid }} ({{ g.name }}) [{{ g.status }}]{% if g.needs_fallback %} ⚠ fallback={{ live_migrate_fallback }} reason={{ g.fallback_reason }}{% endif %} LXC: {{ lxc_guests | map(attribute='vmid') | list }}
{% endfor %}
delegate_to: localhost delegate_to: localhost
- name: "Drain | Warn about non-live-migratable guests" # ── KVM migrations ────────────────────────────────────────────────────────────
ansible.builtin.debug: - name: "Drain | Migrate KVM guests"
msg: >- when: kvm_guests | length > 0
WARNING — {{ item.type | upper }} {{ item.vmid }} ({{ item.name }})
cannot be live migrated ({{ item.fallback_reason }}).
{% if live_migrate_fallback == 'skip' %}THIS VM WILL GO DOWN DURING NODE REBOOT.
{% elif live_migrate_fallback == 'shutdown' %}Will be shut down, cold migrated, and restarted.
{% else %}Will attempt live migrate anyway (may fail).{% endif %}
loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
loop_control:
loop_var: item
delegate_to: localhost
# ── Sequential migrations ─────────────────────────────────────────────────────
- name: "Drain | Migrate guests sequentially"
when: not migration_bulk | bool
block: block:
- name: "Drain | Sequential | Migrate live-migratable guests" - name: "Drain | KVM | Live migrate (sequential)"
include_tasks: migrate_guest.yml community.proxmox.proxmox_kvm:
loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}" api_host: "{{ api_host }}"
loop_control: api_token_id: "{{ api_token_id }}"
loop_var: guest api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port }}"
- name: "Drain | Sequential | Handle fallback guests" node: "{{ current_node }}"
include_tasks: migrate_guest.yml vmid: "{{ item.vmid }}"
loop: "{{ migration_plan | selectattr('needs_fallback') | list }}" migrate: true
loop_control: target_node: "{{ migration_target }}"
loop_var: guest online: "{{ true if item.status == 'running' and not item.get('needs_fallback', false) else false }}"
when: live_migrate_fallback != 'skip' timeout: "{{ vm_shutdown_timeout }}"
loop: "{{ kvm_guests }}"
# ── Bulk migrations ───────────────────────────────────────────────────────────
- name: "Drain | Migrate guests in bulk"
when: migration_bulk | bool
block:
- name: "Drain | Bulk | Trigger all live migrations"
ansible.builtin.shell: |
python3 << 'PYEOF'
import urllib.request, json, ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
api_base = "https://{{ api_host }}:{{ api_port }}/api2/json"
headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
node = "{{ current_node }}"
target = "{{ migration_targets | first }}"
plan = {{ migration_plan | rejectattr('needs_fallback') | list | to_json }}
task_ids = []
for guest in plan:
gtype = "qemu" if guest["type"] == "qemu" else "lxc"
url = f"{api_base}/nodes/{node}/{gtype}/{guest['vmid']}/migrate"
body = json.dumps({"target": target, "online": 1}).encode()
req = urllib.request.Request(url, data=body, headers={**headers, "Content-Type": "application/json"}, method="POST")
with urllib.request.urlopen(req, context=ctx) as r:
task_id = json.loads(r.read())["data"]
task_ids.append({"vmid": guest["vmid"], "name": guest["name"], "task": task_id})
print(f"Triggered migration: {guest['type'].upper()} {guest['vmid']} ({guest['name']}) → {target} task={task_id}")
print(json.dumps({"task_ids": task_ids}))
PYEOF
register: bulk_trigger_raw
delegate_to: localhost delegate_to: localhost
when: not migration_bulk | bool
- name: "Drain | KVM | Bulk migrate (fire and wait)"
community.proxmox.proxmox_kvm:
api_host: "{{ api_host }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port }}"
node: "{{ current_node }}"
vmid: "{{ item.vmid }}"
migrate: true
target_node: "{{ migration_target }}"
online: "{{ true if item.status == 'running' else false }}"
timeout: "{{ vm_shutdown_timeout }}"
loop: "{{ kvm_guests }}"
delegate_to: localhost
async: "{{ vm_shutdown_timeout * 2 }}"
poll: 0
register: kvm_bulk_jobs
when: migration_bulk | bool
- name: "Drain | KVM | Wait for bulk migrations to complete"
ansible.builtin.async_status:
jid: "{{ item.ansible_job_id }}"
register: kvm_job_result
until: kvm_job_result.finished
retries: 60
delay: 10
loop: "{{ kvm_bulk_jobs.results }}"
delegate_to: localhost
when: migration_bulk | bool
# ── LXC migrations ────────────────────────────────────────────────────────────
- name: "Drain | Migrate LXC guests"
when: lxc_guests | length > 0
block:
- name: "Drain | LXC | Warn about restart requirement"
ansible.builtin.debug:
msg: >-
LXC {{ item.vmid }} ({{ item.name | default('unknown') }}) will be
stopped, migrated to {{ migration_target }}, and restarted
(LXC live migration is not supported by Proxmox).
loop: "{{ lxc_guests | selectattr('status', 'equalto', 'running') | list }}"
delegate_to: localhost
- name: "Drain | LXC | Warn about skipped containers"
ansible.builtin.debug:
msg: >-
WARNING — LXC {{ item.vmid }} ({{ item.name | default('unknown') }})
live_migrate_fallback=skip — this container WILL GO DOWN during node reboot.
loop: "{{ lxc_guests | selectattr('status', 'equalto', 'running') | list }}"
when: live_migrate_fallback == 'skip'
delegate_to: localhost
- name: "Drain | LXC | Migrate via pct migrate --restart"
ansible.builtin.command: >
pct migrate {{ item.vmid }} {{ migration_target }}
{{ '--restart' if item.status == 'running' else '' }}
--timeout {{ lxc_migrate_timeout }}
loop: "{{ lxc_guests }}"
when: live_migrate_fallback != 'skip'
register: lxc_migrate_result
changed_when: true changed_when: true
- name: "Drain | Bulk | Wait for all migration tasks to complete" - name: "Drain | LXC | Log migration results"
ansible.builtin.shell: | ansible.builtin.debug:
python3 << 'PYEOF' msg: "LXC {{ item.item.vmid }} migrated to {{ migration_target }}"
import urllib.request, json, ssl, time loop: "{{ lxc_migrate_result.results }}"
when:
- live_migrate_fallback != 'skip'
- item.rc == 0
ctx = ssl.create_default_context() - name: "Drain | {{ current_node }} drained successfully"
ctx.check_hostname = False ansible.builtin.debug:
ctx.verify_mode = ssl.CERT_NONE msg: >-
Node {{ current_node }} drained —
api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" {{ kvm_guests | length }} KVM + {{ lxc_guests | length }} LXC guests
headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} migrated to {{ migration_target }}
node = "{{ current_node }}"
lines = """{{ bulk_trigger_raw.stdout }}""".strip().split('\n')
last_line = [l for l in lines if l.startswith('{')][-1]
task_ids = json.loads(last_line)["task_ids"]
failed = []
for t in task_ids:
retries = 60
while retries > 0:
url = f"{api_base}/nodes/{node}/tasks/{t['task']}/status"
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req, context=ctx) as r:
status = json.loads(r.read())["data"]
if status["status"] == "stopped":
if status.get("exitstatus") != "OK":
failed.append(f"{t['name']} ({t['vmid']}): {status.get('exitstatus')}")
else:
print(f"OK: {t['name']} ({t['vmid']}) migrated successfully")
break
time.sleep(10)
retries -= 1
else:
failed.append(f"{t['name']} ({t['vmid']}): timed out")
if failed:
print("FAILED: " + ", ".join(failed))
exit(1)
print("All bulk migrations completed successfully")
PYEOF
register: bulk_wait_result
delegate_to: localhost
changed_when: false
- name: "Drain | Bulk | Handle fallback guests sequentially"
include_tasks: migrate_guest.yml
loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
loop_control:
loop_var: guest
when: live_migrate_fallback != 'skip'

View File

@@ -8,18 +8,16 @@
ansible.builtin.debug: ansible.builtin.debug:
msg: >- msg: >-
Starting Proxmox rolling upgrade for {{ client_name }} Starting Proxmox rolling upgrade for {{ client_name }}
— {{ upgrade_order | length }} nodes in order: {{ upgrade_order | join(' → ') }} — {{ upgrade_order | length }} nodes: {{ upgrade_order | join(' → ') }}
— migration_bulk={{ migration_bulk }} — migration_bulk={{ migration_bulk }}
— live_migrate_fallback={{ live_migrate_fallback }} — live_migrate_fallback={{ live_migrate_fallback }}
— migration_restore={{ migration_restore }} — migration_restore={{ migration_restore }}
— ceph_enabled={{ ceph_enabled }} — ceph_enabled={{ ceph_enabled }}
# ── Cluster health preflight ──────────────────────────────────────────────────
- name: Proxmox Upgrade | Cluster preflight - name: Proxmox Upgrade | Cluster preflight
include_tasks: preflight.yml include_tasks: preflight.yml
# ── Rolling upgrade — one node at a time ───────────────────────────────────── - name: Proxmox Upgrade | Rolling upgrade
- name: Proxmox Upgrade | Rolling upgrade loop
include_tasks: node_upgrade.yml include_tasks: node_upgrade.yml
loop: "{{ upgrade_order }}" loop: "{{ upgrade_order }}"
loop_control: loop_control:
@@ -27,6 +25,4 @@
- name: Proxmox Upgrade | Complete - name: Proxmox Upgrade | Complete
ansible.builtin.debug: ansible.builtin.debug:
msg: >- msg: "Proxmox rolling upgrade complete for {{ client_name }} — {{ upgrade_order | length }} nodes upgraded"
Proxmox rolling upgrade complete for {{ client_name }}
— all {{ upgrade_order | length }} nodes upgraded successfully

View File

@@ -1,8 +1,8 @@
--- ---
# ============================================================================= # =============================================================================
# proxmox_upgrade — node_upgrade.yml # proxmox_upgrade — node_upgrade.yml
# Per-node upgrade sequence: backup → drain → upgrade → restore # Per-node sequence: backup → drain → upgrade → restore
# Called with loop_var: current_node # Called via loop with loop_var: current_node
# ============================================================================= # =============================================================================
- name: "Node {{ current_node }} | Start" - name: "Node {{ current_node }} | Start"
@@ -16,18 +16,18 @@
vars: vars:
pve_config_git_commit_message: "[{{ client_id }}] {{ current_node }} pre-upgrade config backup {{ ansible_date_time.date }}" pve_config_git_commit_message: "[{{ client_id }}] {{ current_node }} pre-upgrade config backup {{ ansible_date_time.date }}"
# ── Step 2: Drain node ──────────────────────────────────────────────────────── # ── Step 2: Drain ─────────────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 2 — Drain (migrate guests off node)" - name: "Node {{ current_node }} | Step 2 — Drain guests"
include_tasks: drain.yml include_tasks: drain.yml
when: cluster_mode == 'cluster' when: cluster_mode == 'cluster'
- name: "Node {{ current_node }} | Step 2 — Single node mode, skipping drain" - name: "Node {{ current_node }} | Step 2 — Single node, skipping drain"
ansible.builtin.debug: ansible.builtin.debug:
msg: "cluster_mode=single — skipping guest migration" msg: "cluster_mode=single — skipping guest migration"
when: cluster_mode == 'single' when: cluster_mode == 'single'
# ── Step 3: Upgrade ─────────────────────────────────────────────────────────── # ── Step 3: Upgrade ───────────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 3 — Upgrade packages" - name: "Node {{ current_node }} | Step 3 — Upgrade"
include_tasks: upgrade.yml include_tasks: upgrade.yml
# ── Step 4: Restore ─────────────────────────────────────────────────────────── # ── Step 4: Restore ───────────────────────────────────────────────────────────

View File

@@ -1,8 +1,9 @@
--- ---
# ============================================================================= # =============================================================================
# proxmox_upgrade — preflight.yml # proxmox_upgrade — preflight.yml
# Check cluster health before starting any upgrade work # Cluster health check before starting any upgrade work
# All API checks done in single tasks on localhost to avoid variable scope issues # pvecm runs on the node directly (SSH) — no delegation needed
# API node check runs delegate_to: localhost via community.proxmox
# ============================================================================= # =============================================================================
- name: Preflight | Check quorum via pvecm - name: Preflight | Check quorum via pvecm
@@ -11,65 +12,55 @@
register: quorum_check register: quorum_check
changed_when: false changed_when: false
failed_when: quorum_check.rc != 0 failed_when: quorum_check.rc != 0
run_once: true
- name: Preflight | Check all cluster nodes online via API - name: Preflight | Get all cluster nodes via API
ansible.builtin.shell: | community.proxmox.proxmox_node_info:
python3 << 'PYEOF' api_host: "{{ api_host }}"
import urllib.request, urllib.error, json, ssl api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
ctx = ssl.create_default_context() api_port: "{{ api_port }}"
ctx.check_hostname = False register: cluster_node_info
ctx.verify_mode = ssl.CERT_NONE
req = urllib.request.Request(
"https://{{ api_host }}:{{ api_port }}/api2/json/nodes",
headers={"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
)
with urllib.request.urlopen(req, context=ctx) as r:
data = json.loads(r.read())["data"]
offline = [n for n in data if n["status"] != "online"]
if offline:
print("FAIL: " + ", ".join(f"{n['node']}={n['status']}" for n in offline))
exit(1)
else:
print("OK: " + ", ".join(f"{n['node']}={n['status']}" for n in data))
exit(0)
PYEOF
register: node_check
changed_when: false
failed_when: node_check.rc != 0
delegate_to: localhost delegate_to: localhost
run_once: true run_once: true
- name: Preflight | Log node status - name: Preflight | Fail if any node is offline
ansible.builtin.debug: ansible.builtin.fail:
msg: "{{ node_check.stdout }}" msg: >
Cluster health check FAILED — node {{ item.node }} is {{ item.status }}.
Aborting upgrade to prevent data loss. Investigate before retrying.
loop: "{{ cluster_node_info.proxmox_nodes }}"
when: item.status != 'online'
delegate_to: localhost delegate_to: localhost
run_once: true run_once: true
- name: Preflight | Check CEPH health - name: Preflight | Check CEPH health
when: ceph_enabled | bool when: ceph_enabled | bool
block: block:
- name: Preflight | Get CEPH health status - name: Preflight | Get CEPH health
ansible.builtin.shell: ceph health 2>/dev/null ansible.builtin.shell: ceph health
register: ceph_health register: ceph_health
changed_when: false changed_when: false
run_once: true
- name: Preflight | Abort if CEPH is in error state - name: Preflight | Abort if CEPH is in error state
ansible.builtin.fail: ansible.builtin.fail:
msg: > msg: >
CEPH health check FAILED — status: {{ ceph_health.stdout }}. CEPH health check FAILED — {{ ceph_health.stdout }}.
Aborting upgrade. Resolve CEPH issues before retrying. Resolve CEPH issues before retrying.
when: "'HEALTH_OK' not in ceph_health.stdout and 'HEALTH_WARN' not in ceph_health.stdout" when: "'HEALTH_OK' not in ceph_health.stdout and 'HEALTH_WARN' not in ceph_health.stdout"
run_once: true
- name: Preflight | Warn if CEPH has warnings - name: Preflight | Warn if CEPH has warnings
ansible.builtin.debug: ansible.builtin.debug:
msg: "WARNING — CEPH has warnings: {{ ceph_health.stdout }}. Proceeding but monitor closely." msg: "WARNING — CEPH has warnings: {{ ceph_health.stdout }}. Proceeding but monitor closely."
when: "'HEALTH_WARN' in ceph_health.stdout" when: "'HEALTH_WARN' in ceph_health.stdout"
run_once: true
- name: Preflight | Cluster health check passed - name: Preflight | Cluster health check passed
ansible.builtin.debug: ansible.builtin.debug:
msg: "Cluster health check passed — all nodes online, quorum OK{{ ', CEPH checked' if ceph_enabled else '' }}" msg: >-
Cluster health OK — {{ cluster_node_info.proxmox_nodes | length }} nodes online,
quorum confirmed{{ ', CEPH checked' if ceph_enabled else '' }}
delegate_to: localhost delegate_to: localhost
run_once: true run_once: true

View File

@@ -1,22 +1,19 @@
--- ---
# ============================================================================= # =============================================================================
# proxmox_upgrade.yml # proxmox_upgrade.yml — Rolling Proxmox cluster upgrade
# ============================================================================= # =============================================================================
# Rolling Proxmox cluster upgrade playbook. # Requires: community.proxmox collection + proxmoxer>=2.0, requests on Semaphore
# Runs on the first node in upgrade_order — all other nodes are handled # ansible-galaxy collection install community.proxmox
# via API calls and delegate_to from within the role. # pip install proxmoxer requests --break-system-packages
# #
# Usage: # Usage:
# ansible-playbook playbooks/proxmox_upgrade.yml \ # ansible-playbook playbooks/proxmox_upgrade.yml \
# -i inventories/client_local_eng/hypervisor_hosts.yml # -i inventories/client_local_eng/hypervisor_hosts.yml
# #
# Override migration behaviour: # Overrides:
# -e migration_bulk=true # -e migration_bulk=true
# -e live_migrate_fallback=skip # -e live_migrate_fallback=skip
# -e migration_restore=true # -e migration_restore=true
#
# Dry run (check mode — no changes):
# --check
# ============================================================================= # =============================================================================
- name: Proxmox Rolling Upgrade - name: Proxmox Rolling Upgrade
@@ -26,15 +23,15 @@
run_once: true run_once: true
pre_tasks: pre_tasks:
- name: Confirm upgrade_order is defined - name: Validate upgrade_order is defined
ansible.builtin.fail: ansible.builtin.fail:
msg: "upgrade_order must be defined in hypervisor_hosts.yml" msg: "upgrade_order must be defined in hypervisor_hosts.yml"
when: upgrade_order is not defined or upgrade_order | length == 0 when: upgrade_order is not defined or upgrade_order | length == 0
- name: Log upgrade targets - name: Log upgrade plan
ansible.builtin.debug: ansible.builtin.debug:
msg: >- msg: >-
Proxmox upgrade starting for {{ client_name }} ({{ client_id }}) Proxmox upgrade: {{ client_name }} ({{ client_id }})
Nodes: {{ upgrade_order | join(', ') }} Nodes: {{ upgrade_order | join(', ') }}
API: https://{{ api_host }}:{{ api_port }} API: https://{{ api_host }}:{{ api_port }}

View File

@@ -1,81 +1,43 @@
--- ---
# ============================================================================= # =============================================================================
# proxmox_upgrade — restore.yml # proxmox_upgrade — restore.yml
# Optionally migrate guests back to their original node after upgrade # Optionally migrate guests back to original node after upgrade
# Only runs if migration_restore: true # Only runs if migration_restore: true
# ============================================================================= # =============================================================================
- name: "Restore | Skip — migration_restore=false" - name: "Restore | Skip — migration_restore=false"
ansible.builtin.debug: ansible.builtin.debug:
msg: "migration_restore=false — leaving guests on their current nodes" msg: "migration_restore=false — guests remain on {{ migration_target }}"
when: not migration_restore | bool when: not migration_restore | bool
delegate_to: localhost
- name: "Restore | Migrate guests back to {{ current_node }}" - name: "Restore | Migrate guests back to {{ current_node }}"
when: migration_restore | bool when: migration_restore | bool
block: block:
- name: "Restore | Migrate all guests back to {{ current_node }}" - name: "Restore | KVM | Migrate back to {{ current_node }}"
ansible.builtin.shell: | community.proxmox.proxmox_kvm:
python3 << 'PYEOF' api_host: "{{ api_host }}"
import urllib.request, json, ssl, time api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
ctx = ssl.create_default_context() api_port: "{{ api_port }}"
ctx.check_hostname = False node: "{{ migration_target }}"
ctx.verify_mode = ssl.CERT_NONE vmid: "{{ item.vmid }}"
migrate: true
api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" target_node: "{{ current_node }}"
headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} online: "{{ true if item.status == 'running' else false }}"
node = "{{ current_node }}" timeout: "{{ vm_shutdown_timeout }}"
source = "{{ migration_targets | first }}" loop: "{{ kvm_guests | default([]) }}"
plan = {{ migration_plan | to_json }}
fallback = "{{ live_migrate_fallback }}"
def api_req(path, method="GET", body=None):
url = f"{api_base}{path}"
data = json.dumps(body).encode() if body else None
hdrs = {**headers}
if data:
hdrs["Content-Type"] = "application/json"
req = urllib.request.Request(url, data=data, headers=hdrs, method=method)
with urllib.request.urlopen(req, context=ctx) as r:
return json.loads(r.read())["data"]
task_ids = []
for guest in plan:
if guest["needs_fallback"] and fallback == "skip":
print(f"SKIP restore: {guest['type'].upper()} {guest['vmid']} ({guest['name']}) — was skipped during drain")
continue
gtype = guest["type"]
online = 0 if (guest["needs_fallback"] and fallback == "shutdown") else 1
print(f"Restoring {gtype.upper()} {guest['vmid']} ({guest['name']}) → {node} (online={online})...")
task_id = api_req(f"/nodes/{source}/{gtype}/{guest['vmid']}/migrate", "POST",
{"target": node, "online": online})
task_ids.append({"vmid": guest["vmid"], "name": guest["name"], "task": task_id, "type": gtype})
failed = []
for t in task_ids:
for _ in range(60):
status = api_req(f"/nodes/{source}/tasks/{t['task']}/status")
if status["status"] == "stopped":
if status.get("exitstatus") != "OK":
failed.append(f"{t['name']} ({t['vmid']}): {status.get('exitstatus')}")
else:
print(f"OK: {t['name']} ({t['vmid']}) restored to {node}")
break
time.sleep(10)
else:
failed.append(f"{t['name']} ({t['vmid']}): timed out")
if failed:
print("FAILED restores: " + ", ".join(failed))
exit(1)
print(f"All guests restored to {node}")
PYEOF
register: restore_result
delegate_to: localhost delegate_to: localhost
- name: "Restore | LXC | Migrate back to {{ current_node }}"
ansible.builtin.command: >
pct migrate {{ item.vmid }} {{ current_node }}
{{ '--restart' if item.status == 'running' else '' }}
--timeout {{ lxc_migrate_timeout }}
loop: "{{ lxc_guests | default([]) }}"
when: live_migrate_fallback != 'skip'
changed_when: true changed_when: true
delegate_to: "{{ migration_target }}"
- name: "Restore | Log result" - name: "Restore | Complete"
ansible.builtin.debug: ansible.builtin.debug:
msg: "{{ restore_result.stdout_lines }}" msg: "All guests restored to {{ current_node }}"
delegate_to: localhost

View File

@@ -1,42 +1,36 @@
--- ---
# ============================================================================= # =============================================================================
# proxmox_upgrade — upgrade.yml # proxmox_upgrade — upgrade.yml
# Run apt dist-upgrade and reboot, wait for node to rejoin cluster # apt dist-upgrade, reboot, wait for node to rejoin cluster
# Runs directly on the node via SSH — no delegation
# ============================================================================= # =============================================================================
- name: Upgrade | Set CEPH noout flag before upgrade - name: "Upgrade | {{ current_node }} | Set CEPH noout flag"
ansible.builtin.shell: ceph osd set noout ansible.builtin.command: ceph osd set noout
when: ceph_enabled | bool when: ceph_enabled | bool
changed_when: true changed_when: true
- name: Upgrade | Run apt update - name: "Upgrade | {{ current_node }} | apt-get update"
ansible.builtin.shell: apt-get update -q ansible.builtin.apt:
update_cache: true
changed_when: false changed_when: false
- name: Upgrade | Run apt dist-upgrade - name: "Upgrade | {{ current_node }} | apt dist-upgrade"
ansible.builtin.shell: "{{ apt_upgrade_cmd }}" ansible.builtin.apt:
register: apt_upgrade_result upgrade: dist
changed_when: "'0 upgraded' not in apt_upgrade_result.stdout" autoremove: "{{ apt_autoremove | bool }}"
register: apt_result
- name: Upgrade | Log packages upgraded - name: "Upgrade | {{ current_node }} | Log upgraded packages"
ansible.builtin.debug: ansible.builtin.debug:
msg: "{{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('No output') }}" msg: "{{ apt_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('apt dist-upgrade complete') }}"
- name: Upgrade | Run apt autoremove - name: "Upgrade | {{ current_node }} | Check if reboot required"
ansible.builtin.shell: DEBIAN_FRONTEND=noninteractive apt-get autoremove -y
when: apt_autoremove | bool
changed_when: false
- name: Upgrade | Check if reboot is required
ansible.builtin.stat: ansible.builtin.stat:
path: /var/run/reboot-required path: /var/run/reboot-required
register: reboot_required register: reboot_required
- name: Upgrade | Log reboot status - name: "Upgrade | {{ current_node }} | Reboot node"
ansible.builtin.debug:
msg: "{{ 'Reboot required — rebooting node' if reboot_required.stat.exists else 'No reboot required — skipping reboot' }}"
- name: Upgrade | Reboot node
ansible.builtin.reboot: ansible.builtin.reboot:
reboot_timeout: "{{ node_rejoin_timeout }}" reboot_timeout: "{{ node_rejoin_timeout }}"
msg: "Rebooting for Proxmox upgrade" msg: "Rebooting for Proxmox upgrade"
@@ -44,51 +38,57 @@
post_reboot_delay: 30 post_reboot_delay: 30
when: reboot_required.stat.exists when: reboot_required.stat.exists
- name: "Upgrade | {{ current_node }} | Skip reboot (not required)"
ansible.builtin.debug:
msg: "No reboot required — skipping"
when: not reboot_required.stat.exists
# ── Wait for node to rejoin cluster ────────────────────────────────────────── # ── Wait for node to rejoin cluster ──────────────────────────────────────────
- name: Upgrade | Wait for node to appear online in cluster - name: "Upgrade | {{ current_node }} | Wait for node to rejoin cluster"
ansible.builtin.uri: community.proxmox.proxmox_node_info:
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes" api_host: "{{ api_host }}"
method: GET api_token_id: "{{ api_token_id }}"
headers: api_token_secret: "{{ api_token_secret }}"
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" api_port: "{{ api_port }}"
validate_certs: false register: rejoin_check
register: nodes_status
until: >- until: >-
nodes_status.json.data rejoin_check.proxmox_nodes
| selectattr('node', 'equalto', current_node) | selectattr('node', 'equalto', current_node)
| selectattr('status', 'equalto', 'online') | selectattr('status', 'equalto', 'online')
| list | length > 0 | list | length > 0
retries: "{{ (node_rejoin_timeout | int / 10) | int }}" retries: "{{ (node_rejoin_timeout | int / 10) | int }}"
delay: 10 delay: 10
delegate_to: localhost delegate_to: localhost
when: reboot_required.stat.exists
- name: Upgrade | Node {{ current_node }} back online - name: "Upgrade | {{ current_node }} | Node back online"
ansible.builtin.debug: ansible.builtin.debug:
msg: "Node {{ current_node }} has rejoined the cluster" msg: "Node {{ current_node }} has rejoined the cluster"
# ── CEPH recovery wait ──────────────────────────────────────────────────────── # ── CEPH recovery ─────────────────────────────────────────────────────────────
- name: Upgrade | Wait for CEPH to recover - name: "Upgrade | {{ current_node }} | Wait for CEPH to recover"
when: ceph_enabled | bool when: ceph_enabled | bool
block: block:
- name: Upgrade | CEPH | Wait for HEALTH_OK or HEALTH_WARN - name: "Upgrade | CEPH | Wait for healthy status"
ansible.builtin.shell: ceph health ansible.builtin.shell: ceph health
register: ceph_health_post register: ceph_health_post
until: "'HEALTH_OK' in ceph_health_post.stdout or 'HEALTH_WARN' in ceph_health_post.stdout" until: >-
'HEALTH_OK' in ceph_health_post.stdout or
'HEALTH_WARN' in ceph_health_post.stdout
retries: "{{ (ceph_recover_timeout | int / 10) | int }}" retries: "{{ (ceph_recover_timeout | int / 10) | int }}"
delay: 10 delay: 10
changed_when: false changed_when: false
- name: Upgrade | CEPH | Clear noout flag - name: "Upgrade | CEPH | Clear noout flag"
ansible.builtin.shell: ceph osd unset noout ansible.builtin.command: ceph osd unset noout
changed_when: true changed_when: true
- name: Upgrade | CEPH | Log recovery status - name: "Upgrade | CEPH | Status"
ansible.builtin.debug: ansible.builtin.debug:
msg: "CEPH recovered: {{ ceph_health_post.stdout }}" msg: "CEPH recovered: {{ ceph_health_post.stdout }}"
- name: Upgrade | Node {{ current_node }} upgrade complete - name: "Upgrade | {{ current_node }} | Upgrade complete"
ansible.builtin.debug: ansible.builtin.debug:
msg: >- msg: >-
Node {{ current_node }} upgrade complete Node {{ current_node }} upgrade complete
{{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('packages updated') }}
{{ '— rebooted' if reboot_required.stat.exists else '— no reboot needed' }} {{ '— rebooted' if reboot_required.stat.exists else '— no reboot needed' }}