Files
ansible-msp-automations/roles/proxmox_upgrade/tasks/drain.yml

226 lines
9.3 KiB
YAML

---
# =============================================================================
# proxmox_upgrade — drain.yml
# Migrate all VMs/LXCs off a node before upgrading it
# Uses inline Python for API calls to avoid cross-task variable scope issues
# =============================================================================
# ── Build migration plan via API ──────────────────────────────────────────────
- name: "Drain | Build migration plan for {{ current_node }}"
ansible.builtin.shell: |
python3 << 'PYEOF'
import urllib.request, urllib.error, json, ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
api_base = "https://{{ api_host }}:{{ api_port }}/api2/json"
headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
node = "{{ current_node }}"
shared = {{ shared_storage | lower }}
exclude_tags = {{ migrate_exclude_tags | to_json }}
def api_get(path):
req = urllib.request.Request(f"{api_base}{path}", headers=headers)
with urllib.request.urlopen(req, context=ctx) as r:
return json.loads(r.read())["data"]
# Get all online nodes except current
all_nodes = api_get("/nodes")
targets = [n["node"] for n in all_nodes if n["status"] == "online" and n["node"] != node]
if not targets:
print(json.dumps({"error": f"No online nodes available to migrate guests to from {node}"}))
exit(1)
# Get VMs and LXCs on this node
vms = api_get(f"/nodes/{node}/qemu")
lxcs = api_get(f"/nodes/{node}/lxc")
plan = []
for vm in vms:
tags = [t.strip() for t in (vm.get("tags") or "").split(",") if t.strip()]
if any(t in exclude_tags for t in tags):
continue
cfg = api_get(f"/nodes/{node}/qemu/{vm['vmid']}/config")
has_passthrough = any(k.startswith("hostpci") or k.startswith("usb") for k in cfg)
has_local_cdrom = any(
isinstance(v, str) and "local" in v and ".iso" in v
for v in cfg.values()
)
needs_fallback = has_passthrough or not shared or has_local_cdrom
reason = "passthrough" if has_passthrough else ("local_disk" if not shared else ("local_cdrom" if has_local_cdrom else ""))
plan.append({
"vmid": vm["vmid"],
"name": vm.get("name", str(vm["vmid"])),
"type": "qemu",
"status": vm["status"],
"needs_fallback": needs_fallback,
"fallback_reason": reason
})
for lxc in lxcs:
tags = [t.strip() for t in (lxc.get("tags") or "").split(",") if t.strip()]
if any(t in exclude_tags for t in tags):
continue
plan.append({
"vmid": lxc["vmid"],
"name": lxc.get("name", str(lxc["vmid"])),
"type": "lxc",
"status": lxc["status"],
"needs_fallback": False,
"fallback_reason": ""
})
print(json.dumps({"plan": plan, "targets": targets}))
PYEOF
register: drain_plan_raw
delegate_to: localhost
changed_when: false
- name: "Drain | Parse migration plan"
ansible.builtin.set_fact:
drain_data: "{{ drain_plan_raw.stdout | from_json }}"
delegate_to: localhost
- name: "Drain | Fail if error building plan"
ansible.builtin.fail:
msg: "{{ drain_data.error }}"
when: drain_data.error is defined
delegate_to: localhost
- name: "Drain | Set migration plan and targets"
ansible.builtin.set_fact:
migration_plan: "{{ drain_data.plan }}"
migration_targets: "{{ drain_data.targets }}"
delegate_to: localhost
- name: "Drain | Log migration plan for {{ current_node }}"
ansible.builtin.debug:
msg: >-
Migration plan for {{ current_node }} ({{ migration_plan | length }} guests → {{ migration_targets | first }}):
{% for g in migration_plan %}
- {{ g.type | upper }} {{ g.vmid }} ({{ g.name }}) [{{ g.status }}]{% if g.needs_fallback %} ⚠ fallback={{ live_migrate_fallback }} reason={{ g.fallback_reason }}{% endif %}
{% endfor %}
delegate_to: localhost
- name: "Drain | Warn about non-live-migratable guests"
ansible.builtin.debug:
msg: >-
WARNING — {{ item.type | upper }} {{ item.vmid }} ({{ item.name }})
cannot be live migrated ({{ item.fallback_reason }}).
{% if live_migrate_fallback == 'skip' %}THIS VM WILL GO DOWN DURING NODE REBOOT.
{% elif live_migrate_fallback == 'shutdown' %}Will be shut down, cold migrated, and restarted.
{% else %}Will attempt live migrate anyway (may fail).{% endif %}
loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
loop_control:
loop_var: item
delegate_to: localhost
# ── Sequential migrations ─────────────────────────────────────────────────────
- name: "Drain | Migrate guests sequentially"
when: not migration_bulk | bool
block:
- name: "Drain | Sequential | Migrate live-migratable guests"
include_tasks: migrate_guest.yml
loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}"
loop_control:
loop_var: guest
- name: "Drain | Sequential | Handle fallback guests"
include_tasks: migrate_guest.yml
loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
loop_control:
loop_var: guest
when: live_migrate_fallback != 'skip'
# ── Bulk migrations ───────────────────────────────────────────────────────────
- name: "Drain | Migrate guests in bulk"
when: migration_bulk | bool
block:
- name: "Drain | Bulk | Trigger all live migrations"
ansible.builtin.shell: |
python3 << 'PYEOF'
import urllib.request, json, ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
api_base = "https://{{ api_host }}:{{ api_port }}/api2/json"
headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
node = "{{ current_node }}"
target = "{{ migration_targets | first }}"
plan = {{ migration_plan | rejectattr('needs_fallback') | list | to_json }}
task_ids = []
for guest in plan:
gtype = "qemu" if guest["type"] == "qemu" else "lxc"
url = f"{api_base}/nodes/{node}/{gtype}/{guest['vmid']}/migrate"
body = json.dumps({"target": target, "online": 1}).encode()
req = urllib.request.Request(url, data=body, headers={**headers, "Content-Type": "application/json"}, method="POST")
with urllib.request.urlopen(req, context=ctx) as r:
task_id = json.loads(r.read())["data"]
task_ids.append({"vmid": guest["vmid"], "name": guest["name"], "task": task_id})
print(f"Triggered migration: {guest['type'].upper()} {guest['vmid']} ({guest['name']}) → {target} task={task_id}")
print(json.dumps({"task_ids": task_ids}))
PYEOF
register: bulk_trigger_raw
delegate_to: localhost
changed_when: true
- name: "Drain | Bulk | Wait for all migration tasks to complete"
ansible.builtin.shell: |
python3 << 'PYEOF'
import urllib.request, json, ssl, time
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
api_base = "https://{{ api_host }}:{{ api_port }}/api2/json"
headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"}
node = "{{ current_node }}"
lines = """{{ bulk_trigger_raw.stdout }}""".strip().split('\n')
last_line = [l for l in lines if l.startswith('{')][-1]
task_ids = json.loads(last_line)["task_ids"]
failed = []
for t in task_ids:
retries = 60
while retries > 0:
url = f"{api_base}/nodes/{node}/tasks/{t['task']}/status"
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req, context=ctx) as r:
status = json.loads(r.read())["data"]
if status["status"] == "stopped":
if status.get("exitstatus") != "OK":
failed.append(f"{t['name']} ({t['vmid']}): {status.get('exitstatus')}")
else:
print(f"OK: {t['name']} ({t['vmid']}) migrated successfully")
break
time.sleep(10)
retries -= 1
else:
failed.append(f"{t['name']} ({t['vmid']}): timed out")
if failed:
print("FAILED: " + ", ".join(failed))
exit(1)
print("All bulk migrations completed successfully")
PYEOF
register: bulk_wait_result
delegate_to: localhost
changed_when: false
- name: "Drain | Bulk | Handle fallback guests sequentially"
include_tasks: migrate_guest.yml
loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
loop_control:
loop_var: guest
when: live_migrate_fallback != 'skip'