diff --git a/roles/proxmox_upgrade/tasks/drain.yml b/roles/proxmox_upgrade/tasks/drain.yml index 659d0e4..eab55e9 100644 --- a/roles/proxmox_upgrade/tasks/drain.yml +++ b/roles/proxmox_upgrade/tasks/drain.yml @@ -2,185 +2,222 @@ # ============================================================================= # proxmox_upgrade — drain.yml # Migrate all VMs/LXCs off a node before upgrading it -# Uses Proxmox API — runs delegate_to: localhost +# Uses inline Python for API calls to avoid cross-task variable scope issues # ============================================================================= -# ── Get all guests on this node ─────────────────────────────────────────────── -- name: Drain | Get all VMs on node {{ current_node }} - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/qemu" - method: GET - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - register: node_vms - delegate_to: localhost +# ── Build migration plan via API ────────────────────────────────────────────── +- name: "Drain | Build migration plan for {{ current_node }}" + ansible.builtin.shell: | + python3 << 'PYEOF' + import urllib.request, urllib.error, json, ssl -- name: Drain | Get all LXCs on node {{ current_node }} - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/lxc" - method: GET - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - register: node_lxcs - delegate_to: localhost + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE -- name: Drain | Get available target nodes - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes" - method: GET - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - register: all_nodes - delegate_to: localhost + api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" + headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} + node = "{{ current_node }}" + shared = {{ shared_storage | lower }} + exclude_tags = {{ migrate_exclude_tags | to_json }} -- name: Drain | Build target node list (exclude current node) + def api_get(path): + req = urllib.request.Request(f"{api_base}{path}", headers=headers) + with urllib.request.urlopen(req, context=ctx) as r: + return json.loads(r.read())["data"] + + # Get all online nodes except current + all_nodes = api_get("/nodes") + targets = [n["node"] for n in all_nodes if n["status"] == "online" and n["node"] != node] + if not targets: + print(json.dumps({"error": f"No online nodes available to migrate guests to from {node}"})) + exit(1) + + # Get VMs and LXCs on this node + vms = api_get(f"/nodes/{node}/qemu") + lxcs = api_get(f"/nodes/{node}/lxc") + + plan = [] + + for vm in vms: + tags = [t.strip() for t in (vm.get("tags") or "").split(",") if t.strip()] + if any(t in exclude_tags for t in tags): + continue + cfg = api_get(f"/nodes/{node}/qemu/{vm['vmid']}/config") + has_passthrough = any(k.startswith("hostpci") or k.startswith("usb") for k in cfg) + has_local_cdrom = any( + isinstance(v, str) and "local" in v and ".iso" in v + for v in cfg.values() + ) + needs_fallback = has_passthrough or not shared or has_local_cdrom + reason = "passthrough" if has_passthrough else ("local_disk" if not shared else ("local_cdrom" if has_local_cdrom else "")) + plan.append({ + "vmid": vm["vmid"], + "name": vm.get("name", str(vm["vmid"])), + "type": "qemu", + "status": vm["status"], + "needs_fallback": needs_fallback, + "fallback_reason": reason + }) + + for lxc in lxcs: + tags = [t.strip() for t in (lxc.get("tags") or "").split(",") if t.strip()] + if any(t in exclude_tags for t in tags): + continue + plan.append({ + "vmid": lxc["vmid"], + "name": lxc.get("name", str(lxc["vmid"])), + "type": "lxc", + "status": lxc["status"], + "needs_fallback": False, + "fallback_reason": "" + }) + + print(json.dumps({"plan": plan, "targets": targets})) + PYEOF + register: drain_plan_raw + delegate_to: localhost + changed_when: false + +- name: "Drain | Parse migration plan" ansible.builtin.set_fact: - migration_targets: >- - {{ all_nodes.json.data - | selectattr('status', 'equalto', 'online') - | rejectattr('node', 'equalto', current_node) - | map(attribute='node') - | list }} + drain_data: "{{ drain_plan_raw.stdout | from_json }}" delegate_to: localhost -- name: Drain | Fail if no migration targets available +- name: "Drain | Fail if error building plan" ansible.builtin.fail: - msg: "No online nodes available to migrate guests to. Cannot drain {{ current_node }}." - when: migration_targets | length == 0 + msg: "{{ drain_data.error }}" + when: drain_data.error is defined delegate_to: localhost -# ── Classify VMs — live migratable vs needs fallback ───────────────────────── -- name: Drain | Get VM configs to check migratability - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/qemu/{{ item.vmid }}/config" - method: GET - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - register: vm_configs - loop: "{{ node_vms.json.data }}" - delegate_to: localhost - -- name: Drain | Build guest migration plan +- name: "Drain | Set migration plan and targets" ansible.builtin.set_fact: - migration_plan: >- - {%- set plan = [] -%} - {%- for vm in node_vms.json.data -%} - {%- set cfg = vm_configs.results[loop.index0].json.data -%} - {%- set tags = (vm.tags | default('')) .split(',') | map('trim') | list -%} - {%- set excluded = tags | select('in', migrate_exclude_tags) | list | length > 0 -%} - {%- set has_passthrough = 'hostpci0' in cfg or 'usb0' in cfg -%} - {%- set has_local_disk = shared_storage == false -%} - {%- set has_local_cdrom = cfg.values() | select('string') | select('match', '.*local.*\\.iso.*') | list | length > 0 -%} - {%- set needs_fallback = has_passthrough or has_local_disk or has_local_cdrom -%} - {%- if not excluded -%} - {%- set _ = plan.append({ - 'vmid': vm.vmid, - 'name': vm.name, - 'type': 'qemu', - 'status': vm.status, - 'needs_fallback': needs_fallback, - 'fallback_reason': ('passthrough' if has_passthrough else ('local_disk' if has_local_disk else ('local_cdrom' if has_local_cdrom else ''))) - }) -%} - {%- endif -%} - {%- endfor -%} - {%- for lxc in node_lxcs.json.data -%} - {%- set tags = (lxc.tags | default('')) .split(',') | map('trim') | list -%} - {%- set excluded = tags | select('in', migrate_exclude_tags) | list | length > 0 -%} - {%- if not excluded -%} - {%- set _ = plan.append({ - 'vmid': lxc.vmid, - 'name': lxc.name, - 'type': 'lxc', - 'status': lxc.status, - 'needs_fallback': false, - 'fallback_reason': '' - }) -%} - {%- endif -%} - {%- endfor -%} - {{ plan }} + migration_plan: "{{ drain_data.plan }}" + migration_targets: "{{ drain_data.targets }}" delegate_to: localhost -- name: Drain | Log migration plan +- name: "Drain | Log migration plan for {{ current_node }}" ansible.builtin.debug: msg: >- - Migration plan for {{ current_node }}: + Migration plan for {{ current_node }} ({{ migration_plan | length }} guests → {{ migration_targets | first }}): {% for g in migration_plan %} - - {{ g.type | upper }} {{ g.vmid }} ({{ g.name }}) [{{ g.status }}] - {% if g.needs_fallback %} ⚠ needs fallback ({{ g.fallback_reason }}) — action: {{ live_migrate_fallback }}{% endif %} + - {{ g.type | upper }} {{ g.vmid }} ({{ g.name }}) [{{ g.status }}]{% if g.needs_fallback %} ⚠ fallback={{ live_migrate_fallback }} reason={{ g.fallback_reason }}{% endif %} + {% endfor %} delegate_to: localhost -# ── Abort if any guests need fallback and live_migrate_fallback is 'migrate' ── -- name: Drain | Warn about non-migratable guests +- name: "Drain | Warn about non-live-migratable guests" ansible.builtin.debug: msg: >- WARNING — {{ item.type | upper }} {{ item.vmid }} ({{ item.name }}) cannot be live migrated ({{ item.fallback_reason }}). - live_migrate_fallback={{ live_migrate_fallback }} — - {% if live_migrate_fallback == 'skip' %} - THIS VM WILL GO DOWN DURING NODE REBOOT. - {% elif live_migrate_fallback == 'shutdown' %} - Will be shut down, cold migrated, and restarted. - {% else %} - Will attempt live migrate anyway (may fail). - {% endif %} + {% if live_migrate_fallback == 'skip' %}THIS VM WILL GO DOWN DURING NODE REBOOT. + {% elif live_migrate_fallback == 'shutdown' %}Will be shut down, cold migrated, and restarted. + {% else %}Will attempt live migrate anyway (may fail).{% endif %} loop: "{{ migration_plan | selectattr('needs_fallback') | list }}" + loop_control: + loop_var: item delegate_to: localhost -# ── Perform migrations ──────────────────────────────────────────────────────── -- name: Drain | Migrate guests (sequential) +# ── Sequential migrations ───────────────────────────────────────────────────── +- name: "Drain | Migrate guests sequentially" when: not migration_bulk | bool - include_tasks: migrate_guest.yml - loop: "{{ migration_plan | rejectattr('needs_fallback') | list + migration_plan | selectattr('needs_fallback') | rejectattr('needs_fallback' if live_migrate_fallback == 'skip' else 'nonexistent') | list }}" - loop_control: - loop_var: guest - -- name: Drain | Migrate guests (bulk — fire all at once) - when: migration_bulk | bool block: - - name: Drain | Bulk | Trigger all live migrations simultaneously - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate" - method: POST - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - body_format: json - body: - target: "{{ migration_targets | first }}" - online: "{{ 1 if not guest.needs_fallback else 0 }}" - validate_certs: false - register: bulk_migration_tasks + - name: "Drain | Sequential | Migrate live-migratable guests" + include_tasks: migrate_guest.yml loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}" loop_control: loop_var: guest - delegate_to: localhost - - name: Drain | Bulk | Wait for all migrations to complete - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/tasks/{{ item.json.data }}/status" - method: GET - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - register: task_status - until: task_status.json.data.status == 'stopped' - retries: 60 - delay: 10 - loop: "{{ bulk_migration_tasks.results }}" - delegate_to: localhost - - - name: Drain | Bulk | Check all migrations succeeded - ansible.builtin.fail: - msg: "Migration task failed for VMID — exitstatus: {{ item.json.data.exitstatus }}" - loop: "{{ task_status.results }}" - when: item.json.data.exitstatus != 'OK' - delegate_to: localhost - - - name: Drain | Bulk | Handle fallback guests sequentially + - name: "Drain | Sequential | Handle fallback guests" + include_tasks: migrate_guest.yml + loop: "{{ migration_plan | selectattr('needs_fallback') | list }}" + loop_control: + loop_var: guest + when: live_migrate_fallback != 'skip' + +# ── Bulk migrations ─────────────────────────────────────────────────────────── +- name: "Drain | Migrate guests in bulk" + when: migration_bulk | bool + block: + - name: "Drain | Bulk | Trigger all live migrations" + ansible.builtin.shell: | + python3 << 'PYEOF' + import urllib.request, json, ssl + + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" + headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} + node = "{{ current_node }}" + target = "{{ migration_targets | first }}" + plan = {{ migration_plan | rejectattr('needs_fallback') | list | to_json }} + + task_ids = [] + for guest in plan: + gtype = "qemu" if guest["type"] == "qemu" else "lxc" + url = f"{api_base}/nodes/{node}/{gtype}/{guest['vmid']}/migrate" + body = json.dumps({"target": target, "online": 1}).encode() + req = urllib.request.Request(url, data=body, headers={**headers, "Content-Type": "application/json"}, method="POST") + with urllib.request.urlopen(req, context=ctx) as r: + task_id = json.loads(r.read())["data"] + task_ids.append({"vmid": guest["vmid"], "name": guest["name"], "task": task_id}) + print(f"Triggered migration: {guest['type'].upper()} {guest['vmid']} ({guest['name']}) → {target} task={task_id}") + + print(json.dumps({"task_ids": task_ids})) + PYEOF + register: bulk_trigger_raw + delegate_to: localhost + changed_when: true + + - name: "Drain | Bulk | Wait for all migration tasks to complete" + ansible.builtin.shell: | + python3 << 'PYEOF' + import urllib.request, json, ssl, time + + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" + headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} + node = "{{ current_node }}" + + lines = """{{ bulk_trigger_raw.stdout }}""".strip().split('\n') + last_line = [l for l in lines if l.startswith('{')][-1] + task_ids = json.loads(last_line)["task_ids"] + + failed = [] + for t in task_ids: + retries = 60 + while retries > 0: + url = f"{api_base}/nodes/{node}/tasks/{t['task']}/status" + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req, context=ctx) as r: + status = json.loads(r.read())["data"] + if status["status"] == "stopped": + if status.get("exitstatus") != "OK": + failed.append(f"{t['name']} ({t['vmid']}): {status.get('exitstatus')}") + else: + print(f"OK: {t['name']} ({t['vmid']}) migrated successfully") + break + time.sleep(10) + retries -= 1 + else: + failed.append(f"{t['name']} ({t['vmid']}): timed out") + + if failed: + print("FAILED: " + ", ".join(failed)) + exit(1) + print("All bulk migrations completed successfully") + PYEOF + register: bulk_wait_result + delegate_to: localhost + changed_when: false + + - name: "Drain | Bulk | Handle fallback guests sequentially" include_tasks: migrate_guest.yml loop: "{{ migration_plan | selectattr('needs_fallback') | list }}" loop_control: diff --git a/roles/proxmox_upgrade/tasks/migrate_guest.yml b/roles/proxmox_upgrade/tasks/migrate_guest.yml index fd18500..1b89c51 100644 --- a/roles/proxmox_upgrade/tasks/migrate_guest.yml +++ b/roles/proxmox_upgrade/tasks/migrate_guest.yml @@ -6,111 +6,103 @@ # guest = { vmid, name, type, status, needs_fallback, fallback_reason } # ============================================================================= -- name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — skip check" +- name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — skipping (live_migrate_fallback=skip)" ansible.builtin.debug: - msg: "SKIPPING {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — live_migrate_fallback=skip, will go down during reboot" + msg: "SKIPPING {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — will go down during reboot" when: guest.needs_fallback and live_migrate_fallback == 'skip' delegate_to: localhost - name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }})" when: not (guest.needs_fallback and live_migrate_fallback == 'skip') block: - # ── Cold migration: shutdown first ─────────────────────────────────────── - - name: "Migrate | {{ guest.vmid }} | Shutdown for cold migration" - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/shutdown" - method: POST - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - body_format: json - body: - timeout: "{{ vm_shutdown_timeout }}" - forceStop: 1 - validate_certs: false - when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running' - delegate_to: localhost + - name: "Migrate | {{ guest.vmid }} | Execute migration" + ansible.builtin.shell: | + python3 << 'PYEOF' + import urllib.request, json, ssl, time - - name: "Migrate | {{ guest.vmid }} | Wait for shutdown" - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/current" - method: GET - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - register: vm_status - until: vm_status.json.data.status == 'stopped' - retries: "{{ (vm_shutdown_timeout | int / 5) | int }}" - delay: 5 - when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running' - delegate_to: localhost + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE - # ── Trigger migration ───────────────────────────────────────────────────── - - name: "Migrate | {{ guest.vmid }} | Trigger migration to {{ migration_targets | first }}" - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate" - method: POST - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - body_format: json - body: - target: "{{ migration_targets | first }}" - online: "{{ 0 if (guest.needs_fallback and live_migrate_fallback == 'shutdown') else 1 }}" - validate_certs: false - register: migration_task - delegate_to: localhost + api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" + headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} + node = "{{ current_node }}" + target = "{{ migration_targets | first }}" + vmid = {{ guest.vmid }} + gtype = "{{ guest.type }}" + name = "{{ guest.name }}" + status = "{{ guest.status }}" + needs_fallback = {{ guest.needs_fallback | lower }} + fallback = "{{ live_migrate_fallback }}" + shutdown_timeout = {{ vm_shutdown_timeout }} + start_timeout = {{ vm_start_timeout }} - # ── Wait for migration to complete ──────────────────────────────────────── - - name: "Migrate | {{ guest.vmid }} | Wait for migration task to complete" - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/tasks/{{ migration_task.json.data }}/status" - method: GET - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - register: task_status - until: task_status.json.data.status == 'stopped' - retries: 60 - delay: 10 - delegate_to: localhost + def api_req(path, method="GET", body=None): + url = f"{api_base}{path}" + data = json.dumps(body).encode() if body else None + hdrs = {**headers} + if data: + hdrs["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=hdrs, method=method) + with urllib.request.urlopen(req, context=ctx) as r: + return json.loads(r.read())["data"] - - name: "Migrate | {{ guest.vmid }} | Verify migration succeeded" - ansible.builtin.fail: - msg: "Migration of {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) failed — {{ task_status.json.data.exitstatus }}" - when: task_status.json.data.exitstatus != 'OK' - delegate_to: localhost + # ── Cold migration: shutdown first ──────────────────────────────────── + if needs_fallback and fallback == "shutdown" and status == "running": + print(f"Shutting down {gtype.upper()} {vmid} ({name})...") + api_req(f"/nodes/{node}/{gtype}/{vmid}/status/shutdown", "POST", + {"timeout": shutdown_timeout, "forceStop": 1}) + # Wait for stop + for _ in range(shutdown_timeout // 5): + s = api_req(f"/nodes/{node}/{gtype}/{vmid}/status/current") + if s["status"] == "stopped": + print(f" {vmid} stopped") + break + time.sleep(5) + else: + print(f"ERROR: {vmid} did not stop within {shutdown_timeout}s") + exit(1) - # ── Cold migration: restart on target ──────────────────────────────────── - - name: "Migrate | {{ guest.vmid }} | Start on target node after cold migration" - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/start" - method: POST - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running' - delegate_to: localhost + # ── Trigger migration ───────────────────────────────────────────────── + online = 0 if (needs_fallback and fallback == "shutdown") else 1 + print(f"Migrating {gtype.upper()} {vmid} ({name}) → {target} (online={online})...") + task_id = api_req(f"/nodes/{node}/{gtype}/{vmid}/migrate", "POST", + {"target": target, "online": online}) - - name: "Migrate | {{ guest.vmid }} | Wait for VM to start on target" - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/current" - method: GET - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - register: vm_start_status - until: vm_start_status.json.data.status == 'running' - retries: "{{ (vm_start_timeout | int / 5) | int }}" - delay: 5 - when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running' - delegate_to: localhost + # ── Wait for migration task ─────────────────────────────────────────── + for _ in range(60): + t = api_req(f"/nodes/{node}/tasks/{task_id}/status") + if t["status"] == "stopped": + if t.get("exitstatus") != "OK": + print(f"ERROR: migration failed — {t.get('exitstatus')}") + exit(1) + print(f" Migration complete: {t.get('exitstatus')}") + break + time.sleep(10) + else: + print(f"ERROR: migration task timed out") + exit(1) - - name: "Migrate | {{ guest.vmid }} ({{ guest.name }}) | Migration complete" + # ── Cold migration: restart on target ───────────────────────────────── + if needs_fallback and fallback == "shutdown" and status == "running": + print(f"Starting {vmid} on {target}...") + api_req(f"/nodes/{target}/{gtype}/{vmid}/status/start", "POST") + for _ in range(start_timeout // 5): + s = api_req(f"/nodes/{target}/{gtype}/{vmid}/status/current") + if s["status"] == "running": + print(f" {vmid} running on {target}") + break + time.sleep(5) + else: + print(f"WARNING: {vmid} did not start within {start_timeout}s — check manually") + + print(f"Done: {gtype.upper()} {vmid} ({name}) → {target}") + PYEOF + register: migrate_result + delegate_to: localhost + changed_when: true + + - name: "Migrate | {{ guest.vmid }} ({{ guest.name }}) | Log result" ansible.builtin.debug: - msg: >- - {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) - {% if guest.needs_fallback and live_migrate_fallback == 'shutdown' %} - cold migrated to {{ migration_targets | first }} and restarted - {% else %} - live migrated to {{ migration_targets | first }} - {% endif %} + msg: "{{ migrate_result.stdout_lines }}" delegate_to: localhost diff --git a/roles/proxmox_upgrade/tasks/restore.yml b/roles/proxmox_upgrade/tasks/restore.yml index 93c01fd..65e927d 100644 --- a/roles/proxmox_upgrade/tasks/restore.yml +++ b/roles/proxmox_upgrade/tasks/restore.yml @@ -5,59 +5,77 @@ # Only runs if migration_restore: true # ============================================================================= -- name: Restore | Skip restore +- name: "Restore | Skip — migration_restore=false" ansible.builtin.debug: msg: "migration_restore=false — leaving guests on their current nodes" when: not migration_restore | bool + delegate_to: localhost -- name: Restore | Migrate guests back to {{ current_node }} +- name: "Restore | Migrate guests back to {{ current_node }}" when: migration_restore | bool block: - - name: Restore | Get guests currently on other nodes that originated from {{ current_node }} + - name: "Restore | Migrate all guests back to {{ current_node }}" + ansible.builtin.shell: | + python3 << 'PYEOF' + import urllib.request, json, ssl, time + + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + api_base = "https://{{ api_host }}:{{ api_port }}/api2/json" + headers = {"Authorization": "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"} + node = "{{ current_node }}" + source = "{{ migration_targets | first }}" + plan = {{ migration_plan | to_json }} + fallback = "{{ live_migrate_fallback }}" + + def api_req(path, method="GET", body=None): + url = f"{api_base}{path}" + data = json.dumps(body).encode() if body else None + hdrs = {**headers} + if data: + hdrs["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=hdrs, method=method) + with urllib.request.urlopen(req, context=ctx) as r: + return json.loads(r.read())["data"] + + task_ids = [] + for guest in plan: + if guest["needs_fallback"] and fallback == "skip": + print(f"SKIP restore: {guest['type'].upper()} {guest['vmid']} ({guest['name']}) — was skipped during drain") + continue + gtype = guest["type"] + online = 0 if (guest["needs_fallback"] and fallback == "shutdown") else 1 + print(f"Restoring {gtype.upper()} {guest['vmid']} ({guest['name']}) → {node} (online={online})...") + task_id = api_req(f"/nodes/{source}/{gtype}/{guest['vmid']}/migrate", "POST", + {"target": node, "online": online}) + task_ids.append({"vmid": guest["vmid"], "name": guest["name"], "task": task_id, "type": gtype}) + + failed = [] + for t in task_ids: + for _ in range(60): + status = api_req(f"/nodes/{source}/tasks/{t['task']}/status") + if status["status"] == "stopped": + if status.get("exitstatus") != "OK": + failed.append(f"{t['name']} ({t['vmid']}): {status.get('exitstatus')}") + else: + print(f"OK: {t['name']} ({t['vmid']}) restored to {node}") + break + time.sleep(10) + else: + failed.append(f"{t['name']} ({t['vmid']}): timed out") + + if failed: + print("FAILED restores: " + ", ".join(failed)) + exit(1) + print(f"All guests restored to {node}") + PYEOF + register: restore_result + delegate_to: localhost + changed_when: true + + - name: "Restore | Log result" ansible.builtin.debug: - msg: >- - Restoring {{ migration_plan | rejectattr('needs_fallback') | list | length + - (migration_plan | selectattr('needs_fallback') | list | length if live_migrate_fallback != 'skip' else 0) }} - guests back to {{ current_node }} - - - name: Restore | Migrate each guest back - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate" - method: POST - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - body_format: json - body: - target: "{{ current_node }}" - online: "{{ 0 if (guest.needs_fallback and live_migrate_fallback == 'shutdown') else 1 }}" - validate_certs: false - register: restore_task - loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}" - loop_control: - loop_var: guest + msg: "{{ restore_result.stdout_lines }}" delegate_to: localhost - - - name: Restore | Wait for all restore migrations to complete - ansible.builtin.uri: - url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/tasks/{{ item.json.data }}/status" - method: GET - headers: - Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}" - validate_certs: false - register: restore_status - until: restore_status.json.data.status == 'stopped' - retries: 60 - delay: 10 - loop: "{{ restore_task.results }}" - delegate_to: localhost - - - name: Restore | Check all restores succeeded - ansible.builtin.fail: - msg: "Restore migration failed — {{ item.json.data.exitstatus }}" - loop: "{{ restore_status.results }}" - when: item.json.data.exitstatus != 'OK' - delegate_to: localhost - - - name: Restore | Complete - ansible.builtin.debug: - msg: "All guests restored to {{ current_node }}"