feat: proxmox_upgrade role and playbook
This commit is contained in:
43
playbooks/proxmox_upgrade.yml
Normal file
43
playbooks/proxmox_upgrade.yml
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade.yml
|
||||||
|
# =============================================================================
|
||||||
|
# Rolling Proxmox cluster upgrade playbook.
|
||||||
|
# Runs on the first node in upgrade_order — all other nodes are handled
|
||||||
|
# via API calls and delegate_to from within the role.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ansible-playbook playbooks/proxmox_upgrade.yml \
|
||||||
|
# -i inventories/client_local_eng/hypervisor_hosts.yml
|
||||||
|
#
|
||||||
|
# Override migration behaviour:
|
||||||
|
# -e migration_bulk=true
|
||||||
|
# -e live_migrate_fallback=skip
|
||||||
|
# -e migration_restore=true
|
||||||
|
#
|
||||||
|
# Dry run (check mode — no changes):
|
||||||
|
# --check
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: Proxmox Rolling Upgrade
|
||||||
|
hosts: proxmox_cluster
|
||||||
|
gather_facts: true
|
||||||
|
serial: 1
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
pre_tasks:
|
||||||
|
- name: Confirm upgrade_order is defined
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "upgrade_order must be defined in hypervisor_hosts.yml"
|
||||||
|
when: upgrade_order is not defined or upgrade_order | length == 0
|
||||||
|
|
||||||
|
- name: Log upgrade targets
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Proxmox upgrade starting for {{ client_name }} ({{ client_id }})
|
||||||
|
Nodes: {{ upgrade_order | join(', ') }}
|
||||||
|
API: https://{{ api_host }}:{{ api_port }}
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- proxmox_upgrade
|
||||||
|
|
||||||
31
roles/proxmox_upgrade/defaults/main.yml
Normal file
31
roles/proxmox_upgrade/defaults/main.yml
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade — defaults
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Migration behaviour
|
||||||
|
migration_bulk: false # true = all VMs at once, false = one at a time
|
||||||
|
migration_restore: false # true = migrate VMs back to original node after upgrade
|
||||||
|
live_migrate_fallback: shutdown # migrate | shutdown | skip
|
||||||
|
|
||||||
|
# Shutdown timeout in seconds before forcing off
|
||||||
|
vm_shutdown_timeout: 120
|
||||||
|
|
||||||
|
# How long to wait for a VM to start after cold migration
|
||||||
|
vm_start_timeout: 120
|
||||||
|
|
||||||
|
# How long to wait for node to rejoin cluster after reboot
|
||||||
|
node_rejoin_timeout: 600
|
||||||
|
|
||||||
|
# How long to wait for CEPH to recover after node rejoins
|
||||||
|
ceph_recover_timeout: 300
|
||||||
|
|
||||||
|
# apt upgrade options
|
||||||
|
apt_upgrade_cmd: "DEBIAN_FRONTEND=noninteractive apt-get dist-upgrade -y"
|
||||||
|
apt_autoremove: true
|
||||||
|
|
||||||
|
# Tags on VMs/LXCs to never migrate (comma separated in Proxmox)
|
||||||
|
migrate_exclude_tags:
|
||||||
|
- nomigrate
|
||||||
|
- pinned
|
||||||
|
|
||||||
12
roles/proxmox_upgrade/readme.md
Normal file
12
roles/proxmox_upgrade/readme.md
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
roles/proxmox_upgrade/
|
||||||
|
defaults/main.yml ← all vars with defaults
|
||||||
|
tasks/
|
||||||
|
main.yml ← entry point, calls preflight then loops nodes
|
||||||
|
node_upgrade.yml ← per-node: backup → drain → upgrade → restore
|
||||||
|
preflight.yml ← cluster health check, abort if unhealthy
|
||||||
|
drain.yml ← classify guests, trigger migrations
|
||||||
|
migrate_guest.yml ← single guest migration with fallback logic
|
||||||
|
upgrade.yml ← apt dist-upgrade, reboot, wait for rejoin
|
||||||
|
restore.yml ← optional migrate-back
|
||||||
|
playbooks/proxmox_upgrade.yml
|
||||||
|
|
||||||
186
roles/proxmox_upgrade/tasks/drain.yml
Normal file
186
roles/proxmox_upgrade/tasks/drain.yml
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade — drain.yml
|
||||||
|
# Migrate all VMs/LXCs off a node before upgrading it
|
||||||
|
# Uses Proxmox API — runs delegate_to: localhost
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# ── Get all guests on this node ───────────────────────────────────────────────
|
||||||
|
- name: Drain | Get all VMs on node {{ current_node }}
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/qemu"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: node_vms
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Drain | Get all LXCs on node {{ current_node }}
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/lxc"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: node_lxcs
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Drain | Get available target nodes
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: all_nodes
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Drain | Build target node list (exclude current node)
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
migration_targets: >-
|
||||||
|
{{ all_nodes.json.data
|
||||||
|
| selectattr('status', 'equalto', 'online')
|
||||||
|
| rejectattr('node', 'equalto', current_node)
|
||||||
|
| map(attribute='node')
|
||||||
|
| list }}
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Drain | Fail if no migration targets available
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "No online nodes available to migrate guests to. Cannot drain {{ current_node }}."
|
||||||
|
when: migration_targets | length == 0
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
# ── Classify VMs — live migratable vs needs fallback ─────────────────────────
|
||||||
|
- name: Drain | Get VM configs to check migratability
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/qemu/{{ item.vmid }}/config"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: vm_configs
|
||||||
|
loop: "{{ node_vms.json.data }}"
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Drain | Build guest migration plan
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
migration_plan: >-
|
||||||
|
{%- set plan = [] -%}
|
||||||
|
{%- for vm in node_vms.json.data -%}
|
||||||
|
{%- set cfg = vm_configs.results[loop.index0].json.data -%}
|
||||||
|
{%- set tags = (vm.tags | default('')) .split(',') | map('trim') | list -%}
|
||||||
|
{%- set excluded = tags | select('in', migrate_exclude_tags) | list | length > 0 -%}
|
||||||
|
{%- set has_passthrough = 'hostpci0' in cfg or 'usb0' in cfg -%}
|
||||||
|
{%- set has_local_disk = shared_storage == false -%}
|
||||||
|
{%- set has_local_cdrom = cfg.values() | select('string') | select('match', '.*local.*\\.iso.*') | list | length > 0 -%}
|
||||||
|
{%- set needs_fallback = has_passthrough or has_local_disk or has_local_cdrom -%}
|
||||||
|
{%- if not excluded -%}
|
||||||
|
{%- set _ = plan.append({
|
||||||
|
'vmid': vm.vmid,
|
||||||
|
'name': vm.name,
|
||||||
|
'type': 'qemu',
|
||||||
|
'status': vm.status,
|
||||||
|
'needs_fallback': needs_fallback,
|
||||||
|
'fallback_reason': ('passthrough' if has_passthrough else ('local_disk' if has_local_disk else ('local_cdrom' if has_local_cdrom else '')))
|
||||||
|
}) -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{%- for lxc in node_lxcs.json.data -%}
|
||||||
|
{%- set tags = (lxc.tags | default('')) .split(',') | map('trim') | list -%}
|
||||||
|
{%- set excluded = tags | select('in', migrate_exclude_tags) | list | length > 0 -%}
|
||||||
|
{%- if not excluded -%}
|
||||||
|
{%- set _ = plan.append({
|
||||||
|
'vmid': lxc.vmid,
|
||||||
|
'name': lxc.name,
|
||||||
|
'type': 'lxc',
|
||||||
|
'status': lxc.status,
|
||||||
|
'needs_fallback': false,
|
||||||
|
'fallback_reason': ''
|
||||||
|
}) -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{{ plan }}
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Drain | Log migration plan
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Migration plan for {{ current_node }}:
|
||||||
|
{% for g in migration_plan %}
|
||||||
|
- {{ g.type | upper }} {{ g.vmid }} ({{ g.name }}) [{{ g.status }}]
|
||||||
|
{% if g.needs_fallback %} ⚠ needs fallback ({{ g.fallback_reason }}) — action: {{ live_migrate_fallback }}{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
# ── Abort if any guests need fallback and live_migrate_fallback is 'migrate' ──
|
||||||
|
- name: Drain | Warn about non-migratable guests
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
WARNING — {{ item.type | upper }} {{ item.vmid }} ({{ item.name }})
|
||||||
|
cannot be live migrated ({{ item.fallback_reason }}).
|
||||||
|
live_migrate_fallback={{ live_migrate_fallback }} —
|
||||||
|
{% if live_migrate_fallback == 'skip' %}
|
||||||
|
THIS VM WILL GO DOWN DURING NODE REBOOT.
|
||||||
|
{% elif live_migrate_fallback == 'shutdown' %}
|
||||||
|
Will be shut down, cold migrated, and restarted.
|
||||||
|
{% else %}
|
||||||
|
Will attempt live migrate anyway (may fail).
|
||||||
|
{% endif %}
|
||||||
|
loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
# ── Perform migrations ────────────────────────────────────────────────────────
|
||||||
|
- name: Drain | Migrate guests (sequential)
|
||||||
|
when: not migration_bulk | bool
|
||||||
|
include_tasks: migrate_guest.yml
|
||||||
|
loop: "{{ migration_plan | rejectattr('needs_fallback') | list + migration_plan | selectattr('needs_fallback') | rejectattr('needs_fallback' if live_migrate_fallback == 'skip' else 'nonexistent') | list }}"
|
||||||
|
loop_var: guest
|
||||||
|
|
||||||
|
- name: Drain | Migrate guests (bulk — fire all at once)
|
||||||
|
when: migration_bulk | bool
|
||||||
|
block:
|
||||||
|
- name: Drain | Bulk | Trigger all live migrations simultaneously
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate"
|
||||||
|
method: POST
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
body_format: json
|
||||||
|
body:
|
||||||
|
target: "{{ migration_targets | first }}"
|
||||||
|
online: "{{ 1 if not guest.needs_fallback else 0 }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: bulk_migration_tasks
|
||||||
|
loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}"
|
||||||
|
loop_var: guest
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Drain | Bulk | Wait for all migrations to complete
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/tasks/{{ item.json.data }}/status"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: task_status
|
||||||
|
until: task_status.json.data.status == 'stopped'
|
||||||
|
retries: 60
|
||||||
|
delay: 10
|
||||||
|
loop: "{{ bulk_migration_tasks.results }}"
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Drain | Bulk | Check all migrations succeeded
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "Migration task failed for VMID — exitstatus: {{ item.json.data.exitstatus }}"
|
||||||
|
loop: "{{ task_status.results }}"
|
||||||
|
when: item.json.data.exitstatus != 'OK'
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Drain | Bulk | Handle fallback guests sequentially
|
||||||
|
include_tasks: migrate_guest.yml
|
||||||
|
loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
|
||||||
|
loop_var: guest
|
||||||
|
when: live_migrate_fallback != 'skip'
|
||||||
|
|
||||||
33
roles/proxmox_upgrade/tasks/main.yml
Normal file
33
roles/proxmox_upgrade/tasks/main.yml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade — main.yml
|
||||||
|
# Orchestrates rolling Proxmox cluster upgrade
|
||||||
|
# Runs on the first node in upgrade_order, delegates API calls to localhost
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: Proxmox Upgrade | Start
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Starting Proxmox rolling upgrade for {{ client_name }}
|
||||||
|
— {{ upgrade_order | length }} nodes in order: {{ upgrade_order | join(' → ') }}
|
||||||
|
— migration_bulk={{ migration_bulk }}
|
||||||
|
— live_migrate_fallback={{ live_migrate_fallback }}
|
||||||
|
— migration_restore={{ migration_restore }}
|
||||||
|
— ceph_enabled={{ ceph_enabled }}
|
||||||
|
|
||||||
|
# ── Cluster health preflight ──────────────────────────────────────────────────
|
||||||
|
- name: Proxmox Upgrade | Cluster preflight
|
||||||
|
include_tasks: preflight.yml
|
||||||
|
|
||||||
|
# ── Rolling upgrade — one node at a time ─────────────────────────────────────
|
||||||
|
- name: Proxmox Upgrade | Rolling upgrade loop
|
||||||
|
include_tasks: node_upgrade.yml
|
||||||
|
loop: "{{ upgrade_order }}"
|
||||||
|
loop_var: current_node
|
||||||
|
|
||||||
|
- name: Proxmox Upgrade | Complete
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Proxmox rolling upgrade complete for {{ client_name }}
|
||||||
|
— all {{ upgrade_order | length }} nodes upgraded successfully
|
||||||
|
|
||||||
117
roles/proxmox_upgrade/tasks/migrate_guest.yml
Normal file
117
roles/proxmox_upgrade/tasks/migrate_guest.yml
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade — migrate_guest.yml
|
||||||
|
# Handles migration of a single VM or LXC
|
||||||
|
# Called with loop_var: guest
|
||||||
|
# guest = { vmid, name, type, status, needs_fallback, fallback_reason }
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — skip check"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "SKIPPING {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — live_migrate_fallback=skip, will go down during reboot"
|
||||||
|
when: guest.needs_fallback and live_migrate_fallback == 'skip'
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }})"
|
||||||
|
when: not (guest.needs_fallback and live_migrate_fallback == 'skip')
|
||||||
|
block:
|
||||||
|
# ── Cold migration: shutdown first ───────────────────────────────────────
|
||||||
|
- name: "Migrate | {{ guest.vmid }} | Shutdown for cold migration"
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/shutdown"
|
||||||
|
method: POST
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
body_format: json
|
||||||
|
body:
|
||||||
|
timeout: "{{ vm_shutdown_timeout }}"
|
||||||
|
forceStop: 1
|
||||||
|
validate_certs: false
|
||||||
|
when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Migrate | {{ guest.vmid }} | Wait for shutdown"
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/current"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: vm_status
|
||||||
|
until: vm_status.json.data.status == 'stopped'
|
||||||
|
retries: "{{ (vm_shutdown_timeout | int / 5) | int }}"
|
||||||
|
delay: 5
|
||||||
|
when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
# ── Trigger migration ─────────────────────────────────────────────────────
|
||||||
|
- name: "Migrate | {{ guest.vmid }} | Trigger migration to {{ migration_targets | first }}"
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate"
|
||||||
|
method: POST
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
body_format: json
|
||||||
|
body:
|
||||||
|
target: "{{ migration_targets | first }}"
|
||||||
|
online: "{{ 0 if (guest.needs_fallback and live_migrate_fallback == 'shutdown') else 1 }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: migration_task
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
# ── Wait for migration to complete ────────────────────────────────────────
|
||||||
|
- name: "Migrate | {{ guest.vmid }} | Wait for migration task to complete"
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/tasks/{{ migration_task.json.data }}/status"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: task_status
|
||||||
|
until: task_status.json.data.status == 'stopped'
|
||||||
|
retries: 60
|
||||||
|
delay: 10
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Migrate | {{ guest.vmid }} | Verify migration succeeded"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "Migration of {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) failed — {{ task_status.json.data.exitstatus }}"
|
||||||
|
when: task_status.json.data.exitstatus != 'OK'
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
# ── Cold migration: restart on target ────────────────────────────────────
|
||||||
|
- name: "Migrate | {{ guest.vmid }} | Start on target node after cold migration"
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/start"
|
||||||
|
method: POST
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Migrate | {{ guest.vmid }} | Wait for VM to start on target"
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/current"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: vm_start_status
|
||||||
|
until: vm_start_status.json.data.status == 'running'
|
||||||
|
retries: "{{ (vm_start_timeout | int / 5) | int }}"
|
||||||
|
delay: 5
|
||||||
|
when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Migrate | {{ guest.vmid }} ({{ guest.name }}) | Migration complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
{{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }})
|
||||||
|
{% if guest.needs_fallback and live_migrate_fallback == 'shutdown' %}
|
||||||
|
cold migrated to {{ migration_targets | first }} and restarted
|
||||||
|
{% else %}
|
||||||
|
live migrated to {{ migration_targets | first }}
|
||||||
|
{% endif %}
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
43
roles/proxmox_upgrade/tasks/node_upgrade.yml
Normal file
43
roles/proxmox_upgrade/tasks/node_upgrade.yml
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade — node_upgrade.yml
|
||||||
|
# Per-node upgrade sequence: backup → drain → upgrade → restore
|
||||||
|
# Called with loop_var: current_node
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Node {{ current_node }} | Start"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "━━━ Starting upgrade of node {{ current_node }} ━━━"
|
||||||
|
|
||||||
|
# ── Step 1: Backup config ─────────────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 1 — Backup config"
|
||||||
|
include_role:
|
||||||
|
name: hypervisor_backup_config
|
||||||
|
vars:
|
||||||
|
pve_config_git_commit_message: "[{{ client_id }}] {{ current_node }} pre-upgrade config backup {{ ansible_date_time.date }}"
|
||||||
|
delegate_to: "{{ current_node }}"
|
||||||
|
|
||||||
|
# ── Step 2: Drain node ────────────────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 2 — Drain (migrate guests off node)"
|
||||||
|
include_tasks: drain.yml
|
||||||
|
when: cluster_mode == 'cluster'
|
||||||
|
|
||||||
|
- name: "Node {{ current_node }} | Step 2 — Single node mode, skipping drain"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "cluster_mode=single — skipping guest migration"
|
||||||
|
when: cluster_mode == 'single'
|
||||||
|
|
||||||
|
# ── Step 3: Upgrade ───────────────────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 3 — Upgrade packages"
|
||||||
|
include_tasks: upgrade.yml
|
||||||
|
delegate_to: "{{ current_node }}"
|
||||||
|
|
||||||
|
# ── Step 4: Restore ───────────────────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 4 — Restore guests"
|
||||||
|
include_tasks: restore.yml
|
||||||
|
when: cluster_mode == 'cluster'
|
||||||
|
|
||||||
|
- name: "Node {{ current_node }} | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "━━━ Node {{ current_node }} upgrade complete ━━━"
|
||||||
|
|
||||||
64
roles/proxmox_upgrade/tasks/preflight.yml
Normal file
64
roles/proxmox_upgrade/tasks/preflight.yml
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade — preflight.yml
|
||||||
|
# Check cluster health before starting any upgrade work
|
||||||
|
# Runs delegate_to: first node in upgrade_order
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: Preflight | Check all cluster nodes are online
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
pvecm status 2>/dev/null | grep -E "^Nodes|Quorate"
|
||||||
|
register: pvecm_status
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Preflight | Get cluster node status via API
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: cluster_nodes
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Preflight | Check all nodes are online
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >
|
||||||
|
Cluster health check FAILED — node {{ item.node }} is {{ item.status }}.
|
||||||
|
Aborting upgrade to prevent data loss. Investigate before retrying.
|
||||||
|
loop: "{{ cluster_nodes.json.data }}"
|
||||||
|
when: item.status != 'online'
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Preflight | Check quorum via pvecm
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
pvecm status 2>/dev/null | grep -i "quorate" | grep -i "yes"
|
||||||
|
register: quorum_check
|
||||||
|
changed_when: false
|
||||||
|
failed_when: quorum_check.rc != 0
|
||||||
|
|
||||||
|
- name: Preflight | Check CEPH health
|
||||||
|
when: ceph_enabled | bool
|
||||||
|
block:
|
||||||
|
- name: Preflight | Get CEPH health status
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
ceph health 2>/dev/null
|
||||||
|
register: ceph_health
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Preflight | Abort if CEPH is not healthy
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >
|
||||||
|
CEPH health check FAILED — status: {{ ceph_health.stdout }}.
|
||||||
|
Aborting upgrade. Resolve CEPH issues before retrying.
|
||||||
|
when: "'HEALTH_OK' not in ceph_health.stdout and 'HEALTH_WARN' not in ceph_health.stdout"
|
||||||
|
|
||||||
|
- name: Preflight | Warn if CEPH has warnings
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "WARNING — CEPH has warnings: {{ ceph_health.stdout }}. Proceeding but monitor closely."
|
||||||
|
when: "'HEALTH_WARN' in ceph_health.stdout"
|
||||||
|
|
||||||
|
- name: Preflight | Cluster health check passed
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Cluster health check passed — all nodes online, quorum OK{{ ', CEPH checked' if ceph_enabled else '' }}"
|
||||||
|
|
||||||
63
roles/proxmox_upgrade/tasks/restore.yml
Normal file
63
roles/proxmox_upgrade/tasks/restore.yml
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade — restore.yml
|
||||||
|
# Optionally migrate guests back to their original node after upgrade
|
||||||
|
# Only runs if migration_restore: true
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: Restore | Skip restore
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "migration_restore=false — leaving guests on their current nodes"
|
||||||
|
when: not migration_restore | bool
|
||||||
|
|
||||||
|
- name: Restore | Migrate guests back to {{ current_node }}
|
||||||
|
when: migration_restore | bool
|
||||||
|
block:
|
||||||
|
- name: Restore | Get guests currently on other nodes that originated from {{ current_node }}
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Restoring {{ migration_plan | rejectattr('needs_fallback') | list | length +
|
||||||
|
(migration_plan | selectattr('needs_fallback') | list | length if live_migrate_fallback != 'skip' else 0) }}
|
||||||
|
guests back to {{ current_node }}
|
||||||
|
|
||||||
|
- name: Restore | Migrate each guest back
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate"
|
||||||
|
method: POST
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
body_format: json
|
||||||
|
body:
|
||||||
|
target: "{{ current_node }}"
|
||||||
|
online: "{{ 0 if (guest.needs_fallback and live_migrate_fallback == 'shutdown') else 1 }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: restore_task
|
||||||
|
loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}"
|
||||||
|
loop_var: guest
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Restore | Wait for all restore migrations to complete
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/tasks/{{ item.json.data }}/status"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: restore_status
|
||||||
|
until: restore_status.json.data.status == 'stopped'
|
||||||
|
retries: 60
|
||||||
|
delay: 10
|
||||||
|
loop: "{{ restore_task.results }}"
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Restore | Check all restores succeeded
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "Restore migration failed — {{ item.json.data.exitstatus }}"
|
||||||
|
loop: "{{ restore_status.results }}"
|
||||||
|
when: item.json.data.exitstatus != 'OK'
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Restore | Complete
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "All guests restored to {{ current_node }}"
|
||||||
|
|
||||||
95
roles/proxmox_upgrade/tasks/upgrade.yml
Normal file
95
roles/proxmox_upgrade/tasks/upgrade.yml
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade — upgrade.yml
|
||||||
|
# Run apt dist-upgrade and reboot, wait for node to rejoin cluster
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: Upgrade | Set CEPH noout flag before upgrade
|
||||||
|
ansible.builtin.shell: ceph osd set noout
|
||||||
|
when: ceph_enabled | bool
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Upgrade | Run apt update
|
||||||
|
ansible.builtin.shell: apt-get update -q
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Upgrade | Run apt dist-upgrade
|
||||||
|
ansible.builtin.shell: "{{ apt_upgrade_cmd }}"
|
||||||
|
register: apt_upgrade_result
|
||||||
|
changed_when: "'0 upgraded' not in apt_upgrade_result.stdout"
|
||||||
|
|
||||||
|
- name: Upgrade | Log packages upgraded
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "{{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('No output') }}"
|
||||||
|
|
||||||
|
- name: Upgrade | Run apt autoremove
|
||||||
|
ansible.builtin.shell: DEBIAN_FRONTEND=noninteractive apt-get autoremove -y
|
||||||
|
when: apt_autoremove | bool
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Upgrade | Check if reboot is required
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: /var/run/reboot-required
|
||||||
|
register: reboot_required
|
||||||
|
|
||||||
|
- name: Upgrade | Log reboot status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "{{ 'Reboot required — rebooting node' if reboot_required.stat.exists else 'No reboot required — skipping reboot' }}"
|
||||||
|
|
||||||
|
- name: Upgrade | Reboot node
|
||||||
|
ansible.builtin.reboot:
|
||||||
|
reboot_timeout: "{{ node_rejoin_timeout }}"
|
||||||
|
msg: "Rebooting for Proxmox upgrade"
|
||||||
|
pre_reboot_delay: 5
|
||||||
|
post_reboot_delay: 30
|
||||||
|
when: reboot_required.stat.exists
|
||||||
|
|
||||||
|
# ── Wait for node to rejoin cluster ──────────────────────────────────────────
|
||||||
|
- name: Upgrade | Wait for node to appear online in cluster
|
||||||
|
ansible.builtin.uri:
|
||||||
|
url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes"
|
||||||
|
method: GET
|
||||||
|
headers:
|
||||||
|
Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
|
||||||
|
validate_certs: false
|
||||||
|
register: nodes_status
|
||||||
|
until: >-
|
||||||
|
nodes_status.json.data
|
||||||
|
| selectattr('node', 'equalto', current_node)
|
||||||
|
| selectattr('status', 'equalto', 'online')
|
||||||
|
| list | length > 0
|
||||||
|
retries: "{{ (node_rejoin_timeout | int / 10) | int }}"
|
||||||
|
delay: 10
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: Upgrade | Node {{ current_node }} back online
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Node {{ current_node }} has rejoined the cluster"
|
||||||
|
|
||||||
|
# ── CEPH recovery wait ────────────────────────────────────────────────────────
|
||||||
|
- name: Upgrade | Wait for CEPH to recover
|
||||||
|
when: ceph_enabled | bool
|
||||||
|
block:
|
||||||
|
- name: Upgrade | CEPH | Wait for HEALTH_OK or HEALTH_WARN
|
||||||
|
ansible.builtin.shell: ceph health
|
||||||
|
register: ceph_health_post
|
||||||
|
until: "'HEALTH_OK' in ceph_health_post.stdout or 'HEALTH_WARN' in ceph_health_post.stdout"
|
||||||
|
retries: "{{ (ceph_recover_timeout | int / 10) | int }}"
|
||||||
|
delay: 10
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Upgrade | CEPH | Clear noout flag
|
||||||
|
ansible.builtin.shell: ceph osd unset noout
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Upgrade | CEPH | Log recovery status
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "CEPH recovered: {{ ceph_health_post.stdout }}"
|
||||||
|
|
||||||
|
- name: Upgrade | Node {{ current_node }} upgrade complete
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Node {{ current_node }} upgrade complete —
|
||||||
|
{{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('packages updated') }}
|
||||||
|
{{ '— rebooted' if reboot_required.stat.exists else '— no reboot needed' }}
|
||||||
|
|
||||||
Reference in New Issue
Block a user