diff --git a/playbooks/proxmox_upgrade.yml b/playbooks/proxmox_upgrade.yml
new file mode 100644
index 0000000..e6f9e66
--- /dev/null
+++ b/playbooks/proxmox_upgrade.yml
@@ -0,0 +1,43 @@
+---
+# =============================================================================
+# proxmox_upgrade.yml
+# =============================================================================
+# Rolling Proxmox cluster upgrade playbook.
+# Runs on the first node in upgrade_order — all other nodes are handled
+# via API calls and delegate_to from within the role.
+#
+# Usage:
+#   ansible-playbook playbooks/proxmox_upgrade.yml \
+#     -i inventories/client_local_eng/hypervisor_hosts.yml
+#
+# Override migration behaviour:
+#   -e migration_bulk=true
+#   -e live_migrate_fallback=skip
+#   -e migration_restore=true
+#
+# Dry run (check mode — no changes):
+#   --check
+# =============================================================================
+
+- name: Proxmox Rolling Upgrade
+  hosts: proxmox_cluster
+  gather_facts: true
+  serial: 1
+  run_once: true
+
+  pre_tasks:
+    - name: Confirm upgrade_order is defined
+      ansible.builtin.fail:
+        msg: "upgrade_order must be defined in hypervisor_hosts.yml"
+      when: upgrade_order is not defined or upgrade_order | length == 0
+
+    - name: Log upgrade targets
+      ansible.builtin.debug:
+        msg: >-
+          Proxmox upgrade starting for {{ client_name }} ({{ client_id }})
+          Nodes: {{ upgrade_order | join(', ') }}
+          API: https://{{ api_host }}:{{ api_port }}
+
+  roles:
+    - proxmox_upgrade
+
diff --git a/roles/proxmox_upgrade/defaults/main.yml b/roles/proxmox_upgrade/defaults/main.yml
new file mode 100644
index 0000000..558fcdd
--- /dev/null
+++ b/roles/proxmox_upgrade/defaults/main.yml
@@ -0,0 +1,31 @@
+---
+# =============================================================================
+# proxmox_upgrade — defaults
+# =============================================================================
+
+# Migration behaviour
+migration_bulk: false               # true = all VMs at once, false = one at a time
+migration_restore: false            # true = migrate VMs back to original node after upgrade
+live_migrate_fallback: shutdown     # migrate | shutdown | skip
+
+# Shutdown timeout in seconds before forcing off
+vm_shutdown_timeout: 120
+
+# How long to wait for a VM to start after cold migration
+vm_start_timeout: 120
+
+# How long to wait for node to rejoin cluster after reboot
+node_rejoin_timeout: 600
+
+# How long to wait for CEPH to recover after node rejoins
+ceph_recover_timeout: 300
+
+# apt upgrade options
+apt_upgrade_cmd: "DEBIAN_FRONTEND=noninteractive apt-get dist-upgrade -y"
+apt_autoremove: true
+
+# Tags on VMs/LXCs to never migrate (comma separated in Proxmox)
+migrate_exclude_tags:
+  - nomigrate
+  - pinned
+
diff --git a/roles/proxmox_upgrade/readme.md b/roles/proxmox_upgrade/readme.md
new file mode 100644
index 0000000..f5b03be
--- /dev/null
+++ b/roles/proxmox_upgrade/readme.md
@@ -0,0 +1,12 @@
+roles/proxmox_upgrade/
+  defaults/main.yml       ← all vars with defaults
+  tasks/
+    main.yml              ← entry point, calls preflight then loops nodes
+    node_upgrade.yml      ← per-node: backup → drain → upgrade → restore
+    preflight.yml         ← cluster health check, abort if unhealthy
+    drain.yml             ← classify guests, trigger migrations
+    migrate_guest.yml     ← single guest migration with fallback logic
+    upgrade.yml           ← apt dist-upgrade, reboot, wait for rejoin
+    restore.yml           ← optional migrate-back
+playbooks/proxmox_upgrade.yml
+
diff --git a/roles/proxmox_upgrade/tasks/drain.yml b/roles/proxmox_upgrade/tasks/drain.yml
new file mode 100644
index 0000000..54efc10
--- /dev/null
+++ b/roles/proxmox_upgrade/tasks/drain.yml
@@ -0,0 +1,186 @@
+---
+# =============================================================================
+# proxmox_upgrade — drain.yml
+# Migrate all VMs/LXCs off a node before upgrading it
+# Uses Proxmox API — runs delegate_to: localhost
+# =============================================================================
+
+# ── Get all guests on this node ───────────────────────────────────────────────
+- name: Drain | Get all VMs on node {{ current_node }}
+  ansible.builtin.uri:
+    url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/qemu"
+    method: GET
+    headers:
+      Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+    validate_certs: false
+  register: node_vms
+  delegate_to: localhost
+
+- name: Drain | Get all LXCs on node {{ current_node }}
+  ansible.builtin.uri:
+    url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/lxc"
+    method: GET
+    headers:
+      Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+    validate_certs: false
+  register: node_lxcs
+  delegate_to: localhost
+
+- name: Drain | Get available target nodes
+  ansible.builtin.uri:
+    url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes"
+    method: GET
+    headers:
+      Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+    validate_certs: false
+  register: all_nodes
+  delegate_to: localhost
+
+- name: Drain | Build target node list (exclude current node)
+  ansible.builtin.set_fact:
+    migration_targets: >-
+      {{ all_nodes.json.data
+         | selectattr('status', 'equalto', 'online')
+         | rejectattr('node', 'equalto', current_node)
+         | map(attribute='node')
+         | list }}
+  delegate_to: localhost
+
+- name: Drain | Fail if no migration targets available
+  ansible.builtin.fail:
+    msg: "No online nodes available to migrate guests to. Cannot drain {{ current_node }}."
+  when: migration_targets | length == 0
+  delegate_to: localhost
+
+# ── Classify VMs — live migratable vs needs fallback ─────────────────────────
+- name: Drain | Get VM configs to check migratability
+  ansible.builtin.uri:
+    url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/qemu/{{ item.vmid }}/config"
+    method: GET
+    headers:
+      Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+    validate_certs: false
+  register: vm_configs
+  loop: "{{ node_vms.json.data }}"
+  delegate_to: localhost
+
+- name: Drain | Build guest migration plan
+  ansible.builtin.set_fact:
+    migration_plan: >-
+      {%- set plan = [] -%}
+      {%- for vm in node_vms.json.data -%}
+        {%- set cfg = vm_configs.results[loop.index0].json.data -%}
+        {%- set tags = (vm.tags | default('')) .split(',') | map('trim') | list -%}
+        {%- set excluded = tags | select('in', migrate_exclude_tags) | list | length > 0 -%}
+        {%- set has_passthrough = 'hostpci0' in cfg or 'usb0' in cfg -%}
+        {%- set has_local_disk = shared_storage == false -%}
+        {%- set has_local_cdrom = cfg.values() | select('string') | select('match', '.*local.*\\.iso.*') | list | length > 0 -%}
+        {%- set needs_fallback = has_passthrough or has_local_disk or has_local_cdrom -%}
+        {%- if not excluded -%}
+          {%- set _ = plan.append({
+            'vmid': vm.vmid,
+            'name': vm.name,
+            'type': 'qemu',
+            'status': vm.status,
+            'needs_fallback': needs_fallback,
+            'fallback_reason': ('passthrough' if has_passthrough else ('local_disk' if has_local_disk else ('local_cdrom' if has_local_cdrom else '')))
+          }) -%}
+        {%- endif -%}
+      {%- endfor -%}
+      {%- for lxc in node_lxcs.json.data -%}
+        {%- set tags = (lxc.tags | default('')) .split(',') | map('trim') | list -%}
+        {%- set excluded = tags | select('in', migrate_exclude_tags) | list | length > 0 -%}
+        {%- if not excluded -%}
+          {%- set _ = plan.append({
+            'vmid': lxc.vmid,
+            'name': lxc.name,
+            'type': 'lxc',
+            'status': lxc.status,
+            'needs_fallback': false,
+            'fallback_reason': ''
+          }) -%}
+        {%- endif -%}
+      {%- endfor -%}
+      {{ plan }}
+  delegate_to: localhost
+
+- name: Drain | Log migration plan
+  ansible.builtin.debug:
+    msg: >-
+      Migration plan for {{ current_node }}:
+      {% for g in migration_plan %}
+      - {{ g.type | upper }} {{ g.vmid }} ({{ g.name }}) [{{ g.status }}]
+        {% if g.needs_fallback %} ⚠ needs fallback ({{ g.fallback_reason }}) — action: {{ live_migrate_fallback }}{% endif %}
+      {% endfor %}
+  delegate_to: localhost
+
+# ── Abort if any guests need fallback and live_migrate_fallback is 'migrate' ──
+- name: Drain | Warn about non-migratable guests
+  ansible.builtin.debug:
+    msg: >-
+      WARNING — {{ item.type | upper }} {{ item.vmid }} ({{ item.name }})
+      cannot be live migrated ({{ item.fallback_reason }}).
+      live_migrate_fallback={{ live_migrate_fallback }} —
+      {% if live_migrate_fallback == 'skip' %}
+      THIS VM WILL GO DOWN DURING NODE REBOOT.
+      {% elif live_migrate_fallback == 'shutdown' %}
+      Will be shut down, cold migrated, and restarted.
+      {% else %}
+      Will attempt live migrate anyway (may fail).
+      {% endif %}
+  loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
+  delegate_to: localhost
+
+# ── Perform migrations ────────────────────────────────────────────────────────
+- name: Drain | Migrate guests (sequential)
+  when: not migration_bulk | bool
+  include_tasks: migrate_guest.yml
+  loop: "{{ migration_plan | rejectattr('needs_fallback') | list + migration_plan | selectattr('needs_fallback') | rejectattr('needs_fallback' if live_migrate_fallback == 'skip' else 'nonexistent') | list }}"
+  loop_var: guest
+
+- name: Drain | Migrate guests (bulk — fire all at once)
+  when: migration_bulk | bool
+  block:
+    - name: Drain | Bulk | Trigger all live migrations simultaneously
+      ansible.builtin.uri:
+        url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate"
+        method: POST
+        headers:
+          Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+        body_format: json
+        body:
+          target: "{{ migration_targets | first }}"
+          online: "{{ 1 if not guest.needs_fallback else 0 }}"
+        validate_certs: false
+      register: bulk_migration_tasks
+      loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}"
+      loop_var: guest
+      delegate_to: localhost
+
+    - name: Drain | Bulk | Wait for all migrations to complete
+      ansible.builtin.uri:
+        url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/tasks/{{ item.json.data }}/status"
+        method: GET
+        headers:
+          Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+        validate_certs: false
+      register: task_status
+      until: task_status.json.data.status == 'stopped'
+      retries: 60
+      delay: 10
+      loop: "{{ bulk_migration_tasks.results }}"
+      delegate_to: localhost
+
+    - name: Drain | Bulk | Check all migrations succeeded
+      ansible.builtin.fail:
+        msg: "Migration task failed for VMID — exitstatus: {{ item.json.data.exitstatus }}"
+      loop: "{{ task_status.results }}"
+      when: item.json.data.exitstatus != 'OK'
+      delegate_to: localhost
+
+    - name: Drain | Bulk | Handle fallback guests sequentially
+      include_tasks: migrate_guest.yml
+      loop: "{{ migration_plan | selectattr('needs_fallback') | list }}"
+      loop_var: guest
+      when: live_migrate_fallback != 'skip'
+
diff --git a/roles/proxmox_upgrade/tasks/main.yml b/roles/proxmox_upgrade/tasks/main.yml
new file mode 100644
index 0000000..48a7c6c
--- /dev/null
+++ b/roles/proxmox_upgrade/tasks/main.yml
@@ -0,0 +1,33 @@
+---
+# =============================================================================
+# proxmox_upgrade — main.yml
+# Orchestrates rolling Proxmox cluster upgrade
+# Runs on the first node in upgrade_order, delegates API calls to localhost
+# =============================================================================
+
+- name: Proxmox Upgrade | Start
+  ansible.builtin.debug:
+    msg: >-
+      Starting Proxmox rolling upgrade for {{ client_name }}
+      — {{ upgrade_order | length }} nodes in order: {{ upgrade_order | join(' → ') }}
+      — migration_bulk={{ migration_bulk }}
+      — live_migrate_fallback={{ live_migrate_fallback }}
+      — migration_restore={{ migration_restore }}
+      — ceph_enabled={{ ceph_enabled }}
+
+# ── Cluster health preflight ──────────────────────────────────────────────────
+- name: Proxmox Upgrade | Cluster preflight
+  include_tasks: preflight.yml
+
+# ── Rolling upgrade — one node at a time ─────────────────────────────────────
+- name: Proxmox Upgrade | Rolling upgrade loop
+  include_tasks: node_upgrade.yml
+  loop: "{{ upgrade_order }}"
+  loop_var: current_node
+
+- name: Proxmox Upgrade | Complete
+  ansible.builtin.debug:
+    msg: >-
+      Proxmox rolling upgrade complete for {{ client_name }}
+      — all {{ upgrade_order | length }} nodes upgraded successfully
+
diff --git a/roles/proxmox_upgrade/tasks/migrate_guest.yml b/roles/proxmox_upgrade/tasks/migrate_guest.yml
new file mode 100644
index 0000000..9e446d1
--- /dev/null
+++ b/roles/proxmox_upgrade/tasks/migrate_guest.yml
@@ -0,0 +1,117 @@
+---
+# =============================================================================
+# proxmox_upgrade — migrate_guest.yml
+# Handles migration of a single VM or LXC
+# Called with loop_var: guest
+# guest = { vmid, name, type, status, needs_fallback, fallback_reason }
+# =============================================================================
+
+- name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — skip check"
+  ansible.builtin.debug:
+    msg: "SKIPPING {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) — live_migrate_fallback=skip, will go down during reboot"
+  when: guest.needs_fallback and live_migrate_fallback == 'skip'
+  delegate_to: localhost
+
+- name: "Migrate | {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }})"
+  when: not (guest.needs_fallback and live_migrate_fallback == 'skip')
+  block:
+    # ── Cold migration: shutdown first ───────────────────────────────────────
+    - name: "Migrate | {{ guest.vmid }} | Shutdown for cold migration"
+      ansible.builtin.uri:
+        url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/shutdown"
+        method: POST
+        headers:
+          Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+        body_format: json
+        body:
+          timeout: "{{ vm_shutdown_timeout }}"
+          forceStop: 1
+        validate_certs: false
+      when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
+      delegate_to: localhost
+
+    - name: "Migrate | {{ guest.vmid }} | Wait for shutdown"
+      ansible.builtin.uri:
+        url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/current"
+        method: GET
+        headers:
+          Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+        validate_certs: false
+      register: vm_status
+      until: vm_status.json.data.status == 'stopped'
+      retries: "{{ (vm_shutdown_timeout | int / 5) | int }}"
+      delay: 5
+      when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
+      delegate_to: localhost
+
+    # ── Trigger migration ─────────────────────────────────────────────────────
+    - name: "Migrate | {{ guest.vmid }} | Trigger migration to {{ migration_targets | first }}"
+      ansible.builtin.uri:
+        url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate"
+        method: POST
+        headers:
+          Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+        body_format: json
+        body:
+          target: "{{ migration_targets | first }}"
+          online: "{{ 0 if (guest.needs_fallback and live_migrate_fallback == 'shutdown') else 1 }}"
+        validate_certs: false
+      register: migration_task
+      delegate_to: localhost
+
+    # ── Wait for migration to complete ────────────────────────────────────────
+    - name: "Migrate | {{ guest.vmid }} | Wait for migration task to complete"
+      ansible.builtin.uri:
+        url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ current_node }}/tasks/{{ migration_task.json.data }}/status"
+        method: GET
+        headers:
+          Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+        validate_certs: false
+      register: task_status
+      until: task_status.json.data.status == 'stopped'
+      retries: 60
+      delay: 10
+      delegate_to: localhost
+
+    - name: "Migrate | {{ guest.vmid }} | Verify migration succeeded"
+      ansible.builtin.fail:
+        msg: "Migration of {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }}) failed — {{ task_status.json.data.exitstatus }}"
+      when: task_status.json.data.exitstatus != 'OK'
+      delegate_to: localhost
+
+    # ── Cold migration: restart on target ────────────────────────────────────
+    - name: "Migrate | {{ guest.vmid }} | Start on target node after cold migration"
+      ansible.builtin.uri:
+        url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/start"
+        method: POST
+        headers:
+          Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+        validate_certs: false
+      when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
+      delegate_to: localhost
+
+    - name: "Migrate | {{ guest.vmid }} | Wait for VM to start on target"
+      ansible.builtin.uri:
+        url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/status/current"
+        method: GET
+        headers:
+          Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+        validate_certs: false
+      register: vm_start_status
+      until: vm_start_status.json.data.status == 'running'
+      retries: "{{ (vm_start_timeout | int / 5) | int }}"
+      delay: 5
+      when: guest.needs_fallback and live_migrate_fallback == 'shutdown' and guest.status == 'running'
+      delegate_to: localhost
+
+    - name: "Migrate | {{ guest.vmid }} ({{ guest.name }}) | Migration complete"
+      ansible.builtin.debug:
+        msg: >-
+          {{ guest.type | upper }} {{ guest.vmid }} ({{ guest.name }})
+          {% if guest.needs_fallback and live_migrate_fallback == 'shutdown' %}
+          cold migrated to {{ migration_targets | first }} and restarted
+          {% else %}
+          live migrated to {{ migration_targets | first }}
+          {% endif %}
+      delegate_to: localhost
+
diff --git a/roles/proxmox_upgrade/tasks/node_upgrade.yml b/roles/proxmox_upgrade/tasks/node_upgrade.yml
new file mode 100644
index 0000000..e5d4a1f
--- /dev/null
+++ b/roles/proxmox_upgrade/tasks/node_upgrade.yml
@@ -0,0 +1,43 @@
+---
+# =============================================================================
+# proxmox_upgrade — node_upgrade.yml
+# Per-node upgrade sequence: backup → drain → upgrade → restore
+# Called with loop_var: current_node
+# =============================================================================
+
+- name: "Node {{ current_node }} | Start"
+  ansible.builtin.debug:
+    msg: "━━━ Starting upgrade of node {{ current_node }} ━━━"
+
+# ── Step 1: Backup config ─────────────────────────────────────────────────────
+- name: "Node {{ current_node }} | Step 1 — Backup config"
+  include_role:
+    name: hypervisor_backup_config
+  vars:
+    pve_config_git_commit_message: "[{{ client_id }}] {{ current_node }} pre-upgrade config backup {{ ansible_date_time.date }}"
+  delegate_to: "{{ current_node }}"
+
+# ── Step 2: Drain node ────────────────────────────────────────────────────────
+- name: "Node {{ current_node }} | Step 2 — Drain (migrate guests off node)"
+  include_tasks: drain.yml
+  when: cluster_mode == 'cluster'
+
+- name: "Node {{ current_node }} | Step 2 — Single node mode, skipping drain"
+  ansible.builtin.debug:
+    msg: "cluster_mode=single — skipping guest migration"
+  when: cluster_mode == 'single'
+
+# ── Step 3: Upgrade ───────────────────────────────────────────────────────────
+- name: "Node {{ current_node }} | Step 3 — Upgrade packages"
+  include_tasks: upgrade.yml
+  delegate_to: "{{ current_node }}"
+
+# ── Step 4: Restore ───────────────────────────────────────────────────────────
+- name: "Node {{ current_node }} | Step 4 — Restore guests"
+  include_tasks: restore.yml
+  when: cluster_mode == 'cluster'
+
+- name: "Node {{ current_node }} | Complete"
+  ansible.builtin.debug:
+    msg: "━━━ Node {{ current_node }} upgrade complete ━━━"
+
diff --git a/roles/proxmox_upgrade/tasks/preflight.yml b/roles/proxmox_upgrade/tasks/preflight.yml
new file mode 100644
index 0000000..4f598ff
--- /dev/null
+++ b/roles/proxmox_upgrade/tasks/preflight.yml
@@ -0,0 +1,64 @@
+---
+# =============================================================================
+# proxmox_upgrade — preflight.yml
+# Check cluster health before starting any upgrade work
+# Runs delegate_to: first node in upgrade_order
+# =============================================================================
+
+- name: Preflight | Check all cluster nodes are online
+  ansible.builtin.shell: |
+    pvecm status 2>/dev/null | grep -E "^Nodes|Quorate"
+  register: pvecm_status
+  changed_when: false
+
+- name: Preflight | Get cluster node status via API
+  ansible.builtin.uri:
+    url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes"
+    method: GET
+    headers:
+      Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+    validate_certs: false
+  register: cluster_nodes
+  delegate_to: localhost
+
+- name: Preflight | Check all nodes are online
+  ansible.builtin.fail:
+    msg: >
+      Cluster health check FAILED — node {{ item.node }} is {{ item.status }}.
+      Aborting upgrade to prevent data loss. Investigate before retrying.
+  loop: "{{ cluster_nodes.json.data }}"
+  when: item.status != 'online'
+  delegate_to: localhost
+
+- name: Preflight | Check quorum via pvecm
+  ansible.builtin.shell: |
+    pvecm status 2>/dev/null | grep -i "quorate" | grep -i "yes"
+  register: quorum_check
+  changed_when: false
+  failed_when: quorum_check.rc != 0
+
+- name: Preflight | Check CEPH health
+  when: ceph_enabled | bool
+  block:
+    - name: Preflight | Get CEPH health status
+      ansible.builtin.shell: |
+        ceph health 2>/dev/null
+      register: ceph_health
+      changed_when: false
+
+    - name: Preflight | Abort if CEPH is not healthy
+      ansible.builtin.fail:
+        msg: >
+          CEPH health check FAILED — status: {{ ceph_health.stdout }}.
+          Aborting upgrade. Resolve CEPH issues before retrying.
+      when: "'HEALTH_OK' not in ceph_health.stdout and 'HEALTH_WARN' not in ceph_health.stdout"
+
+    - name: Preflight | Warn if CEPH has warnings
+      ansible.builtin.debug:
+        msg: "WARNING — CEPH has warnings: {{ ceph_health.stdout }}. Proceeding but monitor closely."
+      when: "'HEALTH_WARN' in ceph_health.stdout"
+
+- name: Preflight | Cluster health check passed
+  ansible.builtin.debug:
+    msg: "Cluster health check passed — all nodes online, quorum OK{{ ', CEPH checked' if ceph_enabled else '' }}"
+
diff --git a/roles/proxmox_upgrade/tasks/restore.yml b/roles/proxmox_upgrade/tasks/restore.yml
new file mode 100644
index 0000000..474bd00
--- /dev/null
+++ b/roles/proxmox_upgrade/tasks/restore.yml
@@ -0,0 +1,63 @@
+---
+# =============================================================================
+# proxmox_upgrade — restore.yml
+# Optionally migrate guests back to their original node after upgrade
+# Only runs if migration_restore: true
+# =============================================================================
+
+- name: Restore | Skip restore
+  ansible.builtin.debug:
+    msg: "migration_restore=false — leaving guests on their current nodes"
+  when: not migration_restore | bool
+
+- name: Restore | Migrate guests back to {{ current_node }}
+  when: migration_restore | bool
+  block:
+    - name: Restore | Get guests currently on other nodes that originated from {{ current_node }}
+      ansible.builtin.debug:
+        msg: >-
+          Restoring {{ migration_plan | rejectattr('needs_fallback') | list | length +
+          (migration_plan | selectattr('needs_fallback') | list | length if live_migrate_fallback != 'skip' else 0) }}
+          guests back to {{ current_node }}
+
+    - name: Restore | Migrate each guest back
+      ansible.builtin.uri:
+        url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/{{ 'qemu' if guest.type == 'qemu' else 'lxc' }}/{{ guest.vmid }}/migrate"
+        method: POST
+        headers:
+          Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+        body_format: json
+        body:
+          target: "{{ current_node }}"
+          online: "{{ 0 if (guest.needs_fallback and live_migrate_fallback == 'shutdown') else 1 }}"
+        validate_certs: false
+      register: restore_task
+      loop: "{{ migration_plan | rejectattr('needs_fallback') | list }}"
+      loop_var: guest
+      delegate_to: localhost
+
+    - name: Restore | Wait for all restore migrations to complete
+      ansible.builtin.uri:
+        url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes/{{ migration_targets | first }}/tasks/{{ item.json.data }}/status"
+        method: GET
+        headers:
+          Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+        validate_certs: false
+      register: restore_status
+      until: restore_status.json.data.status == 'stopped'
+      retries: 60
+      delay: 10
+      loop: "{{ restore_task.results }}"
+      delegate_to: localhost
+
+    - name: Restore | Check all restores succeeded
+      ansible.builtin.fail:
+        msg: "Restore migration failed — {{ item.json.data.exitstatus }}"
+      loop: "{{ restore_status.results }}"
+      when: item.json.data.exitstatus != 'OK'
+      delegate_to: localhost
+
+    - name: Restore | Complete
+      ansible.builtin.debug:
+        msg: "All guests restored to {{ current_node }}"
+
diff --git a/roles/proxmox_upgrade/tasks/upgrade.yml b/roles/proxmox_upgrade/tasks/upgrade.yml
new file mode 100644
index 0000000..4a0dac9
--- /dev/null
+++ b/roles/proxmox_upgrade/tasks/upgrade.yml
@@ -0,0 +1,95 @@
+---
+# =============================================================================
+# proxmox_upgrade — upgrade.yml
+# Run apt dist-upgrade and reboot, wait for node to rejoin cluster
+# =============================================================================
+
+- name: Upgrade | Set CEPH noout flag before upgrade
+  ansible.builtin.shell: ceph osd set noout
+  when: ceph_enabled | bool
+  changed_when: true
+
+- name: Upgrade | Run apt update
+  ansible.builtin.shell: apt-get update -q
+  changed_when: false
+
+- name: Upgrade | Run apt dist-upgrade
+  ansible.builtin.shell: "{{ apt_upgrade_cmd }}"
+  register: apt_upgrade_result
+  changed_when: "'0 upgraded' not in apt_upgrade_result.stdout"
+
+- name: Upgrade | Log packages upgraded
+  ansible.builtin.debug:
+    msg: "{{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('No output') }}"
+
+- name: Upgrade | Run apt autoremove
+  ansible.builtin.shell: DEBIAN_FRONTEND=noninteractive apt-get autoremove -y
+  when: apt_autoremove | bool
+  changed_when: false
+
+- name: Upgrade | Check if reboot is required
+  ansible.builtin.stat:
+    path: /var/run/reboot-required
+  register: reboot_required
+
+- name: Upgrade | Log reboot status
+  ansible.builtin.debug:
+    msg: "{{ 'Reboot required — rebooting node' if reboot_required.stat.exists else 'No reboot required — skipping reboot' }}"
+
+- name: Upgrade | Reboot node
+  ansible.builtin.reboot:
+    reboot_timeout: "{{ node_rejoin_timeout }}"
+    msg: "Rebooting for Proxmox upgrade"
+    pre_reboot_delay: 5
+    post_reboot_delay: 30
+  when: reboot_required.stat.exists
+
+# ── Wait for node to rejoin cluster ──────────────────────────────────────────
+- name: Upgrade | Wait for node to appear online in cluster
+  ansible.builtin.uri:
+    url: "https://{{ api_host }}:{{ api_port }}/api2/json/nodes"
+    method: GET
+    headers:
+      Authorization: "PVEAPIToken={{ api_token_id }}={{ api_token_secret }}"
+    validate_certs: false
+  register: nodes_status
+  until: >-
+    nodes_status.json.data
+    | selectattr('node', 'equalto', current_node)
+    | selectattr('status', 'equalto', 'online')
+    | list | length > 0
+  retries: "{{ (node_rejoin_timeout | int / 10) | int }}"
+  delay: 10
+  delegate_to: localhost
+
+- name: Upgrade | Node {{ current_node }} back online
+  ansible.builtin.debug:
+    msg: "Node {{ current_node }} has rejoined the cluster"
+
+# ── CEPH recovery wait ────────────────────────────────────────────────────────
+- name: Upgrade | Wait for CEPH to recover
+  when: ceph_enabled | bool
+  block:
+    - name: Upgrade | CEPH | Wait for HEALTH_OK or HEALTH_WARN
+      ansible.builtin.shell: ceph health
+      register: ceph_health_post
+      until: "'HEALTH_OK' in ceph_health_post.stdout or 'HEALTH_WARN' in ceph_health_post.stdout"
+      retries: "{{ (ceph_recover_timeout | int / 10) | int }}"
+      delay: 10
+      changed_when: false
+
+    - name: Upgrade | CEPH | Clear noout flag
+      ansible.builtin.shell: ceph osd unset noout
+      changed_when: true
+
+    - name: Upgrade | CEPH | Log recovery status
+      ansible.builtin.debug:
+        msg: "CEPH recovered: {{ ceph_health_post.stdout }}"
+
+- name: Upgrade | Node {{ current_node }} upgrade complete
+  ansible.builtin.debug:
+    msg: >-
+      Node {{ current_node }} upgrade complete —
+      {{ apt_upgrade_result.stdout_lines | select('match', '.*upgraded.*') | list | first | default('packages updated') }}
+      {{ '— rebooted' if reboot_required.stat.exists else '— no reboot needed' }}
+