diff --git a/playbooks/proxmox_ceph.yml b/playbooks/proxmox_ceph.yml new file mode 100644 index 0000000..be7f6bf --- /dev/null +++ b/playbooks/proxmox_ceph.yml @@ -0,0 +1,45 @@ +--- +# ============================================================================= +# proxmox_ceph.yml +# CEPH management playbook for Proxmox clusters. +# +# Actions: +# status — report current CEPH health and OSD state +# set_noout — set noout flag before node maintenance +# clear_noout — clear noout flag and wait for HEALTH_OK after maintenance +# check_health — wait for CEPH to reach HEALTH_OK or HEALTH_WARN +# +# Usage: +# # Check current status +# ansible-playbook proxmox_ceph.yml -e "ceph_action=status" +# +# # Set noout before maintenance +# ansible-playbook proxmox_ceph.yml -e "ceph_action=set_noout" +# +# # Clear noout after node comes back online +# ansible-playbook proxmox_ceph.yml -e "ceph_action=clear_noout" +# ============================================================================= + +- name: "Proxmox | CEPH Management" + hosts: proxmox_cluster + gather_facts: false + + vars: + ceph_action: status + + pre_tasks: + - name: "CEPH | Validate action" + ansible.builtin.fail: + msg: >- + Invalid ceph_action '{{ ceph_action }}'. + Must be one of: status, set_noout, clear_noout, check_health. + when: ceph_action not in ['status', 'set_noout', 'clear_noout', 'check_health'] + run_once: true + + - name: "CEPH | Log action" + ansible.builtin.debug: + msg: "CEPH action: {{ ceph_action }} on {{ client_name | default('cluster') }}" + run_once: true + + roles: + - role: proxmox_ceph diff --git a/playbooks/proxmox_config_backup.yml b/playbooks/proxmox_config_backup.yml new file mode 100644 index 0000000..db4461a --- /dev/null +++ b/playbooks/proxmox_config_backup.yml @@ -0,0 +1,49 @@ +--- +# ============================================================================= +# proxmox_config_backup.yml +# Backs up critical Proxmox configuration files from all nodes. +# +# Captures: +# /etc/pve — cluster config, VM configs, storage, users, certs +# /etc/network — network interfaces +# /etc/hosts — hostname resolution +# /etc/hostname — node name +# /etc/apt/ — apt sources (so repos can be restored) +# +# NOTE: /etc/pve contains sensitive files (SSL keys, shadow.cfg, API tokens). +# Local and SFTP destinations are supported. Git destination is a +# TODO pending a secure encryption strategy for sensitive files. +# +# Usage: +# # Backup all nodes (local) +# ansible-playbook proxmox_config_backup.yml +# +# # Backup to SFTP +# ansible-playbook proxmox_config_backup.yml \ +# -e "backup_destination=sftp backup_sftp_host=backup.example.com backup_sftp_user=ansible" +# +# # Backup a single node +# ansible-playbook proxmox_config_backup.yml --limit pm-node-01 +# ============================================================================= + +- name: "Proxmox | Config Backup" + hosts: proxmox_cluster + gather_facts: true + serial: 1 # Back up one node at a time to avoid SFTP conflicts + + vars: + backup_destination: local + backup_local_dir: /var/backups/proxmox-config + backup_local_keep: 10 + + tasks: + - name: "Backup | Run config backup for {{ inventory_hostname }}" + ansible.builtin.include_role: + name: proxmox_config_backup + vars: + current_node: "{{ inventory_hostname }}" + + - name: "Backup | All nodes complete" + ansible.builtin.debug: + msg: "✓ Config backup complete for all nodes in {{ client_name | default('cluster') }}." + run_once: true diff --git a/playbooks/proxmox_ha.yml b/playbooks/proxmox_ha.yml new file mode 100644 index 0000000..eb38dfd --- /dev/null +++ b/playbooks/proxmox_ha.yml @@ -0,0 +1,50 @@ +--- +# ============================================================================= +# proxmox_ha.yml +# HA group membership and maintenance mode management. +# +# Actions: +# status — show current HA status for all nodes and services +# disable — put a node into HA maintenance mode (VMs migrate away) +# enable — take a node out of HA maintenance mode (resume normal HA) +# +# Usage: +# # Check HA status +# ansible-playbook proxmox_ha.yml -e "ha_action=status" +# +# # Put node into maintenance before work +# ansible-playbook proxmox_ha.yml -e "ha_action=disable ha_target_node=pm-node-01" +# +# # Resume HA after work is complete +# ansible-playbook proxmox_ha.yml -e "ha_action=enable ha_target_node=pm-node-01" +# ============================================================================= + +- name: "Proxmox | HA Management" + hosts: proxmox_cluster + gather_facts: false + + vars: + ha_action: status + ha_target_node: "{{ inventory_hostname }}" + + pre_tasks: + - name: "HA | Validate action" + ansible.builtin.fail: + msg: >- + Invalid ha_action '{{ ha_action }}'. + Must be one of: status, disable, enable. + when: ha_action not in ['status', 'disable', 'enable'] + run_once: true + + - name: "HA | Log action" + ansible.builtin.debug: + msg: >- + HA {{ ha_action }} — + client={{ client_name | default('Unknown') }} + {% if ha_action in ['disable', 'enable'] %}node={{ ha_target_node }}{% endif %} + run_once: true + + roles: + - role: proxmox_ha + vars: + current_node: "{{ ha_target_node }}" diff --git a/playbooks/proxmox_migrate_vms.yml b/playbooks/proxmox_migrate_vms.yml new file mode 100644 index 0000000..e5ee792 --- /dev/null +++ b/playbooks/proxmox_migrate_vms.yml @@ -0,0 +1,371 @@ +--- +# ============================================================================= +# proxmox_migrate_vms.yml +# Flexible VM migration playbook supporting three modes: +# +# drain — move all VMs off a specific node (pre-maintenance) +# rebalance — redistribute VMs evenly across all online nodes by resources +# restore — return VMs to their origin nodes using a drain state file +# targeted — migrate specific VMIDs or tagged VMs to a specified target +# +# Usage examples: +# # Drain a node before maintenance +# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=drain migrate_source_node=pm-node-01" +# +# # Rebalance the cluster +# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=rebalance" +# +# # Restore VMs to origin after maintenance +# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=restore migrate_source_node=pm-node-01" +# +# # Migrate specific VMIDs to a target node +# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=targeted migrate_vmids=[100,101] migrate_target_node=pm-node-02" +# +# # Migrate VMs by tag +# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=targeted migrate_tags=[win11] migrate_target_node=pm-node-02" +# ============================================================================= + +- name: "Proxmox | Migrate VMs" + hosts: proxmox_cluster + gather_facts: true + run_once: true + + vars: + # Mode: drain | rebalance | restore | targeted + migrate_mode: drain + + # Source node (required for drain and restore modes) + migrate_source_node: "" + + # Target node (required for targeted mode, optional for drain) + migrate_target_node: "" + + # Targeted mode filters + migrate_vmids: [] # list of VMIDs to migrate + migrate_tags: [] # list of tags to match + + # Rebalance threshold — don't migrate if imbalance is below this % of total memory + rebalance_threshold_pct: 10 + + # Shared drain role vars + drain_target_strategy: "{{ 'explicit' if migrate_target_node != '' else 'resources' }}" + drain_target_node: "{{ migrate_target_node }}" + drain_state_dir: "/tmp/proxmox_drain_state" + + # Restore vars + restore_state_dir: "/tmp/proxmox_drain_state" + + pre_tasks: + - name: "Migrate | Validate mode" + ansible.builtin.fail: + msg: >- + Invalid migrate_mode '{{ migrate_mode }}'. + Must be one of: drain, rebalance, restore, targeted. + when: migrate_mode not in ['drain', 'rebalance', 'restore', 'targeted'] + + - name: "Migrate | Validate drain — source node required" + ansible.builtin.fail: + msg: "migrate_source_node is required for drain mode." + when: + - migrate_mode == 'drain' + - migrate_source_node == '' + + - name: "Migrate | Validate restore — source node required" + ansible.builtin.fail: + msg: "migrate_source_node is required for restore mode." + when: + - migrate_mode == 'restore' + - migrate_source_node == '' + + - name: "Migrate | Validate targeted — VMIDs or tags required" + ansible.builtin.fail: + msg: "migrate_vmids or migrate_tags must be set for targeted mode." + when: + - migrate_mode == 'targeted' + - migrate_vmids | length == 0 + - migrate_tags | length == 0 + + - name: "Migrate | Log operation" + ansible.builtin.debug: + msg: >- + Proxmox VM migration — + client={{ client_name | default('Unknown') }} + mode={{ migrate_mode }} + {% if migrate_source_node != '' %}source={{ migrate_source_node }}{% endif %} + {% if migrate_target_node != '' %}target={{ migrate_target_node }}{% endif %} + {% if migrate_vmids | length > 0 %}vmids={{ migrate_vmids }}{% endif %} + {% if migrate_tags | length > 0 %}tags={{ migrate_tags }}{% endif %} + + roles: + - role: proxmox_preflight + + tasks: + # ── DRAIN mode ───────────────────────────────────────────────────────────── + - name: "Migrate | DRAIN mode" + ansible.builtin.include_role: + name: proxmox_drain + vars: + current_node: "{{ migrate_source_node }}" + when: migrate_mode == 'drain' + + # ── RESTORE mode ─────────────────────────────────────────────────────────── + - name: "Migrate | RESTORE mode" + ansible.builtin.include_role: + name: proxmox_restore + vars: + current_node: "{{ migrate_source_node }}" + when: migrate_mode == 'restore' + + # ── REBALANCE mode ───────────────────────────────────────────────────────── + - name: "Migrate | REBALANCE | Get all node info" + community.proxmox.proxmox_node_info: + api_host: "{{ api_host }}" + api_user: "{{ api_user }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port | default(8006) }}" + validate_certs: "{{ validate_certs | default(false) }}" + register: rebalance_nodes + delegate_to: localhost + when: migrate_mode == 'rebalance' + + - name: "Migrate | REBALANCE | Get all VM info per node" + community.proxmox.proxmox_vm_info: + api_host: "{{ api_host }}" + api_user: "{{ api_user }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port | default(8006) }}" + validate_certs: "{{ validate_certs | default(false) }}" + node: "{{ item.node }}" + loop: >- + {{ rebalance_nodes.proxmox_nodes + | selectattr('status', 'equalto', 'online') + | list }} + loop_control: + label: "{{ item.node }}" + register: rebalance_vms_per_node + delegate_to: localhost + when: migrate_mode == 'rebalance' + + - name: "Migrate | REBALANCE | Calculate node loads" + ansible.builtin.set_fact: + rebalance_node_loads: >- + {% set loads = [] %} + {% for result in rebalance_vms_per_node.results %} + {% set node_name = result.item.node %} + {% set node_info = rebalance_nodes.proxmox_nodes + | selectattr('node', 'equalto', node_name) + | first %} + {% set vm_mem = result.proxmox_vms + | map(attribute='mem') + | map('default', 0) + | sum %} + {% set free_mem = node_info.maxmem - node_info.mem %} + {% set load_pct = (node_info.mem / node_info.maxmem * 100) | round(1) %} + {% set _ = loads.append({ + 'node': node_name, + 'used_mem': node_info.mem, + 'max_mem': node_info.maxmem, + 'free_mem': free_mem, + 'load_pct': load_pct, + 'vm_count': result.proxmox_vms | rejectattr('template', 'equalto', true) | list | length, + 'vms': result.proxmox_vms | rejectattr('template', 'equalto', true) | list + }) %} + {% endfor %} + {{ loads | sort(attribute='load_pct', reverse=true) }} + delegate_to: localhost + when: migrate_mode == 'rebalance' + + - name: "Migrate | REBALANCE | Log current distribution" + ansible.builtin.debug: + msg: >- + Current cluster load: + {% for n in rebalance_node_loads %} + {{ n.node }}: {{ n.load_pct }}% memory used, {{ n.vm_count }} VMs + {% endfor %} + when: migrate_mode == 'rebalance' + + - name: "Migrate | REBALANCE | Build migration plan" + ansible.builtin.set_fact: + rebalance_migrations: >- + {% set moves = [] %} + {% set loads = rebalance_node_loads | list %} + {% set total_mem = loads | map(attribute='used_mem') | sum %} + {% set avg_mem = total_mem / loads | length %} + {% for vm in (loads | map(attribute='vms') | flatten + | rejectattr('status', 'equalto', 'stopped') + | list) %} + {% set src_node = vm.node %} + {% set src_info = loads | selectattr('node', 'equalto', src_node) | first %} + {% if src_info.load_pct | float > (avg_mem / src_info.max_mem * 100 + rebalance_threshold_pct) %} + {% set target = loads + | rejectattr('node', 'equalto', src_node) + | sort(attribute='load_pct') + | first %} + {% if target.load_pct | float < src_info.load_pct | float - rebalance_threshold_pct %} + {% set _ = moves.append({ + 'vmid': vm.vmid, + 'name': vm.name, + 'type': vm.type, + 'status': vm.status, + 'from': src_node, + 'to': target.node + }) %} + {% endif %} + {% endif %} + {% endfor %} + {{ moves }} + delegate_to: localhost + when: migrate_mode == 'rebalance' + + - name: "Migrate | REBALANCE | Log migration plan" + ansible.builtin.debug: + msg: >- + Rebalance plan ({{ rebalance_migrations | length }} migration(s)): + {% if rebalance_migrations | length == 0 %} + Cluster is already balanced within {{ rebalance_threshold_pct }}% threshold — no migrations needed. + {% else %} + {% for m in rebalance_migrations %} + {{ m.name }} (VMID {{ m.vmid }}) {{ m.from }} → {{ m.to }} + {% endfor %} + {% endif %} + when: migrate_mode == 'rebalance' + + - name: "Migrate | REBALANCE | Execute KVM migrations" + ansible.builtin.command: > + qm migrate {{ item.vmid }} {{ item.to }} + {% if item.status == 'running' %}--online{% endif %} + --with-local-disks 0 + loop: "{{ rebalance_migrations | selectattr('type', 'equalto', 'qemu') | list }}" + loop_control: + label: "{{ item.name }} ({{ item.from }} → {{ item.to }})" + changed_when: true + delegate_to: "{{ item.from }}" + when: + - migrate_mode == 'rebalance' + - rebalance_migrations | length > 0 + + - name: "Migrate | REBALANCE | Execute LXC migrations" + ansible.builtin.command: > + pct migrate {{ item.vmid }} {{ item.to }} --restart --timeout 120 + loop: "{{ rebalance_migrations | selectattr('type', 'equalto', 'lxc') | list }}" + loop_control: + label: "{{ item.name | default(item.vmid) }} ({{ item.from }} → {{ item.to }})" + changed_when: true + delegate_to: "{{ item.from }}" + when: + - migrate_mode == 'rebalance' + - rebalance_migrations | length > 0 + + - name: "Migrate | REBALANCE | Complete" + ansible.builtin.debug: + msg: >- + ✓ Rebalance complete — + {{ rebalance_migrations | length }} VM(s) redistributed. + when: migrate_mode == 'rebalance' + + # ── TARGETED mode ────────────────────────────────────────────────────────── + - name: "Migrate | TARGETED | Get all VMs" + community.proxmox.proxmox_vm_info: + api_host: "{{ api_host }}" + api_user: "{{ api_user }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port | default(8006) }}" + validate_certs: "{{ validate_certs | default(false) }}" + register: targeted_all_vms + delegate_to: localhost + when: migrate_mode == 'targeted' + + - name: "Migrate | TARGETED | Filter VMs by VMID" + ansible.builtin.set_fact: + targeted_vms: >- + {{ targeted_all_vms.proxmox_vms + | selectattr('vmid', 'in', migrate_vmids) + | list }} + delegate_to: localhost + when: + - migrate_mode == 'targeted' + - migrate_vmids | length > 0 + + - name: "Migrate | TARGETED | Filter VMs by tag" + ansible.builtin.set_fact: + targeted_vms: >- + {{ targeted_all_vms.proxmox_vms + | selectattr('tags', 'defined') + | selectattr('tags', 'search', migrate_tags | join('|')) + | list }} + delegate_to: localhost + when: + - migrate_mode == 'targeted' + - migrate_tags | length > 0 + - migrate_vmids | length == 0 + + - name: "Migrate | TARGETED | Resolve target node" + ansible.builtin.set_fact: + targeted_resolved_target: "{{ migrate_target_node }}" + when: + - migrate_mode == 'targeted' + - migrate_target_node != '' + + - name: "Migrate | TARGETED | Auto-select target by resources" + block: + - name: "Migrate | TARGETED | Get node resources" + community.proxmox.proxmox_node_info: + api_host: "{{ api_host }}" + api_user: "{{ api_user }}" + api_token_id: "{{ api_token_id }}" + api_token_secret: "{{ api_token_secret }}" + api_port: "{{ api_port | default(8006) }}" + validate_certs: "{{ validate_certs | default(false) }}" + register: targeted_nodes + delegate_to: localhost + + - name: "Migrate | TARGETED | Pick best target" + ansible.builtin.set_fact: + targeted_resolved_target: >- + {{ (targeted_nodes.proxmox_nodes + | selectattr('status', 'equalto', 'online') + | sort(attribute='mem') + | first).node }} + delegate_to: localhost + when: + - migrate_mode == 'targeted' + - migrate_target_node == '' + + - name: "Migrate | TARGETED | Log plan" + ansible.builtin.debug: + msg: >- + Targeted migration: {{ targeted_vms | length }} VM(s) → {{ targeted_resolved_target }} + VMIDs: {{ targeted_vms | map(attribute='vmid') | list }} + when: migrate_mode == 'targeted' + + - name: "Migrate | TARGETED | Migrate KVM VMs" + ansible.builtin.command: > + qm migrate {{ item.vmid }} {{ targeted_resolved_target }} + {% if item.status == 'running' %}--online{% endif %} + --with-local-disks 0 + loop: "{{ targeted_vms | selectattr('type', 'equalto', 'qemu') | list }}" + loop_control: + label: "{{ item.name }} (VMID {{ item.vmid }}) → {{ targeted_resolved_target }}" + changed_when: true + delegate_to: "{{ item.node }}" + when: migrate_mode == 'targeted' + + - name: "Migrate | TARGETED | Migrate LXC containers" + ansible.builtin.command: > + pct migrate {{ item.vmid }} {{ targeted_resolved_target }} --restart --timeout 120 + loop: "{{ targeted_vms | selectattr('type', 'equalto', 'lxc') | list }}" + loop_control: + label: "{{ item.name | default(item.vmid) }} (VMID {{ item.vmid }}) → {{ targeted_resolved_target }}" + changed_when: true + delegate_to: "{{ item.node }}" + when: migrate_mode == 'targeted' + + - name: "Migrate | TARGETED | Complete" + ansible.builtin.debug: + msg: >- + ✓ Targeted migration complete — + {{ targeted_vms | length }} VM(s) moved to {{ targeted_resolved_target }}. + when: migrate_mode == 'targeted' diff --git a/playbooks/proxmox_reboot.yml b/playbooks/proxmox_reboot.yml new file mode 100644 index 0000000..816f536 --- /dev/null +++ b/playbooks/proxmox_reboot.yml @@ -0,0 +1,75 @@ +--- +# ============================================================================= +# proxmox_reboot.yml +# Controlled rolling reboot of Proxmox cluster nodes. +# Drains guests before rebooting, waits for rejoin, optionally restores. +# +# Use cases: +# - Apply kernel updates that require a reboot +# - Scheduled maintenance reboots +# - Hardware changes requiring a restart +# +# Variables: +# reboot_order — ordered list of nodes to reboot (default: upgrade_order) +# reboot_reason — logged message explaining the reboot +# migration_restore — return VMs to origin after reboot (default: false) +# drain_target_strategy — resources | explicit (default: resources) +# +# Usage: +# # Rolling reboot all nodes +# ansible-playbook proxmox_reboot.yml +# +# # Reboot a single node +# ansible-playbook proxmox_reboot.yml -e "reboot_order=[pm-node-02]" +# +# # Reboot and restore VMs to origin +# ansible-playbook proxmox_reboot.yml -e "migration_restore=true" +# ============================================================================= + +- name: "Proxmox | Controlled Rolling Reboot" + hosts: proxmox_cluster + gather_facts: true + run_once: true + + vars: + reboot_order: "{{ upgrade_order | default(groups['proxmox_cluster'] | sort) }}" + reboot_reason: "Scheduled maintenance reboot" + migration_restore: false + reboot_timeout: 600 + node_rejoin_timeout: 300 + node_rejoin_retries: 30 + node_rejoin_delay: 10 + + pre_tasks: + - name: "Reboot | Log operation" + ansible.builtin.debug: + msg: >- + Proxmox rolling reboot — + client={{ client_name | default('Unknown') }} + nodes={{ reboot_order | join(', ') }} + reason={{ reboot_reason }} + restore={{ migration_restore }} + + roles: + - role: proxmox_preflight + + tasks: + - name: "Reboot | Rolling reboot — cluster mode" + ansible.builtin.include_tasks: tasks/proxmox_reboot_node_loop.yml + loop: "{{ reboot_order }}" + loop_control: + loop_var: current_node + label: "{{ current_node }}" + when: proxmox_is_cluster + + - name: "Reboot | Standalone | Reboot node" + ansible.builtin.reboot: + reboot_timeout: "{{ reboot_timeout }}" + msg: "{{ reboot_reason }}" + pre_reboot_delay: 5 + post_reboot_delay: 15 + when: not proxmox_is_cluster + + - name: "Reboot | Complete" + ansible.builtin.debug: + msg: "✓ Rolling reboot complete for {{ client_name | default('cluster') }}." diff --git a/playbooks/proxmox_snapshot.yml b/playbooks/proxmox_snapshot.yml new file mode 100644 index 0000000..8ecc7ed --- /dev/null +++ b/playbooks/proxmox_snapshot.yml @@ -0,0 +1,298 @@ +--- +# ============================================================================= +# proxmox_snapshot.yml +# Pre/post maintenance VM snapshot management. +# +# Actions: +# create — snapshot all running VMs across the cluster before maintenance +# verify — verify snapshots exist and are readable +# cleanup — remove snapshots older than snapshot_max_age_hours +# rollback — rollback a specific VMID to its most recent automation snapshot +# +# Snapshots are named with a consistent prefix for easy identification and cleanup: +# auto_pre__