testing new proxmox logic

This commit is contained in:
Semaphore
2026-03-15 15:48:59 -07:00
parent 347a85b09d
commit ea2f00c098
34 changed files with 2391 additions and 21 deletions

View File

@@ -0,0 +1,45 @@
---
# =============================================================================
# proxmox_ceph.yml
# CEPH management playbook for Proxmox clusters.
#
# Actions:
# status — report current CEPH health and OSD state
# set_noout — set noout flag before node maintenance
# clear_noout — clear noout flag and wait for HEALTH_OK after maintenance
# check_health — wait for CEPH to reach HEALTH_OK or HEALTH_WARN
#
# Usage:
# # Check current status
# ansible-playbook proxmox_ceph.yml -e "ceph_action=status"
#
# # Set noout before maintenance
# ansible-playbook proxmox_ceph.yml -e "ceph_action=set_noout"
#
# # Clear noout after node comes back online
# ansible-playbook proxmox_ceph.yml -e "ceph_action=clear_noout"
# =============================================================================
- name: "Proxmox | CEPH Management"
hosts: proxmox_cluster
gather_facts: false
vars:
ceph_action: status
pre_tasks:
- name: "CEPH | Validate action"
ansible.builtin.fail:
msg: >-
Invalid ceph_action '{{ ceph_action }}'.
Must be one of: status, set_noout, clear_noout, check_health.
when: ceph_action not in ['status', 'set_noout', 'clear_noout', 'check_health']
run_once: true
- name: "CEPH | Log action"
ansible.builtin.debug:
msg: "CEPH action: {{ ceph_action }} on {{ client_name | default('cluster') }}"
run_once: true
roles:
- role: proxmox_ceph

View File

@@ -0,0 +1,49 @@
---
# =============================================================================
# proxmox_config_backup.yml
# Backs up critical Proxmox configuration files from all nodes.
#
# Captures:
# /etc/pve — cluster config, VM configs, storage, users, certs
# /etc/network — network interfaces
# /etc/hosts — hostname resolution
# /etc/hostname — node name
# /etc/apt/ — apt sources (so repos can be restored)
#
# NOTE: /etc/pve contains sensitive files (SSL keys, shadow.cfg, API tokens).
# Local and SFTP destinations are supported. Git destination is a
# TODO pending a secure encryption strategy for sensitive files.
#
# Usage:
# # Backup all nodes (local)
# ansible-playbook proxmox_config_backup.yml
#
# # Backup to SFTP
# ansible-playbook proxmox_config_backup.yml \
# -e "backup_destination=sftp backup_sftp_host=backup.example.com backup_sftp_user=ansible"
#
# # Backup a single node
# ansible-playbook proxmox_config_backup.yml --limit pm-node-01
# =============================================================================
- name: "Proxmox | Config Backup"
hosts: proxmox_cluster
gather_facts: true
serial: 1 # Back up one node at a time to avoid SFTP conflicts
vars:
backup_destination: local
backup_local_dir: /var/backups/proxmox-config
backup_local_keep: 10
tasks:
- name: "Backup | Run config backup for {{ inventory_hostname }}"
ansible.builtin.include_role:
name: proxmox_config_backup
vars:
current_node: "{{ inventory_hostname }}"
- name: "Backup | All nodes complete"
ansible.builtin.debug:
msg: "✓ Config backup complete for all nodes in {{ client_name | default('cluster') }}."
run_once: true

50
playbooks/proxmox_ha.yml Normal file
View File

@@ -0,0 +1,50 @@
---
# =============================================================================
# proxmox_ha.yml
# HA group membership and maintenance mode management.
#
# Actions:
# status — show current HA status for all nodes and services
# disable — put a node into HA maintenance mode (VMs migrate away)
# enable — take a node out of HA maintenance mode (resume normal HA)
#
# Usage:
# # Check HA status
# ansible-playbook proxmox_ha.yml -e "ha_action=status"
#
# # Put node into maintenance before work
# ansible-playbook proxmox_ha.yml -e "ha_action=disable ha_target_node=pm-node-01"
#
# # Resume HA after work is complete
# ansible-playbook proxmox_ha.yml -e "ha_action=enable ha_target_node=pm-node-01"
# =============================================================================
- name: "Proxmox | HA Management"
hosts: proxmox_cluster
gather_facts: false
vars:
ha_action: status
ha_target_node: "{{ inventory_hostname }}"
pre_tasks:
- name: "HA | Validate action"
ansible.builtin.fail:
msg: >-
Invalid ha_action '{{ ha_action }}'.
Must be one of: status, disable, enable.
when: ha_action not in ['status', 'disable', 'enable']
run_once: true
- name: "HA | Log action"
ansible.builtin.debug:
msg: >-
HA {{ ha_action }} —
client={{ client_name | default('Unknown') }}
{% if ha_action in ['disable', 'enable'] %}node={{ ha_target_node }}{% endif %}
run_once: true
roles:
- role: proxmox_ha
vars:
current_node: "{{ ha_target_node }}"

View File

@@ -0,0 +1,371 @@
---
# =============================================================================
# proxmox_migrate_vms.yml
# Flexible VM migration playbook supporting three modes:
#
# drain — move all VMs off a specific node (pre-maintenance)
# rebalance — redistribute VMs evenly across all online nodes by resources
# restore — return VMs to their origin nodes using a drain state file
# targeted — migrate specific VMIDs or tagged VMs to a specified target
#
# Usage examples:
# # Drain a node before maintenance
# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=drain migrate_source_node=pm-node-01"
#
# # Rebalance the cluster
# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=rebalance"
#
# # Restore VMs to origin after maintenance
# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=restore migrate_source_node=pm-node-01"
#
# # Migrate specific VMIDs to a target node
# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=targeted migrate_vmids=[100,101] migrate_target_node=pm-node-02"
#
# # Migrate VMs by tag
# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=targeted migrate_tags=[win11] migrate_target_node=pm-node-02"
# =============================================================================
- name: "Proxmox | Migrate VMs"
hosts: proxmox_cluster
gather_facts: true
run_once: true
vars:
# Mode: drain | rebalance | restore | targeted
migrate_mode: drain
# Source node (required for drain and restore modes)
migrate_source_node: ""
# Target node (required for targeted mode, optional for drain)
migrate_target_node: ""
# Targeted mode filters
migrate_vmids: [] # list of VMIDs to migrate
migrate_tags: [] # list of tags to match
# Rebalance threshold — don't migrate if imbalance is below this % of total memory
rebalance_threshold_pct: 10
# Shared drain role vars
drain_target_strategy: "{{ 'explicit' if migrate_target_node != '' else 'resources' }}"
drain_target_node: "{{ migrate_target_node }}"
drain_state_dir: "/tmp/proxmox_drain_state"
# Restore vars
restore_state_dir: "/tmp/proxmox_drain_state"
pre_tasks:
- name: "Migrate | Validate mode"
ansible.builtin.fail:
msg: >-
Invalid migrate_mode '{{ migrate_mode }}'.
Must be one of: drain, rebalance, restore, targeted.
when: migrate_mode not in ['drain', 'rebalance', 'restore', 'targeted']
- name: "Migrate | Validate drain — source node required"
ansible.builtin.fail:
msg: "migrate_source_node is required for drain mode."
when:
- migrate_mode == 'drain'
- migrate_source_node == ''
- name: "Migrate | Validate restore — source node required"
ansible.builtin.fail:
msg: "migrate_source_node is required for restore mode."
when:
- migrate_mode == 'restore'
- migrate_source_node == ''
- name: "Migrate | Validate targeted — VMIDs or tags required"
ansible.builtin.fail:
msg: "migrate_vmids or migrate_tags must be set for targeted mode."
when:
- migrate_mode == 'targeted'
- migrate_vmids | length == 0
- migrate_tags | length == 0
- name: "Migrate | Log operation"
ansible.builtin.debug:
msg: >-
Proxmox VM migration —
client={{ client_name | default('Unknown') }}
mode={{ migrate_mode }}
{% if migrate_source_node != '' %}source={{ migrate_source_node }}{% endif %}
{% if migrate_target_node != '' %}target={{ migrate_target_node }}{% endif %}
{% if migrate_vmids | length > 0 %}vmids={{ migrate_vmids }}{% endif %}
{% if migrate_tags | length > 0 %}tags={{ migrate_tags }}{% endif %}
roles:
- role: proxmox_preflight
tasks:
# ── DRAIN mode ─────────────────────────────────────────────────────────────
- name: "Migrate | DRAIN mode"
ansible.builtin.include_role:
name: proxmox_drain
vars:
current_node: "{{ migrate_source_node }}"
when: migrate_mode == 'drain'
# ── RESTORE mode ───────────────────────────────────────────────────────────
- name: "Migrate | RESTORE mode"
ansible.builtin.include_role:
name: proxmox_restore
vars:
current_node: "{{ migrate_source_node }}"
when: migrate_mode == 'restore'
# ── REBALANCE mode ─────────────────────────────────────────────────────────
- name: "Migrate | REBALANCE | Get all node info"
community.proxmox.proxmox_node_info:
api_host: "{{ api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port | default(8006) }}"
validate_certs: "{{ validate_certs | default(false) }}"
register: rebalance_nodes
delegate_to: localhost
when: migrate_mode == 'rebalance'
- name: "Migrate | REBALANCE | Get all VM info per node"
community.proxmox.proxmox_vm_info:
api_host: "{{ api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port | default(8006) }}"
validate_certs: "{{ validate_certs | default(false) }}"
node: "{{ item.node }}"
loop: >-
{{ rebalance_nodes.proxmox_nodes
| selectattr('status', 'equalto', 'online')
| list }}
loop_control:
label: "{{ item.node }}"
register: rebalance_vms_per_node
delegate_to: localhost
when: migrate_mode == 'rebalance'
- name: "Migrate | REBALANCE | Calculate node loads"
ansible.builtin.set_fact:
rebalance_node_loads: >-
{% set loads = [] %}
{% for result in rebalance_vms_per_node.results %}
{% set node_name = result.item.node %}
{% set node_info = rebalance_nodes.proxmox_nodes
| selectattr('node', 'equalto', node_name)
| first %}
{% set vm_mem = result.proxmox_vms
| map(attribute='mem')
| map('default', 0)
| sum %}
{% set free_mem = node_info.maxmem - node_info.mem %}
{% set load_pct = (node_info.mem / node_info.maxmem * 100) | round(1) %}
{% set _ = loads.append({
'node': node_name,
'used_mem': node_info.mem,
'max_mem': node_info.maxmem,
'free_mem': free_mem,
'load_pct': load_pct,
'vm_count': result.proxmox_vms | rejectattr('template', 'equalto', true) | list | length,
'vms': result.proxmox_vms | rejectattr('template', 'equalto', true) | list
}) %}
{% endfor %}
{{ loads | sort(attribute='load_pct', reverse=true) }}
delegate_to: localhost
when: migrate_mode == 'rebalance'
- name: "Migrate | REBALANCE | Log current distribution"
ansible.builtin.debug:
msg: >-
Current cluster load:
{% for n in rebalance_node_loads %}
{{ n.node }}: {{ n.load_pct }}% memory used, {{ n.vm_count }} VMs
{% endfor %}
when: migrate_mode == 'rebalance'
- name: "Migrate | REBALANCE | Build migration plan"
ansible.builtin.set_fact:
rebalance_migrations: >-
{% set moves = [] %}
{% set loads = rebalance_node_loads | list %}
{% set total_mem = loads | map(attribute='used_mem') | sum %}
{% set avg_mem = total_mem / loads | length %}
{% for vm in (loads | map(attribute='vms') | flatten
| rejectattr('status', 'equalto', 'stopped')
| list) %}
{% set src_node = vm.node %}
{% set src_info = loads | selectattr('node', 'equalto', src_node) | first %}
{% if src_info.load_pct | float > (avg_mem / src_info.max_mem * 100 + rebalance_threshold_pct) %}
{% set target = loads
| rejectattr('node', 'equalto', src_node)
| sort(attribute='load_pct')
| first %}
{% if target.load_pct | float < src_info.load_pct | float - rebalance_threshold_pct %}
{% set _ = moves.append({
'vmid': vm.vmid,
'name': vm.name,
'type': vm.type,
'status': vm.status,
'from': src_node,
'to': target.node
}) %}
{% endif %}
{% endif %}
{% endfor %}
{{ moves }}
delegate_to: localhost
when: migrate_mode == 'rebalance'
- name: "Migrate | REBALANCE | Log migration plan"
ansible.builtin.debug:
msg: >-
Rebalance plan ({{ rebalance_migrations | length }} migration(s)):
{% if rebalance_migrations | length == 0 %}
Cluster is already balanced within {{ rebalance_threshold_pct }}% threshold — no migrations needed.
{% else %}
{% for m in rebalance_migrations %}
{{ m.name }} (VMID {{ m.vmid }}) {{ m.from }} → {{ m.to }}
{% endfor %}
{% endif %}
when: migrate_mode == 'rebalance'
- name: "Migrate | REBALANCE | Execute KVM migrations"
ansible.builtin.command: >
qm migrate {{ item.vmid }} {{ item.to }}
{% if item.status == 'running' %}--online{% endif %}
--with-local-disks 0
loop: "{{ rebalance_migrations | selectattr('type', 'equalto', 'qemu') | list }}"
loop_control:
label: "{{ item.name }} ({{ item.from }} → {{ item.to }})"
changed_when: true
delegate_to: "{{ item.from }}"
when:
- migrate_mode == 'rebalance'
- rebalance_migrations | length > 0
- name: "Migrate | REBALANCE | Execute LXC migrations"
ansible.builtin.command: >
pct migrate {{ item.vmid }} {{ item.to }} --restart --timeout 120
loop: "{{ rebalance_migrations | selectattr('type', 'equalto', 'lxc') | list }}"
loop_control:
label: "{{ item.name | default(item.vmid) }} ({{ item.from }} → {{ item.to }})"
changed_when: true
delegate_to: "{{ item.from }}"
when:
- migrate_mode == 'rebalance'
- rebalance_migrations | length > 0
- name: "Migrate | REBALANCE | Complete"
ansible.builtin.debug:
msg: >-
✓ Rebalance complete —
{{ rebalance_migrations | length }} VM(s) redistributed.
when: migrate_mode == 'rebalance'
# ── TARGETED mode ──────────────────────────────────────────────────────────
- name: "Migrate | TARGETED | Get all VMs"
community.proxmox.proxmox_vm_info:
api_host: "{{ api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port | default(8006) }}"
validate_certs: "{{ validate_certs | default(false) }}"
register: targeted_all_vms
delegate_to: localhost
when: migrate_mode == 'targeted'
- name: "Migrate | TARGETED | Filter VMs by VMID"
ansible.builtin.set_fact:
targeted_vms: >-
{{ targeted_all_vms.proxmox_vms
| selectattr('vmid', 'in', migrate_vmids)
| list }}
delegate_to: localhost
when:
- migrate_mode == 'targeted'
- migrate_vmids | length > 0
- name: "Migrate | TARGETED | Filter VMs by tag"
ansible.builtin.set_fact:
targeted_vms: >-
{{ targeted_all_vms.proxmox_vms
| selectattr('tags', 'defined')
| selectattr('tags', 'search', migrate_tags | join('|'))
| list }}
delegate_to: localhost
when:
- migrate_mode == 'targeted'
- migrate_tags | length > 0
- migrate_vmids | length == 0
- name: "Migrate | TARGETED | Resolve target node"
ansible.builtin.set_fact:
targeted_resolved_target: "{{ migrate_target_node }}"
when:
- migrate_mode == 'targeted'
- migrate_target_node != ''
- name: "Migrate | TARGETED | Auto-select target by resources"
block:
- name: "Migrate | TARGETED | Get node resources"
community.proxmox.proxmox_node_info:
api_host: "{{ api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port | default(8006) }}"
validate_certs: "{{ validate_certs | default(false) }}"
register: targeted_nodes
delegate_to: localhost
- name: "Migrate | TARGETED | Pick best target"
ansible.builtin.set_fact:
targeted_resolved_target: >-
{{ (targeted_nodes.proxmox_nodes
| selectattr('status', 'equalto', 'online')
| sort(attribute='mem')
| first).node }}
delegate_to: localhost
when:
- migrate_mode == 'targeted'
- migrate_target_node == ''
- name: "Migrate | TARGETED | Log plan"
ansible.builtin.debug:
msg: >-
Targeted migration: {{ targeted_vms | length }} VM(s) → {{ targeted_resolved_target }}
VMIDs: {{ targeted_vms | map(attribute='vmid') | list }}
when: migrate_mode == 'targeted'
- name: "Migrate | TARGETED | Migrate KVM VMs"
ansible.builtin.command: >
qm migrate {{ item.vmid }} {{ targeted_resolved_target }}
{% if item.status == 'running' %}--online{% endif %}
--with-local-disks 0
loop: "{{ targeted_vms | selectattr('type', 'equalto', 'qemu') | list }}"
loop_control:
label: "{{ item.name }} (VMID {{ item.vmid }}) → {{ targeted_resolved_target }}"
changed_when: true
delegate_to: "{{ item.node }}"
when: migrate_mode == 'targeted'
- name: "Migrate | TARGETED | Migrate LXC containers"
ansible.builtin.command: >
pct migrate {{ item.vmid }} {{ targeted_resolved_target }} --restart --timeout 120
loop: "{{ targeted_vms | selectattr('type', 'equalto', 'lxc') | list }}"
loop_control:
label: "{{ item.name | default(item.vmid) }} (VMID {{ item.vmid }}) → {{ targeted_resolved_target }}"
changed_when: true
delegate_to: "{{ item.node }}"
when: migrate_mode == 'targeted'
- name: "Migrate | TARGETED | Complete"
ansible.builtin.debug:
msg: >-
✓ Targeted migration complete —
{{ targeted_vms | length }} VM(s) moved to {{ targeted_resolved_target }}.
when: migrate_mode == 'targeted'

View File

@@ -0,0 +1,75 @@
---
# =============================================================================
# proxmox_reboot.yml
# Controlled rolling reboot of Proxmox cluster nodes.
# Drains guests before rebooting, waits for rejoin, optionally restores.
#
# Use cases:
# - Apply kernel updates that require a reboot
# - Scheduled maintenance reboots
# - Hardware changes requiring a restart
#
# Variables:
# reboot_order — ordered list of nodes to reboot (default: upgrade_order)
# reboot_reason — logged message explaining the reboot
# migration_restore — return VMs to origin after reboot (default: false)
# drain_target_strategy — resources | explicit (default: resources)
#
# Usage:
# # Rolling reboot all nodes
# ansible-playbook proxmox_reboot.yml
#
# # Reboot a single node
# ansible-playbook proxmox_reboot.yml -e "reboot_order=[pm-node-02]"
#
# # Reboot and restore VMs to origin
# ansible-playbook proxmox_reboot.yml -e "migration_restore=true"
# =============================================================================
- name: "Proxmox | Controlled Rolling Reboot"
hosts: proxmox_cluster
gather_facts: true
run_once: true
vars:
reboot_order: "{{ upgrade_order | default(groups['proxmox_cluster'] | sort) }}"
reboot_reason: "Scheduled maintenance reboot"
migration_restore: false
reboot_timeout: 600
node_rejoin_timeout: 300
node_rejoin_retries: 30
node_rejoin_delay: 10
pre_tasks:
- name: "Reboot | Log operation"
ansible.builtin.debug:
msg: >-
Proxmox rolling reboot —
client={{ client_name | default('Unknown') }}
nodes={{ reboot_order | join(', ') }}
reason={{ reboot_reason }}
restore={{ migration_restore }}
roles:
- role: proxmox_preflight
tasks:
- name: "Reboot | Rolling reboot — cluster mode"
ansible.builtin.include_tasks: tasks/proxmox_reboot_node_loop.yml
loop: "{{ reboot_order }}"
loop_control:
loop_var: current_node
label: "{{ current_node }}"
when: proxmox_is_cluster
- name: "Reboot | Standalone | Reboot node"
ansible.builtin.reboot:
reboot_timeout: "{{ reboot_timeout }}"
msg: "{{ reboot_reason }}"
pre_reboot_delay: 5
post_reboot_delay: 15
when: not proxmox_is_cluster
- name: "Reboot | Complete"
ansible.builtin.debug:
msg: "✓ Rolling reboot complete for {{ client_name | default('cluster') }}."

View File

@@ -0,0 +1,298 @@
---
# =============================================================================
# proxmox_snapshot.yml
# Pre/post maintenance VM snapshot management.
#
# Actions:
# create — snapshot all running VMs across the cluster before maintenance
# verify — verify snapshots exist and are readable
# cleanup — remove snapshots older than snapshot_max_age_hours
# rollback — rollback a specific VMID to its most recent automation snapshot
#
# Snapshots are named with a consistent prefix for easy identification and cleanup:
# auto_pre_<date>_<time>
#
# Usage:
# # Snapshot all running VMs before upgrade
# ansible-playbook proxmox_snapshot.yml -e "snapshot_action=create"
#
# # Verify snapshots exist
# ansible-playbook proxmox_snapshot.yml -e "snapshot_action=verify"
#
# # Clean up snapshots older than 48 hours
# ansible-playbook proxmox_snapshot.yml -e "snapshot_action=cleanup snapshot_max_age_hours=48"
#
# # Rollback a specific VM
# ansible-playbook proxmox_snapshot.yml -e "snapshot_action=rollback snapshot_rollback_vmid=100"
# =============================================================================
- name: "Proxmox | VM Snapshot Management"
hosts: proxmox_cluster
gather_facts: true
run_once: true
vars:
snapshot_action: create
snapshot_prefix: "auto_pre"
snapshot_description: "Pre-maintenance snapshot — managed by ansible-msp"
snapshot_max_age_hours: 72
snapshot_include_ram: false # include RAM state in snapshot (slower, more disk)
snapshot_target_vmids: [] # empty = all running VMs
snapshot_exclude_tags:
- nosnap
- nosnapshot
snapshot_rollback_vmid: "" # required for rollback action
# API connection
api_port: "{{ api_port | default(8006) }}"
validate_certs: "{{ validate_certs | default(false) }}"
pre_tasks:
- name: "Snapshot | Validate action"
ansible.builtin.fail:
msg: >-
Invalid snapshot_action '{{ snapshot_action }}'.
Must be one of: create, verify, cleanup, rollback.
when: snapshot_action not in ['create', 'verify', 'cleanup', 'rollback']
- name: "Snapshot | Validate rollback — VMID required"
ansible.builtin.fail:
msg: "snapshot_rollback_vmid is required for rollback action."
when:
- snapshot_action == 'rollback'
- snapshot_rollback_vmid == ''
- name: "Snapshot | Set snapshot name"
ansible.builtin.set_fact:
snapshot_name: "{{ snapshot_prefix }}_{{ ansible_date_time.date | replace('-','') }}_{{ ansible_date_time.hour }}{{ ansible_date_time.minute }}"
when: snapshot_action == 'create'
- name: "Snapshot | Log operation"
ansible.builtin.debug:
msg: >-
Snapshot {{ snapshot_action }} —
client={{ client_name | default('Unknown') }}
{% if snapshot_action == 'create' %}name={{ snapshot_name }}{% endif %}
{% if snapshot_action == 'cleanup' %}max_age={{ snapshot_max_age_hours }}h{% endif %}
{% if snapshot_action == 'rollback' %}vmid={{ snapshot_rollback_vmid }}{% endif %}
roles:
- role: proxmox_preflight
tasks:
# ── Get all VMs ────────────────────────────────────────────────────────────
- name: "Snapshot | Get all node info"
community.proxmox.proxmox_node_info:
api_host: "{{ api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port }}"
validate_certs: "{{ validate_certs }}"
register: snapshot_nodes
delegate_to: localhost
- name: "Snapshot | Get all VMs per node"
community.proxmox.proxmox_vm_info:
api_host: "{{ api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port }}"
validate_certs: "{{ validate_certs }}"
node: "{{ item.node }}"
loop: >-
{{ snapshot_nodes.proxmox_nodes
| selectattr('status', 'equalto', 'online')
| list }}
loop_control:
label: "{{ item.node }}"
register: snapshot_vms_per_node
delegate_to: localhost
- name: "Snapshot | Build VM list"
ansible.builtin.set_fact:
snapshot_all_vms: >-
{{ snapshot_vms_per_node.results
| map(attribute='proxmox_vms')
| flatten
| rejectattr('template', 'equalto', true)
| selectattr('type', 'equalto', 'qemu')
| list }}
delegate_to: localhost
- name: "Snapshot | Filter by VMID list"
ansible.builtin.set_fact:
snapshot_target_vms: >-
{{ snapshot_all_vms
| selectattr('vmid', 'in', snapshot_target_vmids)
| list }}
when: snapshot_target_vmids | length > 0
delegate_to: localhost
- name: "Snapshot | Filter running VMs (no VMID filter)"
ansible.builtin.set_fact:
snapshot_target_vms: >-
{{ snapshot_all_vms
| selectattr('status', 'equalto', 'running')
| rejectattr('tags', 'defined')
| list
+ snapshot_all_vms
| selectattr('status', 'equalto', 'running')
| selectattr('tags', 'defined')
| rejectattr('tags', 'search', snapshot_exclude_tags | join('|'))
| list }}
when: snapshot_target_vmids | length == 0
delegate_to: localhost
# ── CREATE ─────────────────────────────────────────────────────────────────
- name: "Snapshot | CREATE | Log plan"
ansible.builtin.debug:
msg: >-
Creating snapshot '{{ snapshot_name }}' for
{{ snapshot_target_vms | length }} VM(s):
{{ snapshot_target_vms | map(attribute='name') | list }}
when: snapshot_action == 'create'
- name: "Snapshot | CREATE | Take snapshots"
community.proxmox.proxmox_snap:
api_host: "{{ api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port }}"
validate_certs: "{{ validate_certs }}"
vmid: "{{ item.vmid }}"
snapname: "{{ snapshot_name }}"
description: "{{ snapshot_description }}"
vmstate: "{{ snapshot_include_ram }}"
state: present
loop: "{{ snapshot_target_vms }}"
loop_control:
label: "{{ item.name }} (VMID {{ item.vmid }}) on {{ item.node }}"
delegate_to: localhost
when: snapshot_action == 'create'
- name: "Snapshot | CREATE | Complete"
ansible.builtin.debug:
msg: "✓ Snapshots created: '{{ snapshot_name }}' on {{ snapshot_target_vms | length }} VM(s)."
when: snapshot_action == 'create'
# ── VERIFY ─────────────────────────────────────────────────────────────────
- name: "Snapshot | VERIFY | Check snapshots exist"
ansible.builtin.command: >
qm listsnapshot {{ item.vmid }}
loop: "{{ snapshot_target_vms }}"
loop_control:
label: "{{ item.name }} (VMID {{ item.vmid }})"
register: snapshot_verify_results
changed_when: false
delegate_to: "{{ item.node }}"
when: snapshot_action == 'verify'
- name: "Snapshot | VERIFY | Report"
ansible.builtin.debug:
msg: >-
{{ item.item.name }} (VMID {{ item.item.vmid }}):
{{ 'HAS snapshot' if snapshot_prefix in item.stdout else 'NO automation snapshot found' }}
loop: "{{ snapshot_verify_results.results | default([]) }}"
loop_control:
label: "{{ item.item.name | default(item.item.vmid) }}"
when: snapshot_action == 'verify'
# ── CLEANUP ────────────────────────────────────────────────────────────────
- name: "Snapshot | CLEANUP | Remove old snapshots"
ansible.builtin.shell: |
cutoff=$(date -d "{{ snapshot_max_age_hours }} hours ago" +%s)
for snap in $(qm listsnapshot {{ item.vmid }} 2>/dev/null | grep "{{ snapshot_prefix }}" | awk '{print $2}'); do
snap_date=$(echo $snap | sed 's/{{ snapshot_prefix }}_//' | sed 's/_[0-9]*$//')
snap_epoch=$(date -d "${snap_date:0:4}-${snap_date:4:2}-${snap_date:6:2}" +%s 2>/dev/null || echo 0)
if [ "$snap_epoch" -lt "$cutoff" ]; then
echo "Removing snapshot: $snap from VMID {{ item.vmid }}"
qm delsnapshot {{ item.vmid }} $snap
fi
done
loop: "{{ snapshot_target_vms }}"
loop_control:
label: "{{ item.name }} (VMID {{ item.vmid }})"
changed_when: true
register: snapshot_cleanup_result
delegate_to: "{{ item.node }}"
when: snapshot_action == 'cleanup'
- name: "Snapshot | CLEANUP | Complete"
ansible.builtin.debug:
msg: "✓ Snapshot cleanup complete — removed snapshots older than {{ snapshot_max_age_hours }} hours."
when: snapshot_action == 'cleanup'
# ── ROLLBACK ───────────────────────────────────────────────────────────────
- name: "Snapshot | ROLLBACK | Find most recent automation snapshot"
ansible.builtin.shell: >
qm listsnapshot {{ snapshot_rollback_vmid }} 2>/dev/null
| grep "{{ snapshot_prefix }}"
| awk '{print $2}'
| sort -r
| head -1
register: snapshot_rollback_name
changed_when: false
delegate_to: >-
{{ (snapshot_all_vms
| selectattr('vmid', 'equalto', snapshot_rollback_vmid | int)
| map(attribute='node')
| first) }}
when: snapshot_action == 'rollback'
- name: "Snapshot | ROLLBACK | Fail if no snapshot found"
ansible.builtin.fail:
msg: >-
No automation snapshot found for VMID {{ snapshot_rollback_vmid }}.
Run snapshot_action=create first.
when:
- snapshot_action == 'rollback'
- snapshot_rollback_name.stdout | trim == ''
- name: "Snapshot | ROLLBACK | Stop VM before rollback"
community.proxmox.proxmox_kvm:
api_host: "{{ api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port }}"
validate_certs: "{{ validate_certs }}"
vmid: "{{ snapshot_rollback_vmid }}"
state: stopped
force: true
timeout: 60
delegate_to: localhost
when: snapshot_action == 'rollback'
- name: "Snapshot | ROLLBACK | Execute rollback"
ansible.builtin.command: >
qm rollback {{ snapshot_rollback_vmid }} {{ snapshot_rollback_name.stdout | trim }}
changed_when: true
delegate_to: >-
{{ (snapshot_all_vms
| selectattr('vmid', 'equalto', snapshot_rollback_vmid | int)
| map(attribute='node')
| first) }}
when: snapshot_action == 'rollback'
- name: "Snapshot | ROLLBACK | Start VM after rollback"
community.proxmox.proxmox_kvm:
api_host: "{{ api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port }}"
validate_certs: "{{ validate_certs }}"
vmid: "{{ snapshot_rollback_vmid }}"
state: started
delegate_to: localhost
when: snapshot_action == 'rollback'
- name: "Snapshot | ROLLBACK | Complete"
ansible.builtin.debug:
msg: >-
✓ VMID {{ snapshot_rollback_vmid }} rolled back to
'{{ snapshot_rollback_name.stdout | trim }}'.
when: snapshot_action == 'rollback'

View File

@@ -0,0 +1,23 @@
---
# =============================================================================
# proxmox_status.yml
# Cluster health report — nodes, VMs, storage, CEPH, HA, updates.
# Safe to run at any time with no side effects.
#
# Usage:
# ansible-playbook proxmox_status.yml
# ansible-playbook proxmox_status.yml -e "status_include_ceph=false"
# =============================================================================
- name: "Proxmox | Cluster Status Report"
hosts: proxmox_cluster
gather_facts: true
vars:
status_include_vms: true
status_include_storage: true
status_include_ceph: true
status_include_ha: true
roles:
- role: proxmox_status

View File

@@ -1,43 +1,83 @@
---
# =============================================================================
# proxmox_upgrade.yml
# =============================================================================
# Rolling Proxmox cluster upgrade playbook.
# Runs on the first node in upgrade_order — all other nodes are handled
# via API calls and delegate_to from within the role.
# Rolling Proxmox upgrade orchestrator.
#
# Usage:
# ansible-playbook playbooks/proxmox_upgrade.yml \
# -i inventories/client_local_eng/hypervisor_hosts.yml
# Workflow per node (cluster mode):
# 1. Backup config
# 2. Set CEPH noout (if CEPH enabled)
# 3. Enable HA maintenance mode
# 4. Drain guests to best available node
# 5. apt dist-upgrade
# 6. Reboot if required, wait for rejoin
# 7. Clear CEPH noout
# 8. Disable HA maintenance mode
# 9. Restore guests (if migration_restore=true)
#
# Override migration behaviour:
# -e migration_bulk=true
# -e live_migrate_fallback=skip
# -e migration_restore=true
# Standalone mode skips all cluster/HA/CEPH/drain steps.
#
# Dry run (check mode — no changes):
# --check
# Variables (set in inventory or pass with -e):
# upgrade_order — ordered list of nodes to upgrade (cluster only)
# migration_restore — return VMs to origin node after upgrade (default: false)
# drain_target_strategy — resources | explicit (default: resources)
# backup_destination — local | sftp (default: local)
# =============================================================================
- name: Proxmox Rolling Upgrade
- name: "Proxmox Rolling Upgrade"
hosts: proxmox_cluster
gather_facts: true
serial: 1
run_once: true
run_once: true # Play runs once — loops over nodes internally
serial: 1 # Safety: only one Ansible host processes at a time
vars:
migration_restore: false
upgrade_order: "{{ groups['proxmox_cluster'] | sort }}"
pre_tasks:
- name: Confirm upgrade_order is defined
- name: "Upgrade | Confirm upgrade_order is defined"
ansible.builtin.fail:
msg: "upgrade_order must be defined in hypervisor_hosts.yml"
msg: "upgrade_order must be defined in inventory or passed with -e"
when: upgrade_order is not defined or upgrade_order | length == 0
- name: Log upgrade targets
- name: "Upgrade | Log targets"
ansible.builtin.debug:
msg: >-
Proxmox upgrade starting for {{ client_name }} ({{ client_id }})
Proxmox upgrade starting for {{ client_name | default('Unknown') }}
({{ client_id | default('?') }})
Nodes: {{ upgrade_order | join(', ') }}
API: https://{{ api_host }}:{{ api_port }}
roles:
- proxmox_upgrade
- role: proxmox_preflight
tasks:
# ── Cluster: loop through each node ────────────────────────────────────────
- name: "Upgrade | Rolling upgrade — cluster mode"
ansible.builtin.include_tasks: tasks/proxmox_upgrade_node_loop.yml
loop: "{{ upgrade_order }}"
loop_control:
loop_var: current_node
label: "{{ current_node }}"
when: proxmox_is_cluster
# ── Standalone: upgrade this host directly ────────────────────────────────
- name: "Upgrade | Standalone | Backup config"
ansible.builtin.include_role:
name: proxmox_config_backup
vars:
current_node: "{{ inventory_hostname }}"
when: not proxmox_is_cluster
- name: "Upgrade | Standalone | Run upgrade"
ansible.builtin.include_role:
name: proxmox_upgrade_node
vars:
current_node: "{{ inventory_hostname }}"
when: not proxmox_is_cluster
- name: "Upgrade | Complete"
ansible.builtin.debug:
msg: >-
✓ Proxmox upgrade complete for
{{ client_name | default('Unknown') }} —
{{ upgrade_order | length }} node(s) upgraded.

View File

@@ -0,0 +1,82 @@
---
# =============================================================================
# proxmox_reboot_node_loop.yml
# Called once per node by proxmox_reboot.yml
# =============================================================================
- name: "Reboot | {{ current_node }} | Start"
ansible.builtin.debug:
msg: "━━━ Starting reboot of node {{ current_node }} ━━━"
- name: "Reboot | {{ current_node }} | Set CEPH noout"
ansible.builtin.include_role:
name: proxmox_ceph
vars:
ceph_action: set_noout
- name: "Reboot | {{ current_node }} | Enable HA maintenance"
ansible.builtin.include_role:
name: proxmox_ha
vars:
ha_action: disable
- name: "Reboot | {{ current_node }} | Drain guests"
ansible.builtin.include_role:
name: proxmox_drain
- name: "Reboot | {{ current_node }} | Reboot"
ansible.builtin.reboot:
reboot_timeout: "{{ reboot_timeout }}"
msg: "{{ reboot_reason }}"
pre_reboot_delay: 5
post_reboot_delay: 15
delegate_to: "{{ current_node }}"
- name: "Reboot | {{ current_node }} | Wait for cluster rejoin"
community.proxmox.proxmox_node_info:
api_host: "{{ api_host }}"
api_user: "{{ api_user }}"
api_token_id: "{{ api_token_id }}"
api_token_secret: "{{ api_token_secret }}"
api_port: "{{ api_port | default(8006) }}"
validate_certs: "{{ validate_certs | default(false) }}"
register: reboot_rejoin_check
delegate_to: localhost
until: >-
reboot_rejoin_check.proxmox_nodes
| selectattr('node', 'equalto', current_node)
| selectattr('status', 'equalto', 'online')
| list
| length > 0
retries: "{{ node_rejoin_retries }}"
delay: "{{ node_rejoin_delay }}"
- name: "Reboot | {{ current_node }} | Back online"
ansible.builtin.debug:
msg: "✓ Node {{ current_node }} has rejoined the cluster after reboot."
- name: "Reboot | {{ current_node }} | Clear CEPH noout"
ansible.builtin.include_role:
name: proxmox_ceph
vars:
ceph_action: clear_noout
- name: "Reboot | {{ current_node }} | Disable HA maintenance"
ansible.builtin.include_role:
name: proxmox_ha
vars:
ha_action: enable
- name: "Reboot | {{ current_node }} | Restore guests"
ansible.builtin.include_role:
name: proxmox_restore
when: migration_restore | bool
- name: "Reboot | {{ current_node }} | Skip restore"
ansible.builtin.debug:
msg: "migration_restore=false — guests remain on their current nodes."
when: not migration_restore | bool
- name: "Reboot | {{ current_node }} | Complete"
ansible.builtin.debug:
msg: "━━━ Reboot complete: {{ current_node }} ━━━"

View File

@@ -0,0 +1,68 @@
---
# =============================================================================
# proxmox_upgrade_node_loop.yml
# Called once per node by proxmox_upgrade.yml.
# Handles the full per-node upgrade pipeline in cluster mode.
# =============================================================================
- name: "Node {{ current_node }} | Start"
ansible.builtin.debug:
msg: "━━━ Starting upgrade of node {{ current_node }} ━━━"
# ── Step 1: Backup config ─────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 1 — Backup config"
ansible.builtin.include_role:
name: proxmox_config_backup
# ── Step 2: CEPH noout ────────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 2 — Set CEPH noout"
ansible.builtin.include_role:
name: proxmox_ceph
vars:
ceph_action: set_noout
# ── Step 3: HA maintenance mode ───────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 3 — Enable HA maintenance"
ansible.builtin.include_role:
name: proxmox_ha
vars:
ha_action: disable
# ── Step 4: Drain guests ──────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 4 — Drain guests"
ansible.builtin.include_role:
name: proxmox_drain
# ── Step 5: Upgrade ───────────────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 5 — Upgrade packages"
ansible.builtin.include_role:
name: proxmox_upgrade_node
# ── Step 6: Clear CEPH noout ──────────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 6 — Clear CEPH noout"
ansible.builtin.include_role:
name: proxmox_ceph
vars:
ceph_action: clear_noout
# ── Step 7: Resume HA management ─────────────────────────────────────────────
- name: "Node {{ current_node }} | Step 7 — Disable HA maintenance"
ansible.builtin.include_role:
name: proxmox_ha
vars:
ha_action: enable
# ── Step 8: Restore guests (optional) ────────────────────────────────────────
- name: "Node {{ current_node }} | Step 8 — Restore guests"
ansible.builtin.include_role:
name: proxmox_restore
when: migration_restore | bool
- name: "Node {{ current_node }} | Skip restore"
ansible.builtin.debug:
msg: "migration_restore=false — guests remain on their current nodes."
when: not migration_restore | bool
- name: "Node {{ current_node }} | Complete"
ansible.builtin.debug:
msg: "━━━ Node {{ current_node }} upgrade complete ━━━"