testing new proxmox logic
This commit is contained in:
45
playbooks/proxmox_ceph.yml
Normal file
45
playbooks/proxmox_ceph.yml
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_ceph.yml
|
||||||
|
# CEPH management playbook for Proxmox clusters.
|
||||||
|
#
|
||||||
|
# Actions:
|
||||||
|
# status — report current CEPH health and OSD state
|
||||||
|
# set_noout — set noout flag before node maintenance
|
||||||
|
# clear_noout — clear noout flag and wait for HEALTH_OK after maintenance
|
||||||
|
# check_health — wait for CEPH to reach HEALTH_OK or HEALTH_WARN
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# # Check current status
|
||||||
|
# ansible-playbook proxmox_ceph.yml -e "ceph_action=status"
|
||||||
|
#
|
||||||
|
# # Set noout before maintenance
|
||||||
|
# ansible-playbook proxmox_ceph.yml -e "ceph_action=set_noout"
|
||||||
|
#
|
||||||
|
# # Clear noout after node comes back online
|
||||||
|
# ansible-playbook proxmox_ceph.yml -e "ceph_action=clear_noout"
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Proxmox | CEPH Management"
|
||||||
|
hosts: proxmox_cluster
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
vars:
|
||||||
|
ceph_action: status
|
||||||
|
|
||||||
|
pre_tasks:
|
||||||
|
- name: "CEPH | Validate action"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >-
|
||||||
|
Invalid ceph_action '{{ ceph_action }}'.
|
||||||
|
Must be one of: status, set_noout, clear_noout, check_health.
|
||||||
|
when: ceph_action not in ['status', 'set_noout', 'clear_noout', 'check_health']
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "CEPH | Log action"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "CEPH action: {{ ceph_action }} on {{ client_name | default('cluster') }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: proxmox_ceph
|
||||||
49
playbooks/proxmox_config_backup.yml
Normal file
49
playbooks/proxmox_config_backup.yml
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_config_backup.yml
|
||||||
|
# Backs up critical Proxmox configuration files from all nodes.
|
||||||
|
#
|
||||||
|
# Captures:
|
||||||
|
# /etc/pve — cluster config, VM configs, storage, users, certs
|
||||||
|
# /etc/network — network interfaces
|
||||||
|
# /etc/hosts — hostname resolution
|
||||||
|
# /etc/hostname — node name
|
||||||
|
# /etc/apt/ — apt sources (so repos can be restored)
|
||||||
|
#
|
||||||
|
# NOTE: /etc/pve contains sensitive files (SSL keys, shadow.cfg, API tokens).
|
||||||
|
# Local and SFTP destinations are supported. Git destination is a
|
||||||
|
# TODO pending a secure encryption strategy for sensitive files.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# # Backup all nodes (local)
|
||||||
|
# ansible-playbook proxmox_config_backup.yml
|
||||||
|
#
|
||||||
|
# # Backup to SFTP
|
||||||
|
# ansible-playbook proxmox_config_backup.yml \
|
||||||
|
# -e "backup_destination=sftp backup_sftp_host=backup.example.com backup_sftp_user=ansible"
|
||||||
|
#
|
||||||
|
# # Backup a single node
|
||||||
|
# ansible-playbook proxmox_config_backup.yml --limit pm-node-01
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Proxmox | Config Backup"
|
||||||
|
hosts: proxmox_cluster
|
||||||
|
gather_facts: true
|
||||||
|
serial: 1 # Back up one node at a time to avoid SFTP conflicts
|
||||||
|
|
||||||
|
vars:
|
||||||
|
backup_destination: local
|
||||||
|
backup_local_dir: /var/backups/proxmox-config
|
||||||
|
backup_local_keep: 10
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: "Backup | Run config backup for {{ inventory_hostname }}"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_config_backup
|
||||||
|
vars:
|
||||||
|
current_node: "{{ inventory_hostname }}"
|
||||||
|
|
||||||
|
- name: "Backup | All nodes complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ Config backup complete for all nodes in {{ client_name | default('cluster') }}."
|
||||||
|
run_once: true
|
||||||
50
playbooks/proxmox_ha.yml
Normal file
50
playbooks/proxmox_ha.yml
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_ha.yml
|
||||||
|
# HA group membership and maintenance mode management.
|
||||||
|
#
|
||||||
|
# Actions:
|
||||||
|
# status — show current HA status for all nodes and services
|
||||||
|
# disable — put a node into HA maintenance mode (VMs migrate away)
|
||||||
|
# enable — take a node out of HA maintenance mode (resume normal HA)
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# # Check HA status
|
||||||
|
# ansible-playbook proxmox_ha.yml -e "ha_action=status"
|
||||||
|
#
|
||||||
|
# # Put node into maintenance before work
|
||||||
|
# ansible-playbook proxmox_ha.yml -e "ha_action=disable ha_target_node=pm-node-01"
|
||||||
|
#
|
||||||
|
# # Resume HA after work is complete
|
||||||
|
# ansible-playbook proxmox_ha.yml -e "ha_action=enable ha_target_node=pm-node-01"
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Proxmox | HA Management"
|
||||||
|
hosts: proxmox_cluster
|
||||||
|
gather_facts: false
|
||||||
|
|
||||||
|
vars:
|
||||||
|
ha_action: status
|
||||||
|
ha_target_node: "{{ inventory_hostname }}"
|
||||||
|
|
||||||
|
pre_tasks:
|
||||||
|
- name: "HA | Validate action"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >-
|
||||||
|
Invalid ha_action '{{ ha_action }}'.
|
||||||
|
Must be one of: status, disable, enable.
|
||||||
|
when: ha_action not in ['status', 'disable', 'enable']
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "HA | Log action"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
HA {{ ha_action }} —
|
||||||
|
client={{ client_name | default('Unknown') }}
|
||||||
|
{% if ha_action in ['disable', 'enable'] %}node={{ ha_target_node }}{% endif %}
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: proxmox_ha
|
||||||
|
vars:
|
||||||
|
current_node: "{{ ha_target_node }}"
|
||||||
371
playbooks/proxmox_migrate_vms.yml
Normal file
371
playbooks/proxmox_migrate_vms.yml
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_migrate_vms.yml
|
||||||
|
# Flexible VM migration playbook supporting three modes:
|
||||||
|
#
|
||||||
|
# drain — move all VMs off a specific node (pre-maintenance)
|
||||||
|
# rebalance — redistribute VMs evenly across all online nodes by resources
|
||||||
|
# restore — return VMs to their origin nodes using a drain state file
|
||||||
|
# targeted — migrate specific VMIDs or tagged VMs to a specified target
|
||||||
|
#
|
||||||
|
# Usage examples:
|
||||||
|
# # Drain a node before maintenance
|
||||||
|
# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=drain migrate_source_node=pm-node-01"
|
||||||
|
#
|
||||||
|
# # Rebalance the cluster
|
||||||
|
# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=rebalance"
|
||||||
|
#
|
||||||
|
# # Restore VMs to origin after maintenance
|
||||||
|
# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=restore migrate_source_node=pm-node-01"
|
||||||
|
#
|
||||||
|
# # Migrate specific VMIDs to a target node
|
||||||
|
# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=targeted migrate_vmids=[100,101] migrate_target_node=pm-node-02"
|
||||||
|
#
|
||||||
|
# # Migrate VMs by tag
|
||||||
|
# ansible-playbook proxmox_migrate_vms.yml -e "migrate_mode=targeted migrate_tags=[win11] migrate_target_node=pm-node-02"
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Proxmox | Migrate VMs"
|
||||||
|
hosts: proxmox_cluster
|
||||||
|
gather_facts: true
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
vars:
|
||||||
|
# Mode: drain | rebalance | restore | targeted
|
||||||
|
migrate_mode: drain
|
||||||
|
|
||||||
|
# Source node (required for drain and restore modes)
|
||||||
|
migrate_source_node: ""
|
||||||
|
|
||||||
|
# Target node (required for targeted mode, optional for drain)
|
||||||
|
migrate_target_node: ""
|
||||||
|
|
||||||
|
# Targeted mode filters
|
||||||
|
migrate_vmids: [] # list of VMIDs to migrate
|
||||||
|
migrate_tags: [] # list of tags to match
|
||||||
|
|
||||||
|
# Rebalance threshold — don't migrate if imbalance is below this % of total memory
|
||||||
|
rebalance_threshold_pct: 10
|
||||||
|
|
||||||
|
# Shared drain role vars
|
||||||
|
drain_target_strategy: "{{ 'explicit' if migrate_target_node != '' else 'resources' }}"
|
||||||
|
drain_target_node: "{{ migrate_target_node }}"
|
||||||
|
drain_state_dir: "/tmp/proxmox_drain_state"
|
||||||
|
|
||||||
|
# Restore vars
|
||||||
|
restore_state_dir: "/tmp/proxmox_drain_state"
|
||||||
|
|
||||||
|
pre_tasks:
|
||||||
|
- name: "Migrate | Validate mode"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >-
|
||||||
|
Invalid migrate_mode '{{ migrate_mode }}'.
|
||||||
|
Must be one of: drain, rebalance, restore, targeted.
|
||||||
|
when: migrate_mode not in ['drain', 'rebalance', 'restore', 'targeted']
|
||||||
|
|
||||||
|
- name: "Migrate | Validate drain — source node required"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "migrate_source_node is required for drain mode."
|
||||||
|
when:
|
||||||
|
- migrate_mode == 'drain'
|
||||||
|
- migrate_source_node == ''
|
||||||
|
|
||||||
|
- name: "Migrate | Validate restore — source node required"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "migrate_source_node is required for restore mode."
|
||||||
|
when:
|
||||||
|
- migrate_mode == 'restore'
|
||||||
|
- migrate_source_node == ''
|
||||||
|
|
||||||
|
- name: "Migrate | Validate targeted — VMIDs or tags required"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "migrate_vmids or migrate_tags must be set for targeted mode."
|
||||||
|
when:
|
||||||
|
- migrate_mode == 'targeted'
|
||||||
|
- migrate_vmids | length == 0
|
||||||
|
- migrate_tags | length == 0
|
||||||
|
|
||||||
|
- name: "Migrate | Log operation"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Proxmox VM migration —
|
||||||
|
client={{ client_name | default('Unknown') }}
|
||||||
|
mode={{ migrate_mode }}
|
||||||
|
{% if migrate_source_node != '' %}source={{ migrate_source_node }}{% endif %}
|
||||||
|
{% if migrate_target_node != '' %}target={{ migrate_target_node }}{% endif %}
|
||||||
|
{% if migrate_vmids | length > 0 %}vmids={{ migrate_vmids }}{% endif %}
|
||||||
|
{% if migrate_tags | length > 0 %}tags={{ migrate_tags }}{% endif %}
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: proxmox_preflight
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
# ── DRAIN mode ─────────────────────────────────────────────────────────────
|
||||||
|
- name: "Migrate | DRAIN mode"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_drain
|
||||||
|
vars:
|
||||||
|
current_node: "{{ migrate_source_node }}"
|
||||||
|
when: migrate_mode == 'drain'
|
||||||
|
|
||||||
|
# ── RESTORE mode ───────────────────────────────────────────────────────────
|
||||||
|
- name: "Migrate | RESTORE mode"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_restore
|
||||||
|
vars:
|
||||||
|
current_node: "{{ migrate_source_node }}"
|
||||||
|
when: migrate_mode == 'restore'
|
||||||
|
|
||||||
|
# ── REBALANCE mode ─────────────────────────────────────────────────────────
|
||||||
|
- name: "Migrate | REBALANCE | Get all node info"
|
||||||
|
community.proxmox.proxmox_node_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port | default(8006) }}"
|
||||||
|
validate_certs: "{{ validate_certs | default(false) }}"
|
||||||
|
register: rebalance_nodes
|
||||||
|
delegate_to: localhost
|
||||||
|
when: migrate_mode == 'rebalance'
|
||||||
|
|
||||||
|
- name: "Migrate | REBALANCE | Get all VM info per node"
|
||||||
|
community.proxmox.proxmox_vm_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port | default(8006) }}"
|
||||||
|
validate_certs: "{{ validate_certs | default(false) }}"
|
||||||
|
node: "{{ item.node }}"
|
||||||
|
loop: >-
|
||||||
|
{{ rebalance_nodes.proxmox_nodes
|
||||||
|
| selectattr('status', 'equalto', 'online')
|
||||||
|
| list }}
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.node }}"
|
||||||
|
register: rebalance_vms_per_node
|
||||||
|
delegate_to: localhost
|
||||||
|
when: migrate_mode == 'rebalance'
|
||||||
|
|
||||||
|
- name: "Migrate | REBALANCE | Calculate node loads"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
rebalance_node_loads: >-
|
||||||
|
{% set loads = [] %}
|
||||||
|
{% for result in rebalance_vms_per_node.results %}
|
||||||
|
{% set node_name = result.item.node %}
|
||||||
|
{% set node_info = rebalance_nodes.proxmox_nodes
|
||||||
|
| selectattr('node', 'equalto', node_name)
|
||||||
|
| first %}
|
||||||
|
{% set vm_mem = result.proxmox_vms
|
||||||
|
| map(attribute='mem')
|
||||||
|
| map('default', 0)
|
||||||
|
| sum %}
|
||||||
|
{% set free_mem = node_info.maxmem - node_info.mem %}
|
||||||
|
{% set load_pct = (node_info.mem / node_info.maxmem * 100) | round(1) %}
|
||||||
|
{% set _ = loads.append({
|
||||||
|
'node': node_name,
|
||||||
|
'used_mem': node_info.mem,
|
||||||
|
'max_mem': node_info.maxmem,
|
||||||
|
'free_mem': free_mem,
|
||||||
|
'load_pct': load_pct,
|
||||||
|
'vm_count': result.proxmox_vms | rejectattr('template', 'equalto', true) | list | length,
|
||||||
|
'vms': result.proxmox_vms | rejectattr('template', 'equalto', true) | list
|
||||||
|
}) %}
|
||||||
|
{% endfor %}
|
||||||
|
{{ loads | sort(attribute='load_pct', reverse=true) }}
|
||||||
|
delegate_to: localhost
|
||||||
|
when: migrate_mode == 'rebalance'
|
||||||
|
|
||||||
|
- name: "Migrate | REBALANCE | Log current distribution"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Current cluster load:
|
||||||
|
{% for n in rebalance_node_loads %}
|
||||||
|
{{ n.node }}: {{ n.load_pct }}% memory used, {{ n.vm_count }} VMs
|
||||||
|
{% endfor %}
|
||||||
|
when: migrate_mode == 'rebalance'
|
||||||
|
|
||||||
|
- name: "Migrate | REBALANCE | Build migration plan"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
rebalance_migrations: >-
|
||||||
|
{% set moves = [] %}
|
||||||
|
{% set loads = rebalance_node_loads | list %}
|
||||||
|
{% set total_mem = loads | map(attribute='used_mem') | sum %}
|
||||||
|
{% set avg_mem = total_mem / loads | length %}
|
||||||
|
{% for vm in (loads | map(attribute='vms') | flatten
|
||||||
|
| rejectattr('status', 'equalto', 'stopped')
|
||||||
|
| list) %}
|
||||||
|
{% set src_node = vm.node %}
|
||||||
|
{% set src_info = loads | selectattr('node', 'equalto', src_node) | first %}
|
||||||
|
{% if src_info.load_pct | float > (avg_mem / src_info.max_mem * 100 + rebalance_threshold_pct) %}
|
||||||
|
{% set target = loads
|
||||||
|
| rejectattr('node', 'equalto', src_node)
|
||||||
|
| sort(attribute='load_pct')
|
||||||
|
| first %}
|
||||||
|
{% if target.load_pct | float < src_info.load_pct | float - rebalance_threshold_pct %}
|
||||||
|
{% set _ = moves.append({
|
||||||
|
'vmid': vm.vmid,
|
||||||
|
'name': vm.name,
|
||||||
|
'type': vm.type,
|
||||||
|
'status': vm.status,
|
||||||
|
'from': src_node,
|
||||||
|
'to': target.node
|
||||||
|
}) %}
|
||||||
|
{% endif %}
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{{ moves }}
|
||||||
|
delegate_to: localhost
|
||||||
|
when: migrate_mode == 'rebalance'
|
||||||
|
|
||||||
|
- name: "Migrate | REBALANCE | Log migration plan"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Rebalance plan ({{ rebalance_migrations | length }} migration(s)):
|
||||||
|
{% if rebalance_migrations | length == 0 %}
|
||||||
|
Cluster is already balanced within {{ rebalance_threshold_pct }}% threshold — no migrations needed.
|
||||||
|
{% else %}
|
||||||
|
{% for m in rebalance_migrations %}
|
||||||
|
{{ m.name }} (VMID {{ m.vmid }}) {{ m.from }} → {{ m.to }}
|
||||||
|
{% endfor %}
|
||||||
|
{% endif %}
|
||||||
|
when: migrate_mode == 'rebalance'
|
||||||
|
|
||||||
|
- name: "Migrate | REBALANCE | Execute KVM migrations"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
qm migrate {{ item.vmid }} {{ item.to }}
|
||||||
|
{% if item.status == 'running' %}--online{% endif %}
|
||||||
|
--with-local-disks 0
|
||||||
|
loop: "{{ rebalance_migrations | selectattr('type', 'equalto', 'qemu') | list }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }} ({{ item.from }} → {{ item.to }})"
|
||||||
|
changed_when: true
|
||||||
|
delegate_to: "{{ item.from }}"
|
||||||
|
when:
|
||||||
|
- migrate_mode == 'rebalance'
|
||||||
|
- rebalance_migrations | length > 0
|
||||||
|
|
||||||
|
- name: "Migrate | REBALANCE | Execute LXC migrations"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
pct migrate {{ item.vmid }} {{ item.to }} --restart --timeout 120
|
||||||
|
loop: "{{ rebalance_migrations | selectattr('type', 'equalto', 'lxc') | list }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name | default(item.vmid) }} ({{ item.from }} → {{ item.to }})"
|
||||||
|
changed_when: true
|
||||||
|
delegate_to: "{{ item.from }}"
|
||||||
|
when:
|
||||||
|
- migrate_mode == 'rebalance'
|
||||||
|
- rebalance_migrations | length > 0
|
||||||
|
|
||||||
|
- name: "Migrate | REBALANCE | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
✓ Rebalance complete —
|
||||||
|
{{ rebalance_migrations | length }} VM(s) redistributed.
|
||||||
|
when: migrate_mode == 'rebalance'
|
||||||
|
|
||||||
|
# ── TARGETED mode ──────────────────────────────────────────────────────────
|
||||||
|
- name: "Migrate | TARGETED | Get all VMs"
|
||||||
|
community.proxmox.proxmox_vm_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port | default(8006) }}"
|
||||||
|
validate_certs: "{{ validate_certs | default(false) }}"
|
||||||
|
register: targeted_all_vms
|
||||||
|
delegate_to: localhost
|
||||||
|
when: migrate_mode == 'targeted'
|
||||||
|
|
||||||
|
- name: "Migrate | TARGETED | Filter VMs by VMID"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
targeted_vms: >-
|
||||||
|
{{ targeted_all_vms.proxmox_vms
|
||||||
|
| selectattr('vmid', 'in', migrate_vmids)
|
||||||
|
| list }}
|
||||||
|
delegate_to: localhost
|
||||||
|
when:
|
||||||
|
- migrate_mode == 'targeted'
|
||||||
|
- migrate_vmids | length > 0
|
||||||
|
|
||||||
|
- name: "Migrate | TARGETED | Filter VMs by tag"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
targeted_vms: >-
|
||||||
|
{{ targeted_all_vms.proxmox_vms
|
||||||
|
| selectattr('tags', 'defined')
|
||||||
|
| selectattr('tags', 'search', migrate_tags | join('|'))
|
||||||
|
| list }}
|
||||||
|
delegate_to: localhost
|
||||||
|
when:
|
||||||
|
- migrate_mode == 'targeted'
|
||||||
|
- migrate_tags | length > 0
|
||||||
|
- migrate_vmids | length == 0
|
||||||
|
|
||||||
|
- name: "Migrate | TARGETED | Resolve target node"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
targeted_resolved_target: "{{ migrate_target_node }}"
|
||||||
|
when:
|
||||||
|
- migrate_mode == 'targeted'
|
||||||
|
- migrate_target_node != ''
|
||||||
|
|
||||||
|
- name: "Migrate | TARGETED | Auto-select target by resources"
|
||||||
|
block:
|
||||||
|
- name: "Migrate | TARGETED | Get node resources"
|
||||||
|
community.proxmox.proxmox_node_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port | default(8006) }}"
|
||||||
|
validate_certs: "{{ validate_certs | default(false) }}"
|
||||||
|
register: targeted_nodes
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Migrate | TARGETED | Pick best target"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
targeted_resolved_target: >-
|
||||||
|
{{ (targeted_nodes.proxmox_nodes
|
||||||
|
| selectattr('status', 'equalto', 'online')
|
||||||
|
| sort(attribute='mem')
|
||||||
|
| first).node }}
|
||||||
|
delegate_to: localhost
|
||||||
|
when:
|
||||||
|
- migrate_mode == 'targeted'
|
||||||
|
- migrate_target_node == ''
|
||||||
|
|
||||||
|
- name: "Migrate | TARGETED | Log plan"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Targeted migration: {{ targeted_vms | length }} VM(s) → {{ targeted_resolved_target }}
|
||||||
|
VMIDs: {{ targeted_vms | map(attribute='vmid') | list }}
|
||||||
|
when: migrate_mode == 'targeted'
|
||||||
|
|
||||||
|
- name: "Migrate | TARGETED | Migrate KVM VMs"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
qm migrate {{ item.vmid }} {{ targeted_resolved_target }}
|
||||||
|
{% if item.status == 'running' %}--online{% endif %}
|
||||||
|
--with-local-disks 0
|
||||||
|
loop: "{{ targeted_vms | selectattr('type', 'equalto', 'qemu') | list }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }} (VMID {{ item.vmid }}) → {{ targeted_resolved_target }}"
|
||||||
|
changed_when: true
|
||||||
|
delegate_to: "{{ item.node }}"
|
||||||
|
when: migrate_mode == 'targeted'
|
||||||
|
|
||||||
|
- name: "Migrate | TARGETED | Migrate LXC containers"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
pct migrate {{ item.vmid }} {{ targeted_resolved_target }} --restart --timeout 120
|
||||||
|
loop: "{{ targeted_vms | selectattr('type', 'equalto', 'lxc') | list }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name | default(item.vmid) }} (VMID {{ item.vmid }}) → {{ targeted_resolved_target }}"
|
||||||
|
changed_when: true
|
||||||
|
delegate_to: "{{ item.node }}"
|
||||||
|
when: migrate_mode == 'targeted'
|
||||||
|
|
||||||
|
- name: "Migrate | TARGETED | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
✓ Targeted migration complete —
|
||||||
|
{{ targeted_vms | length }} VM(s) moved to {{ targeted_resolved_target }}.
|
||||||
|
when: migrate_mode == 'targeted'
|
||||||
75
playbooks/proxmox_reboot.yml
Normal file
75
playbooks/proxmox_reboot.yml
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_reboot.yml
|
||||||
|
# Controlled rolling reboot of Proxmox cluster nodes.
|
||||||
|
# Drains guests before rebooting, waits for rejoin, optionally restores.
|
||||||
|
#
|
||||||
|
# Use cases:
|
||||||
|
# - Apply kernel updates that require a reboot
|
||||||
|
# - Scheduled maintenance reboots
|
||||||
|
# - Hardware changes requiring a restart
|
||||||
|
#
|
||||||
|
# Variables:
|
||||||
|
# reboot_order — ordered list of nodes to reboot (default: upgrade_order)
|
||||||
|
# reboot_reason — logged message explaining the reboot
|
||||||
|
# migration_restore — return VMs to origin after reboot (default: false)
|
||||||
|
# drain_target_strategy — resources | explicit (default: resources)
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# # Rolling reboot all nodes
|
||||||
|
# ansible-playbook proxmox_reboot.yml
|
||||||
|
#
|
||||||
|
# # Reboot a single node
|
||||||
|
# ansible-playbook proxmox_reboot.yml -e "reboot_order=[pm-node-02]"
|
||||||
|
#
|
||||||
|
# # Reboot and restore VMs to origin
|
||||||
|
# ansible-playbook proxmox_reboot.yml -e "migration_restore=true"
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Proxmox | Controlled Rolling Reboot"
|
||||||
|
hosts: proxmox_cluster
|
||||||
|
gather_facts: true
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
vars:
|
||||||
|
reboot_order: "{{ upgrade_order | default(groups['proxmox_cluster'] | sort) }}"
|
||||||
|
reboot_reason: "Scheduled maintenance reboot"
|
||||||
|
migration_restore: false
|
||||||
|
reboot_timeout: 600
|
||||||
|
node_rejoin_timeout: 300
|
||||||
|
node_rejoin_retries: 30
|
||||||
|
node_rejoin_delay: 10
|
||||||
|
|
||||||
|
pre_tasks:
|
||||||
|
- name: "Reboot | Log operation"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Proxmox rolling reboot —
|
||||||
|
client={{ client_name | default('Unknown') }}
|
||||||
|
nodes={{ reboot_order | join(', ') }}
|
||||||
|
reason={{ reboot_reason }}
|
||||||
|
restore={{ migration_restore }}
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: proxmox_preflight
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- name: "Reboot | Rolling reboot — cluster mode"
|
||||||
|
ansible.builtin.include_tasks: tasks/proxmox_reboot_node_loop.yml
|
||||||
|
loop: "{{ reboot_order }}"
|
||||||
|
loop_control:
|
||||||
|
loop_var: current_node
|
||||||
|
label: "{{ current_node }}"
|
||||||
|
when: proxmox_is_cluster
|
||||||
|
|
||||||
|
- name: "Reboot | Standalone | Reboot node"
|
||||||
|
ansible.builtin.reboot:
|
||||||
|
reboot_timeout: "{{ reboot_timeout }}"
|
||||||
|
msg: "{{ reboot_reason }}"
|
||||||
|
pre_reboot_delay: 5
|
||||||
|
post_reboot_delay: 15
|
||||||
|
when: not proxmox_is_cluster
|
||||||
|
|
||||||
|
- name: "Reboot | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ Rolling reboot complete for {{ client_name | default('cluster') }}."
|
||||||
298
playbooks/proxmox_snapshot.yml
Normal file
298
playbooks/proxmox_snapshot.yml
Normal file
@@ -0,0 +1,298 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_snapshot.yml
|
||||||
|
# Pre/post maintenance VM snapshot management.
|
||||||
|
#
|
||||||
|
# Actions:
|
||||||
|
# create — snapshot all running VMs across the cluster before maintenance
|
||||||
|
# verify — verify snapshots exist and are readable
|
||||||
|
# cleanup — remove snapshots older than snapshot_max_age_hours
|
||||||
|
# rollback — rollback a specific VMID to its most recent automation snapshot
|
||||||
|
#
|
||||||
|
# Snapshots are named with a consistent prefix for easy identification and cleanup:
|
||||||
|
# auto_pre_<date>_<time>
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# # Snapshot all running VMs before upgrade
|
||||||
|
# ansible-playbook proxmox_snapshot.yml -e "snapshot_action=create"
|
||||||
|
#
|
||||||
|
# # Verify snapshots exist
|
||||||
|
# ansible-playbook proxmox_snapshot.yml -e "snapshot_action=verify"
|
||||||
|
#
|
||||||
|
# # Clean up snapshots older than 48 hours
|
||||||
|
# ansible-playbook proxmox_snapshot.yml -e "snapshot_action=cleanup snapshot_max_age_hours=48"
|
||||||
|
#
|
||||||
|
# # Rollback a specific VM
|
||||||
|
# ansible-playbook proxmox_snapshot.yml -e "snapshot_action=rollback snapshot_rollback_vmid=100"
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Proxmox | VM Snapshot Management"
|
||||||
|
hosts: proxmox_cluster
|
||||||
|
gather_facts: true
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
vars:
|
||||||
|
snapshot_action: create
|
||||||
|
snapshot_prefix: "auto_pre"
|
||||||
|
snapshot_description: "Pre-maintenance snapshot — managed by ansible-msp"
|
||||||
|
snapshot_max_age_hours: 72
|
||||||
|
snapshot_include_ram: false # include RAM state in snapshot (slower, more disk)
|
||||||
|
snapshot_target_vmids: [] # empty = all running VMs
|
||||||
|
snapshot_exclude_tags:
|
||||||
|
- nosnap
|
||||||
|
- nosnapshot
|
||||||
|
snapshot_rollback_vmid: "" # required for rollback action
|
||||||
|
|
||||||
|
# API connection
|
||||||
|
api_port: "{{ api_port | default(8006) }}"
|
||||||
|
validate_certs: "{{ validate_certs | default(false) }}"
|
||||||
|
|
||||||
|
pre_tasks:
|
||||||
|
- name: "Snapshot | Validate action"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >-
|
||||||
|
Invalid snapshot_action '{{ snapshot_action }}'.
|
||||||
|
Must be one of: create, verify, cleanup, rollback.
|
||||||
|
when: snapshot_action not in ['create', 'verify', 'cleanup', 'rollback']
|
||||||
|
|
||||||
|
- name: "Snapshot | Validate rollback — VMID required"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "snapshot_rollback_vmid is required for rollback action."
|
||||||
|
when:
|
||||||
|
- snapshot_action == 'rollback'
|
||||||
|
- snapshot_rollback_vmid == ''
|
||||||
|
|
||||||
|
- name: "Snapshot | Set snapshot name"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
snapshot_name: "{{ snapshot_prefix }}_{{ ansible_date_time.date | replace('-','') }}_{{ ansible_date_time.hour }}{{ ansible_date_time.minute }}"
|
||||||
|
when: snapshot_action == 'create'
|
||||||
|
|
||||||
|
- name: "Snapshot | Log operation"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Snapshot {{ snapshot_action }} —
|
||||||
|
client={{ client_name | default('Unknown') }}
|
||||||
|
{% if snapshot_action == 'create' %}name={{ snapshot_name }}{% endif %}
|
||||||
|
{% if snapshot_action == 'cleanup' %}max_age={{ snapshot_max_age_hours }}h{% endif %}
|
||||||
|
{% if snapshot_action == 'rollback' %}vmid={{ snapshot_rollback_vmid }}{% endif %}
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: proxmox_preflight
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
# ── Get all VMs ────────────────────────────────────────────────────────────
|
||||||
|
- name: "Snapshot | Get all node info"
|
||||||
|
community.proxmox.proxmox_node_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
register: snapshot_nodes
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Snapshot | Get all VMs per node"
|
||||||
|
community.proxmox.proxmox_vm_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
node: "{{ item.node }}"
|
||||||
|
loop: >-
|
||||||
|
{{ snapshot_nodes.proxmox_nodes
|
||||||
|
| selectattr('status', 'equalto', 'online')
|
||||||
|
| list }}
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.node }}"
|
||||||
|
register: snapshot_vms_per_node
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Snapshot | Build VM list"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
snapshot_all_vms: >-
|
||||||
|
{{ snapshot_vms_per_node.results
|
||||||
|
| map(attribute='proxmox_vms')
|
||||||
|
| flatten
|
||||||
|
| rejectattr('template', 'equalto', true)
|
||||||
|
| selectattr('type', 'equalto', 'qemu')
|
||||||
|
| list }}
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Snapshot | Filter by VMID list"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
snapshot_target_vms: >-
|
||||||
|
{{ snapshot_all_vms
|
||||||
|
| selectattr('vmid', 'in', snapshot_target_vmids)
|
||||||
|
| list }}
|
||||||
|
when: snapshot_target_vmids | length > 0
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Snapshot | Filter running VMs (no VMID filter)"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
snapshot_target_vms: >-
|
||||||
|
{{ snapshot_all_vms
|
||||||
|
| selectattr('status', 'equalto', 'running')
|
||||||
|
| rejectattr('tags', 'defined')
|
||||||
|
| list
|
||||||
|
+ snapshot_all_vms
|
||||||
|
| selectattr('status', 'equalto', 'running')
|
||||||
|
| selectattr('tags', 'defined')
|
||||||
|
| rejectattr('tags', 'search', snapshot_exclude_tags | join('|'))
|
||||||
|
| list }}
|
||||||
|
when: snapshot_target_vmids | length == 0
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
# ── CREATE ─────────────────────────────────────────────────────────────────
|
||||||
|
- name: "Snapshot | CREATE | Log plan"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Creating snapshot '{{ snapshot_name }}' for
|
||||||
|
{{ snapshot_target_vms | length }} VM(s):
|
||||||
|
{{ snapshot_target_vms | map(attribute='name') | list }}
|
||||||
|
when: snapshot_action == 'create'
|
||||||
|
|
||||||
|
- name: "Snapshot | CREATE | Take snapshots"
|
||||||
|
community.proxmox.proxmox_snap:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
vmid: "{{ item.vmid }}"
|
||||||
|
snapname: "{{ snapshot_name }}"
|
||||||
|
description: "{{ snapshot_description }}"
|
||||||
|
vmstate: "{{ snapshot_include_ram }}"
|
||||||
|
state: present
|
||||||
|
loop: "{{ snapshot_target_vms }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }} (VMID {{ item.vmid }}) on {{ item.node }}"
|
||||||
|
delegate_to: localhost
|
||||||
|
when: snapshot_action == 'create'
|
||||||
|
|
||||||
|
- name: "Snapshot | CREATE | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ Snapshots created: '{{ snapshot_name }}' on {{ snapshot_target_vms | length }} VM(s)."
|
||||||
|
when: snapshot_action == 'create'
|
||||||
|
|
||||||
|
# ── VERIFY ─────────────────────────────────────────────────────────────────
|
||||||
|
- name: "Snapshot | VERIFY | Check snapshots exist"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
qm listsnapshot {{ item.vmid }}
|
||||||
|
loop: "{{ snapshot_target_vms }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }} (VMID {{ item.vmid }})"
|
||||||
|
register: snapshot_verify_results
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: "{{ item.node }}"
|
||||||
|
when: snapshot_action == 'verify'
|
||||||
|
|
||||||
|
- name: "Snapshot | VERIFY | Report"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
{{ item.item.name }} (VMID {{ item.item.vmid }}):
|
||||||
|
{{ 'HAS snapshot' if snapshot_prefix in item.stdout else 'NO automation snapshot found' }}
|
||||||
|
loop: "{{ snapshot_verify_results.results | default([]) }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.item.name | default(item.item.vmid) }}"
|
||||||
|
when: snapshot_action == 'verify'
|
||||||
|
|
||||||
|
# ── CLEANUP ────────────────────────────────────────────────────────────────
|
||||||
|
- name: "Snapshot | CLEANUP | Remove old snapshots"
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
cutoff=$(date -d "{{ snapshot_max_age_hours }} hours ago" +%s)
|
||||||
|
for snap in $(qm listsnapshot {{ item.vmid }} 2>/dev/null | grep "{{ snapshot_prefix }}" | awk '{print $2}'); do
|
||||||
|
snap_date=$(echo $snap | sed 's/{{ snapshot_prefix }}_//' | sed 's/_[0-9]*$//')
|
||||||
|
snap_epoch=$(date -d "${snap_date:0:4}-${snap_date:4:2}-${snap_date:6:2}" +%s 2>/dev/null || echo 0)
|
||||||
|
if [ "$snap_epoch" -lt "$cutoff" ]; then
|
||||||
|
echo "Removing snapshot: $snap from VMID {{ item.vmid }}"
|
||||||
|
qm delsnapshot {{ item.vmid }} $snap
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
loop: "{{ snapshot_target_vms }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }} (VMID {{ item.vmid }})"
|
||||||
|
changed_when: true
|
||||||
|
register: snapshot_cleanup_result
|
||||||
|
delegate_to: "{{ item.node }}"
|
||||||
|
when: snapshot_action == 'cleanup'
|
||||||
|
|
||||||
|
- name: "Snapshot | CLEANUP | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ Snapshot cleanup complete — removed snapshots older than {{ snapshot_max_age_hours }} hours."
|
||||||
|
when: snapshot_action == 'cleanup'
|
||||||
|
|
||||||
|
# ── ROLLBACK ───────────────────────────────────────────────────────────────
|
||||||
|
- name: "Snapshot | ROLLBACK | Find most recent automation snapshot"
|
||||||
|
ansible.builtin.shell: >
|
||||||
|
qm listsnapshot {{ snapshot_rollback_vmid }} 2>/dev/null
|
||||||
|
| grep "{{ snapshot_prefix }}"
|
||||||
|
| awk '{print $2}'
|
||||||
|
| sort -r
|
||||||
|
| head -1
|
||||||
|
register: snapshot_rollback_name
|
||||||
|
changed_when: false
|
||||||
|
delegate_to: >-
|
||||||
|
{{ (snapshot_all_vms
|
||||||
|
| selectattr('vmid', 'equalto', snapshot_rollback_vmid | int)
|
||||||
|
| map(attribute='node')
|
||||||
|
| first) }}
|
||||||
|
when: snapshot_action == 'rollback'
|
||||||
|
|
||||||
|
- name: "Snapshot | ROLLBACK | Fail if no snapshot found"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >-
|
||||||
|
No automation snapshot found for VMID {{ snapshot_rollback_vmid }}.
|
||||||
|
Run snapshot_action=create first.
|
||||||
|
when:
|
||||||
|
- snapshot_action == 'rollback'
|
||||||
|
- snapshot_rollback_name.stdout | trim == ''
|
||||||
|
|
||||||
|
- name: "Snapshot | ROLLBACK | Stop VM before rollback"
|
||||||
|
community.proxmox.proxmox_kvm:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
vmid: "{{ snapshot_rollback_vmid }}"
|
||||||
|
state: stopped
|
||||||
|
force: true
|
||||||
|
timeout: 60
|
||||||
|
delegate_to: localhost
|
||||||
|
when: snapshot_action == 'rollback'
|
||||||
|
|
||||||
|
- name: "Snapshot | ROLLBACK | Execute rollback"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
qm rollback {{ snapshot_rollback_vmid }} {{ snapshot_rollback_name.stdout | trim }}
|
||||||
|
changed_when: true
|
||||||
|
delegate_to: >-
|
||||||
|
{{ (snapshot_all_vms
|
||||||
|
| selectattr('vmid', 'equalto', snapshot_rollback_vmid | int)
|
||||||
|
| map(attribute='node')
|
||||||
|
| first) }}
|
||||||
|
when: snapshot_action == 'rollback'
|
||||||
|
|
||||||
|
- name: "Snapshot | ROLLBACK | Start VM after rollback"
|
||||||
|
community.proxmox.proxmox_kvm:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
vmid: "{{ snapshot_rollback_vmid }}"
|
||||||
|
state: started
|
||||||
|
delegate_to: localhost
|
||||||
|
when: snapshot_action == 'rollback'
|
||||||
|
|
||||||
|
- name: "Snapshot | ROLLBACK | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
✓ VMID {{ snapshot_rollback_vmid }} rolled back to
|
||||||
|
'{{ snapshot_rollback_name.stdout | trim }}'.
|
||||||
|
when: snapshot_action == 'rollback'
|
||||||
23
playbooks/proxmox_status.yml
Normal file
23
playbooks/proxmox_status.yml
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_status.yml
|
||||||
|
# Cluster health report — nodes, VMs, storage, CEPH, HA, updates.
|
||||||
|
# Safe to run at any time with no side effects.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# ansible-playbook proxmox_status.yml
|
||||||
|
# ansible-playbook proxmox_status.yml -e "status_include_ceph=false"
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Proxmox | Cluster Status Report"
|
||||||
|
hosts: proxmox_cluster
|
||||||
|
gather_facts: true
|
||||||
|
|
||||||
|
vars:
|
||||||
|
status_include_vms: true
|
||||||
|
status_include_storage: true
|
||||||
|
status_include_ceph: true
|
||||||
|
status_include_ha: true
|
||||||
|
|
||||||
|
roles:
|
||||||
|
- role: proxmox_status
|
||||||
@@ -1,43 +1,83 @@
|
|||||||
---
|
---
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# proxmox_upgrade.yml
|
# proxmox_upgrade.yml
|
||||||
# =============================================================================
|
# Rolling Proxmox upgrade orchestrator.
|
||||||
# Rolling Proxmox cluster upgrade playbook.
|
|
||||||
# Runs on the first node in upgrade_order — all other nodes are handled
|
|
||||||
# via API calls and delegate_to from within the role.
|
|
||||||
#
|
#
|
||||||
# Usage:
|
# Workflow per node (cluster mode):
|
||||||
# ansible-playbook playbooks/proxmox_upgrade.yml \
|
# 1. Backup config
|
||||||
# -i inventories/client_local_eng/hypervisor_hosts.yml
|
# 2. Set CEPH noout (if CEPH enabled)
|
||||||
|
# 3. Enable HA maintenance mode
|
||||||
|
# 4. Drain guests to best available node
|
||||||
|
# 5. apt dist-upgrade
|
||||||
|
# 6. Reboot if required, wait for rejoin
|
||||||
|
# 7. Clear CEPH noout
|
||||||
|
# 8. Disable HA maintenance mode
|
||||||
|
# 9. Restore guests (if migration_restore=true)
|
||||||
#
|
#
|
||||||
# Override migration behaviour:
|
# Standalone mode skips all cluster/HA/CEPH/drain steps.
|
||||||
# -e migration_bulk=true
|
|
||||||
# -e live_migrate_fallback=skip
|
|
||||||
# -e migration_restore=true
|
|
||||||
#
|
#
|
||||||
# Dry run (check mode — no changes):
|
# Variables (set in inventory or pass with -e):
|
||||||
# --check
|
# upgrade_order — ordered list of nodes to upgrade (cluster only)
|
||||||
|
# migration_restore — return VMs to origin node after upgrade (default: false)
|
||||||
|
# drain_target_strategy — resources | explicit (default: resources)
|
||||||
|
# backup_destination — local | sftp (default: local)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
- name: Proxmox Rolling Upgrade
|
- name: "Proxmox Rolling Upgrade"
|
||||||
hosts: proxmox_cluster
|
hosts: proxmox_cluster
|
||||||
gather_facts: true
|
gather_facts: true
|
||||||
serial: 1
|
run_once: true # Play runs once — loops over nodes internally
|
||||||
run_once: true
|
serial: 1 # Safety: only one Ansible host processes at a time
|
||||||
|
|
||||||
|
vars:
|
||||||
|
migration_restore: false
|
||||||
|
upgrade_order: "{{ groups['proxmox_cluster'] | sort }}"
|
||||||
|
|
||||||
pre_tasks:
|
pre_tasks:
|
||||||
- name: Confirm upgrade_order is defined
|
- name: "Upgrade | Confirm upgrade_order is defined"
|
||||||
ansible.builtin.fail:
|
ansible.builtin.fail:
|
||||||
msg: "upgrade_order must be defined in hypervisor_hosts.yml"
|
msg: "upgrade_order must be defined in inventory or passed with -e"
|
||||||
when: upgrade_order is not defined or upgrade_order | length == 0
|
when: upgrade_order is not defined or upgrade_order | length == 0
|
||||||
|
|
||||||
- name: Log upgrade targets
|
- name: "Upgrade | Log targets"
|
||||||
ansible.builtin.debug:
|
ansible.builtin.debug:
|
||||||
msg: >-
|
msg: >-
|
||||||
Proxmox upgrade starting for {{ client_name }} ({{ client_id }})
|
Proxmox upgrade starting for {{ client_name | default('Unknown') }}
|
||||||
|
({{ client_id | default('?') }})
|
||||||
Nodes: {{ upgrade_order | join(', ') }}
|
Nodes: {{ upgrade_order | join(', ') }}
|
||||||
API: https://{{ api_host }}:{{ api_port }}
|
API: https://{{ api_host }}:{{ api_port }}
|
||||||
|
|
||||||
roles:
|
roles:
|
||||||
- proxmox_upgrade
|
- role: proxmox_preflight
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
# ── Cluster: loop through each node ────────────────────────────────────────
|
||||||
|
- name: "Upgrade | Rolling upgrade — cluster mode"
|
||||||
|
ansible.builtin.include_tasks: tasks/proxmox_upgrade_node_loop.yml
|
||||||
|
loop: "{{ upgrade_order }}"
|
||||||
|
loop_control:
|
||||||
|
loop_var: current_node
|
||||||
|
label: "{{ current_node }}"
|
||||||
|
when: proxmox_is_cluster
|
||||||
|
|
||||||
|
# ── Standalone: upgrade this host directly ────────────────────────────────
|
||||||
|
- name: "Upgrade | Standalone | Backup config"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_config_backup
|
||||||
|
vars:
|
||||||
|
current_node: "{{ inventory_hostname }}"
|
||||||
|
when: not proxmox_is_cluster
|
||||||
|
|
||||||
|
- name: "Upgrade | Standalone | Run upgrade"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_upgrade_node
|
||||||
|
vars:
|
||||||
|
current_node: "{{ inventory_hostname }}"
|
||||||
|
when: not proxmox_is_cluster
|
||||||
|
|
||||||
|
- name: "Upgrade | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
✓ Proxmox upgrade complete for
|
||||||
|
{{ client_name | default('Unknown') }} —
|
||||||
|
{{ upgrade_order | length }} node(s) upgraded.
|
||||||
|
|||||||
82
playbooks/tasks/proxmox_reboot_node_loop.yml
Normal file
82
playbooks/tasks/proxmox_reboot_node_loop.yml
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_reboot_node_loop.yml
|
||||||
|
# Called once per node by proxmox_reboot.yml
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Start"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "━━━ Starting reboot of node {{ current_node }} ━━━"
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Set CEPH noout"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_ceph
|
||||||
|
vars:
|
||||||
|
ceph_action: set_noout
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Enable HA maintenance"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_ha
|
||||||
|
vars:
|
||||||
|
ha_action: disable
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Drain guests"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_drain
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Reboot"
|
||||||
|
ansible.builtin.reboot:
|
||||||
|
reboot_timeout: "{{ reboot_timeout }}"
|
||||||
|
msg: "{{ reboot_reason }}"
|
||||||
|
pre_reboot_delay: 5
|
||||||
|
post_reboot_delay: 15
|
||||||
|
delegate_to: "{{ current_node }}"
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Wait for cluster rejoin"
|
||||||
|
community.proxmox.proxmox_node_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port | default(8006) }}"
|
||||||
|
validate_certs: "{{ validate_certs | default(false) }}"
|
||||||
|
register: reboot_rejoin_check
|
||||||
|
delegate_to: localhost
|
||||||
|
until: >-
|
||||||
|
reboot_rejoin_check.proxmox_nodes
|
||||||
|
| selectattr('node', 'equalto', current_node)
|
||||||
|
| selectattr('status', 'equalto', 'online')
|
||||||
|
| list
|
||||||
|
| length > 0
|
||||||
|
retries: "{{ node_rejoin_retries }}"
|
||||||
|
delay: "{{ node_rejoin_delay }}"
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Back online"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ Node {{ current_node }} has rejoined the cluster after reboot."
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Clear CEPH noout"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_ceph
|
||||||
|
vars:
|
||||||
|
ceph_action: clear_noout
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Disable HA maintenance"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_ha
|
||||||
|
vars:
|
||||||
|
ha_action: enable
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Restore guests"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_restore
|
||||||
|
when: migration_restore | bool
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Skip restore"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "migration_restore=false — guests remain on their current nodes."
|
||||||
|
when: not migration_restore | bool
|
||||||
|
|
||||||
|
- name: "Reboot | {{ current_node }} | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "━━━ Reboot complete: {{ current_node }} ━━━"
|
||||||
68
playbooks/tasks/proxmox_upgrade_node_loop.yml
Normal file
68
playbooks/tasks/proxmox_upgrade_node_loop.yml
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade_node_loop.yml
|
||||||
|
# Called once per node by proxmox_upgrade.yml.
|
||||||
|
# Handles the full per-node upgrade pipeline in cluster mode.
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Node {{ current_node }} | Start"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "━━━ Starting upgrade of node {{ current_node }} ━━━"
|
||||||
|
|
||||||
|
# ── Step 1: Backup config ─────────────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 1 — Backup config"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_config_backup
|
||||||
|
|
||||||
|
# ── Step 2: CEPH noout ────────────────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 2 — Set CEPH noout"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_ceph
|
||||||
|
vars:
|
||||||
|
ceph_action: set_noout
|
||||||
|
|
||||||
|
# ── Step 3: HA maintenance mode ───────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 3 — Enable HA maintenance"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_ha
|
||||||
|
vars:
|
||||||
|
ha_action: disable
|
||||||
|
|
||||||
|
# ── Step 4: Drain guests ──────────────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 4 — Drain guests"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_drain
|
||||||
|
|
||||||
|
# ── Step 5: Upgrade ───────────────────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 5 — Upgrade packages"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_upgrade_node
|
||||||
|
|
||||||
|
# ── Step 6: Clear CEPH noout ──────────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 6 — Clear CEPH noout"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_ceph
|
||||||
|
vars:
|
||||||
|
ceph_action: clear_noout
|
||||||
|
|
||||||
|
# ── Step 7: Resume HA management ─────────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 7 — Disable HA maintenance"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_ha
|
||||||
|
vars:
|
||||||
|
ha_action: enable
|
||||||
|
|
||||||
|
# ── Step 8: Restore guests (optional) ────────────────────────────────────────
|
||||||
|
- name: "Node {{ current_node }} | Step 8 — Restore guests"
|
||||||
|
ansible.builtin.include_role:
|
||||||
|
name: proxmox_restore
|
||||||
|
when: migration_restore | bool
|
||||||
|
|
||||||
|
- name: "Node {{ current_node }} | Skip restore"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "migration_restore=false — guests remain on their current nodes."
|
||||||
|
when: not migration_restore | bool
|
||||||
|
|
||||||
|
- name: "Node {{ current_node }} | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "━━━ Node {{ current_node }} upgrade complete ━━━"
|
||||||
18
roles/proxmox_ceph/defaults/main.yml
Normal file
18
roles/proxmox_ceph/defaults/main.yml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_ceph — defaults
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Action: set_noout | clear_noout | status | check_health
|
||||||
|
ceph_action: status
|
||||||
|
|
||||||
|
# Health check settings
|
||||||
|
ceph_health_timeout: 300 # seconds to wait for HEALTH_OK
|
||||||
|
ceph_health_retries: 30
|
||||||
|
ceph_health_delay: 10
|
||||||
|
|
||||||
|
# Abort upgrade if CEPH is in error state
|
||||||
|
ceph_abort_on_error: true
|
||||||
|
|
||||||
|
# Warn but continue if CEPH has warnings
|
||||||
|
ceph_warn_on_warning: true
|
||||||
11
roles/proxmox_ceph/meta/main.yml
Normal file
11
roles/proxmox_ceph/meta/main.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
galaxy_info:
|
||||||
|
role_name: proxmox_ceph
|
||||||
|
author: ansible-msp
|
||||||
|
description: "MSP Proxmox automation — proxmox_ceph"
|
||||||
|
min_ansible_version: "2.15"
|
||||||
|
platforms:
|
||||||
|
- name: Debian
|
||||||
|
versions:
|
||||||
|
- bookworm
|
||||||
|
dependencies: []
|
||||||
140
roles/proxmox_ceph/tasks/main.yml
Normal file
140
roles/proxmox_ceph/tasks/main.yml
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_ceph — tasks
|
||||||
|
# Manages CEPH noout flag and health checks during maintenance.
|
||||||
|
# Skips gracefully if CEPH is not configured on this cluster.
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# ── Detect CEPH ───────────────────────────────────────────────────────────────
|
||||||
|
- name: "CEPH | Detect if CEPH is configured"
|
||||||
|
ansible.builtin.command: ceph status
|
||||||
|
register: ceph_detect
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "CEPH | Set CEPH enabled fact"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
ceph_is_enabled: "{{ ceph_detect.rc == 0 }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "CEPH | Skip — CEPH not configured on this cluster"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "CEPH is not configured on this cluster — skipping all CEPH tasks."
|
||||||
|
when: not ceph_is_enabled
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
# ── CEPH status ───────────────────────────────────────────────────────────────
|
||||||
|
- name: "CEPH | Get cluster status"
|
||||||
|
ansible.builtin.command: ceph status --format json
|
||||||
|
register: ceph_status_raw
|
||||||
|
changed_when: false
|
||||||
|
run_once: true
|
||||||
|
when: ceph_is_enabled
|
||||||
|
|
||||||
|
- name: "CEPH | Parse status"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
ceph_status: "{{ ceph_status_raw.stdout | from_json }}"
|
||||||
|
run_once: true
|
||||||
|
when: ceph_is_enabled
|
||||||
|
|
||||||
|
- name: "CEPH | Log health"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "CEPH health: {{ ceph_status.health.status }}"
|
||||||
|
run_once: true
|
||||||
|
when:
|
||||||
|
- ceph_is_enabled
|
||||||
|
- ceph_action == 'status'
|
||||||
|
|
||||||
|
# ── Health check ──────────────────────────────────────────────────────────────
|
||||||
|
- name: "CEPH | Check health | Abort if HEALTH_ERR"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >-
|
||||||
|
CEPH is in HEALTH_ERR state — aborting to prevent data loss.
|
||||||
|
Run 'ceph status' to investigate. Set ceph_abort_on_error=false to override.
|
||||||
|
when:
|
||||||
|
- ceph_is_enabled
|
||||||
|
- ceph_abort_on_error
|
||||||
|
- ceph_status.health.status == 'HEALTH_ERR'
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "CEPH | Check health | Warn on HEALTH_WARN"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
WARNING: CEPH is in HEALTH_WARN state.
|
||||||
|
Proceeding — set ceph_warn_on_warning=false to suppress this message.
|
||||||
|
Checks: {{ ceph_status.health.checks | default({}) | dict2items | map(attribute='key') | list }}
|
||||||
|
when:
|
||||||
|
- ceph_is_enabled
|
||||||
|
- ceph_warn_on_warning
|
||||||
|
- ceph_status.health.status == 'HEALTH_WARN'
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
# ── Set noout ─────────────────────────────────────────────────────────────────
|
||||||
|
- name: "CEPH | Set noout flag"
|
||||||
|
ansible.builtin.command: ceph osd set noout
|
||||||
|
changed_when: true
|
||||||
|
run_once: true
|
||||||
|
when:
|
||||||
|
- ceph_is_enabled
|
||||||
|
- ceph_action == 'set_noout'
|
||||||
|
|
||||||
|
- name: "CEPH | Confirm noout set"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ CEPH noout flag SET — OSDs will not be marked out during maintenance."
|
||||||
|
when:
|
||||||
|
- ceph_is_enabled
|
||||||
|
- ceph_action == 'set_noout'
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
# ── Clear noout ───────────────────────────────────────────────────────────────
|
||||||
|
- name: "CEPH | Clear noout flag"
|
||||||
|
ansible.builtin.command: ceph osd unset noout
|
||||||
|
changed_when: true
|
||||||
|
run_once: true
|
||||||
|
when:
|
||||||
|
- ceph_is_enabled
|
||||||
|
- ceph_action == 'clear_noout'
|
||||||
|
|
||||||
|
- name: "CEPH | Wait for HEALTH_OK after clearing noout"
|
||||||
|
ansible.builtin.command: ceph status --format json
|
||||||
|
register: ceph_recovery_check
|
||||||
|
changed_when: false
|
||||||
|
until: "(ceph_recovery_check.stdout | from_json).health.status in ['HEALTH_OK', 'HEALTH_WARN']"
|
||||||
|
retries: "{{ ceph_health_retries }}"
|
||||||
|
delay: "{{ ceph_health_delay }}"
|
||||||
|
run_once: true
|
||||||
|
when:
|
||||||
|
- ceph_is_enabled
|
||||||
|
- ceph_action == 'clear_noout'
|
||||||
|
|
||||||
|
- name: "CEPH | Log recovery status"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
✓ CEPH noout CLEARED —
|
||||||
|
health: {{ (ceph_recovery_check.stdout | from_json).health.status }}
|
||||||
|
when:
|
||||||
|
- ceph_is_enabled
|
||||||
|
- ceph_action == 'clear_noout'
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
# ── check_health action ───────────────────────────────────────────────────────
|
||||||
|
- name: "CEPH | Wait for healthy state"
|
||||||
|
ansible.builtin.command: ceph status --format json
|
||||||
|
register: ceph_health_wait
|
||||||
|
changed_when: false
|
||||||
|
until: "(ceph_health_wait.stdout | from_json).health.status in ['HEALTH_OK', 'HEALTH_WARN']"
|
||||||
|
retries: "{{ ceph_health_retries }}"
|
||||||
|
delay: "{{ ceph_health_delay }}"
|
||||||
|
run_once: true
|
||||||
|
when:
|
||||||
|
- ceph_is_enabled
|
||||||
|
- ceph_action == 'check_health'
|
||||||
|
|
||||||
|
- name: "CEPH | Health check result"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "CEPH health: {{ (ceph_health_wait.stdout | from_json).health.status }}"
|
||||||
|
when:
|
||||||
|
- ceph_is_enabled
|
||||||
|
- ceph_action == 'check_health'
|
||||||
|
run_once: true
|
||||||
33
roles/proxmox_config_backup/defaults/main.yml
Normal file
33
roles/proxmox_config_backup/defaults/main.yml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_config_backup — defaults
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Backup destination: local | sftp
|
||||||
|
# git destination removed pending secure implementation (TODO: encrypt secrets)
|
||||||
|
backup_destination: local
|
||||||
|
|
||||||
|
# Local backup settings
|
||||||
|
backup_local_dir: /var/backups/proxmox-config
|
||||||
|
backup_local_keep: 10
|
||||||
|
|
||||||
|
# SFTP settings (used when backup_destination: sftp)
|
||||||
|
backup_sftp_host: ""
|
||||||
|
backup_sftp_user: ""
|
||||||
|
backup_sftp_key: ""
|
||||||
|
backup_sftp_remote_dir: "/backups/proxmox"
|
||||||
|
|
||||||
|
# What to include in the backup tarball
|
||||||
|
backup_paths_proxmox:
|
||||||
|
- /etc/pve
|
||||||
|
- /etc/network/interfaces
|
||||||
|
- /etc/hosts
|
||||||
|
- /etc/hostname
|
||||||
|
- /etc/apt/sources.list
|
||||||
|
- /etc/apt/sources.list.d
|
||||||
|
|
||||||
|
backup_paths_xcpng:
|
||||||
|
- /etc/xcp-ng
|
||||||
|
- /etc/network/interfaces
|
||||||
|
- /etc/hosts
|
||||||
|
- /etc/hostname
|
||||||
11
roles/proxmox_config_backup/meta/main.yml
Normal file
11
roles/proxmox_config_backup/meta/main.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
galaxy_info:
|
||||||
|
role_name: proxmox_config_backup
|
||||||
|
author: ansible-msp
|
||||||
|
description: "MSP Proxmox automation — proxmox_config_backup"
|
||||||
|
min_ansible_version: "2.15"
|
||||||
|
platforms:
|
||||||
|
- name: Debian
|
||||||
|
versions:
|
||||||
|
- bookworm
|
||||||
|
dependencies: []
|
||||||
113
roles/proxmox_config_backup/tasks/main.yml
Normal file
113
roles/proxmox_config_backup/tasks/main.yml
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_config_backup — tasks
|
||||||
|
# Creates a tarball of critical Proxmox config files and stores it locally
|
||||||
|
# or transfers via SFTP. Git destination is a TODO pending secure handling
|
||||||
|
# of sensitive files (SSL keys, shadow.cfg, etc).
|
||||||
|
#
|
||||||
|
# Required vars:
|
||||||
|
# current_node — the node being backed up (for filename)
|
||||||
|
# client_id — client identifier (for filename)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Backup | {{ current_node }} | Gather date/time facts"
|
||||||
|
ansible.builtin.setup:
|
||||||
|
gather_subset:
|
||||||
|
- date_time
|
||||||
|
when: ansible_date_time is not defined
|
||||||
|
|
||||||
|
- name: "Backup | {{ current_node }} | Set backup filename"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
backup_filename: >-
|
||||||
|
proxmox_{{ client_id | lower | replace('-', '_') }}_{{ current_node }}_{{ ansible_date_time.date }}
|
||||||
|
|
||||||
|
- name: "Backup | {{ current_node }} | Set backup paths"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
backup_paths: "{{ backup_paths_proxmox }}"
|
||||||
|
|
||||||
|
# ── Create tarball on node ────────────────────────────────────────────────────
|
||||||
|
- name: "Backup | {{ current_node }} | Create config tarball"
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
tar czf /tmp/{{ backup_filename }}.tar.gz \
|
||||||
|
--ignore-failed-read \
|
||||||
|
--dereference \
|
||||||
|
{{ backup_paths | join(' ') }} 2>/dev/null || true
|
||||||
|
echo "done"
|
||||||
|
changed_when: true
|
||||||
|
register: backup_tarball
|
||||||
|
|
||||||
|
# ── Local backup ──────────────────────────────────────────────────────────────
|
||||||
|
- name: "Backup | {{ current_node }} | Local | Ensure backup dir exists"
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ backup_local_dir }}"
|
||||||
|
state: directory
|
||||||
|
mode: '0700'
|
||||||
|
when: backup_destination == 'local'
|
||||||
|
|
||||||
|
- name: "Backup | {{ current_node }} | Local | Move tarball to backup dir"
|
||||||
|
ansible.builtin.copy:
|
||||||
|
src: "/tmp/{{ backup_filename }}.tar.gz"
|
||||||
|
dest: "{{ backup_local_dir }}/{{ backup_filename }}.tar.gz"
|
||||||
|
remote_src: true
|
||||||
|
mode: '0600'
|
||||||
|
when: backup_destination == 'local'
|
||||||
|
|
||||||
|
- name: "Backup | {{ current_node }} | Local | Rotate old backups"
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
ls -1t {{ backup_local_dir }}/proxmox_*_{{ current_node }}_*.tar.gz 2>/dev/null \
|
||||||
|
| tail -n +{{ (backup_local_keep | int) + 1 }} \
|
||||||
|
| xargs -r rm -f
|
||||||
|
changed_when: false
|
||||||
|
when: backup_destination == 'local'
|
||||||
|
|
||||||
|
- name: "Backup | {{ current_node }} | Local | Log result"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ Config backed up locally: {{ backup_local_dir }}/{{ backup_filename }}.tar.gz"
|
||||||
|
when: backup_destination == 'local'
|
||||||
|
|
||||||
|
# ── SFTP backup ───────────────────────────────────────────────────────────────
|
||||||
|
- name: "Backup | {{ current_node }} | SFTP | Validate required vars"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: "SFTP backup requires backup_sftp_host and backup_sftp_user to be set."
|
||||||
|
when:
|
||||||
|
- backup_destination == 'sftp'
|
||||||
|
- backup_sftp_host == '' or backup_sftp_user == ''
|
||||||
|
|
||||||
|
- name: "Backup | {{ current_node }} | SFTP | Fetch tarball to controller"
|
||||||
|
ansible.builtin.fetch:
|
||||||
|
src: "/tmp/{{ backup_filename }}.tar.gz"
|
||||||
|
dest: "/tmp/{{ backup_filename }}.tar.gz"
|
||||||
|
flat: true
|
||||||
|
when: backup_destination == 'sftp'
|
||||||
|
|
||||||
|
- name: "Backup | {{ current_node }} | SFTP | Transfer to remote host"
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
sftp_opts="-o StrictHostKeyChecking=no -o BatchMode=yes"
|
||||||
|
{% if backup_sftp_key != '' %}
|
||||||
|
sftp_opts="$sftp_opts -i {{ backup_sftp_key }}"
|
||||||
|
{% endif %}
|
||||||
|
sftp $sftp_opts {{ backup_sftp_user }}@{{ backup_sftp_host }} << EOF
|
||||||
|
cd {{ backup_sftp_remote_dir }}
|
||||||
|
put /tmp/{{ backup_filename }}.tar.gz
|
||||||
|
EOF
|
||||||
|
delegate_to: localhost
|
||||||
|
changed_when: true
|
||||||
|
when: backup_destination == 'sftp'
|
||||||
|
|
||||||
|
- name: "Backup | {{ current_node }} | SFTP | Clean up local temp tarball"
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "/tmp/{{ backup_filename }}.tar.gz"
|
||||||
|
state: absent
|
||||||
|
delegate_to: localhost
|
||||||
|
when: backup_destination == 'sftp'
|
||||||
|
|
||||||
|
- name: "Backup | {{ current_node }} | SFTP | Log result"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ Config backed up via SFTP: {{ backup_sftp_host }}:{{ backup_sftp_remote_dir }}/{{ backup_filename }}.tar.gz"
|
||||||
|
when: backup_destination == 'sftp'
|
||||||
|
|
||||||
|
# ── Cleanup ───────────────────────────────────────────────────────────────────
|
||||||
|
- name: "Backup | {{ current_node }} | Clean up temp tarball on node"
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "/tmp/{{ backup_filename }}.tar.gz"
|
||||||
|
state: absent
|
||||||
34
roles/proxmox_drain/defaults/main.yml
Normal file
34
roles/proxmox_drain/defaults/main.yml
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_drain — defaults
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Target selection strategy: resources | explicit
|
||||||
|
# resources = pick node with most available mem+cpu
|
||||||
|
# explicit = use drain_target_node variable
|
||||||
|
drain_target_strategy: resources
|
||||||
|
|
||||||
|
# Explicit target node (only used when drain_target_strategy: explicit)
|
||||||
|
drain_target_node: ""
|
||||||
|
|
||||||
|
# Resource weighting for target scoring (must sum to 1.0)
|
||||||
|
drain_score_mem_weight: 0.6
|
||||||
|
drain_score_cpu_weight: 0.4
|
||||||
|
|
||||||
|
# Migration behaviour
|
||||||
|
drain_online: true # live migrate running VMs
|
||||||
|
drain_shutdown_fallback: true # shutdown VM if live migrate fails
|
||||||
|
drain_vm_shutdown_timeout: 120 # seconds to wait for graceful shutdown
|
||||||
|
drain_lxc_restart: true # restart LXC after migration
|
||||||
|
|
||||||
|
# State file — written to Semaphore host for restore mode
|
||||||
|
drain_state_dir: "/tmp/proxmox_drain_state"
|
||||||
|
|
||||||
|
# Filtering — skip VMs matching these tags (comma-separated string in PVE)
|
||||||
|
drain_exclude_tags:
|
||||||
|
- nomigrate
|
||||||
|
- pinned
|
||||||
|
|
||||||
|
# API connection (inherited from inventory)
|
||||||
|
api_port: 8006
|
||||||
|
validate_certs: false
|
||||||
11
roles/proxmox_drain/meta/main.yml
Normal file
11
roles/proxmox_drain/meta/main.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
galaxy_info:
|
||||||
|
role_name: proxmox_drain
|
||||||
|
author: ansible-msp
|
||||||
|
description: "MSP Proxmox automation — proxmox_drain"
|
||||||
|
min_ansible_version: "2.15"
|
||||||
|
platforms:
|
||||||
|
- name: Debian
|
||||||
|
versions:
|
||||||
|
- bookworm
|
||||||
|
dependencies: []
|
||||||
217
roles/proxmox_drain/tasks/main.yml
Normal file
217
roles/proxmox_drain/tasks/main.yml
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_drain — tasks
|
||||||
|
# Migrates all VMs/LXCs off current_node to the best available target.
|
||||||
|
# Writes a state file so proxmox_restore can return VMs to origin.
|
||||||
|
#
|
||||||
|
# Required vars:
|
||||||
|
# current_node — the node being drained
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# ── Discover guests on this node ──────────────────────────────────────────────
|
||||||
|
- name: "Drain | {{ current_node }} | Discover guests"
|
||||||
|
community.proxmox.proxmox_vm_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
node: "{{ current_node }}"
|
||||||
|
register: drain_node_guests
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | Separate KVM and LXC guests"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
drain_kvm_guests: >-
|
||||||
|
{{ drain_node_guests.proxmox_vms
|
||||||
|
| selectattr('type', 'equalto', 'qemu')
|
||||||
|
| rejectattr('template', 'equalto', true)
|
||||||
|
| list }}
|
||||||
|
drain_lxc_guests: >-
|
||||||
|
{{ drain_node_guests.proxmox_vms
|
||||||
|
| selectattr('type', 'equalto', 'lxc')
|
||||||
|
| list }}
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | Filter excluded tags"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
drain_kvm_guests: >-
|
||||||
|
{{ drain_kvm_guests
|
||||||
|
| rejectattr('tags', 'defined')
|
||||||
|
| list
|
||||||
|
+ drain_kvm_guests
|
||||||
|
| selectattr('tags', 'defined')
|
||||||
|
| rejectattr('tags', 'search', drain_exclude_tags | join('|'))
|
||||||
|
| list }}
|
||||||
|
drain_lxc_guests: >-
|
||||||
|
{{ drain_lxc_guests
|
||||||
|
| rejectattr('tags', 'defined')
|
||||||
|
| list
|
||||||
|
+ drain_lxc_guests
|
||||||
|
| selectattr('tags', 'defined')
|
||||||
|
| rejectattr('tags', 'search', drain_exclude_tags | join('|'))
|
||||||
|
| list }}
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | Log guest inventory"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
{{ current_node }} has
|
||||||
|
{{ drain_kvm_guests | length }} KVM guest(s) and
|
||||||
|
{{ drain_lxc_guests | length }} LXC guest(s) to migrate.
|
||||||
|
VMIDs: {{ (drain_kvm_guests + drain_lxc_guests) | map(attribute='vmid') | list }}
|
||||||
|
|
||||||
|
# ── Skip if nothing to migrate ────────────────────────────────────────────────
|
||||||
|
- name: "Drain | {{ current_node }} | Skip — no guests to migrate"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Node {{ current_node }} has no guests — skipping drain."
|
||||||
|
when:
|
||||||
|
- drain_kvm_guests | length == 0
|
||||||
|
- drain_lxc_guests | length == 0
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | End play if no guests"
|
||||||
|
ansible.builtin.meta: end_play
|
||||||
|
when:
|
||||||
|
- drain_kvm_guests | length == 0
|
||||||
|
- drain_lxc_guests | length == 0
|
||||||
|
|
||||||
|
# ── Select migration target ───────────────────────────────────────────────────
|
||||||
|
- name: "Drain | {{ current_node }} | Get all node resource info"
|
||||||
|
community.proxmox.proxmox_node_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
register: drain_all_nodes
|
||||||
|
delegate_to: localhost
|
||||||
|
when: drain_target_strategy == 'resources'
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | Score nodes by available resources"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
drain_scored_nodes: >-
|
||||||
|
{% set candidates = [] %}
|
||||||
|
{% for node in drain_all_nodes.proxmox_nodes %}
|
||||||
|
{% if node.status == 'online' and node.node != current_node %}
|
||||||
|
{% set free_mem = node.maxmem - node.mem %}
|
||||||
|
{% set free_cpu = 1.0 - (node.cpu | default(0)) %}
|
||||||
|
{% set score = (free_mem * drain_score_mem_weight | float) + (free_cpu * 1000000000 * drain_score_cpu_weight | float) %}
|
||||||
|
{% set _ = candidates.append({'node': node.node, 'score': score, 'free_mem': free_mem, 'free_cpu': free_cpu}) %}
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{{ candidates | sort(attribute='score', reverse=true) }}
|
||||||
|
delegate_to: localhost
|
||||||
|
when: drain_target_strategy == 'resources'
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | Set migration target (resources)"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
drain_resolved_target: "{{ drain_scored_nodes | first | default({}) | default({'node': ''}) }}"
|
||||||
|
drain_target: "{{ (drain_scored_nodes | first).node }}"
|
||||||
|
delegate_to: localhost
|
||||||
|
when: drain_target_strategy == 'resources'
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | Set migration target (explicit)"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
drain_target: "{{ drain_target_node }}"
|
||||||
|
delegate_to: localhost
|
||||||
|
when: drain_target_strategy == 'explicit'
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | Fail if no target available"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >-
|
||||||
|
No valid migration target found for node {{ current_node }}.
|
||||||
|
All other nodes may be offline or no nodes configured.
|
||||||
|
when: drain_target == ''
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | Log migration target"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Migration target for {{ current_node }}: {{ drain_target }}
|
||||||
|
{% if drain_target_strategy == 'resources' %}
|
||||||
|
(free_mem={{ (drain_resolved_target.free_mem / 1073741824) | round(1) }}GB,
|
||||||
|
free_cpu={{ (drain_resolved_target.free_cpu * 100) | round(1) }}%)
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
# ── Write state file for restore ──────────────────────────────────────────────
|
||||||
|
- name: "Drain | {{ current_node }} | Ensure state directory exists"
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ drain_state_dir }}"
|
||||||
|
state: directory
|
||||||
|
mode: '0700'
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | Write VM origin state"
|
||||||
|
ansible.builtin.copy:
|
||||||
|
content: >-
|
||||||
|
{{ (drain_kvm_guests + drain_lxc_guests)
|
||||||
|
| map('combine', {'origin_node': current_node})
|
||||||
|
| list
|
||||||
|
| to_nice_json }}
|
||||||
|
dest: "{{ drain_state_dir }}/{{ current_node }}_{{ ansible_date_time.iso8601_basic_short }}.json"
|
||||||
|
mode: '0600'
|
||||||
|
delegate_to: localhost
|
||||||
|
when: ansible_date_time is defined
|
||||||
|
|
||||||
|
# ── Migrate KVM guests ────────────────────────────────────────────────────────
|
||||||
|
- name: "Drain | {{ current_node }} | KVM | Live migrate to {{ drain_target }}"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
qm migrate {{ item.vmid }} {{ drain_target }}
|
||||||
|
{% if item.status == 'running' %}--online{% endif %}
|
||||||
|
--with-local-disks 0
|
||||||
|
loop: "{{ drain_kvm_guests }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }} (VMID {{ item.vmid }}) — {{ item.status }}"
|
||||||
|
changed_when: true
|
||||||
|
register: drain_kvm_results
|
||||||
|
failed_when: drain_kvm_results.rc is defined and drain_kvm_results.rc != 0
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | KVM | Verify guests moved"
|
||||||
|
community.proxmox.proxmox_vm_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
node: "{{ drain_target }}"
|
||||||
|
register: drain_verify_guests
|
||||||
|
delegate_to: localhost
|
||||||
|
when: drain_kvm_guests | length > 0
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | KVM | Log migration results"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
KVM migrations complete —
|
||||||
|
{{ drain_kvm_guests | length }} guest(s) moved to {{ drain_target }}.
|
||||||
|
when: drain_kvm_guests | length > 0
|
||||||
|
|
||||||
|
# ── Migrate LXC guests ────────────────────────────────────────────────────────
|
||||||
|
- name: "Drain | {{ current_node }} | LXC | Migrate to {{ drain_target }}"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
pct migrate {{ item.vmid }} {{ drain_target }}
|
||||||
|
{% if drain_lxc_restart %}--restart{% endif %}
|
||||||
|
--timeout {{ drain_vm_shutdown_timeout }}
|
||||||
|
loop: "{{ drain_lxc_guests }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name | default(item.vmid) }} (VMID {{ item.vmid }}) — {{ item.status }}"
|
||||||
|
changed_when: true
|
||||||
|
register: drain_lxc_results
|
||||||
|
failed_when: drain_lxc_results.rc is defined and drain_lxc_results.rc != 0
|
||||||
|
|
||||||
|
- name: "Drain | {{ current_node }} | LXC | Log migration results"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
LXC migrations complete —
|
||||||
|
{{ drain_lxc_guests | length }} container(s) moved to {{ drain_target }}.
|
||||||
|
when: drain_lxc_guests | length > 0
|
||||||
|
|
||||||
|
# ── Final summary ─────────────────────────────────────────────────────────────
|
||||||
|
- name: "Drain | {{ current_node }} | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
✓ Node {{ current_node }} drained —
|
||||||
|
{{ drain_kvm_guests | length }} KVM +
|
||||||
|
{{ drain_lxc_guests | length }} LXC guests migrated to {{ drain_target }}.
|
||||||
17
roles/proxmox_ha/defaults/main.yml
Normal file
17
roles/proxmox_ha/defaults/main.yml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_ha — defaults
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Action: status | disable | enable | migrate
|
||||||
|
ha_action: status
|
||||||
|
|
||||||
|
# Node to disable/enable HA management for (used with disable/enable)
|
||||||
|
# Default: current_node (set by caller)
|
||||||
|
|
||||||
|
# Timeout waiting for HA manager to acknowledge
|
||||||
|
ha_timeout: 60
|
||||||
|
|
||||||
|
# API connection (inherited from inventory)
|
||||||
|
api_port: 8006
|
||||||
|
validate_certs: false
|
||||||
11
roles/proxmox_ha/meta/main.yml
Normal file
11
roles/proxmox_ha/meta/main.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
galaxy_info:
|
||||||
|
role_name: proxmox_ha
|
||||||
|
author: ansible-msp
|
||||||
|
description: "MSP Proxmox automation — proxmox_ha"
|
||||||
|
min_ansible_version: "2.15"
|
||||||
|
platforms:
|
||||||
|
- name: Debian
|
||||||
|
versions:
|
||||||
|
- bookworm
|
||||||
|
dependencies: []
|
||||||
96
roles/proxmox_ha/tasks/main.yml
Normal file
96
roles/proxmox_ha/tasks/main.yml
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_ha — tasks
|
||||||
|
# Manages Proxmox HA group membership and maintenance mode.
|
||||||
|
# Proxmox HA is self-managing during migrations — this role handles
|
||||||
|
# cases where you need to explicitly pause or resume HA for a node.
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# ── Detect HA ─────────────────────────────────────────────────────────────────
|
||||||
|
- name: "HA | Detect if HA is configured"
|
||||||
|
ansible.builtin.command: ha-manager status
|
||||||
|
register: ha_detect
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "HA | Set HA enabled fact"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
ha_is_enabled: "{{ ha_detect.rc == 0 and ha_detect.stdout != '' }}"
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "HA | Skip — HA not configured"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "HA is not configured on this cluster — skipping."
|
||||||
|
when: not ha_is_enabled
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
# ── HA status ─────────────────────────────────────────────────────────────────
|
||||||
|
- name: "HA | Get status"
|
||||||
|
ansible.builtin.command: ha-manager status
|
||||||
|
register: ha_status
|
||||||
|
changed_when: false
|
||||||
|
when:
|
||||||
|
- ha_is_enabled
|
||||||
|
- ha_action == 'status'
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "HA | Log status"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "{{ ha_status.stdout_lines }}"
|
||||||
|
when:
|
||||||
|
- ha_is_enabled
|
||||||
|
- ha_action == 'status'
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
# ── Put node in maintenance mode ──────────────────────────────────────────────
|
||||||
|
# Proxmox uses node maintenance mode via ha-manager to gracefully migrate
|
||||||
|
# HA-managed VMs before maintenance. This is the correct HA-aware drain.
|
||||||
|
- name: "HA | Enable maintenance mode for {{ current_node }}"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
ha-manager crm-command node-maintenance enable {{ current_node }}
|
||||||
|
changed_when: true
|
||||||
|
run_once: true
|
||||||
|
when:
|
||||||
|
- ha_is_enabled
|
||||||
|
- ha_action == 'disable'
|
||||||
|
|
||||||
|
- name: "HA | Wait for {{ current_node }} maintenance mode to be acknowledged"
|
||||||
|
ansible.builtin.command: ha-manager status
|
||||||
|
register: ha_maintenance_check
|
||||||
|
changed_when: false
|
||||||
|
until: >-
|
||||||
|
'maintenance' in ha_maintenance_check.stdout
|
||||||
|
or current_node + ' (maintenance)' in ha_maintenance_check.stdout
|
||||||
|
retries: "{{ (ha_timeout / 5) | int }}"
|
||||||
|
delay: 5
|
||||||
|
run_once: true
|
||||||
|
when:
|
||||||
|
- ha_is_enabled
|
||||||
|
- ha_action == 'disable'
|
||||||
|
|
||||||
|
- name: "HA | Maintenance mode enabled for {{ current_node }}"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ HA maintenance mode enabled for {{ current_node }} — HA will not restart VMs on this node."
|
||||||
|
when:
|
||||||
|
- ha_is_enabled
|
||||||
|
- ha_action == 'disable'
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
# ── Resume HA management ──────────────────────────────────────────────────────
|
||||||
|
- name: "HA | Disable maintenance mode for {{ current_node }}"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
ha-manager crm-command node-maintenance disable {{ current_node }}
|
||||||
|
changed_when: true
|
||||||
|
run_once: true
|
||||||
|
when:
|
||||||
|
- ha_is_enabled
|
||||||
|
- ha_action == 'enable'
|
||||||
|
|
||||||
|
- name: "HA | Maintenance mode disabled for {{ current_node }}"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ HA management resumed for {{ current_node }}."
|
||||||
|
when:
|
||||||
|
- ha_is_enabled
|
||||||
|
- ha_action == 'enable'
|
||||||
|
run_once: true
|
||||||
18
roles/proxmox_preflight/defaults/main.yml
Normal file
18
roles/proxmox_preflight/defaults/main.yml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_preflight — defaults
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Minimum number of nodes that must be online before proceeding
|
||||||
|
preflight_min_nodes_online: 1
|
||||||
|
|
||||||
|
# Abort if any node is offline (set false to warn only)
|
||||||
|
preflight_abort_on_offline_node: true
|
||||||
|
|
||||||
|
# Quorum check via pvecm (SSH)
|
||||||
|
preflight_check_quorum: true
|
||||||
|
|
||||||
|
# API connection (inherited from inventory)
|
||||||
|
# api_host, api_port, api_user, api_token_id, api_token_secret
|
||||||
|
api_port: 8006
|
||||||
|
validate_certs: false
|
||||||
11
roles/proxmox_preflight/meta/main.yml
Normal file
11
roles/proxmox_preflight/meta/main.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
galaxy_info:
|
||||||
|
role_name: proxmox_preflight
|
||||||
|
author: ansible-msp
|
||||||
|
description: "MSP Proxmox automation — proxmox_preflight"
|
||||||
|
min_ansible_version: "2.15"
|
||||||
|
platforms:
|
||||||
|
- name: Debian
|
||||||
|
versions:
|
||||||
|
- bookworm
|
||||||
|
dependencies: []
|
||||||
113
roles/proxmox_preflight/tasks/main.yml
Normal file
113
roles/proxmox_preflight/tasks/main.yml
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_preflight — tasks
|
||||||
|
# Determines: standalone vs cluster, node health, quorum, CEPH state
|
||||||
|
# Sets facts: proxmox_is_cluster, proxmox_cluster_nodes, proxmox_node_count
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# ── Detect standalone vs cluster ──────────────────────────────────────────────
|
||||||
|
- name: "Preflight | Detect cluster membership"
|
||||||
|
ansible.builtin.command: pvecm status
|
||||||
|
register: pvecm_status
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: "Preflight | Set cluster mode fact"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
proxmox_is_cluster: "{{ pvecm_status.rc == 0 }}"
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Preflight | Log topology"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Node {{ inventory_hostname }} is running in
|
||||||
|
{{ 'CLUSTER' if proxmox_is_cluster else 'STANDALONE' }} mode.
|
||||||
|
|
||||||
|
# ── Standalone path ───────────────────────────────────────────────────────────
|
||||||
|
- name: "Preflight | Standalone | Verify host is reachable"
|
||||||
|
ansible.builtin.ping:
|
||||||
|
when: not proxmox_is_cluster
|
||||||
|
|
||||||
|
- name: "Preflight | Standalone | Health check passed"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "Standalone node {{ inventory_hostname }} is reachable — preflight passed."
|
||||||
|
when: not proxmox_is_cluster
|
||||||
|
|
||||||
|
# ── Cluster path ──────────────────────────────────────────────────────────────
|
||||||
|
- name: "Preflight | Cluster | Check quorum"
|
||||||
|
ansible.builtin.command: pvecm status
|
||||||
|
register: quorum_check
|
||||||
|
changed_when: false
|
||||||
|
failed_when: "'Quorate' not in quorum_check.stdout"
|
||||||
|
when: proxmox_is_cluster and preflight_check_quorum
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "Preflight | Cluster | Get all node info via API"
|
||||||
|
community.proxmox.proxmox_node_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
register: proxmox_all_nodes
|
||||||
|
delegate_to: localhost
|
||||||
|
run_once: true
|
||||||
|
when: proxmox_is_cluster
|
||||||
|
|
||||||
|
- name: "Preflight | Cluster | Set node list facts"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
proxmox_cluster_nodes: "{{ proxmox_all_nodes.proxmox_nodes }}"
|
||||||
|
proxmox_node_count: "{{ proxmox_all_nodes.proxmox_nodes | length }}"
|
||||||
|
proxmox_online_nodes: >-
|
||||||
|
{{ proxmox_all_nodes.proxmox_nodes
|
||||||
|
| selectattr('status', 'equalto', 'online')
|
||||||
|
| list }}
|
||||||
|
proxmox_offline_nodes: >-
|
||||||
|
{{ proxmox_all_nodes.proxmox_nodes
|
||||||
|
| rejectattr('status', 'equalto', 'online')
|
||||||
|
| list }}
|
||||||
|
delegate_to: localhost
|
||||||
|
run_once: true
|
||||||
|
when: proxmox_is_cluster
|
||||||
|
|
||||||
|
- name: "Preflight | Cluster | Warn about offline nodes"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
WARNING: The following nodes are offline:
|
||||||
|
{{ proxmox_offline_nodes | map(attribute='node') | list }}
|
||||||
|
when:
|
||||||
|
- proxmox_is_cluster
|
||||||
|
- proxmox_offline_nodes | length > 0
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "Preflight | Cluster | Abort if offline nodes detected"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >-
|
||||||
|
Preflight failed — {{ proxmox_offline_nodes | length }} node(s) are offline:
|
||||||
|
{{ proxmox_offline_nodes | map(attribute='node') | list }}.
|
||||||
|
Set preflight_abort_on_offline_node=false to proceed anyway.
|
||||||
|
when:
|
||||||
|
- proxmox_is_cluster
|
||||||
|
- preflight_abort_on_offline_node
|
||||||
|
- proxmox_offline_nodes | length > 0
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "Preflight | Cluster | Verify minimum online node count"
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: >-
|
||||||
|
Only {{ proxmox_online_nodes | length }} node(s) online.
|
||||||
|
Minimum required: {{ preflight_min_nodes_online }}.
|
||||||
|
when:
|
||||||
|
- proxmox_is_cluster
|
||||||
|
- proxmox_online_nodes | length < preflight_min_nodes_online | int
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "Preflight | Cluster | Health check passed"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Cluster preflight OK —
|
||||||
|
{{ proxmox_online_nodes | length }}/{{ proxmox_node_count }} nodes online,
|
||||||
|
quorum confirmed.
|
||||||
|
when: proxmox_is_cluster
|
||||||
|
run_once: true
|
||||||
17
roles/proxmox_restore/defaults/main.yml
Normal file
17
roles/proxmox_restore/defaults/main.yml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_restore — defaults
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# State file directory (must match drain_state_dir)
|
||||||
|
restore_state_dir: "/tmp/proxmox_drain_state"
|
||||||
|
|
||||||
|
# If true, delete the state file after successful restore
|
||||||
|
restore_cleanup_state_file: true
|
||||||
|
|
||||||
|
# Timeout waiting for VM to start on restored node
|
||||||
|
restore_vm_start_timeout: 120
|
||||||
|
|
||||||
|
# API connection (inherited from inventory)
|
||||||
|
api_port: 8006
|
||||||
|
validate_certs: false
|
||||||
11
roles/proxmox_restore/meta/main.yml
Normal file
11
roles/proxmox_restore/meta/main.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
galaxy_info:
|
||||||
|
role_name: proxmox_restore
|
||||||
|
author: ansible-msp
|
||||||
|
description: "MSP Proxmox automation — proxmox_restore"
|
||||||
|
min_ansible_version: "2.15"
|
||||||
|
platforms:
|
||||||
|
- name: Debian
|
||||||
|
versions:
|
||||||
|
- bookworm
|
||||||
|
dependencies: []
|
||||||
112
roles/proxmox_restore/tasks/main.yml
Normal file
112
roles/proxmox_restore/tasks/main.yml
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_restore — tasks
|
||||||
|
# Returns VMs to their origin nodes using state written by proxmox_drain.
|
||||||
|
#
|
||||||
|
# Required vars:
|
||||||
|
# current_node — the node whose VMs should be restored
|
||||||
|
# restore_state_file — path to the JSON state file (set by caller or discovered)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# ── Find state file ───────────────────────────────────────────────────────────
|
||||||
|
- name: "Restore | {{ current_node }} | Find state files"
|
||||||
|
ansible.builtin.find:
|
||||||
|
paths: "{{ restore_state_dir }}"
|
||||||
|
patterns: "{{ current_node }}_*.json"
|
||||||
|
file_type: file
|
||||||
|
register: restore_found_files
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Restore | {{ current_node }} | No state files found — skipping"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
No drain state files found for {{ current_node }} in {{ restore_state_dir }}.
|
||||||
|
Skipping restore.
|
||||||
|
when: restore_found_files.files | length == 0
|
||||||
|
|
||||||
|
- name: "Restore | {{ current_node }} | End if no state files"
|
||||||
|
ansible.builtin.meta: end_play
|
||||||
|
when: restore_found_files.files | length == 0
|
||||||
|
|
||||||
|
- name: "Restore | {{ current_node }} | Use most recent state file"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
restore_state_file: >-
|
||||||
|
{{ (restore_found_files.files | sort(attribute='mtime') | last).path }}
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Restore | {{ current_node }} | Load state file"
|
||||||
|
ansible.builtin.slurp:
|
||||||
|
src: "{{ restore_state_file }}"
|
||||||
|
register: restore_state_raw
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Restore | {{ current_node }} | Parse VM origin list"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
restore_vm_list: "{{ restore_state_raw.content | b64decode | from_json }}"
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
- name: "Restore | {{ current_node }} | Log restore plan"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
Restoring {{ restore_vm_list | length }} guest(s) to {{ current_node }}:
|
||||||
|
{{ restore_vm_list | map(attribute='vmid') | list }}
|
||||||
|
|
||||||
|
# ── Get current VM locations ──────────────────────────────────────────────────
|
||||||
|
- name: "Restore | {{ current_node }} | Get current VM locations"
|
||||||
|
community.proxmox.proxmox_vm_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
register: restore_all_vms
|
||||||
|
delegate_to: localhost
|
||||||
|
|
||||||
|
# ── Migrate KVM guests back ───────────────────────────────────────────────────
|
||||||
|
- name: "Restore | {{ current_node }} | KVM | Migrate back"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
qm migrate {{ item.vmid }} {{ current_node }}
|
||||||
|
{% if item.status == 'running' %}--online{% endif %}
|
||||||
|
--with-local-disks 0
|
||||||
|
loop: "{{ restore_vm_list | selectattr('type', 'equalto', 'qemu') | list }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name }} (VMID {{ item.vmid }})"
|
||||||
|
changed_when: true
|
||||||
|
vars:
|
||||||
|
current_location: >-
|
||||||
|
{{ restore_all_vms.proxmox_vms
|
||||||
|
| selectattr('vmid', 'equalto', item.vmid)
|
||||||
|
| map(attribute='node')
|
||||||
|
| first
|
||||||
|
| default('unknown') }}
|
||||||
|
when: current_location != current_node
|
||||||
|
|
||||||
|
# ── Migrate LXC guests back ───────────────────────────────────────────────────
|
||||||
|
- name: "Restore | {{ current_node }} | LXC | Migrate back"
|
||||||
|
ansible.builtin.command: >
|
||||||
|
pct migrate {{ item.vmid }} {{ current_node }} --restart --timeout 120
|
||||||
|
loop: "{{ restore_vm_list | selectattr('type', 'equalto', 'lxc') | list }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.name | default(item.vmid) }} (VMID {{ item.vmid }})"
|
||||||
|
changed_when: true
|
||||||
|
vars:
|
||||||
|
current_location: >-
|
||||||
|
{{ restore_all_vms.proxmox_vms
|
||||||
|
| selectattr('vmid', 'equalto', item.vmid)
|
||||||
|
| map(attribute='node')
|
||||||
|
| first
|
||||||
|
| default('unknown') }}
|
||||||
|
when: current_location != current_node
|
||||||
|
|
||||||
|
# ── Cleanup ───────────────────────────────────────────────────────────────────
|
||||||
|
- name: "Restore | {{ current_node }} | Remove state file"
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "{{ restore_state_file }}"
|
||||||
|
state: absent
|
||||||
|
delegate_to: localhost
|
||||||
|
when: restore_cleanup_state_file
|
||||||
|
|
||||||
|
- name: "Restore | {{ current_node }} | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "✓ Restore complete — {{ restore_vm_list | length }} guest(s) returned to {{ current_node }}."
|
||||||
20
roles/proxmox_status/defaults/main.yml
Normal file
20
roles/proxmox_status/defaults/main.yml
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_status — defaults
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Include VM inventory in report
|
||||||
|
status_include_vms: true
|
||||||
|
|
||||||
|
# Include storage status
|
||||||
|
status_include_storage: true
|
||||||
|
|
||||||
|
# Include CEPH status (skipped gracefully if not configured)
|
||||||
|
status_include_ceph: true
|
||||||
|
|
||||||
|
# Include HA status (skipped gracefully if not configured)
|
||||||
|
status_include_ha: true
|
||||||
|
|
||||||
|
# API connection (inherited from inventory)
|
||||||
|
api_port: 8006
|
||||||
|
validate_certs: false
|
||||||
11
roles/proxmox_status/meta/main.yml
Normal file
11
roles/proxmox_status/meta/main.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
galaxy_info:
|
||||||
|
role_name: proxmox_status
|
||||||
|
author: ansible-msp
|
||||||
|
description: "MSP Proxmox automation — proxmox_status"
|
||||||
|
min_ansible_version: "2.15"
|
||||||
|
platforms:
|
||||||
|
- name: Debian
|
||||||
|
versions:
|
||||||
|
- bookworm
|
||||||
|
dependencies: []
|
||||||
127
roles/proxmox_status/tasks/main.yml
Normal file
127
roles/proxmox_status/tasks/main.yml
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_status — tasks
|
||||||
|
# Produces a cluster health report: nodes, VMs, storage, CEPH, HA.
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# ── Node info ─────────────────────────────────────────────────────────────────
|
||||||
|
- name: "Status | Get cluster node info"
|
||||||
|
community.proxmox.proxmox_node_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
register: status_nodes
|
||||||
|
delegate_to: localhost
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
- name: "Status | Node summary"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
┌─ NODE SUMMARY ─────────────────────────────
|
||||||
|
{% for node in status_nodes.proxmox_nodes | sort(attribute='node') %}
|
||||||
|
│ {{ node.node | ljust(20) }}
|
||||||
|
status={{ node.status | ljust(8) }}
|
||||||
|
ver={{ node.version.version | default('?') }}
|
||||||
|
cpu={{ (node.cpu | default(0) * 100) | round(1) }}%
|
||||||
|
mem={{ ((node.mem | default(0)) / 1073741824) | round(1) }}GB /
|
||||||
|
{{ ((node.maxmem | default(0)) / 1073741824) | round(1) }}GB
|
||||||
|
{% endfor %}
|
||||||
|
└────────────────────────────────────────────
|
||||||
|
run_once: true
|
||||||
|
|
||||||
|
# ── VM inventory ──────────────────────────────────────────────────────────────
|
||||||
|
- name: "Status | Get VM info for each node"
|
||||||
|
community.proxmox.proxmox_vm_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
node: "{{ item.node }}"
|
||||||
|
loop: "{{ status_nodes.proxmox_nodes | selectattr('status', 'equalto', 'online') | list }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.node }}"
|
||||||
|
register: status_vms_per_node
|
||||||
|
delegate_to: localhost
|
||||||
|
run_once: true
|
||||||
|
when: status_include_vms
|
||||||
|
|
||||||
|
- name: "Status | VM distribution summary"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
┌─ VM DISTRIBUTION ──────────────────────────
|
||||||
|
{% for result in status_vms_per_node.results %}
|
||||||
|
│ {{ result.item.node | ljust(20) }}
|
||||||
|
total={{ result.proxmox_vms | length }}
|
||||||
|
running={{ result.proxmox_vms | selectattr('status', 'equalto', 'running') | list | length }}
|
||||||
|
stopped={{ result.proxmox_vms | selectattr('status', 'equalto', 'stopped') | list | length }}
|
||||||
|
{% endfor %}
|
||||||
|
│ Total VMs: {{ status_vms_per_node.results | map(attribute='proxmox_vms') | flatten | length }}
|
||||||
|
└────────────────────────────────────────────
|
||||||
|
run_once: true
|
||||||
|
when: status_include_vms
|
||||||
|
|
||||||
|
# ── CEPH status ───────────────────────────────────────────────────────────────
|
||||||
|
- name: "Status | CEPH status"
|
||||||
|
ansible.builtin.command: ceph status --format json
|
||||||
|
register: status_ceph
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
run_once: true
|
||||||
|
when: status_include_ceph
|
||||||
|
|
||||||
|
- name: "Status | CEPH summary"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
┌─ CEPH STATUS ───────────────────────────────
|
||||||
|
{% if status_ceph.rc == 0 %}
|
||||||
|
│ Health: {{ (status_ceph.stdout | from_json).health.status }}
|
||||||
|
│ OSDs: {{ (status_ceph.stdout | from_json).osdmap.num_osds }} total,
|
||||||
|
{{ (status_ceph.stdout | from_json).osdmap.num_up_osds }} up,
|
||||||
|
{{ (status_ceph.stdout | from_json).osdmap.num_in_osds }} in
|
||||||
|
{% else %}
|
||||||
|
│ CEPH not configured or not reachable.
|
||||||
|
{% endif %}
|
||||||
|
└────────────────────────────────────────────
|
||||||
|
run_once: true
|
||||||
|
when: status_include_ceph
|
||||||
|
|
||||||
|
# ── HA status ─────────────────────────────────────────────────────────────────
|
||||||
|
- name: "Status | HA status"
|
||||||
|
ansible.builtin.command: ha-manager status
|
||||||
|
register: status_ha
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
run_once: true
|
||||||
|
when: status_include_ha
|
||||||
|
|
||||||
|
- name: "Status | HA summary"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
┌─ HA STATUS ─────────────────────────────────
|
||||||
|
{% if status_ha.rc == 0 and status_ha.stdout != '' %}
|
||||||
|
{{ status_ha.stdout_lines | join('\n ') }}
|
||||||
|
{% else %}
|
||||||
|
│ HA not configured.
|
||||||
|
{% endif %}
|
||||||
|
└────────────────────────────────────────────
|
||||||
|
run_once: true
|
||||||
|
when: status_include_ha
|
||||||
|
|
||||||
|
# ── PVE versions ─────────────────────────────────────────────────────────────
|
||||||
|
- name: "Status | Check for available updates on each node"
|
||||||
|
ansible.builtin.shell: |
|
||||||
|
apt-get -q update > /dev/null 2>&1
|
||||||
|
apt-get -s dist-upgrade 2>/dev/null | grep "^Inst " | wc -l
|
||||||
|
register: status_updates_available
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: "Status | Update availability per node"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
{{ inventory_hostname }}: {{ status_updates_available.stdout | trim }} package(s) available for upgrade
|
||||||
|
(PVE {{ ansible_local.pve_version | default('unknown') }})
|
||||||
21
roles/proxmox_upgrade_node/defaults/main.yml
Normal file
21
roles/proxmox_upgrade_node/defaults/main.yml
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade_node — defaults
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Reboot behaviour
|
||||||
|
upgrade_reboot_if_required: true # reboot if /var/run/reboot-required exists
|
||||||
|
upgrade_reboot_force: false # reboot even if not required
|
||||||
|
upgrade_reboot_timeout: 600 # seconds to wait for node to come back
|
||||||
|
upgrade_node_rejoin_timeout: 300 # seconds to wait for cluster rejoin
|
||||||
|
upgrade_node_rejoin_retries: 30
|
||||||
|
upgrade_node_rejoin_delay: 10
|
||||||
|
|
||||||
|
# apt options
|
||||||
|
upgrade_apt_update_cache: true
|
||||||
|
upgrade_apt_autoremove: true
|
||||||
|
upgrade_apt_cache_valid_time: 3600
|
||||||
|
|
||||||
|
# API connection (inherited from inventory)
|
||||||
|
api_port: 8006
|
||||||
|
validate_certs: false
|
||||||
11
roles/proxmox_upgrade_node/meta/main.yml
Normal file
11
roles/proxmox_upgrade_node/meta/main.yml
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
galaxy_info:
|
||||||
|
role_name: proxmox_upgrade_node
|
||||||
|
author: ansible-msp
|
||||||
|
description: "MSP Proxmox automation — proxmox_upgrade_node"
|
||||||
|
min_ansible_version: "2.15"
|
||||||
|
platforms:
|
||||||
|
- name: Debian
|
||||||
|
versions:
|
||||||
|
- bookworm
|
||||||
|
dependencies: []
|
||||||
85
roles/proxmox_upgrade_node/tasks/main.yml
Normal file
85
roles/proxmox_upgrade_node/tasks/main.yml
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
---
|
||||||
|
# =============================================================================
|
||||||
|
# proxmox_upgrade_node — tasks
|
||||||
|
# Runs apt dist-upgrade on a single node, reboots if required,
|
||||||
|
# and waits for the node to rejoin the cluster.
|
||||||
|
#
|
||||||
|
# Required vars:
|
||||||
|
# current_node — the node being upgraded (used for logging)
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
- name: "Upgrade | {{ current_node }} | apt-get update"
|
||||||
|
ansible.builtin.apt:
|
||||||
|
update_cache: "{{ upgrade_apt_update_cache }}"
|
||||||
|
cache_valid_time: "{{ upgrade_apt_cache_valid_time }}"
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: "Upgrade | {{ current_node }} | apt dist-upgrade"
|
||||||
|
ansible.builtin.apt:
|
||||||
|
upgrade: dist
|
||||||
|
autoremove: "{{ upgrade_apt_autoremove }}"
|
||||||
|
autoclean: true
|
||||||
|
register: upgrade_apt_result
|
||||||
|
|
||||||
|
- name: "Upgrade | {{ current_node }} | Log upgraded packages"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "{{ upgrade_apt_result.stdout_lines | last | default('No output') }}"
|
||||||
|
|
||||||
|
- name: "Upgrade | {{ current_node }} | Check if reboot required"
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: /var/run/reboot-required
|
||||||
|
register: upgrade_reboot_required_file
|
||||||
|
|
||||||
|
- name: "Upgrade | {{ current_node }} | Set reboot needed fact"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
upgrade_needs_reboot: >-
|
||||||
|
{{ upgrade_reboot_required_file.stat.exists or upgrade_reboot_force }}
|
||||||
|
|
||||||
|
- name: "Upgrade | {{ current_node }} | Reboot node"
|
||||||
|
ansible.builtin.reboot:
|
||||||
|
reboot_timeout: "{{ upgrade_reboot_timeout }}"
|
||||||
|
msg: "Ansible controlled reboot for Proxmox upgrade"
|
||||||
|
pre_reboot_delay: 5
|
||||||
|
post_reboot_delay: 15
|
||||||
|
when:
|
||||||
|
- upgrade_needs_reboot
|
||||||
|
- upgrade_reboot_if_required
|
||||||
|
|
||||||
|
- name: "Upgrade | {{ current_node }} | Skip reboot (not required)"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: "No reboot required on {{ current_node }} — skipping."
|
||||||
|
when: not upgrade_needs_reboot
|
||||||
|
|
||||||
|
# ── Wait for cluster rejoin ───────────────────────────────────────────────────
|
||||||
|
- name: "Upgrade | {{ current_node }} | Wait for node to rejoin cluster"
|
||||||
|
community.proxmox.proxmox_node_info:
|
||||||
|
api_host: "{{ api_host }}"
|
||||||
|
api_user: "{{ api_user }}"
|
||||||
|
api_token_id: "{{ api_token_id }}"
|
||||||
|
api_token_secret: "{{ api_token_secret }}"
|
||||||
|
api_port: "{{ api_port }}"
|
||||||
|
validate_certs: "{{ validate_certs }}"
|
||||||
|
register: upgrade_rejoin_check
|
||||||
|
delegate_to: localhost
|
||||||
|
until: >-
|
||||||
|
upgrade_rejoin_check.proxmox_nodes
|
||||||
|
| selectattr('node', 'equalto', current_node)
|
||||||
|
| selectattr('status', 'equalto', 'online')
|
||||||
|
| list
|
||||||
|
| length > 0
|
||||||
|
retries: "{{ upgrade_node_rejoin_retries }}"
|
||||||
|
delay: "{{ upgrade_node_rejoin_delay }}"
|
||||||
|
when: upgrade_needs_reboot
|
||||||
|
|
||||||
|
- name: "Upgrade | {{ current_node }} | Node back online"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
✓ Node {{ current_node }} has rejoined the cluster
|
||||||
|
{{ '(after reboot)' if upgrade_needs_reboot else '(no reboot needed)' }}.
|
||||||
|
|
||||||
|
- name: "Upgrade | {{ current_node }} | Complete"
|
||||||
|
ansible.builtin.debug:
|
||||||
|
msg: >-
|
||||||
|
━━━ Upgrade complete: {{ current_node }}
|
||||||
|
{% if upgrade_apt_result.changed %}(packages updated){% else %}(already up to date){% endif %}
|
||||||
|
{% if upgrade_needs_reboot %}(rebooted){% else %}(no reboot){% endif %} ━━━
|
||||||
Reference in New Issue
Block a user