testing new proxmox logic

This commit is contained in:
Semaphore
2026-03-15 15:48:59 -07:00
parent 347a85b09d
commit ea2f00c098
34 changed files with 2391 additions and 21 deletions

View File

@@ -0,0 +1,18 @@
---
# =============================================================================
# proxmox_ceph — defaults
# =============================================================================
# Action: set_noout | clear_noout | status | check_health
ceph_action: status
# Health check settings
ceph_health_timeout: 300 # seconds to wait for HEALTH_OK
ceph_health_retries: 30
ceph_health_delay: 10
# Abort upgrade if CEPH is in error state
ceph_abort_on_error: true
# Warn but continue if CEPH has warnings
ceph_warn_on_warning: true

View File

@@ -0,0 +1,11 @@
---
galaxy_info:
role_name: proxmox_ceph
author: ansible-msp
description: "MSP Proxmox automation — proxmox_ceph"
min_ansible_version: "2.15"
platforms:
- name: Debian
versions:
- bookworm
dependencies: []

View File

@@ -0,0 +1,140 @@
---
# =============================================================================
# proxmox_ceph — tasks
# Manages CEPH noout flag and health checks during maintenance.
# Skips gracefully if CEPH is not configured on this cluster.
# =============================================================================
# ── Detect CEPH ───────────────────────────────────────────────────────────────
- name: "CEPH | Detect if CEPH is configured"
ansible.builtin.command: ceph status
register: ceph_detect
changed_when: false
failed_when: false
run_once: true
- name: "CEPH | Set CEPH enabled fact"
ansible.builtin.set_fact:
ceph_is_enabled: "{{ ceph_detect.rc == 0 }}"
run_once: true
- name: "CEPH | Skip — CEPH not configured on this cluster"
ansible.builtin.debug:
msg: "CEPH is not configured on this cluster — skipping all CEPH tasks."
when: not ceph_is_enabled
run_once: true
# ── CEPH status ───────────────────────────────────────────────────────────────
- name: "CEPH | Get cluster status"
ansible.builtin.command: ceph status --format json
register: ceph_status_raw
changed_when: false
run_once: true
when: ceph_is_enabled
- name: "CEPH | Parse status"
ansible.builtin.set_fact:
ceph_status: "{{ ceph_status_raw.stdout | from_json }}"
run_once: true
when: ceph_is_enabled
- name: "CEPH | Log health"
ansible.builtin.debug:
msg: "CEPH health: {{ ceph_status.health.status }}"
run_once: true
when:
- ceph_is_enabled
- ceph_action == 'status'
# ── Health check ──────────────────────────────────────────────────────────────
- name: "CEPH | Check health | Abort if HEALTH_ERR"
ansible.builtin.fail:
msg: >-
CEPH is in HEALTH_ERR state — aborting to prevent data loss.
Run 'ceph status' to investigate. Set ceph_abort_on_error=false to override.
when:
- ceph_is_enabled
- ceph_abort_on_error
- ceph_status.health.status == 'HEALTH_ERR'
run_once: true
- name: "CEPH | Check health | Warn on HEALTH_WARN"
ansible.builtin.debug:
msg: >-
WARNING: CEPH is in HEALTH_WARN state.
Proceeding — set ceph_warn_on_warning=false to suppress this message.
Checks: {{ ceph_status.health.checks | default({}) | dict2items | map(attribute='key') | list }}
when:
- ceph_is_enabled
- ceph_warn_on_warning
- ceph_status.health.status == 'HEALTH_WARN'
run_once: true
# ── Set noout ─────────────────────────────────────────────────────────────────
- name: "CEPH | Set noout flag"
ansible.builtin.command: ceph osd set noout
changed_when: true
run_once: true
when:
- ceph_is_enabled
- ceph_action == 'set_noout'
- name: "CEPH | Confirm noout set"
ansible.builtin.debug:
msg: "✓ CEPH noout flag SET — OSDs will not be marked out during maintenance."
when:
- ceph_is_enabled
- ceph_action == 'set_noout'
run_once: true
# ── Clear noout ───────────────────────────────────────────────────────────────
- name: "CEPH | Clear noout flag"
ansible.builtin.command: ceph osd unset noout
changed_when: true
run_once: true
when:
- ceph_is_enabled
- ceph_action == 'clear_noout'
- name: "CEPH | Wait for HEALTH_OK after clearing noout"
ansible.builtin.command: ceph status --format json
register: ceph_recovery_check
changed_when: false
until: "(ceph_recovery_check.stdout | from_json).health.status in ['HEALTH_OK', 'HEALTH_WARN']"
retries: "{{ ceph_health_retries }}"
delay: "{{ ceph_health_delay }}"
run_once: true
when:
- ceph_is_enabled
- ceph_action == 'clear_noout'
- name: "CEPH | Log recovery status"
ansible.builtin.debug:
msg: >-
✓ CEPH noout CLEARED —
health: {{ (ceph_recovery_check.stdout | from_json).health.status }}
when:
- ceph_is_enabled
- ceph_action == 'clear_noout'
run_once: true
# ── check_health action ───────────────────────────────────────────────────────
- name: "CEPH | Wait for healthy state"
ansible.builtin.command: ceph status --format json
register: ceph_health_wait
changed_when: false
until: "(ceph_health_wait.stdout | from_json).health.status in ['HEALTH_OK', 'HEALTH_WARN']"
retries: "{{ ceph_health_retries }}"
delay: "{{ ceph_health_delay }}"
run_once: true
when:
- ceph_is_enabled
- ceph_action == 'check_health'
- name: "CEPH | Health check result"
ansible.builtin.debug:
msg: "CEPH health: {{ (ceph_health_wait.stdout | from_json).health.status }}"
when:
- ceph_is_enabled
- ceph_action == 'check_health'
run_once: true