141 lines
5.2 KiB
YAML
141 lines
5.2 KiB
YAML
---
|
|
# =============================================================================
|
|
# proxmox_ceph — tasks
|
|
# Manages CEPH noout flag and health checks during maintenance.
|
|
# Skips gracefully if CEPH is not configured on this cluster.
|
|
# =============================================================================
|
|
|
|
# ── Detect CEPH ───────────────────────────────────────────────────────────────
|
|
- name: "CEPH | Detect if CEPH is configured"
|
|
ansible.builtin.command: ceph status
|
|
register: ceph_detect
|
|
changed_when: false
|
|
failed_when: false
|
|
run_once: true
|
|
|
|
- name: "CEPH | Set CEPH enabled fact"
|
|
ansible.builtin.set_fact:
|
|
ceph_is_enabled: "{{ ceph_detect.rc == 0 }}"
|
|
run_once: true
|
|
|
|
- name: "CEPH | Skip — CEPH not configured on this cluster"
|
|
ansible.builtin.debug:
|
|
msg: "CEPH is not configured on this cluster — skipping all CEPH tasks."
|
|
when: not ceph_is_enabled
|
|
run_once: true
|
|
|
|
# ── CEPH status ───────────────────────────────────────────────────────────────
|
|
- name: "CEPH | Get cluster status"
|
|
ansible.builtin.command: ceph status --format json
|
|
register: ceph_status_raw
|
|
changed_when: false
|
|
run_once: true
|
|
when: ceph_is_enabled
|
|
|
|
- name: "CEPH | Parse status"
|
|
ansible.builtin.set_fact:
|
|
ceph_status: "{{ ceph_status_raw.stdout | from_json }}"
|
|
run_once: true
|
|
when: ceph_is_enabled
|
|
|
|
- name: "CEPH | Log health"
|
|
ansible.builtin.debug:
|
|
msg: "CEPH health: {{ ceph_status.health.status }}"
|
|
run_once: true
|
|
when:
|
|
- ceph_is_enabled
|
|
- ceph_action == 'status'
|
|
|
|
# ── Health check ──────────────────────────────────────────────────────────────
|
|
- name: "CEPH | Check health | Abort if HEALTH_ERR"
|
|
ansible.builtin.fail:
|
|
msg: >-
|
|
CEPH is in HEALTH_ERR state — aborting to prevent data loss.
|
|
Run 'ceph status' to investigate. Set ceph_abort_on_error=false to override.
|
|
when:
|
|
- ceph_is_enabled
|
|
- ceph_abort_on_error
|
|
- ceph_status.health.status == 'HEALTH_ERR'
|
|
run_once: true
|
|
|
|
- name: "CEPH | Check health | Warn on HEALTH_WARN"
|
|
ansible.builtin.debug:
|
|
msg: >-
|
|
WARNING: CEPH is in HEALTH_WARN state.
|
|
Proceeding — set ceph_warn_on_warning=false to suppress this message.
|
|
Checks: {{ ceph_status.health.checks | default({}) | dict2items | map(attribute='key') | list }}
|
|
when:
|
|
- ceph_is_enabled
|
|
- ceph_warn_on_warning
|
|
- ceph_status.health.status == 'HEALTH_WARN'
|
|
run_once: true
|
|
|
|
# ── Set noout ─────────────────────────────────────────────────────────────────
|
|
- name: "CEPH | Set noout flag"
|
|
ansible.builtin.command: ceph osd set noout
|
|
changed_when: true
|
|
run_once: true
|
|
when:
|
|
- ceph_is_enabled
|
|
- ceph_action == 'set_noout'
|
|
|
|
- name: "CEPH | Confirm noout set"
|
|
ansible.builtin.debug:
|
|
msg: "✓ CEPH noout flag SET — OSDs will not be marked out during maintenance."
|
|
when:
|
|
- ceph_is_enabled
|
|
- ceph_action == 'set_noout'
|
|
run_once: true
|
|
|
|
# ── Clear noout ───────────────────────────────────────────────────────────────
|
|
- name: "CEPH | Clear noout flag"
|
|
ansible.builtin.command: ceph osd unset noout
|
|
changed_when: true
|
|
run_once: true
|
|
when:
|
|
- ceph_is_enabled
|
|
- ceph_action == 'clear_noout'
|
|
|
|
- name: "CEPH | Wait for HEALTH_OK after clearing noout"
|
|
ansible.builtin.command: ceph status --format json
|
|
register: ceph_recovery_check
|
|
changed_when: false
|
|
until: "(ceph_recovery_check.stdout | from_json).health.status in ['HEALTH_OK', 'HEALTH_WARN']"
|
|
retries: "{{ ceph_health_retries }}"
|
|
delay: "{{ ceph_health_delay }}"
|
|
run_once: true
|
|
when:
|
|
- ceph_is_enabled
|
|
- ceph_action == 'clear_noout'
|
|
|
|
- name: "CEPH | Log recovery status"
|
|
ansible.builtin.debug:
|
|
msg: >-
|
|
✓ CEPH noout CLEARED —
|
|
health: {{ (ceph_recovery_check.stdout | from_json).health.status }}
|
|
when:
|
|
- ceph_is_enabled
|
|
- ceph_action == 'clear_noout'
|
|
run_once: true
|
|
|
|
# ── check_health action ───────────────────────────────────────────────────────
|
|
- name: "CEPH | Wait for healthy state"
|
|
ansible.builtin.command: ceph status --format json
|
|
register: ceph_health_wait
|
|
changed_when: false
|
|
until: "(ceph_health_wait.stdout | from_json).health.status in ['HEALTH_OK', 'HEALTH_WARN']"
|
|
retries: "{{ ceph_health_retries }}"
|
|
delay: "{{ ceph_health_delay }}"
|
|
run_once: true
|
|
when:
|
|
- ceph_is_enabled
|
|
- ceph_action == 'check_health'
|
|
|
|
- name: "CEPH | Health check result"
|
|
ansible.builtin.debug:
|
|
msg: "CEPH health: {{ (ceph_health_wait.stdout | from_json).health.status }}"
|
|
when:
|
|
- ceph_is_enabled
|
|
- ceph_action == 'check_health'
|
|
run_once: true
|