#!/usr/bin/env bash set -euo pipefail # # Ceph CRUSH Root Usage Monitor # 功能: 自动发现集群所有不同的 take_root每个 root 只处理一次去重 # - 指标写入 textfile collector 目录供 node_exporter 采集 # - 回显报告写入 reports 目录供管理员定期查阅 # 版本: 5.0 (改为按 take_root 维度去重消除同 root 多规则的冗余输出) # # --------------------- 配置区 --------------------- TEXTFILE_DIR/home/monitor/node_exporter_9101/textfile_collector REPORT_DIR/home/monitor/node_exporter_9101/reports REPORT_FILE${REPORT_DIR}/crush_usage_$(date %Y%m%d).log TEXTFILE_OUT${TEXTFILE_DIR}/ceph_crush_root_usage.prom TEXTFILE_TMP${TEXTFILE_DIR}/.ceph_crush_root_usage.prom.tmp # --------------------- 初始化目录 --------------------- mkdir -p $TEXTFILE_DIR $REPORT_DIR # --------------------- 日志函数 --------------------- log() { local msg[$(date %Y-%m-%d %H:%M:%S)] $* echo $msg echo $msg $REPORT_FILE } log_error() { local msg[$(date %Y-%m-%d %H:%M:%S)] ERROR: $* echo $msg 2 echo $msg $REPORT_FILE } tee_report() { tee -a $REPORT_FILE } # --------------------- 环境检查 --------------------- CEPH_BIN$(command -v ceph || true) PYTHON_BIN$(command -v python3 || true) if [[ -z ${CEPH_BIN:-} ]]; then log_error 未找到 ceph 命令请确保 Ceph 已正确安装 exit 2 fi if [[ -z ${PYTHON_BIN:-} ]]; then log_error 未找到 python3请安装 Python 3 exit 3 fi # --------------------- 临时文件管理 --------------------- RULE_JSON_FILE$(mktemp /tmp/ceph_rule_json.XXXXXX) TREE_JSON_FILE$(mktemp /tmp/ceph_tree_json.XXXXXX) DF_JSON_FILE$(mktemp /tmp/ceph_df_json.XXXXXX) cleanup() { rm -f $RULE_JSON_FILE $TREE_JSON_FILE $DF_JSON_FILE 2/dev/null || true } trap cleanup EXIT # --------------------- 报告头部 --------------------- { echo echo echo Ceph CRUSH Root OSD 使用率报告 echo 生成时间: $(date %Y-%m-%d %H:%M:%S) echo echo } | tee_report # --------------------- 获取 Ceph 数据 --------------------- log 开始获取 Ceph 集群数据... if ! $CEPH_BIN osd crush rule dump --format json 2/dev/null $RULE_JSON_FILE; then log_error 获取 CRUSH rule 配置失败; exit 4 fi if ! $CEPH_BIN osd crush tree --format json 2/dev/null $TREE_JSON_FILE; then log_error 获取 CRUSH tree 失败; exit 5 fi if ! $CEPH_BIN osd df --format json 2/dev/null $DF_JSON_FILE; then log_error 获取 OSD df 信息失败; exit 6 fi log Ceph 数据获取完成 # --------------------- 自动发现所有唯一 take_root去重保序--------------------- # 从 rule dump 中提取每条规则的 take item_name去重后输出跳过空值 mapfile -t UNIQUE_ROOTS ( $PYTHON_BIN - $RULE_JSON_FILE EOF import json, sys with open(sys.argv[1], rb) as f: content f.read() if content.startswith(b\xef\xbb\xbf): content content[3:] rules json.loads(content.decode(utf-8, errorsreplace)) seen set() for r in rules: for step in r.get(steps, []): if step.get(op) take: root step.get(item_name, ).strip() if root and root not in seen: seen.add(root) print(root) break EOF ) if [[ ${#UNIQUE_ROOTS[]} -eq 0 ]]; then log_error 未能从集群中发现任何 take_root请检查 ceph 权限或集群状态 exit 7 fi log 自动发现 ${#UNIQUE_ROOTS[]} 个唯一 take_root: ${UNIQUE_ROOTS[*]} # --------------------- 初始化 textfile 临时文件 --------------------- cat $TEXTFILE_TMP PROMEOF # HELP ceph_crush_root_osd_max_usage_percent Maximum OSD usage percentage under CRUSH take_root # TYPE ceph_crush_root_osd_max_usage_percent gauge PROMEOF # --------------------- 处理每个唯一 take_root --------------------- for ROOT_NAME in ${UNIQUE_ROOTS[]}; do log 处理 take_root: $ROOT_NAME TEXTFILE_TMP$TEXTFILE_TMP \ $PYTHON_BIN - $ROOT_NAME $TREE_JSON_FILE $DF_JSON_FILE PYTHON_SCRIPT | tee_report import json import sys import os from typing import Dict, List def die(msg: str, code: int 1) - None: print(msg, filesys.stderr) sys.exit(code) def load_json(path: str): try: with open(path, rb) as f: content f.read() if content.startswith(b\xef\xbb\xbf): content content[3:] return json.loads(content.decode(utf-8, errorsreplace)) except Exception as e: die(f无法解析 JSON 文件 ({path}): {e}, 4) if len(sys.argv) ! 4: die(f参数错误: 期望 3 个参数实际收到 {len(sys.argv)-1}, 1) root_name, tree_path, df_path sys.argv[1], sys.argv[2], sys.argv[3] tree load_json(tree_path) df load_json(df_path) # --------------------- 在 CRUSH tree 中定位 root --------------------- nodes tree.get(nodes, []) name_to_id {n.get(name): n.get(id) for n in nodes if name in n and id in n} root_id name_to_id.get(root_name) if root_id is None: # 该 root 在 tree 中不存在理论上不会但防御性处理 print(f[跳过] root {root_name} 在 CRUSH tree 中不存在已跳过。) sys.exit(0) id_to_node {n.get(id): n for n in nodes if id in n} sys.setrecursionlimit(10000) def get_osd_descendants(node_id: int) - List[int]: node id_to_node.get(node_id) if not node: return [] if node.get(type) osd: return [node_id] result [] for child_id in node.get(children, []): result.extend(get_osd_descendants(child_id)) return result osd_ids get_osd_descendants(root_id) if not osd_ids: print(f[跳过] root {root_name} 下没有 OSD空桶已跳过。) sys.exit(0) # --------------------- 获取 OSD 使用率信息 --------------------- df_nodes df.get(nodes, []) df_by_id {n.get(id): n for n in df_nodes if isinstance(n, dict) and id in n} osd_info: List[Dict] [] missing_osds [] for osd_id in osd_ids: osd_node df_by_id.get(osd_id) if not osd_node: missing_osds.append(osd_id) continue kb osd_node.get(kb, 0) or 0 kb_used osd_node.get(kb_used, 0) or 0 name osd_node.get(name, fosd.{osd_id}) usage_pct (float(kb_used) / float(kb) * 100.0) if kb 0 else 0.0 osd_info.append({ id: osd_id, name: name, kb: int(kb), kb_used: int(kb_used), usage_pct: usage_pct }) if missing_osds: print(f警告: 以下 OSD 在 ceph osd df 中未找到: {missing_osds}, filesys.stderr) if not osd_info: print(f[跳过] root {root_name} 下无可用 OSD 数据已跳过。) sys.exit(0) # --------------------- 计算统计信息 --------------------- max_osd max(osd_info, keylambda x: x[usage_pct]) min_osd min(osd_info, keylambda x: x[usage_pct]) avg_usage sum(x[usage_pct] for x in osd_info) / len(osd_info) total_kb sum(x[kb] for x in osd_info) total_kb_used sum(x[kb_used] for x in osd_info) total_usage_pct (float(total_kb_used) / float(total_kb) * 100.0) if total_kb 0 else 0.0 # --------------------- 输出终端/报告内容 --------------------- print(f\n{*70}) print(fTake Root : {root_name}) print(fOSD Count : {len(osd_info)}) print(f{*70}\n) print(f{OSD_ID:8} {OSD_NAME:15} {CAPACITY_KB:15} {USED_KB:20} {%USED:10}) print(f{-*70}) for osd in sorted(osd_info, keylambda x: x[id]): print(f{osd[id]:8} {osd[name]:15} {osd[kb]:15,} {osd[kb_used]:20,} {osd[usage_pct]:9.2f}%) print(f\n{*70}) print(f统计摘要:) print(f 总容量 : {total_kb:,} KB ({total_kb/1024/1024:.2f} GB)) print(f 已使用 : {total_kb_used:,} KB ({total_kb_used/1024/1024:.2f} GB)) print(f 总使用率 : {total_usage_pct:.2f}%) print(f 平均使用率 : {avg_usage:.2f}%) print(f 最大使用率 : {max_osd[usage_pct]:.2f}% (OSD {max_osd[id]} / {max_osd[name]})) print(f 最小使用率 : {min_osd[usage_pct]:.2f}% (OSD {min_osd[id]} / {min_osd[name]})) print(f{*70}\n) # --------------------- 写入 textfile collector仅最大使用率--------------------- textfile_tmp os.environ.get(TEXTFILE_TMP) if textfile_tmp: line ( fceph_crush_root_osd_max_usage_percent{{ ftake_root{root_name}, fosd_name{max_osd[name]} f}} {max_osd[usage_pct]:.2f}\n ) with open(textfile_tmp, a) as f: f.write(line) print(f✓ 指标已写入 textfile collector: {textfile_tmp}) else: print(✗ 环境变量 TEXTFILE_TMP 未设置跳过 textfile 写入, filesys.stderr) PYTHON_SCRIPT if [ $? -eq 0 ]; then log take_root $ROOT_NAME 处理完成 else log_error take_root $ROOT_NAME 处理失败 fi echo | tee_report done # --------------------- 原子替换 textfile --------------------- mv -f $TEXTFILE_TMP $TEXTFILE_OUT log textfile collector 已更新: $TEXTFILE_OUT # --------------------- 清理旧报告保留最近 30 天--------------------- find $REPORT_DIR -name crush_usage_*.log -mtime 30 -delete 2/dev/null || true log 所有 take_root 处理完成 log 报告已保存至: $REPORT_FILEceph环境脚本运行结果rootgm1-pub-ceph-172-16-3-107:/tmp# bash /home/monitor/node_exporter_9101/collectors/crushrule_usage_report_nojq_v5_metrics.shCeph CRUSH Root OSD 使用率报告生成时间: 2026-04-24 15:15:09[2026-04-24 15:15:09] 开始获取 Ceph 集群数据...[2026-04-24 15:15:10] Ceph 数据获取完成[2026-04-24 15:15:10] 自动发现 2 个唯一 take_root: default ssd[2026-04-24 15:15:10] 处理 take_root: defaultTake Root : defaultOSD Count : 12OSD_ID OSD_NAME CAPACITY_KB USED_KB %USED----------------------------------------------------------------------0 osd.0 4,140,752,888 1,702,653,156 41.12%1 osd.1 4,140,752,888 1,614,773,804 39.00%2 osd.2 4,140,752,888 1,654,648,876 39.96%3 osd.3 4,140,752,888 1,837,897,260 44.39%4 osd.4 3,907,014,656 1,340,719,904 34.32%5 osd.5 4,141,436,920 1,749,448,584 42.24%6 osd.6 4,141,436,920 1,657,205,256 40.02%7 osd.7 4,141,436,920 1,836,856,668 44.35%8 osd.8 4,141,436,920 1,792,965,288 43.29%9 osd.9 4,141,436,920 1,522,847,576 36.77%10 osd.10 4,141,436,920 1,793,701,216 43.31%11 osd.11 4,141,436,920 1,701,000,068 41.07%统计摘要:总容量 : 49,460,084,648 KB (47168.81 GB)已使用 : 20,204,717,656 KB (19268.72 GB)总使用率 : 40.85%平均使用率 : 40.82%最大使用率 : 44.39% (OSD 3 / osd.3)最小使用率 : 34.32% (OSD 4 / osd.4)✓ 指标已写入 textfile collector: /home/monitor/node_exporter_9101/textfile_collector/.ceph_crush_root_usage.prom.tmp[2026-04-24 15:15:10] take_root default 处理完成[2026-04-24 15:15:10] 处理 take_root: ssdTake Root : ssdOSD Count : 3OSD_ID OSD_NAME CAPACITY_KB USED_KB %USED----------------------------------------------------------------------12 osd.12 937,160,704 3,288,832 0.35%13 osd.13 937,689,088 3,874,516 0.41%14 osd.14 937,689,088 4,192,704 0.45%统计摘要:总容量 : 2,812,538,880 KB (2682.25 GB)已使用 : 11,356,052 KB (10.83 GB)总使用率 : 0.40%平均使用率 : 0.40%最大使用率 : 0.45% (OSD 14 / osd.14)最小使用率 : 0.35% (OSD 12 / osd.12)✓ 指标已写入 textfile collector: /home/monitor/node_exporter_9101/textfile_collector/.ceph_crush_root_usage.prom.tmp[2026-04-24 15:15:10] take_root ssd 处理完成[2026-04-24 15:15:10] textfile collector 已更新: /home/monitor/node_exporter_9101/textfile_collector/ceph_crush_root_usage.prom[2026-04-24 15:15:10] 所有 take_root 处理完成[2026-04-24 15:15:10] 报告已保存至: /home/monitor/node_exporter_9101/reports/crush_usage_20260424.logrootgm1-pub-ceph-172-16-3-107:/tmp#