#!/bin/bash

#运维(英文: Ops)
. vs_logger.sh

# 用于控制执行时显示命令的最大长度
g_cmd_max_length=200

# HCI 版本 
g_hci_ver=$(head -n1 /sf/version 2>/dev/null | sed 's/_.*//' | awk -F. '{print $1$2$3}')

# aSAN版本
g_asan_ver=$(head -n1 /sf/vs/version  | awk -F.  '{print $1$2}')

# 执行主机
g_hosts=""

# 卷主控
g_master_hosts=""

# 执行的案例数量
g_ntest=0

# 检索某个时间段的日志
g_day="today"
g_time=""
g_date=""

# 传入的全部参数
g_all_cmd_args=""

# 子命令
g_subcommand=""

# 命令执行类型
g_type=""

# 检索固定虚拟机ID
g_vmid=""

# 日志检索通配符
g_suffix=""

# 检索日志的数量,为了避免日志过多影响查看，默认10条，超过1000条
g_lines=10

#过滤日志
g_log_filter=""

# 为了最后汇总输出一次所有的报错
g_all_error_echo=""

# 为了最后汇总输出一次所有的超时
g_all_timeout_echo=""

# 主要用于执行次数，比如观测iostat次数，mpstat次数，监控网络丢包次数等涉及到次数的参数
g_cnt=3

# 检测命令执行的最大时长（单位为s），执行时长大于g_time_limit，中断检测命令的执行，默认20s，可通过-g指定
g_time_limit=20

#错误码
ENOENT=2 # 文件目录不存在
EINVAL=22 # 参数错误



#HCI存储网络配置
g_hci_network_path="/cfs/vs/gfs_networking_mode.json"
#HCI集群主机信息
g_hci_members="/cfs/.members"
#HCI存储网丢包检测目录
g_hci_ping_check="/cfs/vs/net_check/ping_check"
#仲裁主机标记
g_hci_arbiter_flag="/sf/cfg/vs/arbiter.flag"
#仲裁盒子标记
g_hci_box_flag="/boot/arbiter_box.flag"


# vs3.x和2.x jq路径不一样
g_jq_path="$(which jq)"

#日志文件路径或名称
g_uwsgi="uwsgi/"
g_vs_scritps_log_dir="vs/scripts/"
g_volume_run_mgr_path="volume_mgr_run_task.log"
g_data_format_path="vs_diskpart.sh.log"
g_cache_format_path="vs_cache_format.sh.log"

#===================================================[start common]=====================================================#
##------------工具函数-----------##
function p_info()
{
    echo -e "\033[37m$*\033[0m"
    log_info "$1"
}

function p_trace()
{
    echo -e "\n\033[32m$1\033[0m"
    log_info "$1"
}

function p_warn()
{
    echo -e "\033[33m$1\033[0m"
    log_warn "$1"
}

function p_timeout()
{
    echo -e "\033[33m$1\033[0m"
    log_warn "$1"
    g_all_timeout_echo=$g_all_timeout_echo'\n'$1
}

function p_error()
{
    echo -e "\033[31m$*\033[0m\n"
    log_error $1
    g_all_error_echo=$g_all_error_echo'\n'$1
}

function expect()
{
    local filename="${0}"
    local funcname="${1}"
    local lineno="${2}"
    shift 2

    e="${1}"
    shift 1
    r=`${fstest} $* 2>/dev/null | tail -1`
    echo "${r}" | egrep '^'${e}'$' >/dev/null 2>&1
    if [ $? -eq 0 ]; then
        p_info "ok ${g_ntest} [$filename:$funcname:$lineno]"
    else
        p_error "not ok ${g_ntest} [$filename:$funcname:$lineno] cmd=\"$*\" err=\"$r\""
    fi
    g_ntest=`expr $g_ntest + 1`
}

function test_check()
{
    local filename="${0}"
    local funcname="${1}"
    local lineno="${2}"
    shift 2

    if [ $* ]; then
        echo "ok ${g_ntest} [$filename:$funcname:$lineno]"
    else
        echo "not ok ${g_ntest} [$filename:$funcname:$lineno]"
    fi
    g_ntest=`expr $g_ntest + 1`
}

function test_equal()
{
    local filename="${0}"
    local funcname="${1}"
    local lineno="${2}"
    shift 2

    value1="$1"
    value2="$2"
    
    if [ "$value1" == "$value2" ]; then
        echo "ok ${g_ntest} [$filename:$funcname:$lineno]"
    else
        echo "not ok ${g_ntest} [$filename:$funcname:$lineno]"
    fi
    g_ntest=`expr $g_ntest + 1`
}

function test_cmd_check()
{
    local filename="${0}"
    local funcname="${1}"
    local lineno="${2}"
    shift 2

    expect="$1"
    ret="$2"

    if [ "$ret" == "$expect" ]; then
        echo "ok ${g_ntest} [$filename:$funcname:$lineno]"
    else
        echo "not ok ${g_ntest} [$filename:$funcname:$lineno]"
    fi
    g_ntest=`expr $g_ntest + 1`
}

# 执行集群命令
function run_cmd_analyze()
{
    local ret=0
    local result=""
    local cmd="$1"
    local port="$2"

    if [ -z "$cmd" ]; then
       log_info "run cmd is null."
       return 1
    fi

    log_info "run host: $g_hosts cmd: $cmd"
    for H in $g_hosts
    do
        grep -q ${H} /etc/hosts
        if [ $? -ne 0 ];then
            continue
        fi
        if [ -n "$port" ]; then
            result=$(timeout -t ${g_time_limit} -s KILL /usr/bin/ssh -p "$port" root@${H} "$cmd")
        else
            result=$(timeout -t ${g_time_limit} -s KILL /usr/bin/ssh root@${H} "$cmd")
        fi
        local tmp=$?
        #[ $tmp -eq 0 ] || ret=$tmp
        
        # 检查 timeout 的返回状态
        if [ $tmp -eq 137 ]; then
            log_warn "command $@ timed out on host: ${H}"
            p_timeout "所在主机 host: ${H}"
            ret=2
        elif [ ! -z "$result" ]; then
            if [ ${#cmd} -gt $g_cmd_max_length ]; then
                local truncated_cmd="${cmd:0:${g_cmd_max_length}}...[已省略]"
                p_info "host: ${H} cmd: $truncated_cmd\n$result\n"
            else
                p_info "host: ${H} cmd: $cmd\n$result\n"
            fi
            ret=1
        fi
        
        log_info "run cmd: ${H}: $cmd result: $result ret: $tmp"
    done

    return $ret
}

# 执行集群命令
function run_cmd()
{
    local result=""
    local cmd="$1"

    if [ -z "$cmd" ]; then
       log_info "run cmd is null."
    fi

    log_info "run host: $g_hosts cmd: $@"
    for H in $g_hosts
    do
        grep -q ${H} /etc/hosts
        if [ $? -ne 0 ];then
            continue
        fi

        result=$(timeout -t ${g_time_limit} -s KILL /usr/bin/ssh root@${H} "$@" 2>/dev/null)
        local cmd_all="$@"
        if [ ${#cmd_all} -gt $g_cmd_max_length ]; then
            local truncated_cmd="${cmd_all:0:${g_cmd_max_length}}...[已省略]"
            p_info "host: ${H} cmd: $truncated_cmd\n$result\n"
        else
            p_info "host: ${H} cmd: $cmd_all\n$result\n"
        fi
        log_info "run cmd: ${H}: $@ result: $result ret: $tmp"
    done
}

# 是否是EDS版本
function vs_is_eds()
{
    if diff /sf/etc/version /sf/etc/version.eds > /dev/null 2>&1; then
        return 0
    fi

    return 1
}

# 案例执行统计
g_case_success=0
g_case_failed=0
g_case_timeout=0

g_bar_idx=0
g_bar_str=""
function ops_report_bar()
{
    arry=("|" "/" "-" "\\")
    let index=g_bar_idx%4
    printf "running success:$g_case_success failed:$g_case_failed timeout:$g_case_timeout %c%-20s%c\r" "${arry[$index]}" "$g_bar_str" "${arry[$index]}"
    #printf "running success:$g_case_success failed:$g_case_failed %3d%% %c%-20s%c\r" "$g_bar_idx" "${arry[$index]}" "$g_bar_str" "${arry[$index]}"
    let g_bar_idx=g_bar_idx+5
    g_bar_str+="*"
}

function testcase_success_stats()
{
    let g_case_success++
    ops_report_bar
    return; 
}

function testcase_failed_stats()
{
    ops_report_bar
    p_error "检索到异常，业务影响: $1"
    p_error "建议解决方案: \n$2\n"
    let g_case_failed++
}

function testcase_timeout_stats()
{
    let g_case_timeout++
    local cmd_all="$1"
    if [ ${#cmd_all} -gt $g_cmd_max_length ]; then
        local truncated_cmd="${cmd_all:0:${g_cmd_max_length}}...[已省略]"
        p_timeout "超时命令：$truncated_cmd\n"
    else
        p_timeout "超时命令：$cmd_all\n"
    fi
}

# 检查并给出解决方案，案例结果有输出，则认为错误
# @param $1: 执行的cmd
# @param $2: 说明业务影响
# @param $3: 解决方案
function check_and_solution()
{
    local cmd="$1"
    local inf="$2"
    local solution="$3"

    if [[ -z $cmd || -z $inf || -z $solution ]]; then
        p_error "案例错误，需要提供业务影响以及解决方案！"
        return 1
    fi

    run_cmd_analyze "$cmd"
    ret=$?
    
    if [ $ret -eq 1 ]; then
         testcase_failed_stats "$inf" "$solution" 
         return 1
    elif [ $ret -eq 2 ]; then
        testcase_timeout_stats "$cmd"
        return 1
    fi
    testcase_success_stats
    return 0
}

function check_rchecksum_efs_read_slice_logs() {
    declare -A read_slice_logs
    local check_timestamp

    if [ -z "$g_day" ]; then
        g_day="today"
    fi

    local log_dir="/sf/log/${g_day}/vs/log/glusterfs/bricks"

    echo "检查efs_read_slice日志"

    # 查找包含特定错误的日志行
    while IFS= read -r line; do
        # 检查是否匹配到efs_read_slice并且包含Input/output error
        if echo "$line" | grep "Input/output error"; then
            check_timestamp=$(echo "$line" | cut -d ":" -f 3-5 | sed -E "s/.*\[(.*)\].*/\1/")  # 获取时间戳
            read_slice_logs["$check_timestamp"]="$line"  # 使用时间戳作为键
        fi
    done < <(grep -rn "efs_read_slice" "$log_dir/")

    # 查找包含tier rchecksum failed的日志
    while IFS= read -r line; do
        if echo "$line" | grep -q "tier rchecksum failed"; then
            timestamp=$(echo "$line" | cut -d ":" -f 3-5 | sed -E "s/.*\[(.*)\].*/\1/")  # 获取时间戳
            # 将时间戳转换为秒（只使用整数秒部分）
            timestamp_seconds=$(date -d "$timestamp" +%s)  # 获取整数秒
            for read_ts in "${!read_slice_logs[@]}"; do
                read_ts_seconds=$(date -d "$read_ts" +%s)  # 获取整数秒
                # 计算时间差并检查是否在1分钟内
                if (( timestamp_seconds >= read_ts_seconds )) && \
                   (( timestamp_seconds < (read_ts_seconds + 60) )); then
                    brick_vg=$(echo "(${read_slice_logs[$read_ts]})" | cut -d "." -f 1 | cut -d "-" -f 5-11 | head -n 1)
                    echo -e "ERROR：校验有坏道!!! brick_vg: $brick_vg"
                fi
            done
        fi
    done < <(grep -rn "tier rchecksum failed" "$log_dir/")

    if [[ -z "${read_slice_logs[*]}" ]]; then
        echo "环境没有匹配到tier rchecksum错误日志。"
    fi
}

function check_efs_preadv_eio() {
    declare -A co_pwrite_preadv_logs
    declare -A st_file_write_back_logs

    if [ -z "$g_day" ]; then
        g_day="today"
    fi

    local log_dir="/sf/log/${g_day}/vs/log/glusterfs/bricks"

    echo "2.检查efs_pwrite_pread数据区非对齐读"

    # 查找包含特定错误的日志行
    while IFS= read -r line; do
        # 提取eio_size
        eio_size=$(echo "$line" | grep " E " | grep "ioengine_co_pwrite" | cut -d ":" -f 8 | cut -d "=" -f 3 | cut -d ")" -f 1)

        if [[ -n "$eio_size" ]]; then
            echo "Extracted eio_size: $eio_size"  # Debug output
            # If the eio_size is unique, record the line
            if [[ -z "${co_pwrite_preadv_logs[$eio_size]}" ]]; then
                co_pwrite_preadv_logs["$eio_size"]="$line"
            fi
        fi
    done < <(grep -rn "ioengine_co_pwrite" "$log_dir/")

    # 遍历每个唯一的eio_size，查找对应的日志条目
    for eio_size in "${!co_pwrite_preadv_logs[@]}"; do
        echo "Searching for eio_size: $eio_size in st_file_write_back logs"  # Debug output
        efs_logs=$(grep -rn "$eio_size" "$log_dir/" | grep "st_file_write_back")

        if [[ -n "$efs_logs" ]]; then
            st_file_write_back_logs["$eio_size"]="$efs_logs"
        fi
    done

    # 输出结果
    if [[ -n "${st_file_write_back_logs[*]}" ]]; then
        echo "**********匹配到的st_file_write_back_logs日志*********"
        for eio_size in "${!st_file_write_back_logs[@]}"; do
            gfid=$(echo "${co_pwrite_preadv_logs[$eio_size]}" | cut -d ":" -f 7 | cut -d "(" -f 1)
            brick_vg=$(echo "(${st_file_write_back_logs[$eio_size]})" | cut -d "." -f 1 | cut -d "-" -f 5-11 | head -n 1)
            echo -e "ERROR：有坏道!!! gfid: $gfid; brick_vg: $brick_vg"
        done
    else
        echo "环境没有st_file_write_back的日志。"
    fi
}

function check_efs_inode_extent_logs() {
    declare -A ext_page_get_nostd_logs
    declare -A efs_inode_free_extent_logs

    if [ -z "$g_day" ]; then
        g_day="today"
    fi

    local log_dir="/sf/log/${g_day}/vs/log/glusterfs/bricks"

    echo "1.检查efs_inode_extent元数据坏道"

    # 查找包含特定错误的日志行
    while IFS= read -r line; do
        # 提取gfid
        gfid=$(echo "$line" | cut -d ":" -f 7 | cut -d "(" -f 1)
        if [[ -n "$gfid" && -z "${ext_page_get_nostd_logs[$gfid]}" ]]; then
            ext_page_get_nostd_logs["$gfid"]="$line"
        fi
    done < <(grep -nr "ext_page_get_nostd.*Input/output error" "$log_dir"/*)

    # 遍历每个唯一的gfid，查找对应的日志条目
    for gfid in "${!ext_page_get_nostd_logs[@]}"; do
        # 查找包含该gfid的efs_inode_free_extent日志
        efs_logs=$(grep -nr "$gfid" "$log_dir"/* | grep efs_inode_free_extent | grep "efs_inode_free_extent")
        if [[ -n "$efs_logs" ]]; then
            efs_inode_free_extent_logs["$gfid"]="$efs_logs"
        fi
    done

    # 输出结果
    if [[ -n "${efs_inode_free_extent_logs[*]}" ]]; then
        echo "**********匹配到的efs_inode_free_extent日志*********"
        for gfid in "${!efs_inode_free_extent_logs[@]}"; do
            brick_vg=$(echo "${efs_inode_free_extent_logs[$gfid]}" | cut -d "." -f 1 | cut -d "-" -f 5-11 | head -n 1)
            echo "ERROR：有坏道!!! Gfid: $gfid; brick_vg: $brick_vg"
        done
    else
        echo "环境没有efs_inode_free_extent的日志。"
    fi
}

function remove_volume_bad_tfile() {
   if [ $g_asan_ver -ge 30 ]; then
    p_error "请在VS2.X的版本上面执行"
    return
   fi
   p_info "开始将脑裂的文件的T文件记录到/sf/data/local/remove_tfile.sh脚本中"
   echo '' > /sf/data/local/remove_tfile.sh
   chmod +x /sf/data/local/remove_tfile.sh

   volume_id="$g_volume_id"
   volume_path="/sf/data/vs/gfs/$volume_id"
   p_info "要删除的volume_id:$volume_id,挂载点$volume_path:的T文件"
   if [ ! -d "$volume_path" ]; then
    p_error "请检查是否在VS物理卷所在的节点上执行"
   fi
   find /sf/data/vs/gfs/$volume_id -type f 2>&1| while IFS= read -r file; do
     err_file="$(echo $file |awk -F '`' '{print $2}'|awk -F ': ' '{print $1}')";
      if [ "$err_file" ];then
        echo "/sf/data/local/vsts-clear-tfile.sh '$err_file" >>/sf/data/local/remove_tfile.sh
      fi;
   done
   p_info "请将vsts-clear-tfile.sh放到/sf/data/local/下，检查/sf/data/local/remove_tfile.sh内容无误后，执行删除T文件即可"
}

function check_tgtd_conn() {
   p_info "开始输出tgtd的所有连接信息..."
   if [ $g_asan_ver -lt 30 ]; then
    p_error "请在VS3.X的版本上面执行"
   return
   fi

   if [[ -z $g_output_file ]];then
    vs_cluster_cmd.sh e "/sf/vs/sbin/tgtadm -m target -o show"
    return
   fi
   vs_cluster_cmd.sh e "/sf/vs/sbin/tgtadm -m target -o show" > /sf/log/today/vs/$g_output_file
   return
}

function check_bad_sector_log() {
    local host="$1"
    local log_function="$2"
    local result

    # 通过 SSH 连接到每个主机并执行传入的日志检查函数
    ssh_command="bash -c '$(declare -f $log_function); $log_function'"
    result=$(timeout -t 4 -s KILL /usr/bin/ssh root@"${host}" "$ssh_command" 2>&1)

    # 检查 SSH 执行的返回状态
    if [ $? -ne 0 ]; then
        p_error "在主机 ${host} 上执行${log_function}检查失败: $result"
        g_all_error_echo+=$'\n'"$result"
    else
        # 提取错误信息
        error_lines=$(echo "$result" | grep "ERROR：有坏道!!!")
        if [ -n "$error_lines" ]; then
            p_error "$error_lines"
            g_all_error_echo+=$'\n'"$error_lines"
        else
            echo "环境没有匹配到$log_function的日志。"
        fi
    fi
}

function check_bad_sector() {
    echo "开始检查是否有坏道功能"

    local solution="内核存在磁盘介质错误(坏道/其他硬件错误)"
    check_and_solution 'zgrep -E "blk_update_request: critical medium error|blk_update_request: I/O error" /sf/log/'$g_day'/kernel.log'$g_suffix' | grep -vE "nbd" | '"$g_log_filter"' ' '同步不完成/业务挂起' $solution

    #两主机或者3.0以下的版本不检查
    vs_is_two_hosts
    if [[ $? -eq 0 || $g_asan_ver -lt 30 ]]; then
        return 0
    fi

    # 获取集群主机列表
    cluster_hosts=$(cat /etc/hosts | grep host- | grep -v mgr | awk '{print $2}')
    for h in ${cluster_hosts}; do
        echo "在主机 ${h} 上检查日志..."

        # 检查 efs_inode_extent_logs
        check_bad_sector_log "${h}" "check_efs_inode_extent_logs"

        # 检查 efs_preadv_eio
        check_bad_sector_log "${h}" "check_efs_preadv_eio"

        # 检查 tier rchecksum的brick报错
        check_bad_sector_log "${h}" "check_rchecksum_efs_read_slice_logs"
    done
}

# 检查并给出解决方案，案例结果有输出，则认为错误
# @param $1: 执行的cmd
# @param $2: 说明业务影响
# @param $3: 解决方案
# @param $4: 端口号
function check_and_solution_with_port()
{
    local cmd="$1"
    local inf="$2"
    local solution="$3"
    local port="$4"
    local ret=""

    if [[ -z "$cmd" || -z "$inf" || -z "$solution" || -z "$port" ]]; then
        p_error "案例错误，需要提供业务影响以及解决方案！"
        return 1
    fi

    # 判断端口号是否合理
    if ! [[ "$port" =~ ^[0-9]+$ ]] && [ "$port" -ge 1 ] && [ "$port" -le 65535 ]; then
        p_error "端口号不合理，必须是数字, 且必须在 1 到 65535 之间"
        return 1
    fi

    run_cmd_analyze "$cmd" "$port"
    ret=$?
    
    if [ $ret -eq 1 ]; then
         testcase_failed_stats "$inf" "$solution" 
         return 1
    elif [ $ret -eq 2 ]; then
        testcase_timeout_stats "$cmd"
        return 1
    fi
    testcase_success_stats
    return 0
}

# 只执行命令，不检测结果，给出关注的项目
function check_and_advice()
{
    local cmd="$1"
    local advice="$2"
    if [[ -z $cmd ]]; then
        p_error "案例错误，需要提供建议方向！"
        return 1
    fi

    run_cmd $cmd 
    if [ ! -z "$advice" ]; then
        p_info "建议关注结果项: $2\n"
    fi
    testcase_success_stats
}

function realethtool()
{
    if [ -f /sbin/realethtool ]; then
        /sbin/realethtool $@ 
        return
    fi
    /sbin/ethtool $@ 
  
}

function vs_is_two_hosts()
{
    local replica_count=$(grep  "replica_count" /sf/cfg/vs/glusterfs/glusterd/vols/*/info 2>/dev/null| cut -d= -f2)
    if [[ -z "$replica_count" ]]; then
        return 2
    fi

    if [[ $replica_count -eq 2 ]]; then
        return 0
    fi

    return 1
}

function get_replica_count()
{
    local replica_count
    replica_count=$(grep "replica_count" /sf/cfg/vs/glusterfs/glusterd/vols/*/info 2>/dev/null| cut -d= -f2)
    if [ $? -ne 0 ]; then
        log_error "获取主机数失败"
        echo -1
        return
    fi
    # 检查 replica_count 是否为空
    if [ -z "$replica_count" ]; then
        log_error "replica_count 为空"
        echo 0
        return
    fi
    echo $replica_count
}

function is_aarch64_platform() 
{
    local platform=""
    # 使用 uname -m 获取 CPU 架构信息
    platform=$(/bin/uname -m)

    # 检查命令是否成功执行
    if [ $? -ne 0 ]; then
        log_error "获取 CPU 架构信息失败，请检查命令是否正确。"
        return 1
    fi

    # 判断是否为 aarch64 架构
    if [ "$platform" = "aarch64" ]; then
        return 0
    fi

    return 1
}


#===================================================[end common]=======================================================#

#==================================================[start check]=======================================================#
function show_host_info()
{
    name=$(hostname)

    # 获取符合条件（第二列值为 localhost 或以 host- 开头，不以 host-mgr 开头）的行并删除第二列值为name的行
    ret=$(awk -v name="$name" '
        $2 == "localhost" || ($2 ~ /^host-/ && $2 !~ /^host-mgr/) {
            if ($2 != name) {
                printf "%s %s\n", $1, $2
            }
        }
    ' /etc/hosts)

    # 替换第二列为localhost的行
    ret=$(echo "$ret" | awk -v name="$name" '$2 == "localhost" { $2 = name } {print}' | sort)

    echo "$ret"
}

#在chroot外部执行命令
function chroot_outside() 
{
    local outside_cmd="$1"

    PYTHON_SCRIPT=$(cat <<EOF
import requests
import sys

cmd = '''{}'''.format('''$outside_cmd''')
url = "http://127.0.0.1:7102/hosts/cmds"

try:
    rsp_url = requests.post(url, json={"cmds": [cmd]}, timeout=$g_time_limit)
    rsp_url.raise_for_status()
    rsp = rsp_url.json()
    # 路径不存在时，不会执行输出
    if rsp["data"] is not None:
        ret = rsp["data"][0].strip()
        print(ret)
    sys.exit(0)  # 成功执行返回0
except Exception as e:
    print(e)
    sys.exit(1)  # 出现异常返回1
EOF
)
    # 执行Python脚本
    python -c "$PYTHON_SCRIPT"
    return $?  # 返回Python脚本的退出状态
}

# 1、收集环境信息
function get_env()
{
    p_trace "环境信息版本信息: "
    cat /sf/version 2>/dev/null; cat /sf/vs/version 
    
    vs_sn -f &>/dev/null; if [ $? -eq 0 ]; then echo -e "\n该环境为一体机"; else echo -e "\n该环境为非一体机"; fi
   
    p_trace "升级记录（包含补丁）: "
    if [ -e '/boot/firmware/history' ]; then
        tail -n 5 /boot/firmware/history 2>/dev/null
    else
        chroot_outside 'cat /sf/histversion | grep -v "rpm info" | sed "/\.rpm$/d"'
    fi
    
    p_trace "主机信息: "
    # cat /etc/hosts | grep host- | grep -v host-mgr 
    show_host_info
    
    p_trace "卷信息: "
    #if [ $ver -lt 68 ]; then /sf/vs/bin/vsmgr volume hosts ; else container_exec -n vs-cp-m -c "/sf/vs/bin/vsmgr volume hosts" ; fi ; dmidecode -t2
    if [ ! -f /sf/vs/bin/vsmgr ]; then
        gluster peer status 
    else 
        /sf/vs/bin/vsmgr volume hosts
    fi
} 

# 检查黑盒日志，中断核是否跑满
function asan_check_eth_irq()
{
    local core_idx="$1"
    local show_line=10
    local cpu_log="/sf/log/blackbox/$g_date/LOG_cpuocp.txt"

    local core_stat=$(awk -v target_core="$core_idx" '
    /^\[.*\]\*\*\*/ {
        current_time_line = $0
        next
    }

    $2 != "CPU" {
        if ($2 == target_core && $NF + 0 < 10.00) {
            print current_time_line  # 打印对应时间行
            print $0                 # 打印对应数据行
        }
    }
' "$cpu_log")

    if [[ -n "$core_stat" ]];then
        echo -e "$core_stat" | tail -n $show_line
    fi
}

function check_eth_port()
{
    local result=""
    local eth_irq_warn="0"
    local eth_port="$1"
    local full_irqs=""
    local local_irqs=""

    if [ -z "$1" ]; then
        p_error "网口信息为空"
        return 1
    fi
    p_trace "\n查看网卡信息: " 
    lspci -nn | grep -i eth

    for eth in `echo -e "$eth_port"`
    do
        p_trace "检测网口：$eth"
        p_trace "列出网卡信息，关注网卡固件版本FW, 网卡序号（是否存在跨网卡）:"
        realethtool -i $eth
       
        # ethtool –s ethX [speed 10|100|1000] [duplex half|full]  [autoneg on|off]
        # 设置网口速率10/100/1000M、设置网口半/全双工、设置网口是否自协商
        p_trace "\n检测网口速度Speed，Duplex，CRC之类的都没问题，基本可以排除物理层面的干扰. (Duplex 必须是full, speed根据网络部署决定，eg:万兆网络即10000MB/s)"
        realethtool $eth | egrep 'Speed|Duplex'
        p_trace "\n关注网卡是否存在错误包选项:"
        realethtool -S $eth | grep err | grep -vE ": 0$"

        p_trace "\nifconfig 关注是否存在丢包：关注dropped和overruns值"
        ifconfig $eth | grep -Ei "RX packets|TX packets" | awk '$1 = $1'
        # 通过 ifconfig 可以看到 overruns 是否一直增大
        p_trace "\nifconfig 监控 $eth dropped和overruns值变化: $g_cnt 次"
        for i in `seq 1 $g_cnt`; do ifconfig $eth |awk '$1 = $1' | grep RX | grep overruns; sleep 1; done
        # 修改buffer size大小, eg: ethtool -G eth0 rx 2048        
        p_trace "\n查看网卡缓存, 可根据最大支持的网卡缓存做适当调整后观测情况."
        realethtool -g $eth
    done
    
    p_trace "\n查看网络中断核CPU核值:（关注是否该核心跑满或是否是VN的dataplate线程），通过mpstat -P 核编号 1 查看该核CPU使用情况"
    irq_eth=$(echo $eth_port | tr -s ' ' '|') 
    local_irqs=$(cat /proc/interrupts | egrep $irq_eth | awk -F: '{print $1}' | xargs -i -t cat /proc/irq/{}/smp_affinity_list)
    p_info "\n网卡 $irq_eth 对应中断核为："
    echo -e "$local_irqs"

    p_trace "\n检查最新LOG_cpuocp日志是否存在中断核跑满的情况(仅显示最近5条)\n"
    check_irqs=$(cat /proc/interrupts | egrep ${irq_eth} | awk -F: '{print $1}' | xargs -i cat /proc/irq/{}/smp_affinity_list | uniq)
    for irq in `echo -e $check_irqs`;do
        p_trace "检查中断核: $irq 是否存在跑满的情况:\n"
        result=$(asan_check_eth_irq $irq)
        if [[ -n "$result" ]];then 
            echo "$result"
            eth_irq_warn="1"
            full_irqs+="$irq "
        fi
    done

    if [[ "$eth_irq_warn" == "1" ]];then
        testcase_failed_stats "\n中断核 $full_irqs 存在跑满的情况" "检查是否需要隔离中断核"
    fi
}


function check_eds_network()
{
     bond_name=$(cat /sf/cfg/vs/cluster_vs_ip.json | /sf/vs/bin/jq -r '.hosts[]|select(.hostname=="'$(hostname)'")|.private_network.iface' | awk -F":" '{print $1}')
     if [ -z "$bond_name" ]; then 
         return 
     fi
 
     eths=$(cat /sys/class/net/"$bond_name"/bonding/slaves)  
     check_eth_port "$eths"
     vs_cluster_cmd.sh e 'ip a | grep "'$bond_name'"' 
}

# 检查network mode
function check_network_mode()
{
    local mode_type=$1
    inf="当前存储网络模式生效情况和配置不一致，主机扩容或断电重启后，链路聚合丢包50%，断掉一线链路无丢包"
    solution="\n参考案例 https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=172244423"
    check_and_solution "cat /sys/class/net/sf_vs_bond0/bonding/mode|grep -wv $mode_type" "$inf" "$solution"
    p_info "虚拟存储网络模式信息如下："
    vs_cluster_cmd.sh e "cat /sys/class/net/sf_vs_bond0/bonding/mode"
}

function check_network()
{
    if vs_is_eds; then
        check_eds_network
        return $? 
    fi

    network_type=$(cat /cfs/vs/gfs_networking_mode.json 2>/dev/null|/sf/vs/bin/jq .[].type|sort -u|tr -d '"| ')
    eth_port=$(cat /cfs/vs/gfs_networking_mode.json | /sf/vs/bin/jq -r '.[]|select(.host_name=="'$(hostname)'")|.nics[]' | tr -s "\n" " ")
    p_trace "检查物理网络部署模式，当前存储网络模式：$network_type, host: $(hostname), eth: $eth_port"
    if [ "$network_type" == "private_network" ]; then # 标准链路聚合
       res=$(echo "$eth_port" | grep "eth")
       if [ $? -eq 0 ]; then
            check_eth_port "$eth_port"
            p_trace "\n检查网口情况，关注网口是否是DOWN状态."
            ip a | grep "$eth_port"
       fi 
    elif [ "$network_type" == "bonding_one" ]; then
       p_info "单交换机链路聚合，若存在两台交换机，检视对端交换机需要做堆叠或者配置M-LAG，堆叠线是否是万兆堆叠线!"
       check_eth_port "$eth_port"
       for eth in `echo -e "$eth_port"`
       do
           realethtool --show-priv-flags $eth | grep disable-fw-lldp | grep "off$" 
           [ $? -eq 0 ] && p_error "网口: $eth lldp需要关闭状态，参考: https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=15089&search=494b67536a486856754c2d7036&highlight="
       done 
       vs_cluster_cmd.sh e "ip a | grep sf_vs_bond"
       check_network_mode sf-balance-rr
    elif [ "$network_type" == "bonding_two" ]; then
       p_info "双交换机链路聚合，检视对端交换机不需要做堆叠，两台傻瓜式交换机即可!"
       check_eth_port "$eth_port"
       vs_cluster_cmd.sh e "ip a | grep sf_vs_bond"
       check_network_mode balance-rr
    elif [ "$network_type" == "standard_network" ]; then
       p_info "标准链路聚合模式, 对端交换机需要按照链路聚合模式配置聚合模式!"
       eth_port=$(echo "$eth_port" | tr -d " ")
       channel_eth=$(cat /sys/class/net/$eth_port/bonding/slaves)
       check_eth_port "$channel_eth"
       for hname in $(cat /cfs/vs/gfs_networking_mode.json|/sf/vs/bin/jq -r '.[].host_name')
       do
          channel_cmd="ip a | grep $(jq -r --arg hname $hname '.[] | select(.host_name == $hname) | .nics[0]' /cfs/vs/gfs_networking_mode.json) 2>/dev/null"
          p_info "$hname: $channel_cmd"
          timeout -t "$g_time_limit" -s KILL /usr/bin/ssh root@"$hname" "$channel_cmd"
       done
    elif [ "$network_type" == "fusion_network" ]; then
       p_info "多网合一模式，如果存在故障可联系协助VN排障..."
    else 
       echo "未知部署模式，暂不支持检测..."
    fi

 
}

# 检测虚拟存储通信口，如果是千兆网络告警
function check_network_trans_rate() {
    local cfg_file=""
    local hosts=""
	declare -A hostname_to_eths
    declare -A hostname_to_type

    if vs_is_eds; then
        cfg_file="/sf/cfg/vs/cluster_vs_ip.json"
    else
        cfg_file="/cfs/vs/gfs_networking_mode.json"
    fi
    if [ ! -e "$cfg_file" ]; then
        log_error "$cfg_file 文件不存在"
        p_error "$cfg_file 文件不存在"
        return 1
    fi
    # 获取主机名
    if vs_is_eds; then
        hosts=($(cat "$cfg_file" | /sf/vs/bin/jq -r '.hosts[].hostname'))
    else
        hosts=($(cat "$cfg_file" | /sf/vs/bin/jq -r '.[].host_name'))
    fi
    if [ $? -ne 0 ] || [ -z "$hosts" ]; then 
        log_error "获取 $cfg_file 输出失败，请检查 jq 命令是否正确执行。"
        p_error "获取 $cfg_file 输出失败，请检查 jq 命令是否正确执行。"
        return 1
    fi
    # 根据主机名获取网口
    for hostname in "${hosts[@]}"; do
        if vs_is_eds; then
            eths=($(cat "$cfg_file" | /sf/vs/bin/jq -r ".hosts[] | select(.hostname == \"$hostname\") | .private_network.nics[]"))
        else
            eths=($(cat "$cfg_file" | /sf/vs/bin/jq -r ".[] | select(.host_name == \"$hostname\") | .nics[]"))
        fi
        if [ $? -ne 0 ] || [ ${#eths[@]} -eq 0 ]; then 
            log_error "获取 $cfg_file 输出失败，请检查 jq 命令是否正确执行。"
            p_error "获取 $cfg_file 输出失败，请检查 jq 命令是否正确执行。"
            return 1
        fi
        # 非 eds 额外输出链路聚合类型
        if ! vs_is_eds; then
            type=$(cat "$cfg_file" | /sf/vs/bin/jq -r ".[] | select(.host_name == \"$hostname\") | .type")
            if [ $? -ne 0 ] || [ -z "$type" ]; then 
                log_error "获取 $cfg_file 输出失败或未找到网络聚合类型，请检查 jq 命令是否正确执行。"
                p_error "获取 $cfg_file 输出失败或未找到网络聚合类型，请检查 jq 命令是否正确执行。"
                return 1
            fi
            case "$type" in
                private_network)
                    type_name="无链路聚合"
                    ;;
                bonding_one)
                    type_name="单交换机链路聚合"
                    ;;
                bonding_two)
                    type_name="双交换机链路聚合"
                    ;;
                standard_network)
                    type_name="标准链路聚合"
                    ;;
                *)
                    type_name="未知"
                    ;;
            esac
            hostname_to_type["$hostname"]="$type_name"
            # 标准链路聚合模式特殊处理
            if [ "$type" == "standard_network" ]; then
                channel="${eths[0]}"
                if [ ! -f "/proc/net/bonding/$channel" ]; then
                    log_error "标准链路聚合的网口所在文件 /proc/net/bonding/$channel 不存在。"
                    p_error "标准链路聚合的网口所在文件 /proc/net/bonding/$channel 不存在。"
                    return 1
                fi
                eths=($(cat /proc/net/bonding/"$channel" | grep -oP 'Slave Interface: \K\S+'))
                if [ $? -ne 0 ] || [ ${#eths[@]} -eq 0 ]; then 
                    log_error "获取标准链路聚合的网口失败或未找到从属接口。"
                    p_error "获取标准链路聚合的网口失败或未找到从属接口。"
                    return 1
                fi
            fi
        fi
        hostname_to_eths["$hostname"]=$(printf "%s " "${eths[@]}")
    done
    
    solution="虚拟存储卷的虚拟存储通信口是千兆网络，不符合最佳实践，最佳实践要求采用万兆网络，请联系研发排查"
    inf=""

    for hostname in "${hosts[@]}"; do
        cur_eths="${hostname_to_eths["$hostname"]}"
        cur_type="${hostname_to_type["$hostname"]}"

        # 定义 command 字符串
        command=$(cat <<EOF
IFS=' ' read -r -a eth_array <<< "$cur_eths"
for eth in "\${eth_array[@]}"; do
    speed=\$(/sbin/ethtool "\$eth" | grep "Speed" | awk '{print \$2}' | sed 's/Mb\/s//g')
    
    if [ \$? -ne 0 ]; then 
        log_error "获取虚拟存储通信口 \$eth 速度失败。"
        exit 1
    elif [ "\$speed" -le 1000 ]; then
        if [ -z "$cur_type" ]; then
            echo "告警: 主机 $hostname 上的网络接口 \$eth 的速度为 \$speed Mb/s"
        else
            echo "告警: 主机 $hostname 上的网络接口 \$eth 的速度为 \$speed Mb/s，链路聚合类型为：${cur_type:-未知}"
        fi
    fi
done
EOF
)
		#echo "command=$command"
		result=$(timeout -t "$g_time_limit" -s KILL /usr/bin/ssh root@"$hostname" "$command" 2>/dev/null)
		exit_status=$?
		if [ ! -z "$result" ]; then
			inf+=$'\n'"$result"
		elif [ $exit_status -eq 137 ]; then
			p_timeout "所在主机 host: $hostname"
			testcase_timeout_stats "$command"
		fi
	done
    if [ -n "$inf" ]; then
        testcase_failed_stats "$inf" "$solution"
    fi
}

function check_network_conf() {
        if ! vs_is_eds; then
        # 检测是否存在部分主机没有配置存储网，部分主机配置了存储网
        inf="检测到集群内部分主机没有配置存储网络，可能导致升级到HCI6.11.0之前的容器化版本升级卡住"
        solution="为集群内所有主机配置存储网络，或者删除没有配置存储网络的主机"
        if [ -f "${g_hci_network_path}" ] && [ $g_hci_ver -lt 6110 ]; then
            if [ "$(cat ${g_hci_members} | jq .nodelist | grep host | wc -l)" != "$(cat ${g_hci_network_path} | jq | grep host_name | wc -l)" ] ;then
                testcase_failed_stats "$inf" "$solution"
            fi
        fi

        # 检测是否存在ping_check目录
        inf="检测到集群不存在${g_hci_ping_check}目录，可能导致升级或者进入主机维护模式时报错：进入单主机维护模式失败"
        solution="优先让客户开通存储授权，否则临时手动创建${g_hci_ping_check}目录"
        if [ ! -d "${g_hci_ping_check}" ]; then
            testcase_failed_stats "$inf" "$solution"
        fi
    fi
}

# 2、探测网络是否异常
function check_network_status()
{
    p_trace "\n探测存储私网当前网络健康状况: "
    inf="\n\t1、丢包对性能影响: \n\t1）、百分之几的丢包会对业务严重影响，业务将出现不可用情况; \n\t2）、千分之几的丢包会引发业务卡顿，甚至出现虚拟机短暂挂起; \n\t3）、万分之几的丢包会引起性能下降，可能出现明显低于标准测试性能。\
        \n\t2、时延对性能影响: \n\t1）、正常万兆网，时延50us左右的时延，且波动稳定；\n\t2）、0.几ms，性能会下降，特别是低深度；\n\t3）、几ms时延存储时延明显升高，业务出现卡顿"
    priv_ips=$(cat /etc/hosts | grep host- | grep host-mgr -v | awk '{print $1}'| sed 1d);
    for ip in $priv_ips
    do 
        echo -n $(grep $ip /etc/hosts|awk '{print $2}');
        echo -n " "$ip; 
        ping_out=$(ping $ip -c 1000 -i 0.001 -w 2 -f | egrep "transmitted|rtt" | tr "\n" " " | sed 's/\// /g' | awk '{printf(" %0.1f%% packet loss, latency %0.3fms, max:%0.3fms.\n", ($1-$4)*100/$4, $18, $19)}');
        if ! echo "$ping_out" | grep -q "0.0% packet loss"; then
            testcase_failed_stats "对 $ip 的 ping 测试失败，可能目标主机不可达（主机离线等）" "请检查主机 $ip 是否离线或存储网络异常"
        else
            echo "$ping_out"
        fi
    done
    p_trace "网络健康状况分析: \n$inf" 

    log_info "检索最近一天内核是否有存储bond口down记录: '$g_day'"
    if [ ! -z "$g_time" ]; then
        time=$(echo "$g_time" | awk '{print $1}' | tr -d '-') 
        day=$(echo "$g_time" | awk '{print $1}')
    else
        day=$(date '+%Y-%m-%d')
        time='today'  
    fi

    solution="\t1、确认认为的拔插网口操作；\n\t2、检查物理网络；\n\t3、尝试执行'asan_ops -s check -t network' 进一步检查"
    check_and_solution 'zgrep -E "sf_vs_bond.*now down" /sf/log/"'$g_day'"/kernel.log'$g_suffix' 2>/dev/null| '"$g_log_filter"'' "$inf" "$solution"
    
    p_trace "\n检索最近一天blackbox是否存在丢包记录: "
    check_and_solution 'zgrep -E "loss|\-\-\-\-|\=\=\=\=\=|\*\*\*" /sf/log/blackbox/'$time'/vs_check_net_status_recor* 2>/dev/null | grep -vE " 0% packet loss" | grep loss -B1' "$inf" "$solution"
    
    check_and_solution 'egrep "'$day'|loss" /sf/log/blackbox/'$time'/LOG_vs_ping.txt 2>/dev/null| grep -vE "0% packet loss" | grep loss -B1' "$inf" "$solution"

    p_trace "\n检索最近一天blackbox是否存在延时超过10ms的记录:"
    check_and_solution 'grep -E "'$day'|time=[1-9][0-9].*" /sf/log/blackbox/'$time'/LOG_vs_ping.txt | grep icmp_seq -B1 | uniq' "$inf" "$solution"

    p_trace "\n检索最近vs_check_net* blackbox是否存在延时超时10ms的记录:"
    check_and_solution 'grep -E "'$day'|time=[1-9][0-9].*" /sf/log/blackbox/'$time'/vs_check_net_status* | grep icmp_seq -B1 | uniq' "$inf" "$solution"

    #检测存储网络
    inf="存储网络异常"
    solution="请检查vs_check_peers_status服务是否重启，如果没有重启，那么存储网络异常"
    log_file="/sf/log/$g_day/vs/scripts/vs_check_peers_status.py.log"
    if [ -e "$log_file" ]; then
        error_log_cmd='zgrep -E "\] E \[" '"$log_file"''"$g_suffix"' 2>/dev/null | '"$g_log_filter"''
        check_and_solution "$error_log_cmd" "$inf" "$solution"
    fi

    # 检测虚拟存储通信口，如果是千兆网络告警
    p_trace "\n检查存储私网是否是千兆网络： "
    check_network_trans_rate

    #检测是否存在部分主机没有配置存储网，部分主机配置了存储网，检测是否存在ping_check目录
    check_network_conf
}

#检测页面是否存在任务残留
#function check_page_residual_tasks()
#{
#    inf="副本一致但是页面还有数据同步在进行"
#    solution="同步完成了，但页面任务残留。参考案例解决：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=31755"
    #vs3.0以下版本会出现
#    if [ $g_asan_ver -lt 30 ]; then
        #该案例是基于副本通过一致性检测产生的现象
#        run_cmd_analyze "/sf/vs/bin/vs_localhost_checkok.js check_all; [ \"\$?\" -ne 0 ] && echo \"\$?\"; "
#        if [ $? -eq 0 ]; then
            #cluster_self_heal_dump文件均不存在才会存在页面任务残留
#            cmd_output=$(run_cmd "if [ -e \"/tmp/vs/cluster_self_heal_dump\" ]; then echo \"exists\"; else echo \"not exists\"; fi;")

#            while IFS= read -r line; do
#            if [[ "$line" == "exists" ]]; then
#                testcase_success_stats
#                return 0

#            elif [[ "$line" == "not exists" ]]; then
                #某主机不存在该文件
#                continue

#            elif [[ -z "$line" ]]; then
                #某主机离线无法判断该主机是否存在该文件
#                return 1

#            fi
#            done <<< "$cmd_output"

#            testcase_failed_stats "$inf" "$solution"
#        fi
#    fi 
#}

#检测授权序列号
function check_authorize_key_id()
{
    inf="设置基础组件(aSAN) 序列号失败:虚拟存储序列号验证失败"
    solution="只有一台主机获取到了key_id,请参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=265396531"
    file="/sf/vs/bin/vs_ukey_clnt.js"

    #eds环境直接返回
    if vs_is_eds; then
        return 0 
    fi

    # 该案例为670版本，非670版本直接返回
    if [ $g_hci_ver -ne 670 ]; then
        return 0
    fi

    file_output=$(run_cmd "if [ -e "$file" ]; then echo "exists"; else echo "not exists"; fi")

    # 解析 file_output 的内容
    while IFS= read -r line; do
        if [[ "$line" == "not exists" ]]; then
            return 1
        elif [[ -z "$line" ]]; then
            # 连接某主机超时或者某主机离线则不执行该案例检测
            return 1
        fi
    done <<< "$file_output"

    key_id_output=$(run_cmd "$file" -i | grep -v "failed")

    key_id_count=0
    # 解析 file_output 的内容
    while IFS= read -r line; do
        if [[ $line =~ ^[0-9]+$  ]]; then
            let key_id_count++
        fi
    done <<< "$key_id_output"

    if [ $key_id_count -eq 1 ]; then
        testcase_failed_stats "$inf" "$solution" 
    else
        testcase_success_stats
    fi
}

#检查是否因为trash目录解析失败，导致挂载点卡住
function check_trash_directory()
{
    #3.0以上版本存在的问题，3.0以下返回
    #3.6已解决
    if [ $g_asan_ver -lt 30 ] || [ $g_asan_ver -ge 36 ]; then
        return 0
    fi
    
    volume_num=$(/sf/vs/bin/vsmgr volume list | wc -l)
    #多卷环境返回
    if [[ "$volume_num" != "1" ]]; then
        return 0
    fi
    
    trash_error_log=$(run_cmd_analyze 'zgrep "trash_sync_resolve_path_task failed" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' \
    2>/dev/null | '"$g_log_filter"'' )
    
    if [[ -z "$trash_error_log" ]];then
        return 0
    fi

    inf="trash目录解析失败直接返-1，导致nfs请求被吞掉，挂载点卡住"
    solution="参考解决方案：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=268878325"
    volume_name=$(/sf/vs/glusterfs/sbin/gluster v list)

    # 获取过滤结果
    results=$(vs_cluster_cmd.sh e '/sf/vs/bin/getfattr -d -m . -e hex /sf/data/vs/local/*-meta/*/.vs/trash' --volume "$volume_name" | grep dirty)

    # 标志变量，用来判断是否有异常
    is_abnormal=false

    # 检查每一行
    while IFS= read -r line; do
        if [[ "${line: -8}" == "ffffffff" ]]; then
            is_abnormal=true
            break
        fi
    done <<< "$results"

    # 根据标志变量判断是否存在问题,is_abnormal为布尔变量不用加中括号
    if $is_abnormal; then
        testcase_failed_stats "$inf" "$solution"
    else
        testcase_success_stats
    fi
}

#检测是否因为sffs驱动异常导致宕机
function check_dmesg_file()
{   
    #eds环境直接返回
    if vs_is_eds; then
        return 0 
    fi

    #670以上版本存在该问题，690已修复，其他版本返回
    if [ $g_hci_ver -lt 670 ] || [ $g_hci_ver -ge 690 ]; then 
        return 0
    fi 

    current_date=$(date "+%Y_%m_%d")
    count_of_dmesg=$(run_cmd "ls /sf/log/kdump/ 2>/dev/null | grep \"$current_date\"| wc -l")
    is_abnormal=false
    # 解析命令执行结果 
    while IFS= read -r line; do
        if [[ "$line" != "0" && "$line" != $'\n' ]]; then
            # 正常情况下不应该有dmesg文件
            is_abnormal=true
        elif [[ -z "$line" ]]; then
            # 连接某主机超时或者某主机离线则不执行该案例检测
            return 1
        fi
    done <<< "$count_of_dmesg"

    inf="sffs驱动异常导致主机宕机离线"
    solution="参考案例解决https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=34422"
    if $is_abnormal; then
        check_and_solution "grep -H 'exception.*check_sffs_lock_wrapper' /sf/log/kdump/*\"$current_date\"* 2>/dev/null" "$inf" "$solution"
    else
        return 0
    fi
}

function check_tier_writeback_status()
{
    p_info "开始检查分层回写功能"

    cluster_hosts=$(cat /etc/hosts |grep host- |grep -v mgr |awk '{print $2}')
    for h in ${cluster_hosts}
    do
        val=$(timeout -t 4 -s KILL /usr/bin/ssh root@${h} cat /sf/cfg/vs/cache/tier.json | jq '.writeback | select(. == 0)')
        if [ -n "${val}" ] && [ "${val}" != "1" ]; then
            p_error "host ${h} 分层回写功能关闭,请注意检查是否为全闪环境"
        fi
    done
}

function check_tier_free_space()
{
    local inf="检测到分层淘汰阈值低于95%，建议调整分层淘汰阈值到95%"
    local solution="参考案例：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=334407007"
    local cond='if [ "$(/sf/vs/bin/vs_tier_cli.py -c getparam -a free_space)" -gt 5 ]; then echo "分层淘汰阈值低于95%" ;fi'
    check_and_solution "$cond" "$inf" "$solution"
}

function check_tier_maps()
{
    local inf="检测到分层maps数量异常存在内存泄露的风险"
    local solution="参考案例：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=339444085"
    local cond='if [ "$(cat /proc/`pidof tierd`/maps | wc -l)" -gt 50000 ]; then echo "分层maps数量异常" ;fi'
    check_and_solution "$cond" "$inf" "$solution"
}

# 检查EFS空间泄漏
function check_efs_space_leak()
{
    local ver
    local vol_host_cnt
    local info=""
    local solution="请参考kb：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=327969753解决"

    ver=$(head -n1 /sf/vs/version )
    vol_host_cnt=$(/sf/vs/bin/vsmgr volume hosts |wc -l)

    if [[ "${ver:0:3}" < "3.0" ]] || [[ $vol_host_cnt -lt 3 ]]; then
        return
    fi

    p_info "检查EFS空间是否存在泄漏风险"

    local h_name=$(hostname)
    if [ -z "${h_name}" ]; then
        return
    fi

    for brickid in $(/sf/vs/glusterfs/sbin/gluster v i | grep ${h_name} | grep -v arbiter | grep -v meta | awk '{print $2}' | awk -F':' '{print $2}');
    do
        prealloc_result=$(vs_rpc_tool --cmd prealloc_info --brickid $brickid)
        real_avail_size=$(echo "$prealloc_result"| grep -w real_avail_size | awk '{print $NF}')
        if [ -z "$real_avail_size" ]; then
           p_warn "磁盘：$brickid 可能已经离线，请确认"
           continue
        fi
        real_avail_size=$(($real_avail_size/1024/1024))

        pid=$(ps auxf | grep $brickid | grep -v super | grep -v grep | awk '{print $2}')

        f_bfree=$(/sf/vs/sbin/efs_dbg -p $pid -c "super statfs" 2>/dev/null | grep -w f_bfree | awk '{print $NF}')

        grep $brickid /sf/log/blackbox/today/LOG_vs_volume_info.txt | head -n 1
        echo "剩余空间, PREALLOC: $real_avail_size, EFS: $f_bfree"

        # EFS 剩余空间小于4G，可以认为是泄漏了
        if [ $f_bfree -lt 4096 ]
        then
            info="host $(hostname) 疑似EFS空间泄漏"
        fi
    done
    if [ -n "$info" ]; then
        testcase_failed_stats "仅对当前主机$(hostname)进行了EFS空间泄漏检测：$info" "$solution"
    fi
}

function check_tier_new_data_leak()
{
    if [ $g_asan_ver -lt 30 ]; then
        return
    fi

    local vol_host_cnt
    vol_host_cnt=$(/sf/vs/bin/vsmgr volume hosts |wc -l)
    if [ $vol_host_cnt -lt 3 ]; then
        return
    fi

    p_info "检查分层空间是否存在泄漏风险"

    # 使用 heredoc with 以避免在更复杂的命令中到处转义引号
    local here_doc
here_doc=$(cat <<-'EOF'
    function do_check_tier_new_data_leak()
    {
        # 1. 遍历主机上所有brick, 拿到brickid
        /sf/vs/glusterfs/sbin/gluster v i | grep "$(hostname)" | grep -v arbiter | grep -v meta | awk '{print $2}' | awk -F':' '{print $2}' | while read -r brickid;
        do
            prealloc_result=$(/sf/vs/bin/vs_rpc_tool --cmd prealloc_info --brickid $brickid);
            if [ -z "$prealloc_result" ]; then
                echo "获取 prealloc_info 失败: $brickid"
                continue
            fi
            tier_new_size=$(echo "$prealloc_result" | /sf/vs/bin/jq .tier_new_size);
            real_avail_size=$(echo "$prealloc_result" | /sf/vs/bin/jq .real_avail_size);
            avail_size=$(echo "$prealloc_result" | /sf/vs/bin/jq .avail_size);
            # 2. 如果 tier_new_size 大于 avail_size 或者 real_avail_size 大于 avail_size, 报异常
            if [ "$tier_new_size" -gt "$avail_size" ] || [ "$real_avail_size" -gt "$avail_size" ]; then
                echo "brick分层空间检测异常: $brickid , tier_new_size: $tier_new_size, real_avail_size: $real_avail_size, avail_size: $avail_size \n";
            fi
        done
    }
    do_check_tier_new_data_leak
    exit
EOF
)

    local fail_cnt
    fail_cnt=0

    # 获取集群主机列表
    cluster_hosts=$(cat /etc/hosts | grep host- | grep -v mgr | awk '{print $2}')
    for h in ${cluster_hosts}; do
        result=$(timeout -t 12 -s KILL /usr/bin/ssh root@"${h}" "$here_doc" 2>&1)

        # 检查 SSH 执行的返回状态
        if [ $? -ne 0 ]; then
            echo "在主机 ${h} 上执行 do_check_tier_new_data_leak 失败: $result"
        else
            if [ -n "$result" ]; then
                echo "$h 主机检测失败:"
                p_warn "$result"  # 输出结果
                fail_cnt=$((fail_cnt + 1))
            fi
        fi
    done

    if [ "$fail_cnt" -gt 0 ]; then
        testcase_failed_stats "brick分层空间检测异常" "分层new data检测异常, 参考解决方案: https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=499976289 \n"
    fi 
}

function check_nfs_T_status()
{
    local inf="glusterfs进程为T状态"
    local solution="参考案例：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=475704654"
    # 检查进程状态是否为 T 状态
    local cond="ps -o state= -p \$(ps aux | grep nfs.pid | grep -vE 'grep|supervise' | awk '{print \$2}') 2>/dev/null | grep T"
    check_and_solution "$cond" "$inf" "$solution"
}

function check_rpyc_daemon()
{
    # 1. 检测是否ethtool命令太多
    if [ $g_hci_ver -ge 680 ]; then
        local inf="rpyc_daemon服务发起的ethtool命令太多，需要进一步判断ethtool命令是否卡住，长时间卡住可能导致rpyc_daemon服务无法正常提供服务。"
        local solution="参考案例：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=478221235"
        local cond='if [ "$(ps auxf | grep -v grep | grep rpyc_daemon -A 1 | grep ethtool | wc -l)" -gt 20 ]; then echo "ethtool命令过多" ;fi'
        check_and_solution "$cond" "$inf" "$solution"
    fi
}

# 检查glusterd配置异常，nfs无法启动
function check_nfs_server()
{
    inf="当前nfs无法启动"
    solution="参考案例：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=328696227"
    check_and_solution 'zgrep "failed to fetch volume file" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' "$inf" "$solution"

}

# 檢查是否有glusterd死锁问题
function check_glusterd_lock()
{
    local ret=0
    local cmd="/sf/vs/glusterfs/sbin/gluster volume status  1>/dev/null"
    local inf="磁盘服务无法上线/运维操作失败"
    local solution="\t检测卷主控执行gluster volume status 失败或超时，请进一步检查的glusterd服务是否死锁，需要检查所有主机。"

    log_info "run host: $g_master_hosts cmd: ${cmd}"
    for H in $g_master_hosts
    do
        grep -q ${H} /etc/hosts
        if [ $? -ne 0 ];then
            continue
        fi
        result=$(timeout -t ${g_time_limit} -s KILL /usr/bin/ssh root@${H} "${cmd}")
        local tmp=$?

        # glusterd死锁，执行gluster v status超时的情况下，返回错误码非0
        if [ $tmp -ne 0 ]; then
            log_warn "command ${cmd} failed on host: ${H}"
            p_timeout "所在主机 host: ${H}"
            ret=1
        fi

        log_info "run cmd: ${H}: $@ result: $result ret: $tmp"
    done

    if [ $ret -eq 1 ]; then
         testcase_failed_stats "$inf" "$solution"
         return 1
    fi
    testcase_success_stats
    return 0
}

# 检查peers个数是否一致
function check_peers_numbers()
{
    inf="检测到主机peers数量不一致"
    solution="参考kb处理: https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=328696227"

    file_output=$(run_cmd 'grep host /sf/cfg/vs/glusterfs/glusterd/peers/*')
    while IFS= read -r line; do
        if [[ -z "$line" ]]; then
            # 连接某主机超时或者某主机离线则不执行该案例检测
            return 1
        fi    
    done <<< "$file_output"

    p_trace "检测主机peers数量是否一致\n" 

    echo $file_output | grep hostname | sort | uniq -c | awk '{
    counts[NR] = $1
} END {
    first_count = counts[1]
    for (i = 2; i <= NR; i++) {
        if (counts[i] != first_count) {
            exit 1;
        }
    }
    exit 0;
}'

    if [ $? -eq 0 ];then
        p_info "主机peers数量一致"
        log_info "主机peers数量一致"
        testcase_success_stats
    else
        p_error "$inf, $solution"
        log_error "$inf, $solution"
        testcase_failed_stats "$inf" "$solution" 
    fi
}

# 检查 vs_cmd_proxy_server 是否正常
function check_vs_cmd_proxy_server()
{
    p_trace "检查 vs_cmd_proxy_server 是否正常"
    
    local inf="/tmp/vs_cmd_proxy.socket 文件不存在, 可能导致nfs一直重启"
    local solution="检查vs_cmd_proxy_server.js服务是否异常"
    local check_vs_proxy_cmd='if [ ! -e "/tmp/vs_cmd_proxy.socket" ]; then echo "/tmp/vs_cmd_proxy.socket不存在";fi'

    check_and_solution "$check_vs_proxy_cmd" "$inf" "$solution"

    inf="vs_cmd_proxy_server服务异常, 可能导致nfs一直重启"
    solution="按照内部KB处理: https://wiki.sangfor.com/pages/viewpage.action?pageId=90839654"
    check_nfs_log_cmd='zgrep "unable to start rpc.statd" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' 2>/dev/null | '"$g_log_filter"''
    check_and_solution "$check_nfs_log_cmd" "$inf" "$solution"
}

function efs_standalone_remain()
{
    if [ $g_hci_ver -lt 620 ] || vs_is_eds; then
        local inf="efs_standalone进程残留"
        local solution="efs_standalone存在内存泄露，长时间运行会占用大量内存，请联系研发清理残留进程"
        local cond='ps auxf | grep -v grep | grep -w efs_standalone'
        check_and_solution "$cond" "$inf" "$solution"
    fi
}

function arbiter_cluster_flag_check() {
    if [ $g_hci_ver -ge 590 ]; then
        local python_code=$(cat <<EOF
# -*- coding:utf-8 -*-

from volume_mgr.arbiter_func.arbiter_host import ArbiterList, ArbiterHost
arbiter_hosts = {}
try:
    al = ArbiterList()
    if al._conf:
        arbiter_hosts = al._conf.content['hosts']
    else:
        exit(0)
except Exception as ex:
    print("获取仲裁主机信息失败，请检查Zookeeper服务是否正常")

for host in arbiter_hosts:
    try:
        ah = ArbiterHost(host)
        if not ah.has_cluster_tag():
            print("检测到仲裁主机{}集群标记不存在，可能导致仲裁主机显示离线，KB：https://wiki.sangfor.com/pages/viewpage.action?pageId=90840192".format(host))
    except Exception as ex:
        print("检测仲裁主机{}集群标记失败，请确保此检测运行在仲裁主机所在卷内主机上，并检查集群到仲裁主机的网络或者防火墙是否正常".format(host))

exit(0)
EOF
)
        local inf=$(/sf/vs/bin/python -c "$python_code" 2>/dev/null)
        if [ "$inf" != "" ]; then
            testcase_failed_stats "$inf"
        fi
    fi
}

# 3、探测存储服务是否异常
function check_storage_service()
{
    local ret=0

    if [ -f /var/run/vs/nfs.pid ]; then
        p_info "检测存储服务集群信息(showmount): "
        vs_cluster_cmd.sh e 'showmount -e 127.0.0.1'
    fi

    solution="\t检测glusterd配置是否异常，nfs无法启动；参考案例：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=328696227"
    check_and_solution 'zgrep "failed to fetch volume file" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' "当前nfs无法启动" "$solution"

    ret=$(pidof glusterd)
    if [ $? -ne 0 ]; then
        p_error "数据面集群管理服务故障(检查卷主控:/sf/log/today/vs/log/glusterfs/glusterd_glusterfs.log)，请联系研发技术支持!"
    fi

    check_glusterd_lock

    solution="\t1、pidof glusterd检查所有主机集群管理服务是否正常；\n\t2、检查网络是否正常(包括防火墙)；\n\t3、检查glusterd配置是否正常(/sf/cfg/vs/glusterfs/glusterd/peers/)"
    check_and_solution '/sf/vs/glusterfs/sbin/gluster peer status | grep "Peer in Cluster" | grep -v Connected' '磁盘服务无法上线/运维操作失败' "$solution"
  
    #进入静默状态后，快速接入，则尝试使用关闭、开启方式快速接入brick
    #静默关闭 gluster v set `gluster v list` network.silence-reconn-mode off
    #静默开启 gluster v set `gluster v list` network.silence-reconn-mode on
    inf="磁盘服务未接入存储/同步无法完成/双点故障等"
    solution="\t0、如果gluster v i 不存在该brick_id，则不关注；\n\t1、检查对应磁盘(数据盘、缓存盘)/服务(包括glusterd/防火墙)是否离线，并检查定时拉起日志(/sf/log/today/vs/scripts/vs_brick_dog.sh.log)；\
              \n\t2、检查对应主机是否故障（网络、内存、假死等）；\n\t3、以上都检查通过,即进程存在但是无法接入，可能brick暂时进入静默状态，联系研发手动关闭再开启静默"
    if [ $g_asan_ver -lt 30 ]; then
        /sf/vs/glusterfs/sbin/gluster volume status | grep -vwE "NFS|local|Self-heal"| grep "N/A$" -B3
    else  
        vs_rpc_tool --cmd clnt 2>/dev/null| grep "UNNORMAL" -B5 
    fi
    if [ $? -eq 0 ]; then
        testcase_failed_stats "$inf" "$solution" 
    else 
        testcase_success_stats
    fi
    
    #p_info "跨卷运行的虚拟机列表： " 
    #vs_cluster_cmd.sh e 'netstat -nlap | grep "2049.*ESTABLISHED.*kvm" -w | grep -v "127.0.0.1" -w'
   
    if [ -f /sf/cfg/vs/cache/tier.json ]; then
        p_info "检测分层相关信息:(配置文件[配置和dump是否一致]、分层空间剩余量[<5%]) "
        vs_cluster_cmd.sh e "cat /sf/cfg/vs/cache/tier.json | /sf/vs/bin/jq .tier_devs[].dev_path; /sf/vs/bin/vs_tier_cli.py -c dump | /sf/vs/bin/jq '.ssd[]|{ssd_uuid,free_block_cnt,available_block_cnt}|.ssd_uuid, .free_block_cnt/.available_block_cnt'"
        check_tier_writeback_status
        if [ $g_hci_ver -ge 690 ];then
            check_tier_free_space
        fi
        if [ $g_hci_ver -lt 6101 ];then
            check_tier_maps
        fi
    fi

    #670之前有zk的版本
    if [ $g_asan_ver -ge 30 ] && [ $g_asan_ver -lt 33 ]; then
        #eds环境直接返回
        if vs_is_eds; then
            return 0 
        fi
        inf="卷列表页面看不到卷信息;zk的数据挂载目录文件系统只读，导致ZK服务启动失败"
        solution="\t1.迁移故障主机的虚拟机运行位置\n\t2.重启主机\n\t3.参考案例https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=188643706"
        check_and_solution "mount | grep -w ro | grep -E \"/sf/vs/vscfg.*errors\"" "$inf" "$solution"
    fi

    #检测页面是否存在任务残留
#    check_page_residual_tasks

    #检测授权序列号
    check_authorize_key_id

    #检查是否因为trash目录解析失败，导致挂载点卡住
    check_trash_directory

    #检测是否因为sffs驱动异常导致宕机
    check_dmesg_file

    #检查EFS是否存在空间泄漏
    check_efs_space_leak

    #检查是否存在tier new_size泄露, 表现为EFS空间和prealloc_info不一致
    check_tier_new_data_leak

    #检查glusterfs进程状态是否为 T 状态
    check_nfs_T_status

    # 检查glusterfsd进程是否为Z状态
    check_glusterfsd_Z_status

    #检查rpyc_daemon服务
    check_rpyc_daemon

    #检查各个主机peer配置是否正常
    check_peers_numbers

    # 检查 vs_cmd_proxy 服务是否异常
    check_vs_cmd_proxy_server

    # 检查efs_standalone进程残留
    efs_standalone_remain
}

#判断目录空间是否满
function check_directory_space()
{
    inf="$1"
    solution="$2"
    directory="$3"
    thresh="$4"
    if [[ "$thresh" =~ ^[0-9]+$ ]] ; then
        thresh=$(expr "$thresh" + 0)
    else
        thresh=99
    fi
    check_directory_space_cmd="if [ -d \"$directory\" ]; then \
                                    df \"$directory\" | awk 'NR==2 && \$5+0 >= $thresh {print \"$directory空间超过百分之$thresh\"}' 2>/dev/null;
                               fi"
    check_and_solution "$check_directory_space_cmd" "$inf" "$solution" 
}

#检测目录空间的容量
function check_directory_space_capacity()
{
    # dev空间检测
    inf="1.bd读写数据失败\n\t2.数据同步任务失败"
    solution="\t1.释放/dev/的空间后，手动执行命令udevadm trigger触发生成/dev/disk/by-asan/的符号链接\
          \n\t2.参考案例：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=180916884"
    check_directory_space "$inf" "$solution" "/dev"

    # /run/shm tmp空间检测
    inf="/tmp空间满，虚拟存储服务启动异常"
    solution="参考解决方案看是否是下面两个问题：\n\t1.https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=30873\n\t2.https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=14601"
    check_directory_space "$inf" "$solution" "/run/shm"

    # /sf/log空间检测
    inf="/sf/log空间满，数据同步异常、虚拟机挂起"
    solution="使用lsof /sf/log/|grep vs_tierd; 1.参考解决方案：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=16292&search=686b314f496a4a79446b794257&highlight=;2.https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=330619976"
    check_directory_space "$inf" "$solution" "/sf/log"

    # /sf/data/local空间检测
    inf="/sf/data/local空间满，部分主机PKG包解压异常"
    solution="参考解决方案：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=32301&search=686b314f496a4a79446b794257&highlight="
    check_directory_space "$inf" "$solution" "/sf/data/local"


    # /var空间检测
    inf="/var空间满，导致服务异常,如防火墙服务等"
    solution="参考解决方案：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=15331"
    check_directory_space "$inf" "$solution" "/var"

    # /run空间检测
    inf="/run空间满"
    solution="请查看/run目录下存在哪些内容"
    check_directory_space "$inf" "$solution" "/run"

    # /sf/cfg空间检测
    inf="/sf/cfg空间满"
    solution="请查看/sf/cfg目录下存在哪些内容"
    check_directory_space "$inf" "$solution" "/sf/cfg"

    # /sf/vs/vscfg空间检测
    inf="/sf/vs/vscfg空间满"
    solution="请查看/sf/vs/vscfg目录下存在哪些内容"
    check_directory_space "$inf" "$solution" "/sf/vs/vscfg"

    # /run/lock空间检测
    inf="/run/lock空间占用超过90%，空间满会导致磁盘格式化失败等问题"
    solution="请查看/run/lock目录下存在哪些内容并确认是否是kb问题：https://support.sangfor.com.cn/cases/list?product_id=33&type=1&category_id=28466"
    check_directory_space "$inf" "$solution" "/run/lock" "90"

    #检测挂载点存储空间使用大小是否为负数
    #eds环境不进行检测
    if ! vs_is_eds; then
        #HCI586-670,会出现这个情况 vs3.0-3.3
        if [ $g_asan_ver -ge 30 ] && [ $g_asan_ver -le 33 ]; then   
            inf="提示虚拟存储空间不足,虚拟存储无法创建新文件"
            solution="\t1.参考案例解决https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=227132630\
                    \n\t2.预警编号YJ20230418001"
            check_mount_capacity_cmd="df | grep /sf/data/vs/gfs | awk '\$3+0 < 0 {print \"挂载点存储空间使用大小为负数\"}' 2>/dev/null"
            check_and_solution "$check_mount_capacity_cmd" "$inf" "$solution"
        fi 
    fi

    # 检查分层是否击穿
    check_tierd_free_block_cnt
}

# 5、检测存储容量
function check_storage_capacity()
{
    solution="检测存储容量(posix/vg), 都有可能文件多， inode(代表元数据空间, 小文件太多), data(代表数据空间不够), 关注备份文件个数，快照meta备份个数，业务文件个数等 "
    check_and_solution 'df -hi 2>/dev/null| grep "/sf/data/vs/local" | while read -r path total use free raito mount ; do [ ${raito//%/} -gt 95 ] && echo -e "$mount inode space warning: $raito"; done ' '业务挂起' $solution
    check_and_solution 'df -hi 2>/dev/null| grep "/sf/data/vs/local" | while read -r path total use free raito mount ; do [ ${raito//%/} -gt 95 ] && echo -e "$mount data space warning: $raito"; done ' '业务挂起' $solution

    # 检测目录空间的容量
    check_directory_space_capacity

    # 日志检测
    solution="1、如果是两主机或者aSAN2.x，单个磁盘组空间满，手动平衡磁盘组; \t2、沟通删除非重要文件/迁移业务到其他集群；\t3、扩容"
    check_and_solution 'zgrep "No space left on device" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' 2>/dev/null| '"$g_log_filter"'' '业务挂起/延时飙升' $solution
    check_and_solution 'zgrep "No space left on device" /sf/log/vs/log/glusterfs/api/*.log'$g_suffix' -n 2>/dev/null| '"$g_log_filter"'' '业务挂起/延时飙升' $solution
    check_and_solution 'zgrep "No space left on device" /sf/log/'$g_day'/vs/log/glusterfs/api/gluster_tgtd.log'$g_suffix' 2>/dev/null|'"$g_log_filter"'' '业务挂起/延时飙升' $solution
    check_and_solution 'zgrep "No space left on device" /sf/log/'$g_day'/vs/log/glusterfs/bricks/glusterfsd_sf-data-vs-local-*.log'$g_suffix' 2>/dev/null | grep -v shm|'"$g_log_filter"'' '业务挂起/延时飙升' $solution
    check_and_solution 'zgrep "No space left on device" /sf/log/vs/log/glusterfs/bricks/glusterfsd_sf-data-vs-local-*.log'$g_suffix' 2>/dev/null|'"$g_log_filter"'' '业务挂起/延时飙升' $solution

    vs_is_two_hosts
    if [[ $? -eq 0 || $g_asan_ver -lt 30 ]]; then 
        vs_cluster_cmd.sh e '/sf/vs/sbin/vgs 2>/dev/null'
    fi 
}

#检测读写缓存服务
function check_cache()
{   
    wcache_file="/sf/cfg/vs/cache/wcache.json"
    ssdcd_file="/sf/cfg/vs/cache/ssdcd.json"
    #首先判断读写缓存配置文件是否存在(防止检测非卷内主机产生误报)
    #然后检测JSON格式是否被破坏
    #将标准输出重定向/dev/null，标准错误重定向到标准输出
    wcache_cmd="if [ -e '$wcache_file' ]; then \
            /sf/vs/bin/jq -e . '$wcache_file' > /dev/null 2>&1;\
            if [ \$? -ne 0 ]; then\
                echo \"写缓存配置文件不是json格式\";\
            fi;\
        fi"
    check_and_solution "$wcache_cmd" "写缓存配置文件异常" "请参考https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=34223解决"
    if [ $? -ne 0 ]; then
         return 1
    fi

    ssdcd_cmd="if [ -e '$ssdcd_file' ]; then \
            /sf/vs/bin/jq -e . '$ssdcd_file' > /dev/null 2>&1;\
            if [ \$? -ne 0 ]; then\
                echo \"读缓存配置文件不是json格式\";\
            fi;\
        fi"
    check_and_solution "$ssdcd_cmd" "读缓存配置文件异常" "请参考https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=34223解决"
    if [ $? -ne 0 ]; then
         return 1
    fi

    inf="写缓存服务"
    solution="'$wcache_file'写缓存配置文件Brick链接状态异常，请参考https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=34223解决"
    check_and_solution " /sf/vs/bin/jq -r '.cacheDevs[] | select(.status != \"ok\") | to_entries[] | \"\(.key): \(.value)\"' '$wcache_file' 2>/dev/null" "$inf" "$solution"

    inf="读缓存服务"
    solution="'$ssdcd_file'读缓存配置文件Brick链接状态异常，请参考https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=34223解决"
    check_and_solution " /sf/vs/bin/jq -r '.data.ssd_devs[] | select(.ssd_status != \"ok\") | to_entries[] | \"\(.key): \(.value)\"' '$ssdcd_file' 2>/dev/null" "$inf" "$solution"   
}

#检测集群ca文件是否一致ca.key/ca.crt
function check_ca_file_consistency()
{
    local file="$1"
    local file_output
    local file_not_exists
    local md5_output 
    local most_common_md5
    local different_md5_output=""
    local inf="创建卷时，无法获取集群中所有主机的磁盘信息\n扩容主机时，无法获取集群中所有主机的磁盘信息\n恢复出厂设置，报certificate verify failed失败"
    local solution="参照 https://wiki.sangfor.com/pages/viewpage.action?pageId=90839442 解决"

    #eds环境直接返回
    if vs_is_eds; then
        return 0 
    fi

    #检测集群中各主机ca文件是否存在(影响集群间rpyc通信)
    file_output=$(run_cmd "if [ -e "$file" ]; then echo "exists"; else echo "not exists"; fi")

    # 解析 file_output 的内容
    while IFS= read -r line; do
        if [[ "$line" == "not exists" ]]; then
            testcase_failed_stats "$file 文件不存在" "$solution"
            return 1
        elif [[ -z "$line" ]]; then
            #连接某主机超时或者某主机离线则不执行该案例检测
            return 1
        fi
    done <<< "$file_output"

    #获取各主机ca文件的md5值
    md5_output=$(vs_cluster_cmd.sh m "$file")

    # 获取最常见的MD5值
    most_common_md5=$(echo "$md5_output" | grep -oE "[a-f0-9]{32}" | sort | uniq -c | sort -nr | head -1 | awk '{print $2}')

    # 检查每个主机的MD5值
    while read -r line; do
        if echo "$line" | grep -q "host-"; then
            host=$(echo "$line" | awk '{print $1}')
        elif echo "$line" | grep -q "$most_common_md5"; then
            continue
        elif echo "$line" | grep -qE "[a-f0-9]{32}"; then
            md5=$(echo "$line" | awk '{print $1}')
            different_md5_output+="$host $md5"$'\n'
        fi
    done <<< "$md5_output"

    if [[ -z $different_md5_output ]]; then
#        p_trace "$file 文件一致\n"
        testcase_success_stats
        return 0
    else
        p_error "$file 文件不一致，不一致的主机和MD5值如下：\n"
        p_info "$different_md5_output"
        p_info "集群中最常见的MD5值为：$most_common_md5\n"
        testcase_failed_stats "$inf" "$solution"
        return 1
    fi
}

#检测brick配置文件
function check_brick_config()
{
    inf="brick配置文件异常"
    solution="参考解决方案：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=32425"
    # 该案例为2.8版本
    #if [ $g_asan_ver -ne 28 ]; then
    #    return 0
    #fi

    count_of_brick_json=$(run_cmd "ls /sf/cfg/vs/brick/ | wc -l")

    # 解析命令执行结果 
    while IFS= read -r line; do
        if [[ "$line" == "0" ]]; then
            # 正常情况下不应该没有brick_json文件
            return 1
        elif [[ -z "$line" ]]; then
            # 连接某主机超时或者某主机离线则不执行该案例检测
            return 1
        fi
    done <<< "$count_of_brick_json"

    check_brick_json_cmd="for f in \`ls /sf/cfg/vs/brick/*\`; do \
                            status=\$(/sf/vs/bin/jq -r .status \"\$f\" 2>/dev/null); \
                            if [ \$? -ne 0 ] || [ \"\$status\" != \"NORMAL\" ]; then \
                                echo \"\$f\"; \
                            fi; \
                        done"

    check_and_solution "$check_brick_json_cmd" "$inf" "$solution"
}

#检测主机lvm.conf过滤规则
function check_lvm_config()
{
    inf="主机重启后磁盘挂载失败"
    solution="lvm.conf配置文件变化，缺少part规则。请参照案例修正：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=235176383"
    file="/etc/lvm/lvm.conf"

    file_output=$(run_cmd "if [ -e "$file" ]; then echo "exists"; else echo "not exists"; fi")

    # 解析 file_output 的内容
    while IFS= read -r line; do
        if [[ "$line" == "not exists" ]]; then
            testcase_failed_stats "$file 文件不存在" "$solution"
            return 1
        elif [[ -z "$line" ]]; then
            # 连接某主机超时或者某主机离线则不执行该案例检测
            return 1
        fi
    done <<< "$file_output"

    check_and_solution 'cat '$file' | grep filter | grep -v \# | grep -v "part"' "$inf" "$solution"
}

#检测磁盘配置文件
function check_disk_config()
{
    inf="磁盘配置文件异常，磁盘替换报错"
    solution="参考解决方案：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=164280629"
    
    count_of_disk_json=$(run_cmd "ls /sf/cfg/vs/disk/ | wc -l")

    # 解析命令执行结果 
    while IFS= read -r line; do
        if [[ "$line" == "0" ]]; then
            # 正常情况下不应该没有disk_json文件
            return 1
        elif [[ -z "$line" ]]; then
            # 连接某主机超时或者某主机离线则不执行该案例检测
            return 1
        fi
    done <<< "$count_of_disk_json"

    check_disk_json_cmd="for f in \`ls /sf/cfg/vs/disk/*\`; do \
                            disk_alias=\$(/sf/vs/bin/jq -r .disk_alias \"\$f\" 2>/dev/null); \
                            life_ok=\$(/sf/vs/bin/jq 'has(\"life_ok\")'  \"\$f\" 2>/dev/null);\
                            if [ \$? -ne 0 ] || [ -z \"\$disk_alias\" ] || [ \"\$life_ok\" == \"false\" ]; then \
                                echo \"\$f\"; \
                            fi; \
                        done"

    check_and_solution "$check_disk_json_cmd" "$inf" "$solution"
    inf="磁盘配置文件异常，热备盘已经加入虚拟存储，请确认并修正"
    solution="参考解决方案：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=520997942"
    check_disk_json_cmd="for f in \`ls /sf/cfg/vs/disk/*\`; do \
                            storage_type=\$(/sf/vs/bin/jq -r .storage_type \"\$f\" 2>/dev/null); \
                            part_uuid=\$(/sf/vs/bin/jq  -r .part_array[].part_uuid  \"\$f\" 2>/dev/null);\
                            if [ \$? -ne 0 ] || [[ -n \"\$part_uuid\"  &&  \"\$storage_type\" == \"STORAGE_BACKUP\" ]]; then \
                                echo \"\$f\"; \
                            fi; \
                        done"

    check_and_solution "$check_disk_json_cmd" "$inf" "$solution"
}

# 检测出卷组名称在 gluster v i 中存在但在系统卷组中不存在的内容并告警
function check_gluster_vgs() {
    local gluster_info=""
    local has_missing_vgs=0

    # 获取 gluster 卷组名称
    local gluster_vgs
    gluster_vgs=$(/sf/vs/glusterfs/sbin/gluster v i | awk '/VG:/ {print $3}')
    if [ $? -ne 0 ]; then
        p_error "获取 Gluster 卷组名称失败，请检查 gluster 命令是否正确执行。"
        return 1
    fi

    # 获取系统卷组名称
    local system_vgs
    system_vgs=$(/sf/vs/bin/vs_cluster_cmd.sh e "/sf/vs/sbin/vgs -o vg_name --noheadings 2>/dev/null" | \
        grep -E '[A-Za-z0-9]{6}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{6}' | \
        awk 'NF' | sed 's/^[ \t]*//')
    if [ $? -ne 0 ]; then
        p_error "获取系统卷组名称失败，请检查 vs_cluster_cmd.sh 命令是否正确执行。"
        return 1
    fi

    # 检查 gluster 卷组是否在系统卷组中
    for vg in $gluster_vgs; do
        if ! echo "$system_vgs" | grep -qw "$vg"; then
            has_missing_vgs=1
            gluster_info+=$(/sf/vs/glusterfs/sbin/gluster v i | grep "VG: $vg")$'\n'
        fi
    done

    # 如果有卷组名称在 gluster 中存在但在系统卷组中不存在
    if [ $has_missing_vgs -eq 1 ]; then
        testcase_failed_stats "卷组名称在 gluster v i 中存在但在系统卷组（vgs）中不存在" "检查对应磁盘是否离线\n$gluster_info"
        return 1
    fi
}

# 检查volume.json配置是否为json格式
function check_volume_config_json()
{
    local inf="/sf/cfg/vs/volume.json 文件不存在 或 不是json格式"
    local solution="检查 /sf/cfg/vs/volume.json 是否正确，或是否系统盘异常"
    local check_volume_json_cmd="test -f /sf/cfg/vs/volume.json && $g_jq_path -e 'has(\"volume_id\")' /sf/cfg/vs/volume.json > /dev/null 2>&1;if [ \$? -ne 0 ];then echo '/sf/cfg/vs/volume.json异常';fi"

    check_and_solution "$check_volume_json_cmd" "$inf" "$solution"   
}

# 检查glusterd的链接文件是否存在异常
function check_glusterd_file_exists()
{
    local inf="/etc/init.d/glusterd 文件不存在"
    local solution="检查 /etc/init.d/glusterd 链接是否异常"
    local check_volume_json_cmd="test -e /etc/init.d/glusterd;if [ \$? -ne 0 ];then echo '/etc/init.d/glusterd文件异常';fi"

    check_and_solution "$check_volume_json_cmd" "$inf" "$solution"   
}

# 4、检测集群配置信息是否异常
function check_storage_config()
{
    # 检查glusterd链接文件是否存在
    check_glusterd_file_exists

    output=$(/sf/vs/glusterfs/sbin/gluster v i)
    if [ $? -ne 0 ]; then
        p_error "检查glusterd服务是否启动"
        return 0
    fi
    log_info "检查存储卷配置是否存在异常： $output"

    # 检查修复(gluster v set `gluster v list` xxxxx xxx)
    echo "$output" | grep "cluster.entry-self-heal: off" -w 
    [ $? -eq 0 ] && p_error "cluster.entry-self-heal must on" 
    echo "$output" | grep "cluster.data-self-heal: off" -w 
    [ $? -eq 0 ] && p_error "cluster.data-self-heal must on" 
    echo "$output" | grep "cluster.metadata-self-heal: off" -w 
    [ $? -eq 0 ] && p_error "cluster.metadata-self-heal must on" 
    echo "$output" | grep "features.lock-heal: off" -w 
    [ $? -eq 0 ] && p_error "features.lock-heal must on" 
    vs_is_two_hosts
    if [ $? -eq 0 ]; then
        # 两主机: 选源none 
        echo "$output" | grep "quorum-type: none" -w 
        [ $? -ne 0 ] && p_error "quorum-type must none" 
    else 
        # 三主机及以上、或者存在仲裁盒子，即存在arbiter场景:auto 
        echo "$output" | grep "quorum-type: none" -w 
        [ $? -eq 0 ] && p_error "quorum-type must auto"
    fi

    # 检查iolog
    echo "$output" | grep "iolog: on" -w 
    [ $? -eq 0 ] && p_error "all debug iolog must off" 
    
    #检测读写缓存服务
    check_cache
    
    #检测集群ca.key文件是否一致
    check_ca_file_consistency "/sf/cfg/vs/cert/ca.key"

    #检测集群ca.crt文件是否一致
    check_ca_file_consistency "/sf/cfg/vs/cert/ca.crt"

    inf="亚健康检测结果配置异常，影响亚健康信息的更新"
    solution="参考案例解决：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=30489&extra="
    check_health_cmd="if [ -e \"/sf/cfg/vs/subhealth/disk_info.json\" ]; then \
                         /sf/vs/bin/jq -e . \"/sf/cfg/vs/subhealth/disk_info.json\" > /dev/null 2>&1;\
                         if [ \$? -ne 0 ]; then\
                            echo \"亚健康检测结果配置文件损坏\";\
                         fi;\
                      fi"
    check_and_solution "$check_health_cmd" "$inf" "$solution"

    #检测brick配置文件
    check_brick_config

    #检测主机lvm.conf过滤规则
    check_lvm_config
    
    #检测磁盘配置文件
    check_disk_config

    # 检测/sf/cfg/vs/lvmmgr/lvm_mgr_vg.json 和本地的 gluster v i | grep `hostname` 结果是否一致
    check_lvm_mgr_vg

    # 检测volume.json文件是否存在，是否为json格式
    check_volume_config_json

    # 检测仲裁主机集群标记是否存在
    arbiter_cluster_flag_check

    # 性能
    return 0   
}

# 检查 CPU 降频
function check_cpu_downclocking()
{
    # EDS 或 HCI arm 架构不支持
    if vs_is_eds || is_aarch64_platform; then
        p_info "EDS 或 HCI arm 架构暂不支持 CPU 降频检查"
        return 1
    fi

    p_trace "检查 CPU 是否存在降频:"
    
    command='if [[ ! -x "/sf/bin/get_turbostat.sh" ]]; then
        p_error "错误: /sf/bin/get_turbostat.sh 不存在或不可执行。";
        exit 1;
    fi;
    output=$(/sf/bin/get_turbostat.sh 2>/dev/null);

    echo "$output" | while IFS= read -r line; do
        if [[ $line =~ ^[[:space:]]*([0-9]+)[[:space:]]+([0-9]+)[[:space:]]+([0-9]+\.[0-9]+)[[:space:]]+([0-9]+)[[:space:]]+([0-9]+) ]]; then
            cpu=${BASH_REMATCH[1]};
            bzy_mhz=${BASH_REMATCH[4]};
            tsc_mhz=${BASH_REMATCH[5]};
            threshold=$(echo "$tsc_mhz * 0.7" | /usr/bin/bc);

            if (( $(echo "$bzy_mhz < $threshold" | /usr/bin/bc -l) )); then
                echo "警告: CPU $cpu 降频! Bzy_MHz=$bzy_mhz, TSC_MHz=$tsc_mhz";
            fi;
        fi;
    done;'
    info="存在 CPU 降频"
    solution="请联系研发解决"
    # 非容器化版本不用进入容器执行命令
    if [ "$g_asan_ver" -le 35 ]; then
        check_and_solution "$command" "$info" "$solution"
    else
        check_and_solution_with_port "$command" "$info" "$solution" "22346"
    fi
}

# 检查内存降频
function check_mem_downclocking()
{
    p_trace "检查内存是否降频:"
    info="存在内存降频"
    solution="请联系研发解决"

    if vs_is_eds; then
        result=""
        output=$(show_host_info)
        hosts=($(echo "$output" | awk '{print $1}'))
        for host in "${hosts[@]}"; do
            command=$(cat <<EOF
/usr/bin/ssh root@${host} /usr/bin/perf bench mem all | grep -vE "GB/sec" | grep -E "sec"
EOF
)       
            ret=$(chroot_outside "$command")
            exit_status=$?
            if [ "$exit_status" -ne 0 ]; then
                p_error "${host} ${ret}"
            elif [ -n "$ret" ]; then
                echo "$command"
                echo "$ret"
                result+="\n${host} ${info}" 
            fi
        done
        if [ -n "$result" ]; then
            testcase_failed_stats "$result" "$solution"
        fi        
    else
        command='/usr/bin/perf bench mem all | grep -vE "GB/sec" | grep -E "sec"' 
        check_and_solution "$command" "$info" "$solution"
    fi
}

function check_zk()
{
    # 检查脚本是否存在
    if [ ! -f "/sf/vs/bin/vs_zkstatus.sh" ]; then
        return
    fi

    p_info "检查ZooKeeper leader数量是否超过1个"

    local here_doc
    here_doc=$(cat <<'EOF'
        # 执行zk状态检查命令，统计该aa主机上的leader数量
        leader_count=$(/sf/vs/bin/vs_zkstatus.sh | grep -c 'Mode: leader')
        echo "$leader_count"
EOF
)

    local fail_cnt=0
    local total_leaders=0

    # 获取集群中的主机列表
    cluster_hosts=$(cat /etc/hosts | grep host- | grep -v mgr | awk '{print $2}')

    for h in ${cluster_hosts}; do
        result=$(timeout -t 12 -s KILL /usr/bin/ssh root@"${h}" "$here_doc" 2>&1)
        # 检查ssh执行的状态
        if [ $? -ne 0 ]; then
            echo "在主机 ${h} 上执行zk leader检查失败: $result"
            # 如果ssh失败，此处可以选择是否增加fail_cnt，但按原逻辑不计入leader总数
        else
            # 提取结果中的数字部分，确保正确获取leader数目
            leader_num=$(echo "$result" | grep -E '^[0-9]+$' | head -n1)
            if [[ -z "$leader_num" ]]; then
                leader_num=0
            fi
            total_leaders=$((total_leaders + leader_num))
        fi
    done

    if [ "$total_leaders" -gt 1 ]; then
        testcase_failed_stats "ZooKeeper leader数量超过限制" "检测到 $total_leaders 个ZooKeeper leader，超过1个，可能构成数据一致性风险。参考解决方案：https://wiki.sangfor.com/pages/viewpage.action?pageId=90839686"
    fi
}

function check_mem_cpu()
{
    p_trace "检查环境内存剩余:"
    check_and_advice 'free -h' '剩余内存、是否使用到swap等'

    # 内存测速
    p_trace "内存测速: "
    check_and_advice 'perf bench mem all | grep -E "benchmark|sec"' '关注和其他主机对比速度，并关注是否存在单个主机卡慢的情况;该问题同样会导致CPU核跑满,导致业务卡慢或虚拟机挂起'

    p_trace "CPU 使用情况:"
    check_and_advice "mpstat -P ALL 1 $g_cnt| grep -E 'CPU|all'" 'CPU空闲率建议(idle)在30%以上'

    p_trace "检查系统负载:"
    check_and_advice 'cpunum=$(grep -c "model name" /proc/cpuinfo); load=$(uptime); echo -e "CPU: $cpunum load: $load"' '15min的系统负载, 是否超过CPU核数'

    # 检查 CPU 降频
    check_cpu_downclocking

    # 检查内存降频
    check_mem_downclocking

}

#检测算法日志是否存在多条loop日志导致如果容量不平衡，但不发起平衡任务
function check_algo_log_loop()
{
    inf="容量不平衡，但不发起平衡任务"
    solution="\t1.算法日志存在多条“doing loop”的日志，请进一步确认是否会导致不发起平衡任务\
              \n\t2.请使用命令：./asan_ops -s case -t databalance 进行检测确认"

    if [ $g_asan_ver -eq 30 ]; then

        # 判断含有'doing loop'的日志行数是否超过10条
        # 只需要判断一台主机即可，TD上说任意一台主机都可以看到算法在一分钟一次加载拓补，如果日志过多则有可能出现问题
        log_count=$(echo "grep 'doing loop' /sf/log/today/vs/scripts/vs_rt_algo.py.log" | wc -l)
        if [ "$log_count" -ge 10 ]; then
            testcase_failed_stats "$inf" "$solution" 
        else 
            testcase_success_stats
        fi
    fi
}

# 是否残留有rebuild标记
function check_rebuild_flag() {
  if [ $g_asan_ver -ge 30 ]; then
    local cmd="/sf/vs/bin/da_rpc_client dump topo|grep rebuild|grep true"
    local inf="算法残留rebuild标记导致无法平衡(磁盘已离线可忽略)"
    local solution="参考案例删除rebuild标记：（https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=165527349）"
    for H in $(/sf/vs/bin/vsmgr volume hosts|grep master|awk -F ' ' '{print $1}');
      do
        log_info "run host: $H cmd: $cmd"
        timeout -t ${g_time_limit} -s KILL /usr/bin/ssh root@${H} "$cmd" 2>/dev/null
        local tmp=$?
        if [ $tmp -eq 0 ]; then
            testcase_failed_stats "$inf" "$solution"
            return 1
        elif [ $tmp -eq 137 ]; then
            p_timeout "所在主机 host: ${H}"
            testcase_timeout_stats "$cmd"
            return 2
        fi
      done
    testcase_success_stats
    return 0
  fi

}

function check_env_conf() {
    local solution="检测到一键检测配置丢失，请查看集群中配置是否正常并同步到丢失主机"
    if ! vs_is_eds; then
        check_and_solution 'test -f /sf/vs/lib/python-srv/check_env/config/env_check_config.json.default;if [ $? -ne 0 ]; then echo -e "一键检查配置文件丢失"; fi' '一键检查失效' "$solution"
    fi

    if [ $g_asan_ver -ge 35 ]; then
        local solution="检测到brick容器未启动，请确认是否有残留容器未删除"
        check_and_solution '/usr/local/sf/container/bin/nerdctl -n k8s.io ps -a|grep fsd|grep vs-dp|grep -v Up' '升级失败' "$solution"
    fi

}

# 检测是否在升级维护模式
function check_upgrade_protect() {
    local inf="存在升级维护模式"
    local solution="检测到当前处于升级维护模式，请确认当前场景"
    if [ $g_asan_ver -ge 35 ]; then
        if [ -f /cfs/upgrade_status.json ]; then
            local upgrade_mode=""
            upgrade_mode=$(/sf/vs/bin/jq .upgrade_mode /cfs/upgrade_status.json)
            if [[ "$upgrade_mode" == "true" ]]; then
                testcase_failed_stats "$inf" "$solution"
                return 1
            fi
            testcase_success_stats
            return 0
        fi
    else
        check_and_solution 'test -f /cfs/protect_mode;if [ $? -eq 0 ]; then echo -e "存在升级维护模式"; fi' '运维操作' "$solution"
        check_and_solution 'test -f /cfs/protect_mode_phoenix;if [ $? -eq 0 ]; then echo -e "存在升级维护模式"; fi' '运维操作' "$solution"
    fi
}

# 检测是否存在冷升级标记
function check_cold_upgrade_flag() {
    local solution="检测到存在老版本冷升级标记文件，请确认是否还处于老版本升级场景"
    check_and_solution "ls /sf/cfg/vs/upgrade -lh 2>/dev/null | grep -vE 'total'" '运维操作' "$solution"

    check_and_solution "ls /cfs/vs/upgrade30 -lh 2>/dev/null | grep -vE 'total'" '运维操作' "$solution"

}

# 检测是否存在热升级标记
function check_hot_upgrade_flag() {
    local solution="检测到存在老版本热升级标记文件，请确认是否还处于老版本热升级场景"
    check_and_solution 'test -f /sf/cfg/vs/inpalce_hot_upgrade_flag;if [ $? -eq 0 ]; then echo -e "存在老版本热升级标记文件"; fi' '运维操作' "$solution"

    check_and_solution 'test -f /boot/firmware/utmp/unpack/update-v2/asan/upgrade/comp_upgrade_flag;if [ $? -eq 0 ]; then echo -e "存在老版本热升级标记文件"; fi' '运维操作' "$solution"

}

# 检测是否在2.8升级到3.x版本的中间过程中，数据转换成功后，分层还被拉起，会有数据问题
function check_righttree_and_tierd() {
    if [ $g_asan_ver -eq 28 ]; then
        local solution="检测到存在数据转换标记, 正在进行VS2.x升级到VS3.x，且分层服务依旧存在，危险场景。
                        \n请参照KB处理：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=349425529"
        check_and_solution 'test -f /boot/firmware/conf/vs/upgrade30/*/suc/suc_dconvert_righttree.flag && pidof tierd && exit 1' '运维操作' "$solution"
    fi
}

# 检测是否存在升级标记
function check_upgrade_flag() {
    p_trace "\n检测升级标记: "
    local solution="检查到集群存在升级目录, 请确认是否升级中"
    command='DIR="/boot/firmware/utmp/unpack"; [ -d "$DIR" ] && [ "$(ls -1 "$DIR" | wc -l)" -gt 0 ] && echo "升级目录存在且升级目录中存在文件"'
    check_and_solution "$command" "运维操作" "$solution"

    local solution="检查到集群存在升级标记, 请确认是否升级中"
    check_and_solution 'test -f /cfs/vs/upgrade/maintain/control_mgr_maintain.flag;if [ $? -eq 0 ]; then echo -e "存在升级标记"; fi' '运维操作' "$solution"

    check_upgrade_protect

    check_cold_upgrade_flag

    check_hot_upgrade_flag

    check_righttree_and_tierd
}

# 检测磁盘配置文件中磁盘组是否异常
function check_disk_group_id() {
    if [ $g_asan_ver -gt 30 ];then
      p_trace "\n检测磁盘配置文件中磁盘组是否异常: "
      res=$(vs_cluster_cmd.sh e '
      for file in "/sf/cfg/vs/disk/"*
      do
        if [ -f "$file" ]; then
          read storage_type group  < <(awk -F ":" '\''/storage_type/ {gsub(",","", $(gsub(/[[:space:]]*/, "", $2))); stype=$2 } /disk_group_id/{gsub(",","", $(gsub(/[[:space:]]*/, "", $2))); gid=$2 } END { print stype, gid }'\'' "$file")
          if expr "$group" + 1 >/dev/null 2>&1; then
            if [ "$group" -eq 0 ] && [ "$storage_type" != "\"STORAGE_BACKUP\"" ] && [ "$storage_type" != "\"STORAGE_NONE\"" ]; then
              echo "磁盘组id为0，但是磁盘类型不是新加盘或者热备盘:err_disk_group_id:$file"
            fi
          else
            if [ "$storage_type" != "\"STORAGE_BACKUP\"" ] && [ "$storage_type" != "\"STORAGE_NONE\"" ]; then
              echo "没有磁盘组id，且磁盘类型不是新加盘或者热备盘:err_none_disk_group_id:$file"
            fi
          fi
        fi
      done
      '|grep -v  'echo' |grep -E 'err_disk_group_id|err_none_disk_group_id')
      if [ "x$res" != "x" ]; then
            p_error "$res"
            p_error "磁盘配置文件：磁盘组参数异常，可能导致卷运维故障。";
      fi
    fi
}

#check_file_too_large 检查文件大小
function check_file_too_large() {
    if [ $g_asan_ver -gt 30 ];then
      p_trace "\n检测文件大小是否会导致数据同步失败："
      too_large_res=$(vs_cluster_cmd.sh e 'zgrep "File too large" /sf/log/today/vs/log/glusterfs/bricks/glusterfsd_sf-data-vs-local-*|head -n 3'| tee /dev/tty)
      res=$(echo $too_large_res|grep " E "|grep "FLUSH")
      if [ "x$res" != "x" ]; then
          p_error "文件太大可能导致数据同步失败，参考KB解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=365570222";
      fi
    fi
}

# check_offine_disk 检查离线的磁盘
function check_offine_disk() {
    p_trace "永久拔盘列表："
    never_recoverd_disks=$(vs_cluster_cmd.sh e 'cat /sf/cfg/vs/never_recoverd_disks.json 2>/dev/null'| tee /dev/tty)
    p_trace "磁盘配置盘符列表："
    disk_dev=$(vs_cluster_cmd.sh e 'grep -w \"dev\": -rn /sf/cfg/vs/disk'| tee /dev/tty)
    res=$(echo "$disk_dev" |grep "\"\"")
    if [ "x$res" != "x" ]; then
        detail_res=$(echo "$never_recoverd_disks" | grep "disk_id")
        if [ "x$detail_res" != "x" ]; then
            p_error "当前有VS存储磁盘离线，且配置中存在有卡慢或只读永久拔盘的配置，请进一步确认离线的盘是否和永久隔离配置的盘是否对应上。";
            return
        fi
        p_error "有VS存储磁盘离线,有可能是卡慢临时拔盘，或brick进程异常4小时拔盘，可以尝试执行vs_hotplug.sh恢复。如果未能上线则为物理拔盘.";
        return
    fi
}

# check_tierd_meta_data_err 检查分层元数据是否损坏
function check_tierd_meta_data_err() {
    p_trace "检查分层元数据是否损坏"
    fail_res=$(vs_cluster_cmd.sh e 'grep  -E "tier_generate_extent|tier_generate_inode|tfs_shard_load|tfs_extent_load|tfs_inode_load|tfs_brick_load" /sf/log/vs/tierd/tierd.log |grep " E " |head -n 3'| tee /dev/tty)
    res=$(echo "$fail_res" |grep "\[")
    if [ "x$res" != "x" ]; then
        p_error "分层元数据损坏，检查一下内核日志是否有IO错误，是否硬件故障，可参考KB修复：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=315732971";
    fi
}

# check_eds_hdparm 检查eds的无效指令问题
function check_eds_hdparm() {
  p_trace "检查eds的无效指令问题"
  fail_res=$(vs_cluster_cmd.sh e  "grep 'vs_dump_hdparm' /sf/vs/bin/vs_blackbox.sh|grep -v 'function'|grep -v '#'")
  res=$(echo "$fail_res" |grep "vs_dump_hdparm" |grep -v 'vs_blackbox')
  if [ "x$res" != "x" ]; then
      echo "EDS上的黑盒vs_blackbox.sh还有频繁指令，可参考KB修复：https://support.sangfor.com.cn/cases/list?product_id=28&type=1&category_id=30351";
  fi
}

# 检查分层是否击穿
function check_tierd_free_block_cnt() {
    # 每个主机只显示最后5条
    local show_cnt=10 # 5*2行(时间行和free_block_cnt行)

    local inf="检测到黑盒日志存在分层缓存击穿 (每个主机只显示最后5条)"
    local solution="缓存击穿会导致存储时延升高，虚拟机卡慢。建议检查一下业务流量和存储缓存比"
    local check_cmd="[ -f /sf/log/blackbox/${g_date}/LOG_vs_tier.log ] && grep -e '\*\*\* ' -e free_block_rate -e free_block_cnt /sf/log/blackbox/${g_date}/LOG_vs_tier.log | awk '/^\*+/ { time_line=\$0; next;} /\"free_block_rate\": 0/ {print time_line; print \$0} /\"free_block_cnt\": ([0-9]+)/ { if (\$2 + 0 < 10000) {print time_line; print \$0}}' | tail -n ${show_cnt}"

    check_and_solution "$check_cmd" "$inf" "$solution"
}

#检测是否因为/sf/cfg/vs/.members更新机制bug导致主机状态异常
function check_host_online_status()
{
    inf="主机状态异常"
    solution="参考案例解决：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=29884"
    host_status_log_cmd='zgrep "is_vsmember_status_online.*online value is (0)" \
            /sf/log/'$g_day'/vs/scripts/check_env_config_vsmember.log'$g_suffix' 2>/dev/null && echo "exists" || echo "not exists"'

    host_status_log_output=$(run_cmd "$host_status_log_cmd")
    flag=1
    # 解析 host_status_log_output 的内容
    while IFS= read -r line; do
        if [[ "$line" == "exists" ]]; then
            flag=0
            break
        elif [[ "$line" == "not exists" ]]; then
            continue
        elif [[ -z "$line" ]]; then
            #连接某主机超时或者某主机离线则不执行该案例检测
            return 1
        fi
    done <<< "$host_status_log_output"

    #/sf/cfg/vs/.members更新机制有bug，导致不同主机配置不一致从而出现状态异常
    if [ $flag -eq 0 ]; then
        check_and_solution "/sf/vs/bin/jq '.nodelist[] | select(.online == 0)' /sf/cfg/vs/.members 2>/dev/null" "$inf" "$solution" 
    fi
}

#检测HCI 6.10.0以后环境是否存在coredump文件
function check_6100_coredump()
{
    #eds环境直接返回
    if vs_is_eds; then
        return 0
    fi

    local inf="检测高版本（HCI6100及以上）环境是否存在coredump"
    local solution="该环境存在coredump日志文件，联系研发开启开关，排查core。注意是否多卷，gluster v set \$(gluster v list) gf-enable-core on"
    local check_cmd="ls /sf/log/'$g_day'/vs/log/glusterfs/ -lh 2>/dev/null | grep -E 'coredump'"
    
    if [ $g_hci_ver -ge 6100 ];then
        check_and_solution "$check_cmd" "$inf" "$solution"
    fi
}

#检测环境是否存在brick处于静默状态
function check_silence_brick()
{
    local inf="检测环境是否存在brick处于静默状态"
    local solution="\n\t1、检查对应磁盘(数据盘、缓存盘)/服务(包括glusterd/防火墙)是否离线，并检查定时拉起日志(/sf/log/today/vs/scripts/vs_brick_dog.sh.log);\
           \n\t2、检查对应主机是否故障（网络、内存、假死等）;\
            \n\t3、以上都检查通过,即进程存在但是无法接入,brick暂时处于静默，尝试使用关闭、开启方式快速接入brick;\
             \n\t4、关闭静默：gluster v set \$(gluster v list) network.silence-reconn-mode off;\
              \n\t5、打开静默：gluster v set \$(gluster v list) network.silence-reconn-mode on;"
    check_and_solution 'zgrep "silence" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' | grep "try_rpc_transport_connect" 2>/dev/null | '"$g_log_filter"'' "$inf" "$solution"
}

#检测低版本dts升级到670可能出现的could not convert string to float报错
function check_wait_all_host_access() {
    inf="升级时存储卷界面进不去，卡在等待所有主机接入"
    solution="\n\t参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=183534439"
    #从vs3.0开始升到vs3.3都有可能出现这种现象
    if [ $g_asan_ver -ge 30 ] && [ $g_asan_ver -lt 33 ]; then
        check_and_solution "grep 'vs_load_dts_tasks' /sf/log/today/vs/scripts/vs_load_dts_tasks.py.log 2>/dev/null | \
                            grep 'could not convert string to float' " "$inf" "$solution"
    fi
}

# 6、检测集群异常日志
function check_error_log()
{
    log_debug "检查环境中是否存在coredump文件："
    check_and_solution "ls /sf/data/local/kdump/ /sf/data/local/dump/ -lh 2>/dev/null | grep -vE 'dump|total'" '进程重启' '1、确认是什么进程？\n2、确认coredump时间 \n3、找具体部门研发确认'
    
    check_6100_coredump

    #检测环境是否存在brick处于静默状态
    check_silence_brick
    
    log_debug "检查环境中是否存在存储客户端和存储服务端断开的日志：(可能是网络故障、磁盘故障、主机延时过高、假死等等)"
    check_and_solution 'zgrep "event = down" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' '业务挂起/业务中断' '可能是网络故障、磁盘故障、主机延时过高、假死等等'

    # 校验分层
    log_debug "检查分层离散数据过多导致brick无法接入、分层无法接入：(https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=180916750)"
    solution="检查分层离散数据过多导致brick无法接入、分层无法接入：(https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=180916750)"
    check_and_solution 'zgrep -E "calc_super_block.*max = 64" /sf/log/vs/tierd/tierd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' '业务挂起/业务中断' $solution

    solution="检查分层io_submit并发量过大，参考是否符合：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=189131968"
    check_and_solution 'zgrep -E "io_submit error ret" /sf/log/vs/tierd/tierd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' '业务挂起/业务中断' "$solution"
    solution="\t1、检查是否有人执行的后台踢盘动作(参照: https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=34238);\
            \n\t2、检查缓存盘寿命是否不足（例如东芝磁盘24000H、或实际内存不足等）参照:https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=249880324"
    check_and_solution 'zgrep -E "get mem error.*node_demote_t" /sf/log/vs/tierd/tierd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' '业务挂起/内存耗尽/服务器重启' "$solution"
    
    solution="检查是否是32GB的文件导致无法重建、迁移: (https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=34240)"
    check_and_solution 'zgrep -E "client3_3_flush_cbk.*File too large" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' "重建/迁移无法完成" $solution
    # https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=34240
    solution="检查是否快照残留文件清理任务失败: (https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=324330347)"
    check_and_solution 'zgrep -E "Unknown error 222" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' "快照残留文件清理任务无法完成" $solution
    inf="brick信息在分层中残留"
    solution="参考案例解决：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=28712"
    # vs_tier_cli.py -c dump -a brickinfo |grep "none" -B2 找到所有为none的brick
    # 如果这些brick不在卷内 即gluster v i 中未找到则认为残留了
    shard_brick_cmd="gluster_output=\$(/sf/vs/glusterfs/sbin/gluster v i); \
            brick_ids=\"\$(vs_tier_cli.py -c dump -a brickinfo 2>/dev/null| grep 'none' -B2 | grep 'bi_brickid' | awk -F': ' '{print \$2}' | tr -d '\",')\"; \
            if [ -z \"\$brick_ids\" ]; then \
                exit 1; \
            fi; \
            echo \"\$brick_ids\" | while read brick_id; do \
            echo \${gluster_output} | grep -q \"\$brick_id\" || echo \"brick:\$brick_id 在分层中残留\"; done"

    check_and_solution "$shard_brick_cmd" "$inf" "$solution"

    # .vs/create 目录出现指控无法并打不上bad的问题  TD2023010900468
    solution="检查 .vs/create 目录是否存在指控并打不上bad属性，具体排查步骤可参考: https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=244510728 "
    check_and_solution 'zgrep -E "__sync_create_tmpfile.*failed" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' /sf/log/'$g_day'/vs/log/glusterfs/api/gluster_tgtd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' '无法新建文件/业务挂起/业务中断' "$solution"

    # shard
    solution="分片可能双点故障, 或如果是残留左子树可参考: https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=271865435 "
    check_and_solution 'zgrep -E "shard_resolve_inode_task.*do lookup failed" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' /sf/log/'$g_day'/vs/log/glusterfs/api/gluster_tgtd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' '业务挂起/业务中断' "$solution"
    solution="复合卷场景，分片路由分布到了不同的子卷上：可参考https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=199218923"
    check_and_solution 'zgrep -E "shard_resolve_inode_task.*volume not equal" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' /sf/log/'$g_day'/vs/log/glusterfs/api/gluster_tgtd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' '业务挂起/业务中断' "$solution"

    solution="是否发生了超时60s的io-retry，需要排查具体的retry原因（可能是双点、硬件故障等）"
    check_and_solution 'zgrep need_retry -n /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' /sf/log/'$g_day'/vs/log/glusterfs/api/gluster_tgtd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"'' '业务挂起/延时飙升' $solution

    solution="磁盘介质错误(坏道/其他硬件错误)，如果是物理磁盘，则建议更换"
    check_and_solution 'zgrep -E "blk_update_request: critical medium error|blk_update_request: I/O error" /sf/log/'$g_day'/kernel.log'$g_suffix' | grep -vE "nbd" | '"$g_log_filter"' ' '同步不完成/业务挂起' $solution

    solution="磁盘超时、卡慢，如果是物理磁盘，排除该盘检测副本一致性，且拔盘更换"
    check_and_solution 'zgrep -E "wait_for_completion_io_timeout.*dev_name" /sf/log/'$g_day'/kernel.log'$g_suffix' -m2' '业务挂起/业务中断' $solution
    
    # zk锁被其他主机占用
#    solution="\t1./sf/vs/etc/init.d/dts-server stop停掉占了锁的主机的dts\
#              \n\t2.super_zkcli.py rm /volumes/\$volume_id/dts/dtsc.lock 删除锁\
#              \n\t3.dts会被拉起，1分钟一次，如果删除zk锁失败构造dts拉不起来再删一次，成功后把dts恢复\
#              \n\t4.参考案例：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=180104683"
#    local inf="1.克隆任务失败\n\t2.读取或更新快照信息失败\n\t3.dts的功能应该都会失效"
#    local current_date=$(date "+%Y-%m-%d")
#    check_zk_cmd="if zgrep 'error on watch lock fail' /sf/log/vs/vs_dts/vs_dts_master.log'$g_suffix' 2>/dev/null| grep -q '$current_date'; then\
#                      zgrep 'zk_lock' /sf/log/vs/vs_dts/vs_dts_master.log'$g_suffix' 2>/dev/null| \
#                      grep 'lock hold by'| grep '$current_date'| tail -n '$g_lines';\
#                  fi"
#    check_and_solution "$check_zk_cmd" "$inf" "$solution"

    # 检测低版本dts升级到670可能出现的could not convert string to float报错
    check_wait_all_host_access

    #检测算法日志是否存在多条loop日志导致如果容量不平衡，但不发起平衡任务
    check_algo_log_loop

    #检测是否因为/sf/cfg/vs/.members更新机制bug导致主机状态异常
    check_host_online_status

    #检测是否生成hostinfo信息
    inf="副本一致，没有进行中的数据同步任务，但是磁盘管理界面磁盘一直在同步"
    solution="没生成hostinfo信息，参考案例解决：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=31159"
    current_date=$(date "+%Y-%m-%d")
    check_and_solution "zgrep 'find volume host into fail' /sf/log/vs/vs_dts/dtsc.log'$g_suffix' 2>/dev/null | \
            grep '$current_date' | tail -n '$g_lines'" "$inf" "$solution" 

    #检查efs checksum是否报错
    inf="对应brick起不来,报的是zmap checksum失败"
    solution="参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=253295808"
    check_and_solution "zgrep -E 'zmap.*checksum failed' /sf/log/'$g_day'/vs/log/glusterfs/bricks/* 2>/dev/null| ${g_log_filter}" "$inf" "$solution"

    #检测环境是否存在死锁
    inf="环境存在死锁"
    solution="请联系研发排查解决，死锁的堆栈信息存在/sf/log/today/vs/scripts/task_watch_dog_pstack.log;参考案例：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=329726402"
    check_lock_cmd='zgrep "do_recover" /sf/log/'$g_day'/vs/scripts/task_watch_dog.log'$g_suffix' 2>/dev/null|grep "with"| '"$g_log_filter"'' 
    check_and_solution "$check_lock_cmd" "$inf" "$solution"

    # 检测kernel是否OOM
    inf="内核OOM"
    solution="请联系研发排查解决，内核日志信息存在/sf/log/${g_day}/kernel.log"

    check_oom_cmd="zgrep -E -i 'out of memory | kill' '/sf/log/${g_day}/kernel.log${g_suffix}' 2>/dev/null | ${g_log_filter}"
    check_and_solution "$check_oom_cmd" "$inf" "$solution"


    #检测IO错误拒绝连接
    inf="IO错误拒绝连接"
    solution="请联系研发排查解决，IO错误拒绝连接信息存在/sf/log/${g_day}vs/iscsi/tgtd.log或/sf/log/vs/iscsi/tgtd.log"
    log_path1="/sf/log/${g_day}/vs/iscsi/tgtd.log"
    log_path2="/sf/log/vs/iscsi/tgtd.log"

    if [ -e "$log_path1" ]; then
        check_refuse_link_cmd="zgrep 'target in error state' '${log_path1}${g_suffix}' 2>/dev/null | ${g_log_filter}"
        check_and_solution "$check_refuse_link_cmd" "$inf" "$solution"
    elif [ -e "$log_path2" ]; then
        check_refuse_link_cmd="zgrep 'target in error state' '${log_path2}${g_suffix}' 2>/dev/null | ${g_log_filter}"
        check_and_solution "$check_refuse_link_cmd" "$inf" "$solution"
    fi

    # 检测CAW上限
    inf="CAW上限"
    solution="请联系研发排查解决，CAW上限信息存在/sf/log/${g_day}/vs/iscsi/tgtd.log或/sf/log/vs/iscsi/tgtd.log"

    if [ -e "$log_path1" ]; then
        check_caw_cmd="zgrep 'bs queue too long' '${log_path1}${g_suffix}' 2>/dev/null | ${g_log_filter}"
    elif [ -e "$log_path2" ]; then
        check_caw_cmd="zgrep 'bs queue too long' '${log_path2}${g_suffix}' 2>/dev/null | ${g_log_filter}"
    fi
    check_and_solution "$check_caw_cmd" "$inf" "$solution"

    #检测分层加载brick失败
    inf="分层加载brick失败"
    solution="参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=339444085"
    check_and_solution "zgrep 'Cannot allocate memory' /sf/log/vs/tierd/tierd.log 2>/dev/null| ${g_log_filter}" "$inf" "$solution"

    #部分主机某些目录上报创建文件失败
    inf="部分主机某些目录上报创建文件失败，可能导致备份虚拟机失败"
    solution="参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=193911256"
    check_and_solution "zgrep -E 'nfs3svc_create_cbk.*No such file or directory' /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log* 2>/dev/null| ${g_log_filter}" "$inf" "$solution"

    inf="仲裁brick没有把本身的brick目录创建出来"
    solution="参考案例解决：http://docs.sangfor.org/pages/viewpage.action?pageId=383994313"
    check_and_solution 'zgrep -E "Directory.*doesn'\''t exist, exiting." /sf/log/'$g_day'/vs/log/glusterfs/bricks/glusterfsd_sf-data-vs-local-*.log'$g_suffix' 2>/dev/null|'"$g_log_filter"' ' "$inf" "$solution"

    # 检测快照合并读失败
    inf="快照合并读失败，可能导致虚拟机挂起或 LUN 连接断开"
    solution="参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=306997300"
    check_and_solution "zgrep -E 'snapshot_merge_read_cbk.*snapshot merge read failed|subio is not complete' /sf/log/'$g_day'/vs/log/glusterfs/api/gluster_tgtd.log* 2>/dev/null| ${g_log_filter}" "$inf" "$solution"
    check_and_solution "zgrep -E 'snapshot_merge_read_cbk.*snapshot merge read failed|subio is not complete' /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log* 2>/dev/null| ${g_log_filter}" "$inf" "$solution"
    check_and_solution 'zgrep -i "Read failed. No bytes read and not at EOF" /sf/log/"'$g_day'"/sfvt_qemu_*.log'$g_suffix' 2>/dev/null| '"$g_log_filter"'' "$inf" "$solution"

    # 检测主机离线后创建虚拟机分组并移动虚拟机到新分组，主机上线后目录无法访问
    inf="主机离线后创建虚拟机分组并移动虚拟机到新分组，主机上线后目录无法访问或 LUN 无法连接"
    solution="参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=361623007"
    check_and_solution "zgrep -E 'afr_sh_expunge_before_rename_entrylk_cbk.*afr_sh_rename failed' /sf/log/'$g_day'/vs/log/glusterfs/api/gluster_tgtd.log* 2>/dev/null| ${g_log_filter}" "$inf" "$solution"
    check_and_solution "zgrep -E 'afr_sh_expunge_before_rename_entrylk_cbk.*afr_sh_rename failed' /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log* 2>/dev/null| ${g_log_filter}" "$inf" "$solution"

    #iscsi链接断开，重连不成功, 670修复，td: 2024112100297
    if [ $g_hci_ver -lt 670 ]; then
        inf="iscsi连接断开，重连失败，无法找到target"
        solution="检查是否存在iscsi target重命名后通知主机失败。研发参考td: https://td.atrust.sangfor.com/#/defect/details/2024112100297"
        log_path1="/sf/log/${g_day}/vs/iscsi/tgtd.log"
        log_path2="/sf/log/vs/iscsi/tgtd.log"
        local check_threshold=10

        if [ -e "$log_path1" ]; then
            check_log_cmd="zgrep 'Cannot find target_name' '${log_path1}' 2>/dev/null | \
            awk 'NR > $check_threshold {print; exit} END {if (NR > $check_threshold) print \"存在iscsi重连, 找不到target超过$check_threshold次\"}'"
            check_and_solution "$check_log_cmd" "$inf" "$solution"
        elif [ -e "$log_path2" ]; then
            check_log_cmd="zgrep 'Cannot find target_name' '${log_path2}' 2>/dev/null | \
            awk 'NR > $check_threshold {print; exit} END {if (NR > $check_threshold) print \"存在iscsi重连, 找不到target超过$check_threshold次\"}'"
            check_and_solution "$check_log_cmd" "$inf" "$solution"
        fi
    fi

    # 检测最近1~2小时内Keepalived_vrrp配置是否异常
    inf="keepalive服务异常"
    solution="参考案例解决：https://support.sangfor.com.cn/cases/list?product_id=33&type=1&category_id=32088，
    https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=505447997"
    current_hour=$(date "+%Y-%m-%d %H:")
    one_hour_ago=$(date -d "-1 hour" "+%Y-%m-%d %H:")
    check_and_solution "zgrep -E 'no match, ignoring|go out and fix your conf' /sf/log/vs/Keepalived_vrrp.log 2>/dev/null | \
            grep -E '$current_hour|$one_hour_ago' | tail -n '$g_lines'" "$inf" "$solution"

    return 0
}

#检测raid卡预警BUG
function check_raid_waring()
{    
    #eds环境直接返回
    if vs_is_eds; then
        return 0 
    fi

    inf="检测到戴尔的H750 raid卡,会导致raid卡卡住，严重的会导致主机重启或者主机的硬盘离线"
    solution="参考预警案例YJ20220117002解决"
    #检测是否存在戴尔H750raid卡
    if [ $g_hci_ver -eq 630 ] || [ $g_hci_ver -eq 670 ]; then
        check_dell_raid_cmd='lspci | grep -i raid | grep -i "H750" | grep -i "Dell"'
        check_and_solution "$check_dell_raid_cmd" "$inf" "$solution"
    fi

    inf="检测到SAS3008 raid卡，该raid卡与坏道扫描功能兼容性不好，导致磁盘性能卡慢"
    solution="\t1.关闭坏道扫描\n\t2.更换RAID卡\n\t3.参考预警案例YJ20210423002"
    #检测是否存在SAS3008的RAID卡
    if [ $g_hci_ver -ge 600 ] && [ $g_hci_ver -le 630 ]; then
        check_sas_raid_cmd='lspci | grep -i raid | grep -i "SAS3008"'
        check_and_solution "$check_sas_raid_cmd" "$inf" "$solution"
    fi
}

#检测690版本的预警
function check_690_waring()
{
    #eds环境直接返回
    if vs_is_eds; then
        return 0 
    fi

    inf="HCI690版本i40e驱动与intel x722的10G网卡固件存在兼容性问题，当对x722网卡的聚合口进行创建、删除、重新配置等操作，触发固件缺陷，导致网口掉线。"
    solution="参考预警案例YJ20230904003解决"
    if [ $g_hci_ver -ge 690 ];then
        check_x722_cmd="lspci | grep 'Ethernet' | grep -E \"Intel.*X722.*10G\""
        check_and_solution "$check_x722_cmd" "$inf" "$solution"
    fi
}

#检测680版本的预警
function check_680_waring()
{
    #eds环境直接返回
    if vs_is_eds; then
        return 0 
    fi

    inf="vn-node-agent-api服务fd泄漏导致调用该服务接口的相关操作都会失败，例如：上传镜像，补丁包，授权文件等。"
    solution="\t1.参考案例解决https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=34516\
              \n\t2.参考预警案例YJ20230627002"
    if [ $g_hci_ver -eq 680 ] || [ $g_hci_ver -eq 681 ]; then
        
        check_vn_cmd="if zgrep -q 'vn-node-agent.*error' /sf/log/today/sfvt_vtpdaemon.log'$g_suffix' 2>/dev/null; then\
                         zgrep 'Too many open files' /sf/log/today/vn/vn-node-agent-api.log'$g_suffix' 2>/dev/null| tail -n '$g_lines';\
                      fi"
        check_and_solution "$check_vn_cmd" "$inf" "$solution"
    fi
}

# 7、检测是否存在预警BUG 
function check_early_waring()
{
    # 刷ipmi日志
    #vs_cluster_cmd.sh e 'modprobe ipmi_msghandler; modprobe ipmi_devintf; modprobe ipmi_si; ipmitool -I open sel elist'
    # ARM版本挂载的sync预警
    log_info "检查ARM版本挂载的sync预警..."
    
    check_and_solution 'sync=$(cat /proc/mounts | grep "/sf/data/vs/local" | grep sync | wc -l); if [ $sync -gt 0 ]; then echo "[ERR] ARM 预警:YJ20220304001 "; fi' '业务卡慢' '按照预警YJ20220304001处理'

    #检测raid卡预警BUG
    check_raid_waring

    #检测690版本的预警
    check_690_waring

    #检测680版本的预警
    check_680_waring
    return 0
}

# 8、检测防火墙问题，防火墙问题通常重启防火墙都能解决，重启命令： /sf/etc/init.d/fw.sh restart （680要进iptables容器执行）
function check_iptables()
{
    log_info "检查挂载点是否堵包"
    solution="检测防火墙问题，如果吞吐过大则不关注，研发确认后，防火墙问题通常重启防火墙都能解决，重启命令:/sf/etc/init.d/fw.sh restart （680要进iptables容器执行）"
    #check_and_solution 'iptables -nvL|grep -E "38465" | grep -vE "0     0" | grep "M VS"; [ $? -eq 0 ] && echo "38465端口可能存在堵包"' '存储离现/挂载点卡死' "$solution"
    if ! vs_is_eds; then
        check_and_solution 'res=$(iptables -nvL|grep lo); if [ -z "$res" ]; then echo -e "iptable本地lo规则不存在"; fi' '存储离现/挂载点卡死' "$solution"
    fi
    #vs_cluster_cmd.sh e 'iptables -nvL|grep -E "38465|pkts" |grep 38465 -B1'

    ips=$(cat /etc/hosts | grep -E "host-|localhost" | grep -E "localhost.local|ip6" -v | awk '{print $1}' |sed 1d| tr -s '\n' '|')
    ips=${ips%?}

    solution="检查防火墙是否被封堵，尝试清理掉防火墙规则，重启防火墙无效"
    check_and_solution 'iptables -nvL | grep DROP | egrep "'$ips'"' '网络不通' '$solution'

    # 检查防火墙是否关闭，eds 环境不检查，vs3.5.0以下版本才执行此项检查
    if ! vs_is_eds; then
        if [ $g_asan_ver -lt 35 ]; then
            inf="防火墙关闭"
            solution="检查防火墙为关闭状态，可尝试使用命令：/sf/etc/init.d/fw.sh start打开防火墙"
            check_and_solution 'iptables -nvL | grep "Chain INPUT (policy ACCEPT"' "$inf" "$solution"
        fi
    fi
}

# 检测/sf/cfg/vs/lvmmgr/lvm_mgr_vg.json 和本地的 gluster v i | grep `hostname` 结果是否一致
function check_lvm_mgr_vg()
{
    json_output="$(/sf/vs/bin/jq -r ".vg_list[]" /sf/cfg/vs/lvmmgr/lvm_mgr_vg.json | sort)"
    if [ $? -ne 0 ]; then
        p_error "获取 /sf/cfg/vs/lvmmgr/lvm_mgr_vg.json 输出失败，请检查 jq 命令是否正确执行。"
        return 1
    fi

    # eds三主机以下的，直接报错：eds小于3主机建卷，非标准部署，请检查，eds必须三主机起步。
    # eds三主机或者三主机以上：直接判断为空，（/sf/cfg/vs/lvmmgr/lvm_mgr_vg.json不存在）
    if vs_is_eds; then
        replica_count=$(get_replica_count)
        if [ "$replica_count" -eq -1 ]; then
            p_error "获取 replica_count 失败"
            return 1
        fi

        if [ "$replica_count" -eq 0 ]; then
            p_error "检查是否建卷"
            return 1
        fi

        # 检查 replica_count 是否小于 3
        if [ "$replica_count" -gt 0 ] && [ "$replica_count" -lt 3 ]; then
            p_error "当前环境是 EDS，且 replica_count 小于 3"
            testcase_failed_stats "eds小于3主机建卷，非标准部署，请检查，eds必须三主机起步" "请联系研发排查原因"
            return 1
        fi
    fi
    vs_is_two_hosts
    is_two_hosts=$?
    # 非两主机或者vs2.8以上不检查
    if { [ $is_two_hosts -ne 0 ] || [ $g_asan_ver -gt 28 ]; }; then
        return 0
    fi

    # 两主机或者老版本，但是/sf/cfg/vs/lvmmgr/lvm_mgr_vg.json为空
    if [ -z "$json_output" ]; then
        testcase_failed_stats "两主机或老版本（vs2.8及以下）/sf/cfg/vs/lvmmgr/lvm_mgr_vg.json为空，异常" "请联系研发排查原因"
        return 1
    fi
    # /sf/cfg/vs/lvmmgr/lvm_mgr_vg.json不为空（包含了老版本升级上来的版本，/sf/cfg/vs/lvmmgr/lvm_mgr_vg.json中仍有内容，需要检查）
    if [ -n "$json_output" ]; then
        command='gluster_output="$(/sf/vs/glusterfs/sbin/gluster v i | grep "$(hostname)" | grep -oP "(?<=/local/)[^/]+" | grep -v meta | sort)";
        if [ $? -ne 0 ]; then
            echo "获取 gluster 输出失败，请检查 gluster 命令是否正确执行。";
            exit 1;
        fi;
        json_output="$(/sf/vs/bin/jq -r ".vg_list[]" /sf/cfg/vs/lvmmgr/lvm_mgr_vg.json | sort)";
        if [ $? -ne 0 ]; then
            echo "获取 /sf/cfg/vs/lvmmgr/lvm_mgr_vg.json 输出失败，请检查 jq 命令是否正确执行。";
            exit 1;
        fi;
        only_in_gluster=0;
        only_in_lvmmgr=0;
        gluster_unique_items=$(mktemp);
        json_unique_items=$(mktemp);

        while read -r item; do
            if ! echo "$json_output" | grep -q -F "$item"; then
                only_in_gluster=1;
                echo "$item" >> "$gluster_unique_items";
            fi;
        done <<< "$gluster_output";

        while read -r item; do
            if ! echo "$gluster_output" | grep -q -F "$item"; then
                only_in_lvmmgr=1;
                echo "$item" >> "$json_unique_items";
            fi;
        done <<< "$json_output";

        if [ $only_in_gluster -ne 0 ]; then
            echo "gluster v i | grep `hostname` 中有但 /sf/cfg/vs/lvmmgr/lvm_mgr_vg.json 中没有的内容：";
            sort "$gluster_unique_items" | uniq;
        fi;

        if [ $only_in_lvmmgr -ne 0 ]; then
            echo "/sf/cfg/vs/lvmmgr/lvm_mgr_vg.json 中有但 gluster v i | grep `hostname` 中没有的内容：";
            sort "$json_unique_items" | uniq;
        fi;

        if [ -f "$gluster_unique_items" ]; then
            rm "$gluster_unique_items";
        fi;

        if [ -f "$json_unique_items" ]; then
            rm "$json_unique_items";
        fi;'
        
        info="/sf/cfg/vs/lvmmgr/lvm_mgr_vg.json 和本地的 gluster v i | grep $(hostname) 结果不一致"
        solution="请联系研发排查原因"
        check_and_solution "$command" "$info" "$solution"        
    fi
}

# 检查磁盘
function check_disk()
{
    # raid卡重置
    inf="业务卡主/磁盘卡慢/主机卡死"
    solution="尝试隔离该主机，并协调平台和硬件部排查根因"
    check_and_solution 'zgrep -E "smart.*resetting scsi" /sf/log/"'$g_day'"/kernel.log'$g_suffix' 2>/dev/null| '"$g_log_filter"'' "$inf" "$solution"

    inf="同步无法完成/业务挂起/缓存盘无法加载/磁盘卡慢等"
    solution="尽快协调硬件厂家是否可做替换操作, 多个主机存在坏道，请谨慎运维操作"
    check_and_solution 'for disk in `ls /sf/cfg/vs/disk`; do dev=$(/sf/vs/bin/vs_json_rw.js "/sf/cfg/vs/disk/$disk" dev);result=$(smartctl -A $dev | egrep "Current_Pending_Sector|UDMA_CRC_Error_Count|Reallocated_Sector_Ct|Elements in grown defect list:" | grep -vE " 0$"); if [ ! -z "$result" ]; then echo -e "disk: $dev 存在坏道\n$result"; fi done' "$inf" "$solution"

    check_and_solution 'zgrep -E "sh_diff_checksum_cbk.*Input/output" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' /sf/log/'$g_day'/vs/log/glusterfs/api/gluster_tgtd.log'$g_suffix' 2>/dev/null| '"$g_log_filter"'' "$inf" "$solution" 

    solution="缓存盘坏块，可能导致缓存盘重复加载影响业务，尽快协调硬件厂家是否可做替换操作, 多个主机存在坏道，请谨慎运维操作(研发临时处理参考：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=245381944)"
    check_and_solution 'zgrep -E "tier_read_cbk.*ret = -5" /sf/log/vs/tierd/tierd.log'$g_suffix' 2>/dev/null| '"$g_log_filter"'' "$inf" "$solution" 
    solution="固件异常，检索兼容性列表，并向硬件部门进一步确认!"
    check_and_solution 'for disk in `ls /sf/cfg/vs/disk`; do dev=$(/sf/vs/bin/vs_json_rw.js "/sf/cfg/vs/disk/$disk" dev);result=$(smartctl -i $dev | egrep "XCV10132|XCV10140|JXTE004Q|JXTE104Q|HXT7904Q"); if [ ! -z "$result" ]; then echo -e "disk: $dev SSD固件故障\n$result"; fi done' "$inf" "$solution"
    inf="磁盘故障/业务故障/存储服务离现/业务异常"
    solution="检查对应磁盘是否存在硬件故障" 
    check_and_solution 'grep -E "UNNORMAL|fault" /sf/cfg/vs/disk -nr | grep -vE "NONE|heal"' "$inf" "$solution"
    #'grep -E "fault" /sf/cfg/vs/disk -nr | grep -vE "NONE|heal"'
    solution="西数HA340系列硬盘FW存在误处理bug会导致硬盘降级无法读写，并向硬件部门进一步确认!相关处理的KB:https://support.sangfor.com.cn/cases/list?product_id=33&type=1&category_id=31021"
    check_and_solution 'for disk in `ls /sf/cfg/vs/disk`; do dev=$(/sf/vs/bin/vs_json_rw.js "/sf/cfg/vs/disk/$disk" dev);result=$(smartctl -i $dev | egrep "V1GNW9F5"); if [ ! -z "$result" ]; then echo -e "disk: $dev 西数HA340系列硬盘FW故障\n$result"; fi done' "$inf" "$solution"

    #检测磁盘配置文件
    check_disk_config

    #残留brick故障信息检测
    inf="替换硬盘失败，报错清理旧盘信息失败"
    solution="\t1.排查旧磁盘配置是否损坏或不存在\n\t2.排查/volumes/volume_id/dts/fault_cfg是否残留不存在的磁盘配置\n\t3.参考案例https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=179104540"
    check_and_solution 'grep "No such file or directory" /sf/log/'$g_day'/vs/scripts/volume_mgr_run_task.log 2>/dev/null' "$inf" "$solution"
    check_and_solution 'grep "load fault table fail" /sf/log/vs/vs_dts/vs_dts_master.log' "$inf" "$solution"

    #查看最近分层是否有重新加载的相关日志
    inf="可能分层在一直加载"
    solution="检查一下分层的状态"
    if [ -e '/sf/log/vs/tierd/tierd.log' ]; then
        check_and_solution 'output=$(zgrep -E -i "\:tier_init_dev\]|tierfs_load_cdev" /sf/log/vs/tierd/tierd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"') && [ -n "$output" ] && echo "$output" && echo "分层中brick状态：" && /sf/vs/bin/vs_tier_cli.py -c dump -a brickinfo | grep status' "$inf" "$solution"
    fi

    # 检测出卷组名称在 gluster v i 中存在但在系统卷组中不存在的内容并告警
    check_gluster_vgs

    #检测2+1系统盘是否是nvme磁盘
    inf="检测到2+1仲裁主机的系统盘是nvme磁盘，可能导致创建2+1卷报错，未配置仲裁盘"
    solution="协调研发按照KB处理，kb：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=374678839"
    if [ -f "${g_hci_box_flag}" ] && [ $g_hci_ver -lt 6110 ]; then
        df /sf/log/ | grep nvme -q
        if [ $? -eq 0 ]; then
            testcase_failed_stats "$inf" "$solution"
        fi
    fi
}

#检测一些进程的内存占用
function check_process_memory_usage()
{
    #3.5以下版本执行vsmgr mem info -f csv -m没有.real_mem (MB)这一项
    if [ $g_asan_ver -lt 35 ]; then
        return 0
    fi

    inf="进程占用超出预留值"
    solution="请联系研发排查原因"
    #rpyc_daemon 执行时有子进程，有任务时会超出预留值一点点，管控面同事确认rpyc没有泄漏问题，这里不检测
    pro="grep -E 'glusterfs|tgtd|tierd|vs_dts'"
    #主要检测的进程有glusterfsd、glusterfs、glusterd、tgtd、tierd、vs_dts
    #全闪卷环境，mem_info也有tierd进程这一项且占用和预留的大小均为0，不影响检测
    check_process_cmd="mem_info=\$(/sf/vs/bin/vsmgr mem info -f csv -m |${pro});echo \"\$mem_info\" | awk -F ' *\\| *' '{ if (\$4 > \$7) { print \$0;}}'"
    check_and_solution "$check_process_cmd" "$inf" "$solution"
}

function check_proc_mem()
{
    #检测一些进程的内存占用
    check_process_memory_usage

    inf="存储延时高/内存占用异常(超过2GB)"
    solution="联系研发继续分析内存"
    # brick进程内存超过2G告警并打印出剩余内存
    check_and_solution 'for mem in $(ps aux | grep brick | grep -vE "super|vi|grep" | tr -s " " | cut -d " " -f 6); do if [ -n "$mem" ] && [ "$mem" -gt 2097152 ]; then echo "进程brick内存占用异常: $mem KB" && free -h; fi; done' "$inf" "$solution"

    # 查看存储客户端进程(超过2GB)
    inf="存储延时高/内存占用异常(超过2GB)"
    solution="联系研发继续分析内存，尝试热重启nfs进程"

    if vs_is_eds; then
        # 查看fuse进程(超过2GB)
        inf="存储延时高/内存占用异常(超过2GB)"
        solution="请联系研发排查原因"
        check_and_solution 'fuse_mem=$(ps aux | grep fuse|grep -vE "super|vi|grep" | tr -s '"' '"' | cut -d '"' '"' -f 6); if [ -n "$fuse_mem" ] && [ $fuse_mem -gt 2097152 ]; then echo "进程fuse内存占用异常: $fuse_mem KB"; fi' "$inf" "$solution"
    fi

    #eds环境直接返回
    if vs_is_eds; then
        return 0 
    fi

    check_and_solution 'nfs_mem=$(ps aux | grep nfs.pid|grep -vE "super|vi|grep" | tr -s '"' '"' | cut -d '"' '"' -f 6); if [ $nfs_mem -gt 2097152 ]; then echo "存储客户端进程nfs内存占用异常: $nfs_mem"; fi' "$inf" "$solution"
}

# 检查mongo配置文件
function check_mongo_config() 
{
    #用Python直接获取mongo节点的配置文件
    local config=$(/sf/vs/bin/python -c "import json; from bson import ObjectId; from bson import json_util; \
        from volume_mgr.mongo import common as mongo_common; mongo_client = mongo_common.MongoClientOperate(); \
        conf = mongo_client.get_conf()['config']; res = json.dumps(conf, default=json_util.default); print res")
  
    local members_count=$(echo -n "$config" | /sf/vs/bin/jq '.members | length')
    local voting=0
    local inf="1.MongoDB的配置错误导致MongoDB的主控切换不了\n\t2.进入单主机维护模式失败\n\t3.访问mongo会受影响"
    local solution="\t1.mongo集群只能有3个或者5个节点进行投票，votes为非0，对应priority必须为非0(hidden节点除外)；其余节点 votes为0，对应priority必须为0;\
                    \n\t2.请输入指令：'/sf/vs/mongodb/bin/mongo_ssl mongo --host vs-rep/127.0.0.1''config=rs.conf()'确认各个节点的votes和priority\
                    \n\t3.详细步骤参照：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=227529131\
                    \n\t4.更改完配置信息，输入exit即可退出"
    local vote
    local priority
    local hidden
    for ((i=0; i<members_count; i++)); do
        vote=$(echo -n "$config"  | /sf/vs/bin/jq ".members[$i].votes")
        priority=$(echo -n "$config"  | /sf/vs/bin/jq ".members[$i].priority")
        hidden=$(echo -n "$config"  | /sf/vs/bin/jq ".members[$i].hidden")

        if [[ "$vote" -eq 0 ]]; then
            #对于投票数为0的节点，优先级应为0
            if [[ "$priority" != "0" && "$priority" != "0.0" ]]; then
                testcase_failed_stats "$inf" "$solution"
                return 1
            fi
        else
            voting=$((voting + 1))
            #对于投票数非0且不隐藏的节点，优先级不应为0
            if [[ "$hidden" == "false" && ("$priority" == "0" || "$priority" == "0.0") ]]; then
                testcase_failed_stats "$inf" "$solution"
                return 1
            fi
        fi
    done

    #投票节点数只能是3或者5
    if [[ "$voting" -ne 3 && "$voting" -ne 5 ]]; then
        testcase_failed_stats "$inf" "$solution" 
        return 1
    fi

    testcase_success_stats
    return 0
}

# 检查mongo_hidden状态
function check_mongo_hidden()
{
    local exit_status
    local inf="1.一键检测，或者升级前检测，提示虚拟存储服务异常\n"
    local solution="\t1.只允许在2主机，有hidden服务的环境上操作执行前再做一次环境确认：\
                    \n\t    gluster v i | grep 'Replica factor' 显示：Replica factor: (2 + 0)\
                    \n\t2.删除hidden服务所在节点的hidden数据即可恢复\
                    \n\t    mkdir -p /sf/log/backup/mongodb_hidden_bak; mv /sf/data/platform_database/vs/mongodb_hidden/* /sf/log/backup/mongodb_hidden_bak\
                    \n\t    tailf /sf/log/today/vs/mongod_hidden.* | grep 'We are too stale'\
                    \n\t3.可参照详细解决方案：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=273345197"
    #该案例背景为两主机
    vs_is_two_hosts
    exit_status=$?
    if [ $exit_status -eq 0 ]; then
        #过滤的原因是，只有存在hidden服务的节点上才会存在hidden日志
        check_and_solution 'zgrep "We are too stale" /sf/log/today/vs/mongod_hidden.*' "$inf" "$solution" 2>&1 | \
        grep -v "zgrep: /sf/log/today/vs/mongod_hidden.*: No such file or directory"
    else
    #只有两主机环境中，才会存在mongo_hidden进程
#       p_info "该环境非两主机，目前不支持非两主机环境检测mongo_hidden状态"
        return 0
    fi
}
# 检查mongo_nodes_status状态
function mongo_nodes_status()
{
    local inf="1.虚拟存储卷状态/磁盘状态获取失败；各种运维操作异常告警\n"
    local solution="\t1.不允许集群内出现两个PRIMARY状态，请联系研发处理\
                    \n\t2.请输入指令：'/sf/vs/mongodb/bin/mongo_ssl mongo --host vs-rep/127.0.0.1''config=rs.status()'确认各个节点的状态\
                    \n\t3.可参照详细解决方案：https://wiki.sangfor.com/pages/viewpage.action?pageId=90839413"
    local config
    local members_count
    for h in ${g_hosts}; do
        config=$(timeout -t 12 -s KILL /usr/bin/ssh root@"${h}" '/sf/vs/bin/python -c "import json; from bson import ObjectId; from bson import json_util; \
            from volume_mgr.mongo import common as mongo_common; mongo_client = mongo_common.MongoClientOperate(); \
            conf = mongo_client.get_replset_status(); res = json.dumps(conf, default=json_util.default); print res"')
        if [[ $? -ne 0 || -z "$config" ]]; then
            p_info "获取主机:$h 的mongodb服务状态失败，可能执行超时，重新检测一下或手动检查"
            continue
        fi
        members_count=$(echo -n "$config" | /sf/vs/bin/jq .members[].stateStr|grep PRIMARY|wc -l)
        if [[ -z "$members_count" ]]; then
            solution="\t1.环境中mongodb状态异常,mongodb集群不存在主控，请联系研发处理
                      \n\t2.请输入指令：'/sf/vs/mongodb/bin/mongo_ssl mongo --host vs-rep/127.0.0.1''config=rs.status()'确认各个节点的状态"
            testcase_failed_stats "$inf" "$solution"
        else
            if [[ "$members_count" -ge 2 ]]; then
                testcase_failed_stats "$inf" "$solution"
            fi
        fi
    done
}
# 检查mongo
function check_mongo()
{
    if [ $g_asan_ver -ge 33 ]; then
        # 获取当前主机名
        current_host=$(hostname)
        if ! vsmgr volume hosts | grep -q "^$current_host"; then
            return 1  # 当前主机未部署mongo，结束函数运行
        fi
        
        #检查mongo配置文件
        check_mongo_config

        #检查mongo_hidden状态
        check_mongo_hidden

        #检查mongo_nodes_status状态
        mongo_nodes_status
    fi
}

# 探测rpyc服务端口
function detect_rpyc_port()
{
    #2.8以上执行
    if [ $g_asan_ver -lt 28 ]; then
        return 0
    fi

    inf="检测rpyc端口异常，可能会导致存储页面无法访问等问题"
    solution="检测rpyc服务是否正常，如果服务正常可以尝试重启下防火墙。"
    check_process_cmd='nc -z 127.0.0.1 18812 -w 1;if [ $? -ne 0 ]; then echo "rpyc服务端口异常";fi'
    check_and_solution "$check_process_cmd" "$inf" "$solution"
}

# 探测rpc_bind服务端口
function detect_rpcbind_port()
{
    #2.8以上执行
    if [ $g_asan_ver -lt 28 ]; then
        return 0
    fi

    inf="检测rpcbind端口异常，可能会导致热升级失败等问题"
    solution="检测rpcbind服务是否正常，如果服务正常可以尝试重启下防火墙。"
    check_process_cmd='nc -z 127.0.0.1 111 -w 1;if [ $? -ne 0 ]; then echo "rpcbind服务端口异常";fi'
    check_and_solution "$check_process_cmd" "$inf" "$solution"
}

# 探测端口
function detect_port()
{
    p_trace "\n检测rpyc服务端口状态: "
    detect_rpyc_port

    p_trace "\n检测rpcbind服务端口状态: "
    detect_rpcbind_port
}

# 检查主机假死，只能作为参考，主机系统hang住，外部不好探测
function check_system_hang_case()
{
    p_info "开始检查是否有主机假死或者存储网离线，可以作为参考"

    local_host=$(hostname)
    cluster_hosts=$(cat /etc/hosts |grep host- |grep -v mgr |grep -v ${local_host} |awk '{print $2}')
    for h in ${cluster_hosts}
    do
        stdout=$(ping -c 100 -w 2 -f ${h})
        if [ "${stdout}" == "" ]
        then
            p_error "host ${h} 存储网离线或丢包，请检查存储网"
            continue
        fi

        #主机假死一般是ssh不上去，这里通过ssh执行命令进行探测
        val=$(timeout -t ${g_time_limit} -s KILL /usr/bin/ssh root@${h} echo system_hang)
        if [ "${val}" != "system_hang" ]; then
            p_error "host ${h} ssh执行失败,主机可能假死,请检查该主机是否正常"
        fi
    done
}

# 检测vs_dog的配置文件，/sf/cfg/vs/vs_dog/brick_info.json和/sf/cfg/vs/vs_dog/service_list.json中的brick信息是否一致
function check_vs_dog_config() {
    #eds环境直接返回
    if vs_is_eds; then
        return 0
    fi
    # 大于等于6.8.0版本才做检测
    if [ $g_hci_ver -lt 680 ]; then
        return 0
    fi

    p_trace "检测/sf/cfg/vs/vs_dog/配置中的brick信息是否一致"

    brick_conf_path="/sf/cfg/vs/vs_dog/brick_info.json"
    srv_list_conf_path="/sf/cfg/vs/vs_dog/service_list.json"

    brick_compile="^brick-([0-9]+)$"
    cluster_hosts=$(cat /etc/hosts |grep host- |grep -v mgr | cut -f2)
    for h in ${cluster_hosts}
    do
        bricks=()
        srv_bricks=()
        brick_conf=$(timeout -t ${g_time_limit} -s KILL /usr/bin/ssh root@${h} "cat ${brick_conf_path}")
        if [[ $? -ne 0 ]]; then
            p_error "读取主机 ${h} 的 ${brick_conf_path} 失败，请检查文件是否存在或主机是否在线"
            continue
        fi

        if echo "$brick_conf" | jq -e "." > /dev/null 2>&1; then
            while IFS= read -r brick_name; do
                if [[ "$brick_name" =~ $brick_compile ]]; then
                    bricks+=("$brick_name")
                fi
            done < <(echo "$brick_conf" | jq -r "keys[]")
        else
            p_error "解析主机 ${h} 的 ${brick_conf_path} 失败，请检查文件内容是否为合法的 JSON 格式"
            continue
        fi

        srv_list_conf=$(timeout -t ${g_time_limit} -s KILL /usr/bin/ssh root@${h} "cat ${srv_list_conf_path}")
        if [[ $? -ne 0 ]]; then
            p_error "读取主机 ${h} 的${srv_list_conf_path} 失败，请检查文件是否存在或主机是否在线"
            continue
        fi

        if echo "$srv_list_conf" | jq -e "." > /dev/null 2>&1; then
            while IFS= read -r service_name; do
                if [[ "$service_name" =~ $brick_compile ]]; then
                    srv_bricks+=("$service_name")
                fi
            done < <(echo "$srv_list_conf" | jq -r ".[].service_name")
        else
            p_error "解析主机 ${h} 的 ${srv_list_conf_path} 失败，请检查文件内容是否为合法的 JSON 格式"
            continue
        fi

        diff_result=$(diff <(printf "%s\n" "${srv_bricks[@]}" | sort) <(printf "%s\n" "${bricks[@]}" | sort))
        if [[ -n "$diff_result" ]]; then
            p_info "主机 ${h} ${brick_conf_path} 的brick信息: ${bricks[*]}"
            p_info "主机 ${h} ${srv_list_conf_path} 的brick信息: ${srv_bricks[*]}"
            testcase_failed_stats "主机: ${h} 检测vs_dog的配置文件失败，${brick_conf_path}和${srv_list_conf_path}中的brick信息不一致" "请按照KB处理：https://support.sangfor.com.cn/cases/list?product_id=33&type=1&category_id=30870."
        fi
    done
}

# 检查主机重启后，99vs_start_services服务是否有异常
function check_99vs_start_services_status()
{
    # 680版本以后没有这个问题
    if [ $g_hci_ver -ge 680 ]; then
        return
    fi

    # 2主机以上没有state-mond，只处理2主机场景
    vs_is_two_hosts
    is_two_hosts=$?
    if [ $is_two_hosts -ne 0 ]; then
        return
    fi

    local today=$g_day
    if [ "$g_day" == "today" ]; then
        today=$(date +%Y%m%d)
    fi
    local yesterday=$(date -d "$today -1 day" +%Y%m%d)
    local ret=0
    inf="glusterfs服务启动异常，glusterd被state-mond killed，检查99vs_start_services是否卡住"
    solution="上升研发，手动拉一下glusterd，参考td：https://td.atrust.sangfor.com/#/defect/details/2024112200239"

    # 1.检测99vs_start_services服务存在
    local check_vs_start_cmd="ps aux | grep '99vs_start_services' | grep -v 'grep' > /dev/null"

    # 2.同时glusterfs_nfs和glusterd不存在
    local check_gluster_cmd="! (ps aux | grep 'glusterfs_nfs' | grep -v 'grep' | grep -v 'supervise' > /dev/null) && ! (ps aux | grep 'glusterd' | grep -v 'grep' | grep -v 'supervise' > /dev/null)"

    # 过滤指定g_day和前一天的日志
    for c_date in "$today" "$yesterday"; do
        local log_path="/sf/log/blackbox/$c_date/LOG_dmesg.txt"
        # 3.检测是否存在 glusterd 被 state-mon kill的日志
        local check_state_mon_cmd="[ -f $log_path ] && grep 'hook kill(sig:9)' $log_path | grep 'state-mond' | grep glusterd | tail"
        # 组合命令
        local exec_cmd="$check_vs_start_cmd && $check_gluster_cmd && $check_state_mon_cmd"
        check_and_solution "$exec_cmd" "$inf" "$solution"
        ret=$?
        # 检测到一次异常即可
        if [ $ret -eq 1 ]; then
            break;
        fi
    done 
}

# 检查gluster状态
function check_gluster_status()
{
    # 检查主机重启后，是否存在网络异常导致gluster卡死
    # 参考td：https://td.atrust.sangfor.com/#/defect/details/2024112200239
    check_99vs_start_services_status
}

# check_glusterfsd_Z_status 检查glusterfsd进程是否为Z状态
function check_glusterfsd_Z_status()
{
    local inf="检查glusterfsd进程是否为Z状态"
    local solution="存在glusterfsd进程存在Z状态，参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=471473087"
    # 检查进程状态是否为 T 状态
    local cond="ps axuf |grep glusterfsd| grep -vE 'grep|supervise' |grep 'Z' |grep 'defunct'"
    check_and_solution "$cond" "$inf" "$solution"
}

# 检查虚拟机是否加锁失败，按照KB处理
function check_vm_lock()
{
    p_info "开始检查是否是否存在加锁失败的虚拟机"

    inf="启动虚拟机过程中加镜像锁失败"
    solution="启动虚拟加锁失败,请参考kb:https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=194889998"
    check_and_solution 'zgrep -i "lock" /sf/log/"'$g_day'"/sfvt_qemu_*.log'$g_suffix' | grep -i failed 2>/dev/null| '"$g_log_filter"'' "$inf" "$solution"
}

# 检查VS垃圾回收站存在大量数据导致业务挂起风险，按照KB处理
function check_trash_clear()
{   
    volume_name=$(/sf/vs/glusterfs/sbin/gluster v list)
    # 检查volume_name是否为空
    if [ -z "$volume_name" ]; then
        return 1
    fi
	
    p_info "开始检查VS垃圾回收站存在大量数据导致业务挂起风险，可以作为参考"
    meta_bricks=($(/sf/vs/glusterfs/sbin/gluster v i| grep meta| awk '{print $2}'))
	if [ ${#meta_bricks[@]} -gt 0 ]; then
	    meta_brick=$(echo ${meta_bricks[0]}|cut -d: -f2)
		host_name=$(echo ${meta_bricks[0]}|cut -d: -f1) 
		trash_file_count=$(timeout -t "$g_time_limit" -s KILL /usr/bin/ssh $host_name ls $meta_brick/.vs/trash/ | wc -w)
		if [ $trash_file_count -gt 2000 ]; then
		    p_error "VS垃圾回收站存在大量数据，达到2000+，可能导致业务挂起风险，请确认并参考kb:https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=349422965"
		fi
	fi
	
}

function do_check_all()
{
    echo "start func, g_time_limit=$g_time_limit"
    # 1. 环境基本信息，卷信息、补丁包信息等
    get_env

    # 1.检查zk问题
    check_zk

    check_mem_cpu

    # 2. 网络
    check_network_status

    # 3. 存储服务
    check_storage_service

    # 4. 存储配置
    check_storage_config

    # 5. 存储容量
    check_storage_capacity

    # 6. 错误日志检索
    check_error_log

    # 7. 预警BUG
    check_early_waring

    # 8. 检测防火墙
    check_iptables

    # 9. 检查内存是否存在异常
    check_proc_mem

    # 10. 检查mongo
    check_mongo

    #11. 探测端口
    detect_port

    check_disk
    # 11. 检查标记残留，配置缺失
    check_env_conf

    check_rebuild_flag

    # 12. 特殊记录放在最后，检查一下vst的操作记录，后续vst人员注意
    check_and_solution "ls /sf/cfg/vs/vst_audit_history 2>/dev/null" "环境存在技术支持操作记录!" "请确认是否存在其它影响或存在历史问题!"

    # 13. 主机假死
    check_system_hang_case

    # 14. 检查是否存在加锁失败的虚拟机
    check_vm_lock

    # 15.检查复合卷是否存在垃圾回收站异常
    check_trash_clear

    # 16. 检测是否还存在升级标记
    check_upgrade_flag

    # 17. 检查磁盘配置文件中的磁盘组值是否正确
    check_disk_group_id

    # 18. 文件太大导致数据同步失败
    check_file_too_large

    # 19. 检查磁盘是否离线
    check_offine_disk
    
    # 20. 检查元数据是否损坏
    check_tierd_meta_data_err

    # 21. 卡慢盘
    check_slow_disk

    # 22. vs_dog配置
    check_vs_dog_config

    # 23. 仲裁配置
    check_arbiter_conf

    # 24. bad_sector
    check_bad_sector    
    
    # 25.检查gluster状态是否有异常
    check_gluster_status

    # 26.检查时延
    check_latency
}

function ops_exec_check()
{
    if [ -z "$g_type" ]; then
        return $EINVAL
    fi

    case $g_type in
        all)
            do_check_all
            return 0
        ;;
        network)
            check_network
            check_network_status
            detect_port
            return 0
        ;;
        service)
            check_storage_service
            check_storage_config
            return 0
        ;;
        disk)
            check_disk
            return 0
        ;;
        mem)
            check_proc_mem
            vsmgr mem info -f csv
            return 0
        ;;
        BUG)
            check_early_waring
            return 0
        ;;
        *)
            return $EINVAL
    esac
    return 0
}
#==================================================[end check]=========================================================#


#================================================[start sample]========================================================#
function collect_storage_info()
{
    # 采样收集ps信息，2s一次
    #sample_file="/sf/data/local/sample_ps.log"; while true; do let i=i+1;date>>"$sample_file"; ps auxf>>"$sample_file";sleep 2; echo "-------------$i--------------"; done
    return 0
}

function ops_exec_sample()
{
    #TODO
    return 0
}

#==================================================[end sample]========================================================#


#================================================[start blackbox]========================================================#

# 函数用于转换时间格式 [年-月-日 时-分-秒]
convert_time() {
    input_time="$1"

    # 默认时间值
    year_month_day=""
    hour="00"
    minute="00"
    second="00"

    # 使用空格和逗号分割输入时间
    IFS=' ,' read -r date time <<< "$input_time"

    if [ -n "$date" ]; then
        year_month_day="$date"
    fi

    if [ -n "$time" ]; then
        IFS=':' read -r hour minute second_part <<< "$time"
        
        if [[ ! -z "$minute" ]] && [[ $minute =~ ^[0-9]+$ ]]; then
            if [[ $second_part =~ ^[0-9]+$ ]]; then
                second="$second_part"  # 如果提供了秒
            else
                second="00"
            fi
            minute=$(printf "%02d" "$minute")  # 格式化分钟
        else
            minute="00"
            second="00"
        fi
    fi

    # 组合成最终格式
    formatted_time="$year_month_day $hour:$minute:$second"
    
    # 输出结果
    echo "$formatted_time"
}

function parse_blackbox_iostat()
{
    local io_param=""
    local io_value=""
    local io_time_hour=5   # 默认从给定时间往后找5个小时
    if [ -z "$g_time" ]; then
        local io_start_time=$(date "+%Y-%m-%d,%H:%M")
    else
        local io_start_time=$g_time
    fi
    local io_convert_time=$(convert_time "$io_start_time")  # 转换成日志里的时间格式，方便匹配
    local io_g_day=$(date -d "${io_convert_time//,/ }" "+%Y%m%d") # 获取黑盒日志的目录，时间格式：2025-0x-0x
    local io_stat_log="/sf/log/blackbox/$io_g_day/LOG_iostat.txt"
    local io_end_time=""
    local print_help=0

    # 解析传递给 iostat 的参数
    # 使用参数分割字符串
    for arg in $g_all_cmd_args; do
        case $arg in
            help*)
                print_help=1
                ;;
            rqms=*)
                io_param="rqms"
                io_value="${arg#*=}"
                ;;
            iops=*)
                io_param="iops"
                io_value="${arg#*=}"
                ;;
            tps=*)
                io_param="tps"  # 吞吐量，读或者写
                io_value="${arg#*=}"
                ;;
            rtps=*)
                io_param="rtps" # 读吞吐
                io_value="${arg#*=}"
                ;;
            wtps=*)
                io_param="wtps" # 写吞吐
                io_value="${arg#*=}"
                ;;
            await=*)
                io_param="await"
                io_value="${arg#*=}"
                ;;
            rwait=*)
                io_param="rwait"
                io_value="${arg#*=}"
                ;;
            wwait=*)
                io_param="wwait"
                io_value="${arg#*=}"
                ;;
            hour=*)
                io_time_hour="${arg#*=}"
                ;;
        esac
    done

    if [ -z "$io_param" ] || [ -z "$io_value" ] || [ "$print_help" -eq 1 ];then
        p_info "支持过滤参数大于某个值日志(只会过滤第二次的iostat, 确保准确性)";
        p_info "参数:";
        p_info "- rqms=xx   查找rqms 读或者写大于某个值的"
        p_info "- iops=xx   查找iops 读或者写大于某个值的"
        p_info "- tps=xxx   查找吞吐 读或者写大于某个值的，单位 kb/s"
        p_info "- rtps=xxx  查找吞吐 读大于某个值的，单位 kb/s"
        p_info "- wtps=xxx  查找吞吐 写大于某个值的，单位 kb/s"
        p_info "- await=xx  查找时延 读或者写时延大于某个值的, 单位 ms"
        p_info "- rwait=xx  查找时延 读时延大于某个值的, 单位 ms"
        p_info "- wwait=xx  查找时延 写时延大于某个值的, 单位 ms"
        p_info "- hour=5    默认是查找给定时间开始往后5个小时内的数据, 可以通过这个参数调整"
        p_info "例如: 查找某个时间内时延大于20ms的盘 ./asan_ops -s blackbox -t iostat -d 2025-06-20,00:40 await=20 hour=2"
        return 0;
    fi
    
    io_end_time=$(date -d "$io_convert_time $io_time_hour hours" +"%Y-%m-%d %H:%M:%S")  # 计算结束时间

    # 过滤脚本
    p_info "检索日志：$io_stat_log"
    p_info "检索时间：$io_convert_time - $io_end_time"

    # 打印一行头部
    grep Device $io_stat_log | head -n 1

    awk -v start="${io_convert_time}" -v end="${io_end_time}" -v io_param="${io_param}" -v io_value="${io_value}" ' 
        BEGIN {
            # 获取当前年份的前两位
            cmd_century = "date +%Y"
            cmd_century | getline current_full_year
            close(cmd_century)
            current_century = substr(current_full_year, 1, 2)
        }

        function to_timestamp(date_str) {
            cmd = "date -d \"" date_str "\" +%s"
            cmd | getline ts
            close(cmd)
            return ts
        }

        {
            if ($0 ~ /^\[.*\]\*\*\*/) {
                log_time = substr($0, 2, 17)  # 提取时间字符串如 "[25-05-21 00:04:08]"

                # 处理日期格式：
                gsub(/-/, "/", log_time)  # 替换 "-" 为 "/"
                log_time_gsub = log_time

                # 补全年份
                year_part = substr(log_time_gsub, 1, 2)  # 提取原两位年份（如 "25"）
                new_year = current_century year_part      # 动态补全（如 "20" + "25" → "2025"）
                log_time_fixed = new_year substr(log_time_gsub, 3, length(log_time_gsub))
                
                # 转换为时间戳
                timestamp = to_timestamp(log_time_fixed)
                
                # 转换开始/结束时间
                start_ts = to_timestamp(start)
                end_ts = to_timestamp(end)

                # 判断时间标记
                if (timestamp >= start_ts) start_flag = 1
                if (timestamp > end_ts) exit
            }
        
            if (start_flag == 1) {
                if ($0 ~ /^\[.*\]\*\*\*/) {
                    state_m = 0;
                    line = $0;
                } else if (/^Device/) {
                    state_m++;
                } else if (!/^Linux/ && (/^sd/ || /^nvme/) && (state_m == 2) && ($6 > read_t || $7 > write_t)) {
                    print_line = 0
                    if (io_param == "rqms" && ($2 >= io_value || $3 >= io_value)) {print_line = 1}
                    else if (io_param == "iops" && ($4 >= io_value || $5 >= io_value)) {print_line = 1}
                    else if (io_param == "tps" && ($6 >= io_value || $7 >= io_value)) {print_line = 1}
                    else if (io_param == "rtps" && ($6 >= io_value)) {print_line = 1}
                    else if (io_param == "wtps" && ($7 >= io_value)) {print_line = 1}
                    else if (io_param == "await" && ($11 >= io_value || $12 >= io_value)) {print_line = 1}
                    else if (io_param == "rwait" && ($11 >= io_value)) {print_line = 1}
                    else if (io_param == "wwait" && ($12 >= io_value)) {print_line = 1}

                    if (print_line) {
                        if (line) {
                            print line;
                            line = 0;
                        }
                        print $0;
                    }
                }
            }
        }
    ' "$io_stat_log"

    return 0
}

function ops_exec_blackbox()
{
    if [ -z "$g_type" ]; then
        return $EINVAL
    fi

    # 判断子命令并调用相应的处理函数
    case $g_type in
        "iostat")
            parse_blackbox_iostat
            return 0;
        ;;
        *)
            p_error "不支持的命令执行类型: $g_type"
            ;;
    esac
    return 0
}

#==================================================[end blackbox]========================================================#


#==================================================[start case]========================================================#

function check_vmid()
{
    if [ -z "$1" ]; then
        p_error "请输入待检查的vmid, eg: -i vmid"
        return 1;
    fi
    p_trace "检查虚拟机: $1"

    grep -w "$1" /cfs/.vmlist  --color | grep -Eiw "storage|name|vm_name" --color
    if [ $? -ne 0 ]; then
       p_error "虚拟机不存在!!!"
       return 1
    fi

    path=$(/sf/vs/bin/vs_quick_get_vmpath_by_vmid.sh $1)
    if [ -z "$path" ]; then
        p_error "镜像文件路径转化失败, 可能镜像目录发生指控异常"
        return 1
    fi

    p_trace "虚拟机镜像路径：$path"
    ls -alht $path
    check_and_advice 'zgrep -vE "kvm_get_apic_state" /sf/log/'$g_day'/sfvt_qemu_'$1'.log'$g_suffix' 2>/dev/null | grep nfs -i | '"$g_log_filter"''

    vs_cluster_cmd.sh e "ps aux | grep $1| grep -vE '/sf/bin/vm_entry.sh|grep|bash'"

    # 虚拟机运行位置执行查看延时情况： watch qmpcmd.sh xxxxxx info iostat -r

    return 0
}

function check_slow_disk()
{
    if [ -f /sf/vs/bin/vs_diskchecker ]; then
        p_info "该环境存在卡慢盘检测功能"
    else
        p_error "该环境不存在卡慢盘检测功能"
    fi

#    p_trace "永久拔盘列表："
#    vs_cluster_cmd.sh e 'cat /sf/cfg/vs/never_recoverd_disks.json 2>/dev/null'
    # 检查磁盘是否离线
    check_offine_disk

    #检查raid卡信息
    vs_cluster_cmd.sh e 'lspci -nn | grep -i sas'

    # 收集延时日志
    vs_cluster_cmd.sh e 'zgrep -E 'svctm' /sf/log/'$g_day'/vs/scripts/abnormal_diskstate.log'$g_suffix' 2>/dev/null | '"$g_log_filter"''

    # 查看延时超过1000ms的最后13条记录
    vs_cluster_cmd.sh e 'zgrep -E "svctm:1000" /sf/log/'$g_day'/vs/scripts/abnormal_diskstate.log'$g_suffix' 2>/dev/null | '"$g_log_filter"''

    # 查看内核是否存在磁盘io超时任务
    vs_cluster_cmd.sh e 'zgrep -E "blk_update_request: critical medium error|blk_update_request: I/O error" /sf/log/'$g_day'/kernel.log'$g_suffix' | grep -vE "nbd" | '"$g_log_filter"' '
    vs_cluster_cmd.sh e 'zgrep -E "wait_for_completion_io_timeout.*dev_name" /sf/log/'$g_day'/kernel.log'$g_suffix' |'"$g_log_filter"''

    # 收集系统iostat执行结果默认3次(通过-c xx调整)
    vs_cluster_cmd.sh e 'iostat -x 1 '$g_cnt' | grep -vE "dm|loop|^nb" | grep -vE "0.00    0.00    0.00   0.00   0.00$"'

    # 检测是否存在卡慢盘误报的可能
    if [ $g_hci_ver -ge 6100 -a $g_hci_ver -le 6110 ]; then
        check_and_solution 'grep -E "busyness:\\s*1\\s*rc:\\s*0,\\s*wc:\\s*0" /sf/log/'$g_day'/vs/scripts/abnormal_diskstate.log | awk -F "util:|medium_cnt:" "{if(\$(NF-1) < 0.99)print \$0}"' \
            "检测到卡慢盘可能存在误报" "参考KB：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=399053330"
    fi

    # 检测EDS是否存在无效指令
    if vs_is_eds; then
        check_eds_hdparm
    fi

    # 增加检查坏道
    check_bad_sector
}

function check_diskgroup_cache_ratio() {
    if [ $g_asan_ver -lt 30 ]; then
        return 0
    fi
    exec_db="super_zkcli.py"
    if ! vs_is_eds; then
      if [ $g_hci_ver -ge 670 ]; then
          exec_db="super_mongocli.py"
      fi
    fi
    p_info "开始检查VS3.X的磁盘组缓存比"
    hosts_list=$($exec_db ls  /nodes/)
    read -ra hosts <<< "$hosts_list"
    for host in "${hosts[@]}"; do
      diskgroups=$($exec_db ls /nodes/$host/diskgroups/)
      read -ra diskgroup <<< "$diskgroups"
      cache_size=0
      data_size=0
      host_name=$(echo $host |awk -F '-' '{print$2}')
      for group in "${diskgroup[@]}"; do
        disk_names=$($exec_db cat /nodes/$host/diskgroups/$group|grep $host_name|awk -F '"' '{print $2}')
        while IFS= read -r disk_name; do
          if [[ -z $disk_name ]];then
            continue
          fi

        disk_detail=$($exec_db cat /nodes/$host/disk/$disk_name.json)
        size=$(echo "$disk_detail" | grep "disk_size"|awk -F ' ' '{print $2}'|awk -F ',' '{print $1}')
        is_cache=$(echo "$disk_detail" |grep STORAGE_CACHE)
        if [[ -z $is_cache ]];then
            data_size=$(echo "$data_size + $size" | bc)
        else
            cache_size=$(echo "$cache_size + $size" | bc)
        fi
       done <<< "$disk_names"
      if [[ $cache_size -eq 0 ]];then
        p_info "host:$host 的磁盘组$group 全部是SSD或者没有SSD盘"
        continue
      fi
      ratio=$(echo "scale=2; $cache_size / $data_size" | bc)
      p_info "host:$host 的磁盘组$group 的缓存比是$ratio"
      if (( $(echo "$ratio < 0.05" | bc -l) )); then
          p_error "host:$host 的磁盘组$group 的缓存比小于5%。建议大于10%"
      fi
      done
    done
}

function check_latency()
{
    p_trace "检查存储客户端延时:"
    vs_cluster_cmd.sh e 'zgrep -E "lantency|latency" -n /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' 2>/dev/null| grep -vE "lease|afr_inode_sh_brick_schedule|option|gf_file_latency_set_mode" |'"$g_log_filter"''
    vs_cluster_cmd.sh e 'zgrep -E "lantency|latency" /sf/log/'$g_day'/vs/log/glusterfs/api/gluster_tgtd.log'$g_suffix' 2>/dev/null| grep -vE "lease|afr_inode_sh_brick_schedule|option|gf_file_latency_set_mode"|'"$g_log_filter"''

    p_trace "检查存储服务端延时: "
    vs_cluster_cmd.sh e 'zgrep -E "update_iso_latency|latency_end|pipeline.*end.*latency|_ios_update_xlator_latency" /sf/log/'$g_day'/vs/log/glusterfs/bricks/glusterfsd_sf-data-vs-local-*.log'$g_suffix' 2>/dev/null| grep -viE "inodelk|finodelk|option|lease|gf_file_latency_set_mode"|'"$g_log_filter"' '

    p_trace "检查分层的延时情况："
    vs_cluster_cmd.sh e 'zgrep -E free_cbk_param /sf/log/vs/tierd/tierd.log'$g_suffix' 2>/dev/null |'"$g_log_filter"''

    p_trace "检查是否存在进程卡主超过5s："
    vs_cluster_cmd.sh e 'zgrep "pending_tick=5" /sf/log/'$g_day'/vs/scripts/task_watch_dog.log'$g_suffix' 2>/dev/null |'"$g_log_filter"''

    p_trace "检查lv的segment数量，如果数量超过1万个，可能影响性能"
    vs_cluster_cmd.sh e 'for file in `ls /etc/lvm/backup/`; do count=$(grep segment /etc/lvm/backup/$file | wc -l); if [ $count -ge 10000 ]; then echo "/etc/lvm/backup/$file segment: $count"; fi ; done '
}

function check_split_brain()
{
    p_trace "检查异常无法访问的故障的文件: [可能双点故障、脑裂、指控异常等]"
    if [ $g_asan_ver -lt 30 ]; then
        find /sf/data/vs/gfs/ -type f 1>/dev/null
        echo -e "检查副本一致性: "
        vs_cluster_cmd.sh e '/sf/vs/bin/vs_localhost_checkok.js check_all; echo $?'
    else
        echo "vs_rpc_tool --cmd lookup: "
        vs_rpc_tool --cmd lookup
        echo -e "检查副本一致性: "
        for b in $(/sf/vs/glusterfs/sbin/gluster v i | grep host- | awk -F: '{print $1}' | grep -Eo '[0-9]+'); do vs_rpc_tool --cmd check --brickno "$b"; done
    fi
}

#部分版本: /sf/log/today/vs/iscsi/
#iscis创建日志: /sf/log/vs/iscsi/tgt-mgrd.log 
#iscis接入位置：/sf/log/vs/iscsi/tgtd.log
#iscis镜像访问日志: /sf/log/today/vs/log/glusterfs/api/gluster_tgtd.log
function check_iscsi()
{
    if [ -z "$1" ]; then
        p_error "输入iscsi盘tid, 匹配/sf/cfg/vs/tgt/target/目录tid"
        return 1
    fi
   
    tgt_file="/sf/cfg/vs/tgt/target/$1.json" 
    if [ ! -f "$tgt_file" ]; then
        if [ -f /sf/cfg/vs/tgt/trash/$1.json ]; then
            p_error "iscsi 已经被删除到回收站!"
        else
            p_error "iscsi 配置文件不存在！"
        fi
        return 1
    fi

    uuid=`cat $tgt_file | /sf/vs/bin/jq .uuid`
    p_trace "查看TID相关信息: $1, uuid: $uuid \n配置文件：$tgt_file"
    cat "$tgt_file" | grep -E "name|size|disk_type|volume_id|preallocate" | awk '$1=$1'
    
    p_trace "iscsi基本信息:"
    tgtadm --lld iscsi --op show --mode target | grep "Target $1:" -A49 | grep -E "SCSI ID|Nop|State:|Backing store path:|iqn\." | grep -vE "None" | awk '$1=$1'

    backstore_path=$(tgtadm --lld iscsi --op show --mode target | grep "Target $1:" -A49| grep "Backing store path" | grep -v None | grep vslun|awk -F: '{print $2, $3}' | awk -F"@" '{print $1, $2}' | awk '{print $1,$3}' | tr -s ' ' '/')
    if [ ! -z $backstore_path ]; then
        iscsi_file="/sf/data/vs/gfs/$backstore_path"    
        echo -e "image path: $iscsi_file\n副本位置:"
        vs_quick_lookup.sh $iscsi_file
    fi
    p_trace "查看iscsi磁盘T端服务是否正常:(是否因为进程挂掉导致断开连接,tgtd/keepalived)"
    #vs_cluster_cmd.sh e 'ps -o lstart `pidof tgtd`'
    check_and_advice 'ps aux | grep -E "/sf/vs/sbin/tgtd|keepalived" | grep -vE "super|grep|container"'
    # grep -E "open_bs|__bs_glfs_open|bs_glfs_close"
    return 0
}

#容量不平衡，但不发起数据平衡任务
#该案例TD2021051500115上说任意一台主机都可以看到算法在一分钟一次加载拓补
function check_no_data_balance()
{
    inf="如果容量不平衡，不会发起平衡任务"
    solution="\t1.请参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=180539617"
    logfile="/sf/log/today/vs/scripts/vs_rt_algo.py.log"

    # 使用grep查找含有'doing loop'的日志
    logs=$(grep 'doing loop' "$logfile")

    # 取最近的连续10条日志
    logs=$(echo "$logs" | tail -n 10)

    # 初始化前一个时间为无效值
    prev_time="invalid"

    # 设置状态为0
    status="0"

    # 记录连续时间差约为1分钟的次数
    consecutive_count=0

    # 遍历每条日志
    while IFS= read -r line; do
        # 提取时间部分
        current_time=$(echo "$line" | awk -F'[]]' '{print $1}' | tr -d '[')

        # 如果前一个时间不是无效值
        if [ "$prev_time" != "invalid" ]; then
            # 计算时间差（秒）
            diff=$(($(date -d "$current_time" +%s) - $(date -d "$prev_time" +%s)))

            # 如果时间差小于或等于65秒(约1分钟)，增加连续计数
            if [ "$diff" -le 65 ]; then
                consecutive_count=$((consecutive_count + 1))
            else
                # 如果时间差大于65秒，重置连续计数
                consecutive_count=1
            fi
        fi

        # 如果连续计数达到5，设置状态为1并跳出循环
        if [ "$consecutive_count" -ge 5 ]; then
            status="1"
            testcase_failed_stats "$inf" "$solution" 
            break
        fi

        # 更新前一个时间为当前时间
        prev_time="$current_time"
    done <<< "$logs"

    if [ "$status" = "0" ];then
        testcase_success_stats
    fi
}

function check_vmp_no_data_balance()
{
    if [ $g_asan_ver -ge 30 ]; then
        return 0
    fi
    
    vs_cluster_cmd.sh x "[ \`ls /proc/\$(pidof vs_tasks)/task|wc -l\` -ne 5 ] && echo \"由于环境出现过异常，数据平衡功能可能已失效！ 重启服务可恢复： /sf/vs/etc/init.d/vs-tasks restart\""
}

#检测虚拟机迁移业务是否正常
function check_vm_migration()
{
    local inf="1.超过一半的磁盘空间利用率超过98%，会导致虚拟机迁移失败"
    local solution="\t1.优先释放空间（回收站、备份、快照（快照合并需要额外空间，这种不太推荐）、优先级较低的虚拟机\
                    \n\t2.同时协调客户及时扩容\
                    \n\t3.迁移时优先热迁，对比冷迁，速度快（不需要校验），可靠（能自动跳过空洞，避免迁移失败）\
                    \n\t热迁优先迁预分配的虚拟机；如果精简分配的虚拟机由于空间不足虚拟机挂起导致迁移失败，这种只能重试迁移\
                    \n\t4.参考案例：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=180111203"
    #从日志中过滤错误日志
    local cmd_to_check_logs="grep  'create group route fail' /sf/log/today/vs/log/glusterfs/glusterfs_nfs.log || 
                             grep  'create group route fail' /sf/log/today/vs/log/glusterfs/api/gluster_tgtd.log 2>/dev/null"

    # 初始化 count
    local count=0
    
    #eds环境直接返回
    if vs_is_eds; then
        return 0 
    fi

    #HCI5.8.6版本后才有虚拟机迁移业务
    if [ $g_hci_ver -ge 586 ]; then
        
        # 通过向终端输入指令获取 JSON 内容（5.8.6对应3.0.0 有vsmgr接口）
        local json_output=$(vsmgr view list-all-disk)

        # 获取总的磁盘数量   
        local total_disks=$(echo "$json_output" | /sf/vs/bin/jq '.total')

        # 循环检查每个磁盘的利用率
        for ((i=0; i<total_disks; i++)); do
            # 获取磁盘的 free_size 和 size
            disk_free_size=$(echo "$json_output" | /sf/vs/bin/jq ".disks[$i].iostat.disk_pvs_free_size")
            disk_size=$(echo "$json_output" | /sf/vs/bin/jq ".disks[$i].iostat.disk_pvs_size")
        
            if [ $disk_size -eq 0 ]; then
                p_error "磁盘容量异常"
                return 1
            fi
            # 计算利用率
            utilization=$(echo "($disk_size - $disk_free_size) / $disk_size * 100" | bc -l)

            # 判断利用率是否大于 98%
            is_full=$(echo "$utilization > 98" | bc -l)
            if [ $is_full -eq 1 ]; then
                count=$((count + 1))
            fi

            # 如果利用率大于98%的磁盘数超过总磁盘数量的一半，检测日志是否有报错信息
            if [ "$count" -gt $(($total_disks / 2)) ]; then
                check_and_solution "$cmd_to_check_logs" "$inf" "$solution"
                return 1
            fi
        done
    fi
}

#检测虚拟机镜像文件状态
function check_vm_qcow2_status()
{
    inf="虚拟机镜像文件异常"
    solution="参考案例解决 https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=28434"
    current_date=$(date "+%Y-%m-%d")
    #2.X版本2扩3可能出现，3.0以上直接返回
    if [ $g_asan_ver -ge 30 ]; then
        return 0
    fi
    check_and_solution "zgrep -v OK /var/vm_check/sf_result.log'$g_suffix' 2>/dev/null| \
    grep ".*\.qcow2$"| grep '$current_date'| tail -n '$g_lines'" "$inf" "$solution"
}


#检测是否存在2扩3分层元数据转换失败
function check_is_data_dir_expand_failed()
{
    #6100r1修复了，检测6100及以下版本
    if [ $g_hci_ver -gt 6101 ]; then
        return 0
    fi
    #690版本出现的，其他版本应该也存在
    # 1.过滤 volume_mgr_run_task 日志，判断是否存在 data dir扩容失败的日志
    local log_file="/sf/log/$g_day/vs/scripts/volume_mgr_run_task.log"
    local check_cmd="[ -f $log_file ] && grep 'deal data dir failed' $log_file"
    run_cmd_analyze $check_cmd
    ret=$?
    # 如果没有主机存在分层元数据扩容失败，return
    if [ $ret -eq 0 ]; then
        return 0
    fi

    # 2.检测vs_efs_trans是否有任务失败
    log_file="/sf/log/$g_day/vs/vs_efs_trans.js.log"
    check_cmd="[ -f $log_file ] && grep 'tcache_worker_exit' $log_file | grep 'trans_status' | grep 'failed'"
    run_cmd_analyze $check_cmd
    ret=$?
    if [ $ret -eq 0 ]; then
        return 0
    fi

    # 3.检查 tier_mt_trans 日志，是否rcache空间太小
    log_file="/sf/log/$g_day/tier_mt_trans.log"
    check_cmd="[ -f $log_file ] && grep 'bak dev to small' $log_file"
    
    inf="2扩3, 检测到rcache空间太小, 分层元数据转换失败"
    solution="参照: https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=324331861"
    check_and_solution "$check_cmd" "$inf" "$solution"
}

#检测虚拟机挂机故障案例
function check_vm_fault()
{
    #存储空间耗尽检测
    check_storage_capacity
    #其他目录占满
    check_directory_space_capacity
    #检测卡慢盘
    check_slow_disk
    check_diskgroup_cache_ratio
    #检测存储网络
    check_network
    check_network_status
    detect_port
    #虚拟机开机枷锁失败检测
    check_vm_lock
    #检测虚拟机迁移业务是否正常
    check_vm_migration
}

#没有vs卷时，误将dmesg_redirect打开，导致升级失败
function check_dmesg_redirect()
{
    inf="虚拟存储数据层升级完成阶段，升级失败"
    solution="\t1.没有vs卷时，误将dmesg_redirect打开，导致升级失败\
              \n\t2.参考案例解决：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=31563&page=1&extra="

    #eds环境直接返回
    if vs_is_eds; then
        return 0 
    fi

    output=$(/sf/vs/glusterfs/sbin/gluster v i)
    if [ $g_hci_ver -eq 670 ]; then
        # 判断是否无卷
        if echo "$output" | grep -q "No volumes present"; then
            check_and_solution "service_ctrl list 2>/dev/null| grep dmesg_redirect | grep True" "$inf" "$solution"
        fi  
    fi
}
function check_etc_hosts_upgrade()
{
   solution="\n集群中虚拟存储网络配置中残留配置文件，请检查配置/cfs/vs/gfs_networking_mode.json"
   network_ok=;
   for H in $(cat /cfs/vs/gfs_networking_mode.json|/sf/vs/bin/jq -r '.[].host_name')
   do
       grep -q "${H}" /etc/hosts
       if [ $? -ne 0 ];then
          continue;
       else
          network_ok=$network_ok${H}" "
       fi
   done
   if [ -z "$network_ok" ];then
        inf="存在网络配置残留，可能影响升级,残留主机为$network_ok,请确认"
        testcase_failed_stats "$inf" "$solution"
   fi
}


#延伸集群升级需要先升级仲裁节点，否则会报错
function check_arbiter_upgrade ()
{
    inf="延伸集群升级失败"
    solution="\n延伸集群需要先升级仲裁节点，参考案例：https://tskb.atrust.sangfor.com/forum.php?mod=viewthread&tid=31584"
    check_and_solution 'zgrep "not exists arbiter upgrade info" /sf/log/'$g_day'/vs/scripts/vs_uapi_update.py.log'$g_suffix' 2>/dev/null |'"$g_log_filter"'' "$inf"  "$solution"
}


function check_arbiter_conf ()
{
    #延伸集群升级前提是仲裁和数据节点版本一致，否则会报错
    inf="延伸集群，仲裁节点和数据节点版本不一致，检查是否没有完成仲裁节点和数据节点升级"
    solution="参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=346945164"
    #eds环境直接返回
    if vs_is_eds; then
        return 0
    fi
    volume_name=$(/sf/vs/glusterfs/sbin/gluster v list)
    # 检查volume_name是否为空
    if [ -z "$volume_name" ]; then
        return 1
    fi
	asan_ver=$(head -n1 /sf/vs/version)
	arbiter_host=''
    #670以上版本适用super_mongocli.py
    if [ $g_hci_ver -ge 670 ]; then 
        arbiter_host=$(super_mongocli.py cat  /volumes/$volume_name/volume_info.json|jq '.arbiter.hosts'|jq -r '.[0]')
		
	else
	    if [ "$asan_ver" \< "3.0.3" ]; then 
		    return 1
	    fi
	    arbiter_host=$(super_zkcli.py cat  /volumes/$volume_name/volume_info.json|jq '.arbiter.hosts'|jq -r '.[0]')
    fi 
	# 检查arbiter_host是否为空
	if [[ -z "$arbiter_host" || "$arbiter_host" == "null" ]]; then
        return 1
    fi
	
    arbiter_ver=$(timeout -t "$g_time_limit" -s KILL /usr/bin/ssh $arbiter_host head -n1 /sf/vs/version)
    if [ "$asan_ver" != "$arbiter_ver" ]; then
        testcase_failed_stats "$inf" "$solution" 
    fi
    inf="主机页面仲裁节点显示离线"
    solution="延伸卷数据节点volume_network.conf文件丢失，请按照KB:https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=489790535处理"
    local stretch_hosts
    stretch_hosts=$(/sf/vs/bin/vsmgr volume hosts --volume_id $volume_name|awk '{print $1}')
    if [[ $? -ne 0 || -z "$stretch_hosts" ]];then
        return 1
    fi
    ##这里只检查延伸集群的配置将g_hosts进行复制，后期变更需要注意场景
    local tmp_hosts
    tmp_hosts="$g_hosts"
    g_hosts="$stretch_hosts"
    check_and_solution 'test -f /sf/cfg/vs/volume_network.conf;if [ $? -ne 0 ]; then echo -e "扩容主机失败后回滚导致volume_network.conf文件丢失"; fi' "$inf" "$solution"
    g_hosts="$tmp_hosts"
}

#检测升级案例
function check_upgrade_case()
{
    #没有vs卷时，误将dmesg_redirect打开，导致升级失败
    check_dmesg_redirect

    #延伸集群升级需要先升级仲裁节点，否则会报错
    check_arbiter_upgrade

    #延伸集群升级前提是仲裁和数据节点版本一致，否则会报错
    check_arbiter_conf

    # 检测network配置是否存在残留
    check_etc_hosts_upgrade

    # 检测升级标记
    check_upgrade_flag

    # 检测是否存在部分主机没有配置存储网，部分主机配置了存储网
    check_network_conf

    # 检测低版本dts升级到670可能出现的could not convert string to float报错
    check_wait_all_host_access

}

#检测卷配置文件是否缺少 topo_version 字段
function check_volume_config_topo_version()
{
    inf="2扩3失败，volume_info.json 缺少 topo_version 字段"
    solution="参考案例解决：https://docs.atrust.sangfor.com/pages/viewpage.action?pageId=208321304"
    volume_name=$(/sf/vs/glusterfs/sbin/gluster v i | grep "Volume Name" | awk -F': ' '{print $2}')
    # 检查volume_name是否为空
    if [ -z "$volume_name" ]; then
        return 1
    fi

    # 3.0之前的版本无topo_version 字段
    if [ $g_asan_ver -lt 30 ]; then
        return 1
    fi 

    #使用python获取卷配置文件
    json_output=$(/sf/vs/bin/python -c "import json; from bson import ObjectId; from bson import json_util;\
            from libcommon.libconf import VolumeConf; vol_conf = VolumeConf('$volume_name');\
            conf = vol_conf.content; res = json.dumps(conf, default=json_util.default); print res")

    status=$(echo -n "$json_output" | /sf/vs/bin/jq -r '.status')
    topo_version=$(echo -n "$json_output" | /sf/vs/bin/jq 'has("topo_version")')

    if [ "$status" == "expand_failed" ]; then
        
        if [ "$topo_version" == "false" ]; then
    
            testcase_failed_stats "$inf" "$solution"
        else
            
            testcase_success_stats
        fi
    fi  
}

#检测2扩3的案例
function check_two_expand_three_case()
{
    #检测卷配置文件是否缺少 topo_version 字段
    check_volume_config_topo_version

    #检测虚拟机镜像文件状态
    check_vm_qcow2_status

    #检测是否存在2扩3分层元数据转换失败
    check_is_data_dir_expand_failed
}

#检测2.X环境数据同步的案例
function check_2X_data_sync_case()
{
    # 3.0及之后的版本返回
    if [ $g_asan_ver -ge 30 ]; then
        return 1
    fi

    #检测分层回写功能
    check_tier_writeback_status

    local inf="检测数据同步任务是否异常"
    local solution="检测到写缓存服务异常 联系研发处理 https://wiki.sangfor.com/pages/viewpage.action?pageId=90839276"
    #检测写缓存是否异常
    check_and_solution 'zgrep -E "get demote data ssd" /sf/log/vs/tierd/tierd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"' | grep "ssd entry fatal error"' "$inf" "$solution"

    #检测是否存在文件太大，导致同步失败
    check_file_too_large

    #检测空间容量是否正常
    check_storage_capacity
}

#检测3.X环境数据同步的案例
function check_3X_data_sync_case()
{
    # 3.0之前的版本返回
    if [ $g_asan_ver -lt 30 ]; then
        return 1
    fi

    local inf="检测数据同步任务是否异常"
    #local solution="联系研发检测是否存在nfs锁泄露，导致数据同步可能卡住或失败 https://wiki.sangfor.com/pages/viewpage.action?pageId=90839986"

    #检测是否存在加锁失败的日志 (先注释掉这个检测逻辑，这个日志判断锁问题有点牵强)
    #check_and_solution 'zgrep -E "but it has opened by self-heal" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' 2>/dev/null | '"$g_log_filter"' | grep "open by robber"' "$inf" "$solution"

    #检测是否存在端口绑定问题
    local solution="nfs端口绑定可能存在问题，导致数据同步可能卡住或失败 https://wiki.sangfor.com/pages/viewpage.action?pageId=90839773"
    check_and_solution 'zgrep -E "socket_connect_finish" /sf/log/'$g_day'/vs/log/glusterfs/glusterfs_nfs.log'$g_suffix' 2>/dev/null | '"$g_log_filter"' | grep "Connection timed out"' "$inf" "$solution"

    #检测分层回写功能
    check_tier_writeback_status

    #检测分层是否无法连接brick
    solution="分层无法连接brick，联系研发处理 https://wiki.sangfor.com/pages/viewpage.action?pageId=90840254"
    check_and_solution 'zgrep -E "report_io_error" /sf/log/vs/tierd/tierd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"' | grep "offline"' "$inf" "$solution"

    #检测分层版本是否不一致,导致握手失败
    solution="检测是否存在版本错误导致brick连接失败 https://wiki.sangfor.com/pages/viewpage.action?pageId=90839358"
    check_and_solution 'zgrep -E "version error, version = 0x20000, tier version = 0x32001" /sf/log/vs/tierd/tierd.log'$g_suffix' 2>/dev/null | '"$g_log_filter"' ' "$inf" "$solution"

    #检测是否存在文件太大，导致同步失败
    check_file_too_large

    #检测空间容量是否正常
    check_storage_capacity

    #检测是否存在空间泄露
    check_efs_space_leak
}

# 检测可能导致扩容，替换卷失败的原因
function check_volume_expand_replace_case() {
    # 环境基本信息，卷信息，补丁包信息等
    get_env

    # 检查内存，CPU
    check_mem_cpu

    # 检查网络
    check_network_status

    # 防火墙
    check_iptables

    # 内存占用
    check_proc_mem

    # 主机假死
    check_system_hang_case

    #检测集群ca.key文件是否一致
    check_ca_file_consistency "/sf/cfg/vs/cert/ca.key"

    #检测集群ca.crt文件是否一致
    check_ca_file_consistency "/sf/cfg/vs/cert/ca.crt"

    # 检查一些目录空间以及存储空间
    check_storage_capacity

    # 磁盘
    check_disk
    check_slow_disk

    # =================检查可能影响卷扩容，替换的一些配置文件异常=================
    # lvm
    check_lvm_config

    # 磁盘组配置
    check_disk_group_id

    # /sf/cfg/vs/.members
    check_host_online_status

    # =================检测可能影响卷扩容，替换的一些服务异常=================
    # rpyc
    check_rpyc_daemon
    detect_rpyc_port

    # mongo
    check_mongo

    # vs_cmd_proxy
    check_vs_cmd_proxy_server

    # nfs
    check_nfs_T_status
    check_nfs_server

    # glusterd
    check_peers_numbers
    check_glusterd_file_exists

    # brick
    check_brick_config
    if [ $g_asan_ver -lt 30 ]; then
        /sf/vs/glusterfs/sbin/gluster volume status | grep -vwE "NFS|local|Self-heal"| grep "N/A$" -B3
    else
        /sf/vs/bin/vs_rpc_tool --cmd clnt 2>/dev/null| grep "UNNORMAL" -B5
    fi
    check_glusterfsd_Z_status

    # 分层
    if [ -f /sf/cfg/vs/cache/tier.json ]; then
        if [ $g_hci_ver -lt 6101 ];then
            check_tier_maps
        fi
    fi
    check_tierd_meta_data_err

    #读写缓存
    check_cache

    # 2扩3
    check_two_expand_three_case
}


#检测数据同步的案例
function check_data_sync_case()
{
    #检测坏道
    check_bad_sector

    #检测brick是否静默
    check_silence_brick

    check_2X_data_sync_case
    check_3X_data_sync_case
}

#检查存储异常的案例
function check_storage_service_case()
{
    p_trace "检查存储服务专项："

    check_storage_config
    check_storage_service
    check_storage_capacity
    check_proc_mem
}

function ops_exec_case()
{
    if [ -z "$g_type" ]; then
        return $EINVAL
    fi

    case $g_type in
        slowdisk)
            check_slow_disk
            check_diskgroup_cache_ratio
            return 0
            ;;
        splitbrain)
            check_split_brain
            return 0
            ;;
        latency)
            check_latency
            return 0
            ;;
        vm)
            check_vmid "$g_vmid"
            return 0
            ;;
        iscsi)
            check_iscsi "$g_vmid"
            return 0
            ;;
        vmfault)
            check_vm_fault
            return 0
            ;;
        databalance)
            check_no_data_balance
            check_vmp_no_data_balance
            return 0
            ;;
        upgrade)
            check_upgrade_case
            return 0
            ;;
        two_expand_three)
            check_two_expand_three_case
            return 0
            ;;
        system_hang)
            check_system_hang_case
            return 0
            ;;
        vs_dog)
            check_vs_dog_config
            return 0
            ;;
        bad_sector)  # 新增的处理逻辑
            check_bad_sector
            return 0
            ;;
        remove_tfile)
            remove_volume_bad_tfile
            return 0
            ;;
        check_tgtd_conn)
            check_tgtd_conn
            return 0
            ;;
        storage)
            check_storage_service_case
            return 0
            ;;
        data_sync)
            check_data_sync_case
            return 0
            ;;
        create_volume)
            check_create_volume_case
            return 0
            ;;
        volume_expand)
            check_volume_expand_replace_case
            return 0
            ;;
        volume_replace)
            check_volume_expand_replace_case
            return 0
            ;;
        *)
            return $EINVAL
        ;;
    esac
}
#================================================[end case]============================================================#
#================================================[start create_volume]=================================================#
# grep日志的函数
function grep_logs_by_ops() {
    local log_file="$1"
    local search_pattern="$2"
    local ops_param="$3"
    res=$(vs_cluster_cmd.sh e  'grep '"$search_pattern" "$log_file" "$ops_param"' 2>/dev/null'|grep -vE "$log_file|CMD" )
    if [ -n "$res" ]; then
        echo "$res"
    fi
}

# 获取日志文件的最后几行
function tail_logs() {
    local log_file="$1"
    local search_pattern="$2"
    local ops_param="$3"
    res=$(vs_cluster_cmd.sh e  'tail -n 3 '"$log_file" "$ops_param"' 2>/dev/null'|grep -vE "$log_file|CMD" )
    if [ -n "$res" ]; then
        p_info "$log_file 没有报错。输出最后几行日志"
        p_info "$res"
    else
      p_info "$log_file 没有日志"
    fi
}

# 跳过的辅助函数
function skip_failed_rule() {
    local ops="$1"
    local log_detail="$2"
    if [ -z "$log_detail" ]; then
      echo "$ops"
    else
      echo "$log_detail|$ops"
    fi
}

# 异常辅助函数
function is_error() {
    local detail="$1"
    local ops="$2"
    echo "$res" |grep "$ops"

}

# 可以跳过的一些创建卷的错误日志
function skip_not_affect_create_volume_error() {
    local log_detail="$1"
    local ops=""
    ops=$(is_cli_tier_dump "$ops")
    ops=$(is_backup_mongo_failed "$ops")
    ops=$(is_mkdir_error_path "$ops")
    ops=$(is_mongo_stable "$ops")
    ops=$(is_wait_format_failed "$ops")
    echo "$1" | grep -vE "$ops"
}

#mkdir error path : 创建mongo目录失败。虽然这里失败了，不会影响主流程。但是如果不是已经存在。建议还是把失败打印出来。
function is_mkdir_error_path() {
    local ops="$1"
    skip_failed_rule "mkdir error path :|is exists" "$ops"
}
# 备份mongo配置失败。但是不影响创建卷
function is_backup_mongo_failed() {
    local ops="$1"
    skip_failed_rule "backup file | to mongo failed" "$ops"
}

# 判断是否是tier dump，全闪卷会有这个这个报错。也不影响创建卷。在主流程被忽略了
function is_cli_tier_dump() {
    local ops="$1"
    skip_failed_rule "cli-tier-dump" "$ops"
}

# 忽略等待Mongo稳定的报错
function is_mongo_stable() {
    local ops="$1"
    skip_failed_rule "No primary available for writes" "$ops"
}

# 忽略掉格式化在执行中的获取失败的报错
function is_wait_format_failed() {
    local ops="$1"
    skip_failed_rule "get disk format progress for" "$ops"
}
# 确认创建卷的任务流程输出的报错
function grep_create_volume_mgr_run_task_error() {
    #开始检查mgr_run_task.log
    p_info "开始检查创建卷的日志"
    log_file="/sf/log/$g_day/$g_vs_scritps_log_dir$g_volume_run_mgr_path"
    search_pattern="\" E \""
    ops_param="-nr"
    res=$(grep_logs_by_ops "$log_file" "$search_pattern" "$ops_param")
    # 排除一下一些不影响创建卷的失败
    res=$(skip_not_affect_create_volume_error "$res")
    if [ -n "$res" ]; then
         #检查主流程失败在哪一步
        p_error "mgr_run_task.log 检查失败:具体失败信息："
        p_info "$res"
        # 确认是否分配内存失败
        is_reseve_mem "$res"
        # 确认是否格式化失败
        is_format_failed "$res"
        #todo 这里后续可以补充一些其他报错，
        #todo 还有一个常见的680-690客户复用主机原本VT使用或者VS自替换的盘。
        #todo 导致向VT申请占用这个盘的时候失败的报错。由于6.10.0补丁已经优化过一版。优先级并没有那么高。
    else
      tail_logs "$log_file" "$ops_param"
    fi
}

# 向VT申请内存失败，可以向VT确认就是内存
function is_reseve_mem() {
    #fail to reserve mem, plus
    local detail="$1"
    local ops="fail to reserve mem, plus"
    is_err=$(is_error "$detail" "$ops")
    if [ -n "$is_err" ]; then
        p_info "$is_err"
        p_error "跟平台申请内存失败，请上升VS确认失败原因";
    else
        p_info "未发现申请内存流程异常"
    fi
}


# 检查格式化是否失败
function is_format_failed() {
    p_info "开始检查是否有格式化失败"
    local mgr_run_res="$1"
    local log_file="/sf/log/$g_day/$g_vs_scritps_log_dir$g_volume_run_mgr_path"
    local search_pattern="disk op failed"
    is_format=$(is_error "$mgr_run_res" "$search_pattern")
    if [ -n "$is_format" ]; then
        p_error "创建卷格式化任务失败:开始分析具体格式化失败原因："
        # 是否存在软raid
        check_is_soft_raid $mgr_run_res
        local soft_raid_res=$?
        #开始检查是否是数据盘格式化失败
        check_data_format_failed
        local data_format_res=$?
        #开始检查是否是缓存盘格式化失败
        check_cache_format_failed
        local cache_format_res=$?
        #todo 这里需要补充后续各种格式化失败的原因。
        format_res=$(($soft_raid_res + $cache_format_res + $data_format_res))
        if [ "$format_res" -eq 0 ]; then
          p_error "格式化没有明确报错请结合 $log_file 目录的报错上升 VS 研发"
        fi
    else
      local search_pattern="no disk need format, return True\|start disk format, task:"
      local ops_param="-E"
      is_start_format_res=$(grep_logs_by_ops "$log_file" "$search_pattern" "$ops_param")
      p_info "$is_start_format_res"
      if [ -n "$is_start_format_res" ]; then
        p_info "创建卷磁盘格式化成功"
      else
        #TODO 后续添加格式化任务卡住的确认方案
        p_info "没有在任务中找到格式化失败的日志,留意格式化任务是否卡住"
      fi
    fi
}

# 检查分区是否失败
#"not exist, wait_partition_ready failed"  分区失败
function check_partition_failed() {
    local check_res_input=$1
    is_partition_fail=$(echo "$check_res_input" | grep -E 'wait_partition_ready failed')
    if [ "x$is_partition_fail" != "x" ]; then
            p_error "创建分区失败，进一步分析问题";
            check_is_udev_failed
            check_kernel_disk_error
            return
    fi
}

# 是否磁盘存在软raid导致的卷操作失败
function check_is_soft_raid() {
    local check_res_input=$1
    soft_raid_error="has md:"
    is_soft_raid=$(is_error "$check_res_input" "$soft_raid_error")
    if [ "x$is_soft_raid" != "x" ]; then
            p_error "磁盘存在软raid导致卷操作失败。请按https://support.sangfor.com.cn/cases/list?product_id=33&type=1&category_id=32960&isOpen=true处理";
            return 1
    fi
    return 0
}

# 检查udev事件服务是否异常。是不是VT的udev事件服务异常
function check_is_udev_failed() {
    local uwsgi_dir="/sf/log/$g_day/vs/$g_uwsgi"
    is_udev_failed=$( grep -E 'load udev_data' -nr $uwsgi_dir | grep -E 'ERROR')
    if [ "x$is_udev_failed" != "x" ]; then
            p_error "udev事件的服务异常json解析失败导致分区事件挂载不上，请联系VS和平台一起确认问题后清理一下事件中心的错误";
    fi
}

#检查内核是否有磁盘异常
#vs_cluster_cmd.sh e 'zgrep -E "blk_update_request: critical medium error|blk_update_request: I/O error" /sf/log/'$g_day'/kernel.log'$g_suffix' | grep -vE "nbd" | '"$g_log_filter"' '
#vs_cluster_cmd.sh e 'zgrep -E "wait_for_completion_io_timeout.*dev_name" /sf/log/'$g_day'/kernel.log'$g_suffix' |'"$g_log_filter"''
function check_kernel_disk_error() {
  ioerr_res=$(vs_cluster_cmd.sh e 'zgrep -E "blk_update_request: critical medium error|blk_update_request: I/O error" /sf/log/'$g_day'/kernel.log'$g_suffix' | grep -vE "nbd" | '"$g_log_filter"' ')
  ioerr_fail_res=$(echo "$ioerr_res" |grep "dev")
  if [ "x$ioerr_fail_res" != "x" ]; then
      echo "$ioerr_fail_res"
      p_error "磁盘io异常，请确认创建卷创建分区格式化失败是否是磁盘异常，建议排除磁盘故障后再创建";
#      exit 0
  fi
  iotimeout_res=$(vs_cluster_cmd.sh e 'zgrep -E "wait_for_completion_io_timeout.*dev_name" /sf/log/'$g_day'/kernel.log'$g_suffix' |'"$g_log_filter"'')
  echo "$iotimeout_res"
  iotimeout_fail_res=$(echo "$iotimeout_res" |grep "/dev")
  if [ "x$iotimeout_fail_res" != "x" ]; then
      p_error "磁盘io超时，请确认创建卷创建分区格式化失败是否是磁盘异常，建议排除磁盘故障后再创建";
  fi
}

function check_create_volume_case() {
   p_info "先检查创建卷网络是否异常"
   check_network
   check_network_status
   detect_port
   p_info "在分析创建卷异常原因"
   grep_create_volume_mgr_run_task_error
}

# 检查是否是数据盘格式化失败
function check_data_format_failed() {
      p_info "开始检查是否是数据盘格式化失败"
      local log_file="/sf/log/$g_day/$g_vs_scritps_log_dir$g_data_format_path"
      local search_pattern="\"E \[\""
      local res=$(grep_logs_by_ops "$log_file" "$search_pattern" "$ops_param")
      if [ -n "$res" ]; then
          p_error "格式化数据盘失败，开始分析具体失败原因："
          p_info "$res"
          check_partition_failed "$res"
          return 1
      fi
      p_info "数据盘格式化没有失败"
      return 0
}

# 检查是否是缓存盘格式化失败
function check_cache_format_failed() {
      p_info "检查是否是缓存盘格式化失败"
      local log_file="/sf/log/$g_day/$g_vs_scritps_log_dir$g_cache_format_path"
      local search_pattern="\"E \[\""
      local res=$(grep_logs_by_ops "$log_file" "$search_pattern" "$ops_param")
      if [ -n "$res" ]; then
        # 是否分层没启动导致的失败
        p_info "$res"
        check_tierd_is_run
        check_kernel_disk_error
        return 1
      fi
      p_info "缓存盘格式化失败没有失败"
      return 0
}

# 检查分层服务是否被拉起
function check_tierd_is_run() {
    res=$(vs_cluster_cmd.sh e "pidof tierd 1>/dev/null;echo \$?" |grep -vE "host-|CMD|0")
    if [ -n "$res" ]; then
      p_info "分层服务未启动,如果当前环境还没有存储卷可执行：vs_cluster_cmd e 'service_ctrl restart tierd'重试失败再上升研发"
    fi
}

#================================================[end create_volume]===================================================#


#==============================================[start repir]===========================================================#
function repair_glusterfs_pid()
{
    p_trace "修复nfs pid和/var/run/vs/nfs.pid不一致的问题...\n"
    vs_cluster_cmd.sh x "[ \`pidof glusterfs\` -ne \`cat \$(ps aux|grep nfs.pid|grep -vE 'supervise|grep'|sed -n 's/.*-p \([^ ]*\).*/\1/p')\` ] && (echo -e  \"\033[5;31mnfs进程pid不一致,尝试修复\033[0m\n\"; timeout -t 5 -s 9 /sf/vs/bin/setfattr_nfs 127.0.0.1 \`/sf/vs/glusterfs/sbin/gluster v list\` -n user.nfs.switch -v "new" -f /);[ 0 == 0 ]"

    p_trace "\n修复完成，查询是否一致："
    vs_cluster_cmd.sh x "[ \`pidof glusterfs\` -ne \`cat $(ps aux|grep nfs.pid|grep -vE 'supervise|grep'|sed -n 's/.*-p \([^ ]*\).*/\1/p')\` ] && echo -e  \"\033[5;31mnfs进程pid仍然不一致,请手动处理\033[0m\";[ 0 == 0 ]"
}

function repair_vg()
{
    p_trace "查看lvm配置是否存在磁盘VG:"
    cat /sf/cfg/vs/lvmmgr/lvm_mgr_vg.json
    # 尝试激活一次vg
    
    p_trace "\n尝试激活一次VG...\n"
    vgscan --ignorelockingfailure
    vgchange -ay --ignorelockingfailure
    vgchange --refresh
    udevadm trigger 
    
    p_trace "\n再次查看VG信息"
    /sf/vs/bin/vs_vginfo.sh
}

function repair_dir_left()
{
    p_trace "尝试清理元数据残留无gfid 的单副本目录"
    vs_cluster_cmd.sh e "/sf/vs/bin/python /sf/vs/lib/python-srv/vs_dts/core/perccenter/brick_offline_clean.py"
    p_trace "清理成功，请挂载点检查被清理的目录是否存在异常"
}

function ops_exec_repair()
{
    if [ -z "$g_type" ]; then
        return $EINVAL
    fi

    case $g_type in
        vg)
            repair_vg 
            return 0
            ;;
        nfspid)
            repair_glusterfs_pid
            return 0
            ;;
        meta_dir_left)
            repair_dir_left
            return 0
            ;;
        *)
            return 1
        ;;  
    esac 
}
#====================================================[repir]===========================================================#

function ops_exec_guide()
{
    # TODO
    return 0
}

function ops_exec_search()
{
    # TODO
    return 0
}

# 匹配执行案例集合
function ops_exec_subcommand()
{
    case $g_subcommand in
        check) # 环境检查
            ops_exec_check
            return $?
        ;;
        case) # 案例执行
            ops_exec_case
            return $?
        ;;
        repair) # 故障修复
            ops_exec_repair
            return $?
        ;;
        guide) # 运维指引
            ops_exec_guide
            return $?
        ;;
        search) # 日志检索
            ops_exec_search
            return $?
        ;;
        blackbox) # 黑盒日志分析
            ops_exec_blackbox
            return $?
        ;;
        sample) # 信息采集，包括: 采集分层命中率、副本分布等
            ops_exec_sample
            return $?
        ;;
        * )
            return $EINVAL;
        ;;
    esac
}

# 执行结果分析
function ops_exec_summarize()
{
    local sum=0
    
    let sum=g_case_failed+g_case_success
    if [ $sum -eq 0 ]; then
        return 0
    fi
    echo -e "\n===========================总结=========================="
    # 这里将所有失败汇总输出一下
    p_error "$g_all_error_echo"
    # 所有超时汇总输出
    p_timeout "$g_all_timeout_echo"
    p_info "本次检查共检测案例数: $(($g_case_failed + $g_case_timeout + $g_case_success)), 
    成功的案例数: $g_case_success 失败的案例数: $g_case_failed 超时的案例数：$g_case_timeout"
    if [ $g_case_failed -gt 0 ]; then
        p_error "存在案例执行失败，请检查失败案例，并根据方案恢复故障!"
        return 1
    fi
    return 0
}

function usage()
{
   echo -e "aSAN operation and maintenance(ops), function: environment check, auto repair fault, maintenance guide etc."
   echo -e "usage: asan_ops [OPTION]..."
   echo -e "\t -s OP: [check|repair|guide|case|search]."
   echo -e "\t -t check: OP[all|network|service|disk|mem|BUG]. eg: asan_ops -s check -t all"
   echo -e "\t\t  check storage network, check storage service/conf, check A known early warning(BUG) etc."
   echo -e "\t    repair: OP[vg|nfspid|meta_dir_left]. eg: asan_ops -s repair -t vg"
   echo -e "\t\t  repair active vg, correction nfs pid/conf not same, repair dir left etc."
   echo -e "\t    case: OP[slowdisk|storage|latency|data_sync|splitbrain|vm|iscsi|vmfault|databalance|upgrade|two_expand_three|system_hang|vs_dog|bad_sector|remove_tfile|check_tgtd_conn|create_volume|volume_expand|volume_replace]."
   echo -e "\t\t  eg: asan_ops -s case -t slowdisk."
   echo -e "\t\t  check specific case, slowdisk, latency, splitbrain etc."
   echo -e "\t    blackbox: OP[iostat]."
   echo -e "\t\t  parse specific case, iostat etc."
   echo -e "\t\t  usage: asan_ops -s blackbox -t iostat help."
   echo -e "\t    sample: OP[cache|polyhost]. eg: asan_ops -s sample -t cache"
   echo -e "\t\t  sample cache hit, polyhost/polyrep etc."
   echo -e "\t -n exec host(node), eg: asan_ops -s case -t slowdisk -n host-b4055d27cb0e,host-b4055d278a6a"
   echo -e "\t -d [date] search log in some time. eg: asan_ops -s search -d \"2023-06-26,07:37:\""
   echo -e "\t -i [vmid|iscsid] check vmid. eg: asan_ops -s case -t vm -i 1649206016202"
   echo -e "\t -l [lines] search log lines eg: asan_ops -s case -t slowdisk -l 1000"
   echo -e "\t -z search zipped log eg: asan_ops -s case -t slowdisk -l 1000 -z"
   echo -e "\t -g set a timeout for each detection order. eg: asan_ops -s check -t all -g 20"
   echo -e "\t -o output the results to a file"
}

function ops_parse_args()
{
    local option=

    if [ $# -lt 1 ]; then
        return 1
    fi

    while getopts "s:t:n:d:i:hc:l:z:g:v:o:" option
    do
        case $option in
            s)
                g_subcommand="$OPTARG"
            ;;
            t)
                g_type="$OPTARG"
            ;;
            n)
                g_hosts="$OPTARG"
                if [ ! -z $g_hosts ]; then
                    g_hosts=$(echo $g_hosts | tr -s ',' ' ')
                fi
            ;;
            z)
                g_suffix="*"
            ;;
            l)
                g_lines="$OPTARG"
            ;;
            d)
                g_time=$(echo "$OPTARG" | sed 's/,/ /')
                g_day=$(echo "$OPTARG" | awk -F',' '{print $1}'|awk -F'-' '{print $3}'|sed -r 's/0*([0-9])/\1/')
            ;;
            i)
                g_vmid="$OPTARG"
            ;;
            c) # 主要用于执行次数，比如观测iostat次数，mpstat次数，监控网络丢包次数等涉及到次数的参数
                g_cnt="$OPTARG"
            ;;
            g)
                g_time_limit="$OPTARG"
            ;;
            v)
                g_volume_id="$OPTARG"
            ;;
            o) #输出的文件路径
                g_output_file="$OPTARG"
            ;;
            h|* )
                return 1
            ;;
        esac
    done

    if [ -z $g_hosts ]; then
        g_hosts=$(cat /etc/hosts | grep host- | grep host-mgr -v | awk '{print $1}'| sed 1d) 
        g_hosts="$g_hosts $(hostname)"
    fi

    # 大于vs2.8版本，获取卷内主控节点列表
    if [ -z $g_master_hosts ] && [ $g_asan_ver -ge 28 ]; then
        g_master_hosts=$(/sf/vs/bin/vsmgr volume hosts | grep master|awk -F ' ' '{print $1}')
    fi
    
    # 日志检测参数
    g_log_filter="grep \"$g_time\" | tail -n $g_lines"

    if [ ! -z "$g_time" ]; then
        g_date=$(echo "$g_time" | awk '{print $1}' | tr -d '-') 
    else
        g_date='today'  
    fi

    g_all_cmd_args="$@"
}

function main()
{
    # 记录开始时间
    start_time=$(date +%s)
    log_info "Call script($0 $@) start."

    # 1. 解析校验参数
    ops_parse_args $@
    if [ $? -ne 0 ]; then
        usage
        return 1
    fi

    # 2. 执行案例
    ops_exec_subcommand
    if [ $? -ne 0 ]; then
        if [ $? -eq $EINVAL ]; then
            usage
            return 1
        fi

        #p_error "环境存在异常，请检索asan_ops 执行记录"
        #return 1
    fi

    # 3. 运维脚本执行结果汇总
    ops_exec_summarize
    # 记录结束时间
    end_time=$(date +%s)

    # 计算并输出执行时间
    elapsed_time=$((end_time - start_time))
    echo "Script execution completed in $elapsed_time seconds."
}

main $@

