#!/sf/vs/bin/python
# -*- coding:utf-8 -*-
# =============================================================================
# Copyright © 2025 Sangfor Technologies
# All rights reserved.
#
# Author: xujimian - 23247@sangfor.com
#
# Create Time: 2025/1/10 上午10:27
#
# Filename:  disk_fault_tool
#
# Description:
#
# =============================================================================

# 用于向scsi 硬盘注入卡死故障

import logging
import argparse
import os
from libcommon.log import logger_init
import subprocess32 as subprocess


logger = logging.getLogger(__name__)
# 注入故障所需写入的文件
fault_files = ["/sys/kernel/debug/fail_io_timeout/probability", # 设置卡盘故障率
                "/sys/kernel/debug/fail_io_timeout/times",      # 设置卡盘故障次数
                "/sys/kernel/debug/fail_io_timeout/verbose",    # 内核日志按精简模式打印
                "/sys/block/{}/device/scsi_disk/",
                "/sys/block/{}/io-timeout-fail"]
# 对应文件应该写入的值，第一个为故障值，第二个为恢复值
files_value = [[100, 0], [-1, 0], [1, 2], [10000000, 2], [1, 0]]


def run_command(command, timeout=30):
    try:
        # 执行命令并设置超时时间
        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, timeout=timeout)
        res = result.stdout.decode('utf-8')  # 获取标准输出并解码
        code = result.returncode  # 获取退出状态码
        # 如果返回状态码非零，返回标准错误输出
        if code != 0:
            return result.stderr.decode('utf-8'), code
        return res, code  # 正常返回标准输出和状态码
    except subprocess.TimeoutExpired:
        return None, 143  # 超时返回 None 和 143
    except Exception as e:
        return str(e), -1  # 返回异常信息和 -1


# 查找dir目录下的max_medium_access_timeouts文件
def get_max_medium_access_timeouts(dir_path):
    res, code = run_command("find {} -name max_medium_access_timeouts".format(dir_path))
    if code != 0:
        print("error: [{}], {}".format(code, res))
        return ""
    else:
        return res


# 恢复磁盘：
def disk_scan():
    # 磁盘清除只读标记
    res, code = run_command("echo {} > /sf/cfg/vs/never_recoverd_disks.json")

    run_command("for disk_file in `ls /sf/cfg/vs/disk`; do "
                "   /sf/vs/bin/vs_json_rw.py -f /sf/cfg/vs/disk/$disk_file -w -k disk_warning_expand.never_recoverd -t int -v 0; "
                "   /sf/vs/bin/vs_json_rw.py -f /sf/cfg/vs/disk/$disk_file -w -k temporary_remove -t int -v 0; "
                "done; ")

    run_command("disk_scan.sh")


def inject_fault(dev_name):
    if not os.path.exists("/dev/{}".format(dev_name)):
        print("设备不存在！！！")
        return

    fault_files[3] = fault_files[3].format(dev_name)
    fault_files[3] = get_max_medium_access_timeouts(fault_files[3])
    if not fault_files[3]:
        print("get get max_medium_access_timeouts file is error")
        return
    fault_files[4] = fault_files[4].format(dev_name)

    for i in range(len(fault_files)):
        res, code = run_command("echo {} > {}".format(files_value[i][0], fault_files[i]))
        if code != 0:
            print("故障注入失败：[{}] {}".format(code, res))
            # 中间有失败的，恢复故障
            for j in range(i):
                run_command("echo {} > {}".format(files_value[j][1], fault_files[j]))
            return

    print("inject fault success")
    logger.info("inject {} fault is success".format(dev_name))


def clear_fault(dev_name):
    remove_disk = True
    if not os.path.exists("/dev/{}".format(dev_name)):
        fault_files[3] = ""
        fault_files[4] = ""
        remove_disk = False
    else:
        fault_files[3] = fault_files[3].format(dev_name)
        fault_files[3] = get_max_medium_access_timeouts(fault_files[3])
        if not fault_files[3]:
            print("get get max_medium_access_timeouts file is error")
        fault_files[4] = fault_files[4].format(dev_name)

    for i in range(len(fault_files)):
        if not fault_files[i]:
            continue
        res, code = run_command("echo {} > {}".format(files_value[i][1], fault_files[i]))
        if code != 0:
            print("恢复故障存在异常：[{}] {}".format(code, res))

    # 拔盘
    if remove_disk:
        logger.info("remove disk: {}".format(dev_name))
        print("remove disk: {}".format(dev_name))
        res, code = run_command("vs_remove_disk.py {}".format(dev_name))
        if code != 0 and code != 143:
            print("error: [{}] {}".format(code, res))

    # 重新读取磁盘列表
    print("故障清理完成，正在重新读取磁盘列表，请耐心等待")
    disk_scan()

    print("clear fault success")


@logger_init()
def main(op, dev):
    logger.info("op: {}, dev: {}".format(op, dev))
    if op == "inject":
        inject_fault(dev)
    else:
        clear_fault(dev)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('op', type=str, choices=['inject', 'clear'], help="Operation to perform: inject or clear")
    parser.add_argument('dev', type=str, help="devname, ex: sda")
    args = parser.parse_args()
    main(args.op, args.dev)

