#!/sf/vs/bin/python
# -*- coding:utf-8 -*-
"""
# Copyright @2021 Sangfor Technologies
# All rights reserved.
#
# Author: lss
#
# create: 2024-12-04
#
# Last modified:
#
# Filename: vs_efs_upgrade_handle.py
#
# Description: efs_v1升级到efs_v2版本场景，加载旧位图反序列化逻辑错误使用V2加载(6.10.0引入的问题)
#              导致借分片归还的地址跟数据物理地址重叠。从而导致升级后brick拉不起来
#
# 使用说明：
# 使用该工具之前需要确保上传efs_upgrade工具到/sf/log/目录下，恢复需要借助该工具(md5:9c544ba2167bf8dbaf32fd8bdd460232)
# 工具分为5个步骤:1.检查环境是否是该问题场景; 2.检查异常的brick并再次确认异常brick是由于该问题导致; 3.输出要操作的命令并让用户确认
#                4.异常brick禁用并停止; 5.回滚EFS升级并重新升级逻辑; 6.启用brick恢复
# 第1步 检查环境是否是该问题场景：
    1）VS为3.0.3之前且非3.0.2R2版本升级到3.7.0系列版本(3.8.0版本内已修复)
    2）匹配日志确认是该问题："zmap_page_alloc_blocks_manual.*some blocks in .* of zmap page .* are already allocated"
# 第2步 检查异常的brick并再次确认异常brick是由于该问题导致：
    1）vs_rpc_tool -c clnt --hostname `hostname` --state UNNORMAL
    2）匹配vs_upgrade_efs.sh确认是因为efs_upgrade转换失败，无法启动的："failed: .*, backup path is "
# 第3步 输出要操作的命令并让用户确认
# 第4步 异常brick禁用并停止
# 第5步 回滚EFS升级并重新升级逻辑
# 第6步 启用brick恢复
# 注：确认操作的命令是否正确，可能路径不存在
#
"""
import json
import logging
import os
import sys
import re
import argparse
from libcommon.log import logger_init
from libvs.glusterfs import Glusterfs
from libvs.utils import stddir
from libvs.utils.common import LOCALHOST


logger = logging.getLogger(__name__)
EFS_UPGRADE_LOG_PATH = "/sf/log/vs/efs_upgrade/efs_upgrade.log"
# efs_upgrade工具路径
efs_upgrade_tool = "/sf/log/efs_upgrade"
# 会同时有today跟当天日期的两个日志都被匹配到,去重
dup_log_path="/sf/log/today/vs/scripts/"


def do_cmd_std(cmd):
    """
    执行shell命令，并返回标准输出，标准错误，返回值
    """
    import subprocess

    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        shell=True
    )

    # TD41350 不用process.wait()的原因是虚拟机太多时会卡死
    # 改用communicate, 信息存放在内存中
    (outmsg, errmsg) = process.communicate()
    stdout = []
    stderr = []

    outmsg = outmsg.strip()
    outlines = outmsg.split('\n')
    errlines = errmsg.split('\n')
    ret = process.returncode
    for outline in outlines:
        if outline:
            stdout.append(outline)

    for errline in errlines:
        if errline:
            stderr.append(errline)
    logger.info("run cmd:%s.ret：%s", cmd, ret)
    return (outmsg, stderr, ret)


def check_upgrade_version():
    """
    检查是否是从efs_v1[5.8.6, 6.0.1)且非5.9.0_R2跟5.9.0_R5升级到efs_v2中的6.10.0系列
    # 升级记录格式：2023-05-25 18:57:53 update | [6.0.0_R5-001 build 2020-06-29 03:38:58] ---> [6.10.0_R1-002 build 2024-07-11 03:38:58]
    """
    cmd = "head -n1 /sf/version"
    hci_version, err, ret = do_cmd_std(cmd)
    logger.info("hci version: %s", hci_version)
    if ret or not hci_version:
        logger.error("run cmd:%s failed or return None.skip handle.", cmd)
        return False
    hci_version_prefix = hci_version.split('_')[0]
    if hci_version_prefix != "6.10.0":
        logger.info("current version is not 6.10.0X.skip handle.")
        return False

    cmd = "grep -w update /boot/firmware/history | tail -n 1"
    res, err, ret = do_cmd_std(cmd)
    logger.info("latest upgrade history: %s", res)
    if ret or not res:
        logger.info("run cmd:%s failed or return None.skip handle.", cmd)
        return False

    latest_update_history = res.split()
    before_version = latest_update_history[4]
    after_version = latest_update_history[9]
    before_version = before_version.lstrip('[').split('-')[0]
    after_version = after_version.lstrip('[').split('-')[0]
    logger.info("before upgrade version: %s, after upgrade version:%s", before_version, after_version)
    from cold_upgrade.cold_upgrade import VersionControl
    version_ctrl = VersionControl()
    after_vers_conditions = [
        {
            "mode": "GE",
            "cold_version": "6.10.0"
        },
        {
            "mode": "LT",
            "cold_version": "6.11.0"
        }
    ]
    before_vers_conditions = [
        {
            "mode": "GE",
            "cold_version": "5.8.6"
        },
        {
            "mode": "LT",
            "cold_version": "6.0.1"
        },
        {
            "mode": "NE",
            "cold_version": "5.9.0_R2"
        },
        {
            "mode": "NE",
            "cold_version": "5.9.0_R5"
        }
    ]
    if not version_ctrl.judge_version(after_version, after_vers_conditions):
        logger.info("af_ver(%s) is not in [6.10.0, 6.11.0).efs not v2.skip handle.", after_version)
        return False
    if not version_ctrl.judge_version(before_version, before_vers_conditions):
        logger.info("bef_ver(%s) is not in [5.8.6,6.0.1) or in [5.9.0_R2,5.9.0_R5].efs not v1.skip handle.", before_version)
        return False

    return True


def check_err_log():
    """检测是否存在错误日志：升级流程中efs升级失败跟efs升级漏处理场景都需要匹配，只要有一个异常就满足条件"""
    upgrade_log_path = "/sf/log/*/vs/scripts/vs_upgrade_efs.sh.log*"
    cmd = "zgrep -E 'file_path.* not exist|failed: .*, backup path is' {}|grep -v {}".format(upgrade_log_path, dup_log_path)
    res, err, ret = do_cmd_std(cmd)
    if ret or not res:
        logger.info("run cmd:%s failed or return None.skip handle.", cmd)
        return False
    return True


def check_env():
    """
    检查环境是否问题场景：1）检测升级版本  2）检测报错日志
    """
    if not os.path.exists(stddir.VSD_CFG_VOLUME):
        print("===检测环境不存在卷, 跳过处理")
        return False
    try:
        if not check_upgrade_version():
            print("===检测升级前版本跟升级后版本不符合, 跳过处理")
            return False
    except Exception as ex:
        print("===检测升级前版本跟升级后版本异常, 请人工确认并处理")
        logger.info("check upgrade version failed:%s", ex)
        return False
    try:
        if not check_err_log():
            print("===检测升级错误日志不符合, 跳过处理")
            return False
    except Exception as ex:
        print("===检测升级错误日志异常, 请人工确认并处理")
        logger.info("check efs_upgrade.log err log failed:%s", ex)
        return False

    print("===检测环境升级版本跟日志符合问题场景")
    return True


def get_data_brick_nos():
    """
    获取本主机数据brick_no
    :return data_brick_nos:[10, 11, 12]
    """
    data_brick_nos = []
    glusterfs = Glusterfs()
    data_bricks = glusterfs.get_data_data_bricks_by_host([LOCALHOST])
    for brick in data_bricks:
        data_brick_nos.append(brick['brick_no'])
    return data_brick_nos


def get_unnormal_data_brick():
    """
    获取本主机的异常数据brick
    :return abnormal_data_bricks:{
        11: "host-005056956551:/sf/data/vs/local/kfdTP3-jIxN-EZG6-XRDh-Pb4i-TXgY-TiPVWT/890d0d10-9e76-11ef-bef5-00505695bd65",
        12: "host-005056956551:/sf/data/vs/local/EsFqKD-1iC4-YwEa-0C13-kDSB-tt7L-DeZluE/8b2e6e9a-9e76-11ef-b9ab-005056956551"
        }
    """
    from vs_rpc.vs_rpc_api import VsRpcApi
    from collections import OrderedDict

    abnormal_data_bricks = {}
    try:
        data_brick_nos = get_data_brick_nos()
        vs_rpc = VsRpcApi()
        rsp = vs_rpc.vs_get_clnt(LOCALHOST)
        dump_obj = json.loads(rsp, object_pairs_hook=OrderedDict)

        for brick in dump_obj["brick"]:
            brick_no = brick["no"]
            if brick["status"] == "UNNORMAL" and brick_no in data_brick_nos:
                abnormal_data_bricks[brick_no] = brick["brickid"]
    except Exception as ex:
        logger.error("get localhost unnormal data brick failed.%s", ex)
        print("===获取异常数据brick出现异常，请人工确认")
    print("===异常数据brick: {}".format(abnormal_data_bricks))
    return abnormal_data_bricks


def match_efs_upgrade_fail_log():
    """
    匹配efs_upgrade升级失败日志
    :return upgrade_fail_log efs升级错误日志
    ["upgrade /dev/EsFqKD-1iC4-YwEa-0C13-kDSB-tt7L-DeZluE/lv_efs failed: .*, backup path is
    /sf/log/vs/efs_upgrade/backup/efs-zmap-EsFqKD-1iC4-YwEa-0C13-kDSB-tt7L-DeZluE-2024-12-04-10-10-43.backup,
    see more details in /sf/log/vs/efs_upgrade/efs_upgrade.log"]
    """
    upgrade_fail_log = []
    upgrade_log_path = "/sf/log/*/vs/scripts/vs_upgrade_efs.sh.log*"
    fail_upgrade_cmd = "zgrep 'failed: .*, backup path is ' {} | grep -v {}".format(upgrade_log_path, dup_log_path)
    res, err, ret = do_cmd_std(fail_upgrade_cmd)
    print("===efs升级错误日志匹配: {}".format(res))
    if ret or not res:
        logger.warning("run cmd:(%s) failed or return None.ret(%s)", fail_upgrade_cmd, ret)
        return upgrade_fail_log

    upgrade_fail_log = res.splitlines()
    return upgrade_fail_log


def match_efs_upgrade_miss_log():
    """
    匹配efs升级漏处理lv的日志
    :return upgrade_fail_log brick升级错误日志
    ["file_path(/sf/data/vs/local/z3rxqY-ndqj-R5gm-t1X3-5THf-Wp8M-L5vzLZ/ff0df9ac-85e6-11e8-bb0a-20040fe6c4f8/lv_efs) not exist"]
    """
    upgrade_miss_log = []
    upgrade_log_path = "/sf/log/*/vs/scripts/vs_upgrade_efs.sh.log*"
    fail_upgrade_cmd = "zgrep 'file_path.* not exist' {} | grep -v {}".format(upgrade_log_path, dup_log_path)
    res, err, ret = do_cmd_std(fail_upgrade_cmd)
    print("===efs升级漏处理日志匹配: {}".format(res))
    if ret or not res:
        logger.error("run cmd:(%s) failed or return None.ret(%s)", fail_upgrade_cmd, ret)
        return upgrade_miss_log

    upgrade_miss_log = res.splitlines()

    return upgrade_miss_log


def get_brick_upgrade_fail_info():
    """
    匹配brick升级逻辑efs升级失败的lv_path跟backup_path
    :return brick_upgrade_fail_info brick升级错误信息
    [(
       "/sf/data/vs/local/z3rxqY-ndqj-R5gm-t1X3-5THf-Wp8M-L5vzLZ/ff0df9ac-85e6-11e8-bb0a-20040fe6c4f8/lv_efs",
        "/sf/log/vs/efs_upgrade/efs-zmap-3f7f4607-dd3a-3f73-0f1a-d9c4bc9b7bfb.backup"
    )] //这种格式是vs_efs_upgrade.sh漏处理，由brick升级的逻辑自己升级efs会出现这种格式
    """
    brick_upgrade_fail_info = []
    upgrade_miss_log = match_efs_upgrade_miss_log()
    brick_log_path = "/sf/log/*/vs/log/glusterfs/bricks/glusterfsd_sf-data-vs-local-{}*"
    miss_log_pattern = r'(/sf/data/vs/local/([^ /]+)/([^ /]+)/lv_efs)'
    backup_path_pattern = r'(/sf/log/vs/efs_upgrade/[^ ]+\.backup)'
    for line in upgrade_miss_log:
        match = re.search(miss_log_pattern, line)
        if not match:
            continue
        lv_path = match.group(1)
        vg = match.group(2)
        brick_id = match.group(3)
        brick_path = "{}-{}".format(vg, brick_id)
        formatted_brick_log_path  = brick_log_path.format(brick_path)
        brick_failed_pattern = "zmap_page_alloc_blocks_manual.*some blocks in .* of zmap page .* are already allocated"
        brick_failed_cmd = "zgrep {} {}|grep -v {}|tail -n 1".format(brick_failed_pattern, formatted_brick_log_path, dup_log_path)
        res, err, ret = do_cmd_std(brick_failed_cmd)
        print("===brick升级错误日志匹配: {}".format(res))
        if ret or not res:
            logger.error("run cmd:(%s) failed or return None.ret(%s)", brick_failed_cmd, ret)
            continue
        backup_path_log = "loaded, path: \[/sf/log/vs/efs_upgrade"
        backup_path_cmd = "zgrep {} {}|grep -v {}|tail -n 1".format(backup_path_log, formatted_brick_log_path, dup_log_path)
        backup_path_res, err, backup_path_ret = do_cmd_std(backup_path_cmd)
        print("===brick日志获取backup_path: {}".format(backup_path_res))
        if backup_path_ret or not backup_path_res:
            logger.error("run cmd:(%s) failed or return None.ret(%s)", backup_path_cmd, backup_path_ret)
            continue

        match = re.search(backup_path_pattern, backup_path_res)
        if not match:
            continue
        backup_path = match.group(1)
        brick_upgrade_fail_info.append((lv_path, backup_path))
    return brick_upgrade_fail_info


def get_efs_upgrade_fail_info():
    """
    获取升级流程efs升级失败的lv_path跟backup_path
    :return efs_upgrade_fail_info efs升级错误信息
    [(
        "/dev/EsFqKD-1iC4-YwEa-0C13-kDSB-tt7L-DeZluE/lv_efs",
        "/sf/log/vs/efs_upgrade/backup/efs-zmap-EsFqKD-1iC4-YwEa-0C13-kDSB-tt7L-DeZluE-2024-12-04-10-10-43.backup"
    )]
    """
    efs_upgrade_fail_info = []
    upgrade_fail_log = match_efs_upgrade_fail_log()

    pattern = r'(/dev/[^ ]+/lv_efs).*?(/sf/log/vs/efs_upgrade/[^ ]+\.backup)'
    for line in upgrade_fail_log:
        match = re.search(pattern, line)
        if not match:
            continue
        lv_path = match.group(1)
        backup_path = match.group(2)
        efs_upgrade_fail_info.append((lv_path, backup_path))
    return efs_upgrade_fail_info


def get_fail_brick_lv():
    """
    获取efs升级失败的brick跟lv信息：brick异常且在efs_upgrade升级异常才需要处理
    :return efs_upgrade_fail_list efs_upgrade升级失败的列表:[{
        "brick_no": 11,
        "lv_path": "/dev/EsFqKD-1iC4-YwEa-0C13-kDSB-tt7L-DeZluE/lv_efs",
        "backup_path": "/sf/log/vs/efs_upgrade/backup/efs-zmap-EsFqKD-1iC4-YwEa-0C13-kDSB-tt7L-DeZluE-2024-12-04-10-10-43.backup"
    },
    {
        "brick_no": 10,
        "lv_path": "/sf/data/vs/local/z3rxqY-ndqj-R5gm-t1X3-5THf-Wp8M-L5vzLZ/ff0df9ac-85e6-11e8-bb0a-20040fe6c4f8/lv_efs",
        "backup_path": "/sf/log/vs/efs_upgrade/efs-zmap-3f7f4607-dd3a-3f73-0f1a-d9c4bc9b7bfb.backup"
    } //这种格式是vs_efs_upgrade.sh漏处理，由brick升级的逻辑自己升级efs会出现这种格式
    ]
    """
    efs_upgrade_fail_list = []
    upgrade_fail_info = []
    try:
        abnormal_data_bricks = get_unnormal_data_brick()
        efs_upgrade_fail_info = get_efs_upgrade_fail_info()
        brick_upgrade_fail_info = get_brick_upgrade_fail_info()
        upgrade_fail_info.extend(efs_upgrade_fail_info)
        upgrade_fail_info.extend(brick_upgrade_fail_info)
        print("===所有匹配到efs升级异常的lv_path跟backup_path:{}".format(upgrade_fail_info))

        for lv_path, backup_path in upgrade_fail_info:
            if lv_path.startswith("/dev/"):
                fail_vg = lv_path.split("/")[-2]
            elif lv_path.startswith("/sf/data/vs/local/"):
                fail_vg = lv_path.split("/")[-3]
            else:
                print("===lv_path({})的格式不正确，跳过处理，请手动确认.".format(lv_path))
                continue
            efs_upgrade_fail_item = {}
            # efs_upgrade升级异常且brick状态异常才需要处理
            for brick_no, brick_id in abnormal_data_bricks.items():
                brick_vg = brick_id.split("/")[-2]
                if fail_vg == brick_vg:
                    efs_upgrade_fail_item["brick_no"] = brick_no
                    efs_upgrade_fail_item["lv_path"] = lv_path
                    efs_upgrade_fail_item["backup_path"] = backup_path
                    efs_upgrade_fail_list.append(efs_upgrade_fail_item)
                    break
            if not efs_upgrade_fail_item:
                print("==={}的brick状态正常，跳过处理，请手动确认.".format(lv_path))
    except Exception as ex:
        logger.error("get efs_upgrade_fail_list failed:%s", ex)
        print("===获取efs_upgrade升级失败信息异常，请人工确认并处理.")
        sys.exit(1)

    return efs_upgrade_fail_list


def disable_brick(brick_no):
    disable_cmd = "service_ctrl disable brick-{0} && service_ctrl stop brick-{0};".format(brick_no)
    res, err, ret = do_cmd_std(disable_cmd)
    print("===停用brick-{}, 结果：{}".format(brick_no, ret))
    if ret:
        logger.error("run cmd:%s failed", disable_cmd)
        return False
    return True


def enable_brick(brick_no):
    enable_cmd = "service_ctrl enable brick-{0} && service_ctrl start brick-{0};".format(brick_no)
    res, err, ret = do_cmd_std(enable_cmd)
    print("===启用brick-{}, 结果：{}".format(brick_no, ret))
    if ret:
        logger.error("run cmd:%s failed", enable_cmd)
        return False
    return True


def chmod_tool():
    cmd = "chmod +x {}".format(efs_upgrade_tool)
    res, err, ret = do_cmd_std(cmd)
    if ret:
        logger.error("run cmd:%s failed", cmd)
        return False
    return True


def retry_efs_upgrade(lv_path, backup_path, no_check=False):
    efs_restore_cmd = "EFS_LOG_TARGET={} {} -i ulvm -p {} -a restore -b {}".format(EFS_UPGRADE_LOG_PATH, efs_upgrade_tool, lv_path, backup_path)
    efs_upgrade_cmd = "EFS_LOG_TARGET={} {} -i ulvm -p {} -a upgrade -b {}".format(EFS_UPGRADE_LOG_PATH, efs_upgrade_tool, lv_path, backup_path)
    print("===将运行命令手工升级efs: {};\n{}".format(efs_restore_cmd, efs_upgrade_cmd))
    if not no_check:
        confirm = raw_input("===请输入Y确认执行：")
        if str(confirm) != "Y":
            print("输入{}不执行！".format(confirm))
            sys.exit(1)

    res, err, ret = do_cmd_std(efs_restore_cmd)
    print("回滚efs命令: {}, 结果：{}".format(efs_restore_cmd, ret))
    if ret:
        logger.error("run cmd:%s failed", efs_restore_cmd)
        return False

    res, err, ret = do_cmd_std(efs_upgrade_cmd)
    print("重新升级efs命令: {} 结果：{}".format(efs_upgrade_cmd, ret))
    if ret:
        logger.error("run cmd:%s failed", efs_upgrade_cmd)
        return False
    return True


def handle_efs_upgrade(efs_upgrade_fail_list, no_check=False):
    """重试升级efs"""
    success_cnt = 0
    try:
        if not chmod_tool():
            print("===工具添加执行权限失败！")
            sys.exit(1)
        for efs_upgrade_fail_item in efs_upgrade_fail_list:
            brick_no = efs_upgrade_fail_item["brick_no"]
            lv_path = efs_upgrade_fail_item["lv_path"]
            backup_path = efs_upgrade_fail_item["backup_path"]
            if not disable_brick(brick_no):
                continue
            if not retry_efs_upgrade(lv_path, backup_path, no_check):
                continue
            if not enable_brick(brick_no):
                continue
            success_cnt += 1
    except Exception as ex:
        logger.error("handle_efs_upgrade failed:%s", ex)
        print("===重试efs_upgrade升级失败，请人工确认并处理.升级失败的brick跟lv信息：{}".format(efs_upgrade_fail_list))
        sys.exit(1)

    if len(efs_upgrade_fail_list) == success_cnt:
        return True
    else:
        return False


@logger_init()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--no_check", action="store_true", default=False, help="是否需要确认重试efs升级")
    args = parser.parse_args()
    if not check_env():
        return 1

    efs_upgrade_fail_list = get_fail_brick_lv()
    if handle_efs_upgrade(efs_upgrade_fail_list, args.no_check):
        print("===脚本执行完成 : 所有步骤执行成功!")
        return 0
    else:
        print("===脚本执行完成 : 存在某个brick执行失败!")
        return 1


if __name__ == '__main__':
    sys.exit(main())