#!/sf/vs/bin/python
# -*- coding: utf-8 -*-
import os
import sys
import re
import json
import logging
import logging.handlers
import psutil
import traceback
import argparse
import subprocess
import shlex
import socket
import time
from subprocess32 import check_call, check_output
import pylib.utils.utiltools as common

reload(sys)
sys.setdefaultencoding('utf8')


CLUSTER_HOSTS_CFG = '/cfs/.members'
VS_VOLUME_CFG = '/sf/cfg/vs/volume.json'


class Error(Exception):
    pass


def get_vs_version():
    cmd = "head -n 1 /sf/vs/version"
    ver = check_output(cmd, shell=True)
    return ver.strip('\n')


def cli(cmdline, split=True):
    if not cmdline:
        raise Error("cli input cmd is empty")

    common.logger.info('cli {}'.format(cmdline))
    process = subprocess.Popen(shlex.split(cmdline),
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)

    (stdoutdata, stderrdata) = process.communicate()
    if process.returncode != 0:
        str = ''
        if stdoutdata:
            str += 'stdout: {0:s}'.format(stdoutdata.decode('utf-8'))
        if stderrdata:
            str += 'stderr: {0:s}'.format(stderrdata.decode('utf-8'))
        errstr = "Failed to exec {0:s}. {1:s}".format(cmdline, str)
        raise Error(errstr)

    if split:
        lines = stdoutdata.decode('utf-8').split('\n')
    else:
        lines = stdoutdata.decode('utf-8')
    return lines


def remote_cli(host, cmdline, is_vt = False):
    if not host or not cmdline:
        raise Error("remote cli input host or cmd is empty")
    
    if get_vs_version() < '3.5.0':
        cmdline = "/usr/bin/ssh root@{0:s} {1:s}".format(host, cmdline)
    else:
        if is_vt:
            cmdline = "/usr/bin/ssh -p 22346 root@{0:s} '{1:s}'".format(host, cmdline)
        else:
            cmdline = "/usr/bin/ssh root@{0:s} {1:s}".format(host, cmdline)
    try:
        return cli(cmdline)
    except Error as e:
        common.logger.info("Error executing remote command: {}".format(e))
        return ""  # 返回空字符串而不是抛出异常


'''
{
"nodeid": "1452653589",
"nodename": "host-00505695bc15",
"version": 6,
"uptime": 122,
"stable_version": 1063145100,
"cluster": { "name": "Sangfor-VTP", "version": 0, "nodes": 2, "quorate": 1, "synced_flag": 3, "synced": 1},
"nodelist": {
  "host-00505695bc15": { "id": 1452653589, "online": 1, "ip": "10.175.129.41"},
  "host-005056950f6a": { "id": 1452609386, "online": 1, "ip": "10.175.129.42"}
  },
"trusted_group": 1
}
'''
def get_cluster_nodes():
    with open(CLUSTER_HOSTS_CFG, 'r') as file:
        content = file.read()
    nodes_cfg = json.loads(content)
    return nodes_cfg['nodelist']


# 可能多卷环境,找到一个能获取成功的主机
# vs3.0.5之后有快照,要加上--base参数
def get_base_gfid_vs30(file):
    if get_vs_version() < '3.0.5':
        cmd_lookup = "\"/sf/vs/bin/vs_quick_lookup.sh {} |head -n 1\"".format(file)
    else:
        cmd_lookup = "\"/sf/vs/bin/vs_quick_lookup.sh --base {} |head -n 1\"".format(file)

    nodes_cfg = get_cluster_nodes()
    for node_name in nodes_cfg.keys():
        if nodes_cfg[node_name]["online"] == 0:
            print common.Colored().red("node: {} 主机状态异常,请检查是否已经离线".format(node_name))
            continue
        result = remote_cli(node_name, cmd_lookup)
        for line in result:
            if line and "host-" in line and "/sf/data/vs/local" in line:
                parts = line.rsplit('/', 1)  # 从右侧分割一次
                if len(parts) > 1:
                    gfid = parts[-1]  # 返回最后一个部分,即 UUID
                    common.logger.info("文件:{} gfid:{}".format(file, gfid))
                    return gfid
    print common.Colored().red("没有找到文件对应的gfid,请确认文件是否存在,文件路径:{}".format(file))
    return None


'''
vs_quick_lookup.sh /sf/data/vs/gfs/d115069a_vs_vol_rep2/images/host-005056956533/centos_auto_test.vm/vm-disk-1.qcow2
host-005056959419:/sf/data/vs/local/r70m89-xxx/xxx/images/host-005056956533/centos_auto_test.vm/vm-disk-1.qcow2
host-005056956533:/sf/data/vs/local/Vm2HJm-xxx/xxx/images/host-005056956533/centos_auto_test.vm/vm-disk-1.qcow2
trusted.gfid=0x41d849e0521b42ed9d63a5e63eda515e
直到一个主机能找到gfid为止
'''
def get_base_gfid_vs28(file):
    if get_vs_version() > '3.0':
        return None
    host_name = ""
    file_path = ""
    cmd_lookup = "\"/sf/vs/bin/vs_quick_lookup.sh {} |head -n 1\"".format(file)
    nodes_cfg = get_cluster_nodes()
    for node_name in nodes_cfg.keys():
        if nodes_cfg[node_name]["online"] == 0:
            print common.Colored().red("node: {} 主机状态异常,请检查是否已经离线".format(node_name))
            continue

        result = remote_cli(node_name, cmd_lookup)
        for line in result:
            if line and "host-" in line and "/sf/data/vs/local" in line:
                host_name, file_path = line.split(":", 1)
                cmd_getfattr = "/sf/vs/bin/getfattr -d -m . -e hex {}".format(file_path)
                res_xattr = remote_cli(host_name, cmd_getfattr)
                if not res_xattr:
                    continue

                for xattr in res_xattr:
                    if "trusted.gfid" in xattr:
                        start_index = xattr.find('0x') + 2
                        gfid = xattr[start_index:start_index + 8]
                        common.logger.info("文件:{} gfid:{}".format(file, gfid))
                        return gfid
    print common.Colored().red("没有找到文件对应的gfid,请确认文件是否存在,文件路径:{}".format(file))
    return None


def get_base_gfid(file):
    if get_vs_version() < '3.0':
        return get_base_gfid_vs28(file)
    return get_base_gfid_vs30(file)


def get_nfs_path(result):
    for line in result:
        if "Failed to lock file" in line and "vm-disk-" in line:
            pattern = r"nfs://([^,]+\.qcow2)"
            match = re.search(pattern, line)
            if match:
                full_path = match.group(1)
                # 去掉 nfs:// 前缀
                return '/' + full_path.split('/', 1)[1]
    return None

def get_vm_lock_failed_images(vmid):
    vm_log_path = "/sf/log/today/sfvt_qemu_{}.log".format(vmid)
    nodes_cfg = get_cluster_nodes()
    for node_name in nodes_cfg.keys():
        if nodes_cfg[node_name]["online"] == 0:
            print common.Colored().red("node: {} 主机状态异常,请检查是否已经离线".format(node_name))
            continue

        cmd = "ls -l {}".format(vm_log_path)
        result = remote_cli(node_name, cmd)
        if not result:
            continue
        cmd = "\"grep -i 'Failed to lock file' {} | grep 'ret = -11' |grep kvm | grep -v grep | tail -n 1\"".format(vm_log_path)
        result = remote_cli(node_name, cmd)
        if not result:
            continue
        nfs_file = get_nfs_path(result)
        if not nfs_file:
            continue
        image_file = "/sf/data/vs/gfs{}".format(nfs_file)
        common.logger.info("虚拟机:{} 开机加锁失败文件:{}".format(vmid, image_file))
        return image_file
    print common.Colored().red("没有找到虚拟机加锁失败的日志,请确认虚拟机启动报错提示为[虚拟机镜像忙,正在执行其他操作]")
    return None


def check_vs_failed_log(gfid):
    cmd = "\"zgrep {} /sf/log/today/vs/log/glusterfs/glusterfs_nfs.log* |grep afr_lk_cbk |grep 'op_errno=11' |tail -n 1\"".format(gfid)
    nodes_cfg = get_cluster_nodes()
    for node_name in nodes_cfg.keys():
        if nodes_cfg[node_name]["online"] == 0:
            print common.Colored().red("node: {} 主机状态异常,请检查是否已经离线".format(node_name))
            continue
        result = remote_cli(node_name, cmd)
        if result:
            common.logger.info("检测出主机:{} 有加锁失败日志,日志: {}".format(node_name, result))
            return True
    print common.Colored().red("nfs日志没有找到虚拟机加锁失败的日志,请确认虚拟机启动报错提示为[虚拟机镜像忙,正在执行其他操作]")
    return False


def check_vm_non_multi_instances(vmid):
    nodes_cfg = get_cluster_nodes()
    for node_name in nodes_cfg.keys():
        if nodes_cfg[node_name]["online"] == 0:
            print common.Colored().red("node: {} 主机状态异常,请检查是否已经离线".format(node_name))
            continue
        cmd = "ps auxf |grep {} |grep kvm |grep -v grep".format(vmid)
        pid_status = remote_cli(node_name, cmd)
        if pid_status:
            print common.Colored().red("虚拟机:{} 正在节点:{} 上运行,请检查页面虚拟机是否已经启动成功了".format(vmid, node_name))
            print common.Colored().red("如果页面上显示虚拟机还是关机状态,请联系VT研发技术支持,处理虚拟机多实例运行问题")
            return False
    common.logger.info("vmid: {} 虚拟机不存在多实例运行,继续检测...".format(vmid))
    return True


def check_vm_exists(vmid):
    cmd = "/sf/bin/qm-c list | grep -w {}".format(vmid)
    result = remote_cli('127.0.0.1', cmd, True)
    if result:
        return True
    else:
        print common.Colored().red("输入的虚拟机不存在,请检查vmid:{} 输入是否有误".format(vmid))
        return False


def check_virtual_storage_exists():
    if not os.path.exists(VS_VOLUME_CFG):
        print common.Colored().red("不存在虚拟存储或者该节点不是存储节点,请在存储节点运行")
        return False
    with open(VS_VOLUME_CFG, 'r') as file:
        content = file.read()
    vs_volume_cfg = json.loads(content)
    if vs_volume_cfg["volume_id"]:
        return True
    print common.Colored().red("不存在虚拟存储或者该节点不是存储节点,请在存储节点运行")
    return False


def check_vs_version():
    vs_version = get_vs_version()
    if vs_version >= '3.7.0':
        print common.Colored().red("HCI6.10.0以及以上版本,虚拟机开机提示[虚拟机镜像忙,正在执行其他操作]的问题需要上升VS研发排查")
        print common.Colored().red("当前环境VS版本: {}".format(vs_version))
        return False
    if vs_version < "2.8":
        print common.Colored().red("vs2.8以前版本没有nfs协议,请上升VS研发技术支持处理")
        print common.Colored().red("当前环境VS版本: {}".format(vs_version))
        return False
    return True


def restart_nfs_server(host):
    print common.Colored().fuchsia("正在重启主机:{}虚拟存储客户端,请稍后....".format(host))
    cmd = ""
    vs_version = get_vs_version()
    if vs_version >= "2.8.0" and vs_version < '3.0.5':
        cmd = "\"killall -9 glusterfs\""
    elif vs_version >= "3.0.5" and vs_version < '3.5.0':
        cmd = "\"/sf/vs/bin/vs_update_nfs.sh\""
    elif vs_version >= "3.5.0" and vs_version < '3.7.0':
        cmd = "\"/sf/vs/bin/vs_update_nfs.sh restart recover\""
    else:
        return False

    result = remote_cli(host, cmd)
    if not result:
        return False
    return True


def handle_nfs_lock(gfid):
    # 只有一个主机上有输出
    cmd = "\"/sf/vs/bin/vs_nfs_statedump.sh |grep -C 8 {}\"".format(gfid)
    nodes_cfg = get_cluster_nodes()
    need_restart_nfs_node = ""
    for node_name in nodes_cfg.keys():
        if nodes_cfg[node_name]["online"] == 0:
            print common.Colored().red("node: {} 主机状态异常,请检查是否已经离线").format(node_name)
            continue
        result = remote_cli(node_name, cmd)
        common.logger.info("host:{} cmd: {} result: {}".format(node_name, cmd, result))
        for line in result:
            if line and gfid in line:
                need_restart_nfs_node = node_name
                break
        if need_restart_nfs_node:
            break

    if not need_restart_nfs_node:
        print common.Colored().red("没有检测到锁泄漏问题,请尝试重新开启虚拟机")
        return
    
    print common.Colored().fuchsia("检测到主机:{} 存在锁泄漏问题,导致虚拟机开机失败[虚拟机镜像忙,正在执行其他操作]").format(need_restart_nfs_node)
    print common.Colored().fuchsia("临时方案:重启主机:{} 上的存储客户端服务,请与客户沟通影响,会影响该主机上的虚拟机IO,大约卡住3-30秒之后自动恢复").format(need_restart_nfs_node)
    while True:
        print common.Colored().fuchsia("输入y确认重启主机:{} 上的存储客户端服务, 如果要退出工具请输入ctrl+c".format(need_restart_nfs_node))
        step = sys.stdin.readline().strip('\n')
        if step.lower() == 'y':
            break
        else:
            print common.Colored().fuchsia("输入字符错误,请重新输入")
    if not restart_nfs_server(need_restart_nfs_node):
        print common.Colored().red("重启主机:{} 上的存储客户端服务失败,请联系VS研发技术支持处理".format(need_restart_nfs_node))
        return False
    print common.Colored().fuchsia("已经重启主机:{} 上的存储客户端服务,请稍等3-30秒之后再尝试重新开虚拟机".format(need_restart_nfs_node))
    return True


def check_vm_and_handle_lock(vmid):
    if not check_virtual_storage_exists():
        return False
    if not check_vm_exists(vmid):
        return False
    if not check_vm_non_multi_instances(vmid):
        return False
    if not check_vs_version():
        return False
    image_file = get_vm_lock_failed_images(vmid)
    if not image_file:
        return False
    gfid = get_base_gfid(image_file)
    if not gfid:
        return False
    if not check_vs_failed_log(gfid):
        return False
    return handle_nfs_lock(gfid)


def _check_vm_and_handle_lock(vmid):
    lock_file = common.get_vsfire_lock_file()
    with common.VsfireFlock(lock_file) as lock:
        ret = check_vm_and_handle_lock(vmid)
        print common.Colored().fuchsia("执行结束")
        return ret