#!/sf/vs/bin/python
# -*- coding:utf-8 -*-
"""
# Copyright @2021 Sangfor Technologies
# All rights reserved.
#
# Author: hwh
#
# create: 2021-1-11
#
# Last modified:
#
# Filename: vs_hot_add_ssd.py
#
# Description:
#
# 使用说明：
# 进入维护模式前请手动把虚拟机迁走
# 工具分为5个步骤:1.扩容缓存盘到新磁盘组; 2.进入维护模式; 3.给新磁盘组分配卷内的数据盘; 4.退出维护模式
# 第1步 会列出环境下空闲的ssd，需要用户手动输入，一次只能扩一块ssd
# 第2步 进入维护模式会等待平衡和重建任务结束，可以提前在页面确认。
# 第3步 列出当前主机上其他磁盘组的信息，请根据提示手动分配1个以上的数据盘到新的磁盘组中
# 第4步 退出维护模式，扩容完成
# 注：老版本配置了聚合口可能会导致退出维护模式失败，详见td http://td.sangfor.com/#/defect/details/2020120900260
#
"""
import copy
import csv
import hashlib
import json
import logging
import os
import shutil
import subprocess
import sys
import time

import StringIO
import requests
import zkapi.zk_op as zk
from libcelery.rpc_call import remote_check_output
from libcommon.config import Config
from libcommon.libconf import VolumeConf, LocalDiskConf
from libcommon.log import logger_init
from libcommon.singleproc import SingleProc
from libvs.glusterfs import Glusterfs
from libvs.glusterfs.vs_gluster import volume_mount_point_op
from libvs.utils.common import LOCALHOST, get_vs_version, get_vol_master
from libvs.volume.common import get_localhost_volume_id
from libvs.volume.common import get_localhost_volume_master
from maintain.lib.protect_precheck import check_brick_status
from maintain.lib.server_manage import wait_wcache_tier
from vs_dts.lib.module_conf import DTS_COMPRESS_ON
from zkapi.zk_op import zk_op

###################################################################
# 热扩容SSD修改配置列表  -  备份位置
# /sf/cfg/vs/disk/*.json      扩容前:/sf/cfg/vs/back_expand_disk/,扩容后修改磁盘组前:/sf/cfg/vs/bak_modify_disk/
# /volumes/$volume_id/vs_vol_diskmap.json       /sf/cfg/vs/vs_vol_diskmap.json.bak
# /sf/cfg/vs/cache/wcache.json /sf/cfg/vs/cache/wcache.json.bak
#
###################################################################

# 配置文件路径
EDS_VERSION = "/sf/vs/version"
DISK_PATH = "/sf/cfg/vs/disk"
DISK_MAP_PATH = " /volumes/{}/vs_vol_diskmap.json"
WCACHE_CFG_PATH = "/sf/cfg/vs/cache/wcache.json"
TIER_CFG_PATH = "/sf/cfg/vs/cache/tier.json"
HOST_GROUP_INFO = "/nodes/{}/diskgroups/{}"

# 扩容操作的备份
DISK_EXPAND_BAK_PATH = "/sf/cfg/vs/bak_expand_disk/"
# 修改磁盘组操作的备份
DISK_MODIFY_BAK_PATH = "/sf/cfg/vs/bak_modify_disk/"
# 热扩容配置文件路径
EXPAND_OP_DATA = "/sf/cfg/vs/hot_add_expand_data"

#配置文件路径
EXPAND_CFG_PATH = '/sf/log/vs/ssd_hot_expand.json'

#状态
STRING_STATUS = 'status'
STATUS_NONE = 'none'
STATUS_DOING = 'doing'
STATUS_SUCC = 'success'
STATUS_FAIL = 'fail'

#阶段
PHASE_ADD_SSD = 'add_ssd'
PHASE_ENTRY_PRO = 'entry_protect'
PHASE_KICKOUT_SSD = 'kickout_ssd_data'
PHASE_MOD_CFG = 'modify_cfg'
PHASE_LEAVE_PRO = 'leave_protect'

PHASE_NAME = ['add_ssd', 'entry_protect', 'kickout_ssd_data', 'modify_cfg', 'wait_brick_normal', 'leave_protect']

logger = logging.getLogger(__name__)

task_volume_id = ""

file_md5_map = {
    "/sf/vs/bin/vs_wait_wcache_tier.sh": "b833577acfe25fc95f74ca9acaa1d6a4",
    "/sf/vs/bin/update_disk_group.py": "2e513799da61c3762f13c30d8dd0bf7c"
}

RETRY_TIME = 30

def calculate_md5(file_path):
    """计算文件的 MD5 值"""
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


def verify_md5(file_path, expected_md5):
    """校验文件的 MD5 值是否与预期值相同"""
    calculated_md5 = calculate_md5(file_path)
    if calculated_md5 == expected_md5:
        return True
    return False


def do_cmd_std(cmd):
    """
    执行shell命令，并返回标准输出，标准错误，返回值
    """
    import subprocess

    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        shell=True
    )

    # TD41350 不用process.wait()的原因是虚拟机太多时会卡死
    # 改用communicate, 信息存放在内存中
    (outmsg, errmsg) = process.communicate()
    stdout = []
    stderr = []

    outmsg = outmsg.strip()
    outlines = outmsg.split('\n')
    errlines = errmsg.split('\n')
    ret = process.returncode
    for outline in outlines:
        if outline:
            stdout.append(outline)

    for errline in errlines:
        if errline:
            stderr.append(errline)

    return (outmsg, stderr, ret)


def wait_task_finish(master, task_id):
    global task_volume_id

    logger.info("execute expand disk, plz wait... master is {}, task id is {}".format(master, task_id))
    cmd = 'scp {}:/sf/cfg/vs/task/{}_expand_volume.json  /tmp/ '.format(master, task_id)
    fail_flag = 0
    while True:
        try:
            res, err, ret = do_cmd_std(cmd)
            if ret:
                raise Exception("run cmd {} error, reason {}".format(cmd, err))
            conf = cfg_load("/tmp/{}_expand_volume.json".format(task_id))

            # 上报一下进度
            logger.info("task_info: {}".format(conf['progress_info']))

            if conf['progress_info']['status'] == 'vs_success':
                print("task finished")
                if conf['progress_info'] != 100:
                    res, err, ret = do_cmd_std('vsmgr volume set-volume-task-success {}'.format(task_volume_id))
                    if ret:
                        logger.warning("set expand task success failed, reason {}".format(err))
                time.sleep(60)
                break
            if conf['progress_info']['status'] == 'failed':
                print("task is failed, reason: {}. please retry on the webpage".format(conf['progress_info']['message']))
                # return False
        except Exception as ex:
            logger.warning('load task conf error, reason:{}'.format(ex))
            fail_flag = fail_flag + 1
        if fail_flag >= 10:
            return False
        time.sleep(30)

    return True


def get_all_files(dir_path, file_list):
    """
    获取一个目录下的所有文件
    :param dir_path:
    :param file_list:
    :return:
    """
    if not os.path.exists(dir_path):
        logger.debug("dir path:{} not exit".format(dir_path))
        return
    for file_path in os.listdir(dir_path):
        file_path = os.path.join(dir_path, file_path)
        if os.path.isdir(file_path):
            continue
        else:
            file_list.append(file_path)

def copy_dir(src, dst):
    file_list = []
    get_all_files(src, file_list)
    for one_file in file_list:
        shutil.copy(one_file, dst)
        os.remove(one_file)
    os.removedirs(src)


def cfg_save(obj, path):
    """
    将json对象内容写入配置
    """
    bakfile = path + '.bak'
    with open(bakfile, 'w') as f:
        json.dump(obj, f, indent=4)

    os.rename(bakfile, path)


def cfg_load(path):
    """
    解析配置文件为json对象
    """
    try:
        obj = json.load(file(path))
    except Exception as exp:
        logger.error("Failure to load file, error is {}".format(str(exp)))
        return None
    return obj

def create_emtpy_cfg():
    """
    生成一个空的配置文件
    """

    dict_cfg = {}
    dict_status = {}

    logger.info("gen new dict_cfg %s" % dict_cfg)
    dict_status[STRING_STATUS] = STATUS_NONE

    for item in PHASE_NAME:
        dict_cfg[item] = dict_status
    cfg_save(dict_cfg, EXPAND_CFG_PATH)

    return dict_cfg

def save_phase_status(cfg, obj, value):
    obj[STRING_STATUS] = value
    cfg_save(cfg, EXPAND_CFG_PATH)


def bak_conf(src, dst):
    if os.path.exists(dst):
        os.removedirs(dst)

    shutil.copytree(src, dst)


def judge_vs_version(tar_version):
    cur_version = get_vs_version()
    if cur_version >= tar_version:
        return True
    else:
        return False


def delete_wcache_brick(brick):
    """
    将brick从写缓存中删除
    :param brick:
    :return:
    """
    try:
        cmd = "/sf/vs/bin/vs_wcache_notify_org.js --setInvalid {}".format(brick)
        logger.info("cmd:{}".format(cmd))

        res, err, ret = do_cmd_std(cmd)
        logger.info("cmd:{}, result: {}, err: {}, ret: {}".format(cmd, res, err, ret))
        if ret:
            return False
        return True
    except Exception as exp:
        logger.error("delete wcache brick failed: {}".format(str(exp)))
        raise exp


def curl(cmd):
    try:
        url = 'http://127.0.0.1:7102/hosts/cmds'
        headers = {'Content-Type': 'application/json'}
        data = {
            "cmds": [cmd]
        }
        response = requests.post(url, headers=headers, data=json.dumps(data))
        if response.status_code == 200:
            return True
        return False
    except Exception as ex:
        logger.error("curl  cmd {} failed {}".format(cmd, ex))
        return False
    return True


# 禁用/开启存储口
def storage_network_modify(ban=False):
    op_param = 'D'
    if ban:
        op_param = 'I'
    try:
        from libvs.utils.hostinfo import HostStatus
        hostinfo = HostStatus()
        stornet_info = hostinfo.get_stornet_info()
        if os.path.exists("/sf/cfg/vs_private_iface"):
            with open("/sf/cfg/vs_private_iface", 'r') as file:
                content = file.read().strip() 
                print("nics {}".format(content))
        else:
            print("not found /sf/cfg/vs_private_iface")
        
        stornet_info['nics'].append(content)
        for eth in stornet_info['nics']:
            cmd = "iptables -{} INPUT -j REJECT  -i {}".format(op_param, eth)
            # cmd = "iptables -{} INPUT -s {}/{} -j DROP".format(op_param, net_conf[0]['host_ip'], net_conf[0]['netmask'])
            print(cmd)
            res = curl(cmd)
            if not res:
                return False
    except Exception as ex:
        logger.error("resolve iptables cmd failed {}".format(ex))
        return False
    return True


def rollback_add_ssd(cfg):
    global task_volume_id

    # 回滚扩容操作
    if 'task_id' in cfg and 'need_revert' in cfg and cfg['need_revert']:
        logger.info("volume {} need to be reverted, task_id {}".format(task_volume_id, cfg['task_id']))
        cmd = "/sf/vs/bin/vsmgr volume revert-volume-task {}".format(task_volume_id)
        res, err, ret = do_cmd_std(cmd)
        if ret:
            logger.error("revert expand task failed, reason :{}".format(err))
            return False
        cfg.pop('need_revert')

    # 恢复DISK_EXPAND_BAK_PATH备份
    if os.path.isdir(DISK_EXPAND_BAK_PATH):
        copy_dir(DISK_EXPAND_BAK_PATH, DISK_PATH)
    return True


def rollback_entry_protect(cfg):
    if not execute_leave_protect(""):
        print("rollback_entry_protect failed")
        return False
    return True


def rollback_kickout_ssd_data(cfg):
    return True

def rollback_wait_brick_normal(cfg):
    return True

def rollback_modify_cfg(cfg):
    global task_volume_id
    # 回滚 DISK_MODIFY_BAK_PATH备份
    if os.path.isdir(DISK_MODIFY_BAK_PATH):
        copy_dir(DISK_MODIFY_BAK_PATH, DISK_PATH)
    # 回滚 DISK_MAP_PATH 备份
    if 'voldisk_map_need_revert' in cfg:
        temp, err, ret = do_cmd_std(
            "/sf/vs/bin/super_zkcli.py local2zk -s /sf/cfg/vs/vs_vol_diskmap.json.bak -d /volumes/{}/vs_vol_diskmap.json".format(
                task_volume_id))
        if ret:
            logger.error("rollbak vs_vol_diskmap.json failed : {}".format(err))
            return False
        cfg.pop('voldisk_map_need_revert')

    # 回滚tier 和 wcache
    if 'request_disk' in cfg and 'need_revert' in cfg:
        if not deal_with_wcache_and_tier(cfg['request_disk']):
            logger.info("rollback modify_cfg failed")
            return False
        cfg.pop('need_revert')
    return True


def rollback_leave_protect(cfg):
    return True


def show_cur_diskgroup_info():
    # 构造磁盘组展示信息
    file_list = []
    get_all_files(DISK_PATH, file_list)
    one_disk_group = {
        'cache': [],
        'data': []
    }
    disk_group_info = {}
    try:
        for file in file_list:
            conf = cfg_load(file)
            if conf['storage_type'] == 'STORAGE_NONE':
                continue
            group_id = conf['disk_group_id']
            is_cache = 'cache' if conf['storage_type'] == 'STORAGE_CACHE' else 'data'
            one_disk = {'size': conf['disk_size'] / 1024 / 1024 / 1024,
                        'disk_id': conf['disk'],
                        'uuid': conf['part_array'][0]['part_uuid'],
                        'sn': conf['disk_sn'].split(' ')[0]
                        }
            if group_id not in disk_group_info:
                disk_group_info[group_id] = copy.deepcopy(one_disk_group)
            disk_group_info[group_id][is_cache].append(one_disk)
    except Exception as ex:
        logger.error("get disk group info failed {}".format(ex))
        return None
    return disk_group_info

def get_remove_disk_list(disk_group_info):
    # 构造磁盘组改动格式
    # [{group_id:1 ,disk_id:xxx, uuid:aaa}, {group_id:2 ,disk_id:xxx, uuid:bbb}]
    request_disk = []
    while True:
        data = raw_input("请选择需要移动的disk_id，多个磁盘用逗号隔开: ")
        remove_disks = data.split(',')
        is_in_conf = 0
        for r_disk in remove_disks:
            for id in disk_group_info:
                for disk in disk_group_info[id]['data']:
                    if disk['disk_id'] == r_disk:
                        request_disk.append({'group_id': id, 'disk_id': r_disk, 'uuid': disk['uuid'], 'sn': disk['sn']})
                        is_in_conf = is_in_conf + 1
                        break
        if is_in_conf == len(remove_disks):
            break
        del request_disk[:]
        print("存在无效的disk_id: {} ".format(request_disk))
        continue

    while True:
        data = raw_input("请输入目标磁盘组id: ")
        try:
            target_id = int(data)
        except Exception as ex:
            print("请输入一个整数")
        if target_id not in disk_group_info:
            print("磁盘组ID不存在")
            continue
        break
    return target_id, request_disk


def modify_host_diskmap(target_id, request_disk):

    try:
        handler = zk_op()
        request_groups = {}
        new_group_path = HOST_GROUP_INFO.format(LOCALHOST, target_id)
        targe_group_info = json.loads(handler.read(new_group_path))
        for one_request_disk in request_disk:
            targe_group_info['disks'].append(one_request_disk['disk_id'])

            id = one_request_disk['group_id']
            if id not in request_groups:
                request_groups[id] = []
            request_groups[id].append(one_request_disk['disk_id'])
        handler.write(new_group_path, json.dumps(targe_group_info, indent=4))

        for id in request_groups:
            one_old_group_path = HOST_GROUP_INFO.format(LOCALHOST, id)
            targe_group_info = json.loads(handler.read(one_old_group_path))
            for disk in request_groups[id]:
                targe_group_info['disks'].remove(disk)
            handler.write(one_old_group_path, json.dumps(targe_group_info, indent=4))
    except Exception as e:
        logger.error("modify host diskmap failed : {}".format(e))
        return False
    return True


def modify_diskmap(target_id, request_disk):
    # 修改 vs_vol_diskmap.json
    # [{group_id:1 ,disk_id:xxx, uuid:aaa}, {group_id:2 ,disk_id:xxx, uuid:bbb}]
    global task_volume_id
    try:
        handler = zk_op()
        diskmap_path = DISK_MAP_PATH.format(task_volume_id)
        diskmap = json.loads(handler.read(diskmap_path))
        for host in diskmap['hosts']:
            if host['host_name'] != LOCALHOST:
                continue
                # 先删后加
            for one_group in host['disk_groups']:
                for one_req_disk in request_disk:
                    if one_group['id'] == one_req_disk['group_id']:
                        one_group['disk'].remove(one_req_disk['disk_id'])
                    elif one_group['id'] == target_id:
                        one_group['disk'].append(one_req_disk['disk_id'])
            break
        logger.info('get new diskmap {}'.format(json.dumps(diskmap, indent=4)))
        handler.write(diskmap_path, json.dumps(diskmap, indent=4))
        return True
    except Exception as ex:
        logger.error("modify vs_diskmap.json failed {}".format(ex))
        return False


def deal_with_wcache_and_tier(request_disk):
    try:
        # 先备份
        shutil.copyfile(WCACHE_CFG_PATH, "{}.bak".format(WCACHE_CFG_PATH))
        # 删除写缓存中的brick信息和tier中的brick信息
        uuid_list = [elem['uuid'] for elem in request_disk]
        delete_brick_list = []
        wcache_cfg = cfg_load(WCACHE_CFG_PATH)
        for one_disk_group in wcache_cfg['maps']:
            new_bricks_info = []
            for brick in one_disk_group['bricks']:
                remove_flag = False
                for uuid in uuid_list:
                    if uuid not in brick['brickId']:
                        continue
                    delete_wcache_brick(brick['brickId'])
                    delete_brick_list.append(brick['brickId'])
                    remove_flag = True
                    break
                if not remove_flag:
                    new_bricks_info.append(brick)
            one_disk_group['bricks'] = new_bricks_info
        cfg_save(wcache_cfg, WCACHE_CFG_PATH)
        print("delete wcache brick info succ :{}".format(delete_brick_list))

        if judge_vs_version('3.0.3'):
            from hot_update.server_hot_update.common_hotupgrade import kill_glusterfsd
            if not kill_glusterfsd():
                logger.error("restart glusterfsd failed")
                return False
        else:
            res, err, ret = do_cmd_std("/etc/init.d/glusterd restart")
            if ret:
                logger.error("restart glusterfsd failed")
                return False

        # 删除分层中的brick信息
        sys.path.insert(0, "/sf/vs/bin/")
        import vs_tier_brick_clean
        for brick_path in delete_brick_list:
            vs_tier_brick_clean.tier_delete_brick("", brick_path)

        temp, err, ret = do_cmd_std("/sf/vs/etc/init.d/tierd restart")
        if ret:
            logger.error("bak vs_vol_diskmap.json failed : {}".format(err))
            return False
        print("delete tier brick info succ :{}".format(delete_brick_list))
    except Exception as ex:
        logger.error("modify tier or wcache conf failed {}".format(ex))
        return False
    return True


def get_max_diskgroup_id(host_name):
    disk_groups, err, ret = do_cmd_std("/sf/vs/bin/super_zkcli.py ls /nodes/{}/diskgroups/".format(host_name))
    disk_groups = disk_groups.split()
    disk_groups.sort()
    return int(disk_groups[-1]) + 1

def get_pv_uuid(dev):
    base_name = os.path.basename(dev)
    cmd = "timeout -t 10 /bin/lsblk -l /dev/{} | grep part | " \
            "head -1 | awk -F' ' '{{print $1}}'".format(base_name)
    # 检查是否存在子分区，不存在则直接返回
    try:
        part_id = subprocess.check_output(cmd, shell=True).strip()
        if not part_id:
            return None
    except Exception as ex:
        logger.error("failed to get part id of %s", dev)
        raise ex

    # 获取子分区PV UUID
    cmd = 'timeout -t 10 /sbin/blkid -s UUID -o value /dev/{}'.format(part_id)
    try:
        pv_uuid = subprocess.check_output(cmd, shell=True).strip()
        return pv_uuid
    except subprocess.CalledProcessError as ex:
        logger.info("not fount pv uuid, cmd: %s", cmd)
        # 2代表BLKID_EXIT_NOTFOUND，源码中的解释为token or device not found
        if ex.returncode != 2:
            raise ex
    return None

def local_check_disk_pvs(disk_pvs):
    """
    本地检测单个pv是否被使用
    :param disk_pvs: {"host-A": {disk_pv: disk_alias}}
    :return:
    """
    vol_id = get_localhost_volume_id()
    if not vol_id:
        print("no vol_id")
        return 1

    check_files = [
        "/sf/cfg/vs/glusterfs/glusterd/vols/{}/info".format(vol_id),
        "/sf/cfg/vs/cache/tier.json",
        "/sf/cfg/vs/cache/wcache.json"
    ]
    args = [path for path in check_files if os.path.isfile(path)]
    logger.info("check config files",args)
    if not args:
        logger.info("file not exits: %s", check_files)
        return 1

    if not disk_pvs or len(disk_pvs) < 38:
        # VS使用的盘的pv_id最少都有38位，低于这个长度不检查
        logger.error("pv_id(%s)  too short, no check",
                    disk_pvs)
        return 1

    cmd = "grep -qw {} {}".format(disk_pvs, " ".join(args))
    res, _, ret = do_cmd_std(cmd)
    if ret == 1:
        # ret为1说明不在配置文件里 说明不在卷内
        return 0
    # 其余情况都认为是有问题的
    return 1

def check_disk_in_config(dev_name):
    pv_uuid = get_pv_uuid(dev_name)
    if not pv_uuid:
        print("{} 未获取到pv_id 该盘可能为未分区的新盘".format(dev_name))
        return 0
    print(pv_uuid)
    return local_check_disk_pvs(pv_uuid)

def get_unuse_ssd(volname):
    """
    获取所有未使用的SSD和当前主机磁盘信息
    """
    localhost_name, err, ret = do_cmd_std("hostname")
    if ret != 0 or localhost_name == '':
        logger.error('get local hostname fail')
        return False

    output, err, ret = do_cmd_std("vsmgr volume get-host-disks --volume_id {} {} 3 0".format(volname, localhost_name))
    if ret != 0 or output == '':
        logger.error('have not volume, ret = {}, err = {}'.format(ret, err))
        return False

    disks = json.loads(output)
    if disks is None:
        logger.error('get disk fail, disks = {}'.format(output))
        return False

    host_disk_info = []
    for host in disks['hosts']:
        remove_disks = []
        # 理论上只有一个主机信息
        host_disk_info = copy.deepcopy(host['disks'])
        for disk in host['disks']:

            if disk['disk_group_id'] != 0 and disk['storage_type'] != 'STORAGE_NONE':
                remove_disks.append(disk)
            elif disk['disk_type'] != 'DISK_SSD':
                remove_disks.append(disk)

        for disk in remove_disks:
            host['disks'].remove(disk)

    del disks['data_disk_size']
    del disks['disk_num_stat']

    disks['volume_type'] = "normal"
    disks['volume_id'] = volname
    for host in disks['hosts']:
        for disk in host['disks']:
            if check_disk_in_config(disk['dev']):
                print("磁盘配置存在问题 {} 可能已经在卷内，人工介入确认".format(disk['dev']))
                exit(1)
    print('当前未使用的SSD:')

    ssd_list = []
    for host in disks['hosts']:
        for disk in host['disks']:
            print('disk id: {}, disk_size: {} GB, dev: {}'.format(disk['disk'], disk['disk_size'] / 1024 / 1024 / 1024, disk['dev']))
            ssd_list.append(disk['disk'])

    print('\n')
    seldisk = ""
    while True:
        data = raw_input("请输入需要添加SSD的disk id，一次只能添加一个SSD: ")
        if data in ssd_list:
            seldisk = data
            break

        print('{} can not find, please try again'.format(data))

    #删除不需要的ssd
    for host in disks['hosts']:
        remove_disks = []

        for disk in host['disks']:
            if seldisk != disk['disk']:
                remove_disks.append(disk)
            else:
                cache_disk = copy.deepcopy(disk)
                cache_disk['disk_sn'] = cache_disk['disk_sn'].split(' ')[0]
                print("add cache {}".format(cache_disk))
                with open('/sf/log/vs/request_cache_disk.json', 'w') as f:
                    json.dump(cache_disk, f, indent=4)
                disk['disk_group_id'] = get_max_diskgroup_id(LOCALHOST)
                disk['storage_type'] = 'STORAGE_CACHE'

        for disk in remove_disks:
            host['disks'].remove(disk)

    return disks, host_disk_info


def gen_expand_disk_conf(volume_id, disks_conf, host_disk_info):
    from libvs.utils.hostinfo import HostStatus
    vol_conf = VolumeConf(task_volume_id)
    hosts = HostStatus()
    host_ip = hosts.get_mgr_ip(LOCALHOST)

    data = {
        "expand_type": "disk",
        "hosts": [
            {
                "host_alias": host_ip,
                "max_disk_index": 1,
                "disks": [

                ],
                "host_ip": host_ip,
                "host_status": "on",
                "all_ssd": "no",
                "host_name": LOCALHOST,
                "max_disk_capacity": 2000398934016,
                "authorization": "yes"
            }
        ],
        "volume_type": vol_conf['volume_type'],
        "volume_id": task_volume_id
    }
    for one_disk in host_disk_info:
        for expand_disk in disks_conf['hosts'][0]['disks']:
            if expand_disk['disk'] == one_disk['disk']:
                data['hosts'][0]['disks'].append(expand_disk)
            elif one_disk['storage_type'] != 'STORAGE_NONE':
                data['hosts'][0]['disks'].append(one_disk)

    return data


def execute_add_ssd(cfg):
    """
    执行扩容ssd流程
    """
    global task_volume_id

    from libvs.utils.common import get_host_of_vol
    volname = get_host_of_vol(LOCALHOST)

    # 混合卷才搞，全闪不用管
    blend, err, ret = do_cmd_std("/sf/vs/bin/super_zkcli.py cat /volumes/{}/volume_info.json | grep storage_type | grep -q blend".format(volname))
    if ret != 0:
        logger.error('only run on blend volume')
        return False

    cfg['volname'] = volname
    task_volume_id = volname
    disks, host_disk_info = get_unuse_ssd(volname)

    # 备份磁盘配置
    bak_conf(DISK_PATH, DISK_EXPAND_BAK_PATH)

    # 构造扩容配置
    data = gen_expand_disk_conf(task_volume_id, disks, host_disk_info)
    cfg_save(data, EXPAND_OP_DATA)
    logger.info("get expand data {}".format(json.dumps(data, indent=4)))
    # 调用页面扩容接口
    # from libvs.utils.vs_get_from_vt import http_proxy_handler
    # res = http_proxy_handler('get', '/vs/vs_config/vs_expand_volume', data=data)
    res, err, ret = do_cmd_std('/sf/vs/bin/vsmgr volume expand-volume {}'.format(EXPAND_OP_DATA))
    if ret:
        logger.error("start expand-volume task error, expand param: {}, err: {}".format(EXPAND_OP_DATA, err))
        return False
    # 删除备份
    # os.removedirs(DISK_EXPAND_BAK_PATH)
    vol_conf = VolumeConf(task_volume_id)
    if vol_conf['task']['type'] != 'expand_volume':
        logger.error('start expand_volume task failed: get vol_conf failed, current task is {} {}'.
                     format(vol_conf['task']['type'], vol_conf['task']['id']))
        return False
    cfg['task_id'] = vol_conf['task']['id']
    cfg['master'] = vol_conf['task']['master']
    print("start expand task success, expand param: {}".format(EXPAND_OP_DATA))
    if not wait_task_finish(cfg['master'], cfg['task_id']):
        cfg['need_revert'] = 1
        return False
    return True


def execute_entry_protect(cfg):
    global task_volume_id

    temp, err, ret = do_cmd_std("gluster volume set {} performance.wcc-delay-time 0".format(task_volume_id))
    if ret:
        logger.error("volume set {} performance.wcc-delay-time 0 failed : {}".format(task_volume_id, err))
        return False
    print("volume set {} performance.wcc-delay-time 0 succ".format(task_volume_id))

    # 检查一下有没有运行中的虚拟机
    if check_has_running_vm():
        print("该主机上有正在运行的虚拟机，请先迁移到其他主机上")
        return False

    # 对端副本数据一致性检查 vs_rpc_tool --cmd check --brickno num --exclude yes
    # 检查一下坏道 kernal日志
    # 提示一下非一体机检查不了坏道  一体机jbod模式也检查不了
    if not check_brick_data_ok():
        return False

    # 302以下要手动进入维护模式
    vol_master = get_localhost_volume_master()
    # 禁止重建
    temp, err, ret = do_cmd_std("/sf/vs/bin/vs_ssh root@{} /sf/vs/bin/vsmgr dts-sched config-modify Global.EnableRebuld 0 --save".format(vol_master))
    if ret:
        logger.error("set EnableRebuld 0 failed : {}".format(err))
        return False

    # 停掉vs_dog，改名
    try:
        os.rename("/sf/vs/bin/vs_dog.sh", "/sf/vs/bin/vs_dog.sh.bak")
    except Exception as ex:
        logger.error("rename vs_dog.sh failed {}".format(ex))
        return False

    # 卸载nfs挂载点
    if not volume_mount_point_op(task_volume_id, 'umount -lf'):
        logger.info("umount volume {} failed".format(task_volume_id))
        return False

    # 停掉keepalive
    keepalived_conf = os.path.join('/sf/vs/etc', 'keepalived/keepalived.conf')
    if os.path.exists(keepalived_conf):
        cmd = "/sf/vs/etc/init.d/keepalived stop"
        temp, err, ret = do_cmd_std(cmd)
        if ret:
            logger.error("run cmd: {} failed".format(cmd))

    # 等待几秒钟再停私网
    time.sleep(10)

    # 存储网隔离
    if not storage_network_modify(ban=True):
        return False

    logger.info("enter maintain succ")

    return True


def execute_kickout_ssd_data(cfg):
    #global task_volume_id
    #temp, err, ret = do_cmd_std("gluster volume set {} performance.wcc-delay-time 0".format(task_volume_id))
    #if ret:
    #    logger.error("set wcc failed : {}".format(err))
    #    return False
    #temp, err, ret = do_cmd_std("/sf/vs/bin/vs_tier_cli.py -c  setparam -a  all_bypass=1")
    #if ret:
    #    logger.error("set all_bypass failed : {}".format(err))
    #    return False
    #temp, err, ret = do_cmd_std("/sf/vs/bin/vs_tier_cli.py -c  setparam -a  all_nopromote=1")
    #if ret:
    #    logger.error("set all_nopromote failed : {}".format(err))
    #    return False

    # 等待写缓存和分层刷完，所有qcow2文件都没有标记
    # 大于302的版本进入维护模式时已经做了
    # if judge_vs_version("3.0.2"):
    #    return True
    print("分层回刷比较久，如果运行窗口退出后，请加上nocheck参数，重试！！！")
    tier_info = Config(TIER_CFG_PATH)
    tier_devs = tier_info.content.get('tier_devs', [])
    cmd = "/sf/vs/bin/vs_tier_cli.py -c kickout -a ssd_uuid={} > /dev/null"
    for dev in tier_devs:
        print(cmd.format(dev["ssd_uuid"]))
        do_cmd_std(cmd.format(dev["ssd_uuid"]))

    for disk in LocalDiskConf.get_all_disk():
        if disk['storage_type'] == "STORAGE_CACHE":
            print("disk {} :waiting for tier kickout...".format(disk["disk"]))
            wait_wcache_tier(disk["disk"])
            print("kickout succ")

    return True

def execute_modify_cfg(cfg):
    # 备份磁盘配置
    global task_volume_id
    bak_conf(DISK_PATH, DISK_MODIFY_BAK_PATH)

    disk_group_info = show_cur_diskgroup_info()

    for group_id in disk_group_info:
        if 'cache' not in disk_group_info[group_id] or len(disk_group_info[group_id]['cache']) == 0:
            continue
        print(
            '{} STORAGE_CACHE: [{}G], STORAGE_DATA: {}'.format(group_id, disk_group_info[group_id]['cache'][0]['size'],
                                                               disk_group_info[group_id]['data']))

    # {1: 
    #     {
    #     'cache': 
    #         [
    #             {'sn': u'SDKDT0430066', 'uuid': u'hROwa5-qGAJ-tR3Q-ylAX-Ns7C-6Cci-SI01AL-rcache', 'disk_id': u'76a9243d8f83_SATA_STS480TFI1AP_SDKDT0430066', 'size': 447}
    #         ],
    #     'data': 
    #         [
    #             {'sn': u'V1G7UM6G', 'uuid': u'mOLEw5-CstW-124Y-aCO6-605q-TI7I-87MmeE', 'disk_id': u'76a9243d8f83_SATA_HGST_HUS726T4TAV1G7UM6G', 'size': 3726}, 
    #             {'sn': u'V1G7XASC', 'uuid': u'1Y1uLp-IFyI-BP13-H7sI-TQn8-EDJc-D68WNt', 'disk_id': u'76a9243d8f83_SATA_HGST_HUS726T4TAV1G7XASC', 'size': 3726}, 
    #             {'sn': u'WD-WMC6N0J7UDM6', 'uuid': u'CLHnWW-Dxxk-KAGw-m0Gu-IefP-Ulwn-scdPl1', 'disk_id': u'76a9243d8f83_SATA_WDC_WD2005FBYZ-_WD-WMC6N0J7UDM6', 'size': 1863}, 
    #             {'sn': u'V6KXNRNP', 'uuid': u'B2km15-KiS9-RPCR-TKtr-Byxp-D7Bu-ivhXyR', 'disk_id': u'76a9243d8f83_SATA_HGST_HUS726T4TAV6KXNRNP', 'size': 3726}
    #         ]
    #     }
    # }
    #拿要移动的磁盘组号，和磁盘
    target_id, request_disk = get_remove_disk_list(disk_group_info)
    # 2
    # [{'group_id': 1, 'uuid': u'B2km15-KiS9-RPCR-TKtr-Byxp-D7Bu-ivhXyR', 'disk_id': '76a9243d8f83_SATA_HGST_HUS726T4TAV6KXNRNP', 'sn': u'V6KXNRNP'}]
    #适配修改sds mongo里的配置 pool和disk 保存这个信息
    with open('/sf/log/vs/request_disk.json', 'w') as f:
        json.dump(request_disk, f, indent=4)
    with open('/sf/log/vs/target_id.json', 'w') as f:
        json.dump(target_id, f)
    # 修改/nodes/*/diskgroups
    if not modify_host_diskmap(target_id, request_disk):
        return False

    # 修改磁盘配置
    for disk in request_disk:
        disk_path = "{}/{}.json".format(DISK_PATH, disk['disk_id'])
        disk_conf = cfg_load(disk_path)
        disk_conf['disk_group_id'] = target_id
        cfg_save(disk_conf, disk_path)

    cfg['request_disk'] = copy.deepcopy(request_disk)
    if not deal_with_wcache_and_tier(request_disk):
        cfg['need_revert'] = 1
        return False
    # 删除备份
    # os.removedirs(DISK_MODIFY_BAK_PATH)
    # 都成功后，将卷状态设置回去
    resp, err, ret = do_cmd_std('vsmgr volume set-volume-task-success {}'.format(task_volume_id))
    if ret:
        logger.error("set volume {} task sucess failed")
        return False
    return True

def execute_wait_brick_normal(cfg):
    # 等待brick正常
    all_normal = True
    try:
        for i in range(RETRY_TIME):
            temp, err, ret = do_cmd_std("/sf/vs/bin/vs_tier_cli.py -c dump -a brickinfo")
            if ret:
                logger.error("dump tierd brick info failed : {}".format(err))
                print("dump tierd brick info failed : {}".format(err))
                time.sleep(10)
                continue

            tier_info = json.loads(temp)
            all_normal = True
            for ssd_info in tier_info["ssd"]:
                for brick_info in ssd_info["brick"]:
                    if brick_info["status"] != "normal":
                        all_normal = False
            if all_normal:
                break
            logger.info("wait tierd brick normal, sleep 10s")
            print("wait tierd brick normal, sleep 10s")
            time.sleep(10)
    except Exception as e:
        logger.error("wait tierd brick normal error: {}".format(e))

    if not all_normal:
        print("wait tierd brick normal timeout, bypass")
        logger.error("wait tierd brick normal timeout, bypass")

    return True

def execute_leave_protect(cfg):
    # 页面退出维护模式接口
    # 302以下要手动进入退出模式
    # 存储网放通
    if not storage_network_modify(False):
        return False

    # 恢复keepalive
    keepalived_conf = os.path.join('/sf/vs/etc', 'keepalived/keepalived.conf')
    if os.path.exists(keepalived_conf):
        cmd = "/sf/vs/etc/init.d/keepalived restart"
        temp, err, ret = do_cmd_std(cmd)
        if ret:
            logger.error("run cmd: {} failed".format(cmd))

    vol_master = get_localhost_volume_master()
    # 恢复重建
    temp, err, ret = do_cmd_std(
        "/sf/vs/bin/vs_ssh root@{} /sf/vs/bin/vsmgr dts-sched config-modify Global.EnableRebuld 1 --save".format(
            vol_master))
    if ret:
        logger.error("set EnableRebuld 1 failed : {}".format(err))
        return False

    # 恢复vs_dog
    try:
        os.rename("/sf/vs/bin/vs_dog.sh.bak", "/sf/vs/bin/vs_dog.sh")
    except Exception as ex:
        logger.error("rename vs_dog.sh failed {}".format(ex))
        if not os.path.exists("/sf/vs/bin/vs_dog.sh"):
            return False

    # 挂载nfs
    if not volume_mount_point_op(task_volume_id, 'mount -lf'):
        logger.info("mount volume failed")
        # return False

    logger.info("exit maintain succ")
    return True


def check_brick_data_ok():
    print("非一体机环境和一体机jbod模式坏道检查结果可能不准")

    bad_sec, err, ret = do_cmd_std('zgrep -n err /sf/log/today/kernel.log* | grep -v info |grep sd')
    if bad_sec != "":
        logger.info("the kernel log has bad_sector warning, bad_sec: {}, err: {}".format(bad_sec, err))
        # return False

    bad_sec, err, ret = do_cmd_std('for sd in $(ls /dev/sd[a-z]); do smartctl -A $sd | egrep "Current_Pending_Sector|UDMA_CRC_Error_Count|Reallocated_Sector_Ct"; done')
    if bad_sec != "":
        logger.info("smartctl info has bad_sector info, bad_sec: {}, err: {}".format(bad_sec, err))
        # return False

    try:
        brick_no_list = []
        from libvs.volume.topo_mgr import TopoInfoMgr
        topo_info = TopoInfoMgr(task_volume_id).get_host_brick_map(False, task_volume_id, LOCALHOST)
        for disk in topo_info:
            for brick in topo_info[disk]:
                brick_no_list.append(topo_info[disk][brick]['brick_no'])
        brick_no_list = set(brick_no_list)
        for brick_no in brick_no_list:
            bad_data, err, ret = do_cmd_std(
                "/sf/vs/bin/vs_rpc_tool --cmd check --brickno {} --exclude yes".format(brick_no))
            if ret != 0 or "err_files:[]" not in bad_data:
                logger.error("vs_rpc_tool --cmd check --brickno, bad_sec: {}, err: {}".format(bad_data, err))
                return False
    except Exception as e:
        logger.error("check brick data error: {}".format(e))
        return False
    print("check_brick_data_ok 结束")
    return True


def check_has_running_vm():
    vm, err, ret = do_cmd_std("/sf/vs/bin/vs_get_running_vmids.sh")
    if ret:
        logger.error("get running vmids failed")
        return True
    if vm != "":
        print("running vmids is: {}".format(vm))
        return True
    return False


def check_environment():
    """
    检查环境是否适合扩SSD
    """
    #分层进程不在，不允许执行
    tierd, err, ret = do_cmd_std('pidof tierd')
    if ret != 0 or tierd == '':
        logger.error('tierd is not exist, ret = {}, err = {}'.format(ret, err))
        return False

    #glusterfsd程序不在，不允许执行
    glusterfsd, err, ret = do_cmd_std('pidof glusterfsd')
    if ret != 0 or glusterfsd == '':
        logger.error('glusterfsd is not exist, ret = {}, err = {}'.format(ret, err))
        return False

    #判断有没有卷
    volname, err, ret = do_cmd_std("vsmgr volume list | grep vs_vol | awk '{print $1}'")
    if ret != 0 or volname == '':
        logger.error('have not volume, ret = {}, err = {}'.format(ret, err))
        return False

    # 检查所有缓存盘都在线
    dump_cnt, err, ret = do_cmd_std("/sf/vs/bin/vs_tier_cli.py -c dump | grep ssd_uuid | wc -l")
    if ret != 0 or dump_cnt == '':
        logger.error('have not volume, ret = {}, err = {}'.format(ret, err))
        return False

    cfg_cnt, err, ret = do_cmd_std("grep dev_path {} | wc -l".format(TIER_CFG_PATH))
    if ret != 0 or cfg_cnt == '':
        logger.error('have not volume, ret = {}, err = {}'.format(ret, err))
        return False

    if dump_cnt != cfg_cnt:
        logger.error('have ssd do not online, dump_cnt: {}, cfg_cnt: {}'.format(dump_cnt, cfg_cnt))
        return False

    # 检查所有的brick都在线
    output, err, ret = do_cmd_std("/sf/vs/bin/vs_tier_cli.py -c dump -a brickinfo | grep status | grep -q none")
    if ret == 0:
        logger.error('have brick do not online')
        return False

    # 检测bad副本
    volname_list = volname.splitlines()
    for vol_id in volname_list:
        gfs = Glusterfs(vol_id)
        bricks = gfs.get_bricks_by_host(LOCALHOST)
        obj = check_brick_status(bricks, LOCALHOST)
        if obj["ret"] != 0:
            logger.error("check brick status failed")
            return False
        #只处理混合卷，全闪不用管，可能有多个卷，不放在这里判断

    # 检查有没有正在运行的虚拟机
    if check_has_running_vm():
        print("注意：该主机上有正在运行的虚拟机，进入维护模式前请先迁移到其他主机上")

    return True

def get_eds_version():
    with open(EDS_VERSION, 'r') as file:
        version = file.read().strip()
        parts = version.split('.')
        if len(parts) >= 4:
            return parts[3]
        else:
            return None
@logger_init()
def main():
    proc = SingleProc('/var/lock/vs_hot_add_ssd.lock')
    if proc.lock() != 0:
        logger.info('another vs_hot_add_ssd process is running')
        return 1
    current_script_path = os.path.abspath(__file__)
    current_script_dir = os.path.dirname(current_script_path)
    if current_script_dir != '/sf/vs/bin':
        print("请将脚本放到/sf/vs/bin/目录下执行")
        return 1

    # 校验md5
    for file_path, file_md5 in file_md5_map.items():
        if not os.path.exists(file_path):
            print("文件【{}】不存在，请将所有文件都放到/sf/vs/bin/目录下执行".format(file_path))
            return 1
        if not verify_md5(file_path, file_md5):
            print("当前文件【{}】，请确认文件是否被篡改".format(file_path))
            return 1

    eds_version = get_eds_version()
    if eds_version != "309" and eds_version != "501" and eds_version != "502" and eds_version != "503":
        print("only run on eds309 or eds501 or eds502 or eds503")
        return 1

    global task_volume_id
    only_roll_back = True if len(sys.argv) > 1 and sys.argv[1] == 'rollback' else False
    dont_check = True if len(sys.argv) > 1 and sys.argv[1] == 'nocheck' else False
    print("进入维护模式后尝试继续上次步骤请加上nocheck参数，否则可能导致tier检查卡死，卡死时强制退出即可")
    # 前置检查，检查一下环境是否有问题
    if not dont_check and not check_environment():
        return 1

    # 不存在配置时先创建一个
    if not os.path.exists(EXPAND_CFG_PATH):
        create_emtpy_cfg()

    # 加载配置
    cfg = cfg_load(EXPAND_CFG_PATH)
    if cfg is None:
        logger.error('load {} fail'.format(EXPAND_CFG_PATH))
        return 1

    # 执行各个步骤
    for item in PHASE_NAME:
        print("Current step: {}".format(item))
        dict_phase = cfg[item]
        # 把全局变量先读出来，避免流程已经执行过，这次执行没有对它们初始化
        if 'volname' in dict_phase:
            task_volume_id = dict_phase['volname']

        # 判断各个状态，做过了就不需要做了
        if dict_phase[STRING_STATUS] == STATUS_SUCC:
            print("pass")
            continue

        # 做过了，但是失败了，则先回滚一下相应的子流程
        if dict_phase[STRING_STATUS] != STATUS_NONE:
            func = getattr(sys.modules[__name__], 'rollback_{}'.format(item))
            ret = func(dict_phase)
            if not ret:
                return 1

            save_phase_status(cfg, dict_phase, STATUS_NONE)

        # 设置状态为doing
        save_phase_status(cfg, dict_phase, STATUS_DOING)

        # 开始执行
        print(sys.modules[__name__])
        func = getattr(sys.modules[__name__], 'execute_{}'.format(item))
        # ret = func(dict_phase[STRING_STATUS])
        ret = func(dict_phase)
        if not ret:
            save_phase_status(cfg, dict_phase, STATUS_FAIL)
            return 1

        # 设置状态为success
        save_phase_status(cfg, dict_phase, STATUS_SUCC)
        print("success")

    temp, err, ret = do_cmd_std("gluster volume reset {} performance.wcc-delay-time &>/dev/null".format(task_volume_id))
    if ret:
        logger.error("volume reset {} performance.wcc-delay-time 0 failed : {}".format(task_volume_id, err))
        return False

    # 当前任务的配置保存起来，换个文件名
    now_time = int(time.time())
    if os.path.exists(DISK_MODIFY_BAK_PATH):
        os.rename(DISK_MODIFY_BAK_PATH, "/sf/cfg/vs/bak_modify_disk_{}".format(now_time))
    if os.path.exists(DISK_EXPAND_BAK_PATH):
        os.rename(DISK_EXPAND_BAK_PATH, "/sf/cfg/vs/bak_expand_disk_{}".format(now_time))
    if os.path.exists(EXPAND_CFG_PATH):
        os.rename(EXPAND_CFG_PATH, "{}_{}".format(EXPAND_CFG_PATH, now_time))
    if os.path.exists(EXPAND_OP_DATA):
        os.rename(EXPAND_OP_DATA, "{}_{}".format(EXPAND_OP_DATA, now_time))

    # 处理306进入维护模式的bug
    if judge_vs_version("3.0.6") and not judge_vs_version("3.0.6R2"):
        # 在fault_cfg中找到残留的这个配置的fault_id
        zkclient = zk.zk_op(log=logger, compress_on=DTS_COMPRESS_ON)
        path = '/volumes/{}/dts/fmc_fault_tree.csv'.format(task_volume_id)
        file = StringIO.StringIO(zkclient.read(path))
        data = file.getvalue()
        logger.info("volume {} \nfault {}".format(task_volume_id, data))
        csv_file = csv.DictReader(file)
        fault_id = ''
        for item in csv_file:
            if item["host"] == LOCALHOST and item["task_type"] == "rebuild" and item["fault_type"] == "host":
                fault_id = item['fault_id']
                break
        # 取消重建失败任务
        if fault_id != '':
            master = get_vol_master(task_volume_id)
            cmd = "vsmgr dts-sched cancel-tasks --fault_id {}".format(fault_id)
            try:
                remote_check_output(master, cmd)
            except Exception as e:
                logger.error("vsmgr dts-sched cancel-tasks --fault_id {} failed : {}".format(fault_id, e))
                return False

    print("all success")
    print("\033[31mEDS版本请退出chroot，在宿主机上执行“python /root/vs-env/sf/vs/bin/update_disk_group.py”！！！\033[0m")
    return 0


def unitest():
    global task_volume_id
    # task_volume_id = "e41feb25_vs_vol_rep2"
    check_environment()

    if not execute_add_ssd({}):
        print("enter falied")
        exit(0)

    print("start entry protect")
    if not execute_entry_protect({}):
        print("enter falied")
        exit(0)
    print("start kickout data")
    if not execute_kickout_ssd_data({}):
        print("kickout failed")
        exit(0)
    print("start leave protect")
    if not execute_leave_protect({}):
        print("leave protect error")
        exit(0)
    print("start modify cfg")
    execute_modify_cfg({})
    exit(0)


if __name__ == '__main__':
    sys.exit(main())
    # sys.exit(unitest())
