#!/sf/vs/bin/python
# -*- coding:utf-8 -*-

"""
## LV坏道修复工具
"""
import os
import re
import sys
import socket
import traceback
import pylib.utils.utiltools as common


def try_bad_blocks_repair(bad_blocks_gfid, bad_blocks_lv_index, dst_lv_dev,
                          src_brick, dst_brick, offset_block, block_size, is_forced):
    dst_file_offset = offset_block * block_size
    gfid_path = '.glusterfs/{}/{}/{}'.format(bad_blocks_gfid[0:2], bad_blocks_gfid[2:4], bad_blocks_gfid)

    # 获取目的端的文件偏移
    dst_local_path = os.path.join(dst_brick['path'], gfid_path)
    cmdline = '/bin/cat {}'.format(dst_local_path)
    result = common.remote_cli(dst_brick['host'], cmdline, True)
    for line in result:
        if line and line.split(':')[0] == 'SHARD':
            if int(line.split(':')[3]) == bad_blocks_lv_index:
                # 找到对应的LV编号，则退出
                break
            else:
                dst_file_offset += int(line.split(':')[5]) * (128*1024*1024)  # 每个分片块128MB

    # 获取源端的对应的LV与LV偏移
    src_lv_dev = ''
    src_file_offset = dst_file_offset
    src_local_path = os.path.join(src_brick['path'], gfid_path)
    cmdline = '/bin/cat {}'.format(src_local_path)
    result = common.remote_cli(src_brick['host'], cmdline, True)
    for line in result:
        if line and line.split(':')[0] == 'SHARD':
            lv_size = int(line.split(':')[5]) * (128*1024*1024)
            if src_file_offset >= lv_size:
                src_file_offset -= lv_size
            else:
                disk_vg = line.split(':')[2]  # 被借空间的VG
                src_vg = src_brick['path'].split('/')[5]  # 源brick所在的VG
                lv_index = int(line.split(':')[3])
                src_lv_dev = '/dev/{}/{}_{}.{}'.format(disk_vg, src_vg, bad_blocks_gfid.replace('-', ''), lv_index)
                break

    # 确认目的端是坏道才允许修复，不然跳过
    if not common.check_dev_has_badblock(dst_brick['host'], dst_lv_dev, offset_block*block_size, block_size):
        common.logger.error('failed to check badblock, dev: {}, offset: {}'.format(dst_lv_dev, offset_block*block_size))
        print common.Colored().red('块设备: {}, 偏移: {} 无法确认该区间是真坏道, 请重试修复该坏道区间'.format(dst_lv_dev, offset_block*block_size))
        return -1
    common.calc_dev_block_md5(dst_brick['host'], dst_lv_dev, offset_block*block_size, block_size)

    # 将源的偏移转化成块对齐
    src_offset_blocks = src_file_offset / block_size
    cmdline = '/usr/bin/ssh root@{} \"/bin/dd if={} bs={} count=1 skip={} iflag=direct conv=notrunc\" | ' \
              '/usr/bin/ssh root@{} \"/bin/dd of={} bs={} count=1 seek={} oflag=direct conv=notrunc\"'.\
        format(common.get_ssh_host(src_brick['host']), src_lv_dev, block_size, src_offset_blocks,
               common.get_ssh_host(dst_brick['host']), dst_lv_dev, block_size, offset_block)

    # 判断是否执行坏道修复
    if not is_forced:
        readline = '是否执行坏道修复，修复命令:{}，输入\'y\'继续, \'n\'退出'.format(cmdline)
        common.check_terminal_input(readline)

    common.logger.info('try to cmdline: {}'.format(cmdline))
    common.cli(cmdline, False)
    common.calc_dev_block_md5(dst_brick['host'], dst_lv_dev, offset_block*block_size, block_size)
    return 0

def get_lv_size(brick, lv_dev):
    lv_size = 0
    host = brick['host']
    cmdline = '/bin/lsblk -b {} -o SIZE'.format(lv_dev)
    result = common.remote_cli(host, cmdline, True)
    for line in result:
        if line and line != 'SIZE' and line.isdigit():
            lv_size = int(line)
            break
    return lv_size

def lv_data_repair(lv_dev, version, volume_name, has_arbiter, replicate_num, replicate, hosts):
    common.logger.info('try to lv_data_repair, lv_dev: {}'.format(lv_dev))
    bad_blocks_offsets = []
    bad_block_size = 4096  # 默认坏道块大小为4096
    color = common.Colored()

    bad_blocks_lv = lv_dev.split('/')[3]
    bad_blocks_vg = bad_blocks_lv.split('_')[0]
    bad_blocks_gfid = bad_blocks_lv.split('_')[1]
    bad_blocks_lv_index = int(bad_blocks_lv.split('.')[1])
    bad_blocks_gfid = '{}-{}-{}-{}-{}'.format(bad_blocks_gfid[0:8], bad_blocks_gfid[8:12], bad_blocks_gfid[12:16],
                                              bad_blocks_gfid[16:20], bad_blocks_gfid[20:32])
    
    online_bricks = common.get_online_bricks(volume_name)
    if common.fault_point_result() or not online_bricks:
        common.logger.error('failed to supported, cannot find online_bricks')
        return -1

    # 获取文件的复制组
    import modules.bad_blocks.bad_blocks_tier as bad_blocks_tier
    replicate_bricks = bad_blocks_tier.get_file_replicate_bricks(version, bad_blocks_gfid, hosts, replicate, online_bricks)
    if not replicate_bricks:
        common.logger.error('failed to get replicate_bricks')
        return -1
    
    # 获取坏道修复的源与目的brick
    bad_block_brick_id = -1
    for brick in replicate_bricks:
        if not brick['arbiter'] and bad_blocks_vg in brick['path']:
            bad_block_brick_id = brick['id']
            break
    if bad_block_brick_id == -1:
        common.logger.error('failed to get bad_block_brick_id')
        return -1
    src_brick, dst_brick = bad_blocks_tier.get_src_and_dst_brick(replicate_bricks, None, bad_block_brick_id)
    if not src_brick or not dst_brick:
        common.logger.error('failed to get src_brick: {} or dst_brick: {}'.format(src_brick, dst_brick))
        return -1
    
    # 提示业务影响
    bad_blocks_tier.check_vm_is_stopped(version, bad_blocks_gfid, volume_name, hosts)

    # 提示修复设备坏道
    lv_size = get_lv_size(dst_brick, lv_dev)
    print color.cyan('开始修复块设备: {}(大小:{}) 上的坏道'.format(lv_dev, common.to_human_readable(lv_size)))
    biggest_lv_size = 128*1024*1024*1024  # 最大支持128GB，越过，需要找研发确认，才可以继续执行
    if lv_size >= biggest_lv_size:
        readline = '块设备大小大于{}，需要找研发协助确认是否可以继续处理，输入\'y\'继续，\'n\'退出'.format(common.to_human_readable(biggest_lv_size))
        common.check_terminal_input(readline)

    localhost = socket.gethostname()
    bad_blocks_offsets_path = '/root/{}_badblocks.txt'.format(common.vsfire_recovery_dir)  # 默认坏道扫描文件

    if localhost == dst_brick['host']:
        cmdline = '/sbin/badblocks -b {} -o /root/{}_badblocks.txt {}'.format(bad_block_size, common.vsfire_recovery_dir, lv_dev)
    else:
        cmdline = '/sbin/badblocks -b {} -o /root/{}_badblocks.txt {}; /usr/bin/scp -r root@{}:{} root@{}:{};'.\
            format(bad_block_size, common.vsfire_recovery_dir, lv_dev,
                dst_brick['host'], bad_blocks_offsets_path,
                localhost, bad_blocks_offsets_path)
    bad_blocks_offsets_path, bad_blocks_offsets = bad_blocks_tier.try_scan_or_get_badblock_offset(dst_brick['host'], cmdline, bad_blocks_offsets_path, bad_block_size)
    if not bad_blocks_offsets:
        common.logger.error('failed to get bad_blocks_offsets in file: {}'.format(bad_blocks_offsets_path))
        return -1

    result = 0
    has_force_clean_wcache = True
    try:
        # 坏道修复前，准备源brick
        has_force_clean_wcache = bad_blocks_tier.prepare_src_brick(volume_name, bad_blocks_gfid, False, src_brick)

        # 开始执行坏道修复
        is_forced = False  # 默认第1次执行需要手工确认，主要是自动计算的偏移地址需要确认
        for offset in bad_blocks_offsets:
            if try_bad_blocks_repair(bad_blocks_gfid, bad_blocks_lv_index, lv_dev,
                                  src_brick, dst_brick, offset, bad_block_size, is_forced):
                result = -1
            
            # 成功过一次，说明后面的坏道修复比较安全，不需要再重复确认了
            if not result:
                is_forced = True 

        # 坏道修复后，尝试删除临时文件
        cmdline = '/bin/rm -f {}'.format(bad_blocks_offsets_path)
        common.logger.info('try to host: {}, cmdline: {}'.format(localhost, cmdline))
        common.remote_cli(localhost, cmdline)
        common.logger.info('try to host: {}, cmdline: {}'.format(dst_brick['host'], cmdline))
        common.remote_cli(dst_brick['host'], cmdline)

        # 坏道修复后，回退源brick
        bad_blocks_tier.post_src_brick(has_force_clean_wcache, volume_name, False, src_brick)
        return result
    except:
        common.logger.error('failed to bad_blocks_lv: {}, got except:{}'.format(lv_dev, traceback.format_exc()))

    # 异常场景，尝试删除临时文件
    cmdline = '/bin/rm -f {}'.format(bad_blocks_offsets_path)
    common.logger.info('try to host: {}, cmdline: {}'.format(localhost, cmdline))
    common.remote_cli(localhost, cmdline)
    common.logger.info('try to host: {}, cmdline: {}'.format(dst_brick['host'], cmdline))
    common.remote_cli(dst_brick['host'], cmdline)

    # 坏道修复后，回退源brick
    bad_blocks_tier.post_src_brick(has_force_clean_wcache, volume_name, False, src_brick)
    return -1


def get_badblocks_from_log(version, hosts):
    badblocks_lvs = []

    if version <= common.VS_VERSION_2_6:
        cmdline = '/usr/bin/find /sf/log/vs/glusterfs/log/glusterfs/bricks -type f -name "*.log" ' \
                  '-exec /usr/bin/tail -c 1M {} \; | ' \
                  '/bin/grep ":read_one_shard].*Input/output error" | ' \
                  '/usr/bin/awk "{print \$7}" | /usr/bin/sort -u'
    else:
        cmdline = '/usr/bin/find /sf/log/today/vs/log/glusterfs/bricks -type f -name "*.log" ' \
                  '-exec /usr/bin/tail -c 1M {} \; | ' \
                  '/bin/grep ":read_one_shard].*,Input/output error," | ' \
                  '/usr/bin/awk "{print \$7}" | /usr/bin/sort -u'
    for host in hosts:
        try:
            result = common.remote_cli(host, cmdline, True)
            for line in result:
                if re.match(r"\w{6}-\w{4}-\w{4}-\w{4}-\w{4}-\w{4}-\w{6}/"
                            r"\w{6}-\w{4}-\w{4}-\w{4}-\w{4}-\w{4}-\w{6}_\w{32}.\d+", line):
                    badblocks_lvs.append(line)
        except common.CmdError:
            # 有主机获取日志错误，直接忽略
            common.logger.warn('failed to host: {}, cmdline: {}'.format(host, cmdline))
    return badblocks_lvs

def check_impact_vms(version, badblocks_lvs, volume_name, hosts):
    all_shards = []
    files = []
    vms_name = {}
    for lv in badblocks_lvs:
        bad_blocks_lv = lv.split('/')[1]
        gfid_hex = bad_blocks_lv.split('_')[1]
        gfid = '{}-{}-{}-{}-{}'.format(gfid_hex[0:8], gfid_hex[8:12], gfid_hex[12:16], gfid_hex[16:20], gfid_hex[20:32])
        if version >= common.VS_VERSION_3_0:
            import pylib.rpcservice as rpcservice
            if not all_shards:
                all_shards = rpcservice.route_list_all_shards(volume_name)
            base_file_path = rpcservice.route_gfid_to_file_path(gfid, all_shards)
        else:
            import modules.bad_blocks.bad_blocks_tier as bad_blocks_tier
            base_file_path = bad_blocks_tier.get_file_path_from_gfid(gfid, hosts)
        if base_file_path:
            files.append(base_file_path)
        vms_info = common.files_path_to_vms_name([base_file_path])
        if vms_info:
            vmid, vm_name = next(iter(vms_info.items()))
            vms_name[vmid] = vm_name
    
    file_msg_print = ''
    vm_msg_print = ''
    if files:
        files = list(set(files))
        for index, file in enumerate(files):
            file_msg_print += file
            if (index + 1) % 1 == 0 and index != len(files) - 1:
                file_msg_print += '\n'

    if vms_name:
        for index, (vm_id, vm_name) in enumerate(vms_name.items()):
            vm_msg_print += '{} ({}) '.format(vm_id, vm_name)
            if (index + 1) % 1 == 0 and index != len(vms_name) - 1:
                vm_msg_print += '\n'
    
    if file_msg_print:
        print common.Colored().red('\n找到坏道文件:')
        print common.Colored().cyan('{}'.format(file_msg_print))
    if vm_msg_print:
        print common.Colored().red('\n找到坏道虚拟机:')
        print common.Colored().cyan('{}'.format(vm_msg_print))


def _lv_data_repair_start(lv_dev):
    version = common.get_vs_version()
    if not common.is_vs_version_valid(version):
        return -1

    volume_name, hosts, replicate_num, has_arbiter, bricks, replicate = common.get_vs_cluster_info()
    if not volume_name:
        common.logger.error('failed to supported, cannot find volume name')
        return -1

    # 环境不能有EFS
    if common.vs_has_efs(version, hosts):
        common.logger.error('failed to supported, version: {}'.format(version))
        return -1

    result = 0
    if not lv_dev:
        # 如果输入参数为空，表示自动从日志中获取坏道块设备
        readline = '是否搜索存储日志，获取存在坏道的块设备，输入\'y\'继续，\'n\'退出'
        common.check_terminal_input(readline)

        badblocks_lvs = get_badblocks_from_log(version, hosts)
        if not badblocks_lvs:
            print common.Colored().red('找不到存在坏道的块设备')
            return -1

        # 显示所有坏道文件与坏道虚拟机
        check_impact_vms(version, badblocks_lvs, volume_name, hosts)
        
        readline = '是否开始修复找到的块设备上的坏道，输入\'y\'继续，\'n\'退出'
        common.check_terminal_input(readline)

        for lv in badblocks_lvs:
            try:
                lv_dev = '/dev/{}'.format(lv)
                if lv_data_repair(lv_dev, version, volume_name, has_arbiter, replicate_num, replicate, hosts):
                    result = -1
            except common.CmdError as e:
                if 'Manual check cancellation' in str(e):
                    result = -1
                    continue
                else:
                    raise
        return result

    # 如果有输入参数，参数一定是LV设备格式
    if not re.match(r"/dev/\w{6}-\w{4}-\w{4}-\w{4}-\w{4}-\w{4}-\w{6}/"
                    r"\w{6}-\w{4}-\w{4}-\w{4}-\w{4}-\w{4}-\w{6}_\w{32}.\d+", lv_dev):
        common.logger.error('failed to supported, lv_dev: {}'.format(lv_dev))
        return -1

    return lv_data_repair(lv_dev, version, volume_name, has_arbiter, replicate_num, replicate, hosts)


def _lv_data_repair(lv_dev):
    lock_file = common.get_vsfire_lock_file()
    with common.VsfireFlock(lock_file) as lock:
        ret = _lv_data_repair_start(lv_dev)
        if ret:
            print common.Colored().red('执行失败')
        else:
            print common.Colored().cyan('执行成功')
        return ret
