#!/usr/bin/env python3
# Copyright (C) 2022 Checkmk GmbH - License: GNU General Public License v2
# This file is part of Checkmk (https://checkmk.com). It is subject to the terms and
# conditions defined in the file COPYING, which is part of this source code package.

import glob
import os
import random
import subprocess
import sys
import time
import traceback
from typing import Any, Literal

from omdlib.utils import get_site_distributed_setup, SiteDistributedSetup

from cmk.utils.hostaddress import HostName
from cmk.utils.paths import omd_root, var_dir
from cmk.utils.render import fmt_bytes

opt_verbose = "-v" in sys.argv
opt_force = "-f" in sys.argv

config_file = omd_root / "etc/diskspace.conf"
plugin_dir = omd_root / "share/diskspace"
plugin_dir_local = omd_root / "local/share/diskspace"

# Configuration variables
min_free_bytes: int | None = None
max_file_age: int | None = None
min_file_age: int | None = None
cleanup_abandoned_host_files: int | None = None

# Initial configuration
default_config: dict[str, int] = {
    "cleanup_abandoned_host_files": 2592000,
}

plugins: dict[str, dict[str, Any]] = {}


def _error(message: str) -> None:
    sys.stderr.write("ERROR: %s\n" % message)


def _terminate(message: str) -> None:
    _error(message)
    sys.exit(1)


def _log(message: str) -> None:
    sys.stdout.write("%s\n" % message)


def _verbose(message: str) -> None:
    if opt_verbose:
        _log(message)


def _read_config() -> None:
    globals().update(default_config)

    try:
        exec(config_file.read_text(), globals(), globals())  # nosec B102 # BNS:aee528
    except OSError:
        pass  # ignore non existant config
    except Exception as e:
        _terminate("Invalid configuration: %s" % e)


def _print_config() -> None:
    _verbose("Settings:")
    if cleanup_abandoned_host_files is None:
        _verbose("  Not cleaning up abandoned host files.")
    else:
        _verbose(
            "  Cleaning up abandoned host files older than %d seconds."
            % int(cleanup_abandoned_host_files)
        )

    if max_file_age is None:
        _verbose("  Not cleaning up files by age.")
    else:
        _verbose("  Cleanup files that are older than %d seconds." % int(max_file_age))

    if min_free_bytes is None or min_file_age is None:
        _verbose("  Not cleaning up files by free space left.")
    else:
        _verbose(
            "  Cleanup files till %s are free while not deleting files "
            "older than %d seconds" % (fmt_bytes(min_free_bytes), min_file_age or 0)
        )


def _resolve_paths() -> None:
    for plugin in plugins.values():
        resolved: list[str] = []
        for path in plugin.get("cleanup_paths", []):
            # Make relative paths absolute ones
            if path[0] != "/":
                path = str(omd_root / path)

            # This resolves given path patterns to really existing files.
            # It also ensures that the files in the resolved list do really exist.
            resolved += glob.glob(path)

        if resolved:
            plugin["cleanup_paths"] = resolved
        elif "cleanup_paths" in plugin:
            del plugin["cleanup_paths"]


def _load_plugins() -> None:
    try:
        local_plugins: list[str] = list(p.name for p in plugin_dir_local.iterdir())
    except OSError:
        local_plugins = []  # this is optional

    plugin_files: list[str] = [p.name for p in plugin_dir.iterdir() if p.name not in local_plugins]

    for base_dir, file_list in [(plugin_dir, plugin_files), (plugin_dir_local, local_plugins)]:
        for file_name in file_list:
            if file_name[0] == ".":
                continue

            plugins[file_name] = {}

            path = base_dir / file_name
            _verbose("Loading plugin: %s" % path)
            try:
                exec(  # nosec B102 # BNS:aee528
                    path.read_text(),
                    plugins[file_name],
                    plugins[file_name],
                )
            except Exception as e:
                _error(f'Exception while loading plugin "{path}": {e}')

    # Now transform all path patterns to absolute paths for really existing files
    _resolve_paths()


def _collect_file_infos() -> None:
    for plugin in plugins.values():
        for path in plugin.get("cleanup_paths", []):
            result: os.stat_result = os.stat(path)
            file_infos_value: tuple[int, float] = (result.st_size, result.st_mtime)
            plugin.setdefault("file_infos", {})
            plugin["file_infos"][path] = file_infos_value


def _get_free_space():
    # FIXME: Take possible root reserved space into account
    for filesystem in subprocess.check_output(
        [
            "df",
            "-P",
            "-B1",
            omd_root,
        ],
        encoding="utf-8",
    ).split("\n"):
        if filesystem[0] == "/":
            _vol, _size_bytes, _used_bytes, free_bytes, _used_perc, _mp = filesystem.split()
            return int(free_bytes)
    return None


def _above_threshold(bytes_free: int) -> bool:
    assert min_free_bytes is not None
    return bytes_free >= min_free_bytes


def _delete_file(path: str, reason: str) -> bool:
    try:
        _log(f"Deleting file ({reason}): {path}")
        os.unlink(path)

        # Also delete any .info files which are connected to the rrd file
        if path.endswith(".rrd"):
            path = "%sinfo" % path[:-3]
            if os.path.exists(path):
                _log(f"Deleting file ({reason}): {path}")
                os.unlink(path)

        return True
    except Exception as e:
        _error(f"Error while deleting {path}: {e}")
    return False


def _delete_files_and_base_directory(path: str, reason: str) -> bool:
    """
    Deletes files in a directory and the directory itself
    (not recursing into sub directories. Failing instead)
    """
    try:
        _log(f"Deleting directory and files ({reason}): {path}")
        for file_name in os.listdir(path):
            os.unlink(path + "/" + file_name)
        os.rmdir(path)
        return True
    except Exception as e:
        _error(f"Error while deleting {path}: {e}")
    return False


def _oldest_candidate(file_infos: dict) -> str | None:
    assert min_file_age is not None
    if file_infos:
        # Sort by modification time
        sorted_infos = sorted(file_infos.items(), key=lambda i: i[1][1])
        oldest = sorted_infos[0]
        if oldest[1][1] < time.time() - min_file_age:
            return oldest[0]
    return None


def _cleanup_host_directory_for_local_hosts(
    cleanup_hosts: set[HostName], base_path: str
) -> list[HostName]:
    """
    First find all directories not related to a known host.
    """
    if not os.path.isdir(base_path):
        return []

    unrelated_dirs: list[str] = []
    for host_dir in os.listdir(base_path):
        if host_dir not in cleanup_hosts:
            unrelated_dirs.append(host_dir)

    cleaned_up_hosts = _check_threshold_and_delete(unrelated_dirs, base_path)

    return cleaned_up_hosts


def _cleanup_host_directory_for_remote_hosts(cleaned_up_remote_hosts: set, base_path: str) -> list:
    """
    Find all directories existing on the local site and return a list of all
    matching hosts that are known on remote sites
    """
    if not os.path.isdir(base_path):
        return []

    unrelated_dirs = []
    for host_dir in os.listdir(base_path):
        if host_dir in cleaned_up_remote_hosts:
            unrelated_dirs.append(host_dir)

    cleaned_up_hosts = _check_threshold_and_delete(unrelated_dirs, base_path)

    return cleaned_up_hosts


def _check_threshold_and_delete(unrelated_dirs: list[str], base_path: str) -> list:
    """
    Find the latest modified file for each directory. When the latest
    modified file is older than the threshold, delete all files including
    the host base directory.
    """
    assert cleanup_abandoned_host_files is not None
    cleaned_up_hosts = []
    for unrelated_dir in unrelated_dirs:
        path = f"{base_path}/{unrelated_dir}"
        mtime: float = _newest_modification_time_in_dir(path)
        if mtime < time.time() - cleanup_abandoned_host_files:
            _delete_files_and_base_directory(path, "abandoned host")
            cleaned_up_hosts.append(unrelated_dir)
        else:
            _verbose("Found abandoned host path (but not old enough): %s" % path)

    return cleaned_up_hosts


def _do_automation_call(
    hosts_to_cleanup: set,
    automation_call: Literal["delete-hosts", "delete-hosts-known-remote"],
) -> None:
    """Call automation command"""
    command: list[str] = ["check_mk", "--automation", automation_call] + list(hosts_to_cleanup)
    _verbose('Calling "%s"' % " ".join(command))
    with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as p:
        if p.wait() != 0:
            assert p.stdout is not None
            _error(
                'Failed to execute "%s" to cleanup the host files. Exit-Code: %d, Output: %r'
                % (" ".join(command), p.returncode, p.stdout.read())
            )


def _newest_modification_time_in_dir(dir_path: str) -> float:
    mtime: float = 0.0
    for entry in os.listdir(dir_path):
        path = dir_path + "/" + entry
        mtime = max(os.stat(path).st_mtime, mtime)
    return mtime


def _get_configured_hosts() -> tuple[set, set, bool]:
    """
    Get local known hosts for all kind of sites (central and remote).
    For central sites, get also all known hosts, even the ones that are
    assigned to remote sites.
    """
    local_site_hosts: set = set()
    all_hosts: set = set()

    is_wato_remote_site = get_site_distributed_setup() == SiteDistributedSetup.DISTRIBUTED_REMOTE

    if not is_wato_remote_site:
        all_hosts.update(
            subprocess.check_output(
                ["check_mk", "--list-hosts", "--all-sites", "--include-offline"], encoding="utf-8"
            ).splitlines()
        )

    local_site_hosts.update(
        subprocess.check_output(
            ["check_mk", "--list-hosts", "--include-offline"], encoding="utf-8"
        ).splitlines()
    )

    return all_hosts, local_site_hosts, is_wato_remote_site


def _do_cleanup_abandoned_host_files() -> None:
    """
    The mechanism is like this:
    - Get the list of configured hosts (also temporarily disabled ones)
    - Scan the configured paths for files related to not known hosts
    - Check the age of the found files and delete them when they are too old
    - Additionally: Call the Check_MK-Automation to cleanup more files of
      the hosts which files have been deleted for.
    """
    if not cleanup_abandoned_host_files:
        return

    try:
        all_hosts, local_site_hosts, is_wato_remote_site = _get_configured_hosts()
    except subprocess.CalledProcessError as e:
        _verbose("Failed to get site hosts (%s). Skipping abandoned host files cleanup" % e)
        return

    if not local_site_hosts:
        _verbose("Found no hosts. Be careful and not cleaning up anything.")
        return

    cleanup_hosts = all_hosts if not is_wato_remote_site else local_site_hosts

    # Base directories where each host has a sub-directory below with
    # host related files inside
    path_patterns: list[str] = [
        "%s/inventory_archive" % var_dir,
        "%s/rrd" % var_dir,
        "%s/var/pnp4nagios/perfdata" % omd_root,
    ]

    cleaned_up_local_hosts: set = set()
    for base_path in path_patterns:
        cleaned_up_local_hosts.update(
            _cleanup_host_directory_for_local_hosts(
                cleanup_hosts,
                base_path,
            )
        )

    # Now call Check_MK to clean up other files for the hosts which we have
    # cleaned up abandoned files for.
    if cleaned_up_local_hosts:
        _do_automation_call(cleaned_up_local_hosts, "delete-hosts")

    # Now call Check_MK to clean up files for hosts that still have files local
    # but are only known on remote sites
    if all_hosts:
        remote_site_hosts = all_hosts - local_site_hosts
        cleaned_up_remote_hosts: set = set()
        for base_path in path_patterns:
            cleaned_up_remote_hosts.update(
                _cleanup_host_directory_for_remote_hosts(
                    remote_site_hosts,
                    base_path,
                )
            )
        if cleaned_up_remote_hosts:
            _do_automation_call(cleaned_up_remote_hosts, "delete-hosts-known-remote")


def _cleanup_aged() -> None:
    """
    Loop all files to check wether or not files are older than
    max_age. Simply remove all of them.

    """
    if max_file_age is None:
        _verbose("Not cleaning up too old files (not enabled)")
        return
    max_age: float = time.time() - max_file_age

    for plugin in plugins.values():
        for path, (_size, mtime) in list(plugin.get("file_infos", {}).items()):
            if mtime < max_age:
                if _delete_file(path, "too old"):
                    del plugin["file_infos"][path]
            else:
                _verbose("Not deleting %s" % path)

    bytes_free: int = _get_free_space()
    _verbose("Free space (after file age cleanup): %s" % fmt_bytes(bytes_free))


def _cleanup_oldest_files() -> None:
    if min_free_bytes is None or min_file_age is None:
        _verbose("Not cleaning up oldest files of plugins (not enabled)")
        return

    # check diskspace against configuration
    bytes_free: int = _get_free_space()
    if not opt_force and _above_threshold(bytes_free):
        _verbose(
            "Free space is above threshold of %s. Nothing to be done." % fmt_bytes(min_free_bytes)
        )
        return

    # the scheduling of the cleanup job is supposed to be equal for
    # all sites. To ensure that not only one single site is always
    # cleaning up, we add a a random wait before cleanup.
    sleep_sec = float(random.randint(0, 10000)) / 1000
    _verbose("Sleeping for %0.3f seconds" % sleep_sec)
    time.sleep(sleep_sec)

    # Loop all cleanup plugins to find the oldest candidate per plugin
    # which is older than min_age and delete this file.
    for plugin_name, plugin in plugins.items():
        oldest = _oldest_candidate(plugin.get("file_infos", {}))
        if oldest is not None:
            _delete_file(oldest, plugin_name + ": my oldest")

    bytes_free = _get_free_space()
    _verbose("Free space (after min free space space cleanup): %s" % fmt_bytes(bytes_free))


def main() -> None:
    _print_config()
    _load_plugins()
    _collect_file_infos()

    _do_cleanup_abandoned_host_files()

    # get used diskspace of the sites volume
    bytes_free = _get_free_space()
    _verbose("Free space: %s" % fmt_bytes(bytes_free))

    _cleanup_aged()
    _cleanup_oldest_files()


if __name__ == "__main__":
    _read_config()

    try:
        main()
    except SystemExit:  # pylint: disable=try-except-raise
        raise
    except Exception:
        _terminate("Unexpected exception: %s" % traceback.format_exc())
