@inproceedings{7d47e90c7ce54270a5c4d919fa0278b9,
title = "FaultInsight: Interpreting Hyperscale Data Center Host Faults",
abstract = "Operating and maintaining hyperscale data centers involving millions of service hosts has been an extremely intricate task to tackle for top Internet companies. Incessant system failures cost operators countless hours of browsing through performance metrics to diagnose the underlying root cause to prevent the recurrence. Although many state-of-the-art (SOTA) methods have used time-series causal discovery to construct causal relationships among anomalous metrics, they only focus on homogeneous service-level performance metrics and fail to yield useful insights on heterogeneous host-level metrics. To address the challenge, this study presents FaultInsight, a highly interpretable deep causal host fault diagnosing framework that offers diagnostic insights from various perspectives to reduce human effort in troubleshooting. We evaluate FaultInsight using dozens of incidents collected from our production environment. FaultInsight provides markedly better root cause identification accuracy than SOTA baselines in our incident dataset. It also shows outstanding advantages in terms of deployability in real production systems. Our engineers are deeply impressed by FaultInsight's ability to interpret incidents from multiple perspectives, helping them quickly understand the mechanism behind the faults.",
keywords = "causal discovery, data center, fault diagnosis",
author = "Tingzhu Bi and Zhang Yang and Yicheng Pan and Yu Zhang and Meng Ma and Xinrui Jiang and Linlin Han and Feng Wang and Xian Liu and Ping Wang",
note = "Publisher Copyright: {\textcopyright} 2024 Copyright held by the owner/author(s). Publication rights licensed to ACM.; 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD 2024 ; Conference date: 25-08-2024 Through 29-08-2024",
year = "2024",
month = aug,
day = "25",
doi = "10.1145/3637528.3672051",
language = "English",
series = "Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining",
publisher = "Association for Computing Machinery",
pages = "141--152",
booktitle = "KDD 2024 - Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining",
}