@inproceedings{1c962085c6264458aa4b098db0f8f10c,
title = "Exploring Determinants of Longevity of Biomedical Databases",
abstract = "The maintenance of biomedical databases requires ongoing and systematic efforts in keeping them up-to-date which may affect long-term sustainability. Since research has become more reliant on publicly available biomedical data collections, it is important to understand factors affecting their longevity. The aim of this article was to explore potential determinants of biomedical database longevity. To build an analytical dataset, we used Database journal that have been created as an open access platform for presenting biological databases. A stratified analysis of all the original databases published in Database journal between 2009 and 2016 was conducted depending on their accessibility status. Overall, 35% of 518 analyzed databases were not accessible in 2020. We showed that databases with higher citation counts from institutions with higher scientific output were significantly more likely to be currently accessible. Databases from researchers with higher h-index were more likely to be accessible. Further investigation is warranted to identify factors affecting longevity of high impact databases.",
keywords = "Biomedical databases, longevity, predictive analytics",
author = "Joseph Finkelstein and Jennifer Guarino and Xingyue Huo and Kirill Borziak and Irena Parvanova",
note = "Funding Information: We analyzed characteristics of 518 publicly available online databases presented in peer-reviewed articles published between 2009 and 2016 in Database journal devoted to introduction of biomedical databases to scientific community. As of April 1, 2020, 35% of these databases were not accessible indicating significant attrition of biomedical databases over the time. Significant differences were identified between characteristics of surviving and dead databases. The surviving databases were published by authors with significantly higher h-index from institutions with higher publication output. The surviving databases were more likely to have prior publications and were more frequently cited in PubMed. Surprisingly, government funding was not identified as a significant predictor of database longevity. A multivariate logistic regression accounting for all potential covariates demonstrated sufficient accuracy of predicting database accessibility with C-statistic of 78%. Our results are congruent with previous report which demonstrated that in the course of 18 years out of 326 biological databases only 16.3% remained alive and 7% were rebranded with the remaining databases not being accessible [7]. This article positioned weaker financial support as one of the primary factors affecting database longevity which was not fully supported by our initial analysis. However, we were in an agreement that databases originating from institutions with stronger academic environments or whose core mission was aligned with supporting that database development and maintenance were more likely to have prolonged longevity. As in our work, citation count was shown to be a significant predictor of biological database longevity in a recent analysis of 1.727 biological databases [8]. Recently introduced guidelines for improved data availability and reusability entitled FAIR [9] combined with a comprehensive set of approaches to enhance reliability and trustworthiness of shared research data entitled TRUST [10] are likely to facilitate meaningful data sharing and storage [11]. Intelligent integrative informatics approaches [12] utilizing cross-linked biomedical ontologies [13], common data models [14], and core outcomes sets [15] will promote data harmonization and longevity of evolving biomedical databases [16]. Application of appropriate data exchange standards with domain-relevant content standards combined with accessible rich metadata based on applicable terminologies will catalyze effective and sustainable data sharing in the future [17]. Our pilot study was restricted to databases published in a single peer-reviewed journal and included analysis of a limited number of database characteristics. Temporal trends, compliance with FAIR and TRUST policies, database size, functionality and subject area were not addressed in this initial analysis. Nevertheless, we were able to identify significant determinants of biomedical database longevity and build sufficiently accurate predictive model. Publisher Copyright: {\textcopyright} 2022 International Medical Informatics Association (IMIA) and IOS Press.; 18th World Congress on Medical and Health Informatics: One World, One Health - Global Partnership for Digital Innovation, MEDINFO 2021 ; Conference date: 02-10-2021 Through 04-10-2021",
year = "2022",
month = jun,
day = "6",
doi = "10.3233/SHTI220047",
language = "English",
series = "Studies in Health Technology and Informatics",
publisher = "IOS Press BV",
pages = "135--139",
editor = "Paula Otero and Philip Scott and Martin, {Susan Z.} and Elaine Huesing",
booktitle = "MEDINFO 2021",
address = "Netherlands",
}