<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e78764</article-id><article-id pub-id-type="doi">10.2196/78764</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Integrating Confidence, Difficulty, and Language Model Calibration for Better Explainability in Clinical Documents Coding: Applications of AI</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Popescu</surname><given-names>Mihai Horia</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Roitero</surname><given-names>Kevin</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Della Mea</surname><given-names>Vincenzo</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Mathematics, Computer Science and Physics (DMIF), University of Udine</institution><addr-line>via Delle Scienze, 206</addr-line><addr-line>Udine</addr-line><country>Italy</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Rasool</surname><given-names>Abdur</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chen</surname><given-names>Chia-Mei</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mihai Horia Popescu, PhD, Department of Mathematics, Computer Science and Physics (DMIF), University of Udine, via Delle Scienze, 206, Udine, 33100, Italy, 39 0432 558400; <email>mihaihoria.popescu@uniud.it</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>all authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>22</day><month>4</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e78764</elocation-id><history><date date-type="received"><day>09</day><month>06</month><year>2025</year></date><date date-type="rev-recd"><day>29</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>29</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Mihai Horia Popescu, Kevin Roitero, Vincenzo Della Mea. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 22.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e78764"/><abstract><sec><title>Background</title><p>In recent years, there has been increasing interest in developing machine and deep learning models capable of annotating clinical documents with semantically relevant labels. However, the complex nature of these models often leads to significant challenges regarding interpretability and transparency.</p></sec><sec><title>Objective</title><p>This study aims to improve the interpretability of transformer models and evaluate the explainability of a deep learning&#x2013;based annotation of coded clinical documents derived from death certificates. Specifically, the focus is on interpreting and explaining model behavior and predictions by leveraging calibrated confidence, saliency maps, and measures of instance difficulty applied to textualized representations coded using the International Statistical Classification of Diseases and Related Health Problems (ICD). In particular, the instance difficulty approach has previously proven effective in interpreting image-based models.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used disease language bidirectional encoder representations from transformers, a domain-specific bidirectional encoder representations from transformers model pretrained on ICD classification-related data, to analyze reverse-coded representations of death certificates from the US National Center for Health Statistics, covering the years 2014 to 2017 and comprising 12,919,268 records. The model inputs consist of textualized representations of ICD-coded fields derived from death certificates, obtained by mapping codes to the corresponding ICD concept titles. For this study, we extracted a subset of 400,000 certificates for training, 100,000 for testing, and 10,000 for validation. We assessed the model&#x2019;s calibration and applied a temperature scaling post-hoc calibration method to improve the reliability of its confidence scores. Additionally, we introduced mechanisms to rank instances by difficulty using Variance of Gradients scores, which also facilitate the detection of out-of-distribution cases. Saliency maps were also used to enhance interpretability by highlighting which tokens in the input text most influenced the model&#x2019;s predictions.</p></sec><sec sec-type="results"><title>Results</title><p>Experimental results on a pre&#x2013;fine-tuned model for predicting the underlying cause of death from reverse-coded death certificate representations, which already achieves high accuracy (0.990), show good out-of-the-box calibration with respect to expected calibration error (1.40), though less so for maximum calibration error (30.91). Temperature scaling further reduces expected calibration error (1.13) while significantly increasing maximum calibration error (42.17). We report detailed Variance of Gradients analyses at the ICD category and chapter levels, including distributions of target and input categories, and provide word-level attributions using Integrated Gradients for both correctly classified and failure cases.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study demonstrates that enhancing interpretability and explainability in deep learning models can improve their practical utility in clinical document annotation. By addressing reliability and transparency, the proposed approaches support more informed and trustworthy application of machine learning in mission-critical medical settings. The results also highlight the ongoing need to address data limitations and ensure robust performance, especially for rare or complex cases.</p></sec></abstract><kwd-group><kwd>deep learning</kwd><kwd>cause of death prediction</kwd><kwd>model confidence</kwd><kwd>instance difficulty</kwd><kwd>semantic</kwd><kwd>machine learning</kwd><kwd>saliency maps</kwd><kwd>prediction</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Over the past decade, machine learning (ML) models have been increasingly gaining trust from stakeholders due to their better and better performance. This trend is even more remarkable due to the recent advances in deep learning, which have dramatically improved neural network (NN) accuracy, gaining interest over traditional techniques. As a result, NNs have seen wide adoption in a range of applications, such as health care, object detection, speech recognition, and finance; however, they are still frequently criticized for being black boxes. To gain trust from both researchers and end users in these settings, it is beneficial to develop interpretable models. Sensitive domains and real-world decision-making systems require not only accuracy on familiar data distributions but also mechanisms to ensure reliability, highlight potential errors [<xref ref-type="bibr" rid="ref1">1</xref>], and identify uncertainty in out-of-distribution (OOD) scenarios [<xref ref-type="bibr" rid="ref2">2</xref>]. For example, in digital health care, the system should recognize and specify when the model suggestion is poorly confident, so the control can pass to a human doctor [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], since for clinical scenarios, one wishes to avoid failure at all costs. Alternatively, vision models are increasingly used in safety-critical applications such as autonomous driving [<xref ref-type="bibr" rid="ref5">5</xref>], where the detection network needs to predict the presence or absence of immediate obstructions, and the car, based on the confidence of the prediction, should decide if it should rely more on the output of other sensors or the current prediction for braking. As a result, a network should provide calibrated confidence where the probability associated with the prediction should reflect its ground truth correctness likelihood. At the same time, calibration alone does not guarantee overall reliability, as a model may be well-calibrated, yet still inaccurate or biased. Nonetheless, achieving good calibration remains an essential property of trustworthy systems, as it enables downstream decision modules to interpret predictions in a consistent and meaningful way.</p><p>Calibrated confidence is a desired feature of NNs, since good confidence estimates can be used to accomplish model interpretability. Good probability estimates provide beneficial extra information to establish trustworthiness with the user [<xref ref-type="bibr" rid="ref3">3</xref>]; therefore, humans have a natural cognitive intuition for probabilities [<xref ref-type="bibr" rid="ref6">6</xref>]. In addition, NNs can be incorporated into other probabilistic models by using good probability estimates [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Calibrated models have also been shown to be useful for detecting OOD data [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>] and to improve model fairness [<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>While the main focus of these studies has been on improving the predictive accuracy of models, not much has been done on the interpretability of models, with less attention on its calibration. Initially, the models were not as accurate as today, but generally, NNs were producing well-calibrated probabilities on binary classification tasks [<xref ref-type="bibr" rid="ref12">12</xref>]. A few years ago, many instances of miscalibration in modern NNs were reported [<xref ref-type="bibr" rid="ref1">1</xref>], with a trend that suggests newer, larger, and more accurate models may produce poorly calibrated predictions. Recently, Minderer et al [<xref ref-type="bibr" rid="ref4">4</xref>] revisited this question with the recent state-of-the-art image classification models and suggested that recent models such as the nonconvolutional multilayer perceptrons mixer [<xref ref-type="bibr" rid="ref13">13</xref>] and vision transformers [<xref ref-type="bibr" rid="ref14">14</xref>] are among the best calibrated, notably those not using convolution.</p><p>Presenting a subset of data points that a model considers to be relatively more challenging to learn can help in reasoning about model behavior. For this task, a ranking mechanism can help achieve interpretability by ranking instances by their difficulty. A mechanism for such a purpose is the Variance of Gradients (VoG) [<xref ref-type="bibr" rid="ref15">15</xref>], which ranks data by difficulty and helps to identify the most challenging subset of instances.</p><p>Ranking mechanisms are valuable for identifying difficult instances and are also applied in detection domain analyses, including anomaly detection, OOD detection, and adversarial detection [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. In particular, on classification tasks, OOD detection is very important, since it determines whether an input is in-distribution (ID) or OOD. It is unrealistic to assume that all inputs encountered in real-world settings are ID; therefore, training a model that performs perfectly on all possible inputs is inherently challenging, particularly given that real-world datasets are limited in scope. However, OOD uncertainty estimation is very challenging on modern deep learning algorithms, since it can easily produce overconfident predictions also on OOD inputs [<xref ref-type="bibr" rid="ref2">2</xref>]. This phenomenon makes the distinction between ID and OOD data even more challenging when the dataset is not well-balanced. While there are different techniques to handle OOD detection, some authors focus on deriving OOD uncertainty measurements from the activation space of the NN by using model output [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>] or feature representations [<xref ref-type="bibr" rid="ref22">22</xref>]. Others instead make use of the gradient space [<xref ref-type="bibr" rid="ref2">2</xref>]; in particular, VoG was shown to be a reliable score as an OOD detection technique [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>Saliency maps based on gradients represent a powerful tool for enhancing text interpretability in NNs. They fall under the broader category of attribution algorithms, which aim to understand how various features, neurons, and layers contribute to a model&#x2019;s output. Saliency maps specifically focus on highlighting the parts of the input data&#x2014;such as tokens in transformer-based techniques&#x2014;that have the most significant impact on the model&#x2019;s decision. In practice, saliency maps are generated by computing the gradient of the model&#x2019;s output with respect to the input text. This process reveals how small changes in the input text would affect the output. Consequently, words or phrases that cause significant changes are considered more important for the model&#x2019;s predictions. This method aligns with human cognitive processes, as it provides a direct way to visualize which parts of the text are influencing the model&#x2019;s decisions [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>In this paper, we experiment with the proposed techniques in a medical classification task, namely, the identification of the so-called underlying cause of death (UCOD) from death certificates. The UCOD was defined by the World Health Organization (WHO) as &#x201C;I (a) the disease or injury which initiated the train of morbid events leading directly to death; or (b) the circumstances of the accident or violence which produced the fatal injury,&#x201D; together with the rules for its selection [<xref ref-type="bibr" rid="ref25">25</xref>]. In principle, this important condition is part of the stated information in the death certificate; yet, it is not always easy to identify. As of today, few rule-based systems are used for UCOD selection. ML has been shown to be very effective; however, such models are often designed as black boxes. Therefore, without explainability of the results, the experts are not willing to adopt such models.</p><p>In this work, we build upon a previously developed deep learning model that achieved state-of-the-art performance in UCOD identification [<xref ref-type="bibr" rid="ref26">26</xref>]. While the model demonstrates strong predictive accuracy, its reliability and interpretability have not yet been systematically examined, particularly with respect to calibration and the explainability of its predictions.</p></sec><sec id="s1-2"><title>Related Work</title><sec id="s1-2-1"><title>Model Calibration</title><p>While ML models are not always well calibrated, many techniques have been proposed to train NNs for calibration or post-hoc calibrate a model without losing the focus of improving the predictive accuracy. According to Karandikar et al [<xref ref-type="bibr" rid="ref27">27</xref>], the existing calibration approaches can be categorized into the following 3 categories. The first category explicitly rewards calibration by augmenting or replacing the primary training loss such as accuracy versus uncertainty calibration loss [<xref ref-type="bibr" rid="ref28">28</xref>], maximum mean calibration error loss [<xref ref-type="bibr" rid="ref29">29</xref>], and focal loss [<xref ref-type="bibr" rid="ref30">30</xref>]. Recently, Hui and Belkin [<xref ref-type="bibr" rid="ref31">31</xref>] have shown that using the mean square error loss would further improve the performance, and no extra loss rescaling parameters would be necessary, as was required for the cross-entropy loss. While Karandikar et al [<xref ref-type="bibr" rid="ref27">27</xref>] suggest that applying across multiple primary losses, the methods outperform all these calibration-incentivizing training objectives.</p><p>A second category of methods examines model changes, such as deep ensembles, for predictive uncertainty estimation. In Lakshminarayanan et al [<xref ref-type="bibr" rid="ref20">20</xref>], it is shown that this approach is a strong baseline on evaluation metrics and is a simple and effective method for ensembling. This approach trains multiple copies of a network and aggregates the individual models to form a mixture distribution on which the predictions are made. Wen et al [<xref ref-type="bibr" rid="ref32">32</xref>] proposed batch ensemble, a computationally efficient ensembling method that maintains competitive accuracy and uncertainty estimates while significantly reducing training and inference costs and scaling effectively to lifelong learning scenarios. Recent work of Dusenberry et al [<xref ref-type="bibr" rid="ref33">33</xref>] has shown a prior distribution as a simple strategy for aggregating multimodal weight solutions, similar to deep ensembles.</p><p>A third category consists of post-hoc calibrating a model by rescaling the model predictions after training. The most popular technique for this category is temperature scaling [<xref ref-type="bibr" rid="ref1">1</xref>], which maximizes a single temperature parameter on held-out negative log likelihood. Regarding calibration of pretrained transformers in natural language processing, Desai and Durrett [<xref ref-type="bibr" rid="ref34">34</xref>] analyzed the calibration on 2 models, bidirectional encoder representations from transformers (BERT) [<xref ref-type="bibr" rid="ref35">35</xref>] and robustly optimized bidirectional encoder representations from transformers approach (ROBERTa) [<xref ref-type="bibr" rid="ref36">36</xref>], across 3 tasks. When used out of the box, the authors show that the pretrained models are well calibrated in-domain, but out-of-domain calibration can be as much as 3.5 times lower. The authors suggest that temperature scaling post-hoc calibration can further reduce calibration error in-domain, and using label smoothing helps calibrate posteriors out-of-domain. Similar to previous work [<xref ref-type="bibr" rid="ref34">34</xref>], we focus on post-hoc calibration of transformers by studying the performance of an in-domain model trained by the method disease language bidirectional encoder representations from transformers (DiLBERT) [<xref ref-type="bibr" rid="ref37">37</xref>], checking improvements by using temperature scaling.</p><p>In addressing model calibration, we specifically opt for post-hoc calibration methods, since we prioritize the preservation of the original training procedure and model integrity. This approach has been chosen to focus on transparent and nonintrusive adjustments, thus avoiding direct optimization methods that could potentially compromise model effectiveness on the task.</p><p>It is important to note that our methodology is reliant on the use of encoder-based models. This is due to the inherent compatibility of encoder-based models with the state-of-the-art calibration techniques, which rely on logit confidence scores. Consequently, our study does not incorporate models like LlamA [<xref ref-type="bibr" rid="ref38">38</xref>] or Mistral [<xref ref-type="bibr" rid="ref39">39</xref>], as these models are causal autoregressive in nature, which does not align with our requirement for accessible and adjustable logit confidence scores. Additionally, we do not use ChatGPT as part of the experimental or modeling pipeline because of its proprietary nature and the challenges associated with conducting transparent and replicable research with such closed systems.</p></sec><sec id="s1-2-2"><title>Estimating Instance Difficulty</title><p>Presenting a subset of data points that a model considers to be relatively more challenging to learn can help in reasoning about model behavior. Techniques aimed at identifying such behavior can be categorized based on their objectives. Case-based reasoning not only aids interpretability [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>] but also facilitates the identification of atypical examples for human auditing [<xref ref-type="bibr" rid="ref42">42</xref>] and enables the model to refrain from classifying uncertain instances [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Human auditing can be challenging for large datasets; therefore, prior work has focused on methods that automatically identify a subset of more challenging examples to prioritize limited human annotation and auditing resources. For such a task, an effective tool is saliency maps [<xref ref-type="bibr" rid="ref45">45</xref>]; additionally, several recent studies have focused on the problem of estimating example difficulty by using a ranking mechanism such as VoG [<xref ref-type="bibr" rid="ref15">15</xref>]. From their study, the authors suggest that the images that appear more difficult also have a higher VoG score. For each image, the algorithm is calculating the gradient of the activations with respect to the pixels over multiple checkpoints during the training process. The VoG score is calculated as the average (over pixels) of the per-pixel variance across these checkpoints. Using gradient information, Kokilepersaud et al [<xref ref-type="bibr" rid="ref46">46</xref>] have proposed Gradient Constraint, where they have been using gradient measures as a method to detect anomalies by assigning pseudoseverity labels to a large set of unlabeled optical coherence tomography scans.</p><p>Another approach to estimating example difficulty relies on prediction depth [<xref ref-type="bibr" rid="ref47">47</xref>], which provides an alternative perspective compared with gradient-based methods. The prediction depth represents the first hidden layers, after which the k-nearest neighbor classifier can effectively classify an example using the representation of the image in all subsequent layers. The authors show that the prediction depth is larger for examples that visually appear to be more difficult. Their investigations also reveal that the predictions are on average more accurate for validation points with small prediction depths.</p><p>In addition to methodological differences, we can categorize the approaches into 2 groups based on when they score training instances: those that use the final trained model and those that assess instances early in training. Specifically, the methods developed by Agarwal et al [<xref ref-type="bibr" rid="ref15">15</xref>] and Baldock et al [<xref ref-type="bibr" rid="ref47">47</xref>] fall into the first category, as they use the final trained network. In contrast, the methods by Paul et al [<xref ref-type="bibr" rid="ref48">48</xref>] belong to the second category, as they highlight the presence of a strong signal for estimating example difficulty very early in the training process. To this end, they propose scoring the importance of each training example based on its expected loss gradient norm (Gradient Normed score). Furthermore, the authors suggest that, within the first few epochs of training, the Gradient Normed score can be effectively approximated by the norm of the error vector (Error L2-Norm score). They conclude by demonstrating that significant fractions of the training data can be pruned without sacrificing test accuracy, as evidenced by their experiments across a variety of architectures and datasets.</p></sec><sec id="s1-2-3"><title>Gradients Techniques Used to Identify OOD Samples</title><p>Gradients can be used alternatively for more specific tasks, such as detecting adversarial, anomalous, and OOD samples. Igoe et al [<xref ref-type="bibr" rid="ref49">49</xref>] and Kwon et al [<xref ref-type="bibr" rid="ref50">50</xref>] proposed the use of back-propagated gradients from NNs to obtain the model-based characterization of abnormality. The authors afterward refine the previous idea, proposing an anomaly detection algorithm using the Gradient Constraint [<xref ref-type="bibr" rid="ref16">16</xref>]. In their experiment, the authors measure the cosine similarity between past normal gradients and the current input. Shifting focus to research themes associated with anomaly detection, we find its applications in OOD. Unlike OOD detection, which presupposes multiclass data with identifiable labels, in anomaly detection, we can distinguish between monolithic sets of normal and anomalous data. Considering OOD detection [<xref ref-type="bibr" rid="ref51">51</xref>], make use of the Mahalanobis distance of the gradient to detect OOD samples. Instead, Lee et al [<xref ref-type="bibr" rid="ref17">17</xref>] propose to use back-propagated gradients to characterize anomalies in inputs seen during inference from the perspective of the model. In Grad Norm [<xref ref-type="bibr" rid="ref2">2</xref>], the authors propose a gradient-based OOD uncertainty estimation method, which is label-agnostic and where no outlier data are required. For the adversarial detection problem, in raw gradient anomaly detection, the authors analyze the temporal distribution of the entire raw gradient by their end-to-end deep learning&#x2013;based architecture. In detecting adversarial attacks by analyzing gradients, Schulze et al [<xref ref-type="bibr" rid="ref18">18</xref>] analyze the raw gradient of the last two layers of classifiers.</p></sec><sec id="s1-2-4"><title>Saliency Maps</title><p>In Malkiel et al [<xref ref-type="bibr" rid="ref24">24</xref>], a novel unsupervised method is presented to explain paragraph similarities using pretrained BERT models. The technique identifies and matches keywords across paragraph pairs, highlighting the crucial pairs that best explain their similarity. To evaluate the reliability of saliency maps for text analysis, Kokhlikyan et al [<xref ref-type="bibr" rid="ref52">52</xref>] compare their application with image-based models. They discover that input multipliers maintain text structural patterns across different models, leading to uniform explanations. The study also highlights that smoother NN components, such as SoftPlus instead of rectified linear units, significantly enhance the accuracy and reliability of saliency-based interpretations in text. Borgnia et al [<xref ref-type="bibr" rid="ref53">53</xref>] propose a novel saliency approach by analyzing network parameters instead of inputs to understand NN errors. The findings reveal that problematic parameters cause semantically similar misclassifications, and pruning or fine-tuning these parameters can rectify similar errors across different samples. Additionally, the authors developed a technique for linking image features to parameter malfunctions, enhancing model interpretability. The effectiveness of this method is confirmed through extensive validation, offering a fresh perspective on NN diagnostics.</p><p>Captum Insights is an open-source interpretability library designed for PyTorch [<xref ref-type="bibr" rid="ref23">23</xref>], encapsulating a variety of gradient and perturbation-based attribution methods. This tool, suitable for a range of models beyond mere classification tasks, supports multiple data types, including images, text, and more, ensuring wide applicability. Highlighting its multimodal and extendable nature, the library aims to simplify and enhance the interpretability process. The latter is part of the interpretability techniques applied in our study.</p></sec><sec id="s1-2-5"><title>UCOD Selection</title><p>The UCOD is the most important condition used for statistical comparison and public health data. To be able to correctly identify the UCOD in a death certificate, the certificate needs to undergo 2 different processes: first, the textual conditions need to be coded according to a standard defined by the WHO, using the International Statistical Classification of Diseases and Related Health Problems (ICD). The second process is the selection of the UCOD, according to specific rules defined by WHO [<xref ref-type="bibr" rid="ref25">25</xref>]. The selection of the UCOD is a laborious process, and there is a need for automatic or semiautomatic systems to limit errors. Therefore, there is much interest in developing support systems for such a purpose. We can identify 2 methodologies for such systems: rule-based systems and ML systems. Currently, there are 3 published rule-based systems that implement the WHO instructions to select the UCOD; other proprietary and unpublished systems might exist. The most used system is Iris [<xref ref-type="bibr" rid="ref54">54</xref>], which currently supports ICD 10th revision (ICD-10) and is being updated to support ICD 11th revision (ICD-11). Automated Classification of Medical Entities [<xref ref-type="bibr" rid="ref55">55</xref>], the oldest one, is a system developed in the United States, which can also identify the UCOD for ICD-10. The last entrant is Digital Open Rule Integrated Cause of Death Selection [<xref ref-type="bibr" rid="ref56">56</xref>], which is a new rule-based system designed and made available by WHO to identify the UCOD for ICD-11. Significant past work was devoted to the application of ML techniques for this specific task, reaching the state of the art over the traditional rule-based systems [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]. However, despite the accuracy achievements, such systems are still not officially acknowledged by mortality coding institutions and are not yet used in practice. This is due to the necessarily conservative approach that drives the collection of data for statistical purposes. In particular, using deep learning techniques, Falissard et al [<xref ref-type="bibr" rid="ref57">57</xref>] developed a modified Inception network, obtaining an accuracy score of 0.978 on a French death certificate dataset. Pita Ferreira et al [<xref ref-type="bibr" rid="ref58">58</xref>] conducted a sensitivity analysis of AUTOCOD, a deep learning model used to classify the UCOD from free-text certificates in Portugal, reporting an overall <italic>F</italic><sub>1</sub>-score of 0.88 at the category level and 0.94 at the block level. In our own previous work [<xref ref-type="bibr" rid="ref26">26</xref>], we have shown that by fine-tuning a transformer model, we can reach the state of the art by obtaining an accuracy score of 0.990. By comparison, reported Iris accuracy is 0.74&#x2010;0.78 from evaluations in the Netherlands [<xref ref-type="bibr" rid="ref59">59</xref>] and France [<xref ref-type="bibr" rid="ref57">57</xref>]. Falissard et al [<xref ref-type="bibr" rid="ref57">57</xref>] also reported a score of 0.92 when considering only nonrejected certificates. Similar results were obtained in the preliminary validation of Digital Open Rule Integrated Cause of Death Selection, with an accuracy of 0.78 for ICD-10, while it achieved an accuracy of 0.63 on ICD-11 [<xref ref-type="bibr" rid="ref56">56</xref>]. Our analysis in this paper centers on the model proposed by Della Mea et al [<xref ref-type="bibr" rid="ref26">26</xref>].</p></sec></sec><sec id="s1-3"><title>Objectives</title><p>This study aimed to evaluate the interpretability and reliability of a deep learning model for UCOD identification. Specifically, we assess whether the model is well calibrated and whether its calibration can be improved using temperature scaling post-hoc calibration. In addition, we investigate a mechanism to rank instances based on difficulty using VoG [<xref ref-type="bibr" rid="ref15">15</xref>] and explore its applicability for OOD detection in textual representations.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Definitions</title><p>This paper addresses 2 main problems that improve the interpretability of a transformer model. The first study aims to evaluate and improve the error calibration of the model by using post-hoc temperature scaling calibration, while the second study focuses on implementing a ranking mechanism that uses VoG to estimate the difficulty of predicting instances.</p><sec id="s2-1-1"><title>Disease Language Bidirectional Encoder Representations From Transformers</title><p>The language model used in this study is DiLBERT [<xref ref-type="bibr" rid="ref37">37</xref>], which is a domain-specific BERT model that has been pretrained on English disease-related corpora containing ICD-11 classification entities, Wikipedia and PubMed documents. The DiLBERT model&#x2019;s architecture mirrors that of the original BERT base model. It features 12 BERT encoders stacked together, each with 12 attention heads. The hidden size of the model (and the embedding dimension) is 768. Additionally, it can handle input sequences up to 512 tokens in length. The pretraining was performed using a masked language modeling task for 50 epochs on a 3.3 billion words corpus, with a batch size of 256.</p></sec><sec id="s2-1-2"><title>Posterior Calibration</title><p>A model is said to be well calibrated when the estimated confidence of the predictions is aligned with empirical likelihoods. For example, given 100 predictions, each receives a posterior probability of 0.9, it is expected that 90 predictions will be correctly classified. Formally, we consider a multiclass classification problem in which an input <inline-formula><mml:math id="ieqn1"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>x</mml:mi><mml:mo>&#x2008;</mml:mo><mml:mo>&#x2208;</mml:mo><mml:mo>&#x2008;</mml:mo><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> is observed, and a categorical output <inline-formula><mml:math id="ieqn2"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>y</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="italic">k</mml:mi></mml:mrow></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> is predicted, which are random variables that follow a ground truth joint distribution <inline-formula><mml:math id="ieqn3"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mrow><mml:mi>&#x03C0;</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="italic">x</mml:mi></mml:mrow></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="italic">y</mml:mi></mml:mrow></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mi>&#x03C0;</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="italic">y</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mo>&#x2008;</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="italic">x</mml:mi></mml:mrow></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x03C0;</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="italic">x</mml:mi></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula>. Let <italic>f</italic> be an NN model, the predictor <italic>f</italic> is modeled as a function that maps every <inline-formula><mml:math id="ieqn4"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mi>x</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mi>X</mml:mi></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> to a categorical distribution over <italic>k</italic> labels, where <inline-formula><mml:math id="ieqn5"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mrow><mml:mi mathvariant="normal">f</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="italic">x</mml:mi></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:mover><mml:mrow><mml:mi mathvariant="normal">y</mml:mi></mml:mrow><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> is a class prediction, and <inline-formula><mml:math id="ieqn6"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> is the associated confidence, that is, the probability of correctness. Since the objective is to obtain a confidence estimate <inline-formula><mml:math id="ieqn7"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> to be calibrated, then intuitively <inline-formula><mml:math id="ieqn8"><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula> should represent a true probability. Perfect calibration can be defined as</p><disp-formula> <label>(1)</label><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>P</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mo>=</mml:mo><mml:mi>y</mml:mi><mml:mo>&#x2223;</mml:mo><mml:mrow><mml:mover><mml:mi>p</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mo>=</mml:mo><mml:mi>p</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>p</mml:mi><mml:mo>,</mml:mo><mml:mtext>&#x00A0;</mml:mtext><mml:mi mathvariant="normal">&#x2200;</mml:mi><mml:mtext>&#x00A0;</mml:mtext><mml:mi>p</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy="false">]</mml:mo></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where the probability is over the joint distribution. While 1 represents the ideal settings, in practice, achieving perfect calibration is impossible, since <inline-formula><mml:math id="ieqn9"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mrow><mml:mover><mml:mi>P</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> is a continuous random variable, and the probability in the equation cannot be computed using finitely many samples. This motivates the need for empirical approximations; following previous work [<xref ref-type="bibr" rid="ref1">1</xref>], one can group predictions into <italic>M</italic> interval bins, where the bins are equally sized (each of size 1/<italic>M</italic>), then calculate the accuracy per bin. Let <italic>B<sub>m</sub></italic> be the set of samples whose prediction confidence belongs to the interval <inline-formula><mml:math id="ieqn10"><mml:msub><mml:mrow><mml:mi>I</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfenced close="]" separators="|"><mml:mrow><mml:mfrac><mml:mrow><mml:mi>m</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mfrac><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:mfenced></mml:math></inline-formula> [<xref ref-type="bibr" rid="ref1">1</xref>].</p></sec><sec id="s2-1-3"><title>Expected Calibration Error</title><p>Previous work in measuring calibration suggests that a common metric often used in practice is expected calibration error (ECE) [<xref ref-type="bibr" rid="ref60">60</xref>], which is a convenient scalar summary statistic of calibration. ECE makes use of equally spaced bins and is computed as the weighted average of the difference between each bin&#x2019;s accuracy and confidence. Formally, the accuracy of <italic>B<sub>m</sub></italic> can be defined as</p><disp-formula><label>(2)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi mathvariant="normal">a</mml:mi><mml:mi mathvariant="normal">c</mml:mi><mml:mi mathvariant="normal">c</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mi>m</mml:mi></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mn>1</mml:mn><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <inline-formula><mml:math id="ieqn11"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mrow><mml:mover><mml:mi>y</mml:mi><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> is the predicted class label for the sample <italic>i</italic>, and <italic>y</italic> is the true class label for the same sample. The average confidence of <italic>B</italic><sub><italic>m</italic></sub> bin can be defined as</p><disp-formula id="E3"> <label>(3)</label><mml:math id="eqn3"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>conf</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mover><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mover><mml:mrow/><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow></mml:mover><mml:mo>,</mml:mo></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <inline-formula><mml:math id="ieqn12"><mml:mstyle><mml:mrow><mml:mstyle displaystyle="false"><mml:mrow><mml:mover><mml:msub><mml:mi>p</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo stretchy="false">^</mml:mo></mml:mover></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></inline-formula> is the confidence of the sample <italic>i</italic>. Then, perfect calibration can be accomplished when <inline-formula><mml:math id="ieqn13"><mml:mo>&#x2200;</mml:mo><mml:mi>m</mml:mi><mml:mi> </mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo>{</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>M</mml:mi><mml:mo>}</mml:mo></mml:math></inline-formula>, then <inline-formula><mml:math id="ieqn14"><mml:mtext>acc</mml:mtext><mml:mfenced separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:mtext>conf</mml:mtext><mml:mfenced separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced></mml:math></inline-formula> (ie, when accuracy and confidence of the model align perfectly across all intervals considered). Based on the accuracy and confidence formulations, ECE can be defined as</p><disp-formula id="E4"><label>(4)</label><mml:math id="eqn4"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>ECE</mml:mtext><mml:mo>=</mml:mo><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>M</mml:mi></mml:mrow></mml:munderover><mml:mfrac><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow><mml:mi>n</mml:mi></mml:mfrac><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:mtext>acc</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mtext>conf</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>n</italic> is the number of samples [<xref ref-type="bibr" rid="ref1">1</xref>].</p></sec><sec id="s2-1-4"><title>Maximum Calibration Error</title><p>As reported by Guo et al [<xref ref-type="bibr" rid="ref1">1</xref>], ECE can be considered as the primary metric for measuring calibration. However, depending on specific requirements, other variant metrics may also be used, such as each one that emphasizes different aspects of calibration. Maximum calibration error (MCE) is one of those variants that is based on minimizing the worst-case deviation between confidence and accuracy, which can measure the reliability in high-risk applications. As in the case of ECE, the computation of MCE relies on an empirical approximation with equally spaced bins. The MCE can be defined as</p><disp-formula id="equWL2"><label>(5)</label><mml:math id="eqn5"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>MCE</mml:mtext><mml:mo>=</mml:mo><mml:munder><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mrow><mml:mi>m</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>M</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:munder><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:mtext>acc</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mtext>conf</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Perfect calibration can be defined as for ECE when <inline-formula><mml:math id="ieqn15"><mml:mo>&#x2200;</mml:mo><mml:mi>m</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mo>{</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>M</mml:mi><mml:mo>}</mml:mo><mml:mo>,</mml:mo><mml:mo>&#x2003;</mml:mo><mml:mtext>acc</mml:mtext><mml:mfenced separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced><mml:mo>=</mml:mo><mml:mtext>conf</mml:mtext><mml:mfenced separators="|"><mml:mrow><mml:msub><mml:mrow><mml:mi>B</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfenced></mml:math></inline-formula>. When the model is perfectly calibrated, the values of MCE and ECE are both equal to 0 [<xref ref-type="bibr" rid="ref1">1</xref>].</p></sec><sec id="s2-1-5"><title>Temperature Scaling</title><p>Temperature scaling [<xref ref-type="bibr" rid="ref1">1</xref>] is a post-hoc calibration method that adjusts the confidence scores produced by a deep learning model without altering its predicted class labels. It involves a single scalar parameter <italic>T</italic> called temperature, which is used to scale the logits of a model before applying the softmax function, which converts then the produced logits into probabilities. The primary goal of temperature scaling is to improve the calibration of the model, making the predicted probabilities more reflective of the true likelihood of correctness. Let <italic>z</italic>(<italic>x</italic>)  represent the vector of logits output by the NN for an input <italic>x</italic>. Temperature scaling modifies these logits as <italic>z</italic>(<italic>x</italic>)/<italic>T</italic>, where <italic>T</italic> &#x003E; 0  is the temperature. The temperature score is typically determined on a validation set by minimizing a calibration metric, typically ECE or MCE.</p></sec><sec id="s2-1-6"><title>Variance of Gradients</title><p>VoG is used to compute an instance score, which is of great interest, that is, whether a sample is difficult (ie, challenging) for a model to classify. Such information can be leveraged during model deployment, for example, by identifying difficult instances and handling them differently, such as referring them to a human assessor. In this work, we rely on the VoG score [<xref ref-type="bibr" rid="ref15">15</xref>], originally defined for images and conveniently modified to be used in a natural language processing problem. In our adaptation, instead of computing gradient variance over image pixels, we compute it over tokens by taking the gradient of the final activation layer with respect to each token&#x2019;s embedding. The variance of these token-level gradients (computed per class) is used as the stratification variable. VoG is computed by considering, for a given instance, the gradient of the final activation layer with respect to each part of the input instance at the token level. The measure is computed separately for each class. In other terms, this measures the contribution of each single instance to the final class prediction [<xref ref-type="bibr" rid="ref45">45</xref>]. Formally,</p><disp-formula id="E6"><label>(6)</label><mml:math id="eqn6"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>S</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:msubsup><mml:mi>A</mml:mi><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>represents the gradient matrix of the last layer <italic>l</italic>  for the class <italic>p</italic> (ie, <inline-formula><mml:math id="ieqn16"><mml:msubsup><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>) for each part of the instance <italic>x</italic><sub><italic>i</italic></sub>  (ie, its tokens), then VoG can be simply computed by considering the average VoG score for each instance. To handle class imbalance, we also normalize VoG by considering the average VoG of each class.</p></sec><sec id="s2-1-7"><title>Saliency Maps</title><p>To compute feature attributions for the model&#x2019;s predictions, we used Integrated Gradients (IG). This method attributes importance to each input component (token in our case) by accumulating the gradients of the output with respect to the inputs along a path that interpolates between a baseline input <italic>x</italic><sup>baseline</sup>  and the actual input <italic>x</italic>. The method is grounded in 2 axioms, sensitivity and implementation invariance, which ensure that the resulting attributions are faithful to the behavior of the model.</p><p>Formally, for a function <italic>F</italic>: <italic>R<sup>n</sup></italic>&#x2192;<italic>R</italic>  and an input <italic>x</italic>, the attribution for its <italic>i</italic>-th  dimension is defined as:</p><disp-formula id="E7"><label>(7)</label><mml:math id="eqn7"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mtext>Integrated Gradients</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mtext>&#x00A0;</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mi>x</mml:mi><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mtext>baseline</mml:mtext></mml:mrow></mml:msubsup></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:msubsup><mml:mo>&#x222B;</mml:mo><mml:mrow><mml:mrow><mml:mi>&#x03B1;</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mfrac><mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:mi>F</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mtext>baseline</mml:mtext></mml:mrow></mml:msup><mml:mo>+</mml:mo><mml:mrow><mml:mi>&#x03B1;</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mtext>baseline</mml:mtext></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi mathvariant="normal">&#x2202;</mml:mi><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mi>d</mml:mi><mml:mrow><mml:mi>&#x03B1;</mml:mi></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>In practice, this path integral is approximated numerically (eg, via a Riemann sum or Gauss-Legendre), providing a stable estimate of each feature&#x2019;s contribution to the model&#x2019;s prediction. Further details on the theoretical properties of IG can be found in the original work [<xref ref-type="bibr" rid="ref61">61</xref>].</p><p>The IG obtained for each input token serves as the attribution scores, which we then normalize and aggregate at the word level for visualization.</p></sec></sec><sec id="s2-2"><title>Task and Statistical Analysis</title><p>This study uses a dataset consisting of coded death certificates obtained from the US National Center for Health Statistics (NCHS), which are available for statistical and analytical research purposes. The dataset spans across the years 2014&#x2010;2017 and includes 12,919,268 records. The selected data correspond to the pre&#x2013;COVID-19 pandemic period. Data from the pandemic period were deliberately excluded because the pandemic introduced temporary ICD codes and evolving UCOD selection rules, resulting in unstable and noncomparable coding practices. Restricting the analysis to the prepandemic period ensures consistency in coding rules and provides a stable basis for the evaluation.</p><p>Each record contains administrative data such as sex, age, and concatenated conditions from both Part 1 and Part 2 of the certificate, as well as the corresponding UCOD. An example of a death certificate is presented in <xref ref-type="table" rid="table1">Table 1</xref>. The original records are already provided in an ICD-10 coded format, as the US National Center for Health Statistics public database does not contain the original free-text narratives written by physicians for privacy and anonymization reasons.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Example of a coded death certificate.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Condition</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Part 1</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">I21.9 Acute myocardial infarction</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">I10 Hypertension</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">N19 Unspecified kidney failure</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top"><italic>&#x2014;</italic><sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top" colspan="2">Part 2</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">Other</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Administrative data</td><td align="left" valign="top">Sex: female, age: 55 years</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Underlying cause of death</td><td align="left" valign="top">I21.9 Acute myocardial infarction</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>As UCOD selection is generally a two-step process: (1) coding the free text into ICD codes and (2) applying the mortality coding rules to select the UCOD&#x2014;this study focuses exclusively on the second step, as the first step cannot be studied with this dataset.</p><p>To enable the use of textual data, we performed reverse coding from the death certificate to its textual description, converting the administrative data into a textual format and mapping each code to its corresponding ICD title. The resulting encoding is illustrated with 3 examples in <xref ref-type="table" rid="table2">Table 2</xref>. For this study, we extracted a subset of 400,000 certificates for training, 100,000 for testing, and 10,000 for validation purposes from the total number of records in the dataset. This split was selected to ensure comparability with prior work using the same experimental setup, while also reflecting practical computational constraints, as the evaluation involved training and comparing multiple (7) alternative models. Extraction has been carried out with stratified sampling to have a subset matching the features of the entire dataset.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Example encoding for the death certificates as a sentence.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Every record of the death certificate as a single sentence</td><td align="left" valign="bottom">UCOD<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Female, 55 years: (Acute myocardial infarction) due to (Hypertension) due to (Unspecified kidney failure)</td><td align="left" valign="top">I21.9</td></tr><tr><td align="left" valign="top">Male, 39 years: (Malignant neoplasm of breast, unspecified or malignant neoplasm, without specification of site)</td><td align="left" valign="top">C50.9</td></tr><tr><td align="left" valign="top">Male, 40 years: (Sepsis, unspecified) due to (Necrotizing fasciitis)</td><td align="left" valign="top">M72.6</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>UCOD: underlying cause of death.</p></fn></table-wrap-foot></table-wrap><p>We have pretrained and fine-tuned DiLBERT [<xref ref-type="bibr" rid="ref37">37</xref>] on the earlier-mentioned dataset. One aspect worth mentioning is that ICD-10 is partitioned into 22 chapters according to the disease site or etiology. In addition to the overall performance of the model, we wanted to analyze the performance by disease category, based on the ICD-10 chapter organization. Thus, the accuracy of the DiLBERT model at the chapter level is presented in <xref ref-type="table" rid="table3">Table 3</xref>, along with the frequency of the target coded condition as a chapter and the frequency of the chapter considering the input conditions coded. These statistics consider both training and test records.</p><p>The table highlights some interesting observations. For example, Chapter XXI is not relevant for mortality and does not appear in any of the death certificates. Chapter XXII, on the other hand, is reserved for special purposes, such as emergency situations, and codes are typically added to this chapter before they are assigned to their appropriate classification in subsequent years. Notably, during the COVID-19 pandemic, a special code for the disease was added to Chapter XXII.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Accuracy by ICD-10<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> chapter (table sorted according to ICD-10 chapter field), frequency of each chapter as target and input coded conditions.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ICD-10 chapter</td><td align="left" valign="bottom">Input frequency, n (%)</td><td align="left" valign="bottom">Target frequency, n (%)</td><td align="left" valign="bottom">Accuracy @1</td></tr></thead><tbody><tr><td align="left" valign="top">I Certain infectious and parasitic diseases</td><td align="left" valign="top">12,096 (2.521)</td><td align="left" valign="top">48,384 (3.283)</td><td align="left" valign="top">0.925</td></tr><tr><td align="left" valign="top">II Neoplasms</td><td align="left" valign="top">106,126 (22.118)</td><td align="left" valign="top">148,271 (10.060)</td><td align="left" valign="top">0.986</td></tr><tr><td align="left" valign="top">III Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism</td><td align="left" valign="top">12,624 (0.857)</td><td align="left" valign="top">1,592 (0.332)</td><td align="left" valign="top">0.794</td></tr><tr><td align="left" valign="top">IV Endocrine, nutritional, and metabolic diseases</td><td align="left" valign="top">86,320 (5.857)</td><td align="left" valign="top">21,124 (4.402)</td><td align="left" valign="top">0.952</td></tr><tr><td align="left" valign="top">V Mental and behavioral disorders</td><td align="left" valign="top">116,318 (7.892)</td><td align="left" valign="top">23,239 (4.843)</td><td align="left" valign="top">0.976</td></tr><tr><td align="left" valign="top">VI Diseases of the nervous system</td><td align="left" valign="top">57,602 (3.908)</td><td align="left" valign="top">35,094 (7.314)</td><td align="left" valign="top">0.981</td></tr><tr><td align="left" valign="top">VII Diseases of the eye and adnexa</td><td align="left" valign="top">444 (0.030)</td><td align="left" valign="top">5 (0.001)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">VIII Diseases of the ear and mastoid process</td><td align="left" valign="top">131 (0.009)</td><td align="left" valign="top">6 (0.001)</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">IX Diseases of the circulatory system</td><td align="left" valign="top">458,620 (31.116)</td><td align="left" valign="top">148,518 (30.953)</td><td align="left" valign="top">0.978</td></tr><tr><td align="left" valign="top">X Diseases of the respiratory system</td><td align="left" valign="top">172,966 (11.735)</td><td align="left" valign="top">48,674 (10.144)</td><td align="left" valign="top">0.978</td></tr><tr><td align="left" valign="top">XI Diseases of the digestive system</td><td align="left" valign="top">48,211 (3.271)</td><td align="left" valign="top">18,119 (3.776)</td><td align="left" valign="top">0.941</td></tr><tr><td align="left" valign="top">XII Diseases of the skin and subcutaneous tissue</td><td align="left" valign="top">4,257 (0.289)</td><td align="left" valign="top">821 (0.171)</td><td align="left" valign="top">0.892</td></tr><tr><td align="left" valign="top">XIII Diseases of the musculoskeletal system and connective tissue</td><td align="left" valign="top">12,300 (0.835)</td><td align="left" valign="top">2,345 (0.489)</td><td align="left" valign="top">0.819</td></tr><tr><td align="left" valign="top">XIV Diseases of the genitourinary system</td><td align="left" valign="top">65,914 (4.472)</td><td align="left" valign="top">12,196 (2.542)</td><td align="left" valign="top">0.960</td></tr><tr><td align="left" valign="top">XV Pregnancy, childbirth, and the puerperium</td><td align="left" valign="top">651 (0.044)</td><td align="left" valign="top">215 (0.045)</td><td align="left" valign="top">0.581</td></tr><tr><td align="left" valign="top">XVI Certain conditions originating in the perinatal period</td><td align="left" valign="top">5,570 (0.378)</td><td align="left" valign="top">1,964 (0.409)</td><td align="left" valign="top">0.805</td></tr><tr><td align="left" valign="top">XVII Congenital malformations, deformations, and chromosomal abnormalities</td><td align="left" valign="top">3,125 (0.212)</td><td align="left" valign="top">1,644 (0.343)</td><td align="left" valign="top">0.708</td></tr><tr><td align="left" valign="top">XVIII Symptoms, signs, and abnormal clinical and laboratory findings, not elsewhere classified</td><td align="left" valign="top">91,947 (6.238)</td><td align="left" valign="top">6,142 (1.280)</td><td align="left" valign="top">0.985</td></tr><tr><td align="left" valign="top">XIX Injury, poisoning, and certain other consequences of external causes</td><td align="left" valign="top">87,478 (5.935)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">XX External causes of morbidity and mortality</td><td align="left" valign="top">52,752 (3.579)</td><td align="left" valign="top">39,906 (8.317)</td><td align="left" valign="top">0.962</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>ICD-10: International Statistical Classification of Diseases and Related Health Problems 10th Revision.</p></fn><fn id="table3fn2"><p><sup>b</sup>Not available.</p></fn></table-wrap-foot></table-wrap><p>An analysis of the accuracy of the DiLBERT model at the chapter level reveals that chapters with a target frequency of less than 1% have an accuracy score below 0.9, while chapter VIII, with a frequency of less than 0.01%, has an accuracy score of 0. Chapter VII also has a frequency of less than 0.01% in the training set but has no cases in the test set. Chapter XIX, which covers injury, poisoning, and certain other consequences of external causes, is unique in that it has a relatively high input frequency of 5.9%; yet, it is never selected as the UCOD by the model. This is because such conditions are considered consequences of an external cause, and the external cause itself (ie, Chapter XX) is always the cause that should be selected as the UCOD. Therefore, the frequency of external causes as a target is higher than the input frequency, as these causes of death are more likely to be selected when present in the certificate. Similarly, for neoplasms (ie, Chapter II), the input frequency is higher than the target frequency because when a malignant neoplasm is the UCOD, multiple malignant neoplasms may be stated on the same death certificate.</p><p>While there are codes that cannot be selected as the UCOD and are scattered across different chapters, there are certain conditions that are more likely to be excluded. For example, the entire Chapter XVIII (Symptoms, signs, and abnormal clinical and laboratory findings, not elsewhere classified) is part of the so-called ill-defined conditions. These conditions are typically not selected as the UCOD if other conditions that are not ill-defined are present on the certificate, which is why Chapter XVIII has a very small target frequency compared to the associated input frequency.</p><p>While ICD-10 contains more than 14,000 codes, not all of them can be used for mortality coding and the selection of the UCOD (eg, Z codes). It is difficult to estimate the number of codes missing from our dataset since we cannot identify all the codes that can be used for mortality coding. For instance, some codes may not be present because the distribution of diseases varies between regions, and our dataset only includes records from the United States. Furthermore, physicians may not always use the most precise condition on the death certificate, resulting in the use of residuals (identified by 0.9 in ICD-10) instead of the most precise code, as shown in <xref ref-type="table" rid="table4">Tables 4</xref> and <xref ref-type="table" rid="table5">5</xref>.</p><p>The dataset used in this study contains 2066 different codes stated as the target class and 3999 different codes stated as coded conditions on the death certificates. The top 10 UCODs with their frequency are presented in <xref ref-type="table" rid="table4">Table 4</xref>, covering approximately 36% of the total number of records. Interestingly, as shown in <xref ref-type="table" rid="table5">Table 5</xref>, the top 10 conditions stated on the death certificates differ from the selected UCODs in <xref ref-type="table" rid="table4">Table 4</xref>. The most commonly used conditions have a frequency of 31.2% of the total conditions stated.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Top 10 most selected underlying cause of death (UCOD).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">UCOD</td><td align="left" valign="bottom">Title</td><td align="left" valign="bottom">Values, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">I25.1</td><td align="left" valign="top">Atherosclerotic heart disease</td><td align="char" char="." valign="top">26,523 (5.527)</td></tr><tr><td align="left" valign="top">C34.9</td><td align="left" valign="top">Bronchus or lung, unspecified</td><td align="char" char="." valign="top">26,232 (5.466)</td></tr><tr><td align="left" valign="top">J44.9</td><td align="left" valign="top">Chronic obstructive pulmonary disease, unspecified</td><td align="char" char="." valign="top">21,504 (4.481)</td></tr><tr><td align="left" valign="top">I21.9</td><td align="left" valign="top">Acute myocardial infarction, unspecified</td><td align="char" char="." valign="top">20,176 (4.204)</td></tr><tr><td align="left" valign="top">G30.9</td><td align="left" valign="top">Alzheimer disease, unspecified</td><td align="char" char="." valign="top">19,598 (4.084)</td></tr><tr><td align="left" valign="top">F03</td><td align="left" valign="top">Unspecified dementia</td><td align="char" char="." valign="top">18,135 (3.779)</td></tr><tr><td align="left" valign="top">I50.0</td><td align="left" valign="top">Congestive heart failure</td><td align="char" char="." valign="top">11,882 (2.476)</td></tr><tr><td align="left" valign="top">I64</td><td align="left" valign="top">Stroke, not specified as hemorrhage or infarction</td><td align="char" char="." valign="top">10,757 (2.241)</td></tr><tr><td align="left" valign="top">I25.0</td><td align="left" valign="top">Atherosclerotic cardiovascular disease, so described</td><td align="char" char="." valign="top">10,406 (2.168)</td></tr><tr><td align="left" valign="top">J18.9</td><td align="left" valign="top">Pneumonia, organism unspecified</td><td align="char" char="." valign="top">7,679 (1.600)</td></tr></tbody></table></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Top 10 most stated conditions on the death certificate.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Condition stated</td><td align="left" valign="bottom">Title</td><td align="left" valign="bottom">Values, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">I46.9</td><td align="left" valign="top">Cardiac arrest, unspecified</td><td align="char" char="." valign="top">62,926 (4.269)</td></tr><tr><td align="left" valign="top">I10</td><td align="left" valign="top">Essential (primary) hypertension</td><td align="char" char="." valign="top">62,666 (4.251)</td></tr><tr><td align="left" valign="top">F17.9</td><td align="left" valign="top">Mental and behavioral disorders due to use of tobacco</td><td align="char" char="." valign="top">54,675 (3.709)</td></tr><tr><td align="left" valign="top">I25.1</td><td align="left" valign="top">Atherosclerotic heart disease</td><td align="char" char="." valign="top">52,057 (3.531)</td></tr><tr><td align="left" valign="top">J44.9</td><td align="left" valign="top">Chronic obstructive pulmonary disease, unspecified</td><td align="char" char="." valign="top">50,101 (3.399)</td></tr><tr><td align="left" valign="top">I50.0</td><td align="left" valign="top">Congestive heart failure</td><td align="char" char="." valign="top">49,042 (3.327)</td></tr><tr><td align="left" valign="top">F03</td><td align="left" valign="top">Unspecified dementia</td><td align="char" char="." valign="top">34,343 (2.330)</td></tr><tr><td align="left" valign="top">A41.9</td><td align="left" valign="top">Sepsis, unspecified</td><td align="char" char="." valign="top">33,818 (2.294)</td></tr><tr><td align="left" valign="top">J96.9</td><td align="left" valign="top">Respiratory failure, unspecified</td><td align="char" char="." valign="top">31,180 (2.115)</td></tr><tr><td align="left" valign="top">J18.9</td><td align="left" valign="top">Pneumonia, unspecified</td><td align="char" char="." valign="top">28,525 (1.935)</td></tr></tbody></table></table-wrap></sec><sec id="s2-3"><title>Ethical Considerations</title><p>This study did not involve human participants or any form of human subject research. The analyses were conducted exclusively on publicly available, anonymized death certificate data, which contain no personally identifiable information. No attempt was made to reidentify individuals, and no procedures, interventions, or interactions with human subjects were performed; therefore, none of the ethical review elements listed in the JMIR guidelines apply.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Model Calibration</title><p>The top row of <xref ref-type="fig" rid="figure1">Figure 1</xref> shows the distribution of prediction confidence, while the bottom row shows the so-called reliability diagram, which plots accuracy as a function of confidence. The reliability diagram provides a visual representation of model calibration, with perfect calibration represented by plotting the identity function (ie, <italic>x</italic>=<italic>y</italic>, represented by the dotted line in the figure). Following Guo et al [<xref ref-type="bibr" rid="ref1">1</xref>], confidence scores are grouped into bins, and for each bin, the average predicted confidence is compared with the corresponding empirical accuracy. The reliability diagram highlights, in red, the discrepancy between these 2 quantities, providing a visual indication of model miscalibration. Thus, deviations from the perfect diagonal indicate model miscalibration. The figure includes results for the out-of-the-box samples in the left column and samples rescaled with temperature scaling in the right column.</p><p>Overall, the histograms and reliability diagram shown in <xref ref-type="fig" rid="figure1">Figure 1</xref> demonstrate that the out-of-the-box average confidence of the predictions closely matches their accuracy, with confidence slightly higher than accuracy, indicating a small miscalibration. The bottom left panel shows that most of the bins are not well calibrated, as indicated by the high MCE of 30.91. However, the ECE is low, with an error of 1.4, suggesting that the model is well calibrated overall. This is partly due to the last bin, which is both well-calibrated and represents the highest number of samples.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Confidence histograms (top) and reliability diagrams (bottom), for out-of-the-box (left) and with temperature scaling (right). ECE: expected calibration error; MCE: maximum calibration error.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig01.png"/></fig><p>When temperature scaling is applied, the bins with high confidence become well calibrated, but those with low and medium confidence become less calibrated, as shown in the bottom right panel of <xref ref-type="fig" rid="figure1">Figure 1</xref>. The temperature scaling method reduces the ECE to 1.13, while the MCE increases to 42.17. It is important to note that, while there is an observed increase in the MCE metric, this increment is relatively small. More significantly, the key aspect of the calibration phase can be seen in the improved calibration scores for the instances in the right-most part of the plot, specifically those having a confidence level greater than or equal to 0.8. This enhanced calibration at higher confidence levels is a crucial factor in developing a reliable model, particularly for the most practically interesting instances, that is, those having high-confidence scores. As reported in Desai and Durrett [<xref ref-type="bibr" rid="ref34">34</xref>], pretrained models are generally well calibrated, although further improvements are possible; in this case, the out-of-the-box ECE was already very low.</p></sec><sec id="s3-2"><title>Confidence Reliability</title><p>A well-calibrated model can use its confidence as a measure of prediction reliability, which is often used to focus on chapter granularity or abstract ranges of conditions in UCOD predictions. <xref ref-type="fig" rid="figure2">Figure 2</xref> shows the error rate as a function of confidence for instances, grouped by chapter. Excluding the chapter with a 100% error rate and a confidence of 1 (VIII), we observe a correlation between confidence and error rate at the chapter level. <xref ref-type="fig" rid="figure3">Figure 3</xref> displays the scaled confidence distribution of the instances by chapter, with data quartiles and averages. The plot shows that half of the chapters have a very low IQR, indicating that the model is highly decisive in predicting those chapters, with quartile groups close to a confidence of 1.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Chapter-wise scaled confidence with error rate.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Class-wise scaled confidence percentiles per chapter.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig03.png"/></fig><p>Chapters III, XII, XIII, and XVI have similar, high medians compared to preceding chapters but show a different distribution, higher IQR, and long lower whiskers. Chapter XVI is the tallest box plot compared to all other chapters, with an IQR covering the entire range of confidence. Chapter XV has the lowest median, with a value around 0.3 of confidence and long upper and lower whiskers, indicating that the model&#x2019;s predictions on this chapter are more variable. Except for Chapter VIII, which is used to select UCOD, and Chapters XV and XVI, whose whiskers cover the entire range, all other chapters have predictions over the entire range of confidence as outliers.</p><p><xref ref-type="fig" rid="figure4">Figure 4</xref> shows the distribution of training sample classes in relation to scaled confidence, with color representing class accuracy from red (accuracy of 0) to green (accuracy of 1). This variation of color will be used for multiple figures to show the accuracy. Two regions can be identified, divided by a linear function of the training samples with a value around 10.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Class-wise confidence with training sample.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig04.png"/></fig></sec><sec id="s3-3"><title>Instance Difficulty Estimation</title><p><xref ref-type="fig" rid="figure5">Figure 5</xref> displays the mean number of conditions per class in relation to the global normalized VoG score. Each certificate may contain from 1 to 15 conditions. VoG is standardized using the global mean and SD and then clipped to the [&#x2212;1,1] range for visualization; values closer to &#x00B1;1 indicate higher gradient variability. The figure shows a correlation between the number of stated conditions and the difficulty of estimating the instance. When only a single condition is present, we observe that almost every instance score is below the average. On the other side, as the number of reported conditions increases, there is a corresponding increase in the average score. This indicates that instances with a greater number of conditions are generally associated with higher difficulty levels. A similar trend can be seen in <xref ref-type="fig" rid="figure6">Figure 6</xref>, which shows the average number of filled lines in the death certificate per class in relation to the instance difficulty VoG score for the same class, since data are most likely present in the top-left quadrant. As the average lines used per class increase, so does the difficulty. The lines displayed in the figure are simple regression lines. As an additional analysis, we computed and compared the Pearson &#x03C1; and Kendall &#x03C4; correlation coefficients between VoG and the specified conditions. The correlations reveal a weak positive linear correlation between the VoG scores and both the mean conditions and mean lines, which show, respectively, Pearson &#x03C1; scores of 0.23 and 0.22 and a Kendall &#x03C4; correlation coefficient of 0.06 and 0.05.</p><p>The global normalized distribution of the VoG score of instances by chapter is visually displayed in <xref ref-type="fig" rid="figure7">Figure 7</xref>. This figure shows the data quartiles and averages, indicating that the difficulty of instances is mostly low for all chapters. Notably, the median for each chapter is lower than its corresponding average for the VoG difficulty score. Additionally, the lower quartile for all chapters is proximal to the lower value score. These observations suggest that estimating the difficulty of instances is challenging for all chapters. Among the chapters, Chapter XVIII has the lowest difficulty scores, as indicated by its upper quartile, which has the lowest value among all chapters. Furthermore, Chapters V, VI, XV, and XVI have an upper quartile lower than the average difficulty score, indicating that the instances in these chapters are less difficult.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>VoG globally normalized per class-wise in relation to the average condition stated in the death certificate per class. VoG: Variance of Gradients.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>VoG globally normalized per class-wise in relation to the average lines filled in the death certificate per class. VoG: Variance of Gradients.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig06.png"/></fig><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>VoG globally normalized percentiles per chapter. VoG: Variance of Gradients.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig07.png"/></fig><p>We further investigated the difficulty of the instances at the class level by selecting 9 classes with at least 10 instances in the test set, sorted by error rate. <xref ref-type="fig" rid="figure8">Figure 8</xref> shows the global normalized class-wise VoG score distribution for these classes. We divided these classes into 3 groups: positive classes (K565, Y12, and X94) with mostly positive predictions, mixed classes (M348, E854, and K219) with balanced predictions between positive and negative, and negative classes (Y830, I638, and I612) with mostly negative predictions. Notably, positive classes have an upper quartile lower than the mixed classes, indicating that classes with high accuracy may have fewer difficult instances compared to mixed classes. However, estimating the difficulty of classes with mostly negative predictions is challenging due to the shadowing effect of other classes. For instance, the instances of classes I638 and I612 have very low difficulty scores, contrary to our expectations based on the behavior of class Y830.</p><p>In addition to studying the model&#x2019;s behavior with respect to the predicted class, we also explored its behavior based on the conditions stated on each certificate. <xref ref-type="fig" rid="figure9">Figure 9</xref> displays the distribution of the stated conditions as a function of the globally normalized average VoG score of death certificates containing a given stated condition. A long tail pattern is observed, where less frequently stated conditions are associated with higher average instance difficulty, while conditions with a larger number of instances show more stable difficulty estimates. <xref ref-type="fig" rid="figure10">Figure 10</xref>, similar to <xref ref-type="fig" rid="figure4">Figure 4</xref>, shows the distribution of the stated conditions as a function of the scaled confidence. However, the observed behavior differs from that in <xref ref-type="fig" rid="figure4">Figure 4</xref>, which displays the predicted class in relation to the scaled confidence. Notably, when considering predicted classes in the long tail of the distribution, these classes have an average accuracy of 0. In contrast, low-frequency stated conditions do not necessarily influence the prediction outcome, as some conditions may not be involved in the final UCOD decision. In fact, we observed several low-frequency conditions with high confidence and low frequency that had high accuracy. However, for rare conditions with low frequency, a clear distinction between the classes that are unlikely to be predicted correctly and those that are well predicted is not evident. The results suggest that the performance of the model varies based on the conditions stated on the certificate and their frequency and role in the decision process.</p><fig position="float" id="figure8"><label>Figure 8.</label><caption><p>VoG globally normalized percentiles for some classes. VoG: Variance of Gradients.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig08.png"/></fig><fig position="float" id="figure9"><label>Figure 9.</label><caption><p>VoG globally normalized per stated condition in relation to the training sample. VoG: Variance of Gradients.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig09.png"/></fig><fig position="float" id="figure10"><label>Figure 10.</label><caption><p>Conditions in the training sample in relation to the scaled confidence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig10.png"/></fig><p><xref ref-type="fig" rid="figure11">Figure 11</xref> shows how the global normalized VoG score varies with the minimum occurrence of each stated condition in the training data using discrete frequency ranges. Conditions with 0 occurrences in the training set exhibit the highest difficulty, with an average VoG score of approximately 0.4. When the minimum occurrence increases to 1 or 2 instances, the average VoG score remains elevated (around 0.2). For conditions appearing between 2 and 1000 times, the VoG score decreases but remains positive (&#x2248;0.1). Conditions in the higher frequency ranges (1000&#x2010;5000 and 5000&#x2010;20,000 occurrences) show VoG scores close to 0. A notable change is observed for conditions occurring more than 20,000 times, where the average VoG score becomes negative (&#x2248;&#x2013;0.2), suggesting that these instances are highly redundant and therefore easier. Overall, the results indicate that instance difficulty decreases as condition frequency increases.</p><p><xref ref-type="fig" rid="figure12">Figure 12</xref> presents the same relationship as <xref ref-type="fig" rid="figure11">Figure 11</xref> using a log<sub>&#x2082;</sub>-transformed axis with locally estimated scatterplot smoothing. The trend shows a gradual decline in difficulty as condition frequency increases, with VoG values decreasing slightly up to log<sub>&#x2082;</sub>&#x2248;12. Between log<sub>2</sub>&#x2248;12 and 14, the curve stabilizes. Beyond log<sub>2</sub>&#x2248;14, the difficulty decreases more sharply, with estimated VoG values dropping from approximately &#x2013;0.30 to &#x2013;0.50 by log<sub>2</sub>&#x2248;16. This pattern indicates a pronounced reduction in difficulty for conditions with very high training frequencies.</p><fig position="float" id="figure11"><label>Figure 11.</label><caption><p>The condition with minimum occurrence in training with the average VoG score for the range. VoG: Variance of Gradients.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig11.png"/></fig><fig position="float" id="figure12"><label>Figure 12.</label><caption><p>Minimum condition occurrence in the training data (log<sub>2</sub>-scaled) versus the corresponding VoG score. A LOESS regression line is used to smooth the trend across the full range of frequencies. LOESS: locally estimated scatterplot smoothing; VoG: Variance of Gradients.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig12.png"/></fig></sec><sec id="s3-4"><title>Saliency Maps Overview</title><p>In <xref ref-type="fig" rid="figure13">Figure 13</xref>, we present an overview of saliency maps computed using Captum. The figure includes 3 positive examples (the first 3 rows) and 3 failure cases (the last 3). For each case, the true and predicted UCOD codes are shown in the &#x201C;True Label (UCOD)&#x201D; and &#x201C;Predicted Label (UCOD)&#x201D; columns, respectively, while the corresponding input text is provided in the &#x201C;Attribution Label&#x201D; column. The &#x201C;Word Importance&#x201D; column displays the attribution assigned to each word, representing the key results from the saliency maps. In this visualization, red indicates a negative attribution score, white indicates a neutral score, and green indicates a positive score, with positive values highlighting words that contribute more strongly to the classification decision. The &#x201C;Attribution Score&#x201D; column reports the sum of all word-level attributions, which is generally greater than 0, although in some cases, the total may be negative. Additionally, the titles corresponding to the true and predicted labels are shown in the &#x201C;True UCOD (title)&#x201D; and &#x201C;Predicted UCOD (title)&#x201D; columns.</p><fig position="float" id="figure13"><label>Figure 13.</label><caption><p>Word importance with attribution scores computed using Layer Integrated Gradients for 3 positive cases and 3 failure cases, as analyzed using Captum. UCOD: underlying cause of death.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e78764_fig13.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In the medical domain, as in other mission-critical areas, the black-box nature of ML models limits their adoption in real-world applications. This is because users need to understand what the model is doing, why a specific result is produced, and when their intervention may be necessary due to unreliable outcomes. In the specific use case presented here, the starting point was a model that provides better accuracy compared with the standard approach, yet is considered not usable in practice.</p><p>In this study, we proposed 3 approaches to analyze and improve the reliability of the model. First, we evaluated the model&#x2019;s calibration using ECE and MCE, conducting this analysis at both the chapter level and the category level by considering the distribution of training samples and the frequency of inputs and labels. Next, we presented the VoG as a ranking framework to identify the most challenging instances and to support OOD analysis in this work. In this context, OOD does not refer to explicit distributional shifts between training and test data, nor to unseen UCOD labels, as the train-test split was performed using stratified sampling over the target classes. Instead, OOD is used in a task-specific sense to describe rare labels and rare or previously unseen ICD codes stated on the certificate. While certain code combinations and positional effects may influence the rule-based UCOD selection process, such cases cannot be explicitly identified in our analysis. We carried out the same chapter-level and category-level analysis for the VoG scores to better characterize such behavior. This analysis helps experts gain greater control over the model&#x2019;s behavior. Finally, we introduced saliency maps to provide word-level attribution and highlight the importance of specific input features.</p><p>The complexity of the proposed methodologies is relatively low. The training of the model proposed by Della Mea et al [<xref ref-type="bibr" rid="ref26">26</xref>] requires approximately 8 hours, while inference takes only a few minutes. Additionally, the integration of temperature scaling and the computation of the VoG can be easily implemented on the trained model. Specifically, the temperature scaling technique proposed by Guo et al [<xref ref-type="bibr" rid="ref1">1</xref>] involves dividing the logits (inputs to the softmax function) by a learned scalar parameter, simplifying the calibration process. Meanwhile, the computation of the VoG score, proposed by Agarwal et al [<xref ref-type="bibr" rid="ref15">15</xref>], is straightforward and quick to perform on the test set. This is because it relies solely on the vanilla gradient explanation from the model, making the process efficient and accessible, and the same goes for the saliency maps computation.</p><p>Although transformer models have been shown to be well calibrated [<xref ref-type="bibr" rid="ref34">34</xref>], we demonstrated that our model is calibrated out of the box with respect to the ECE but less so for the MCE, which reflects the worst-case deviation across bins (<xref ref-type="fig" rid="figure1">Figure 1</xref>). When applying temperature scaling, a standard post-hoc calibration method, we observed a small improvement in ECE but a considerable increase in MCE. A closer inspection indicates that this trade-off is largely driven by the extreme imbalance in the confidence distribution. Because the model achieves an accuracy of 0.990, almost all predictions fall within the highest confidence bin between 0.9 and 1.0, while the remaining bins contain only a very small number of samples. Consequently, temperature scaling improves the average calibration but amplifies errors in sparsely populated bins, which disproportionately affects the MCE. This suggests that temperature scaling may not be suitable for improving calibration when confidence bins are strongly unevenly distributed, and that the increased worst-case error should be interpreted with caution. In our specific use case, we place greater importance on maintaining good calibration in the 0.9&#x2010;1.0 confidence range, where the vast majority of predictions lie, and we treat the remaining lower-confidence cases as candidates for manual review. Under these conditions, the improvement in ECE obtained through recalibration can still be considered beneficial.</p><p>From the analysis conducted at the chapter level (<xref ref-type="fig" rid="figure3">Figure 3</xref>), we found that some chapters, such as XV (Pregnancy, childbirth, and the puerperium) and XVII (Congenital malformations, deformations, and chromosomal abnormalities), have overall low confidence. This is expected, as these conditions occur very infrequently in the dataset (<xref ref-type="table" rid="table3">Table 3</xref>). In practice, existing rule-based systems also reject these cases, and all certificates would be subject to manual review because they are highly sensitive. Additional attention may also be needed for Chapters III (Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism), XIII (Diseases of the musculoskeletal system and connective tissue), and XVI (Certain conditions originating in the perinatal period), which exhibit a large IQR together with a low first quartile. These patterns likewise reflect the low target frequency observed for these chapters (<xref ref-type="table" rid="table3">Table 3</xref>).</p><p><xref ref-type="fig" rid="figure2">Figure 2</xref> also presents the relationship between confidence and error rate at the chapter level. In the top-right corner, we observe a cluster of chapters with an error rate of 1, reflecting the fact that these chapters are never predicted; in particular, Chapter XIX (Injury, poisoning, and certain other consequences of external causes) cannot be selected as the UCOD. In contrast, the lower-right region shows a grouping of chapters with both high accuracy and high confidence. The remaining chapters fall outside these clusters and would be candidates for manual intervention or for targeted improvements in the training data. Additionally, we manually investigated specific training samples in relation to confidence (<xref ref-type="fig" rid="figure4">Figure 4</xref>) and found that classes with fewer than 15 training samples have very low accuracy, which can be visually identified in the bottom region. From the analysis at the chapter level and based on the distribution of training samples, we can clearly distinguish instances that require further improvements and, when this is not feasible, identify cases that should be flagged for manual selection by experts.</p><p><xref ref-type="fig" rid="figure5">Figures 5</xref> and <xref ref-type="fig" rid="figure6">6</xref> present the VoG score in relation to the average number of lines and the average number of conditions stated on the certificate. Although we can visually observe an increase in instance difficulty as the number of conditions and lines increases, the correlation between these features appears to be weak. <xref ref-type="fig" rid="figure7">Figure 7</xref> shows the VoG distribution at the chapter level, and <xref ref-type="fig" rid="figure8">Figure 8</xref> illustrates the VoG for a selection of individual categories. Overall, the average VoG remains low across chapters, but we observe a substantial dispersion with several high-VoG outliers. These cases, observed at both the chapter and category levels, should be taken into consideration for further investigation and may indicate instances that require additional training data or manual review.</p><p>Furthermore, we extended our investigation to the combined analysis of VoG and confidence with respect to the number of conditions stated as input in the death certificate, as shown in <xref ref-type="fig" rid="figure9">Figures 9</xref><xref ref-type="fig" rid="figure10"/><xref ref-type="fig" rid="figure11"/>-<xref ref-type="fig" rid="figure12">12</xref>. In particular, we observe that failure cases are distributed differently when examined in relation to the input conditions compared with the distribution based on the label frequency in the training set. This behavior is expected, since the conditions listed on the certificate are not always necessary or directly relevant for determining the UCOD. In <xref ref-type="fig" rid="figure11">Figure 11</xref>, we note that the average VoG is considerably higher when the input includes conditions that were either absent or occurred only a few times in the training data. Additionally, the difficulty decreases more sharply after a log<sub>2</sub> value of approximately 14 (<xref ref-type="fig" rid="figure12">Figure 12</xref>).</p><p>In particular, <xref ref-type="fig" rid="figure9">Figure 9</xref> shows that VoG is able, to some extent, to highlight OOD-like behavior, as reflected by a long tail of high-difficulty classes corresponding to conditions that appear only a few times in the training set. More clearly, the average VoG is highest for cases involving codes that never appear in the training data (<xref ref-type="fig" rid="figure11">Figures 11</xref> and <xref ref-type="fig" rid="figure12">12</xref>).</p><p>In <xref ref-type="fig" rid="figure13">Figure 13</xref>, we present an overview of how saliency maps can support model interpretability. Specifically, saliency maps highlight which parts of the input text are most influential for the model&#x2019;s decision-making process by identifying words or conditions that contribute most to the predicted UCOD. In the 3 positive examples, selected to illustrate common WHO-rule scenarios, we observe a close correspondence between the predicted label and the features receiving the highest attribution. In the first example, the certificate contains a single condition, a frequent real-world scenario. The attribution correctly emphasizes &#x201C;prostate&#x201D; more strongly than &#x201C;malignant neoplasm,&#x201D; reflecting the fact that many malignant neoplasm categories share similar wording and rely on the anatomical location for discrimination. In the second example, 2 conditions are reported on separate lines, and the first is selected as the UCOD. This is expected because atrial fibrillation and flutter (I48), when mentioned alongside &#x201C;ischemic heart disease&#x201D; (I20-I25), must be coded within the &#x201C;ischemic heart disease&#x201D; range. Accordingly, the saliency map assigns meaningful attribution not only to the first condition but also to the supporting condition needed to justify its selection. In the third example, the second condition on the first used line is selected as the UCOD, but this selection is only valid in the presence of the first condition on the same line. The saliency map reflects this rule-consistent structure by highlighting both conditions. Across these examples, we can discern which parts of the input are most decisive for the classification and how the model appears to rely on patterns aligned with underlying WHO rules.</p><p>The final 3 examples in <xref ref-type="fig" rid="figure13">Figure 13</xref> illustrate failure cases. In the first case, the model predicts &#x201C;vascular dementia, unspecified&#x201D; as the UCOD. Although this condition is less specific than the correct one, it corresponds to the condition explicitly stated on the certificate. The correct UCOD is obtained only when this condition is recoded into a more specific category based on the combination of conditions mentioned. However, such recoding patterns are absent or extremely rare in the training data, preventing the model from learning this specific transformation. As a result, the attribution pattern is logically consistent with the input representation, even though the final prediction is incorrect. In the second failure case, the error arises because 2 distinct ICD-10 categories share the same title, an issue present in only a few parts of the classification. The reverse-coding process used for training collapses these distinct codes into an identical textual representation, thereby removing the information required for the model to distinguish between them. As a consequence, the model cannot learn this distinction, regardless of reasoning quality. The saliency map nonetheless assigns meaningful attribution to terms such as &#x201C;driver&#x201D; and &#x201C;motor vehicle,&#x201D; which is consistent with the underlying coding logic: injuries cannot be selected as UCOD, and external causes take precedence when present. Since the injury itself is not part of the causal sequence leading to the UCOD, it is plausible that it receives little or no positive attribution. In the third case, the model assigns a positive attribution to the condition that corresponds to the true UCOD, while giving only a slightly positive attribution to the condition it incorrectly predicts. Here, the attribution appears incongruent with the predicted label, and we would have expected to see clearer importance assigned to both the last condition and the predicted UCOD. This discrepancy highlights a mismatch between attribution and prediction, which aligns with the fact that the model selects an incorrect UCOD.</p></sec><sec id="s4-2"><title>Limitations</title><p>In our analysis, we presented statistics on the ICD-10 codes actually present in the training dataset, which encompass approximately 36% of the entire ICD-10 classification. This suggests that one of the primary limitations of the trained model is the coverage of the codes in the dataset, potentially contributing to errors in our predictions on the test set. Consequently, understanding OOD data is crucial, as we attempted with our analysis.</p><p>Additionally, we consider that errors may occur due to incomplete input data. As described by Della Mea et al [<xref ref-type="bibr" rid="ref26">26</xref>], while the certificate input is textual, the dataset contains coded data rather than plain textual descriptions. Thus, the data used for training are derived from a reverse coding process from the code to the title of the condition. Unlike a rule-based system where coders can access the full certificate, our study evaluated only the administrative data and conditions stated in the certificate, as this was the only data available. Consequently, there are instances where the same certificate with the same input conditions may assign a UCOD with different codes, which our model cannot differentiate without access to the comprehensive death certificate knowledge used by the coders.</p><p>Our calibration analysis is based on a single post-hoc method, temperature scaling, and reveals a trade-off between average and worst-case calibration. While the model is reasonably well calibrated out-of-the-box in terms of ECE, the MCE remains higher, and applying temperature scaling further improves ECE at the cost of worsening MCE. Given the very high accuracy of the model and the strong concentration of predictions in the highest confidence bin, this behavior suggests that temperature scaling may not be ideal in settings with such imbalanced confidence distributions.</p><p>There are also limitations in how VoG and saliency maps capture instance difficulty and explainability. VoG shows only a weak correlation with simple difficulty features such as the number of conditions or lines, meaning it does not fully reflect these intuitive measures, even if it remains helpful for identifying rare or OOD instances. Saliency maps were illustrated through a small set of positive and negative examples, and, given that the input consists of reverse-coded representations based on standardized ICD concept titles rather than natural clinical narratives, token-level attributions reflect the informativeness of specific standardized terms rather than linguistic nuance in unstructured text. Token-level attributions cannot provide a full explanation for a task governed by complex WHO rules. They offer useful insights into which parts of the input influence the model&#x2019;s decisions but should be viewed as complementary rather than exhaustive explainability tools.</p></sec><sec id="s4-3"><title>Future Work</title><p>Future research should evaluate the model on data that differ substantially from the training distribution, such as certificates from the COVID-19 period, to assess how well confidence and VoG scores identify emerging or atypical cases. A more detailed investigation of very high-VoG instances, conducted together with domain experts, is also needed to understand why these cases are difficult and whether they can guide improved training strategies, for example, through difficulty-aware sampling or adaptive instance weighting.</p><p>Further work is required to deepen the analysis of saliency maps and to determine, through expert feedback, whether token-level attributions provide explanations that are meaningful and actionable in practice. More generally, open questions remain regarding how structural elements of the input representation influence gradient-based explanations and whether such effects reflect model internals, methodological artifacts, or data-dependent interactions. Addressing these issues will be important to improve the robustness and interpretability of attribution-based explanations. Finally, alternative calibration approaches that maintain high ECE performance without increasing MCE should be explored.</p></sec><sec id="s4-4"><title>Conclusions</title><p>This study evaluated the interpretability of a transformer-based model for UCOD identification by jointly analyzing confidence calibration, instance difficulty, and input-level explanations. Using a pre&#x2013;fine-tuned model on large-scale reverse-coded death certificate data, we showed that the model exhibits good out-of-the-box calibration in terms of ECE, while MCE remains higher. Post-hoc temperature scaling further improves average calibration but increases worst-case deviations, indicating that calibration methods should be applied cautiously in highly imbalanced confidence regimes.</p><p>VoG provides a complementary signal for ranking instance difficulty, supporting the identification of potentially OOD cases that may require expert review. Although VoG shows limited correlation with simple difficulty proxies, it contributes additional information beyond confidence scores alone. Saliency maps based on gradient attribution highlight input tokens influencing model predictions and reveal patterns consistent with established WHO coding rules in both correct and incorrect cases. Overall, these combined analyses create a structured framework for interpreting transformer-based clinical coding models.</p></sec></sec></body><back><ack><p>The authors used ChatGPT to assist with language editing, grammar refinement, and phrasing improvements during manuscript preparation. All scientific content, analyses, experimental design, and conclusions were produced by the authors.</p></ack><notes><sec><title>Funding</title><p>This study did not receive any external funding. No specific grant from any public, commercial, or not-for-profit funding agency was obtained for the conduct of this research.</p></sec><sec><title>Data Availability</title><p>The datasets analyzed during this study are available from the US National Center for Health Statistics Vital Statistics online portal [<xref ref-type="bibr" rid="ref62">62</xref>]. The language model used in this study (disease language bidirectional encoder representations from transformers) is publicly available on Hugging Face [<xref ref-type="bibr" rid="ref37">37</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>MHP contributed to data curation, investigation, validation, and visualization and led writing&#x2014;original draft. KR contributed to conceptualization, formal analysis, methodology, and project administration, and participated in writing&#x2014;review and editing. VDM contributed to resources, supervision, and participated in writing&#x2014;review and editing.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb2">DiLBERT</term><def><p>disease language bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb3">ECE</term><def><p>expected calibration error</p></def></def-item><def-item><term id="abb4">ICD</term><def><p>International Statistical Classification of Diseases and Related Health Problems</p></def></def-item><def-item><term id="abb5">ICD-10</term><def><p>International Statistical Classification of Diseases and Related Health Problems version-10th revision</p></def></def-item><def-item><term id="abb6">ICD-11</term><def><p>International Statistical Classification of Diseases and Related Health Problems- 11th revision</p></def></def-item><def-item><term id="abb7">ID</term><def><p>in-distribution</p></def></def-item><def-item><term id="abb8">IG</term><def><p>Integrated Gradients</p></def></def-item><def-item><term id="abb9">MCE</term><def><p>maximum calibration error</p></def></def-item><def-item><term id="abb10">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb11">NN</term><def><p>neural network</p></def></def-item><def-item><term id="abb12">OOD</term><def><p>out-of-distribution</p></def></def-item><def-item><term id="abb13">UCOD</term><def><p>underlying cause of death</p></def></def-item><def-item><term id="abb14">VoG</term><def><p>Variance of Gradients</p></def></def-item><def-item><term id="abb15">WHO</term><def><p>World Health Organization</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pleiss</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Weinberger</surname><given-names>KQ</given-names> </name></person-group><article-title>On calibration of modern neural networks</article-title><access-date>2026-04-15</access-date><conf-name>Proceedings of the 34th International Conference on Machine Learning</conf-name><conf-date>Aug 6-11, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.5555/3305381.3305518">https://dl.acm.org/doi/10.5555/3305381.3305518</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Geng</surname><given-names>A</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ranzato</surname><given-names>M</given-names> </name><name name-style="western"><surname>Beygelzimer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dauphin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Wortman Vaughan</surname><given-names>J</given-names> </name></person-group><article-title>On the importance of gradients for detecting distributional shifts in the wild</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 6-14, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2021/file/063e26c670d07bb7c4d30e6fc69fe056-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2021/file/063e26c670d07bb7c4d30e6fc69fe056-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Osl</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ohno-Machado</surname><given-names>L</given-names> </name></person-group><article-title>Calibrating predictive model estimates to support personalized medicine</article-title><source>J Am Med Inform Assoc</source><year>2012</year><volume>19</volume><issue>2</issue><fpage>263</fpage><lpage>274</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000291</pub-id><pub-id pub-id-type="medline">21984587</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Minderer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Djolonga</surname><given-names>J</given-names> </name><name name-style="western"><surname>Romijnders</surname><given-names>R</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ranzato</surname><given-names>M</given-names> </name><name name-style="western"><surname>Beygelzimer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dauphin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Wortman Vaughan</surname><given-names>J</given-names> </name></person-group><article-title>Revisiting the calibration of modern neural networks</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 6-14, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2021/file/8420d359404024567b5aefda1231af24-Paper.pdf">https://proceedings.neurips.cc/paper/2021/file/8420d359404024567b5aefda1231af24-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Bojarski</surname><given-names>M</given-names> </name><name name-style="western"><surname>Del Testa</surname><given-names>D</given-names> </name><name name-style="western"><surname>Dworakowski</surname><given-names>D</given-names> </name><etal/></person-group><article-title>End-to-end learning for self-driving cars</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 25, 2016</comment><pub-id pub-id-type="doi">10.48550/arXiv.1604.07316</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cosmides</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tooby</surname><given-names>J</given-names> </name></person-group><article-title>Are humans good intuitive statisticians after all? Rethinking some conclusions from the literature on judgment under uncertainty</article-title><source>Cognition</source><year>1996</year><month>01</month><volume>58</volume><issue>1</issue><fpage>1</fpage><lpage>73</lpage><pub-id pub-id-type="doi">10.1016/0010-0277(95)00664-8</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hannun</surname><given-names>A</given-names> </name><name name-style="western"><surname>Case</surname><given-names>C</given-names> </name><name name-style="western"><surname>Casper</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Deep Speech: scaling up end-to-end speech recognition</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 19, 2014</comment><pub-id pub-id-type="doi">10.48550/ARXIV.1412.5567</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kendall</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cipolla</surname><given-names>R</given-names> </name></person-group><article-title>Modelling uncertainty in deep learning for camera relocalization</article-title><conf-name>2016 IEEE International Conference on Robotics and Automation (ICRA)</conf-name><conf-date>May 16-21, 2016</conf-date><pub-id pub-id-type="doi">10.1109/ICRA.2016.7487679</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>DeVries</surname><given-names>T</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>GW</given-names> </name></person-group><article-title>Learning confidence for out-of-distribution detection in neural networks</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 13, 2018</comment><pub-id pub-id-type="doi">10.48550/ARXIV.1802.04865</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kuleshov</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ermon</surname><given-names>S</given-names> </name></person-group><article-title>Estimating uncertainty online against an adversary</article-title><conf-name>Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence (AAAI&#x2019;17)</conf-name><conf-date>Feb 4-9, 2017</conf-date><pub-id pub-id-type="doi">10.1609/aaai.v31i1.10949</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pleiss</surname><given-names>G</given-names> </name><name name-style="western"><surname>Raghavan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Kleinberg</surname><given-names>J</given-names> </name><name name-style="western"><surname>Weinberger</surname><given-names>KQ</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Guyon</surname><given-names>I</given-names> </name><name name-style="western"><surname>Luxburg</surname><given-names>U</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wallach</surname><given-names>H</given-names> </name><name name-style="western"><surname>Fergus</surname><given-names>R</given-names> </name><name name-style="western"><surname>Vishwanathan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Garnett</surname><given-names>R</given-names> </name></person-group><article-title>On fairness and calibration</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 4-9, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2017/file/b8b9c74ac526fffbeb2d39ab038d1cd7-Paper.pdf">https://proceedings.neurips.cc/paper/2017/file/b8b9c74ac526fffbeb2d39ab038d1cd7-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Niculescu-Mizil</surname><given-names>A</given-names> </name><name name-style="western"><surname>Caruana</surname><given-names>R</given-names> </name></person-group><article-title>Predicting good probabilities with supervised learning</article-title><year>2005</year><conf-name>ICML &#x2019;05: Proceedings of the 22nd International Conference on Machine Learning</conf-name><conf-date>Aug 7-11, 2005</conf-date><pub-id pub-id-type="doi">10.1145/1102351.1102430</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Tolstikhin</surname><given-names>IO</given-names> </name><name name-style="western"><surname>Houlsby</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kolesnikov</surname><given-names>A</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ranzato</surname><given-names>M</given-names> </name><name name-style="western"><surname>Beygelzimer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dauphin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Wortman Vaughan</surname><given-names>J</given-names> </name></person-group><article-title>MLP-Mixer: an all-MLP architecture for vision</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 6-14, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2021/file/cba0a4ee5ccd02fda0fe3f9a3e7b89fe-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2021/file/cba0a4ee5ccd02fda0fe3f9a3e7b89fe-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dosovitskiy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Beyer</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kolesnikov</surname><given-names>A</given-names> </name><etal/></person-group><article-title>An image is worth 16&#x00D7;16 words: transformers for image recognition at scale</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 3, 2021</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2010.11929</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Agarwal</surname><given-names>C</given-names> </name><name name-style="western"><surname>D&#x2019;souza</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hooker</surname><given-names>S</given-names> </name></person-group><article-title>Estimating example difficulty using variance of gradients</article-title><conf-name>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 18-24, 2022</conf-date><pub-id pub-id-type="doi">10.1109/CVPR52688.2022.01012</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kwon</surname><given-names>G</given-names> </name><name name-style="western"><surname>Prabhushankar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Temel</surname><given-names>D</given-names> </name><name name-style="western"><surname>Al Regib</surname><given-names>G</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Vedaldi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bischof</surname><given-names>H</given-names> </name><name name-style="western"><surname>Brox</surname><given-names>T</given-names> </name><name name-style="western"><surname>Frahm</surname><given-names>JM</given-names> </name></person-group><article-title>Backpropagated gradient representations for anomaly detection</article-title><conf-name>Computer Vision&#x2014;ECCV 2020: 16th European Conference, Proceedings, Part XXI</conf-name><conf-date>Aug 23-28, 2020</conf-date><pub-id pub-id-type="doi">10.1007/978-3-030-58589-1_13</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Prabhushankar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Al Regib</surname><given-names>G</given-names> </name></person-group><article-title>Gradient-based adversarial and out-of-distribution detection</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 4, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2206.08255</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Schulze</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Sperl</surname><given-names>P</given-names> </name><name name-style="western"><surname>B&#x00F6;ttinger</surname><given-names>K</given-names> </name></person-group><article-title>DA3G: detecting adversarial attacks by analysing gradients</article-title><year>2021</year><conf-name>Computer Security&#x2014;ESORICS 2021: 26th European Symposium on Research in Computer Security</conf-name><pub-id pub-id-type="doi">10.1007/978-3-030-88418-5_27</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Schulze</surname><given-names>JP</given-names> </name><name name-style="western"><surname>Sperl</surname><given-names>P</given-names> </name><name name-style="western"><surname>R&#x0103;dutoiu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sagebiel</surname><given-names>C</given-names> </name><name name-style="western"><surname>B&#x00F6;ttinger</surname><given-names>K</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Amini</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Canu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fischer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Guns</surname><given-names>T</given-names> </name><name name-style="western"><surname>Novak</surname><given-names>PK</given-names> </name><name name-style="western"><surname>Tsoumakas</surname><given-names>G</given-names> </name></person-group><article-title>R2-AD2: detecting anomalies by analysing the raw gradient</article-title><source>Machine Learning and Knowledge Discovery in Databases</source><year>2023</year><publisher-name>Springer</publisher-name><fpage>209</fpage><lpage>224</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-26387-3_13</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lakshminarayanan</surname><given-names>B</given-names> </name><name name-style="western"><surname>Pritzel</surname><given-names>A</given-names> </name><name name-style="western"><surname>Blundell</surname><given-names>C</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Guyon</surname><given-names>I</given-names> </name><name name-style="western"><surname>Luxburg</surname><given-names>U</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wallach</surname><given-names>H</given-names> </name><name name-style="western"><surname>Fergus</surname><given-names>R</given-names> </name><name name-style="western"><surname>Vishwanathan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Garnett</surname><given-names>R</given-names> </name></person-group><article-title>Simple and scalable predictive uncertainty estimation using deep ensembles</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 4-9, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2017/file/9ef2ed4b7fd2c810847ffa5fa85bce38-Paper.pdf">https://proceedings.neurips.cc/paper/2017/file/9ef2ed4b7fd2c810847ffa5fa85bce38-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Owens</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name></person-group><article-title>Energy-based out-of-distribution detection</article-title><conf-name>Proceedings of the 34th International Conference on Neural Information Processing Systems (NeurIPS&#x2019;20)</conf-name><conf-date>Dec 6-12, 2020</conf-date></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>H</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>J</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Bengio</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wallach</surname><given-names>H</given-names> </name><name name-style="western"><surname>Larochelle</surname><given-names>H</given-names> </name><name name-style="western"><surname>Grauman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Cesa-Bianchi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Garnett</surname><given-names>R</given-names> </name></person-group><article-title>A simple unified framework for detecting out-of-distribution samples and adversarial attacks</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 3-8, 2018</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2018/file/abdeb6f575ac5c6676b747bca8d09cc2-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2018/file/abdeb6f575ac5c6676b747bca8d09cc2-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kokhlikyan</surname><given-names>N</given-names> </name><name name-style="western"><surname>Miglani</surname><given-names>V</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Captum: a unified and generic model interpretability library for PyTorch</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 16, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2009.07896</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Malkiel</surname><given-names>I</given-names> </name><name name-style="western"><surname>Ginzburg</surname><given-names>D</given-names> </name><name name-style="western"><surname>Barkan</surname><given-names>O</given-names> </name><name name-style="western"><surname>Caciularu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Weill</surname><given-names>J</given-names> </name><name name-style="western"><surname>Koenigstein</surname><given-names>N</given-names> </name></person-group><article-title>Interpreting BERT-based text similarity via activation and saliency maps</article-title><conf-name>WWW &#x2019;22: Proceedings of the ACM Web Conference 2022</conf-name><conf-date>Apr 25-29, 2022</conf-date><pub-id pub-id-type="doi">10.1145/3485447.3512045</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>International Statistical Classification of Diseases and Related Health Problems, 10th revision, volume 2</article-title><source>World Health Organization</source><year>2016</year><access-date>2026-03-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://icd.who.int/browse10/Content/statichtml/ICD10Volume2_en_2016.pdf">https://icd.who.int/browse10/Content/statichtml/ICD10Volume2_en_2016.pdf</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Della Mea</surname><given-names>V</given-names> </name><name name-style="western"><surname>Popescu</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Roitero</surname><given-names>K</given-names> </name></person-group><article-title>Underlying cause of death identification from death certificates using reverse coding to text and a NLP based deep learning approach</article-title><source>Inform Med Unlocked</source><year>2020</year><volume>21</volume><fpage>100456</fpage><pub-id pub-id-type="doi">10.1016/j.imu.2020.100456</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Karandikar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cain</surname><given-names>N</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>D</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ranzato</surname><given-names>M</given-names> </name><name name-style="western"><surname>Beygelzimer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dauphin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Wortman Vaughan</surname><given-names>J</given-names> </name></person-group><article-title>Soft calibration objectives for neural networks</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 6-14, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2021/file/f8905bd3df64ace64a68e154ba72f24c-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2021/file/f8905bd3df64ace64a68e154ba72f24c-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Krishnan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tickoo</surname><given-names>O</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Larochelle</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ranzato</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hadsell</surname><given-names>R</given-names> </name><name name-style="western"><surname>Balcan</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>H</given-names> </name></person-group><article-title>Improving model calibration with accuracy versus uncertainty optimization</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2020/file/d3d9446802a44259755d38e6d163e820-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kumar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sarawagi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>U</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Dy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Krause</surname><given-names>A</given-names> </name></person-group><article-title>Trainable calibration measures for neural networks from kernel mean embeddings</article-title><access-date>2026-03-20</access-date><conf-name>Proceedings of the 35th International Conference on Machine Learning (PMLR)</conf-name><conf-date>Jul 10-15, 2018</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v80/kumar18a.html">https://proceedings.mlr.press/v80/kumar18a.html</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mukhoti</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kulharia</surname><given-names>V</given-names> </name><name name-style="western"><surname>Sanyal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Golodetz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Torr</surname><given-names>P</given-names> </name><name name-style="western"><surname>Dokania</surname><given-names>P</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Larochelle</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ranzato</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hadsell</surname><given-names>R</given-names> </name><name name-style="western"><surname>Balcan</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>H</given-names> </name></person-group><article-title>Calibrating deep neural networks using focal loss</article-title><year>2020</year><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 6-12, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2020/file/aeb7b30ef1d024a76f21a1d40e30c302-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2020/file/aeb7b30ef1d024a76f21a1d40e30c302-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hui</surname><given-names>L</given-names> </name><name name-style="western"><surname>Belkin</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of neural architectures trained with square loss vs cross-entropy in classification tasks</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 23, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2006.07322</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ba</surname><given-names>J</given-names> </name></person-group><article-title>Batch ensemble: an alternative approach to efficient ensemble and lifelong learning</article-title><access-date>2026-04-15</access-date><conf-name>International Conference on Learning Representations (ICLR)</conf-name><conf-date>Apr 26-30, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2002.06715">https://arxiv.org/abs/2002.06715</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Dusenberry</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jerfel</surname><given-names>G</given-names> </name><name name-style="western"><surname>Wen</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Efficient and scalable Bayesian neural nets with rank-1 factors</article-title><access-date>2026-03-20</access-date><conf-name>Proceedings of the 37th International Conference on Machine Learning (PMLR)</conf-name><conf-date>Jun 13-18, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v119/dusenberry20a.html">https://proceedings.mlr.press/v119/dusenberry20a.html</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Desai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Durrett</surname><given-names>G</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Webber</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cohn</surname><given-names>T</given-names> </name><name name-style="western"><surname>He</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name></person-group><article-title>Calibration of pre-trained transformers</article-title><conf-name>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name><conf-date>Nov 16-18, 2020</conf-date><pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.21</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><conf-name>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Long and Short Papers)</conf-name><conf-date>Jun 2-7, 2019</conf-date><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>RoBERTa: a robustly optimized BERT pretraining approach</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 26, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roitero</surname><given-names>K</given-names> </name><name name-style="western"><surname>Portelli</surname><given-names>B</given-names> </name><name name-style="western"><surname>Popescu</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Mea</surname><given-names>VD</given-names> </name></person-group><article-title>DiLBERT: cheap embeddings for disease related medical NLP</article-title><source>IEEE Access</source><year>2021</year><volume>9</volume><fpage>159714</fpage><lpage>159723</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2021.3131386</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Stone</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Llama 2: open foundation and fine-tuned chat models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 19, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.09288</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mensch</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Mistral 7B</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 10, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.06825</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Caruana</surname><given-names>R</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Malmgren</surname><given-names>H</given-names> </name><name name-style="western"><surname>Borga</surname><given-names>M</given-names> </name><name name-style="western"><surname>Niklasson</surname><given-names>L</given-names> </name></person-group><article-title>Case-based explanation for artificial neural nets</article-title><source>Artificial Neural Networks in Medicine and Biology</source><year>2000</year><publisher-name>Springer</publisher-name><fpage>303</fpage><lpage>308</lpage><pub-id pub-id-type="doi">10.1007/978-1-4471-0513-8_46</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hooker</surname><given-names>S</given-names> </name><name name-style="western"><surname>Courville</surname><given-names>A</given-names> </name><name name-style="western"><surname>Clark</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dauphin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Frome</surname><given-names>A</given-names> </name></person-group><article-title>What do compressed deep neural networks forget</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 6, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.1911.05248</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Krizhevsky</surname><given-names>A</given-names> </name></person-group><article-title>Learning multiple layers of features from tiny images. Technical report</article-title><year>2009</year><access-date>2026-04-06</access-date><publisher-name>University of Toronto</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf">https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf</ext-link></comment></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cortes</surname><given-names>C</given-names> </name><name name-style="western"><surname>Salvo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Mohri</surname><given-names>M</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Lee</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sugiyama</surname><given-names>M</given-names> </name><name name-style="western"><surname>Luxburg</surname><given-names>U</given-names> </name><name name-style="western"><surname>Guyon</surname><given-names>I</given-names> </name><name name-style="western"><surname>Garnett</surname><given-names>R</given-names> </name></person-group><article-title>Boosting with abstention</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 5-10, 2016</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2016/file/7634ea65a4e6d9041cfd3f7de18e334a-Paper.pdf">https://proceedings.neurips.cc/paper/2016/file/7634ea65a4e6d9041cfd3f7de18e334a-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guha Roy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>J</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Does your dermatology classifier know what it doesn&#x2019;t know? Detecting the long-tail of unseen conditions</article-title><source>Med Image Anal</source><year>2022</year><month>01</month><volume>75</volume><fpage>102274</fpage><pub-id pub-id-type="doi">10.1016/j.media.2021.102274</pub-id><pub-id pub-id-type="medline">34731777</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Simonyan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Vedaldi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zisserman</surname><given-names>A</given-names> </name></person-group><article-title>Deep inside convolutional networks: visualizing image classification models and saliency maps</article-title><year>2014</year><access-date>2026-04-15</access-date><conf-name>Proceedings of the International Conference on Learning Representations (ICLR)</conf-name><conf-date>Apr 26-30, 2014</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1312.6034">https://arxiv.org/abs/1312.6034</ext-link></comment></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kokilepersaud</surname><given-names>K</given-names> </name><name name-style="western"><surname>Prabhushankar</surname><given-names>M</given-names> </name><name name-style="western"><surname>AlRegib</surname><given-names>G</given-names> </name><name name-style="western"><surname>Trejo Corona</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wykoff</surname><given-names>C</given-names> </name></person-group><article-title>Gradient-based severity labeling for biomarker classification in OCT</article-title><conf-name>2022 IEEE International Conference on Image Processing (ICIP)</conf-name><conf-date>Oct 16-19, 2022</conf-date><pub-id pub-id-type="doi">10.1109/ICIP46576.2022.9897215</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Baldock</surname><given-names>R</given-names> </name><name name-style="western"><surname>Maennel</surname><given-names>H</given-names> </name><name name-style="western"><surname>Neyshabur</surname><given-names>B</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ranzato</surname><given-names>M</given-names> </name><name name-style="western"><surname>Beygelzimer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dauphin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Wortman Vaughan</surname><given-names>J</given-names> </name></person-group><article-title>Deep learning through the lens of example difficulty</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 6-14, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2021/file/5a4b25aaed25c2ee1b74de72dc03c14e-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2021/file/5a4b25aaed25c2ee1b74de72dc03c14e-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Paul</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ganguli</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dziugaite</surname><given-names>GK</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ranzato</surname><given-names>M</given-names> </name><name name-style="western"><surname>Beygelzimer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dauphin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Wortman Vaughan</surname><given-names>J</given-names> </name></person-group><article-title>Deep learning on a data diet: finding important examples early in training</article-title><access-date>2026-03-20</access-date><conf-name>Advances in Neural Information Processing Systems</conf-name><conf-date>Dec 6-14, 2021</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2021/file/ac56f8fe9eea3e4a365f29f0f1957c55-Paper.pdf">https://proceedings.neurips.cc/paper/2021/file/ac56f8fe9eea3e4a365f29f0f1957c55-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Igoe</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chung</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Char</surname><given-names>I</given-names> </name><name name-style="western"><surname>Schneider</surname><given-names>J</given-names> </name></person-group><article-title>How useful are gradients for OOD detection really</article-title><source>arXiv</source><comment>Preprint posted online on  May 20, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2205.10439</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kwon</surname><given-names>G</given-names> </name><name name-style="western"><surname>Prabhushankar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Temel</surname><given-names>D</given-names> </name><name name-style="western"><surname>AlRegib</surname><given-names>G</given-names> </name></person-group><article-title>Novelty detection through model-based characterization of neural networks</article-title><conf-name>2020 IEEE International Conference on Image Processing (ICIP)</conf-name><conf-date>Oct 25-28, 2020</conf-date><pub-id pub-id-type="doi">10.1109/ICIP40778.2020.9190706</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Gradient-based novelty detection boosted by self-supervised binary classification</article-title><conf-name>Proceedings of the 36th AAAI Conference on Artificial Intelligence (AAAI 2022)</conf-name><conf-date>Feb 22 to Mar 1, 2022</conf-date><pub-id pub-id-type="doi">10.1609/aaai.v36i8.20812</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kokhlikyan</surname><given-names>N</given-names> </name><name name-style="western"><surname>Miglani</surname><given-names>V</given-names> </name><name name-style="western"><surname>Alsallakh</surname><given-names>B</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>M</given-names> </name><name name-style="western"><surname>Reblitz-Richardson</surname><given-names>O</given-names> </name></person-group><article-title>Investigating sanity checks for saliency maps with image and text classification</article-title><comment>Preprint posted online on  Jun 8, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2106.07475</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Borgnia</surname><given-names>E</given-names> </name><name name-style="western"><surname>Goldblum</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goldstein</surname><given-names>T</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Levin</surname><given-names>R</given-names> </name><name name-style="western"><surname>Shu</surname><given-names>M</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Koyejo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Belgrave</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>K</given-names> </name><name name-style="western"><surname>Oh</surname><given-names>A</given-names> </name></person-group><article-title>Where do models go wrong? Parameter-space saliency maps for explainability</article-title><conf-name>Advances in Neural Information Processing Systems 35</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><pub-id pub-id-type="doi">10.52202/068431-1135</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eckert</surname><given-names>O</given-names> </name></person-group><article-title>Electronic coding of death certificates</article-title><source>Bundesgesundheitsblatt Gesundheitsforschung Gesundheitsschutz</source><year>2019</year><month>12</month><volume>62</volume><issue>12</issue><fpage>1468</fpage><lpage>1475</lpage><pub-id pub-id-type="doi">10.1007/s00103-019-03045-2</pub-id><pub-id pub-id-type="medline">31686151</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Israel</surname><given-names>RA</given-names> </name></person-group><article-title>Automation of mortality data coding and processing in the United States of America</article-title><source>World Health Stat Q</source><year>1990</year><volume>43</volume><issue>4</issue><fpage>259</fpage><lpage>262</lpage><pub-id pub-id-type="medline">2293494</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Popescu</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Celik</surname><given-names>C</given-names> </name><name name-style="western"><surname>Della Mea</surname><given-names>V</given-names> </name><name name-style="western"><surname>Jakob</surname><given-names>R</given-names> </name></person-group><article-title>Preliminary validation of a rule-based system for mortality coding using ICD-11</article-title><source>Stud Health Technol Inform</source><year>2022</year><month>05</month><day>25</day><volume>294</volume><fpage>679</fpage><lpage>683</lpage><pub-id pub-id-type="doi">10.3233/SHTI220555</pub-id><pub-id pub-id-type="medline">35612175</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Falissard</surname><given-names>L</given-names> </name><name name-style="western"><surname>Morgand</surname><given-names>C</given-names> </name><name name-style="western"><surname>Roussel</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A deep artificial neural network-based model for prediction of underlying cause of death from death certificates: algorithm development and validation</article-title><source>JMIR Med Inform</source><year>2020</year><month>04</month><day>28</day><volume>8</volume><issue>4</issue><fpage>e17125</fpage><pub-id pub-id-type="doi">10.2196/17125</pub-id><pub-id pub-id-type="medline">32343252</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pita Ferreira</surname><given-names>P</given-names> </name><name name-style="western"><surname>Godinho Sim&#x00F5;es</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pinto de Carvalho</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Real-time classification of causes of death using AI: sensitivity analysis</article-title><source>JMIR AI</source><year>2023</year><month>11</month><day>22</day><volume>2</volume><fpage>e40965</fpage><pub-id pub-id-type="doi">10.2196/40965</pub-id><pub-id pub-id-type="medline">38875558</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harteloh</surname><given-names>P</given-names> </name></person-group><article-title>The implementation of an automated coding system for cause-of-death statistics</article-title><source>Inform Health Soc Care</source><year>2020</year><month>01</month><volume>45</volume><issue>1</issue><fpage>1</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.1080/17538157.2018.1496092</pub-id><pub-id pub-id-type="medline">30125131</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Naeini</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Cooper</surname><given-names>GF</given-names> </name><name name-style="western"><surname>Hauskrecht</surname><given-names>M</given-names> </name></person-group><article-title>Obtaining well calibrated probabilities using Bayesian binning</article-title><source>Proc AAAI Conf Artif Intell</source><year>2015</year><month>01</month><volume>2015</volume><fpage>2901</fpage><lpage>2907</lpage><pub-id pub-id-type="medline">25927013</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sundararajan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Taly</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>Q</given-names> </name></person-group><article-title>Axiomatic attribution for deep networks</article-title><access-date>2026-04-15</access-date><conf-name>Proceedings of the 34th International Conference on Machine Learning (ICML 2017)</conf-name><conf-date>Aug 6-11, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1703.01365">https://arxiv.org/abs/1703.01365</ext-link></comment></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="web"><article-title>The National Vital Statistics System (NVSS)</article-title><source>US Centers for Disease Control and Prevention</source><access-date>2026-03-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/nchs/nvss/index.htm">https://www.cdc.gov/nchs/nvss/index.htm</ext-link></comment></nlm-citation></ref></ref-list></back></article>