<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e66153</article-id><article-id pub-id-type="doi">10.2196/66153</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Domain-Specific Pretraining of NorDeClin-Bidirectional Encoder Representations From Transformers for <italic>International Statistical Classification of Diseases, Tenth Revision,</italic> Code Prediction in Norwegian Clinical Texts: Model Development and Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Ngo</surname><given-names>Phuong Dinh</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Tejedor Hern&#x00E1;ndez</surname><given-names>Miguel &#x00C1;ngel</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chomutare</surname><given-names>Taridzo</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Budrionis</surname><given-names>Andrius</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Svenning</surname><given-names>Therese Olsen</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Torsvik</surname><given-names>Torbj&#x00F8;rn</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lamproudis</surname><given-names>Anastasios</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dalianis</surname><given-names>Hercules</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff5">5</xref></contrib></contrib-group><aff id="aff1"><institution>Norwegian Centre for E-health Research, University Hospital of Northern Norway</institution><addr-line>P.O. Box 35, N-9038</addr-line><addr-line>Troms&#x00F8;</addr-line><country>Norway</country></aff><aff id="aff2"><institution>Department of Physics and Technology, Faculty of Sciences and Technology, UiT The Arctic University of Norway</institution><addr-line>Troms&#x00F8;</addr-line><country>Norway</country></aff><aff id="aff3"><institution>Department of Mathematics and Statistics, Faculty of Sciences and Technology, UiT The Arctic University of Norway</institution><addr-line>Troms&#x00F8;</addr-line><country>Norway</country></aff><aff id="aff4"><institution>Department of Computer Sciences, Faculty of Sciences and Technology, UiT The Arctic University of Norway</institution><addr-line>Troms&#x00F8;</addr-line><country>Norway</country></aff><aff id="aff5"><institution>Department of Computer and Systems Sciences, Stockholm University</institution><addr-line>Kista</addr-line><country>Sweden</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Emam</surname><given-names>Khaled El</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Yoon</surname><given-names>Hong-Jun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kwon</surname><given-names>Sunjae</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Karen</surname><given-names>Triep</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Miguel &#x00C1;ngel Tejedor Hern&#x00E1;ndez, PhD, Norwegian Centre for E-health Research, University Hospital of Northern Norway, P.O. Box 35, N-9038, Troms&#x00F8;, Norway, 47 92699162; <email>Miguel.Angel.Tejedor.Hernandez@ehealthresearch.no</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>25</day><month>8</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e66153</elocation-id><history><date date-type="received"><day>05</day><month>09</month><year>2024</year></date><date date-type="rev-recd"><day>02</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>26</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Phuong Dinh Ngo, Miguel &#x00C1;ngel Tejedor Hern&#x00E1;ndez, Taridzo Chomutare, Andrius Budrionis, Therese Olsen Svenning, Torbj&#x00F8;rn Torsvik, Anastasios Lamproudis, Hercules Dalianis. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 25.8.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e66153"/><abstract><sec><title>Background</title><p>Accurately assigning <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>) codes is critical for clinical documentation, reimbursement processes, epidemiological studies, and health care planning. Manual coding is time-consuming, labor-intensive, and prone to errors, underscoring the need for automated solutions within the Norwegian health care system. Recent advances in natural language processing (NLP) and transformer-based language models have shown promising results in automating <italic>ICD</italic> (<italic>International Classification of Diseases</italic>) coding in several languages. However, prior work has focused primarily on English and other high-resource languages, leaving a gap in Norwegian-specific clinical NLP research.</p></sec><sec><title>Objective</title><p>This study introduces 2 versions of NorDeClin-BERT (NorDeClin Bidirectional Encoder Representations from Transformers), domain-specific Norwegian BERT-based models pretrained on a large corpus of Norwegian clinical text to enhance their understanding of medical language. Both models were subsequently fine-tuned to predict ICD-10 diagnosis codes. We aimed to evaluate the impact of domain-specific pretraining and model size on classification performance and to compare NorDeClin-BERT with general-purpose and cross-lingual BERT models in the context of Norwegian ICD-10 coding.</p></sec><sec sec-type="methods"><title>Methods</title><p>Two versions of NorDeClin-BERT were pretrained on the ClinCode Gastro Corpus, a large-scale dataset comprising 8.8 million deidentified Norwegian clinical notes, to enhance domain-specific language modeling. The base model builds upon NorBERT3-base and was pretrained on a large, relevant subset of the corpus, while the large model builds upon NorBERT3-large and was trained on the full dataset. Both models were benchmarked against SweDeClin-BERT, ScandiBERT, NorBERT3-base, and NorBERT3-large, using standard evaluation metrics: accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score.</p></sec><sec sec-type="results"><title>Results</title><p>The results show that both versions of NorDeClin-BERT outperformed general-purpose Norwegian BERT models and Swedish clinical BERT models in classifying both prevalent and less common <italic>ICD-10</italic> codes. Notably, NorDeClin-BERT-large achieved the highest overall performance across evaluation metrics, demonstrating the impact of domain-specific clinical pretraining in Norwegian. These results highlight that domain-specific pretraining on Norwegian clinical text, combined with model capacity, improves <italic>ICD-10</italic> classification accuracy compared with general-domain Norwegian models and Swedish models pretrained on clinical text. Furthermore, while Swedish clinical models demonstrated some transferability to Norwegian, their performance remained suboptimal, emphasizing the necessity of Norwegian-specific clinical pretraining.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This study highlights the potential of NorDeClin-BERT to improve <italic>ICD-10</italic> code classification for the gastroenterology domain in Norway, ultimately streamlining clinical documentation, reporting processes, reducing administrative burden, and enhancing coding accuracy in Norwegian health care institutions. The benchmarking evaluation establishes NorDeClin-BERT as a state-of-the-art model for processing Norwegian clinical text and predicting <italic>ICD-10</italic> coding, establishing a new baseline for future research in Norwegian medical NLP. Future work may explore further domain adaptation techniques, external knowledge integration, and cross-hospital generalizability to enhance <italic>ICD</italic> coding performance across broader clinical settings.</p></sec></abstract><kwd-group><kwd>natural language processing</kwd><kwd>artificial intelligence</kwd><kwd>language model</kwd><kwd>clinical text</kwd><kwd>BERT</kwd><kwd>text mining</kwd><kwd>health care</kwd><kwd>ICD-10 Coding</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The transition to digital health records and the automation of clinical documentation processes represent significant milestones in modern health care management. Central to these advancements is the accurate assignment of the <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>) codes to patient records [<xref ref-type="bibr" rid="ref1">1</xref>]. These codes serve multiple critical functions: they streamline billing and insurance claims, play a pivotal role in epidemiological studies, facilitate health care planning, and aid in the management of public health resources. In addition, they serve as a measure of both the quantity and quality of health care provided [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>In Norway, all hospitals record their activity by summarizing patient encounters into <italic>ICD-10</italic> codes. Despite its importance, manually assigning <italic>ICD-10</italic> codes is time-consuming and prone to errors, highlighting the need for an automatic solution [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Several Norwegian studies have highlighted the issues associated with clinical coding [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>], and similar findings from other countries support this nonsatisfactory quality of the manually assigned codes [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>Recent advances in natural language processing (NLP), particularly the development of Bidirectional Encoder Representations from Transformers (BERT) models [<xref ref-type="bibr" rid="ref12">12</xref>], have facilitated novel methodologies for automating complex text data processing. Specifically, the architecture of the BERT transformer facilitates a good understanding of contextual linguistic nuances, making it highly applicable for various clinical tasks, including deidentification [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>] and the prediction of <italic>ICD-10</italic> codes from clinical notes [<xref ref-type="bibr" rid="ref15">15</xref>]. Building on these advancements, NorBERT3-base, developed by the Language Technology Group [<xref ref-type="bibr" rid="ref16">16</xref>] at the University of Oslo and available on Hugging Face [<xref ref-type="bibr" rid="ref17">17</xref>], is an advanced, state-of-the-art Norwegian BERT model tailored to understand the complexities of the Norwegian language. It was trained as part of the NorBench initiative, which benchmarks Norwegian language models across various NLP tasks to ensure high performance and robustness. NorBERT3-base is a powerful tool for NLP tasks such as text classification and named entity recognition [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>Regarding the state-of-the-art automatic <italic>ICD</italic> (<italic>International Classification of Diseases</italic>) coding or computer-assisted coding (CAC) tools, Yan et al [<xref ref-type="bibr" rid="ref19">19</xref>] provided an overview of different approaches to predict <italic>ICD-10</italic> diagnosis codes, describing training datasets in various languages and highlighting issues such as dataset imbalance and explainability. Studies from China [<xref ref-type="bibr" rid="ref20">20</xref>] and Taiwan [<xref ref-type="bibr" rid="ref21">21</xref>] have demonstrated the potential of these tools to improve coding speed and quality.</p><p>In the study by Zhou et al [<xref ref-type="bibr" rid="ref20">20</xref>], a set of regular expressions was written to encode the diagnosis of <italic>ICD-10</italic> automatically. The CAC tool was used for 16 months in 2017&#x2010;2018 and compared with manual diagnosis coding. During this period, 160,000 codes were automatically assigned by the CAC tool and then compared with the manual coding. One of the main findings was that the CAC tool was 100 times faster than manual coding, and the CAC tool could maintain high coding quality. The <italic>F</italic><sub>1</sub>-score of the CAC tool is around 0.6086. In another study by Chen et al [<xref ref-type="bibr" rid="ref21">21</xref>], the authors implemented a CAC tool using the BERT model. They trained on patient records from one hospital. A total of 14,602 labels were distributed in the training material that comprised discharge summaries. Note that Chinese and Taiwanese use the same dialect but have different character sets, which are simplified and traditional. The Taiwanese <italic>ICD-10</italic> CAC tool predicts <italic>ICD-10</italic> codes with the best results of <italic>F</italic><sub>1</sub>-score of 0.715 and 0.618, respectively. The tool was also used in a user study that did not decrease coding time; however, the coding quality increased significantly from a median <italic>F</italic><sub>1</sub>-score of 0.832 to 0.922. Ponthongmak et al [<xref ref-type="bibr" rid="ref22">22</xref>] used NLP and discharge summary texts to develop a CAC tool for Thai, achieving an <italic>F</italic><sub>1</sub>-score of 0.7239 using a pretrained language model for automatic <italic>ICD</italic> coding. A systematic literature review of artificial intelligence (AI)&#x2013;based <italic>ICD</italic> coding and classification approaches using discharge summaries can be found in [<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>Several studies have also explored pretraining on clinical text for automatic <italic>ICD</italic> coding, leveraging transformer-based architectures. One such approach is GPsoap, developed by Yang et al [<xref ref-type="bibr" rid="ref24">24</xref>], which transforms <italic>ICD</italic> coding into an autoregressive text generation task. Instead of directly predicting <italic>ICD</italic> codes, GPsoap first generates natural language code descriptions, which are then mapped to <italic>ICD</italic> codes. This approach has shown advantages in few-shot learning and rare code prediction. Other studies, such as L&#x00F3;pez-Garc&#x00ED;a et al [<xref ref-type="bibr" rid="ref25">25</xref>], focused on Spanish oncology clinical texts, where a BERT-based model was pretrained on Spanish biomedical literature and further fine-tuned on ICD-O-3 (International Classification of Diseases for Oncology) coding. Similarly, Gao et al [<xref ref-type="bibr" rid="ref26">26</xref>] proposed BNMF, a BERT and Named Entity Recognition-based model for Chinese <italic>ICD</italic> coding, integrating semantic features from clinical text and structured information from <italic>ICD</italic> taxonomies. These studies highlight the importance of domain-specific adaptation when applying language models to clinical coding.</p><p>In contrast, our approach focuses on pretraining a domain-specific clinical BERT model, NorDeClin-BERT, directly on Norwegian clinical text, enabling robust <italic>ICD-10</italic> classification in a multilabel setting. Unlike GPsoap, which generates free-text descriptions, our model directly assigns <italic>ICD-10</italic> codes, aligning more closely with real-world coding workflows. Additionally, our work explores cross-linguistic transfer, evaluating models pretrained on Swedish clinical text for Norwegian <italic>ICD</italic> coding. By fine-tuning various general-domain and domain-specific Scandinavian BERT models, we systematically assess the impact of domain adaptation, model size, and linguistic generalization on <italic>ICD</italic> coding performance.</p><p>Furthermore, our approach provides a comprehensive evaluation of model performance across both domain-adapted and general-purpose pretraining approaches, offering insights into the effectiveness of Norwegian-specific pretraining compared with multilingual and cross-lingual alternatives. By investigating how domain-specific pretraining influences <italic>ICD-10</italic> coding accuracy, our study contributes to advancing automatic clinical coding for Norwegian, a language with limited prior research in this area. While model performance is crucial, interpretability is equally important, especially in health care settings where understanding the reasoning behind predictions can impact patient care and trust in the system. Various approaches to model interpretability have been explored in the context of automated <italic>ICD-10</italic> coding. For example, Dolk et al [<xref ref-type="bibr" rid="ref27">27</xref>] evaluated 2 popular interpretability methods, LIME (Local Interpretable Model-agnostic Explanations) and SHAP (Shapley additive explanations), to explain automatic <italic>ICD-10</italic> classifications of Swedish gastrointestinal discharge summaries, where SHAP was considered better than LIME. In our study, we opted for an attention-based analysis instead of LIME or SHAP. This choice is motivated by several factors. Attention mechanisms are inherent to BERT and other Transformer-based models, providing a direct window into the model&#x2019;s decision-making process without requiring post hoc explanations. Furthermore, attention-based interpretability can be extracted during inference, making it more computationally efficient than methods like LIME and SHAP. Attention weights offer fine-grained, token-level insights into which parts of the input text the model focuses on when making predictions, aligning well with the nature of the clinical text and <italic>ICD-10</italic> coding tasks.</p><p>Our research group has previously explored the application of NLP techniques to improve the accuracy and efficiency of <italic>ICD-10</italic> diagnosis coding. In a recent study, we developed a BERT-based language model, SweDeClin-BERT, trained on a large open clinical corpus of Swedish discharge summaries, particularly in the gastrointestinal surgery domain [<xref ref-type="bibr" rid="ref13">13</xref>]. This model demonstrated significant potential in assigning <italic>ICD-10</italic> codes to discharge summaries written in Swedish [<xref ref-type="bibr" rid="ref15">15</xref>]. Building on the insights gained from this work, we have extended our focus to the Norwegian clinical context, aiming to develop a specialized language model tailored to the nuances of Norwegian medical texts.</p><p>This study introduces 2 versions of NorDeClin-BERT, BERT-based models specifically developed and fine-tuned for processing Norwegian clinical texts and predicting <italic>ICD-10</italic> codes. We detail the continuous pretraining process of NorDeClin-BERT-base from NorBERT3-base using a large, relevant subset of Norwegian gastroenterological clinical notes, and NorDeClin-BERT-large from NorBERT3-large using the full clinical corpus. By leveraging domain-specific pretraining on Norwegian clinical texts, both models capture the unique linguistic features and domain-specific terminology in Norwegian medical documentation. To assess their effectiveness, we compared the performance of NorDeClin-BERT with other BERT variants, including ScandiBERT [<xref ref-type="bibr" rid="ref28">28</xref>] and NorBERT [<xref ref-type="bibr" rid="ref29">29</xref>]. This comparative analysis aims to provide insight into the advantages of a domain-specific, language-tailored model for Norwegian clinical text processing.</p><p>To guide our study, we defined the following research questions (RQs):</p><list list-type="bullet"><list-item><p>RQ1: Does domain-specific pretraining on Norwegian clinical text improve <italic>ICD-10</italic> code classification performance compared with general-domain and cross-lingual models?</p></list-item><list-item><p>RQ2: How does model size impact performance in <italic>ICD-10</italic> coding tasks when combined with clinical domain adaptation?</p></list-item><list-item><p>RQ3: Can a domain-specific base-size model match or outperform larger general-purpose models in a practical clinical classification task?</p></list-item></list></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This study adopts a structured approach to the continuous pretraining and evaluation of 2 NorDeClin-BERT, new clinical BERT-based models developed for predicting <italic>ICD-10</italic> codes from Norwegian clinical notes, specifically focusing on the gastroenterology domain. This section covers ethical considerations related to data use, the process of data collection and preparation, selection of model architecture and continuous pretraining, fine-tuning, evaluation, and interpretability analysis.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This research was approved by the Norwegian Regional Committees for Medical and Health Research Ethics North, decision number 260972. This study is based on a retrospective analysis of deidentified clinical text. The ethics committee granted a waiver of informed consent in accordance with Norwegian regulations for secondary use of health data in research. All data used in this study were fully deidentified prior to analysis. No personal identifiers were included in the dataset. Access to the data was restricted to authorized personnel, and all analyses were conducted in secure computing environments. No compensation was provided to individuals, as the study did not involve direct participation and was conducted on retrospective clinical data. The manuscript does not contain any images or materials in which individual participants or users can be identified. All data used in this study were fully deidentified prior to analysis. No personal identifiers were included in the dataset. Access to the data was restricted to authorized personnel, and all analyses were conducted in secure computing environments.</p><p>All data used in this study were fully deidentified prior to analysis. No personal identifiers were included in the dataset. Access to the data was restricted to authorized personnel, and all analyses were conducted in secure computing environments.</p></sec><sec id="s2-3"><title>Dataset and Data Processing</title><sec id="s2-3-1"><title>Overview</title><p>The corpus for this study, the ClinCode Gastro Corpus, contains approximately 8.8 million deidentified and pseudonymized clinical notes [<xref ref-type="bibr" rid="ref30">30</xref>] of adult patients treated at the Gastro-Surgical Department of the University Hospital of North Norway, Troms&#x00F8;, from 2017 to 2022. The dataset was subjected to rigorous preprocessing, including deidentification using the NorDeid tool, to ensure patient privacy and data quality [<xref ref-type="bibr" rid="ref30">30</xref>]. The NorDeid tool combines deep learning and rule-based approaches using regular expressions. This tool was adapted for the Norwegian clinical text to address the country&#x2019;s unique format and clinical terminology. The process involved identifying and pseudonymizing various protected health information types, such as names, dates, locations, and social security numbers.</p><p>We used the tokenizer associated with each corresponding backbone model during preprocessing: the NorBERT3-base tokenizer for NorDeClin-BERT-base, and the NorBERT3-large tokenizer for NorDeClin-BERT-large. Trained on a general corpus of Norwegian text, these tokenizers effectively handle the linguistic characteristics of the Norwegian language through their subword tokenization technique. Although not specifically constructed for clinical terminology, their subword approach allowed them to manage specialized medical terms and abbreviations present in our dataset during the continuous pretraining phase.</p></sec><sec id="s2-3-2"><title>Data Processing for Continuous Pretraining</title><p>Two configurations of NorDeClin-BERT were pretrained using Norwegian clinical notes, each with a different data selection strategy. For NorDeClin-BERT-base, the dataset was filtered based on clinical relevance and practical feasibility, due to limited computational resources available at the time. Two of the authors of this paper (MATH and TOS) collaborated to identify and agree on the most informative files for the pretraining process. The selection criteria focused on document types containing longer and more meaningful clinical information, ensuring the model was pretrained on the most relevant data. As a result, the final dataset used for pretraining was optimized for both quality and relevance. After removing duplicates, the Norwegian clinical corpus used for the continuous pretraining of NorDeClin-BERT-base consisted of 1,670,464 text files (3.2 GB) from various sources, including discharge summaries, surgery notes, nurses&#x2019; notes, laboratory notes, admission notes, pharmacology notes, and others. This dataset is further described in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Document types included in the Norwegian clinical corpus used for the continuous pretraining of NorDeClin-BERT-base (NorDeClin Bidirectional Encoder Representations from Transformers).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Document type</td><td align="left" valign="bottom">Number of files</td><td align="left" valign="bottom">Size</td></tr></thead><tbody><tr><td align="left" valign="top">Anesthesia</td><td align="left" valign="top">46,310</td><td align="left" valign="top">94.8 MB</td></tr><tr><td align="left" valign="top">Treatment</td><td align="left" valign="top">29,919</td><td align="left" valign="top">49.3 MB</td></tr><tr><td align="left" valign="top">Discharge summaries</td><td align="left" valign="top">586,637</td><td align="left" valign="top">1.6 GB</td></tr><tr><td align="left" valign="top">Ergotherapy</td><td align="left" valign="top">33,220</td><td align="left" valign="top">38.4 MB</td></tr><tr><td align="left" valign="top">Pharmacy</td><td align="left" valign="top">3484</td><td align="left" valign="top">4.6 MB</td></tr><tr><td align="left" valign="top">Physiotherapy</td><td align="left" valign="top">69,324</td><td align="left" valign="top">80.4 MB</td></tr><tr><td align="left" valign="top">Individual plan</td><td align="left" valign="top">558</td><td align="left" valign="top">1.4 MB</td></tr><tr><td align="left" valign="top">Admission records</td><td align="left" valign="top">248,208</td><td align="left" valign="top">779,2 MB</td></tr><tr><td align="left" valign="top">Laboratory</td><td align="left" valign="top">66</td><td align="left" valign="top">53.8 kB</td></tr><tr><td align="left" valign="top">Surgery</td><td align="left" valign="top">313,795</td><td align="left" valign="top">446.8 MB</td></tr><tr><td align="left" valign="top">Summary records</td><td align="left" valign="top">5710</td><td align="left" valign="top">9.2 MB</td></tr><tr><td align="left" valign="top">Radiology</td><td align="left" valign="top">63,734</td><td align="left" valign="top">30.1 MB</td></tr><tr><td align="left" valign="top">Somatic care</td><td align="left" valign="top">110,248</td><td align="left" valign="top">211.3 MB</td></tr><tr><td align="left" valign="top">Nursing</td><td align="left" valign="top">299,212</td><td align="left" valign="top">220.7 MB</td></tr><tr><td align="left" valign="top">Training dataset (no duplicates)</td><td align="left" valign="top">1,670,464</td><td align="left" valign="top">3.2 GB</td></tr></tbody></table></table-wrap><p>In contrast, NorDeClin-BERT-large was pretrained on the entire ClinCode Gastro Corpus, using updated hardware infrastructure with increased graphics processing unit (GPU) capacity. The only filtering applied at this stage was the removal of very short documents, excluding those with fewer than 50 tokens, to ensure a minimum level of linguistic and contextual content per note. After filtering, the dataset used for NorDeClin-BERT-large consisted of 8,337,664 text files, totaling approximately 13.2 GB. This broader dataset allowed the large model to capture a more comprehensive representation of the Norwegian clinical language used across document types.</p><p>The data processing pipeline for continuous pretraining began with loading and reading the text files from the specified directory. We applied the appropriate tokenizer to convert the text into token IDs while generating the corresponding attention masks. The text was processed in chunks, each constrained to a sequence length of 512 tokens, ensuring compatibility with the model&#x2019;s architecture.</p><p>Several key steps were involved in preparing the text data. Initially, empty lines and whitespace were removed, followed by tokenization without adding special tokens. We then introduced separation tokens (&#x003C;/s&#x003E;), as recommended in the RoBERTa paper [<xref ref-type="bibr" rid="ref31">31</xref>], to demarcate the end of individual documents within the text. After concatenating the tokenized text into segments of 510 tokens&#x2014;leaving space for the model&#x2019;s classification (&#x003C;s&#x003E;) and separator (&#x003C;/s&#x003E;) tokens&#x2014;we added these tokens to the beginning and end of each segment, enabling the model to recognize the start and end of sequences effectively. Finally, the processed data was saved to disk in a structured format, ready for continuous pretraining.</p></sec><sec id="s2-3-3"><title>Data Processing for <italic>ICD-10</italic> Fine-Tuning</title><p>The <italic>ICD-10</italic> is a standardized system for coding diseases, signs, symptoms, and other health-related factors. The <italic>ICD-10</italic> is divided into 22 chapters, each representing a broad category of medical conditions. Our study focuses specifically on Chapter XI (K-codes), which covers &#x201C;Diseases of the digestive system.&#x201D; This chapter contains approximately 500 &#x201C;K&#x201D; codes representing various gastrointestinal diseases out of the 38,000 <italic>ICD-10</italic> codes available. The presence of 87,938 discharge summaries with K-codes in our corpus underscores the richness and relevance of our dataset for gastroenterological research and NLP applications in this field. Furthermore, to prevent label leakage during the fine-tuning process, all <italic>ICD-10</italic> codes that match the label for each training sample were systematically removed from the training text. This step was essential to ensure the model learns to predict <italic>ICD-10</italic> codes based on clinical content rather than relying on explicitly mentioned codes within the text.</p></sec></sec><sec id="s2-4"><title>Model Continuous Pretraining</title><p>This study presents 2 versions of NorDeClin-BERT, both developed through continuous pretraining on Norwegian clinical text. The first, NorDeClin-BERT-base, is based on the NorBERT3-base architecture, consisting of 12 transformer layers, 12 self-attention heads per layer, and a hidden size of 768 dimensions, resulting in 123 million parameters. The second, NorDeClin-BERT-large, builds upon NorBERT3-large, which features 24 layers, 16 attention heads per layer, a hidden size of 1024 dimensions, and approximately 340 million parameters. These architectures were selected for their proven effectiveness in capturing contextual information and learning rich language representations.</p><p>While both NorDeClin-BERT models retain the architecture of their respective backbone models, they differ in the domain-specific knowledge acquired through further pretraining. NorBERT3-base and NorBERT3-large were originally pretrained on general-domain Norwegian text, whereas we further trained both models on deidentified and pseudonymized Norwegian clinical text to create NorDeClin-BERT-base and NorDeClin-BERT-large, respectively. This continuous pretraining process enhanced the models&#x2019; ability to understand and represent medical language more effectively, making them well-suited for downstream clinical tasks such as <italic>ICD-10</italic> code prediction.</p><p>To further pretrain the NorDeClin-BERT models on our specialized clinical text data, we used a well-structured training pipeline built upon the Hugging Face Transformers library [<xref ref-type="bibr" rid="ref32">32</xref>]. The pretraining process was carried out on a Republic of Gamers server running Debian Linux, initially equipped with a single ASUS GeForce RTX 3090 GPU and later expanded to support dual GPUs for training larger configurations. The system has 64 GB of RAM (2&#x00D7;32GB 3200 MHz DDR4), and an 8 TB Gen4&#x00D7;4 M.2 NVMe SSD. The server storage was encrypted and located in a secure server room, accessible only to researchers who were specially authorized to work with the data and had signed confidentiality agreements. The server was not connected to the internet to ensure data security and remained offline throughout the project.</p><p>NorDeClin-BERT was continuously pretrained using the masked language modeling (MLM) objective. In MLM, a portion of the input tokens is randomly masked, and the model is trained to predict the original tokens based on the surrounding context. This approach allows the model to learn robust representations of words and their relationships. Following the findings from the RoBERTa paper [<xref ref-type="bibr" rid="ref31">31</xref>], which indicated that the next-sentence prediction task was unnecessary, we opted to focus exclusively on MLM during the pretraining of both versions of NorDeClin-BERT.</p><p>The tokenized data parts were loaded and concatenated to form a complete training dataset. The dataset was designed to be dynamically masked during training, where tokens were randomly masked at a probability of 15% to train the model on the MLM objective. Training parameters were carefully configured to optimize the model&#x2019;s performance, closely following the RoBERTa paper [<xref ref-type="bibr" rid="ref31">31</xref>]. Both NorDeClin-BERT-base and NorDeClin-BERT-large were pretrained for 40 epochs with a learning rate of 0.0001.</p><p>For NorDeClin-BERT-base, the batch size was configured for 8 sequences per device, with gradient accumulation steps set to 16, effectively simulating a larger batch size of 128 sequences. For NorDeClin-BERT-large, the configuration was adapted for dual-GPU training, using a batch size of 16 and accumulation steps of 2 per device, yielding an effective batch size of 64. While not identical, these settings were selected to maintain stable training dynamics under different hardware constraints. Additionally, the training process included a warmup phase (10,000 steps for base, 5000 for large), weight decay of 0.01, and no gradient clipping. The Adam optimizer was used with custom &#x03B5; of 0.000001, &#x03B2;<sub>1</sub> of 0.9, and &#x03B2;<sub>2</sub> of 0.999. During the training process, checkpoints were saved periodically, with retention limits in place to manage disk space efficiently. The training could be resumed from a specific checkpoint if needed.</p></sec><sec id="s2-5"><title>Fine-Tuning</title><p>After continuous pretraining, both NorDeClin-BERT-base and NorDeClin-BERT-large were fine-tuned for <italic>ICD-10</italic> code prediction using 87,938 discharge summaries with K-codes. The dataset was partitioned into training (70,350/87,938, 80%), validation (8794/87,938, 10%), and testing (8794/87,938, 10%) sets. The fine-tuning process began with data preparation, where the discharge summaries and their corresponding <italic>ICD-10</italic> codes were loaded from a CSV file using the Hugging Face datasets library. Each summary&#x2019;s codes were split into a list format for further processing. <xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates this workflow, showing how the NorDeClin-BERT models were pretrained on Norwegian clinical texts and subsequently fine-tuned on the Norwegian <italic>ICD-10</italic> coding task to create the final classification models.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow of the NorDeClin-BERT models. The models are initialized from NorBERT3-base and NorBERT3-large and further pretrained on Norwegian clinical texts to create NorDeClin-BERT-base and NorDeClin-BERT-large. Both are then fine-tuned on the Norwegian <italic>ICD-10</italic> coding task, resulting in the specialized classification models NorDeClin-BERT-base-NorICD and NorDeClin-BERT-large-NorICD. BERT: Bidirectional Encoder Representations from Transformers; <italic>ICD-10</italic>: <italic>International Statistical Classification of Diseases, Tenth Revision</italic>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e66153_fig01.png"/></fig><p>A custom preprocessing function was implemented to tokenize the text and prepare the labels. This function tokenized each discharge summary using the model&#x2019;s tokenizer with a maximum sequence length of 512 tokens. Labels were encoded using a multihot encoding scheme, where each unique <italic>ICD-10</italic> code was represented as a binary vector. The NorDeClin-BERT models were then loaded with a classification head adapted for multilabel classification, with the number of output labels set to match the total number of unique <italic>ICD-10</italic> codes in the dataset.</p><p>The training setup used a custom MultilabelTrainer class, extending the HuggingFace Trainer class for multilabel classification. The trainer used a binary cross-entropy loss function BCEWithLogitsLoss and was configured with specific hyperparameters: 40 epochs, a learning rate of 2e-5, and an early stopping patience of 1 epoch. To effectively manage memory constraints and increase the batch size, the training used a batch size of 4 with 16 gradient accumulation steps, resulting in a batch size of 64.</p><p>During the fine-tuning process, the model was trained on the prepared dataset, with evaluation performed on the validation set after each epoch. Early stopping was applied to prevent overfitting, and the best model was saved based on validation performance. The training process used a constant learning rate scheduler.</p><p>After training, the models were evaluated on the held-out test set using custom metric functions to compute accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score for multilabel classification. A threshold of 0.5 was applied to the model&#x2019;s output probabilities to determine the final predictions.</p></sec><sec id="s2-6"><title>Evaluation and Benchmarking of the NorDeClin-BERT Models</title><sec id="s2-6-1"><title>Overview</title><p>To benchmark the NorDeClin-BERT models&#x2019; performance, we carefully selected several other BERT-based models for comparison. Each model was chosen to provide specific insights into different aspects of language modeling and transfer learning in the context of Scandinavian languages and clinical text processing. Norwegian and Swedish, as closely related North Germanic languages, share significant lexical, syntactic, and morphological similarities, making cross-linguistic model transfer feasible. Medical terminology is also largely standardized across Scandinavian countries, further supporting the applicability of models trained on one language to another. Given these linguistic and domain-specific similarities, evaluating the NorDeClin-BERT models against models trained on Swedish and general-domain Scandinavian corpora provides valuable insights into how well these models generalize within the Nordic clinical context. To better illustrate the methodological differences across models, we provide individual workflow diagrams (<xref ref-type="fig" rid="figure1">Figures 1</xref><xref ref-type="fig" rid="figure2"/><xref ref-type="fig" rid="figure3"/>-<xref ref-type="fig" rid="figure4">4</xref>) and a summary table (<xref ref-type="table" rid="table2">Table 2</xref>), which highlight variations in model pretraining, fine-tuning, and dataset composition, facilitating direct comparison.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Workflow of the SweDeClin-BERT model. The model is initialized from KB-BERT and further pretrained on Swedish clinical texts. It is then fine-tuned separately on Swedish and Norwegian <italic>ICD-10</italic> coding tasks, resulting in 2 specialized versions: SweDeClin-BERT-SweICD and SweDeClin-BERT-NorICD. BERT: Bidirectional Encoder Representations from Transformers; <italic>ICD-10</italic>: <italic>International Statistical Classification of Diseases, Tenth Revision</italic>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e66153_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Workflow of the ScandiBERT model. The model is initialized from ScandiBERT and fine-tuned on the Norwegian <italic>ICD-10</italic> coding task. BERT: Bidirectional Encoder Representations from Transformers; <italic>ICD-10</italic>: <italic>International Statistical Classification of Diseases, Tenth Revision</italic>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e66153_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Workflow of the NorBERT3 models. The base and large variants of NorBERT3 are fine-tuned on the Norwegian <italic>ICD-10</italic> coding task. This results in 2 specialized models: NorBERT3-base-NorICD and NorBERT3-large-NorICD. BERT: Bidirectional Encoder Representations from Transformers; <italic>ICD-10: International Classification of Diseases, Tenth Revision</italic>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e66153_fig04.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Overview of the models used in the study. The table presents each model&#x2019;s size, type, pretraining data, and fine-tuning task.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Size</td><td align="left" valign="bottom">Type</td><td align="left" valign="bottom">Pretrained from</td><td align="left" valign="bottom">Pretraining</td><td align="left" valign="bottom">Fine-tuning</td></tr></thead><tbody><tr><td align="left" valign="top">NorDeClin-BERT-base-NorICD<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup><sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Base</td><td align="left" valign="top">Domain-Specific Clinical BERT</td><td align="left" valign="top">NorBERT3-base</td><td align="left" valign="top">Subset of Norwegian clinical texts</td><td align="left" valign="top">Norwegian <italic>ICD-10</italic></td></tr><tr><td align="left" valign="top">SweDeClin-BERT-SweICD</td><td align="left" valign="top">Base</td><td align="left" valign="top">Domain-Specific Clinical BERT</td><td align="left" valign="top">KB-BERT</td><td align="left" valign="top">Swedish clinical texts</td><td align="left" valign="top">Swedish <italic>ICD-10</italic></td></tr><tr><td align="left" valign="top">SweDeClin-BERT-NorICD</td><td align="left" valign="top">Base</td><td align="left" valign="top">Domain-Specific Clinical BERT</td><td align="left" valign="top">KB-BERT</td><td align="left" valign="top">Swedish clinical texts</td><td align="left" valign="top">Norwegian <italic>ICD-10</italic></td></tr><tr><td align="left" valign="top">ScandiBERT-NorICD</td><td align="left" valign="top">Base</td><td align="left" valign="top">General-Domain BERT</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">No</td><td align="left" valign="top">Norwegian <italic>ICD-10</italic></td></tr><tr><td align="left" valign="top">NorBERT3-base-NorICD</td><td align="left" valign="top">Base</td><td align="left" valign="top">General-Domain BERT</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">No</td><td align="left" valign="top">Norwegian <italic>ICD-10</italic></td></tr><tr><td align="left" valign="top">NorDeClin-BERT-large-NorICD</td><td align="left" valign="top">Large</td><td align="left" valign="top">Domain-Specific Clinical BERT</td><td align="left" valign="top">NorBERT3-large</td><td align="left" valign="top">Full Norwegian clinical corpus</td><td align="left" valign="top">Norwegian <italic>ICD-10</italic></td></tr><tr><td align="left" valign="top">NorBERT3-large-NorICD</td><td align="left" valign="top">Large</td><td align="left" valign="top">General-Domain BERT</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">No</td><td align="left" valign="top">Norwegian <italic>ICD-10</italic></td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>BERT: Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table2fn2"><p><sup>b</sup><italic>ICD-10</italic>: <italic>International Statistical Classification of Diseases, Tenth Revision</italic>.</p></fn><fn id="table2fn3"><p><sup>c</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>The following sections contain a detailed explanation of each model and the rationale behind its inclusion.</p></sec><sec id="s2-6-2"><title>SweDeClin-BERT</title><p>This model originates from KB-BERT [<xref ref-type="bibr" rid="ref33">33</xref>], a standard Swedish BERT model, which was further pretrained on Swedish clinical texts [<xref ref-type="bibr" rid="ref13">13</xref>]. It uses the base BERT architecture with 12 layers, 768 hidden units, and 12 attention heads, totaling approximately 110 million parameters. This comparison helps evaluate the importance of language-specific training in clinical NLP tasks. SweDeClin-BERT is represented by 2 variants in our evaluation task: SweDeClin-BERT-SweICD and SweDeClin-BERT-NorICD. SweDeClin-BERT-SweICD is a variant of SweDeClin-BERT, which was further fine-tuned on Swedish datasets for <italic>ICD-10</italic> code classification. SweDeClin-BERT-NorICD represents SweDeClin-BERT further fine-tuned on the Norwegian ClinCode Gastro Corpus. Their inclusion allows us to assess the performance of models specifically designed for clinical text but in a closely related Scandinavian language. By comparing their performance with the NorDeClin-BERT models, we can also determine how well clinical knowledge and <italic>ICD-10</italic> classification capabilities transfer from Swedish to Norwegian. <xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the training and fine-tuning process of SweDeClin-BERT, highlighting its pretraining on Swedish clinical text and subsequent fine-tuning for <italic>ICD-10</italic> classification in both Swedish and Norwegian.</p></sec><sec id="s2-6-3"><title>ScandiBERT</title><p>A model [<xref ref-type="bibr" rid="ref34">34</xref>] pretrained on a mix of Scandinavian languages designed to capture the linguistic characteristics of the region [<xref ref-type="bibr" rid="ref28">28</xref>]. Its inclusion allows for evaluating the effectiveness of a multilingual model compared with a language-specific model in the <italic>ICD-10</italic> coding prediction task. <xref ref-type="fig" rid="figure3">Figure 3</xref> illustrates the fine-tuning process of ScandiBERT, where the model is adapted to Norwegian clinical text using <italic>ICD-10</italic> coding data, resulting in the ScandiBERT-NorICD variant.</p></sec><sec id="s2-6-4"><title>NorBERT3 (Base and Large Variants)</title><p>A model developed for the Norwegian language [<xref ref-type="bibr" rid="ref29">29</xref>]. NorBERT3-base uses a similar architecture to the other base models, while NorBERT3-large uses a larger architecture with 24 layers, 1024 hidden units, and 16 attention heads, totaling approximately 340 million parameters. Including both base and large variants allows assessing the impact of model size on performance. Additionally, comparing these general-domain models with the NorDeClin-BERT-base and NorDeClin-BERT-large models provides a fair assessment of the effects of clinical domain adaptation versus general language pretraining for Norwegian. <xref ref-type="fig" rid="figure4">Figure 4</xref> illustrates the fine-tuning process of NorBERT3-base and NorBERT3-large on Norwegian <italic>ICD-10</italic> coding tasks, resulting in the specialized models NorBERT3-base-NorICD and NorBERT3-large-NorICD.</p></sec><sec id="s2-6-5"><title>Evaluation Metrics</title><sec id="s2-6-5-1"><title>Overview</title><p>Each model was fine-tuned and evaluated using the same training, validation, and testing splits of the dataset. We used a comprehensive evaluation strategy focusing on the following metrics: accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score [<xref ref-type="bibr" rid="ref35">35</xref>]. Accuracy measures the proportion of correct predictions out of the total predictions made, providing an overall effectiveness of the model. Precision indicates the proportion of true positive predictions among all positive predictions, where high precision means that the model has a low false-positive rate. The recall represents the proportion of true positive predictions among all actual positives, with high recall indicating the model&#x2019;s ability to identify most of the relevant instances. The <italic>F</italic><sub>1</sub>-score, as the harmonic mean of precision and recall, provides a single metric that balances both concerns, which is particularly useful when the class distribution is imbalanced. These metrics were calculated considering the multilabel nature of the problem using weighted averages. The evaluation was carried out for both the complete set of codes and the top 80% codes that are used the most. We applied these metrics in 2 main evaluation strategies: multilabel evaluation and top-5 evaluation.</p></sec><sec id="s2-6-5-2"><title>Multilabel Evaluation</title><p>Given the multilabel nature of <italic>ICD-10</italic> coding, where multiple codes may apply to a single clinical note, we analyzed model performance in predicting the exact set of relevant codes at the sample level. This was achieved by converting the model&#x2019;s output logits to probability scores and applying a threshold of 0.5 to generate binary predictions, where a label is considered predicted if its probability is greater than or equal to 0.5. These binary predictions were then compared against the true labels to compute accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score, providing a detailed view of the model&#x2019;s ability to handle multiple simultaneous labels correctly.</p></sec><sec id="s2-6-5-3"><title>Top-5 Evaluation</title><p>This evaluation assesses the model&#x2019;s ability to predict the top-5 most probable codes for each clinical note at the sample level, reflecting practical coding scenarios where identifying the most relevant codes quickly is crucial. The process involved sorting the probability scores for each sample to identify the top 5 highest scoring labels and converting these indices to their corresponding <italic>ICD-10</italic> codes. The actual labels present in the ground truth were then extracted for each sample. Each actual label was checked to see if it was among the top 5 predicted labels. If the actual label was among the top 5 predicted labels, it was added to both the actual and predicted lists. If not, the actual label was added to the actual list, and the last element in the top-5 predictions was added to the predicted list. Finally, the evaluation metrics, including accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score, were calculated by comparing the predicted and label lists.</p></sec></sec></sec><sec id="s2-7"><title>Model Interpretability</title><p>To provide insights into the decision-making processes of the NorDeClin-BERT models, an attention-based interpretability analysis was conducted. This involved generating a synthetic clinical text using ChatGPT, processing the text through both NorDeClin-BERT-base-NorICD and NorDeClin-BERT-large-NorICD models, extracting attention weights, aggregating and normalizing attention weights across all layers and heads, and visualizing attention distribution across input tokens during <italic>ICD-10</italic> code prediction.</p><p>This methodology allows for a comprehensive evaluation of the NorDeClin-BERT models&#x2019; performance, their comparative advantages over other BERT variants, and insights into their internal decision-making processes, all crucial for assessing their potential in automating <italic>ICD-10</italic> coding in Norwegian health care settings.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Model Performance</title><p>The evaluation of the NorDeClin-BERT models and their comparison with other BERT-based models across 4 critical metrics (accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score) yielded meaningful insights into their performance on <italic>ICD-10</italic> code classification tasks. The analysis was conducted for 2 distinct scenarios: classification performance for all codes and the top 80% most frequently used codes. Performance was further categorized into multilabel and top-5 accuracy.</p></sec><sec id="s3-2"><title>Accuracy</title><p><xref ref-type="table" rid="table3">Table 3</xref> presents the accuracy scores across all evaluated models under 4 evaluation settings. NorDeClin-BERT-large-NorICD achieved the highest accuracy across all scenarios, including multilabel (0.47) and top-5 (0.82) classification of the full <italic>ICD-10</italic> code set, as well as multilabel (0.56) and top-5 (0.88) classification of the top 80% most-used codes. It outperformed all other models, including the larger general-domain NorBERT3-large-NorICD, with the largest margin observed in the all codes multilabel setting (0.47 vs 0.42).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison of the accuracy of different BERT (Bidirectional Encoder Representations from Transformers) models.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model size and model name</td><td align="left" valign="bottom" colspan="2">All codes, 95% CI</td><td align="left" valign="bottom" colspan="2">Top 80% codes, 95% CI</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Multilabel</td><td align="left" valign="bottom">Top-5</td><td align="left" valign="bottom">Multilabel</td><td align="left" valign="bottom">Top-5</td></tr></thead><tbody><tr><td align="left" valign="top">Base</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorDeClin-BERT-base-NorICD<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="char" char="." valign="top">0.44 (0.43-0.45)</td><td align="char" char="." valign="top">0.81 (0.80-0.81)</td><td align="char" char="." valign="top">0.54 (0.53-0.55)</td><td align="char" char="." valign="top">0.87 (0.86-0.88)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SweDeClin-BERT-SweICD</td><td align="char" char="." valign="top">0.25 (0.24-0.26)</td><td align="char" char="." valign="top">0.59 (0.58-0.60)</td><td align="char" char="." valign="top">0.35 (0.34-0.36)</td><td align="char" char="." valign="top">0.65 (0.63-0.65)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SweDeClin-BERT-NorICD</td><td align="char" char="." valign="top">0.40 (0.39-0.41)</td><td align="char" char="." valign="top">0.78 (0.77-0.79)</td><td align="char" char="." valign="top">0.50 (0.49-0.51)</td><td align="char" char="." valign="top">0.85 (0.84-0.86)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ScandiBERT-NorICD</td><td align="char" char="." valign="top">0.39 (0.38-0.40)</td><td align="char" char="." valign="top">0.78 (0.77-0.79)</td><td align="char" char="." valign="top">0.51 (0.50-0.52)</td><td align="char" char="." valign="top">0.85 (0.84-0.86)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorBERT3-base-NorICD</td><td align="char" char="." valign="top">0.43 (0.42-0.44)</td><td align="char" char="." valign="top">0.80 (0.79-0.81)</td><td align="char" char="." valign="top">0.52 (0.51-0.53)</td><td align="char" char="." valign="top">0.86 (0.86-0.87)</td></tr><tr><td align="left" valign="top">Large</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorDeClin-BERT-large-NorICD</td><td align="char" char="." valign="top">0.47 (0.46-0.48)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="char" char="." valign="top">0.82 (0.82-0.83)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="char" char="." valign="top">0.56 (0.55-0.57)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="char" char="." valign="top">0.88 (0.88-0.89)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorBERT3-large-NorICD</td><td align="char" char="." valign="top">0.42 (0.41-0.43)</td><td align="char" char="." valign="top">0.81 (0.80-0.82)</td><td align="char" char="." valign="top">0.53 (0.52-0.54)</td><td align="char" char="." valign="top">0.88 (0.87-0.88)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup><italic>ICD-10</italic>: <italic>International Statistical Classification of Diseases, Tenth Revision</italic>.</p></fn><fn id="table3fn2"><p><sup>b</sup>Highest score for each scenario.</p></fn></table-wrap-foot></table-wrap><p>Among base-sized models, NorDeClin-BERT-base-NorICD also showed strong performance, surpassing SweDeClin-BERT-NorICD, ScandiBERT-NorICD, and SweDeClin-BERT-SweICD in all settings. Notably, it matched or exceeded the performance of the larger NorBERT3-large-NorICD in 3 out of 4 scenarios, highlighting the impact of clinical domain adaptation even in smaller models.</p></sec><sec id="s3-3"><title>Precision</title><p><xref ref-type="table" rid="table4">Table 4</xref> presents precision scores across all models and evaluation scenarios. NorDeClin-BERT-large-NorICD achieved the highest precision in 3 out of 4 settings, including all codes multilabel (0.66), top-5 (0.82), and top 80% most-used codes top-5 (0.90). It performed comparably to NorBERT3-large-NorICD in the remaining setting, where NorBERT3-large-NorICD achieved a higher top 80% most-used codes multilabel precision (0.73 vs 0.72).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Comparison of the precision of different BERT (Bidirectional Encoder Representations from Transformers) models.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model size and model name</td><td align="left" valign="bottom" colspan="2">All codes, 95% CI</td><td align="left" valign="bottom" colspan="2">Top 80% codes, 95% CI</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Multilabel</td><td align="left" valign="bottom">Top-5</td><td align="left" valign="bottom">Multilabel</td><td align="left" valign="bottom">Top-5</td></tr></thead><tbody><tr><td align="left" valign="top">Base</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorDeClin-BERT-base-NorICD<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="char" char="." valign="top">0.65 (0.64-0.66)</td><td align="char" char="." valign="top">0.80 (0.79-0.81)</td><td align="char" char="." valign="top">0.71 (0.70-0.73)</td><td align="char" char="." valign="top">0.89 (0.88-0.90)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SweDeClin-BERT-SweICD</td><td align="char" char="." valign="top">0.38 (0.36-0.40)</td><td align="char" char="." valign="top">0.61 (0.60-0.62)</td><td align="char" char="." valign="top">0.46 (0.44-0.48)</td><td align="char" char="." valign="top">0.69 (0.67-0.70)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SweDeClin-BERT-NorICD</td><td align="char" char="." valign="top">0.58 (0.56-0.59)</td><td align="char" char="." valign="top">0.77 (0.76-0.78)</td><td align="char" char="." valign="top">0.66 (0.65-0.68)</td><td align="char" char="." valign="top">0.87 (0.86-0.88)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ScandiBERT-NorICD</td><td align="char" char="." valign="top">0.57 (0.55-0.58)</td><td align="char" char="." valign="top">0.77 (0.76-0.78)</td><td align="char" char="." valign="top">0.67 (0.66-0.69)</td><td align="char" char="." valign="top">0.87 (0.87-0.88)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorBERT3-base-NorICD</td><td align="char" char="." valign="top">0.63 (0.61-0.64)</td><td align="char" char="." valign="top">0.79 (0.78-0.80)</td><td align="char" char="." valign="top">0.69 (0.68-0.70)</td><td align="char" char="." valign="top">0.88 (0.88-0.89)</td></tr><tr><td align="left" valign="top">Large</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorDeClin-BERT-large-NorICD</td><td align="char" char="." valign="top">0.66 (0.65-0.68)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="char" char="." valign="top">0.82 (0.81-0.82)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="char" char="." valign="top">0.72 (0.71-0.74)</td><td align="char" char="." valign="top">0.90 (0.90-0.91)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorBERT3-large-NorICD</td><td align="char" char="." valign="top">0.65 (0.64-0.67)</td><td align="char" char="." valign="top">0.80 (0.79-0.81)</td><td align="char" char="." valign="top">0.73 (0.72-0.74)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="char" char="." valign="top">0.89 (0.89-0.90)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup><italic>ICD-10</italic>: <italic>International Statistical Classification of Diseases, Tenth Revision</italic>.</p></fn><fn id="table4fn2"><p><sup>b</sup>Highest score for each scenario.</p></fn></table-wrap-foot></table-wrap><p>Among base-sized models, NorDeClin-BERT-base-NorICD outperformed all other base models in every scenario, with precision scores of 0.65 (all codes multilabel), 0.80 (top-5), 0.71 (top 80% most-used codes multilabel), and 0.89 (top 80% most-used codes top-5). This performance closely approaches that of the large models, further reinforcing the strength of domain-specific pretraining even with smaller architectures.</p></sec><sec id="s3-4"><title>Recall</title><p><xref ref-type="table" rid="table5">Table 5</xref> reports the recall scores across all models and evaluation scenarios. NorDeClin-BERT-large-NorICD consistently achieved the highest recall across all 4 evaluation settings, with scores of 0.48 (all codes multilabel), 0.82 (all codes top-5), 0.54 (top 80% most-used codes multilabel), and 0.88 (top 80% most-used codes top-5). The largest improvement was observed in the multilabel settings, where it outperformed the general-domain NorBERT3-large-NorICD by 5% points in all codes (0.48 vs 0.43) and 4 points in the top 80% codes (0.54 vs 0.50), underscoring the advantage of domain-specific pretraining at scale.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Comparison of the recall of different BERT (Bidirectional Encoder Representations from Transformers) models.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model size and model name</td><td align="left" valign="bottom" colspan="2">All codes, 95% CI</td><td align="left" valign="bottom" colspan="2">Top 80% codes, 95% CI</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Multilabel</td><td align="left" valign="bottom">Top-5</td><td align="left" valign="bottom">Multilabel</td><td align="left" valign="bottom">Top-5</td></tr></thead><tbody><tr><td align="left" valign="top">Base</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorDeClin-BERT-base-NorICD<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="char" char="." valign="top">0.45 (0.44-0.46)</td><td align="char" char="." valign="top">0.81 (0.80-0.81)</td><td align="char" char="." valign="top">0.51 (0.50-0.52)</td><td align="char" char="." valign="top">0.87 (0.86-0.88)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SweDeClin-BERT-SweICD</td><td align="char" char="." valign="top">0.25 (0.24-0.26)</td><td align="char" char="." valign="top">0.59 (0.58-0.60)</td><td align="char" char="." valign="top">0.29 (0.28-0.30)</td><td align="char" char="." valign="top">0.65 (0.63-0.66)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SweDeClin-BERT-NorICD</td><td align="char" char="." valign="top">0.41 (0.40-0.42)</td><td align="char" char="." valign="top">0.78 (0.77-0.79)</td><td align="char" char="." valign="top">0.48 (0.47-0.49)</td><td align="char" char="." valign="top">0.85 (0.84-0.86)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ScandiBERT-NorICD</td><td align="char" char="." valign="top">0.40 (0.39-0.41)</td><td align="char" char="." valign="top">0.78 (0.77-0.79)</td><td align="char" char="." valign="top">0.48 (0.47-0.49)</td><td align="char" char="." valign="top">0.85 (0.84-0.86)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorBERT3-base-NorICD</td><td align="char" char="." valign="top">0.44 (0.43-0.45)</td><td align="char" char="." valign="top">0.80 (0.79-0.81)</td><td align="char" char="." valign="top">0.51 (0.50-0.52)</td><td align="char" char="." valign="top">0.86 (0.86-0.87)</td></tr><tr><td align="left" valign="top">Large</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorDeClin-BERT-large-NorICD</td><td align="char" char="." valign="top">0.48 (0.47-0.49)<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="char" char="." valign="top">0.82 (0.82-0.83)<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="char" char="." valign="top">0.54 (0.53-0.55)<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="char" char="." valign="top">0.88 (0.88-0.89)<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorBERT3-large-NorICD</td><td align="char" char="." valign="top">0.43 (0.42-0.44)</td><td align="char" char="." valign="top">0.81 (0.80-0.82)</td><td align="char" char="." valign="top">0.50 (0.49-0.51)</td><td align="char" char="." valign="top">0.88 (0.87-0.88)<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup><italic>ICD-10</italic>: <italic>International Statistical Classification of Diseases, Tenth Revision</italic>.</p></fn><fn id="table5fn2"><p><sup>b</sup>Highest score for each scenario.</p></fn></table-wrap-foot></table-wrap><p>Among the base-sized models, NorDeClin-BERT-base-NorICD also performed strongly, achieving 0.45 recall in all codes multilabel, 0.81 in top-5, 0.51 in top 80% most-used codes multilabel, and 0.87 in top 80% most-used codes top-5. It outperformed all general-domain baselines (ScandiBERT and NorBERT3-base), as well as the domain-specific Swedish models (SweDeClin-BERT-SweICD and SweDeClin-BERT-NorICD). Its recall closely matched or exceeded that of the larger NorBERT3-large-NorICD model in 3 of the 4 settings, further supporting the impact of domain-specific pretraining for improving recall in clinical coding tasks.</p></sec><sec id="s3-5"><title><italic>F</italic><sub>1</sub>-Score</title><p><xref ref-type="table" rid="table6">Table 6</xref> summarizes the <italic>F</italic><sub>1</sub>-scores for all models across the 4 evaluation scenarios. NorDeClin-BERT-large-NorICD achieved the highest <italic>F</italic><sub>1</sub>-score in all cases, with 0.54 for all codes multilabel, 0.81 for all codes top-5, 0.60 for the top 80% most-used codes multilabel, and 0.89 for top 80% top-5, consistently outperforming the general-domain NorBERT3-large-NorICD (0.50, 0.79, 0.58, and 0.88, respectively). The largest <italic>F</italic><sub>1</sub>-score margin between the large models was observed in the all codes multilabel setting (0.54 vs 0.50), highlighting the impact of domain adaptation on balancing precision and recall in complex coding tasks.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Comparison of <italic>F</italic><sub>1</sub>-score of different BERT (Bidirectional Encoder Representations from Transformers) models.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model size and model name</td><td align="left" valign="bottom" colspan="2">All codes, 95% CI</td><td align="left" valign="bottom" colspan="2">Top 80% codes, 95% CI</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Multilabel</td><td align="left" valign="bottom">Top-5</td><td align="left" valign="bottom">Multilabel</td><td align="left" valign="bottom">Top-5</td></tr></thead><tbody><tr><td align="left" valign="top">Base</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorDeClin-BERT-base-NorICD<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup></td><td align="char" char="." valign="top">0.52 (0.51-0.53)</td><td align="char" char="." valign="top">0.79 (0.79-0.80)</td><td align="char" char="." valign="top">0.58 (0.57-0.59)</td><td align="char" char="." valign="top">0.88 (0.87-0.88)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SweDeClin-BERT-SweICD</td><td align="char" char="." valign="top">0.27 (0.26-0.27)</td><td align="char" char="." valign="top">0.55 (0.54-0.56)</td><td align="char" char="." valign="top">0.31 (0.30-0.32)</td><td align="char" char="." valign="top">0.63 (0.62-0.64)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SweClin-BERT-NorICD</td><td align="char" char="." valign="top">0.46 (0.45-0.47)</td><td align="char" char="." valign="top">0.76 (0.75-0.77)</td><td align="char" char="." valign="top">0.54 (0.53-0.55)</td><td align="char" char="." valign="top">0.86 (0.85-0.87)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ScandiBERT-NorICD</td><td align="char" char="." valign="top">0.45 (0.44-0.46)</td><td align="char" char="." valign="top">0.76 (0.75-0.77)</td><td align="char" char="." valign="top">0.54 (0.53-0.55)</td><td align="char" char="." valign="top">0.86 (0.85-0.87)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorBERT3-base-NorICD</td><td align="char" char="." valign="top">0.50 (0.49-0.51)</td><td align="char" char="." valign="top">0.78 (0.78-0.79)</td><td align="char" char="." valign="top">0.57 (0.56-0.58)</td><td align="char" char="." valign="top">0.87 (0.86-0.88)</td></tr><tr><td align="left" valign="top">Large</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorDeClin-BERT-large-NorICD</td><td align="char" char="." valign="top">0.54 (0.53-0.55)<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td><td align="char" char="." valign="top">0.81 (0.80-0.82)<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td><td align="char" char="." valign="top">0.60 (0.60-0.61)<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td><td align="char" char="." valign="top">0.89 (0.89-0.90)<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NorBERT3-large-NorICD</td><td align="char" char="." valign="top">0.50 (0.49-0.51)</td><td align="char" char="." valign="top">0.79 (0.78-0.80)</td><td align="char" char="." valign="top">0.58 (0.56-0.59)</td><td align="char" char="." valign="top">0.88 (0.87-0.89)</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup><italic>ICD-10</italic>: <italic>International Statistical Classification of Diseases, Tenth Revision</italic>.</p></fn><fn id="table6fn2"><p><sup>b</sup>Highest score for each scenario.</p></fn></table-wrap-foot></table-wrap><p>Among base-sized models, NorDeClin-BERT-base-NorICD also demonstrated strong performance with <italic>F</italic><sub>1</sub>-scores of 0.52, 0.79, 0.58, and 0.88, respectively. It outperformed all other base models across every scenario and matched or exceeded the performance of the larger NorBERT3-large-NorICD in all 4 settings, further validating the strength of clinical domain pretraining even in smaller architectures.</p></sec><sec id="s3-6"><title>Interpretability</title><p><xref ref-type="fig" rid="figure5">Figure 5</xref> illustrates the attention distribution of NorDeClin-BERT-base-NorICD in a synthetic clinical text. The attention appears to be distributed relatively uniformly throughout the clinical description, suggesting that the model focuses on a comprehensive contextual understanding of the text to make predictions. Key medical terms like diar&#x00E9; (diarrhea), bl&#x00F8;dning (bleeding), Crohns sykdom (Crohn disease), and inflammatorisk tarmsykdom (inflammatory bowel disease) receive high attention, indicating their importance in the model&#x2019;s decision-making process. The model&#x2019;s interpretability is based on its attention to clinical descriptions and terminology. This approach provides valuable insights into how the model processes natural language to arrive at its predictions. It is particularly useful in understanding how the model infers <italic>ICD</italic> codes from medical text alone, mimicking the process a human expert might follow when assigning codes based on clinical narratives.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Attention distribution of NorDeClin-BERT-base-NorICD on a synthetic clinical text. A translation of the text is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. BERT: Bidirectional Encoder Representations from Transformers; <italic>ICD-10</italic>: <italic>International Statistical Classification of Diseases, Tenth Revision</italic>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e66153_fig05.png"/></fig><p><xref ref-type="fig" rid="figure6">Figure 6</xref> shows the attention distribution of NorDeClin-BERT-large-NorICD applied to the same synthetic clinical text. Like the base model, it assigns high attention weights to terms such as bl&#x00F8;dning, Crohns sykdom, and inflammatorisk tarmsykdom. However, the large model displays a slightly more focused and confident pattern, with attention concentrated more tightly around diagnostically relevant phrases. This may reflect the benefit of both greater model capacity and pretraining on the full corpus, resulting in more targeted representation learning.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Attention distribution of NorDeClin-BERT-large-NorICD on the same synthetic clinical text. A translation of the text is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. BERT: Bidirectional Encoder Representations from Transformers; <italic>ICD-10</italic>: <italic>International Statistical Classification of Diseases, Tenth Revision</italic>.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e66153_fig06.png"/></fig><p>Comparing the 2 models, both demonstrate strong interpretability by attending to clinically meaningful concepts. NorDeClin-BERT-large-NorICD appears to apply attention more selectively, in line with its superior classification performance. These visualizations support the idea that domain-specific pretraining not only improves predictive performance but also enhances transparency and trust in real-world clinical applications.</p><p>The distribution of attention across the text, focusing on key medical terms, suggests that the models have developed a nuanced understanding of clinical language. This method of interpretation allows us to understand which parts of the clinical narrative the models consider most relevant for predicting <italic>ICD</italic> codes. This ability to extract relevant information from various parts of the text indicates a robust and generalizable approach to <italic>ICD</italic> code prediction. It showcases the models&#x2019; capacity to process and understand clinical narratives in a way that aligns with human expert reasoning.</p><p>This interpretability analysis highlights the NorDeClin-BERT models&#x2019; potential to assist health care professionals and improve their trust by providing insight into the reasoning behind the predicted <italic>ICD</italic> codes. The models&#x2019; attention to a broad range of clinical terms and contexts suggests their potential adaptability to various types of medical narrative, which is crucial for real-world applications in diverse health care settings.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The development and evaluation of 2 variants of NorDeClin-BERT for <italic>ICD-10</italic> code classification tasks have yielded insightful results, highlighting their capabilities and potential applications in Norwegian health care settings. Both NorDeClin-BERT-base-NorICD and NorDeClin-BERT-large-NorICD have emerged as frontrunners, demonstrating higher accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-scores across both all codes and the top 80% most-used codes. These findings underscore the robustness and efficiency of the models in handling diverse and prevalent code classifications.</p><p>The good performance of the NorDeClin-BERT models, especially in the context of the top 80% most-used codes, suggests that these models have effectively captured the underlying patterns and nuances of the most frequent classifications in Norwegian clinical texts. This capability is critical in practical applications where prioritizing common codes can substantially enhance operational efficiency and accuracy. At the same time, the models, particularly NorDeClin-BERT-large, showed notable improvements in multilabel classification across both full and frequent-code scenarios, outperforming all baseline models in recall and <italic>F</italic><sub>1</sub>-score. Furthermore, the high precision of the NorDeClin-BERT models indicates their utility in scenarios where the cost of false positives is high, making them an ideal choice for critical applications in medical coding and documentation.</p><p>An important aspect of our study is the comparison of models with different sizes and architectures. NorDeClin-BERT was developed in both base and large configurations, with the base model built on the BERT-base architecture (&#x2248;110 million parameters) and the large model using a BERT-large architecture (&#x2248;340 million parameters). NorDeClin-BERT-base consistently outperformed or matched the performance of other models, including the larger general-domain NorBERT3-large model. This finding is particularly noteworthy, as it challenges the common assumption that larger models invariably lead to better performance. The success of NorDeClin-BERT-base suggests that, for specialized tasks such as <italic>ICD-10</italic> coding in Norwegian clinical texts, a well-tuned base-size model can be highly effective and potentially more efficient in terms of computational resources and inference time.</p><p>The comparable or better performance of NorDeClin-BERT-base to larger models such as NorBERT3-large also highlights the importance of domain-specific pretraining and fine-tuning. It appears that the targeted approach of training on Norwegian clinical texts has allowed even the smaller NorDeClin-BERT variant to develop a more nuanced understanding of medical terminology and context, compensating for its reduced size. This observation has significant implications for model development in specialized domains, suggesting that carefully curated training data and domain-specific adaptation can be as important as, if not more important than, raw model size.</p><p>Furthermore, the efficiency of a smaller model like NorDeClin-BERT-base has practical advantages in clinical settings. It can be more easily deployed in environments with limited computational resources, potentially allowing for faster inference times and lower hardware requirements. This could facilitate broader adoption across various health care institutions, including those with constrained IT infrastructures.</p><p>The findings of this study have broader implications for the implementation of machine learning in Norwegian clinical settings. The NorDeClin-BERT models can substantially reduce the workload of health care professionals by automating routine coding tasks, allowing them to focus more on patient care and less on administrative duties. In addition, the enhanced accuracy and precision of these models can contribute to better patient outcomes by ensuring more accurate reporting and documentation, which, in turn, can lead to more targeted and effective patient care plans in Norwegian hospitals.</p><p>The attention-based interpretability analysis provides valuable insight into NorDeClin-BERT models&#x2019; decision-making process, which could enhance trust and adoption among health care professionals. The models&#x2019; ability to focus on relevant clinical terms when <italic>ICD</italic> codes are not present demonstrates their potential to generalize well to various clinical narratives.</p><p>Our study not only demonstrates the effectiveness of the NorDeClin-BERT models in <italic>ICD-10</italic> coding tasks but also provides valuable insights into the trade-offs between model size, domain-specific training, and performance in specialized NLP tasks. These findings could guide future research and development in clinical NLP, potentially leading to more efficient and effective AI solutions in health care.</p></sec><sec id="s4-2"><title>Broader Implications of <italic>ICD-10</italic> Coding Performance</title><p>While this study focuses on <italic>ICD-10</italic> coding for clinical documentation, structured coding also plays a crucial role in several other domains, including billing, epidemiological research, clinical registries, and decision support systems. In billing and insurance claims, accurate <italic>ICD-10</italic> coding ensures proper reimbursement and minimizes administrative errors. In epidemiological studies, these codes are essential for monitoring disease prevalence and public health trends, where high recall is particularly important to ensure comprehensive case identification and minimize underreporting. Similarly, clinical registries rely on structured diagnostic coding to maintain high-quality datasets, where both precision and recall influence the completeness and reliability of registry-based research. Additionally, in clinical decision support systems, <italic>ICD-10</italic> codes are often used to trigger alerts, inform risk assessments, or guide treatment recommendations, where high precision is crucial to avoid false-positive alerts that could contribute to alert fatigue and unnecessary interventions. While our study does not directly evaluate these applications, our findings suggest that models like NorDeClin-BERT-base-NorICD and NorDeClin-BERT-large-NorICD have the potential to improve coding accuracy in such contexts, thereby enhancing the quality of structured health data across multiple domains. Future research could explore domain-specific adaptations to optimize NLP-driven <italic>ICD-10</italic> coding for these different use cases.</p></sec><sec id="s4-3"><title>Limitations and Future Directions</title><p>While the results of this study are promising, several limitations must be acknowledged. First, the performance of the NorDeClin-BERT models might vary with different datasets or coding systems not covered in this study, particularly those outside the gastroenterology domain. This suggests the need for wider validation across various medical specialties and health care institutions in Norway to fully understand the generalizability of the findings.</p><p>Future research should aim to address these limitations by expanding the scope of the datasets and coding systems, potentially including other medical specialties and health care institutions across Norway. Exploring the integration of the NorDeClin-BERT models into real-world clinical workflows in Norwegian hospitals and assessing their impact on efficiency and patient care outcomes would provide valuable insights into their practical utility.</p><p>Furthermore, investigating the interpretability of the NorDeClin-BERT models and user trust in automated coding systems represents a crucial research area, as these factors greatly influence the adoption of AI technologies in health care. Developing explainable AI techniques tailored to the Norwegian clinical context could further improve the transparency and trustworthiness of these models, potentially accelerating their integration into Norwegian health care systems.</p></sec><sec id="s4-4"><title>Conclusions</title><p>This study introduced 2 versions of NorDeClin-BERT, domain-specific BERT models specifically developed for automating <italic>ICD-10</italic> code assignments from clinical notes within the Norwegian gastroenterological domain. By benchmarking these models against both general-domain and cross-lingual BERT baselines, we addressed 3 core RQs. First (RQ1), we found that domain-specific pretraining on Norwegian clinical text consistently improved <italic>ICD-10</italic> classification performance across all evaluation metrics, compared with general-domain Norwegian models and Swedish clinical models. Second (RQ2), we showed that scaling the model size from base to large further enhanced performance, most notably in multilabel scenarios, demonstrating that model capacity can amplify the benefits of domain adaptation. Third (RQ3), NorDeClin-BERT-base matched or outperformed NorBERT3-large in multiple scenarios, highlighting the value of targeted pretraining even with smaller architectures.</p><p>Compared with previous work on Swedish <italic>ICD-10</italic> classification using SweDeClin-BERT [<xref ref-type="bibr" rid="ref15">15</xref>], our models achieved competitive or superior performance, especially under strict multilabel evaluation, despite differences in language and dataset structure. To our knowledge, this is the first study to develop and evaluate BERT-based models for <italic>ICD-10</italic> coding in the Norwegian language, setting a new benchmark for future clinical NLP research in this area.</p><p>Through detailed analysis of accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score, our findings demonstrate the potential of domain-specific language models to support structured clinical documentation, reduce administrative burden, and enable more accurate downstream analytics in Norwegian health care. The results highlight the NorDeClin-BERT models as superior in terms of accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score for both all codes and the top 80% most-used codes, consistently outperforming other BERT variants, including ScandiBERT, NorBERT3-base, NorBERT3-large, and SweDeClin-BERT. NorDeClin-BERT-large-NorICD demonstrated the highest overall performance, while NorDeClin-BERT-base-NorICD matched or exceeded the performance of larger general-purpose models in multiple scenarios. Both models demonstrate an improved ability to capture the nuances of the Norwegian language and the complexity of medical coding. The study also underscores the relevance of language-specific and domain-specific models, as evidenced by NorDeClin-BERT&#x2019;s improved performance compared with models pretrained on general Scandinavian languages.</p><p>The attention-based interpretability analysis provided valuable insight into the NorDeClin-BERT models&#x2019; decision-making processes, demonstrating their ability to focus on relevant clinical terms and adapt to the presence or absence of explicit <italic>ICD</italic> codes in the text. This feature enhances the models&#x2019; potential for generalization and practical application in diverse clinical settings across Norway.</p></sec></sec></body><back><ack><p>The publication charges for this article have been funded by a grant from the publication fund of UiT The Arctic University of Norway. The authors used OpenAI&#x2019;s ChatGPT to assist with language editing and rephrasing portions of the manuscript. All content was reviewed and edited by the authors for accuracy and correctness. ChatGPT was also used to generate the synthetic clinical text used in the interpretability analysis. No generative artificial intelligence was used for data analysis or original scientific content. The research was funded by the Norwegian Research Council under the project ClinCode Computer-Assisted Clinical <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>) Coding for improving efficiency and quality in health care (project number 318098).</p></ack><notes><sec><title>Data Availability</title><p>The dataset used in this study is not publicly available due to patient privacy regulations and institutional data governance policies. Access to this data is strictly controlled by the data owner, the University Hospital of North Norway, in accordance with Norwegian health data protection laws.</p></sec></notes><fn-group><fn fn-type="con"><p>PDN and MATH contributed to the conceptualization, data curation, formal analysis, investigation, methodology, software development, and writing of the original draft. TC contributed to the conceptualization, methodology, provision of resources, and review and editing of the manuscript. AB contributed to the conceptualization, data curation, methodology, software development, and review and editing of the manuscript. TOS contributed to data curation, methodology, and review and editing of the manuscript. TT contributed to the conceptualization, methodology, and review and editing of the manuscript. AL contributed to the methodology, software development, and review and editing of the manuscript. HD contributed to the conceptualization, data curation, funding acquisition, methodology, project administration, and review and editing of the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb3">CAC</term><def><p>computer-assisted coding</p></def></def-item><def-item><term id="abb4">GPU</term><def><p>graphics processing unit</p></def></def-item><def-item><term id="abb5"><italic>ICD</italic></term><def><p><italic>International Classification of Diseases</italic></p></def></def-item><def-item><term id="abb6"><italic>ICD-10</italic></term><def><p><italic>International Statistical Classification of Diseases, Tenth Revision</italic></p></def></def-item><def-item><term id="abb7">ICD-O-3</term><def><p>International Classification of Diseases for Oncology</p></def></def-item><def-item><term id="abb8">LIME</term><def><p>Local Interpretable Model-agnostic Explanations</p></def></def-item><def-item><term id="abb9">MLM</term><def><p>masked language modeling</p></def></def-item><def-item><term id="abb10">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb11">RQ</term><def><p>research question</p></def></def-item><def-item><term id="abb12">SHAP</term><def><p>Shapley additive explanations</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Moriyama</surname><given-names>IM</given-names> </name></person-group><article-title>History of the statistical classification of diseases and causes of death</article-title><year>2011</year><publisher-name>Department of Health and Human Services Public Health Service</publisher-name></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>Meld. st. 11 (2015&#x2013;2016) [white paper No. 11 (2015&#x2013;2016)]</article-title><source>Regjeringen.no</source><year>2015</year><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.regjeringen.no/no/dokumenter/meld.-st.-11-20152016/id2462047/">https://www.regjeringen.no/no/dokumenter/meld.-st.-11-20152016/id2462047/</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stanfill</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Hsieh</surname><given-names>KL</given-names> </name><name name-style="western"><surname>Beal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Fenton</surname><given-names>SH</given-names> </name></person-group><article-title>Preparing for ICD-10-CM/PCS implementation: impact on productivity and quality</article-title><source>Perspect Health Inf Manag</source><year>2014</year><month>07</month><day>1</day><volume>11</volume><pub-id pub-id-type="medline">25214823</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stausberg</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lehmann</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kaczmarek</surname><given-names>D</given-names> </name><name name-style="western"><surname>Stein</surname><given-names>M</given-names> </name></person-group><article-title>Reliability of diagnoses coding with ICD-10</article-title><source>Int J Med Inform</source><year>2008</year><month>01</month><volume>77</volume><issue>1</issue><fpage>50</fpage><lpage>57</lpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2006.11.005</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="web"><article-title>Riksrevisjonens kontroll med forvaltningen av statlige selskaper for 2008 &#x2014; stortinget.no [The Office of the Auditor General&#x2019;s control of the administration of state-owned companies for 2008]</article-title><source>Stortinget</source><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.stortinget.no/globalassets/pdf/dokumentserien/2009-2010/dokument_3_2_2009_2010.pdf">https://www.stortinget.no/globalassets/pdf/dokumentserien/2009-2010/dokument_3_2_2009_2010.pdf</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><article-title>Riksrevisjonens unders&#x00F8;kelse av innsatsstyrt finansiering i somatiske sykehus [The Office of the Auditor General&#x2019;s investigation of activity-based funding in somatic hospitals]</article-title><source>Stortinget</source><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.stortinget.no/globalassets/pdf/dokumentserien/2001-2002/dok_3_6_2001_2002.pdf">https://www.stortinget.no/globalassets/pdf/dokumentserien/2001-2002/dok_3_6_2001_2002.pdf</ext-link></comment></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Mathisen</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Mathisen</surname><given-names>T</given-names> </name></person-group><article-title>Medisinsk koding av sykehusopphold p&#x00E5; oslo universitetssykehus HF, ullev&#x00E5;l en unders&#x00F8;kelse av kvaliteten p&#x00E5; kodingen og hvordan problemet med ukorrekt koding kan bedres [medical coding of hospital stays at Oslo University Hospital HF, Ullev&#x00E5;l: an investigation of the quality of coding and how the problem of incorrect coding can be improved]</article-title><source>UiT The Arctic University of Norway</source><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://munin.uit.no/bitstream/handle/10037/9304/thesis.pdf?sequence=1&#x0026;isAllowed=y">https://munin.uit.no/bitstream/handle/10037/9304/thesis.pdf?sequence=1&#x0026;isAllowed=y</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Jacobsson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Serd&#x00E9;n</surname><given-names>L</given-names> </name></person-group><article-title>Kodningskvalitet i patientregistret ett nytt verktyg f&#x00F6;r att m&#x00E4;ta kvalitet [coding quality in the patient register: a new tool for measuring quality]</article-title><source>Socialstyrelsen [The Swedish National Board of Health and Welfare]</source><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.socialstyrelsen.se/globalassets/sharepoint-dokument/artikelkatalog/statistik/2013-3-10.pdf">https://www.socialstyrelsen.se/globalassets/sharepoint-dokument/artikelkatalog/statistik/2013-3-10.pdf</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schmidt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>SAJ</given-names> </name><name name-style="western"><surname>Sandegaard</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Ehrenstein</surname><given-names>V</given-names> </name><name name-style="western"><surname>Pedersen</surname><given-names>L</given-names> </name><name name-style="western"><surname>S&#x00F8;rensen</surname><given-names>HT</given-names> </name></person-group><article-title>The Danish national patient registry: a review of content, data quality, and research potential</article-title><source>CLEP</source><year>2015</year><volume>7</volume><fpage>449</fpage><pub-id pub-id-type="doi">10.2147/CLEP.S91125</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stegman</surname><given-names>MS</given-names> </name></person-group><article-title>Coding &#x0026; billing errors: do they really add up to a $100 billion health care crisis</article-title><source>J Health Care Compliance</source><year>2005</year><volume>7</volume><issue>4</issue><fpage>51</fpage><lpage>55</lpage></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>So</surname><given-names>L</given-names> </name><name name-style="western"><surname>Beck</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Brien</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Chart documentation quality and its relationship to the validity of administrative data discharge records</article-title><source>Health Informatics J</source><year>2010</year><month>06</month><volume>16</volume><issue>2</issue><fpage>101</fpage><lpage>113</lpage><pub-id pub-id-type="doi">10.1177/1460458210364784</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>Bert: pre-training of deep bidirectional transformers for language understanding</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 11, 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Vakili</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lamproudis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Henriksson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dalianis</surname><given-names>H</given-names> </name></person-group><article-title>Downstream task performance of bert models pre-trained using automatically de-identified clinical data</article-title><conf-name>Proceedings of the Thirteenth Language Resources and Evaluation Conference</conf-name><conf-date>Jun 20-25, 2022</conf-date><fpage>4245</fpage><lpage>4252</lpage></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lamproudis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mora</surname><given-names>S</given-names> </name><name name-style="western"><surname>Olsen Svenning</surname><given-names>T</given-names> </name><etal/></person-group><article-title>De-identifying Norwegian clinical text using resources from swedish and danish</article-title><source>AMIA Annu Symp Proc</source><year>2024</year><month>01</month><day>11</day><pub-id pub-id-type="medline">38222432</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lamproudis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Olsen Svenning</surname><given-names>T</given-names> </name><name name-style="western"><surname>Torsvik</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Using a large open clinical corpus for improved ICD-10 diagnosis coding</article-title><source>AMIA Annu Symp Proc</source><year>2024</year><month>01</month><day>11</day><pub-id pub-id-type="medline">38222373</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Language technology group (LTG)</article-title><source>University of Oslo</source><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.mn.uio.no/ifi/english/research/groups/ltg/">https://www.mn.uio.no/ifi/english/research/groups/ltg/</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>NorBERT 3 base</article-title><source>Hugging Face</source><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/ltg/norbert3-base">https://huggingface.co/ltg/norbert3-base</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Samuel</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kutuzov</surname><given-names>A</given-names> </name><name name-style="western"><surname>Touileb</surname><given-names>S</given-names> </name><etal/></person-group><article-title>NorBench &#x2013; a benchmark for norwegian language models</article-title><conf-name>Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)</conf-name><conf-date>May 22-24, 2023</conf-date><conf-loc>Faroe Islands</conf-loc><fpage>618</fpage><lpage>633</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.nodalida-1.61">https://aclanthology.org/2023.nodalida-1.61</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>A survey of automated international classification of diseases coding: development, challenges, and applications</article-title><source>Intell Med</source><year>2022</year><month>08</month><volume>2</volume><issue>3</issue><fpage>161</fpage><lpage>173</lpage><pub-id pub-id-type="doi">10.1016/j.imed.2022.03.003</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ou</surname><given-names>D</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>H</given-names> </name></person-group><article-title>Construction of a semi-automatic ICD-10 coding system</article-title><source>BMC Med Inform Decis Mak</source><year>2020</year><month>12</month><volume>20</volume><issue>1</issue><fpage>1</fpage><lpage>12</lpage><pub-id pub-id-type="doi">10.1186/s12911-020-1085-4</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>PF</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>WC</given-names> </name><etal/></person-group><article-title>Automatic ICD-10 coding and training system: deep neural network based on supervised learning</article-title><source>JMIR Med Inform</source><volume>9</volume><issue>8</issue><fpage>e23230</fpage><pub-id pub-id-type="doi">10.2196/23230</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ponthongmak</surname><given-names>W</given-names> </name><name name-style="western"><surname>Thammasudjarit</surname><given-names>R</given-names> </name><name name-style="western"><surname>McKay</surname><given-names>GJ</given-names> </name><name name-style="western"><surname>Attia</surname><given-names>J</given-names> </name><name name-style="western"><surname>Theera-Ampornpunt</surname><given-names>N</given-names> </name><name name-style="western"><surname>Thakkinstian</surname><given-names>A</given-names> </name></person-group><article-title>Development and external validation of automated ICD-10 coding from discharge summaries using deep learning approaches</article-title><source>Informatics Med Unlock</source><year>2023</year><volume>38</volume><fpage>101227</fpage><pub-id pub-id-type="doi">10.1016/j.imu.2023.101227</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaur</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ginige</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Obst</surname><given-names>O</given-names> </name></person-group><article-title>AI-based ICD coding and classification approaches using discharge summaries: a systematic literature review</article-title><source>Expert Syst Appl</source><year>2023</year><month>03</month><volume>213</volume><fpage>118997</fpage><pub-id pub-id-type="doi">10.1016/j.eswa.2022.118997</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Kwon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name></person-group><article-title>Multi-label few-shot ICD coding as autoregressive generation with prompt</article-title><source>AAAI</source><volume>37</volume><issue>4</issue><fpage>5366</fpage><lpage>5374</lpage><pub-id pub-id-type="doi">10.1609/aaai.v37i4.25668</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>L&#x00F3;pez-Garc&#x00ED;a</surname><given-names>G</given-names> </name><name name-style="western"><surname>Jerez</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ribelles</surname><given-names>N</given-names> </name><name name-style="western"><surname>Alba</surname><given-names>E</given-names> </name><name name-style="western"><surname>Veredas</surname><given-names>F</given-names> </name></person-group><article-title>ICB-UMA at CANTEMIST 2020: automatic ICD-o coding in spanish with BERT</article-title><conf-name>IberLEF 2020 - CANTEMIST Track</conf-name><conf-date>Sep 23-25, 2020</conf-date><conf-loc>Spain</conf-loc></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Farka&#x0161;</surname><given-names>I</given-names> </name><name name-style="western"><surname>Masulli</surname><given-names>P</given-names> </name><name name-style="western"><surname>Otte</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wermter</surname><given-names>S</given-names> </name></person-group><article-title>Multi-features-based automatic clinical coding for Chinese ICD-9-CM-3</article-title><conf-name>Artificial Neural Networks and Machine Learning &#x2013; ICANN 2021: 30th International Conference on Artificial Neural Networks</conf-name><conf-date>Sep 14-17, 2021</conf-date><conf-loc>Bratislava, Slovakia</conf-loc><fpage>473</fpage><lpage>486</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-86383-8_38</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Dolk</surname><given-names>A</given-names> </name><name name-style="western"><surname>Davidsen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Dalianis</surname><given-names>H</given-names> </name><name name-style="western"><surname>Vakili</surname><given-names>T</given-names> </name></person-group><article-title>Evaluation of LIME and SHAP in explaining automatic ICD-10 classifications of swedish gastrointestinal discharge summaries</article-title><conf-name>18th Scandinavian Conference on Health Informatics</conf-name><conf-date>Aug 22-24, 2022</conf-date><conf-loc>Troms&#x00F8;, Norway</conf-loc><fpage>166</fpage><lpage>173</lpage><pub-id pub-id-type="doi">10.3384/ecp187028</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sn&#x00E6;bjarnarson</surname><given-names>V</given-names> </name><name name-style="western"><surname>Simonsen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Glava&#x0161;</surname><given-names>G</given-names> </name><name name-style="western"><surname>Vuli&#x0107;</surname><given-names>I</given-names> </name></person-group><article-title>Transfer to a low-resource language via close relatives: the case study on faroese</article-title><conf-name>Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)</conf-name><conf-date>May 22-23, 2024</conf-date><conf-loc>T&#x00F3;rshavn, Faroe Islands</conf-loc></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kutuzov</surname><given-names>A</given-names> </name><name name-style="western"><surname>Barnes</surname><given-names>J</given-names> </name><name name-style="western"><surname>Velldal</surname><given-names>E</given-names> </name><name name-style="western"><surname>&#x00D8;vrelid</surname><given-names>L</given-names> </name><name name-style="western"><surname>Oepen</surname><given-names>S</given-names> </name></person-group><article-title>Large-scale contextualised language modelling for norwegian</article-title><conf-name>Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)</conf-name><conf-date>May 31 to Jun 2, 2021</conf-date><conf-loc>Reykjavik, Iceland</conf-loc></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ngo</surname><given-names>P</given-names> </name><name name-style="western"><surname>Tejedor</surname><given-names>M</given-names> </name><name name-style="western"><surname>Svenning</surname><given-names>TO</given-names> </name><name name-style="western"><surname>Chomutare</surname><given-names>T</given-names> </name><name name-style="western"><surname>Budrionis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dalianis</surname><given-names>H</given-names> </name></person-group><article-title>Deidentifying a norwegian clinical corpus-an effort to create a privacy-preserving norwegian large clinical language model</article-title><conf-name>Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024)</conf-name><conf-date>Mar 21, 2024</conf-date><conf-loc>St Julian&#x2019;s, Malta</conf-loc><fpage>37</fpage><lpage>43</lpage></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>RoBERTa: a robustly optimized BERT pretraining approach</article-title><source>arXiv</source><comment>Preprint posted online on 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1907.11692</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>Transformers library</article-title><source>Hugging Face</source><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/docs/transformers/en/index">https://huggingface.co/docs/transformers/en/index</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>KB/bert-base-swedish-cased</article-title><source>Hugging Face</source><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/KB/bert-base-swedish-cased">https://huggingface.co/KB/bert-base-swedish-cased</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><article-title>Vesteinn/scandibert</article-title><source>Hugging Face</source><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/vesteinn/ScandiBERT">https://huggingface.co/vesteinn/ScandiBERT</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Jurafsky</surname><given-names>D</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>JH</given-names> </name></person-group><source>Speech and language processing: an introduction to natural language processing, computational linguistics, and speech recognition with language models</source><year>2000</year><access-date>2025-08-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://web.stanford.edu/~jurafsky/slp3/ed3book.pdf">https://web.stanford.edu/~jurafsky/slp3/ed3book.pdf</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>English translation of the text presented in <xref ref-type="fig" rid="figure5">Figure 5</xref>.</p><media xlink:href="ai_v4i1e66153_app1.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material></app-group></back></article>