<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v4i1e58670</article-id>
      <article-id pub-id-type="pmid">39993309</article-id>
      <article-id pub-id-type="doi">10.2196/58670</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Leveraging Medical Knowledge Graphs Into Large Language Models for Diagnosis Prediction: Design and Application Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Liu</surname>
            <given-names>Hongfang</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sheth</surname>
            <given-names>Amit</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhang</surname>
            <given-names>Ningyu</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Hua</surname>
            <given-names>Yining</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Gao</surname>
            <given-names>Yanjun</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Biomedical Informatics</institution>
            <institution>University of Colorado Anschutz Medical Campus</institution>
            <addr-line>1890 N Revere Ct</addr-line>
            <addr-line>Denver, CO, 80045</addr-line>
            <country>United States</country>
            <phone>1 303 724 5375</phone>
            <email>yanjun.gao@cuanschutz.edu</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9341-7360</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Ruizhe</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2512-845X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Croxford</surname>
            <given-names>Emma</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-9117-7009</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Caskey</surname>
            <given-names>John</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5665-524X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Patterson</surname>
            <given-names>Brian W</given-names>
          </name>
          <degrees>MPH, MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4584-3808</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Churpek</surname>
            <given-names>Matthew</given-names>
          </name>
          <degrees>MPH, MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4030-5250</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Miller</surname>
            <given-names>Timothy</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4513-403X</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Dligach</surname>
            <given-names>Dmitriy</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2585-2707</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Afshar</surname>
            <given-names>Majid</given-names>
          </name>
          <degrees>MD, MSCR</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6368-4652</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>University of Colorado Anschutz Medical Campus</institution>
        <addr-line>Denver, CO</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Medicine</institution>
        <institution>University of Wisconsin–Madison</institution>
        <addr-line>Madison, WI</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>University of Aberdeen</institution>
        <addr-line>Aberdeen</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Boston Children's Hospital</institution>
        <institution>Harvard Medical School</institution>
        <addr-line>Boston, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Loyola University Chicago</institution>
        <addr-line>Chicago, IL</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Yanjun Gao <email>yanjun.gao@cuanschutz.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>24</day>
        <month>2</month>
        <year>2025</year>
      </pub-date>
      <volume>4</volume>
      <elocation-id>e58670</elocation-id>
      <history>
        <date date-type="received">
          <day>21</day>
          <month>3</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>17</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>7</day>
          <month>8</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>7</day>
          <month>11</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Yanjun Gao, Ruizhe Li, Emma Croxford, John Caskey, Brian W Patterson, Matthew Churpek, Timothy Miller, Dmitriy Dligach, Majid Afshar. Originally published in JMIR AI (https://ai.jmir.org), 24.02.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2025/1/e58670" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Electronic health records (EHRs) and routine documentation practices play a vital role in patients’ daily care, providing a holistic record of health, diagnoses, and treatment. However, complex and verbose EHR narratives can overwhelm health care providers, increasing the risk of diagnostic inaccuracies. While large language models (LLMs) have showcased their potential in diverse language tasks, their application in health care must prioritize the minimization of diagnostic errors and the prevention of patient harm. Integrating knowledge graphs (KGs) into LLMs offers a promising approach because structured knowledge from KGs could enhance LLMs’ diagnostic reasoning by providing contextually relevant medical information.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study introduces DR.KNOWS (Diagnostic Reasoning Knowledge Graph System), a model that integrates Unified Medical Language System–based KGs with LLMs to improve diagnostic predictions from EHR data by retrieving contextually relevant paths aligned with patient-specific information.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>DR.KNOWS combines a stack graph isomorphism network for node embedding with an attention-based path ranker to identify and rank knowledge paths relevant to a patient’s clinical context. We evaluated DR.KNOWS on 2 real-world EHR datasets from different geographic locations, comparing its performance to baseline models, including QuickUMLS and standard LLMs (Text-to-Text Transfer Transformer and ChatGPT). To assess diagnostic reasoning quality, we designed and implemented a human evaluation framework grounded in clinical safety metrics.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>DR.KNOWS demonstrated notable improvements over baseline models, showing higher accuracy in extracting diagnostic concepts and enhanced diagnostic prediction metrics. Prompt-based fine-tuning of Text-to-Text Transfer Transformer with DR.KNOWS knowledge paths achieved the highest ROUGE-L (Recall-Oriented Understudy for Gisting Evaluation–Longest Common Subsequence) and concept unique identifier <italic>F</italic><sub>1</sub>-scores, highlighting the benefits of KG integration. Human evaluators found the diagnostic rationales of DR.KNOWS to be aligned strongly with correct clinical reasoning, indicating improved abstraction and reasoning. Recognized limitations include potential biases within the KG data, which we addressed by emphasizing case-specific path selection and proposing future bias-mitigation strategies.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>DR.KNOWS offers a robust approach for enhancing diagnostic accuracy and reasoning by integrating structured KG knowledge into LLM-based clinical workflows. Although further work is required to address KG biases and extend generalizability, DR.KNOWS represents progress toward trustworthy artificial intelligence–driven clinical decision support, with a human evaluation framework focused on diagnostic safety and alignment with clinical standards.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>knowledge graph</kwd>
        <kwd>natural language processing</kwd>
        <kwd>machine learning</kwd>
        <kwd>electronic health record</kwd>
        <kwd>large language model</kwd>
        <kwd>diagnosis prediction</kwd>
        <kwd>graph model</kwd>
        <kwd>artificial intelligence</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>The ubiquitous use of electronic health records (EHRs) and the standard documentation practice of daily care notes are integral to the continuity of patient care because these records provide a comprehensive account of the patient’s health trajectory, inclusive of condition status, diagnoses, and treatment plans [<xref ref-type="bibr" rid="ref1">1</xref>]. Nevertheless, the growing complexity and verbosity of EHR clinical narratives, which are often filled with redundant information, can overwhelm health care providers and increase the risk of diagnostic errors [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Physicians often skip sections of lengthy and repetitive notes and rely on decisional shortcuts (ie, decisional heuristics) that can contribute to diagnostic errors [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
        <p>Current efforts at automating diagnosis generation from daily progress notes leverage large language models (LLMs). Gao et al [<xref ref-type="bibr" rid="ref7">7</xref>] introduced a summarization task that takes progress notes as input and generates a summary of active diagnoses. The authors annotated a set of progress notes from the publicly available EHR dataset Medical Information Mart for Intensive Care III (MIMIC-III) [<xref ref-type="bibr" rid="ref8">8</xref>]. The BioNLP 2023 shared task, known as ProbSum, built upon this work by providing additional annotated notes and attracting multiple efforts focused on developing solutions [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. Demonstrating a growing interest in applying LLMs to serve as solutions, these prior studies use language models such as Text-to-Text Transfer Transformer (T5) [<xref ref-type="bibr" rid="ref12">12</xref>], developed by Google Research; and Open AI’s Generative Pretrained Transformer (GPT) [<xref ref-type="bibr" rid="ref13">13</xref>]. Unlike the conventional language tasks where LLMs have shown promising abilities, automated diagnosis generation is a critical task that requires high accuracy and reliability to ensure patient safety and improve health care outcomes. Concerns regarding the potential misleading and hallucinated information that could result in life-threatening events prevent LLMs from being used for diagnostic prediction [<xref ref-type="bibr" rid="ref14">14</xref>].</p>
        <p>The Unified Medical Language System (UMLS) [<xref ref-type="bibr" rid="ref15">15</xref>], a comprehensive resource developed by the National Library of Medicine in the United States, has been extensively used in natural language processing (NLP) research. The UMLS serves as a medical knowledge repository, facilitating the integration, retrieval, and sharing of biomedical information. It offers concept vocabulary and semantic relationships, enabling the construction of medical knowledge graphs (KGs). Prior studies have leveraged UMLS KGs for tasks such as information extraction [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref19">19</xref>] and question answering [<xref ref-type="bibr" rid="ref17">17</xref>]. Mining relevant knowledge for diagnosis is particularly challenging for 2 reasons: the highly specific factors related to the patient’s complaints, histories, and symptoms documented in the EHR; and the vast search space within a KG containing 4.5 million concepts and 15 million relations for diagnosis determination.</p>
        <p>In this study, we explore the use of KGs as external resources to enhance LLMs for diagnosis generation. Our work is motivated not only by the potential in the NLP field of augmenting LLMs with KGs [<xref ref-type="bibr" rid="ref20">20</xref>] but also by the theoretical exploration in medical education and psychology research, shedding light on the diagnostic decision-making process used by clinicians. Forming a diagnostic decision requires the examination of patient data, retrieving encapsulated medical knowledge, and the formulation and testing of the diagnostic hypothesis, which is also known as clinical diagnostic reasoning [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. We propose a novel graph model, DR.KNOWS (Diagnostic Reasoning Knowledge Graph System), designed to retrieve the top N case-specific knowledge paths related to disease pathology and feed them into foundational LLMs to improve the accuracy of diagnostic predictions (as shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>). Two distinct foundational models are the subject of this study: T5, known for being fine-tunable; and a sandboxed version of ChatGPT, a powerful LLM where we explore zero-shot prompting.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Study overview: we focused on generating diagnoses (text given in red in the “Plan” section) using the SOAP (subjective, objective, assessment, and plan) format progress note with the aid of large language models (LLMs). The input consists of “Subjective,” “Objective,” and “Assessment” sections (the dotted line box below the heading “Patient Progress Note”), and the diagnoses in the “Plan” section are the ground truth. We introduced an innovative knowledge graph (KG) model, namely DR.KNOWS (Diagnostic Reasoning Knowledge Graph System), that identifies and extracts the most relevant knowledge trajectories from the Unified Medical Language System (UMLS) KG. The nodes of the UMLS KG represent concept unique identifiers (CUIs), and the edges denote the semantic relations among the CUIs. We experimented with prompting ChatGPT for diagnosis generation, with and without the knowledge paths predicted by DR.KNOWS. Furthermore, we investigated how this knowledge grounding influences the diagnostic output of LLMs using human evaluation. The underlined text shows the UMLS concepts identified through a concept extractor. EtOH: ethanol; GI: gastrointestinal; REDCap: Research Electronic Data Capture; T5: Text-to-Text Transfer Transformer; UGIB: upper gastrointestinal bleeding.</p>
          </caption>
          <graphic xlink:href="ai_v4i1e58670_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>Our work and contribution are structured into two primary components: (1) designing and evaluating DR.KNOWS, a graph-based model that selects the top N probable diagnoses with explainable paths; and (2) demonstrating the usefulness of DR.KNOWS as an additional module to augment pretrained language models in generating relevant diagnoses. Along with the technical contributions, we propose the first human evaluation framework for LLM-generated diagnoses that adapts a survey instrument designed to evaluate diagnostic safety. Our research poses a new exciting problem that has not been addressed in the realm of NLP for diagnosis generation, that is, harnessing the power of KGs for the controllability and explainability of foundational models. By examining the effects of KG path–based prompts on foundational models on a real-world hospital dataset, we strive to contribute to an explainable artificial intelligence (AI) diagnostic pathway.</p>
        <p>Several studies have focused on the application of clinical note summarization to discharge summaries [<xref ref-type="bibr" rid="ref23">23</xref>], hospital course narratives [<xref ref-type="bibr" rid="ref24">24</xref>], real-time patient visit summaries [<xref ref-type="bibr" rid="ref25">25</xref>], and problem and diagnosis lists [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Our work follows the line of research on problem and diagnosis summarization. The integration of KGs with LLMs has been gaining traction as an emerging trend due to the potential enhancement of factual knowledge [<xref ref-type="bibr" rid="ref20">20</xref>], especially on domain-specific question-answering tasks [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. Our work stands out by integrating KGs into LLMs for diagnosis prediction, using a novel graph model for path-based prompts.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Problem Formulation</title>
        <sec>
          <title>Daily Progress Notes for Diagnosis Prediction</title>
          <p>Daily progress notes are formatted using the SOAP (subjective, objective, assessment, and plan) format [<xref ref-type="bibr" rid="ref30">30</xref>]. The subjective section of a SOAP daily progress note comprises the patient’s self-reported symptoms, concerns, and medical history. The objective section consists of structural data collected by health care providers during observation or examination, such as vital signs (eg, blood pressure and heart rate), laboratory results, or physical examination findings. The assessment section summarizes the patient’s overall condition, with a focus on the most active problems and diagnoses for that day. Finally, the plan section contains multiple subsections, each outlining a diagnosis or problem and its treatment plan. Our task is to predict the list of problems and diagnoses that are part of the plan section. Our research used the ProbSum dataset, an annotated resource created for the BioNLP 2023 shared task with gold standard diagnoses derived from progress notes [<xref ref-type="bibr" rid="ref27">27</xref>].</p>
        </sec>
        <sec>
          <title>Using UMLS KGs to Find Potential Diagnoses, Given Medical Narratives</title>
          <p>The UMLS concepts vocabulary comprises &#62;180 sources. For our study, we focused on the Systematized Nomenclature of Medicine–Clinical Terms (SNOMED CT). The UMLS vocabulary is a comprehensive, multilingual health terminology and the US national standard for EHRs and health information exchange. Each UMLS medical concept is assigned a SNOMED CT concept unique identifier (CUI) from the clinical terminology system. We used semantic types, networks, and semantic relations from UMLS knowledge sources to categorize concepts based on shared attributes, enabling efficient exploration and supporting semantic understanding and knowledge discovery across various medical vocabularies.</p>
          <p>Given a medical KG where the nodes represent concepts and the edges denote semantic relations along with an input text describing a patient’s problems, we could perform multihop reasoning across the KG and infer the final diagnoses. <xref rid="figure2" ref-type="fig">Figure 2</xref> demonstrates how UMLS semantic relations and concepts can be used to identify potential diagnoses from the evidence provided in a daily care note. The example patient presents with medical conditions of fever, cough and sepsis, which are the concepts recognized by medical concept extractors (Clinical Text Analysis and Knowledge Extraction System [<xref ref-type="bibr" rid="ref31">31</xref>] and QuickUMLS [<xref ref-type="bibr" rid="ref32">32</xref>]) and the starting concepts for multihop reasoning. Initially, we extracted the direct neighbors for these concepts. Relevant concepts that aligned with the patient’s descriptions were preferred. For precise diagnoses, we chose the top N most relevant nodes at each hop.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Problem formulation: inferring possible diagnoses within 2 hops from a Unified Medical Language System (UMLS) knowledge graph given a patient’s medical description. The UMLS medical concepts are highlighted in the colored boxes (“female,” “sepsis,” etc). Each concept has its own subgraph, where concepts are the vertices, and semantic relations are the edges (owing to space constraints, we have omitted the subgraph for “female” in this graph presentation). On the first hop, we could identify the most relevant neighboring concepts to the input description. The darker the color of the vertices, the more relevant they are to the input description. A second hop could be further performed based on the most relevant nodes, leading to the final diagnoses “Pneumonia and influenza” and “Respiratory distress syndrome.” Of note, we use the preferred text of concept unique identifiers for presentation purposes. The actual UMLS knowledge graph is built on concept unique identifiers rather than preferred text.</p>
            </caption>
            <graphic xlink:href="ai_v4i1e58670_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>The UMLS’s vast repository consists of 270 semantic relations, but not all are crucial for diagnostic reasoning. Adding the nonrelevant relations into a KG introduced substantially complexities in both computation and retrieval processes. A board-certified physician (MA) refined these to identify the 107 most relevant relations for diagnostics, which were then used to build the UMLS KG. This selection, including relations such as “causative agent of” and excluding ones such as “inverse isa,” is vital to maintaining computational efficiency and retrieval accuracy within the KG.</p>
        </sec>
      </sec>
      <sec>
        <title>Data Overview</title>
        <p>We used 2 sets of progress notes from different clinical settings in this study: MIMIC-III and in-house EHR datasets. MIMIC-III is one of the largest publicly available databases containing deidentified health data from patients admitted to intensive care units. It was developed by the Massachusetts Institute of Technology and Beth Israel Deaconess Medical Center. MIMIC-III includes data from &#62;38,000 patients admitted to intensive care units at the Beth Israel Deaconess Medical Center between 2001 and 2012. The second set, namely the in-house EHR data, was a subset of EHRs that included adult patients (aged 18 years) admitted to the University of Wisconsin health system between 2008 and 2021. In contrast to the MIMIC-III subset, the in-house set covered progress notes from all hospital settings, including the emergency department, general medicine wards, and subspecialty wards. While the 2 datasets originated from separate hospitals and departmental settings and might reflect distinct note-taking practices, both followed the SOAP documentation format for progress notes.</p>
        <p>Gao et al [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>] introduced a subset of 1005 progress notes from MIMIC-III with active diagnoses annotated from the “plan” sections, namely, the ProbSum dataset. Therefore, we applied this dataset for training and evaluation for both graph model intrinsic evaluation and diagnosis summarization. The in-house dataset did not contain human annotation. Even so, by parsing the text with a medical concept extractor that was based on UMLS SNOMED CT vocabulary, we were able to pull out concepts that belonged to the semantic type of “T047 Disease and Syndromes.” We deployed this set of concepts as the ground truth data to train and evaluate the graph model. The final in-house dataset contained 4815 progress notes. We present the descriptive statistics in <xref ref-type="table" rid="table1">Table 1</xref>. When contrasted with MIMIC-III, the in-house dataset exhibited a greater number of CUIs in its input, leading to an extended CUI output. In addition, MIMIC-III encompassed a wider range of abstractive concepts compared to the in-house progress notes.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Average number of concept unique identifiers (CUIs) in the input and output across the 2 electronic health record datasets: Medical Information Mart for Intensive Care III (MIMIC-III) and in-house. Abstractive concepts are those not found in the input but present in the gold standard diagnoses.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Datasets</td>
                <td>Departments</td>
                <td>Input CUIs (n), mean (SD)</td>
                <td>Output CUIs (n), mean (SD)</td>
                <td>Abstractive CUIs (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>MIMIC-III</td>
                <td>ICU<sup>a</sup></td>
                <td>15.95</td>
                <td>3.51</td>
                <td>48.92</td>
              </tr>
              <tr valign="top">
                <td>In-house</td>
                <td>All</td>
                <td>41.43</td>
                <td>5.81</td>
                <td>&#60;1</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>ICU: intensive care unit.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Graph Model Development</title>
        <sec>
          <title>Overview</title>
          <p>This section introduces the architecture design for DR.KNOWS. The DR.KNOWS model is designed to enhance automated diagnostic reasoning by integrating structured clinical knowledge from the UMLS into patient-specific diagnostic predictions. By leveraging a graph-based approach, DR.KNOWS retrieves and ranks relevant knowledge paths from the UMLS, ensuring that only clinically pertinent information is considered. Using a graph neural network, DR.KNOWS incorporates topological information from the UMLS KG into concept representations to better determine each node’s relevance to the patient’s specific conditions.</p>
        </sec>
        <sec>
          <title>Architecture Overview</title>
          <p>As shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>, all identified UMLS concepts with an assigned CUI from the input patient text were used to retrieve 1-hop subgraphs from the constructed large UMLS KG. Each node in this graph represents a CUI; therefore, we use “node” and “concept (CUI)” interchangeably throughout. These 1-hop subgraphs are encoded by a stack graph isomorphism network (SGIN) [<xref ref-type="bibr" rid="ref33">33</xref>], which generates node embeddings that capture both neighboring concept information and pretrained concept embeddings. We chose the SGIN for node embedding because it matches the expressive power of the Weisfeiler-Lehman graph isomorphism test, maximizing the graph neural network’s ability to capture meaningful representations. The resulting node embeddings serve as the basis for path embeddings, which the path encoder further processes.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>DR.KNOWS (Diagnostic Reasoning Knowledge Graph System) model architecture. The input concepts (“female,” “fever,” etc) are represented by concept unique identifiers (CUIs) represented as a combination of letters and numbers (eg, “C0243026” and “C0015967”). SapBERT: Self-alignment Pretrained Bidirectional Encoder Representations from Transformers.</p>
            </caption>
            <graphic xlink:href="ai_v4i1e58670_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <p>The path encoder module then evaluates these 1-hop paths by examining their semantic and logical alignment with the input text and concept representations, assigning a relevance score to each path. The top N scores across these paths, aggregated across each node’s neighboring paths, guide the selection of nodes for the next hop. If no suitable diagnosis node is found, the path exploration terminates by assigning a self-loop to the current node.</p>
          <p>While the dominant technique for retrieval-augmented generation systems relies heavily on vector representations and cosine similarity for retrieving and ranking candidate text, our work goes beyond this by adding 2 extra layers of design. First, we leverage the expressive power of the graph structure to enhance the retrieval process. Second, we select paths not simply based on their embeddings but through an attention network that encodes the path-concept relationships, ensuring a more accurate and contextually relevant selection process. In the following paragraphs, we present details regarding each component in the architecture of DR.KNOWS.</p>
        </sec>
        <sec>
          <title>Contextualized Node Representation</title>
          <p>We define the deterministic UMLS KG <italic>G</italic> = <italic>VE</italic> based on SNOMED CT CUIs and semantic relations, where <italic>V</italic> is a set of CUIs, and <italic>E</italic> is a set of semantic relations. Given an input text <italic>x</italic> containing a set of source CUIs <italic>V<sub>src</sub></italic> ⊆ <italic>V</italic> and their 1-hop relations <italic>E<sub>src</sub></italic> ⊆ <italic>E</italic>, we can construct relation paths for each source node <italic>v<sub>src</sub></italic> ⊆ <italic>V<sub>src</sub></italic> as <italic>P</italic> = {<italic>p</italic><sub>1</sub>, <italic>p</italic><sub>2</sub>,...<italic>p<sub>j</sub></italic>} such that <italic>p<sub>j</sub></italic> = {<italic>v</italic><sub>1</sub>, <italic>e</italic><sub>1</sub>, <italic>v</italic><sub>2</sub>,...<italic>e<sub>j</sub></italic><sub>−1</sub>, <italic>v<sub>j</sub></italic>}, <italic>j</italic> ⊆ <italic>J</italic>, where <italic>J</italic> is the maximum length that a source node <italic>v<sub>src</sub></italic> could reach and is nondeterministic. Relations <italic>e</italic> are encoded as one-hot embeddings. We concatenate all concept names for <italic>v<sub>i</sub></italic> with special tokens such as [SEP] (for “separator”), such that <italic>l<sub>i</sub></italic> = [name 1 [SEP] name 2 [SEP]...] and encode <italic>l<sub>i</sub></italic> using Self-alignment Pretrained Bidirectional Encoder Representations from Transformers (SapBERT) [<xref ref-type="bibr" rid="ref34">34</xref>] to obtain <italic>h<sub>i</sub></italic> as concept representation. This allows the CUI representation to serve as the contextualized representation of its corresponding concept names. We chose SapBERT for its contrastive learning-based training, which discriminates similar concepts and their synonyms. It is evaluated on entity linking tasks and has shown state-of-the-art performance. The <italic>h<sub>i</sub></italic> is further updated through topological representation using the SGIN to become node representation:</p>
          <disp-formula>
            <graphic xlink:href="ai_v4i1e58670_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <disp-formula>
            <graphic xlink:href="ai_v4i1e58670_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p><italic>N (v<sub>i</sub>)</italic> represents the set of neighboring nodes of node <italic>v<sub>i</sub></italic>, <inline-graphic xlink:href="ai_v4i1e58670_fig9.png" xlink:type="simple" mimetype="image"/> is the representation of node <italic>v<sub>i</sub></italic> at layer <italic>k</italic>, <italic>ϵ<sup>(k)</sup></italic> is a learnable parameter at layer <italic>k</italic>, and <italic>MLP<sup>(k)</sup></italic> is a multilayer perceptron at layer <italic>k</italic>. GIN iteratively aggregates neighborhood information using graph convolution followed by nonlinearity, modeling interactions among nodes within the set <inline-graphic xlink:href="ai_v4i1e58670_fig10.png" xlink:type="simple" mimetype="image"/>. Furthermore, the stacking mechanism is introduced to combine multiple GIN layers. The final node representation <italic>v<sub>i</sub></italic> at layer <italic>K</italic> (last layer) is computed by stacking the GIN layers, where [...;...] denotes matrix concatenation.</p>
          <p>We empirically observed that some types of CUIs are less likely to lead to useful paths for diseases, for example, the concept “recent” (CUI: C0332185) is a temporal concept, and the neighbors associated with it are less useful to predict diagnoses. We designed a weighting scheme based on term frequency–inverse document frequency to assign higher weights to more relevant CUIs and semantic types:</p>
          <disp-formula>
            <graphic xlink:href="ai_v4i1e58670_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p><italic>W<sub>CUI</sub></italic> are then multiplied by the corresponding <italic>h<sub>i</sub></italic> to assign weighted representations to the concept representation.</p>
        </sec>
        <sec>
          <title>Path Reasoning and Ranking</title>
          <p>For each node representation <italic>h<sub>i</sub></italic>, we use its n-hop <inline-graphic xlink:href="ai_v4i1e58670_fig12.png" xlink:type="simple" mimetype="image"/> of the set neighborhood for <inline-graphic xlink:href="ai_v4i1e58670_fig13.png" xlink:type="simple" mimetype="image"/> for <italic>h<sub>i</sub></italic> and the associated relation edge <inline-graphic xlink:href="ai_v4i1e58670_fig14.png" xlink:type="simple" mimetype="image"/> to generate the corresponding path embeddings, with <italic>t</italic> being the index of the node and its associated neighborhood and relations:</p>
          <disp-formula>
            <italic>hi, if n=1</italic>
          </disp-formula>
          <disp-formula>
            <italic>pi = {</italic>
          </disp-formula>
          <disp-formula><inline-graphic xlink:href="ai_v4i1e58670_fig15.png" xlink:type="simple" mimetype="image"/>, otherwise</disp-formula>
          <disp-formula>
            <graphic xlink:href="ai_v4i1e58670_fig16.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>where “FFN” is the feedforward network, and <italic>n</italic> is the number of hops in the subgraph <italic>G<sub>src</sub></italic>. The path embedding <italic>p<sub>i</sub></italic> is the node embedding itself for the first hop and is recursively aggregated with new nodes and edges as the path extends to the next hop.</p>
          <p>To determine each path’s relevance to the patient’s specific symptoms, we used 2 attention mechanisms—multihead attention (MultiAttn) and trilinear attention (TriAttn)—to compute scores <italic>S</italic> for each path. Both mechanisms use the patient’s input text representation <italic>h<sub>x</sub></italic> and input list of CUIs <italic>h<sub>v</sub></italic>, encoded by SapBERT, to capture explicit and intricate relationships in the input data. MultiAttn was used to explicitly capture relationships between the input text, the list of concepts, and the current path, while TriAttn was used to automatically learn these complex relationships through the inner products of the 3 matrices. As demonstrated in <xref rid="figure2" ref-type="fig">Figure 2</xref>, for each hop the path tries to achieve based on the input patient description, the candidate concept can add relevant information, provide no new information and remain neutral, or contradict the information already present in the context.</p>
          <p>Using MultiAttn, we define the context relevancy matrix <italic>H<sub>i</sub></italic> and the concept relevancy matrix <italic>Z<sub>i</sub></italic> as follows:</p>
          <disp-formula>
            Hi = [hx; pi; hx – pi; hx ⊙ pi]
          </disp-formula>
          <disp-formula>
            Zi = [hv; pi; hv – pi; hv ⊙ pi]
          </disp-formula>
          <disp-formula>
            αi = MultiAttn(Hi ⊙ Zi),
          </disp-formula>
          <disp-formula>
            SMulti = ϕ (Relu(σ(αi)))
          </disp-formula>
          <p>These relevancy matrices are inspired by a prior work on natural language inference [<xref ref-type="bibr" rid="ref35">35</xref>], representing logical relations such as neutrality, contradiction, and entailment via matrix concatenation, difference, and product, respectively. Alternatively, TriAttn learns the intricate relations by 3 attention maps:</p>
          <disp-formula>
            αi = (hx, hv, pi) = Σabc (hx)a (hv)b (pi)c Wabc
          </disp-formula>
          <disp-formula>
          STri = ϕ (Relu(σ(αi)))
          </disp-formula>
          <p><italic>h<sub>x</sub></italic>, <italic>h<sub>v</sub></italic>, and <italic>p<sub>i</sub></italic> have the same dimensionality D, and <italic>ϕ</italic> is an MLP player. Finally, we aggregate the MultiAttn or TriAttn scores on all candidate nodes and select the top N nodes (concepts) <italic>V<sub>N</sub></italic> for the next iteration based on the aggregate attention scores:</p>
          <disp-formula>
            <graphic xlink:href="ai_v4i1e58670_fig17.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <disp-formula>
            <italic>V<sub>N</sub> = argmax<sub>N</sub>(β)</italic>
          </disp-formula>
          <p>By comparing attention scores across candidate paths, the path ranker selects the top N nodes most relevant to each patient’s symptoms, maximizing contextual relevance.</p>
        </sec>
        <sec>
          <title>Loss Function</title>
          <p>Our loss function consists of 2 parts: a CUI prediction loss L<sub>pred</sub> and a contrastive learning loss <italic>L<sub>CL</sub></italic>:</p>
          <disp-formula>
            <italic>L = L<sub>pred</sub> + L<sub>CL</sub></italic>
          </disp-formula>
          <p>For CUI prediction loss, we use binary cross entropy loss to calculate whether the predicted node <italic>V<sub>N</sub></italic> is in the gold standard label <italic>Y</italic>:</p>
          <disp-formula>
            <graphic xlink:href="ai_v4i1e58670_fig18.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>Where <italic>M</italic> is the number of sets of gold labels. For contrastive learning loss L<sub>CL</sub>, we encourage the model to learn meaningful and discriminative representations through comparison with positive and negative samples:</p>
          <disp-formula>
            <graphic xlink:href="ai_v4i1e58670_fig19.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>where A<sub>i</sub> is the anchor embedding, defined as h<sub>x</sub> ⊙ h<sub>v</sub>, representing the input text and concept representation. Σ<sub>i</sub> indicates a summation over a set of indices <italic>i</italic>, typically representing different training samples or pairs. Inspired by the study by Hu et al [<xref ref-type="bibr" rid="ref29">29</xref>], we construct <italic>cos (A<sub>i</sub>, f<sub>i</sub>)</italic> and <italic>cos (A<sub>i</sub>, f<sub>i–</sub>)</italic> to calculate cosine similarity between <italic>A<sub>i</sub></italic> and positive feature <italic>f<sub>i+</sub></italic> or negative feature <italic>f<sub>i–</sub></italic>, respectively. A positive feature represents the paths correctly leading to the ground truth concept, while a negative feature embodies the paths that, although starting from the source, culminate in an incorrect concept. This equation measures the loss when the similarity between an anchor and its positive feature is not significantly greater than the similarity between the same anchor and a negative feature, considering a margin for desired separation.</p>
          <p>We designed a training algorithm to iteratively select and rank the most relevant paths to extend. This algorithm helped to reduce the computational requirement because it does not rank all n-hop paths within 1 pass. This algorithm is presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        </sec>
      </sec>
      <sec>
        <title>Selection of Foundational Models and Experiment Setup</title>
        <p>Our study centers around the following question: To what extent does the incorporation of DR.KNOWS as a knowledge path–based prompt provider influence the performance of language models in diagnosis summarization?</p>
        <p>We present results derived from 2 distinct foundational models, varying significantly in their parameter scales, namely T5-Large, which comprises 770 million parameters [<xref ref-type="bibr" rid="ref12">12</xref>]; and GPT-3.5-Turbo, which features 154 billion parameters [<xref ref-type="bibr" rid="ref13">13</xref>]. Specifically, we were granted access to a restricted version of the GPT-3.5-Turbo model, which served as the underlying framework for the highly capable language model, ChatGPT.</p>
        <p>These 2 models represent the prevailing direction in the evolution of language models: smaller models such as T5 that offer easier control and larger models such as GPT that generate text with substantial scale and power. Our investigation focused on evaluating the performance of T5 in fine-tuning scenarios and GPT models in zero-shot settings. Our primary objective was not solely to demonstrate cutting-edge results but also to critically examine the potential influence of incorporating predicted paths, generated by graph models, as auxiliary knowledge contributors.</p>
        <p>We selected 3 distinct T5-Large variants for fine-tuning using the ProbSum summarization dataset. The chosen T5 models encompass the vanilla T5 [<xref ref-type="bibr" rid="ref12">12</xref>], a foundational model that has been extensively used in varied NLP tasks; Flan-T5 [<xref ref-type="bibr" rid="ref36">36</xref>], which has been fine-tuned using an instructional approach; and Clinical-T5 [<xref ref-type="bibr" rid="ref37">37</xref>], which has been specifically trained on the MIMIC dataset.</p>
        <p>Given that our work encompasses a public EHR dataset (MIMIC-III) and a private EHR dataset with protected health information (in-house), we conducted training using 3 distinct computing environments. Specifically, most of the experiments on MIMIC-III were conducted on Google’s cloud computing platform, using 1 to 2 NVIDIA A100 40 GB graphics processing units (GPUs) and a conventional server equipped with 1 RTX 3090 Ti 24 GB GPU. The in-house EHR dataset is stored on a workstation located within a hospital research laboratory. The workstation operates within a Health Insurance Portability and Accountability Act–compliant network, ensuring the confidentiality, integrity, and availability of electronic protected health information, and it is equipped with a single NVIDIA V100 32 GB GPU. To use ChatGPT, we used an in-house ChatGPT-3.5-Turbo version hosted on our local cloud infrastructure. No data were sent to Microsoft or OpenAI. This setup ensured that no data were transmitted to OpenAI or external websites, and we were in strict compliance with the MIMIC data use agreement.</p>
        <p>While GPT can handle 4096 tokens, T5 is limited to 512 tokens. To ensure a fair comparison, we focused on the subjective and assessment sections of progress notes as input. These sections provide physicians’ evaluations of patients’ conditions and fall within T5’s 512-token limit. This differs from the objective sections, which mainly contain numerical values. Detailed information on data preprocessing, T5 model fine-tuning, and GPT zero-shot setting is presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Prompting Foundational Models to Integrate Graph Knowledge</title>
        <p>To incorporate graph model–predicted paths into a prompt, we applied a prompt engineering strategy using domain-independent prompt patterns, as delineated in the study by White et al [<xref ref-type="bibr" rid="ref38">38</xref>]. Our prompt was constructed with 3 primary components: the output customization that specifies the persona; the output format and template; and the context-control patterns, which are directly linked to the input note and the output of DR.KNOWS. In our test set, for the few input EHRs where no paths could be found (&#60;20 instances), we directly fed the input into the LLMs (T5 and ChatGPT) to generate diagnoses.</p>
        <p>Given that our core objective was to assess the extent to which the prompt can bolster the model’s performance, it became imperative to test an array of prompts. Gonen et al [<xref ref-type="bibr" rid="ref39">39</xref>] presented a technique, BETTERPROMPT, which relied on “selecting prompts by estimating language model likelihood.” Essentially, we initiated the process with a set of manual task-specific prompts, subsequently expanding the prompt set via automatic paraphrasing facilitated by ChatGPT and backtranslation. We then ranked these prompts by their perplexity score (averaged over a representative sample of task inputs), ultimately selecting those prompts that exhibited the lowest perplexity. Guided by this framework, we manually crafted 5 sets of prompts to integrate the path input, which are visually represented in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Specifically, the first 3 prompts were designed by a non–medical domain expert (computer scientist), whereas the final 2 sets of prompts were developed by a medical domain expert (a critical care physician and a medical informaticist). We designated the last 2 prompts (with the medical persona) as “subject matter prompts” and the first 3 prompts as “non–subject matter prompts.”</p>
        <p>The chosen final prompt came from a template with minimal perplexity, incorporating predicted knowledge paths from the DR.KNOWS model as part of the input. We explored 2 path representation methods: “structural,” which uses “→” to link source concepts, edges (relation names), and target concepts; and “clause,” which converts paths into clause-style text by directly joining the source and target concepts with their relations. Preliminary experiments showed superior performance with the “structural” representation, leading to its exclusive use in our reported results. The final prompt selected for the foundational models is a paraphrased prompt from the subject matter expert–crafted prompt: “Imagine you are a medical professional equipped with a knowledge graph, and generate the top three direct and indirect diagnoses from the input note. &#60;Input note&#62;…These are knowledge paths: &#60;path 1&#62;; &#60;path 2&#62;…Separate the diagnoses using semicolons, and explain your reasoning starting with &#60;Reasoning&#62;.” For the setup where the input did not contain paths, we simply used the prompt with the medical persona and task description as follows: “Imagine you are a medical professional, and generate the top three direct and indirect diagnoses from the input note. &#60;Input note&#62;...” The manually crafted prompts, their paraphrased versions, and their perplexity scores are presented in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Evaluation Metrics</title>
        <sec>
          <title>Automated Evaluation Metrics for Quantitative Analysis</title>
          <p>We conducted 2 evaluations for the DR.KNOWS models: the first was an intrinsic evaluation to determine how many gold standard concepts the graph model can retrieve. The second evaluation examined whether the retrieved knowledge paths could enhance the LLM’s diagnosis prediction task. Regarding the first evaluation, our primary objective was to evaluate the effectiveness of DR.KNOWS in predicting diagnoses using CUIs. We used a concept extractor to analyze text within the plan section, specifically extracting CUIs classified under the semantic type T047 DISEASE AND SYNDROMES. We only included CUIs that were guaranteed to connect with at least 1 path, having a maximum length of 2 hops between the target and input CUIs. These chosen CUIs constituted the “gold standard” CUI set, used for both training and assessing the model’s performance. As DR.KNOWS predicts the top N CUIs, we measured the Recall@N and Precision@N as follows:</p>
          <disp-formula>
            <graphic xlink:href="ai_v4i1e58670_fig20.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <disp-formula>
            <graphic xlink:href="ai_v4i1e58670_fig21.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>The <italic>F</italic>-score, the harmonic mean between recall and precision, will also be reported.</p>
          <p>To evaluate foundational model performance on EHR diagnosis prediction, we applied the aforementioned evaluation metric as well as Recall-Oriented Understudy for Gisting Evaluation (ROUGE) [<xref ref-type="bibr" rid="ref40">40</xref>]. Specifically, ROUGE is a widely used set of metrics designed for evaluating the quality of machine-generated text by comparing it to reference texts. We used the ROUGE–Longest Common Subsequence (ROUGE-L) variant, which is based on the longest common substring; and the ROUGE-2 variant, which focuses on bigram matching. Both ROUGE metrics were used in the ProbSum shared task.</p>
          <p>For reporting results from automated metrics, we provided the mean scores across all samples in the test set, along with 95% CIs on 1000 bootstrapped samples.</p>
        </sec>
        <sec>
          <title>Human Evaluation for Qualitative Analysis</title>
          <p>Existing evaluation frameworks for AI, such as those used in radiology report generation, do not address diagnosis prediction with LLMs, leaving a significant gap. To address this, our prior work introduced a new human evaluation framework based on the Safer DX Instrument [<xref ref-type="bibr" rid="ref41">41</xref>], aiming to provide a structured approach for assessing LLMs in diagnosis tasks. In this study, we used this framework to assess the impact of knowledge paths on LLM diagnostic predictions, specifically through a qualitative analysis of the “reasoning” output by LLMs, aiming to gauge the depth and accuracy of the models’ diagnostic reasoning processes.</p>
          <p>Specifically, we evaluated the model-generated “reasoning” section on the following aspects: (1) <italic>reading comprehension</italic>, (2) <italic>rationale</italic>, (3) <italic>recall of knowledge</italic>, (4) <italic>omission of diagnostic reasoning</italic>, and (5) <italic>abstraction</italic> and <italic>effective abstraction</italic>. <italic>Reading comprehension</italic> was intended to capture whether a model understood the information in a progress note. <italic>Rationale</italic> was intended to capture the inclusion of incorrect reasoning steps. <italic>Recall of knowledge</italic> was intended to capture the hallucination of incorrect facts as well as the inclusion of irrelevant facts in the output. <italic>Omission</italic> of a diagnosis served the same purpose as noted previously by capturing instances when the model failed to support conclusions or provide evidence for a diagnostic choice. <italic>Abstraction</italic> and <italic>effective abstraction</italic> were intended to evaluate the amount of <italic>abstraction</italic> present in each part of the output. This was to ascertain how the knowledge paths influenced the type of output produced and whether the model was able to use abstraction. <italic>Omission</italic> as well as <italic>abstraction</italic> and <italic>effective abstraction</italic> were formatted as <italic>yes</italic> or <italic>no</italic> questions. <italic>Reading comprehension</italic>, <italic>rationale</italic>, and <italic>recall of knowledge</italic> were assessed on a Likert scale ranging from 1 to 5, with 1 indicating strong agreement with poor quality and 5 indicating strong disagreement (representing high quality).</p>
          <p>We recruited 2 medical professionals to evaluate LLM outputs using human evaluation guidelines developed by us. Full details of the guidelines, evaluation training, and interannotator agreement are reported in a separate publication (currently under review). The evaluation framework used the REDCap (Research Electronic Data Capture; Vanderbilt University) web application to present the evaluators with input notes, gold standard diagnoses, and model-predicted diagnoses. The evaluators, treated as separate arms in a longitudinal framework, assessed models with KG paths and those without across 2 defined events. Detailed step-by-step guidelines were provided for completing the evaluations in REDCap.</p>
          <p>Two senior board-certified clinical informatics physicians served as advisors, pilot testers, and trainers for the 2 medical professionals who completed the human evaluations. The 2 physicians used 5 samples cases to iteratively refine the guidelines provided to the evaluators; these sample evaluations also served as examples for the evaluators to reference during training. The evaluation guidelines consisted of clear descriptions of the meaning of evaluative scores for each aspect of the human evaluation framework as well as a completed example workflow.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Intrinsic Evaluation of DR.KNOWS on Predicting Diagnostic Concepts</title>
        <p>We compared DR.KNOWS with QuickUMLS, which is a concept extractor baseline that identifies medical concepts from raw text. We took input text, parsed it with QuickUMLS, and outputted a list of concepts. <xref ref-type="table" rid="table2">Table 2</xref> presents results from the 2 EHR datasets, MIMIC and in-house. The selection of different top N values was determined by the disparity in text length between the 2 datasets. DR.KNOWS demonstrated superior precision and <italic>F</italic>-scores compared to QuickUMLS across both datasets compared to the baseline, with precision scores of 19.10 (95% CI 17.82-20.37) versus 13.59 (95% CI 12.32-14.88) on the MIMIC dataset and 22.88 (95% CI 20.92-24.85) versus 12.38 (95% CI 11.09-13.66) on the in-house dataset. In addition, its <italic>F</italic>-scores of 25.20 (95% CI 23.93-26.48) on the MIMIC dataset and 25.70 (95% CI 24.06-27.37) on the in-house dataset exceeded the comparison scores of 21.13 (95% CI 19.85-22.41) and 20.09 (95% CI 18.81-21.37), respectively, underscoring the effectiveness of DR.KNOWS in accurately predicting diagnostic CUIs. The TriAttn variant of DR.KNOWS consistently outperformed the MultiAttn variant on both datasets, with <italic>F</italic>-scores of 25.20 (95% CI 23.93-26.48) versus 23.10 (95% CI 21.83-24.39) on the MIMIC dataset and 25.70 (95% CI 24.06-27.37) versus 17.69 (95% CI 16.40-18.96) on the in-house dataset. The concept extractor baseline achieved the highest recall scores—56.91 on the MIMIC dataset and 90.11 on the in-house dataset—because it identified all input concepts that overlapped with the reference CUIs, in particular on the in-house dataset, which was largely an extractive dataset. Training the DR.KNOWS model took an average of 2 of 3 (SD 1.22) hours per epoch on 5000 samples, using 8000 MB of GPU memory.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Performance comparison between concept extraction and 2 variants of DR.KNOWS on target concept unique identifier prediction using the Medical Information Mart for Intensive Care (MIMIC-III) and in-house datasets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="120"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="0"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <thead>
              <tr valign="top">
                <td>Model</td>
                <td colspan="5">MIMIC-III</td>
                <td colspan="4">In-house</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Top N knowledge paths</td>
                <td>Recall score (95% CI)</td>
                <td>Precision score (95% CI)</td>
                <td><italic>F</italic>-score (95% CI)</td>
                <td colspan="2">Top N knowledge paths</td>
                <td>Recall score (95% CI)</td>
                <td>Precision score (95% CI)</td>
                <td><italic>F</italic>-score (95% CI)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Concept extractor</td>
                <td>—<sup>a</sup></td>
                <td>56.91 (55.62-58.18)</td>
                <td>13.59 (12.32-14.88)</td>
                <td>21.13 (19.85-22.41)</td>
                <td colspan="2">—</td>
                <td><italic>90.11</italic><sup>b</sup> (88.84-91.37)</td>
                <td>12.38 (11.09-13.66)</td>
                <td>20.09 (18.81-21.37)</td>
              </tr>
              <tr valign="top">
                <td>MultiAttn<sup>c</sup></td>
                <td>4</td>
                <td>26.91 (25.64-28.19</td>
                <td><italic>22.79</italic> (21.51-24.06)</td>
                <td>23.10 (21.83-24.39)</td>
                <td colspan="2">6</td>
                <td>24.68 (23.35-25.91)</td>
                <td>15.82 (14.55-17.10)</td>
                <td>17.69 (16.40-18.96)</td>
              </tr>
              <tr valign="top">
                <td>MultiAttn</td>
                <td>6</td>
                <td>29.14 (27.85-30.41)</td>
                <td>16.73 (15.46-18.00)</td>
                <td>19.94 (18.66-21.22)</td>
                <td colspan="2">8</td>
                <td>28.69 (27.43-29.98)</td>
                <td>15.82 (14.55-17.11)</td>
                <td>17.33 (16.06-18.60)</td>
              </tr>
              <tr valign="top">
                <td>TriAttn<sup>d</sup></td>
                <td>4</td>
                <td>29.85 (26.23-33.45)</td>
                <td>17.61 (16.33-18.89)</td>
                <td>20.93 (19.67-22.21)</td>
                <td colspan="2">6</td>
                <td>34.00 (31.04-36.97)</td>
                <td><italic>22.88</italic> (20.92-24.85)</td>
                <td>23.39 (21.71-25.06)</td>
              </tr>
              <tr valign="top">
                <td>TriAttn</td>
                <td>6</td>
                <td>37.06 (35.80-38.33)</td>
                <td>19.10 (17.82-20.37)</td>
                <td><italic>25.20</italic> (23.93-26.48)</td>
                <td colspan="2">8</td>
                <td>44.58 (41.38-47.78)</td>
                <td>22.43 (20.62-24.23)</td>
                <td><italic>25.70</italic> (24.06-27.37)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Not applicable.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>Best performance values are italicized.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>MultiAttn: multihead attention.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>TriAttn: trilinear attention.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Assessing the Impact of DR.KNOWS on Diagnosis Prediction</title>
        <p>The best systems for each foundational model on the ProbSum test set are presented in <xref ref-type="table" rid="table3">Table 3</xref>, including those with predicted paths provided by DR.KNOWS and those without. Overall, the prompt-based fine-tuning of T5 surpassed ChatGPT’s prompt-based zero-shot approach on all metrics, and ChatGPT’s prompt-based few-shot approach showed comparable performance to T5. Notably, models that incorporated paths, particularly for the CUI <italic>F</italic>-score, showed significant improvements. The vanilla T5 model with a path prompt excelled, achieving the highest ROUGE-L score (30.72, 95% CI 30.40-32.44) and CUI <italic>F</italic>-score (27.78, 95% CI 27.09-29.80). This ROUGE-L score could have ranked third on the ProbSum leaderboard [<xref ref-type="bibr" rid="ref27">27</xref>], which is noteworthy considering that the top 2 systems used ensemble methods [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Best performance on the Medical Information Mart for Intensive Care III (MIMIC III) test set (with annotated active diagnoses) from 3 Text-to-Text Transfer Transformer (T5) variants and ChatGPT across all prompt styles with DR.KNOWS (Diagnostic Reasoning Knowledge Graph System) path prompting and without. To illustrate the performance differences better, we report Recall-Oriented Understudy for Gisting Evaluation-2 (ROUGE-2); ROUGE–Longest Common Subsequence (ROUGE-L); and concept unique identifier (CUI) recall, precision, and F-scores.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="220"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Model</td>
                <td colspan="2">Rouge-2 score (95% CI)</td>
                <td colspan="2">Rouge-L score (95% CI)</td>
                <td colspan="2">CUI recall score (95% CI)</td>
                <td colspan="2">CUI precision score (95% CI)</td>
                <td>CUI <italic>F</italic>-score (95% CI)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="12">
                  <bold>Prompt-based fine-tuning setting</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Vanilla T5</td>
                <td colspan="2">12.66 (11.24-13.54)</td>
                <td colspan="2">29.08 (27.52-29.99)</td>
                <td colspan="2">39.17 (37.53-41.56)</td>
                <td colspan="2">22.89 (21.02-23.62)</td>
                <td colspan="2">26.19 (25.31-26.78)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Vanilla T5+path<sup>a</sup></td>
                <td colspan="2">13.13 (12.64-13.88)</td>
                <td colspan="2"><italic>30.72</italic><sup>b</sup> (30.40-32.44<sup>c</sup>)</td>
                <td colspan="2"><italic>40.73</italic> (39.46-42.18)</td>
                <td colspan="2">24.28 (23.49-26.03)</td>
                <td colspan="2"><italic>27.78</italic> (27.08-29.80<sup>c</sup>)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Flan-T5</td>
                <td colspan="2">11.83 (10.51-12.40)</td>
                <td colspan="2">27.02 (25.64-27.80)</td>
                <td colspan="2">38.28 (36.70-39.45)</td>
                <td colspan="2">22.32 (21.81-23.00)</td>
                <td colspan="2">25.32 (24.10-26.34)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Flan-T5+path</td>
                <td colspan="2"><italic>13.30</italic> (12.19-14.44)</td>
                <td colspan="2">30.00 (29.20-32.70<sup>c</sup>)</td>
                <td colspan="2">38.96 (37.48-40.01)</td>
                <td colspan="2"><italic>24.74</italic> (23.35-26.12<sup>c</sup>)</td>
                <td colspan="2">27.38 (26.98-28.68<sup>c</sup>)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Clinical-T5</td>
                <td colspan="2">11.68 (11.06-12.49)</td>
                <td colspan="2">25.84 (23.74-26.15)</td>
                <td colspan="2">30.37 (28.94-30.99)</td>
                <td colspan="2">17.91 (15.46-19.79)</td>
                <td colspan="2">19.61 (16.44-20.03)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Clinical-T5+path</td>
                <td colspan="2">12.06 (10.89-12.48)</td>
                <td colspan="2">25.97 (24.71-26.33)</td>
                <td colspan="2">29.45 (27.65-30.19)</td>
                <td colspan="2">22.78 (21.35-23.59<sup>c</sup>)</td>
                <td colspan="2">23.17 (21.39-23.96<sup>c</sup>)</td>
              </tr>
              <tr valign="top">
                <td colspan="12">
                  <bold>Prompt-based zero-shot setting</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ChatGPT</td>
                <td colspan="2">7.05 (6.54-7.56)</td>
                <td colspan="2">19.77 (19.26-20.28)</td>
                <td colspan="2">23.68 (23.18-24.19)</td>
                <td colspan="2">15.52 (15.00-16.02)</td>
                <td colspan="2">16.04 (15.53-16.55)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ChatGPT+path</td>
                <td colspan="2">5.70 (5.19-6.21)</td>
                <td colspan="2">15.49 (14.98-15.99)</td>
                <td colspan="2">25.33 (24.82-25.84<sup>c</sup>)</td>
                <td colspan="2">17.05 (16.29-17.81<sup>c</sup>)</td>
                <td colspan="2">18.21 (17.46-18.98<sup>c</sup>)</td>
              </tr>
              <tr valign="top">
                <td colspan="12">
                  <bold>Prompt-based few-shot setting</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ChatGPT 3-shot</td>
                <td colspan="2">9.63 (8.32-10.06)</td>
                <td colspan="2">21.84 (19.99-22.09)</td>
                <td colspan="2">22.71 (20.99-23.96)</td>
                <td colspan="2">19.57 (17.23-19.78)</td>
                <td colspan="2">21.02 (20.26-21.79)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ChatGPT 5-shot</td>
                <td colspan="2">9.73 (8.52-10.18)</td>
                <td colspan="2">21.23 (19.58-21.72)</td>
                <td colspan="2">22.45 (20.93-23.80)</td>
                <td colspan="2">19.67 (17.66-20.33)</td>
                <td colspan="2">20.96 (20.19-21.73)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ChatGPT 3-shot+path</td>
                <td colspan="2">10.66 (9.17-10.72)</td>
                <td colspan="2">24.32 (22.44-24.25<sup>c</sup>)</td>
                <td colspan="2">26.48 (25.33-28.36<sup>c</sup>)</td>
                <td colspan="2">24.22 (21.44-24.21<sup>c</sup>)</td>
                <td colspan="2">25.30 (24.52-26.06<sup>c</sup>)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>ChatGPT 5-shot+path</td>
                <td colspan="2">11.73 (10.51-12.25<sup>c</sup>)</td>
                <td colspan="2">25.43 (23.53-25.35<sup>c</sup>)</td>
                <td colspan="2">27.76 (26.56-29.39<sup>c</sup>)</td>
                <td colspan="2">24.56 (22.47-25.12<sup>c</sup>)</td>
                <td colspan="2">26.02 (25.25-26.78<sup>c</sup>)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Prompt styles with DR.KNOWS path prompting.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>Best performance values are italicized.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>95% CIs with a distinct CI for the DR.KNOWS-prompted path compared to no-path scenarios.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>The comparison between ChatGPT with DR.KNOWS and ChatGPT without in the predicted paths scenario provided additional insights. In the few-shot setting, the incorporation of paths led to marked improvements; for instance, in the 3-shot setting, the with-path scenario outperformed the no-path scenario on all metrics, with ROUGE-L score of 24.32 (95% CI 22.44-24.25) compared to ChatGPT 3-shot no-path ROUGE-L score of 21.84 (95% CI 19.44-22.09) and CUI <italic>F</italic>-score of 25.30 (95% CI 24.52-26.06) versus 21.02 (95% CI 20.26-21.79). In the 5-shot setting, ChatGPT with paths achieved a ROUGE-L score of 25.43 (95% CI 25.53-25.35) compared to 21.23 (95% CI 19.58-21.72) for ChatGPT without paths and CUI <italic>F</italic>-score of 26.02 (95% CI 25.25-26.78) versus 20.96 (95% CI 20.19-21.73).</p>
      </sec>
      <sec>
        <title>Human Evaluation Results</title>
        <p>After the annotation procedure, the 2 medical professionals completed a supervised set of evaluations and were considered validated once they achieved a κ coefficient of 0.7 with the physician trainers and each other.</p>
        <p>Although the T5 and ChatGPT models displayed similar performance on automated metrics, their outputs diverged significantly. The T5 models, lacking instruction tuning, failed to respond adequately to prompts requesting the generation of a &#60;Reasoning&#62; section. Consequently, our human evaluation focused exclusively on the outputs produced by ChatGPT. We conducted human evaluation of the top-performing ChatGPT output (5-shot approach), comparing scenarios with the DR.KNOWS knowledge paths with KG and without KG. The final evaluation set consisted of 92 input notes and 2 sets of ChatGPT-predicted text.</p>
        <p>The results are reported in <xref rid="figure4" ref-type="fig">Figure 4</xref>. First, there was no significant increase in <italic>omission of diagnoses</italic>, with 16% (15/92) observed with KG as opposed to 10% (9/92) without KG (<italic>P</italic>=.16). Regarding <italic>rationale</italic> (correct reasoning), ChatGPT with KG exhibited stronger agreement with the human evaluators (51/92, 55%) than ChatGPT without KG (46/92, 50%; <italic>P</italic>&#60;.001). In the <italic>abstraction</italic> category (assessing the presence of abstraction in the model output), there was a notable drop from 88% (81/92; without KG ) to 78% (71/92; with KG ) in the affirmative responses (<italic>P</italic>=.03), indicating that less abstraction was required when KG paths were included. Differences were also noted in <italic>effective abstraction</italic> in favor of the KG paths (<italic>P</italic>=.002).</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Human evaluation of ChatGPT outputs comparing scenarios with (“KG” [knowledge graph]) the DR.KNOWS (Diagnostic Reasoning Knowledge Graph System) knowledge paths and without (“No KG”).</p>
          </caption>
          <graphic xlink:href="ai_v4i1e58670_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Error Analysis</title>
        <p>We discovered 2 primary types of errors in the DR.KNOWS outputs that could result in missed opportunities for improving knowledge grounding. <xref rid="figure5" ref-type="fig">Figure 5</xref> presents an example where ChatGPT did not find the provided knowledge paths useful. In this case, the majority of the provided knowledge paths were highly extractive (“leukocytosis,” “reticular dysgenesis,” and “paraplegia” are the target concepts to which the knowledge paths led, and all are associated with a “self-loop” relationship). On the abstraction paths, the retrieved target concepts “abdomen hernia scrotal” and “chronic neutrophilia” were not relevant to the input patient condition.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>An example of an error in the knowledge paths retrieved by DR.KNOWS (Diagnostic Reasoning Knowledge Graph System). DR.KNOWS retrieved 2 paths leading to irrelevant and misleading diagnoses (marked in red). The counterclockwise gapped circular arrow symbol represents a self-loop.</p>
          </caption>
          <graphic xlink:href="ai_v4i1e58670_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Another error observed occurred when DR.KNOWS selected the source CUIs that were less likely to generate pertinent paths for clinical diagnoses, resulting in ineffective knowledge paths. <xref rid="figure6" ref-type="fig">Figure 6</xref> shows a retrieved path from “consulting with (procedure)” to “consultation-action (qualifier value).” Although some procedure-related concepts such as endoscopy or blood testing were valuable for clinical diagnosis, this specific path of consulting did not contribute meaningfully to the input case. Similarly, another erroneous pathway began with “drug allergy” and led to “allergy to dimetindene (finding),” which is contradictory, given that the input note explicitly states “no known drug allergies.” While the consulting path’s issue was its lack of utility, the “drug allergy” path could introduce the risk of hallucination (misleading or fabricated content) within ChatGPT.</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>An example illustrating ChatGPT’s performance with the knowledge paths extracted by DR.KNOWS (Diagnostic Reasoning Knowledge Graph System). Two paths had source concept unique identifiers (“Consulting with [procedure]” and “Drug allergy”) that were less likely to generate pertinent paths for clinical diagnoses. Of note, the path of “Drug allergy” led to a path contradicting the “No Known Drug Allergies” description in the input. The path of “cirrhosis of liver” represents a correct diagnosis, but ChatGPT failed to include it. The counterclockwise gapped circular arrow symbol represents a self-loop. ESRD: end-stage renal disease.</p>
          </caption>
          <graphic xlink:href="ai_v4i1e58670_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>In addition to the errors in the DR.KNOWS outputs, there were instances where ChatGPT failed to leverage the accurate knowledge paths presented. <xref rid="figure6" ref-type="fig">Figure 6</xref> includes a knowledge path regarding “cirrhosis of liver,” which was the correct diagnosis. However, ChatGPT response did not include this diagnosis.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>DR.KNOWS showed significant advantages over the QuickUMLS concept extractor baseline in extracting correct concepts for diagnoses. On the ProbSum dataset, where the goal was to generate a list of diagnoses given the progress notes, prompt-based fine-tuning of T5 outperformed ChatGPT’s zero-shot approach and showed comparable results to its few-shot approaches, with the inclusion of predicted paths by DR.KNOWS significantly enhancing performance across all metrics. The vanilla T5 with path prompts notably achieved top ROUGE-L and CUI <italic>F</italic>-scores, demonstrating the effectiveness of incorporating paths into the model. Human evaluation of ChatGPT’s reasoning section showed strong agreement with human evaluators in terms of correct <italic>rationale</italic> and enhanced <italic>effective abstraction</italic>, indicating nuanced improvement in reasoning and abstraction quality with KG integration.</p>
        <p>While DR.KNOWS leverages KG paths to enhance diagnosis prediction, it is important to acknowledge the potential biases and limitations inherent in KG data. KGs such as UMLS are comprehensive, but they may reflect biases based on the clinical domains and patient populations from which they were constructed, which could impact the relevance or appropriateness of the retrieved paths. To mitigate this, DR.KNOWS focuses on case-specific path selection, aiming to retrieve only the paths most directly relevant to the patient context. Nonetheless, future iterations could benefit from evaluating path relevance using additional contextual information, such as demographic details, to better align with patient-specific needs and reduce bias.</p>
        <p>Error analysis showed that DR.KNOWS occasionally struggled with identifying knowledge paths unrelated to the patient representation; in addition, the analysis emphasized the importance of selecting accurate starting medical concepts. Currently, DR.KNOWS relies solely on semantic-based ranking on the candidate paths, that is, the cosine similarity between candidate path embeddings and input text, with the embedding quality being crucial for ranking performance. Improving the representation and embedding methods, as well as exploring probabilistic modeling techniques [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], could enhance path relevance. Furthermore, incorporating a graph reasoning mechanism that enables symbolic chain-of-thought reasoning might compensate for the weaknesses of contextualized embeddings and cosine-similarity metrics [<xref ref-type="bibr" rid="ref44">44</xref>], presenting a valuable future direction. This integration could improve the diagnostic potential of DR.KNOWS, allowing for more nuanced and bias-aware reasoning.</p>
        <p>The error analysis also presented instances where ChatGPT neglected to incorporate certain beneficial knowledge paths. It is important to acknowledge that ChatGPT operates as a black box application programming interface model, with its internal weights and training processes being inaccessible. To enhance the efficacy of the graph-based retrieve-and-augment framework, it would be advantageous to explore the potential of graph prompting and instruction tuning on open-source language models. These methods could refine the model’s ability to use relevant information effectively. Other relevant research also uses advanced prompting techniques, such as self-retrieval–augmented generation [<xref ref-type="bibr" rid="ref45">45</xref>] and step-back prompting [<xref ref-type="bibr" rid="ref46">46</xref>]. The Google Research team recently presented a study investigating multiple ways of encoding graphs into LLM inputs [<xref ref-type="bibr" rid="ref47">47</xref>], which might inform a future direction for this work beyond the typical structural or clause-based path prompting.</p>
        <p>In conclusion, LLMs such as ChatGPT hold promise for generating diagnoses for clinical decision support; however, methods such as graph prompting are needed to guide the model down the correct reasoning paths to avoid hallucinations and provide comprehensive diagnoses. While we show some progress in a graph prompting approach with DR.KNOWS, more work is needed to improve methods that leverage the UMLS knowledge source for grounding to achieve more accurate outputs. Nonetheless, DR.KNOWS represents a step toward trustworthy AI in medicine, providing knowledge grounding to LLMs and potentially reducing factual errors in diagnostic outputs [<xref ref-type="bibr" rid="ref48">48</xref>]. Furthermore, our proposed human evaluation framework, derived from diagnostic safety evaluations used in clinical settings, enables the assessment of LLMs from the perspective of diagnostic safety. It carries strong face validity and reliability to evaluate a model’s strengths and weaknesses as a diagnostic decision support system. This ensures that the models not only perform well on technical metrics but also align with clinical standards of safety and reliability.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Our work on leveraging KGs for LLM diagnosis generation has shown promising results; however, there are notable limitations that must be acknowledged. First, while the UMLS concept extractors (Clinical Text Analysis and Knowledge Extraction System and QuickUMLS) are powerful tools, they are not without flaws. One significant limitation is their inability to accurately identify all relevant concepts, particularly indirect or nuanced medical concepts. These indirect concepts can be crucial for accurate diagnosis generation; yet, the current concept extractors may fail to recognize them, leading to incomplete or less accurate knowledge representation.</p>
        <p>Second, our path selection methodology relies heavily on cosine similarity, a common approach within the retrieval-augmented generation framework. Despite its prevalence, this method has inherent limitations due to its heavy reliance on the quality of embedding representations. If the embeddings do not adequately capture the semantic nuances of medical concepts, the similarity measure may lead to the retrieval of less relevant or noisy knowledge paths. This can ultimately impact the quality and reliability of the diagnostic suggestions generated by the LLM.</p>
        <p>These limitations highlight the need for the continued refinement of both the concept extraction and path selection processes. Future work should explore more sophisticated techniques to enhance concept identification and improve the robustness of embedding representations, thereby reducing the reliance on cosine similarity and increasing the overall accuracy and utility of the KG-based approach.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Data preprocessing, DR.KNOWS (Diagnostic Reasoning Knowledge Graph System) training details, prompt engineering using ChatGPT, and Text-to-Text Transfer Transformer (T5) fine-tuning.</p>
        <media xlink:href="ai_v4i1e58670_app1.docx" xlink:title="DOCX File , 37 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CUI</term>
          <def>
            <p>concept unique identifier</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">DR.KNOWS</term>
          <def>
            <p>Diagnostic Reasoning Knowledge Graph System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">EHR</term>
          <def>
            <p>electronic health record</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">GPT</term>
          <def>
            <p>Generative Pretrained Transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GPU</term>
          <def>
            <p>graphics processing unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">KG</term>
          <def>
            <p>knowledge graph</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MIMIC-III</term>
          <def>
            <p>Medical Information Mart for Intensive Care III</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">MultiAttn</term>
          <def>
            <p>multihead attention</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">REDCap</term>
          <def>
            <p>Research Electronic Data Capture</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">ROUGE</term>
          <def>
            <p>Recall-Oriented Understudy for Gisting Evaluation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">ROUGE-L</term>
          <def>
            <p>Recall-Oriented Understudy for Gisting Evaluation–Longest Common Subsequence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">SapBERT</term>
          <def>
            <p>Self-alignment Pretrained Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">SGIN</term>
          <def>
            <p>stack graph isomorphism network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">SNOMED CT</term>
          <def>
            <p>Systematized Nomenclature of Medicine–Clinical Terms</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">SOAP</term>
          <def>
            <p>subjective, objective, assessment, and plan</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">T5</term>
          <def>
            <p>Text-to-Text Transfer Transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">TriAttn</term>
          <def>
            <p>trilinear attention</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20">UMLS</term>
          <def>
            <p>Unified Medical Language System</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work is supported by grants from the National Institutes of Health. Funding was supported by the National Library of Medicine (K99LM014308, R00LM014308: YG; R01LM012973-04: TM and DD); the National Heart, Lung, and Blood Institute (R01HL157262-03: MMC); and the National Institute on Drug Abuse (R01DA051464: MA).</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The source code knowledge graph generated during this study are available on the GitHub repository [<xref ref-type="bibr" rid="ref49">49</xref>]. Medical Information Mart for Intensive Care III is available from PhysioNet.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>YG was responsible for conceptualization, supervision, methodology, formal analysis, writing (original draft as well as review and editing), validation, visualization, data curation, investigation, project administration, and funding acquisition. RL was responsible for writing (original draft as well as review and editing), methodology, data curation, validation, investigation, conceptualization, and formal analysis. EC was responsible for writing (original draft as well as review and editing), validation, methodology, data curation, investigation, conceptualization, and formal analysis. JRC was responsible for writing (review and editing), formal analysis, investigation, and data curation. BWP was responsible for writing (review and editing), validation, formal analysis, methodology, investigation, and conceptualization. MMC was responsible for writing (review and editing), conceptualization, methodology, and funding acquisition. TM was responsible for writing (review and editing), conceptualization, methodology, and funding acquisition. DD was responsible for writing (review and editing), conceptualization, methodology, and funding acquisition. MA was responsible for conceptualization, supervision, methodology, formal analysis, writing (original draft as well as review and editing), validation, visualization, data curation, investigation, project administration, and funding acquisition.</p>
      </fn>
      <fn fn-type="conflict">
        <p>TM is a consultant for Lavita.ai, a startup that builds NLP tools for medical use cases. All other authors declare no conflicts of interest.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Marquard</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Amster</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Romoser</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Friderici</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Goff</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fisher</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>What do physicians read (and ignore) in electronic progress notes?</article-title>
          <source>Appl Clin Inform</source>
          <year>2017</year>
          <month>12</month>
          <day>21</day>
          <volume>05</volume>
          <issue>02</issue>
          <fpage>430</fpage>
          <lpage>44</lpage>
          <pub-id pub-id-type="doi">10.4338/aci-2014-01-ra-0003</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rule</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bedrick</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chiang</surname>
              <given-names>MF</given-names>
            </name>
            <name name-style="western">
              <surname>Hribar</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Length and redundancy of outpatient progress notes across a decade at an academic medical center</article-title>
          <source>JAMA Netw Open</source>
          <year>2021</year>
          <month>07</month>
          <day>01</day>
          <volume>4</volume>
          <issue>7</issue>
          <fpage>e2115334</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34279650"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2021.15334</pub-id>
          <pub-id pub-id-type="medline">34279650</pub-id>
          <pub-id pub-id-type="pii">2782054</pub-id>
          <pub-id pub-id-type="pmcid">PMC8290305</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Capurro</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Verspoor</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>"Note Bloat" impacts deep learning-based NLP models for clinical prediction tasks</article-title>
          <source>J Biomed Inform</source>
          <year>2022</year>
          <month>09</month>
          <volume>133</volume>
          <fpage>104149</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(22)00161-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2022.104149</pub-id>
          <pub-id pub-id-type="medline">35878821</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(22)00161-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nijor</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rallis</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lad</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gokcen</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Patient safety issues from information overload in electronic medical records</article-title>
          <source>J Patient Saf</source>
          <year>2022</year>
          <month>09</month>
          <day>01</day>
          <volume>18</volume>
          <issue>6</issue>
          <fpage>e999</fpage>
          <lpage>1003</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35985047"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/PTS.0000000000001002</pub-id>
          <pub-id pub-id-type="medline">35985047</pub-id>
          <pub-id pub-id-type="pii">01209203-202209000-00034</pub-id>
          <pub-id pub-id-type="pmcid">PMC9422765</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Furlow</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Information overload and unsustainable workloads in the era of electronic health records</article-title>
          <source>Lancet Respir Med</source>
          <year>2020</year>
          <month>03</month>
          <volume>8</volume>
          <issue>3</issue>
          <fpage>243</fpage>
          <lpage>4</lpage>
          <pub-id pub-id-type="doi">10.1016/S2213-2600(20)30010-2</pub-id>
          <pub-id pub-id-type="medline">32135094</pub-id>
          <pub-id pub-id-type="pii">S2213-2600(20)30010-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Croskerry</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Henriksen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Battles</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Marks</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Lewin</surname>
              <given-names>DI</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic failure: a cognitive and affective approach</article-title>
          <source>Advances in Patient Safety: From Research to Implementation. Volume 2</source>
          <year>2005</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Agency for Healthcare Research and Quality</publisher-name>
          <fpage>241</fpage>
          <lpage>54</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Dligach</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Churpek</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Afshar</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Summarizing patients’ problems from hospital progress notes using pre-trained sequence-to-sequence models</article-title>
          <source>Proceedings of the 29th International Conference on Computational Linguistics</source>
          <year>2022</year>
          <conf-name>COLING '22</conf-name>
          <conf-date>October 12-17, 2022</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>2979</fpage>
          <lpage>91</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LH</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>05</month>
          <day>24</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>160035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2016.35"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Dligach</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Churpek</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Afshar</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Overview of the problem list summarization (ProbSum) 2023 shared task on summarizing patients' active diagnoses and problems from electronic health record progress notes</article-title>
          <source>Proc Conf Assoc Comput Linguist Meet</source>
          <year>2023</year>
          <month>07</month>
          <volume>2023</volume>
          <fpage>461</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37583489"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2023.bionlp-1.43</pub-id>
          <pub-id pub-id-type="medline">37583489</pub-id>
          <pub-id pub-id-type="pmcid">PMC10426335</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Manakul</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Fathullah</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liusie</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Raina</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Raina</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Gales</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>CUED at ProbSum 2023: hierarchical ensemble of summarization models</article-title>
          <source>Proceedings of the 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks</source>
          <year>2023</year>
          <conf-name>BioNLP '23</conf-name>
          <conf-date>July 13, 2023</conf-date>
          <conf-loc>Toronto, ON</conf-loc>
          <fpage>516</fpage>
          <lpage>23</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2023.bionlp-1.51.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2023.bionlp-1.51</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Schlegel</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Batista-Navarro</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>TT</given-names>
            </name>
            <name name-style="western">
              <surname>Kashyap</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Zeng</surname>
              <given-names>XJ</given-names>
            </name>
            <name name-style="western">
              <surname>Beck</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Winkler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nenadic</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Team:PULSAR at ProbSum 2023:PULSAR: pre-training with extracted healthcare terms for summarising patients’ problems and data augmentation with black-box large language models</article-title>
          <source>Proceedings of the 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks</source>
          <year>2023</year>
          <conf-name>BioNLP '23</conf-name>
          <conf-date>July 13, 2023</conf-date>
          <conf-loc>Toronto, ON</conf-loc>
          <fpage>503</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/2023.bionlp-1.49</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raffel</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Narang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Matena</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>PJ</given-names>
            </name>
          </person-group>
          <article-title>Exploring the limits of transfer learning with a unified text-to-text transformer</article-title>
          <source>J Mach Learn Res</source>
          <year>2020</year>
          <volume>21</volume>
          <issue>140</issue>
          <fpage>1</fpage>
          <lpage>67</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jmlr.org/papers/volume21/20-074/20-074.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Floridi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chiriatti</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>GPT-3: its nature, scope, limits, and consequences</article-title>
          <source>Minds Mach</source>
          <year>2020</year>
          <month>11</month>
          <day>01</day>
          <volume>30</volume>
          <issue>4</issue>
          <fpage>681</fpage>
          <lpage>94</lpage>
          <pub-id pub-id-type="doi">10.1007/S11023-020-09548-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baumgartner</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>The potential impact of ChatGPT in clinical and translational medicine</article-title>
          <source>Clin Transl Med</source>
          <year>2023</year>
          <month>03</month>
          <volume>13</volume>
          <issue>3</issue>
          <fpage>e1206</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36854881"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/ctm2.1206</pub-id>
          <pub-id pub-id-type="medline">36854881</pub-id>
          <pub-id pub-id-type="pmcid">PMC9974599</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bodenreider</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>The Unified Medical Language System (UMLS): integrating biomedical terminology</article-title>
          <source>Nucleic Acids Res</source>
          <year>2004</year>
          <month>01</month>
          <day>01</day>
          <volume>32</volume>
          <issue>Database issue</issue>
          <fpage>D267</fpage>
          <lpage>70</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/14681409"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkh061</pub-id>
          <pub-id pub-id-type="medline">14681409</pub-id>
          <pub-id pub-id-type="pii">32/suppl_1/D267</pub-id>
          <pub-id pub-id-type="pmcid">PMC308795</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Biomedical event extraction with hierarchical knowledge graphs</article-title>
          <source>Proceedings of the 2020 Conference on Association for Computational Linguistics</source>
          <year>2020</year>
          <conf-name>EMNLP '20</conf-name>
          <conf-date>November 16-20, 2020</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>1277</fpage>
          <lpage>85</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.findings-emnlp.114.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.findings-emnlp.114</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Dou</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>TH</given-names>
            </name>
          </person-group>
          <article-title>Parameter-efficient domain knowledge integration from multiple sources for biomedical pre-trained language models</article-title>
          <source>Proceedings of the 2021 Conference on the Association for Computational Linguistics</source>
          <year>2021</year>
          <conf-name>EMNLP '21</conf-name>
          <conf-date>November 7-11, 2021</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>3855</fpage>
          <lpage>65</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2021.findings-emnlp.325.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2021.findings-emnlp.325</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aracena</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Villena</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Rojas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dunstan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A knowledge-graph-based intrinsic test for benchmarking medical concept embeddings and pretrained language models</article-title>
          <source>Proceedings of the 13th International Workshop on Health Text Mining and Information Analysis</source>
          <year>2022</year>
          <conf-name>LOUHI '22</conf-name>
          <conf-date>December 7, 2022</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>197</fpage>
          <lpage>206</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2022.louhi-1.22.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2022.louhi-1.22</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>BERT-MK: integrating graph contextualized knowledge into pre-trained language models</article-title>
          <source>Proceedings of the 2020 Conference on Association for Computational Linguistics</source>
          <year>2020</year>
          <conf-name>EMNLP '20</conf-name>
          <conf-date>November 16-20, 2020</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>2281</fpage>
          <lpage>90</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.findings-emnlp.207.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.findings-emnlp.207</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Unifying large language models and knowledge graphs: a roadmap</article-title>
          <source>IEEE Trans Knowl Data Eng</source>
          <year>2024</year>
          <month>7</month>
          <volume>36</volume>
          <issue>7</issue>
          <fpage>3580</fpage>
          <lpage>99</lpage>
          <pub-id pub-id-type="doi">10.1109/tkde.2024.3352100</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bowen</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>Educational strategies to promote clinical diagnostic reasoning</article-title>
          <source>N Engl J Med</source>
          <year>2006</year>
          <month>11</month>
          <day>23</day>
          <volume>355</volume>
          <issue>21</issue>
          <fpage>2217</fpage>
          <lpage>25</lpage>
          <pub-id pub-id-type="doi">10.1056/nejmra054782</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Corazza</surname>
              <given-names>GR</given-names>
            </name>
            <name name-style="western">
              <surname>Lenti</surname>
              <given-names>MV</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic reasoning in internal medicine. Cynefin framework makes sense of clinical complexity</article-title>
          <source>Front Med (Lausanne)</source>
          <year>2021</year>
          <month>4</month>
          <day>22</day>
          <volume>8</volume>
          <fpage>641093</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33968954"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fmed.2021.641093</pub-id>
          <pub-id pub-id-type="medline">33968954</pub-id>
          <pub-id pub-id-type="pmcid">PMC8100038</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kanwal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Rizzo</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Attention-based clinical note summarization</article-title>
          <source>Proceedings of the 37th ACM/SIGAPP Symposium on Applied Computing</source>
          <year>2022</year>
          <conf-name>SAC '22</conf-name>
          <conf-date>April 25-29, 2022</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>813</fpage>
          <lpage>20</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.1145/3477314.3507256"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3477314.3507256</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Alsentzer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ketenci</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zucker</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Elhadad</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>What’s in a summary? Laying the groundwork for advances in hospital-course summarization</article-title>
          <source>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2021</year>
          <conf-name>NAACL '21</conf-name>
          <conf-date>June 6-11, 2021</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>4794</fpage>
          <lpage>811</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2021.naacl-main.382.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2021.naacl-main.382</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pivovarov</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Elhadad</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Automated methods for the summarization of electronic health records</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2015</year>
          <month>09</month>
          <volume>22</volume>
          <issue>5</issue>
          <fpage>938</fpage>
          <lpage>47</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/25882031"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocv032</pub-id>
          <pub-id pub-id-type="medline">25882031</pub-id>
          <pub-id pub-id-type="pii">ocv032</pub-id>
          <pub-id pub-id-type="pmcid">PMC4986665</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tsou</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Poddar</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A novel system for extractive clinical note summarization using EHR data</article-title>
          <source>Proceedings of the 2nd Clinical Natural Language Processing Workshop</source>
          <year>2019</year>
          <conf-name>ClinicalNLP '19</conf-name>
          <conf-date>June 7, 2019</conf-date>
          <conf-loc>Minneapolis, MN</conf-loc>
          <fpage>46</fpage>
          <lpage>54</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W19-1906/"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/w19-1906</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Subgraph retrieval enhanced model for multi-hop knowledge base question answering</article-title>
          <source>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics</source>
          <year>2022</year>
          <conf-name>ACL '22</conf-name>
          <conf-date>May 22-27, 2022</conf-date>
          <conf-loc>Dublin, Ireland</conf-loc>
          <fpage>5773</fpage>
          <lpage>84</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2022.acl-long.396.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.396</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yasunaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bosselut</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Leskovec</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Deep bidirectional language-knowledge graph pretraining</article-title>
          <source>Proceedings of the 36th Annual Conference on Neural Information Processing Systems</source>
          <year>2022</year>
          <conf-name>NIPS '22</conf-name>
          <conf-date>November 28-December 9, 2022</conf-date>
          <conf-loc>New Orleans, LA</conf-loc>
          <fpage>37309</fpage>
          <lpage>23</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dl.acm.org/doi/10.5555/3600270.3602974"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Empowering language models with knowledge graph reasoning for open-domain question answering</article-title>
          <source>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2022</year>
          <conf-name>EMNLP '22</conf-name>
          <conf-date>December 7-11, 2022</conf-date>
          <conf-loc>Abu Dhabi, United Arab Emirates</conf-loc>
          <fpage>9562</fpage>
          <lpage>81</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2022.emnlp-main.650.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.650</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weed</surname>
              <given-names>LL</given-names>
            </name>
          </person-group>
          <article-title>Medical records, patient care, and medical education</article-title>
          <source>Ir J Med Sci</source>
          <year>2008</year>
          <month>10</month>
          <day>22</day>
          <volume>39</volume>
          <issue>6</issue>
          <fpage>271</fpage>
          <lpage>82</lpage>
          <pub-id pub-id-type="doi">10.1007/bf02945791</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Masanz</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ogren</surname>
              <given-names>PV</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kipper-Schuler</surname>
              <given-names>KC</given-names>
            </name>
            <name name-style="western">
              <surname>Chute</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>Mayo clinical Text Analysis and Knowledge Extraction System (cTAKES): architecture, component evaluation and applications</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2010</year>
          <month>09</month>
          <day>01</day>
          <volume>17</volume>
          <issue>5</issue>
          <fpage>507</fpage>
          <lpage>13</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/20819853"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id>
          <pub-id pub-id-type="medline">20819853</pub-id>
          <pub-id pub-id-type="pii">17/5/507</pub-id>
          <pub-id pub-id-type="pmcid">PMC2995668</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soldaini</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Goharian</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Quickumls: a fast, unsupervised approach for medical concept extraction</article-title>
          <source>Proceedings of the 2016 Conference on Medical Information Retrieval</source>
          <year>2016</year>
          <conf-name>MedIR '16</conf-name>
          <conf-date>July 21, 2016</conf-date>
          <conf-loc>Pisa, Italy</conf-loc>
          <fpage>1</fpage>
          <lpage>4</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ir.cs.georgetown.edu/downloads/quickumls.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>RT</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>MC</given-names>
            </name>
          </person-group>
          <article-title>Measuring and improving the use of graph information in graph neural network</article-title>
          <source>Proceedings of the 8th International Conference on Learning Representations</source>
          <year>2020</year>
          <conf-name>ICLR '20</conf-name>
          <conf-date>June 16-18, 2020</conf-date>
          <conf-loc>Addis Ababa, Ethiopia</conf-loc>
          <fpage>1</fpage>
          <lpage>16</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/pdf?id=rkeIIkHKvS"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Shareghi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Basaldella</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Collier</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Self-alignment pretraining for biomedical entity representations</article-title>
          <source>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2021</year>
          <conf-name>NAACL '21</conf-name>
          <conf-date>June 6-11, 2021</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>4228</fpage>
          <lpage>38</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2021.naacl-main.334.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2021.naacl-main.334</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Conneau</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kiela</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Schwenk</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Barrault</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bordes</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Supervised learning of universal sentence representations from natural language inference data</article-title>
          <source>Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2017</year>
          <conf-name>EMNLP '17</conf-name>
          <conf-date>September 7-11, 2017</conf-date>
          <conf-loc>Copenhagen, Denmark</conf-loc>
          <fpage>670</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D17-1070.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d17-1070</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Longpre</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zoph</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Tay</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fedus</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Dehghani</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Brahma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Webson</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Scaling instruction-finetuned language models</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online October 20, 2022</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2210.11416"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Clinical-T5: large language models built using MIMIC clinical text</article-title>
          <source>PhysioNet</source>
          <access-date>2023-01-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.physionet.org/content/clinical-t5/1.0.0/">https://www.physionet.org/content/clinical-t5/1.0.0/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>White</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Hays</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sandborn</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Olea</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Elnashar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Spencer-Smith</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>DC</given-names>
            </name>
          </person-group>
          <article-title>A prompt pattern catalog to enhance prompt engineering with ChatGPT</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online February 21, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2302.11382"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gonen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Iyer</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Blevins</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>NA</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Demystifying prompts in language models via perplexity estimation</article-title>
          <source>Proceedings of the 2023 Conference of the Association for Computational Linguistics</source>
          <year>2023</year>
          <conf-name>EMNLP '23</conf-name>
          <conf-date>December 6-10, 2023</conf-date>
          <conf-loc>Singapore, Singapore</conf-loc>
          <fpage>10136</fpage>
          <lpage>48</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2023.findings-emnlp.679.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.679</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>CY</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>CY</given-names>
            </name>
          </person-group>
          <article-title>ROUGE: a package for automatic evaluation of summaries</article-title>
          <source>Text Summarization Branches Out</source>
          <year>2004</year>
          <publisher-loc>Barcelona, Spain</publisher-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>74</fpage>
          <lpage>81</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Khanna</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Spitzmueller</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>AN</given-names>
            </name>
          </person-group>
          <article-title>Recommendations for using the revised safer Dx instrument to help measure and improve diagnostic safety</article-title>
          <source>Diagnosis (Berl)</source>
          <year>2019</year>
          <month>11</month>
          <day>26</day>
          <volume>6</volume>
          <issue>4</issue>
          <fpage>315</fpage>
          <lpage>23</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.degruyter.com/document/doi/10.1515/dx-2019-0012"/>
          </comment>
          <pub-id pub-id-type="doi">10.1515/dx-2019-0012</pub-id>
          <pub-id pub-id-type="medline">31287795</pub-id>
          <pub-id pub-id-type="pii">dx-2019-0012</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rotmensch</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Halpern</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tlimat</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Horng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sontag</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Learning a health knowledge graph from electronic medical records</article-title>
          <source>Sci Rep</source>
          <year>2017</year>
          <month>07</month>
          <day>20</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>5994</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-017-05778-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-017-05778-z</pub-id>
          <pub-id pub-id-type="medline">28729710</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-017-05778-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC5519723</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wan</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>GaussianPath: a Bayesian multi-hop reasoning framework for knowledge graph reasoning</article-title>
          <source>AAAI Conf Artif Intell</source>
          <year>2021</year>
          <month>05</month>
          <day>18</day>
          <volume>35</volume>
          <issue>5</issue>
          <fpage>4393</fpage>
          <lpage>401</lpage>
          <pub-id pub-id-type="doi">10.1609/aaai.v35i5.16565</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fei</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Faithful logical reasoning via symbolic chain-of-thought</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online May 28, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2405.18357"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.720</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Asai</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sil</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hajishirzi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Self-RAG: learning to retrieve, generate, and critique through self-reflection</article-title>
          <source>Proceedings of the 25th International Conference on Learning Representations</source>
          <year>2024</year>
          <conf-name>ICLR '24</conf-name>
          <conf-date>May 7-11, 2024</conf-date>
          <conf-loc>Vienna Austria</conf-loc>
          <fpage>1</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/pdf?id=hSyW5go0v8"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Mishra</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>HT</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>EH</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>QV</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Take a step back: evoking reasoning via abstraction in large language models</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online October 9, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2310.06117"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fatemi</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Halcrow</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Perozzi</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Talk like a graph: encoding graphs for large language models</article-title>
          <source>Proceedings of the 25th International Conference on Learning Representations</source>
          <year>2024</year>
          <conf-name>ICLR '24</conf-name>
          <conf-date>May 7-11, 2024</conf-date>
          <conf-loc>Vienna Austria</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/attachment?id=IuXR1CCrSi&#38;name=supplementary_material"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Schenck</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Faithful AI in medicine: a systematic review with large language models and beyond</article-title>
          <source>medRxiv</source>
          <comment>Preprint posted online July 23, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37398329"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2023.04.18.23288752</pub-id>
          <pub-id pub-id-type="medline">37398329</pub-id>
          <pub-id pub-id-type="pii">2023.04.18.23288752</pub-id>
          <pub-id pub-id-type="pmcid">PMC10312867</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="web">
          <article-title>serenayj / DRKnows</article-title>
          <source>GitHub</source>
          <access-date>2024-04-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/serenayj/DRKnows">https://github.com/serenayj/DRKnows</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
