<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR AI</journal-id>
      <journal-title>JMIR AI</journal-title>
      <issn pub-type="epub">2817-1705</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v5i1e85221</article-id>
      <article-id pub-id-type="pmid">41667124</article-id>
      <article-id pub-id-type="doi">10.2196/85221</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Research Letter</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Research Letter</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluating Large Language Model–Generated Clinical Summaries Through a Dual-Perspective Framework: Retrospective Observational Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Coristine</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Goldsmith</surname>
            <given-names>Michael</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zahedivash</surname>
            <given-names>Aydin</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Han</surname>
            <given-names>Brian</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Division of Cardiology</institution>
            <institution>Lucile Packard Children’s Hospital</institution>
            <institution>Stanford University School of Medicine</institution>
            <addr-line>750 Welch Road</addr-line>
            <addr-line>Suite 305, MC 5208</addr-line>
            <addr-line>Palo Alto, CA, 94304</addr-line>
            <country>United States</country>
            <phone>1 6507237913</phone>
            <email>brianhan@stanford.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9713-988X</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Barnes</surname>
            <given-names>Traci</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-7417-5183</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Reddy</surname>
            <given-names>Charitha D</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0062-1272</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Shin</surname>
            <given-names>Andrew Y</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6713-0609</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Division of Cardiology</institution>
        <institution>Lucile Packard Children’s Hospital</institution>
        <institution>Stanford University School of Medicine</institution>
        <addr-line>Palo Alto, CA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Brian Han <email>brianhan@stanford.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2026</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>10</day>
        <month>2</month>
        <year>2026</year>
      </pub-date>
      <volume>5</volume>
      <elocation-id>e85221</elocation-id>
      <history>
        <date date-type="received">
          <day>7</day>
          <month>10</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>25</day>
          <month>11</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>31</day>
          <month>1</month>
          <year>2026</year>
        </date>
        <date date-type="accepted">
          <day>2</day>
          <month>2</month>
          <year>2026</year>
        </date>
      </history>
      <copyright-statement>©Brian Han, Traci Barnes, Charitha D Reddy, Andrew Y Shin. Originally published in JMIR AI (https://ai.jmir.org), 10.02.2026.</copyright-statement>
      <copyright-year>2026</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on https://www.ai.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://ai.jmir.org/2026/1/e85221" xlink:type="simple"/>
      <abstract>
        <p>Large language models (LLMs) are increasingly used by patients and families to interpret complex medical documentation, yet most evaluations focus only on clinician-judged accuracy. In this study, 50 pediatric cardiac intensive care unit notes were summarized using GPT-4o mini and reviewed by both physicians and parents, who rated readability, clinical fidelity, and helpfulness. There were important discrepancies between parents and clinicians in the realm of helpfulness, along with important insights by clinicians assessing clinical accuracy and parents assessing readability. This study highlights the need for dual-perspective frameworks that balance clinical precision with patient understanding.</p>
      </abstract>
      <kwd-group>
        <kwd>large language models</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>pediatric cardiology</kwd>
        <kwd>clinical informatics</kwd>
        <kwd>patient advocacy</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>The integration of large language models (LLMs) into clinical medicine [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>] has prompted studies to evaluate their utility in synthesizing clinical information [<xref ref-type="bibr" rid="ref3">3</xref>], assisting with clinical decision-making [<xref ref-type="bibr" rid="ref4">4</xref>], or answering standardized questions [<xref ref-type="bibr" rid="ref5">5</xref>]. However, only focusing on assessments of medical fidelity may not allow appropriate assessments of optimal utility, particularly in one use case: interpretation of medical documentation for patients and their families. While patients are increasingly using LLMs to interpret medical information, systematic assessments of this use remain rare.</p>
      <p>This gap is consequential in pediatric cardiology, where there is a layer of care complexity with involvement of a caregiver. In this setting, parents and caretakers use LLMs for improved understanding, but clarity and reassurance matter as much as clinical precision. We evaluated LLM-generated summaries of progress notes from two perspectives, clinicians and parents, introducing a 360° framework that captures complementary dimensions of utility.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>We identified 50 patients admitted to the pediatric cardiovascular intensive care unit between July 5, 2024, and July 5, 2025. For each case, two consecutive daily progress notes were selected. Assessment and plan sections, which included relevant clinical data, were deidentified and used as input for a standardized prompt requesting a 6-to-8-sentence summary at a 6th-to-8th-grade reading level (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Outputs were generated using a secure institutional version of GPT-4o mini during July 2025. Records of the children of the parent volunteers were not used as part of the study.</p>
        <p>The generated summaries were divided among 8 pediatric cardiologists and 10 parents of pediatric cardiology patients; 2 cardiologists and 2 parents reviewed each LLM-generated summary alongside the deidentified note for reference. Parents were recruited from a local parental advocacy group and from the inpatient cardiology unit during the week of July 7, 2025. Using a 4-point Likert scale, for each summary, clinical reviewers rated clinical fidelity (accuracy, omission of information, need for revision, and clinical alignment) and helpfulness, while parental reviewers rated readability and helpfulness with separate grading rubrics (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Demographic data and baseline attitudes toward LLMs were also collected.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The study was reviewed and approved by the institutional review board (protocol 80502). Informed consent was obtained and no compensation was provided for participation. All identifying information was omitted from patient notes and every effort was taken to preserve privacy, confidentiality, and anonymization throughout the study.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>All participants completed the survey. Demographics and baseline attitudes regarding LLMs are reported in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. Of note, none of the parents had medical backgrounds. The composite Flesch-Kincaid grade level for the responses was 10.6. Interrater reliability (Krippendorff α) was moderate for physician grading (α=0.69) and parental grading (α=0.75). Parents reported greater familiarity and comfort with LLMs and had a stronger belief in their role in medicine than physicians. Parents consistently rated the summaries as clear, easy to understand, and helpful in explaining clinical changes. The 3 questions on helpfulness answered by parents had a Cronbach α of 0.96; the Mann-Whitney <italic>U</italic> test was used to compare the parents’ average scores and the physicians’ scores for 1 question. Physicians rated the summaries lower than the parents, with a significant difference (<italic>U</italic>=3897; <italic>z</italic>=2.69; <italic>P</italic>=.007). Separately, physicians judged clinical accuracy less favorably than parents (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Ratings of helpfulness, readability, and clinical fidelity of large language model–generated summaries for parents and physicians. All scores ranged from 1 to 4.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="800"/>
          <col width="0"/>
          <col width="170"/>
          <thead>
            <tr valign="top">
              <td colspan="3">Questions and ratings</td>
              <td>Scores, mean (SE)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="4">
                <bold>Perceived helpfulness by parents<sup>a</sup></bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>How helpful was the summary in understanding the changes in the patient’s condition or treatment plan?</td>
              <td colspan="2">3.25 (0.58)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>How helpful would it be to receive this summary while your child was admitted?</td>
              <td colspan="2">3.26 (0.6)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>How helpful would this summary be in addition to the current communication you receive from the medical team?</td>
              <td colspan="2">3.36 (0.62)</td>
            </tr>
            <tr valign="top">
              <td colspan="4">
                <bold>Perceived helpfulness by physicians<sup>a</sup></bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>How helpful would this summary of changes be for a patient’s family?</td>
              <td colspan="2">2.97 (0.57)</td>
            </tr>
            <tr valign="top">
              <td colspan="4">
                <bold>Parent rating of readability</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Readability<sup>b</sup></td>
              <td colspan="2">3.36 (0.75)</td>
            </tr>
            <tr valign="top">
              <td colspan="4">
                <bold>Physician ratings of clinical fidelity</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Clinical accuracy<sup>c</sup></td>
              <td colspan="2">3.19 (0.68)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Clinical completeness<sup>d</sup></td>
              <td colspan="2">3.04 (0.72)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>No need for revision<sup>e</sup></td>
              <td colspan="2">2.96 (0.75)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Clinical alignment<sup>f</sup></td>
              <td colspan="2">2.9 (0.66)</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>Answers ranged from “not helpful” to “extremely helpful.”</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>Answers ranged from “hard to read” to “easy to read.”</p>
          </fn>
          <fn id="table1fn3">
            <p><sup>c</sup>Answers ranged from “inaccurate” to “very accurate.”</p>
          </fn>
          <fn id="table1fn4">
            <p><sup>d</sup>Answers ranged from “omitting key information” to “includes all key information.”</p>
          </fn>
          <fn id="table1fn5">
            <p><sup>e</sup>Answers ranged from “extensive revision needed” to “no revision needed.”</p>
          </fn>
          <fn id="table1fn6">
            <p><sup>f</sup>Answers ranged from “not aligned” to “very aligned.”</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>This study introduces a dual-perspective evaluation of LLM-generated medical summaries. While families gave favorable ratings for helpfulness and readability, there were fewer positive scores for clinical fidelity from the clinical experts. Readability scores were favorable despite the Flesch-Kincaid grade level being higher than 6 to 8, as asked for in the prompt. While the physicians still rated the summaries as helpful, their ratings were lower than those of the parents. These findings suggest that when the focus of such an assessment does not include patient and parental input, the actual patient-centered value of such summaries may be underestimated.</p>
      <p>The discrepancies are important to understand. Patients and caregivers are using LLMs, yet validation efforts remain clinician-centric and technical [<xref ref-type="bibr" rid="ref6">6</xref>]. Without evaluation frameworks that incorporate patient perspectives, there is a risk of limiting the potential usefulness of LLMs and our understanding of them as a patient tool [<xref ref-type="bibr" rid="ref7">7</xref>]. For example, there were summaries that clinicians rated as having low helpfulness but that parents perceived as very helpful. It is important for physicians to acknowledge that the use of LLMs continues to grow and that laypersons have a generally positive perception of the technology [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
      <p>Our study has several strengths and weaknesses. It used a single-institution design and a subspecialized patient population, limiting generalizability; nevertheless, it used unaltered clinical notes, enhancing real-world validity compared with curated data. It should also be noted that studies similar to this one are limited in the pediatric population, increasing the significance of this study’s impact. There was potential clustering bias in the survey distribution that was not accounted for in the statistical analyses. Another limitation was that parents reviewed summaries of notes for other children, which removed the emotional connection when evaluating information. Lastly, there was only moderate consensus among raters, which may affect the strength of the conclusions.</p>
      <p>It is also important to acknowledge limitations related to LLM performance. For one, the Flesch-Kincaid grade level of the summaries was much higher than what the prompt dictated, indicating limitations to the simplification of complex medical information. This also limits the impact of the favorable readability ratings, as the findings may not generalize to populations with lower health literacy. Additionally, the prompt mandates a certain format to describe changes, which may force the LLM to hallucinate and overreport a change. While this was not seen in this intensive care unit population, the same prompt may not be generalizable to a more stable population. In addition, while the LLM had access to the medical plan, it processed a physician’s interpretation of objective data rather than raw data, potentially affecting its ultimate accuracy. Both limitations may have negatively affected the perceived clinical fidelity.</p>
      <p>In conclusion, as patients continue to use LLMs, evaluations must evolve to integrate both clinical accuracy and patient experience. A balanced framework that incorporates both physicians and families should be considered to better guide safer and more effective adoption.</p>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Standardized large language model prompt.</p>
        <media xlink:href="ai_v5i1e85221_app1.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Physician and parental grading rubrics.</p>
        <media xlink:href="ai_v5i1e85221_app2.docx" xlink:title="DOCX File , 17 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Demographics and baseline attitudes toward large language models among physicians and parents.</p>
        <media xlink:href="ai_v5i1e85221_app3.docx" xlink:title="DOCX File , 16 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors are grateful to the patient families and caregivers involved in the study.</p>
    </ack>
    <notes>
      <title>Data Availability</title>
      <p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p>
    </notes>
    <notes>
      <title>Funding</title>
      <p>There was no funding for this study.</p>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>Conceptualization: BH (lead), TB (supporting), CDR (supporting)</p>
        <p>Data curation: BH</p>
        <p>Formal analysis: BH</p>
        <p>Investigation: BH</p>
        <p>Methodology: BH</p>
        <p>Project administration: BH (lead), TB (supporting)</p>
        <p>Resources: BH</p>
        <p>Supervision: AYS</p>
        <p>Validation: BH</p>
        <p>Visualization: BH</p>
        <p>Writing—original draft: BH (lead)</p>
        <p>Writing—review and editing: BH (lead), TB (supporting), CDR (supporting), AYS (supporting)</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSJ</given-names>
            </name>
            <name name-style="western">
              <surname>Elangovan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gutierrez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>TF</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
          </person-group>
          <article-title>Large language models in medicine</article-title>
          <source>Nat Med</source>
          <year>2023</year>
          <month>08</month>
          <volume>29</volume>
          <issue>8</issue>
          <fpage>1930</fpage>
          <lpage>1940</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id>
          <pub-id pub-id-type="medline">37460753</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-023-02448-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tripathi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sukumaran</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Efficient healthcare with large language models: optimizing clinical workflow and enhancing patient care</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2024</year>
          <month>05</month>
          <day>20</day>
          <volume>31</volume>
          <issue>6</issue>
          <fpage>1436</fpage>
          <lpage>1440</lpage>
          <pub-id pub-id-type="doi">10.1093/jamia/ocad258</pub-id>
          <pub-id pub-id-type="medline">38273739</pub-id>
          <pub-id pub-id-type="pii">7589687</pub-id>
          <pub-id pub-id-type="pmcid">PMC11105142</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bedi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Orr-Ewing</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Dash</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Koyejo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Callahan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fries</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Wornow</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Swaminathan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lehmann</surname>
              <given-names>LS</given-names>
            </name>
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kashyap</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chaurasia</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NR</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tazbaz</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Milstein</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pfeffer</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
          </person-group>
          <article-title>Testing and evaluation of health care applications of large language models: a systematic review</article-title>
          <source>JAMA</source>
          <year>2025</year>
          <month>01</month>
          <day>28</day>
          <volume>333</volume>
          <issue>4</issue>
          <fpage>319</fpage>
          <lpage>328</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2024.21700</pub-id>
          <pub-id pub-id-type="medline">39405325</pub-id>
          <pub-id pub-id-type="pii">2825147</pub-id>
          <pub-id pub-id-type="pmcid">PMC11480901</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goh</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Gallo</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Strong</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kerman</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Freed</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Cool</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Kanjee</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lane</surname>
              <given-names>KP</given-names>
            </name>
            <name name-style="western">
              <surname>Parsons</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Ahuja</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Horvitz</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Milstein</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Olson</surname>
              <given-names>APJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hom</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Rodman</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 assistance for improvement of physician performance on patient care tasks: a randomized controlled trial</article-title>
          <source>Nat Med</source>
          <year>2025</year>
          <month>04</month>
          <volume>31</volume>
          <issue>4</issue>
          <fpage>1233</fpage>
          <lpage>1238</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-024-03456-y</pub-id>
          <pub-id pub-id-type="medline">39910272</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-024-03456-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC12380382</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hanss</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sarma</surname>
              <given-names>KV</given-names>
            </name>
            <name name-style="western">
              <surname>Glowinski</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Krystal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Saunders</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Halls</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gorrell</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Reilly</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Assessing the accuracy and reliability of large language models in psychiatry using standardized multiple-choice questions: cross-sectional study</article-title>
          <source>J Med Internet Res</source>
          <year>2025</year>
          <month>05</month>
          <day>20</day>
          <volume>27</volume>
          <fpage>e69910</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2025//e69910/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/69910</pub-id>
          <pub-id pub-id-type="medline">40392576</pub-id>
          <pub-id pub-id-type="pii">v27i1e69910</pub-id>
          <pub-id pub-id-type="pmcid">PMC12134693</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johri</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jeong</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Schlessinger</surname>
              <given-names>DI</given-names>
            </name>
            <name name-style="western">
              <surname>Wongvibulsin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Barnes</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>ZR</given-names>
            </name>
            <name name-style="western">
              <surname>Van Allen</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Daneshjou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>An evaluation framework for clinical use of large language models in patient interaction tasks</article-title>
          <source>Nat Med</source>
          <year>2025</year>
          <month>01</month>
          <volume>31</volume>
          <issue>1</issue>
          <fpage>77</fpage>
          <lpage>86</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-024-03328-5</pub-id>
          <pub-id pub-id-type="medline">39747685</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-024-03328-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raghu Subramanian</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Khanna</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Enhancing health care communication with large language models-the role, challenges, and future directions</article-title>
          <source>JAMA Netw Open</source>
          <year>2024</year>
          <month>03</month>
          <day>04</day>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>e240347</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://jamanetwork.com/journals/jamanetworkopen/fullarticle/10.1001/jamanetworkopen.2024.0347"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.0347</pub-id>
          <pub-id pub-id-type="medline">38466311</pub-id>
          <pub-id pub-id-type="pii">2815872</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mendel</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Wiesenfeld</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Nov</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Laypeople's use of and attitudes toward large language models and search engines for health queries: survey study</article-title>
          <source>J Med Internet Res</source>
          <year>2025</year>
          <month>02</month>
          <day>13</day>
          <volume>27</volume>
          <fpage>e64290</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2025//e64290/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/64290</pub-id>
          <pub-id pub-id-type="medline">39946180</pub-id>
          <pub-id pub-id-type="pii">v27i1e64290</pub-id>
          <pub-id pub-id-type="pmcid">PMC11888097</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
