<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e86974</article-id><article-id pub-id-type="doi">10.2196/86974</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Supporting Radiology Resident Education and Clinical Decision-Making With Large Language Models: Comparative Study of Reasoning Models DeepSeek-R1 and ChatGPT-o1</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Eminovic</surname><given-names>Semil</given-names></name><degrees>BSc, MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schmidt</surname><given-names>Robin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Levita</surname><given-names>Bogdan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lindholz</surname><given-names>Maximilian</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Haack</surname><given-names>Anna-Maria</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Burdenski</surname><given-names>Alina</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bui</surname><given-names>Maurice</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schobert</surname><given-names>Isabel Theresa</given-names></name><degrees>MD/PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dell&#x2019;Orco</surname><given-names>Andrea</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nawabi</surname><given-names>Jawed</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Penzkofer</surname><given-names>Tobias</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Radiology, Charit&#x00E9; - Universit&#x00E4;tsmedizin Berlin</institution><addr-line>Augustenburger Platz 1</addr-line><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff2"><institution>Berlin Institute of Health, Charit&#x00E9; - Universit&#x00E4;tsmedizin Berlin</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><aff id="aff3"><institution>Department of Neuroradiology, Charit&#x00E9; - Universit&#x00E4;tsmedizin Berlin</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Farah</surname><given-names>Andrew</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Fukuzawa</surname><given-names>Fumitoshi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Semil Eminovic, BSc, MD, Department of Radiology, Charit&#x00E9; - Universit&#x00E4;tsmedizin Berlin, Augustenburger Platz 1, Berlin, 13353, Germany, ; <email>semil.eminovic@charite.de</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>26</day><month>6</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e86974</elocation-id><history><date date-type="received"><day>02</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>29</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>30</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Semil Eminovic, Robin Schmidt, Bogdan Levita, Maximilian Lindholz, Anna-Maria Haack, Alina Burdenski, Maurice Bui, Isabel Theresa Schobert, Andrea Dell&#x2019;Orco, Jawed Nawabi, Tobias Penzkofer. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 26.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e86974"/><abstract><sec><title>Background</title><p>Radiology trainees require efficient, accurate, and accessible resources to master complex imaging techniques and identify findings that guide clinical decision-making. Large language models (LLMs) are emerging as promising tools for medical education and clinical workflows, offering the potential to enhance learning by providing instant feedback, aiding in diagnostic accuracy, and offering personalized learning experiences. However, systematic comparisons of LLMs for radiology education and clinical support remain limited, particularly regarding differences across subspecialties and resident experience levels.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate and compare the response quality of 2 state-of-the-art reasoning-based LLMs, namely DeepSeek-R1 and ChatGPT-o1, as clinical and radiology residency support tools, comparing performance across clinical and didactic dimensions, including text- and image-based responses.</p></sec><sec sec-type="methods"><title>Methods</title><p>Overall, 27 radiology questions covering 9 radiological subspecialties were answered by both LLMs. Additionally, 6 image-based questions were presented only to ChatGPT-o1 due to its image processing capabilities. Responses were independently rated by 7 radiology residents (postgraduate years 2&#x2010;5) across 9 rating criteria grouped into 3 dimensions (factual accuracy, clinical practicality, and didactic value), using a 5-point Likert scale. Statistics compared LLMs, reader experience, and response types for text- as well as image-based for ChatGPT-o1 queries.</p></sec><sec sec-type="results"><title>Results</title><p>DeepSeek-R1 consistently outperformed ChatGPT-o1 across all rating dimensions, with highly significant differences across all criteria (mean ratings: DeepSeek-R1 4.51, SD 0.73 vs ChatGPT-o1 3.73, SD 0.98; <italic>P</italic>&#x003C;.001). In an exploratory subspecialty-level analysis, DeepSeek-R1 descriptively outperformed ChatGPT-o1 across all subspecialties. For both LLMs accumulated, junior residents tended to rate slightly higher than seniors in 7 of 9 criteria, although differences were not statistically significant. However, for ChatGPT-o1, junior residents rated significantly higher in overall score across all criteria (juniors 3.81, SD 0.64 vs seniors 3.63, SD 0.65; <italic>P</italic>=.02). Image-based responses by ChatGPT-o1 scored significantly lower than text-based (mean 3.19, SD 1.42; <italic>P</italic>=.007), particularly in factual accuracy (mean 2.75, SD 1.45; <italic>P</italic>&#x003C;.001) and clinical practicality (mean 3.11, SD 1.47; <italic>P</italic>=.03).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Both DeepSeek-R1 and ChatGPT-o1 demonstrate promising potential on simulated radiology question sets designed for educational and clinical contexts, with DeepSeek-R1 outperforming ChatGPT-o1 across all evaluated criteria. These results emphasize the value of open-source models for educational use and provide early evidence that LLMs may support radiology resident training under controlled conditions; however, their real-world educational and clinical effects require further investigation. Future research should prospectively evaluate how LLMs can be integrated into radiology training, assess their impact alongside conventional teaching methods, and investigate multimodal capabilities to better reflect realistic clinical scenarios.</p></sec></abstract><kwd-group><kwd>education</kwd><kwd>medicine</kwd><kwd>radiology</kwd><kwd>artificial intelligence</kwd><kwd>deep learning</kwd><kwd>large language models</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Radiology residents face an exceptionally wide range of organ systems, modalities, and clinical routine and emergency scenarios. This complexity results in a steep learning curve requiring the integration of medical knowledge, image interpretation, and clinical reasoning. Although being in training, these demands collide with the high pace of modern clinical workflows resulting in an increasing workload for radiology residents, which limits their direct supervision or attending consultation within clinical routines [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Consequently, trainees must frequently retrieve accurate, domain-specific knowledge independently. Conventional educational tools, such as textbooks or general-purpose online platforms, often provide voluminous or poorly targeted results, which may not be considered appropriately in real-life situations where immediate guidance is required. Ultimately, this may lead to frustration, exacerbate cognitive load, and contribute to burnout, which is a common issue faced by radiology residents [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Recent advances in large language models (LLMs) have prompted growing interest in their potential as educational aides in medicine and radiology [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref11">11</xref>], since they can condense complex topics into concise, context-aware explanations and support iterative follow-up questions that can mimic a human mentor supporting not only point-of-care decision-making but also dedicated study. Additionally, in terms of radiology-specific education, LLMs can integrate image analysis with text-based reasoning to guide learners in correlating imaging data with clinical context. Furthermore, the user may engage in interactive case-based discussions or simulate tumor boards and simply create learning material. However, besides their versatile use cases, LLMs pose a range of risks such as biased, incomplete, or hallucinated responses, which demand robust validation before clinical deployment. While LLMs have yet been proven to perform well on general medical benchmarks, there is an unmet clinical need to explore their potential for educational use in specialty-specific scenarios. To address this gap, this study systematically compared 2 state-of-the-art reasoning LLMs, namely OpenAI&#x2019;s ChatGPT-o1 [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>] and DeepSeek-R1 [<xref ref-type="bibr" rid="ref14">14</xref>], using a structured set of clinically relevant questions that commonly arise during residency. Thereby, both image-description&#x2013;based diagnostic prompts and knowledge-based questions were included, covering both quick in-shift use cases as well as dedicated study scenarios. The primary comparison in this study focuses on text-based questions that both models could process, whereas the image-based tasks administered only to ChatGPT-o1 were analyzed as an exploratory extension. In summary, this study aims to (1) assess the accuracy, clinical, and didactic value of LLM responses and (2) compare performance between models across clinical and didactic criteria for different radiology subspecialties, thereby providing early evidence on how such models perform on simulated tasks that are relevant for radiology residency training and clinical decision-making. The findings are intended to guide future prospective studies evaluating safe and effective integration of such models into clinical and dedicated educational practice.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>General Study Design</title><p>Due to the human-generated sample dataset, it was not mandatory to obtain a positive ethics vote from the institute&#x2019;s ethics committee. In this prospective controlled evaluation study, a set of 33 questions was derived from frequently encountered clinical scenarios in collaboration with both a senior radiologist (TP) with over 15 years of professional clinical and teaching experience and a neuroradiologist (JN) with over 8 years of professional clinical and teaching experience. The generated questions were submitted to the 2 state-of-the-art reasoning LLMs DeepSeek-R1 (DeepSeek, accessed via public web interface on March 28, 2025) and ChatGPT-o1 (full o1 release; OpenAI, accessed via public web interface on March 28, 2025). The LLM-generated responses were independently evaluated by 7 radiology residents (n=4 in their second year of residency: RS, ML, AB, MB; n=1 in third year: AMH; n=2 in fifth year of training: ITS, BL). Because DeepSeek-R1 does not support image inputs, the core model comparison is restricted to text-based questions. The 6 image-based questions presented only to ChatGPT-o1 were therefore treated as an exploratory extension. DeepSeek-R1 and ChatGPT-o1 were selected based on their state-of-the-art performance in both general-purpose reasoning tasks and domain-specific applications [<xref ref-type="bibr" rid="ref15">15</xref>] and, in particular, because they are well-known among the general public, including radiology residents. Both LLMs use &#x201C;chain-of-thought&#x201D; reasoning [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>] and are explicitly positioned as reasoning-oriented models rather than purely instruction-following systems. They further reflect complementary development paradigms, with DeepSeek-R1 offered as an open-weight model and ChatGPT-o1 as a proprietary system, and both are readily accessible through public interfaces, supporting evaluation in realistic educational settings.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study evaluated large language model outputs and did not involve patient data or human participants in research. Based on institutional policy, formal ethics approval was therefore not required. Questions were designed from clinical experience, not individual cases. Raters participated voluntarily, no personal data were collected, and no compensation was provided. All images were sourced from Radiopaedia.org, an open-access educational platform providing fully de-identified cases.</p></sec><sec id="s2-3"><title>Question Design and Prompting</title><p>A total of 33 questions (27 text-based and 6 image-based) were prepared in the German language covering 9 radiology subspecialties (thoracic, abdominal, oncological, cardiovascular, emergency, head and neck, and musculoskeletal imaging, neuroradiology, and interventional radiology). Each subspecialty included 1 question assessing declarative knowledge and 2 questions describing imaging findings that require diagnostic reasoning. Only ChatGPT-o1 received the 6 image&#x2010;based prompts, as it was the sole image&#x2010;processing capable model.</p><p>Examples of knowledge-based questions (translated):</p><disp-quote><p>What are five typical signs of active tuberculosis on CT?</p></disp-quote><p>Examples of diagnostic and image-description&#x2013;based questions (translated):</p><disp-quote><p>On magnetic resonance imaging (MRI) there is an extensive bone-marrow edema in the medial tibial plateau without fracture line. There is also no trauma. The patient has load-dependent pain. What is the most likely diagnosis and what are the three most important differential diagnoses?</p></disp-quote><p>Example of image-based questions with the corresponding image attached (translated):</p><disp-quote><p>Please describe the radiological findings on this MRI image. What is the most likely diagnosis? Which three important differential diagnoses should be considered?</p></disp-quote><p>Each question was submitted as an independent, standalone prompt in a newly initiated chat session via the public web interface of each LLM (on March 28, 2025), using identical phrasing to ensure comparability and consistency. A new cache-cleared session was started for each query to prevent any context from prior interactions. No few-shot examples were provided, simulating typical resident interactions with LLMs during clinical decision-making and self-directed study. For image-based questions, a corresponding diagnostic image (provided as a static visual reference) was included with each prompt (<xref ref-type="fig" rid="figure1">Figure 1</xref>). For illustrative purposes, green asterisks were added to the images in <xref ref-type="fig" rid="figure1">Figure 1</xref> to indicate the key pathological regions, while the unannotated versions were provided to both the raters and ChatGPT-o1.</p><p>The complete list of questions is available in the supplementary material (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). All corresponding model responses are documented as well and available on request.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Diagnostic images used as input for the 6 image-based questions submitted to model ChatGPT-o1. Green asterisks are added for illustration only and mark the relevant pathological findings: (A) meniscal tear [<xref ref-type="bibr" rid="ref18">18</xref>], (B) pancreatitis [<xref ref-type="bibr" rid="ref19">19</xref>], (C) disc herniation [<xref ref-type="bibr" rid="ref20">20</xref>], (D) hepatocellular carcinoma [<xref ref-type="bibr" rid="ref21">21</xref>], (E) multiple sclerosis [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>], and (F) subarachnoid and subdural hemorrhage [<xref ref-type="bibr" rid="ref24">24</xref>]. These images are published under Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported [<xref ref-type="bibr" rid="ref25">25</xref>].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e86974_fig01.png"/></fig></sec><sec id="s2-4"><title>Response Evaluation</title><p>Responses were rated using a 5-point Likert scale across 9 criteria grouped into 3 dimensions: factual accuracy (correctness, completeness, and precision), clinical practicality (comprehensibility, clinical usefulness, and trustworthiness), and didactic value (explanation depth, structure, and learning facilitation). Each rating dimension represents the average of the scores of its 3 underlying rating criteria (full criteria definitions in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). To support consistent scoring, reference answers were provided for all 9 criteria as content-oriented anchor points, giving raters an expert-validated reference to assess LLM responses across all evaluated dimensions. These were initially drafted by the first author (SE) based on established radiology resources (Core Radiology [<xref ref-type="bibr" rid="ref26">26</xref>], Radiopaedia.org [<xref ref-type="bibr" rid="ref27">27</xref>], Radiology Assistant [<xref ref-type="bibr" rid="ref28">28</xref>]) and were subsequently independently and blindly reviewed by 2 experienced radiologists (TP and JN) with over 15 and 8 years of clinical and teaching experience, respectively; any discrepancies were resolved through consensus meetings. Subsequently, all LLM responses were also blind-scored independently by evaluating residents.</p><p><xref ref-type="fig" rid="figure2">Figure 2</xref> displays exemplary responses from ChatGPT-o1 and DeepSeek-R1 for the classification of acute pancreatitis on computed tomography (CT) imaging.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Example of a response for classifying acute pancreatitis on computed tomography imaging. The displayed prompt and model responses have been translated from German to English for publication purposes. CT: computed tomography.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e86974_fig02.png"/></fig></sec><sec id="s2-5"><title>Statistical Analysis</title><sec id="s2-5-1"><title>Paired Design and LLM Comparison (ChatGPT-o1 vs DeepSeek-R1)</title><p>The study used a repeated-measures (paired) design. Since each rater evaluated both LLMs on the same set of 27 questions, every question received 2 independent ratings (one per LLM) from the same 7 raters. The 7 ratings were aggregated for each question&#x00D7; criteria combination into a single representative value per LLM by calculating the mean score. Paired differences between the 2 LLMs were tested with a 2-sided Wilcoxon signed-rank test. Holm correction was applied to adjust the significance level for multiple comparisons. As a sensitivity analysis addressing the potential loss of interrater variance through score aggregation, we fitted linear mixed-effects models for each criterion, dimension, and overall score, retaining all individual text-based ratings (n=3402) without aggregation (Table S1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). Each model specified LLM as a fixed effect with crossed random intercepts for rater and question, estimated via restricted maximum likelihood, thereby explicitly accounting for systematic differences in rater severity and question difficulty. Holm correction was applied across the 9 per-criterion tests.</p></sec><sec id="s2-5-2"><title>Subspecialty-Level Performance</title><p>With only 3 question pairs per subspecialty, we did not perform significance testing; instead, subspecialty-level analyses were predefined as exploratory, and we report mean (SD) differences descriptively. With 3 question pairs per subspecialty, these findings are preliminary and subject to variance; they should not be interpreted as generalizable conclusions about model performance across individual radiological domains. This exploratory analysis is best understood as hypothesis-generating.</p></sec><sec id="s2-5-3"><title>Rating Differences by Training Level (Junior vs Senior Residents)</title><p>To examine whether rating behavior differed by training stage, raters were stratified into junior (postgraduate year [PGY] 2, n=4) and senior residents (PGY 3&#x2010;5, n=3). We pooled both models&#x2019; responses by group and averaged ratings for each criterion, dimension, and overall. Group comparison was performed via Mann-Whitney <italic>U</italic> tests followed by Holm correction. Additionally, junior-senior comparisons were performed for each model.</p></sec><sec id="s2-5-4"><title>Analysis of Image-Based Question Ratings</title><p>The 6 image-based questions were presented only to ChatGPT-o1 due to its ability to process visual inputs; therefore, no direct comparison with DeepSeek-R1 was possible. Given the small sample size of 6 image-based questions, the Mann-Whitney <italic>U</italic> test was applied exclusively across all criteria to compare ChatGPT-o1&#x2019;s mean text and image ratings, using Holm correction.</p></sec><sec id="s2-5-5"><title>Interrater Reliability Analysis (ICC)</title><p>Interrater agreement was quantified with 2 intraclass correlation coefficients (ICC) computed from a 2-way random-effects model. ICC(2,1) reflects the reliability of a single resident&#x2019;s rating, whereas ICC(2,k) (k=7) captures the reliability of the mean score averaged across all residents.</p></sec><sec id="s2-5-6"><title>Statistical Software and Tools</title><p>All analyses were conducted in Python 3.9.13 (Pandas 2.2.2, NumPy 1.23.1, SciPy 1.13.1, pingouin 0.5.5, statsmodels 0.14.4, Matplotlib 3.9.2, Seaborn 0.13.2).</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Analysis of Paired Design and LLM Comparison (ChatGPT-o1 vs DeepSeek-R1)</title><p>DeepSeek-R1 consistently outperformed ChatGPT-o1 across all evaluated questions and all 3 rating dimensions and their 9 underlying rating criteria (mean ratings: DeepSeek-R1 4.51, SD 0.73 vs ChatGPT-o1 3.73, SD 0.98; <italic>P</italic>&#x003C;.001). The difference across all criteria was highly significant (<italic>P</italic>&#x003C;.001) (<xref ref-type="table" rid="table1">Table 1</xref>, <xref ref-type="fig" rid="figure3">Figures 3</xref> and <xref ref-type="fig" rid="figure4">4</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Statistical comparison between DeepSeek-R1 and ChatGPT-o1 across all evaluation criteria for text-based questions. Mean (SD) values are derived from individual raw ratings (n=189 per criterion, reflecting 7 raters &#x00D7; 27 questions; n=3 overarching rating dimensions). Wilcoxon signed-rank tests were conducted on aggregated mean scores per question pair (n=27), as specified in the methods (statistical analysis).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Rating criteria</td><td align="left" valign="bottom">DeepSeek-R1</td><td align="left" valign="bottom">ChatGPT-o1</td><td align="left" valign="bottom">Wilcoxon paired test (Holm corrected)</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Overall (all criteria)</td><td align="left" valign="top">4.51 (0.73)</td><td align="left" valign="top">3.73 (0.98)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Factual accuracy</td><td align="left" valign="top">4.55 (0.80)</td><td align="left" valign="top">3.93 (1.02)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Correctness</td><td align="left" valign="top">4.51 (0.88)</td><td align="left" valign="top">4.19 (1.02)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Completeness</td><td align="left" valign="top">4.52 (0.84)</td><td align="left" valign="top">3.85 (1.06)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Precision</td><td align="left" valign="top">4.60 (0.65)</td><td align="left" valign="top">3.74 (0.91)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Clinical practicality</td><td align="left" valign="top">4.44 (0.71)</td><td align="left" valign="top">3.84 (0.88)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Comprehensibility</td><td align="left" valign="top">4.63 (0.57)</td><td align="left" valign="top">4.00 (0.77)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Clinical usefulness</td><td align="left" valign="top">4.41 (0.76)</td><td align="left" valign="top">3.70 (0.94)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Trustworthiness</td><td align="left" valign="top">4.26 (0.73)</td><td align="left" valign="top">3.81 (0.87)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Didactic value</td><td align="left" valign="top">4.54 (0.66)</td><td align="left" valign="top">3.43 (0.96)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Explanation depth</td><td align="left" valign="top">4.59 (0.62)</td><td align="left" valign="top">3.15 (0.81)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Structure</td><td align="left" valign="top">4.59 (0.62)</td><td align="left" valign="top">3.79 (0.98)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Learning facilitation</td><td align="left" valign="top">4.42 (0.74)</td><td align="left" valign="top">3.37 (0.95)</td><td align="left" valign="top">&#x003C;.001</td></tr></tbody></table></table-wrap><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>The bar chart displays the distribution of cumulative ratings assigned to 2 large language models, DeepSeek-R1 and ChatGPT-o1, based on evaluations for all 27 text-based questions across nine criteria by all 7 raters. The corresponding absolute rating counts are provided in Table S2 (a) in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e86974_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Radar plot comparing mean scores across evaluation criteria for DeepSeek-R1 and ChatGPT-o1 based on evaluations for all 27 text-based questions across 9 criteria by all 7 raters.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e86974_fig04.png"/></fig><p>Overall, ChatGPT-o1 accumulated far more &#x201C;2&#x201D; ratings than DeepSeek-R1 (n1=141 vs n2=12) and double the &#x201C;1&#x201D; ratings (n1=22 vs n2=11). Cumulative absolute ratings are provided in <xref ref-type="fig" rid="figure3">Figure 3</xref> and relative ratings per criterion in <xref ref-type="fig" rid="figure5">Figure 5</xref>.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Stacked barplots show the distribution of Likert ratings for all rated criteria DeepSeek-R1 and ChatGPT-o1 for text-based questions. For example, for correctness (n=189 ratings per model), DeepSeek-R1 received mostly top ratings (5: 69.8%, 4: 17.5%), while ChatGPT-o1 showed a broader distribution with fewer top scores (5: 51.9%, 4: 24.9%) and more mid- to low ratings. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e86974_fig05.png"/></fig><p>Across both LLMs combined, ratings of &#x201C;1&#x201D; occurred most often for completeness (overall n=9; n1=3 for DeepSeek-R1 vs n2=6 for ChatGPT-o1); ratings of &#x201C;2&#x201D; were concentrated in the didactic domain, particularly for learning facilitation (overall n=35; n1=2 for DeepSeek-R1 vs n2=33 for ChatGPT-o1) and explanation depth (overall n=30; n1=0 for DeepSeek-R1 vs n2=30 for ChatGPT-o1). In contrast, ratings of &#x201C;5&#x201D; were mostly given to correctness (overall n=230; n1=132 for DeepSeek-R1 vs n2=98 for ChatGPT-o1). Full frequency distributions are reported in Tables S2 (a)-(d) and S3 (a)-(i) in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. The mixed-effects sensitivity analysis confirmed all primary findings. DeepSeek-R1 scored significantly higher than ChatGPT-o1 across all 9 criteria (all Holm-corrected <italic>P</italic>&#x003C;.001), with the largest effects observed for explanation depth (&#x03B2;=&#x2212;1.44) and learning facilitation (&#x03B2;=&#x2212;1.06). Rater-level and question-level variance were modest relative to residual variance. All 9 significance conclusions were concordant with the aggregated Wilcoxon approach (Table S1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p></sec><sec id="s3-2"><title>Exploratory Analysis of Subspecialty-Level Performance</title><p>In this exploratory subspecialty-level analysis, DeepSeek-R1 consistently descriptively outperformed ChatGPT-o1 across all 9 imaging subspecialties (<xref ref-type="table" rid="table2">Table 2</xref>) with only a few similar scored criteria for both models, for example, for correctness rating in interventional radiology (<xref ref-type="fig" rid="figure6">Figure 6</xref>). With only 3 question pairs per subspecialty, this exploratory analysis should be understood as hypothesis-generating only.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Overall ratings by imaging subspecialty for DeepSeek-R1 and ChatGPT-o1 (n=9 rating criteria, n=3 questions per subspecialty, n=7 raters).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Subspecialty</td><td align="left" valign="bottom">DeepSeek-R1, mean (SD)</td><td align="left" valign="bottom">ChatGPT-o1, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Thoracic imaging</td><td align="left" valign="top">4.54 (0.60)</td><td align="left" valign="top">3.60 (0.89)</td></tr><tr><td align="left" valign="top">Abdominal imaging</td><td align="left" valign="top">4.37 (0.80)</td><td align="left" valign="top">3.53 (0.95)</td></tr><tr><td align="left" valign="top">Oncological imaging</td><td align="left" valign="top">4.48 (0.73)</td><td align="left" valign="top">3.28 (0.99)</td></tr><tr><td align="left" valign="top">Cardiovascular imaging</td><td align="left" valign="top">4.62 (0.59)</td><td align="left" valign="top">4.01 (0.82)</td></tr><tr><td align="left" valign="top">Emergency imaging</td><td align="left" valign="top">4.79 (0.46)</td><td align="left" valign="top">3.92 (0.97)</td></tr><tr><td align="left" valign="top">Head and neck imaging</td><td align="left" valign="top">4.14 (1.13)</td><td align="left" valign="top">3.71 (1.08)</td></tr><tr><td align="left" valign="top">Musculoskeletal imaging</td><td align="left" valign="top">4.49 (0.67)</td><td align="left" valign="top">3.66 (1.06)</td></tr><tr><td align="left" valign="top">Neuroradiology</td><td align="left" valign="top">4.46 (0.66)</td><td align="left" valign="top">3.83 (0.89)</td></tr><tr><td align="left" valign="top">Interventional radiology</td><td align="left" valign="top">4.69 (0.47)</td><td align="left" valign="top">4.11 (0.80)</td></tr></tbody></table></table-wrap><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Radar plot panel comparing mean values for DeepSeek-R1 and ChatGPT-o1 across 9 radiology subspecialties by all 7 raters.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e86974_fig06.png"/></fig></sec><sec id="s3-3"><title>Analysis of Rating Differences by Training Level (Junior vs Senior Residents)</title><p>No statistically significant differences were observed between junior and senior residents when pooling ratings across both LLMs for any of the 9 rating criteria (<xref ref-type="table" rid="table3">Table 3</xref> and Figure S1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparison of junior and senior resident ratings for both DeepSeek-R1 and ChatGPT-o1. Parameters are accumulated across all evaluation criteria, including Holm-corrected Mann-Whitney <italic>U</italic> test results.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Rating criteria</td><td align="left" valign="bottom">Junior residents (n=4)</td><td align="left" valign="bottom">Senior residents (n=3)</td><td align="left" valign="bottom">Mann-Whitney-<italic>U</italic> test with Holm correction</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Overall (all criteria)</td><td align="left" valign="top">4.15 (0.65)</td><td align="left" valign="top">4.08 (0.71)</td><td align="left" valign="top">.47</td></tr><tr><td align="left" valign="top">Factual accuracy</td><td align="left" valign="top">4.09 (0.65)</td><td align="left" valign="top">3.98 (0.69)</td><td align="left" valign="top">.47</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Correctness</td><td align="left" valign="top">4.21 (0.61)</td><td align="left" valign="top">4.12 (0.69)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Completeness</td><td align="left" valign="top">3.93 (0.69)</td><td align="left" valign="top">3.85 (0.73)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Precision</td><td align="left" valign="top">4.13 (0.62)</td><td align="left" valign="top">3.96 (0.64)</td><td align="left" valign="top">.73</td></tr><tr><td align="left" valign="top">Clinical practicality</td><td align="left" valign="top">4.11 (0.70)</td><td align="left" valign="top">4.05 (0.82)</td><td align="left" valign="top">.79</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Comprehensibility</td><td align="left" valign="top">3.90 (0.79)</td><td align="left" valign="top">3.83 (0.91)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Clinical usefulness</td><td align="left" valign="top">4.18 (0.74)</td><td align="left" valign="top">4.20 (0.80)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Trustworthiness</td><td align="left" valign="top">4.25 (0.50)</td><td align="left" valign="top">4.11 (0.70)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">Didactic value</td><td align="left" valign="top">4.26 (0.60)</td><td align="left" valign="top">4.21 (0.59)</td><td align="left" valign="top">.68</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Explanation depth</td><td align="left" valign="top">4.31 (0.75)</td><td align="left" valign="top">4.41 (0.63)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Structure</td><td align="left" valign="top">4.13 (0.50)</td><td align="left" valign="top">3.91 (0.58)</td><td align="left" valign="top">.32</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Learning facilitation</td><td align="left" valign="top">4.33 (0.48)</td><td align="left" valign="top">4.30 (0.44)</td><td align="left" valign="top">&#x2265;.99</td></tr></tbody></table></table-wrap><p>However, junior residents assigned descriptively higher scores in 7 out of 9 criteria, including precision (juniors 4.13, SD 0.62 vs seniors 3.96, SD 0.64; <italic>P</italic>=.73), trustworthiness (juniors 4.25, SD 0.50 vs seniors 4.11, SD 0.70; <italic>P</italic>&#x2265;.99), and structure (juniors 4.13, SD 0.50 vs seniors 3.91, SD 0.58; <italic>P</italic>=.32), although none of these differences were statistically significant. DeepSeek-R1 displayed no subgroup effects (Table S4 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>), while juniors rated ChatGPT-o1&#x2019;s overall performance (juniors 3.81, SD 0.64 vs seniors 3.63, SD 0.65; <italic>P</italic>=.02) and structure (juniors 4, SD 0.49 vs seniors 3.57, SD 0.51; <italic>P</italic>=.04) significantly higher (Table&#x202F;S5 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p></sec><sec id="s3-4"><title>Analysis of Image-Based Question Ratings</title><p>For ChatGPT-o1, image-based questions were rated markedly lower than text-based overall (3.19, SD 1.42 vs 3.73, SD 0.98; <italic>P</italic>=.007), driven by a significant drop in factual accuracy (<italic>P</italic>&#x003C;.001), especially correctness (<italic>P</italic>=.03), and in clinical practicality (<italic>P</italic>=.03), whereas didactic value got slightly higher rated for the image cases (<italic>P</italic>=.04), mostly due to higher scores in explanation depth (<italic>P</italic>=.01) (<xref ref-type="table" rid="table4">Table 4</xref>, <xref ref-type="fig" rid="figure7">Figure 7</xref>).</p><p>Low ratings (1&#x2013;2) accounted for 34.9% of image-based questions compared to 9.6% for text-based inputs (Table&#x202F;S6 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Comparison of ChatGPT-o1 ratings for text- versus image-based questions. Parameters are accumulated across all evaluation criteria using Holm-corrected Mann-Whitney <italic>U</italic> tests.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">ChatGPT-o1</td><td align="left" valign="bottom">Text-based questions (n=27)</td><td align="left" valign="bottom">Image-based questions (n=6)</td><td align="left" valign="bottom">Mann-Whitney-<italic>U</italic> test (Holm corrected)</td></tr><tr><td align="left" valign="bottom">Rating criteria</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Overall (all criteria)</td><td align="left" valign="top">3.73 (0.98)</td><td align="left" valign="top">3.19 (1.42)</td><td align="left" valign="top">.007</td></tr><tr><td align="left" valign="top">Factual accuracy</td><td align="left" valign="top">3.93 (1.02)</td><td align="left" valign="top">2.75 (1.45)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Correctness</td><td align="left" valign="top">4.19 (1.02)</td><td align="left" valign="top">2.26 (1.47)</td><td align="left" valign="top">.03</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Completeness</td><td align="left" valign="top">3.85 (1.06)</td><td align="left" valign="top">2.45 (1.45)</td><td align="left" valign="top">.15</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Precision</td><td align="left" valign="top">3.74 (0.91)</td><td align="left" valign="top">3.55 (1.06)</td><td align="left" valign="top">.97</td></tr><tr><td align="left" valign="top">Clinical practicality</td><td align="left" valign="top">3.84 (0.88)</td><td align="left" valign="top">3.11 (1.47)</td><td align="left" valign="top">.03</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Comprehensibility</td><td align="left" valign="top">4 (0.77)</td><td align="left" valign="top">4.07 (0.95)</td><td align="left" valign="top">.97</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Clinical usefulness</td><td align="left" valign="top">3.70 (0.94)</td><td align="left" valign="top">2.95 (1.45)</td><td align="left" valign="top">.34</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Trustworthiness</td><td align="left" valign="top">3.81 (0.87)</td><td align="left" valign="top">2.31 (1.39)</td><td align="left" valign="top">.07</td></tr><tr><td align="left" valign="top">Didactic value</td><td align="left" valign="top">3.43 (0.96)</td><td align="left" valign="top">3.71 (1.19)</td><td align="left" valign="top">.04</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Explanation depth</td><td align="left" valign="top">3.15 (0.81)</td><td align="left" valign="top">3.98 (0.90)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Structure</td><td align="left" valign="top">3.79 (0.98)</td><td align="left" valign="top">4.07 (1)</td><td align="left" valign="top">.92</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Learning facilitation</td><td align="left" valign="top">3.37 (0.95)</td><td align="left" valign="top">3.07 (1.37)</td><td align="left" valign="top">.97</td></tr></tbody></table></table-wrap><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>Line plot comparing the mean ratings of the ChatGPT-o1 model for text-based versus image-based questions across every evaluation criterion.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e86974_fig07.png"/></fig></sec><sec id="s3-5"><title>Interrater Reliability Analysis (ICC)</title><p>Single-rater agreement varied across criteria, with ICC(2) ranging from 0.216 (comprehensibility) to 0.608 (factual accuracy), reflecting fair-to-moderate reliability at the individual rater level. Mean rater reliability was consistently strong across all criteria (ICC(2,k)=0.659&#x2010;0.916) with an overall ICC(2,k) of 0.89 across all 9 criteria (Table S7 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study investigated the on-demand diagnostic performance as well as the educational use of 2 state-of-the-art reasoning LLMs, namely ChatGPT-o1 and DeepSeek-R1, across clinically relevant radiology questions. The results demonstrated that DeepSeek-R1 consistently and significantly outperformed ChatGPT-o1 across all evaluated domains, including factual accuracy, clinical practicality, and didactic value. Performance differences were particularly pronounced for didactic criteria such as explanation depth and learning facilitation. Subgroup analyses revealed no major effects of resident training level, indicating generalizable usability across experience levels. Overall, these findings suggest that reasoning LLMs may serve as useful tools for answering radiology-related clinical questions and supporting resident learning in simulated scenarios.</p></sec><sec id="s4-2"><title>Performance for Text-Based Questions</title><p>While prior studies have examined LLMs for radiology resident training in isolation [<xref ref-type="bibr" rid="ref29">29</xref>] or for other medical specialties and use-cases (eg, diagnostic reasoning or guideline-based classification tasks) [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref32">32</xref>], to our knowledge, this study displays the first comprehensive comparison within radiology education. Prior findings have been heterogeneous: DeepSeek-R1 outperformed ChatGPT-o1 and Llama 3.1-405B in step-wise diagnostic-reasoning explanations, whereas ChatGPT-o1 excelled at United States Medical Licensing Examination-style exams and radiology report summarization [<xref ref-type="bibr" rid="ref30">30</xref>]. Both models performed similarly on text-based clinical cases and CT report&#x2013;based Response Evaluation Criteria In Solid Tumors (RECIST) classifications [<xref ref-type="bibr" rid="ref30">30</xref>]. The performance gap in our study may be partially explained by different training strategies. While ChatGPT-o1 design and training remain largely undisclosed, DeepSeek-R1 uses supervised fine-tuning, reinforcement learning, and iterative refinement [<xref ref-type="bibr" rid="ref14">14</xref>]. This approach may support more structured reasoning and improve factual accuracy, which are important for addressing complex, domain-specific tasks in radiology. Furthermore, although GPT-o1 scores slightly higher on the general-purpose massive multitask language understanding-pro benchmark [<xref ref-type="bibr" rid="ref15">15</xref>], our findings suggest that such metrics do not reliably predict performance in highly specialized contexts such as radiology resident education.</p><p>ChatGPT-o1 got nearly twice as many &#x201C;1&#x201D; and over 10 times more &#x201C;2&#x201D; ratings as DeepSeek-R1, mainly for completeness, learning facilitation, and explanation depth. Most &#x201C;5&#x201D; ratings were awarded for correctness, indicating that both models can be accurate but differ markedly in depth and educational value. Rare yet serious gaps in completeness can threaten patient safety, while didactic shortcomings may reduce learning impact. Ten of the eleven &#x201C;1&#x201D; ratings for DeepSeek-R1 originate from a single head-and-neck question describing a T2-hyperintense, well-defined, non-enhancing lesion at the mandibular angle. Both models incorrectly assumed osseous origin, failing to recognize the imaging pattern as consistent with a branchial cleft cyst. While more explicit phrasing specifying a soft-tissue lesion could have guided the models toward the correct diagnosis, the combination of MRI characteristics described represents a recognizable radiological pattern that a clinically experienced reader would be expected to correctly identify. The shared misinterpretation therefore reflects both the sensitivity of LLM outputs to prompt phrasing and a genuine limitation in contextual clinical reasoning, highlighting the importance of precise prompting in clinical and educational applications. Although DeepSeek-R1 outperformed ChatGPT-o1, it still produced eleven &#x201C;insufficient&#x201D; responses, underscoring the need for mandatory human verification. Infrequent but dangerous errors outweigh minor flaws, demanding layered safeguards such as automated uncertainty detection, structured review checklists, and human approval for high-risk cases.</p><p>ChatGPT-o1 received a &#x201C;1&#x201D; rating for providing insufficient detail (50 words vs DeepSeek-R1&#x2019;s 157 words). While previous studies have noted DeepSeek&#x2019;s tendency toward lengthy, repetitive responses [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref30">30</xref>], our findings demonstrate that DeepSeek-R1 received its highest rating advantage in explanation depth, reflecting that reviewers found its more detailed answers especially helpful for learning. While residents may favor concise, context-aware explanations for rapid comprehension, such compactness can hinder deeper learning by overlooking essential details and detailed reasoning required to understand complex radiological scenarios. To illustrate these differences in didactic value, 2 representative model outputs from emergency and cardiac imaging are displayed in <xref ref-type="fig" rid="figure8">Figure 8</xref>.</p><p>A potential verbosity bias may have specifically affected the didactic value ratings. Given that DeepSeek-R1 consistently produced longer responses and simultaneously received its greatest rating advantage in explanation depth and learning facilitation, criteria that are inherently susceptible to length-related perception effects, the possibility that raters equated response length with educational quality cannot be excluded. Future evaluations could explore structured rating frameworks that explicitly separate content quality from response length to mitigate this effect. Matching superiority across all questions, DeepSeek-R1 also appears to outperform ChatGPT-o1 in all 9 imaging subspecialties, with minor overlaps (eg, correctness in interventional radiology), likely due to a low number of questions per subspecialty (n=3). As noted in the &#x201C;Limitations&#x201D; section, the subspecialty analysis lacks statistical power for domain-specific conclusions. We used standardized single-turn, one-shot batch prompting to enable a strictly paired model comparison and avoid rater-dependent variability, which may render our findings a conservative estimate of real-world performance, where iterative re-prompting and answer refinement can further improve output quality.</p><fig position="float" id="figure8"><label>Figure 8.</label><caption><p>Representative model outputs from emergency (A) and cardiac (B) imaging prompts illustrating differences in explanation depth, reasoning structure, and didactic clarity between ChatGPT-o1 and DeepSeek-R1. The displayed prompts and model responses have been translated from German to English for publication purposes. CT: computed tomography.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e86974_fig08.png"/></fig></sec><sec id="s4-3"><title>ChatGPT-o1&#x2019;s Performance for Image-Based Questions</title><p>ChatGPT-o1&#x2019;s accuracy dropped on image-based cases driven by declines in factual accuracy and clinical practicality, whereas its didactic scores rose slightly. ChatGPT-o1 failed to identify straightforward MRI findings, for example, mislabeling a lumbar disc herniation as spondylodiscitis and a meniscal tear as a Baker&#x2019;s cyst, echoing studies that GPT-4 handles text-only radiology questions far more accurately than those requiring image interpretation [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. This underscores that current large language models excel on text but struggle with visuals, likely due to limited paired image-text supervision and the complexity of visual signs. Several factors may have contributed to ChatGPT-o1&#x2019;s decreasing performance on image-based cases. First, our prompts were relatively generic and did not explicitly include specific image descriptions (eg, attenuation and enhancement) or clinical context (eg, patient history, laboratory values, or prior imaging), reducing the model&#x2019;s differential generation reliability. Second, the mismatch between the model&#x2019;s predominantly text-based and generic vision-language pretraining and our specialized radiologic cases (eg, high-field MRI spine sequences) may have impaired its ability to recognize expected imaging features. Third, limitations in image resolution and potential compression artifacts may have obscured subtle findings.</p></sec><sec id="s4-4"><title>Rating Differences by Training Level (Junior vs Senior Residents)</title><p>Junior residents were awarded higher scores on 7 of 9 criteria (exceptions were clinical usefulness and explanation depth), yet overall ratings did not differ significantly from seniors. This may reflect (1) a clear, structured scoring rubric that guided consistent ratings across experience levels or (2) difficulties in detecting subtle differences in answer quality, something future studies could address through rater training or reference standards. Interestingly, junior residents rated ChatGPT-o1 significantly higher than seniors in both overall score and structure. This was not observed for DeepSeek-R1, possibly reflecting its more consistent, higher-quality output. In contrast, ChatGPT-o1&#x2019;s more variable style may have appeared more pedagogically accessible to juniors, while advanced residents judged it more critically. These findings do not imply equivalent usability across experience levels. Differences in scoring may instead reflect variation in comfort with artificial intelligence tools, especially LLMs, as well as differing expectations for detail and explanation depth, or greater familiarity of senior residents with typical diagnostic pitfalls and subtle error patterns. Further research with larger and more heterogeneous resident groups will be required to understand how LLM performance is perceived across stages of training. Single-rater reliability was moderate (ICC&#x2248;0.6), rising to near-excellent across 7 raters (ICC&#x2248;0.89) [<xref ref-type="bibr" rid="ref35">35</xref>]. In this study, our evaluation primarily reflects how trainees perceive LLM outputs, and future work should also involve attending radiologists to better characterize staff-level expectations and critical appraisal of model performance.</p></sec><sec id="s4-5"><title>Open-Source vs Proprietary Models</title><p>Open-source models such as DeepSeek-R1 prioritize transparency, local deployment, and custom fine-tuning, benefiting health care research and data-sensitive clinical applications, though they require greater technical resources and expertise. DeepSeek-R1&#x2019;s openly released weights and low resource requirements may allow large health care organizations to run the model locally for in-house fine-tuning and deployment. However, the limited information about its training data and the absence of training code raise open questions around data provenance, reproducibility, and responsible model governance [<xref ref-type="bibr" rid="ref16">16</xref>].</p></sec><sec id="s4-6"><title>Responsible Deployment for Global Health Care Equity</title><p>Our findings underscore the need to examine the broader implications of open-source LLM deployment for global health&#x2010;care equity. Although open&#x2010;source, locally deployed LLMs could reduce costs and support under-resourced regions, disparities in high-performance computing and high-quality data risk widening the digital divide [<xref ref-type="bibr" rid="ref36">36</xref>]. Developed regions may refine these technologies more rapidly, widening the gap with resource-constrained areas. Future research should extend beyond technical model benchmarking to explore open-source collaboration, local deployment, and low-cost optimizations that ensure equitable global health care delivery.</p><p>Additionally, policy measures, international cooperation, and resource-sharing initiatives must be explored to prevent monopolies and ensure that LLMs strengthen, rather than exacerbate, global health care and educational equity.</p></sec><sec id="s4-7"><title>Future Directions</title><p>Future research should explore how multimodal LLMs, capable of processing radiological images and accompanying text, could support more complex, realistic training scenarios. In addition, prospective studies comparing and combining LLM-assisted learning with conventional teaching methods are needed to assess long-term educational impact, for example through structured training modules with pre- and post-knowledge evaluations, longitudinal follow-up of resident performance, and objective structured clinical examinations incorporating LLM-supported components. LLMs could support resident training through real-time case questions and answers, automated learn card generation, dynamic case simulations (combining patient history and imaging), guided reporting practice and quality assurance (&#x201C;second-reads&#x201D;), and on-demand guideline summaries or quiz creation to keep residents up to date. Further work should optimize prompt strategies, evaluate curriculum integration, and assess user trust, error types, and practical safeguards for clinical use.</p></sec><sec id="s4-8"><title>Limitations</title><p>This study has several limitations. First, the rating scale, while multidimensional, remains subjective and is based on assessments from 7 residents at a single institution, which limits external validity. Since model outputs were evaluated solely by radiology residents (PGY 2&#x2010;5), subtle factual errors or hallucinations more readily identified by experienced attending radiologists may have gone undetected, potentially leading to an overestimation of factual accuracy and overly optimistic safety assessments. Future studies should include attending-level evaluation to establish a more rigorous and safety-relevant accuracy benchmark. Expert input came from one senior general radiologist and one neuroradiologist rather than a formal subspecialty panel.</p><p>Second, the small question set (27 text-based and 6 image-based) constrains generalizability, though it was kept intentionally compact given the extensive multicriteria evaluation required per response. With 3 question pairs per subspecialty, findings at this level are preliminary and should not be interpreted as generalizable conclusions across individual radiological domains. Additionally, 1 question included an explicit conciseness instruction that may have been in tension with depth-sensitive evaluation criteria, potentially disadvantaging the model ChatGPT-o1 that adhered more closely to that constraint. Future work should expand the question set and involve multi-institutional and subspecialty expert input.</p><p>Third, blinding integrity may have been partially compromised, as the consistently greater length and detail of DeepSeek-R1 responses compared to ChatGPT-o1 may have allowed raters to infer model identity from stylistic characteristics alone. Should raters have perceived longer responses as more comprehensive, this could have introduced a systematic bias in favor of DeepSeek-R1, potentially contributing to the observed performance gap between the 2 models.</p><p>Fourth, image-based questions were sourced from Radiopaedia.org, a publicly accessible platform potentially indexed during model pretraining, introducing a data contamination risk for the image-based results. Similarly, while text-based questions were derived from clinical experience, the underlying declarative medical knowledge was likely present in the models&#x2019; pretraining corpora.</p><p>Fifth, the comparison of ChatGPT-o1&#x2019;s text-based and image-based performance is subject to a notable confound: the 2 conditions involve different clinical cases, and lower image-based scores may in part reflect the specific difficulty of those 6 cases in addition to potential visual processing limitations. A matched design presenting identical cases as both image inputs and text descriptions would be required to fully isolate modality effects.</p><p>Sixth, as all questions were submitted in German, observed performance differences may partly reflect differential multilingual robustness of the 2 models rather than domain-specific reasoning capability alone. While this choice reflects real-world usage conditions for German-speaking residents, generalizability to other language settings remains limited and should be addressed in future studies.</p><p>Seventh, the inherently subjective nature of the didactic and clinical practicality criteria limits the extent to which standardized reference answers can ensure consistent scoring, potentially introducing additional inter-rater variability.</p><p>Eighth, rapid LLM evolution may limit our findings&#x2019; applicability to upcoming model versions.</p></sec><sec id="s4-9"><title>Conclusions</title><p>This study demonstrates that DeepSeek-R1 significantly outperformed ChatGPT-o1 across all 3 dimensions: factual accuracy, clinical practicality, and didactic value for text-based radiology questions, with highly significant differences across all 9 rating criteria. For ChatGPT-o1, image-based performance was significantly lower than text-based performance, particularly in factual accuracy and clinical practicality. No statistically significant differences were observed between junior and senior resident raters when pooling both models. These findings provide early, controlled evidence that reasoning LLMs can produce clinically and educationally relevant responses to radiology residency questions, while also demonstrating the limitations of current vision-language capabilities. Both models still produced factually insufficient responses in a subset of cases, underscoring the continued necessity of human expert oversight.</p></sec></sec></body><back><ack><p>The authors used large language models (Claude Sonnet 4.6, Anthropic; ChatGPT-o1 and 5.1, OpenAI; DeepSeek-R1, DeepSeek) for language editing and figure preparation, as well as in the context of the study itself, as described in the manuscript. All AI-assisted content was critically reviewed and approved by the authors.</p></ack><notes><sec><title>Funding</title><p>This research received no specific funding. Any funding held by individual authors is disclosed in the Conflicts of Interest section and had no role in the design, conduct, or publication of this study.</p></sec><sec><title>Data Availability</title><p>The full list of questions is provided in the supplementary material <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>; responses and evaluations are available from the corresponding author upon reasonable request and will be provided without undue restriction.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: SE (lead), TP (equal)</p><p>Data curation: SE (lead), RS (equal), BL (equal), ML (equal), AMH (equal), AB (equal), MB (equal), ITS (equal), JN (equal), TP (equal)</p><p>Formal analysis: SE (lead), AD (supporting)</p><p>Investigation: SE (lead), TP (supporting)</p><p>Methodology: SE (lead), JN (equal), TP (equal)</p><p>Project administration: TP (lead), SE (equal)</p><p>Resources: TP</p><p>Software: SE (lead), TP (equal)</p><p>Supervision: TP (lead), JN (equal)</p><p>Validation: SE (lead), TP (supporting)</p><p>Visualization: SE</p><p>Writing &#x2013; original draft: SE</p><p>Writing &#x2013; review and editing: SE (lead), RS (supporting), BL (supporting), ML (supporting), AMH (supporting), AB (supporting), MB (supporting), ITS (supporting), AD (supporting), JN (supporting), TP (supporting)</p></fn><fn fn-type="conflict"><p>The authors have no competing interests that might be perceived to influence the results reported in this paper. TP receives funding from Berlin Institute of Health (Advanced Clinician Scientist Grant, Platform Grant), Ministry of Education and Research (BMBF, 01KX2021 (RACOON), 01KX2121 ("NUM 2.0", RACOON), 01KX2524 (NUM 3.0), 68GX21001A, 01ZZ2315D), German Research Foundation (DFG, SFB 1340/2), European Union (H2020, CHAIMELEON: 952172, DIGITAL, EUCAIM:101100633, ARTEMIS). TP reports research agreements (no personal payments, outside of the submitted work) with AGO Research GmbH, Aravive, Inc, ARCAGY-GINECO, Astellas Pharma Global Development Inc, AstraZeneca AB, AstraZeneca GmbH, Clovis Oncology, EQRx International, Inc, F Hoffmann-La Roche Ltd, GlaxoSmithKline Research &#x0026; Development Limited, Grupo Espa&#x00F1;ol de Investigaci&#x00F3;n en C&#x00E1;ncer de Ovario (GEICO), ImmunoGen Inc, Incyte Corporation, Karyopharm Therapeutics, Mario Negri Gynecology Oncology Group (MaNGO) (111), Merck KGaA, Merck Sharp &#x0026; Dohme Corp, NOGGO eV, Nordic Society of Gynaecological Oncology &#x2013; Clinical Trial Unit (NSGO-CTU), Novartis Pharma GmbH, NovoCure GmbH, Sutro Biopharma, Inc, TESARO Inc, TORL Biotherapeutics, LLC, Tubulis GmbH, Universit&#x00E4;tspoliklinik A Gemelli, and Verastem Inc, as well as fees for a book translation (Elsevier) and speaking engagements (Bayer Healthcare). JN receives funding from Berlin Institute of Health (Digital Health Accelerator), European Union&#x2019;s Horizon Europe programme (COMFORT, 101079894) and reports personal fees from Eppdata GmbH outside the submitted work.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CT</term><def><p>computed tomography</p></def></def-item><def-item><term id="abb2">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">MRI</term><def><p>magnetic resonance imaging</p></def></def-item><def-item><term id="abb5">PGY</term><def><p>postgraduate year</p></def></def-item><def-item><term id="abb6">RECIST</term><def><p>Response Evaluation Criteria In Solid Tumors</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dan Lantsman</surname><given-names>C</given-names> </name><name name-style="western"><surname>Barash</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name><name name-style="western"><surname>Guranda</surname><given-names>L</given-names> </name><name name-style="western"><surname>Konen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Tau</surname><given-names>N</given-names> </name></person-group><article-title>Trend in radiologist workload compared to number of admissions in the emergency department</article-title><source>Eur J Radiol</source><year>2022</year><month>04</month><volume>149</volume><fpage>110195</fpage><pub-id pub-id-type="doi">10.1016/j.ejrad.2022.110195</pub-id><pub-id pub-id-type="medline">35149337</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bruls</surname><given-names>RJM</given-names> </name><name name-style="western"><surname>Kwee</surname><given-names>RM</given-names> </name></person-group><article-title>Workload for radiologists during on-call hours: dramatic increase in the past 15 years</article-title><source>Insights Imaging</source><year>2020</year><month>11</month><day>23</day><volume>11</volume><issue>1</issue><fpage>121</fpage><pub-id pub-id-type="doi">10.1186/s13244-020-00925-z</pub-id><pub-id pub-id-type="medline">33226490</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chokshi</surname><given-names>FH</given-names> </name><name name-style="western"><surname>Hughes</surname><given-names>DR</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Mullins</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Hawkins</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Duszak</surname><given-names>R</given-names> </name></person-group><article-title>Diagnostic radiology resident and fellow workloads: a 12-year longitudinal trend analysis using national medicare aggregate claims data</article-title><source>J Am Coll Radiol</source><year>2015</year><month>07</month><volume>12</volume><issue>7</issue><fpage>664</fpage><lpage>669</lpage><pub-id pub-id-type="doi">10.1016/j.jacr.2015.02.009</pub-id><pub-id pub-id-type="medline">25972250</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bai</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Burnout among radiology residents: a systematic review and meta-analysis</article-title><source>Eur Radiol</source><year>2024</year><month>02</month><volume>34</volume><issue>2</issue><fpage>1399</fpage><lpage>1407</lpage><pub-id pub-id-type="doi">10.1007/s00330-023-09986-2</pub-id><pub-id pub-id-type="medline">37589905</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chetlen</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>TL</given-names> </name><name name-style="western"><surname>Ballard</surname><given-names>DH</given-names> </name><etal/></person-group><article-title>Addressing burnout in radiologists</article-title><source>Acad Radiol</source><year>2019</year><month>04</month><volume>26</volume><issue>4</issue><fpage>526</fpage><lpage>533</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2018.07.001</pub-id><pub-id pub-id-type="medline">30711406</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bin Dahmash</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alorfi</surname><given-names>FK</given-names> </name><name name-style="western"><surname>Alharbi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Aldayel</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kamel</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Almoaiqel</surname><given-names>M</given-names> </name></person-group><article-title>Burnout phenomenon and its predictors in radiology residents</article-title><source>Acad Radiol</source><year>2020</year><month>07</month><volume>27</volume><issue>7</issue><fpage>1033</fpage><lpage>1039</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2019.09.024</pub-id><pub-id pub-id-type="medline">31629625</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lyo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mohan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hassankhani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Noor</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dako</surname><given-names>F</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>T</given-names> </name></person-group><article-title>From revisions to insights: converting radiology report revisions into actionable educational feedback using generative AI models</article-title><source>J Imaging Inform Med</source><year>2025</year><month>04</month><volume>38</volume><issue>2</issue><fpage>1265</fpage><lpage>1279</lpage><pub-id pub-id-type="doi">10.1007/s10278-024-01233-4</pub-id><pub-id pub-id-type="medline">39160366</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ballard</surname><given-names>DH</given-names> </name><name name-style="western"><surname>Antigua-Made</surname><given-names>A</given-names> </name><name name-style="western"><surname>Barre</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Impact of ChatGPT and large language models on radiology education: association of academic radiology-radiology research alliance task force white paper</article-title><source>Acad Radiol</source><year>2025</year><month>05</month><volume>32</volume><issue>5</issue><fpage>3039</fpage><lpage>3049</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2024.10.023</pub-id><pub-id pub-id-type="medline">39616097</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Me&#x015F;e</surname><given-names>&#x0130;</given-names> </name><name name-style="western"><surname>Alt&#x0131;nta&#x015F; Ta&#x015F;l&#x0131;&#x00E7;ay</surname><given-names>C</given-names> </name><name name-style="western"><surname>Kuzan</surname><given-names>BN</given-names> </name><name name-style="western"><surname>Kuzan</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Sivrio&#x011F;lu</surname><given-names>AK</given-names> </name></person-group><article-title>Educating the next generation of radiologists: a comparative report of ChatGPT and e-learning resources</article-title><source>Diagn Interv Radiol</source><year>2024</year><month>05</month><day>13</day><volume>30</volume><issue>3</issue><fpage>163</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.4274/dir.2023.232496</pub-id><pub-id pub-id-type="medline">38145370</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Sidamon-Eristoff</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chartash</surname><given-names>D</given-names> </name></person-group><article-title>The role of large language models in medical education: applications and implications</article-title><source>JMIR Med Educ</source><year>2023</year><month>08</month><day>14</day><volume>9</volume><fpage>e50945</fpage><pub-id pub-id-type="doi">10.2196/50945</pub-id><pub-id pub-id-type="medline">37578830</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lucas</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Upperman</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Robinson</surname><given-names>JR</given-names> </name></person-group><article-title>A systematic review of large language models and their implications in medical education</article-title><source>Med Educ</source><year>2024</year><month>11</month><volume>58</volume><issue>11</issue><fpage>1276</fpage><lpage>1285</lpage><pub-id pub-id-type="doi">10.1111/medu.15402</pub-id><pub-id pub-id-type="medline">38639098</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Achiam</surname><given-names>J</given-names></name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 4, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><article-title>ChatGPT-o1 (large language model)</article-title><source>OpenAI</source><year>2024</year><access-date>2025-03-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com">https://openai.com</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Deepseek-R1: incentivizing reasoning capability in LLMs via reinforcement learning</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 4, 2026</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.12948</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><article-title>MMLU-pro leaderboard</article-title><source>Hugging Face</source><access-date>2026-06-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro">https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Shih</surname><given-names>G</given-names> </name></person-group><article-title>DeepSeek is open-access and the next AI disrupter for radiology</article-title><source>Radiol Adv</source><year>2025</year><month>01</month><volume>2</volume><issue>1</issue><fpage>umaf009</fpage><pub-id pub-id-type="doi">10.1093/radadv/umaf009</pub-id><pub-id pub-id-type="medline">40837590</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name></person-group><article-title>A tutorial on LLM reasoning: relevant methods behind chatgpt o1</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 15, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.10867</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Gaillard</surname><given-names>F</given-names> </name></person-group><article-title>Horizontal meniscal tear - horizontal, case study</article-title><source>Radiopaedia</source><access-date>2025-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.53347/rID-6361">https://doi.org/10.53347/rID-6361</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>N</given-names> </name></person-group><article-title>Acute pancreatitis, case study</article-title><source>Radiopaedia</source><access-date>2025-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.53347/rID-11083">https://doi.org/10.53347/rID-11083</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Walizai</surname><given-names>T</given-names> </name></person-group><article-title>Lumbar disc herniation with caudal migration, case study</article-title><source>Radiopaedia</source><access-date>2025-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.53347/rID-169524">https://doi.org/10.53347/rID-169524</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Osman</surname><given-names>M</given-names> </name></person-group><article-title>Hepatocellular carcinoma, case study</article-title><source>Radiopaedia</source><access-date>2025-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.53347/rID-23929">https://doi.org/10.53347/rID-23929</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Gaillard</surname><given-names>F</given-names> </name></person-group><article-title>Multiple sclerosis &#x2013; Dawson&#x2019;s fingers F, case study</article-title><source>Radiopaedia</source><access-date>2025-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.53347/rID-35916">https://doi.org/10.53347/rID-35916</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Gaillard</surname><given-names>F</given-names> </name></person-group><article-title>Multiple sclerosis, case study</article-title><source>Radiopaedia</source><access-date>2025-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.53347/rID-2634">https://doi.org/10.53347/rID-2634</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Puy&#x00F3; Vera</surname><given-names>D</given-names> </name></person-group><article-title>Subarachnoid haemorrhage, case study</article-title><source>Radiopaedia</source><access-date>2025-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.53347/rID-22770">https://doi.org/10.53347/rID-22770</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>Attribution-noncommercial-sharealike 3.0 unported</article-title><source>Creative Commons</source><access-date>2026-06-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by-nc-sa/3.0/deed.en">https://creativecommons.org/licenses/by-nc-sa/3.0/deed.en</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="book"><person-group person-group-type="editor"><name name-style="western"><surname>Sun</surname><given-names>EX</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mandell</surname><given-names>JC</given-names> </name></person-group><source>Core Radiology: A Visual Approach to Diagnostic Imaging</source><year>2021</year><publisher-name>Cambridge University Press</publisher-name><pub-id pub-id-type="doi">10.1017/9781108966450</pub-id><pub-id pub-id-type="other">978-1-108-96591-0</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><source>Radiopaedia</source><access-date>2025-03-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://radiopaedia.org">https://radiopaedia.org</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><source>The Radiology Assistant</source><access-date>2025-03-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://radiologyassistant.nl">https://radiologyassistant.nl</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Saliba</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ferrari</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pozzessere</surname><given-names>C</given-names> </name><name name-style="western"><surname>Rotzinger</surname><given-names>D</given-names> </name><name name-style="western"><surname>Fahrni</surname><given-names>G</given-names> </name></person-group><article-title>Can advanced large language models support radiology training? A performance assessment of DeepSeek R1</article-title><source>European Journal of Radiology Artificial Intelligence</source><year>2025</year><month>09</month><volume>3</volume><fpage>100024</fpage><pub-id pub-id-type="doi">10.1016/j.ejrai.2025.100024</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tordjman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yuce</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparative benchmarking of the DeepSeek large language model on medical tasks and clinical reasoning</article-title><source>Nat Med</source><year>2025</year><month>08</month><volume>31</volume><issue>8</issue><fpage>2550</fpage><lpage>2555</lpage><pub-id pub-id-type="doi">10.1038/s41591-025-03726-3</pub-id><pub-id pub-id-type="medline">40267969</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Faray de Paiva</surname><given-names>L</given-names> </name><name name-style="western"><surname>Luijten</surname><given-names>G</given-names> </name><name name-style="western"><surname>Puladi</surname><given-names>B</given-names> </name><name name-style="western"><surname>Egger</surname><given-names>J</given-names> </name></person-group><article-title>How does deepseek-R1 perform on USMLE?</article-title><source>medRxiv</source><comment>Preprint posted online on  Feb 10, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.02.06.25321749</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mikhail</surname><given-names>D</given-names> </name><name name-style="western"><surname>Farah</surname><given-names>A</given-names> </name><name name-style="western"><surname>Milad</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Performance of DeepSeek-R1 in ophthalmology: an evaluation of clinical decision-making and cost-effectiveness</article-title><source>Br J Ophthalmol</source><year>2025</year><month>08</month><day>20</day><volume>109</volume><issue>9</issue><fpage>976</fpage><lpage>981</lpage><pub-id pub-id-type="doi">10.1136/bjo-2025-327360</pub-id><pub-id pub-id-type="medline">40701781</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hayden</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>S</given-names> </name><name name-style="western"><surname>Poisson</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Griffith</surname><given-names>B</given-names> </name><name name-style="western"><surname>Klochko</surname><given-names>C</given-names> </name></person-group><article-title>Performance of GPT-4 with vision on text- and image-based ACR diagnostic radiology in-training examination questions</article-title><source>Radiology</source><year>2024</year><month>09</month><volume>312</volume><issue>3</issue><fpage>e240153</fpage><pub-id pub-id-type="doi">10.1148/radiol.240153</pub-id><pub-id pub-id-type="medline">39225605</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Payne</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Purohit</surname><given-names>K</given-names> </name><name name-style="western"><surname>Borrero</surname><given-names>WM</given-names> </name><etal/></person-group><article-title>Performance of GPT-4 on the American College of Radiology in-training examination: evaluating accuracy, model drift, and fine-tuning</article-title><source>Acad Radiol</source><year>2024</year><month>07</month><volume>31</volume><issue>7</issue><fpage>3046</fpage><lpage>3054</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2024.04.006</pub-id><pub-id pub-id-type="medline">38653599</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koo</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Li</surname><given-names>MY</given-names> </name></person-group><article-title>A guideline of selecting and reporting intraclass correlation coefficients for reliability research</article-title><source>J Chiropr Med</source><year>2016</year><month>06</month><volume>15</volume><issue>2</issue><fpage>155</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1016/j.jcm.2016.02.012</pub-id><pub-id pub-id-type="medline">27330520</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Surge in large language models exacerbates global regional healthcare inequalities</article-title><source>J Transl Med</source><year>2025</year><month>07</month><day>1</day><volume>23</volume><issue>1</issue><fpage>706</fpage><pub-id pub-id-type="doi">10.1186/s12967-025-06751-5</pub-id><pub-id pub-id-type="medline">40597368</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Complete list of questions submitted to DeepSeek-R1 and ChatGPT-o1.</p><media xlink:href="ai_v5i1e86974_app1.docx" xlink:title="DOCX File, 19 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Rating criteria definitions and Likert scale anchors.</p><media xlink:href="ai_v5i1e86974_app2.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Additional statistical tables and figures.</p><media xlink:href="ai_v5i1e86974_app3.docx" xlink:title="DOCX File, 225 KB"/></supplementary-material></app-group></back></article>