<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e76372</article-id><article-id pub-id-type="doi">10.2196/76372</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Detection of Medical Misinformation in Hemangioma Patient Education: Comparative Study of ChatGPT-4o and DeepSeek-R1 Large Language Models</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Guoyong</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Ye</given-names></name><degrees>MD, PHD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Weixin</given-names></name><degrees>MD, PHD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhu</surname><given-names>Yingjie</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lu</surname><given-names>Wei</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Chaonan</given-names></name><degrees>MD, PHD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bi</surname><given-names>Hui</given-names></name><degrees>MD, PHD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Yang</surname><given-names>Xiaonan</given-names></name><degrees>MD, PHD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Hemangioma and Vascular Malformation, Plastic Surgery Hospital, Chinese Academy of Medical Sciences and Peking Union Medical College</institution><addr-line>33 Badachu Road, Shijingshan District</addr-line><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Internal Medicine, Plastic Surgery Hospital, Chinese Academy of Medical Sciences and Peking Union Medical College</institution><addr-line>Beijing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Wang</surname><given-names>Yanshan</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhang</surname><given-names>Kaijun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Biswas</surname><given-names>Sandipan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhang</surname><given-names>Yunxuan</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Xiaonan Yang, MD, PHD, Department of Hemangioma and Vascular Malformation, Plastic Surgery Hospital, Chinese Academy of Medical Sciences and Peking Union Medical College, 33 Badachu Road, Shijingshan District, Beijing, 100144, China, 86 18810601889, 86 01053968149; <email>yxnan@aliyun.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>18</day><month>11</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e76372</elocation-id><history><date date-type="received"><day>24</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>31</day><month>08</month><year>2025</year></date><date date-type="accepted"><day>01</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Guoyong Wang, Ye Zhang, Weixin Wang, Yingjie Zhu, Wei Lu, Chaonan Wang, Hui Bi, Xiaonan Yang. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 18.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e76372"/><abstract><sec><title>Background</title><p>This study examines the capability of large language models (LLMs) in detecting medical rumors, using hemangioma-related information as an example. It compares the performances of ChatGPT-4o and DeepSeek-R1.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate and compare the accuracy, stability, and expert-rated reliability of 2 LLMs, ChatGPT-4o and DeepSeek-R1, in classifying medical information related to hemangiomas as either &#x201C;rumors&#x201D; or &#x201C;accurate information.&#x201D;</p></sec><sec sec-type="methods"><title>Methods</title><p>We collected 82 publicly available texts from social media platforms, medical education websites, international guidelines, and journals. Of the 82 items, 47/82 (57%) were labeled as &#x201C;rumors,&#x201D; and 35/82 (43%) were labeled as &#x201C;accurate information.&#x201D; Three vascular anomaly specialists with extensive clinical experience independently annotated the texts in a double-blinded manner, and disagreements were resolved by arbitration to ensure labeling reliability. Subsequently, these texts were input into ChatGPT-4o and DeepSeek-R1, with each model generating 2 rounds of results under identical instructions. Output stability was assessed using bidirectional encoder representations from transformers&#x2013;based semantic similarity scores. Classification accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score were calculated to evaluate the performance. Additionally, 2 medical experts independently rated the model outputs using a 5-point scale based on clinical guidelines. Statistical analyses included paired <italic>t</italic> tests, Wilcoxon signed-rank tests, and bootstrap resampling to compute confidence intervals.</p></sec><sec sec-type="results"><title>Results</title><p>In terms of semantic stability, the similarity distributions for the 2 models largely overlapped, with no statistically significant difference observed (mean difference=&#x2212;0.003, 95% CI &#x2212;0.011 to 0.005; <italic>P</italic>=.30). Regarding classification performance, DeepSeek-R1 achieved higher accuracy (0.963) compared to ChatGPT-4o (0.910), and also performed better in terms of precision (0.978 vs 0.940), recall (0.957 vs 0.894), and <italic>F</italic><sub>1</sub>-score (0.967 vs 0.916). Expert evaluations revealed that DeepSeek-R1 significantly outperformed ChatGPT-4o on both &#x201C;rumor&#x201D; items (mean difference=0.431; <italic>P</italic>&#x003C;.001; Cohen <italic>d<sub>z</sub></italic>=0.594) and &#x201C;accurate information&#x201D; items (mean difference=0.264; <italic>P</italic>=.045; Cohen <italic>d<sub>z</sub></italic>=0.352), with a particularly pronounced advantage in rumor detection.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>DeepSeek-R1 demonstrated greater accuracy and rationale in detecting medical rumors compared with ChatGPT-4o. This study provides empirical support for the application of LLMs and recommends optimizing accuracy and incorporating real-time verification mechanisms to mitigate the harmful impact of misleading information on patient health.</p></sec></abstract><kwd-group><kwd>medical rumors</kwd><kwd>large language models</kwd><kwd>hemangioma</kwd><kwd>semantic similarity</kwd><kwd>classification performance</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>In recent years, artificial intelligence (AI) has drawn considerable attention in detecting medical and health-related rumors [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Some studies have conducted systematic reviews on the application of AI technologies, such as text mining and machine learning, for the automatic identification of health misinformation [<xref ref-type="bibr" rid="ref3">3</xref>]. Nonetheless, recognizing medical rumors remains a challenge due to the scarcity of high-quality specialized datasets and the extensive effort required by medical experts for annotation [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], making it difficult to train highly accurate rumor detection models. Moreover, as conversational AI assistants become increasingly integrated with and partially replace traditional search engine functionalities, more individuals are turning to chatbots for medical information [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. However, current large language models (LLMs) lack robust verification mechanisms and often struggle to differentiate genuine from false medical information, frequently producing factually incorrect or imprecise answers&#x2014;commonly known as &#x201C;hallucinations&#x201D; [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. In the medical field, the risks posed by misinformation are particularly severe, as misleading content can undermine trust in health care systems, alter treatment decisions, and even lead patients to delay or reject scientifically validated therapies, opting instead for unsupported and potentially harmful treatments [<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>To ground our investigation concretely, we focused on vascular tumors and malformations&#x2014;a field where rapidly evolving medical classifications often cause significant public confusion and misinformation [<xref ref-type="bibr" rid="ref11">11</xref>]. The International Society for the Study of Vascular Anomalies classification is continuously updated, with the 2025 edition significantly revising its 2018 predecessor by introducing a new category, potentially unique vascular anomaly, incorporating multiple genetic syndromes into the classification framework, and implementing extensive terminology revisions. Such frequent updates complicate both clinical diagnosis and public comprehension [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. A prominent example is the lesion previously termed &#x201C;cavernous hemangioma,&#x201D; which has now been redefined as a subtype of &#x201C;venous malformation.&#x201D; However, outdated terminology persists widely in patient forums and online sources, creating a gap between current medical standards and lay perceptions. This misinformation can lead directly to clinical risks, such as misdiagnosis, delayed treatments, or unnecessary interventions, highlighting the critical need to address inaccuracies and outdated information [<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>In this context, our study selected 2 widely adopted conversational AI models&#x2014;OpenAI&#x2019;s ChatGPT-4o and the open-source DeepSeek-R1&#x2014;as research subjects [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. This combination not only represents the 2 primary development trajectories (closed-source versus open-source) of contemporary LLMs but also establishes a baseline task for subsequent benchmarking, allowing future studies to incorporate additional LLMs and facilitate longitudinal comparability. We conducted a classification evaluation of medical statements concerning hemangiomas and vascular malformations, focusing particularly on the models&#x2019; ability to identify incorrect medical claims (rumors). By comparing the performance of these 2 models on relevant statements, our research aims to evaluate the current capabilities and limitations of AI models in verifying medical information and to provide insights for enhancing rumor-detection capabilities in medical AI systems in future work.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Overview</title><p>Our study used publicly available texts from global social media platforms (eg, Reddit, Zhihu, and Weibo); medical education websites (eg, WebMD, Mayo Clinic, and HaoDF or HaoDaifu Online), the International Society for the Study of Vascular Anomalies classification resources, relevant guidelines, and medical journals (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). In total, 82 statements were collected, with 47 (57%) classified as &#x201C;rumors&#x201D; and 35 (43%) as &#x201C;accurate information.&#x201D; These statements covered key educational aspects of patients with hemangiomas and vascular malformations, including (1) nomenclature and classification, (2) pathogenesis and natural history, (3) risk stratification and complications, (4) assessment and referral, (5) treatment and peritreatment issues, and (6) prognosis and follow-up. All texts collected were independently reviewed by medical experts and labeled as either &#x201C;rumors&#x201D; or &#x201C;accurate information,&#x201D; based on guideline-supported factual accuracy. <xref ref-type="fig" rid="figure1">Figure 1</xref> provides an overview of the study workflow.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Research methodology framework. BERT: bidirectional encoder representations from transformers; ISSVA: International Society for the Study of Vascular Anomalies.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e76372_fig01.png"/></fig></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study used only publicly available, nonidentifiable text data and did not involve clinical interventions, access to medical records, or collection of personal identifiers. In accordance with the Measures for the Ethical Review of Life Science and Medical Research Involving Humans, research using lawfully obtained public data or anonymized information may be exempt from ethics review (Article 32). Therefore, an ethics application was not required for this study [<xref ref-type="bibr" rid="ref16">16</xref>]. Since the data were public and nonidentifiable, informed consent was not required. No compensation was provided to any individuals in relation to this study.</p></sec><sec id="s2-3"><title>Data Collection and Annotation</title><p>Two medical experts specializing in vascular anomalies (with 5 and 10 y of clinical experience, respectively) independently reviewed and labeled each statement as either &#x201C;rumor&#x201D; or &#x201C;accurate information.&#x201D; To minimize bias, all items were anonymized by removing source identifiers and engagement metrics prior to labeling, and annotators remained double-blinded to each other&#x2019;s decisions. In cases of disagreement, arbitration was conducted by a third medical expert with 15 years of clinical experience, resulting in a unified set of labels and ensuring labeling reliability. Potential biases were mitigated through independent dual review, third-party arbitration, and prespecified labeling guidelines.</p></sec><sec id="s2-4"><title>Model Testing</title><p>After labeling, the texts were input into 2 LLMs&#x2014;ChatGPT-4o and DeepSeek-R1&#x2014;for testing. The process is presented in <xref ref-type="other" rid="box1">Textbox 1</xref>.</p><boxed-text id="box1"><title> Model testing process.</title><list list-type="bullet"><list-item><p>Prompts and outputs: to minimize bias introduced by variations in prompting and to highlight baseline comparability, both models received the identical concise instruction: &#x201C;evaluate the following statement for accuracy and reliability in the context of hemangioma and vascular malformation treatment.&#x201D; Each model classified the texts as either &#x201C;rumor&#x201D; or &#x201C;accurate information,&#x201D; accompanied by a brief rationale (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p></list-item><list-item><p>Multiple rounds of generation: to reduce the effects of random output, each model generated results twice for each text. A bidirectional encoder representations from transformers model was then used to compute the semantic similarity of these 2 outputs to assess the stability of the model&#x2019;s performance under identical inputs.</p></list-item></list></boxed-text></sec><sec id="s2-5"><title>Expert Scoring</title><p>In addition to classification results, 2 medical experts independently assessed the compliance of each model&#x2019;s output with clinical guidelines. Evaluations were performed using a 5-point Likert scale (1= highly noncompliant, 5=highly reasonable). The medical experts remained blinded to both the model identities (ChatGPT-4o vs DeepSeek-R1) and each other&#x2019;s scores. Detailed scoring criteria are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p></sec><sec id="s2-6"><title>Statistical Analysis</title><sec id="s2-6-1"><title>Semantic Similarity and Stability</title><p>Semantic stability was assessed by calculating bidirectional encoder representations from transformers (BERT)&#x2013;based similarity scores between 2 independently generated outputs for each statement (see <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> for detailed code). Descriptive statistics, including means, SDs, medians, and IQRs, were reported. Differences between models were compared using paired Wilcoxon signed-rank tests (due to partially nonnormal distributions). Additionally, 95% bias-corrected and accelerated CIs for mean differences were computed via 10,000 bootstrap resamples to ensure robust interval estimation.</p></sec><sec id="s2-6-2"><title>Classification Performance</title><p>Classification accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-scores were calculated based on standard definitions, with error distributions visualized using confusion matrices. This approach allows comprehensive evaluation of global and class-specific performance and is particularly suitable for scenarios involving class imbalance.</p></sec><sec id="s2-6-3"><title>Expert Ratings</title><p>Two clinical experts independently provided ratings on a 5-point Likert scale for each of the 82 statements (47 rumors and 35 accurate statements) in 2 separate rounds. The mean rating for each item was computed as the final score. For each model, descriptive statistics such as mean (SD) and 95% CIs were calculated, treating each statement as an independent unit. Between-model comparisons were performed using paired 2-tailed <italic>t</italic> tests (assuming normality of differences) supplemented by Wilcoxon signed-rank tests as a robust alternative, with Cohen <italic>d<sub>z</sub></italic> effect sizes reported. Within-model comparisons between &#x201C;rumors&#x201D; and &#x201C;accurate information&#x201D; were conducted using Welch <italic>t</italic> test to account for unequal sample sizes and potential variance heterogeneity. Reviewer agreement and reliability were assessed using Cronbach &#x03B1; and interclass correlation coefficients (ICCs), ICC(2,1)/ICC(2,k). All tests were 2-tailed, with statistical significance defined as <italic>P</italic>&#x003C;.05.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>This study systematically compared the performance of ChatGPT-4o and DeepSeek-R1 in classifying statements related to hemangiomas and vascular malformations across three dimensions: (1) the stability of 2 independent outputs, assessed using BERT-based semantic similarity metrics; (2) classification performance, evaluated by accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score; and (3) clinical appropriateness of model outputs as rated by experts on a 5-point scale. For expert ratings, statistical inference was conducted using a paired design with Wilcoxon signed-rank tests, effect sizes (r), and 95% bias-corrected and accelerated CIs.</p></sec><sec id="s3-2"><title>Semantic Similarity Analysis</title><p>To evaluate the semantic similarity between the model-generated responses, we used a BERT-based scoring approach (detailed in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref> shows the distribution of the scores for ChatGPT-4o and DeepSeek-R1. Overall, the distributions for both models exhibited substantial overlap, with ChatGPT-4o displaying a slightly narrower distribution, while DeepSeek-R1 showed a marginally wider range. During paired comparisons, 1 pair with identical observations was excluded, resulting in 81 (99%) paired samples for analysis. The Wilcoxon signed-rank test indicated no significant difference in stability between the 2 models (W=1440.5; <italic>z</italic>=&#x2212;1.036; <italic>P</italic>=.30), with a mean difference of only &#x2212;0.003 (95% bootstrap CI &#x2212;0.011 to 0.005, r=&#x2212;0.115) as shown in <xref ref-type="table" rid="table1">Table 1</xref>. These findings suggest comparable semantic similarity and stability performance between the 2 models.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Stability comparison between ChatGPT-4o and DeepSeek-R1 based on bidirectional encoder representations from transformers semantic similarity scores.<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model and comparison</td><td align="left" valign="bottom">Mean (SD)</td><td align="left" valign="bottom">Median (IQR)</td><td align="left" valign="bottom">Range</td></tr></thead><tbody><tr><td align="left" valign="top">ChatGPT-4o (N=82)</td><td align="left" valign="top">0.9000 (0.0250)</td><td align="left" valign="top">0.9060 (0.8870&#x2010;0.9180)</td><td align="left" valign="top">0.8250&#x2010;0.9400</td></tr><tr><td align="left" valign="top">DeepSeek-R1 (N=82)</td><td align="left" valign="top">0.8970 (0.0320)</td><td align="left" valign="top">0.9010 (0.8850&#x2010;0.9140)</td><td align="left" valign="top">0.7800&#x2010;1.0000</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Paired difference (DeepSeek-R1 &#x2212; ChatGPT-4o; n=81; of the original 82 pairs, 1 pair with identical values [tie] was excluded automatically during the Wilcoxon test, resulting in an effective sample size of 81): mean difference=&#x2212;0.0030; 95% bias-corrected and accelerated CI &#x2212;0.0110 to 0.005; Wilcoxon W=1440.5000; <italic>z</italic>=&#x2212;1.0360; <italic>P</italic>=.30; <italic>r</italic>=&#x2212;0.1150.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Classification Performance Evaluation</title><p>Classification performance for hemangioma and vascular malformation statements was evaluated by examining confusion matrices (<xref ref-type="fig" rid="figure2">Figure 2A</xref>) and key performance metrics. Confusion matrix analyses indicated no substantial differences in misclassification distribution between the 2 models, with overall good stability. In terms of the overall classification accuracy (<xref ref-type="fig" rid="figure2">Figure 2B</xref>), DeepSeek-R1 achieved 0.963, which was notably higher than ChatGPT-4o, which reached approximately 0.910. Additionally, DeepSeek-R1 surpassed ChatGPT-4o in terms of other metrics, including precision, recall, and <italic>F</italic><sub>1</sub>-score. Specifically, DeepSeek-R1 demonstrated a precision of approximately 0.978, recall of 0.957, and an <italic>F</italic><sub>1</sub>-score of 0.967, each marginally higher than the corresponding values for ChatGPT-4o (<xref ref-type="fig" rid="figure2">Figure 2C</xref>). These results highlight the superior classification accuracy of DeepSeek-R1.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>(A) Confusion matrices for vascular lesion classification by ChatGPT-4o and DeepSeek-R1; (B) overall classification accuracy of ChatGPT-4o and DeepSeek-R1; (C) precision, recall, and <italic>F</italic><sub>1</sub>-scores of ChatGPT-4o and DeepSeek-R1.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e76372_fig02.png"/></fig></sec><sec id="s3-4"><title>Expert Rating Analysis</title><p>In qualitative assessments, both models demonstrated strong performance regarding the clinical appropriateness of their outputs, with subtle yet meaningful differences observed. Expert ratings (<xref ref-type="supplementary-material" rid="app7">Multimedia Appendices 7</xref> and <xref ref-type="supplementary-material" rid="app8">8</xref>) indicated that for statements classified as &#x201C;rumors&#x201D; (47/82, 57%), DeepSeek-R1 scored significantly higher with a mean (SD) of 4.39 (0.59) and 95% CI 4.21&#x2010;4.56 compared to ChatGPT-4o with a mean of 3.96 (SD 0.81) and 95% CI of 3.72&#x2010;4.20; the mean difference was 0.431 (95% CI 0.218&#x2010;0.644); paired <italic>t</italic><sub>46</sub>=4.071; <italic>P</italic>&#x003C;.001; Wilcoxon <italic>P</italic>&#x003C;.001; and effect size Cohen <italic>d<sub>z</sub></italic>=0.594.</p><p>For statements labeled as &#x201C;accurate information&#x201D; (35/82, 43%), DeepSeek-R1 with a mean of 4.44 (SD 0.37) and 95% CI of 4.32&#x2010;4.57 also significantly outperformed ChatGPT-4o with a mean of 4.18 (SD 0.69) and 95% CI of 3.94&#x2010;4.41; the mean difference was 0.264 (95% CI 0.007&#x2010;0.522); paired t<sub>34</sub>=2.085; <italic>P</italic>=.045; Wilcoxon <italic>P</italic>=.046; and Cohen <italic>d<sub>z</sub></italic>=0.352.</p><p>These findings demonstrate significant superiority of DeepSeek-R1 over ChatGPT-4o in evaluating both &#x201C;rumors&#x201D; and &#x201C;accurate information,&#x201D; with a particularly pronounced advantage in detecting &#x201C;rumors.&#x201D;</p><p>DeepSeek-R1 performed slightly better than ChatGPT-4o across multiple evaluation dimensions, exhibiting higher output stability and classification accuracy. This finding suggests that DeepSeek-R1 holds greater potential for medical information classification tasks.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>This study compared ChatGPT-4o and DeepSeek-R1 in the task of identifying medical rumors, with hemangioma-related misinformation serving as the focal point [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Overall, both models demonstrated robust language comprehension capabilities but differed markedly in their approaches to recognizing inaccurate statements about hemangiomas. DeepSeek-R1 excelled at pinpointing erroneous claims and clearly categorizing them as rumors, showing its strength in explicit rumor detection and confident classification. In contrast, ChatGPT-4o demonstrated superior semantic similarity and exhibited more consistent stability in understanding nuanced languages, yet tended to approach rumor identification cautiously, often resorting to ambiguous wording rather than decisively refuting false information. Although these observed differences may stem from variations in training data, model architecture, and fine-tuning strategies, existing evidence from other studies suggests that specialized fine-tuning with medical information could further enhance the capability of LLMs in accurately and effectively detecting medical misinformation [<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>In our task, overly cautious responses&#x2014;specifically, the failure to decisively refute rumors (false negatives)&#x2014;may perpetuate harmful misconceptions, causing caregivers to delay specialist referrals or discontinue evidence-based treatments in favor of unproven remedies. Conversely, overconfidence&#x2014;erroneously labeling accurate guidance as rumors (false positives)&#x2014;may lead to unnecessary anxiety, undermine trust in clinicians, or impede appropriate interventions. In hemangioma treatment, such misclassification could negatively impact decisions regarding timely assessment (eg, ulceration and airway involvement), follow-up intervals, or continuation of guideline-adherent therapies. These risks support the use of conservative safety thresholds, verifiable citations, and escalation of human oversight when model confidence is low. One illustrative example is the claim that &#x201C;sun exposure exacerbates hemangiomas,&#x201D; which lacks scientific support [<xref ref-type="bibr" rid="ref20">20</xref>]. Authoritative sources indicate that sun exposure does not directly enlarge or worsen hemangiomas. While moderate sun protection can help safeguard the skin, it does not specifically address pathological changes in hemangiomas [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. In this study, DeepSeek-R1 correctly identified this assertion as a rumor and provided a concise explanation consistent with medical consensus. ChatGPT-4o, in contrast, did not unequivocally refute the claim, instead offering a somewhat reserved answer that did not effectively dispel the misconception. Although both models possess extensive medical knowledge, DeepSeek-R1 displayed a stronger rumor-debunking ability when confronted with evidently incorrect statements, whereas the cautious approach of ChatGPT-40 diluted its capacity to correct misinformation.</p><p>As more users turn to AI assistants for medical information, traditional search engines are gradually being supplemented or even replaced by these systems [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Unlike search engines that merely provide links, AI chatbots often deliver comprehensive, single-point answers whose perceived authority may lead users to over-rely on them instead of consulting additional information sources [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. Consequently, the adverse impact of inaccurate or ambiguous medical information disseminated by AI could be amplified, posing a considerable risk of misleading patients in their health care decisions. Therefore, ensuring higher accuracy in identifying medical rumors is both urgent and critical [<xref ref-type="bibr" rid="ref27">27</xref>].</p><p>Recent research has proposed various methods for leveraging AI to detect medical rumors. For instance, studies comparing GPT-4 with other models trained specifically on health information have shown that specialized models tend to be more accurate in identifying and correcting misinformation [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. These findings underscore that although LLMs have tremendous potential for conveying medical knowledge, they still exhibit shortcomings in fact-checking and real-time verification [<xref ref-type="bibr" rid="ref30">30</xref>]. Incorporating real-time retrieval mechanisms and referencing authoritative data in responses represents a key direction for improving the accuracy of AI-generated medical information [<xref ref-type="bibr" rid="ref28">28</xref>]. Notably, conclusions regarding model superiority depend heavily on the task design, dataset scope, and evaluation criteria. These factors help explain the inconsistencies observed in the existing literature and highlight the novelty of our research, which specifically addresses misinformation related to hemangiomas. The methodological workflow applied in this study&#x2014;consisting of data annotation, multiround generation, BERT similarity assessment, and expert evaluation&#x2014;not only validates the relative advantages of DeepSeekR1 in our task but also underscores the insufficiency of any single metric for comprehensively assessing model performance. Multidimensional evaluations more effectively reveal nuanced differences between models in stability, accuracy, and clinical appropriateness, thereby offering valuable lessons and standardized protocols for the deployment and further study of large medical language models.</p><p>This study has several limitations. First, our data primarily address hemangiomas and vascular malformations, and the limited number and types of examples may not comprehensively encompass all medical rumors. Second, the labeling of rumors relies on expert judgment, introducing an element of subjectivity, and disagreements may arise when experts evaluate borderline cases. Additionally, discrepancies in the 2 AI models&#x2019; training data and knowledge cutoff dates could affect their ability to capture the latest medical information. Finally, we did not evaluate aspects such as explanatory depth, response speed, and user-friendliness. For instance, we did not conduct a formal qualitative or user-centered analysis of explanation quality, which remains an important area for future investigation. For clinical decision support, patient-oriented education, or public health surveillance, LLM-generated outputs should be embedded within regulated workflows that include (1) retrieval-augmented validation from curated vascular anomaly sources, (2) human-in-the-loop review of high-risk recommendations, (3) audit trails and disclaimers clearly delineating accountability, (4) transparent rationales with explicit references to guidelines and clearly marked uncertainties, and (5) postdeployment monitoring for data drift and fairness. These safeguards are prerequisites for mitigating liabilities and improving interpretability and usability in practical applications.</p><p>In conclusion, this research highlights the performance differences between the 2 LLMs in detecting hemangioma-related medical rumors, stressing the urgency of maintaining accurate medical information as AI gradually supplants traditional search engines. DeepSeek-R1 showed higher accuracy and a more decisive approach to rumor detection, whereas the guarded stance of ChatGPT-4o sometimes led to less definitive answers. Future studies should optimize AI models&#x2019; fact-checking capabilities, for example, by integrating real-time access to authoritative databases, enhancing domain-specific fine-tuning, and building human-machine collaborative monitoring systems. Continuous improvements in the accuracy and transparency of AI-driven medical communications will better protect patient health and reinforce public trust in evidence-based health care.</p></sec></body><back><ack><p>The authors would like to express their sincere gratitude to the Vascular Anomalies and Vascular Malformations Plastic Surgery Team at the Plastic Surgery Hospital of the Chinese Academy of Medical Sciences for their strong support of this research, the National Clinical Key Specialty Construction Project (23003), and the Plastic Medicine Research Fund of the Chinese Academy of Medical Sciences (2024-ZX-1&#x2010;01). The funders had no role in the design and conduct of the study; collection, management, analysis, and interpretation of the data; preparation, review, or approval of the manuscript; and decision to submit the manuscript for publication.</p></ack><notes><sec><title>Data Availability</title><p>All deidentified data that support the findings of this study are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> (vascular anomaly information sources). Additional deidentified data and analysis materials that were generated during the study are available from the corresponding author on reasonable request for noncommercial purposes. Authors are prepared to provide the underlying (anonymized) data to the journal for inspection or verification upon request.</p></sec></notes><fn-group><fn fn-type="con"><p>GW wrote the main manuscript and designed the research methodology framework. Y Zhu and WW prepared <xref ref-type="fig" rid="figure1">Figures 1</xref> and <xref ref-type="fig" rid="figure2">2</xref> and conducted data evaluation. GW, Y Zhang, and HB collected the data. GW and XY designed the study. All authors contributed to the statistical analysis and critically reviewed the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BERT</term><def><p>bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb3">ICC</term><def><p>interclass correlation coefficient</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fridman</surname><given-names>I</given-names> </name><name name-style="western"><surname>Boyles</surname><given-names>D</given-names> </name><name name-style="western"><surname>Chheda</surname><given-names>R</given-names> </name><name name-style="western"><surname>Baldwin-SoRelle</surname><given-names>C</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Elston Lafata</surname><given-names>J</given-names> </name></person-group><article-title>Identifying misinformation about unproven cancer treatments on social media using user-friendly linguistic characteristics: content analysis</article-title><source>JMIR Infodemiology</source><year>2025</year><month>02</month><day>12</day><volume>5</volume><fpage>e62703</fpage><pub-id pub-id-type="doi">10.2196/62703</pub-id><pub-id pub-id-type="medline">39938078</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>A</given-names> </name></person-group><article-title>Tackling misinformation in mobile social networks a BERT-LSTM approach for enhancing digital literacy</article-title><source>Sci Rep</source><year>2025</year><month>01</month><day>7</day><volume>15</volume><issue>1</issue><fpage>1118</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-85308-4</pub-id><pub-id pub-id-type="medline">39774143</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schlicht</surname><given-names>IB</given-names> </name><name name-style="western"><surname>Fernandez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Chulvi</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rosso</surname><given-names>P</given-names> </name></person-group><article-title>Automatic detection of health misinformation: a systematic review</article-title><source>J Ambient Intell Humaniz Comput</source><year>2023</year><month>05</month><day>27</day><volume>27</volume><fpage>1</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1007/s12652-023-04619-4</pub-id><pub-id pub-id-type="medline">37360776</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rosenbacke</surname><given-names>R</given-names> </name><name name-style="western"><surname>Melhus</surname><given-names>&#x00C5;</given-names> </name><name name-style="western"><surname>Stuckler</surname><given-names>D</given-names> </name></person-group><article-title>False conflict and false confirmation errors are crucial components of AI accuracy in medical decision making</article-title><source>Nat Commun</source><year>2024</year><month>08</month><day>13</day><volume>15</volume><issue>1</issue><fpage>6896</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-50952-3</pub-id><pub-id pub-id-type="medline">39138179</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ferber</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rood</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Regev</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kather</surname><given-names>JN</given-names> </name></person-group><article-title>How AI agents will change cancer research and oncology</article-title><source>Nat Cancer</source><year>2024</year><month>12</month><volume>5</volume><issue>12</issue><fpage>1765</fpage><lpage>1767</lpage><pub-id pub-id-type="doi">10.1038/s43018-024-00861-7</pub-id><pub-id pub-id-type="medline">39690222</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Menz</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Modi</surname><given-names>ND</given-names> </name><name name-style="western"><surname>Abuhelwa</surname><given-names>AY</given-names> </name><etal/></person-group><article-title>Generative AI chatbots for reliable cancer information: evaluating web-search, multilingual, and reference capabilities of emerging large language models</article-title><source>Eur J Cancer</source><year>2025</year><month>03</month><day>11</day><volume>218</volume><fpage>115274</fpage><pub-id pub-id-type="doi">10.1016/j.ejca.2025.115274</pub-id><pub-id pub-id-type="medline">39922126</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huo</surname><given-names>B</given-names> </name><name name-style="western"><surname>Boyle</surname><given-names>A</given-names> </name><name name-style="western"><surname>Marfo</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Large language models for chatbot health advice studies: a systematic review</article-title><source>JAMA Netw Open</source><year>2025</year><month>02</month><day>3</day><volume>8</volume><issue>2</issue><fpage>e2457879</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.57879</pub-id><pub-id pub-id-type="medline">39903463</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maaz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Palaganas</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Palaganas</surname><given-names>G</given-names> </name><name name-style="western"><surname>Bajwa</surname><given-names>M</given-names> </name></person-group><article-title>A guide to prompt design: foundations and applications for healthcare simulationists</article-title><source>Front Med (Lausanne)</source><year>2024</year><volume>11</volume><fpage>1504532</fpage><pub-id pub-id-type="doi">10.3389/fmed.2024.1504532</pub-id><pub-id pub-id-type="medline">39980724</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meyrowitsch</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Jensen</surname><given-names>AK</given-names> </name><name name-style="western"><surname>S&#x00F8;rensen</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Varga</surname><given-names>TV</given-names> </name></person-group><article-title>AI chatbots and (mis)information in public health: impact on vulnerable communities</article-title><source>Front Public Health</source><year>2023</year><volume>11</volume><fpage>1226776</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2023.1226776</pub-id><pub-id pub-id-type="medline">38026315</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Borges do Nascimento</surname><given-names>IJ</given-names> </name><name name-style="western"><surname>Pizarro</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Almeida</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Infodemics and health misinformation: a systematic review of reviews</article-title><source>Bull World Health Organ</source><year>2022</year><month>09</month><day>1</day><volume>100</volume><issue>9</issue><fpage>544</fpage><lpage>561</lpage><pub-id pub-id-type="doi">10.2471/BLT.21.287654</pub-id><pub-id pub-id-type="medline">36062247</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>Classification of vascular anomalies</article-title><source>International Society for the Study of Vascular Anomalies</source><year>2025</year><access-date>2025-03-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.issva.org/classification">https://www.issva.org/classification</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Su</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Dermatoscopic features differentiating among port wine stain, arteriovenous malformation, and capillary malformation-arteriovenous malformation syndrome: to detect potential fast-flow vascular malformations at an early stage</article-title><source>J Am Acad Dermatol</source><year>2022</year><month>12</month><volume>87</volume><issue>6</issue><fpage>1435</fpage><lpage>1437</lpage><pub-id pub-id-type="doi">10.1016/j.jaad.2022.07.053</pub-id><pub-id pub-id-type="medline">35952834</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Werner</surname><given-names>JA</given-names> </name><name name-style="western"><surname>D&#x00FC;nne</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Lippert</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Folz</surname><given-names>BJ</given-names> </name></person-group><article-title>Optimal treatment of vascular birthmarks</article-title><source>Am J Clin Dermatol</source><year>2003</year><volume>4</volume><issue>11</issue><fpage>745</fpage><lpage>756</lpage><pub-id pub-id-type="doi">10.2165/00128071-200304110-00003</pub-id><pub-id pub-id-type="medline">14572297</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Normile</surname><given-names>D</given-names> </name></person-group><article-title>Chinese firm&#x2019;s large language model makes a splash</article-title><source>Science</source><year>2025</year><month>01</month><day>17</day><volume>387</volume><issue>6731</issue><fpage>238</fpage><pub-id pub-id-type="doi">10.1126/science.adv9836</pub-id><pub-id pub-id-type="medline">39818899</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>National Health Commission</collab><collab>Ministry of Education</collab><collab>Ministry of Science and Technology</collab><collab>National Administration of Traditional Chinese Medicine</collab></person-group><article-title>Measures for the Ethical Review of Life Science and Medical Research Involving Humans [Web page in Chinese]</article-title><source>The State Council of the People&#x2019;s Republic of China</source><access-date>2025-11-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.gov.cn/zhengce/zhengceku/2023-02/28/content_5743658.htm">https://www.gov.cn/zhengce/zhengceku/2023-02/28/content_5743658.htm</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>K</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Potential and limitations of ChatGPT 3.5 and 4.0 as a source of COVID-19 information: comprehensive comparative analysis of generative and authoritative information</article-title><source>J Med Internet Res</source><year>2023</year><month>12</month><day>14</day><volume>25</volume><fpage>e49771</fpage><pub-id pub-id-type="doi">10.2196/49771</pub-id><pub-id pub-id-type="medline">38096014</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Campbell</surname><given-names>JP</given-names> </name><etal/></person-group><article-title>Generative artificial intelligence through ChatGPT and other large language models in ophthalmology: clinical applications and challenges</article-title><source>Ophthalmol Sci</source><year>2023</year><month>12</month><volume>3</volume><issue>4</issue><fpage>100394</fpage><pub-id pub-id-type="doi">10.1016/j.xops.2023.100394</pub-id><pub-id pub-id-type="medline">37885755</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><etal/></person-group><article-title>BioBERT: a pre-trained biomedical language representation model for biomedical text mining</article-title><source>Bioinformatics</source><year>2020</year><month>02</month><day>15</day><volume>36</volume><issue>4</issue><fpage>1234</fpage><lpage>1240</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btz682</pub-id><pub-id pub-id-type="medline">31501885</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Torrence</surname><given-names>D</given-names> </name><name name-style="western"><surname>Antonescu</surname><given-names>CR</given-names> </name></person-group><article-title>The genetics of vascular tumours: an update</article-title><source>Histopathology</source><year>2022</year><month>01</month><volume>80</volume><issue>1</issue><fpage>19</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1111/his.14458</pub-id><pub-id pub-id-type="medline">34958509</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Krowchuk</surname><given-names>DP</given-names> </name><name name-style="western"><surname>Frieden</surname><given-names>IJ</given-names> </name><name name-style="western"><surname>Mancini</surname><given-names>AJ</given-names> </name><etal/></person-group><article-title>Clinical practice guideline for the management of infantile hemangiomas</article-title><source>Pediatrics</source><year>2019</year><month>01</month><volume>143</volume><issue>1</issue><fpage>e20183475</fpage><pub-id pub-id-type="doi">10.1542/peds.2018-3475</pub-id><pub-id pub-id-type="medline">30584062</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Frenette</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mendiratta-Lala</surname><given-names>M</given-names> </name><name name-style="western"><surname>Salgia</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Sauer</surname><given-names>BG</given-names> </name><name name-style="western"><surname>Pillai</surname><given-names>A</given-names> </name></person-group><article-title>ACG clinical guideline: focal liver lesions</article-title><source>Am J Gastroenterol</source><year>2024</year><month>07</month><day>1</day><volume>119</volume><issue>7</issue><fpage>1235</fpage><lpage>1271</lpage><pub-id pub-id-type="doi">10.14309/ajg.0000000000002857</pub-id><pub-id pub-id-type="medline">38958301</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fayos De Ariz&#x00F3;n</surname><given-names>L</given-names> </name><name name-style="western"><surname>Viera</surname><given-names>ER</given-names> </name><name name-style="western"><surname>Pilco</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Artificial intelligence: a new field of knowledge for nephrologists?</article-title><source>Clin Kidney J</source><year>2023</year><month>12</month><volume>16</volume><issue>12</issue><fpage>2314</fpage><lpage>2326</lpage><pub-id pub-id-type="doi">10.1093/ckj/sfad182</pub-id><pub-id pub-id-type="medline">38046016</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>TW</given-names> </name></person-group><article-title>Application of artificial intelligence chatbots, including ChatGPT, in education, scholarly work, programming, and content generation and its prospects: a narrative review</article-title><source>J Educ Eval Health Prof</source><year>2023</year><volume>20</volume><fpage>38</fpage><pub-id pub-id-type="doi">10.3352/jeehp.2023.20.38</pub-id><pub-id pub-id-type="medline">38148495</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reddy</surname><given-names>S</given-names> </name></person-group><article-title>Generative AI in healthcare: an implementation science informed translational path on application, integration and governance</article-title><source>Implement Sci</source><year>2024</year><month>03</month><day>15</day><volume>19</volume><issue>1</issue><fpage>27</fpage><pub-id pub-id-type="doi">10.1186/s13012-024-01357-9</pub-id><pub-id pub-id-type="medline">38491544</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pugliese</surname><given-names>N</given-names> </name><name name-style="western"><surname>Wai-Sun Wong</surname><given-names>V</given-names> </name><name name-style="western"><surname>Schattenberg</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Accuracy, reliability, and comprehensibility of ChatGPT-generated medical responses for patients with nonalcoholic fatty liver disease</article-title><source>Clin Gastroenterol Hepatol</source><year>2024</year><month>04</month><volume>22</volume><issue>4</issue><fpage>886</fpage><lpage>889</lpage><pub-id pub-id-type="doi">10.1016/j.cgh.2023.08.033</pub-id><pub-id pub-id-type="medline">37716618</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ismail</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kbaier</surname><given-names>D</given-names> </name><name name-style="western"><surname>Farrell</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kane</surname><given-names>A</given-names> </name></person-group><article-title>The experience of health professionals with misinformation and its impact on their job practice: qualitative interview study</article-title><source>JMIR Form Res</source><year>2022</year><month>11</month><day>2</day><volume>6</volume><issue>11</issue><fpage>e38794</fpage><pub-id pub-id-type="doi">10.2196/38794</pub-id><pub-id pub-id-type="medline">36252133</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zakka</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chaurasia</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Almanac - retrieval-augmented language models for clinical medicine</article-title><source>NEJM AI</source><year>2024</year><month>02</month><volume>1</volume><issue>2</issue><pub-id pub-id-type="doi">10.1056/aioa2300068</pub-id><pub-id pub-id-type="medline">38343631</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alber</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Alyakin</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Medical large language models are vulnerable to data-poisoning attacks</article-title><source>Nat Med</source><year>2025</year><month>02</month><volume>31</volume><issue>2</issue><fpage>618</fpage><lpage>626</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03445-1</pub-id><pub-id pub-id-type="medline">39779928</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Vascular anomaly information sources.</p><media xlink:href="ai_v4i1e76372_app1.docx" xlink:title="DOCX File, 23 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Large language model (LLM) text classification prompts.</p><media xlink:href="ai_v4i1e76372_app2.docx" xlink:title="DOCX File, 13 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Likert scale for model output assessment.</p><media xlink:href="ai_v4i1e76372_app3.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Bidirectional encoder representations from transformers (BERT) semantic similarity code.</p><media xlink:href="ai_v4i1e76372_app4.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Bidirectional encoder representations from transformers (BERT) similarity scores model comparison.</p><media xlink:href="ai_v4i1e76372_app5.xlsx" xlink:title="XLSX File, 15 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Bidirectional encoder representations from transformers (BERT) similarity ChatGPT-4o versus DeepSeek-R1.</p><media xlink:href="ai_v4i1e76372_app6.png" xlink:title="PNG File, 148 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>YZ ratings for model reasonableness.</p><media xlink:href="ai_v4i1e76372_app7.xlsx" xlink:title="XLSX File, 13 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>WW ratings for model reasonableness.</p><media xlink:href="ai_v4i1e76372_app8.xlsx" xlink:title="XLSX File, 13 KB"/></supplementary-material></app-group></back></article>