<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e87062</article-id><article-id pub-id-type="doi">10.2196/87062</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Multimodal GPT-5 for Predicting Poor Functional Outcomes After Intracerebral Hemorrhage in the Emergency Department: Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Matsumoto</surname><given-names>Koutarou</given-names></name><degrees>MPH, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ishihara</surname><given-names>Kazuaki</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tamba</surname><given-names>Ryota</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fujiyoshi</surname><given-names>Yusuke</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tokunaga</surname><given-names>Koki</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Matsuda</surname><given-names>Katsuhiko</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nohara</surname><given-names>Yasunobu</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Jenhui</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yamashiro</surname><given-names>Shigeo</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Nakashima</surname><given-names>Naoki</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff10">10</xref><xref ref-type="aff" rid="aff11">11</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kamouchi</surname><given-names>Masahiro</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff12">12</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Health Care Administration and Management, Graduate School of Medical Sciences, Kyushu University</institution><addr-line>3-1-1 Maidashi, Higashi-ku, Fukuoka-shi</addr-line><addr-line>Fukuoka</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Institute for Medical Information Research and Analysis, Saiseikai Kumamoto Hospital</institution><addr-line>Kumamoto</addr-line><country>Japan</country></aff><aff id="aff3"><institution>Department of Computer Science and Information Engineering, Chang Gung University</institution><addr-line>Taoyuan</addr-line><country>Taiwan</country></aff><aff id="aff4"><institution>Graduate Degree Program of Applied Data Sciences, Sophia University</institution><addr-line>Tokyo</addr-line><country>Japan</country></aff><aff id="aff5"><institution>Joint Graduate School of Mathematics for Innovation, Kyushu University</institution><addr-line>Fukuoka</addr-line><country>Japan</country></aff><aff id="aff6"><institution>Department of Pharmacy, Saiseikai Kumamoto Hospital</institution><addr-line>Kumamoto</addr-line><country>Japan</country></aff><aff id="aff7"><institution>Department of Radiology, Saiseikai Kumamoto Hospital</institution><addr-line>Kumamoto</addr-line><country>Japan</country></aff><aff id="aff8"><institution>Big Data Science and Technology, Faculty of Advanced Science and Technology, Kumamoto University</institution><addr-line>Kumamoto</addr-line><country>Japan</country></aff><aff id="aff9"><institution>Division of Neurosurgery, Saiseikai Kumamoto Hospital</institution><country>Japan</country></aff><aff id="aff10"><institution>Medical Information Center, Kyushu University Hospital</institution><addr-line>Fukuoka</addr-line><country>Japan</country></aff><aff id="aff11"><institution>Department of Medical Informatics, Graduate School of Medical Sciences, Kyushu University</institution><addr-line>Fukuoka</addr-line><country>Japan</country></aff><aff id="aff12"><institution>Center for Cohort Studies, Graduate School of Medical Sciences, Kyushu University</institution><addr-line>Fukuoka</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhang</surname><given-names>Jun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lambert</surname><given-names>Pascal</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Koutarou Matsumoto, MPH, PhD, Department of Health Care Administration and Management, Graduate School of Medical Sciences, Kyushu University, 3-1-1 Maidashi, Higashi-ku, Fukuoka-shi, Fukuoka, 812-8582, Japan, 81 0926426960; <email>matsumoto.kotaro.251@m.kyushu-u.ac.jp</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>27</day><month>5</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e87062</elocation-id><history><date date-type="received"><day>03</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>27</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>28</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Koutarou Matsumoto, Kazuaki Ishihara, Ryota Tamba, Yusuke Fujiyoshi, Koki Tokunaga, Katsuhiko Matsuda, Yasunobu Nohara, Jenhui Chen, Shigeo Yamashiro, Naoki Nakashima, Masahiro Kamouchi. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 27.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2026/1/e87062"/><abstract><sec><title>Background</title><p>In the emergency department, rapid prognostic assessment of patients with intracerebral hemorrhage (ICH) is essential for guiding early management decisions, particularly when stroke specialists are not immediately available. Recent advances in large language models have generated interest in their potential utility as clinical decision-support tools.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate the predictive performance and potential clinical utility of GPT (OpenAI)-based models for poor functional outcomes after ICH using real-world multimodal data routinely available at emergency department presentation.</p></sec><sec sec-type="methods"><title>Methods</title><p>We analyzed data from patients with ICH admitted to a tertiary hospital. Using routinely collected clinical data and noncontrast computed tomography (CT) images at admission, GPT-4.1 (OpenAI) and GPT-5 (OpenAI)&#x2014;accessed via the Azure OpenAI Service&#x2014;were applied to predict poor functional outcomes, defined as a modified Rankin Scale score of 3&#x2010;6 at discharge. A conventional machine learning (ML) model was developed by combining deep learning&#x2013;extracted imaging features from Digital Imaging and Communications in Medicine CT data with clinical variables using L1-regularized logistic regression. GPT-based models were evaluated using the same clinical dataset and JPEG-format CT images. Model performance was assessed in terms of discrimination (area under the receiver operating characteristic curve [AUROC]), overall performance (scaled Brier score and Nagelkerke <italic>R</italic>&#x00B2;), calibration, reproducibility (intraclass correlation coefficient [ICC]), and clinical utility (decision curve analysis).</p></sec><sec sec-type="results"><title>Results</title><p>The ML model achieved an AUROC of 0.85 (95% CI 0.79&#x2010;0.90), a scaled Brier score of 0.23 (95% CI 0.06&#x2010;0.36), and a Nagelkerke <italic>R</italic>&#x00B2; of 0.35 (95% CI 0.18&#x2010;0.48). Zero-shot GPT-4.1 and GPT-5 demonstrated discrimination comparable to the ML model (AUROC 0.84, 95% CI 0.77&#x2010;0.91 and 0.85, 95% CI 0.78&#x2010;0.91, respectively) with high reproducibility (ICC 0.91 and 0.95, respectively) but inferior overall performance, as reflected by lower scaled Brier scores and low or negative Nagelkerke <italic>R</italic>&#x00B2; values. Incorporating ML-derived information into the prompts modestly improved discrimination (AUROC 0.84, 95% CI 0.78&#x2010;0.90 and 0.87, 95% CI 0.81&#x2010;0.92, respectively) and reproducibility (ICC 0.97 and 0.96, respectively). Calibration plots indicated that GPT-based models tended to underestimate predicted probabilities, although this bias was partially attenuated after model-informed prompting. Decision curve analysis indicated that GPT-based models provided net benefit only at higher threshold probabilities and did not demonstrate superior clinical utility compared with the ML model.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Zero-shot GPT models achieved discriminatory performance comparable to a conventional ML model but showed limitations in calibration and overall predictive accuracy. Rather than replacing established prognostic ML models, GPT-based models may be better positioned as complementary interfaces that translate predictive outputs into clinically interpretable natural language to support decision-making.</p></sec></abstract><kwd-group><kwd>multimodal model</kwd><kwd>zero-shot learning</kwd><kwd>generative pretrained transformer</kwd><kwd>GPT</kwd><kwd>intracerebral hemorrhage</kwd><kwd>generative artificial intelligence</kwd><kwd>machine learning</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Intracerebral hemorrhage (ICH) accounts for approximately 10%&#x2010;30% of all strokes yet carries disproportionately high morbidity and mortality rates [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. It is a life-threatening neurological emergency that demands prompt and accurate decision-making upon hospital arrival. ICH onset is unpredictable; therefore, stroke specialists are not always available in the emergency department (ED). Therefore, establishing systems that support rapid decision-making by nonspecialists and enable seamless handover to stroke specialists is essential.</p><p>In routine clinical practice, emergency physicians first gather initial clinical information, conduct brain imaging, and consult a stroke specialist once ICH is confirmed. Although nonspecialists can usually recognize ICH on imaging, accurately estimating a patient&#x2019;s functional prognosis, which is crucial for determining the treatment strategy, remains challenging. Effective clinical reasoning in this setting requires integrating multimodal information, including imaging, vital signs, laboratory data, and medical history, into a coherent assessment.</p><p>Recent advances in deep learning have enabled multimodal algorithms that predict ICH prognosis early after presentation [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. More recently, large language models (LLMs) have emerged with the capability to process tabular, textual, and imaging inputs. As these models have evolved, foundation models fine-tuned with medical data have been increasingly adapted for diverse health care applications [<xref ref-type="bibr" rid="ref7">7</xref>]. In parallel, closed-source general-purpose LLMs, including OpenAI&#x2019;s GPT, Google Gemini, and Anthropic Claude, have shown strong performance on complex reasoning tasks even without task-specific training.</p><p>Released in August 2025, GPT-5 represents one of the most advanced LLMs to date [<xref ref-type="bibr" rid="ref8">8</xref>]. Early studies have indicated that GPT-5 can integrate multimodal inputs, including medical images and structured data, to perform sophisticated clinical reasoning and achieve high accuracy on standardized multimodal benchmarks [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. However, despite these advances, its prognostic reasoning performance with real-world clinical data has not been evaluated, and no prior study has validated its use for predicting functional outcomes in patients with ICH.</p><p>Therefore, in this study, from the perspective of emergency physicians, we evaluated how accurately GPT models predict poor functional outcomes&#x2014;herein defined as a modified Rankin Scale (mRS) score &#x2265;3&#x2014;after ICH using only routinely available ED clinical data and brain images converted to compact JPEG files suitable for remote transmission. We compared the performance of GPT with that of a conventional machine learning (ML) model trained on the same clinical variables and image features extracted from Digital Imaging and Communications in Medicine (DICOM) computed tomography (CT) scans via deep learning. Our specific objectives were to assess the zero-shot predictive performance of GPT-4.1 and GPT-5 using ED clinical data and brain imaging without any additional fine-tuning and determine whether incorporating outputs from a conventional ML model could further enhance GPT performance.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>The study protocol was approved by the Ethics Committee of Saiseikai Kumamoto Hospital (approval number 1199; September 29, 2023) and conducted in accordance with the Declaration of Helsinki. All data were fully anonymized before analysis and retrospectively analyzed; therefore, the requirement for informed consent was formally waived by the institutional Ethics Committee. No compensation was provided to study participants. The paper and supplementary materials include only noncontrast head CT images that do not permit identification of individual participants. Images shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> were obtained from a publicly available anonymized dataset [<xref ref-type="bibr" rid="ref12">12</xref>].</p></sec><sec id="s2-2"><title>Study Design and Participants</title><p>Validation and reporting of GPT-based predictions followed the Transparent Reporting of a Multivariable Model for Individual Prognosis or Diagnosis (TRIPOD)-LLM statement [<xref ref-type="bibr" rid="ref13">13</xref>]. To compare the performance of GPT-based predictions with that of the ML model, we used the same datasets as in our previous studies [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], which developed multimodal ML-based models for predicting poststroke unfavorable discharge outcomes, including poor functional outcome and in-hospital mortality. In the previous analysis, 527 patients with ICH were included. Among them, the derivation cohort comprised 352 patients admitted between April 2019 and December 2020, and the temporal validation cohort included 175 patients admitted between January 2021 and January 2022 (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). In this study, all comparisons and validations of GPT and ML model performance were conducted using this temporal validation cohort.</p></sec><sec id="s2-3"><title>Clinical Outcome</title><p>A poor functional outcome was defined as an mRS score of 3&#x2010;6 at hospital discharge. The mRS score was assessed by the attending physician.</p></sec><sec id="s2-4"><title>Tabular Data</title><p>Clinical data comprised tabular variables combined with noncontrast head CT images that were routinely obtainable by nonspecialists at ED presentation. A detailed list of the tabular variables used in this study, together with their corresponding missing rates, is provided in <xref ref-type="table" rid="table1">Table 1</xref>. Although multiple imputation is generally recommended, directly applying multiple imputation within a GPT-based inference framework is technically challenging. In addition, the proportion of missing values across all variables in this study was low (maximum &#x003C;3.5%), indicating a limited loss of information and an unlikely material impact on the results [<xref ref-type="bibr" rid="ref14">14</xref>]. Therefore, single imputation was performed using the Multivariate Imputation by Chained Equations method implemented in the <italic>mice</italic> package in R. To prevent data leakage, missing value imputation was performed after splitting the dataset into the derivation and temporal validation cohorts and was conducted independently within each cohort [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>].</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Tabular clinical variables used for both GPT zero-shot inference and ML<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>-based model<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variable</td><td align="left" valign="bottom">Summary statistics</td><td align="left" valign="bottom">Missing rate</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Demographic</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Age (y), median (IQR)</td><td align="left" valign="top">75 (64&#x2010;83)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female, n (%)</td><td align="left" valign="top">69 (39.4)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top" colspan="3">Risk factor, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hypertension</td><td align="left" valign="top">126 (72.0)</td><td align="left" valign="top">3.4</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Diabetes mellitus</td><td align="left" valign="top">41 (23.4)</td><td align="left" valign="top">3.4</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Dyslipidemia</td><td align="left" valign="top">58 (33.1)</td><td align="left" valign="top">3.4</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Smoking</td><td align="left" valign="top">60 (34.3)</td><td align="left" valign="top">0.6</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Drinking</td><td align="left" valign="top">72 (41.1)</td><td align="left" valign="top">2.9</td></tr><tr><td align="left" valign="top" colspan="3">Prestroke functional status, median (IQR)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Prestroke mRS<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> score</td><td align="left" valign="top">0 (0&#x2010;1)</td><td align="left" valign="top">0.6</td></tr><tr><td align="left" valign="top" colspan="3">Prestroke medication, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Antiplatelets</td><td align="left" valign="top">40 (22.9)</td><td align="left" valign="top">3.4</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Direct oral anticoagulants</td><td align="left" valign="top">24 (13.7)</td><td align="left" valign="top">3.4</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vitamin K antagonists</td><td align="left" valign="top">6 (3.4)</td><td align="left" valign="top">3.4</td></tr><tr><td align="left" valign="top" colspan="3">Onset-to-admission time, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x003C;4 h</td><td align="left" valign="top">91 (52.0)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4&#x2010;8 h</td><td align="left" valign="top">18 (10.3)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>8&#x2010;24 h</td><td align="left" valign="top">39 (22.3)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>24&#x2010;72 h</td><td align="left" valign="top">24 (13.7)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x003E;72 h</td><td align="left" valign="top">3 (1.7)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top" colspan="3">Transportation method to the hospital, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ambulance use</td><td align="left" valign="top">161 (92.0)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top" colspan="3">Physiological data, median (IQR)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Systolic blood pressure (mm Hg)</td><td align="left" valign="top">180 (162.5&#x2010;202)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Diastolic blood pressure (mm Hg)</td><td align="left" valign="top">106 (93.5&#x2010;121)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Pulse rate (per min)</td><td align="left" valign="top">83 (70&#x2010;95.5)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>SpO<sub>2</sub> (%)</td><td align="left" valign="top">97 (95&#x2010;98)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Respiratory rate (per min)</td><td align="left" valign="top">19 (17&#x2010;22)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Body temperature (&#x2103;)</td><td align="left" valign="top">36.6 (36.4&#x2010;36.9)</td><td align="left" valign="top">1.7</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>BMI (kg/m&#x00B2;)</td><td align="left" valign="top">22.9 (19.5&#x2010;25.6)</td><td align="left" valign="top">1.1</td></tr><tr><td align="left" valign="top" colspan="3">Laboratory data, median (IQR)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sodium (mmol/L)</td><td align="left" valign="top">141 (139&#x2010;142)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Potassium (mmol/L)</td><td align="left" valign="top">3.8 (3.6&#x2010;4.2)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chloride (mmol/L)</td><td align="left" valign="top">104 (101&#x2010;105)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Total protein (g/dL)</td><td align="left" valign="top">7.1 (6.75&#x2010;7.55)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Albumin (g/dL)</td><td align="left" valign="top">4.0 (3.8&#x2010;4.3)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Blood urea nitrogen (mg/dL)</td><td align="left" valign="top">16.1 (13.65&#x2010;19.75)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Creatinine (mg/dL)</td><td align="left" valign="top">0.82 (0.67&#x2010;1.04)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Aspartate aminotransferase (IU/L)</td><td align="left" valign="top">25 (20&#x2010;32.5)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Alanine transaminase (IU/L)</td><td align="left" valign="top">17 (13.5&#x2010;25.5)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Gamma-glutamyl transferase (IU/L)</td><td align="left" valign="top">25 (15&#x2010;48.5)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lactate dehydrogenase (IU/L)</td><td align="left" valign="top">224 (195.5&#x2010;263)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Total bilirubin (mg/dL)</td><td align="left" valign="top">0.8 (0.6&#x2010;1.0)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Glucose (mg/dL)</td><td align="left" valign="top">127 (108&#x2010;161.5)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>White blood cell count (10<sup>3</sup>/&#x03BC;L)</td><td align="left" valign="top">7.60 (6.15&#x2010;10.10)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Red blood cell count (10<sup>6</sup>/&#x03BC;L)</td><td align="left" valign="top">4.45 (3.96&#x2010;4.88)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hemoglobin (g/dL)</td><td align="left" valign="top">13.7 (12.4&#x2010;15.05)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Hematocrit (%)</td><td align="left" valign="top">40.8 (37.2&#x2010;44.5)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Blood platelet count (10<sup>4</sup>/&#x03BC;L)</td><td align="left" valign="top">21.2 (17.05&#x2010;24.6)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>C-reactive protein (mg/dL)</td><td align="left" valign="top">0.10 (0.05&#x2010;0.26)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Activated partial thromboplastin time (s)</td><td align="left" valign="top">27.6 (25.4&#x2010;30.4)</td><td align="left" valign="top">0.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Prothrombin time-international normalized ratio, median (IQR)</td><td align="left" valign="top">0.97 (0.91&#x2010;1.03)</td><td align="left" valign="top">0.6</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>ML: machine learning.</p></fn><fn id="table1fn2"><p><sup>b</sup>Data are expressed as median (IQR) or n (%). The rates of missing values (%) are shown for clinical variables.</p></fn><fn id="table1fn3"><p><sup>c</sup>mRS: modified Rankin Scale.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-5"><title>Image Data</title><p>CT images were acquired using a Canon Aquilion Prime SP scanner (Canon Medical Systems Corporation) and stored in DICOM format within the hospital&#x2019;s picture archiving and communication system (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). The image preprocessing procedure for the ML model was identical to that described in our previous report [<xref ref-type="bibr" rid="ref4">4</xref>]. Briefly, noncontrast CT images were adjusted to a window range of 15&#x2010;100 Hounsfield units, followed by min-max normalization. After centering the brain, each image was resized to 256&#x00D7;256 pixels. The number of axial slices was standardized to 22 using spline interpolation, as specified in the study protocol.</p><p>In contrast, for the GPT models, CT images in JPEG format were used as model inputs. Representative slices that clearly demonstrated ICH were used owing to the limit of 10 images per chat call when uploading images through the application programming interface (API). For each patient, an experienced radiologic technologist, blinded to discharge mRS scores and all baseline clinical admission data used in the prediction models, selected slices deemed clinically relevant for diagnostic reporting according to predefined criteria: images including the hematoma; images capturing the overall hematoma when large&#x2014;covering the upper margin, central portion (multiple slices if necessary), and lower margin; and images showing intraventricular extension or midline shift when present. The selected images (median 3, IQR 2&#x2010;4 slices; <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>) were converted from DICOM to compact JPEG format to emulate real-world ED workflows, where nonspecialists frequently share lightweight image files for remote consultation. These JPEG images were used to facilitate standardized multimodal input and efficient remote processing during GPT inference.</p><p>This design enabled a direct comparison between a conventional ML model trained on fully standardized DICOM images and GPT models operating on simplified, nonspecialist-accessible JPEG images, thereby reflecting distinct yet complementary clinical use cases.</p></sec><sec id="s2-6"><title>GPT Configuration</title><p>GPT-5 (version 2025-08-07) was accessed via the Azure OpenAI Service using the Responses API [<xref ref-type="bibr" rid="ref15">15</xref>], which supports multimodal input. The model had been trained on data updated until September 2024. Based on preliminary analyses showing negligible differences in predictive accuracy between the minimal and high reasoning-effort configurations (internally defined by Azure OpenAI), the reasoning effort was fixed at the minimal level for all experiments (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). The verbosity level (response length control) was set to low to reduce unnecessary variation and ensure concise, reproducible outputs.</p><p>The maximum token limit was not explicitly specified; the provider&#x2019;s default setting was used because all clinical summaries were concise and remained well within this limit. To ensure complete independence of inference across patients, conversation history and system context were cleared before each request, and each case was separately processed. All GPT-5 inference experiments were conducted between August 17 and 23, 2025.</p><p>For comparison, GPT-4.1 (version 2025-04-14) was also evaluated via the Azure OpenAI Service using the Chat Completions API [<xref ref-type="bibr" rid="ref15">15</xref>]. This model had been trained on data updated until April 2024. Unlike GPT-5, the Chat Completions API allowed explicit control of the temperature parameter; therefore, the temperature was set to 0 to minimize sampling variability and improve output consistency across runs. Nevertheless, exact reproducibility may not always be achieved, as closed-source LLMs may exhibit nondeterministic behavior due to factors such as floating-point operations and distributed inference. All GPT-4.1 inference experiments were performed during the same period as GPT-5 (August 17&#x2010;23, 2025).</p></sec><sec id="s2-7"><title>ML-Based Model</title><p>For comparison with GPT, we used an ML-based model previously developed by our group, with minor modifications where the tabular clinical inputs were replaced with the same variables used for GPT inference (<xref ref-type="table" rid="table1">Table 1</xref>). This model adopted a late-fusion architecture that combined imaging features extracted from noncontrast head CT scans with structured tabular variables obtainable by nonspecialists, as listed in <xref ref-type="table" rid="table1">Table 1</xref>.</p><p>Imaging features were extracted using a 3D U-Net pretrained on the publicly available Brain Hemorrhage Segmentation Dataset for ICH segmentation [<xref ref-type="bibr" rid="ref12">12</xref>]. Fully connected layers were added to the encoder output and subsequently fine-tuned using 3D CT data from Saiseikai Kumamoto Hospital to predict discharge outcomes (mRS score &#x2265;3) based on binary cross-entropy loss. This fine-tuning process allowed the pretrained encoder to adapt its high-level feature representations to the target classification task while capturing local imaging characteristics specific to the institutional dataset.</p><p>From the fine-tuned network, a 512-dimensional feature vector was obtained via a global average pooling layer applied to the final encoder output. This vector was concatenated with standardized tabular variables, and an L1-regularized logistic regression model was trained to construct the final late-fusion model. The use of L1 regularization promoted sparsity and facilitated feature selection among the combined multimodal inputs. Model training and fine-tuning were performed on the derivation cohort (n=352), and model performance was assessed on the same independent temporal validation cohort (n=175) as that used for GPT evaluation.</p></sec><sec id="s2-8"><title>Prompt Design and GPT Inference</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> summarizes the 2 types of prompt strategies used in this study. Detailed examples of each prompt type are provided in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref> (Prompts S1 and S2). (1) Tabular-image integration prompt: structured tabular variables were combined with noncontrast head CT images selected by a radiologic technologist. The GPT model was instructed to jointly process both modalities and integrate image-derived features with clinical data during reasoning (Prompt S1).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of GPT-based zero-shot inference and the machine learning&#x2013;assisted prompting workflow. CoT: chain-of-thought output; CT: computed tomography; DICOM: Digital Imaging and Communications in Medicine; JSON: JavaScript Object Notation; LR: logistic regression; ML: machine learning; PACS: Picture Archiving and Communication System.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e87062_fig01.png"/></fig><p>(2) Model-informed prompt: outputs from the ML-based model described above&#x2014;including predicted probabilities and standardized regression coefficients (<xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>)&#x2014;were appended to the GPT input. This design allowed GPT to incorporate prior model knowledge, including feature importance, together with patient-specific tabular and imaging data during inference (Prompt S2). The predicted probabilities were provided to support calibration of the GPT-generated probabilities, whereas the standardized regression coefficients were included to explicitly convey the direction of risk contribution for each feature, thereby improving clinical interpretability. Because the predicted probabilities already incorporate information from the model intercept, the intercept itself was not explicitly included.</p><p>Across both prompt types, GPT generated quantitative predictions of the probability of a poor functional outcome (discharge mRS score &#x2265;3) expressed as a percent (0%&#x2010;100%), accompanied by a concise explanation of its reasoning process. Representing the outcome as a continuous probability enabled quantitative evaluation of both discrimination and calibration. To enhance interpretability, a zero-shot chain-of-thought prompting approach was used, instructing the model to generate a brief reasoning statement before providing the final probability estimate [<xref ref-type="bibr" rid="ref16">16</xref>]. All outputs were formatted in a unified JavaScript Object Notation structure to ensure consistency across cases and facilitate automated postprocessing and evaluation of performance metrics.</p><p>As a preliminary test of model behavior before applying real patient data, we created synthetic clinical data representing hypothetical patients and asked the GPT models to estimate prognostic outcomes. A stroke specialist (MK) designed these pseudopatient profiles using the same variable structure as the real dataset from Saiseikai Kumamoto Hospital, representing relatively low-risk clinical scenarios (eg, mild neurological severity and smaller hematoma volume). For demonstration purposes, the synthetic cases were paired with brain CT images from the publicly available Brain Hemorrhage Segmentation Dataset [<xref ref-type="bibr" rid="ref12">12</xref>].</p></sec><sec id="s2-9"><title>Performance Evaluation</title><p>The discriminative performance of GPT-based predictions was evaluated using the area under the receiver operating characteristic curve (AUROC), sensitivity, specificity, positive predictive value (PPV), and negative predictive value (NPV). Sensitivity, specificity, the PPV, and the NPV were calculated at the optimal cutoff determined by the Youden index. In this study, the scaled Brier score and Nagelkerke <italic>R</italic>&#x00B2; were calculated as overall predictive accuracy measures. The scaled Brier score was defined as the improvement in the Brier score relative to a noninformative null model that assigns the overall outcome incidence as a constant predicted probability to all individuals. Nagelkerke <italic>R</italic>&#x00B2; was calculated using the same null model based on the improvement in log-likelihood. Model calibration was visually assessed using calibration plots. Calibration curves were generated using Locally Estimated Scatterplot Smoothing (LOESS) with a span parameter of 0.80.</p><p>The ML-based model was evaluated using the same temporal cohort as that used for GPT. For the ML-based and GPT-based models, 95% CIs were estimated using 2000 bootstrap resamples based on predicted probabilities from a single representative GPT inference run per patient to assess the stability of the performance metrics. To evaluate reproducibility, GPT inference was repeated 5 times for each case under identical conditions, and the consistency of predicted probabilities across runs was quantified using the intraclass correlation coefficient (ICC; single-score, 1-way random-effects model). Finally, decision curve analysis (DCA) was conducted to assess clinical utility by calculating the net benefit&#x2014;defined as the trade-off between true-positive and false-positive predictions weighted by the relative harm of false-positives&#x2014;across varying probability thresholds derived from GPT predictions [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Decision curves were smoothed using LOESS with a span parameter of 0.20 to reduce variability in the plotted net benefit curves.</p></sec><sec id="s2-10"><title>Image Contribution and Validity of GPT-Estimated Hematoma Volume</title><p>To evaluate the contribution of image inputs to GPT-based inference, we conducted an ablation analysis. Specifically, inference using structured tabular clinical data alone was compared with inference using both noncontrast head CT images and tabular data in terms of discriminative performance. To assess the validity of image-derived outputs generated by GPT, we examined the agreement between GPT-estimated hematoma volume and imaging-based reference standards. The measured hematoma volume was used as the reference standard and was calculated using the SYNAPSE VINCENT 3D image analysis system (Fujifilm Medical) by the same radiologic technologist who selected the representative JPEG slices and remained blinded to discharge mRS scores and baseline clinical admission data. GPT-based hematoma volume estimation was performed using the ABC/2 method, and the association between GPT-estimated and measured hematoma volumes was evaluated using Spearman rank correlation coefficient.</p></sec><sec id="s2-11"><title>Sensitivity Analysis</title><p>To evaluate the impact of a good premorbid functional status, a sensitivity analysis was conducted restricting the cohort to patients who were functionally independent at baseline (premorbid mRS 0&#x2010;1), consistent with common practice in stroke research. The same models and evaluation procedures used in the primary analysis were applied.</p></sec><sec id="s2-12"><title>Software</title><p>Prompt generation and inference using GPT via the Azure OpenAI API were implemented in Python (version 3.13.7; Python Software Foundation). The ML experiments were conducted in Python (version 3.10.11; Python Software Foundation) using PyTorch (version 1.11.0; Meta Platforms). Evaluation of predictive performance and ICC was performed with RStudio (version 2025.05.0; Posit) in R statistical software (version 4.5.0; R Foundation for Statistical Computing). All codes and implementation details are available in the GitHub repository referenced in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Patient Characteristics</title><p>Among the 175 patients in the temporal validation cohort, 139 (79.4%) had poor functional outcomes (mRS score=3&#x2010;6) at discharge, including those who died (mRS score=6). The distributions and missing rates of tabular variables used as model inputs for the GPT and ML are summarized in <xref ref-type="table" rid="table1">Table 1</xref>.</p></sec><sec id="s3-2"><title>Preliminary Test of Model Behavior</title><p>The results of GPT-based prognostic inference using synthetic clinical data representing pseudopatients are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. In this experiment, 2 different CT images were provided as input predictors for the same set of clinical variables. The structured tabular inputs were identical between the 2 cases, whereas the CT images differed in hematoma volume (A: small hematoma; B: large hematoma). GPT-5 produced different prognostic estimates according to hematoma size, assigning a higher risk to the large-hematoma case (B), confirming that the model&#x2019;s outputs appropriately varied in response to image-based prognostic cues.</p></sec><sec id="s3-3"><title>Predictive Performance</title><p>The predictive performance of the clinical risk score, the conventional ML-based model, and the GPT-based models is summarized in <xref ref-type="table" rid="table2">Table 2</xref>. For reference, the performance of the established FUNC score, previously validated in the same cohort in our prior study [<xref ref-type="bibr" rid="ref4">4</xref>], is also reported in <xref ref-type="table" rid="table2">Table 2</xref>. Overall, both the conventional ML-based model and the GPT-based models showed discriminative performance comparable to that of the FUNC score. The conventional ML-based model achieved an AUROC of 0.85 (95% CI 0.79&#x2010;0.90), along with high specificity and PPV. The scaled Brier score for the ML model was 0.23 (95% CI 0.06&#x2010;0.36), with a corresponding Nagelkerke <italic>R</italic>&#x00B2; of 0.35 (95% CI 0.18&#x2010;0.48), indicating good overall predictive performance. Among the zero-shot models, GPT-4.1 and GPT-5 achieved AUROCs of 0.84 (95% CI 0.77-0.91) and 0.85 (95% CI 0.78-0.91), respectively, indicating discrimination similar to that of the conventional ML-based model. However, the overall predictive performance of zero-shot GPT-5 was inferior to that of the null model. The scaled Brier score for GPT-5 was low and included negative values (&#x2212;0.17; 95% CI &#x2212;0.58 to 0.10); Nagelkerke <italic>R</italic>&#x00B2; was also negative (&#x2212;0.16; 95% CI &#x2212;0.67 to 0.14). Negative Nagelkerke <italic>R</italic>&#x00B2; values can occur when the model&#x2019;s predicted probabilities produce a log-likelihood worse than that of a noninformative null model, reflecting miscalibration of the probability estimates. For the ML-assisted GPT-4.1 model, AUROC remained largely unchanged, although sensitivity increased relative to the zero-shot GPT-4.1 model. In contrast, the ML-assisted GPT-5 model demonstrated the highest discrimination, with an AUROC of 0.87 (95% CI 0.81-0.92). This model also achieved high specificity and PPV. The scaled Brier score for this model was improved to 0.19 (95% CI &#x2212;0.10 to 0.36), and Nagelkerke <italic>R</italic>&#x00B2; was increased to 0.29 (95% CI &#x2212;0.001 to 0.46), suggesting partial improvement in overall predictive performance, although not surpassing that of the stand-alone ML model. The probability thresholds derived using the Youden method were consistently higher for the ML-assisted models than for their zero-shot counterparts. Specifically, the threshold increased from 0.68 (95% CI 0.50-0.73) to 0.82 (95% CI 0.61-0.91) for GPT-4.1 and from 0.46 (95% CI 0.37-0.52) to 0.57 (95% CI 0.50-0.73) for GPT-5. The reproducibility of GPT-based predictions across 5 repeated inference runs is summarized in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>. Both GPT-4.1 and GPT-5 demonstrated high interrun agreement, which was further enhanced by model-informed prompting.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Predictive performance of GPT-based models, an ML<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>-based model, and a clinical risk score.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Risk score or model</td><td align="left" valign="bottom">AUROC<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="bottom">Sensitivity</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom">PPV<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="bottom">NPV<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="bottom">Scaled BS<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="bottom">Nagelkerke <italic>R</italic>&#x00B2;</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="8">Risk score, estimated performance metrics (95% CI)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>FUNC score</td><td align="left" valign="top">0.80 (0.73-0.86)</td><td align="left" valign="top">0.61 (0.53-0.70)</td><td align="left" valign="top">0.94 (0.86-1.00)</td><td align="left" valign="top">0.98 (0.94-1.00)</td><td align="left" valign="top">0.39 (0.34-0.45)</td><td align="left" valign="top">&#x2003;</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top" colspan="8">Model type, estimated performance metrics (95% CI)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ML-based model</td><td align="left" valign="top">0.85 (0.79-0.90)</td><td align="left" valign="top">0.72 (0.62-0.85)</td><td align="left" valign="top">0.94 (0.81-1.00)</td><td align="left" valign="top">0.98 (0.94-1.00)</td><td align="left" valign="top">0.47 (0.39-0.59)</td><td align="left" valign="top">0.23 (0.06-0.36)</td><td align="left" valign="top">0.35 (0.18-0.48)</td></tr><tr><td align="left" valign="top" colspan="8">Zero-shot model, estimated performance metrics (95% CI)</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4.1</td><td align="left" valign="top">0.84 (0.77-0.91)</td><td align="left" valign="top">0.65 (0.54-0.89)</td><td align="left" valign="top">0.92 (0.70-1.00)</td><td align="left" valign="top">0.97 (0.91-1.00)</td><td align="left" valign="top">0.41 (0.32-0.66)</td><td align="left" valign="top">0.13 (&#x2013;0.15 to 0.33)</td><td align="left" valign="top">0.21 (&#x2212;0.11 to 0.42)</td></tr><tr><td align="left" valign="top">&#x2003;GPT-5</td><td align="left" valign="top">0.85 (0.78-0.91)</td><td align="left" valign="top">0.68 (0.58-0.85)</td><td align="left" valign="top">0.92 (0.76-1.00)</td><td align="left" valign="top">0.97 (0.92-1.00)</td><td align="left" valign="top">0.42 (0.32-0.60)</td><td align="left" valign="top">&#x2013;0.17 (&#x2013;0.58 to 0.10)</td><td align="left" valign="top">&#x2212;0.16 (&#x2212;0.67 to 0.14)</td></tr><tr><td align="left" valign="top" colspan="8">ML-assisted model, estimated performance metrics (95% CI)</td></tr><tr><td align="left" valign="top">&#x2003;GPT-4.1-assisted by ML</td><td align="left" valign="top">0.84 (0.78-0.90)</td><td align="left" valign="top">0.71 (0.62-0.85)</td><td align="left" valign="top">0.92 (0.77-1.00)</td><td align="left" valign="top">0.97 (0.93-1.00)</td><td align="left" valign="top">0.45 (0.34-0.59)</td><td align="left" valign="top">0.22 (0.02-0.37)</td><td align="left" valign="top">0.34 (0.14-0.48)</td></tr><tr><td align="left" valign="top">&#x2003;GPT-5-assisted by ML</td><td align="left" valign="top">0.87 (0.81-0.92)</td><td align="left" valign="top">0.72 (0.61-0.82)</td><td align="left" valign="top">0.94 (0.87-1.00)</td><td align="left" valign="top">0.98 (0.96-1.00)</td><td align="left" valign="top">0.47 (0.34-0.59)</td><td align="left" valign="top">0.19 (&#x2013;0.10 to 0.36)</td><td align="left" valign="top">0.29 (&#x2212;0.001 to 0.46)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>ML: machine learning.</p></fn><fn id="table2fn2"><p><sup>b</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table2fn3"><p><sup>c</sup> PPV: positive predictive value.</p></fn><fn id="table2fn4"><p><sup>d</sup>NPV: negative predictive value.</p></fn><fn id="table2fn5"><p><sup>e</sup>BS: Brier score.</p></fn></table-wrap-foot></table-wrap><p>Values are presented as point estimates with 95% CIs in parentheses. All 95% CIs were estimated using 2000 bootstrap resamples based on predicted probabilities from a single representative inference run.</p><p>Calibration plots for the ML model are shown in <xref ref-type="supplementary-material" rid="app10">Multimedia Appendix 10</xref>, and those for the 5 repeated inference runs are presented in <xref ref-type="fig" rid="figure2">Figure 2</xref>. Both GPT-4.1 and GPT-5 initially tended to underestimate predicted probabilities, as indicated by calibration curves lying above the ideal diagonal line. After model-informed prompting, this bias appeared to be partially attenuated, with a tendency toward closer agreement between the observed and predicted probabilities across risk levels. The improvement was more apparent for GPT-5 but modest and primarily confined to the higher-risk range for GPT-4.1. Taken together, these findings suggest that model-informed prompting may influence discrimination, reproducibility, and calibration; however, it does not fully address the limitations in predictive reliability.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Calibration of GPT-4.1 and GPT-5 with and without model-informed prompting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e87062_fig02.png"/></fig><p>Calibration plots for (A) GPT-4.1, (B) GPT-5, (C) model-informed GPT-4.1, and (D) model-informed GPT-5 in the validation cohort. The relationship between predicted probabilities (<italic>x</italic>-axis) and observed probabilities (<italic>y</italic>-axis) is shown using a LOESS-smoothed calibration curve. The shaded area represents the pointwise 95% CI estimated by patient-level bootstrap resampling. Predicted probabilities from 5 independent inference runs are overlaid to illustrate interrun variability and reproducibility. The dashed diagonal line indicates perfect calibration.</p></sec><sec id="s3-4"><title>Net Benefit</title><p><xref ref-type="fig" rid="figure3">Figure 3</xref> shows the results of the DCA, defining poor functional outcome as discharge mRS scores 3&#x2010;6. At threshold probabilities up to approximately 0.5, the net benefit was generally similar among the GPT-4.1 model, the ML-assisted GPT-4.1 model, the ML model, and the treat-all strategy (panel A). Specifically, at a threshold probability of 0.5, the net benefit was 0.61 (95% CI 0.51&#x2010;0.69) for the GPT-4.1 model and 0.60 (95% CI 0.50&#x2010;0.69) for the ML-assisted GPT-4.1 model, which were broadly comparable to that of the treat-all strategy (0.59, 95% CI 0.47&#x2010;0.69). At higher threshold probabilities, both the GPT-4.1 model and the ML-assisted GPT-4.1 model showed greater net benefit than the treat-all strategy. At a threshold probability of 0.8, the net benefit of the treat-all strategy was &#x2212;0.03 (95% CI &#x2212;0.31 to 0.23), whereas it was 0.35 (95% CI 0.27&#x2010;0.43) for the GPT-4.1 model and 0.49 (95% CI 0.37&#x2010;0.60) for the ML-assisted GPT-4.1 model.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Decision curve analysis of GPT-4.1 and GPT-5 with and without model-informed prompting. ML: machine learning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v5i1e87062_fig03.png"/></fig><p>Decision curves illustrate the relationship between the threshold probability and net benefit for GPT-4.1 (A) and GPT-5 (B). The red line represents the stand-alone GPT model without model-informed prompting, the blue line represents the ML-assisted GPT model incorporating outputs from the ML model, and the green line represents the ML-based model alone. The gray &#x201C;All&#x201D; line indicates the net benefit under the assumption that all patients experienced poor functional outcomes, whereas the black &#x201C;None&#x201D; line indicates the net benefit under the assumption that no patients experienced poor functional outcomes. Net benefit curves were estimated using LOESS smoothing to reduce variability and facilitate visual comparison across models. The vertical axis represents the net benefit, and the horizontal axis represents the threshold probability derived from each model&#x2019;s predicted probability.</p><p>In contrast, for the GPT-5 model, the net benefit at a threshold probability of 0.5 was 0.50 (95% CI 0.42&#x2010;0.57), which appeared lower than that of the treat-all strategy (0.59, 95% CI 0.47&#x2010;0.69; panel B). For the ML-assisted GPT-5 model, the net benefit at the same threshold probability was 0.59 (95% CI 0.50&#x2010;0.67), which was broadly comparable to that of the treat-all strategy. At a threshold probability of 0.8, the net benefit was 0.16 (95% CI 0.11&#x2010;0.22) for the GPT-5 model and 0.46 (95% CI 0.38&#x2010;0.54) for the ML-assisted GPT-5 model; both of which appeared greater than that of the treat-all strategy. The ML-assisted GPT-5 model showed consistently higher net benefit than the GPT-5 model alone across the higher threshold range.</p></sec><sec id="s3-5"><title>Effects of Image Input on Discrimination and Volume Estimation</title><p>In the ablation analysis, the discriminative performance of the GPT-5 model using structured tabular clinical data alone yielded an AUROC of 0.79 (95% CI 0.71&#x2010;0.86; <xref ref-type="supplementary-material" rid="app11">Multimedia Appendix 11</xref>). Discriminative performance improved with the addition of noncontrast head CT images, and the increase in AUROC was statistically significant based on 2000 bootstrap resamples (<italic>P</italic>=.02). Because the CT images were provided as JPEG files without spatial calibration or physical scale metadata, the volumes generated by GPT do not represent physically calibrated measurements but rather values inferred from visual proportions within the images. This methodological limitation should be considered when interpreting the results. To assess the extent to which GPT could approximate hematoma volume despite this constraint, we examined the correlation between GPT-derived estimates and volumes measured using the ABC/2 method. A strong positive monotonic association was observed between GPT-estimated hematoma volume and the 3D-measured hematoma volume obtained with the SYNAPSE VINCENT system (Spearman &#x03C1;=0.79, <italic>P</italic>&#x003C;.001; <xref ref-type="supplementary-material" rid="app12">Multimedia Appendix 12</xref>).</p></sec><sec id="s3-6"><title>Sensitivity Analysis in Patients Functionally Independent at Baseline</title><p>The analysis restricted to patients with premorbid mRS scores of 0&#x2010;1 produced overall patterns similar to those observed in the primary analysis, although the predictive performance was slightly reduced for all models (<xref ref-type="supplementary-material" rid="app13">Multimedia Appendix 13</xref>). The FUNC score showed an AUROC of 0.75 (95% CI 0.68&#x2010;0.82). The conventional ML-based model achieved an AUROC of 0.80 (95% CI 0.72&#x2010;0.87), whereas GPT-4.1 and GPT-5 showed AUROCs of 0.79 (95% CI 0.71&#x2010;0.87) and 0.80 (95% CI 0.72&#x2010;0.87), respectively. For GPT-5, the scaled Brier score and Nagelkerke <italic>R</italic>&#x00B2; included negative values, indicating suboptimal calibration and overall predictive performance. Incorporating ML-derived information did not result in a clear improvement in discrimination; however, overall performance indices, including the scaled Brier score and Nagelkerke <italic>R</italic>&#x00B2;, improved modestly. Calibration plots (<xref ref-type="supplementary-material" rid="app14">Multimedia Appendix 14</xref>) showed that, consistent with the primary analysis, GPT models tended to underestimate predicted probabilities. The discrepancy between the observed and predicted probabilities was partially attenuated but not fully resolved after incorporating ML-derived information. In the DCA (<xref ref-type="supplementary-material" rid="app15">Multimedia Appendix 15</xref>), similar to the primary analysis, the model-based strategies showed only limited net benefit, primarily in higher threshold ranges.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>In this study, even without task-specific training, both GPT-4.1 and GPT-5 demonstrated stable discrimination comparable to that of conventional ML models. Ablation analyses indicated that the inclusion of noncontrast head CT images significantly improved discriminative performance. However, GPT-based models alone exhibited limitations in calibration. Incorporating ML-derived outputs into GPT prompts yielded modest improvements in calibration, reproducibility, and decision-curve net benefit compared with stand-alone GPT models; however, these improvements did not translate into exceeding the performance of the conventional ML model.</p></sec><sec id="s4-2"><title>Prognostic Models Based on Specialist-Derived Variables</title><p>Several clinical scoring systems have been proposed to predict functional outcomes after ICH. Among them, the ICH FUNC score is one of the most widely used tools [<xref ref-type="bibr" rid="ref19">19</xref>]. It estimates the probability of achieving functional independence&#x2014;defined as a Glasgow Outcome Scale score &#x2265;4&#x2014;at 90 days after ICH occurrence based on the following 5 key factors: ICH volume, age, ICH location, Glasgow Coma Scale (GCS) score, and pre-ICH cognitive impairment. In our cohort, patients with poor outcomes showed unfavorable values across these components (<xref ref-type="supplementary-material" rid="app16">Multimedia Appendix 16</xref>). Although the FUNC score demonstrated good predictive ability (AUROC 0.80, 95% CI 0.73&#x2010;0.86; <xref ref-type="table" rid="table2">Table 2</xref>) [<xref ref-type="bibr" rid="ref4">4</xref>], its calculation requires specialist assessment, limiting its applicability in nonspecialist emergency settings.</p><p>To improve predictive precision and automation, several ML models have been developed using various variables, achieving AUROC values of 0.80 to 0.90 [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. However, most models rely on expert-interpreted data, including GCS scoring and detailed image analysis, making them difficult to implement in EDs lacking specialist support. This limitation highlights the importance of investigating whether GPT models can achieve comparable predictive performance using only routinely available, nonspecialist data.</p></sec><sec id="s4-3"><title>Automated Prognostic Prediction Based on Nonspecialist Data</title><p>In recent years, automated approaches for predicting poststroke outcomes without reliance on specialist interpretation have been increasingly explored [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]. Among these, end-to-end deep learning models that optimize the entire pipeline&#x2014;from image input to outcome prediction&#x2014;without explicit region-of-interest extraction or manual feature engineering can streamline the inference process and support rapid decision-making under the time constraints of the ED. Indeed, end-to-end deep learning models using noncontrast CT images alone have reported discriminative performance with AUROC values of approximately 0.83 [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>In our previous ML-based studies, we developed a fully automated deep learning preprocessing pipeline and achieved high predictive performance by integrating latent feature representations extracted from imaging data with routinely available clinical variables that can be obtained by nonspecialists in the ED [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. However, in this framework, imaging information was encoded as latent feature vectors. Although the overall contribution of these image-derived features could be quantified using variable importance measures, intuitively understanding which specific anatomical regions or local imaging findings contributed to poor outcomes remained challenging. Thus, limitations in model interpretability persisted.</p><p>Against this background, this study explored an approach based on multimodal LLMs, focusing on their ability to convey the meaning of imaging findings and their clinical implications to clinicians through natural language interaction. As illustrated in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, this approach offers a degree of interpretability by enabling natural language explanations of imaging findings and their relevance to clinical decision-making. Even in a zero-shot setting without task-specific fine-tuning, GPT demonstrated discriminative performance within the range reported for widely used clinical scores such as the FUNC score. However, direct comparison was not feasible because of differences in outcome assessment timing, and therefore, these findings require cautious interpretation.</p><p>GPT-5 retained overall good discriminative performance but tended to slightly underestimate predicted probabilities relative to observed outcomes, indicating persistent challenges in calibration. The model with ML-derived information (such as predicted probabilities and regression coefficients) incorporated into the prompt demonstrated improved calibration compared with GPT alone; however, both the scaled Brier score and Nagelkerke <italic>R</italic>&#x00B2; of this model remained inferior to those of the stand-alone ML model. The latent features provided to the model were high-dimensional representations extracted by a deep learning U-Net, the semantic meaning of which is not directly interpretable by the LLM. Accordingly, explanations generated using these features should be regarded as post hoc rationalizations rather than true mechanistic interpretations and may represent synthetic clinical narratives. In addition, integrating the prior model&#x2019;s predicted probability into the prompt likely introduced anchoring bias, whereby the LLM&#x2019;s estimates are influenced by the supplied numerical value. This anchoring effect may partly explain the convergence between zero-shot and ML-assisted outputs and should be considered a key factor driving the observed performance changes. Routing predictions through GPT may reduce statistical calibration relative to the baseline model and produce a measurable trade-off between natural language interpretability and predictive reliability. This approach did not fully resolve the observed reliability limitations; further work is needed to develop robust guardrail mechanisms to ensure the safe clinical use of generative artificial intelligence systems.</p><p>The GPT-based pipeline evaluated in this study relies on a human-curated workflow, incorporating outputs from an ML model and manually selected representative CT slices, and, therefore, does not constitute a fully automated system for use in the ED. Rather than an end-to-end autonomous inference model, this approach should be viewed as a decision-support process that includes human curation. Although the required inputs do not involve assessments that can only be performed by stroke specialists, such as GCS scoring, it should be noted that effective slice selection still requires a certain level of radiological literacy. In addition, when GPT-5 was deployed via an API using the reasoning-minimal setting, inference required approximately 8.6 seconds per case, and token-based usage costs were incurred. In contrast, conventional ML models typically allow rapid inference with negligible per-case inference costs. Accordingly, careful consideration is required when applying large foundation models across all ED cases from an operational and cost perspective.</p></sec><sec id="s4-4"><title>Safety of GPT-Based Clinical Decision Support</title><p>The ablation analysis suggests that adding noncontrast head CT images provides information beyond structured tabular clinical data, contributing to prognostic inference. However, several considerations should be taken into account when interpreting these findings from a safety perspective. Previous studies on ICH subtype classification using noncontrast CT have reported that zero-shot multimodal LLMs, such as GPT-4o, Gemini 2.0 Flash, and Claude 3.5 Sonnet V2, do not achieve performance comparable to conventional deep learning models, including ResNet-50 and Vision Transformers, indicating persistent challenges in pixel-level image recognition [<xref ref-type="bibr" rid="ref30">30</xref>]. In addition, the CT images provided to GPT in this study were in the JPEG format and did not explicitly encode physical scale information such as pixel spacing, slice thickness, or field of view. Consequently, absolute hematoma volume cannot be derived from these images. Within the ABC/2 framework, GPT-5 appears to rely on relatively coarse visual features&#x2014;such as the longitudinal extent and overall morphology of the hematoma&#x2014;to approximate relative hematoma burden, and these estimates show some correlation with reference measurements. Nevertheless, such estimates should be interpreted cautiously as imprecise and supplementary indicators rather than reliable quantitative measurements.</p><p>Beyond quantitative performance metrics, we conducted a brief qualitative case review focused on safety-relevant failure modes, particularly instances in which GPT produced high-confidence but incorrect predictions. Most errors were attributable to misinterpretation of CT imaging findings. In a false-positive thalamic hemorrhage case, the model assigned a high probability of mRS score 3&#x2010;6 at discharge despite an actual discharge mRS score of 1. The model appeared to interpret choroid plexus calcification within the lateral ventricle as hemorrhage and to infer ventricular extension, leading to risk overestimation. Conversely, in a false-negative case involving a large pontine hemorrhage, the model assigned a low probability of mRS score 3&#x2010;6 despite an actual discharge mRS score of 6. In this case, linear hyperattenuation caused by motion artifact reduced lesion conspicuity, resulting in underestimation of hemorrhage severity. These observations are consistent with previously reported limitations in the image-understanding capabilities of multimodal LLMs, in which judgment can become unstable under visual conditions that strongly depend on lesion contrast and visibility, as well as the presence of hyperattenuating structures or imaging artifacts [<xref ref-type="bibr" rid="ref30">30</xref>]. Taken together, these findings underscore the importance of retaining human oversight and implementing safety guardrails when deploying multimodal LLMs in clinical decision-support settings.</p></sec><sec id="s4-5"><title>Utility in Clinical Decision Support</title><p>The clinical intervention scenario considered in this study involves decision support at ED arrival, specifically whether to initiate early transfer planning or to consider a home-discharge pathway for patients predicted to have poor functional outcomes at discharge (mRS score 3&#x2010;6). Patients with ICH often present with severe clinical features, which may lead clinicians to adopt pessimistic recovery expectations and favor intensive or transfer-oriented management strategies. Because accurately predicting functional outcomes (mRS &#x2265;3) remains challenging in routine practice, precautionary decisions such as early transfer planning may be implemented even for patients who would ultimately achieve meaningful recovery. In this context, a prediction model could, in principle, help identify patients at relatively low risk of poor functional outcome who might be considered for home discharge rather than routine transfer-oriented care. However, the DCA for the present cohort of patients with acute ICH indicated that both the ML and GPT-based models provided net benefit only at relatively high decision thresholds compared with a treat-all strategy. Furthermore, the GPT-based models did not demonstrate a meaningful advantage in net benefit over the ML model, suggesting a limited incremental value in this setting. Restricting the analysis to patients who were functionally independent prior to onset did not materially alter these findings.</p><p>In this study, the prevalence of poor functional outcome approached 80%, resulting in a decision-curve pattern in which a treat-all strategy retained net benefit across a wide range of threshold probabilities. Consequently, the clinical utility of any predictive model in this setting is mathematically constrained to higher decision thresholds, because poor outcomes are overwhelmingly common and the default strategy already captures most events. From a decision-analytic perspective, net benefit depends not only on discrimination but also on model calibration [<xref ref-type="bibr" rid="ref31">31</xref>]. The tendency toward risk underestimation observed in the stand-alone GPT models may therefore partly explain their reduced net benefit in certain threshold regions. Future research may benefit from developing prediction frameworks that jointly model outcome severity across multiple mRS strata (eg, mRS score 3&#x2010;4, mRS score 5&#x2010;6, and death). Such approaches could enable clearer differentiation of the clinical consequences of false-positive and false-negative predictions and may support more context-sensitive, interpretable, and clinically actionable decision support in emergency care settings.</p></sec><sec id="s4-6"><title>Limitations</title><p>This study had certain limitations. First, the dataset was derived from a single institution in Japan with a limited sample size, which may restrict the generalizability of the findings. Differences in the prevalence of hypertensive intracerebral hemorrhage and imaging acquisition protocols across institutions and regions may limit applicability to other populations. Second, this study focused on discharge mRS score as the prediction target, whereas many prior studies, including those using the FUNC score, evaluated functional outcomes at 90 days; therefore, differences in outcome timing introduce limitations in direct comparisons. Third, for multimodal inference, representative CT slices were manually selected by a radiologic technologist and converted into JPEG format, which may have introduced selection bias. Fourth, because single imputation was used instead of multiple imputation, some degree of bias may remain, even though the extent of missingness was minimal. Fifth, the ML pipeline used an L1-regularized logistic regression model, which is effective for variable selection but can produce biased coefficient estimates due to shrinkage; consequently, predicted probabilities derived from these coefficients may also be biased. Finally, for privacy protection, GPT-based inference should be performed within secure environments, such as Azure OpenAI, which imposes technical, financial, and governance constraints on clinical implementation.</p></sec><sec id="s4-7"><title>Conclusion</title><p>This study showed that zero-shot GPT models achieved discrimination comparable to that of a conventional ML model but did not demonstrate superior predictive performance. GPT-based models alone showed limitations in calibration, and routing predictions through GPT was associated with reduced statistical reliability, suggesting a trade-off between natural language interpretability and predictive robustness. Incorporating outputs from lightweight ML models into the prompts yielded modest improvements, although with remaining reliability concerns. The clinical usefulness of such systems appears to be context-dependent and may be restricted to higher decision-threshold regions in high-prevalence settings. Safe clinical deployment will likely require both system-level guardrails and human-in-the-loop oversight to ensure that final decisions remain under clinician control. These findings should also be interpreted in light of study-specific constraints, including nonautomated image input and inherent uncertainties in visual feature interpretation.</p></sec></sec></body><back><ack><p>We would like to express our sincere gratitude to the staff of Saiseikai Kumamoto Hospital for their generous support in data extraction. During manuscript preparation, generative artificial intelligence (AI) was used for English language editing. All AI-assisted text was reviewed and approved by the authors, who take full responsibility for the content.</p></ack><notes><sec><title>Funding</title><p>This study was supported by the Japan Society for the Promotion of Science (JSPS) KAKENHI (grant number JP25H01083). This article is also based on results obtained from a project (JPNP25006) commissioned by the New Energy and Industrial Technology Development Organization (NEDO).</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are not publicly available due to the sensitive nature of the data but are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>KM had full access to all the study data and was responsible for the integrity and accuracy of the data analysis. Concept and design: KM, YN, SY, MK, and NN. Data analysis and interpretation: KM, RT, YF, and KI. Manuscript drafting: KM. Critical manuscript revision for important intellectual content: MK. Statistical analysis: KM. Obtained funding: MK. Technical and material support: SY, KM, and KT. Supervision: JC, SY, MK, and NN.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb2">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb3">CT</term><def><p>computed tomography</p></def></def-item><def-item><term id="abb4">DCA</term><def><p>decision curve analysis</p></def></def-item><def-item><term id="abb5">DICOM</term><def><p>Digital Imaging and Communications in Medicine</p></def></def-item><def-item><term id="abb6">ED</term><def><p>emergency department</p></def></def-item><def-item><term id="abb7">GCS</term><def><p>Glasgow Coma Scale</p></def></def-item><def-item><term id="abb8">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb9">ICH</term><def><p>intracerebral hemorrhage</p></def></def-item><def-item><term id="abb10">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb11">LOESS</term><def><p>Locally Estimated Scatterplot Smoothing</p></def></def-item><def-item><term id="abb12">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb13">mRS</term><def><p>modified Rankin Scale</p></def></def-item><def-item><term id="abb14">NPV</term><def><p>negative predictive value</p></def></def-item><def-item><term id="abb15">PPV</term><def><p>positive predictive value</p></def></def-item><def-item><term id="abb16">TRIPOD</term><def><p>Transparent Reporting of a Multivariable Model for Individual Prognosis or Diagnosis</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Magid-Bernstein</surname><given-names>J</given-names> </name><name name-style="western"><surname>Girard</surname><given-names>R</given-names> </name><name name-style="western"><surname>Polster</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Cerebral hemorrhage: pathophysiology, treatment, and future directions</article-title><source>Circ Res</source><year>2022</year><month>04</month><day>15</day><volume>130</volume><issue>8</issue><fpage>1204</fpage><lpage>1229</lpage><pub-id pub-id-type="doi">10.1161/CIRCRESAHA.121.319949</pub-id><pub-id pub-id-type="medline">35420918</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Feigin</surname><given-names>VL</given-names> </name><name name-style="western"><surname>Abate</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Abate</surname><given-names>YH</given-names> </name><etal/></person-group><article-title>Global, regional, and national burden of stroke and its risk factors, 1990&#x2013;2021: a systematic analysis for the Global Burden of Disease Study 2021</article-title><source>Lancet Neurol</source><year>2024</year><month>10</month><volume>23</volume><issue>10</issue><fpage>973</fpage><lpage>1003</lpage><pub-id pub-id-type="doi">10.1016/S1474-4422(24)00369-7</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Matsumoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ishihara</surname><given-names>K</given-names> </name><name name-style="western"><surname>Matsuda</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Machine learning-based prediction for in-hospital mortality after acute intracerebral hemorrhage using real-world clinical and image data</article-title><source>J Am Heart Assoc</source><year>2024</year><month>12</month><day>17</day><volume>13</volume><issue>24</issue><fpage>e036447</fpage><pub-id pub-id-type="doi">10.1161/JAHA.124.036447</pub-id><pub-id pub-id-type="medline">39655759</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Matsumoto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Suzuki</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ishihara</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Performance of multimodal prediction models for intracerebral hemorrhage outcomes using real-world data</article-title><source>Int J Med Inform</source><year>2025</year><month>10</month><volume>202</volume><fpage>105989</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2025.105989</pub-id><pub-id pub-id-type="medline">40412140</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>P&#x00E9;rez Del Barrio</surname><given-names>A</given-names> </name><name name-style="western"><surname>Esteve Dom&#x00ED;nguez</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Men&#x00E9;ndez Fern&#x00E1;ndez-Miranda</surname><given-names>P</given-names> </name><etal/></person-group><article-title>A deep learning model for prognosis prediction after intracranial hemorrhage</article-title><source>J Neuroimaging</source><year>2023</year><month>03</month><volume>33</volume><issue>2</issue><fpage>218</fpage><lpage>226</lpage><pub-id pub-id-type="doi">10.1111/jon.13078</pub-id><pub-id pub-id-type="medline">36585957</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>B</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>CT-based deep learning model for predicting hospital discharge outcome in spontaneous intracerebral hemorrhage</article-title><source>Eur Radiol</source><year>2024</year><month>07</month><volume>34</volume><issue>7</issue><fpage>4417</fpage><lpage>4426</lpage><pub-id pub-id-type="doi">10.1007/s00330-023-10505-6</pub-id><pub-id pub-id-type="medline">38127074</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lei</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey on medical large language models: technology, application, trustworthiness, and future directions</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 6, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.03712</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><article-title>Introducing GPT-5</article-title><source>OpenAI</source><year>2025</year><access-date>2025-08-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://openai.com/index/introducing-gpt-5/">openai.com/index/introducing-gpt-5/</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Safari</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Eidex</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name></person-group><article-title>Performance of GPT&#x2011;5 in brain tumor MRI reasoning</article-title><conf-name>SPIE Medical Imaging 2026</conf-name><conf-date>Feb 15-19, 2026</conf-date><pub-id pub-id-type="doi">10.1117/12.3087328</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Safari</surname><given-names>M</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>RLJ</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name></person-group><article-title>Capabilities of GPT&#x2011;5 on multimodal medical reasoning</article-title><conf-name>SPIE Medical Imaging 2026</conf-name><conf-date>Feb 15-19, 2026</conf-date><pub-id pub-id-type="doi">10.1117/12.3086794</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Eidex</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Safari</surname><given-names>M</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name></person-group><article-title>Benchmarking GPT&#x2011;5 for zero&#x2011;shot multimodal medical reasoning in radiology and radiation oncology</article-title><conf-name>SPIE Medical Imaging 2026</conf-name><conf-date>Feb 15-19, 2026</conf-date><pub-id pub-id-type="doi">10.1117/12.3087763</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Cao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Rekik</surname><given-names>I</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ouyang</surname><given-names>X</given-names> </name></person-group><article-title>A 3D multi-class brain hemorrhage segmentation dataset</article-title><source>Machine Learning in Medical Imaging MLMI 2023</source><year>2024</year><publisher-name>Springer Nature</publisher-name><pub-id pub-id-type="doi">10.1007/978-3-031-45673-2_15</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallifant</surname><given-names>J</given-names> </name><name name-style="western"><surname>Afshar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>60</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id><pub-id pub-id-type="medline">39779929</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>KJ</given-names> </name><name name-style="western"><surname>Tilling</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Cornish</surname><given-names>RP</given-names> </name><etal/></person-group><article-title>Framework for the treatment and reporting of missing data in observational studies: the treatment And reporting of missing data in observational studies framework</article-title><source>J Clin Epidemiol</source><year>2021</year><month>06</month><volume>134</volume><fpage>79</fpage><lpage>88</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2021.01.008</pub-id><pub-id pub-id-type="medline">33539930</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><article-title>Foundry models sold by azure</article-title><source>Microsoft</source><access-date>2025-08-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/models">https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/models</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Iwasawa</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kojima</surname><given-names>T</given-names> </name><name name-style="western"><surname>Matsuo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Reid</surname><given-names>M</given-names> </name></person-group><article-title>Large language models are zero-shot reasoners</article-title><conf-name>Advances in Neural Information Processing Systems 35</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><conf-loc>New Orleans, LA</conf-loc><fpage>22199</fpage><lpage>22213</lpage><pub-id pub-id-type="doi">10.52202/068431-1613</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vickers</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Elkin</surname><given-names>EB</given-names> </name></person-group><article-title>Decision curve analysis: a novel method for evaluating prediction models</article-title><source>Med Decis Making</source><year>2006</year><volume>26</volume><issue>6</issue><fpage>565</fpage><lpage>574</lpage><pub-id pub-id-type="doi">10.1177/0272989X06295361</pub-id><pub-id pub-id-type="medline">17099194</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vickers</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>van Calster</surname><given-names>B</given-names> </name><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name></person-group><article-title>A simple step-by-step guide to interpreting decision curve analysis</article-title><source>Diagn Progn Res</source><year>2019</year><volume>3</volume><issue>1</issue><fpage>18</fpage><pub-id pub-id-type="doi">10.1186/s41512-019-0064-7</pub-id><pub-id pub-id-type="medline">31592444</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rost</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>EE</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Prediction of functional outcome in patients with primary intracerebral hemorrhage: the FUNC score</article-title><source>Stroke</source><year>2008</year><month>08</month><volume>39</volume><issue>8</issue><fpage>2304</fpage><lpage>2309</lpage><pub-id pub-id-type="doi">10.1161/STROKEAHA.107.512202</pub-id><pub-id pub-id-type="medline">18556582</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>YF</given-names> </name><name name-style="western"><surname>Song</surname><given-names>XL</given-names> </name><etal/></person-group><article-title>An interpretable artificial intelligence model based on CT for prognosis of intracerebral hemorrhage: a multicenter study</article-title><source>BMC Med Imaging</source><year>2024</year><month>07</month><day>9</day><volume>24</volume><issue>1</issue><fpage>170</fpage><pub-id pub-id-type="doi">10.1186/s12880-024-01352-y</pub-id><pub-id pub-id-type="medline">38982357</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pei</surname><given-names>L</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ni</surname><given-names>C</given-names> </name></person-group><article-title>A radiomics model based on CT images combined with multiple machine learning models to predict the prognosis of spontaneous intracerebral hemorrhage</article-title><source>World Neurosurg</source><year>2024</year><month>01</month><volume>181</volume><fpage>e856</fpage><lpage>e866</lpage><pub-id pub-id-type="doi">10.1016/j.wneu.2023.11.002</pub-id><pub-id pub-id-type="medline">37931880</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nawabi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kniep</surname><given-names>H</given-names> </name><name name-style="western"><surname>Elsayed</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Imaging-based outcome prediction of acute intracerebral hemorrhage</article-title><source>Transl Stroke Res</source><year>2021</year><month>12</month><volume>12</volume><issue>6</issue><fpage>958</fpage><lpage>967</lpage><pub-id pub-id-type="doi">10.1007/s12975-021-00891-8</pub-id><pub-id pub-id-type="medline">33547592</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>B</given-names> </name></person-group><article-title>Prognostic prediction of hypertensive intracerebral hemorrhage using CT radiomics and machine learning</article-title><source>Brain Behav</source><year>2021</year><month>05</month><volume>11</volume><issue>5</issue><fpage>e02085</fpage><pub-id pub-id-type="doi">10.1002/brb3.2085</pub-id><pub-id pub-id-type="medline">33624945</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qi</surname><given-names>X</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>C</given-names> </name></person-group><article-title>Machine learning-based perihematomal tissue features to predict clinical outcome after spontaneous intracerebral hemorrhage</article-title><source>J Stroke Cerebrovasc Dis</source><year>2022</year><month>06</month><volume>31</volume><issue>6</issue><fpage>106475</fpage><pub-id pub-id-type="doi">10.1016/j.jstrokecerebrovasdis.2022.106475</pub-id><pub-id pub-id-type="medline">35417846</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lim</surname><given-names>MJR</given-names> </name><name name-style="western"><surname>Quek</surname><given-names>RHC</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>KJ</given-names> </name><etal/></person-group><article-title>Machine learning models prognosticate functional outcomes better than clinical scores in spontaneous intracerebral haemorrhage</article-title><source>J Stroke Cerebrovasc Dis</source><year>2022</year><month>02</month><volume>31</volume><issue>2</issue><fpage>106234</fpage><pub-id pub-id-type="doi">10.1016/j.jstrokecerebrovasdis.2021.106234</pub-id><pub-id pub-id-type="medline">34896819</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hall</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Weaver</surname><given-names>B</given-names> </name><name name-style="western"><surname>Liotta</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Identifying modifiable predictors of patient outcomes after intracerebral hemorrhage with machine learning</article-title><source>Neurocrit Care</source><year>2021</year><month>02</month><volume>34</volume><issue>1</issue><fpage>73</fpage><lpage>84</lpage><pub-id pub-id-type="doi">10.1007/s12028-020-00982-8</pub-id><pub-id pub-id-type="medline">32385834</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Geng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Development and validation of a machine learning-based predictive model for assessing the 90-day prognostic outcome of patients with spontaneous intracerebral hemorrhage</article-title><source>J Transl Med</source><year>2024</year><month>03</month><day>4</day><volume>22</volume><issue>1</issue><fpage>236</fpage><pub-id pub-id-type="doi">10.1186/s12967-024-04896-3</pub-id><pub-id pub-id-type="medline">38439097</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Trevisi</surname><given-names>G</given-names> </name><name name-style="western"><surname>Caccavella</surname><given-names>VM</given-names> </name><name name-style="western"><surname>Scerrati</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Machine learning model prediction of 6-month functional outcome in elderly patients with intracerebral hemorrhage</article-title><source>Neurosurg Rev</source><year>2022</year><month>08</month><volume>45</volume><issue>4</issue><fpage>2857</fpage><lpage>2867</lpage><pub-id pub-id-type="doi">10.1007/s10143-022-01802-7</pub-id><pub-id pub-id-type="medline">35522333</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Katsuki</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kakizawa</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Nishikawa</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yamamoto</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Uchiyama</surname><given-names>T</given-names> </name></person-group><article-title>Postsurgical functional outcome prediction model using deep learning framework (Prediction One, Sony Network Communications Inc.) for hypertensive intracerebral hemorrhage</article-title><source>Surg Neurol Int</source><year>2021</year><volume>12</volume><fpage>203</fpage><pub-id pub-id-type="doi">10.25259/SNI_222_2021</pub-id><pub-id pub-id-type="medline">34084630</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>Z</given-names> </name></person-group><article-title>Zero-shot multi-modal large language models vs. supervised deep learning: a comparative study on CT-based intracranial hemorrhage subtyping</article-title><source>Brain Hemorrhages</source><year>2025</year><month>12</month><volume>6</volume><issue>6</issue><fpage>323</fpage><lpage>330</lpage><pub-id pub-id-type="doi">10.1016/j.hest.2025.10.004</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Van Calster</surname><given-names>B</given-names> </name><name name-style="western"><surname>Vickers</surname><given-names>AJ</given-names> </name></person-group><article-title>Calibration of risk prediction models: impact on decision-analytic performance</article-title><source>Med Decis Making</source><year>2015</year><month>02</month><volume>35</volume><issue>2</issue><fpage>162</fpage><lpage>169</lpage><pub-id pub-id-type="doi">10.1177/0272989X14547233</pub-id><pub-id pub-id-type="medline">25155798</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Example multimodal GPT-5 outputs using structured and computed tomography image data.</p><media xlink:href="ai_v5i1e87062_app1.docx" xlink:title="DOCX File, 187 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Patient selection for derivation and validation cohorts.</p><media xlink:href="ai_v5i1e87062_app2.docx" xlink:title="DOCX File, 67 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Acquisition parameters of computed tomography images.</p><media xlink:href="ai_v5i1e87062_app3.docx" xlink:title="DOCX File, 43 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Distribution of the number of representative computed tomography slices selected per patient.</p><media xlink:href="ai_v5i1e87062_app4.docx" xlink:title="DOCX File, 52 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Discriminative performance of GPT-5 measured by the area under the receiver operating characteristic curve and inference time.</p><media xlink:href="ai_v5i1e87062_app5.docx" xlink:title="DOCX File, 51 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Prompts used for GPT inference.</p><media xlink:href="ai_v5i1e87062_app6.docx" xlink:title="DOCX File, 21 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Standardized regression coefficients for the machine learning&#x2013;based model.</p><media xlink:href="ai_v5i1e87062_app7.docx" xlink:title="DOCX File, 60 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Programs for data analysis.</p><media xlink:href="ai_v5i1e87062_app8.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Predictive performance and reproducibility of GPT-4.1 and GPT-5 with and without machine learning assistance.</p><media xlink:href="ai_v5i1e87062_app9.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app10"><label>Multimedia Appendix 10</label><p>Calibration plot of the machine learning&#x2013;based model.</p><media xlink:href="ai_v5i1e87062_app10.docx" xlink:title="DOCX File, 51 KB"/></supplementary-material><supplementary-material id="app11"><label>Multimedia Appendix 11</label><p>Discriminative performance of GPT-5 zero-shot models with and without imaging inputs.</p><media xlink:href="ai_v5i1e87062_app11.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app12"><label>Multimedia Appendix 12</label><p>Correlation of GPT-5&#x2013;estimated and 3D-measured intracerebral hemorrhage volumes.</p><media xlink:href="ai_v5i1e87062_app12.docx" xlink:title="DOCX File, 81 KB"/></supplementary-material><supplementary-material id="app13"><label>Multimedia Appendix 13</label><p>Predictive performance of GPT-based models, a machine learning&#x2013;based model, and a clinical risk score in patients with premorbid modified Rankin Scale 0 to 1.</p><media xlink:href="ai_v5i1e87062_app13.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material><supplementary-material id="app14"><label>Multimedia Appendix 14</label><p>Calibration of the GPT-4.1 and GPT-5 models with and without model-informed prompting in patients with premorbid modified Rankin Scale 0 to 1.</p><media xlink:href="ai_v5i1e87062_app14.docx" xlink:title="DOCX File, 201 KB"/></supplementary-material><supplementary-material id="app15"><label>Multimedia Appendix 15</label><p>Decision curve analysis of the GPT-4.1 and GPT-5 models with and without model-informed prompting in patients with premorbid modified Rankin Scale 0 to 1.</p><media xlink:href="ai_v5i1e87062_app15.docx" xlink:title="DOCX File, 150 KB"/></supplementary-material><supplementary-material id="app16"><label>Multimedia Appendix 16</label><p>Differences in baseline clinical data according to functional outcome.</p><media xlink:href="ai_v5i1e87062_app16.docx" xlink:title="DOCX File, 19 KB"/></supplementary-material></app-group></back></article>