<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e74426</article-id><article-id pub-id-type="doi">10.2196/74426</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Effectiveness of the GPT-4o Model in Interpreting Electrocardiogram Images for Cardiac Diagnostics: Diagnostic Accuracy Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Engelstein</surname><given-names>Haya</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Ramon-Gonen</surname><given-names>Roni</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sabbag</surname><given-names>Avi</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Klang</surname><given-names>Eyal</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sudri</surname><given-names>Karin</given-names></name><degrees>MA</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Cohen-Shelly</surname><given-names>Michal</given-names></name><degrees>BSc, MBA</degrees><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Barbash</surname><given-names>Israel</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff8">8</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Sheba Medical Center</institution><addr-line>Tel Hashomer, Ramat Gan</addr-line><country>Israel</country></aff><aff id="aff2"><institution>The Graduate School of Business Administration, Information Systems Program, Bar-Ilan University</institution><addr-line>Max and Anna Webb St</addr-line><addr-line>Ramat Gan</addr-line><country>Israel</country></aff><aff id="aff3"><institution>Davidai Arrhythmia Center, Sheba Medical Center</institution><addr-line>Tel Hashomer, Ramat Gan</addr-line><country>Israel</country></aff><aff id="aff4"><institution>Faculty of Medicine, Tel Aviv University</institution><addr-line>Tel Aviv</addr-line><country>Israel</country></aff><aff id="aff5"><institution>Division of Data-Driven and Digital Medicine (D3M), Icahn School of Medicine at Mount Sinai</institution><addr-line>New York</addr-line><addr-line>NY</addr-line><country>USA</country></aff><aff id="aff6"><institution>The Charles Bronfman Institute of Personalized Medicine, Icahn School of Medicine at Mount Sinai</institution><addr-line>New York</addr-line><addr-line>NY</addr-line><country>USA</country></aff><aff id="aff7"><institution>Sheba ARC, Sagol Big Data and AI Hub, Sheba Medical Center</institution><addr-line>Tel Hashomer, Ramat Gan</addr-line><country>Israel</country></aff><aff id="aff8"><institution>Interventional Cardiology Unit, Leviev Heart Center, Sheba Medical Center</institution><addr-line>Tel Hashomer, Ramat Gan</addr-line><country>Israel</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Huo</surname><given-names>Yuankai</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lu</surname><given-names>Siqi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Han</surname><given-names>Zhongyi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Roni Ramon-Gonen, PhD, The Graduate School of Business Administration, Information Systems Program, Bar-Ilan University, Max and Anna Webb St, Ramat Gan, 5290002, Israel, 972 3-531-8910; <email>roni.ramon-gonen@biu.ac.il</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>22</day><month>8</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e74426</elocation-id><history><date date-type="received"><day>24</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>25</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>09</day><month>07</month><year>2025</year></date></history><copyright-statement>&#x00A9; Haya Engelstein, Roni Ramon Gonen, Avi Sabbag, Eyal Klang, Karin Sudri, Michal Cohen-Shelly, Israel Barbash. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 22.8.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e74426"/><abstract><sec><title>Background</title><p>Recent progress has demonstrated the potential of deep learning models in analyzing electrocardiogram (ECG) pathologies. However, this method is intricate, expensive to develop, and designed for specific purposes. Large language models show promise in medical image interpretation, and yet their effectiveness in ECG analysis remains understudied. Generative Pretrained Transformer 4 Omni (GPT-4o), a multimodal artificial intelligence model, capable of processing images and text without task-specific training, may offer an accessible alternative.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate GPT-4o&#x2019;s effectiveness in interpreting 12-lead ECGs, assessing classification accuracy, and exploring methods to enhance its performance.</p></sec><sec sec-type="methods"><title>Methods</title><p>A total of 6 common ECG diagnoses were evaluated: normal ECG, ST-segment elevation myocardial infarction, atrial fibrillation, right bundle branch block, left bundle branch block, and paced rhythm, with 30 normal ECGs and 10 of each abnormal pattern, totaling 80 cases. Deidentified ECGs were analyzed using OpenAI&#x2019;s GPT-4o. Our study used both zero-shot and few-shot learning methodologies to investigate three main scenarios: (1) ECG image recognition, (2) binary classification of normal versus abnormal ECGs, and (3) multiclass classification into 6 categories.</p></sec><sec sec-type="results"><title>Results</title><p>The model excelled in recognizing ECG images, achieving an accuracy of 100%. In the classification of normal or abnormal ECG cases, the few-shot learning approach improved GPT-4o&#x2019;s accuracy by 30% from the baseline, reaching 83% (95% CI 81.8%-84.6%). However, multiclass classification for a specific pathology remained limited, achieving only 41% accuracy.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPT-4o effectively differentiates normal from abnormal ECGs, suggesting its potential as an accessible artificial intelligence&#x2013;assisted triage tool. Although limited in diagnosing specific cardiac conditions, GPT-4o&#x2019;s capability to interpret ECG images without specialized training highlights its potential for preliminary ECG interpretation in clinical and remote settings.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>cardiology</kwd><kwd>decision support systems</kwd><kwd>electrocardiogram</kwd><kwd>large language models</kwd><kwd>LLMs</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Artificial intelligence (AI) in the realm of medicine, including cardiology, has been consistently evolving. A significant recent AI milestone was achieved when a model, specifically ChatGPT by OpenAI, successfully passed the European Exam in Core Cardiology [<xref ref-type="bibr" rid="ref1">1</xref>]. However, this evaluation focused solely on text-based multiple-choice questions, excluding those with audio or visual elements. While this accomplishment is impressive, cardiology heavily relies on image interpretation and visual data for patient assessment [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>Deep learning (DL), which uses neural networks for image-related tasks [<xref ref-type="bibr" rid="ref3">3</xref>], has already demonstrated its significant impact in medical image analysis, including cardiology [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Moreover, it has been proven effective in predicting clinically significant abnormalities in electrocardiograms (ECGs), such as potassium levels and adverse reactions to medications, while also extracting valuable insights beyond human capabilities, such as estimating sex, age, and identifying specific cardiac conditions [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. For example, Prifti et al [<xref ref-type="bibr" rid="ref7">7</xref>] trained convolutional neural networks (CNNs) on short ECG recordings to accurately detect early signs of drug-induced cardiac effects and inherited rhythm disorders. In a separate study, Attia et al [<xref ref-type="bibr" rid="ref9">9</xref>] demonstrated that deep CNNs could estimate a person&#x2019;s age and sex solely from the heart&#x2019;s electrical signals, tasks that even experienced cardiologists cannot perform reliably, highlighting AI&#x2019;s ability to uncover hidden insights from routine medical data. However, while DL has shown great promise, developing a DL model requires substantial efforts, including the collection of large, labeled datasets and extensive training for the specific task [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>Large language models (LLMs), such as Generative Pretrained Transformer, specialize in processing human language using artificial neural networks [<xref ref-type="bibr" rid="ref13">13</xref>]. The newly introduced multimodal LLM, GPT-4 Omni (GPT-4o) by OpenAI, advances this even further by seamlessly combining text and image data, presenting substantial potential benefits in the medical domain [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>In emergency rooms, efficient patient triaging based on ECG findings is crucial. An AI model capable of distinguishing between normal and abnormal ECGs, even without offering a specific diagnosis, holds significant promise for improving patient care. The concept of &#x201C;ECG triage&#x201D; has the potential to transform how patients are prioritized for cardiology consultations.</p><p>This study aims to evaluate the ability of general purpose LLMs to interpret ECG images using zero-shot and few-shot learning strategies across a range of diagnostic tasks, including ECG recognition, binary classification (normal vs abnormal), and multiclass pathology classification. Our goal is to determine whether GPT-4o can perform these tasks with sufficient accuracy to support its potential role in clinical ECG triage and diagnosis.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Image Collection and Cohort Selection</title><p>The study design is depicted in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Research methodology overview illustrating the research methodology and portraying the high-level design of the method. AF: atrial fibrillation; API: application programming interface; ECG: electrocardiogram; GPT: generative pretrained transformer; LBBB: left bundle branch block; RBBB: right bundle branch block; STEMI: ST-segment elevation myocardial infarction.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e74426_fig01.png"/></fig><p>The study included patients aged 18 years or older who underwent a high-quality ECG recording using the MUSE (GE HealthCare Technologies) system at our institute from August 2010 to February 2024.</p><p>A cohort of 80 arbitrarily chosen 12-lead ECG strips was assembled, covering 6 distinct electrocardiographic presentations. This included 30 records of normal ECG strips and an additional 50 ECG strips representing 5 distinct, common diagnoses (10 ECG strips of each different diagnosis): ST-segment elevation myocardial infarction (STEMI), atrial fibrillation (AF), right bundle branch block (RBBB), left bundle branch block (LBBB), and paced rhythm. These pathologies were chosen for their diverse representation of cardiac conditions, each with unique electrocardiographic features [<xref ref-type="bibr" rid="ref19">19</xref>]. All ECG charts were anonymized, removing age and gender identifiers.</p></sec><sec id="s2-2"><title>Data Validation</title><p>Each case underwent thorough validation via electronic medical record review, with ECG findings meticulously interpreted by a board-certified cardiologist. Only those patients with a singular diagnosis for each condition were included to ensure study validity; those with multiple diagnoses or low-quality images were excluded.</p></sec><sec id="s2-3"><title>GPT-4o Prompt Engineering and Study Design</title><p>GPT-4o is a state-of-the-art multimodal model proficient in analyzing both image and text inputs. We used the OpenAI API to test whether it can interpret ECG images and classify them accurately into distinct categories. We tested three main scenarios: (1) Can GPT-4o recognize an ECG image? (2) Can GPT-4o classify an ECG image as normal or abnormal? (3) Can GPT-4o classify an ECG image into 1 of the 6 specific diagnoses: normal ECG, AF, STEMI, LBBB, RBBB, and paced rhythm?</p><sec id="s2-3-1"><title>Learning Techniques</title><p>In scenarios 2 and 3, we evaluated 2 learning approaches&#x2014;zero-shot and few-shot [<xref ref-type="bibr" rid="ref20">20</xref>]. The zero-shot approach involved providing the model with only a textual instruction describing the classification task, without any previous examples. In contrast, the few-shot approach included a limited number of ECG images, each labeled with its diagnosis, to serve as training data [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. These examples were intended to guide the model in recognizing diagnostic visual patterns and applying them when analyzing new ECGs. To ensure unbiased testing, the evaluation excluded images used for training. For example, if 6 images were given as examples, 54 images were evaluated. This design optimizes training efficiency.</p></sec><sec id="s2-3-2"><title>Prompt Formats</title><p>In some scenarios, we repeated the same task using three different prompt formats to assess how varying levels of complexity and detail affect model performance. The formats were (1) a basic prompt stating only the classification task, (2) a prompt that included the task along with brief descriptions of each class, and (3) a detailed prompt that combined the task with explicit textual guidance, instructing the model on specific visual features to consider when analyzing the ECG images.</p></sec><sec id="s2-3-3"><title>Experimental Scenarios&#x2019; Processes</title><p>The following section outlines the procedures and objectives of each experimental scenario designed to evaluate GPT-4o&#x2019;s ability to interpret ECG images. <xref ref-type="table" rid="table1">Table 1</xref> shows the different experiments conducted across the 3 tested scenarios, and <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides the exact prompts used in each experiment.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Experiments description<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Experiment</td><td align="left" valign="bottom">Scenario</td><td align="left" valign="bottom">Technique</td><td align="left" valign="bottom">Task</td><td align="left" valign="bottom">Total, N</td><td align="left" valign="bottom">Few-shot training sample</td><td align="left" valign="bottom">Testing sample</td></tr></thead><tbody><tr><td align="char" char="." valign="top">1.1</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Recognize ECG<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">60</td></tr><tr><td align="char" char="." valign="top">1.2</td><td align="char" char="." valign="top">1</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Classify ECG or not ECG</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">60</td></tr><tr><td align="char" char="." valign="top">2.1</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Classify normal or abnormal. No textual guidance.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">60</td></tr><tr><td align="char" char="." valign="top">2.2</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Classify normal or abnormal. Minimal textual guidance.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">60</td></tr><tr><td align="char" char="." valign="top">2.3</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Classify normal or abnormal. Textual guidance was provided.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">60</td></tr><tr><td align="char" char="." valign="top">4.2</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Classify normal or abnormal&#x2014;learn 6 examples. No textual guidance.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">6</td><td align="char" char="." valign="top">54</td></tr><tr><td align="char" char="." valign="top">4.3</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Classify normal or abnormal&#x2014;learn 6 examples along with added textual guidance.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">6</td><td align="char" char="." valign="top">54</td></tr><tr><td align="char" char="." valign="top">4.4</td><td align="char" char="." valign="top">2</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Classify normal or abnormal&#x2014;learn 10 examples along with added textual guidance.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">10</td><td align="char" char="." valign="top">50</td></tr><tr><td align="char" char="." valign="top">3.1</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Classify into 6 classes (normal and 5 pathologies). No textual guidance.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">60</td></tr><tr><td align="char" char="." valign="top">3.2</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Classify into 6 classes (normal and 5 pathologies). Textual guidance was provided.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">0</td><td align="char" char="." valign="top">60</td></tr><tr><td align="char" char="." valign="top">5.1</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Classify into 6 classes (normal and 5 pathologies). Examples were provided.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">6</td><td align="char" char="." valign="top">54</td></tr><tr><td align="char" char="." valign="top">5.2</td><td align="char" char="." valign="top">3</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Classify into 6 classes (normal and 5 pathologies). Examples were provided along with added textual guidance.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">6</td><td align="char" char="." valign="top">54</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>The table summarizes the experimental design, including the scenario, prompting technique, classification task, total number of images used, and the number of examples provided in few-shot learning settings.</p></fn><fn id="table1fn2"><p><sup>b</sup>ECG: electrocardiogram.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s2-4"><title>Scenario 1: ECG Image Identification</title><p>This scenario aimed to evaluate the GPT-4o model&#x2019;s ability to recognize ECG images. The dataset included 60 ECG images, each assessed individually by the model. Two experiments were conducted: the first (experiment 1.1) was a simple test to determine whether GPT-4o could recognize that the image presented was an ECG, using the prompt &#x201C;What is this image? Output one line for the label.&#x201D; The second experiment (experiment 1.2) explicitly asked the model to classify the image as either &#x201C;ECG&#x201D; or &#x201C;not ECG.&#x201D;</p></sec><sec id="s2-5"><title>Scenario 2: Distinguishing ECG Images as Normal or Abnormal</title><p>This scenario aimed to evaluate the GPT-4o model&#x2019;s ability to distinguish between normal and abnormal ECG images. The dataset included 30 normal and 30 abnormal ECGs (6 images from each of the 5 abnormalities). Using the zero-shot approach, ECGs were presented without previous examples or guidance. For few-shot learning, 3 experiments were conducted (4.2, 4.3, and 4.4). Two experiments used a single composite image made up of 6 examples (3 normal and 3 abnormal), with and without textual guidance. In the third experiment, 2 composite images with textual guidance were used, together containing 10 examples (5 normal and 5 abnormal). Each file contained a mix of normal and abnormal examples (<xref ref-type="table" rid="table1">Table 1</xref> and <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-6"><title>Scenario 3: Multiclass Classification for a Specific Pathology</title><p>This scenario aimed to assess the GPT-4o model&#x2019;s ability to classify ECG images into specific abnormal categories. The dataset included 60 ECGs, with 10 images from each of 6 pathology classes. Using the zero-shot approach, ECGs were presented without previous examples or guidance (experiments 3.1 and 3.2). In the few-shot learning experiments (experiments 5.1 and 5.2), a single composite image comprising 6 examples (1 from each category) was used, with and without textual guidance. The composite image displaying the 6 pathologies is shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Composite image displaying the 6 electrocardiogram classes used in the multiclass classification few-shot learning approach. AF: atrial fibrillation; LBBB: left bundle branch block; RBBB: right bundle branch block; STEMI: ST-segment elevation myocardial infarction.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e74426_fig02.png"/></fig></sec><sec id="s2-7"><title>Study End Point</title><p>In both the binary (normal or abnormal) and multiclass classification scenarios, GPT-4o&#x2019;s diagnostic output was compared with the reference assessments made by expert cardiologists who manually reviewed each ECG specifically for this study.</p></sec><sec id="s2-8"><title>Evaluation Metrics</title><p>The agreement level between the GPT-4o predictions and the actual labels was evaluated using measures of accuracy, sensitivity, specificity, and <italic>F</italic><sub>1</sub>-score. The positive class was defined as abnormal ECG, with sensitivity representing the detection rate of abnormal ECG, and specificity indicating the detection rate of normal ECG. To ensure the robustness of the results, we repeated the best-performing experiment 5 times and reported both the average values of all evaluation metrics and their corresponding confidence intervals across runs.</p></sec><sec id="s2-9"><title>Software and Statistical Analysis</title><p>Python (version 3.10; Python Software Foundation) was used to interface with the GPT-4o API and generate visualizations. Statistical analyses and performance metric calculations were conducted using R (version 4.4.2; R Foundation for Statistical Computing).</p></sec><sec id="s2-10"><title>Sensitivity Analysis</title><p>To assess the robustness of GPT-4o&#x2019;s performance, we conducted a sensitivity analysis using 2 additional models: a pretrained Vision Transformer (ViT) and Gemini 2.0 Flash (Google), the latest stable version of the Gemini model.</p><sec id="s2-10-1"><title>Vision Transformer</title><p>We implemented a pretrained ViT (vit_base_patch16_224, pretrained on ImageNet) using the timm library in PyTorch. The model was fine-tuned on 10 manually labeled ECG plots (classified as normal or abnormal). Only the classification head was trained, while the transformer backbone remained frozen. Training was performed over 7 epochs using the Adam optimizer (learning rate=1e-4). We also experimented with data augmentation techniques (random rotation and horizontal flipping), which did not improve performance in this small data setting. Model evaluation was performed on a held-out test set of 50 ECG images.</p></sec><sec id="s2-10-2"><title>Gemini 2.0 Flash</title><p>We evaluated Gemini 2.0 flash (Gemini-2.0-Flash-001) using the official Vertex AI SDK (vertexai.generative_models) in Python. Each ECG image was submitted along with the same prompt used in the GPT-4o experiments (as described in the &#x201C;Methods&#x201D; section) except for the few-shot learning experiments, which were adapted to the structured format supported by the model. The model&#x2019;s textual output was parsed to assign a binary class label (normal or abnormal). We assessed accuracy, sensitivity, specificity, and <italic>F</italic><sub>1</sub>-score using the ground truth labels of the test set. We ran 1 iteration for each experiment and set the temperature parameter to 0.2 for consistency across runs.</p></sec></sec><sec id="s2-11"><title>Ethical Considerations</title><p>Ethical approval was obtained from the institutional ethics committee following standard institutional procedures (SMC-D-0522-23).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>The cohort consisted of 80 patients, with a median age of 69 (IQR 57.0-78.0) years, of which 53.8% (43) were females, carefully selected to ensure representativeness. <xref ref-type="table" rid="table2">Table 2</xref> shows the number of patients in each ECG pathology group, the patients&#x2019; age distribution, gender, and key ECG parameters that reflect the clinical and electrophysiological diversity of the cohort.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Demographic characteristics and electrocardiogram parameters of the cohort patients.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Statistics</td></tr></thead><tbody><tr><td align="left" valign="top">Total number of patients</td><td align="left" valign="top">80</td></tr><tr><td align="left" valign="top">Group, n (%)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AF<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">10 (12.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LBBB<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">10 (12.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Normal</td><td align="left" valign="top">30 (37.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Paced</td><td align="left" valign="top">10 (12.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>RBBB<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">10 (12.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>STEMI<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">10 (12.5)</td></tr><tr><td align="left" valign="top">Age at ECG<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> (years), median (IQR)</td><td align="left" valign="top">69.0 (57.0-78.0)</td></tr><tr><td align="left" valign="top">Sex (female), n (%)</td><td align="left" valign="top">43 (53.8)</td></tr><tr><td align="left" valign="top">Ventricular rate, median (IQR)</td><td align="left" valign="top">72.0 (66.0-81.2)</td></tr><tr><td align="left" valign="top">QRS duration, median (IQR)</td><td align="left" valign="top">98.0 (84.0-138.0)</td></tr><tr><td align="left" valign="top">R axis, median (IQR)</td><td align="left" valign="top">4.5 (&#x2212;42.8 to 46.2)</td></tr><tr><td align="left" valign="top">T axis, median (IQR)</td><td align="left" valign="top">44.0 (23.2-79.0)</td></tr><tr><td align="left" valign="top">Num QRS complexes, median (IQR)</td><td align="left" valign="top">12.0 (11.0-13.2)</td></tr><tr><td align="left" valign="top">Pacemaker, n (%)</td><td align="left" valign="top">10 (12.5)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup> AF: atrial fibrillation.</p></fn><fn id="table2fn2"><p><sup>b</sup> LBBB: left bundle branch block.</p></fn><fn id="table2fn3"><p><sup>c</sup> RBBB: right bundle branch block.</p></fn><fn id="table2fn4"><p><sup>d</sup> STEMI: ST-segment elevation myocardial infarction.</p></fn><fn id="table2fn5"><p><sup>e</sup> ECG: electrocardiogram.</p></fn></table-wrap-foot></table-wrap><p>As part of a sensitivity analysis, we compared the performance of GPT-4o with Gemini 2.0 Flash and a pretrained ViT model. Since GPT-4o consistently outperformed the alternative models, we report the full sensitivity analysis results in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. The following sections present the classification results for each scenario using GPT-4o.</p></sec><sec id="s3-2"><title>Scenario 1: ECG Image Identification</title><p>This scenario assessed the GPT-4o model&#x2019;s ability to recognize whether an image depicted an ECG. In both simple experiments (experiments 1.1 and 1.2), the model demonstrated excellent recognition ability, correctly classifying 100% of the images as ECG. These findings are consistent with previous work showing that the earlier model, GPT-4V, achieved 100% accuracy in recognizing medical modalities such as ultrasonography, computed tomography, and radiography [<xref ref-type="bibr" rid="ref23">23</xref>], further supporting GPT-4o&#x2019;s reliability in fundamental image recognition tasks. However, we did not evaluate its performance in more complex scenarios, such as distinguishing electroencephalograms from ECGs.</p></sec><sec id="s3-3"><title>Scenario 2: Distinguishing ECG Images as Normal or Abnormal</title><p>This scenario evaluated the GPT-4o model&#x2019;s ability to differentiate between normal and abnormal ECGs using both zero-shot and few-shot learning approaches. The zero-shot approach showed moderate to high success in diagnosis, with performance gradually improving with the addition of more auxiliary text: 53% without any text, 57% with minimal text, and 63% with extended text (<xref ref-type="table" rid="table3">Table 3</xref>). The sensitivity in the zero-shot experiments was very high, while the specificity was low, indicating that the model classified most cases as abnormal, including many that were normal. In the initial experiment, where no textual guidance was provided, the specificity was close to zero. Following this, we added the sentence &#x201C;Normal ECG: Look for regular P waves, QRS complexes, and T waves with consistent intervals between them. Absence of significant abnormalities.&#x201D; to the prompt, thereby clarifying the definition of a normal ECG. As a result, specificity improved by 26%.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Scenario 2 results.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Experiment</td><td align="left" valign="bottom">Technique</td><td align="left" valign="bottom">Prompt type</td><td align="left" valign="bottom">Testing size</td><td align="left" valign="bottom">Accuracy</td><td align="left" valign="bottom">Sensitivity</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">2.1</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">No textual guidance.</td><td align="left" valign="top">60</td><td align="left" valign="top">0.53</td><td align="left" valign="top">1.0</td><td align="left" valign="top">0.07</td><td align="left" valign="top">0.68</td></tr><tr><td align="left" valign="top">2.2</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Minimal textual guidance.</td><td align="left" valign="top">60</td><td align="left" valign="top">0.57</td><td align="left" valign="top">1.0</td><td align="left" valign="top">0.13</td><td align="left" valign="top">0.7</td></tr><tr><td align="left" valign="top">2.3</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Provide textual guidance.</td><td align="left" valign="top">60</td><td align="left" valign="top">0.63</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.33</td><td align="left" valign="top">0.72</td></tr><tr><td align="left" valign="top">4.2</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Learn 6 examples. No textual guidance.</td><td align="left" valign="top">54</td><td align="left" valign="top">0.72</td><td align="left" valign="top">0.67</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.71</td></tr><tr><td align="left" valign="top">4.3</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Learn 6 examples along with added textual guidance.</td><td align="left" valign="top">54</td><td align="left" valign="top">0.8</td><td align="left" valign="top">0.67</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.77</td></tr><tr><td align="left" valign="top">4.4</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Learn 10 examples along with added textual guidance&#x2014;average results across 5 runs.</td><td align="left" valign="top">50</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.7</td><td align="left" valign="top">0.97</td><td align="left" valign="top">0.81</td></tr></tbody></table></table-wrap><p>In contrast, the few-shot approach demonstrated enhanced accuracy, particularly in experiment 4.4. Incorporating 10 learning examples and additional guidance led to the highest classification performance, achieving an average accuracy of 83% (95% CI 81.8%&#x2010;84.6%), sensitivity of 70% (95% CI 62.9%&#x2010;76.3%), and specificity of 97% (95% CI 92.6%&#x2010;100.0%) across 5 runs (<xref ref-type="table" rid="table3">Table 3</xref> and <xref ref-type="fig" rid="figure3">Figure 3</xref>). By adding textual guidance and providing examples, we improved the accuracy by 30% compared with the baseline model (experiment 2.1), indicating a significant improvement. <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> shows 2 examples of the GPT-4o model&#x2019;s reasoning when classifying an image as a normal or abnormal ECG. We see from the reason it provides that it considers the R-R intervals, P waves, QRS complex, QRS duration, and T waves. However, the accuracy of these explanations was not formally evaluated in this study.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Experiment 4.4 average confusion matrix across 5 iterations. ECG: electrocardiogram.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e74426_fig03.png"/></fig></sec><sec id="s3-4"><title>Scenario 3: Multiclass Classification for a Specific Pathology</title><p>In identifying a specific pathology, both approaches showed low success. However, few-shot outperformed zero-shot, achieving an accuracy of 41% compared with 28%. In the few-shot scenario, textual guidance also led to improved results compared with the case without it (<xref ref-type="table" rid="table4">Table 4</xref> and <xref ref-type="fig" rid="figure4">Figure 4</xref>). Notably, 89% of normal ECGs were correctly classified as normal. Paced rhythm was the most accurately identified cardiac condition, with an accuracy of 55.5%.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Scenario 3 results.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Experiment</td><td align="left" valign="bottom">Technique</td><td align="left" valign="bottom">Prompt type</td><td align="left" valign="bottom">Testing size</td><td align="left" valign="bottom">Accuracy</td></tr></thead><tbody><tr><td align="char" char="." valign="top">3.1</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">No textual guidance.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">0.28</td></tr><tr><td align="char" char="." valign="top">3.2</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Textual guidance was provided.</td><td align="char" char="." valign="top">60</td><td align="char" char="." valign="top">0.28</td></tr><tr><td align="char" char="." valign="top">5.1</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Six examples were provided.</td><td align="char" char="." valign="top">54</td><td align="char" char="." valign="top">0.31</td></tr><tr><td align="char" char="." valign="top">5.2</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Six examples were provided along with added textual guidance.</td><td align="char" char="." valign="top">54</td><td align="char" char="." valign="top">0.41</td></tr></tbody></table></table-wrap><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Experiment 5.2 confusion matrix. Multiclass classification, few-shot learning. ECG: electrocardiogram.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e74426_fig04.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study assesses the image analysis capabilities of GPT-4o for interpreting ECG tests. The main findings reveal that GPT-4o&#x2019;s capabilities in recognizing and understanding ECG images can be significantly improved with prompt engineering and learning examples. In our case, accuracy improved by 30%. GPT-4o effectively identified the images as ECGs and demonstrated a solid theoretical understanding of ECG components and pathologies. Its performance in distinguishing normal from abnormal ECGs was moderate to high, with an average accuracy of 83% (95% CI 81.8%&#x2010;84.6%) across 5 repeated runs on the same 50 ECG examples, reflecting consistent performance. However, the model struggled with more granular classification tasks, achieving only 41% accuracy when identifying specific diagnoses. Furthermore, the study showed that few-shot learning surpassed zero-shot learning, and combining textual instructions with image examples led to better outcomes, achieving moderate to high accuracy and high specificity improvement compared with the baseline model. As part of a sensitivity analysis to contextualize GPT-4o&#x2019;s performance, we also evaluated Gemini 2.0 Flash and a pretrained ViT model; however, neither outperformed GPT-4o in this task.</p><p>Previous studies [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref31">31</xref>] extensively investigated DL AI models&#x2019; diagnostic capabilities for classifying ECGs, achieving higher accuracy rates compared with our study, which explored the performance of LLMs in zero-shot and few-shot learning contexts. While previous studies have reported superior accuracy using specialized DL models (eg, CNNs and LCNNs), these approaches require substantial computational resources and model-specific training, limiting their accessibility in routine clinical practice. In contrast, multimodal LLMs such as GPT-4o provide a low-barrier alternative that could support medical professionals without specialized AI expertise.</p><p>Our findings also align with recent research on the robustness of multimodal models to domain shifts, such as ECG images, which differ substantially from the natural images seen during model pretraining. Previous work has shown that performance under such shifts can be improved through in-context learning strategies such as few-shot learning, as demonstrated in studies evaluating GPT-4V and other vision-language models [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref35">35</xref>]. In our study, this was evident in the improved performance observed with few-shot learning when distinguishing normal from abnormal ECGs. However, the model continued to struggle with identifying specific pathologies, as seen in scenario 3. Several factors likely contributed to this limitation. Certain cardiac conditions are inherently difficult to detect, as their features may be masked by noise, artifacts, or subtle waveform variations [<xref ref-type="bibr" rid="ref16">16</xref>]. These factors can mislead the model, especially with incomplete or atypical ECGs that do not match the patterns it learned during training [<xref ref-type="bibr" rid="ref36">36</xref>], situations in which multimodal LMMs often fail to generalize effectively. Furthermore, the absence of clinical context may further constrain performance, as incorporating patient symptoms or medical history has been shown to enhance diagnostic accuracy [<xref ref-type="bibr" rid="ref37">37</xref>]. Together, these factors likely contributed to the model&#x2019;s limited ability to accurately identify specific abnormalities.</p><p>When comparing our study with those investigating AI&#x2019;s diagnostic performance, a distinct contrast emerges. These studies, using DL models trained on large ECG datasets for specific diagnosis tasks ranging from arrhythmia to STEMI detection, consistently report high diagnostic accuracy rates, often exceeding 90% [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. Conversely, compared with the studies focusing on binary classification of ECGs (normal vs abnormal) [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], our study achieved a moderate to high accuracy of 83% despite minimal training, and by that, highlighting the potential of accessible AI models for cardiac diagnostics. Conversely, compared with the studies focusing on binary classification of ECGs (normal vs abnormal) [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], our study achieved a moderate to high accuracy of 83% despite minimal training, and by that, highlighting the potential of accessible AI models for cardiac diagnostics.</p><p>In addition to its potential in cardiology, GPT-4o&#x2019;s image interpretation capabilities find relevance in various medical domains, such as radiology, neurology, and ophthalmology. Research in these fields indicates that while GPT-4o can identify imaging modalities and tackle intricate diagnostic tasks, its current success rates remain modest [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>Consistent with these findings, our results suggest that although GPT-4o shows promise in medical image interpretation, it remains best suited as a supplementary tool to support, rather than replace, clinical expertise [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. This is especially important given the risk of hallucinations and overconfident misclassifications that LLMs may produce when faced with ambiguous or unfamiliar inputs [<xref ref-type="bibr" rid="ref38">38</xref>]. As multimodal AI models continue to evolve, further research is needed to refine their integration into diagnostic workflows and optimize their clinical use.</p></sec><sec id="s4-2"><title>Limitations and Future Research</title><p>The current findings rely on a small retrospective sample of 80 patients. While this limited sample size constrains the statistical robustness of the findings, it was sufficient to support a focused proof-of-concept evaluation of GPT-4o&#x2019;s capabilities in ECG interpretation. The sample, although small, demonstrated consistent performance across repeated runs and helped highlight key challenges and opportunities in applying multimodal LLMs to ECG analysis. Moreover, our study acknowledges the documented potential impact of prompt wording variations on GPT-4o&#x2019;s responses [<xref ref-type="bibr" rid="ref39">39</xref>]. Minor changes in prompts can significantly affect language models such as GPT-4o. Finally, our study solely evaluated GPT-4o with ECG recordings, excluding the patient&#x2019;s medical history, a departure from typical clinical practice, where attending physicians have access to comprehensive patient information. We hypothesize that incorporating these contextual data into the model could enhance diagnostic accuracy.</p><p>Future research could assess custom GPT-4o performance when it is enhanced with specific knowledge sources, such as cardiology textbooks, rather than solely instructions. Furthermore, to address the challenges observed in multiclass classification of specific diagnoses (scenario 3, <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>), future studies should explore few-shot learning setups that include multiple examples for each diagnostic class and test on a larger sample. As demonstrated in previous work, this approach can improve performance under domain shift conditions by enabling the model to generalize more effectively across diverse pathology patterns [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. Finally, future work should consider evaluating more advanced models such as Gemini 2.5, which, while not yet part of a stable public release, has demonstrated strong performance in multimodal tasks and may offer improved capabilities for clinical ECG interpretation.</p></sec><sec id="s4-3"><title>Conclusions</title><p>The current version of GPT-4o exhibits moderate to high proficiency in distinguishing between normal and abnormal ECG readings. However, its ability to diagnose specific cardiac conditions remains limited. Our findings suggest that GPT-4o&#x2019;s performance can be enhanced through prompt engineering and few-shot learning, highlighting its potential as a supplementary decision support system in clinical practice. Future improvements to the algorithm, particularly in fine-tuning its diagnostic capabilities, could further expand its use in medical image analysis.</p></sec></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AF</term><def><p>atrial fibrillation</p></def></def-item><def-item><term id="abb2">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb3">CNN</term><def><p>convolutional neural network</p></def></def-item><def-item><term id="abb4">DL</term><def><p>deep learning</p></def></def-item><def-item><term id="abb5">ECG</term><def><p>electrocardiogram</p></def></def-item><def-item><term id="abb6">GPT-4o</term><def><p>Generative Pre-trained Transformer 4 Omni</p></def></def-item><def-item><term id="abb7">LBBB</term><def><p>left bundle branch block</p></def></def-item><def-item><term id="abb8">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb9">RBBB</term><def><p>right bundle branch block</p></def></def-item><def-item><term id="abb10">STEMI</term><def><p>ST-segment elevation myocardial infarction</p></def></def-item><def-item><term id="abb11">ViT</term><def><p>Vision Transformer</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Skalidis</surname><given-names>I</given-names> </name><name name-style="western"><surname>Cagnina</surname><given-names>A</given-names> </name><name name-style="western"><surname>Luangphiphat</surname><given-names>W</given-names> </name><etal/></person-group><article-title>ChatGPT takes on the European exam in core cardiology: an artificial intelligence success story?</article-title><source>Eur Heart J Digit Health</source><year>2023</year><month>05</month><volume>4</volume><issue>3</issue><fpage>279</fpage><lpage>281</lpage><pub-id pub-id-type="doi">10.1093/ehjdh/ztad029</pub-id><pub-id pub-id-type="medline">37265864</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Niederer</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Lumens</surname><given-names>J</given-names> </name><name name-style="western"><surname>Trayanova</surname><given-names>NA</given-names> </name></person-group><article-title>Computational models in cardiology</article-title><source>Nat Rev Cardiol</source><year>2019</year><month>02</month><volume>16</volume><issue>2</issue><fpage>100</fpage><lpage>111</lpage><pub-id pub-id-type="doi">10.1038/s41569-018-0104-y</pub-id><pub-id pub-id-type="medline">30361497</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Getty</surname><given-names>N</given-names> </name><name name-style="western"><surname>Brettin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Stevens</surname><given-names>R</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>F</given-names> </name></person-group><article-title>Deep medical image analysis with representation learning and neuromorphic computing</article-title><source>Interface Focus</source><year>2021</year><month>02</month><day>6</day><volume>11</volume><issue>1</issue><fpage>20190122</fpage><pub-id pub-id-type="doi">10.1098/rsfs.2019.0122</pub-id><pub-id pub-id-type="medline">33343872</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kamaleswaran</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mahajan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Akbilgic</surname><given-names>O</given-names> </name></person-group><article-title>A robust deep convolutional neural network for the classification of abnormal cardiac rhythm using single lead electrocardiograms of variable length</article-title><source>Physiol Meas</source><year>2018</year><month>03</month><day>27</day><volume>39</volume><issue>3</issue><fpage>035006</fpage><pub-id pub-id-type="doi">10.1088/1361-6579/aaaa9d</pub-id><pub-id pub-id-type="medline">29369044</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oke</surname><given-names>OA</given-names> </name><name name-style="western"><surname>Cavus</surname><given-names>N</given-names> </name></person-group><article-title>A systematic review on the impact of artificial intelligence on electrocardiograms in cardiology</article-title><source>Int J Med Inform</source><year>2025</year><month>03</month><volume>195</volume><fpage>105753</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2024.105753</pub-id><pub-id pub-id-type="medline">39674006</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Galloway</surname><given-names>CD</given-names> </name><name name-style="western"><surname>Valys</surname><given-names>AV</given-names> </name><name name-style="western"><surname>Shreibati</surname><given-names>JB</given-names> </name><etal/></person-group><article-title>Development and validation of a deep-learning model to screen for hyperkalemia from the electrocardiogram</article-title><source>JAMA Cardiol</source><year>2019</year><month>05</month><day>1</day><volume>4</volume><issue>5</issue><fpage>428</fpage><lpage>436</lpage><pub-id pub-id-type="doi">10.1001/jamacardio.2019.0640</pub-id><pub-id pub-id-type="medline">30942845</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Prifti</surname><given-names>E</given-names> </name><name name-style="western"><surname>Fall</surname><given-names>A</given-names> </name><name name-style="western"><surname>Davogustto</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Deep learning analysis of electrocardiogram for risk prediction of drug-induced arrhythmias and diagnosis of long QT syndrome</article-title><source>Eur Heart J</source><year>2021</year><month>10</month><day>7</day><volume>42</volume><issue>38</issue><fpage>3948</fpage><lpage>3961</lpage><pub-id pub-id-type="doi">10.1093/eurheartj/ehab588</pub-id><pub-id pub-id-type="medline">34468739</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen-Shelly</surname><given-names>M</given-names> </name><name name-style="western"><surname>Attia</surname><given-names>ZI</given-names> </name><name name-style="western"><surname>Friedman</surname><given-names>PA</given-names> </name><etal/></person-group><article-title>Electrocardiogram screening for aortic valve stenosis using artificial intelligence</article-title><source>Eur Heart J</source><year>2021</year><month>08</month><day>7</day><volume>42</volume><issue>30</issue><fpage>2885</fpage><lpage>2896</lpage><pub-id pub-id-type="doi">10.1093/eurheartj/ehab153</pub-id><pub-id pub-id-type="medline">33748852</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Attia</surname><given-names>ZI</given-names> </name><name name-style="western"><surname>Friedman</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Noseworthy</surname><given-names>PA</given-names> </name><etal/></person-group><article-title>Age and sex estimation using artificial intelligence from standard 12-lead ECGs</article-title><source>Circ Arrhythm Electrophysiol</source><year>2019</year><month>09</month><volume>12</volume><issue>9</issue><fpage>e007284</fpage><pub-id pub-id-type="doi">10.1161/CIRCEP.119.007284</pub-id><pub-id pub-id-type="medline">31450977</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jahan</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Mansourvar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Puthusserypady</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wiil</surname><given-names>UK</given-names> </name><name name-style="western"><surname>Peimankar</surname><given-names>A</given-names> </name></person-group><article-title>Short-term atrial fibrillation detection using electrocardiograms: a comparison of machine learning approaches</article-title><source>Int J Med Inform</source><year>2022</year><month>07</month><volume>163</volume><fpage>104790</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2022.104790</pub-id><pub-id pub-id-type="medline">35552189</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bochinski</surname><given-names>E</given-names> </name><name name-style="western"><surname>Eiselein</surname><given-names>V</given-names> </name><name name-style="western"><surname>Sikora</surname><given-names>T</given-names> </name></person-group><article-title>Training a convolutional neural network for multi-class object detection using solely virtual world data</article-title><conf-name>2016 13th IEEE International Conference on Advanced Video and Signal Based Surveillance (AVSS)</conf-name><conf-date>Aug 23-26, 2016</conf-date><conf-loc>Colorado Springs, CO, USA</conf-loc><pub-id pub-id-type="doi">10.1109/AVSS.2016.7738056</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>LeCun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bengio</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>G</given-names> </name></person-group><article-title>Deep learning</article-title><source>Nature New Biol</source><year>2015</year><month>05</month><day>28</day><volume>521</volume><issue>7553</issue><fpage>436</fpage><lpage>444</lpage><pub-id pub-id-type="doi">10.1038/nature14539</pub-id><pub-id pub-id-type="medline">26017442</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Galke</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ram</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Raviv</surname><given-names>L</given-names> </name></person-group><article-title>Deep neural networks and humans both benefit from compositional language structure</article-title><source>Nat Commun</source><year>2024</year><month>12</month><day>30</day><volume>15</volume><issue>1</issue><fpage>10816</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-55158-1</pub-id><pub-id pub-id-type="medline">39738033</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x00D6;zt&#x00FC;rk</surname><given-names>A</given-names> </name><name name-style="western"><surname>G&#x00FC;nay</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ate&#x015F;</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yi&#x011F;it Yavuz Yigit</surname><given-names>Y</given-names> </name></person-group><article-title>Can GPT-4o accurately diagnose trauma x-rays? A comparative study with expert evaluations</article-title><source>J Emerg Med</source><year>2025</year><month>06</month><volume>73</volume><fpage>71</fpage><lpage>79</lpage><pub-id pub-id-type="doi">10.1016/j.jemermed.2024.12.010</pub-id><pub-id pub-id-type="medline">40348690</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanzawa</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kurokawa</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kaiume</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evaluating the role of GPT-4 and GPT-4o in the detectability of chest radiography reports requiring further assessment</article-title><source>Cureus</source><year>2024</year><month>12</month><volume>16</volume><issue>12</issue><fpage>e75532</fpage><pub-id pub-id-type="doi">10.7759/cureus.75532</pub-id><pub-id pub-id-type="medline">39803046</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Avidan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tabachnikov</surname><given-names>V</given-names> </name><name name-style="western"><surname>Court</surname><given-names>OB</given-names> </name><name name-style="western"><surname>Khoury</surname><given-names>R</given-names> </name><name name-style="western"><surname>Aker</surname><given-names>A</given-names> </name></person-group><article-title>In the face of confounders: atrial fibrillation detection&#x2014;practitioners vs. ChatGPT</article-title><source>J Electrocardiol</source><year>2025</year><volume>88</volume><fpage>153851</fpage><pub-id pub-id-type="doi">10.1016/j.jelectrocard.2024.153851</pub-id><pub-id pub-id-type="medline">39667153</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sozer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sahin</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Sozer</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Do LLMs have &#x201C;the Eye&#x201D; for MRI? Evaluating GPT-4o, Grok, and Gemini on brain MRI performance: first evaluation of Grok in medical imaging and a comparative analysis</article-title><source>Diagnostics (Basel)</source><year>2025</year><month>05</month><day>24</day><volume>15</volume><issue>11</issue><fpage>1320</fpage><pub-id pub-id-type="doi">10.3390/diagnostics15111320</pub-id><pub-id pub-id-type="medline">40506892</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Be&#x015F;ler</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Oleaga</surname><given-names>L</given-names> </name><name name-style="western"><surname>Junquero</surname><given-names>V</given-names> </name><name name-style="western"><surname>Merino</surname><given-names>C</given-names> </name></person-group><article-title>Evaluating GPT-4o&#x2019;s performance in the official European board of radiology exam: a comprehensive assessment</article-title><source>Acad Radiol</source><year>2024</year><month>11</month><volume>31</volume><issue>11</issue><fpage>4365</fpage><lpage>4371</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2024.09.005</pub-id><pub-id pub-id-type="medline">39294055</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hampton</surname><given-names>J</given-names> </name></person-group><source>The ECG Made Easy</source><year>2019</year><edition>9</edition><publisher-name>Elsevier</publisher-name><pub-id pub-id-type="other">0702074578</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kadam</surname><given-names>S</given-names> </name><name name-style="western"><surname>Vaidya</surname><given-names>V</given-names> </name></person-group><article-title>Review and analysis of zero, one and few shot learning approaches</article-title><source>Advances in Intelligent Systems and Computing</source><year>2019</year><publisher-name>Springer, Cham</publisher-name><fpage>100</fpage><lpage>112</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-16657-1_10</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>AI concepts</article-title><source>LastMile AI Docs</source><year>2023</year><access-date>2025-08-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://lastmile-ai.gitbook.io/lastmile-ai-docs/getting-started/ai-concepts">https://lastmile-ai.gitbook.io/lastmile-ai-docs/getting-started/ai-concepts</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><name name-style="western"><surname>Subbiah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kaplan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dhariwal</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on 2020</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2005.14165</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Barash</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Assessing GPT-4 multimodal performance in radiological image analysis</article-title><source>Eur Radiol</source><year>2025</year><month>04</month><volume>35</volume><issue>4</issue><fpage>1959</fpage><lpage>1965</lpage><pub-id pub-id-type="doi">10.1007/s00330-024-11035-5</pub-id><pub-id pub-id-type="medline">39214893</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Natarajan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mariani</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A wide and deep transformer neural network for 12-lead ECG classification</article-title><conf-name>2020 Computing in Cardiology Conference</conf-name><conf-date>Sep 13-16, 2020</conf-date><conf-loc>Rimini, Italy</conf-loc><pub-id pub-id-type="doi">10.22489/CinC.2020.107</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Somani</surname><given-names>S</given-names> </name><name name-style="western"><surname>Russak</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Richter</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Deep learning and the electrocardiogram: review of the current state-of-the-art</article-title><source>Europace</source><year>2021</year><month>08</month><day>6</day><volume>23</volume><issue>8</issue><fpage>1179</fpage><lpage>1191</lpage><pub-id pub-id-type="doi">10.1093/europace/euaa377</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>HY</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>W</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>GH</given-names> </name><etal/></person-group><article-title>Diagnostic accuracy of the deep learning model for the detection of ST elevation myocardial infarction on electrocardiogram</article-title><source>J Pers Med</source><year>2022</year><month>02</month><day>23</day><volume>12</volume><issue>3</issue><fpage>336</fpage><pub-id pub-id-type="doi">10.3390/jpm12030336</pub-id><pub-id pub-id-type="medline">35330336</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hwan Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Whan Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Seop Kim</surname><given-names>K</given-names> </name></person-group><article-title>Classification of cardiac arrhythmias using deep learning</article-title><source>IJET</source><year>2018</year><volume>7</volume><issue>3.3</issue><fpage>401</fpage><pub-id pub-id-type="doi">10.14419/ijet.v7i2.33.14195</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eltrass</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Tayel</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Ammar</surname><given-names>AI</given-names> </name></person-group><article-title>Automated ECG multi-class classification system based on combining deep learning features with HRV and ECG measures</article-title><source>Neural Comput Appl</source><year>2022</year><month>06</month><volume>34</volume><issue>11</issue><fpage>8755</fpage><lpage>8775</lpage><pub-id pub-id-type="doi">10.1007/s00521-022-06889-z</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lui</surname><given-names>HW</given-names> </name><name name-style="western"><surname>Chow</surname><given-names>KL</given-names> </name></person-group><article-title>Multiclass classification of myocardial infarction with convolutional and recurrent neural networks for portable ECG devices</article-title><source>Inform Med Unlocked</source><year>2018</year><volume>13</volume><fpage>26</fpage><lpage>33</lpage><pub-id pub-id-type="doi">10.1016/j.imu.2018.08.002</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>J</given-names> </name></person-group><article-title>Normal versus abnormal ECG classification by the aid of deep learning</article-title><source>Artificial Intelligence&#x2014;Emerging Trends and Applications</source><year>2018</year><publisher-name>InTechOpen</publisher-name><fpage>295</fpage><lpage>315</lpage><pub-id pub-id-type="doi">10.5772/intechopen.75546</pub-id><pub-id pub-id-type="other">978-1-78923-364-3</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lv</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kong</surname><given-names>D</given-names> </name></person-group><article-title>CNN-FWS: a model for the diagnosis of normal and abnormal ECG with feature adaptive</article-title><source>Entropy (Basel)</source><year>2022</year><month>03</month><day>28</day><volume>24</volume><issue>4</issue><fpage>471</fpage><pub-id pub-id-type="doi">10.3390/e24040471</pub-id><pub-id pub-id-type="medline">35455133</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yoo</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>JY</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>HK</given-names> </name></person-group><article-title>Feasibility study to improve deep learning in OCT diagnosis of rare retinal diseases with few-shot classification</article-title><source>Med Biol Eng Comput</source><year>2021</year><month>02</month><volume>59</volume><issue>2</issue><fpage>401</fpage><lpage>415</lpage><pub-id pub-id-type="doi">10.1007/s11517-021-02321-1</pub-id><pub-id pub-id-type="medline">33492598</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Agbareia</surname><given-names>R</given-names> </name><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zloto</surname><given-names>O</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Multimodal LLMs for retinal disease diagnosis via OCT: few-shot versus single-shot learning</article-title><source>Ther Adv Ophthalmol</source><year>2025</year><volume>17</volume><fpage>25158414251340569</fpage><pub-id pub-id-type="doi">10.1177/25158414251340569</pub-id><pub-id pub-id-type="medline">40400723</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>G</given-names> </name><name name-style="western"><surname>Han</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Adapting large multimodal models to distribution shifts: the role of in-context learning</article-title><source>arXiv</source><comment>Preprint posted online on 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.12217</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>G</given-names> </name><name name-style="western"><surname>He</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Yin</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>How well does GPT-4V(ision) adapt to distribution shifts? a preliminary investigation</article-title><comment>Preprint posted online on 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2312.07424</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>On the out-of-distribution generalization of multimodal large language models</article-title><source>arXiv</source><comment>Preprint posted online on 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.06599</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhenzhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Jingfeng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Jianjun</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yinshui</surname><given-names>X</given-names> </name></person-group><article-title>GPT-agents based on medical guidelines can improve the responsiveness and explainability of outcomes for traumatic brain injury rehabilitation</article-title><source>Sci Rep</source><year>2024</year><month>04</month><day>1</day><volume>14</volume><issue>1</issue><fpage>7626</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-58514-9</pub-id><pub-id pub-id-type="medline">38561445</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ji</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>N</given-names> </name><name name-style="western"><surname>Frieske</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Survey of hallucination in natural language generation</article-title><source>ACM Comput Surv</source><year>2023</year><month>12</month><day>31</day><volume>55</volume><issue>12</issue><fpage>1</fpage><lpage>38</lpage><pub-id pub-id-type="doi">10.1145/3571730</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al Zubaer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Granitzer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mitrovi&#x0107;</surname><given-names>J</given-names> </name></person-group><article-title>Performance analysis of large language models in the domain of legal argument mining</article-title><source>Front Artif Intell</source><year>2023</year><volume>6</volume><fpage>1278796</fpage><pub-id pub-id-type="doi">10.3389/frai.2023.1278796</pub-id><pub-id pub-id-type="medline">38045763</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pachetti</surname><given-names>E</given-names> </name><name name-style="western"><surname>Colantonio</surname><given-names>S</given-names> </name></person-group><article-title>A systematic review of few-shot learning in medical imaging</article-title><source>Artif Intell Med</source><year>2024</year><month>10</month><volume>156</volume><fpage>102949</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2024.102949</pub-id><pub-id pub-id-type="medline">39178621</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>The prompt of each experiment.</p><media xlink:href="ai_v4i1e74426_app1.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Sensitivity analysis.</p><media xlink:href="ai_v4i1e74426_app2.docx" xlink:title="DOCX File, 19 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Examples of the GPT-4o reasoning when deciding whether an electrocardiogram is normal or abnormal.</p><media xlink:href="ai_v4i1e74426_app3.docx" xlink:title="DOCX File, 418 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Illustration of the challenges in classifying specific pathologies within a few-shot learning setup.</p><media xlink:href="ai_v4i1e74426_app4.docx" xlink:title="DOCX File, 619 KB"/></supplementary-material></app-group></back></article>