<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR AI</journal-id><journal-id journal-id-type="publisher-id">ai</journal-id><journal-id journal-id-type="index">41</journal-id><journal-title>JMIR AI</journal-title><abbrev-journal-title>JMIR AI</abbrev-journal-title><issn pub-type="epub">2817-1705</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v4i1e75910</article-id><article-id pub-id-type="doi">10.2196/75910</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Comparative Diagnostic Performance of a Multimodal Large Language Model Versus a Dedicated Electrocardiogram AI in Detecting Myocardial Infarction From Electrocardiogram Images: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Haemin</given-names></name><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yoo</surname><given-names>Sooyoung</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Joonghee</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cho</surname><given-names>Youngjin</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Suh</surname><given-names>Dongbum</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Lee</surname><given-names>Keehyuck</given-names></name><degrees>MD, MBA</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Emergency Medicine, Seoul National University Bundang Hospital</institution><addr-line>Seongnam-si, Gyeonggi-do</addr-line><country>Republic of Korea</country></aff><aff id="aff2"><institution>ARPI Inc</institution><addr-line>Seongnam-si, Gyeonggi-do</addr-line><country>Republic of Korea</country></aff><aff id="aff3"><institution>Office of eHealth Research and Businesses, Seoul National University Bundang Hospital</institution><addr-line>Seongnam-si, Gyeonggi-do</addr-line><country>Republic of Korea</country></aff><aff id="aff4"><institution>Department of Internal Medicine, Cardiovascular Center, Seoul National University Bundang Hospital</institution><addr-line>Seongnam-si, Gyeonggi-do</addr-line><country>Republic of Korea</country></aff><aff id="aff5"><institution>Department of Family Medicine, Seoul National University Bundang Hospital</institution><addr-line>82 Gumi-ro 173 beon-gil, Bundang-gu</addr-line><addr-line>Seongnam-si, Gyeonggi-do</addr-line><country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Luo</surname><given-names>Gang</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Arafat</surname><given-names>Amr A</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Seisyou</surname><given-names>Kou</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Keehyuck Lee, MD, MBA, Department of Family Medicine, Seoul National University Bundang Hospital, 82 Gumi-ro 173 beon-gil, Bundang-gu, Seongnam-si, Gyeonggi-do, 13620, Republic of Korea, 82 10-2514-7428; <email>chrisruga@snubh.org</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>17</day><month>9</month><year>2025</year></pub-date><volume>4</volume><elocation-id>e75910</elocation-id><history><date date-type="received"><day>18</day><month>04</month><year>2025</year></date><date date-type="rev-recd"><day>08</day><month>07</month><year>2025</year></date><date date-type="accepted"><day>17</day><month>07</month><year>2025</year></date></history><copyright-statement>&#x00A9; Haemin Lee, Sooyoung Yoo, Joonghee Kim, Youngjin Cho, Dongbum Suh, Keehyuck Lee. Originally published in JMIR AI (<ext-link ext-link-type="uri" xlink:href="https://ai.jmir.org">https://ai.jmir.org</ext-link>), 17.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR AI, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.ai.jmir.org/">https://www.ai.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://ai.jmir.org/2025/1/e75910"/><abstract><sec><title>Background</title><p>Accurate and timely electrocardiogram (ECG) interpretation is critical for diagnosing myocardial infarction (MI) in emergency settings. Recent advances in multimodal large language models (LLMs), such as ChatGPT (OpenAI) and Gemini (Google DeepMind), have shown promise in clinical interpretation for medical imaging. However, whether these models analyze waveform patterns or simply rely on text cues remains unclear, underscoring the need for direct comparisons with dedicated ECG artificial intelligence (AI) tools.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate the diagnostic performance of ChatGPT and Gemini, a general-purpose LLM, in detecting MI from ECG images and to compare its performance with that of ECG Buddy (ARPI Inc), a dedicated AI-driven ECG analysis tool.</p></sec><sec sec-type="methods"><title>Methods</title><p>This retrospective study evaluated and compared AI models for classifying MI using a publicly available 12-lead ECG dataset from Pakistan, categorizing cases into MI-positive (239 images) and MI-negative (689 images). ChatGPT (GPT-4o, version November 20, 2024) and Gemini (Gemini 2.5 pro) were queried with 5 MI confidence options, whereas ECG Buddy for Microsoft Windows analyzed the images based on ST-elevation MI, acute coronary syndrome, and myocardial injury biomarkers.</p></sec><sec sec-type="results"><title>Results</title><p>Among 928 ECG recordings (239/928, 25.8% MI-positive), ChatGPT achieved an accuracy of 65.95% (95% CI 62.80&#x2010;69.00), area under the curve (AUC) of 57.34% (95% CI 53.44&#x2010;61.24), sensitivity of 36.40% (95% CI 30.30&#x2010;42.85), and specificity of 76.2% (95% CI 72.84&#x2010;79.33). With Gemini 2.5 Pro, accuracy dropped to 29.63% (95% CI 26.71&#x2010;32.69), AUC to 51.63% (95% CI 50.22&#x2010;53.04), and sensitivity rose to 97.07% (95% CI 94.06&#x2010;98.81), but specificity fell sharply to 6.24% (95% CI 4.55&#x2010;8.31). However, ECG Buddy reached an accuracy of 96.98% (95% CI 95.67&#x2010;97.99), AUC of 98.8% (95% CI 98.3&#x2010;99.43), sensitivity of 96.65% (95% CI 93.51&#x2010;98.54), and specificity of 97.10% (95% CI 95.55&#x2010;98.22). DeLong test confirmed that ECG Buddy significantly outperformed ChatGPT (all <italic>P</italic>&#x003C;.001). In a qualitative error analysis of LLMs&#x2019; diagnostic explanations, GPT-4o produced fully accurate explanations in only 5% of cases (2/40), was partially accurate in 38% (15/40), and completely inaccurate in 58% (23/40). By contrast, Gemini 2.5 Pro yielded fully accurate explanations in 32% of cases (12/37), was partially accurate in 14% (5/37), and completely inaccurate in 54% (20/37).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLMs, such as ChatGPT and Gemini, underperform relative to specialized tools such as ECG Buddy in ECG image&#x2013;based MI diagnosis. Further training may improve LLMs; however, domain-specific AI remains essential for clinical accuracy. The high performance of ECG Buddy underscores the importance of specialized models for achieving reliable and robust diagnostic outcomes.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>myocardial infarction</kwd><kwd>ECG</kwd><kwd>LLMs</kwd><kwd>large language models</kwd><kwd>electrocardiogram</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Electrocardiogram (ECG) interpretation is a fundamental skill in cardiovascular medicine, playing a crucial role in diagnosing conditions such as ST-elevation myocardial infarction (STEMI), arrhythmias, and electrolyte imbalances [<xref ref-type="bibr" rid="ref1">1</xref>]. Accurate and timely ECG analysis is critical in clinical decision-making, particularly in emergency settings where rapid interventions impact patient outcomes.</p><p>With advancements in artificial intelligence (AI), researchers have explored using various deep learning techniques, including convolutional neural networks and transformer-based models, to automate ECG interpretation by extracting clinically relevant features from ECG signal or image data [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>Recently, multimodal large language models (LLMs) trained on textual and imaging data have gained attention in the medical field [<xref ref-type="bibr" rid="ref6">6</xref>]. These models have demonstrated the ability to generate diagnostic reports, highlighting their potential for medical image interpretation [<xref ref-type="bibr" rid="ref7">7</xref>]. As LLMs have become increasingly sophisticated, the interest in applying similar multimodal architectures to ECG interpretation has also grown.</p><p>General-purpose LLMs, such as ChatGPT (OpenAI), have recently demonstrated some capabilities in assisting with image interpretation and text-based medical assessments [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Unlike traditional AI models specifically trained for ECG signal processing, these models leverage extensive general knowledge and are now being considered for processing visual medical data, including ECG images. For example, Zaboli et al [<xref ref-type="bibr" rid="ref10">10</xref>] investigated the ECG interpretation ability and outcome prediction of ChatGPT in the emergency department and found moderate agreement with cardiologists, but with notable discrepancies in major adverse cardiac event risk assessment. Zhu et al [<xref ref-type="bibr" rid="ref11">11</xref>] reported that GPT-4 achieved approximately 83% accuracy in multiple-choice ECG diagnostic questions. G&#x00FC;nay et al [<xref ref-type="bibr" rid="ref12">12</xref>] compared GPT-4, GPT-4o, and Gemini Advanced (Google DeepMind) against cardiologists and emergency medicine specialists using routine and challenging ECG cases. Although all LLMs underperformed compared to cardiologists, GPT-4o showed relatively better accuracy and moderate agreement, suggesting potential as a supportive tool in clinical settings. Similarly, Avidan et al [<xref ref-type="bibr" rid="ref13">13</xref>] examined the ability of GPT-4o to detect atrial fibrillation in ECGs with confounding factors. Their findings indicated that while the overall accuracy of GPT-4o was comparable to that of internists and primary care physicians, it fell short of cardiologists&#x2019; performance, particularly in challenging scenarios. In contrast, G&#x00FC;nay et al [<xref ref-type="bibr" rid="ref14">14</xref>] reported that GPT-4 outperformed emergency medicine specialists in interpreting everyday ECG cases and performed on par with cardiologists when facing more complex ECG challenges. However, a key limitation of their study is that ECG descriptions rather than actual ECG images were evaluated by GPT-4, potentially limiting its applicability in clinical settings.</p><p>Collectively, these studies highlight that although LLM-based approaches in ECG interpretation hold promise, their reliability in complex cases remains limited. Moreover, it remains unclear whether these models truly analyze waveform patterns or simply rely on text-based cues, such as machine-readable annotations. This raises concerns about the reproducibility of the models&#x2019; interpretations when presented with raw ECG images alone.</p><p>To date, no study has systematically compared the performance of LLMs against specialized ECG diagnostic AI tools. This comparison is becoming increasingly relevant, as general-purpose LLMs are not specifically designed for cardiovascular medicine. However, speculation about their potential applications in ECG interpretation is already widespread. Thus, a comparative evaluation with dedicated ECG AI software is necessary to determine the feasibility of LLM-based ECG interpretation in clinical practice.</p><p>Recent studies comparing ChatGPT-4o with Gemini on ECG interpretation tasks identify both models as suitable reference LLMs, so we included them in our evaluation [<xref ref-type="bibr" rid="ref12">12</xref>]. We then benchmarked their performance against ECG Buddy (ARPI Inc.), a commercially available, domain-specific AI tool for ECG analysis.</p><p>ECG Buddy is approved by the Korean Ministry of Food and Drug Safety, South Korea&#x2019;s regulatory agency for medical device oversight, responsible for thorough examinations and continuous supervision, and is currently in routine clinical use at multiple hospitals, including tertiary care centers. ECG Buddy has been validated in multiple studies, demonstrating superior diagnostic accuracy to clinical experts in detecting conditions such as myocardial infarction (MI), hyperkalemia, and right ventricular (RV) dysfunction [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>This study aimed to evaluate the diagnostic performance of ChatGPT and Gemini relative to a dedicated ECG AI (ECG Buddy) in analyzing ECG images for MI detection. MI interpretation is one of the most essential aspects of ECG analysis. Through this comparative study, we aimed to determine whether LLMs could currently be used for ECG interpretation in clinical practice.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Data Preparation</title><p>In this retrospective study, we evaluated the performance of ChatGPT, Gemini, and ECG Buddy in classifying MI from ECG images. A publicly available 12-lead ECG image dataset compiled by the Ch. Pervaiz Elahi Institute of Cardiology in Multan, Pakistan, was used [<xref ref-type="bibr" rid="ref19">19</xref>]. The dataset includes ECG images categorized into the following 4 groups: patients with MI (239 images), patients with abnormal heartbeats (233 images), patients with a history of MI (172 images), and healthy controls (284 images). This publicly available, fully deidentified ECG image dataset was chosen to enable reproducible benchmarking without privacy constraints and is frequently referenced in prior studies. The dataset does not provide additional patient information beyond these labels. It lacks metadata such as infarct territory or cardiac biomarker data needed to differentiate STEMI from non-STEMI (NSTEMI). Therefore, further analyses by infarct location or NSTEMI status were not possible.</p><p>This study was designed and reported in accordance with the TRIPOD-LLM (Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis Or Diagnosis specifically tailored for LLM) guidelines, a comprehensive reporting framework for studies involving LLMs in health care, to ensure that every step, from data processing and image-to-text conversion to AI querying and performance evaluation, was transparently and reproducibly documented [<xref ref-type="bibr" rid="ref20">20</xref>]. To ensure consistency in data processing, extraneous areas of the images, including any supplementary text not related to patient information or diagnosis, were cropped, retaining only the waveform regions. No raster-to-signal conversion was applied, and all analyses were performed directly on the image data. For classification, only the images labeled as &#x201C;patients with myocardial infarction&#x201D; were designated as MI-positive, representing active MI cases. The remaining 689 images, comprising abnormal heartbeats, history of MI, and healthy cases, were classified as MI-negative.</p></sec><sec id="s2-2"><title>AI Query and Output (ChatGPT and Gemini)</title><p>The identical workflow was applied to 2 multimodal LLMs&#x2014;GPT-4o (OpenAI, version November 20, 2024) and Gemini 2.5 Pro (Google, May 2025 release). To assess the ability of LLMs to classify MI from ECG images, we designed a structured prompt aimed at systematically capturing the model&#x2019;s diagnostic rationale and confidence in detecting MI. Before querying, ECG images were converted into base64 format, a lossless binary-to-text format required by both application programming interfaces (APIs) and one that preserves every pixel and ensures no loss of ECG signal fidelity.</p><p>Each LLM received the base64-encoded images and prompted explicitly to analyze them, determine the likelihood of MI, and select from 5 predefined response categories&#x2014;unknown, unlikely, possible, probable, and definite&#x2014;representing increasing diagnostic certainty. Both LLMs were queried with the prompt shown in <xref ref-type="other" rid="box1">Textbox 1</xref>, which presents the ChatGPT-4o version; the Gemini 2.5 Pro prompt was identical, differing only in that the model name was replaced with &#x201C;GPT-4o.&#x201D;</p><boxed-text id="box1"><title> <bold>Textbox 1.</bold> GPT-4o prompt design.</title><p>SYSTEM_TEXT = (</p><p>&#x201C;Analyze the provided ECG image in base64 format and assess the likelihood of Myocardial Infarction (MI).&#x201D;</p><p>&#x201C;Based on your analysis, select the most appropriate confidence level regarding the presence of MI and provide a detailed explanation for your choice.&#x201D;</p><p>&#x201C;Specify which leads exhibit abnormalities and describe the observed changes (eg, ST-segment elevation, T-wave inversion, pathological Q waves).&#x201D;</p><p>&#x201C;If no abnormalities are present, explain why the ECG appears normal. If the findings are ambiguous, discuss potential differential diagnoses.&#x201D;</p><p>)</p><p>CONFIDENCE_TEXT = (</p><p>&#x201C;Confidence Levels for MI Diagnosis:\n&#x201D;</p><p>&#x201C;Definitely Not &#x2013; No ECG evidence or extremely low probability of MI; clearly normal waveform patterns.\n&#x201D;</p><p>&#x201C;Unlikely &#x2013; Minimal or questionable evidence making MI improbable; non-specific changes.\n&#x201D;</p><p>&#x201C;Possible &#x2013; Moderate suspicion with mixed findings; abnormalities with alternative explanations possible.\n&#x201D;</p><p>&#x201C;Probable &#x2013; Strongly suggestive findings (ST-segment elevation, pathological Q waves, T-wave inversion).\n&#x201D;</p><p>&#x201C;Definite &#x2013; Conclusive evidence: clear ST elevation in contiguous leads, significant Q waves, reciprocal changes.\n\n&#x201D;</p><p>&#x201C;Respond in JSON with keys: confidence, explanation, abnormal_leads.&#x201D;</p><p>)</p><p>data = [] for idx, (file_name, base64_image) in enumerate(tqdm(image_results, desc=&#x201C;ECG&#x201D;)):</p><p>try:</p><p>messages = [</p><p>{&#x201C;role&#x201D;: &#x201C;system&#x201D;, &#x201C;content&#x201D;: SYSTEM_TEXT},</p><p>{</p><p>&#x201C;role&#x201D;: &#x201C;user&#x201D;,</p><p>&#x201C;content&#x201D;: [</p><p>{</p><p>&#x201C;type&#x201D;: &#x201C;image_url&#x201D;,</p><p>&#x201C;image_url&#x201D;: {</p><p>&#x201C;url&#x201D;: f"data:image/png;base64,{base64_image}",</p><p>&#x201C;detail&#x201D;: &#x201C;high&#x201D;</p><p>},</p><p>},</p><p>{</p><p>&#x201C;type&#x201D;: &#x201C;text&#x201D;,</p><p>&#x201C;text&#x201D;: CONFIDENCE_TEXT,</p><p>},</p><p>],</p><p>},</p><p>]</p><p>response =client.chat.completions.create(</p><p>model=&#x201C;gpt-4o&#x201D;,</p><p>messages =messages,</p><p>temperature =0.0,</p><p>max_tokens =300,</p><p>)</p><p>chatgpt_result =response.choices[0].message.content.strip()</p><p>except Exception as e:</p><p>print(f&#x201D;[Error] {file_name}: {e}")</p><p>chatgpt_result = None</p><p>data.append({&#x201C;File Name&#x201D;: file_name, &#x201C;ChatGPT Result&#x201D;: chatgpt_result})</p></boxed-text><p>To define a positive MI diagnosis based on this likelihood measure, the Youden index was applied to determine an optimal cutoff. Specifically, ChatGPT was instructed to identify and specify which ECG leads exhibited abnormalities, describe the changes observed (such as ST-segment elevation, T-wave inversion, or pathological Q waves), and provide detailed reasoning supporting its diagnostic conclusion. In cases where no abnormalities were noted or the ECG findings were ambiguous, the model was prompted to discuss potential alternative diagnoses or clearly explain why the ECG appeared normal. All queries were conducted using GPT-4o via the ChatGPT API. An example of a typical ChatGPT response is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>. All inferences were conducted as single-turn prompts, since each case consisted only of a deidentified ECG image with no ancillary clinical or serial ECG data that could support further interaction, and our study aimed to evaluate the models&#x2019; final diagnostic performance.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Example electrocardiogram images and ChatGPT model output for myocardial infarction detection. ECG: electrocardiogram.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e75910_fig01.png"/></fig></sec><sec id="s2-3"><title>Qualitative Assessment of Diagnostic Explanations&#x2013;ChatGPT and Gemini</title><p>In addition, a qualitative assessment of the diagnostic explanations of LLMs was performed to further evaluate the ability to accurately interpret ECG images. For GPT-4o, 40 cases were randomly selected: 10 true-positive, 10 true-negative, 10 false-positive, and 10 false-negative. For Gemini 2.5 Pro, the same procedure yielded 37 cases (10 true-positive, 10 true-negative, 10 false-positive, and 7 false-negative) owing to the model&#x2019;s smaller false-negative pool. Two board-certified clinicians, an emergency medicine specialist and a cardiologist, each with more than 10 years of ECG interpretation experience, independently reviewed the diagnostic explanations to assess whether the model provided clinically appropriate rationales for their classification. They then reconciled any discrepancies by consensus, and the final consensus ratings, together with per-reviewer tallies, are reported in the Results section.</p></sec><sec id="s2-4"><title>AI-Powered Image Analysis (ECG Buddy)</title><p>ECG Buddy is a deep learning&#x2013;based ECG analysis platform designed for 12-lead ECG image interpretation. The software is available for both smartphones and Microsoft Windows&#x2013;based desktop personal computers. In this study, ECG Buddy for Microsoft Windows [<xref ref-type="bibr" rid="ref21">21</xref>] was used to perform bulk analysis of ECG data (<xref ref-type="fig" rid="figure2">Figure 2</xref>). It is approved by the Korean Ministry of Food and Drug Safety and freely available for download in Korean app stores and can analyze 12-lead ECGs by taking pictures of ECG outputs to produce 10 digital biomarkers. The software automatically detects the ECG image displayed on the desktop and provides the analysis results within 10&#x2010;15 seconds. <xref ref-type="fig" rid="figure2">Figure 2A</xref> shows the operating screen of ECG Buddy for Microsoft Windows, while <xref ref-type="fig" rid="figure2">Figure 2</xref>B shows the ECG image analysis output. ECG Buddy generates 10 digital biomarkers that assess a range of cardiac conditions, including STEMI, acute coronary syndrome (ACS), myocardial injury (MyoInj), critical condition, pulmonary edema, pericardial effusion, left ventricular dysfunction, RV dysfunction, pulmonary hypertension, and severe hyperkalemia. This study analyzed only the STEMI, ACS, and MyoInj biomarkers owing to their direct relevance to MI classification.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The operating screen of ECG Buddy for Microsoft Windows. (A) ECG input image and (B) ECG image analysis result.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e75910_fig02.png"/></fig></sec><sec id="s2-5"><title>Statistical Analysis</title><p>Model performance was evaluated using accuracy, sensitivity, specificity, positive predictive value (PPV), and negative predictive value (NPV). The Youden index was used to determine optimal classification thresholds. In addition, model performance was evaluated using the area under the receiver operating characteristic curve (AUROC), and the AUC values were compared using the DeLong method, with statistical significance set at <italic>P</italic>&#x003C;.05. All analyses were conducted using R software version 4.1.0 (RStudio) [<xref ref-type="bibr" rid="ref22">22</xref>], with ChatGPT API responses obtained using Python (Python Software Foundation).</p></sec><sec id="s2-6"><title>Ethical Considerations</title><p>The study design was approved by the Institutional Review Board of Seoul National University Bundang Hospital (IRBX-2504-966-902). Given the public availability of the dataset, the Institutional Review Board of Seoul National University Bundang Hospital granted a waiver for the requirement of informed consent.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Performance of ChatGPT and ECG Buddy</title><p>In total, 928 ECG recordings (239/928, 25.8% MI-positive cases) were analyzed, and all were successfully processed by both AI models. ChatGPT demonstrated limited discriminative ability in MI detection, achieving an AUC of 57.34% (95% CI 53.44&#x2010;61.24). Using the Youden index, the optimal cutoff was determined as the category &#x201C;definite.&#x201D; At this cutoff, the model&#x2019;s sensitivity, specificity, PPV, and NPV were 36.40% (95% CI 30.30&#x2010;42.85), 76.20% (95% CI 72.84&#x2010;79.33), 34.66% (95% CI 28.79&#x2010;40.90), and 77.55% (95% CI 74.21&#x2010;80.64), respectively (<xref ref-type="fig" rid="figure3">Figure 3</xref>A and <xref ref-type="table" rid="table1">Table 1</xref>). Gemini 2.5 Pro showed even weaker overall discrimination, with an AUC of 51.63% (95% CI 50.22&#x2010;53.04). Applying the same Youden index procedure, the optimal cutoff corresponded to the &#x201C;definite&#x201D; category. At this threshold, sensitivity rose to 97.07% (95% CI 94.06&#x2010;98.81) but at the expense of specificity, which fell to 6.24% (95% CI 4.55&#x2010;8.31); the resulting PPV and NPV were 26.42% (95% CI 23.53&#x2010;29.47) and 86.00% (95% CI 73.26&#x2010;94.18), respectively (<xref ref-type="fig" rid="figure3">Figure 3</xref>A and <xref ref-type="table" rid="table1">Table 1</xref>).</p><p>The dedicated ECG AI software ECG Buddy exhibited highly accurate MI classification across the STEMI, ACS, and MyoInj markers. The AUC for detecting MI-positive cases for the STEMI biomarker was 98.87% (95% CI 98.30&#x2010;99.43), for the ACS biomarker was 98.78% (95% CI 98.05&#x2010;99.50), and for the MyoInj biomarker was 98.88% (95% CI 98.24&#x2010;99.51). Using the STEMI biomarker, ECG Buddy achieved the best accuracy of 96.98% (95% CI 95.67&#x2010;97.99), with a sensitivity of 96.65% (95% CI 93.51&#x2010;98.54), specificity of 97.10% (95% CI 95.55&#x2010;98.22), and <italic>F</italic><sub>1</sub>-score of 94.27% (95% CI 91.86&#x2010;96.28). DeLong test confirmed that ChatGPT (AUC 53.63%) performed significantly worse than ECG Buddy across all biomarkers (all <italic>P</italic>&#x003C;.001; <xref ref-type="fig" rid="figure3">Figure 3</xref>B and <xref ref-type="table" rid="table1">Table 1</xref>).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Diagnostic performance of large language models and ECG Buddy. (A) ChatGPT-4o and Gemini 2.5 Pro and (B) ECG Buddy. ACS: acute coronary syndrome; MYOINJ: myocardial injury; STEMI: ST-elevation myocardial infarction.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="ai_v4i1e75910_fig03.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Diagnostic performance of ChatGPT-4o, Gemini 2.5 Pro, and ECG Buddy. Data are expressed as values and 95% CI.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Metrics</td><td align="left" valign="bottom">ChatGPT-4o</td><td align="left" valign="bottom">Gemini 2.5 Pro</td><td align="left" valign="bottom" colspan="3">ECG Buddy</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">ST-elevation myocardial infarction</td><td align="left" valign="top">Acute coronary syndrome</td><td align="left" valign="top">Myocardial injury</td></tr></thead><tbody><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top">Sensitivity (95% CI)</td><td align="left" valign="top">36.40 (30.30&#x2010;42.85)</td><td align="left" valign="top">97.07 (94.06&#x2010;98.81)</td><td align="left" valign="top">96.65 (93.50&#x2010;98.54)</td><td align="left" valign="top">96.65 (93.51&#x2010;98.54)</td><td align="left" valign="top">96.65 (93.51&#x2010;98.54)</td></tr><tr><td align="left" valign="top">Specificity (95% CI)</td><td align="left" valign="top">76.20 (72.84&#x2010;79.33)</td><td align="left" valign="top">6.24 (4.55&#x2010;8.31)</td><td align="left" valign="top">97.10 (95.55&#x2010;98.22)</td><td align="left" valign="top">96.66 (95.03&#x2010;97.87)</td><td align="left" valign="top">97.24 (95.73&#x2010;98.33)</td></tr><tr><td align="left" valign="top">PPV<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (95% CI)</td><td align="left" valign="top">34.66 (28.79&#x2010;40.90)</td><td align="left" valign="top">26.42 (23.53&#x2010;29.47)</td><td align="left" valign="top">92.03 (87.96&#x2010;95.07)</td><td align="left" valign="top">90.94 (86.72&#x2010;94.17)</td><td align="left" valign="top">92.40 (88.39&#x2010;95.36)</td></tr><tr><td align="left" valign="top">NPV<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> (95% CI)</td><td align="left" valign="top">77.55 (74.21&#x2010;80.64)</td><td align="left" valign="top">86.00 (73.26&#x2010;94.18)</td><td align="left" valign="top">98.82 (97.68&#x2010;99.49)</td><td align="left" valign="top">98.81 (97.67&#x2010;99.49)</td><td align="left" valign="top">98.82 (97.69&#x2010;99.49)</td></tr><tr><td align="left" valign="top">AUROC<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> (95% CI)</td><td align="left" valign="top">57.34 (53.44&#x2010;61.24)</td><td align="left" valign="top">51.63 (50.22&#x2010;53.04)</td><td align="left" valign="top">98.87 (98.30&#x2010;99.43)</td><td align="left" valign="top">98.78 (98.05&#x2010;99.50)</td><td align="left" valign="top">98.88 (98.24&#x2010;99.51)</td></tr><tr><td align="left" valign="top">Accuracy (95% CI)</td><td align="left" valign="top">65.95 (62.80&#x2010;69.00)</td><td align="left" valign="top">29.63 (26.71&#x2010;32.69)</td><td align="left" valign="top">96.98 (95.67&#x2010;97.99)</td><td align="left" valign="top">96.66 (95.29&#x2010;97.72)</td><td align="left" valign="top">97.09 (95.79&#x2010;98.07)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>PPV: positive predictive value.</p></fn><fn id="table1fn2"><p><sup>b</sup>NPV: negative predictive value.</p></fn><fn id="table1fn3"><p><sup>c</sup>AUROC: area under the receiver operating curve.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Qualitative Assessment of Diagnostic Explanations of ChatGPT and Gemini</title><p>Two board-certified clinicians independently reviewed every explanation generated by the 2 LLMs. For GPT-4o, reviewer 1 judged 5% (2/40) explanations fully correct, 40% (16/40) partially correct, and 55% (22/40) completely incorrect. Reviewer 2 judged 5% (2/40) fully correct, 37.5% (15/40) partially correct, and 57.5% (23/40) completely incorrect. The reviewers concurred in 87.5% of GPT-4o cases (35/40; weighted &#x03BA;=0.76). After consensus, GPT-4o explanations were fully correct in 5% (2/40), partially correct in 37.5% (15/40), and completely incorrect in 57.5% (23/40; <xref ref-type="table" rid="table2">Table 2</xref>).</p><p>For Gemini 2.5 Pro, the 2 clinicians showed high interrater agreement (91.9%, 34/37; weighted &#x03BA; &#x2248; 0.67) while following the identical review procedure. Consensus ratings were 32.4% (12/37) fully correct, 13.5% (5/37) partially correct, and 54.1% (20/37) completely incorrect. Although Gemini produced a higher proportion of fully correct statements than GPT-4o, more than half of its explanations remained completely inaccurate, underscoring the need for expert oversight. The detailed per-rater counts are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Consensus qualitative assessment of large language model diagnostic explanations. Interrater agreement before consensus: &#x03BA;=0.76 (GPT-4o) and 0.67 (Gemini).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Correct, n (%)</td><td align="left" valign="bottom">Partially correct, n (%)</td><td align="left" valign="bottom">Completely incorrect, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-4o (n=40)</td><td align="left" valign="top">2 (5)</td><td align="left" valign="top">15 (37.5)</td><td align="left" valign="top">23 (57.5)</td></tr><tr><td align="left" valign="top">Gemini 2.5 Pro (n=37)</td><td align="left" valign="top">12 (32.4)</td><td align="left" valign="top">5 (13.5)</td><td align="left" valign="top">20 (54.1)</td></tr></tbody></table></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>This study directly compared ChatGPT and Gemini, general-purpose multimodal LLMs, with ECG Buddy, a specialized deep-learning tool for ECG analysis. While ECG Buddy achieved high accuracy in detecting MI from ECG images, ChatGPT&#x2019;s performance was significantly inferior. Gemini 2.5 Pro, the latest vision-language model from Google, showed even lower overall accuracy than GPT-4o, reinforcing the conclusion that current general-purpose LLMs remain unsuitable for primary ECG interpretation.</p><p>The considerable performance gap between the dedicated ECG Buddy and LLMs underscores a difference in their architecture and training methodologies. LLMs are primarily optimized for textual understanding and general visual recognition tasks and lack the specific training necessary for detailed ECG waveform interpretation. As a result, it may generate contextually plausible yet inaccurate responses, which could lead to potentially dangerous diagnostic errors if relied upon in clinical settings. Moreover, the performance of LLMs is highly sensitive to prompt design and the specific model version used, resulting in inconsistent outcomes. Clinical studies have reported that while LLMs perform moderately well in common clinical cases, they deviate significantly from evaluations in critical scenarios [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. Conversely, ECG Buddy is optimized through targeted domain-specific training, resulting in superior performance and reliability compared to LLMs.</p><p>The findings of this study indicate that current general-purpose multimodal LLM architectures may primarily rely on textual annotations or explicit labels rather than on directly analyzing waveform patterns. Due to these structural limitations, current LLMs cannot be considered reliable as primary diagnostic tools for detecting STEMI. Accordingly, LLMs should be confined to a supplementary decision-support role, where they can supply guideline-based contextual information, augment the interpretations generated by specialized tools such as ECG Buddy, and propose clinically appropriate follow-up options. Although recent advances in LLMs have broadened the applications of medical AI, domain-specific models remain indispensable, as the inherent limitations of general-purpose LLMs still compromise clinical utility and reproducibility.</p></sec><sec id="s4-2"><title>Limitations</title><p>This study has some limitations. First, the study was conducted retrospectively using a publicly available ECG dataset, which lacked detailed clinical context or patient demographic information. The absence of comprehensive clinical data may limit the generalizability of these findings to diverse patient populations or different clinical settings. In addition, the dataset comprised only deidentified ECG images lacking infarct-territory labels and lab results, location-specific and STEMI or NSTEMI analyses were not possible. Second, the qualitative assessments were performed independently by 2 board-certified clinicians&#x2014;1 emergency physician and 1 cardiologist. While providing useful insight, interpretations by multiple clinicians across various specialties might yield different assessments of diagnostic appropriateness or accuracy. To address this limitation and confirm its clinical utility, we plan to initiate prospective validation studies that will evaluate ECG Buddy&#x2019;s diagnostic accuracy and workflow integration in emergency department settings across hospitals.</p></sec><sec id="s4-3"><title>Comparison With Prior Work</title><p>Our findings are consistent with those of prior research, highlighting that ECG-specialized AI models regularly outperform general-purpose models. Previous studies have demonstrated that deep learning models trained extensively on ECG-specific datasets accurately detect subtle waveform changes indicative of asymptomatic ventricular dysfunction and cirrhosis [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Similar to these specialized ECG models, ECG Buddy undergoes targeted optimization tailored specifically to ECG images, ensuring consistent predictive performance and stable error margins essential for clinical reliability. Notably, users only need to provide an ECG image or screenshot; both the smartphone and desktop versions feature an intuitive interface that requires no extra training or expertise. In addition to MI, ECG Buddy demonstrates robust diagnostic capabilities across diverse cardiac conditions, including STEMI, hyperkalemia, and RV dysfunction, validating its efficacy in various clinical settings [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. Moreover, it has outperformed human experts in diagnosing STEMI and hyperkalemia.</p></sec><sec id="s4-4"><title>Conclusions</title><p>To our knowledge, this study provides the first direct comparative assessment between ChatGPT, Gemini, and ECG Buddy for detecting MI from ECG images. Our findings reveal that, despite the accelerating use of LLM-based AI, current LLMs do not meet the clinical performance and accuracy requirements for ECG interpretation. Herein, the ability of a general-purpose multimodal LLM (ChatGPT and Gemini) to detect ECG abnormalities fell short of that achieved by board-certified emergency physicians, rendering it insufficient for use in interpreting critical ECG readings in clinical practice. In contrast, the specialized ECG AI tool (ECG Buddy), which has been trained and validated, achieved high diagnostic accuracy and reproducibility, suggesting its utility in clinical settings. These results, consistent with those of several other studies, underscore the superiority of medical domain-specific AI programs over general-purpose LLMs for ECG analysis and interpretation and emphasize the importance of specialized models in the field of medical AI development.</p></sec></sec></body><back><ack><p>This research was partly supported by a grant from the Technological Innovation Research &#x0026; Development Program (SCALEUP TIPS), funded by the Ministry of SMEs and Startups (grant number: RS-2024&#x2010;00415492), and the Medical AI Clinic Program through the National IT Industry Promotion Agency, funded by the Ministry of Science Information Communication and Technology (grant number H0904-24-1002).</p></ack><notes><sec><title>Data Availability</title><p>All data generated or analyzed during this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: KL, DS, JK, YC</p><p>Data curation: HL, SY</p><p>Formal analysis: SY, HL</p><p>Funding acquisition: JK, YC</p><p>Investigation: SY, HL</p><p>Methodology: SY, HL</p><p>Project administration: KL, DS</p><p>Resources: JK, YC</p><p>Supervision: KL, DS</p><p>Validation: HL</p><p>Visualization: HL</p><p>Writing &#x2013; HL, JK</p><p>Writing &#x2013; review &#x0026; editing: KL, DS, SY</p></fn><fn fn-type="conflict"><p>JK developed the algorithm. He also founded a start-up company, ARPI Inc., where he serves as the CEO. YC and DS work for the company as research directors. HL works for the company as a data scientist.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ACS</term><def><p>acute coronary syndrome</p></def></def-item><def-item><term id="abb2">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb3">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb4">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb5">ECG</term><def><p>electrocardiogram</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">MI</term><def><p>myocardial infarction</p></def></def-item><def-item><term id="abb8">MyoInj</term><def><p>myocardial injury</p></def></def-item><def-item><term id="abb9">NPV</term><def><p>negative predictive value</p></def></def-item><def-item><term id="abb10">NSTEMI</term><def><p>non&#x2013;ST-elevation myocardial infarction</p></def></def-item><def-item><term id="abb11">PPV</term><def><p>positive predictive value</p></def></def-item><def-item><term id="abb12">RV</term><def><p>right ventricular</p></def></def-item><def-item><term id="abb13">STEMI</term><def><p>ST-elevation myocardial infarction</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zimetbaum</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Josephson</surname><given-names>ME</given-names> </name></person-group><article-title>Use of the electrocardiogram in acute myocardial infarction</article-title><source>N Engl J Med</source><year>2003</year><month>03</month><day>6</day><volume>348</volume><issue>10</issue><fpage>933</fpage><lpage>940</lpage><pub-id pub-id-type="doi">10.1056/NEJMra022700</pub-id><pub-id pub-id-type="medline">12621138</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hannun</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Rajpurkar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Haghpanahi</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Cardiologist-level arrhythmia detection and classification in ambulatory electrocardiograms using a deep neural network</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>65</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0268-3</pub-id><pub-id pub-id-type="medline">30617320</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Attia</surname><given-names>ZI</given-names> </name><name name-style="western"><surname>Kapa</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lopez-Jimenez</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Screening for cardiac contractile dysfunction using an artificial intelligence-enabled electrocardiogram</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>70</fpage><lpage>74</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0240-2</pub-id><pub-id pub-id-type="medline">30617318</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yin</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Automatic multilabel electrocardiogram diagnosis of heart rhythm or conduction abnormalities with deep learning: a cohort study</article-title><source>Lancet Digit Health</source><year>2020</year><month>07</month><volume>2</volume><issue>7</issue><fpage>e348</fpage><lpage>e357</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(20)30107-2</pub-id><pub-id pub-id-type="medline">33328094</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vaid</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sawant</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A foundational vision transformer improves diagnostic performance for electrocardiograms</article-title><source>NPJ Digit Med</source><year>2023</year><month>06</month><day>6</day><volume>6</volume><issue>1</issue><fpage>108</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00840-9</pub-id><pub-id pub-id-type="medline">37280346</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesk&#x00F3;</surname><given-names>B</given-names> </name></person-group><article-title>The impact of multimodal large language models on health care&#x2019;s future</article-title><source>J Med Internet Res</source><year>2023</year><month>11</month><day>2</day><volume>25</volume><issue>25</issue><fpage>e52865</fpage><pub-id pub-id-type="doi">10.2196/52865</pub-id><pub-id pub-id-type="medline">37917126</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meddeb</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ebert</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bressem</surname><given-names>KK</given-names> </name><etal/></person-group><article-title>Evaluating local open-source large language models for data extraction from unstructured reports on mechanical thrombectomy in patients with ischemic stroke</article-title><source>J Neurointerv Surg</source><year>2025</year><month>08</month><day>13</day><volume>17</volume><issue>9</issue><fpage>986</fpage><lpage>991</lpage><pub-id pub-id-type="doi">10.1136/jnis-2024-022078</pub-id><pub-id pub-id-type="medline">39095085</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><source>ChatGPT</source><year>2023</year><access-date>2025-08-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/%20blog/chatgpt/">https://openai.com/ blog/chatgpt/</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><source>GPT-4o c2024</source><year>2024</year><access-date>2025-08-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/%20index/hello-gpt-4o/">https://openai.com/ index/hello-gpt-4o/</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zaboli</surname><given-names>A</given-names> </name><name name-style="western"><surname>Brigo</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ziller</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Exploring ChatGPT&#x2019;s potential in ECG interpretation and outcome prediction in emergency department</article-title><source>Am J Emerg Med</source><year>2025</year><month>02</month><volume>88</volume><issue>88</issue><fpage>7</fpage><lpage>11</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2024.11.023</pub-id><pub-id pub-id-type="medline">39566376</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Multimodal ChatGPT-4V for electrocardiogram interpretation: promise and limitations</article-title><source>J Med Internet Res</source><year>2024</year><month>06</month><day>26</day><volume>26</volume><issue>26</issue><fpage>e54607</fpage><pub-id pub-id-type="doi">10.2196/54607</pub-id><pub-id pub-id-type="medline">38764297</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>G&#x00FC;nay</surname><given-names>S</given-names> </name><name name-style="western"><surname>&#x00D6;zt&#x00FC;rk</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yi&#x011F;it</surname><given-names>Y</given-names> </name></person-group><article-title>The accuracy of Gemini, GPT-4, and GPT-4o in ECG analysis: a comparison with cardiologists and emergency medicine specialists</article-title><source>Am J Emerg Med</source><year>2024</year><month>10</month><volume>84</volume><issue>84</issue><fpage>68</fpage><lpage>73</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2024.07.043</pub-id><pub-id pub-id-type="medline">39096711</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Avidan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tabachnikov</surname><given-names>V</given-names> </name><name name-style="western"><surname>Court</surname><given-names>OB</given-names> </name><name name-style="western"><surname>Khoury</surname><given-names>R</given-names> </name><name name-style="western"><surname>Aker</surname><given-names>A</given-names> </name></person-group><article-title>In the face of confounders: atrial fibrillation detection - practitioners vs. ChatGPT</article-title><source>J Electrocardiol</source><year>2025</year><volume>88</volume><issue>88</issue><fpage>153851</fpage><pub-id pub-id-type="doi">10.1016/j.jelectrocard.2024.153851</pub-id><pub-id pub-id-type="medline">39667153</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>G&#x00FC;nay</surname><given-names>S</given-names> </name><name name-style="western"><surname>&#x00D6;zt&#x00FC;rk</surname><given-names>A</given-names> </name><name name-style="western"><surname>&#x00D6;zerol</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yi&#x011F;it</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Erenler</surname><given-names>AK</given-names> </name></person-group><article-title>Comparison of emergency medicine specialist, cardiologist, and chat-GPT in electrocardiography assessment</article-title><source>Am J Emerg Med</source><year>2024</year><month>06</month><volume>80</volume><issue>80</issue><fpage>51</fpage><lpage>60</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2024.03.017</pub-id><pub-id pub-id-type="medline">38507847</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hwang</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A retrospective clinical evaluation of an artificial intelligence screening method for early detection of STEMI in the emergency department</article-title><source>J Korean Med Sci</source><year>2022</year><month>03</month><day>14</day><volume>37</volume><issue>10</issue><fpage>e81</fpage><pub-id pub-id-type="doi">10.3346/jkms.2022.37.e81</pub-id><pub-id pub-id-type="medline">35289140</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>D</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Hyperkalemia detection in emergency departments using initial ECGs: a smartphone AI ECG analyzer vs. board-certified physicians</article-title><source>J Korean Med Sci</source><year>2023</year><month>11</month><day>20</day><volume>38</volume><issue>45</issue><fpage>e322</fpage><pub-id pub-id-type="doi">10.3346/jkms.2023.38.e322</pub-id><pub-id pub-id-type="medline">37987103</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Park</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Screening for RV dysfunction using smartphone ECG analysis app: validation study with acute pulmonary embolism patients</article-title><source>J Clin Med</source><year>2024</year><month>08</month><day>14</day><volume>13</volume><issue>16</issue><fpage>4792</fpage><pub-id pub-id-type="doi">10.3390/jcm13164792</pub-id><pub-id pub-id-type="medline">39200934</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choi</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Park</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Artificial intelligence versus physicians on interpretation of printed ECG images: Diagnostic performance of ST-elevation myocardial infarction on electrocardiography</article-title><source>Int J Cardiol</source><year>2022</year><month>09</month><day>15</day><volume>363</volume><issue>363</issue><fpage>6</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1016/j.ijcard.2022.06.012</pub-id><pub-id pub-id-type="medline">35691440</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Khan</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Hussain</surname><given-names>M</given-names> </name><name name-style="western"><surname>Malik</surname><given-names>MK</given-names> </name></person-group><article-title>ECG images dataset of cardiac patients</article-title><year>2021</year><month>02</month><publisher-name>Mendeley Data (version 2)</publisher-name><pub-id pub-id-type="doi">10.1016/j.dib.2021.106762</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallifant</surname><given-names>J</given-names> </name><name name-style="western"><surname>Afshar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>60</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id><pub-id pub-id-type="medline">39779929</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>ECGBuddy</article-title><source>Microsoft Store</source><year>2021</year><access-date>2025-08-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://apps.microsoft.com/detail/xpffzhzlvj2tbj?hl=ko-kr&#x0026;gl=KR">https://apps.microsoft.com/detail/xpffzhzlvj2tbj?hl=ko-kr&#x0026;gl=KR</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>R Project for Statistical Computing</article-title><source>R Project</source><access-date>2025-08-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.R-project.org/">https://www.R-project.org/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vaid</surname><given-names>A</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Badgeley</surname><given-names>MA</given-names> </name><etal/></person-group><article-title>Using deep-learning algorithms to simultaneously identify right and left ventricular dysfunction from the electrocardiogram</article-title><source>JACC Cardiovasc Imaging</source><year>2022</year><month>03</month><volume>15</volume><issue>3</issue><fpage>395</fpage><lpage>410</lpage><pub-id pub-id-type="doi">10.1016/j.jcmg.2021.08.004</pub-id><pub-id pub-id-type="medline">34656465</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ahn</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Attia</surname><given-names>ZI</given-names> </name><name name-style="western"><surname>Rattan</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Development of the AI-cirrhosis-ECG score: an electrocardiogram-based deep learning model in cirrhosis</article-title><source>Am J Gastroenterol</source><year>2022</year><month>03</month><day>1</day><volume>117</volume><issue>3</issue><fpage>424</fpage><lpage>432</lpage><pub-id pub-id-type="doi">10.14309/ajg.0000000000001617</pub-id><pub-id pub-id-type="medline">35029163</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Qualitative evaluation of large language model diagnostic explanations.</p><media xlink:href="ai_v4i1e75910_app1.docx" xlink:title="DOCX File, 11 KB"/></supplementary-material></app-group></back></article>